diff --git a/.bazelrc b/.bazelrc
index cf15d0976b1..f2aa3ac447b 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -143,6 +143,11 @@ build:mkl --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl --define=build_with_mkl_dnn_v1_only=true
 build:mkl -c opt
 
+# config to build OneDNN backend with a user specified threadpool.
+build:mkl_threadpool --define=build_with_mkl=true --define=enable_mkl=true
+build:mkl_threadpool --define=tensorflow_mkldnn_contraction_kernel=0
+build:mkl_threadpool --define=build_with_mkldnn_threadpool=true
+build:mkl_threadpool -c opt
 # This config refers to building with CUDA available. It does not necessarily
 # mean that we build CUDA op kernels.
 build:using_cuda --define=using_cuda=true
@@ -163,6 +168,8 @@ build:cuda_clang --action_env TF_CUDA_CLANG=1
 build:dbg --config=opt -c dbg
 # for now, disable arm_neon. see: https://github.com/tensorflow/tensorflow/issues/33360
 build:dbg --cxxopt -DTF_LITE_DISABLE_X86_NEON
+# AWS SDK must be compiled in release mode. see: https://github.com/tensorflow/tensorflow/issues/37498
+build:dbg --copt -DDEBUG_BUILD
 
 build:tensorrt --action_env TF_NEED_TENSORRT=1
 
@@ -233,10 +240,15 @@ build:c++17 --cxxopt=-std=c++1z
 build:c++17 --cxxopt=-stdlib=libc++
 build:c++1z --config=c++17
 
-# Enable using platform specific build settings
+# Enable using platform specific build settings, except when cross-compiling for
+# mobile platforms.
 build --enable_platform_specific_config
+build:android --noenable_platform_specific_config
+build:ios --noenable_platform_specific_config
 
 # Suppress C++ compiler warnings, otherwise build logs become 10s of MBs.
+build:android --copt=-w
+build:ios --copt=-w
 build:linux --copt=-w
 build:macos --copt=-w
 build:windows --copt=/w
@@ -256,6 +268,10 @@ build:macos --define=INCLUDEDIR=$(PREFIX)/include
 # TF_SYSTEM_LIBS do not work on windows.
 
 # By default, build TF in C++ 14 mode.
+build:android --cxxopt=-std=c++14
+build:android --host_cxxopt=-std=c++14
+build:ios --cxxopt=-std=c++14
+build:ios --host_cxxopt=-std=c++14
 build:linux --cxxopt=-std=c++14
 build:linux --host_cxxopt=-std=c++14
 build:macos --cxxopt=-std=c++14
diff --git a/.github/bot_config.yml b/.github/bot_config.yml
new file mode 100644
index 00000000000..88c737f41e2
--- /dev/null
+++ b/.github/bot_config.yml
@@ -0,0 +1,87 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+# A list of assignees
+assignees:
+   - amahendrakar
+   - ravikyram
+   - Saduf2019
+# A list of assignees for compiler folder
+compiler_assignees:
+   - joker-eph
+# Cuda Comment
+cuda_comment: >
+   From the template it looks like you are installing **TensorFlow** (TF) prebuilt binaries:
+      * For TF-GPU - See point 1
+      * For TF-CPU - See point 2
+   -----------------------------------------------------------------------------------------------
+   
+   **1. Installing **TensorFlow-GPU** (TF) prebuilt binaries**
+   
+   
+   Make sure you are using compatible TF and CUDA versions.
+   Please refer following TF version and CUDA version compatibility table.
+   
+   | TF  | CUDA |
+   
+   | :-------------: | :-------------: |
+   
+   | 2.1.0 - 2.2.0  | 10.1 |
+   
+   | 1.13.1 - 2.0  | 10.0  |
+   
+   | 1.5.0 - 1.12.0 | 9.0 |
+   
+     * If you have above configuration and using _**Windows**_ platform -
+       * Try adding the CUDA, CUPTI, and cuDNN installation directories to the %PATH% environment variable.
+       * Refer [windows setup guide](https://www.tensorflow.org/install/gpu#windows_setup).
+     * If you have above configuration and using _**Ubuntu/Linux**_ platform -
+       * Try adding the CUDA, CUPTI, and cuDNN installation directories to the $LD_LIBRARY_PATH environment variable.
+       * Refer [linux setup guide](https://www.tensorflow.org/install/gpu#linux_setup).
+     * If error still persists then, apparently your CPU model does not support AVX instruction sets.
+       * Refer [hardware requirements](https://www.tensorflow.org/install/pip#hardware-requirements).
+   
+   -----------------------------------------------------------------------------------------------
+   
+   **2. Installing **TensorFlow** (TF) CPU prebuilt binaries**
+   
+   
+   *TensorFlow release binaries version 1.6 and higher are prebuilt with AVX instruction sets.*
+   
+   
+   Therefore on any CPU that does not have these instruction sets, either CPU or GPU version of TF will fail to load.
+   
+   Apparently, your CPU model does not support AVX instruction sets. You can still use TensorFlow with the alternatives given below:
+   
+      * Try Google Colab to use TensorFlow.
+         * The easiest way to use TF will be to switch to [google colab](https://colab.sandbox.google.com/notebooks/welcome.ipynb#recent=true). You get pre-installed latest stable TF version. Also you can use ```pip install```  to install any other preferred TF version.
+         * It has an added advantage since you can you easily switch to different hardware accelerators (cpu, gpu, tpu) as per the task.
+         * All you need is a good internet connection and you are all set.
+      * Try to build TF from sources by changing CPU optimization flags.
+   
+   *Please let us know if this helps.*
+   
+windows_comment: >
+   From the stack trace it looks like you are hitting windows path length limit.
+      * Try to disable path length limit on Windows 10.
+        * Refer [disable path length limit instructions guide.](https://mspoweruser.com/ntfs-260-character-windows-10/)
+   
+   Please let us know if this helps.
diff --git a/README.md b/README.md
index 27032043e07..a76b1bfd0b7 100644
--- a/README.md
+++ b/README.md
@@ -103,17 +103,17 @@ open-source software development:
 
 ### Official Builds
 
-Build Type               | Status                                                                                                                                                                                                                                                                                                                                        | Artifacts
------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
-**Linux CPU**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.html)                                                                                                                                                                        | [PyPI](https://pypi.org/project/tf-nightly/)
-**Linux GPU**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.html)                                                                                                                                                              | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
-**Linux XLA**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.html)                                                                                                                                                                      | TBA
-**macOS**                | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.html)                                                                                                                                                                  | [PyPI](https://pypi.org/project/tf-nightly/)
-**Windows CPU**          | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.html)                                                                                                                                                                    | [PyPI](https://pypi.org/project/tf-nightly/)
-**Windows GPU**          | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.html)                                                                                                                                                                    | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
-**Android**              | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.html)                                                                                                                                                                            | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
-**Raspberry Pi 0 and 1** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py2.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py2.html) [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.html) | [Py2](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp27-none-linux_armv6l.whl) [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv6l.whl)
-**Raspberry Pi 2 and 3** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py2.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py2.html) [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.html) | [Py2](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp27-none-linux_armv7l.whl) [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv7l.whl)
+Build Type               | Status                                                                                                                                                                           | Artifacts
+------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
+**Linux CPU**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.html)           | [PyPI](https://pypi.org/project/tf-nightly/)
+**Linux GPU**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.html) | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
+**Linux XLA**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.html)         | TBA
+**macOS**                | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.html)     | [PyPI](https://pypi.org/project/tf-nightly/)
+**Windows CPU**          | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.html)       | [PyPI](https://pypi.org/project/tf-nightly/)
+**Windows GPU**          | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.html)       | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
+**Android**              | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.html)               | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
+**Raspberry Pi 0 and 1** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.html)           | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv6l.whl)
+**Raspberry Pi 2 and 3** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.html)           | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv7l.whl)
 
 ### Community Supported Builds
 
@@ -142,6 +142,7 @@ Build Type                                                        | Status
 *   [Getting Started with TensorFlow 2 from Coursera](https://www.coursera.org/learn/getting-started-with-tensor-flow2)
 *   [Intro to TensorFlow for Deep Learning from Udacity](https://www.udacity.com/course/intro-to-tensorflow-for-deep-learning--ud187)
 *   [Introduction to TensorFlow Lite from Udacity](https://www.udacity.com/course/intro-to-tensorflow-lite--ud190)
+*   [Machine Learning with TensorFlow on GCP](https://www.coursera.org/specializations/machine-learning-tensorflow-gcp)
 *   [TensorFlow Blog](https://blog.tensorflow.org)
 *   [Learn ML with TensorFlow](https://www.tensorflow.org/resources/learn-ml)
 *   [TensorFlow Twitter](https://twitter.com/tensorflow)
diff --git a/RELEASE.md b/RELEASE.md
index b5d088821e4..6f3aa94c203 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,185 @@
+# Release 2.3.0
+
+## Breaking Changes
+
+*   `tf.image.extract_glimpse` has been updated to correctly process the case
+    where `centered=False` and `normalized=False`. This is a breaking change as
+    the output is different from (incorrect) previous versions. Note this
+    breaking change only impacts `tf.image.extract_glimpse` and
+    `tf.compat.v2.image.extract_glimpse` API endpoints. The behavior of
+    `tf.compat.v1.image.extract_glimpse` does not change. The behavior of
+    exsiting C++ kernel `ExtractGlimpse` does not change as well, so saved
+    models will not be impacted.
+
+# Release 2.1.1
+
+## Bug Fixes and Other Changes
+* Updates `sqlite3` to `3.31.01` to handle [CVE-2019-19880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19880), [CVE-2019-19244](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19244) and [CVE-2019-19645](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19645)
+* Updates `curl` to `7.69.1` to handle [CVE-2019-15601](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-15601)
+* Updates `libjpeg-turbo` to `2.0.4` to handle [CVE-2018-19664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-19664), [CVE-2018-20330](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-20330) and [CVE-2019-13960](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-13960)
+* Updates Apache Spark to `2.4.5` to handle [CVE-2019-10099](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-10099), [CVE-2018-17190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-17190) and [CVE-2018-11770](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-11770)
+* Fixes a versioning bug which causes Keras layers from TF 1.x to be used instead of those from TF 2.x
+
+# Release 2.0.2
+
+## Bug Fixes and Other Changes
+* Updates `sqlite3` to `3.31.01` to handle [CVE-2019-19880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19880), [CVE-2019-19244](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19244) and [CVE-2019-19645](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19645)
+* Updates `curl` to `7.69.1` to handle [CVE-2019-15601](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-15601)
+* Updates `libjpeg-turbo` to `2.0.4` to handle [CVE-2018-19664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-19664), [CVE-2018-20330](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-20330) and [CVE-2019-13960](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-13960)
+* Updates Apache Spark to `2.4.5` to handle [CVE-2019-10099](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-10099), [CVE-2018-17190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-17190) and [CVE-2018-11770](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-11770)
+
+# Release 1.15.3
+
+## Bug Fixes and Other Changes
+* Updates `sqlite3` to `3.31.01` to handle [CVE-2019-19880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19880), [CVE-2019-19244](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19244) and [CVE-2019-19645](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19645)
+* Updates `curl` to `7.69.1` to handle [CVE-2019-15601](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-15601)
+* Updates `libjpeg-turbo` to `2.0.4` to handle [CVE-2018-19664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-19664), [CVE-2018-20330](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-20330) and [CVE-2019-13960](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-13960)
+* Updates Apache Spark to `2.4.5` to handle [CVE-2019-10099](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-10099), [CVE-2018-17190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-17190) and [CVE-2018-11770](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-11770)
+
+# Release 2.2.0
+
+TensorFlow 2.2 discontinues support for Python 2, [previously announced](https://groups.google.com/a/tensorflow.org/d/msg/announce/gVwS5RC8mds/dCt1ka2XAAAJ) as following [Python 2's EOL on January 1, 2020](https://www.python.org/dev/peps/pep-0373/#update).
+
+Coinciding with this change, new releases of [TensorFlow's Docker images](https://hub.docker.com/r/tensorflow/tensorflow/) provide Python 3 exclusively. Because all images now use Python 3, Docker tags containing `-py3` will no longer be provided and existing `-py3` tags like `latest-py3` will not be updated.
+
+## Major Features and Improvements
+
+* Replaced the scalar type for string tensors from `std::string` to `tensorflow::tstring` which is now ABI stable.
+* A new Profiler for TF 2 for CPU/GPU/TPU. It offers both device and host performance analysis, including input pipeline and TF Ops. Optimization advisory is provided whenever possible. Please see [this tutorial](https://www.tensorflow.org/tensorboard/tensorboard_profiling_keras) and [guide](https://www.tensorflow.org/guide/profiler) for usage guidelines.
+* Export C++ functions to Python using `pybind11` as opposed to `SWIG` as a part of our [deprecation of swig efforts](https://github.com/tensorflow/community/blob/master/rfcs/20190208-pybind11.md).
+* `tf.distribute`:
+  * Support added for global sync `BatchNormalization` by using the newly added `tf.keras.layers.experimental.SyncBatchNormalization` layer. This layer will sync `BatchNormalization` statistics every step across all replicas taking part in sync training.
+  * Performance improvements for GPU multi-worker distributed training using `tf.distribute.experimental.MultiWorkerMirroredStrategy`
+    * Update NVIDIA `NCCL` to `2.5.7-1` for better performance and performance tuning. Please see [nccl developer guide](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html) for more information on this.
+    * Support gradient `allreduce` in `float16`. See this [example](https://github.com/tensorflow/models/blob/master/official/staging/training/grad_utils.py) usage.
+    * Experimental support of [all reduce gradient packing](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/CollectiveHints) to allow overlapping gradient aggregation with backward path computation.
+    * Deprecated `experimental_run_v2` method for distribution strategies and renamed the method `run` as it is no longer experimental.
+    * Add CompositeTensor support for DistributedIterators. This should help prevent unnecessary function retracing and memory leaks.
+* `tf.keras`:
+  * `Model.fit` major improvements:
+     * You can now use custom training logic with `Model.fit` by overriding `Model.train_step`.
+     * Easily write state-of-the-art training loops without worrying about all of the features `Model.fit` handles for you (distribution strategies, callbacks, data formats, looping logic, etc)
+     * See the default [`Model.train_step`](https://github.com/tensorflow/tensorflow/blob/1381fc8e15e22402417b98e3881dfd409998daea/tensorflow/python/keras/engine/training.py#L540) for an example of what this function should look like. Same applies for validation and inference via `Model.test_step` and `Model.predict_step`.
+     * SavedModel uses its own `Model._saved_model_inputs_spec` attr now instead of
+       relying on `Model.inputs` and `Model.input_names`, which are no longer set for subclass Models.
+       This attr is set in eager, `tf.function`, and graph modes. This gets rid of the need for users to
+       manually call `Model._set_inputs` when using Custom Training Loops(CTLs).
+     * Dynamic shapes are supported for generators by calling the Model on the first batch we "peek" from the generator.
+       This used to happen implicitly in `Model._standardize_user_data`. Long-term, a solution where the
+       `DataAdapter` doesn't need to call the Model is probably preferable.
+  * The SavedModel format now supports all Keras built-in layers (including metrics, preprocessing layers, and stateful RNN layers)
+  * Update Keras batch normalization layer to use the running mean and average computation in the `fused_batch_norm`. You should see significant performance improvements when using `fused_batch_norm` in Eager mode.
+
+* `tf.lite`:
+  * Enable TFLite experimental new converter by default.
+* XLA
+  * XLA now builds and works on windows. All prebuilt packages come with XLA available.
+  * XLA can be [enabled for a `tf.function`](https://www.tensorflow.org/xla#explicit_compilation_with_tffunction
+) with “compile or throw exception” semantics on CPU and GPU.
+
+## Breaking Changes
+* `tf.keras`:
+  * In `tf.keras.applications` the name of the "top" layer has been standardized to "predictions". This is only a problem if your code relies on the exact name of the layer.
+  * Huber loss function has been updated to be consistent with other Keras losses. It now computes mean over the last axis of per-sample losses before applying the reduction function.
+* AutoGraph no longer converts functions passed to `tf.py_function`, `tf.py_func` and `tf.numpy_function`.
+* Deprecating `XLA_CPU` and `XLA_GPU` devices with this release.
+* Increasing the minimum bazel version to build TF to 2.0.0 to use Bazel's `cc_experimental_shared_library`.
+* Keras compile/fit behavior for functional and subclassed models have been unified. Model properties such as `metrics`, `metrics_names` will now be available only after **training/evaluating the model on actual data** for functional models. `metrics` will **now include** model `loss` and output losses.`loss_functions` property has been removed from the model. This was an undocumented property that was accidentally public and has now been removed.
+
+## Known Caveats
+* The current TensorFlow release now **requires** [gast](https://pypi.org/project/gast/) version 0.3.3.
+
+## Bug Fixes and Other Changes
+* `tf.data`:
+  * Removed `autotune_algorithm` from experimental optimization options.
+* TF Core:
+  * `tf.constant` always creates CPU tensors irrespective of the current device context.
+  * Eager `TensorHandles` maintain a list of mirrors for any copies to local or remote devices. This avoids any redundant copies due to op execution.
+  * For `tf.Tensor` & `tf.Variable`, `.experimental_ref()` is no longer experimental and is available as simply `.ref()`.
+  * `pfor/vectorized_map`: Added support for vectorizing 56 more ops. Vectorizing `tf.cond` is also supported now.
+  * Set as much partial shape as we can infer statically within the gradient impl of the gather op.
+  * Gradient of `tf.while_loop` emits `StatelessWhile` op if `cond` and body functions are stateless. This allows multiple gradients while ops to run in parallel under distribution strategy.
+  * Speed up `GradientTape` in eager mode by auto-generating list of op inputs/outputs which are unused and hence not cached for gradient functions.
+  * Support `back_prop=False` in `while_v2` but mark it as deprecated.
+  * Improve error message when attempting to use `None` in data-dependent control flow.
+  * Add `RaggedTensor.numpy()`.
+  * Update `RaggedTensor.__getitem__` to preserve uniform dimensions & allow indexing into uniform dimensions.
+  * Update `tf.expand_dims` to always insert the new dimension as a non-ragged dimension.
+  * Update `tf.embedding_lookup` to use `partition_strategy` and `max_norm` when `ids` is ragged.
+  * Allow `batch_dims==rank(indices)` in `tf.gather`.
+  * Add support for bfloat16 in `tf.print`.
+* `tf.distribute`:
+  * Support `embedding_column` with variable-length input features for `MultiWorkerMirroredStrategy`.
+* `tf.keras`:
+  * Added `experimental_aggregate_gradients` argument to `tf.keras.optimizer.Optimizer.apply_gradients`. This allows custom gradient aggregation and processing aggregated gradients in custom training loop.
+  * Allow `pathlib.Path` paths for loading models via Keras API.
+* `tf.function`/AutoGraph:
+  * AutoGraph is now available in `ReplicaContext.merge_call`, `Strategy.extended.update` and `Strategy.extended.update_non_slot`.
+  * Experimental support for shape invariants has been enabled in `tf.function`. See the API docs for `tf.autograph.experimental.set_loop_options` for additonal info.
+  * AutoGraph error messages now exclude frames corresponding to APIs internal to AutoGraph.
+  * Improve shape inference for `tf.function` input arguments to unlock more Grappler optimizations in TensorFlow 2.x.
+  * Improve automatic control dependency management of resources by allowing resource reads to occur in parallel and synchronizing only on writes.
+  * Fix execution order of multiple stateful calls to `experimental_run_v2` in `tf.function`.
+  * You can now iterate over `RaggedTensors` using a for loop inside `tf.function`.
+* `tf.lite`:
+  *  Migrated the `tf.lite` C inference API out of experimental into lite/c.
+  * Add an option to disallow `NNAPI` CPU / partial acceleration on Android 10
+  * TFLite Android AARs now include the C headers and APIs are required to use TFLite from native code.
+  * Refactors the delegate and delegate kernel sources to allow usage in the linter.
+  * Limit delegated ops to actually supported ones if a device name is specified or `NNAPI` CPU Fallback is disabled.
+  * TFLite now supports `tf.math.reciprocal1` op by lowering to `tf.div op`.
+  * TFLite's unpack op now supports boolean tensor inputs.
+  * Microcontroller and embedded code moved from experimental to main TensorFlow Lite folder
+  * Check for large TFLite tensors.
+  * Fix GPU delegate crash with C++17.
+  * Add 5D support to TFLite `strided_slice`.
+  * Fix error in delegation of `DEPTH_TO_SPACE` to `NNAPI` causing op not to be accelerated.
+  * Fix segmentation fault when running a model with LSTM nodes using `NNAPI` Delegate
+  * Fix `NNAPI` delegate failure when an operand for Maximum/Minimum operation is a scalar.
+  * Fix `NNAPI` delegate failure when Axis input for reduce operation is a scalar.
+  * Expose option to limit the number of partitions that will be delegated to `NNAPI`.
+  * If a target accelerator is specified, use its feature level to determine operations to delegate instead of SDK version.
+* `tf.random`:
+  * Various random number generation improvements:
+    * Add a fast path for default `random_uniform`
+    * `random_seed` documentation improvement.
+    * `RandomBinomial` broadcasts and appends the sample shape to the left rather than the right.
+  * Added `tf.random.stateless_binomial`, `tf.random.stateless_gamma`, `tf.random.stateless_poisson`
+  * `tf.random.stateless_uniform` now supports unbounded sampling of `int` types.
+* Math and Linear Algebra:
+  * Add `tf.linalg.LinearOperatorTridiag`.
+  * Add `LinearOperatorBlockLowerTriangular`
+  * Add broadcasting support to tf.linalg.triangular_solve[#26204](https://github.com/tensorflow/tensorflow/issues/26204), tf.math.invert_permutation.
+  * Add `tf.math.sobol_sample` op.
+  * Add `tf.math.xlog1py`.
+  * Add `tf.math.special.{dawsn,expi,fresnel_cos,fresnel_sin,spence}`.
+  * Add a Modified Discrete Cosine Transform (MDCT) and its inverse to `tf.signal`.
+* TPU Enhancements:
+  * Refactor `TpuClusterResolver` to move shared logic to a separate pip package.
+  * Support configuring TPU software version from cloud tpu client.
+  * Allowed TPU embedding weight decay factor to be multiplied by learning rate.
+* XLA Support:
+  * Add standalone XLA AOT runtime target + relevant .cc sources to pip package.
+  * Add check for memory alignment to MemoryAllocation::MemoryAllocation() on 32-bit ARM. This ensures a deterministic early exit instead of a hard to debug bus error later.
+  * `saved_model_cli aot_compile_cpu` allows you to compile saved models to XLA header+object files and include them in your C++ programs.
+  * Enable `Igamma`, `Igammac` for XLA.
+* Deterministic Op Functionality:
+  * XLA reduction emitter is deterministic when the environment variable `TF_DETERMINISTIC_OPS` is set to "true" or "1". This extends deterministic `tf.nn.bias_add` back-prop functionality (and therefore also deterministic back-prop of bias-addition in Keras layers) to include when XLA JIT complilation is enabled.
+  * Fix problem, when running on a CUDA GPU and when either environment variable `TF_DETERMINSTIC_OPS` or environment variable `TF_CUDNN_DETERMINISTIC` is set to "true" or "1", in which some layer configurations led to an exception with the message "No algorithm worked!"
+* Tracing and Debugging:
+  * Add source, destination name to `_send` traceme to allow easier debugging.
+  * Add traceme event to `fastpathexecute`.
+* Other:
+  * Fix an issue with AUC.reset_states for multi-label AUC [#35852](https://github.com/tensorflow/tensorflow/issues/35852)
+  * Fix the TF upgrade script to not delete files when there is a parsing error and the output mode is `in-place`.
+  * Move `tensorflow/core:framework/*_pyclif` rules to `tensorflow/core/framework:*_pyclif`.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+372046933, 8bitmp3, aaronhma, Abin Shahab, Aditya Patwardhan, Agoniii, Ahti Kitsik, Alan Yee, Albin Joy, Alex Hoffman, Alexander Grund, Alexandre E. Eichenberger, Amit Kumar Jaiswal, amoitra, Andrew Anderson, Angus-Luo, Anthony Barbier, Anton Kachatkou, Anuj Rawat, archis, Arpan-Dhatt, Arvind Sundararajan, Ashutosh Hathidara, autoih, Bairen Yi, Balint Cristian, Bas Aarts, BashirSbaiti, Basit Ayantunde, Ben Barsdell, Benjamin Gaillard, boron, Brett Koonce, Bryan Cutler, Christian Goll, Christian Sachs, Clayne Robison, comet, Daniel Falbel, Daria Zhuravleva, darsh8200, David Truby, Dayananda-V, deepakm, Denis Khalikov, Devansh Singh, Dheeraj R Reddy, Diederik Van Liere, Diego Caballero, Dominic Jack, dothinking, Douman, Drake Gens, Duncan Riach, Ehsan Toosi, ekuznetsov139, Elena Zhelezina, elzino, Ending2015a, Eric Schweitz, Erik Zettel, Ethan Saadia, Eugene Kuznetsov, Evgeniy Zheltonozhskiy, Ewout Ter Hoeven, exfalso, FAIJUL, Fangjun Kuang, Fei Hu, Frank Laub, Frederic Bastien, Fredrik Knutsson, frreiss, Frédéric Rechtenstein, fsx950223, Gaurav Singh, gbaned, George Grzegorz Pawelczak, George Sterpu, Gian Marco Iodice, Giorgio Arena, Hans Gaiser, Hans Pabst, Haoyu Wu, Harry Slatyer, hsahovic, Hugo, Hugo Sjöberg, IrinaM21, jacco, Jake Tae, Jean-Denis Lesage, Jean-Michel Gorius, Jeff Daily, Jens Elofsson, Jerry Shih, jerryyin, Jin Mingjian, Jinjing Zhou, JKIsaacLee, jojimonv, Jonathan Dekhtiar, Jose Ignacio Gomez, Joseph-Rance, Judd, Julian Gross, Kaixi Hou, Kaustubh Maske Patil, Keunwoo Choi, Kevin Hanselman, Khor Chean Wei, Kilaru Yasaswi Sri Chandra Gandhi, Koan-Sin Tan, Koki Ibukuro, Kristian Holsheimer, kurileo, Lakshay Tokas, Lee Netherton, leike666666, Leslie-Fang-Intel, Li, Guizi, LIUJIAN435, Lukas Geiger, Lyo Nguyen, madisetti, Maher Jendoubi, Mahmoud Abuzaina, Manuel Freiberger, Marcel Koester, Marco Jacopo Ferrarotti, Markus Franke, marload, Mbah-Javis, mbhuiyan, Meng Zhang, Michael Liao, MichaelKonobeev, Michal Tarnowski, Milan Straka, minoring, Mohamed Nour Abouelseoud, MoussaMM, Mrinal Jain, mrTsjolder, Måns Nilsson, Namrata Bhave, Nicholas Gao, Niels Ole Salscheider, nikochiko, Niranjan Hasabnis, Nishidha Panpaliya, nmostafa, Noah Trenaman, nuka137, Officium, Owen L - Sfe, Pallavi G, Paul Andrey, Peng Sun, Peng Wu, Phil Pearl, PhilipMay, pingsutw, Pooya Davoodi, PragmaTwice, pshiko, Qwerty71, R Gomathi, Rahul Huilgol, Richard Xiao, Rick Wierenga, Roberto Rosmaninho, ruchit2801, Rushabh Vasani, Sami, Sana Damani, Sarvesh Dubey, Sasan Jafarnejad, Sergii Khomenko, Shane Smiskol, Shaochen Shi, sharkdtu, Shawn Presser, ShengYang1, Shreyash Patodia, Shyam Sundar Dhanabalan, Siju Samuel, Somyajit Chakraborty Sam, Srihari Humbarwadi, srinivasan.narayanamoorthy, Srishti Yadav, Steph-En-M, Stephan Uphoff, Stephen Mugisha, SumanSudhir, Taehun Kim, Tamas Bela Feher, TengLu, Tetragramm, Thierry Herrmann, Tian Jin, tigertang, Tom Carchrae, Tom Forbes, Trent Lo, Victor Peng, vijayphoenix, Vincent Abriou, Vishal Bhola, Vishnuvardhan Janapati, vladbataev, VoVAllen, Wallyss Lima, Wen-Heng (Jack) Chung, wenxizhu, William D. Irons, William Zhang, Xiaoming (Jason) Cui, Xiaoquan Kong, Xinan Jiang, Yasir Modak, Yasuhiro Matsumoto, Yaxun (Sam) Liu, Yong Tang, Ytyt-Yt, yuan, Yuan Mingshuai, Yuan Tang, Yuki Ueda, Yusup, zhangshijin, zhuwenxi
+
 # Release 2.0.1
 
 ## Bug Fixes and Other Changes
diff --git a/SECURITY.md b/SECURITY.md
index 6fc2c3aa9cc..f3a6c148b2e 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -64,7 +64,7 @@ your model, and we recommend you run the TensorFlow process in a sandbox.
 
 It is possible to write models that are secure in a sense that they can safely
 process untrusted inputs assuming there are no bugs. There are two main reasons
-to not rely on this: first, it is easy to write models which must not be exposed
+to not rely on this: First, it is easy to write models which must not be exposed
 to untrusted inputs, and second, there are bugs in any software system of
 sufficient complexity. Letting users control inputs could allow them to trigger
 bugs either in TensorFlow or in dependent libraries.
@@ -149,7 +149,7 @@ attack (or worse). Because TensorFlow behaves correctly, this is not a
 vulnerability in TensorFlow (although it would be a vulnerability of this
 hypothetical system).
 
-As a general rule, it is incorrect behavior for Tensorflow to access memory it
+As a general rule, it is incorrect behavior for TensorFlow to access memory it
 does not own, or to terminate in an unclean way. Bugs in TensorFlow that lead to
 such behaviors constitute a vulnerability.
 
diff --git a/WORKSPACE b/WORKSPACE
index 021ed6d2542..ea741c31c7f 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -114,6 +114,14 @@ http_archive(
     ],
 )
 
+http_archive(
+    name = "person_detect_data",
+    sha256 = "170542270da256994ce24d1e357f6e84a54fdaf7d28ff2b74725a40b70b082cf",
+    urls = [
+        "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_grayscale_2020_05_24.zip",
+    ],
+)
+
 # Required for dependency @com_github_grpc_grpc
 
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
diff --git a/configure.py b/configure.py
index ac9ed0c4d88..c2850beede6 100644
--- a/configure.py
+++ b/configure.py
@@ -144,7 +144,7 @@ def write_to_bazelrc(line):
 
 
 def write_action_env_to_bazelrc(var_name, var):
-  write_to_bazelrc('build --action_env %s="%s"' % (var_name, str(var)))
+  write_to_bazelrc('build --action_env {}="{}"'.format(var_name, str(var)))
 
 
 def run_shell(cmd, allow_non_zero=False, stderr=None):
@@ -205,7 +205,7 @@ def setup_python(environ_cp):
   # Get PYTHON_BIN_PATH, default is the current running python.
   default_python_bin_path = sys.executable
   ask_python_bin_path = ('Please specify the location of python. [Default is '
-                         '%s]: ') % default_python_bin_path
+                         '{}]: ').format(default_python_bin_path)
   while True:
     python_bin_path = get_from_env_or_user_or_default(environ_cp,
                                                       'PYTHON_BIN_PATH',
@@ -215,9 +215,10 @@ def setup_python(environ_cp):
     if os.path.isfile(python_bin_path) and os.access(python_bin_path, os.X_OK):
       break
     elif not os.path.exists(python_bin_path):
-      print('Invalid python path: %s cannot be found.' % python_bin_path)
+      print('Invalid python path: {} cannot be found.'.format(python_bin_path))
     else:
-      print('%s is not executable.  Is it the python binary?' % python_bin_path)
+      print('{} is not executable.  Is it the python binary?'.format(
+          python_bin_path))
     environ_cp['PYTHON_BIN_PATH'] = ''
 
   # Convert python path to Windows style before checking lib and version
@@ -236,7 +237,7 @@ def setup_python(environ_cp):
       default_python_lib_path = python_lib_paths[0]
       python_lib_path = get_input(
           'Please input the desired Python library path to use.  '
-          'Default is [%s]\n' % python_lib_paths[0])
+          'Default is [{}]\n'.format(python_lib_paths[0]))
       if not python_lib_path:
         python_lib_path = default_python_lib_path
     environ_cp['PYTHON_LIB_PATH'] = python_lib_path
@@ -252,7 +253,7 @@ def setup_python(environ_cp):
   # Set-up env variables used by python_configure.bzl
   write_action_env_to_bazelrc('PYTHON_BIN_PATH', python_bin_path)
   write_action_env_to_bazelrc('PYTHON_LIB_PATH', python_lib_path)
-  write_to_bazelrc('build --python_path=\"%s"' % python_bin_path)
+  write_to_bazelrc('build --python_path=\"{}"'.format(python_bin_path))
   environ_cp['PYTHON_BIN_PATH'] = python_bin_path
 
   # If choosen python_lib_path is from a path specified in the PYTHONPATH
@@ -266,7 +267,7 @@ def setup_python(environ_cp):
   with open(
       os.path.join(_TF_WORKSPACE_ROOT, 'tools', 'python_bin_path.sh'),
       'w') as f:
-    f.write('export PYTHON_BIN_PATH="%s"' % python_bin_path)
+    f.write('export PYTHON_BIN_PATH="{}"'.format(python_bin_path))
 
 
 def reset_tf_configure_bazelrc():
@@ -320,11 +321,12 @@ def get_var(environ_cp,
       Raise the error to avoid infinitely looping.
   """
   if not question:
-    question = 'Do you wish to build TensorFlow with %s support?' % query_item
+    question = 'Do you wish to build TensorFlow with {} support?'.format(
+        query_item)
   if not yes_reply:
-    yes_reply = '%s support will be enabled for TensorFlow.' % query_item
+    yes_reply = '{} support will be enabled for TensorFlow.'.format(query_item)
   if not no_reply:
-    no_reply = 'No %s' % yes_reply
+    no_reply = 'No {}'.format(yes_reply)
 
   yes_reply += '\n'
   no_reply += '\n'
@@ -368,7 +370,7 @@ def get_var(environ_cp,
         print(no_reply)
         var = False
     else:
-      print('Invalid selection: %s' % user_input_origin)
+      print('Invalid selection: {}'.format(user_input_origin))
   return var
 
 
@@ -479,13 +481,13 @@ def check_bazel_version(min_version, max_version):
   if which('bazel') is None:
     print('Cannot find bazel. Please install bazel.')
     sys.exit(0)
-  curr_version = run_shell(
-      ['bazel', '--batch', '--bazelrc=/dev/null', 'version'])
 
-  for line in curr_version.split('\n'):
-    if 'Build label: ' in line:
-      curr_version = line.split('Build label: ')[1]
-      break
+  stderr = open(os.devnull, 'wb')
+  curr_version = run_shell(['bazel', '--version'],
+                           allow_non_zero=True,
+                           stderr=stderr)
+  if curr_version.startswith('bazel '):
+    curr_version = curr_version.split('bazel ')[1]
 
   min_version_int = convert_version_to_int(min_version)
   curr_version_int = convert_version_to_int(curr_version)
@@ -1009,17 +1011,15 @@ def set_tf_cuda_compute_capabilities(environ_cp):
       default_cuda_compute_capabilities = native_cuda_compute_capabilities
 
     ask_cuda_compute_capabilities = (
-        'Please specify a list of comma-separated '
-        'CUDA compute capabilities you want to '
-        'build with.\nYou can find the compute '
-        'capability of your device at: '
-        'https://developer.nvidia.com/cuda-gpus.\nPlease'
-        ' note that each additional compute '
-        'capability significantly increases your '
-        'build time and binary size, and that '
-        'TensorFlow only supports compute '
-        'capabilities >= 3.5 [Default is: %s]: ' %
-        default_cuda_compute_capabilities)
+        'Please specify a list of comma-separated CUDA compute capabilities '
+        'you want to build with.\nYou can find the compute capability of your '
+        'device at: https://developer.nvidia.com/cuda-gpus. Each capability '
+        'can be specified as "x.y" or "compute_xy" to include both virtual and'
+        ' binary GPU code, or as "sm_xy" to only include the binary '
+        'code.\nPlease note that each additional compute capability '
+        'significantly increases your build time and binary size, and that '
+        'TensorFlow only supports compute capabilities >= 3.5 [Default is: '
+        '%s]: ' % default_cuda_compute_capabilities)
     tf_cuda_compute_capabilities = get_from_env_or_user_or_default(
         environ_cp, 'TF_CUDA_COMPUTE_CAPABILITIES',
         ask_cuda_compute_capabilities, default_cuda_compute_capabilities)
@@ -1031,8 +1031,23 @@ def set_tf_cuda_compute_capabilities(environ_cp):
     for compute_capability in tf_cuda_compute_capabilities.split(','):
       m = re.match('[0-9]+.[0-9]+', compute_capability)
       if not m:
-        print('Invalid compute capability: %s' % compute_capability)
-        all_valid = False
+        # We now support sm_35,sm_50,sm_60,compute_70.
+        sm_compute_match = re.match('(sm|compute)_?([0-9]+[0-9]+)',
+                                    compute_capability)
+        if not sm_compute_match:
+          print('Invalid compute capability: %s' % compute_capability)
+          all_valid = False
+        else:
+          ver = int(sm_compute_match.group(2))
+          if ver < 30:
+            print(
+                'ERROR: TensorFlow only supports small CUDA compute'
+                ' capabilities of sm_30 and higher. Please re-specify the list'
+                ' of compute capabilities excluding version %s.' % ver)
+            all_valid = False
+          if ver < 35:
+            print('WARNING: XLA does not support CUDA compute capabilities '
+                  'lower than sm_35. Disable XLA when running on older GPUs.')
       else:
         ver = float(m.group(0))
         if ver < 3.0:
@@ -1223,7 +1238,8 @@ def is_reduced_optimize_huge_functions_available(environ_cp):
   only, as of 2019-11-19). TensorFlow needs this flag to massively reduce
   compile times, but until 16.4 is officially released, we can't depend on it.
 
-  See also https://groups.google.com/a/tensorflow.org/d/topic/build/SsW98Eo7l3o/discussion
+  See also
+  https://groups.google.com/a/tensorflow.org/d/topic/build/SsW98Eo7l3o/discussion
 
   Because it's very annoying to check this manually (to check the MSVC installed
   versions, you need to use the registry, and it's not clear if Bazel will be
@@ -1366,8 +1382,13 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  current_bazel_version = check_bazel_version(_TF_MIN_BAZEL_VERSION,
-                                              _TF_MAX_BAZEL_VERSION)
+  try:
+    current_bazel_version = check_bazel_version(_TF_MIN_BAZEL_VERSION,
+                                                _TF_MAX_BAZEL_VERSION)
+  except subprocess.CalledProcessError as e:
+    print('Error checking bazel version: ', e.output.decode('UTF-8').strip())
+    raise e
+
   _TF_CURRENT_BAZEL_VERSION = convert_version_to_int(current_bazel_version)
 
   reset_tf_configure_bazelrc()
@@ -1385,7 +1406,6 @@ def main():
     # Windows.
     environ_cp['TF_DOWNLOAD_CLANG'] = '0'
     environ_cp['TF_NEED_MPI'] = '0'
-    environ_cp['TF_SET_ANDROID_WORKSPACE'] = '0'
 
   if is_macos():
     environ_cp['TF_NEED_TENSORRT'] = '0'
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index f2018220a56..efbdf89ecea 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -517,18 +517,29 @@ package_group(
         "//perftools/accelerators/xprof/api/...",
         "//third_party/py/autograph/...",
         "//third_party/swift/tensorflow/x10/...",
+        "//third_party/swift/tensorflow_apis/...",
         "//tensorflow/...",
         "//tensorflow_estimator/python/estimator/...",
         "//tensorflow_models/official/...",
     ],
 )
 
-package_group(name = "ndarray_tensor_allow_list")
+package_group(
+    name = "ndarray_tensor_allow_list",
+    packages = ["//learning/pathways/..."],
+)
 
 # Packages that use composite tensors or dispatch.
 # TODO(b/154762408) Remove this package group once it's no longer needed.
 package_group(name = "composite_tensor_whitelist")
 
+# Packages that use private types symbols, until they are exported.
+# TODO(b/154650521) Remove.
+package_group(
+    name = "types_whitelist",
+    packages = ["//learning/deepmind/tensorflow/replicator/..."],
+)
+
 filegroup(
     name = "intel_binary_blob",
     data = if_mkl_ml(
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 66ade5c7bd4..12021a294e8 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -85,7 +85,7 @@ tf_cuda_library(
     ],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:chromiumos": [
             ":tf_attrtype",
@@ -182,7 +182,7 @@ tf_cuda_library(
         ":tf_status_internal",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":tf_status",
@@ -216,10 +216,11 @@ tf_cuda_library(
     ],
     visibility = [
         "//tensorflow/c:__subpackages__",
+        "//tensorflow/compiler/mlir/tensorflow/c:__subpackages__",
     ],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             "//tensorflow/core:lib",
@@ -232,12 +233,13 @@ cc_library(
     srcs = ["tf_status.cc"],
     hdrs = ["tf_status.h"],
     visibility = ["//visibility:public"],
-    deps = select({
+    deps = [
+        ":tf_status_internal",
+    ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            ":tf_status_internal",
             "//tensorflow/core:lib",
         ],
     }),
@@ -259,10 +261,15 @@ cc_library(
     name = "tensor_interface",
     hdrs = ["tensor_interface.h"],
     visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
+    deps = select({
+        "//tensorflow:android": [
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:lib",
+            "//tensorflow/core:protos_all_cc",
+        ],
+    }),
 )
 
 cc_library(
@@ -272,7 +279,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
@@ -286,16 +293,17 @@ cc_library(
     srcs = ["tf_tensor.cc"],
     hdrs = ["tf_tensor.h"],
     visibility = ["//visibility:public"],
-    deps = select({
+    deps = [
+        ":tensor_interface",
+        ":tf_datatype",
+        ":tf_status",
+        ":tf_status_helper",
+        ":tf_tensor_internal",
+    ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            ":tensor_interface",
-            ":tf_datatype",
-            ":tf_status",
-            ":tf_status_helper",
-            ":tf_tensor_internal",
             "//tensorflow/core:framework",
             "//tensorflow/core:lib",
             "//tensorflow/core:protos_all_cc",
@@ -311,14 +319,15 @@ tf_cuda_library(
         "tf_tensor_internal.h",
     ],
     visibility = ["//tensorflow:internal"],
-    deps = select({
+    deps = [
+        ":tensor_interface",
+        ":tf_datatype",
+        ":tf_status",
+    ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            ":tensor_interface",
-            ":tf_datatype",
-            ":tf_status",
             "//tensorflow/core:framework",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core/platform:casts",
@@ -386,8 +395,14 @@ tf_cuda_library(
     deps = [
         ":tf_status",
         ":tf_status_internal",
-        "//tensorflow/core:lib",
-    ],
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:lib",
+        ],
+    }),
 )
 
 tf_cc_test(
@@ -426,7 +441,7 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
@@ -457,7 +472,7 @@ tf_cuda_library(
     ] + select({
         "//tensorflow:android": [
             ":c_api_internal",
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":c_api_internal",
@@ -484,7 +499,7 @@ tf_cuda_library(
         ":tf_status_helper",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index e623f30b98c..e9e6d470c68 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -325,205 +325,6 @@ TF_Buffer* TFE_GetServerDef(const char* text_proto, TF_Status* status) {
   return ret;
 }
 
-TFE_Context* TFE_CreateContextFromSession(TF_Session* session,
-                                          TF_Status* status) {
-  auto* opts = TFE_NewContextOptions();
-
-  // Reduce GPU memory allocation, and set appropriate config options for TFE
-  // context.
-  auto* config = TF_CreateConfig(
-      /*xla*/ false, /* gpu_memory_allow_growth */ true, /* num_cpu_devices */
-      10);
-  TFE_ContextOptionsSetConfig(opts, config->data, config->length, status);
-  if (!status->status.ok()) {
-    CHECK(!config);
-    TFE_DeleteContextOptions(opts);
-    return nullptr;
-  }
-
-  auto* ctx = TFE_NewContextFromSession(opts, session, status);
-  TF_DeleteBuffer(config);
-  TFE_DeleteContextOptions(opts);
-  return ctx;
-}
-
-// TODO: retrieve the device string via TFE_ContextListDevices()
-static const char DEFAULT_CPU_DEVICE[] =
-    "/job:localhost/replica:0/task:0/device:CPU:0";
-
-static TFE_TensorHandle* createTFEQueue(TFE_Context* ctx, TF_DataType inputType,
-                                        int tensor_id, TF_Status* status) {
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> queueOp(
-      TFE_NewOp(ctx, "FIFOQueueV2", status), TFE_DeleteOp);
-  TFE_OpSetDevice(queueOp.get(), DEFAULT_CPU_DEVICE, status);
-  if (!status->status.ok()) return nullptr;
-  // TODO: use NAMED_TENSOR_QUEUE_CAPACITY in S4TF compiler.
-  TFE_OpSetAttrInt(queueOp.get(), "capacity", 1);
-  TFE_OpSetAttrTypeList(queueOp.get(), "component_types", &inputType, 1);
-  auto shared_name = tensorflow::strings::StrCat("fifo_queue_", tensor_id);
-  TFE_OpSetAttrString(queueOp.get(), "shared_name", shared_name.data(),
-                      shared_name.size());
-  TFE_OpSetAttrString(queueOp.get(), "container", "", 0);
-
-  // TODO: consider making this an unknown shape.
-  const int64_t* dims_ptr = nullptr;
-  int num_dims = 0;
-  TFE_OpSetAttrShapeList(queueOp.get(), "shapes", &dims_ptr, &num_dims,
-                         /*num_values*/ 0, status);
-  if (!status->status.ok()) return nullptr;
-
-  int num_retvals = 1;
-  TFE_TensorHandle* queue = nullptr;
-  TFE_Execute(queueOp.get(), &queue, &num_retvals, status);
-  if (!status->status.ok()) return nullptr;
-  CHECK_EQ(num_retvals, 1);
-
-  return queue;
-}
-
-static void createTFEEnqueue(TFE_Context* ctx, TF_DataType inputType,
-                             TFE_TensorHandle* queue, TFE_TensorHandle* tensor,
-                             TF_Status* status) {
-  TFE_Op* op = TFE_NewOp(ctx, "QueueEnqueueV2", status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op_deleter(op, TFE_DeleteOp);
-  TFE_OpSetDevice(op, DEFAULT_CPU_DEVICE, status);
-  if (!status->status.ok()) return;
-  TFE_OpAddInput(op, queue, status);
-  if (!status->status.ok()) return;
-  TFE_OpAddInput(op, tensor, status);
-  if (!status->status.ok()) return;
-  TFE_OpSetAttrTypeList(op, "Tcomponents", &inputType, 1);
-  TFE_OpSetAttrInt(op, "timeout_ms", -1);
-
-  int num_retvals = 0;
-  TFE_Execute(op, nullptr /*retvals*/, &num_retvals, status);
-  if (!status->status.ok()) return;
-  CHECK_EQ(num_retvals, 0);
-}
-
-static TFE_TensorHandle* createTFEDequeue(TFE_Context* ctx,
-                                          TF_DataType inputType,
-                                          TFE_TensorHandle* queue,
-                                          TF_Status* status) {
-  TFE_Op* op = TFE_NewOp(ctx, "QueueDequeueV2", status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op_deleter(op, TFE_DeleteOp);
-  TFE_OpSetDevice(op, DEFAULT_CPU_DEVICE, status);
-  if (!status->status.ok()) return nullptr;
-
-  TFE_OpAddInput(op, queue, status);
-  if (!status->status.ok()) return nullptr;
-  TFE_OpSetAttrTypeList(op, "component_types", &inputType, 1);
-  TFE_OpSetAttrInt(op, "timeout_ms", -1);
-  TFE_TensorHandle* ret;
-  int num_retvals = 1;
-  TFE_Execute(op, &ret, &num_retvals, status);
-  if (!status->status.ok()) return nullptr;
-  CHECK_EQ(num_retvals, 1);
-  return ret;
-}
-
-TFE_TensorHandle* TFE_DequeueNamedTensor(TF_Session* session, int tensor_id,
-                                         TF_DataType inputType,
-                                         TF_Status* status) {
-  assert(session);
-  VLOG(1) << "Dequeuing data tensor with id " << tensor_id;
-
-  auto ctx = TFE_CreateContextFromSession(session, status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> ctx_deleter(
-      ctx, TFE_DeleteContext);
-
-  TFE_TensorHandle* queue = createTFEQueue(ctx, inputType, tensor_id, status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  auto* ret = createTFEDequeue(ctx, inputType, queue, status);
-  return ret;
-}
-
-TFE_TensorHandle* TFE_DequeueNamedTensorFromCtx(TFE_Context* ctx, int tensor_id,
-                                                TF_DataType inputType,
-                                                TF_Status* status) {
-  TFE_TensorHandle* queue = createTFEQueue(ctx, inputType, tensor_id, status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  auto* ret = createTFEDequeue(ctx, inputType, queue, status);
-
-  return ret;
-}
-
-void TFE_EnqueueNamedTensor(TF_Session* session, int tensor_id,
-                            TFE_TensorHandle* tensor, TF_Status* status) {
-  assert(session);
-  VLOG(1) << "Enqueuing data tensor with id " << tensor_id;
-
-  auto ctx = TFE_CreateContextFromSession(session, status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> ctx_deleter(
-      ctx, TFE_DeleteContext);
-
-  TF_DataType inputType = TFE_TensorHandleDataType(tensor);
-  TFE_TensorHandle* queue = createTFEQueue(ctx, inputType, tensor_id, status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  createTFEEnqueue(ctx, inputType, queue, tensor, status);
-}
-
-void TFE_EnqueueNamedTensorFromCtx(TFE_Context* ctx, int tensor_id,
-                                   TFE_TensorHandle* tensor,
-                                   TF_Status* status) {
-  VLOG(1) << "Enqueuing data tensor with id " << tensor_id;
-
-  TF_DataType inputType = TFE_TensorHandleDataType(tensor);
-  TFE_TensorHandle* queue = createTFEQueue(ctx, inputType, tensor_id, status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  createTFEEnqueue(ctx, inputType, queue, tensor, status);
-}
-
-void TFE_EnqueueVariantTensor(TF_Session* session, int tensor_id,
-                              TFE_TensorHandle* tensor, TF_Status* status) {
-  VLOG(1) << "Enqueuing variant tensor with id " << tensor_id;
-
-  auto ctx = TFE_CreateContextFromSession(session, status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> ctx_deleter(
-      ctx, TFE_DeleteContext);
-
-  TFE_TensorHandle* queue = createTFEQueue(ctx, TF_VARIANT, tensor_id, status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  createTFEEnqueue(ctx, TF_VARIANT, queue, tensor, status);
-}
-
-TFE_TensorHandle* TFE_DequeueVariantTensor(TF_Session* session, int tensor_id,
-                                           TF_Status* status) {
-  VLOG(1) << "Dequeuing variant tensor with id " << tensor_id;
-
-  auto ctx = TFE_CreateContextFromSession(session, status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> ctx_deleter(
-      ctx, TFE_DeleteContext);
-
-  TFE_TensorHandle* queue = createTFEQueue(ctx, TF_VARIANT, tensor_id, status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  return createTFEDequeue(ctx, TF_VARIANT, queue, status);
-}
-
 void TF_MakeInternalErrorStatus(TF_Status* status, const char* errMsg) {
   status->status = tensorflow::errors::Internal(errMsg);
 }
@@ -622,10 +423,9 @@ void TF_AttrBuilderSetType(TF_AttrBuilder* builder, const char* attr_name,
 void TF_AttrBuilderSetTypeList(TF_AttrBuilder* builder, const char* attr_name,
                                const TF_DataType* values, int num_values) {
   auto iter = builder->attr_names.insert(attr_name).first;
-  builder->Set(
-      (*iter).c_str(),
-      tensorflow::gtl::ArraySlice<const tensorflow::DataType>(
-          reinterpret_cast<const tensorflow::DataType*>(values), num_values));
+  builder->Set(*iter, tensorflow::gtl::ArraySlice<const tensorflow::DataType>(
+                          reinterpret_cast<const tensorflow::DataType*>(values),
+                          num_values));
 }
 
 void TF_AttrBuilderCheckCanRunOnDevice(TF_AttrBuilder* builder,
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 551a45d92c4..d0ffbf125fb 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -146,48 +146,6 @@ TF_CAPI_EXPORT extern void TF_EnqueueNamedTensor(TF_Session* session,
 // Create a serialized tensorflow.ServerDef proto.
 TF_Buffer* TFE_GetServerDef(const char* text_proto, TF_Status* status);
 
-// TODO: remove this API in favor of the next one.
-TF_CAPI_EXPORT extern TFE_Context* TFE_NewContextFromSession(
-    const TFE_ContextOptions* opts, TF_Session* sess, TF_Status* status);
-
-// Creates from `session` a new eager context to run a graph function or
-// sends/recvs, so that these concurrent TFE executions can share (via
-// `session` and its associated device mgr) the same set of fifo queue resource
-// ops, used for host<->TF tensor transfers. This way the sends/recvs calls and
-// graph function execution can access the same fifo queue resource handles
-// (associated with devices managed by the device manager, which can be obtained
-// from `session`).
-//
-// TODO: Remove this function once we migrate away from using session.
-TF_CAPI_EXPORT extern TFE_Context* TFE_CreateContextFromSession(
-    TF_Session* session, TF_Status* status);
-
-// TODO: Retire this API in favor of the next one.
-TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueNamedTensor(
-    TF_Session* session, int tensor_id, TF_DataType inputType,
-    TF_Status* status);
-
-TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueNamedTensorFromCtx(
-    TFE_Context* ctx, int tensor_id, TF_DataType inputType, TF_Status* status);
-
-TF_CAPI_EXPORT extern void TFE_EnqueueNamedTensor(TF_Session* session,
-                                                  int tensor_id,
-                                                  TFE_TensorHandle* tensor,
-                                                  TF_Status* status);
-
-TF_CAPI_EXPORT extern void TFE_EnqueueNamedTensorFromCtx(
-    TFE_Context* ctx, int tensor_id, TFE_TensorHandle* tensor,
-    TF_Status* status);
-
-// TODO: consider folding the 2 APIs below into the ones above.
-TF_CAPI_EXPORT extern void TFE_EnqueueVariantTensor(TF_Session* session,
-                                                    int tensor_id,
-                                                    TFE_TensorHandle* tensor,
-                                                    TF_Status* status);
-
-TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueVariantTensor(
-    TF_Session* session, int tensor_id, TF_Status* status);
-
 TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status,
                                                       const char* errMsg);
 
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 3d3fc7065a4..407acfe1ca9 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -16,7 +16,6 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -36,7 +35,7 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":context_interface",
@@ -145,6 +144,24 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "c_api_unified_internal",
+    hdrs = [
+        "c_api_unified_experimental_internal.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":c_api",
+        ":c_api_experimental",
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/core/platform:casts",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
 cc_library(
     name = "tensor_handle_interface",
     hdrs = ["tensor_handle_interface.h"],
@@ -320,6 +337,7 @@ tf_cuda_cc_test(
     tags = [
         "noguitar",  # TODO(b/155445984): flaky
         #"guitar",
+        "notap",  # TODO(b/156981931): flaky
         "multi_gpu",
     ],
     deps = [
@@ -350,6 +368,38 @@ tf_cuda_cc_test(
     # TODO(b/136478427): Figure out how to correctly shut the server down
     args = ["--heap_check=local"],
     extra_copts = tfe_xla_copts(),
+    tags = [
+        "noasan",  # leaks gRPC server instances
+    ],
+    deps = [
+        ":c_api",
+        ":c_api_experimental",
+        ":c_api_internal",
+        ":c_api_test_util",
+        ":tfe_tensorhandle_internal",
+        "//tensorflow/c:c_test_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime:function_optimization_registry",
+        "//tensorflow/core/common_runtime/eager:eager_operation",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "c_api_distributed_test",
+    size = "small",
+    srcs = [
+        "c_api_distributed_test.cc",
+    ],
+    # TODO(b/136478427): Figure out how to correctly shut the server down
+    args = ["--heap_check=local"],
+    extra_copts = tfe_xla_copts(),
     tags = ["noasan"],  # leaks gRPC server instances
     deps = [
         ":c_api",
@@ -358,10 +408,13 @@ tf_cuda_cc_test(
         ":c_api_test_util",
         ":tfe_tensorhandle_internal",
         "//tensorflow/c:c_test_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime:function_optimization_registry",
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "@com_google_absl//absl/strings",
@@ -413,7 +466,7 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":c_api",
@@ -449,6 +502,8 @@ tf_cuda_library(
         "//conditions:default": [],
     }) + [
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core/distributed_runtime/eager:eager_client",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
@@ -506,6 +561,7 @@ tf_cuda_cc_test(
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_test_util",
         "//tensorflow/cc/profiler",
+        "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -609,7 +665,6 @@ filegroup(
         ],
         exclude = [
             "c_api_experimental.cc",
-            "*c_api_tfrt*",
             "*test*",
             "*dlpack*",
         ],
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 9651a47d6ac..5a39c17e1d9 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -38,7 +38,7 @@ limitations under the License.
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/tf_tensor_internal.h"
 #ifdef PLATFORM_GOOGLE
-#include "tensorflow/c/eager/c_api_tfrt.h"
+#include "tensorflow/core/tfrt/eager/c_api_tfrt.h"
 #endif
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
@@ -102,6 +102,15 @@ string DeviceName(const tensorflow::Device* d) {
 }
 
 #if !defined(IS_MOBILE_PLATFORM)
+bool AreLocalDevicesCompatible(const tensorflow::EagerContext* context,
+                               const tensorflow::ServerDef& server_def) {
+  if (server_def.job_name() != context->HostCPU()->parsed_name().job) {
+    return false;
+  }
+  return server_def.default_session_config().SerializeAsString() ==
+         context->session_options().config.SerializeAsString();
+}
+
 tensorflow::Status AddRemoteDevicesToMgr(
     const std::vector<string>& added_remote_workers,
     tensorflow::WorkerCacheInterface* worker_cache,
@@ -469,10 +478,15 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
       tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   tensorflow::GrpcServer* grpc_server;
   if (reset_context) {
-    LOG_AND_RETURN_IF_ERROR(tensorflow::NewServer(server_def, &new_server));
+    const tensorflow::DeviceMgr* device_mgr =
+        AreLocalDevicesCompatible(context, server_def)
+            ? context->local_device_mgr()
+            : nullptr;
+    LOG_AND_RETURN_IF_ERROR(tensorflow::NewServerWithOptions(
+        server_def, {device_mgr}, &new_server));
     grpc_server = dynamic_cast<tensorflow::GrpcServer*>(new_server.get());
     LOG_AND_RETURN_IF_ERROR(
-        ListRemoteWorkers(grpc_server, worker_name, &remote_workers));
+        ListRemoteWorkers(new_server.get(), worker_name, &remote_workers));
   } else {
     LOG_AND_RETURN_IF_ERROR(ListRemoteWorkers(context->GetServer(), worker_name,
                                               &curr_remote_workers));
@@ -727,24 +741,6 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
       tensorflow::GetDefaultCustomKernelCreator()));
 }
 
-TFE_Context* TFE_NewContextFromSession(const TFE_ContextOptions* opts,
-                                       TF_Session* sess, TF_Status* status) {
-  const tensorflow::DeviceMgr* device_mgr = nullptr;
-  status->status = sess->session->LocalDeviceManager(&device_mgr);
-  if (!status->status.ok()) return nullptr;
-  tensorflow::Rendezvous* r =
-      new tensorflow::IntraProcessRendezvous(device_mgr);
-
-  return tensorflow::wrap(new tensorflow::EagerContext(
-      opts->session_options.options,
-      static_cast<tensorflow::ContextDevicePlacementPolicy>(
-          opts->device_placement_policy),
-      static_cast<tensorflow::ContextMirroringPolicy>(opts->mirroring_policy),
-      opts->async, opts->lazy_remote_inputs_copy, device_mgr,
-      /*device_mgr_owned*/ false, r,
-      tensorflow::GetDefaultCustomKernelCreator()));
-}
-
 void TFE_DeleteContext(TFE_Context* ctx) {
   if (ctx == nullptr) {
     return;
@@ -899,9 +895,7 @@ TF_CAPI_EXPORT extern void TFE_ContextAsyncWait(TFE_Context* ctx,
 #if defined(IS_MOBILE_PLATFORM)
   status->status = tensorflow::Status::OK();
 #else   // !defined(IS_MOBILE_PLATFORM)
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  status->status = context->SyncExecutors();
+  status->status = tensorflow::unwrap(ctx)->AsyncWait();
 #endif  // !IS_MOBILE_PLATFORM
 }
 
@@ -924,7 +918,7 @@ extern TFE_ContextDevicePlacementPolicy TFE_ContextGetDevicePlacementPolicy(
       context->GetDevicePlacementPolicy());
 }
 
-TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
+TFE_TensorHandle* TFE_NewTensorHandle(const TF_Tensor* t, TF_Status* status) {
   tensorflow::Tensor tensor;
   status->status = tensorflow::TF_TensorToTensor(t, &tensor);
   if (!status->status.ok()) return nullptr;
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 070b3a9bb60..5afe3047dd7 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -137,7 +137,7 @@ TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx,
 // placed in memory of different devices or remote address spaces.
 typedef struct TFE_TensorHandle TFE_TensorHandle;
 
-TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t,
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandle(const TF_Tensor* t,
                                                             TF_Status* status);
 // Indicates that the caller will not be using `h` any more.
 TF_CAPI_EXPORT extern void TFE_DeleteTensorHandle(TFE_TensorHandle* h);
diff --git a/tensorflow/c/eager/c_api_cluster_test.cc b/tensorflow/c/eager/c_api_cluster_test.cc
index 8f585d6f02c..7a604950a63 100644
--- a/tensorflow/c/eager/c_api_cluster_test.cc
+++ b/tensorflow/c/eager/c_api_cluster_test.cc
@@ -30,24 +30,11 @@ namespace {
 
 using ::tensorflow::string;
 
-tensorflow::ServerDef GetServerDef(const string& job_name, int num_tasks) {
-  tensorflow::ServerDef server_def;
-  server_def.set_protocol("grpc");
-  server_def.set_job_name(job_name);
-  server_def.set_task_index(0);
-  tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster();
-  tensorflow::JobDef* job_def = cluster_def->add_job();
-  job_def->set_name(job_name);
-  for (int i = 0; i < num_tasks; i++) {
-    int port = tensorflow::testing::PickUnusedPortOrDie();
-    job_def->mutable_tasks()->insert(
-        {i, tensorflow::strings::StrCat("localhost:", port)});
-  }
-  return server_def;
-}
-
-tensorflow::ServerDef GetServerDef(int num_tasks) {
-  return GetServerDef("localhost", num_tasks);
+void ReplaceTaskInServerDef(tensorflow::ServerDef* server_def, int task_index) {
+  tensorflow::JobDef* job_def = server_def->mutable_cluster()->mutable_job(0);
+  int port = tensorflow::testing::PickUnusedPortOrDie();
+  job_def->mutable_tasks()->at(task_index) =
+      tensorflow::strings::StrCat("localhost:", port);
 }
 
 void CheckTFE_TensorHandleHasFloats(TFE_TensorHandle* handle,
@@ -101,6 +88,22 @@ void CheckRemoteMatMulExecutesOK(TFE_Context* ctx,
   TF_DeleteStatus(status);
 }
 
+// Read the value of variable `var` and save it into `out_value`.
+void ReadVariable(TFE_Context* ctx, TFE_TensorHandle* var,
+                  TFE_TensorHandle** out_value) {
+  TF_Status* status = TF_NewStatus();
+  TFE_Op* op = TFE_NewOp(ctx, "ReadVariableOp", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
+  TFE_OpAddInput(op, var, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  int num_retvals = 1;
+  TFE_Execute(op, out_value, &num_retvals, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(op);
+  TF_DeleteStatus(status);
+}
+
 void TestRemoteExecuteChangeServerDef(bool async) {
   tensorflow::ServerDef server_def = GetServerDef(2);
 
@@ -243,6 +246,102 @@ TEST(CAPI, RemoteExecuteUpdateServerDefAsync) {
   TestRemoteExecuteUpdateServerDef(true);
 }
 
+void TestRemoteExecuteUpdateServerDefResourceAccess(bool async) {
+  tensorflow::ServerDef server_def = GetServerDef(2);
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server)
+                  .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  const char dev0_name[] = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const char dev1_name[] = "/job:localhost/replica:0/task:1/device:CPU:0";
+
+  TFE_TensorHandle* var_handle0 = TestVariable(ctx, 1.0, dev0_name);
+  EXPECT_NE(var_handle0, nullptr);
+  TFE_TensorHandle* var_handle1 = TestVariable(ctx, 2.0, dev1_name);
+  EXPECT_NE(var_handle1, nullptr);
+
+  TFE_TensorHandle* value_handle = nullptr;
+  ReadVariable(ctx, var_handle1, &value_handle);
+  CheckTFE_TensorHandleHasFloats(value_handle, {2});
+  TFE_DeleteTensorHandle(value_handle);
+
+  // Start a new worker to replace task:1
+  ReplaceTaskInServerDef(&server_def, 1);
+  server_def.set_task_index(1);
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server.release();
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server)
+                  .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  // Update server def to replace the remote device with the device info on the
+  // new worker (different incarnation ID).
+  server_def.set_task_index(0);
+  string serialized_update = server_def.SerializeAsString();
+  TFE_ContextUpdateServerDef(ctx, 0, serialized_update.data(),
+                             serialized_update.size(), status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  // The device of var_handle0 is local device which is the same before and
+  // after cluster update. Remove resource with valid device should succeed.
+  TFE_Op* op = TFE_NewOp(ctx, "DestroyResourceOp", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, var_handle0, status);
+  TFE_OpSetDevice(op, dev0_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  int num_retvals = 0;
+  TFE_Execute(op, nullptr, &num_retvals, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(op);
+
+  // The device of var_handle1 is remote device, which was replaced during
+  // cluster update. Removing resource with invalid device should fail
+  // gracefully (i.e., with error status) instead of crashing with segfaults.
+  op = TFE_NewOp(ctx, "DestroyResourceOp", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, var_handle1, status);
+  TFE_OpSetDevice(op, dev1_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  num_retvals = 0;
+  TFE_Execute(op, nullptr, &num_retvals, status);
+  EXPECT_NE(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(op);
+
+  TFE_DeleteTensorHandle(var_handle0);
+  TFE_DeleteTensorHandle(var_handle1);
+
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server.release();
+}
+
+TEST(CAPI, TestRemoteExecuteUpdateServerDefResourceAccess) {
+  TestRemoteExecuteUpdateServerDefResourceAccess(false);
+}
+
+TEST(CAPI, TestRemoteExecuteUpdateServerDefResourceAccessAsync) {
+  TestRemoteExecuteUpdateServerDefResourceAccess(true);
+}
+
 void TestRemoteExecuteUpdateServerDefWithFailures(bool async) {
   // Fail fast on GetStatus requests so we can get errors instead of timeout
   // when updating cluster with non-exsitent worker
@@ -282,6 +381,7 @@ void TestRemoteExecuteUpdateServerDefWithFailures(bool async) {
   int port = tensorflow::testing::PickUnusedPortOrDie();
   job_def->mutable_tasks()->insert(
       {2, tensorflow::strings::StrCat("localhost:", port)});
+  server_def.set_task_index(0);
   string serialized_update = server_def.SerializeAsString();
   TFE_ContextUpdateServerDef(ctx, 0, serialized_update.data(),
                              serialized_update.size(), status);
@@ -310,4 +410,70 @@ TEST(CAPI, RemoteExecuteUpdateServerDefWithFailuresAsync) {
   TestRemoteExecuteUpdateServerDefWithFailures(true);
 }
 
+void TestConnectToCluster(bool keep_localhost_for_first_connect) {
+  // Fail fast on GetStatus requests so we can get errors instead of timeout
+  // when updating cluster with non-exsitent worker
+  tensorflow::setenv("GRPC_FAIL_FAST", "TRUE", /*overwrite=*/1);
+
+  const string first_name =
+      keep_localhost_for_first_connect ? "localhost" : "abc";
+  tensorflow::ServerDef server_def = GetServerDef(first_name, 1);
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  const string dev0_name = "/job:localhost/replica:0/task:0/device:CPU:0";
+  TFE_TensorHandle* var_handle0 = TestVariable(ctx, 1.0, dev0_name);
+  EXPECT_NE(var_handle0, nullptr);
+
+  tensorflow::Status status2;
+  EXPECT_EQ(tensorflow::unwrap(var_handle0)->DeviceName(&status2), dev0_name);
+
+  // Rename local device
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  const string dev1_name =
+      absl::StrCat("/job:", first_name, "/replica:0/task:0/device:CPU:0");
+  TFE_TensorHandle* var_handle1 = TestVariable(ctx, 2.0, dev1_name);
+  EXPECT_NE(var_handle1, nullptr);
+  EXPECT_EQ(tensorflow::unwrap(var_handle1)->DeviceName(&status2), dev1_name);
+
+  // Another renaming of local device
+  const string second_name = "def";
+  server_def.set_job_name(second_name);
+  server_def.mutable_cluster()->mutable_job(0)->set_name(second_name);
+  (*server_def.mutable_cluster()->mutable_job(0)->mutable_tasks())[0] =
+      absl::StrCat(second_name, ":",
+                   tensorflow::testing::PickUnusedPortOrDie());
+
+  serialized = server_def.SerializeAsString();
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  const string dev2_name = "/job:def/replica:0/task:0/device:CPU:0";
+  TFE_TensorHandle* var_handle2 = TestVariable(ctx, 2.0, dev2_name);
+  EXPECT_NE(var_handle2, nullptr);
+  EXPECT_EQ(tensorflow::unwrap(var_handle2)->DeviceName(&status2), dev2_name);
+
+  TFE_DeleteTensorHandle(var_handle0);
+  TFE_DeleteTensorHandle(var_handle1);
+  TFE_DeleteTensorHandle(var_handle2);
+
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+
+  tensorflow::unsetenv("GRPC_FAIL_FAST");
+}
+
+TEST(CAPI, ConnectToClusterLocalhostFirst) { TestConnectToCluster(false); }
+
+TEST(CAPI, ConnectToClusterRenameFirst) { TestConnectToCluster(true); }
+
 }  // namespace
diff --git a/tensorflow/c/eager/c_api_distributed_test.cc b/tensorflow/c/eager/c_api_distributed_test.cc
new file mode 100644
index 00000000000..65f8d3cc646
--- /dev/null
+++ b/tensorflow/c/eager/c_api_distributed_test.cc
@@ -0,0 +1,506 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/function_optimization_registry.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+
+namespace {
+
+using ::tensorflow::string;
+
+// Add the values of three variables on three different tasks.
+string AddVariablesFunction() {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'AddVariablesFunction'"
+      "      input_arg {"
+      "        name: 'var'"
+      "        type: DT_RESOURCE"
+      "      }"
+      "      output_arg {"
+      "        name: 'sum'"
+      "        type: DT_FLOAT"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read0'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var'"
+      "      device: '/job:localhost/replica:0/task:0/device:CPU:0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read1'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var'"
+      "      device: '/job:localhost/replica:0/task:1/device:CPU:0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read2'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var'"
+      "      device: '/job:localhost/replica:0/task:2/device:CPU:0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'add1'"
+      "      op: 'Add'"
+      "      input: 'read0:value:0'"
+      "      input: 'read1:value:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'add2'"
+      "      op: 'Add'"
+      "      input: 'add1:z:0'"
+      "      input: 'read2:value:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    ret {"
+      "      key: 'sum'"
+      "      value: 'add2:z:0'"
+      "    }",
+      &def));
+  return def.SerializeAsString();
+}
+
+void VarIsInitialized(TFE_Context* ctx, TFE_TensorHandle* var_handle) {
+  TF_Status* status = TF_NewStatus();
+  TFE_Op* op = TFE_NewOp(ctx, "VarIsInitializedOp", status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_OpAddInput(op, var_handle, status);
+  TFE_TensorHandle* is_initialized[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(op, &is_initialized[0], &num_retvals, status);
+  CHECK_EQ(1, num_retvals);
+  TF_Tensor* t = TFE_TensorHandleResolve(is_initialized[0], status);
+  bool initialized = false;
+  memcpy(&initialized, TF_TensorData(t), TF_TensorByteSize(t));
+  EXPECT_EQ(initialized, true);
+  TF_DeleteTensor(t);
+  TFE_DeleteTensorHandle(is_initialized[0]);
+  TFE_DeleteOp(op);
+  delete status;
+}
+
+void TestFunctionWithPackedInput(const bool remote) {
+  tensorflow::ServerDef server_def = GetServerDef(3);
+
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server1)
+                  .ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
+
+  server_def.set_task_index(2);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server2)
+                  .ok());
+  ASSERT_TRUE(worker_server2->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(/*enable=*/true));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  const char task0_name[] = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const char task1_name[] = "/job:localhost/replica:0/task:1/device:CPU:0";
+  const char task2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0";
+
+  // Create one variable per task.
+  TFE_TensorHandle* h0 = TestVariable(ctx, 1.0, task0_name);
+  TFE_TensorHandle* h1 = TestVariable(ctx, 2.0, task1_name);
+  TFE_TensorHandle* h2 = TestVariable(ctx, 3.0, task2_name);
+
+  // Add a sync point in order to make sure that variables have been initialized
+  // before the function execution starts.
+  // TODO(b/155789951): Remove once b/155789951 is fixed.
+  VarIsInitialized(ctx, h1);
+  VarIsInitialized(ctx, h2);
+
+  // Pack 3 variable handles into one TFE_TensorHandle.
+  int num_replicas = 3;
+  std::vector<TFE_TensorHandle*> handles = {h0, h1, h2};
+  TFE_TensorHandle* packed_handle =
+      TFE_CreatePackedTensorHandle(ctx, handles.data(), &num_replicas, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  EXPECT_EQ(TFE_TensorHandleDataType(packed_handle), TF_RESOURCE);
+  EXPECT_EQ(TFE_TensorHandleNumDims(packed_handle, status), 0);
+  EXPECT_EQ(TFE_TensorHandleNumElements(packed_handle, status), 1);
+
+  const string composite_device_name =
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:0";
+  EXPECT_EQ(TFE_TensorHandleDeviceName(packed_handle, status),
+            composite_device_name);
+  EXPECT_EQ(TFE_TensorHandleBackingDeviceName(packed_handle, status),
+            composite_device_name);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  // Register and run a function which returns the sum of 3 variables.
+  const string function_def = AddVariablesFunction();
+  TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
+                            status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TFE_Op* func = TFE_NewOp(ctx, "AddVariablesFunction", status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_OpAddInput(func, packed_handle, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  if (remote) {
+    TFE_OpSetDevice(func, task1_name, status);
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  }
+
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(func, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  ASSERT_EQ(1, num_retvals);
+  TFE_DeleteOp(func);
+  TFE_DeleteTensorHandle(packed_handle);
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_DeleteTensorHandle(retvals[0]);
+  float sum = 0;
+  EXPECT_EQ(sizeof(sum), TF_TensorByteSize(t));
+  memcpy(&sum, TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(sum, 6.0);
+
+  TFE_DeleteTensorHandle(h0);
+  TFE_DeleteTensorHandle(h1);
+  TFE_DeleteTensorHandle(h2);
+
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_DeleteExecutor(executor);
+  TFE_ContextRemoveFunction(ctx, "AddVariablesFunction", status);
+  TFE_DeleteContext(ctx);
+
+  TF_DeleteStatus(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server1.release();
+  worker_server2.release();
+}
+
+TEST(CAPI, TestLocalFunctionWithPackedInput) {
+  TestFunctionWithPackedInput(/*remote=*/false);
+}
+
+TEST(CAPI, TestRemoteFunctionWithPackedInput) {
+  TestFunctionWithPackedInput(/*remote=*/true);
+}
+
+string VariableAddFunction() {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'VariableAddFunction'"
+      "      input_arg {"
+      "        name: 'var0'"
+      "        type: DT_RESOURCE"
+      "      }"
+      "      output_arg {"
+      "        name: 'var0_value'"
+      "        type: DT_FLOAT"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read0'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'add'"
+      "      op: 'Add'"
+      "      input: 'read0:value:0'"
+      "      input: 'read0:value:0'"
+      "      device: '/job:localhost/task:1/device:CPU:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'identity'"
+      "      op: 'Identity'"
+      "      input: 'add:z:0'"
+      "      device: '/job:localhost/task:0/device:CPU:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    ret {"
+      "      key: 'var0_value'"
+      "      value: 'identity:output:0'"
+      "    }",
+      &def));
+  return def.SerializeAsString();
+}
+
+class FunctionErrorInjectionPass : public tensorflow::FunctionOptimizationPass {
+ public:
+  FunctionErrorInjectionPass(string error_node, string error_device)
+      : error_node_(error_node), error_device_(error_device) {}
+  tensorflow::Status Run(const tensorflow::DeviceSet& device_set,
+                         const tensorflow::ConfigProto& config_proto,
+                         std::unique_ptr<tensorflow::Graph>* graph,
+                         tensorflow::FunctionLibraryDefinition* flib_def,
+                         std::vector<std::string>* control_ret_node_names,
+                         bool* control_rets_updated) override {
+    // Inject failure to function instantiation if finding a node that contains
+    // the given node name (error_node_) and requested device (error_device_).
+    for (const auto node : graph->get()->nodes()) {
+      if (node->name().find(error_node_) != string::npos &&
+          node->requested_device() == error_device_) {
+        return tensorflow::errors::Internal("Injected graph pass error.");
+      }
+    }
+    return tensorflow::Status::OK();
+  }
+
+ private:
+  const string error_node_;
+  const string error_device_;
+};
+
+void TestDistributedFunctionCancellation(bool inject_error) {
+  tensorflow::ServerDef server_def = GetServerDef(3);
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server1)
+                  .ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
+  server_def.set_task_index(2);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server2)
+                  .ok());
+  ASSERT_TRUE(worker_server2->Start().ok());
+  const char dev2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0";
+
+  if (inject_error) {
+    // Inject a function optimization pass failure when it sees the 'read0' op
+    // having a requested device `dev2_name`. During execution:
+    //   * task:0 processes the main function `VariableAddFunction` and places
+    //     the read0 op on task:2
+    //   * task:0 partitions the main function with a subgraph containing read0
+    //     sent to task:2
+    //   * task:2 graph pass reports an error when it sees read0 with dev2_name
+    tensorflow::function_optimization_registration::
+        FunctionOptimizationPassRegistration register_test_pass(
+            std::make_unique<FunctionErrorInjectionPass>("read0", dev2_name));
+  }
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* var_handle = TestVariable(ctx, 2.0, dev2_name);
+  EXPECT_NE(var_handle, nullptr);
+
+  const string function_def = VariableAddFunction();
+  TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
+                            status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TFE_Op* func = TFE_NewOp(ctx, "VariableAddFunction", status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_OpAddInput(func, var_handle, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(func, &retvals[0], &num_retvals, status);
+
+  if (inject_error) {
+    ASSERT_EQ(TF_INTERNAL, TF_GetCode(status)) << TF_Message(status);
+  } else {
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    ASSERT_EQ(1, num_retvals);
+    TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteTensorHandle(retvals[0]);
+    float sum = 0;
+    ASSERT_EQ(sizeof(sum), TF_TensorByteSize(t));
+    memcpy(&sum, TF_TensorData(t), TF_TensorByteSize(t));
+    TF_DeleteTensor(t);
+    ASSERT_EQ(sum, 4.0);
+  }
+
+  TFE_DeleteOp(func);
+  TFE_DeleteTensorHandle(var_handle);
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server1.release();
+  worker_server2.release();
+}
+
+TEST(CAPI, DistributedFunctionNoError) {
+  TestDistributedFunctionCancellation(false);
+}
+
+TEST(CAPI, DistributedFunctionCancelledOnError) {
+  TestDistributedFunctionCancellation(true);
+}
+
+void TestRemoteExecuteDeleteContextWithOutstandingRPC(bool async) {
+  tensorflow::ServerDef server_def = GetServerDef(2);
+
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+
+  std::unique_ptr<tensorflow::GrpcServer> worker_server;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server)
+                  .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts,
+                                             TFE_DEVICE_PLACEMENT_EXPLICIT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  // Use large matrices so that RPCs don't return before we get a chance
+  // to call TFE_DeleteContext.
+  TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle100x100(ctx);
+  TFE_TensorHandle* h1_task0 = TestMatrixTensorHandle100x100(ctx);
+  const char remote_device_name[] =
+      "/job:localhost/replica:0/task:1/device:CPU:0";
+  auto* h0_task1 =
+      TFE_TensorHandleCopyToDevice(h0_task0, ctx, remote_device_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  auto* h1_task1 =
+      TFE_TensorHandleCopyToDevice(h1_task0, ctx, remote_device_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_Op* matmul = MatMulOp(ctx, h0_task1, h1_task1);
+  TFE_OpSetDevice(matmul, remote_device_name, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* retvals[1];
+  int num_retvals = 1;
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+
+  TFE_DeleteTensorHandle(h0_task0);
+  TFE_DeleteTensorHandle(h1_task0);
+  TFE_DeleteTensorHandle(h0_task1);
+  TFE_DeleteTensorHandle(h1_task1);
+  TFE_DeleteTensorHandle(retvals[0]);
+
+  TFE_DeleteOp(matmul);
+
+  TFE_DeleteContext(ctx);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server.release();
+}
+
+TEST(CAPI, RemoteExecuteDeleteContextWithOutstandingRPC) {
+  TestRemoteExecuteDeleteContextWithOutstandingRPC(false);
+}
+
+TEST(CAPI, RemoteExecuteDeleteContextWithOutstandingRPCAsync) {
+  TestRemoteExecuteDeleteContextWithOutstandingRPC(true);
+}
+}  // namespace
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index 820650e315f..0d71b11531b 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/c/eager/tfe_op_internal.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
@@ -638,3 +639,35 @@ TFE_TensorHandle* TFE_NewTensorHandleFromTensor(TFE_Context* ctx, TF_Tensor* t,
   return tensorflow::wrap(
       tensorflow::unwrap(ctx)->CreateLocalHandle(t->tensor));
 }
+
+TFE_TensorHandle* TFE_CreatePackedTensorHandle(TFE_Context* ctx,
+                                               TFE_TensorHandle** handles,
+                                               int* num_handles,
+                                               TF_Status* status) {
+  std::vector<tensorflow::TensorHandle*> tensor_handles;
+  tensor_handles.reserve(*num_handles);
+  for (int i = 0; i < *num_handles; ++i) {
+    tensor_handles.push_back(
+        tensorflow::TensorHandleFromInterface(tensorflow::unwrap(handles[i])));
+  }
+  tensorflow::EagerContext* context =
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+  tensorflow::TensorHandle* handle = nullptr;
+  status->status = tensorflow::TensorHandle::CreatePackedHandle(
+      std::move(tensor_handles), context, &handle);
+  return tensorflow::wrap(handle);
+}
+
+void TFE_ContextSetSoftDevicePlacement(TFE_Context* ctx, unsigned char enable,
+                                       TF_Status* status) {
+  tensorflow::EagerContext* context =
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+  context->SetAllowSoftPlacement(enable);
+}
+
+void TFE_ContextSetLogDevicePlacement(TFE_Context* ctx, unsigned char enable,
+                                      TF_Status* status) {
+  tensorflow::EagerContext* context =
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+  context->SetLogDevicePlacement(enable);
+}
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 33adce40da0..1b8efe61ee0 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -541,6 +541,26 @@ TF_CAPI_EXPORT extern TF_Tensor* TFE_AllocateHostTensor(TFE_Context* ctx,
 TF_CAPI_EXPORT TFE_TensorHandle* TFE_NewTensorHandleFromTensor(
     TFE_Context* ctx, TF_Tensor* t, TF_Status* status);
 
+// Create a packed TensorHandle with the given list of TensorHandles.
+// If `handles` are on the same device, assign the same device to the packed
+// handle; if `handles` are on different deivces, assign a CompositeDevice to
+// it.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_CreatePackedTensorHandle(
+    TFE_Context* ctx, TFE_TensorHandle** handles, int* num_handles,
+    TF_Status* status);
+
+// Configure soft device placement policy for the eager executor. Note this
+// policy is applied to any subsequent op executions.
+TF_CAPI_EXPORT void TFE_ContextSetSoftDevicePlacement(TFE_Context* ctx,
+                                                      unsigned char enable,
+                                                      TF_Status* status);
+
+// Configure device placement policy logging for the eager executor. Note this
+// policy is applied to any subsequent op executions.
+TF_CAPI_EXPORT void TFE_ContextSetLogDevicePlacement(TFE_Context* ctx,
+                                                     unsigned char enable,
+                                                     TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/eager/c_api_remote_test.cc b/tensorflow/c/eager/c_api_remote_test.cc
index 0f988b1456d..94c32cf3f30 100644
--- a/tensorflow/c/eager/c_api_remote_test.cc
+++ b/tensorflow/c/eager/c_api_remote_test.cc
@@ -19,37 +19,22 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/function_optimization_registry.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 
 namespace {
 
 using ::tensorflow::string;
 
-tensorflow::ServerDef GetServerDef(const string& job_name, int num_tasks) {
-  tensorflow::ServerDef server_def;
-  server_def.set_protocol("grpc");
-  server_def.set_job_name(job_name);
-  server_def.set_task_index(0);
-  tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster();
-  tensorflow::JobDef* job_def = cluster_def->add_job();
-  job_def->set_name(job_name);
-  for (int i = 0; i < num_tasks; i++) {
-    int port = tensorflow::testing::PickUnusedPortOrDie();
-    job_def->mutable_tasks()->insert(
-        {i, tensorflow::strings::StrCat("localhost:", port)});
-  }
-  return server_def;
-}
-
-tensorflow::ServerDef GetServerDef(int num_tasks) {
-  return GetServerDef("localhost", num_tasks);
-}
-
 void TestRemoteExecute(bool async) {
   tensorflow::ServerDef server_def = GetServerDef(2);
 
@@ -351,74 +336,4 @@ TEST(CAPI, RemoteExecuteSilentCopiesLocalAsyncFuncOrdering) {
                                 /*heavy_load_on_streaming_rpc=*/true);
 }
 
-void TestRemoteExecuteDeleteContextWithOutstandingRPC(bool async) {
-  tensorflow::ServerDef server_def = GetServerDef(2);
-
-  // This server def has the task index set to 0.
-  string serialized = server_def.SerializeAsString();
-
-  server_def.set_task_index(1);
-
-  std::unique_ptr<tensorflow::GrpcServer> worker_server;
-  ASSERT_TRUE(tensorflow::GrpcServer::Create(
-                  server_def, tensorflow::Env::Default(), &worker_server)
-                  .ok());
-  ASSERT_TRUE(worker_server->Start().ok());
-
-  TF_Status* status = TF_NewStatus();
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
-  TFE_ContextOptionsSetDevicePlacementPolicy(opts,
-                                             TFE_DEVICE_PLACEMENT_EXPLICIT);
-  TFE_Context* ctx = TFE_NewContext(opts, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_DeleteContextOptions(opts);
-
-  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  // Use large matrices so that RPCs don't return before we get a chance
-  // to call TFE_DeleteContext.
-  TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle100x100(ctx);
-  TFE_TensorHandle* h1_task0 = TestMatrixTensorHandle100x100(ctx);
-  const char remote_device_name[] =
-      "/job:localhost/replica:0/task:1/device:CPU:0";
-  auto* h0_task1 =
-      TFE_TensorHandleCopyToDevice(h0_task0, ctx, remote_device_name, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  auto* h1_task1 =
-      TFE_TensorHandleCopyToDevice(h1_task0, ctx, remote_device_name, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TFE_Op* matmul = MatMulOp(ctx, h0_task1, h1_task1);
-  TFE_OpSetDevice(matmul, remote_device_name, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TFE_TensorHandle* retvals[1];
-  int num_retvals = 1;
-  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TF_DeleteStatus(status);
-
-  TFE_DeleteTensorHandle(h0_task0);
-  TFE_DeleteTensorHandle(h1_task0);
-  TFE_DeleteTensorHandle(h0_task1);
-  TFE_DeleteTensorHandle(h1_task1);
-  TFE_DeleteTensorHandle(retvals[0]);
-
-  TFE_DeleteOp(matmul);
-
-  TFE_DeleteContext(ctx);
-
-  // TODO(b/136478427): Figure out how to correctly shut the server down.
-  worker_server.release();
-}
-
-TEST(CAPI, RemoteExecuteDeleteContextWithOutstandingRPC) {
-  TestRemoteExecuteDeleteContextWithOutstandingRPC(false);
-}
-
-TEST(CAPI, RemoteExecuteDeleteContextWithOutstandingRPCAsync) {
-  TestRemoteExecuteDeleteContextWithOutstandingRPC(true);
-}
 }  // namespace
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 3160cb0e585..724176505ba 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -1132,51 +1132,6 @@ void BM_ExecuteFunction(int iters, int async) {
 }
 BENCHMARK(BM_ExecuteFunction)->Arg(0)->Arg(1);
 
-TFE_TensorHandle* CreateVariable(TFE_Context* ctx, float value,
-                                 TF_Status* status) {
-  // Create the variable handle.
-  TFE_Op* op = TFE_NewOp(ctx, "VarHandleOp", status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
-  TFE_OpSetAttrShape(op, "shape", {}, 0, status);
-  TFE_OpSetAttrString(op, "container", "", 0);
-  TFE_OpSetAttrString(op, "shared_name", "", 0);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_TensorHandle* var_handle = nullptr;
-  int num_retvals = 1;
-  TFE_Execute(op, &var_handle, &num_retvals, status);
-  TFE_DeleteOp(op);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  CHECK_EQ(1, num_retvals);
-
-  // Assign 'value' to it.
-  op = TFE_NewOp(ctx, "AssignVariableOp", status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
-  TFE_OpAddInput(op, var_handle, status);
-
-  // Convert 'value' to a TF_Tensor then a TFE_TensorHandle.
-  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> t(
-      TF_AllocateTensor(TF_FLOAT, nullptr, 0, sizeof(value)), TF_DeleteTensor);
-  memcpy(TF_TensorData(t.get()), &value, TF_TensorByteSize(t.get()));
-
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      value_handle(TFE_NewTensorHandle(t.get(), status),
-                   TFE_DeleteTensorHandle);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-
-  TFE_OpAddInput(op, value_handle.get(), status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-
-  num_retvals = 0;
-  TFE_Execute(op, nullptr, &num_retvals, status);
-  TFE_DeleteOp(op);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  CHECK_EQ(0, num_retvals);
-
-  return var_handle;
-}
-
 TEST(CAPI, Variables) {
   // Variables use resource handles, so this is really a test for resource
   // tensor handling.
@@ -1186,7 +1141,7 @@ TEST(CAPI, Variables) {
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
 
-  TFE_TensorHandle* var_handle = CreateVariable(ctx, 12.0, status);
+  TFE_TensorHandle* var_handle = TestVariable(ctx, 12.0);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   TFE_Op* op = TFE_NewOp(ctx, "ReadVariableOp", status);
@@ -1227,7 +1182,7 @@ void BM_ReadVariable(int iters) {
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
 
-  TFE_TensorHandle* var_handle = CreateVariable(ctx, 5.0, status);
+  TFE_TensorHandle* var_handle = TestVariable(ctx, 5.0);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   TFE_Op* op = TFE_NewOp(ctx, "ReadVariableOp", status);
@@ -1248,6 +1203,8 @@ void BM_ReadVariable(int iters) {
     CHECK_EQ(0, TFE_TensorHandleNumDims(h, status));
     CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
     h = nullptr;
+    TFE_OpAddInput(op, var_handle, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   }
   tensorflow::testing::StopTiming();
   TFE_DeleteOp(op);
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index e67e17963b3..4b5ad8f50f7 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -18,7 +18,9 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/strcat.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
 
 using tensorflow::string;
 
@@ -133,6 +135,58 @@ TFE_TensorHandle* TestMatrixTensorHandle3X2(TFE_Context* ctx) {
   return th;
 }
 
+TFE_TensorHandle* TestVariable(TFE_Context* ctx, float value,
+                               const tensorflow::string& device_name) {
+  TF_Status* status = TF_NewStatus();
+  // Create the variable handle.
+  TFE_Op* op = TFE_NewOp(ctx, "VarHandleOp", status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
+  TFE_OpSetAttrShape(op, "shape", {}, 0, status);
+  TFE_OpSetAttrString(op, "container", "", 0);
+  TFE_OpSetAttrString(op, "shared_name", "", 0);
+  if (!device_name.empty()) {
+    TFE_OpSetDevice(op, device_name.c_str(), status);
+  }
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_TensorHandle* var_handle = nullptr;
+  int num_retvals = 1;
+  TFE_Execute(op, &var_handle, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_DeleteOp(op);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  CHECK_EQ(1, num_retvals);
+
+  // Assign 'value' to it.
+  op = TFE_NewOp(ctx, "AssignVariableOp", status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
+  TFE_OpAddInput(op, var_handle, status);
+
+  // Convert 'value' to a TF_Tensor then a TFE_TensorHandle.
+  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> t(
+      TF_AllocateTensor(TF_FLOAT, nullptr, 0, sizeof(value)), TF_DeleteTensor);
+  memcpy(TF_TensorData(t.get()), &value, TF_TensorByteSize(t.get()));
+
+  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
+      value_handle(TFE_NewTensorHandle(t.get(), status),
+                   TFE_DeleteTensorHandle);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+
+  TFE_OpAddInput(op, value_handle.get(), status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+
+  num_retvals = 0;
+  TFE_Execute(op, nullptr, &num_retvals, status);
+  TFE_DeleteOp(op);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  CHECK_EQ(0, num_retvals);
+
+  TF_DeleteStatus(status);
+
+  return var_handle;
+}
+
 TFE_Op* AddOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
   TF_Status* status = TF_NewStatus();
 
@@ -244,3 +298,23 @@ bool GetDeviceName(TFE_Context* ctx, string* device_name,
   TF_DeleteDeviceList(devices);
   return false;
 }
+
+tensorflow::ServerDef GetServerDef(const string& job_name, int num_tasks) {
+  tensorflow::ServerDef server_def;
+  server_def.set_protocol("grpc");
+  server_def.set_job_name(job_name);
+  server_def.set_task_index(0);
+  tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster();
+  tensorflow::JobDef* job_def = cluster_def->add_job();
+  job_def->set_name(job_name);
+  for (int i = 0; i < num_tasks; i++) {
+    int port = tensorflow::testing::PickUnusedPortOrDie();
+    job_def->mutable_tasks()->insert(
+        {i, tensorflow::strings::StrCat("localhost:", port)});
+  }
+  return server_def;
+}
+
+tensorflow::ServerDef GetServerDef(int num_tasks) {
+  return GetServerDef("localhost", num_tasks);
+}
diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h
index 11ae6d1181b..fcf62223f14 100644
--- a/tensorflow/c/eager/c_api_test_util.h
+++ b/tensorflow/c/eager/c_api_test_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 
 // Return a tensor handle containing a float scalar
 TFE_TensorHandle* TestScalarTensorHandle(TFE_Context* ctx, float value);
@@ -42,6 +43,11 @@ TFE_TensorHandle* DoubleTestMatrixTensorHandle3X2(TFE_Context* ctx);
 // Return a tensor handle containing a 3x2 matrix of floats
 TFE_TensorHandle* TestMatrixTensorHandle3X2(TFE_Context* ctx);
 
+// Return a variable handle referring to a variable with the given initial value
+// on the given device.
+TFE_TensorHandle* TestVariable(TFE_Context* ctx, float value,
+                               const tensorflow::string& device_name = "");
+
 // Return an add op multiplying `a` by `b`.
 TFE_Op* AddOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b);
 
@@ -67,4 +73,11 @@ TFE_Op* MinOp(TFE_Context* ctx, TFE_TensorHandle* input,
 bool GetDeviceName(TFE_Context* ctx, tensorflow::string* device_name,
                    const char* device_type);
 
+// Create a ServerDef with the given `job_name` and add `num_tasks` tasks in it.
+tensorflow::ServerDef GetServerDef(const tensorflow::string& job_name,
+                                   int num_tasks);
+
+// Create a ServerDef with job name "localhost" and add `num_tasks` tasks in it.
+tensorflow::ServerDef GetServerDef(int num_tasks);
+
 #endif  // TENSORFLOW_C_EAGER_C_API_TEST_UTIL_H_
diff --git a/tensorflow/c/eager/c_api_unified_experimental.cc b/tensorflow/c/eager/c_api_unified_experimental.cc
index 68afffb28b4..e5030a602b3 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
@@ -26,6 +28,51 @@ using tensorflow::string;
 using tensorflow::internal::OutputList;
 using tensorflow::internal::unwrap;
 
+namespace tensorflow {
+namespace internal {
+typedef absl::flat_hash_map<std::string, FactoryFunction> FactoriesMap;
+
+static FactoriesMap& GetFactories() {
+  static FactoriesMap* factories = new FactoriesMap;
+  return *factories;
+}
+
+static const char* default_factory = "<unset>";
+
+void RegisterTracingEngineFactory(const string& name, FactoryFunction factory) {
+  assert((!GetFactories().count(name)) ||
+         (GetFactories()[name] == factory) &&
+             "Duplicate tracing factory registration");
+  GetFactories()[name] = factory;
+}
+
+void SetDefaultTracingEngine(const char* name) { default_factory = name; }
+
+static ExecutionContext* CreateTracingExecutionContext(const char* fn_name,
+                                                       TF_Status* s) {
+  auto entry = GetFactories().find(default_factory);
+  if (entry != GetFactories().end()) return entry->second(fn_name, s);
+  string msg = absl::StrCat(
+      "No tracing engine factory has been registered with the key '",
+      default_factory, "' (available: ");
+  // Ensure deterministic (sorted) order in the error message
+  std::set<string> factories_sorted;
+  for (const auto& factory : GetFactories())
+    factories_sorted.insert(factory.first);
+  const char* comma = "";
+  for (const string& factory : factories_sorted) {
+    msg += comma + factory;
+    comma = ", ";
+  }
+  msg += ")";
+
+  TF_SetStatus(s, TF_INVALID_ARGUMENT, msg.c_str());
+  return nullptr;
+}
+
+}  // end namespace internal
+}  // end namespace tensorflow
+
 // =============================================================================
 // Public C API entry points
 //
@@ -36,6 +83,28 @@ using tensorflow::internal::unwrap;
 //
 // =============================================================================
 
+void TF_SetTracingImplementation(const char* name) {
+  tensorflow::internal::SetDefaultTracingEngine(name);
+}
+
+// Creates a new TensorFlow function, it is an execution context attached to a
+// given tracing context.
+TF_ExecutionContext* TF_CreateFunction(const char* fn_name, TF_Status* s) {
+  return wrap(tensorflow::internal::CreateTracingExecutionContext(fn_name, s));
+}
+
+TF_AbstractFunction* TF_FinalizeFunction(TF_ExecutionContext* ctx,
+                                         TF_OutputList* outputs, TF_Status* s) {
+  auto* func = wrap(unwrap(ctx)->Finalize(unwrap(outputs), s));
+  TF_DeleteExecutionContext(ctx);
+  return func;
+}
+
+TF_AbstractTensor* TF_AddFunctionParameter(TF_ExecutionContext* func,
+                                           TF_DataType dtype, TF_Status* s) {
+  return wrap(unwrap(func)->AddParameter(dtype, s));
+}
+
 void TF_DeleteExecutionContext(TF_ExecutionContext* c) { delete unwrap(c); }
 
 TF_AbstractOp* TF_NewAbstractOp(TF_ExecutionContext* c) {
@@ -58,6 +127,10 @@ int TF_OutputListNumOutputs(TF_OutputList* o) {
 TF_AbstractTensor* TF_OutputListGet(TF_OutputList* o, int i) {
   return wrap(unwrap(o)->outputs[i]);
 }
+void TF_OutputListPushBack(TF_OutputList* o, TF_AbstractTensor* tensor,
+                           TF_Status* s) {
+  unwrap(o)->outputs.push_back(unwrap(tensor));
+}
 
 void TF_AbstractOpSetOpType(TF_AbstractOp* op, const char* const op_type,
                             TF_Status* s) {
diff --git a/tensorflow/c/eager/c_api_unified_experimental.h b/tensorflow/c/eager/c_api_unified_experimental.h
index be8fc64c2e1..86c59a7f625 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.h
+++ b/tensorflow/c/eager/c_api_unified_experimental.h
@@ -49,15 +49,26 @@ typedef struct TF_AbstractOp TF_AbstractOp;
 // setting functional attributes of other composite ops e.g. control flow.
 typedef struct TF_AbstractFunction TF_AbstractFunction;
 
-// Creates a context for tracing the execution of operations into a function.
-TF_ExecutionContext* TF_NewGraphExecutionContext(TF_Status* s);
+// This allows the client to swap the implementation of the tracing engine.
+// Any future call to TF_CreateFunction will use the implementation defined
+// here.
+void TF_SetTracingImplementation(const char* name);
+
+// Creates a new TensorFlow function. A Function is an execution context, and as
+// such it can trace operations through TF_ExecuteOperation. After completing
+// tracing, a function can be obtained by TF_FinalizeFunction.
+TF_ExecutionContext* TF_CreateFunction(const char* fn_name, TF_Status* status);
 
 // Creates a context for eager execution of operations.
 TF_ExecutionContext* TF_NewEagerExecutionContext(TFE_ContextOptions*,
                                                  TF_Status* s);
-
 void TF_DeleteExecutionContext(TF_ExecutionContext*);
 
+// Add a new parameter to a TensorFlow Function.
+// TODO(aminim): what about shape?
+TF_AbstractTensor* TF_AddFunctionParameter(TF_ExecutionContext* func,
+                                           TF_DataType dtype, TF_Status* s);
+
 // Create an operation suitable to use with the provided context. The operation
 // requires its type (e.g. "AddV2") to be set independently.
 TF_AbstractOp* TF_NewAbstractOp(TF_ExecutionContext* ctx);
@@ -77,19 +88,21 @@ void TF_AbstractOpSetAttrType(TF_AbstractOp* op, const char* const attr_name,
 void TF_DeleteAbstractTensor(TF_AbstractTensor*);
 
 // TF_OutputList holds the list of TF_AbstractTensor that results from executing
-// an operation.
-// It just lets us not specify the number of outputs of an operation
-// beforehand. This forces a memory allocation in the runtime, which is bad, but
-// it allows for generic code.
-// TODO(aminim): the description above isn't clear with respect to
-// TF_OutputListNumOutputs and the current eager implementation which requires
-// the number of outputs to be set by the client.
+// an operation, or provided to create a function.
+// When executing an operation in an eager context, the expected number of
+// outputs must be set beforehand with `TF_OutputListSetNumOutputs`.
 typedef struct TF_OutputList TF_OutputList;
 TF_OutputList* TF_NewOutputList();
 void TF_DeleteOutputList(TF_OutputList* o);
-void TF_OutputListSetNumOutputs(TF_OutputList* o, int, TF_Status*);
+// Prepare tracing to the expected number of output for an operation.
+void TF_OutputListSetNumOutputs(TF_OutputList* o, int num_outputs, TF_Status*);
+// Return the number of outputs in the list.
 int TF_OutputListNumOutputs(TF_OutputList* o);
+// Return the `i`th output in the list.
 TF_AbstractTensor* TF_OutputListGet(TF_OutputList* o, int i);
+// Append a tensor at the end of the output list, growing its size by one.
+void TF_OutputListPushBack(TF_OutputList* o, TF_AbstractTensor* tensor,
+                           TF_Status*);
 
 // TF_ExecuteOperation will, if in eager mode, execute, if in graph mode, maybe
 // capture some inputs and then add a node in the graph. The output tensors are
@@ -100,13 +113,12 @@ void TF_ExecuteOperation(TF_AbstractOp* op, int num_inputs,
                          TF_ExecutionContext* ctx, TF_Status* s);
 
 // Creates a new TF_AbstractFunction from the current tracing states in the
-// context. The returned TF_GraphToFunction must be deleted by the client.
+// context. The provided `ctx` is consumed by this API call and deleted.
+// The returned TF_AbstractFunction must be deleted by the client,
 // TODO(aminim): clarify the contract on the state of the context after this
 // call.
-TF_AbstractFunction* TF_ExecutionContextToFunction(
-    const TF_ExecutionContext* fn_body, const char* fn_name, int num_inputs,
-    const TF_AbstractTensor* inputs, int num_outputs,
-    const TF_AbstractTensor* outputs, TF_Status* status);
+TF_AbstractFunction* TF_FinalizeFunction(TF_ExecutionContext* ctx,
+                                         TF_OutputList*, TF_Status*);
 
 void TF_DeleteAbstractFunction(TF_AbstractFunction*);
 
diff --git a/tensorflow/c/eager/c_api_unified_experimental_eager.cc b/tensorflow/c/eager/c_api_unified_experimental_eager.cc
index 820c61445fb..cf8cf845834 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_eager.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_eager.cc
@@ -123,6 +123,17 @@ class EagerContext : public ExecutionContext {
     }
   }
 
+  AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) override {
+    TF_SetStatus(s, TF_INVALID_ARGUMENT,
+                 "Can't add function parameter on an eager context.");
+    return nullptr;
+  }
+  AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) override {
+    TF_SetStatus(s, TF_INVALID_ARGUMENT,
+                 "Can't use finalize function on an eager context.");
+    return nullptr;
+  }
+
   void RegisterFunction(AbstractFunction* afunc, TF_Status* s) override {
     auto* func = afunc->GetTfFunction(s);
     if (!func) {
diff --git a/tensorflow/c/eager/c_api_unified_experimental_graph.cc b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
index 36f8353894b..dd5a95b3526 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_graph.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_unified_experimental.h"
@@ -114,12 +115,14 @@ struct GraphFunction : public AbstractFunction {
   static constexpr AbstractFunctionKind kKind = kGraphFunc;
 };
 
-// GraphContext wraps a TF_Graph and manages the "execution" of operation, i.e.
-// adding them to the graph.
+// GraphContext wraps a TF_Graph modeling a single function and manages the
+// "execution" of operation, i.e. adding them to the function.
 class GraphContext : public ExecutionContext {
  public:
-  GraphContext()
-      : ExecutionContext(kKind), graph_(new TF_Graph(), TF_DeleteGraph) {}
+  explicit GraphContext(const char* name)
+      : ExecutionContext(kKind),
+        graph_(new TF_Graph(), TF_DeleteGraph),
+        name_(name) {}
 
   AbstractOp* CreateOperation() override {
     // TODO(srbs): Should the lifetime of this op be tied to the context.
@@ -136,6 +139,10 @@ class GraphContext : public ExecutionContext {
       return;
     }
     auto* tf_opdesc = graph_op->op_.release();
+    if (tf_opdesc == nullptr) {
+      TF_SetStatus(s, TF_INVALID_ARGUMENT, "AbstractOp is incomplete.");
+      return;
+    }
     for (int i = 0; i < num_inputs; ++i) {
       auto* graph_tensor = dyncast<GraphTensor>(inputs[i]);
       if (!graph_tensor) {
@@ -164,24 +171,38 @@ class GraphContext : public ExecutionContext {
     }
   }
 
-  TF_Function* ToFunction(const char* fn_name, int num_inputs,
-                          const GraphTensor* inputs, int num_outputs,
-                          const GraphTensor* outputs, TF_Status* status) const {
-    std::vector<TF_Output> graph_inputs;
-    graph_inputs.resize(num_inputs);
+  AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) override {
+    TF_OperationDescription* opdesc =
+        TF_NewOperation(graph_.get(), "Placeholder",
+                        absl::StrCat("_input_", inputs_.size()).c_str());
+    TF_SetAttrType(opdesc, "dtype", dtype);
+    auto* operation = TF_FinishOperation(opdesc, s);
+    if (!s->status.ok()) return nullptr;
+
+    inputs_.push_back(TF_Output{operation, 0});
+    return new GraphTensor(inputs_.back(), this);
+  }
+
+  AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) override {
+    std::unique_ptr<GraphFunction> func(new GraphFunction);
     std::vector<TF_Output> graph_outputs;
-    graph_outputs.resize(num_outputs);
-    for (int i = 0; i < num_inputs; i++) {
-      graph_inputs[i] = inputs[i].output;
-    }
-    for (int i = 0; i < num_outputs; i++) {
-      graph_outputs[i] = outputs[i].output;
+    graph_outputs.reserve(outputs->outputs.size());
+    for (AbstractTensor* abstract_output : outputs->outputs) {
+      GraphTensor* output = dyncast<GraphTensor>(abstract_output);
+      if (!output) {
+        TF_SetStatus(s, TF_UNIMPLEMENTED,
+                     "Returning a non-graph tensor from a function has not "
+                     "been implemented yet.");
+        return nullptr;
+      }
+      graph_outputs.push_back(output->output);
     }
 
-    return TF_GraphToFunction(graph_.get(), fn_name, 0, -1, nullptr,
-                              graph_inputs.size(), graph_inputs.data(),
-                              graph_outputs.size(), graph_outputs.data(),
-                              nullptr, nullptr, fn_name, status);
+    func->func = TF_GraphToFunction(
+        graph_.get(), name_, 0, -1, nullptr, inputs_.size(), inputs_.data(),
+        graph_outputs.size(), graph_outputs.data(), nullptr, nullptr, name_, s);
+    if (TF_GetCode(s) != TF_OK) return nullptr;
+    return func.release();
   }
 
   void RegisterFunction(AbstractFunction* func, TF_Status* s) override {
@@ -195,54 +216,20 @@ class GraphContext : public ExecutionContext {
 
  private:
   std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)> graph_;
+  std::vector<TF_Output> inputs_;
+  const char* name_;
 };
 
-// Helper that converts the graph currently held in the context into a function.
-static AbstractFunction* ExecutionContextToFunction(
-    const ExecutionContext* fn_body, const char* fn_name, int num_inputs,
-    const AbstractTensor* inputs, int num_outputs,
-    const AbstractTensor* outputs, TF_Status* status) {
-  auto* graph_ctx = dyncast<const GraphContext>(fn_body);
-  if (graph_ctx == nullptr) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT,
-                 "fn_body is not a TF_GraphContext.");
-    return nullptr;
-  }
-  auto* graph_inputs = dyncast<const GraphTensor>(inputs);
-  if (!graph_inputs) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT, "inputs aren't GraphTensors.");
-    return nullptr;
-  }
-  auto* graph_outputs = dyncast<const GraphTensor>(outputs);
-  if (!graph_outputs) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT, "outputs aren't GraphTensors.");
-    return nullptr;
-  }
-  GraphFunction* func = new GraphFunction;
-  func->func = graph_ctx->ToFunction(fn_name, num_inputs, graph_inputs,
-                                     num_outputs, graph_outputs, status);
-  return func;
+static ExecutionContext* GraphTracingFactory(const char* name, TF_Status* s) {
+  return new GraphContext(name);
 }
 
+// Register the tracing implemented in this file as the default tracing engine.
+static bool register_tracing = [] {
+  RegisterTracingEngineFactory("graphdef", GraphTracingFactory);
+  SetDefaultTracingEngine("graphdef");
+  return true;
+}();
+
 }  // namespace internal
 }  // namespace tensorflow
-
-// =============================================================================
-// Public C API entry points
-// These are only the entry points specific to the Graph API.
-// =============================================================================
-
-using tensorflow::internal::unwrap;
-
-TF_ExecutionContext* TF_NewGraphExecutionContext(TF_Status* s) {
-  return wrap(new tensorflow::internal::GraphContext());
-}
-
-TF_AbstractFunction* TF_ExecutionContextToFunction(
-    const TF_ExecutionContext* fn_body, const char* fn_name, int num_inputs,
-    const TF_AbstractTensor* inputs, int num_outputs,
-    const TF_AbstractTensor* outputs, TF_Status* status) {
-  return wrap(ExecutionContextToFunction(unwrap(fn_body), fn_name, num_inputs,
-                                         unwrap(inputs), num_outputs,
-                                         unwrap(outputs), status));
-}
diff --git a/tensorflow/c/eager/c_api_unified_experimental_internal.h b/tensorflow/c/eager/c_api_unified_experimental_internal.h
index ab085a20ff0..8fc696f0f2f 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_internal.h
+++ b/tensorflow/c/eager/c_api_unified_experimental_internal.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace internal {
@@ -57,7 +58,7 @@ T* dyncast(S source) {
 // GraphContext and vice-versa).
 class AbstractTensor {
  protected:
-  enum AbstractTensorKind { kGraphTensor, kEagerTensor, kMLIRTensor };
+  enum AbstractTensorKind { kMlirTensor, kGraphTensor, kEagerTensor };
   explicit AbstractTensor(AbstractTensorKind kind) : kind_(kind) {}
 
  public:
@@ -100,7 +101,7 @@ class AbstractFunction {
 // on a given context, with the same or different input tensors.
 class AbstractOp {
  protected:
-  enum AbstractOpKind { kGraphOp, kEagerOp };
+  enum AbstractOpKind { kMlirOp, kGraphOp, kEagerOp };
   explicit AbstractOp(AbstractOpKind kind) : kind_(kind) {}
 
  public:
@@ -128,7 +129,7 @@ class AbstractOp {
 // eager implementation or to a graph implementation.
 struct ExecutionContext {
  protected:
-  enum ExecutionContextKind { kGraphContext, kEagerContext };
+  enum ExecutionContextKind { kMlirContext, kGraphContext, kEagerContext };
   explicit ExecutionContext(ExecutionContextKind kind) : k(kind) {}
 
  public:
@@ -148,6 +149,17 @@ struct ExecutionContext {
   // Creates an empty AbstractOperation suitable to use with this context.
   virtual AbstractOp* CreateOperation() = 0;
 
+  // Add a function parameter and return the corresponding tensor.
+  // This is only valid with an ExecutionContext obtained from a TracingContext,
+  // it'll always error out with an eager context.
+  virtual AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) = 0;
+
+  // Finalize this context and make a function out of it. The context is in a
+  // invalid state after this call and must be destroyed.
+  // This is only valid with an ExecutionContext obtained from a TracingContext,
+  // it'll always error out with an eager context.
+  virtual AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) = 0;
+
   // Registers a functions with this context, after this the function is
   // available to be called/referenced by its name in this context.
   virtual void RegisterFunction(AbstractFunction* func, TF_Status* s) = 0;
@@ -156,6 +168,11 @@ struct ExecutionContext {
   const ExecutionContextKind k;
 };
 
+typedef ExecutionContext* (*FactoryFunction)(const char* fn_name, TF_Status*);
+void SetDefaultTracingEngine(const char* name);
+void RegisterTracingEngineFactory(const ::tensorflow::string& name,
+                                  FactoryFunction factory);
+
 // Create utilities to wrap/unwrap: this convert from the C opaque types to the
 // C++ implementation, and back.
 #define MAKE_WRAP_UNWRAP(C_TYPEDEF, CPP_CLASS)                              \
diff --git a/tensorflow/c/eager/c_api_unified_experimental_test.cc b/tensorflow/c/eager/c_api_unified_experimental_test.cc
index 170b82333d8..24d170f2f99 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_test.cc
@@ -29,7 +29,12 @@ using tensorflow::string;
 namespace tensorflow {
 namespace {
 
-TEST(UnifedCAPI, TestBasicEager) {
+class UnifiedCAPI : public ::testing::TestWithParam<const char*> {
+ protected:
+  void SetUp() override { TF_SetTracingImplementation(GetParam()); }
+};
+
+TEST_P(UnifiedCAPI, TestBasicEager) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -81,33 +86,18 @@ TEST(UnifedCAPI, TestBasicEager) {
   TF_DeleteExecutionContext(ctx);
 }
 
-TEST(UnifedCAPI, TestBasicGraph) {
+TEST_P(UnifiedCAPI, TestBasicGraph) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
-  TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
+  // Start a new function / execution context.
+  string fn_name = "double";
+  TF_ExecutionContext* graph_ctx =
+      TF_CreateFunction(fn_name.c_str(), status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
-  // Add a placeholder to the graph.
-  auto* placeholder_op = TF_NewAbstractOp(graph_ctx);
-  TF_AbstractOpSetOpType(placeholder_op, "Placeholder", status.get());
+  auto* placeholder_t =
+      TF_AddFunctionParameter(graph_ctx, TF_FLOAT, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_AbstractOpSetOpName(placeholder_op, "my_ph", status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_AbstractOpSetAttrType(placeholder_op, "dtype", TF_FLOAT, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-
-  // Build inputs and outputs.
-  TF_OutputList* placeholder_outputs = TF_NewOutputList();
-
-  // Execute.
-  TF_ExecuteOperation(placeholder_op, 0, nullptr, placeholder_outputs,
-                      graph_ctx, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  ASSERT_EQ(1, TF_OutputListNumOutputs(placeholder_outputs));
-  TF_AbstractTensor* placeholder_t = TF_OutputListGet(placeholder_outputs, 0);
-
-  // Delete placeholder op.
-  TF_DeleteAbstractOp(placeholder_op);
 
   // Build an abstract operation.
   auto* add_op = TF_NewAbstractOp(graph_ctx);
@@ -123,16 +113,13 @@ TEST(UnifedCAPI, TestBasicGraph) {
   // Execute.
   TF_ExecuteOperation(add_op, 2, inputs, add_outputs, graph_ctx, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_AbstractTensor* output_t = TF_OutputListGet(add_outputs, 0);
 
   // Clean up operation and inputs.
   TF_DeleteAbstractOp(add_op);
 
-  string fn_name = "double";
-  TF_AbstractFunction* func = TF_ExecutionContextToFunction(
-      graph_ctx, fn_name.c_str(), 1, placeholder_t, 1, output_t, status.get());
-  TF_DeleteAbstractTensor(placeholder_t);
-  TF_DeleteAbstractTensor(output_t);
+  TF_AbstractFunction* func =
+      TF_FinalizeFunction(graph_ctx, add_outputs, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Build eager context.
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -173,18 +160,161 @@ TEST(UnifedCAPI, TestBasicGraph) {
   ASSERT_EQ(*f_value, 4.0);
 
   TF_DeleteOutputList(add_outputs);
-  TF_DeleteOutputList(placeholder_outputs);
   TF_DeleteAbstractOp(fn_op);
   TF_DeleteAbstractTensor(input_t);
   TF_DeleteAbstractTensor(final_result);
   TF_DeleteTensor(f_t);
   TF_DeleteAbstractFunction(func);
 
-  TF_DeleteExecutionContext(graph_ctx);
   TF_DeleteExecutionContext(eager_execution_ctx);
 }
 
-TEST(UnifedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
+TEST_P(UnifiedCAPI, TestMultiOutputGraph) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_Status* s = status.get();
+
+  // Start a new function / execution context.
+  string fn_name = "two_adds";
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction(fn_name.c_str(), s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  auto* arg0 = TF_AddFunctionParameter(graph_ctx, TF_FLOAT, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  auto* arg1 = TF_AddFunctionParameter(graph_ctx, TF_FLOAT, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Create a first "Add" computing `arg0 + arg1`.
+  TF_AbstractTensor* add_output1;
+  {
+    // Build an abstract operation, inputs and output.
+    auto* add_op = TF_NewAbstractOp(graph_ctx);
+    TF_AbstractOpSetOpType(add_op, "Add", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractOpSetOpName(add_op, "my_add1", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractTensor* inputs[2] = {arg0, arg1};
+    TF_OutputList* add_outputs = TF_NewOutputList();
+    // Trace the operation now (create a node in the graph).
+    TF_ExecuteOperation(add_op, 2, inputs, add_outputs, graph_ctx, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteAbstractOp(add_op);
+    // Extract the resulting tensor.
+    add_output1 = TF_OutputListGet(add_outputs, 0);
+    TF_DeleteOutputList(add_outputs);
+  }
+
+  // Same with a second "Add" computing `arg1 + arg1`.
+  TF_AbstractTensor* add_output2;
+  {
+    // Build an abstract operation, inputs and output.
+    auto* add_op = TF_NewAbstractOp(graph_ctx);
+    TF_AbstractOpSetOpType(add_op, "Add", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractOpSetOpName(add_op, "my_add2", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractTensor* inputs[2] = {arg1, arg1};
+    TF_OutputList* add_outputs = TF_NewOutputList();
+    // Trace the operation now (create a node in the graph).
+    TF_ExecuteOperation(add_op, 2, inputs, add_outputs, graph_ctx, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteAbstractOp(add_op);
+    // Extract the resulting tensor.
+    add_output2 = TF_OutputListGet(add_outputs, 0);
+    TF_DeleteOutputList(add_outputs);
+  }
+
+  // Finalize the function by providing the returned values.
+  TF_AbstractFunction* func;
+  {
+    // We want to return the output of both add operations, create a new list
+    // and populate it.
+    TF_OutputList* func_outputs = TF_NewOutputList();
+    TF_OutputListPushBack(func_outputs, add_output1, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_OutputListPushBack(func_outputs, add_output2, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    func = TF_FinalizeFunction(graph_ctx, func_outputs, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteOutputList(func_outputs);
+  }
+
+  /**
+   * We traced so far this function:
+   *
+   *   def two_adds(a, b):
+   *     my_add1 = a + b
+   *     my_add2 = b + b
+   *     return my_add1, my_add2
+   *
+   * Now we will execute this function with an eager context:
+   *
+   *   output1, output2 = two_adds(2.0, 3.0)
+   *
+   * and check that we got 5.0 and 6.0 as results.
+   */
+
+  // Build eager context.
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TF_ExecutionContext* eager_execution_ctx =
+      TF_NewEagerExecutionContext(opts, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TFE_DeleteContextOptions(opts);
+
+  TF_ExecutionContextRegisterFunction(eager_execution_ctx, func, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Build the abstract op to run the function.
+  TF_AbstractOp* fn_op = TF_NewAbstractOp(eager_execution_ctx);
+  TF_AbstractOpSetOpType(fn_op, fn_name.c_str(), s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Build two abstract input tensors as function arguments.
+  std::vector<TF_AbstractTensor*> func_args;
+  {
+    TFE_Context* eager_ctx =
+        TF_ExecutionContextGetTFEContext(eager_execution_ctx);
+    TFE_TensorHandle* input_eager = TestScalarTensorHandle(eager_ctx, 2.0f);
+    func_args.push_back(TF_CreateAbstractTensorFromEagerTensor(input_eager, s));
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    input_eager = TestScalarTensorHandle(eager_ctx, 3.0f);
+    func_args.push_back(TF_CreateAbstractTensorFromEagerTensor(input_eager, s));
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  }
+
+  TF_OutputList* func_outputs = TF_NewOutputList();
+  TF_OutputListSetNumOutputs(func_outputs, 2, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_ExecuteOperation(fn_op, func_args.size(), func_args.data(), func_outputs,
+                      eager_execution_ctx, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_DeleteAbstractOp(fn_op);
+  for (TF_AbstractTensor* t : func_args) TF_DeleteAbstractTensor(t);
+
+  ASSERT_EQ(2, TF_OutputListNumOutputs(func_outputs));
+  float results[2];
+  for (int idx = 0; idx < 2; ++idx) {
+    TF_AbstractTensor* result = TF_OutputListGet(func_outputs, idx);
+    TFE_TensorHandle* handle = TF_AbstractTensorGetEagerTensor(result, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_Tensor* f_t = TFE_TensorHandleResolve(handle, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    results[idx] = *static_cast<float*>(TF_TensorData(f_t));
+    TF_DeleteTensor(f_t);
+  }
+  ASSERT_EQ(results[0], 5.0);
+  ASSERT_EQ(results[1], 6.0);
+
+  for (int idx = 0; idx < 2; ++idx) {
+    TF_AbstractTensor* result = TF_OutputListGet(func_outputs, idx);
+    TF_DeleteAbstractTensor(result);
+  }
+  TF_DeleteOutputList(func_outputs);
+  TF_DeleteExecutionContext(eager_execution_ctx);
+  TF_DeleteAbstractFunction(func);
+}
+
+TEST(UnifiedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -192,18 +322,15 @@ TEST(UnifedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   TFE_DeleteContextOptions(opts);
 
-  TF_AbstractFunction* func = TF_ExecutionContextToFunction(
-      ctx, nullptr, 0, nullptr, 0, nullptr, status.get());
+  TF_AbstractFunction* func = TF_FinalizeFunction(ctx, nullptr, status.get());
   ASSERT_EQ(nullptr, func);
   ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
-
-  TF_DeleteExecutionContext(ctx);
 }
 
-TEST(UnifedCAPI, TF_CallingSetOpTypeAfterFinishingOpBuildingRaises) {
+TEST_P(UnifiedCAPI, TF_CallingSetOpTypeAfterFinishingOpBuildingRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
-  TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Add a placeholder to the graph.
@@ -221,10 +348,10 @@ TEST(UnifedCAPI, TF_CallingSetOpTypeAfterFinishingOpBuildingRaises) {
   TF_DeleteExecutionContext(graph_ctx);
 }
 
-TEST(UnifedCAPI, TF_CallingSetOpNameAfterFinishingOpBuildingRaises) {
+TEST_P(UnifiedCAPI, TF_CallingSetOpNameAfterFinishingOpBuildingRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
-  TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Add a placeholder to the graph.
@@ -242,7 +369,7 @@ TEST(UnifedCAPI, TF_CallingSetOpNameAfterFinishingOpBuildingRaises) {
   TF_DeleteExecutionContext(graph_ctx);
 }
 
-TEST(UnifedCAPI, TestExecutingEagerOpInGraphModeRaises) {
+TEST_P(UnifiedCAPI, TestExecutingEagerOpInGraphModeRaises) {
   // Build an Eager context.
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -272,7 +399,8 @@ TEST(UnifedCAPI, TestExecutingEagerOpInGraphModeRaises) {
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Build a Graph context.
-  TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Execute eager op using graph context.
@@ -288,10 +416,11 @@ TEST(UnifedCAPI, TestExecutingEagerOpInGraphModeRaises) {
   TF_DeleteExecutionContext(graph_ctx);
 }
 
-TEST(UnifedCAPI, TestExecutingGraphOpInEagerModeRaises) {
+TEST_P(UnifiedCAPI, TestExecutingGraphOpInEagerModeRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
-  TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Add a placeholder to the graph.
@@ -348,5 +477,8 @@ TEST(UnifedCAPI, TestExecutingGraphOpInEagerModeRaises) {
   TF_DeleteExecutionContext(eager_execution_ctx);
 }
 
+INSTANTIATE_TEST_SUITE_P(Tracing, UnifiedCAPI,
+                         ::testing::Values("graphdef", "mlir"));
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/context_interface.h b/tensorflow/c/eager/context_interface.h
index 9377bf0be12..2861fa43b66 100644
--- a/tensorflow/c/eager/context_interface.h
+++ b/tensorflow/c/eager/context_interface.h
@@ -59,6 +59,20 @@ class AbstractContextInterface {
   virtual AbstractTensorInterface* CreateTensor(
       DataType dtype, absl::Span<const int64> dim_sizes) = 0;
 
+  typedef void (*MemoryReleaser)(void* data, size_t len, void* arg);
+
+  // Create a tensor instance from the given data buffer and description.
+  // `memory_releaser` will be called on destruction, and it's responsible for
+  // cleaning up the underlying buffer. `convert_string` indicates whether it
+  // has to handle tstring conversion. Expected to be removed once tstring
+  // migration is done.
+  virtual AbstractTensorInterface* CreateTensor(DataType dtype,
+                                                const int64_t* dims,
+                                                int num_dims, void* data,
+                                                size_t len, bool convert_string,
+                                                MemoryReleaser memory_releaser,
+                                                void* memory_releaser_arg) = 0;
+
   // Create a handle to wrap and manage a Tensor
   virtual AbstractTensorHandleInterface* CreateLocalHandle(
       AbstractTensorInterface* t) = 0;
@@ -87,6 +101,9 @@ class AbstractContextInterface {
   // Destroy the step resource container for a training step.
   virtual void EndStep() = 0;
 
+  // Block until all pending nodes are finished.
+  virtual Status AsyncWait() = 0;
+
  protected:
   virtual ~AbstractContextInterface() {}
 };
diff --git a/tensorflow/c/eager/parallel_device/BUILD b/tensorflow/c/eager/parallel_device/BUILD
index f4dbcc6cead..6fce918aab1 100644
--- a/tensorflow/c/eager/parallel_device/BUILD
+++ b/tensorflow/c/eager/parallel_device/BUILD
@@ -27,6 +27,7 @@ cc_library(
     name = "parallel_device",
     srcs = [":sources"],
     hdrs = [":headers"],
+    visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/c:c_api",
         "//tensorflow/c/eager:c_api",
@@ -38,11 +39,30 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "parallel_device_testlib",
+    testonly = 1,
+    srcs = ["parallel_device_testlib.cc"],
+    hdrs = ["parallel_device_testlib.h"],
+    deps = [
+        ":parallel_device",
+        ":parallel_device_ops",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:c_api_experimental",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "parallel_device_test",
     srcs = ["parallel_device_test.cc"],
     deps = [
         ":parallel_device",
+        ":parallel_device_ops",
+        ":parallel_device_testlib",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_experimental",
         "//tensorflow/c/eager:c_api",
@@ -52,3 +72,40 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+tf_cc_test(
+    name = "parallel_device_remote_test",
+    srcs = ["parallel_device_remote_test.cc"],
+    # TODO(b/136478427): Enable global heap checking when servers shut down
+    # cleanly.
+    args = ["--heap_check=local"],
+    deps = [
+        ":parallel_device",
+        ":parallel_device_ops",
+        ":parallel_device_testlib",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:c_api_experimental",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+    ],
+)
+
+# Note: ParallelDevice-specific ops are experimental and not currently linked in
+# to TensorFlow by default, just used in a few tests.
+filegroup(
+    name = "parallel_device_ops_srcs",
+    srcs = ["parallel_device_ops.cc"],
+    visibility = ["//tensorflow/python/distribute/parallel_device:__pkg__"],
+)
+
+cc_library(
+    name = "parallel_device_ops",
+    srcs = [":parallel_device_ops_srcs"],
+    visibility = ["//tensorflow:internal"],
+    deps = ["//tensorflow/core:framework"],
+    alwayslink = 1,
+)
diff --git a/tensorflow/c/eager/parallel_device/parallel_device.cc b/tensorflow/c/eager/parallel_device/parallel_device.cc
index e6846809fcf..75d188d0c45 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device.cc
@@ -92,6 +92,10 @@ class ParallelDevice {
                                                        TFE_TensorHandle* tensor,
                                                        TF_Status* status) const;
 
+  // A parallel tensor with scalar integers numbering component devices.
+  std::unique_ptr<ParallelTensor> DeviceIDs(TFE_Context* context,
+                                            TF_Status* status) const;
+
   // Takes a description of a single operation being executed on the
   // ParallelDevice, and in turn runs one operation per component device with
   // its corresponding inputs from the input ParallelTensors (or
@@ -208,6 +212,46 @@ std::unique_ptr<ParallelTensor> ParallelDevice::CopyToParallelDevice(
                                            status);
 }
 
+std::unique_ptr<ParallelTensor> ParallelDevice::DeviceIDs(
+    TFE_Context* context, TF_Status* status) const {
+  // TODO(allenl): We could cache DeviceIDs (keyed by context).
+  std::vector<TensorHandlePtr> components;
+  components.reserve(underlying_devices_.size());
+  for (int device_index = 0; device_index < underlying_devices_.size();
+       ++device_index) {
+    int64_t* device_id = new int64_t;
+    *device_id = device_index;
+    std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
+        TF_NewTensor(
+            TF_INT64, /*dims=*/nullptr, /*num_dims=*/0, device_id,
+            sizeof(int64_t),
+            [](void* data, size_t, void* arg) {
+              delete reinterpret_cast<int64_t*>(data);
+            },
+            nullptr),
+        TF_DeleteTensor);
+    // TODO(allenl): Here and when executing regular operations, we could hold
+    // on to one TFE_Op per device and just call TFE_ResetOp to avoid parsing
+    // device names repeatedly.
+    OpPtr const_op(TFE_NewOp(context, "Const", status));
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    TFE_OpSetDevice(const_op.get(), underlying_devices_[device_index].c_str(),
+                    status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    TFE_OpSetAttrTensor(const_op.get(), "value", tensor.get(), status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    TFE_OpSetAttrType(const_op.get(), "dtype", TF_INT64);
+    TFE_TensorHandle* device_handle;
+    int num_outputs = 1;
+    TFE_Execute(const_op.get(), &device_handle, &num_outputs, status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    components.emplace_back(device_handle);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+  }
+  return ParallelTensor::FromTensorHandles(*this, std::move(components),
+                                           status);
+}
+
 absl::optional<std::vector<MaybeParallelTensorOwned>> ParallelDevice::Execute(
     TFE_Context* context, std::vector<MaybeParallelTensorUnowned> inputs,
     const char* operation_name, const TFE_OpAttrs* attributes,
@@ -275,6 +319,11 @@ absl::optional<std::vector<MaybeParallelTensorOwned>> ParallelDevice::Execute(
     std::vector<MaybeParallelTensorOwned> outputs;
     outputs.reserve(t->num_tensors());
     for (int i = 0; i < t->num_tensors(); ++i) {
+      // TODO(b/157523095): Syncing the executor here shouldn't be
+      // necessary. Currently async+remote is missing cross-executor
+      // coordination.
+      TFE_ExecutorWaitForAllPendingNodes(executors_[i].get(), status);
+      if (TF_GetCode(status) != TF_OK) return result;
       TensorHandlePtr this_output(
           TFE_TensorHandleCopySharingTensor(t->tensor(i), status));
       outputs.emplace_back(std::move(this_output));
@@ -282,6 +331,13 @@ absl::optional<std::vector<MaybeParallelTensorOwned>> ParallelDevice::Execute(
     }
     result.emplace(std::move(outputs));
     return result;
+  } else if (operation_name == std::string("DeviceID")) {
+    std::vector<MaybeParallelTensorOwned> result_content;
+    result_content.reserve(1);
+    result_content.push_back(DeviceIDs(context, status));
+    if (TF_GetCode(status) != TF_OK) return result;
+    result.emplace(std::move(result_content));
+    return result;
   }
   absl::optional<std::vector<std::unique_ptr<ParallelTensor>>>
       maybe_parallel_results(
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_ops.cc b/tensorflow/c/eager/parallel_device/parallel_device_ops.cc
new file mode 100644
index 00000000000..1decffca047
--- /dev/null
+++ b/tensorflow/c/eager/parallel_device/parallel_device_ops.cc
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+// TODO(allenl): Figure out if we need this op, and if so whether we should move
+// it to core TF. Right now the eager C API does some checking of op
+// registrations before calling into custom devices, but we may be able to avoid
+// that.
+REGISTER_OP("DeviceID")
+    .Output("device_id: int64")
+    .SetIsStateful()
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_remote_test.cc b/tensorflow/c/eager/parallel_device/parallel_device_remote_test.cc
new file mode 100644
index 00000000000..32a4b440d25
--- /dev/null
+++ b/tensorflow/c/eager/parallel_device/parallel_device_remote_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+#include <string>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/parallel_device/parallel_device.h"
+#include "tensorflow/c/eager/parallel_device/parallel_device_testlib.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/platform/test.h"
+
+tensorflow::ServerDef GetServerDef(const std::string& job_name, int num_tasks) {
+  tensorflow::ServerDef server_def;
+  server_def.set_protocol("grpc");
+  server_def.set_job_name(job_name);
+  server_def.set_task_index(0);
+  tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster();
+  tensorflow::JobDef* job_def = cluster_def->add_job();
+  job_def->set_name(job_name);
+  for (int i = 0; i < num_tasks; i++) {
+    int port = tensorflow::testing::PickUnusedPortOrDie();
+    job_def->mutable_tasks()->insert(
+        {i, tensorflow::strings::StrCat("localhost", ":", port)});
+  }
+  return server_def;
+}
+
+TEST(PARALLEL_DEVICE, TestRemoteBasic) {
+  std::unique_ptr<TFE_ContextOptions, decltype(&TFE_DeleteContextOptions)> opts(
+      TFE_NewContextOptions(), TFE_DeleteContextOptions);
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> context(
+      TFE_NewContext(opts.get(), status.get()), TFE_DeleteContext);
+  tensorflow::ServerDef server_def = GetServerDef("worker", 3);
+
+  // This server def has the task index set to 0.
+  std::string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server1)
+                  .ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
+
+  server_def.set_task_index(2);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server2)
+                  .ok());
+  ASSERT_TRUE(worker_server2->Start().ok());
+
+  TFE_ContextSetServerDef(context.get(), 0, serialized.data(),
+                          serialized.size(), status.get());
+  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  BasicTestsForTwoDevices(context.get(),
+                          "/job:worker/replica:0/task:1/device:CPU:0",
+                          "/job:worker/replica:0/task:2/device:CPU:0");
+
+  worker_server1.release();
+  worker_server2.release();
+}
+
+TEST(PARALLEL_DEVICE, TestAsyncCopyOff) {
+  std::unique_ptr<TFE_ContextOptions, decltype(&TFE_DeleteContextOptions)> opts(
+      TFE_NewContextOptions(), TFE_DeleteContextOptions);
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> context(
+      TFE_NewContext(opts.get(), status.get()), TFE_DeleteContext);
+  tensorflow::ServerDef server_def = GetServerDef("worker", 3);
+
+  // This server def has the task index set to 0.
+  std::string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server1)
+                  .ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
+
+  server_def.set_task_index(2);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server2)
+                  .ok());
+  ASSERT_TRUE(worker_server2->Start().ok());
+
+  TFE_ContextSetServerDef(context.get(), 0, serialized.data(),
+                          serialized.size(), status.get());
+  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  const char* first_device = "/job:worker/replica:0/task:1/device:CPU:0";
+  const char* second_device = "/job:worker/replica:0/task:2/device:CPU:0";
+  const char* device_name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
+  std::array<const char*, 2> underlying_devices{first_device, second_device};
+  RegisterParallelDevice(context.get(), device_name, underlying_devices,
+                         status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+  TensorHandlePtr value_one(FloatTensorHandle(3., status.get()));
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  TensorHandlePtr value_two(FloatTensorHandle(-2., status.get()));
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  std::array<TFE_TensorHandle*, 2> in_components{value_one.get(),
+                                                 value_two.get()};
+  TensorHandlePtr combined_value = CreatePerDeviceValues(
+      context.get(), in_components, device_name, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+  // Loop to make synchronization failures more deterministic
+  for (int i = 0; i < 100; ++i) {
+    TensorHandlePtr multiply_result(
+        Multiply(context.get(), combined_value.get(), combined_value.get(),
+                 status.get()));
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    std::array<TensorHandlePtr, 2> out_components;
+    ExtractPerDeviceValues(context.get(), multiply_result.get(),
+                           &out_components, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    ExpectScalarEq<float>(out_components[0].get(), 9.);
+    ExpectScalarEq<float>(out_components[1].get(), 4.);
+  }
+
+  worker_server1.release();
+  worker_server2.release();
+}
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_test.cc b/tensorflow/c/eager/parallel_device/parallel_device_test.cc
index 9b0613b0391..d9784ac9fa6 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_test.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/parallel_device/parallel_device_testlib.h"
 #include "tensorflow/core/platform/test.h"
 
 // NOTE(allenl): These tests currently go through TFE_Execute and so are
@@ -28,363 +29,6 @@ limitations under the License.
 // correspond fairly well to the implementation, but testing the C++ directly is
 // another option.
 
-// Functor for making unique_ptr to TFE_TensorHandle slightly more
-// ergonomic. Using decltype(TFE_DeleteTensorHandle) in the unique_ptr's second
-// template argument requires passing a function pointer to
-// TFE_DeleteTensorHandle when constructing the unique_ptr.
-class TensorHandleDeleter {
- public:
-  void operator()(TFE_TensorHandle* to_delete) {
-    TFE_DeleteTensorHandle(to_delete);
-  }
-};
-
-using TensorHandlePtr = std::unique_ptr<TFE_TensorHandle, TensorHandleDeleter>;
-
-// A helper for performing common operations on variables. A much more
-// restricted stand-in for tf.Variable in Python.
-class Variable {
- public:
-  // Construct a Variable from a resource-dtype TFE_TensorHandle and an
-  // indication of the dtype of the variable's value.
-  //
-  // Note that creating this resource-dtype handle can fail, so `Create` is a
-  // separate static method which returns a status.
-  Variable(TFE_TensorHandle* handle, TF_DataType type)
-      : handle_(handle), type_(type) {}
-
-  // Helper for constructing a resource handle and wrapping it in a `Variable`
-  // object.
-  static Variable* Create(TFE_Context* context, TF_DataType type,
-                          const int64_t* dims, const int num_dims,
-                          const char* device, TF_Status* status);
-  // Dereferences the backing buffer for the variable. Note that since this can
-  // fail (it runs operations), it must be called explicitly and the resulting
-  // `status` checked.
-  void Destroy(TFE_Context* context, TF_Status* status);
-
-  // Reads from the variable.
-  TensorHandlePtr Read(TFE_Context* context, TF_Status* status);
-  // Assigns a new value to the variable.
-  void Assign(TFE_Context* context, TFE_TensorHandle* value, TF_Status* status);
-  // Adds `value` to the existing value of the variable.
-  void AssignAdd(TFE_Context* context, TFE_TensorHandle* value,
-                 TF_Status* status);
-
- private:
-  // Helper for running any single-argument assignment ops (Assign, AssignAdd,
-  // AssignSub, ...).
-  void GeneralAssignment(const char* op_name, TFE_Context* context,
-                         TFE_TensorHandle* value, TF_Status* status);
-
-  // The a handle for the resource-dtype tensor pointing to the variable's
-  // buffer.
-  TFE_TensorHandle* handle_;
-  // The dtype of the variable's buffer (input dtype for assignments, output
-  // dtype of read operations).
-  TF_DataType type_;
-};
-
-Variable* Variable::Create(TFE_Context* context, TF_DataType type,
-                           const int64_t* dims, const int num_dims,
-                           const char* device, TF_Status* status) {
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-      TFE_NewOp(context, "VarHandleOp", status), TFE_DeleteOp);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpSetAttrType(op.get(), "dtype", type);
-  TFE_OpSetAttrShape(op.get(), "shape", dims, num_dims, status);
-  TFE_OpSetAttrString(op.get(), "container", "", 0);
-  // Use the special GUID for no buffer sharing
-  //
-  // TODO(allenl): Should we provide a better API for this? AFAIK this is the
-  // only reasonable way to make variables with no aliasing using the eager C
-  // API.
-  std::string no_sharing = "cd2c89b7-88b7-44c8-ad83-06c2a9158347";
-  TFE_OpSetAttrString(op.get(), "shared_name", no_sharing.c_str(),
-                      no_sharing.length());
-  TFE_OpSetDevice(op.get(), device, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_TensorHandle* var_handle = nullptr;
-  int num_retvals = 1;
-  TFE_Execute(op.get(), &var_handle, &num_retvals, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  return new Variable(var_handle, type);
-}
-
-void Variable::Destroy(TFE_Context* context, TF_Status* status) {
-  // Free the backing buffer for the variable.
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-      TFE_NewOp(context, "DestroyResourceOp", status), &TFE_DeleteOp);
-  if (TF_GetCode(status) != TF_OK) return;
-  TFE_OpAddInput(op.get(), handle_, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  const char* device = TFE_TensorHandleDeviceName(handle_, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  TFE_OpSetDevice(op.get(), device, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  int num_retvals = 0;
-  TFE_Execute(op.get(), nullptr, &num_retvals, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  // Delete the variable handle itself.
-  TFE_DeleteTensorHandle(handle_);
-}
-
-TensorHandlePtr Variable::Read(TFE_Context* context, TF_Status* status) {
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-      TFE_NewOp(context, "ReadVariableOp", status), &TFE_DeleteOp);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpAddInput(op.get(), handle_, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  const char* device = TFE_TensorHandleDeviceName(handle_, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpSetDevice(op.get(), device, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpSetAttrType(op.get(), "dtype", type_);
-  int num_retvals = 1;
-  TFE_TensorHandle* var_value = nullptr;
-  TFE_Execute(op.get(), &var_value, &num_retvals, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  return TensorHandlePtr(var_value);
-}
-
-void Variable::GeneralAssignment(const char* op_name, TFE_Context* context,
-                                 TFE_TensorHandle* value, TF_Status* status) {
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-      TFE_NewOp(context, op_name, status), &TFE_DeleteOp);
-  if (TF_GetCode(status) != TF_OK) return;
-  TFE_OpSetAttrType(op.get(), "dtype", type_);
-  TFE_OpAddInput(op.get(), handle_, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  TFE_OpAddInput(op.get(), value, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  const char* device = TFE_TensorHandleDeviceName(handle_, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  TFE_OpSetDevice(op.get(), device, status);
-
-  int num_retvals = 0;
-  TFE_Execute(op.get(), nullptr, &num_retvals, status);
-  if (TF_GetCode(status) != TF_OK) return;
-}
-
-void Variable::AssignAdd(TFE_Context* context, TFE_TensorHandle* value,
-                         TF_Status* status) {
-  GeneralAssignment("AssignAddVariableOp", context, value, status);
-}
-
-void Variable::Assign(TFE_Context* context, TFE_TensorHandle* value,
-                      TF_Status* status) {
-  GeneralAssignment("AssignVariableOp", context, value, status);
-}
-
-// Passed to `TF_NewTensor` to indicate how an array of floats should be
-// deleted.
-static void FloatDeallocator(void* data, size_t, void* arg) {
-  delete[] static_cast<float*>(data);
-}
-
-// Creates a TFE_TensorHandle with value `v`.
-TensorHandlePtr FloatTensorHandle(float v, TF_Status* status) {
-  const int num_bytes = sizeof(float);
-  float* values = new float[1];
-  values[0] = v;
-  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
-      TF_NewTensor(TF_FLOAT, nullptr, 0, values, num_bytes, &FloatDeallocator,
-                   nullptr),
-      TF_DeleteTensor);
-  return TensorHandlePtr(TFE_NewTensorHandle(tensor.get(), status));
-}
-
-// Creates a rank-one TFE_TensorHandle with value `v`.
-TensorHandlePtr VectorFloatTensorHandle(const std::vector<float>& v,
-                                        TF_Status* status) {
-  const int num_bytes = v.size() * sizeof(float);
-  float* values = new float[v.size()];
-  memcpy(values, v.data(), num_bytes);
-  int64_t dims = v.size();
-  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
-      TF_NewTensor(TF_FLOAT, &dims, 1 /* num_dims */, values, num_bytes,
-                   &FloatDeallocator, nullptr),
-      TF_DeleteTensor);
-  return TensorHandlePtr(TFE_NewTensorHandle(tensor.get(), status));
-}
-
-// Helper to un-pack `num_replicas` TFE_TensorHandles from one parallel handle.
-template <std::size_t num_replicas>
-void ExtractPerDeviceValues(
-    TFE_Context* context, TFE_TensorHandle* input,
-    std::array<TensorHandlePtr, num_replicas>* components, TF_Status* status) {
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-      TFE_NewOp(context, "TPUReplicatedOutput", status), TFE_DeleteOp);
-  if (TF_GetCode(status) != TF_OK) return;
-  TFE_OpSetAttrInt(op.get(), "num_replicas", num_replicas);
-  TFE_OpAddInput(op.get(), input, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  const char* device = TFE_TensorHandleDeviceName(input, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  TFE_OpSetDevice(op.get(), device, status);
-  if (TF_GetCode(status) != TF_OK) return;
-
-  TFE_TensorHandle* result_handles[num_replicas];
-  int num_retvals = num_replicas;
-  TFE_Execute(op.get(), result_handles, &num_retvals, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  for (int i = 0; i < num_replicas; ++i) {
-    (*components)[i].reset(result_handles[i]);
-  }
-}
-
-// Helper to pack `num_replicas` TFE_TensorHandles into one parallel handle.
-template <std::size_t num_replicas>
-TensorHandlePtr CreatePerDeviceValues(
-    TFE_Context* context,
-    const std::array<TFE_TensorHandle*, num_replicas>& components,
-    const char* device, TF_Status* status) {
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-      TFE_NewOp(context, "TPUReplicatedInput", status), TFE_DeleteOp);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpSetAttrInt(op.get(), "N", num_replicas);
-  for (int i = 0; i < num_replicas; ++i) {
-    TFE_OpAddInput(op.get(), components[i], status);
-    if (TF_GetCode(status) != TF_OK) return nullptr;
-  }
-  TFE_OpSetDevice(op.get(), device, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-
-  TFE_TensorHandle* result_handle;
-  int num_retvals = 1;
-  TFE_Execute(op.get(), &result_handle, &num_retvals, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  return TensorHandlePtr(result_handle);
-}
-
-TensorHandlePtr Multiply(TFE_Context* context, TFE_TensorHandle* first,
-                         TFE_TensorHandle* second, TF_Status* status) {
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-      TFE_NewOp(context, "Mul", status), TFE_DeleteOp);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpAddInput(op.get(), first, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpAddInput(op.get(), second, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  const char* first_device = TFE_TensorHandleDeviceName(first, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpSetDevice(op.get(), first_device, status);
-
-  TFE_TensorHandle* result_handle;
-  int num_retvals = 1;
-  TFE_Execute(op.get(), &result_handle, &num_retvals, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  return TensorHandlePtr(result_handle);
-}
-
-// Assert that `handle` is equal to `expected_value`.
-void AssertScalarFloatEq(TFE_TensorHandle* handle, float expected_value) {
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> value_zero(
-      TFE_TensorHandleResolve(handle, status.get()), TF_DeleteTensor);
-  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  ASSERT_EQ(expected_value,
-            *static_cast<float*>(TF_TensorData(value_zero.get())));
-}
-
-template <std::size_t num_devices>
-void RegisterParallelDevice(
-    TFE_Context* context, const char* device_name,
-    const std::array<const char*, num_devices>& underlying_devices,
-    TF_Status* status) {
-  TFE_CustomDevice device;
-  void* device_info;
-  tensorflow::eager::AllocateParallelDevice(
-      device_name, underlying_devices.data(), underlying_devices.size(),
-      &device, &device_info);
-  TFE_RegisterCustomDevice(context, device, device_name, device_info, status);
-}
-
-// Create and modify a variable placed on a parallel device which composes
-// `first_device` and `second_device`.
-void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
-                             const char* second_device) {
-  // Register the custom device
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  const char* device_name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  std::array<const char*, 2> underlying_devices{first_device, second_device};
-  RegisterParallelDevice(context, device_name, underlying_devices,
-                         status.get());
-  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-  // Create a variable handle (uninitialized to start) placed on the parallel
-  // device.
-  std::function<void(Variable*)> variable_deleter = [&](Variable* to_delete) {
-    to_delete->Destroy(context, status.get());
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-    delete to_delete;
-  };
-  std::unique_ptr<Variable, decltype(variable_deleter)> variable(
-      Variable::Create(context, TF_FLOAT, /* Scalar */ {}, 0, device_name,
-                       status.get()),
-      variable_deleter);
-  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-  // Assign an initial value to the variable, implicitly mirroring it to each
-  // component device.
-  {
-    TensorHandlePtr initial_value = FloatTensorHandle(20., status.get());
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-    variable->Assign(context, initial_value.get(), status.get());
-  }
-
-  // Read from the variable and verify that we have a parallel tensor.
-  {
-    TensorHandlePtr read = variable->Read(context, status.get());
-    std::array<TensorHandlePtr, 2> components;
-    ExtractPerDeviceValues(context, read.get(), &components, status.get());
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-    AssertScalarFloatEq(components[0].get(), 20.);
-    AssertScalarFloatEq(components[1].get(), 20.);
-
-    std::string first_device =
-        TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
-    ASSERT_EQ(underlying_devices[0], first_device);
-    std::string second_device =
-        TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
-    ASSERT_EQ(underlying_devices[1], second_device);
-  }
-
-  // Add a parallel tensor with different values on each device to the variable.
-  {
-    TensorHandlePtr value_one(FloatTensorHandle(3., status.get()));
-    TensorHandlePtr value_two(FloatTensorHandle(-2., status.get()));
-    std::array<TFE_TensorHandle*, 2> components{value_one.get(),
-                                                value_two.get()};
-    TensorHandlePtr combined_value =
-        CreatePerDeviceValues(context, components, device_name, status.get());
-    variable->AssignAdd(context, combined_value.get(), status.get());
-  }
-
-  // Read the variable and verify that each component has the right modified
-  // value.
-  {
-    TensorHandlePtr read = variable->Read(context, status.get());
-    std::array<TensorHandlePtr, 2> components;
-    ExtractPerDeviceValues(context, read.get(), &components, status.get());
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-    AssertScalarFloatEq(components[0].get(), 23.);
-    AssertScalarFloatEq(components[1].get(), 18.);
-
-    std::string first_device =
-        TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
-    ASSERT_EQ(underlying_devices[0], first_device);
-    std::string second_device =
-        TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
-    ASSERT_EQ(underlying_devices[1], second_device);
-  }
-}
-
 TEST(PARALLEL_DEVICE, TestBasicCPU) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -498,8 +142,8 @@ TEST(PARALLEL_DEVICE, TestExplicitCopies) {
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
     // The value of the original tensor is replicated on each device.
-    AssertScalarFloatEq(components[0].get(), 3.);
-    AssertScalarFloatEq(components[1].get(), 3.);
+    ExpectScalarEq<float>(components[0].get(), 3.);
+    ExpectScalarEq<float>(components[1].get(), 3.);
 
     // Verify that the mirrors are placed on the component devices.
     std::string first_device =
@@ -630,7 +274,7 @@ TEST(PARALLEL_DEVICE, TestNestedParallelDevices) {
                          &second_components, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
-  AssertScalarFloatEq(second_components[1].get(), 9.);
+  ExpectScalarEq<float>(second_components[1].get(), 9.);
 
   // Verify that the mirrors are placed on the component devices.
   std::string first_device = TFE_TensorHandleBackingDeviceName(
@@ -644,8 +288,8 @@ TEST(PARALLEL_DEVICE, TestNestedParallelDevices) {
   std::array<TensorHandlePtr, 2> first_components;
   ExtractPerDeviceValues(context.get(), second_components[0].get(),
                          &first_components, status.get());
-  AssertScalarFloatEq(first_components[0].get(), 3.);
-  AssertScalarFloatEq(first_components[1].get(), 6.);
+  ExpectScalarEq<float>(first_components[0].get(), 3.);
+  ExpectScalarEq<float>(first_components[1].get(), 6.);
 
   first_device = TFE_TensorHandleBackingDeviceName(first_components[0].get(),
                                                    status.get());
@@ -806,8 +450,8 @@ TEST(PARALLEL_DEVICE, TestCollective) {
   ExtractPerDeviceValues(context.get(), reduced.get(), &result_components,
                          status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  AssertScalarFloatEq(result_components[0].get(), 3.);
-  AssertScalarFloatEq(result_components[1].get(), 3.);
+  ExpectScalarEq<float>(result_components[0].get(), 3.);
+  ExpectScalarEq<float>(result_components[1].get(), 3.);
 }
 
 void RegisterCollectiveMulFunction(TFE_Context* context,
@@ -909,8 +553,8 @@ TEST(PARALLEL_DEVICE, TestFunction) {
   ExtractPerDeviceValues(context.get(), reduced.get(), &result_components,
                          status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  AssertScalarFloatEq(result_components[0].get(), 7. * 9.);
-  AssertScalarFloatEq(result_components[1].get(), 7. * 9.);
+  ExpectScalarEq<float>(result_components[0].get(), 7. * 9.);
+  ExpectScalarEq<float>(result_components[1].get(), 7. * 9.);
 
   std::string first_device = TFE_TensorHandleBackingDeviceName(
       result_components[0].get(), status.get());
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc b/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
new file mode 100644
index 00000000000..fba47865c36
--- /dev/null
+++ b/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
@@ -0,0 +1,308 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/parallel_device/parallel_device_testlib.h"
+
+#include <array>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/core/platform/test.h"
+
+// NOTE(allenl): These tests currently go through TFE_Execute and so are
+// integration testing rather than purely testing the parallel device. They
+// correspond fairly well to the implementation, but testing the C++ directly is
+// another option.
+
+
+Variable* Variable::Create(TFE_Context* context, TF_DataType type,
+                           const int64_t* dims, const int num_dims,
+                           const char* device, TF_Status* status) {
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context, "VarHandleOp", status), TFE_DeleteOp);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetAttrType(op.get(), "dtype", type);
+  TFE_OpSetAttrShape(op.get(), "shape", dims, num_dims, status);
+  TFE_OpSetAttrString(op.get(), "container", "", 0);
+  // Use the special GUID for no buffer sharing
+  //
+  // TODO(allenl): Should we provide a better API for this? AFAIK this is the
+  // only reasonable way to make variables with no aliasing using the eager C
+  // API.
+  std::string no_sharing = "cd2c89b7-88b7-44c8-ad83-06c2a9158347";
+  TFE_OpSetAttrString(op.get(), "shared_name", no_sharing.c_str(),
+                      no_sharing.length());
+  TFE_OpSetDevice(op.get(), device, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_TensorHandle* var_handle = nullptr;
+  int num_retvals = 1;
+  TFE_Execute(op.get(), &var_handle, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  return new Variable(var_handle, type);
+}
+
+void Variable::Destroy(TFE_Context* context, TF_Status* status) {
+  // Free the backing buffer for the variable.
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context, "DestroyResourceOp", status), &TFE_DeleteOp);
+  if (TF_GetCode(status) != TF_OK) return;
+  TFE_OpAddInput(op.get(), handle_, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  const char* device = TFE_TensorHandleDeviceName(handle_, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  TFE_OpSetDevice(op.get(), device, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  int num_retvals = 0;
+  TFE_Execute(op.get(), nullptr, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  // Delete the variable handle itself.
+  TFE_DeleteTensorHandle(handle_);
+}
+
+TensorHandlePtr Variable::Read(TFE_Context* context, TF_Status* status) {
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context, "ReadVariableOp", status), &TFE_DeleteOp);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpAddInput(op.get(), handle_, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  const char* device = TFE_TensorHandleDeviceName(handle_, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetDevice(op.get(), device, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetAttrType(op.get(), "dtype", type_);
+  int num_retvals = 1;
+  TFE_TensorHandle* var_value = nullptr;
+  TFE_Execute(op.get(), &var_value, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  return TensorHandlePtr(var_value);
+}
+
+void Variable::GeneralAssignment(const char* op_name, TFE_Context* context,
+                                 TFE_TensorHandle* value, TF_Status* status) {
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context, op_name, status), &TFE_DeleteOp);
+  if (TF_GetCode(status) != TF_OK) return;
+  TFE_OpSetAttrType(op.get(), "dtype", type_);
+  TFE_OpAddInput(op.get(), handle_, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  TFE_OpAddInput(op.get(), value, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  const char* device = TFE_TensorHandleDeviceName(handle_, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  TFE_OpSetDevice(op.get(), device, status);
+
+  int num_retvals = 0;
+  TFE_Execute(op.get(), nullptr, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return;
+}
+
+void Variable::AssignAdd(TFE_Context* context, TFE_TensorHandle* value,
+                         TF_Status* status) {
+  GeneralAssignment("AssignAddVariableOp", context, value, status);
+}
+
+void Variable::Assign(TFE_Context* context, TFE_TensorHandle* value,
+                      TF_Status* status) {
+  GeneralAssignment("AssignVariableOp", context, value, status);
+}
+
+// Passed to `TF_NewTensor` to indicate how an array of floats should be
+// deleted.
+static void FloatDeallocator(void* data, size_t, void* arg) {
+  delete[] static_cast<float*>(data);
+}
+
+// Creates a TFE_TensorHandle with value `v`.
+TensorHandlePtr FloatTensorHandle(float v, TF_Status* status) {
+  const int num_bytes = sizeof(float);
+  float* values = new float[1];
+  values[0] = v;
+  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
+      TF_NewTensor(TF_FLOAT, nullptr, 0, values, num_bytes, &FloatDeallocator,
+                   nullptr),
+      TF_DeleteTensor);
+  return TensorHandlePtr(TFE_NewTensorHandle(tensor.get(), status));
+}
+
+// Creates a rank-one TFE_TensorHandle with value `v`.
+TensorHandlePtr VectorFloatTensorHandle(const std::vector<float>& v,
+                                        TF_Status* status) {
+  const int num_bytes = v.size() * sizeof(float);
+  float* values = new float[v.size()];
+  memcpy(values, v.data(), num_bytes);
+  int64_t dims = v.size();
+  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
+      TF_NewTensor(TF_FLOAT, &dims, 1 /* num_dims */, values, num_bytes,
+                   &FloatDeallocator, nullptr),
+      TF_DeleteTensor);
+  return TensorHandlePtr(TFE_NewTensorHandle(tensor.get(), status));
+}
+
+// Helper to un-pack `num_replicas` TFE_TensorHandles from one parallel handle.
+template <std::size_t num_replicas>
+void ExtractPerDeviceValues(
+    TFE_Context* context, TFE_TensorHandle* input,
+    std::array<TensorHandlePtr, num_replicas>* components, TF_Status* status) {
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context, "TPUReplicatedOutput", status), TFE_DeleteOp);
+  if (TF_GetCode(status) != TF_OK) return;
+  TFE_OpSetAttrInt(op.get(), "num_replicas", num_replicas);
+  TFE_OpAddInput(op.get(), input, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  const char* device = TFE_TensorHandleDeviceName(input, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  TFE_OpSetDevice(op.get(), device, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  TFE_TensorHandle* result_handles[num_replicas];
+  int num_retvals = num_replicas;
+  TFE_Execute(op.get(), result_handles, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  for (int i = 0; i < num_replicas; ++i) {
+    (*components)[i].reset(result_handles[i]);
+  }
+}
+
+TensorHandlePtr Multiply(TFE_Context* context, TFE_TensorHandle* first,
+                         TFE_TensorHandle* second, TF_Status* status) {
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context, "Mul", status), TFE_DeleteOp);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpAddInput(op.get(), first, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpAddInput(op.get(), second, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  const char* first_device = TFE_TensorHandleDeviceName(first, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetDevice(op.get(), first_device, status);
+
+  TFE_TensorHandle* result_handle;
+  int num_retvals = 1;
+  TFE_Execute(op.get(), &result_handle, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  return TensorHandlePtr(result_handle);
+}
+
+// Create and modify a variable placed on a parallel device which composes
+// `first_device` and `second_device`.
+void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
+                             const char* second_device) {
+  // Register the custom device
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  const char* device_name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
+  std::array<const char*, 2> underlying_devices{first_device, second_device};
+  RegisterParallelDevice(context, device_name, underlying_devices,
+                         status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+  // Create a variable handle (uninitialized to start) placed on the parallel
+  // device.
+  std::function<void(Variable*)> variable_deleter = [&](Variable* to_delete) {
+    to_delete->Destroy(context, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    delete to_delete;
+  };
+  std::unique_ptr<Variable, decltype(variable_deleter)> variable(
+      Variable::Create(context, TF_FLOAT, /* Scalar */ {}, 0, device_name,
+                       status.get()),
+      variable_deleter);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+  // Assign an initial value to the variable, implicitly mirroring it to each
+  // component device.
+  {
+    TensorHandlePtr initial_value = FloatTensorHandle(20., status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    variable->Assign(context, initial_value.get(), status.get());
+  }
+
+  // Read from the variable and verify that we have a parallel tensor.
+  {
+    TensorHandlePtr read = variable->Read(context, status.get());
+    std::array<TensorHandlePtr, 2> components;
+    ExtractPerDeviceValues(context, read.get(), &components, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    ExpectScalarEq<float>(components[0].get(), 20.);
+    ExpectScalarEq<float>(components[1].get(), 20.);
+
+    std::string first_device =
+        TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
+    ASSERT_EQ(underlying_devices[0], first_device);
+    std::string second_device =
+        TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
+    ASSERT_EQ(underlying_devices[1], second_device);
+  }
+
+  // Add a parallel tensor with different values on each device to the variable.
+  {
+    TensorHandlePtr value_one(FloatTensorHandle(3., status.get()));
+    TensorHandlePtr value_two(FloatTensorHandle(-2., status.get()));
+    std::array<TFE_TensorHandle*, 2> components{value_one.get(),
+                                                value_two.get()};
+    TensorHandlePtr combined_value =
+        CreatePerDeviceValues(context, components, device_name, status.get());
+    variable->AssignAdd(context, combined_value.get(), status.get());
+  }
+
+  // Read the variable and verify that each component has the right modified
+  // value.
+  {
+    TensorHandlePtr read = variable->Read(context, status.get());
+    std::array<TensorHandlePtr, 2> components;
+    ExtractPerDeviceValues(context, read.get(), &components, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    ExpectScalarEq<float>(components[0].get(), 23.);
+    ExpectScalarEq<float>(components[1].get(), 18.);
+
+    std::string first_device =
+        TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
+    ASSERT_EQ(underlying_devices[0], first_device);
+    std::string second_device =
+        TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
+    ASSERT_EQ(underlying_devices[1], second_device);
+  }
+  // Compute the device ID twice and verify the result
+  for (int i = 0; i < 2; ++i) {
+    std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+        TFE_NewOp(context, "DeviceID", status.get()), TFE_DeleteOp);
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    TFE_OpSetDevice(op.get(), device_name, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    TFE_TensorHandle* result_handle;
+    int num_retvals = 1;
+    TFE_Execute(op.get(), &result_handle, &num_retvals, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    std::array<TensorHandlePtr, 2> components;
+    ExtractPerDeviceValues(context, result_handle, &components, status.get());
+    TFE_DeleteTensorHandle(result_handle);
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    ExpectScalarEq<int64_t>(components[0].get(), 0);
+    ExpectScalarEq<int64_t>(components[1].get(), 1);
+    std::string first_device =
+        TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
+    ASSERT_EQ(underlying_devices[0], first_device);
+    std::string second_device =
+        TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
+    ASSERT_EQ(underlying_devices[1], second_device);
+  }
+}
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_testlib.h b/tensorflow/c/eager/parallel_device/parallel_device_testlib.h
new file mode 100644
index 00000000000..fdd21087949
--- /dev/null
+++ b/tensorflow/c/eager/parallel_device/parallel_device_testlib.h
@@ -0,0 +1,174 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_TESTLIB_H_
+#define TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_TESTLIB_H_
+
+#include "tensorflow/c/eager/parallel_device/parallel_device.h"
+
+#include <array>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/core/platform/test.h"
+
+
+// Functor for making unique_ptr to TFE_TensorHandle slightly more
+// ergonomic. Using decltype(TFE_DeleteTensorHandle) in the unique_ptr's second
+// template argument requires passing a function pointer to
+// TFE_DeleteTensorHandle when constructing the unique_ptr.
+class TensorHandleDeleter {
+ public:
+  void operator()(TFE_TensorHandle* to_delete) {
+    TFE_DeleteTensorHandle(to_delete);
+  }
+};
+
+using TensorHandlePtr = std::unique_ptr<TFE_TensorHandle, TensorHandleDeleter>;
+
+// A helper for performing common operations on variables. A much more
+// restricted stand-in for tf.Variable in Python.
+class Variable {
+ public:
+  // Construct a Variable from a resource-dtype TFE_TensorHandle and an
+  // indication of the dtype of the variable's value.
+  //
+  // Note that creating this resource-dtype handle can fail, so `Create` is a
+  // separate static method which returns a status.
+  Variable(TFE_TensorHandle* handle, TF_DataType type)
+      : handle_(handle), type_(type) {}
+
+  // Helper for constructing a resource handle and wrapping it in a `Variable`
+  // object.
+  static Variable* Create(TFE_Context* context, TF_DataType type,
+                          const int64_t* dims, const int num_dims,
+                          const char* device, TF_Status* status);
+  // Dereferences the backing buffer for the variable. Note that since this can
+  // fail (it runs operations), it must be called explicitly and the resulting
+  // `status` checked.
+  void Destroy(TFE_Context* context, TF_Status* status);
+
+  // Reads from the variable.
+  TensorHandlePtr Read(TFE_Context* context, TF_Status* status);
+  // Assigns a new value to the variable.
+  void Assign(TFE_Context* context, TFE_TensorHandle* value, TF_Status* status);
+  // Adds `value` to the existing value of the variable.
+  void AssignAdd(TFE_Context* context, TFE_TensorHandle* value,
+                 TF_Status* status);
+
+ private:
+  // Helper for running any single-argument assignment ops (Assign, AssignAdd,
+  // AssignSub, ...).
+  void GeneralAssignment(const char* op_name, TFE_Context* context,
+                         TFE_TensorHandle* value, TF_Status* status);
+
+  // The a handle for the resource-dtype tensor pointing to the variable's
+  // buffer.
+  TFE_TensorHandle* handle_;
+  // The dtype of the variable's buffer (input dtype for assignments, output
+  // dtype of read operations).
+  TF_DataType type_;
+};
+
+// Creates a TFE_TensorHandle with value `v`.
+TensorHandlePtr FloatTensorHandle(float v, TF_Status* status);
+
+// Creates a rank-one TFE_TensorHandle with value `v`.
+TensorHandlePtr VectorFloatTensorHandle(const std::vector<float>& v,
+                                        TF_Status* status);
+
+// Helper to un-pack `num_replicas` TFE_TensorHandles from one parallel handle.
+template <std::size_t num_replicas>
+void ExtractPerDeviceValues(
+    TFE_Context* context, TFE_TensorHandle* input,
+    std::array<TensorHandlePtr, num_replicas>* components, TF_Status* status);
+
+// Helper to pack `num_replicas` TFE_TensorHandles into one parallel handle.
+template <std::size_t num_replicas>
+TensorHandlePtr CreatePerDeviceValues(
+    TFE_Context* context,
+    const std::array<TFE_TensorHandle*, num_replicas>& components,
+    const char* device, TF_Status* status);
+
+TensorHandlePtr Multiply(TFE_Context* context, TFE_TensorHandle* first,
+                         TFE_TensorHandle* second, TF_Status* status);
+
+// Assert that `handle` is equal to `expected_value`.
+template <typename value_type>
+void ExpectScalarEq(TFE_TensorHandle* handle, value_type expected_value);
+
+template <std::size_t num_devices>
+void RegisterParallelDevice(
+    TFE_Context* context, const char* device_name,
+    const std::array<const char*, num_devices>& underlying_devices,
+    TF_Status* status);
+
+// Create and modify a variable placed on a parallel device which composes
+// `first_device` and `second_device`.
+void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
+                             const char* second_device);
+
+// Implementations of templated functions ******************************
+
+template <std::size_t num_replicas>
+TensorHandlePtr CreatePerDeviceValues(
+    TFE_Context* context,
+    const std::array<TFE_TensorHandle*, num_replicas>& components,
+    const char* device, TF_Status* status) {
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context, "TPUReplicatedInput", status), TFE_DeleteOp);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetAttrInt(op.get(), "N", num_replicas);
+  for (int i = 0; i < num_replicas; ++i) {
+    TFE_OpAddInput(op.get(), components[i], status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+  }
+  TFE_OpSetDevice(op.get(), device, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+
+  TFE_TensorHandle* result_handle;
+  int num_retvals = 1;
+  TFE_Execute(op.get(), &result_handle, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  return TensorHandlePtr(result_handle);
+}
+
+template <typename value_type>
+void ExpectScalarEq(TFE_TensorHandle* handle, value_type expected_value) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> value_zero(
+      TFE_TensorHandleResolve(handle, status.get()), TF_DeleteTensor);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  EXPECT_EQ(expected_value,
+            *static_cast<value_type*>(TF_TensorData(value_zero.get())));
+}
+
+template <std::size_t num_devices>
+void RegisterParallelDevice(
+    TFE_Context* context, const char* device_name,
+    const std::array<const char*, num_devices>& underlying_devices,
+    TF_Status* status) {
+  TFE_CustomDevice device;
+  void* device_info;
+  tensorflow::eager::AllocateParallelDevice(
+      device_name, underlying_devices.data(), underlying_devices.size(),
+      &device, &device_info);
+  TFE_RegisterCustomDevice(context, device, device_name, device_info, status);
+}
+
+#endif  // TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_TESTLIB_H_
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
new file mode 100644
index 00000000000..34142fec5f7
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -0,0 +1,30 @@
+# Experimental gcs filesystem plugin.
+load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Filesystem implementation for GCS environments
+tf_cc_shared_object(
+    name = "gcs_filesystem",
+    framework_so = [],
+    linkstatic = False,
+    per_os_targets = 1,
+    visibility = ["//visibility:public"],
+    deps = [":gcs_filesystem_impl"],
+)
+
+# The real implementation of the filesystem.
+cc_library(
+    name = "gcs_filesystem_impl",
+    srcs = ["gcs_filesystem.cc"],
+    copts = select({
+        "//conditions:default": [],
+        "//tensorflow:windows": get_win_copts(),
+    }),
+    deps = [
+        "//tensorflow/c:tf_status",
+        "//tensorflow/c/experimental/filesystem:filesystem_interface",
+    ],
+)
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
new file mode 100644
index 00000000000..ea9f59f1af3
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -0,0 +1,72 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <stdlib.h>
+#include <string.h>
+
+#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/c/tf_status.h"
+
+// Implementation of a filesystem for GCS environments.
+// This filesystem will support `gs://` URI schemes.
+
+static void* plugin_memory_allocate(size_t size) { return calloc(1, size); }
+static void plugin_memory_free(void* ptr) { free(ptr); }
+
+// SECTION 1. Implementation for `TF_RandomAccessFile`
+// ----------------------------------------------------------------------------
+namespace tf_random_access_file {
+
+// TODO(vnvo2409): Implement later
+
+}  // namespace tf_random_access_file
+
+// SECTION 2. Implementation for `TF_WritableFile`
+// ----------------------------------------------------------------------------
+namespace tf_writable_file {
+
+// TODO(vnvo2409): Implement later
+
+}  // namespace tf_writable_file
+
+// SECTION 3. Implementation for `TF_ReadOnlyMemoryRegion`
+// ----------------------------------------------------------------------------
+namespace tf_read_only_memory_region {
+
+// TODO(vnvo2409): Implement later
+
+}  // namespace tf_read_only_memory_region
+
+// SECTION 4. Implementation for `TF_Filesystem`, the actual filesystem
+// ----------------------------------------------------------------------------
+namespace tf_gcs_filesystem {
+
+// TODO(vnvo2409): Implement later
+
+}  // namespace tf_gcs_filesystem
+
+static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,
+                                        const char* uri) {
+  TF_SetFilesystemVersionMetadata(ops);
+  ops->scheme = strdup(uri);
+}
+
+void TF_InitPlugin(TF_FilesystemPluginInfo* info) {
+  info->plugin_memory_allocate = plugin_memory_allocate;
+  info->plugin_memory_free = plugin_memory_free;
+  info->num_schemes = 1;
+  info->ops = static_cast<TF_FilesystemPluginOps*>(
+      plugin_memory_allocate(info->num_schemes * sizeof(info->ops[0])));
+  ProvideFilesystemSupportFor(&info->ops[0], "gs");
+}
\ No newline at end of file
diff --git a/tensorflow/c/experimental/network.cc b/tensorflow/c/experimental/network.cc
index 94375cf9983..97e63ec6259 100644
--- a/tensorflow/c/experimental/network.cc
+++ b/tensorflow/c/experimental/network.cc
@@ -108,7 +108,7 @@ class CServerFactory : public ServerFactory {
         delete_function_(delete_function),
         rendezvous_builder_(rendezvous_builder) {}
 
-  Status NewServer(const ServerDef& server_def,
+  Status NewServer(const ServerDef& server_def, const Options& options,
                    std::unique_ptr<ServerInterface>* out_server) override {
     TF_RETURN_IF_ERROR(CGrpcServer::Create(
         server_def, init_function_, start_function_, stop_function_,
diff --git a/tensorflow/c/experimental/saved_model/internal/BUILD b/tensorflow/c/experimental/saved_model/internal/BUILD
index 7a694f4f803..2ded784882b 100644
--- a/tensorflow/c/experimental/saved_model/internal/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/BUILD
@@ -31,9 +31,6 @@ cc_library(
         "//tensorflow/c/experimental/saved_model/public:concrete_function.h",
     ],
     copts = tf_copts(),
-    # TODO(bmzhao): Remove this as we refactor C API to granular targets,
-    # so that we can depend on c/eager/c_api_unified_experimental.h.
-    features = ["-layering_check"],
     visibility = [
         "//tensorflow/c/experimental/saved_model/public:__pkg__",
     ],
@@ -41,6 +38,8 @@ cc_library(
         ":concrete_function_type",
         ":function_metadata",
         ":function_metadata_type",
+        ":tensorhandle_list",
+        ":tensorhandle_list_type",
         "//tensorflow/c:c_api_macros",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_internal",
@@ -156,10 +155,43 @@ cc_library(
         "saved_model_api_type.h",
     ],
     deps = [
+        "//tensorflow/c:conversion_macros",
         "//tensorflow/c/experimental/saved_model/core:saved_model_api",
     ],
 )
 
+cc_library(
+    name = "tensorhandle_list",
+    srcs = [
+        "tensorhandle_list.cc",
+    ],
+    hdrs = [
+        "//tensorflow/c/experimental/saved_model/public:tensorhandle_list.h",
+    ],
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/c/experimental/saved_model/public:__pkg__",
+    ],
+    deps = [
+        ":tensorhandle_list_type",
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+    ],
+)
+
+cc_library(
+    name = "tensorhandle_list_type",
+    hdrs = [
+        "tensorhandle_list_type.h",
+    ],
+    deps = [
+        "//tensorflow/c:conversion_macros",
+        "//tensorflow/c/eager:tensor_handle_interface",
+    ],
+)
+
 tf_cc_test(
     name = "saved_model_api_test",
     size = "small",
diff --git a/tensorflow/c/experimental/saved_model/internal/concrete_function.cc b/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
index 4884f9e2e97..dd54416ddf9 100644
--- a/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
+++ b/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
 
-#include "tensorflow/c/eager/c_api_unified_experimental.h"
 #include "tensorflow/c/eager/tfe_op_internal.h"
 #include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
 #include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
 #include "tensorflow/c/experimental/saved_model/internal/concrete_function_type.h"
 #include "tensorflow/c/experimental/saved_model/internal/function_metadata_type.h"
+#include "tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h"
 
 extern "C" {
 
@@ -29,10 +29,9 @@ TF_FunctionMetadata* TF_ConcreteFunctionGetMetadata(TF_ConcreteFunction* func) {
       &tensorflow::unwrap(func)->GetFunctionMetadata()));
 }
 
-TF_OutputList* TF_ConcreteFunctionGetCaptures(TF_ConcreteFunction* func) {
-  // TODO(bmzhao): Refactor TF_OutputList struct definition into a separate
-  // internal header, and implement this function.
-  return nullptr;
+const TF_TensorHandleList* TF_ConcreteFunctionGetCaptures(
+    TF_ConcreteFunction* func) {
+  return tensorflow::wrap(&tensorflow::unwrap(func)->GetCaptures());
 }
 
 TFE_Op* TF_ConcreteFunctionGetCallOp(TF_ConcreteFunction* func) {
diff --git a/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc b/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc
index 629610dbe29..9614e507646 100644
--- a/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc
+++ b/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc
@@ -41,7 +41,7 @@ TF_SavedModel* TF_LoadSavedModel(const char* dirname, TFE_Context* ctx,
   if (!status->status.ok()) {
     return nullptr;
   }
-  return new TF_SavedModel{std::move(result)};
+  return tensorflow::wrap(result.release());
 }
 
 TF_SavedModel* TF_LoadSavedModelWithTags(const char* dirname, TFE_Context* ctx,
@@ -60,17 +60,19 @@ TF_SavedModel* TF_LoadSavedModelWithTags(const char* dirname, TFE_Context* ctx,
   if (!status->status.ok()) {
     return nullptr;
   }
-  return new TF_SavedModel{std::move(result)};
+  return tensorflow::wrap(result.release());
 }
 
-void TF_DeleteSavedModel(TF_SavedModel* model) { delete model; }
+void TF_DeleteSavedModel(TF_SavedModel* model) {
+  delete tensorflow::unwrap(model);
+}
 
 TF_ConcreteFunction* TF_GetSavedModelConcreteFunction(TF_SavedModel* model,
                                                       const char* function_path,
                                                       TF_Status* status) {
   tensorflow::ConcreteFunction* result = nullptr;
   tensorflow::Status get_function_status =
-      model->saved_model->GetFunction(function_path, &result);
+      tensorflow::unwrap(model)->GetFunction(function_path, &result);
   status->status.Update(get_function_status);
   if (!get_function_status.ok()) {
     return nullptr;
@@ -82,7 +84,8 @@ TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_GetSavedModelSignatureDefFunction(
     TF_SavedModel* model, const char* signature_def_key, TF_Status* status) {
   tensorflow::ConcreteFunction* result = nullptr;
   tensorflow::Status get_function_status =
-      model->saved_model->GetSignatureDefFunction(signature_def_key, &result);
+      tensorflow::unwrap(model)->GetSignatureDefFunction(signature_def_key,
+                                                         &result);
   status->status.Update(get_function_status);
   if (!get_function_status.ok()) {
     return nullptr;
@@ -91,7 +94,8 @@ TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_GetSavedModelSignatureDefFunction(
 }
 
 TF_ConcreteFunctionList* TF_ListSavedModelFunctions(TF_SavedModel* model) {
-  return new TF_ConcreteFunctionList{model->saved_model->ListFunctions()};
+  return new TF_ConcreteFunctionList{
+      tensorflow::unwrap(model)->ListFunctions()};
 }
 
 }  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h b/tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h
index 9e2d1117463..380c3703426 100644
--- a/tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h
+++ b/tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h
@@ -18,13 +18,18 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/c/conversion_macros.h"
 #include "tensorflow/c/experimental/saved_model/core/saved_model_api.h"
 
 // Internal structures used by the SavedModel C API. These are likely to change
 // and should not be depended on.
 
-struct TF_SavedModel {
-  std::unique_ptr<tensorflow::SavedModelAPI> saved_model;
-};
+typedef struct TF_SavedModel TF_SavedModel;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::SavedModelAPI, TF_SavedModel)
+
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SAVED_MODEL_API_TYPE_H_
diff --git a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
new file mode 100644
index 00000000000..7d018658101
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/public/tensorhandle_list.h"
+
+#include <stddef.h>
+
+#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h"
+
+extern "C" {
+
+size_t TF_TensorHandleListSize(const TF_TensorHandleList* list) {
+  return tensorflow::unwrap(list)->size();
+}
+
+TFE_TensorHandle* TF_TensorHandleListGet(const TF_TensorHandleList* list,
+                                         int i) {
+  return tensorflow::wrap((*tensorflow::unwrap(list))[i]);
+}
+
+
+}  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
new file mode 100644
index 00000000000..8cbec2806a8
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
+
+#include <vector>
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/eager/tensor_handle_interface.h"
+
+// Internal structures used by the SavedModel C API. These are likely to
+// change and should not be depended on.
+
+typedef struct TF_TensorHandleList TF_TensorHandleList;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(
+    std::vector<tensorflow::AbstractTensorHandleInterface*>,
+    TF_TensorHandleList)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
diff --git a/tensorflow/c/experimental/saved_model/public/BUILD b/tensorflow/c/experimental/saved_model/public/BUILD
index af65e05e7f6..0cfa0a2c005 100644
--- a/tensorflow/c/experimental/saved_model/public/BUILD
+++ b/tensorflow/c/experimental/saved_model/public/BUILD
@@ -24,6 +24,7 @@ exports_files(
         "concrete_function_list.h",
         "function_metadata.h",
         "saved_model_api.h",
+        "tensorhandle_list.h",
     ],
     visibility = ["//tensorflow/c/experimental/saved_model/internal:__pkg__"],
 )
@@ -39,6 +40,7 @@ cc_library(
         ":concrete_function_list",
         ":function_metadata",
         ":saved_model_api",
+        ":tensorhandle_list",
     ],
 )
 
@@ -61,3 +63,8 @@ alias(
     name = "saved_model_api",
     actual = "//tensorflow/c/experimental/saved_model/internal:saved_model_api",
 )
+
+alias(
+    name = "tensorhandle_list",
+    actual = "//tensorflow/c/experimental/saved_model/internal:tensorhandle_list",
+)
diff --git a/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h b/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
index 30f533f140a..aae95a5477c 100644
--- a/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
+++ b/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/public/concrete_function_list.h"
 #include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
 #include "tensorflow/c/experimental/saved_model/public/saved_model_api.h"
+#include "tensorflow/c/experimental/saved_model/public/tensorhandle_list.h"
 // IWYU pragma: end_exports
 
 #endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_C_SAVED_MODEL_API_H_
diff --git a/tensorflow/c/experimental/saved_model/public/concrete_function.h b/tensorflow/c/experimental/saved_model/public/concrete_function.h
index 351d8daed8e..2a87214270c 100644
--- a/tensorflow/c/experimental/saved_model/public/concrete_function.h
+++ b/tensorflow/c/experimental/saved_model/public/concrete_function.h
@@ -17,9 +17,9 @@ limitations under the License.
 #define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_H_
 
 #include "tensorflow/c/c_api_macros.h"
-#include "tensorflow/c/eager/c_api_internal.h"
-#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
+#include "tensorflow/c/experimental/saved_model/public/tensorhandle_list.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -36,7 +36,7 @@ TF_CAPI_EXPORT extern TF_FunctionMetadata* TF_ConcreteFunctionGetMetadata(
     TF_ConcreteFunction* func);
 
 // Returns a list of TensorHandles implicitly captured by this function.
-TF_CAPI_EXPORT extern TF_OutputList* TF_ConcreteFunctionGetCaptures(
+TF_CAPI_EXPORT extern const TF_TensorHandleList* TF_ConcreteFunctionGetCaptures(
     TF_ConcreteFunction* func);
 
 // Returns a TFE_Op suitable for executing this function.
diff --git a/tensorflow/c/experimental/saved_model/public/concrete_function_list.h b/tensorflow/c/experimental/saved_model/public/concrete_function_list.h
index 7add847259c..e35546751f1 100644
--- a/tensorflow/c/experimental/saved_model/public/concrete_function_list.h
+++ b/tensorflow/c/experimental/saved_model/public/concrete_function_list.h
@@ -21,19 +21,27 @@ limitations under the License.
 #include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
 // An opaque type that is acts like a list of TF_ConcreteFunction pointers.
 typedef struct TF_ConcreteFunctionList TF_ConcreteFunctionList;
 
 // Returns the size of `list`.
-TF_CAPI_EXPORT size_t
-TF_ConcreteFunctionListSize(TF_ConcreteFunctionList* list);
+TF_CAPI_EXPORT extern size_t TF_ConcreteFunctionListSize(
+    TF_ConcreteFunctionList* list);
 
 // Returns the `i`th TF_ConcreteFunction in the list.
-TF_CAPI_EXPORT TF_ConcreteFunction* TF_ConcreteFunctionListGet(
+TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_ConcreteFunctionListGet(
     TF_ConcreteFunctionList* list, int i);
 
 // Deletes `list`.
-TF_CAPI_EXPORT void TF_DeleteConcreteFunctionList(
+TF_CAPI_EXPORT extern void TF_DeleteConcreteFunctionList(
     TF_ConcreteFunctionList* list);
 
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
 #endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
diff --git a/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h b/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h
new file mode 100644
index 00000000000..a1e88db3474
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h
@@ -0,0 +1,43 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSORHANDLE_LIST_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSORHANDLE_LIST_H_
+
+#include <stddef.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/eager/c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that is acts like a list of TF_ConcreteFunction pointers.
+typedef struct TF_TensorHandleList TF_TensorHandleList;
+
+// Returns the size of `list`.
+TF_CAPI_EXPORT extern size_t TF_TensorHandleListSize(
+    const TF_TensorHandleList* list);
+
+// Returns the `i`th TFE_TensorHandle in the list.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TF_TensorHandleListGet(
+    const TF_TensorHandleList* list, int i);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSORHANDLE_LIST_H_
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index e8cb40f153b..e1fad8e697a 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -178,7 +178,7 @@ cc_library_with_android_deps(
     name = "ops",
     srcs = ["framework/ops.cc"],
     hdrs = ["framework/ops.h"],
-    android_deps = ["//tensorflow/core:android_tensorflow_lib"],
+    android_deps = ["//tensorflow/core:portable_tensorflow_lib"],
     deps = [
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -197,7 +197,7 @@ cc_library_with_android_deps(
         "framework/scope_internal.h",
     ],
     hdrs = ["framework/scope.h"],
-    android_deps = ["//tensorflow/core:android_tensorflow_lib"],
+    android_deps = ["//tensorflow/core:portable_tensorflow_lib"],
     common_deps = [
         ":ops",
     ],
@@ -237,7 +237,7 @@ cc_library_with_android_deps(
     name = "client_session",
     srcs = ["client/client_session.cc"],
     hdrs = ["client/client_session.h"],
-    android_deps = ["//tensorflow/core:android_tensorflow_lib"],
+    android_deps = ["//tensorflow/core:portable_tensorflow_lib"],
     common_deps = [
         ":ops",
         ":scope",
@@ -275,7 +275,7 @@ cc_library_with_android_deps(
     srcs = ["ops/const_op.cc"],
     hdrs = ["ops/const_op.h"],
     android_deps = [
-        "//tensorflow/core:android_tensorflow_lib",
+        "//tensorflow/core:portable_tensorflow_lib",
     ],
     common_deps = [
         ":ops",
@@ -304,7 +304,7 @@ cc_library_with_android_deps(
     srcs = ["ops/while_loop.cc"],
     hdrs = ["ops/while_loop.h"],
     android_deps = [
-        "//tensorflow/core:android_tensorflow_lib",
+        "//tensorflow/core:portable_tensorflow_lib",
     ],
     common_deps = [
         ":cc_ops",
diff --git a/tensorflow/cc/experimental/base/public/BUILD b/tensorflow/cc/experimental/base/public/BUILD
index 4249d7918c8..045d4e6cd97 100644
--- a/tensorflow/cc/experimental/base/public/BUILD
+++ b/tensorflow/cc/experimental/base/public/BUILD
@@ -57,7 +57,22 @@ cc_library(
         "tensor.h",
     ],
     deps = [
+        ":status",
         "//tensorflow/c:tf_datatype",
         "//tensorflow/c:tf_tensor",
     ],
 )
+
+cc_library(
+    name = "tensorhandle",
+    hdrs = [
+        "tensorhandle.h",
+    ],
+    deps = [
+        ":runtime",
+        ":status",
+        ":tensor",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
+    ],
+)
diff --git a/tensorflow/cc/experimental/base/public/runtime.h b/tensorflow/cc/experimental/base/public/runtime.h
index 47fd8869647..711a38c233a 100644
--- a/tensorflow/cc/experimental/base/public/runtime.h
+++ b/tensorflow/cc/experimental/base/public/runtime.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_experimental.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // Runtime represents an opaque instance of a Tensorflow runtime, with its own
@@ -40,6 +41,7 @@ class Runtime {
  private:
   friend class RuntimeBuilder;
   friend class SavedModelAPI;
+  friend class TensorHandle;
 
   // Wraps a TFE_Context. Takes ownership of ctx.
   explicit Runtime(TFE_Context* ctx) : ctx_(ctx) {}
@@ -63,6 +65,7 @@ class Runtime {
 };
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_H_
diff --git a/tensorflow/cc/experimental/base/public/runtime_builder.h b/tensorflow/cc/experimental/base/public/runtime_builder.h
index ed3c93ae135..737e06cb2c6 100644
--- a/tensorflow/cc/experimental/base/public/runtime_builder.h
+++ b/tensorflow/cc/experimental/base/public/runtime_builder.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/cc/experimental/base/public/status.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // RuntimeBuilder is a builder used to construct a tensorflow::cc::Runtime.
@@ -79,6 +80,7 @@ inline std::unique_ptr<Runtime> RuntimeBuilder::Build(Status* status) {
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_BUILDER_H_
diff --git a/tensorflow/cc/experimental/base/public/status.h b/tensorflow/cc/experimental/base/public/status.h
index f91f2caccd8..98c8cf6ced2 100644
--- a/tensorflow/cc/experimental/base/public/status.h
+++ b/tensorflow/cc/experimental/base/public/status.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/c/tf_status.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // Status is a wrapper around an error code and an optional error message.
@@ -57,6 +58,7 @@ class Status {
   friend class RuntimeBuilder;
   friend class Runtime;
   friend class SavedModelAPI;
+  friend class TensorHandle;
 
   // Wraps a TF_Status*, and takes ownership of it.
   explicit Status(TF_Status* status) : status_(status) {}
@@ -88,6 +90,7 @@ inline void Status::SetStatus(TF_Code code, const std::string& msg) {
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_STATUS_H_
diff --git a/tensorflow/cc/experimental/base/public/tensor.h b/tensorflow/cc/experimental/base/public/tensor.h
index 1afdbcad50c..fc447262ce1 100644
--- a/tensorflow/cc/experimental/base/public/tensor.h
+++ b/tensorflow/cc/experimental/base/public/tensor.h
@@ -19,30 +19,53 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
+#include <functional>
 #include <memory>
+#include <vector>
 
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // Tensor represents an n-dimensional array of values.
 class Tensor {
  public:
-  // TODO(bmzhao): Add a factory function that constructs a Tensor from a char
-  // buffer, with an options struct (to specify the buffer's layout, device?,
-  // whether to create a TFRT or TF tensor, whether we should take ownership of
-  // the memory, etc). This requires extending TF_NewTensor with an options
-  // struct:
-  // https://github.com/tensorflow/tensorflow/blob/3c520614a3c056d56afdc79b59979b9b0087f8b9/tensorflow/c/tf_tensor.h#L77-L80
+  using DeleterCallback = std::function<void(void*, size_t)>;
+
+  // Constructs a Tensor from user provided buffer.
+  //
+  // Params:
+  //  dtype - The dtype of the tensor's data.
+  //  shape - A shape vector, where each element corresponds to the size of
+  //          the tensor's corresponding dimension.
+  //  data - Pointer to a buffer of memory to construct a Tensor out of.
+  //  len - The length (in bytes) of `data`
+  //  deleter - A std::function to be called when the Tensor no longer needs the
+  //            memory in `data`. This can be used to free `data`, or
+  //            perhaps decrement a refcount associated with `data`, etc.
+  //  status - Set to OK on success and an error on failure.
+  // Returns:
+  // If an error occurred, status->ok() will be false, and the returned
+  // Tensor must not be used.
+  // TODO(bmzhao): Add Runtime as an argument to this function so we can swap to
+  // a TFRT backed tensor.
+  // TODO(bmzhao): Add benchmarks on overhead for this function; we can
+  // consider using int64_t* + length rather than vector.
+  static Tensor FromBuffer(TF_DataType dtype, const std::vector<int64_t>& shape,
+                           void* data, size_t len, DeleterCallback deleter,
+                           Status* status);
 
   // TODO(bmzhao): In the case we construct a tensor from non-owned memory,
   // we should offer a way to deep copy the tensor into a new tensor, which
   // owns the underlying memory. This could be a .deepcopy()/clone() method.
 
   // TODO(bmzhao): In the future, we want to relax the non-copyability
-  // constraint. To do so, we can add a C API function that acts like CopyFrom:
+  // constraint. To do so, we can add a C API function that acts like
+  // CopyFrom:
   // https://github.com/tensorflow/tensorflow/blob/08931c1e3e9eb2e26230502d678408e66730826c/tensorflow/core/framework/tensor.h#L301-L311
 
   // Tensor is movable, but not copyable
@@ -85,6 +108,16 @@ class Tensor {
   // This object retains ownership of the pointer.
   TF_Tensor* GetTFTensor() const { return tensor_.get(); }
 
+  struct DeleterStruct {
+    std::function<void(void*, size_t)> deleter;
+  };
+
+  static void DeleterFunction(void* memory, size_t len, void* deleter_struct) {
+    DeleterStruct* deleter = reinterpret_cast<DeleterStruct*>(deleter_struct);
+    deleter->deleter(memory, len);
+    delete deleter;
+  }
+
   struct TFTensorDeleter {
     void operator()(TF_Tensor* p) const { TF_DeleteTensor(p); }
   };
@@ -111,7 +144,32 @@ inline size_t Tensor::num_bytes() const {
   return TF_TensorByteSize(tensor_.get());
 }
 
+inline Tensor Tensor::FromBuffer(TF_DataType dtype,
+                                 const std::vector<int64_t>& shape, void* data,
+                                 size_t len, DeleterCallback deleter,
+                                 Status* status) {
+  // Credit to apassos@ for this technique:
+  // Despite the fact that our API takes a std::function deleter, we are able
+  // to maintain ABI stability because:
+  // 1. Only a function pointer is sent across the C API (&DeleterFunction)
+  // 2. DeleterFunction is defined in the same build artifact that constructed
+  //    the std::function (so there isn't confusion about std::function ABI).
+  // Note that 2. is satisifed by the fact that this is a header-only API, where
+  // the function implementations are inline.
+
+  DeleterStruct* deleter_struct = new DeleterStruct{deleter};
+  TF_Tensor* tensor = TF_NewTensor(dtype, shape.data(), shape.size(), data, len,
+                                   &DeleterFunction, deleter_struct);
+  if (tensor == nullptr) {
+    status->SetStatus(TF_INVALID_ARGUMENT,
+                      "Failed to create tensor for input buffer");
+    return Tensor(nullptr);
+  }
+  return Tensor(tensor);
+}
+
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSOR_H_
diff --git a/tensorflow/cc/experimental/base/public/tensorhandle.h b/tensorflow/cc/experimental/base/public/tensorhandle.h
new file mode 100644
index 00000000000..99453ee7ea8
--- /dev/null
+++ b/tensorflow/cc/experimental/base/public/tensorhandle.h
@@ -0,0 +1,98 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSORHANDLE_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSORHANDLE_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/cc/experimental/base/public/runtime.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+#include "tensorflow/cc/experimental/base/public/tensor.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// An opaque representation of a tensor computed/managed by the Tensorflow
+// runtime (tensorflow:cc::Runtime). Unlike a tensor, a Tensorhandle may refer
+// to tensors placed in memory of different devices or remote address spaces.
+// Note that tensorflow::cc::Runtime MUST outlive all TensorHandles created
+// from it.
+class TensorHandle {
+ public:
+  // Unwraps a Tensor from the given TensorHandle. If an error occurred,
+  // status->ok() will be false, and the returned Tensor must not be used.
+  Tensor Resolve(Status* status);
+
+  // Constructs a TensorHandle from a Tensor. If an error occurred,
+  // status->ok() will be false, and the returned TensorHandle must not be used.
+  static TensorHandle FromTensor(const Tensor& tensor, const Runtime& runtime,
+                                 Status* status);
+
+  // TensorHandle is movable, and not copyable
+  TensorHandle(TensorHandle&&) = default;
+  TensorHandle& operator=(TensorHandle&&) = default;
+
+ private:
+  // Wraps a TFE_TensorHandle. Takes ownership of handle.
+  explicit TensorHandle(TFE_TensorHandle* handle) : handle_(handle) {}
+
+  // TensorHandle is not copyable
+  TensorHandle(const TensorHandle&) = delete;
+  TensorHandle& operator=(const TensorHandle&) = delete;
+
+  // Returns the underlying TFE_TensorHandle that this object wraps.
+  // This object retains ownership of the pointer.
+  TFE_TensorHandle* GetTFETensorHandle() const { return handle_.get(); }
+
+  // Deletes the currently wrapped TFE_TensorHandle, and swaps it with handle,
+  // and takes ownership of handle.
+  void Reset(TFE_TensorHandle* handle) { handle_.reset(handle); }
+
+  struct TFETensorHandleDeleter {
+    void operator()(TFE_TensorHandle* p) const { TFE_DeleteTensorHandle(p); }
+  };
+  std::unique_ptr<TFE_TensorHandle, TFETensorHandleDeleter> handle_;
+};
+
+inline Tensor TensorHandle::Resolve(Status* status) {
+  TF_Tensor* tensor =
+      TFE_TensorHandleResolve(handle_.get(), status->GetTFStatus());
+  if (!status->ok()) {
+    return Tensor(nullptr);
+  }
+  return Tensor(tensor);
+}
+
+inline TensorHandle TensorHandle::FromTensor(const Tensor& tensor,
+                                             const Runtime& runtime,
+                                             Status* status) {
+  TFE_TensorHandle* tensor_handle = TFE_NewTensorHandleFromTensor(
+      runtime.GetTFEContext(), tensor.GetTFTensor(), status->GetTFStatus());
+  if (!status->ok()) {
+    return TensorHandle(nullptr);
+  }
+  return TensorHandle(tensor_handle);
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSORHANDLE_H_
diff --git a/tensorflow/cc/experimental/base/tests/BUILD b/tensorflow/cc/experimental/base/tests/BUILD
new file mode 100644
index 00000000000..f449d618f72
--- /dev/null
+++ b/tensorflow/cc/experimental/base/tests/BUILD
@@ -0,0 +1,50 @@
+# Tests for the C++ header-only base types.
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "tensor_types_test_util",
+    testonly = True,
+    hdrs = ["tensor_types_test_util.h"],
+    deps = [
+        "//tensorflow/c:tf_datatype",
+    ],
+)
+
+tf_cc_test(
+    name = "tensor_test",
+    srcs = [
+        "tensor_test.cc",
+    ],
+    deps = [
+        ":tensor_types_test_util",
+        "//tensorflow/c:tf_datatype",
+        "//tensorflow/cc/experimental/base/public:status",
+        "//tensorflow/cc/experimental/base/public:tensor",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "tensorhandle_test",
+    srcs = [
+        "tensorhandle_test.cc",
+    ],
+    deps = [
+        ":tensor_types_test_util",
+        "//tensorflow/c:tf_datatype",
+        "//tensorflow/cc/experimental/base/public:runtime",
+        "//tensorflow/cc/experimental/base/public:runtime_builder",
+        "//tensorflow/cc/experimental/base/public:status",
+        "//tensorflow/cc/experimental/base/public:tensor",
+        "//tensorflow/cc/experimental/base/public:tensorhandle",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/cc/experimental/base/tests/tensor_test.cc b/tensorflow/cc/experimental/base/tests/tensor_test.cc
new file mode 100644
index 00000000000..33f9ab637e8
--- /dev/null
+++ b/tensorflow/cc/experimental/base/tests/tensor_test.cc
@@ -0,0 +1,163 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/experimental/base/public/tensor.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/cc/experimental/base/tests/tensor_types_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace {
+
+using tensorflow::experimental::cc::Status;
+using tensorflow::experimental::cc::Tensor;
+
+using SimpleTypes = ::testing::Types<
+    tensorflow::FloatType, tensorflow::DoubleType, tensorflow::Int32Type,
+    tensorflow::UINT8Type, tensorflow::INT8Type, tensorflow::INT64Type,
+    tensorflow::UINT16Type, tensorflow::UINT32Type, tensorflow::UINT64Type>;
+
+template <typename T>
+class ConstructScalarTensorTest : public ::testing::Test {};
+TYPED_TEST_SUITE(ConstructScalarTensorTest, SimpleTypes);
+
+// This test constructs a scalar tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(ConstructScalarTensorTest, ValidTensorAttributesAfterConstruction) {
+  Status status;
+  TF_DataType dtype = TypeParam::kDType;
+  typename TypeParam::type value = 42;
+  Tensor tensor = Tensor::FromBuffer(/*dtype=*/dtype, /*shape=*/{},
+                                     /*data=*/&value,
+                                     /*len=*/sizeof(value),
+                                     /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 0);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  EXPECT_EQ(*reinterpret_cast<typename TypeParam::type*>(tensor.data()), 42);
+  EXPECT_EQ(tensor.num_bytes(), sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), 1);
+}
+
+template <typename T>
+class Construct1DTensorTest : public ::testing::Test {};
+TYPED_TEST_SUITE(Construct1DTensorTest, SimpleTypes);
+
+// This test constructs a 1D tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(Construct1DTensorTest, ValidTensorAttributesAfterConstruction) {
+  Status status;
+  TF_DataType dtype = TypeParam::kDType;
+  // This is our 1D tensor of varying dtype.
+  std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
+  // Shape is Rank 1 vector.
+  std::vector<int64_t> shape;
+  shape.push_back(value.size());
+
+  Tensor tensor = Tensor::FromBuffer(
+      /*dtype=*/dtype, /*shape=*/shape,
+      /*data=*/value.data(),
+      /*len=*/value.size() * sizeof(typename TypeParam::type),
+      /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 1);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
+      reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
+  EXPECT_EQ(tensor_view[0], 42);
+  EXPECT_EQ(tensor_view[1], 100);
+  EXPECT_EQ(tensor_view[2], 0);
+  EXPECT_EQ(tensor_view[3], 1);
+  EXPECT_EQ(tensor_view[4], 4);
+  EXPECT_EQ(tensor_view[5], 29);
+
+  EXPECT_EQ(tensor.num_bytes(),
+            value.size() * sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), value.size());
+}
+
+template <typename T>
+class Construct2DTensorTest : public ::testing::Test {};
+TYPED_TEST_SUITE(Construct2DTensorTest, SimpleTypes);
+
+// This test constructs a 2D tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(Construct2DTensorTest, ValidTensorAttributesAfterConstruction) {
+  Status status;
+  TF_DataType dtype = TypeParam::kDType;
+  // This is our 1D tensor of varying dtype.
+  std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
+  // Shape is Rank 2 vector with shape 2 x 3.
+  std::vector<int64_t> shape({2, 3});
+
+  Tensor tensor = Tensor::FromBuffer(
+      /*dtype=*/dtype, /*shape=*/shape,
+      /*data=*/value.data(),
+      /*len=*/value.size() * sizeof(typename TypeParam::type),
+      /*deleter=*/[](void*, size_t) {}, &status);
+
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 2);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
+      reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
+  EXPECT_EQ(tensor_view[0], 42);
+  EXPECT_EQ(tensor_view[1], 100);
+  EXPECT_EQ(tensor_view[2], 0);
+  EXPECT_EQ(tensor_view[3], 1);
+  EXPECT_EQ(tensor_view[4], 4);
+  EXPECT_EQ(tensor_view[5], 29);
+
+  EXPECT_EQ(tensor.num_bytes(),
+            value.size() * sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), value.size());
+}
+
+TEST(CPPTensorAPI, ConstructTensorFromBuffer) {
+  bool done = false;
+  Status status;
+  std::vector<int32_t> data_vector({12, 14, 20, 18, 39, 42, 100});
+  {
+    // data_vector is a rank 1 tensor.
+    std::vector<int64_t> shape;
+    shape.push_back(data_vector.size());
+
+    Tensor::DeleterCallback callback = [&done](void* data, size_t len) {
+      done = true;
+    };
+
+    Tensor tensor =
+        Tensor::FromBuffer(/*dtype=*/TF_INT32, /*shape=*/shape,
+                           /*data=*/data_vector.data(),
+                           /*len=*/data_vector.size() * sizeof(int32_t),
+                           /*deleter=*/callback, &status);
+    ASSERT_TRUE(status.ok()) << status.message();
+  }
+  // At this point, tensor has been destroyed, and the deleter callback should
+  // have run.
+  EXPECT_TRUE(done);
+}
+
+}  // namespace
diff --git a/tensorflow/cc/experimental/base/tests/tensor_types_test_util.h b/tensorflow/cc/experimental/base/tests/tensor_types_test_util.h
new file mode 100644
index 00000000000..af9cad7529b
--- /dev/null
+++ b/tensorflow/cc/experimental/base/tests/tensor_types_test_util.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_TEST_TENSOR_TYPES_TEST_UTIL_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_TEST_TENSOR_TYPES_TEST_UTIL_H_
+
+#include <stdint.h>
+
+#include "tensorflow/c/tf_datatype.h"
+
+namespace tensorflow {
+
+// Each of the following struct types have two members: a kDType that
+// corresponds to a TF_Datatype enum value, and a typedef "type"
+// of its corresponding C++ type. These types allow us to write Dtype-agnostic
+// tests via GoogleTest's TypedTests:
+// https://github.com/google/googletest/blob/e589a337170554c48bc658cc857cf15080c9eacc/googletest/docs/advanced.md#typed-tests
+struct FloatType {
+  using type = float;
+  static constexpr TF_DataType kDType = TF_FLOAT;
+};
+
+struct DoubleType {
+  using type = double;
+  static constexpr TF_DataType kDType = TF_DOUBLE;
+};
+
+struct Int32Type {
+  using type = int32_t;
+  static constexpr TF_DataType kDType = TF_INT32;
+};
+
+struct UINT8Type {
+  using type = uint8_t;
+  static constexpr TF_DataType kDType = TF_UINT8;
+};
+
+struct INT8Type {
+  using type = int8_t;
+  static constexpr TF_DataType kDType = TF_INT8;
+};
+
+struct INT64Type {
+  using type = int64_t;
+  static constexpr TF_DataType kDType = TF_INT64;
+};
+
+struct UINT16Type {
+  using type = uint16_t;
+  static constexpr TF_DataType kDType = TF_UINT16;
+};
+
+struct UINT32Type {
+  using type = uint32_t;
+  static constexpr TF_DataType kDType = TF_UINT32;
+};
+
+struct UINT64Type {
+  using type = uint64_t;
+  static constexpr TF_DataType kDType = TF_UINT64;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_TEST_TENSOR_TYPES_TEST_UTIL_H_
diff --git a/tensorflow/cc/experimental/base/tests/tensorhandle_test.cc b/tensorflow/cc/experimental/base/tests/tensorhandle_test.cc
new file mode 100644
index 00000000000..cfeaba4e392
--- /dev/null
+++ b/tensorflow/cc/experimental/base/tests/tensorhandle_test.cc
@@ -0,0 +1,184 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/experimental/base/public/tensorhandle.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <memory>
+
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/cc/experimental/base/public/runtime.h"
+#include "tensorflow/cc/experimental/base/public/runtime_builder.h"
+#include "tensorflow/cc/experimental/base/public/tensor.h"
+#include "tensorflow/cc/experimental/base/tests/tensor_types_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+using tensorflow::experimental::cc::Runtime;
+using tensorflow::experimental::cc::RuntimeBuilder;
+using tensorflow::experimental::cc::Status;
+using tensorflow::experimental::cc::Tensor;
+using tensorflow::experimental::cc::TensorHandle;
+
+using SimpleTypes = ::testing::Types<
+    tensorflow::FloatType, tensorflow::DoubleType, tensorflow::Int32Type,
+    tensorflow::UINT8Type, tensorflow::INT8Type, tensorflow::INT64Type,
+    tensorflow::UINT16Type, tensorflow::UINT32Type, tensorflow::UINT64Type>;
+
+template <typename T>
+class ConstructScalarTensorHandleTest : public ::testing::Test {};
+TYPED_TEST_SUITE(ConstructScalarTensorHandleTest, SimpleTypes);
+
+// This test constructs a scalar tensor for each of the types in "SimpleTypes",
+// then wraps it in a TensorHandle. We then unwrap it back into a Tensor, and
+// verify the expected dims, dtype, value, num bytes, and num elements.
+TYPED_TEST(ConstructScalarTensorHandleTest,
+           ValidTensorAttributesAfterConstruction) {
+  Status status;
+  RuntimeBuilder runtime_builder;
+  std::unique_ptr<Runtime> runtime = runtime_builder.Build(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TF_DataType dtype = TypeParam::kDType;
+  typename TypeParam::type value = 42;
+  Tensor original_tensor =
+      Tensor::FromBuffer(/*dtype=*/dtype, /*shape=*/{},
+                         /*data=*/&value,
+                         /*len=*/sizeof(value),
+                         /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TensorHandle handle =
+      TensorHandle::FromTensor(original_tensor, *runtime, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  Tensor tensor = handle.Resolve(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 0);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  EXPECT_EQ(*reinterpret_cast<typename TypeParam::type*>(tensor.data()), 42);
+  EXPECT_EQ(tensor.num_bytes(), sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), 1);
+}
+
+template <typename T>
+class Construct1DTensorHandleTest : public ::testing::Test {};
+TYPED_TEST_SUITE(Construct1DTensorHandleTest, SimpleTypes);
+
+// This test constructs a 1D tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(Construct1DTensorHandleTest,
+           ValidTensorAttributesAfterConstruction) {
+  Status status;
+  RuntimeBuilder runtime_builder;
+  std::unique_ptr<Runtime> runtime = runtime_builder.Build(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TF_DataType dtype = TypeParam::kDType;
+  // This is our 1D tensor of varying dtype.
+  std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
+  // Shape is Rank 1 vector.
+  std::vector<int64_t> shape;
+  shape.push_back(value.size());
+
+  Tensor original_tensor = Tensor::FromBuffer(
+      /*dtype=*/dtype, /*shape=*/shape,
+      /*data=*/value.data(),
+      /*len=*/value.size() * sizeof(typename TypeParam::type),
+      /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TensorHandle handle =
+      TensorHandle::FromTensor(original_tensor, *runtime, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  Tensor tensor = handle.Resolve(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 1);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
+      reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
+  EXPECT_EQ(tensor_view[0], 42);
+  EXPECT_EQ(tensor_view[1], 100);
+  EXPECT_EQ(tensor_view[2], 0);
+  EXPECT_EQ(tensor_view[3], 1);
+  EXPECT_EQ(tensor_view[4], 4);
+  EXPECT_EQ(tensor_view[5], 29);
+
+  EXPECT_EQ(tensor.num_bytes(),
+            value.size() * sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), value.size());
+}
+
+template <typename T>
+class Construct2DTensorHandleTest : public ::testing::Test {};
+TYPED_TEST_SUITE(Construct2DTensorHandleTest, SimpleTypes);
+
+// This test constructs a 2D tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(Construct2DTensorHandleTest,
+           ValidTensorAttributesAfterConstruction) {
+  Status status;
+  RuntimeBuilder runtime_builder;
+  std::unique_ptr<Runtime> runtime = runtime_builder.Build(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TF_DataType dtype = TypeParam::kDType;
+  // This is our 1D tensor of varying dtype.
+  std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
+  // Shape is Rank 2 vector with shape 2 x 3.
+  std::vector<int64_t> shape({2, 3});
+
+  Tensor original_tensor = Tensor::FromBuffer(
+      /*dtype=*/dtype, /*shape=*/shape,
+      /*data=*/value.data(),
+      /*len=*/value.size() * sizeof(typename TypeParam::type),
+      /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TensorHandle handle =
+      TensorHandle::FromTensor(original_tensor, *runtime, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  Tensor tensor = handle.Resolve(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 2);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
+      reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
+  EXPECT_EQ(tensor_view[0], 42);
+  EXPECT_EQ(tensor_view[1], 100);
+  EXPECT_EQ(tensor_view[2], 0);
+  EXPECT_EQ(tensor_view[3], 1);
+  EXPECT_EQ(tensor_view[4], 4);
+  EXPECT_EQ(tensor_view[5], 29);
+
+  EXPECT_EQ(tensor.num_bytes(),
+            value.size() * sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), value.size());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 882b4032f76..b13d8db48a9 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -4,7 +4,6 @@
 load(
     "//tensorflow:tensorflow.bzl",
     "if_android",
-    "if_ios",
     "if_mobile",
     "if_not_mobile",
     "tf_cc_test",
@@ -85,7 +84,7 @@ cc_library(
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
     ]) + if_android([
-        "//tensorflow/core:android_tensorflow_lib",
+        "//tensorflow/core:portable_tensorflow_lib",
     ]),
 )
 
diff --git a/tensorflow/cc/saved_model/experimental/public/concrete_function.h b/tensorflow/cc/saved_model/experimental/public/concrete_function.h
index f57ba052f1a..1adaf70b01a 100644
--- a/tensorflow/cc/saved_model/experimental/public/concrete_function.h
+++ b/tensorflow/cc/saved_model/experimental/public/concrete_function.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/experimental/public/function_metadata.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // ConcreteFunction is an executable "function" loaded from a SavedModelAPI.
@@ -54,6 +55,7 @@ inline const FunctionMetadata* ConcreteFunction::GetFunctionMetadata() {
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_H_
diff --git a/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h b/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h
index bab95278eac..88cb779ef15 100644
--- a/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h
+++ b/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/experimental/public/concrete_function.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // ConcreteFunctionList helps convert an opaque pointer to an array of
@@ -56,6 +57,7 @@ inline std::vector<ConcreteFunction*> ConcreteFunctionList::ToVector() {
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
diff --git a/tensorflow/cc/saved_model/experimental/public/function_metadata.h b/tensorflow/cc/saved_model/experimental/public/function_metadata.h
index c3dcc45af0e..11e1a860d84 100644
--- a/tensorflow/cc/saved_model/experimental/public/function_metadata.h
+++ b/tensorflow/cc/saved_model/experimental/public/function_metadata.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // FunctionMetadata stores additional function information, including
@@ -40,6 +41,7 @@ class FunctionMetadata final {
 };
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_FUNCTION_METADATA_H_
diff --git a/tensorflow/cc/saved_model/experimental/public/saved_model_api.h b/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
index 814479de213..04018bf2aab 100644
--- a/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
+++ b/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/experimental/public/concrete_function_list.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // SavedModelAPI offers a way to load Tensorflow Saved Models
@@ -155,6 +156,7 @@ inline std::vector<ConcreteFunction*> SavedModelAPI::ListFunctions() {
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SAVED_MODEL_API_H_
diff --git a/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc b/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
index 155c58604bf..7f7f6b09a6d 100644
--- a/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
+++ b/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
@@ -26,10 +26,14 @@ limitations under the License.
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/test.h"
 
-namespace tensorflow {
 
 namespace {
 
+using tensorflow::experimental::cc::Runtime;
+using tensorflow::experimental::cc::RuntimeBuilder;
+using tensorflow::experimental::cc::SavedModelAPI;
+using tensorflow::experimental::cc::Status;
+
 constexpr char kTestData[] = "cc/saved_model/testdata";
 
 std::string SavedModelPath(tensorflow::StringPiece saved_model_dir) {
@@ -43,21 +47,21 @@ std::string SavedModelPath(tensorflow::StringPiece saved_model_dir) {
 class CPPSavedModelAPITest : public ::testing::TestWithParam<bool> {};
 
 TEST_P(CPPSavedModelAPITest, LoadsSavedModelWithTags) {
-  cc::Status status;
-  cc::RuntimeBuilder builder;
+  Status status;
+  RuntimeBuilder builder;
   bool use_tfrt = GetParam();
   if (use_tfrt) {
     GTEST_SKIP();  // TODO(chky) : Enable this once TFRT is open sourced.
   }
 
   builder.SetUseTFRT(use_tfrt);
-  std::unique_ptr<cc::Runtime> runtime = builder.Build(&status);
+  std::unique_ptr<Runtime> runtime = builder.Build(&status);
   ASSERT_TRUE(status.ok()) << status.message();
 
   std::string model_dir = SavedModelPath("VarsAndArithmeticObjectGraph");
   std::unordered_set<std::string> tags = {"serve"};
-  std::unique_ptr<cc::SavedModelAPI> model =
-      cc::SavedModelAPI::Load(model_dir, *runtime, &status, &tags);
+  std::unique_ptr<SavedModelAPI> model =
+      SavedModelAPI::Load(model_dir, *runtime, &status, &tags);
 
   // TODO(bmzhao): Change this to expect TF_OK when loading is implemented.
   // That unblocks writing other tests that require a TF_SavedModel*,
@@ -67,20 +71,20 @@ TEST_P(CPPSavedModelAPITest, LoadsSavedModelWithTags) {
 }
 
 TEST_P(CPPSavedModelAPITest, LoadsSavedModel) {
-  cc::Status status;
-  cc::RuntimeBuilder builder;
+  Status status;
+  RuntimeBuilder builder;
   bool use_tfrt = GetParam();
   if (use_tfrt) {
     GTEST_SKIP();  // TODO(chky) : Enable this once TFRT is open sourced.
   }
 
   builder.SetUseTFRT(use_tfrt);
-  std::unique_ptr<cc::Runtime> runtime = builder.Build(&status);
+  std::unique_ptr<Runtime> runtime = builder.Build(&status);
   ASSERT_TRUE(status.ok()) << status.message();
 
   std::string model_dir = SavedModelPath("VarsAndArithmeticObjectGraph");
-  std::unique_ptr<cc::SavedModelAPI> model =
-      cc::SavedModelAPI::Load(model_dir, *runtime, &status);
+  std::unique_ptr<SavedModelAPI> model =
+      SavedModelAPI::Load(model_dir, *runtime, &status);
 
   // TODO(bmzhao): Change this to expect TF_OK when loading is implemented.
   // That unblocks writing other tests that require a TF_SavedModel*,
@@ -94,4 +98,3 @@ INSTANTIATE_TEST_SUITE_P(RuntimeAgnosticCPPSavedModelTests,
 
 }  // namespace
 
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index c9a36b88795..e4df3090046 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -131,6 +131,7 @@ Status AddRewritesForShape(int i, const xla::Shape& shape,
   TF_RETURN_IF_ERROR(XLATypeToCpp(shape.element_type(), &type));
   std::vector<string> dim_vars;
   string dim_sizes, indices;
+  int count = 1;
   if (shape.rank() == 0 ||
       (shape.dimensions_size() == 1 && shape.dimensions(0) == 1)) {
     dim_sizes = "[1]";
@@ -140,6 +141,7 @@ Status AddRewritesForShape(int i, const xla::Shape& shape,
       dim_vars.push_back(absl::StrCat("size_t dim", dim));
       dim_sizes += absl::StrCat("[", shape.dimensions(dim), "]");
       indices += absl::StrCat("[dim", dim, "]");
+      count *= shape.dimensions(dim);
     }
   }
   rewrites->push_back({"{{I}}", absl::StrCat(i)});
@@ -147,6 +149,7 @@ Status AddRewritesForShape(int i, const xla::Shape& shape,
   rewrites->push_back({"{{DIM_VARS}}", absl::StrJoin(dim_vars, ", ")});
   rewrites->push_back({"{{DIM_SIZES}}", dim_sizes});
   rewrites->push_back({"{{INDICES}}", indices});
+  rewrites->push_back({"{{COUNT}}", absl::StrCat(count)});
   return Status::OK();
 }
 
@@ -199,6 +202,12 @@ Status GenArgMethods(const tf2xla::Config& config,
     return (*static_cast<const {{TYPE}}(*){{DIM_SIZES}}>(
         arg_data({{I}}))){{INDICES}};
   }
+  int arg{{NAME}}_size() const {
+    return {{COUNT}} * sizeof({{TYPE}});
+  }
+  int arg{{NAME}}_count() const {
+    return {{COUNT}};
+  }
 )";
     *methods += RewriteWithName(absl::StrCat(i), code, rewrites);
     if (!config.feed(i).name().empty()) {
@@ -246,6 +255,12 @@ Status GenResultMethods(const tf2xla::Config& config,
     return (*static_cast<const {{TYPE}}(*){{DIM_SIZES}}>(
         result_data({{I}}))){{INDICES}};
   }
+  int result{{NAME}}_size() const {
+    return {{COUNT}} * sizeof({{TYPE}});
+  }
+  int result{{NAME}}_count() const {
+    return {{COUNT}};
+  }
 )";
     *methods += RewriteWithName(absl::StrCat(i), code, rewrites);
     if (!config.fetch(i).name().empty()) {
@@ -281,6 +296,12 @@ Status GenVariableMethods(const tf2xla::Config& config,
     return (*static_cast<const {{TYPE}}(*){{DIM_SIZES}}>(
         arg_data({{I}}))){{INDICES}};
   }
+  int var_{{NAME}}_size() const {
+    return {{COUNT}} * sizeof({{TYPE}});
+  }
+  int var_{{NAME}}_count() const {
+    return {{COUNT}};
+  }
 )";
     const tf2xla::Variable& var = config.variable(i - config.feed_size());
     rewrites.emplace_back("{{MAYBE_CONST}}", var.readonly() ? "const " : "");
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index af58ca233f0..d011279dbb7 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -138,6 +138,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const float(*)[1][2]>(
         arg_data(0)))[dim0][dim1];
   }
+  int arg0_size() const {
+    return 2 * sizeof(float);
+  }
+  int arg0_count() const {
+    return 2;
+  }
 
   void set_arg_myfeed_data(const void* data) {
     set_arg_data(0, data);
@@ -156,6 +162,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const float(*)[1][2]>(
         arg_data(0)))[dim0][dim1];
   }
+  int arg_myfeed_size() const {
+    return 2 * sizeof(float);
+  }
+  int arg_myfeed_count() const {
+    return 2;
+  }
 
   void set_arg1_data(const void* data) {
     set_arg_data(1, data);
@@ -174,6 +186,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const tensorflow::int64(*)[3][4]>(
         arg_data(1)))[dim0][dim1];
   }
+  int arg1_size() const {
+    return 12 * sizeof(tensorflow::int64);
+  }
+  int arg1_count() const {
+    return 12;
+  }
 
   // Result methods for managing output buffers. Buffers are in row-major order.
   // Must only be called after a successful Run call. There is a set of methods
@@ -204,6 +222,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const tensorflow::uint32(*)[5][6]>(
         result_data(0)))[dim0][dim1];
   }
+  int result0_size() const {
+    return 30 * sizeof(tensorflow::uint32);
+  }
+  int result0_count() const {
+    return 30;
+  }
 
   tensorflow::uint32* result_myfetch_data() {
     return static_cast<tensorflow::uint32*>(result_data(0));
@@ -219,6 +243,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const tensorflow::uint32(*)[5][6]>(
         result_data(0)))[dim0][dim1];
   }
+  int result_myfetch_size() const {
+    return 30 * sizeof(tensorflow::uint32);
+  }
+  int result_myfetch_count() const {
+    return 30;
+  }
 
   // Methods for managing variable buffers. Buffers are in row-major order.
   //
@@ -261,6 +291,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const float(*)[1]>(
         arg_data(2)))[0];
   }
+  int var_myvar_readonly_size() const {
+    return 1 * sizeof(float);
+  }
+  int var_myvar_readonly_count() const {
+    return 1;
+  }
 
   void set_var_myvar_data(float* data) {
     set_arg_data(3, data);
@@ -279,6 +315,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const float(*)[1]>(
         arg_data(3)))[0];
   }
+  int var_myvar_size() const {
+    return 1 * sizeof(float);
+  }
+  int var_myvar_count() const {
+    return 1;
+  }
 
   void set_var_myvar2_data(tensorflow::int32* data) {
     set_arg_data(4, data);
@@ -297,6 +339,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const tensorflow::int32(*)[5]>(
         arg_data(4)))[dim0];
   }
+  int var_myvar2_size() const {
+    return 5 * sizeof(tensorflow::int32);
+  }
+  int var_myvar2_count() const {
+    return 5;
+  }
 
  private:
   // Number of buffers for the compiled computation.
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index abccefbcdbb..f2b28e70ff1 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -20,7 +20,7 @@ load(
     "tf_cc_test",
     "tf_copts",
 )
-load("//tensorflow:tensorflow.bzl", "tfcompile_extra_flags")
+load("//tensorflow:tensorflow.bzl", "tfcompile_target_cpu")
 
 def tf_library(
         name,
@@ -42,7 +42,8 @@ def tf_library(
         mlir_components = "None",
         deps = None,
         tags = []):
-    """Runs tfcompile to compile a TensorFlow graph into executable code.
+    """Runs tfcompile to compile a TensorFlow graph into executable code with fast
+    math enabled on cpu.
 
     Given an invocation of tf_library(name="foo", ...), generates the following
     build targets:
@@ -187,7 +188,9 @@ def tf_library(
     # `find` on such an object.
     need_xla_data_proto = flags and flags.find("--gen_program_shape") != -1
 
-    flags = tfcompile_extra_flags() + flags
+    target_cpu = tfcompile_target_cpu()
+    extra_flags = "--target_cpu=" + target_cpu + " " if target_cpu else " "
+    flags = extra_flags + flags
 
     if enable_xla_hlo_profiling:
         profiling_flag = "--xla_hlo_profile"
@@ -207,6 +210,15 @@ def tf_library(
         srcs.append(debug_info)
         debug_info_flag = " --debug_info=$(location " + debug_info + ")"
 
+    default_fast_math_xla_flags = ("XLA_FLAGS='" +
+                                   "--xla_cpu_enable_fast_math=true " +
+                                   "--xla_cpu_fast_math_honor_nans=false " +
+                                   "--xla_cpu_fast_math_honor_infs=false " +
+                                   "--xla_cpu_fast_math_honor_functions=false " +
+                                   "--xla_cpu_fast_math_honor_division=false " +
+                                   "--xla_cpu_enable_fast_min_max=true " +
+                                   "$${XLA_FLAGS:-}' ")
+
     native.genrule(
         name = ("gen_" + name),
         srcs = srcs,
@@ -216,6 +228,7 @@ def tf_library(
             function_object_file,
         ],
         cmd = (
+            default_fast_math_xla_flags +
             "CUDA_VISIBLE_DEVICES='' " +
             "$(location " + tfcompile_tool + ")" +
             " --graph=$(location " + tfcompile_graph + ")" +
@@ -256,6 +269,7 @@ def tf_library(
             session_module_pb,
         ],
         cmd = (
+            default_fast_math_xla_flags +
             "CUDA_VISIBLE_DEVICES='' " +
             "$(location " + tfcompile_tool + ")" +
             " --graph=$(location " + tfcompile_graph + ")" +
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index f0cf8f2ded9..846947454bb 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -67,6 +67,8 @@ int main(int argc, char** argv) {
   flags.entry_point = "entry";
   flags.debug_info_path_begin_marker = "";
 
+  // Note that tfcompile.bzl's tf_library macro sets fast math flags as that is
+  // generally the preferred case.
   std::vector<tensorflow::Flag> flag_list;
   AppendMainFlags(&flag_list, &flags);
   xla::AppendDebugOptionsFlags(&flag_list);
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 28d922f9e3c..5ec0575ed77 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -251,7 +251,7 @@ cc_library(
     visibility = [":friends"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:graph",
@@ -505,6 +505,7 @@ cc_library(
     name = "shape_inference",
     srcs = ["shape_inference.cc"],
     hdrs = ["shape_inference.h"],
+    visibility = [":friends"],
     deps = [
         ":shape_inference_helpers",
         "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 174250f18bd..9f5723f4fa4 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -2034,6 +2034,7 @@ absl::flat_hash_set<string> GetKnownXLAWhitelistOp() {
                                      "TensorArraySplitV3",
                                      "TensorArrayV3",
                                      "TensorArrayWriteV3",
+                                     "TensorListConcatV2",
                                      "TensorListElementShape",
                                      "TensorListFromTensor",
                                      "TensorListGather",
@@ -2043,6 +2044,7 @@ absl::flat_hash_set<string> GetKnownXLAWhitelistOp() {
                                      "TensorListPushBack",
                                      "TensorListReserve",
                                      "TensorListSetItem",
+                                     "TensorListSplit",
                                      "TensorListStack",
                                      "TensorScatterAdd",
                                      "TensorScatterSub",
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index abb42aa1815..7842513331d 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -395,12 +395,11 @@ static void ShowXlaDeviceDeprecationWarning(
   if (absl::StrContains(compilation_device_name, "CPU") ||
       absl::StrContains(compilation_device_name, "GPU")) {
     absl::call_once(once, [] {
-      LOG(WARNING)
-          << "XLA_GPU and XLA_CPU devices are deprecated and will be "
-             "removed in subsequent releases. Instead, use either "
-             "@tf.function(experimental_compile=True) for must-compile "
-             "semantics, or run with TF_XLA_FLAGS=--tf_xla_auto_jit=2 "
-             "for auto-clustering best-effort compilation.";
+      LOG(INFO) << "XLA_GPU and XLA_CPU devices are deprecated and will be "
+                   "removed in subsequent releases. Instead, use either "
+                   "@tf.function(experimental_compile=True) for must-compile "
+                   "semantics, or run with TF_XLA_FLAGS=--tf_xla_auto_jit=2 "
+                   "for auto-clustering best-effort compilation.";
     });
   }
 }
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 34ff0c55615..17e4226405a 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -180,12 +180,10 @@ class XlaAssignVariableOp : public OpKernel {
       data::MakeIteratorOp);                                                   \
   REGISTER_KERNEL_BUILDER(Name("AnonymousIterator").Device(DEVICE),            \
                           data::AnonymousIteratorHandleOp);                    \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("AnonymousIteratorV2").Device(DEVICE).HostMemory("deleter"),        \
-      data::AnonymousIteratorHandleOp);                                        \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("DeleteIterator").Device(DEVICE).HostMemory("deleter"),             \
-      data::DeleteIteratorOp);                                                 \
+  REGISTER_KERNEL_BUILDER(Name("AnonymousIteratorV2").Device(DEVICE),          \
+                          data::AnonymousIteratorHandleOp);                    \
+  REGISTER_KERNEL_BUILDER(Name("DeleteIterator").Device(DEVICE),               \
+                          data::DeleteIteratorOp);                             \
   REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE),              \
                           data::IteratorGetNextOp);                            \
   REGISTER_KERNEL_BUILDER(Name("IteratorGetNextAsOptional").Device(DEVICE),    \
diff --git a/tensorflow/compiler/jit/xla_kernel_creator_util.cc b/tensorflow/compiler/jit/xla_kernel_creator_util.cc
index 99046c0bd76..3cc68f2a1a4 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator_util.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator_util.cc
@@ -91,7 +91,7 @@ Status CreateXlaKernel(FunctionLibraryRuntime* flr, const NodeDef& node_def,
     }
     string message = absl::StrCat(
         "Function invoked by the following node is not compilable: ",
-        SummarizeNodeDef(node_def), ".\n");
+        SummarizeNodeDef(node_def, /*max_inputs_in_summary=*/10), ".\n");
     absl::StrAppend(&message, "Uncompilable nodes:");
     for (const auto& node_info : uncompilable_node_info) {
       string node_message =
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index e0ec990462b..8c24f182f5c 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -201,9 +201,7 @@ void XlaComputationLaunchContext::PopulateInputs(
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
   // Build ShapedBuffers that point directly to the Tensor buffers.
-  arg_buffers_.reserve(kernel->xla_input_shapes.size() + 1);
-  arg_buffers_.resize(kernel->xla_input_shapes.size());
-  arg_ptrs_ = std::vector<ShapedBuffer*>(arg_buffers_.size());
+  arg_ptrs_ = std::vector<ShapedBuffer*>(kernel->xla_input_shapes.size());
 
   // Pass remaining parameters.
   const Tensor* t;
@@ -239,11 +237,11 @@ void XlaComputationLaunchContext::PopulateInputs(
           << " not the same as on-host shape "
           << xla::ShapeUtil::HumanStringWithLayout(shape);
       se::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t);
-      arg_buffers_[i] = absl::make_unique<ShapedBuffer>(
+      arg_buffers_.emplace_back(
           /*on_host_shape=*/shape, /*on_device_shape=*/shape,
           client_->platform(), client_->default_device_ordinal());
-      arg_buffers_[i]->set_buffer(dmem, /*index=*/{});
-      arg_ptrs_[i] = arg_buffers_[i].get();
+      arg_buffers_.back().set_buffer(dmem, /*index=*/{});
+      arg_ptrs_[i] = &arg_buffers_.back();
     }
   }
 }
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 511e0f1451a..cf68dcb7dd6 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -165,7 +165,7 @@ class XlaComputationLaunchContext {
   se::DeviceMemoryAllocator* xla_allocator_;
   bool allocate_xla_tensors_;
   bool use_multiple_streams_;
-  std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers_;
+  std::deque<xla::ShapedBuffer> arg_buffers_;
   std::vector<xla::ShapedBuffer*> arg_ptrs_;
 };
 
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index 4a4d566f163..c4472e1185c 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -77,10 +77,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_legalize_hlo",
         "//tensorflow/compiler/mlir/tfjs:tensorflow_js_passes",
-        "//tensorflow/compiler/mlir/tfrt:lower_tf_to_tfd_alwayslink",
-        "//tensorflow/compiler/mlir/tfrt:runtime_fallback_opdefs_alwayslink",
-        "//tensorflow/compiler/mlir/tfrt:tf_legalize_to_tfrt",
-        "//tensorflow/compiler/mlir/tfrt:tf_to_corert",
     ],
 )
 
@@ -108,6 +104,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Shape",
         "@llvm-project//mlir:StandardOps",
     ],
     alwayslink = 1,
@@ -152,7 +149,6 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/compiler/mlir/tensorflow:translate_registration",
         "//tensorflow/compiler/mlir/tensorflow:translate_tf_dialect_op",
-        "//tensorflow/compiler/mlir/tfrt:compatibility_analysis",
         "//tensorflow/compiler/mlir/xla:xla_mlir_translate",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index d907d28b2c7..c60c0c0edbf 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -31,7 +31,7 @@ filegroup(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
@@ -216,13 +216,13 @@ cc_library(
         "ir/tfl_ops.h",
         "transforms/passes.h",
         "utils/attribute_utils.h",
-        "//tensorflow/compiler/mlir/lite/quantization:quantization_traits.h",
         "@llvm-project//mlir:include/mlir/Transforms/InliningUtils.h",
     ],
     deps = [
         ":tensorflow_lite_ops_inc_gen",
         ":validators",
         "//tensorflow/compiler/mlir/lite/experimental/estimators:cost_estimators",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/lite/schema:schema_fbs",
         "@llvm-project//llvm:support",
@@ -260,6 +260,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tftext_utils",
+    srcs = [
+        "utils/tftext_utils.cc",
+    ],
+    hdrs = [
+        "utils/tftext_utils.h",
+    ],
+    copts = ["-std=c++14"],
+    deps = [
+        ":tensorflow_lite",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "stateful_ops_utils",
     srcs = [
@@ -320,6 +339,7 @@ cc_library(
         ":lstm_utils",
         ":stateful_ops_utils",
         ":tensorflow_lite",
+        ":tftext_utils",
         ":validators",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
@@ -695,9 +715,9 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LoopOpsTransforms",
         "@llvm-project//mlir:MlirTranslateMain",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Translation",
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index e9192388070..df84b028f63 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -799,11 +799,6 @@ Optional<CustomOptionsOffset> Translator::CreateFlexOpCustomOptions(
 
 Optional<CustomOptionsOffset> Translator::CreateCustomOpCustomOptions(
     const ::tensorflow::NodeDef& node_def, const mlir::Location& loc) {
-  std::string node_def_str;
-  if (!node_def.SerializeToString(&node_def_str)) {
-    return emitError(loc, "failed to serialize tensorflow node_def"),
-           llvm::None;
-  }
   auto flex_builder = CreateFlexBuilderWithNodeAttrs(node_def, loc);
   return builder_.CreateVector(flex_builder->GetBuffer());
 }
@@ -813,9 +808,13 @@ Translator::CreateFlexBuilderWithNodeAttrs(
     const ::tensorflow::NodeDef& node_def, const mlir::Location& loc) {
   auto flex_builder = absl::make_unique<flexbuffers::Builder>();
   size_t map_start = flex_builder->StartMap();
-  for (const auto& pair : node_def.attr()) {
+  using Item = std::pair<std::string, ::tensorflow::AttrValue>;
+  std::vector<Item> attrs(node_def.attr().begin(), node_def.attr().end());
+  std::sort(attrs.begin(), attrs.end(),
+            [](Item& p1, Item& p2) -> bool { return p1.first < p2.first; });
+  for (const Item& pair : attrs) {
     const char* key = pair.first.c_str();
-    const auto& attr = pair.second;
+    const ::tensorflow::AttrValue& attr = pair.second;
     switch (attr.value_case()) {
       case ::tensorflow::AttrValue::kS:
         flex_builder->String(key, attr.s());
@@ -1020,7 +1019,7 @@ Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
       if (!inst->getMutableAttrDict().getAttrs().empty()) {
         os << " {";
         bool first = true;
-        for (auto& named_attr : inst->getMutableAttrDict().getDictionary()) {
+        for (auto& named_attr : inst->getAttrDictionary()) {
           os << (!first ? ", " : "");
           first = false;
           named_attr.first.print(os);
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 3dcfe71770b..0c384ebf9f3 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -1966,9 +1966,9 @@ OpFoldResult TransposeOp::fold(ArrayRef<Attribute> operands) {
 }
 
 static LogicalResult Verify(TransposeOp op) {
-  auto input_type = op.x().getType().cast<ShapedType>();
+  auto input_type = op.input().getType().cast<ShapedType>();
   auto perm_type = op.perm().getType().cast<ShapedType>();
-  auto output_type = op.y().getType().cast<ShapedType>();
+  auto output_type = op.output().getType().cast<ShapedType>();
   if (input_type.hasStaticShape() && perm_type.hasStaticShape()) {
     if (perm_type.getNumElements() != input_type.getRank()) {
       return op.emitOpError(
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
index 0e6a3db1f1b..c7a1504c3b7 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index d103c07b986..48bc68e5c95 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -20,7 +20,7 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td"
 include "tensorflow/compiler/mlir/lite/quantization/quantization.td"
 
@@ -161,7 +161,6 @@ class TFL_VariadicTensorOf<list<Type> allowedRuntimeTypes,
   Variadic<TensorOf<allowedOpTypes>>,
   TFL_RuntimeType<Variadic<TensorOf<allowedRuntimeTypes>>>;
 
-def TFL_Uint8 : UI<8>;
 def TFL_Int32Or64 : SignlessIntOfWidths<[32, 64]>;
 
 def TFL_BoolTensor : TFL_TensorOf<[I1]>;
@@ -228,10 +227,14 @@ class TFL_OperandHasAtleastRank<int n, int m> :
 
 class TFL_OperandRankEquals1DimOfOperand<int x, int y> :
   PredOpTrait<"operand " # x # "'s rank equals operand " # y # "'s size",
-    CPred<"$_op.getOperand(" # x #
-      ").getType().cast<ShapedType>().getRank() == "
-      "$_op.getOperand(" # y #
-      ").getType().cast<ShapedType>().getShape()[0]">>;
+    Or<[TFL_OperandIsUnrankedPred<x>,
+        TFL_OperandIsUnrankedPred<y>,
+        CPred<"!$_op.getOperand(" # y #
+          ").getType().cast<ShapedType>().hasStaticShape()">,
+        CPred<"$_op.getOperand(" # x #
+          ").getType().cast<ShapedType>().getRank() == "
+          "$_op.getOperand(" # y #
+          ").getType().cast<ShapedType>().getShape()[0]">]>>;
 
 class TFL_Operand0DOr1ElementTensor<int x> :
   PredOpTrait<"operand #" # x # " is an 0-d tensor or 1-d tensor w/ 1 element",
@@ -247,7 +250,22 @@ class TFL_TFTypesWithSameBits<int i, int j, int num> :
     Or<[CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isa<mlir::TF::Quint" # num # "Type>()">,
         CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isUnsignedInteger(" # num # ")">]>]>;
 
-class TFL_OperandIsNoneOrHasRankLessThanOrEqualTo<int n, int m> :
+class TFL_TFOperandTypesWithSameBits<int i, int j, int num> :
+  And<[
+    Or<[CPred<"getElementTypeOrSelf($_op.getOperand(" # i # ")).isa<mlir::TF::Quint" # num # "Type>()">,
+        CPred<"getElementTypeOrSelf($_op.getOperand(" # i # ")).isUnsignedInteger(" # num # ")">]>,
+    Or<[CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isa<mlir::TF::Quint" # num # "Type>()">,
+        CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isUnsignedInteger(" # num # ")">]>]>;
+
+class TFL_OperandIsNoneOrHasRank<int n, int m> :
+  PredOpTrait<"operand " # n # " is " # m # "-D",
+    Or<[
+      CPred<"$_op.getOperand(" # n # ").getType().isa<NoneType>()">,
+      TFL_OperandIsUnrankedPred<n>,
+      CPred<"$_op.getOperand(" # n #
+      ").getType().cast<ShapedType>().getRank() == " # m>]>>;
+
+class TFL_OperandIsNoneOrHasRankAtMost<int n, int m> :
   PredOpTrait<"operand " # n # " is at most " # m # "-D",
     Or<[
       CPred<"$_op.getOperand(" # n # ").getType().isa<NoneType>()">,
@@ -255,13 +273,13 @@ class TFL_OperandIsNoneOrHasRankLessThanOrEqualTo<int n, int m> :
       CPred<"$_op.getOperand(" # n #
       ").getType().cast<ShapedType>().getRank() <= " # m>]>>;
 
-class TFL_OperandHasRankLessThanOrEqualTo<int n, int m> :
+class TFL_OperandHasRankAtMost<int n, int m> :
   PredOpTrait<"operand " # n # " is at most " # m # "-D",
     Or<[TFL_OperandIsUnrankedPred<n>,
       CPred<"$_op.getOperand(" # n #
       ").getType().cast<ShapedType>().getRank() <= " # m>]>>;
 
-class TFL_OperandHasRankGreaterThanOrEqualTo<int n, int m> :
+class TFL_OperandHasRankAtLeast<int n, int m> :
   PredOpTrait<"operand " # n # " is at least " # m # "-D",
     Or<[TFL_OperandIsUnrankedPred<n>,
       CPred<"$_op.getOperand(" # n #
@@ -275,17 +293,33 @@ class TFL_OperandHasRankRange<int n, int x, int y> :
       "getRank() <= " # y>]>>;
 
 def TFL_FloatNonNegative : AttrConstraint<
-    CPred<"!$_self.cast<FloatAttr>().getValue().isNegative()">,
+    CPred<"$_self.isa<FloatAttr>() && "
+            "!$_self.cast<FloatAttr>().getValue().isNegative()">,
     "whose value is non-negative">;
 
-def TFL_BoolTrue: AttrConstraint<
-    CPred<"$_self.cast<BoolAttr>().getValue()">,
+def TFL_BoolTrue : AttrConstraint<
+    CPred<"$_self.isa<BoolAttr>() && $_self.cast<BoolAttr>().getValue()">,
     "whose value is true">;
 
-def TFL_BoolFalse: AttrConstraint<
-    CPred<"!$_self.cast<BoolAttr>().getValue()">,
+def TFL_BoolFalse : AttrConstraint<
+    CPred<"$_self.isa<BoolAttr>() && !$_self.cast<BoolAttr>().getValue()">,
     "whose value is false">;
 
+class TFL_StringEqualsTo<string value> : AttrConstraint<
+    CPred<"$_self.cast<StringAttr>().getValue() == \"" # value # "\"">,
+    "whose value equals to '" # value # "'">;
+
+// Ensures the array attribute's size is within the given maximum size.
+class TFL_ArrayMaxCount<int n> : AttrConstraint<
+    CPred<"$_self.isa<ArrayAttr>() && $_self.cast<ArrayAttr>().size() <= " # n>,
+    "whose size is at most " # n>;
+
+// Ensures the given integer attribute has the given value.
+class TFL_IntEqualsTo<int n> : AttrConstraint<
+    CPred<"$_self.isa<IntegerAttr>() && "
+            "$_self.cast<IntegerAttr>().getInt() == " # n>,
+    "whose value is " # n>;
+
 // This is a quantization-aware version of TCresVTEtIsSameAsOp
 class TFL_TCresVTEtIsSameAsOp<int i, int j> : And<[
   TCOpResIsShapedTypePred<i, j>,
@@ -300,6 +334,18 @@ class TFL_TCresVTEtIsSameAsOp<int i, int j> : And<[
             "quant::QuantizedType::castToStorageType("
                 "getElementTypeOrSelf($_op.getOperand(" # j # ")))">]>]>]>;
 
+// This is a quantization-aware version of TCresVTEtIsSameAsOp
+class TFL_TCopVTEtAreSameAt<int i, int j> : Or<[
+  TCopVTEtAreSameAt<[i, j]>,
+  TFL_TFOperandTypesWithSameBits<i, j, 8>,
+  And<[
+    SubstLeaves<"$_self", "getElementTypeOrSelf($_op.getOperand(" # j # "))",
+      quant_QuantizedType.predicate>,
+    CPred<"quant::QuantizedType::castToStorageType("
+              "getElementTypeOrSelf($_op.getOperand(" # i # "))) == "
+          "quant::QuantizedType::castToStorageType("
+              "getElementTypeOrSelf($_op.getOperand(" # j # ")))">]>]>;
+
 //===----------------------------------------------------------------------===//
 // TFL op common constraints.
 //===----------------------------------------------------------------------===//
@@ -395,9 +441,9 @@ class TFL_ConvOp<string mnemonic, string opSummary, int index> :
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, QI8, QUI8]>:$input,
+    ins TFL_TensorOf<[F32, QI8, QUI8, QI16]>:$input,
     TFL_TensorOf<[F32, QI8, QUI8]>:$filter,
-    TFL_TensorOfOrNone<[F32, I32]>:$bias,
+    TFL_TensorOfOrNone<[F32, I32, I64]>:$bias,
     I32Attr:$dilation_h_factor,
     I32Attr:$dilation_w_factor,
     TFL_AFAttr:$fused_activation_function,
@@ -406,7 +452,7 @@ class TFL_ConvOp<string mnemonic, string opSummary, int index> :
     I32Attr:$stride_w
   );
 
-  let results = (outs TFL_TensorOf<[F32, QI8, QUI8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, QI16]>:$output);
 
   let hasOptions = 0b1;
 }
@@ -437,7 +483,10 @@ an output element, this operation computes \\(y = |x|\\).
 
 def TFL_AddOp : TFL_Op<"add", [
     TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 5>,
-    ResultsBroadcastableShape, NoSideEffect, Commutative, TFL_GpuTargetOp]> {
+    ResultsBroadcastableShape,
+    NoSideEffect,
+    Commutative,
+    TFL_GpuTargetOp]> {
   let summary = "Addition operator";
 
   let description = [{
@@ -505,8 +554,14 @@ retained with length 1.
   let customOption = "ReducerOptions";
 }
 
-def TFL_TransposeConvOp:
-    TFL_Op<"transpose_conv", [NoSideEffect, TFL_GpuTargetOp]> {
+def TFL_TransposeConvOp: TFL_Op<"transpose_conv", [
+    NoSideEffect,
+    TFL_OperandHasRank<0, 1>,
+    TFL_OperandHasRank<1, 4>,
+    TFL_OperandHasRank<2, 4>,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 2>>,
+    TFL_GpuTargetOp]> {
   let summary = "Transpose convolution operator";
 
   let description = [{
@@ -514,16 +569,16 @@ def TFL_TransposeConvOp:
   }];
 
   let arguments = (ins
-    TFL_1DTensorOf<[I32]>:$output_shape,
-    TFL_TensorOf<[F32, TFL_Uint8, QI8, QUI8]>:$weights,
-    TFL_TensorOf<[F32, TFL_Uint8, QI8, QUI8]>:$input,
-    TFL_TensorOfOrNone<[F32, QI32, QUI32]>:$bias,
+    TFL_I32Tensor:$output_shape,
+    TFL_TensorOf<[F32, QI8, QUI8]>:$weights,
+    TFL_TensorOf<[F32, QI8, QUI8]>:$input,
+    TFL_TensorOfOrNone<[F32, QI32]>:$bias,
     TFL_PaddingAttr:$padding,
-    I32Attr:$stride_h,
-    I32Attr:$stride_w
+    Confined<I32Attr, [IntPositive]>:$stride_h,
+    Confined<I32Attr, [IntPositive]>:$stride_w
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, QI8, QUI8]>:$output);
 
   let hasOptions = 1;
 
@@ -565,7 +620,7 @@ def TFL_ArgMaxOp : TFL_Op<"arg_max", [NoSideEffect]> {
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, I32, I8, TFL_Uint8, QI8, QUI8]>:$input,
+    ins TFL_TensorOf<[F32, I32, I8, UI8, QI8, QUI8]>:$input,
     TFL_I32OrI64Tensor:$dim
   );
 
@@ -595,7 +650,7 @@ def TFL_ArgMinOp : TFL_Op<"arg_min", [NoSideEffect]> {
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, I32, I8, TFL_Uint8, QI8, QUI8]>:$input,
+    ins TFL_TensorOf<[F32, I32, I8, UI8, QI8, QUI8]>:$input,
     TFL_I32OrI64Tensor:$dim
   );
 
@@ -642,14 +697,14 @@ def TFL_ConcatenationOp : TFL_Op<"concatenation",
 
   let arguments = (
     ins TFL_VariadicTensorOf<
-      [F32, I64, I32, I16, I8, QI8, QUI8, TFL_Uint8]>:$values,
+      [F32, I64, I32, I16, I8, QI8, QUI8, UI8]>:$values,
     I32Attr:$axis,
     TFL_AFAttr:$fused_activation_function
   );
 
   let results = (outs
     TFL_TensorOf<
-      [F32, I64, I32, I16, I8, QI8, QUI8, TFL_Uint8]>:$output
+      [F32, I64, I32, I16, I8, QI8, QUI8, UI8]>:$output
   );
 
   let hasOptions = 1;
@@ -713,7 +768,8 @@ def SparsityParameterAttr : StructAttr<"SparsityParameterAttr", TFL_Dialect, [
   let storageType = [{ TFL::SparsityParameterAttr }];
 }
 
-def TFL_SparseConstOp : Op<TFL_Dialect, "pseudo_sparse_const", [NoSideEffect,
+def TFL_SparseConstOp : Op<TFL_Dialect, "pseudo_sparse_const", [
+    NoSideEffect,
     FirstAttrDerivedResultType]> {
   let summary = "Sparse constant pseudo op.";
 
@@ -924,12 +980,12 @@ def TFL_GatherNdOp : TFL_Op<"gather_nd", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8, TFL_Str]>:$params,
+    TFL_TensorOf<[F32, I8, I64, I32, UI8, TFL_Str]>:$params,
     TFL_I32OrI64Tensor:$indices
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8, TFL_Str]>:$output
+    TFL_TensorOf<[F32, I8, I64, I32, UI8, TFL_Str]>:$output
   );
 }
 
@@ -948,12 +1004,12 @@ def TFL_ScatterNdOp : TFL_Op<"scatter_nd", [
 
   let arguments = (ins
     TFL_TensorOf<[I32]>:$indices,
-    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$updates,
+    TFL_TensorOf<[F32, I8, I64, I32, UI8]>:$updates,
     TFL_1DTensorOf<[I32]>:$shape
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$output
+    TFL_TensorOf<[F32, I8, I64, I32, UI8]>:$output
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -963,7 +1019,11 @@ def TFL_ScatterNdOp : TFL_Op<"scatter_nd", [
 
 // Same type check of lhs and rhs is handled by the ResultsBroadcastableShape trait.
 def TFL_LessEqualOp : TFL_Op<"less_equal", [
-    ResultsBroadcastableShape, NoSideEffect, NoQuantizableResult]> {
+    ResultsBroadcastableShape,
+    BinaryOpSameElementTypeConstraint,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "Less_equal operator";
 
   let description = [{
@@ -971,8 +1031,8 @@ def TFL_LessEqualOp : TFL_Op<"less_equal", [
   }];
 
   let arguments = (
-      ins TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$lhs,
-      TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$rhs);
+      ins TFL_TensorOf<[F32, I32, I64, QI8, QUI8]>:$lhs,
+      TFL_TensorOf<[F32, I32, I64, QI8, QUI8]>:$rhs);
 
   let results = (outs TFL_BoolTensor:$output);
 
@@ -985,9 +1045,12 @@ def TFL_LessEqualOp : TFL_Op<"less_equal", [
   let hasOptions = 0;
 }
 
-def TFL_LocalResponseNormalizationOp : TFL_Op<"local_response_normalization",
-                                             [NoSideEffect]> {
-    let summary = "Local Response Normalization.";
+def TFL_LocalResponseNormalizationOp : TFL_Op<"local_response_normalization", [
+    TFL_OperandHasRank<0, 4>,
+    SameOperandsAndResultShape,
+    SameOperandsAndResultType,
+    NoSideEffect]> {
+  let summary = "Local Response Normalization.";
 
   let description = [{
 The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
@@ -1004,7 +1067,7 @@ convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imag
   }];
 
   let arguments = (ins
-      TFL_TensorOf<[F32, QI8, QUI8]>:$input,
+      TFL_FpTensor:$input,
       I32Attr:$radius,
       F32Attr:$bias,
       F32Attr:$alpha,
@@ -1012,7 +1075,7 @@ convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imag
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, QI8, QUI8]>:$output
+    TFL_FpTensor:$output
   );
 
   let hasOptions = 1;
@@ -1048,7 +1111,7 @@ def TFL_MatrixDiagOp : TFL_Op<"matrix_diag", [
   NoSideEffect,
   TFL_OperandHasAtleastRank<0, 1>,
   PredOpTrait<"operand and result must have the same element type",
-    TCresVTEtIsSameAsOp<0, 0>>]> {
+    TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = [{
     Returns a tensor with the provided diagonal and everything else padded with zeros.
   }];
@@ -1061,17 +1124,21 @@ def TFL_MatrixDiagOp : TFL_Op<"matrix_diag", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$diagonal
+    TFL_TensorOf<[F32, I8, I16, I32, I64, UI8, QUI8, QI8, TFL_Quint8]>:$diagonal
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$output
+    TFL_TensorOf<[F32, I8, I16, I32, I64, UI8, QUI8, QI8, TFL_Quint8]>:$output
   );
 
   let hasOptions = 0;
 }
 
-def TFL_MatrixSetDiagOp : TFL_Op<"matrix_set_diag", [NoSideEffect]> {
+def TFL_MatrixSetDiagOp : TFL_Op<"matrix_set_diag", [
+    TFL_OperandHasAtleastRank<0, 2>,
+    PredOpTrait<"input and result must have the same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect]> {
   let summary = [{
     Returns a batched matrix tensor with new batched diagonal values.
   }];
@@ -1083,12 +1150,12 @@ innermost matrices.  These will be overwritten by the values in `diagonal`.
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I32, I64, I8, QI8, QI16, QUI8, TFL_Uint8, TFL_Quint8]>:$input,
-    TensorOf<[F32, I32, I64, I8, QI8, QI16, QUI8, TFL_Uint8, TFL_Quint8]>:$diagonal
+    TensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QI16, QUI8, TFL_Quint8]>:$input,
+    TensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QI16, QUI8, TFL_Quint8]>:$diagonal
   );
 
   let results = (outs
-    TensorOf<[F32, I32, I64, I8, QI8, QI16, QUI8, TFL_Uint8, TFL_Quint8]>:$output
+    TensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QI16, QUI8, TFL_Quint8]>:$result
   );
 
   let hasOptions = 0;
@@ -1206,7 +1273,12 @@ larger than 0.
 }
 
 def TFL_NotEqualOp : TFL_Op<"not_equal", [
-    ResultsBroadcastableShape, Commutative, NoSideEffect, NoQuantizableResult]> {
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
+    BinaryOpSameElementTypeConstraint,
+    ResultsBroadcastableShape,
+    Commutative,
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "Not_equal operator";
 
   let description = [{
@@ -1214,8 +1286,8 @@ def TFL_NotEqualOp : TFL_Op<"not_equal", [
   }];
 
   let arguments = (
-      ins AnyTensor:$lhs,
-      AnyTensor:$rhs);
+      ins TFL_TensorOf<[I1, F32, I32, I64, QUI8, QI8, TFL_Quint8, TFL_Str]>:$lhs,
+      TFL_TensorOf<[I1, F32, I32, I64, QUI8, QI8, TFL_Quint8, TFL_Str]>:$rhs);
 
   let results = (outs TFL_BoolTensor:$output);
 
@@ -1234,8 +1306,10 @@ def TFL_NotEqualOp : TFL_Op<"not_equal", [
 }
 
 def TFL_DivOp : TFL_Op<"div", [
-  // TODO(fengliuai): NoQuantizableResult is only correct for int8
-  // quantization. update to handle Uint8 quantization.
+    // TODO(fengliuai): NoQuantizableResult is only correct for int8
+    // quantization. update to handle Uint8 quantization.
+    BinaryOpSameElementTypeConstraint,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 5>,
     ResultsBroadcastableShape,
     NoSideEffect,
     NoQuantizableResult,
@@ -1248,10 +1322,10 @@ def TFL_DivOp : TFL_Op<"div", [
 
   let arguments = (
       ins TFL_TensorOf<[F32, I32, QUI8]>:$lhs,
-      TFL_TensorOf<[F32, I32, TFL_Uint8]>:$rhs,
+      TFL_TensorOf<[F32, I32, QUI8]>:$rhs,
       TFL_AFAttr:$fused_activation_function);
 
-  let results = (outs TFL_TensorOf<[F32, I32, TFL_Uint8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I32, QUI8]>:$output);
 
   let builders = [TFL_FusedBroadcastableBinaryBuilder];
 
@@ -1284,7 +1358,7 @@ def TFL_EmbeddingLookupOp: TFL_Op<"embedding_lookup",
      PredOpTrait<"value and output must have same element type",
        TFL_TCresVTEtIsSameAsOp<0, 1>>,
      TFL_OperandHasRank<0, 1>,
-     TFL_OperandHasRankGreaterThanOrEqualTo<1, 2>
+     TFL_OperandHasRankAtLeast<1, 2>
     ]> {
   let summary = "Embedding lookup operator";
 
@@ -1294,10 +1368,10 @@ def TFL_EmbeddingLookupOp: TFL_Op<"embedding_lookup",
 
   let arguments = (ins
     TFL_TensorOf<[I32]>:$lookup,
-    TFL_TensorOf<[F32, I8, TFL_Uint8]>:$value
+    TFL_TensorOf<[F32, I8, UI8]>:$value
    );
 
-  let results = (outs TFL_TensorOf<[F32, I8, TFL_Uint8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I8, UI8]>:$output);
 }
 
 def TFL_EqualOp: TFL_Op<"equal", [Commutative, ResultsBroadcastableShape,
@@ -1313,8 +1387,8 @@ def TFL_EqualOp: TFL_Op<"equal", [Commutative, ResultsBroadcastableShape,
 
   let arguments = (
     ins
-    TFL_TensorOf<[I1, F32, I32, I64, QI8, QUI8, TFL_Uint8, TFL_Str]>:$x,
-    TFL_TensorOf<[I1, F32, I32, I64, QI8, QUI8, TFL_Uint8, TFL_Str]>:$y
+    TFL_TensorOf<[I1, F32, I32, I64, QI8, QUI8, UI8, TFL_Str]>:$x,
+    TFL_TensorOf<[I1, F32, I32, I64, QI8, QUI8, UI8, TFL_Str]>:$y
   );
 
   let results = (outs TFL_BoolTensor:$output);
@@ -1394,7 +1468,7 @@ def TFL_SqueezeOp: TFL_Op<"squeeze", [NoSideEffect,
 Given a tensor `input`, this operation returns a tensor of the same type with
 all dimensions of size 1 removed. If you don't want to remove all size 1
 dimensions, you can remove specific size 1 dimensions by specifying
-`axis`.
+`squeeze_dims`.
 
 For example:
 
@@ -1413,7 +1487,7 @@ shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
 
   let arguments = (ins
     AnyTensor:$input,
-    DefaultValuedAttr<I64ArrayAttr, "{}">:$squeeze_dims
+    Confined<DefaultValuedAttr<I64ArrayAttr, "{}">, [TFL_ArrayMaxCount<8>]>:$squeeze_dims
   );
 
   let results = (outs
@@ -1502,7 +1576,11 @@ def TFL_FloorModOp : TFL_Op<"floor_mod", [
 }
 
 def TFL_GreaterOp : TFL_Op<"greater", [
-    ResultsBroadcastableShape, NoSideEffect, NoQuantizableResult]> {
+    ResultsBroadcastableShape,
+    BinaryOpSameElementTypeConstraint,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "Greater operator";
 
   let description = [{
@@ -1510,10 +1588,10 @@ def TFL_GreaterOp : TFL_Op<"greater", [
   }];
 
   let arguments = (
-    ins AnyTensor:$lhs,
-    AnyTensor:$rhs);
+    ins TFL_TensorOf<[F32, I32, I64, QUI8, QI8, TFL_Quint8]>:$lhs,
+    TFL_TensorOf<[F32, I32, I64, QUI8, QI8, TFL_Quint8]>:$rhs);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_BoolTensor:$output);
 
   let builders = [TFL_ComparisonBinaryBuilder];
 
@@ -1522,9 +1600,12 @@ def TFL_GreaterOp : TFL_Op<"greater", [
   let printer = [{ return mlir::impl::printOneResultOp(getOperation(), p); }];
 }
 
-def TFL_HardSwishOp: TFL_Op<"hard_swish", [NoSideEffect,
-                                          SameOperandsAndResultShape,
-                                          TFL_GpuTargetOp]> {
+def TFL_HardSwishOp: TFL_Op<"hard_swish", [
+    NoSideEffect,
+    SameOperandsAndResultShape,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_GpuTargetOp]> {
   let summary = "Hardswish activation function.";
   let description = [{
     Computes hard-swish activation function
@@ -1534,7 +1615,7 @@ def TFL_HardSwishOp: TFL_Op<"hard_swish", [NoSideEffect,
 
   let arguments = (ins TFL_TensorOf<[F32, QUI8, QI8]>:$input);
 
-  let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$out);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$output);
 
   let hasOptions = 0;
 }
@@ -1563,29 +1644,35 @@ def TFL_L2NormalizationOp : TFL_Op<"l2_normalization", [NoSideEffect,
   let customOption = "L2NormOptions";
 }
 
-def TFL_LeakyReluOp: TFL_Op<"leaky_relu", [NoSideEffect, SameOperandsAndResultType]> {
+def TFL_LeakyReluOp: TFL_Op<"leaky_relu", [
+    SameOperandsAndResultShape,
+    NoSideEffect,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = "Leaky Relu operator";
 
-  // TODO(jpienaar): Add type restriction. This op is only defined for
-  // restricted (floating point) types.
   let description = [{
     Element-wise Leaky ReLU operator
       x -> x >= 0 ? x : (alpha * x)
   }];
 
   let arguments = (
-    ins AnyTensor:$input,
+    ins TFL_TensorOf<[F32, QUI8, QI8, TFL_Quint8]>:$input,
     // Slope of the activation function at x < 0.
     F32Attr:$alpha
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8, TFL_Quint8]>:$output);
 
   let hasOptions = 0b1;
 }
 
 def TFL_LessOp : TFL_Op<"less", [
-    ResultsBroadcastableShape, NoSideEffect, NoQuantizableResult]> {
+    ResultsBroadcastableShape,
+    BinaryOpSameElementTypeConstraint,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "Less operator";
 
   let description = [{
@@ -1593,8 +1680,8 @@ def TFL_LessOp : TFL_Op<"less", [
   }];
 
   let arguments = (
-    ins AnyTensor:$lhs,
-    AnyTensor:$rhs);
+    ins TFL_TensorOf<[F32, I32, I64, QUI8, QI8, TFL_Quint8]>:$lhs,
+    TFL_TensorOf<[F32, I32, I64, QUI8, QI8, TFL_Quint8]>:$rhs);
 
   let results = (outs TFL_BoolTensor:$output);
 
@@ -1655,11 +1742,14 @@ def TFL_LogicalOrOp : TFL_Op<"logical_or", [NoSideEffect]> {
 
 def TFL_LogisticOp: TFL_Op<"logistic", [
     NoSideEffect,
+    PredOpTrait<"x and y must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     SameOperandsAndResultShape,
     // zero_point = 0
     // scale = 1. / (max_value + 1)
     FixedResultScale<Int8UniformQuantizedType<-128, 390625, -8>>,
     FixedResultScale<UInt8UniformQuantizedType<0, 390625, -8>>,
+    FixedOutputRangeInterface,
     TFL_GpuTargetOp]> {
   let summary = "Logistic operator";
 
@@ -1667,9 +1757,39 @@ def TFL_LogisticOp: TFL_Op<"logistic", [
     Computes element-wise Sigmoid of input
   }];
 
-  let arguments = (ins TFL_TensorOf<[F32, QI8, QUI8, QI16, QUI16]>:$x);
+  let arguments = (ins TFL_TensorOf<[F32, QI8, QUI8, QI16, TFL_Quint8]>:$x);
 
-  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, QI16, QUI16]>:$y);
+  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, QI16, TFL_Quint8]>:$y);
+
+  let extraClassDeclaration = [{
+  // FixedOutputRangeInterface:
+  quant::UniformQuantizedType GetFixedOutputRange(
+      bool is_signed, int bit_width) {
+    auto result_type = y().getType().cast<ShapedType>();
+    if (!result_type.getElementType().isa<FloatType>()) return {};
+    Builder builder(result_type.getContext());
+
+    // Only support 8-bits
+    if (bit_width != 8) return {};
+    IntegerType storage_type = builder.getIntegerType(bit_width);
+
+    double scale = 1.0 / 256;
+    int64_t zero_point, storage_min, storage_max;
+    if (is_signed) {
+      zero_point = -128;
+      storage_min = -128;
+      storage_max = 127;
+    } else {
+      zero_point = 0;
+      storage_min = 0;
+      storage_max = 255;
+    }
+
+    return quant::UniformQuantizedType::getChecked(
+        is_signed, storage_type, result_type.getElementType(), scale,
+        zero_point, storage_min, storage_max, builder.getUnknownLoc());
+  }
+  }];
 }
 
 def TFL_LogOp: TFL_Op<"log", [
@@ -1690,10 +1810,11 @@ def TFL_LogOp: TFL_Op<"log", [
   let hasFolder = 1;
 }
 
-// TODO(b/130643170): Adds some constraint for the input/output element types.
 def TFL_LogSoftmaxOp : TFL_Op<"log_softmax", [
     NoSideEffect,
     SameOperandsAndResultShape,
+    PredOpTrait<"x and y must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     // zero_point = max_value
     // scale = -log_softmax_output_min / (max_value + 1)
     FixedResultScale<Int8UniformQuantizedType<127, 625, -4>>,
@@ -1706,9 +1827,9 @@ def TFL_LogSoftmaxOp : TFL_Op<"log_softmax", [
       input - log(reduce_sum(exp(input), dim))
   }];
 
-  let arguments = (ins AnyTensor:$input);
+  let arguments = (ins TFL_TensorOf<[F32, QUI8, QI8, TFL_Quint8]>:$input);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
 }
@@ -1727,6 +1848,9 @@ def MaxPoolOperandAndResultConstraints : PredOpTrait<"MaxPool2D operand and "
     TFL_TCresVTEtIsSameAsOp<0, 0>]>>;
 
 def TFL_MaxPool2DOp : TFL_Op<"max_pool_2d", [
+    TFL_OperandHasRank<0, 4>,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
     MaxPoolOperandAndResultConstraints,
     SameOperandsAndResultsScale,
@@ -1741,7 +1865,7 @@ def TFL_MaxPool2DOp : TFL_Op<"max_pool_2d", [
   }];
 
   let arguments = (
-    ins AnyTensor:$input,
+    ins TFL_TensorOf<[F32, QUI8, QI8, QI16, TFL_Quint8]>:$input,
     TFL_PaddingAttr:$padding,
     I32Attr:$stride_w,
     I32Attr:$stride_h,
@@ -1750,7 +1874,7 @@ def TFL_MaxPool2DOp : TFL_Op<"max_pool_2d", [
     TFL_AFAttr:$fused_activation_function
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8, QI16, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
 
@@ -1782,7 +1906,11 @@ def TFL_MaximumOp : TFL_Op<"maximum", [
   let hasOptions = 0;
 }
 
-def TFL_MeanOp : TFL_Op<"mean", [NoSideEffect, TFL_GpuTargetOp]> {
+def TFL_MeanOp : TFL_Op<"mean", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    TFL_GpuTargetOp]> {
   let summary = "Mean operator";
 
   let description = [{
@@ -1794,13 +1922,13 @@ def TFL_MeanOp : TFL_Op<"mean", [NoSideEffect, TFL_GpuTargetOp]> {
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8, TFL_Uint8]>:$input,
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, UI8]>:$input,
     TFL_TensorOf<[I32, I64]>:$axis,
     BoolAttr:$keep_dims
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$output);
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, UI8]>:$output);
 
   let hasOptions = 1;
   let customOption = "ReducerOptions";
@@ -1821,20 +1949,23 @@ def TFL_OneHotOp : TFL_Op<"one_hot", [NoSideEffect]> {
   let arguments = (ins
     TFL_TensorOf<[I32, I64]>:$indices,
     TFL_I32Tensor:$depth,
-    TFL_TensorOf<[F32, I32, I64, I1]>:$on_value,
-    TFL_TensorOf<[F32, I32, I64, I1]>:$off_value,
+    TFL_TensorOf<[F32, I32, I64, I1, I8, UI8]>:$on_value,
+    TFL_TensorOf<[F32, I32, I64, I1, I8, UI8]>:$off_value,
 
     I32Attr:$axis
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I32, I64, I1]>:$output
+    TFL_TensorOf<[F32, I32, I64, I1, I8, UI8]>:$output
   );
 
   let hasOptions = 1;
 }
 
-def TFL_RoundOp: TFL_Op<"round", [NoSideEffect, SameOperandsAndResultType]> {
+def TFL_RoundOp: TFL_Op<"round", [
+    NoSideEffect,
+    SameOperandsAndResultShape,
+    SameOperandsAndResultType]> {
   let summary = "Round operator";
 
   let description = [{
@@ -1842,16 +1973,23 @@ Rounds the values of a tensor to the nearest integer, element-wise.
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32]>:$x
+    TFL_FpTensor:$x
   );
 
   let results = (outs
-    TFL_TensorOf<[F32]>:$y
+    TFL_FpTensor:$y
   );
 }
 
 def TFL_SliceOp : TFL_Op<"slice", [
-    NoSideEffect, SameOperandsAndResultsScale, TFL_GpuTargetOp]> {
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    SameOperandsAndResultsScale,
+    TFL_OperandHasRankAtMost<0, 4>,
+    TFL_OperandHasRankAtMost<1, 1>,
+    TFL_OperandHasRankAtMost<2, 1>,
+    TFL_GpuTargetOp]> {
   let summary = "Return a slice from 'input'.";
 
   let description = [{
@@ -1869,13 +2007,13 @@ equivalent to setting:
   }];
 
   let arguments = (ins
-    AnyTensor:$input,
+    TFL_TensorOf<[F32, I32, I64, I8, UI8, I1, TFL_Str, QI8, QUI8, TFL_Quint8]>:$input,
     TFL_I32OrI64Tensor:$begin,
     TFL_I32OrI64Tensor:$size
   );
 
   let results = (outs
-    AnyTensor:$output
+    TFL_TensorOf<[F32, I32, I64, I8, UI8, I1, TFL_Str, QI8, QUI8, TFL_Quint8]>:$output
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -1883,7 +2021,11 @@ equivalent to setting:
   let hasCanonicalizer = 1;
 }
 
-def TFL_SumOp: TFL_Op<"sum", [NoSideEffect]> {
+def TFL_SumOp: TFL_Op<"sum", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect]> {
+
   let summary = "Sum operator";
 
   let description = [{
@@ -1891,19 +2033,22 @@ def TFL_SumOp: TFL_Op<"sum", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    AnyTensor:$input,
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$input,
     TFL_I32Tensor:$axes,
     BoolAttr:$keep_dims
   );
 
-  let results = (outs AnyTensor);
+  let results = (outs TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
   let customOption = "ReducerOptions";
 }
 
 def TFL_ReduceMinOp: TFL_Op<"reduce_min", [
-    NoSideEffect, SameOperandsAndResultsScale]> {
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    SameOperandsAndResultsScale]> {
   let summary = "Min-reduction operator";
 
   let description = [{
@@ -1911,19 +2056,23 @@ def TFL_ReduceMinOp: TFL_Op<"reduce_min", [
   }];
 
   let arguments = (ins
-    AnyTensor:$input,
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$input,
     TFL_I32Tensor:$axes,
     BoolAttr:$keep_dims
   );
 
-  let results = (outs AnyTensor);
+  let results = (outs
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
   let customOption = "ReducerOptions";
 }
 
 def TFL_ReduceMaxOp: TFL_Op<"reduce_max", [
-    NoSideEffect, SameOperandsAndResultsScale]> {
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    SameOperandsAndResultsScale]> {
   let summary = "Max-reduction operator";
 
   let description = [{
@@ -1931,18 +2080,22 @@ def TFL_ReduceMaxOp: TFL_Op<"reduce_max", [
   }];
 
   let arguments = (ins
-    AnyTensor:$input,
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$input,
     TFL_I32Tensor:$axes,
     BoolAttr:$keep_dims
   );
 
-  let results = (outs AnyTensor);
+  let results = (outs
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
   let customOption = "ReducerOptions";
 }
 
-def TFL_ReduceProdOp: TFL_Op<"reduce_prod", [NoSideEffect]> {
+def TFL_ReduceProdOp: TFL_Op<"reduce_prod", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect]> {
   let summary = "Prod-reduction operator";
 
   let description = [{
@@ -1950,12 +2103,13 @@ def TFL_ReduceProdOp: TFL_Op<"reduce_prod", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64]>:$input,
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$input,
     TFL_I32Tensor:$axes,
     BoolAttr:$keep_dims
   );
 
-  let results = (outs AnyTensor);
+  let results = (outs
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
   let customOption = "ReducerOptions";
@@ -1986,12 +2140,13 @@ def TFL_MinimumOp : TFL_Op<"minimum", [
   let hasOptions = 0;
 }
 
-def TFL_MulOp : TFL_Op<"mul", [ResultsBroadcastableShape,
-                               NoSideEffect,
-                               Commutative,
-                               BinaryOpSameElementTypeConstraint,
-                               TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 5>,
-                               TFL_GpuTargetOp]> {
+def TFL_MulOp : TFL_Op<"mul", [
+    ResultsBroadcastableShape,
+    NoSideEffect,
+    Commutative,
+    BinaryOpSameElementTypeConstraint,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 5>,
+    TFL_GpuTargetOp]> {
   let summary = "Multiplication operator";
 
   let description = [{
@@ -2032,7 +2187,11 @@ def TFL_NegOp: TFL_Op<"neg", [NoSideEffect, SameOperandsAndResultType]> {
   let hasFolder = 1;
 }
 
-def TFL_PackOp : TFL_Op<"pack", [NoSideEffect, SameOperandsAndResultsScale]> {
+def TFL_PackOp : TFL_Op<"pack", [
+    PredOpTrait<"values and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    SameOperandsAndResultsScale]> {
   let summary = "Packs a list of tensors along a dimension into one tensor";
 
   let description = [{
@@ -2063,14 +2222,14 @@ def TFL_PackOp : TFL_Op<"pack", [NoSideEffect, SameOperandsAndResultsScale]> {
   }];
 
   let arguments = (ins
-    TFL_VariadicTensorOf<[F32, I8, I16, I32, I64, QI8, QUI8, QI16]>:$values,
+    TFL_VariadicTensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QUI8, QI16, TFL_Quint8]>:$values,
 
-    I32Attr:$values_count,
+    Confined<I32Attr, [IntPositive]>:$values_count,
     I32Attr:$axis
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I16, I32, I64, QI8, QUI8, QI16]>:$output
+    TFL_TensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QUI8, QI16, TFL_Quint8]>:$output
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -2081,8 +2240,11 @@ def TFL_PackOp : TFL_Op<"pack", [NoSideEffect, SameOperandsAndResultsScale]> {
 }
 
 def TFL_PadOp : TFL_Op<"pad", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
     SameOperandsAndResultsScale,
+    TFL_OperandHasRankAtMost<0, 4>,
     TFL_OperandHasRank<1, 2>,
     TFL_OperandRankEquals1DimOfOperand<0, 1>,
     TFL_GpuTargetOp]> {
@@ -2113,22 +2275,25 @@ def TFL_PadOp : TFL_Op<"pad", [
     ```
   }];
 
-  let arguments = (ins TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
+  let arguments = (ins TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$input,
     TFL_I32OrI64Tensor:$padding);
 
-  let results = (outs TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
 }
 
 def TFL_PadV2Op : TFL_Op<"padv2", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
     SameOperandsAndResultsScale,
+    TFL_OperandHasRankAtMost<0, 4>,
     TFL_OperandHasRank<1, 2>,
     TFL_OperandHasRank<2, 0>,
     TFL_OperandRankEquals1DimOfOperand<0, 1>,
     PredOpTrait<"input and constant value operands must have same element type",
-      TCopVTEtAreSameAt<[0, 2]>>]> {
+      TFL_TCopVTEtAreSameAt<0, 2>>]> {
   let summary = "Padding operator v2";
 
   let description = [{
@@ -2159,11 +2324,11 @@ def TFL_PadV2Op : TFL_Op<"padv2", [
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
+    ins TFL_TensorOf<[F32, I32, I64, UI8, QI8, QUI8, TFL_Quint8]>:$input,
     TFL_I32OrI64Tensor:$padding,
-    TFL_TensorOf<[F32, I8, I32, I64]>:$constant_values);
+    TFL_TensorOf<[F32, I32, I64, UI8, QI8, QUI8, TFL_Quint8]>:$constant_values);
 
-  let results = (outs TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I32, I64, UI8, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
 }
@@ -2191,26 +2356,29 @@ def TFL_PowOp : TFL_Op<"pow", [ResultsBroadcastableShape,
   let builders = [TFL_BroadcastableBinaryBuilder];
 }
 
-def TFL_PReluOp : TFL_Op<"prelu", [NoSideEffect,
-                                   TFL_GpuTargetOp,
-                                   SameOperandsAndResultsScale]> {
+def TFL_PReluOp : TFL_Op<"prelu", [
+    NoSideEffect,
+    ResultsBroadcastableShape,
+    TFL_GpuTargetOp,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
+    BinaryOpSameElementTypeConstraint,
+    PredOpTrait<"input and output must have the same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = "Parameterized Relu operator";
 
   let description = [{
     Parameterized Relu operator
       x -> x >= 0 ? x : (alpha * x)
     where alpha is a trainable tensor.
-    alpha should have one less rank than the input as it doesn't have the batch
-    dimension, and the other dimensions either should be the same size as input
-    or size 1, where it is broadcasted in the second case.
+    input and alpha should be the same size as input or be broadcastable.
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, QUI8]>:$input,
-    TFL_TensorOf<[F32, QUI8]>:$alpha
+    ins TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$input,
+    TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$alpha
   );
 
-  let results = (outs TFL_TensorOf<[F32, QUI8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$output);
 
   let verifier = [{ return Verify(*this); }];
 }
@@ -2228,10 +2396,13 @@ def TFL_RankOp: TFL_Op<"rank", [NoSideEffect]> {
   let hasFolder = 1;
 }
 
-def TFL_ReluOp: TFL_Op<"relu", [NoSideEffect,
-                                SameOperandsAndResultShape,
-                                SameOperandsAndResultsScale,
-                                TFL_GpuTargetOp]> {
+def TFL_ReluOp: TFL_Op<"relu", [
+    PredOpTrait<"x and y must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    SameOperandsAndResultShape,
+    SameOperandsAndResultsScale,
+    TFL_GpuTargetOp]> {
   let summary = "Relu operator";
 
   let description = [{
@@ -2239,9 +2410,9 @@ def TFL_ReluOp: TFL_Op<"relu", [NoSideEffect,
       x -> max(0, x)
   }];
 
-  let arguments = (ins TFL_TensorOf<[F32, QUI8, I8]>:$x);
+  let arguments = (ins TFL_TensorOf<[F32, QUI8, QI8]>:$x);
 
-  let results = (outs TFL_TensorOf<[F32, QUI8, I8]>:$y);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$y);
 
   // This builder doesn't work with quantized type, so it can only be used by
   // non-quantization tablegen patterns. Currently, it is used by the
@@ -2255,10 +2426,13 @@ def TFL_ReluOp: TFL_Op<"relu", [NoSideEffect,
   ];
 }
 
-def TFL_Relu6Op: TFL_Op<"relu6", [NoSideEffect,
-                                  SameOperandsAndResultShape,
-                                  SameOperandsAndResultsScale,
-                                  TFL_GpuTargetOp]> {
+def TFL_Relu6Op: TFL_Op<"relu6", [
+    PredOpTrait<"x and y must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    SameOperandsAndResultShape,
+    SameOperandsAndResultsScale,
+    TFL_GpuTargetOp]> {
   let summary = "Relu6 operator";
 
   let description = [{
@@ -2266,9 +2440,9 @@ def TFL_Relu6Op: TFL_Op<"relu6", [NoSideEffect,
       x -> max(0, min(6, x))
   }];
 
-  let arguments = (ins TFL_TensorOf<[F32, QUI8, I8]>:$x);
+  let arguments = (ins TFL_TensorOf<[F32, QUI8, QI8]>:$x);
 
-  let results = (outs TFL_TensorOf<[F32, QUI8, I8]>:$y);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$y);
 
   // This builder doesn't work with quantized type, so it can only be used by
   // non-quantization tablegen patterns. Currently, it is used by the
@@ -2282,9 +2456,12 @@ def TFL_Relu6Op: TFL_Op<"relu6", [NoSideEffect,
   ];
 }
 
-def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [NoSideEffect,
-                                  SameOperandsAndResultShape,
-                                  SameOperandsAndResultsScale]> {
+def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [
+    PredOpTrait<"x and y must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    SameOperandsAndResultShape,
+    SameOperandsAndResultsScale]> {
   let summary = "Relu1 operator";
 
   let description = [{
@@ -2292,9 +2469,9 @@ def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [NoSideEffect,
       x -> max(-1, min(1, x))
   }];
 
-  let arguments = (ins TFL_TensorOf<[F32, QUI8, I8]>:$x);
+  let arguments = (ins TFL_TensorOf<[F32, QUI8, QI8]>:$x);
 
-  let results = (outs TFL_TensorOf<[F32, QUI8, I8]>:$y);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$y);
 
   // This builder doesn't work with quantized type, so it can only be used by
   // non-quantization tablegen patterns. Currently, it is used by the
@@ -2326,7 +2503,11 @@ def TFL_ReshapeOp: TFL_Op<"reshape", [
   let hasFolder = 1;
 }
 
-def TFL_ReverseSequenceOp : TFL_Op<"reverse_sequence", [NoSideEffect]> {
+def TFL_ReverseSequenceOp : TFL_Op<"reverse_sequence", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    TFL_OperandHasRank<1, 1>]> {
   let summary = "Reverses variable length slices.";
 
   let description = [{
@@ -2343,15 +2524,15 @@ slice `i`, with the first `seq_lengths[i]` slices along dimension
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I16, I32, I64, TFL_Uint8]>:$input,
+    TFL_TensorOf<[F32, I32, I64, QI16, QUI8, TFL_Quint8]>:$input,
     TFL_I32OrI64Tensor:$seq_lengths,
 
-    I32Attr:$seq_dim,
-    I32Attr:$batch_dim
+    Confined<I32Attr, [IntNonNegative]>:$seq_dim,
+    Confined<I32Attr, [IntNonNegative]>:$batch_dim
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I16, I32, I64, TFL_Uint8]>:$output
+    TFL_TensorOf<[F32, I32, I64, QI16, QUI8, TFL_Quint8]>:$output
   );
 
   let hasOptions = 1;
@@ -2359,6 +2540,7 @@ slice `i`, with the first `seq_lengths[i]` slices along dimension
 
 def TFL_RsqrtOp: TFL_Op<"rsqrt", [NoSideEffect,
                                   SameOperandsAndResultType,
+                                  SameOperandsAndResultShape,
                                   NoQuantizableResult,
                                   TFL_GpuTargetOp]> {
   let summary = "Reciprocal of square root operator";
@@ -2367,9 +2549,9 @@ def TFL_RsqrtOp: TFL_Op<"rsqrt", [NoSideEffect,
     Computes element-wise reverse square root of input
   }];
 
-  let arguments = (ins AnyTensor:$x);
+  let arguments = (ins TFL_FpTensor:$x);
 
-  let results = (outs AnyTensor:$y);
+  let results = (outs TFL_FpTensor:$y);
 
   let hasFolder = 1;
 }
@@ -2383,7 +2565,7 @@ def TFL_ShapeOp: TFL_Op<"shape", [NoSideEffect]> {
 
   let arguments = (ins AnyTensor:$input);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[I32, I64]>:$output);
 
   DerivedTypeAttr out_type = DerivedTypeAttr<[{
     return getResult().getType().cast<TensorType>().getElementType();
@@ -2392,9 +2574,11 @@ def TFL_ShapeOp: TFL_Op<"shape", [NoSideEffect]> {
   let hasOptions = 1;
 }
 
-// TODO(jpienaar): Flesh this out.
-def TFL_RangeOp: TFL_Op<"range", [NoSideEffect, TFL_OperandHasRank<0, 0>,
-    TFL_OperandHasRank<1, 0>, TFL_OperandHasRank<2, 0>,
+def TFL_RangeOp: TFL_Op<"range", [
+    NoSideEffect,
+    TFL_OperandHasRank<0, 0>,
+    TFL_OperandHasRank<1, 0>,
+    TFL_OperandHasRank<2, 0>,
     PredOpTrait<"operands and output must have same element type",
       And<[TCresVTEtIsSameAsOp<0, 0>, TCresVTEtIsSameAsOp<0, 1>,
            TCresVTEtIsSameAsOp<0, 2>]>>]> {
@@ -2406,17 +2590,20 @@ def TFL_RangeOp: TFL_Op<"range", [NoSideEffect, TFL_OperandHasRank<0, 0>,
   }];
 
   let arguments = (ins
-    AnyTensor:$start,
-    AnyTensor:$limit,
-    AnyTensor:$delta);
+    TFL_TensorOf<[I32, F32]>:$start,
+    TFL_TensorOf<[I32, F32]>:$limit,
+    TFL_TensorOf<[I32, F32]>:$delta);
 
-  let results = (outs AnyTensor:$result);
+  let results = (outs TFL_TensorOf<[I32, F32]>:$result);
 
   let hasFolder = 1;
 }
 
-def TFL_ReverseV2Op: TFL_Op<"reverse_v2",
-                            [NoSideEffect, TFL_OperandHasRank<1,1>]> {
+def TFL_ReverseV2Op: TFL_Op<"reverse_v2", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    TFL_OperandHasRank<1, 1>]> {
   let summary = "ReverseV2 Operator";
 
   let description = [{
@@ -2438,21 +2625,21 @@ def TFL_ReverseV2Op: TFL_Op<"reverse_v2",
 
   let arguments = (
     ins
-    TFL_TensorOf<[F32, I16, I32, I64, TFL_Uint8, I1]>:$input,
-    TFL_TensorOf<[I32, I64]>:$axis
+    TFL_TensorOf<[F32, UI8, I16, I32, I64, QI16, QUI8, TFL_Quint8, I1]>:$input,
+    TFL_I32Tensor:$axis
   );
 
   let results = (outs
-  TFL_TensorOf<[F32, I16, I32, I64, TFL_Uint8, I1]>:$output
-  );
+    TFL_TensorOf<[F32, UI8, I16, I32, I64, QI16, QUI8, TFL_Quint8, I1]>:$output);
 }
 
 // Select has many instances in TF models where one or more of its operands
 // are unranked. Therefore, we skip adding shape constraints here.
-def TFL_SelectOp : TFL_Op<"select", [NoSideEffect,
+def TFL_SelectOp : TFL_Op<"select", [
+  NoSideEffect,
   PredOpTrait<"operands have same element type", TCopVTEtIsSameAs<1, 2>>,
   PredOpTrait<"operands and result have same element type",
-    TCresVTEtIsSameAsOp<0, 1>>]> {
+    TFL_TCresVTEtIsSameAsOp<0, 1>>]> {
   let summary = "Select operator";
 
   let description = [{
@@ -2465,9 +2652,11 @@ def TFL_SelectOp : TFL_Op<"select", [NoSideEffect,
 
   let arguments = (ins
     TFL_BoolTensor:$condition,
-    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$x,
-    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$y);
-  let results = (outs AnyTensor:$output);
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$x,
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$y);
+
+  let results = (outs
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$output);
 
   // TODO(jpienaar): autogenerate this.
   let builders = [OpBuilder<"OpBuilder &builder, OperationState &result, "
@@ -2481,7 +2670,12 @@ def TFL_SelectOp : TFL_Op<"select", [NoSideEffect,
   let hasOptions = 1;
 }
 
-def TFL_SelectV2Op : TFL_Op<"select_v2", [NoSideEffect]> {
+def TFL_SelectV2Op : TFL_Op<"select_v2", [
+    NoSideEffect,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<1, 2, 4>,
+    PredOpTrait<"operands have same element type", TCopVTEtIsSameAs<1, 2>>,
+    PredOpTrait<"operands and result have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 1>>]> {
   let summary = "SelectV2 operator";
 
   let description = [{
@@ -2494,9 +2688,11 @@ def TFL_SelectV2Op : TFL_Op<"select_v2", [NoSideEffect]> {
 
   let arguments = (ins
     TFL_BoolTensor:$condition,
-    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$x,
-    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$y);
-  let results = (outs AnyTensor:$output);
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$x,
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$y);
+
+  let results = (outs
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$output);
 
   let builders = [OpBuilder<"OpBuilder &builder, OperationState &result, "
                             "Value cond, Value x, Value y",
@@ -2525,9 +2721,11 @@ def TFL_SinOp: TFL_Op<"sin", [
   let hasFolder = 1;
 }
 
-// TODO(b/130643170): Adds some constraint for the input/output element types.
 def TFL_SoftmaxOp : TFL_Op<"softmax", [
     NoSideEffect,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_OperandHasRankRange<0, 1, 4>,
     SameOperandsAndResultShape,
     // zero_point = 0
     // scale = 1. / (max_value + 1)
@@ -2543,11 +2741,11 @@ def TFL_SoftmaxOp : TFL_Op<"softmax", [
   }];
 
   let arguments = (
-    ins AnyTensor:$input,
+    ins TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$input,
     F32Attr:$beta
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
 }
@@ -2590,7 +2788,11 @@ def TFL_SquareOp: TFL_Op<"square", [
   let hasFolder = 1;
 }
 
-def TFL_SubOp : TFL_Op<"sub", [ResultsBroadcastableShape, NoSideEffect]> {
+def TFL_SubOp : TFL_Op<"sub", [
+    ResultsBroadcastableShape,
+    BinaryOpSameElementTypeConstraint,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 5>,
+    NoSideEffect]> {
   let summary = "Subtraction operator";
 
   let description = [{
@@ -2598,11 +2800,11 @@ def TFL_SubOp : TFL_Op<"sub", [ResultsBroadcastableShape, NoSideEffect]> {
   }];
 
   let arguments = (
-    ins AnyTensor:$lhs,
-    AnyTensor:$rhs,
+    ins TFL_TensorOf<[F32, I32, QI8, QUI8, QI16]>:$lhs,
+    TFL_TensorOf<[F32, I32, QI8, QUI8, QI16]>:$rhs,
     TFL_AFAttr:$fused_activation_function);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, I32, QI8, QUI8, QI16]>:$output);
 
   let hasFolder = 1;
 
@@ -2618,6 +2820,8 @@ def TFL_SubOp : TFL_Op<"sub", [ResultsBroadcastableShape, NoSideEffect]> {
 // TODO(jpienaar): Expand the kernel implementation to support all types besides
 // I32 and F32.
 def TFL_SquaredDifferenceOp : TFL_Op<"squared_difference", [
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
+    SameOperandsAndResultElementType,
     ResultsBroadcastableShape,
     NoSideEffect,
     NoQuantizableResult,
@@ -2629,10 +2833,10 @@ def TFL_SquaredDifferenceOp : TFL_Op<"squared_difference", [
   }];
 
   let arguments = (
-    ins AnyTensor:$lhs,
-    AnyTensor:$rhs);
+    ins TFL_TensorOf<[F32, I32]>:$lhs,
+    TFL_TensorOf<[F32, I32]>:$rhs);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, I32]>:$output);
 
   let builders = [TFL_BroadcastableBinaryBuilder];
 
@@ -2644,6 +2848,8 @@ def TFL_SquaredDifferenceOp : TFL_Op<"squared_difference", [
 def TFL_TanhOp: TFL_Op<"tanh", [
     NoSideEffect,
     SameOperandsAndResultShape,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     // central_value = min_value / 2 + (max_value - 1) / 2 + 1
     // zero_point = central_value
     // scale = 1. / (central_value - min_value)
@@ -2656,9 +2862,9 @@ def TFL_TanhOp: TFL_Op<"tanh", [
     Computes element-wise Hyperbolic tangent of input
   }];
 
-  let arguments = (ins TFL_TensorOf<[F32, I16, I8, QI8, QUI8, QI16, QUI16, TFL_Uint8]>:$x);
+  let arguments = (ins TFL_TensorOf<[F32, QI8, QUI8, QI16, TFL_Quint8]>:$input);
 
-  let results = (outs TFL_TensorOf<[F32, I16, I8, QI8, QUI8, QI16, QUI16, TFL_Uint8]>:$y);
+  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, QI16, TFL_Quint8]>:$output);
 
   // This builder doesn't work with quantized type, so it can only be used by
   // non-quantization tablegen patterns. Currently, it is used by the
@@ -2672,9 +2878,11 @@ def TFL_TanhOp: TFL_Op<"tanh", [
   ];
 }
 
-def TFL_TileOp: TFL_Op<"tile", [NoSideEffect, SameOperandsAndResultsScale,
-  PredOpTrait<"resultant element type needs to match first operand type",
-              TFL_TCresVTEtIsSameAsOp<0,0>>]> {
+def TFL_TileOp: TFL_Op<"tile", [
+    NoSideEffect,
+    SameOperandsAndResultsScale,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = "Tile operator.";
   let description = [{
     Constructs a tensor by tiling a given tensor.
@@ -2687,11 +2895,11 @@ def TFL_TileOp: TFL_Op<"tile", [NoSideEffect, SameOperandsAndResultsScale,
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I1, I32, I64, TFL_Uint8, QUI8, TFL_Str]>:$input,
+    TFL_TensorOf<[F32, I1, I32, I64, UI8, QUI8, TFL_Str]>:$input,
     TFL_I32OrI64Tensor:$multiples);
 
   let results = (outs
-    TFL_TensorOf<[F32, I1, I32, I64, TFL_Uint8, QUI8, TFL_Str]>:$output);
+    TFL_TensorOf<[F32, I1, I32, I64, UI8, QUI8, TFL_Str]>:$output);
 
   let hasOptions = 0;
 }
@@ -2699,9 +2907,13 @@ def TFL_TileOp: TFL_Op<"tile", [NoSideEffect, SameOperandsAndResultsScale,
 // TODO(jpienaar): Maybe make it accept any single element tensor as `k`.
 // TODO(jpienaar): Check that input has one or more dimensions.
 // TODO(jpienaar): Check that k is less or equal the internal dimension
-def TFL_TopKV2Op: TFL_Op<"topk_v2", [NoSideEffect, TFL_OperandHasRank<1,0>,
+def TFL_TopKV2Op: TFL_Op<"topk_v2", [
+    NoSideEffect,
+    TFL_OperandHasRankAtLeast<0, 1>,
+    TFL_OperandHasRank<1, 0>,
     PredOpTrait<"result and input element type match",
-                TCresVTEtIsSameAsOp<0,0>>, SameOperandsAndResultsScale]> {
+      TFL_TCresVTEtIsSameAsOp<0,0>>,
+    SameOperandsAndResultsScale]> {
   let summary = "TopK operator";
 
   let description = [{
@@ -2711,11 +2923,11 @@ def TFL_TopKV2Op: TFL_Op<"topk_v2", [NoSideEffect, TFL_OperandHasRank<1,0>,
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, QI8, QUI8]>:$input,
+    TFL_TensorOf<[F32, I8, I32, I64, UI8, QI8, QUI8]>:$input,
     TFL_I32Tensor:$k);
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, QI8, QUI8]>:$values,
+    TFL_TensorOf<[F32, I8, I32, I64, UI8, QI8, QUI8]>:$values,
     TFL_I32Tensor:$indices);
 
   let builders = [OpBuilder<"OpBuilder &builder, OperationState &result, "
@@ -2725,29 +2937,27 @@ def TFL_TopKV2Op: TFL_Op<"topk_v2", [NoSideEffect, TFL_OperandHasRank<1,0>,
   let hasOptions = 1;
 }
 
-def TFL_TransposeOp : TFL_Op<"transpose",
-  [NoSideEffect,
-   TFL_OperandHasRank<1,1>,
-   // TODO(jpienaar): these are only true dynamically, change so that it works
-   // with unknowns.
-   // TFL_OperandRankEquals1DimOfOperand<0, 1>,
-   PredOpTrait<"input and output must have same element type",
-   TCresVTEtIsSameAsOp<0, 0>>,
-   SameOperandsAndResultsScale,
-   TFL_GpuTargetOp]> {
+def TFL_TransposeOp : TFL_Op<"transpose", [
+    NoSideEffect,
+    TFL_OperandHasRankAtMost<0, 5>,
+    TFL_OperandHasRank<1, 1>,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    SameOperandsAndResultsScale,
+    TFL_GpuTargetOp]> {
   let summary = "Transpose operator";
 
   let description = [{
     Returns the Transpose of x
   }];
 
-  let arguments = (
-    ins AnyTensor:$x,
+  let arguments = (ins
+    TFL_TensorOf<[I32, F32, I8, UI8, QI8, QUI8, TFL_Quint8, I1, I64]>:$input,
     TFL_TensorOf<[I32]>:$perm
   );
 
   let results = (outs
-    AnyTensor:$y
+    TFL_TensorOf<[I32, F32, I8, UI8, QI8, QUI8, TFL_Quint8, I1, I64]>:$output
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -2755,7 +2965,10 @@ def TFL_TransposeOp : TFL_Op<"transpose",
   let hasFolder = 1;
 }
 
-def TFL_UnpackOp : TFL_Op<"unpack", [NoSideEffect, SameOperandsAndResultsScale]> {
+def TFL_UnpackOp : TFL_Op<"unpack", [
+    NoSideEffect,
+    SameOperandsAndResultElementType,
+    SameOperandsAndResultsScale]> {
   let summary = "Unpacks a tensor along a dimension into multiple tensors";
 
   let description = [{
@@ -2776,14 +2989,14 @@ def TFL_UnpackOp : TFL_Op<"unpack", [NoSideEffect, SameOperandsAndResultsScale]>
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I1, I8, I32, QI8, QUI8]>:$input,
+    TFL_TensorOf<[F32, I1, I8, UI8, I32, QI8, QUI8, I16, QI16]>:$input,
 
     I32Attr:$num,
     I32Attr:$axis
   );
 
   let results = (outs
-    TFL_VariadicTensorOf<[F32, I1, I8, I32, QI8, QUI8]>:$outputs
+    TFL_VariadicTensorOf<[F32, I1, I8, UI8, I32, QI8, QUI8, I16, QI16]>:$outputs
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -2791,16 +3004,19 @@ def TFL_UnpackOp : TFL_Op<"unpack", [NoSideEffect, SameOperandsAndResultsScale]>
   let hasOptions = 1;
 }
 
-def TFL_ZerosLikeOp: TFL_Op<"zeros_like", [NoSideEffect]> {
+def TFL_ZerosLikeOp: TFL_Op<"zeros_like", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect]> {
   let summary = "ZerosLike operator";
 
   let description = [{
     Returns a tensor of zeros with the same shape and type as the input tensor.
   }];
 
-  let arguments = (ins AnyTensor:$input);
+  let arguments = (ins TFL_TensorOf<[I64, I32, F32]>:$input);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[I64, I32, F32]>:$output);
 
   let hasOptions = 1;
 }
@@ -2834,8 +3050,9 @@ def TFL_BatchToSpaceNdOp: TFL_Op<"batch_to_space_nd", [
 def TFL_SpaceToBatchNdOp: TFL_Op<"space_to_batch_nd", [
     NoSideEffect,
     SameOperandsAndResultsScale,
+    TFL_OperandHasRankRange<0, 3, 4>,
     PredOpTrait<"input and output must have same element type",
-      TCresVTEtIsSameAsOp<0, 0>>
+      TFL_TCresVTEtIsSameAsOp<0, 0>>
   ]> {
   let summary = "SpaceToBatchNd operator";
 
@@ -2844,13 +3061,13 @@ def TFL_SpaceToBatchNdOp: TFL_Op<"space_to_batch_nd", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
-    TFL_TensorOf<[I32]>:$block_shape,
-    TFL_TensorOf<[I32]>:$paddings
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$input,
+    TFL_I32Tensor:$block_shape,
+    TFL_I32Tensor:$paddings
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I16, I32, I64, QI8, QUI8]>:$output
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$output
   );
 }
 
@@ -2858,7 +3075,8 @@ def TFL_SpaceToDepthOp: TFL_Op<"space_to_depth", [
     NoSideEffect,
     SameOperandsAndResultsScale,
     PredOpTrait<"input and output must have same element type",
-      TCresVTEtIsSameAsOp<0, 0>>,
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_OperandHasRankAtMost<0, 4>,
     TFL_GpuTargetOp
   ]> {
   let summary = "SpaceToDepth operator";
@@ -2871,12 +3089,12 @@ def TFL_SpaceToDepthOp: TFL_Op<"space_to_depth", [
    }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, QUI8]>:$input,
-    I32Attr:$block_size
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$input,
+    Confined<I32Attr, [IntPositive]>:$block_size
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, QUI8]>:$output
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$output
   );
 
   let hasOptions = 1;
@@ -2887,7 +3105,7 @@ def TFL_DepthToSpaceOp: TFL_Op<"depth_to_space", [
     SameOperandsAndResultsScale,
     PredOpTrait<"input and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
-    TFL_OperandHasRankLessThanOrEqualTo<0, 4>
+    TFL_OperandHasRankAtMost<0, 4>
   ]> {
   let summary = "DepthToSpace operator";
 
@@ -2901,12 +3119,12 @@ def TFL_DepthToSpaceOp: TFL_Op<"depth_to_space", [
    }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64, TFL_Quint8, TFL_Uint8, UI8, QI8, QUI8]>:$input,
+    TFL_TensorOf<[F32, I8, I32, I64, TFL_Quint8, UI8, QI8, QUI8]>:$input,
     Confined<I32Attr, [IntPositive]>:$block_size
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I32, I64, TFL_Quint8, TFL_Uint8, UI8, QI8, QUI8]>:$output
+    TFL_TensorOf<[F32, I8, I32, I64, TFL_Quint8, UI8, QI8, QUI8]>:$output
   );
 
   let hasOptions = 1;
@@ -2926,12 +3144,12 @@ def TFL_SplitOp : TFL_Op<"split", [
 
   let arguments = (ins
     TFL_TensorOf<[I32]>:$split_dim,
-    TFL_TensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>:$value,
+    TFL_TensorOf<[F32, I16, I32, I64, I8, UI8, QI8, QUI8, QI16]>:$value,
     Confined<I32Attr, [IntPositive]>:$num_splits
   );
 
   let results = (outs
-    TFL_VariadicTensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>:$outputs
+    TFL_VariadicTensorOf<[F32, I16, I32, I64, I8, UI8, QI8, QUI8, QI16]>:$outputs
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -2949,14 +3167,14 @@ def TFL_SplitVOp : TFL_Op<"split_v", [NoSideEffect, SameOperandsAndResultsScale]
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>:$value,
+    TFL_TensorOf<[F32, I16, I32, I64, I8, UI8, QI8, QUI8, QI16]>:$value,
     TFL_1DTensorOf<[I32], [I32]>:$size_splits,
     TFL_0DTensorOf<[I32], [I32]>:$split_dim,
     Confined<I32Attr, [IntPositive]>:$num_splits
   );
 
   let results = (outs
-    TFL_VariadicTensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>:$outputs
+    TFL_VariadicTensorOf<[F32, I16, I32, I64, I8, UI8, QI8, QUI8, QI16]>:$outputs
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -2965,7 +3183,12 @@ def TFL_SplitVOp : TFL_Op<"split_v", [NoSideEffect, SameOperandsAndResultsScale]
 }
 
 def TFL_ResizeBilinearOp: TFL_Op<"resize_bilinear", [
-    NoSideEffect, SameOperandsAndResultsScale]> {
+    NoSideEffect,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_OperandHasRank<0, 4>,
+    TFL_OperandHasRank<1, 1>,
+    SameOperandsAndResultsScale]> {
   let summary = "ResizeBilinear Op";
 
   let description = [{
@@ -2973,23 +3196,26 @@ def TFL_ResizeBilinearOp: TFL_Op<"resize_bilinear", [
   }];
 
   let arguments = (ins
-    // TODO(ycling): Support quantized types.
-    TFL_TensorOf<[F32, I32, QI8, QUI8]>:$input,
-    TFL_TensorOf<[I32]>:$size,
+    TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$input,
+    TFL_I32Tensor:$size,
     BoolAttr:$align_corners,
     DefaultValuedAttr<BoolAttr, "false">:$half_pixel_centers
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, QI8, QUI8]>:$output
+    TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$output
   );
 
   let hasOptions = 1;
 }
 
-def TFL_ResizeNearestNeighborOp : TFL_Op<"resize_nearest_neighbor",
-                                [NoSideEffect,
-                                 SameOperandsAndResultsScale]> {
+def TFL_ResizeNearestNeighborOp : TFL_Op<"resize_nearest_neighbor", [
+    NoSideEffect,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_OperandHasRank<0, 4>,
+    TFL_OperandHasRank<1, 1>,
+    SameOperandsAndResultsScale]> {
   let summary = "ResizeNearestNeighbor Op";
 
   let description = [{
@@ -2997,20 +3223,28 @@ def TFL_ResizeNearestNeighborOp : TFL_Op<"resize_nearest_neighbor",
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, TFL_Uint8, QUI8, QI8]>:$input,
-    TFL_TensorOf<[I32]>:$size,
+    TFL_TensorOf<[F32, TFL_Quint8, QUI8, QI8]>:$input,
+    TFL_I32Tensor:$size,
     BoolAttr:$align_corners,
     DefaultValuedAttr<BoolAttr, "false">:$half_pixel_centers
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, TFL_Uint8, QUI8, QI8]>:$output
+    TFL_TensorOf<[F32, TFL_Quint8, QUI8, QI8]>:$output
   );
 
   let hasOptions = 1;
 }
 
-def TFL_SparseToDenseOp : TFL_Op<"sparse_to_dense", [NoSideEffect]> {
+def TFL_SparseToDenseOp : TFL_Op<"sparse_to_dense", [
+    NoSideEffect,
+    PredOpTrait<"sparse_values and dense must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 2>>,
+    PredOpTrait<"default_value and dense must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 3>>,
+    TFL_OperandHasRankAtMost<0, 2>,
+    TFL_OperandHasRankAtMost<1, 1>,
+    TFL_OperandHasRankAtMost<2, 1>]> {
   let summary = "Converts a sparse representation into a dense tensor.";
 
   let description = [{
@@ -3038,21 +3272,24 @@ are checked during execution.
   let arguments = (ins
     TFL_I32OrI64Tensor:$sparse_indices,
     TFL_I32OrI64Tensor:$output_shape,
-    TFL_TensorOf<[I32, I64, I8, TFL_Uint8, F32]>:$sparse_values,
-    TFL_TensorOf<[I32, I64, I8, TFL_Uint8, F32]>:$default_value
+    TFL_TensorOf<[I32, I64, I8, QI8, UI8, QUI8, TFL_Quint8, F32]>:$sparse_values,
+    TFL_TensorOf<[I32, I64, I8, QI8, UI8, QUI8, TFL_Quint8, F32]>:$default_value
   );
 
   let results = (outs
-    TFL_TensorOf<[I32, I64, I8, TFL_Uint8, F32]>:$dense
+    TFL_TensorOf<[I32, I64, I8, QI8, UI8, QUI8, TFL_Quint8, F32]>:$dense
   );
 }
 
-def TFL_StridedSliceOp: TFL_Op<"strided_slice",
-  [
+def TFL_StridedSliceOp: TFL_Op<"strided_slice", [
     NoSideEffect,
     PredOpTrait<"input and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     SameOperandsAndResultsScale,
+    TFL_OperandHasRankAtMost<0, 5>,
+    TFL_OperandHasRank<1, 1>,
+    TFL_OperandHasRank<2, 1>,
+    TFL_OperandHasRank<3, 1>,
     TFL_GpuTargetOp
   ]> {
   let summary = "StridedSlice Op";
@@ -3062,20 +3299,20 @@ def TFL_StridedSliceOp: TFL_Op<"strided_slice",
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, I1, TFL_Quint8, TFL_Uint8]>:$input,
-    TFL_TensorOf<[I32]>:$begin,
-    TFL_TensorOf<[I32]>:$end,
-    TFL_TensorOf<[I32]>:$strides,
+    TFL_TensorOf<[F32, I32, I64, I8, UI8, QI8, QUI8, I1, I16, QI16, TFL_Quint8]>:$input,
+    TFL_I32Tensor:$begin,
+    TFL_I32Tensor:$end,
+    TFL_I32Tensor:$strides,
 
     I32Attr:$begin_mask,
     I32Attr:$end_mask,
-    I32Attr:$ellipsis_mask,
-    I32Attr:$new_axis_mask,
+    Confined<I32Attr, [TFL_IntEqualsTo<0>]>:$ellipsis_mask,
+    Confined<I32Attr, [TFL_IntEqualsTo<0>]>:$new_axis_mask,
     I32Attr:$shrink_axis_mask
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, I1, TFL_Quint8, TFL_Uint8]>:$output
+    TFL_TensorOf<[F32, I32, I64, I8, UI8, QI8, QUI8, I1, I16, QI16, TFL_Quint8]>:$output
   );
 
   let hasOptions = 1;
@@ -3090,17 +3327,16 @@ def TFL_CastOp : TFL_Op<"cast", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I1, I32, I64, TFL_Quint8, TFL_Uint8, Complex<F<32>>]>:$input
+    TFL_TensorOf<[F32, I1, I32, I64, TFL_Quint8, UI8, Complex<F<32>>]>:$input
   );
 
-  let results = (outs TFL_TensorOf<[F32, I1, I32, I64, TFL_Quint8, TFL_Uint8, Complex<F<32>>]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I1, I32, I64, TFL_Quint8, UI8, Complex<F<32>>]>:$output);
 
   // TFLite's cast op does not utilize CastOptions, instead derives types
   // from the TfLiteTensors.
   let hasOptions = 0;
 }
 
-
 def TFL_MirrorPadOp: TFL_Op<"mirror_pad", [
                      NoSideEffect, TFL_OperandHasRank<1, 2>, TFL_GpuTargetOp]> {
   let summary = "MirrorPad Operator. Pads a tensor with mirrored values.";
@@ -3136,24 +3372,25 @@ def TFL_MirrorPadOp: TFL_Op<"mirror_pad", [
   let hasOptions = 1;
 }
 
-def TFL_UniqueOp: TFL_Op<"unique", [NoSideEffect]> {
+def TFL_UniqueOp: TFL_Op<"unique", [
+    TFL_OperandHasRank<0, 1>,
+    NoSideEffect]> {
   let summary = "Unique Op.";
 
   let description = [{
-  This operation returns a tensor `y` containing all of the unique elements of `x`
-sorted in the same order that they occur in `x`. This operation also returns a
-tensor `idx` the same size as `x` that contains the index of each value of `x`
-in the unique output `y`. In other words:
+  This operation returns a tensor `output` containing all of the unique elements
+of `input` sorted in the same order that they occur in `input`. This operation
+also returns a tensor `idx` the same size as `x` that contains the index of each
+value of `input` in the unique output `output`. In other words:
   }];
 
   let arguments = (ins
-    // TODO: add uint8 support after quantize support.
-    TFL_TensorOf<[I8, I16, I32, I64, F32]>:$input
+    TFL_TensorOf<[I8, QI8, UI8, QUI8, I16, QI16, I32, I64, F32]>:$input
   );
 
   let results = (outs
-    TFL_TensorOf<[I8, I16, I32, I64, F32]>:$output,
-    TFL_TensorOf<[I32, I64]>:$idx
+    TFL_TensorOf<[I8, QI8, UI8, QUI8, I16, QI16, I32, I64, F32]>:$output,
+    TFL_I32OrI64Tensor:$idx
   );
 
   DerivedTFLiteTypeAttr idx_out_type = DerivedTFLiteTypeAttr<[{
@@ -3224,7 +3461,7 @@ def TFL_QConstOp : Op<TFL_Dialect, "pseudo_qconst", [
     ElementsAttr:$value
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[QUI8, QI8, QI16, QUI16, TFL_Quint8]>:$output);
 
   let builders = [OpBuilder<
     "OpBuilder &, OperationState &state, TypeAttr qtype, Attribute value",
@@ -3253,7 +3490,7 @@ def TFL_SparseQConstOp : Op<TFL_Dialect, "pseudo_sparse_qconst", [
     ElementsAttr:$compressed_data
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[QUI8, QI8, QI16, QUI16, TFL_Quint8]>:$output);
 
   let builders = [OpBuilder<
     "OpBuilder &, OperationState &state, TypeAttr qtype, "
@@ -3269,7 +3506,9 @@ def TFL_SparseQConstOp : Op<TFL_Dialect, "pseudo_sparse_qconst", [
 }
 
 def TFL_QuantizeOp: TFL_Op<"quantize", [
-    FirstAttrDerivedResultType, NoQuantizableResult]> {
+    FirstAttrDerivedResultType,
+    SameOperandsAndResultShape,
+    NoQuantizableResult]> {
   let summary = "Quantize operator";
 
   let description = [{
@@ -3278,16 +3517,18 @@ def TFL_QuantizeOp: TFL_Op<"quantize", [
   }];
 
   let arguments = (
-    ins AnyTensor:$input,
+    ins TFL_TensorOf<[F32, QI8, QUI8, QI16, TFL_Quint8]>:$input,
     TensorTypeAttr:$qtype
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[QI8, QUI8, QI16, TFL_Quint8]>:$output);
 }
 
-def TFL_DensifyOp: TFL_Op<"densify", [NoSideEffect,
-                                      SameOperandsAndResultType,
-                                      NoQuantizableResult]> {
+def TFL_DensifyOp: TFL_Op<"densify", [
+    NoSideEffect,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoQuantizableResult]> {
   let summary = "Densify operator";
 
   let description = [{
@@ -3390,6 +3631,19 @@ def TFL_LSTMOp :
            LstmOptionalPeepholeWeightConstraint,
            LstmProjectionWeightBiasConstraint,
            LstmResultConstraint,
+           TFL_OperandHasRank<2, 2>,           // input_to_forget_weights
+           TFL_OperandHasRank<3, 2>,           // input_to_cell_weights
+           TFL_OperandIsNoneOrHasRank<5, 2>,   // recurrent_to_input_weights
+           TFL_OperandHasRank<6, 2>,           // recurrent_to_forget_weights
+           TFL_OperandHasRank<7, 2>,           // recurrent_to_cell_weights
+           TFL_OperandIsNoneOrHasRank<9, 1>,   // cell_to_input_weights
+           TFL_OperandIsNoneOrHasRank<10, 1>,  // cell_to_forget_weights
+           TFL_OperandIsNoneOrHasRank<11, 1>,  // cell_to_output_weights
+           TFL_OperandHasRank<13, 1>,          // forget_gate_bias
+           TFL_OperandHasRank<14, 1>,          // cell_gate_bias
+           TFL_OperandHasRank<15, 1>,          // output_gate_bias
+           TFL_OperandIsNoneOrHasRank<16, 2>,  // projection_weights
+           TFL_OperandIsNoneOrHasRank<17, 1>,  // projection_bias
            TFL_StatefulOp]> {
   let summary = "The full lstm operator";
 
@@ -3416,23 +3670,23 @@ Ba et al. 'Layer Normalization'
     ins TFL_TensorOf<[F32, QI8]>:$input,
 
     // Weights
-    TFL_TensorOfOrNone<[F32, I8, QI8]>:$input_to_input_weights,
-    TFL_TensorOf<[F32, I8, QI8]>:$input_to_forget_weights,
-    TFL_TensorOf<[F32, I8, QI8]>:$input_to_cell_weights,
-    TFL_TensorOf<[F32, I8, QI8]>:$input_to_output_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$input_to_input_weights,
+    TFL_TensorOf<[F32, QI8]>:$input_to_forget_weights,
+    TFL_TensorOf<[F32, QI8]>:$input_to_cell_weights,
+    TFL_TensorOf<[F32, QI8]>:$input_to_output_weights,
 
     // Recurrent weights
-    TFL_TensorOfOrNone<[F32, I8, QI8]>:$recurrent_to_input_weights,
-    TFL_TensorOf<[F32, I8, QI8]>:$recurrent_to_forget_weights,
-    TFL_TensorOf<[F32, I8, QI8]>:$recurrent_to_cell_weights,
-    TFL_TensorOf<[F32, I8, QI8]>:$recurrent_to_output_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$recurrent_to_input_weights,
+    TFL_TensorOf<[F32, QI8]>:$recurrent_to_forget_weights,
+    TFL_TensorOf<[F32, QI8]>:$recurrent_to_cell_weights,
+    TFL_TensorOf<[F32, QI8]>:$recurrent_to_output_weights,
 
     // Cell weights
-    TFL_TensorOfOrNone<[F32, I8, QI16]>:$cell_to_input_weights,
+    TFL_TensorOfOrNone<[F32, QI8, QI16]>:$cell_to_input_weights,
     // Optional input
-    TFL_TensorOfOrNone<[F32, I8, QI16]>:$cell_to_forget_weights,
+    TFL_TensorOfOrNone<[F32, QI8, QI16]>:$cell_to_forget_weights,
     // Optional input
-    TFL_TensorOfOrNone<[F32, I8, QI16]>:$cell_to_output_weights,
+    TFL_TensorOfOrNone<[F32, QI8, QI16]>:$cell_to_output_weights,
 
     // Bias
     TFL_TensorOfOrNone<[F32, QI32]>:$input_gate_bias,
@@ -3441,7 +3695,7 @@ Ba et al. 'Layer Normalization'
     TFL_TensorOf<[F32, QI32]>:$output_gate_bias,
 
     // Projection weight and bias
-    TFL_TensorOfOrNone<[F32, I8, QI8]>:$projection_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$projection_weights,
     // Optional input
     TFL_TensorOfOrNone<[F32, QI32]>:$projection_bias,
 
@@ -3457,8 +3711,8 @@ Ba et al. 'Layer Normalization'
 
     // Attributes
     TFL_AFAttr:$fused_activation_function,
-    DefaultValuedAttr<F32Attr, "0.0f">:$cell_clip,
-    DefaultValuedAttr<F32Attr, "0.0f">:$proj_clip,
+    Confined<DefaultValuedAttr<F32Attr, "0.0f">, [TFL_FloatNonNegative]>:$cell_clip,
+    Confined<DefaultValuedAttr<F32Attr, "0.0f">, [TFL_FloatNonNegative]>:$proj_clip,
     // Since this op is the FULL kernel only, constrain it.
     Confined<
       DefaultValuedAttr<TFL_LSTMKernelTypeAttr, "FULL">,
@@ -3498,6 +3752,24 @@ def TFL_UnidirectionalSequenceLSTMOp :
            LstmOptionalPeepholeWeightConstraint,
            LstmProjectionWeightBiasConstraint,
            LstmResultConstraint,
+           TFL_OperandHasRankAtLeast<0, 2>,    // input
+           TFL_OperandIsNoneOrHasRank<1, 2>,   // input_to_input_weights
+           TFL_OperandHasRank<2, 2>,           // input_to_forget_weights
+           TFL_OperandHasRank<3, 2>,           // input_to_cell_weights
+           TFL_OperandHasRank<4, 2>,           // input_to_output_weights
+           TFL_OperandIsNoneOrHasRank<5, 2>,   // recurrent_to_input_weights
+           TFL_OperandHasRank<6, 2>,           // recurrent_to_forget_weights
+           TFL_OperandHasRank<7, 2>,           // recurrent_to_cell_weights
+           TFL_OperandHasRank<8, 2>,           // recurrent_to_output_weights
+           TFL_OperandIsNoneOrHasRank<9, 1>,   // cell_to_input_weights
+           TFL_OperandIsNoneOrHasRank<10, 1>,  // cell_to_forget_weights
+           TFL_OperandIsNoneOrHasRank<11, 1>,  // cell_to_output_weights
+           TFL_OperandIsNoneOrHasRank<12, 1>,  // input_gate_bias
+           TFL_OperandHasRank<13, 1>,          // forget_gate_bias
+           TFL_OperandHasRank<14, 1>,          // cell_gate_bias
+           TFL_OperandHasRank<15, 1>,          // output_gate_bias
+           TFL_OperandIsNoneOrHasRank<16, 2>,  // projection_weights
+           TFL_OperandIsNoneOrHasRank<17, 2>,  // projection_bias
            TFL_StatefulOp]> {
   let summary = "Unidirectional sequence lstm operator";
 
@@ -3513,35 +3785,35 @@ def TFL_UnidirectionalSequenceLSTMOp :
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, I8]>:$input,
+    ins TFL_FpTensor:$input,
 
     // Weights
-    TFL_TensorOfOrNone<[F32, I8]>:$input_to_input_weights,
-    TFL_TensorOf<[F32, I8]>:$input_to_forget_weights,
-    TFL_TensorOf<[F32, I8]>:$input_to_cell_weights,
-    TFL_TensorOf<[F32, I8]>:$input_to_output_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$input_to_input_weights,
+    TFL_TensorOf<[F32, QI8]>:$input_to_forget_weights,
+    TFL_TensorOf<[F32, QI8]>:$input_to_cell_weights,
+    TFL_TensorOf<[F32, QI8]>:$input_to_output_weights,
 
     // Recurrent weights
-    TFL_TensorOfOrNone<[F32, I8]>:$recurrent_to_input_weights,
-    TFL_TensorOf<[F32, I8]>:$recurrent_to_forget_weights,
-    TFL_TensorOf<[F32, I8]>:$recurrent_to_cell_weights,
-    TFL_TensorOf<[F32, I8]>:$recurrent_to_output_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$recurrent_to_input_weights,
+    TFL_TensorOf<[F32, QI8]>:$recurrent_to_forget_weights,
+    TFL_TensorOf<[F32, QI8]>:$recurrent_to_cell_weights,
+    TFL_TensorOf<[F32, QI8]>:$recurrent_to_output_weights,
 
     // Cell weights
-    TFL_TensorOfOrNone<[F32, I8]>:$cell_to_input_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$cell_to_input_weights,
     // Optional input
-    TFL_TensorOfOrNone<[F32, I8]>:$cell_to_forget_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$cell_to_forget_weights,
     // Optional input
-    TFL_TensorOfOrNone<[F32, I8]>:$cell_to_output_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$cell_to_output_weights,
 
     // Bias
     TFL_TensorOfOrNone<[F32]>:$input_gate_bias,
-    TFL_TensorOf<[F32]>:$forget_gate_bias,
-    TFL_TensorOf<[F32]>:$cell_bias,
-    TFL_TensorOf<[F32]>:$output_gate_bias,
+    TFL_FpTensor:$forget_gate_bias,
+    TFL_FpTensor:$cell_bias,
+    TFL_FpTensor:$output_gate_bias,
 
     // Projection weight and bias
-    TFL_TensorOfOrNone<[F32, I8]>:$projection_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$projection_weights,
     // Optional input
     TFL_TensorOfOrNone<[F32]>:$projection_bias,
 
@@ -3550,19 +3822,19 @@ def TFL_UnidirectionalSequenceLSTMOp :
     TFL_StatefulTensor:$input_cell_state,
 
     // Layer norm coefficients
-    TFL_TensorOfOrNone<[F32, I8]>:$input_layer_norm_coefficients,
-    TFL_TensorOfOrNone<[F32, I8]>:$forget_layer_norm_coefficients,
-    TFL_TensorOfOrNone<[F32, I8]>:$cell_layer_norm_coefficients,
-    TFL_TensorOfOrNone<[F32, I8]>:$output_layer_norm_coefficients,
+    TFL_TensorOfOrNone<[F32, QI8]>:$input_layer_norm_coefficients,
+    TFL_TensorOfOrNone<[F32, QI8]>:$forget_layer_norm_coefficients,
+    TFL_TensorOfOrNone<[F32, QI8]>:$cell_layer_norm_coefficients,
+    TFL_TensorOfOrNone<[F32, QI8]>:$output_layer_norm_coefficients,
 
     // Attributes
     TFL_AFAttr:$fused_activation_function,
-    DefaultValuedAttr<F32Attr, "0.0f">:$cell_clip,
-    DefaultValuedAttr<F32Attr, "0.0f">:$proj_clip,
+    Confined<DefaultValuedAttr<F32Attr, "0.0f">, [TFL_FloatNonNegative]>:$cell_clip,
+    Confined<DefaultValuedAttr<F32Attr, "0.0f">, [TFL_FloatNonNegative]>:$proj_clip,
     BoolAttr:$time_major
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, QI8]>:$output);
 
   let hasOptions = 1;
 
@@ -3759,15 +4031,14 @@ def TFL_BidirectionalSequenceLSTMOp :
   }];
 }
 
-def RnnResultConstraint : PredOpTrait<
-  "the input and result tensor elemental types must be same",
-  TCresVTEtIsSameAsOp<0, 0>>;
-
 // UnidirectionalSequenceRNN op.
-def TFL_UnidirectionalSequenceRNNOp :
-  TFL_Op<"unidirectional_sequence_rnn",
-         [RnnResultConstraint, TFL_StatefulOp]> {
-
+def TFL_UnidirectionalSequenceRNNOp : TFL_Op<"unidirectional_sequence_rnn", [
+    TFL_OperandHasRank<4, 2>,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    PredOpTrait<"input and constant value operands must have same element type",
+      TFL_TCopVTEtAreSameAt<1, 2>>,
+    TFL_StatefulOp]> {
   let summary = "Unidirectional sequence rnn operator";
 
   let description = [{
@@ -3784,16 +4055,16 @@ def TFL_UnidirectionalSequenceRNNOp :
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, I8]>:$input,
+    ins TFL_FpTensor:$input,
 
     // Weights
-    TFL_TensorOf<[F32, I8]>:$input_to_input_weights,
+    TFL_TensorOf<[F32, QI8]>:$input_to_input_weights,
 
     // Recurrent weights
-    TFL_TensorOf<[F32, I8]>:$recurrent_to_input_weights,
+    TFL_TensorOf<[F32, QI8]>:$recurrent_to_input_weights,
 
     // Bias
-    TFL_TensorOf<[F32]>:$input_gate_bias,
+    TFL_FpTensor:$input_gate_bias,
 
     // Hidden state.
     TFL_StatefulTensor:$hidden_state,
@@ -3803,7 +4074,7 @@ def TFL_UnidirectionalSequenceRNNOp :
     TFL_AFAttr:$fused_activation_function
   );
 
-  let results = (outs TFL_TensorOf<[F32, I8]>:$output);
+  let results = (outs TFL_FpTensor:$output);
 
   let hasOptions = 1;
 
@@ -3849,7 +4120,7 @@ def TFL_NumericVerifyOp : Op<TFL_Dialect, "NumericVerify", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[QI8, QUI8, QI16, QUI16]>:$input,
+    TFL_TensorOf<[QI8, QUI8, QI16, F16, TFL_Quint8]>:$input,
     TFL_TensorOf<[F32]>:$ref,
 
     // Attributes
@@ -3859,14 +4130,12 @@ def TFL_NumericVerifyOp : Op<TFL_Dialect, "NumericVerify", [
   let results = (outs);
 }
 
-def SVDFResultConstraint: PredOpTrait<
-  "the input and result tensor elemental types must be same",
-  TCresVTEtIsSameAsOp<0, 0>>;
-
 // SVDF op.
 def TFL_SVDFOp :
-  TFL_Op<"svdf",
-         [SVDFResultConstraint, TFL_StatefulOp]> {
+  TFL_Op<"svdf", [
+    PredOpTrait<"the input and result tensor elemental types must be same",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_StatefulOp]> {
 
   let summary = "Single value decomposition filter operator";
 
@@ -3878,13 +4147,13 @@ def TFL_SVDFOp :
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, I8]>:$input,
+    ins TFL_TensorOf<[F32, QI8]>:$input,
 
     // Feature Weights.
-    TFL_TensorOf<[F32, I8]>:$feature_weights,
+    TFL_TensorOf<[F32, QI8, QUI8]>:$feature_weights,
 
     // Time weights
-    TFL_TensorOf<[F32, I8]>:$time_weights,
+    TFL_TensorOf<[F32, QI8]>:$time_weights,
 
     // Bias
     TFL_TensorOfOrNone<[F32]>:$input_gate_bias,
@@ -3893,11 +4162,11 @@ def TFL_SVDFOp :
     TFL_StatefulTensor:$activation_state,
 
     // Attributes
-    I32Attr:$rank,
+    Confined<I32Attr, [IntPositive]>:$rank,
     TFL_AFAttr:$fused_activation_function
   );
 
-  let results = (outs TFL_TensorOf<[F32, I8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, QI8]>:$output);
 
   let hasOptions = 1;
 
@@ -3909,7 +4178,10 @@ def TFL_SVDFOp :
   }];
 }
 
-def TFL_SegmentSumOp: TFL_Op<"segment_sum", [NoSideEffect]> {
+def TFL_SegmentSumOp: TFL_Op<"segment_sum", [
+    NoSideEffect,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = "SegmentSum operator";
 
   let description = [{
@@ -3917,7 +4189,7 @@ def TFL_SegmentSumOp: TFL_Op<"segment_sum", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I32]>:$data,
+    TFL_TensorOf<[F32, I32]>:$input,
     TFL_I32Tensor:$segment_ids
   );
   let results = (outs TFL_TensorOf<[F32, I32]>:$output);
@@ -3936,8 +4208,8 @@ def TFL_YieldOp : Op<TFL_Dialect, "yield", [Terminator]> {
 }
 
 def TFL_WhileOp : Op<TFL_Dialect, "while", [
-       DeclareOpInterfaceMethods<LoopLikeOpInterface>,
-       SingleBlockImplicitTerminator<"YieldOp">]> {
+    DeclareOpInterfaceMethods<LoopLikeOpInterface>,
+    SingleBlockImplicitTerminator<"YieldOp">]> {
   let summary = [{While loop}];
 
   let description = [{
@@ -3948,7 +4220,7 @@ def TFL_WhileOp : Op<TFL_Dialect, "while", [
 
     input: A list of input tensors whose types are T.
     output: A list of output tensors whose types are T.
-    cond: A region takes 'input' and returns a boolean scalar tensor.
+    cond: A region that takes 'input' and returns a boolean scalar tensor.
     body: A region that takes a list of tensors and returns another
           list of tensors. Both lists have the same types.
   }];
diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
index a07b7b8dd1d..8a2faebcbe6 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
@@ -55,8 +55,8 @@ Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
   std::vector<string> node_names;
   std::vector<string> node_dtypes;
   std::vector<std::vector<int>> node_shapes;
-  std::vector<double> node_mins;
-  std::vector<double> node_maxs;
+  std::vector<llvm::Optional<double>> node_mins;
+  std::vector<llvm::Optional<double>> node_maxs;
 
   // Populate quantization specs.
   TF_RETURN_IF_ERROR(internal::PopulateQuantizationSpecs(
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index c338b723a4a..ab80746f8b7 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -125,8 +125,8 @@ Status ConvertSavedModelToTFLiteFlatBuffer(
   std::vector<string> node_names;
   std::vector<string> node_dtypes;
   std::vector<std::vector<int>> node_shapes;
-  std::vector<double> node_mins;
-  std::vector<double> node_maxs;
+  std::vector<llvm::Optional<double>> node_mins;
+  std::vector<llvm::Optional<double>> node_maxs;
 
   // Populate quantization specs.
   TF_RETURN_IF_ERROR(internal::PopulateQuantizationSpecs(
@@ -146,6 +146,10 @@ Status ConvertSavedModelToTFLiteFlatBuffer(
       saved_model_exported_names.begin(), saved_model_exported_names.end());
   absl::Span<std::string> exported_names(exported_names_in_vector);
 
+  if (exported_names.size() != 1) {
+    return errors::Unimplemented("Only support a single exported name.");
+  }
+
   TF_ASSIGN_OR_RETURN(auto module,
                       ImportSavedModel(model_flags.saved_model_dir(),
                                        model_flags.saved_model_version(), tags,
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
index 6dd44e666fb..8f2c8bc362c 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
@@ -121,6 +121,8 @@ DataType ConvertIODataTypeToDataType(toco::IODataType dtype) {
       return DT_STRING;
     case toco::IODataType::BOOL:
       return DT_BOOL;
+    case toco::IODataType::COMPLEX64:
+      return DT_COMPLEX64;
     default:
       return DT_INVALID;
   }
@@ -175,14 +177,13 @@ Status RegisterAllCustomOps(const toco::TocoFlags& toco_flags) {
   return RegisterCustomBuiltinOps(extra_tf_opdefs);
 }
 
-Status PopulateQuantizationSpecs(const toco::ModelFlags& model_flags,
-                                 const toco::TocoFlags& toco_flags,
-                                 mlir::TFL::QuantizationSpecs* quant_specs,
-                                 std::vector<string>* node_names,
-                                 std::vector<string>* node_dtypes,
-                                 std::vector<std::vector<int>>* node_shapes,
-                                 std::vector<double>* node_mins,
-                                 std::vector<double>* node_maxs) {
+Status PopulateQuantizationSpecs(
+    const toco::ModelFlags& model_flags, const toco::TocoFlags& toco_flags,
+    mlir::TFL::QuantizationSpecs* quant_specs, std::vector<string>* node_names,
+    std::vector<string>* node_dtypes,
+    std::vector<std::vector<int>>* node_shapes,
+    std::vector<llvm::Optional<double>>* node_mins,
+    std::vector<llvm::Optional<double>>* node_maxs) {
   quant_specs->inference_input_type =
       ConvertIODataTypeToDataType(toco_flags.inference_input_type());
   tensorflow::DataType inference_type =
@@ -209,11 +210,16 @@ Status PopulateQuantizationSpecs(const toco::ModelFlags& model_flags,
                                             flag.shape().dims().end()));
     // Currently, only UINT8 and INT8 require inputs stats
     if (inference_type == DT_QINT8 || inference_type == DT_QUINT8) {
-      TF_ASSIGN_OR_RETURN(
-          auto min_max, InputStatsToMinMax(flag.mean_value(), flag.std_value(),
-                                           inference_type));
-      node_mins->push_back(min_max.first);
-      node_maxs->push_back(min_max.second);
+      if (flag.has_mean_value() && flag.has_std_value()) {
+        TF_ASSIGN_OR_RETURN(
+            auto min_max, InputStatsToMinMax(flag.mean_value(),
+                                             flag.std_value(), inference_type));
+        node_mins->push_back(min_max.first);
+        node_maxs->push_back(min_max.second);
+      } else {
+        node_mins->push_back(llvm::None);
+        node_maxs->push_back(llvm::None);
+      }
     }
   }
 
@@ -252,7 +258,7 @@ Status DumpOpGraphToFile(mlir::ModuleOp module, const std::string& filename) {
   std::string error_message;
   auto output = mlir::openOutputFile(filename, &error_message);
   if (!error_message.empty()) {
-    return errors::InvalidArgument("Failed to open file in %s.", filename);
+    return errors::InvalidArgument("Failed to open file in ", filename);
   }
   mlir::PassManager pm(module.getContext());
   pm.addPass(mlir::createPrintOpGraphPass(output->os()));
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
index 3ea36e5eb1d..87e73912a46 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
@@ -34,14 +34,13 @@ Status RegisterAllCustomOps(const toco::TocoFlags& toco_flags);
 
 // Populate quantization specs (or not) given user specified ranges for each
 // input arrays.
-Status PopulateQuantizationSpecs(const toco::ModelFlags& model_flags,
-                                 const toco::TocoFlags& toco_flags,
-                                 mlir::TFL::QuantizationSpecs* quant_specs,
-                                 std::vector<string>* node_names,
-                                 std::vector<string>* node_dtypes,
-                                 std::vector<std::vector<int>>* node_shapes,
-                                 std::vector<double>* node_mins,
-                                 std::vector<double>* node_maxs);
+Status PopulateQuantizationSpecs(
+    const toco::ModelFlags& model_flags, const toco::TocoFlags& toco_flags,
+    mlir::TFL::QuantizationSpecs* quant_specs, std::vector<string>* node_names,
+    std::vector<string>* node_dtypes,
+    std::vector<std::vector<int>>* node_shapes,
+    std::vector<llvm::Optional<double>>* node_mins,
+    std::vector<llvm::Optional<double>>* node_maxs);
 
 // Convert imported MLIR file to TfLite flatbuffer.
 // This will also run relevant passes as well.
diff --git a/tensorflow/compiler/mlir/lite/quantization/BUILD b/tensorflow/compiler/mlir/lite/quantization/BUILD
index 23a65a88186..57417e95ec6 100644
--- a/tensorflow/compiler/mlir/lite/quantization/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/BUILD
@@ -3,6 +3,10 @@ load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
 )
+load(
+    "//third_party/mlir:tblgen.bzl",
+    "gentbl",
+)
 
 package(
     default_visibility = [
@@ -23,6 +27,7 @@ package_group(
 exports_files([
     "quantization_traits.h",
     "quantization_config.h",
+    "quantization_utils.h",
 ])
 
 filegroup(
@@ -34,6 +39,25 @@ filegroup(
     ],
 )
 
+gentbl(
+    name = "quantization_interfaces_inc_gen",
+    tbl_outs = [
+        (
+            "-gen-op-interface-decls",
+            "quantization_interface.h.inc",
+        ),
+        (
+            "-gen-op-interface-defs",
+            "quantization_interface.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "quantization.td",
+    td_srcs = [
+        ":quantization_td_files",
+    ],
+)
+
 tf_proto_library(
     name = "quantization_info_proto",
     srcs = [
@@ -71,9 +95,11 @@ cc_library(
     name = "quantization_lib",
     srcs = [
         "quantization_driver.cc",
+        "quantization_interface.cc.inc",
         "quantization_utils.cc",
     ],
     hdrs = [
+        "quantization_interface.h.inc",
         "quantization_traits.h",
         "quantization_utils.h",
     ],
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
index b4fddceb580..2783297814b 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
@@ -49,14 +49,16 @@ cc_library(
     ],
     hdrs = [
         "tfl_to_std.h",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_utils.h",
     ],
     deps = [
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "@com_google_absl//absl/strings",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
index 0ac3fa419bc..a2e3c065113 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace mlir {
 namespace lite {
@@ -38,6 +39,7 @@ namespace lite {
 TfLiteStatus QuantizeModel(
     const tflite::ModelT& input_model, const tflite::TensorType& input_type,
     const tflite::TensorType& output_type,
+    const tflite::TensorType& inference_type,
     const std::unordered_set<std::string>& operator_names,
     bool disable_per_channel, bool fully_quantize,
     flatbuffers::FlatBufferBuilder* builder,
@@ -73,7 +75,7 @@ TfLiteStatus QuantizeModel(
   // Apply quantization passes
   PassManager pm(module->getContext());
   TFL::QuantizationSpecs quant_specs;
-  quant_specs.inference_type = tensorflow::DT_QINT8;
+  quant_specs.inference_type = tflite::TflTypeToTfType(inference_type);
   quant_specs.post_training_quantization = true;
   quant_specs.disable_per_channel = disable_per_channel;
 
@@ -81,8 +83,10 @@ TfLiteStatus QuantizeModel(
   auto input_tf_type = tflite::TflTypeToTfType(input_type);
   if (input_tf_type == tensorflow::DT_FLOAT) {
     emit_adaptor = true;
-  } else if (input_tf_type == tensorflow::DT_UINT8) {
-    quant_specs.inference_type = tensorflow::DT_QUINT8;
+  } else if (input_tf_type == tensorflow::DT_UINT8 ||
+             input_tf_type == tensorflow::DT_INT8 ||
+             input_tf_type == tensorflow::DT_INT16) {
+    quant_specs.inference_type = input_tf_type;
   }
 
   pm.addPass(TFL::CreatePrepareQuantizePass(quant_specs));
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
index 578aa6438de..d60df56b473 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
@@ -26,11 +26,13 @@ namespace mlir {
 namespace lite {
 
 // Quantize the `input_model` and write the result to a flatbuffer `builder`.
-// The `input_type` and `output_type` can be float32/qint8/int8.
+// The `input_type`, `output_type` and `inference_type` can be
+// float32/qint8/int8/int16.
 // Return partially quantized model if `fully_quantize` is false.
 TfLiteStatus QuantizeModel(
     const tflite::ModelT& input_model, const tflite::TensorType& input_type,
     const tflite::TensorType& output_type,
+    const tflite::TensorType& inference_type,
     const std::unordered_set<std::string>& operator_names,
     bool disable_per_channel, bool fully_quantize,
     flatbuffers::FlatBufferBuilder* builder,
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
index 77bd87a3c03..5bd1b71e631 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
@@ -46,7 +46,8 @@ TfLiteStatus QuantizeAnnotatedModel(llvm::StringRef buffer,
 
   tflite::StderrReporter error_reporter;
   return mlir::lite::QuantizeModel(
-      *model, tflite::TensorType_INT8, tflite::TensorType_INT8, {},
+      *model, tflite::TensorType_INT8, tflite::TensorType_INT8,
+      tflite::TensorType_INT8, {},
       /*disable_per_channel=*/false,
       /*fully_quantize=*/true, builder, &error_reporter);
 }
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.cc b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.cc
index 8ea1709b15f..6b226fa68e7 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 
 namespace mlir {
 namespace TFL {
@@ -47,12 +48,18 @@ void ConvertMlirQuantOpsToTFLQuantOps(FuncOp func) {
       auto dcast = b.create<DequantizeOp>(dq.getLoc(), dq.getResult().getType(),
                                           dq.arg());
       dq.getResult().replaceAllUsesWith(dcast);
+      if (auto extra_attr = op->getAttr(mlir::quant::kVolatileOpAttrName)) {
+        dcast.setAttr(mlir::quant::kVolatileOpAttrName, extra_attr);
+      }
       dq.erase();
     } else if (auto q = llvm::dyn_cast<quant::QuantizeCastOp>(op)) {
       auto out_type = q.getResult().getType();
       auto qcast = b.create<QuantizeOp>(q.getLoc(), out_type, q.arg(),
                                         TypeAttr::get(out_type));
       q.getResult().replaceAllUsesWith(qcast);
+      if (auto extra_attr = op->getAttr(mlir::quant::kVolatileOpAttrName)) {
+        qcast.setAttr(mlir::quant::kVolatileOpAttrName, extra_attr);
+      }
       q.erase();
     }
   });
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization.td b/tensorflow/compiler/mlir/lite/quantization/quantization.td
index 7bfcdb65686..c1e392bd3ad 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization.td
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization.td
@@ -63,6 +63,22 @@ def QI32 : QuantizedType<"Uniform", [32], 1>;
 // https://www.tensorflow.org/lite/performance/quantization_spec
 //===----------------------------------------------------------------------===//
 
+// TODO(b/157870442): replace all FixedResultScale trait
+def FixedOutputRangeInterface : OpInterface<
+  "FixedOutputRangeInterface"> {
+  let description = [{
+    Interface for defining the fixed output range.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      [{Returns the fixed output range.}],
+      "UniformQuantizedType", "GetFixedOutputRange",
+      (ins "bool":$sign, "int":$bit_width)
+    >,
+  ];
+}
+
 // Specify this trait if the op has a fixed output value range.
 class FixedResultScale<QuantizedType qt> : NativeOpTrait<!strconcat(
   "quant::FixedResult", qt.name, "Scale<", qt.asTraitArgsStr, ">::Impl")>;
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
index 6b897bd5608..3edd9c36760 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
@@ -45,7 +45,7 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
                               absl::string_view inference_type,
                               QuantizationSpecs* quant_specs) {
   std::vector<std::string> input_nodes = absl::StrSplit(node_names, ',');
-  std::vector<double> node_mins;
+  std::vector<llvm::Optional<double>> node_mins;
   if (!min_values.empty()) {
     std::vector<std::string> node_mins_str = absl::StrSplit(min_values, ',');
     for (int i = 0; i < node_mins_str.size(); i++) {
@@ -57,7 +57,7 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
     }
   }
 
-  std::vector<double> node_maxs;
+  std::vector<llvm::Optional<double>> node_maxs;
   if (!max_values.empty()) {
     std::vector<std::string> node_maxs_str = absl::StrSplit(max_values, ',');
     for (int i = 0; i < node_maxs_str.size(); i++) {
@@ -79,11 +79,11 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
                                 quant_specs);
 }
 
-bool GetInputNodeQuantSpecs(const std::vector<std::string>& node_names,
-                            const std::vector<double>& node_mins,
-                            const std::vector<double>& node_maxs,
-                            tensorflow::DataType inference_type,
-                            QuantizationSpecs* quant_specs) {
+bool GetInputNodeQuantSpecs(
+    const std::vector<std::string>& node_names,
+    const std::vector<llvm::Optional<double>>& node_mins,
+    const std::vector<llvm::Optional<double>>& node_maxs,
+    tensorflow::DataType inference_type, QuantizationSpecs* quant_specs) {
   quant_specs->inference_type = inference_type;
 
   // If min/max are not specified, just return;
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
index cac1df9eee1..0e766ec52b6 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
@@ -69,7 +69,8 @@ struct QuantizationSpecs {
   // arguments. They are only used when `weight_quantization` is set to false,
   // and the model is required to have quantization parameters, either from
   // quantization aware training or calibration, for the remaining tensors.
-  std::vector<std::pair<double, double>> input_ranges;
+  std::vector<std::pair<llvm::Optional<double>, llvm::Optional<double>>>
+      input_ranges;
 
   // The default ranges can be used when a tensor doesn't have quantization
   // parameters and couldn't be quantized. Used only for latency tests.
@@ -90,7 +91,7 @@ struct QuantizationSpecs {
   bool RunWeightQuantization() const { return weight_quantization; }
 
   // Whether this inference type represents a signed storage type.
-  bool IsSignedInferenceType() {
+  bool IsSignedInferenceType() const {
     switch (inference_type) {
       case tensorflow::DT_QUINT8:
       case tensorflow::DT_QUINT16:
@@ -102,7 +103,7 @@ struct QuantizationSpecs {
 
   // Gets the width of this quantization type. Returns 0 if it isn't a
   // quantization type.
-  int64_t GetQuantizationTypeWidth() {
+  int64_t GetQuantizationTypeWidth() const {
     switch (inference_type) {
       case tensorflow::DT_QINT8:
       case tensorflow::DT_QUINT8:
@@ -130,11 +131,11 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
 // Gets the quantization specification for input arrays. The array names are not
 // stored in the spec, and will be matched by position. The min/max will be
 // ignored if the inference_type isn't a quantized type. Returns true if failed.
-bool GetInputNodeQuantSpecs(const std::vector<std::string>& node_names,
-                            const std::vector<double>& node_mins,
-                            const std::vector<double>& node_maxs,
-                            tensorflow::DataType inference_type,
-                            QuantizationSpecs* quant_specs);
+bool GetInputNodeQuantSpecs(
+    const std::vector<std::string>& node_names,
+    const std::vector<llvm::Optional<double>>& node_mins,
+    const std::vector<llvm::Optional<double>>& node_maxs,
+    tensorflow::DataType inference_type, QuantizationSpecs* quant_specs);
 
 }  // namespace TFL
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index 2964a3e79f8..89443b1ec65 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -494,6 +494,13 @@ void QuantizationDriver::QuantizeValue(Value value, QuantParams params,
   auto quantize = builder_.create<quant::QuantizeCastOp>(loc, new_type, value);
   auto dequantize = builder_.create<quant::DequantizeCastOp>(
       loc, expressed_type, quantize.getResult());
+
+  // This attribute is set to distinguish the quantize ops being added by the
+  // quantization pass. These ops can be removed without losing original
+  // program accuracy.
+  // TODO(fengliuai): make the attribute being part of op definition.
+  quantize.setAttr(kVolatileOpAttrName, builder_.getUnitAttr());
+
   // `original_result` has a use to `quantize`, so this will replace that use
   // by the result of `dequantize`. Remember to reset that use afterwards
   value.replaceAllUsesWith(dequantize);
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_traits.h b/tensorflow/compiler/mlir/lite/quantization/quantization_traits.h
index b59164b72e6..693f692c61a 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_traits.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_traits.h
@@ -21,13 +21,18 @@ limitations under the License.
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
-namespace mlir {
-namespace OpTrait {
-namespace quant {
-
 using QuantizedType = mlir::quant::QuantizedType;
 using UniformQuantizedType = mlir::quant::UniformQuantizedType;
 
+namespace mlir {
+
+// This includes the interface class definition. It couldn't be in a namespace
+// because the table gen doesn't emit the namespace when it is used.
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_interface.h.inc"
+
+namespace OpTrait {
+namespace quant {
+
 // The base class that all the quantization related OpTrait implements.
 template <typename ConcreteType, template <typename> class TraitType>
 struct QuantizationSpecTraitBase : public TraitBase<ConcreteType, TraitType> {
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index 3d50f280d0f..32f68aaae5f 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Quant/FakeQuantSupport.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
@@ -436,6 +437,16 @@ bool RemoveRedundantStatsOps(mlir::FuncOp func,
   llvm::SmallVector<quant::StatisticsOp, 16> all_stats_ops;
   llvm::DenseSet<Operation*> redundant_stats_ops;
 
+  // Step 0: remove the quant::StatisticsOp which are used by the tfl.quantize
+  // op in case it overrides the information from training FakeQuant ops.
+  func.walk([&](quant::QuantizeCastOp q) {
+    auto input_op = q.arg().getDefiningOp();
+    if (auto stats = llvm::dyn_cast_or_null<quant::StatisticsOp>(input_op)) {
+      q.setOperand(stats.arg());
+      if (stats.use_empty()) stats.erase();
+    }
+  });
+
   // Step 1: forward pass: propagate any value scales which are not produces
   // by `SameOperandsAndResultsScale`. Additionally, remove the value scales
   // which are produced by the `restricted_output_params`.
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index 27ccc7d2b22..f17e44cd756 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -22,6 +22,8 @@ limitations under the License.
 #include <unordered_map>
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Quant/FakeQuantSupport.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
@@ -35,11 +37,17 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
 
 namespace mlir {
 namespace quant {
 
+// A unit attribute can be attached to the quantize/dequantize ops which are
+// added by the quantization passes. These ops can be removed erased without
+// losing accuracy.
+constexpr char kVolatileOpAttrName[] = "volatile";
+
 using QuantParams = quant::QuantizedType;
 using SignedInteger = std::pair<unsigned, unsigned>;  // bitwidth and sign
 using QuantParamsForResults = llvm::SmallVector<QuantParams, 4>;
@@ -363,6 +371,55 @@ struct ConvertUnsignedToSigned : public OpRewritePattern<Q> {
   }
 };
 
+// Fold Extra Requantize ops if the preceding ops has free scale requirement.
+template <typename RQ>
+struct FoldTrivalRequantizeOp : public OpRewritePattern<RQ> {
+  explicit FoldTrivalRequantizeOp(MLIRContext* context)
+      : OpRewritePattern<RQ>(context, 1) {}
+
+  LogicalResult matchAndRewrite(RQ op,
+                                PatternRewriter& rewriter) const override {
+    Value pre_quantized = op.input();
+    auto pre_quantized_type =
+        quant::QuantizedType::getQuantizedElementType(pre_quantized.getType());
+    if (!pre_quantized_type) return failure();
+
+    Operation* def = pre_quantized.getDefiningOp();
+    if (!def) return failure();
+    if (llvm::isa<FixedOutputRangeInterface>(def) ||
+        def->hasTrait<OpTrait::quant::SameOperandsAndResultsScale>() ||
+        def->hasTrait<OpTrait::quant::NoQuantizableResult>()) {
+      return failure();
+    }
+
+    op.emitWarning("Remove trivial `rescale` op. Please fix the source graph.");
+
+    llvm::SmallVector<Type, 4> new_output_types;
+    for (auto result : def->getResults()) {
+      result.getUsers().begin()->dump();
+      op.dump();
+      if (result.hasOneUse() && *result.getUsers().begin() == op) {
+        new_output_types.push_back(op.qtype());
+      } else {
+        new_output_types.push_back(result.getType());
+      }
+    }
+
+    // Remove this rescale op.
+    rewriter.replaceOp(op, {pre_quantized});
+
+    // Replace the output scale of the preceding op.
+    rewriter.setInsertionPointAfter(def);
+    OperationState new_state(def->getLoc(), def->getName().getStringRef(),
+                             def->getOperands(), new_output_types,
+                             def->getAttrs());
+    Operation* new_op = rewriter.createOperation(new_state);
+
+    rewriter.replaceOp(def, new_op->getResults());
+    return success();
+  }
+};
+
 // Given a quantized type `input`, magnifying its scales by the factor stored in
 // `factor`. If `input` isn't a quantized type or the `factor` doesn't match the
 // dimension size of `input` or isn't floating-point, nullptr will be returned.
diff --git a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
index 1f067aae685..5c69130c939 100644
--- a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
@@ -11,9 +11,9 @@ func @reshape_removeAdjacent(tensor<4x4x4xf32>) -> tensor<64xf32> {
   return %1 : tensor<64xf32>
 
 // CHECK-LABEL: func @reshape_removeAdjacent
-// CHECK:  %cst = constant dense<64> : tensor<1xi32>
-// CHECK:  %0 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
-// CHECK:  return
+// CHECK:  %[[CST:.*]] = constant dense<64> : tensor<1xi32>
+// CHECK:  %[[RESHAPE:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
+// CHECK:  return %[[RESHAPE]]
 }
 
 // Checks that tfl.reshape should be removed if its output has more than one
@@ -29,11 +29,11 @@ func @reshape_removeAdjacentWithMultipleUse(tensor<4x4x4xf32>) -> tensor<64xf32>
   return %3 : tensor<64xf32>
 
 // CHECK-LABEL: func @reshape_removeAdjacentWithMultipleUse
-// CHECK:  %cst = constant dense<64> : tensor<1xi32>
-// CHECK:  %0 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
-// CHECK:  %1 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
-// CHECK:  %2 = addf %0, %1
-// CHECK:  return %2
+// CHECK:  %[[CST:.*]] = constant dense<64> : tensor<1xi32>
+// CHECK:  %[[RESHAPE_1:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
+// CHECK:  %[[RESHAPE_2:.*]]  = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
+// CHECK:  %[[RESULT:.*]] = addf %[[RESHAPE_1]], %[[RESHAPE_2]]
+// CHECK:  return %[[RESULT]]
 }
 
 // Checks that tfl.reshape should be kept if its output has more than one
@@ -47,11 +47,11 @@ func @reshape_keepAdjacentWithMultipleUse(tensor<4x4x4xf32>) -> (tensor<16x4xf32
   return %0, %1 : tensor<16x4xf32>, tensor<64xf32>
 
 // CHECK-LABEL: func @reshape_keepAdjacentWithMultipleUse
-// CHECK:  %cst = constant dense<[16, 4]> : tensor<2xi32>
-// CHECK:  %cst_0 = constant dense<64> : tensor<1xi32>
-// CHECK:  %0 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<2xi32>) -> tensor<16x4xf32>
-// CHECK:  %1 = "tfl.reshape"(%arg0, %cst_0) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
-// CHECK:  return %0, %1
+// CHECK:  %[[CST:.*]]  = constant dense<[16, 4]> : tensor<2xi32>
+// CHECK:  %[[CST_0:.*]]  = constant dense<64> : tensor<1xi32>
+// CHECK:  %[[RESHAPE_1:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<2xi32>) -> tensor<16x4xf32>
+// CHECK:  %[[RESHAPE_2:.*]] = "tfl.reshape"(%arg0, %[[CST_0]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
+// CHECK:  return  %[[RESHAPE_1]],  %[[RESHAPE_2]]
 }
 
 // Checks that tfl.reshape should be removed if its output type is the same
diff --git a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
index 4b8993e2b26..a8463d51c7e 100644
--- a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
@@ -8,13 +8,13 @@ func @add_float() -> (tensor<f32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>,
   %2 = constant dense< 3.5> : tensor<4xf32>
   %3 = constant dense<-0.5> : tensor<4xf32>
 
-  // CHECK: %cst = constant dense<3.500000e+00> : tensor<4xf32>
-  // CHECK: %cst_0 = constant dense<-5.000000e-01> : tensor<4xf32>
-  // CHECK: %cst_1 = constant dense<6.000000e+00> : tensor<f32>
-  // CHECK: %cst_2 = constant dense<4.000000e+00> : tensor<4xf32>
-  // CHECK: %cst_3 = constant dense<5.000000e+00> : tensor<4xf32>
-  // CHECK: %cst_4 = constant dense<3.000000e+00> : tensor<4xf32>
-  // CHECK: %0 = tfl.add %cst, %cst_0 {fused_activation_function = "SIGN_BIT"} : tensor<4xf32>
+  // CHECK: %[[CST:.*]] = constant dense<3.500000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<-5.000000e-01> : tensor<4xf32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<6.000000e+00> : tensor<f32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<4.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_3:.*]]  = constant dense<5.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_4:.*]]  = constant dense<3.000000e+00> : tensor<4xf32>
+  // CHECK: %0 = tfl.add %[[CST]], %[[CST_0]] {fused_activation_function = "SIGN_BIT"} : tensor<4xf32>
 
   %5 = "tfl.add"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<  f32>) -> tensor<  f32>
   %6 = "tfl.add"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<4xf32>) -> tensor<4xf32>
@@ -33,10 +33,10 @@ func @add_int() -> (tensor<i32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) {
   %2 = constant dense< 4> : tensor<4xi32>
   %3 = constant dense<-2> : tensor<4xi32>
 
-  // CHECK: %cst = constant dense<9> : tensor<i32>
-  // CHECK: %cst_0 = constant dense<6> : tensor<4xi32>
-  // CHECK: %cst_1 = constant dense<5> : tensor<4xi32>
-  // CHECK: %cst_2 = constant dense<2> : tensor<4xi32>
+  // CHECK: %[[CST:.*]] = constant dense<9> : tensor<i32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<6> : tensor<4xi32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<5> : tensor<4xi32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<2> : tensor<4xi32>
 
   %5 = "tfl.add"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  i32>, tensor<  i32>) -> tensor<  i32>
   %6 = "tfl.add"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  i32>, tensor<4xi32>) -> tensor<4xi32>
@@ -54,10 +54,10 @@ func @sub_float() -> (tensor<f32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>)
   %2 = constant dense< 3.5> : tensor<4xf32>
   %3 = constant dense<-0.5> : tensor<4xf32>
 
-  // CHECK: %cst = constant dense<3.000000e+00> : tensor<f32>
-  // CHECK: %cst_0 = constant dense<5.000000e+00> : tensor<4xf32>
-  // CHECK: %cst_1 = constant dense<2.000000e+00> : tensor<4xf32>
-  // CHECK: %cst_2 = constant dense<4.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST:.*]] = constant dense<3.000000e+00> : tensor<f32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<5.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<2.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<4.000000e+00> : tensor<4xf32>
 
   %5 = "tfl.sub"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<  f32>) -> tensor<  f32>
   %6 = "tfl.sub"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<4xf32>) -> tensor<4xf32>
@@ -75,10 +75,10 @@ func @sub_int() -> (tensor<i32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) {
   %2 = constant dense< 4> : tensor<4xi32>
   %3 = constant dense<-2> : tensor<4xi32>
 
-  // CHECK: %cst = constant dense<7> : tensor<i32>
-  // CHECK: %cst_0 = constant dense<10> : tensor<4xi32>
-  // CHECK: %cst_1 = constant dense<3> : tensor<4xi32>
-  // CHECK: %cst_2 = constant dense<6> : tensor<4xi32>
+  // CHECK: %[[CST:.*]] = constant dense<7> : tensor<i32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<10> : tensor<4xi32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<3> : tensor<4xi32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<6> : tensor<4xi32>
 
   %5 = "tfl.sub"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  i32>, tensor<  i32>) -> tensor<  i32>
   %6 = "tfl.sub"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  i32>, tensor<4xi32>) -> tensor<4xi32>
@@ -96,10 +96,10 @@ func @mul_float() -> (tensor<f32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>)
   %2 = constant dense< 3.5> : tensor<4xf32>
   %3 = constant dense<-0.5> : tensor<4xf32>
 
-  // CHECK: %cst = constant dense<6.750000e+00> : tensor<f32>
-  // CHECK: %cst_0 = constant dense<-2.250000e+00> : tensor<4xf32>
-  // CHECK: %cst_1 = constant dense<5.250000e+00> : tensor<4xf32>
-  // CHECK: %cst_2 = constant dense<-1.750000e+00> : tensor<4xf32>
+  // CHECK: %[[CST:.*]] = constant dense<6.750000e+00> : tensor<f32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<-2.250000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<5.250000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<-1.750000e+00> : tensor<4xf32>
 
   %5 = "tfl.mul"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<  f32>) -> tensor<  f32>
   %6 = "tfl.mul"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<4xf32>) -> tensor<4xf32>
@@ -170,8 +170,8 @@ func @add_dense_splat_int() -> tensor<4xi32> {
 
   return %2 : tensor<4xi32>
 
-// CHECK:  %cst = constant dense<[-5, 4, 47, 105]> : tensor<4xi32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-5, 4, 47, 105]> : tensor<4xi32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_splat_dense_int
@@ -183,8 +183,8 @@ func @add_splat_dense_int() -> tensor<4xi32> {
 
   return %2 : tensor<4xi32>
 
-// CHECK:  %cst = constant dense<[-5, 4, 47, 105]> : tensor<4xi32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-5, 4, 47, 105]> : tensor<4xi32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_int_same_shape
@@ -196,8 +196,8 @@ func @add_dense_dense_int_same_shape() -> tensor<4xi32> {
 
   return %2 : tensor<4xi32>
 
-// CHECK:  %cst = constant dense<[5, 22, -2, 98]> : tensor<4xi32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[5, 22, -2, 98]> : tensor<4xi32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_int_trailing_dim
@@ -212,10 +212,10 @@ func @add_dense_dense_int_trailing_dim() -> (tensor<2x2xi32>, tensor<2x2x2xi32>,
 
   return %0, %1, %2 : tensor<2x2xi32>, tensor<2x2x2xi32>, tensor<2x2x2xi32>
 
-// CHECK:  %cst = constant dense<{{\[\[}}11, 22], [13, 24]]> : tensor<2x2xi32>
-// CHECK:  %cst_0 = constant dense<{{\[\[\[}}2, 3], [5, 6]], {{\[\[}}4, 5], [7, 8]]]> : tensor<2x2x2xi32>
-// CHECK:  %cst_1 = constant dense<{{\[\[\[}}11, 21], [12, 22]], {{\[\[}}13, 23], [14, 24]]]> : tensor<2x2x2xi32>
-// CHECK:  return %cst, %cst_0, %cst_1
+// CHECK:  %[[CST:.*]] = constant dense<{{\[\[}}11, 22], [13, 24]]> : tensor<2x2xi32>
+// CHECK:  %[[CST_0:.*]]  = constant dense<{{\[\[\[}}2, 3], [5, 6]], {{\[\[}}4, 5], [7, 8]]]> : tensor<2x2x2xi32>
+// CHECK:  %[[CST_1:.*]]  = constant dense<{{\[\[\[}}11, 21], [12, 22]], {{\[\[}}13, 23], [14, 24]]]> : tensor<2x2x2xi32>
+// CHECK:  return %[[CST]], %[[CST_0]], %[[CST_1]]
 }
 
 // CHECK-LABEL: @add_dense_dense_int_mixing_1_n
@@ -226,8 +226,8 @@ func @add_dense_dense_int_mixing_1_n() -> tensor<2x2xi32> {
   %0 = "tfl.add"(%cst_0, %cst_1) {fused_activation_function = "NONE"} : (tensor<1x2xi32>, tensor<2x1xi32>) -> tensor<2x2xi32>
 
   return %0 : tensor<2x2xi32>
-// CHECK: %cst = constant dense<{{\[\[}}4, 5], [5, 6]]> : tensor<2x2xi32>
-// CHECK:  return %cst
+// CHECK: %[[CST:.*]] = constant dense<{{\[\[}}4, 5], [5, 6]]> : tensor<2x2xi32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_splat_float
@@ -239,8 +239,8 @@ func @add_dense_splat_float() -> tensor<4xf32> {
 
   return %2 : tensor<4xf32>
 
-// CHECK:  %cst = constant dense<[-6.500000e+00, 2.000000e+00, 4.550000e+01, 1.075000e+01]> : tensor<4xf32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-6.500000e+00, 2.000000e+00, 4.550000e+01, 1.075000e+01]> : tensor<4xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_splat_dense_float
@@ -252,8 +252,8 @@ func @add_splat_dense_float() -> tensor<4xf32> {
 
   return %2 : tensor<4xf32>
 
-// CHECK:  %cst = constant dense<[-6.500000e+00, 2.000000e+00, 4.550000e+01, 1.075000e+01]> : tensor<4xf32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-6.500000e+00, 2.000000e+00, 4.550000e+01, 1.075000e+01]> : tensor<4xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_float_same_shape
@@ -265,8 +265,8 @@ func @add_dense_dense_float_same_shape() -> (tensor<4xf32>) {
 
   return %2 : tensor<4xf32>
 
-// CHECK:  %cst = constant dense<[-8.89999961, 1.000000e+00, 3.800000e+01, 9.800000e+01]> : tensor<4xf32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-8.89999961, 1.000000e+00, 3.800000e+01, 9.800000e+01]> : tensor<4xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_float_trailing_dim
@@ -281,10 +281,10 @@ func @add_dense_dense_float_trailing_dim() -> (tensor<2x2xf32>, tensor<2x2x2xf32
 
   return %0, %1, %2 : tensor<2x2xf32>, tensor<2x2x2xf32>, tensor<2x2x2xf32>
 
-// CHECK:  %cst = constant dense<{{\[\[}}-4.500000e+00, -2.500000e+00], [8.500000e+00, -8.500000e+00]]> : tensor<2x2xf32>
-// CHECK:  %cst_0 = constant dense<{{\[\[\[}}-4.500000e+00, 2.500000e+00], [9.500000e+00, -2.500000e+00]], {{\[\[}}-2.500000e+00, 4.500000e+00], [1.150000e+01, -5.000000e-01]]]> : tensor<2x2x2xf32>
-// CHECK:  %cst_1 = constant dense<{{\[\[\[}}2.000000e+00, -3.000000e+00], [3.000000e+00, -2.000000e+00]], {{\[\[}}4.000000e+00, -1.000000e+00], [5.000000e+00, 0.000000e+00]]]> : tensor<2x2x2xf32>
-// CHECK:  return %cst, %cst_0, %cst_1
+// CHECK:  %[[CST:.*]] = constant dense<{{\[\[}}-4.500000e+00, -2.500000e+00], [8.500000e+00, -8.500000e+00]]> : tensor<2x2xf32>
+// CHECK:  %[[CST_0:.*]]  = constant dense<{{\[\[\[}}-4.500000e+00, 2.500000e+00], [9.500000e+00, -2.500000e+00]], {{\[\[}}-2.500000e+00, 4.500000e+00], [1.150000e+01, -5.000000e-01]]]> : tensor<2x2x2xf32>
+// CHECK:  %[[CST_1:.*]]  = constant dense<{{\[\[\[}}2.000000e+00, -3.000000e+00], [3.000000e+00, -2.000000e+00]], {{\[\[}}4.000000e+00, -1.000000e+00], [5.000000e+00, 0.000000e+00]]]> : tensor<2x2x2xf32>
+// CHECK:  return %[[CST]], %[[CST_0]], %[[CST_1]]
 }
 
 // CHECK-LABEL: @add_dense_dense_float_mixfng_1_n
@@ -296,24 +296,24 @@ func @add_dense_dense_float_mixfng_1_n() -> tensor<2x2xf32> {
 
   return %0 : tensor<2x2xf32>
 
-// CHECK: %cst = constant dense<{{\[\[}}-1.500000e+00, -5.500000e+00], [5.500000e+00, 1.500000e+00]]> : tensor<2x2xf32>
-// CHECK:  return %cst
+// CHECK: %[[CST:.*]] = constant dense<{{\[\[}}-1.500000e+00, -5.500000e+00], [5.500000e+00, 1.500000e+00]]> : tensor<2x2xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @rank
 func @rank() -> tensor<1xi32> {
   %cst = constant dense<[[1], [2]]> : tensor<2x1xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<2> : tensor<1xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<2> : tensor<1xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.rank"(%cst) : (tensor<2x1xi32>) -> tensor<1xi32>
   return %0 : tensor<1xi32>
 }
 
 // CHECK-LABEL: @rank_input_known_rank
 func @rank_input_known_rank(%arg0 : tensor<2x1xi32>) -> tensor<1xi32> {
-  // CHECK: [[cst:%.*]] = constant dense<2> : tensor<1xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<2> : tensor<1xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.rank"(%arg0) : (tensor<2x1xi32>) -> tensor<1xi32>
   return %0 : tensor<1xi32>
 }
@@ -323,8 +323,8 @@ func @reshape() -> tensor<4xi32> {
   %input = constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
   %shape = constant dense<[4]> : tensor<1xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<[1, 2, 3, 4]> : tensor<4xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.reshape"(%input, %shape) : (tensor<2x2xi32>, tensor<1xi32>) -> tensor<4xi32>
   return %0 : tensor<4xi32>
 }
@@ -334,8 +334,8 @@ func @reshape_dynamic_output() -> tensor<?xi32> {
   %input = constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
   %shape = constant dense<[4]> : tensor<1xi32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[1, 2, 3, 4]> : tensor<4xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[1, 2, 3, 4]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.reshape"(%input, %shape) : (tensor<2x2xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
@@ -343,8 +343,8 @@ func @reshape_dynamic_output() -> tensor<?xi32> {
 
 // CHECK-LABEL: @pseudo_const
 func @pseudo_const() -> tensor<i32> {
-  // CHECK: [[cst:%.*]] = constant dense<1> : tensor<i32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<1> : tensor<i32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   return %0 : tensor<i32>
 }
@@ -356,8 +356,8 @@ func @range_int() -> tensor<?xi32> {
   %cst_1 = constant dense<4> : tensor<i32>
   %cst_2 = constant dense<1> : tensor<i32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[0, 1, 2, 3]> : tensor<4xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[0, 1, 2, 3]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
@@ -368,8 +368,8 @@ func @range_float() -> tensor<?xf32> {
   %cst_1 = constant dense<4.0> : tensor<f32>
   %cst_2 = constant dense<1.0> : tensor<f32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
@@ -381,8 +381,8 @@ func @range_float_neg_delta() -> tensor<?xf32> {
   %cst_1 = constant dense<-4.0> : tensor<f32>
   %cst_2 = constant dense<-1.0> : tensor<f32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, -1.000000e+00, -2.000000e+00, -3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, -1.000000e+00, -2.000000e+00, -3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
@@ -393,8 +393,8 @@ func @range_float_nonzero_base() -> tensor<?xf32> {
   %cst_1 = constant dense<7.0> : tensor<f32>
   %cst_2 = constant dense<1.5> : tensor<f32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[2.000000e+00, 3.500000e+00, 5.000000e+00, 6.500000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[2.000000e+00, 3.500000e+00, 5.000000e+00, 6.500000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
@@ -414,8 +414,8 @@ func @transpose_1d() -> tensor<3xi32> {
   %cst = constant dense<[1, 2, 3]> : tensor<3xi32>
   %cst_perm = constant dense<0> : tensor<1xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<{{\[}}1, 2, 3]> : tensor<3xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<{{\[}}1, 2, 3]> : tensor<3xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<3xi32>, tensor<1xi32>) -> tensor<3xi32>
   return %0 : tensor<3xi32>
 }
@@ -425,8 +425,8 @@ func @transpose_dynamic() -> tensor<?xi32> {
   %cst = constant dense<[1, 2, 3]> : tensor<3xi32>
   %cst_perm = constant dense<0> : tensor<1xi32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<{{\[}}1, 2, 3]> : tensor<3xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<{{\[}}1, 2, 3]> : tensor<3xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<3xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
@@ -436,8 +436,8 @@ func @transpose_2d() -> tensor<2x2xi32> {
   %cst = constant dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>
   %cst_perm = constant dense<[1, 0]> : tensor<2xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<{{\[\[}}0, 2], {{\[}}1, 3]]> : tensor<2x2xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<{{\[\[}}0, 2], {{\[}}1, 3]]> : tensor<2x2xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
   return %0 : tensor<2x2xi32>
 }
@@ -447,8 +447,8 @@ func @transpose_2d_identity() -> tensor<2x2xi32> {
   %cst = constant dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>
   %cst_perm = constant dense<[0, 1]> : tensor<2xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<{{\[\[}}0, 1], {{\[}}2, 3]]> : tensor<2x2xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<{{\[\[}}0, 1], {{\[}}2, 3]]> : tensor<2x2xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
   return %0 : tensor<2x2xi32>
 }
@@ -460,8 +460,8 @@ func @transpose_3d() -> tensor<4x2x3xi32> {
   %cst = constant dense<[[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]> : tensor<2x3x4xi32>
   %cst_perm = constant dense<[2, 0, 1]> : tensor<3xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<{{\[\[\[}}0, 4, 8], {{\[}}12, 16, 20]], {{\[\[}}1, 5, 9], {{\[}}13, 17, 21]], {{\[\[}}2, 6, 10], {{\[}}14, 18, 22]], {{\[\[}}3, 7, 11], {{\[}}15, 19, 23]]]> : tensor<4x2x3xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<{{\[\[\[}}0, 4, 8], {{\[}}12, 16, 20]], {{\[\[}}1, 5, 9], {{\[}}13, 17, 21]], {{\[\[}}2, 6, 10], {{\[}}14, 18, 22]], {{\[\[}}3, 7, 11], {{\[}}15, 19, 23]]]> : tensor<4x2x3xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<2x3x4xi32>, tensor<3xi32>) -> tensor<4x2x3xi32>
   return %0 : tensor<4x2x3xi32>
 }
@@ -473,8 +473,8 @@ func @ConstantFoldBinaryOpDynamicOutput() -> tensor<?xi32> {
   %87 = "tfl.sub"(%cst_0, %cst) {fused_activation_function = "NONE"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
   return %87 : tensor<?xi32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[-5, 0]> : tensor<2xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[-5, 0]> : tensor<2xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_int_same_shape_dynamic
@@ -486,8 +486,8 @@ func @add_dense_dense_int_same_shape_dynamic() -> tensor<?xi32> {
 
   return %2 : tensor<?xi32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[5, 22, -2, 98]> : tensor<4xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[5, 22, -2, 98]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @concat_2_tensors_1_empty
@@ -497,8 +497,8 @@ func @concat_2_tensors_1_empty() -> tensor<2xi32> {
   %3 = "tfl.concatenation"(%1, %2) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<2xi32>, tensor<0xi32>) -> tensor<2xi32>
   return %3 : tensor<2xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<1> : tensor<2xi32>
-  // CHECK: return [[cst]] : tensor<2xi32>
+  // CHECK: %[[CST:.*]] = constant dense<1> : tensor<2xi32>
+  // CHECK: return %[[CST]] : tensor<2xi32>
 }
 
 // CHECK-LABEL: @concat_3_tensors_1_empty
@@ -509,7 +509,7 @@ func @concat_3_tensors_1_empty() -> tensor<?xi32> {
   %3 = "tfl.concatenation"(%0, %1, %2) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<2xi32>, tensor<2xi32>, tensor<0xi32>) -> tensor<?xi32>
   return %3 : tensor<?xi32>
 
-  // CHECK: %0 = "tfl.concatenation"(%cst, %cst) {axis = 0 : i32, fused_activation_function = "NONE"}
+  // CHECK: %0 = "tfl.concatenation"(%[[CST]], %[[CST]]) {axis = 0 : i32, fused_activation_function = "NONE"}
   // CHECK: return %0 : tensor<?xi32>
 }
 
@@ -520,10 +520,10 @@ func @concatConstantTensorsFirstDim() -> tensor<2x2x3xi32> {
   %0 = "tfl.concatenation"(%cst_0, %cst_1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xi32>, tensor<1x2x3xi32>) -> tensor<2x2x3xi32>
   return %0 : tensor<2x2x3xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0], {{\[}}0, 0, 0]], {{\[}}{{\[}}1, 1, 1], {{\[}}1, 1, 1]]]> : tensor<2x2x3xi32>
+  // CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0], {{\[}}0, 0, 0]], {{\[}}{{\[}}1, 1, 1], {{\[}}1, 1, 1]]]> : tensor<2x2x3xi32>
   // CHECK-NOT: constant-dense
   // CHECK-NOT: "tfl.concatenation"
-  // CHECK: return [[cst]]
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @concatConstantTensorsMiddleDim
@@ -533,10 +533,10 @@ func @concatConstantTensorsMiddleDim() -> tensor<1x4x3xi32> {
   %0 = "tfl.concatenation"(%cst_0, %cst_1) {axis = 1 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xi32>, tensor<1x2x3xi32>) -> tensor<1x4x3xi32>
   return %0 : tensor<1x4x3xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0], {{\[}}0, 0, 0], {{\[}}1, 1, 1], {{\[}}1, 1, 1]]]> : tensor<1x4x3xi32>
+  // CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0], {{\[}}0, 0, 0], {{\[}}1, 1, 1], {{\[}}1, 1, 1]]]> : tensor<1x4x3xi32>
   // CHECK-NOT: constant-dense
   // CHECK-NOT: "tfl.concatenation"
-  // CHECK: return [[cst]]
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @concatConstantTensorsLastDim
@@ -546,10 +546,10 @@ func @concatConstantTensorsLastDim() -> tensor<1x2x6xi32> {
   %0 = "tfl.concatenation"(%cst_0, %cst_1) {axis = 2 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xi32>, tensor<1x2x3xi32>) -> tensor<1x2x6xi32>
   return %0 : tensor<1x2x6xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0, 1, 1, 1], {{\[}}0, 0, 0, 1, 1, 1]]]> : tensor<1x2x6xi32>
+  // CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0, 1, 1, 1], {{\[}}0, 0, 0, 1, 1, 1]]]> : tensor<1x2x6xi32>
   // CHECK-NOT: constant-dense
   // CHECK-NOT: "tfl.concatenation"
-  // CHECK: return [[cst]]
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @div_dense_dense_float_mixfng_1_n
@@ -561,8 +561,8 @@ func @div_dense_dense_float_mixfng_1_n() -> tensor<2x2xf32> {
 
   return %0 : tensor<2x2xf32>
 
-// CHECK: %cst = constant dense<{{\[\[}}-5.000000e-01, 0.833333313], [3.750000e-01, -6.250000e-01]]> : tensor<2x2xf32>
-// CHECK:  return %cst
+// CHECK: %[[CST:.*]] = constant dense<{{\[\[}}-5.000000e-01, 0.833333313], [3.750000e-01, -6.250000e-01]]> : tensor<2x2xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @div_dense_different_rank
@@ -574,6 +574,6 @@ func @div_dense_different_rank() -> tensor<1x2x2xf32> {
 
   return %0 : tensor<1x2x2xf32>
 
-// CHECK: %cst = constant dense<[{{\[}}{{\[}}5.000000e-01, 0.333333343], [1.000000e+00, 0.666666686]]]> : tensor<1x2x2xf32>
-// CHECK:  return %cst
+// CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}5.000000e-01, 0.333333343], [1.000000e+00, 0.666666686]]]> : tensor<1x2x2xf32>
+// CHECK:  return %[[CST]]
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/back2back_fake_quant.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/back2back_fake_quant.pbtxt
new file mode 100644
index 00000000000..31e2157d360
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/back2back_fake_quant.pbtxt
@@ -0,0 +1,1186 @@
+# RUN: tf_tfl_translate --mlir-elide-elementsattrs-if-larger=10 --emit-builtin-tflite-ops \
+# RUN: --emit-select-tf-ops --tf-inference-type=DT_QUINT8 --tf-input-min-values=0.0 \
+# RUN: --tf-input-max-values=6.283185307179586 --tf-input-arrays=quant_dense_input --tf-input-shapes=1,1  \
+# RUN: --tf-output-arrays=Identity -o %t.tflite  %s 2>&1 | FileCheck %s
+# RUN: flatbuffer_to_string %t.tflite | FileCheck --check-prefix=RESULT1 %s
+
+node {
+  name: "quant_dense_input"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 16
+          }
+        }
+        tensor_content: "n\267\313\276@W\337>\227o9>PF\237\275|%}\276\333n\005>\371\005\031?\230\355\235\275\344\211_>\034\222\264=\254\003\345=Q\027*>\225\304\373>Qm6>\270\025\022;N/\020\277"
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense/MatMul/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.5634322166442871
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5968852043151855
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  op: "Identity"
+  input: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense/MatMul/ReadVariableOp"
+  input: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  input: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  attr {
+    key: "narrow_range"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "num_bits"
+    value {
+      i: 8
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul/kquant/IdentityN"
+  op: "IdentityN"
+  input: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense/MatMul/ReadVariableOp"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_gradient_op_type"
+    value {
+      s: "CustomGradient-10455"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul"
+  op: "MatMul"
+  input: "quant_dense_input"
+  input: "sequential/quant_dense/MatMul/kquant/IdentityN"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/BiasAdd/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 16
+          }
+        }
+        tensor_content: "\000\000\000\000L\020\341=\355\223\242\276\000\000\000\000\000\000\000\000\223&<>\206\234d\276\000\000\000\000\323%\241\276\331\305\234>\004\341\216>\3545e\276\032\363O\276\257:u\276\313\223\r>\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/BiasAdd/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense/BiasAdd/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/BiasAdd"
+  op: "BiasAdd"
+  input: "sequential/quant_dense/MatMul"
+  input: "sequential/quant_dense/BiasAdd/ReadVariableOp"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/Relu"
+  op: "Relu"
+  input: "sequential/quant_dense/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0019266719464212656
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 3.249699354171753
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  op: "Identity"
+  input: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense/Relu"
+  input: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  input: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  attr {
+    key: "narrow_range"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "num_bits"
+    value {
+      i: 8
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/oquant/IdentityN"
+  op: "IdentityN"
+  input: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense/Relu"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_gradient_op_type"
+    value {
+      s: "CustomGradient-10477"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0019266719464212656
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 3.1626083850860596
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  op: "Identity"
+  input: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense/oquant/IdentityN"
+  input: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  input: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  attr {
+    key: "narrow_range"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "num_bits"
+    value {
+      i: 8
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/iquant/IdentityN"
+  op: "IdentityN"
+  input: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense/oquant/IdentityN"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_gradient_op_type"
+    value {
+      s: "CustomGradient-10494"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 16
+          }
+          dim {
+            size: 16
+          }
+        }
+        tensor_content: "\356x\007<\273\351H\275\2506\242\275\261t\332>\"O\'>m\"\010\276(\311b\275\301\233\262=d\327\031\275V\030`>1U\212\276\325\353,=\321\375+=\305\016-=IQC\276\272N\255\276\342\\\241=\253K\230>\221\247\220\274\026[\220=\301W9\276\3041\311>\200\013\255\276@\331\307>\250\246\320=e\305A\276\312\211\200>\252\220x\276?\306\305>jC\221>_h\304>\350\2701>\213&\014\277\327a\251>\010\321\365=\022\346\357>_P\326>\024@\260\276\010\002\203\275%\013;\276\300\270s\2762~\206\2769\341\356<;\364\023\276\'=\225\276\347\327\247\275\037\317\326\276\2379\243>\241+\306\276\252\226\007\276\232\273\010= \001s>h.\010=x\rK=\231\255\315\276\001\244>>_\227\304\276\030\255\255\276\t\013\014>\263\031i\276\312\312\204>\262\331\244=\367\331R\276\020h\005>\361\260\322<n\273\274>a?\323>\t\236\304\275\240\215\240\275Y\034V>\200\236=<\232\336Y\276\274_\263\276\240\004\334\276\232\003\252>\264\243\273>\254Q\201>_\271\263\276a1\305>\237?\331>\244\031\275>\365K\351\276\212\302\234>\276{\217\275{\211{\276.\016\271\276\337\344\255\276\230\2347\2757O(>puO=\024\177\207\276v:\255>%.\230>\237\211\341=\327\316\037\274\302\205I>\020\234\201\276\027\356(\276J\320*\277\014\226\240=\222\312\323\276J\235\253=\263\201\221>\030\220\027\276\375g\"\277\227E\326\276\206\267{>\253\271\374\276!\235\203>\264\215\342=g\235\267\2767\211\306>x\323i>\244TJ>\306\216]\275\310\230\004\276\340\000B\274\351\324a>$[\266=\267\220\256\275\241n >\003s\235>\340\251\225\276\335jX=N\"@\276N;\271\276Sl6=\234S\007>~\345\025\277\013\020\362\275\377\333 \276@\227\325=\177\375\215=\335\357\332=t\217\027\276D\337\326=}\211\363\276\227\"\322>r\245\267=]\007x=4\372e>\252\235\373\276\030\276\221\275\2763\000\276\202\007\301>H\037Y>\307;\006?QT\241=NY\\<\014\254@>P\031\217<l\023\340>\010\327+\276\212\240\305\276\371x7>$\350\037?I\332\262\276\361\307\031\276\275o\206>\355\217\016\276 \036\007>yQ\221=\372\204\207>#\301\214>\344\376\302\275y\024\261\2768\304+\276\260\n \276(\026\254>\300*\034=j\2142=[\023\233>\301\370\304\2766<0=\321\025\237>\264C\220\2765\301\330\276\274\027\252\276\n\331U\276\355\363\302>\214i\305\276\367`\273>\244\352\241\276c\030C\276u\033\251\275r\314\036>\033\352h\275\361\372\374\276\370\356\242\274O\305\030\277\360\264\221>\244 \'\276\223j\006>\334\320E>\274\223\031\277\275\210\326>\330\301o=9\273>>\324\003\314\275\234\204\215\276\321\253|\276\313v\275\276d\256\270>r,\013\27765J=\313s\'\276\035\363{\276\"3\251\275\007\267d<\266\261\256\2766\205g\276~\275\331>\272\317\242>\237\305\034>\312\1770\2769\356\024\277\255|Y\276\374V\271=\320b\177\276\014\023\345\2766\353\226\273\333\033\373\275.[\264\276\tx@\276Dx\273>Q\327\233>\246\265\353>\021\214\315\276\000K\356\272d\207\037\276\331Z\321\276\\>\001?\202\343\031?\000>^\2727\375V\276\266\006[=\367\377\313>;vB\275F\212\022?D\253\246\276\314\207\210>\255\222\211\276\230V\223\273z,\316\276\325D\206\276CR\260> \260-\275\273-*\276i\032d>\266\316\003>\300\322\205\276\232\205\322\276\036\267\205>\372Y\342=r\221k\276(\003k>"
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_1/MatMul/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.6485296487808228
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.6135242581367493
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  op: "Identity"
+  input: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense_1/MatMul/ReadVariableOp"
+  input: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  input: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  attr {
+    key: "narrow_range"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "num_bits"
+    value {
+      i: 8
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul/kquant/IdentityN"
+  op: "IdentityN"
+  input: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense_1/MatMul/ReadVariableOp"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_gradient_op_type"
+    value {
+      s: "CustomGradient-10513"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul"
+  op: "MatMul"
+  input: "sequential/quant_dense_1/iquant/IdentityN"
+  input: "sequential/quant_dense_1/MatMul/kquant/IdentityN"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/BiasAdd/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 16
+          }
+        }
+        tensor_content: "Y\226;>\002#==\315\253\207>\032t\021\276\2510O\273\354\026\001\276\000\000\000\000\227\021M>\275D\213>\000\000\000\000\231\354)\276]@\237>\026\327E=p\007\003>\007\340c>\\\241\336\275"
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/BiasAdd/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_1/BiasAdd/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/BiasAdd"
+  op: "BiasAdd"
+  input: "sequential/quant_dense_1/MatMul"
+  input: "sequential/quant_dense_1/BiasAdd/ReadVariableOp"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/Relu"
+  op: "Relu"
+  input: "sequential/quant_dense_1/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0019266719464212656
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 2.7181646823883057
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  op: "Identity"
+  input: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense_1/Relu"
+  input: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  input: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  attr {
+    key: "narrow_range"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "num_bits"
+    value {
+      i: 8
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/oquant/IdentityN"
+  op: "IdentityN"
+  input: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense_1/Relu"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_gradient_op_type"
+    value {
+      s: "CustomGradient-10535"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 16
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\217\017J?$\023*>\352\265\272\276\304|;\276\2178\277=\275\031\026\277J\017\304\276Iu!?h]\314\276A7;\276\204\221\031\275\323\2259\277\341K\304>Uzs?@)\337>\"\300n\276"
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_2/MatMul/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.7017847895622253
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.9041997790336609
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  op: "Identity"
+  input: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense_2/MatMul/ReadVariableOp"
+  input: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  input: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  attr {
+    key: "narrow_range"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "num_bits"
+    value {
+      i: 8
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul/kquant/IdentityN"
+  op: "IdentityN"
+  input: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense_2/MatMul/ReadVariableOp"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_gradient_op_type"
+    value {
+      s: "CustomGradient-10554"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul"
+  op: "MatMul"
+  input: "sequential/quant_dense_1/oquant/IdentityN"
+  input: "sequential/quant_dense_2/MatMul/kquant/IdentityN"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/BiasAdd/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        float_val: 0.04556306451559067
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/BiasAdd/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_2/BiasAdd/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/BiasAdd"
+  op: "BiasAdd"
+  input: "sequential/quant_dense_2/MatMul"
+  input: "sequential/quant_dense_2/BiasAdd/ReadVariableOp"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.8735198974609375
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0447778701782227
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  op: "Identity"
+  input: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense_2/BiasAdd"
+  input: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  input: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  attr {
+    key: "narrow_range"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "num_bits"
+    value {
+      i: 8
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/oquant/IdentityN"
+  op: "IdentityN"
+  input: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense_2/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_gradient_op_type"
+    value {
+      s: "CustomGradient-10575"
+    }
+  }
+}
+node {
+  name: "sequential/output/FakeQuantWithMinMaxArgs"
+  op: "FakeQuantWithMinMaxArgs"
+  input: "sequential/quant_dense_2/oquant/IdentityN"
+  attr {
+    key: "max"
+    value {
+      f: 0.9921875
+    }
+  }
+  attr {
+    key: "min"
+    value {
+      f: -1.0
+    }
+  }
+  attr {
+    key: "narrow_range"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "num_bits"
+    value {
+      i: 8
+    }
+  }
+}
+node {
+  name: "Identity"
+  op: "Identity"
+  input: "sequential/output/FakeQuantWithMinMaxArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+versions {
+  producer: 175
+}
+
+# CHECK: {{.*: warning:}} loc(fused["sequential/quant_dense/oquant/FakeQuantWithMinMaxVars"{{.*\): }}quantizer's output has another quantizer
+# CHECK: {{.*: warning:}} loc(fused["sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars"{{.*\): }}quantizer's output has another quantizer
+
+# RESULT1: name: "Identity"
+# RESULT1-NEXT: quantization:
+# RESULT1-NEXT: scale: [ 0.007523 ],
+# RESULT1-NEXT: zero_point: [ 116 ]
+
+# TODO  Actually RESULT1 represents in incomplete implementation
+# Currently TF2.2.0-rc3 all but the first fake_quant in 
+# sequence are dropped.  A correct transalation would retain the second as a requantization
+# op.
+
+# CORRECT1:name: "Identity"
+# CORRECT1-NEXT: quantization:
+# CORRECT1-NEXT: scale: [ 0.007813 ],
+# CORRECT1-NEXT: zero_point: [ 128 ]
+
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/unroll_batch_matmul.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/unroll_batch_matmul.pbtxt
new file mode 100644
index 00000000000..096033e37cb
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/unroll_batch_matmul.pbtxt
@@ -0,0 +1,101 @@
+# RUN: tf_tfl_translate -tf-input-arrays=Placeholder,Placeholder_1 -tf-input-shapes=2,5,3:3,7 -tf-input-data-types=DT_FLOAT,DT_FLOAT -tf-output-arrays=MatMul -output-mlir %s -o - 2>&1 | FileCheck %s
+
+node {
+  name: "Placeholder"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 2
+        }
+        dim {
+          size: 5
+        }
+        dim {
+          size: 3
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Placeholder_1"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 7
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "BatchMatMulV2"
+  input: "Placeholder"
+  input: "Placeholder_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "adj_x"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "adj_y"
+    value {
+      b: false
+    }
+  }
+}
+versions {
+  producer: 175
+}
+
+# CHECK:       func @main(%[[VAL_0:.*]]: tensor<2x5x3xf32>, %[[VAL_1:.*]]: tensor<3x7xf32>) -> tensor<2x5x7xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "Placeholder,Placeholder_1", outputs = "MatMul"}} {
+# CHECK:           %[[VAL_2:.*]] = constant dense<[1, 0]> : tensor<2xi32>
+# CHECK:           %[[VAL_3:.*]] = constant dense<[5, 3]> : tensor<2xi32>
+# CHECK:           %[[VAL_4:.*]] = constant dense<[3, 7]> : tensor<2xi32>
+# CHECK:           %[[VAL_5:.*]] = constant unit
+# CHECK:           %[[VAL_6:.*]] = constant dense<[1, 0, 0]> : tensor<3xi32>
+# CHECK:           %[[VAL_7:.*]] = constant dense<[1, 5, 3]> : tensor<3xi32>
+# CHECK:           %[[VAL_8:.*]] = constant dense<0> : tensor<3xi32>
+# CHECK:           %[[VAL_9:.*]] = constant dense<[1, 3, 7]> : tensor<3xi32>
+# CHECK:           %[[VAL_10:.*]] = "tfl.slice"(%[[VAL_0]], %[[VAL_8]], %[[VAL_7]]) : (tensor<2x5x3xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x5x3xf32>
+# CHECK:           %[[VAL_11:.*]] = "tfl.reshape"(%[[VAL_10]], %[[VAL_3]]) : (tensor<1x5x3xf32>, tensor<2xi32>) -> tensor<5x3xf32>
+# CHECK:           %[[VAL_12:.*]] = "tfl.slice"(%[[VAL_0]], %[[VAL_6]], %[[VAL_7]]) : (tensor<2x5x3xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x5x3xf32>
+# CHECK:           %[[VAL_13:.*]] = "tfl.reshape"(%[[VAL_12]], %[[VAL_3]]) : (tensor<1x5x3xf32>, tensor<2xi32>) -> tensor<5x3xf32>
+# CHECK:           %[[VAL_14:.*]] = "tfl.reshape"(%[[VAL_1]], %[[VAL_9]]) : (tensor<3x7xf32>, tensor<3xi32>) -> tensor<1x3x7xf32>
+# CHECK:           %[[VAL_15:.*]] = "tfl.slice"(%[[VAL_14]], %[[VAL_8]], %[[VAL_9]]) : (tensor<1x3x7xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x3x7xf32>
+# CHECK:           %[[VAL_16:.*]] = "tfl.reshape"(%[[VAL_15]], %[[VAL_4]]) : (tensor<1x3x7xf32>, tensor<2xi32>) -> tensor<3x7xf32>
+# CHECK:           %[[VAL_17:.*]] = "tfl.transpose"(%[[VAL_16]], %[[VAL_2]]) : (tensor<3x7xf32>, tensor<2xi32>) -> tensor<7x3xf32>
+# CHECK:           %[[VAL_18:.*]] = "tfl.fully_connected"(%[[VAL_11]], %[[VAL_17]], %[[VAL_5]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<5x3xf32>, tensor<7x3xf32>, none) -> tensor<5x7xf32>
+# CHECK:           %[[VAL_19:.*]] = "tfl.fully_connected"(%[[VAL_13]], %[[VAL_17]], %[[VAL_5]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<5x3xf32>, tensor<7x3xf32>, none) -> tensor<5x7xf32>
+# CHECK:           %[[VAL_20:.*]] = "tfl.pack"(%[[VAL_18]], %[[VAL_19]]) {axis = 0 : i32, values_count = 2 : i32} : (tensor<5x7xf32>, tensor<5x7xf32>) -> tensor<2x5x7xf32>
+# CHECK:           return %[[VAL_20]] : tensor<2x5x7xf32>
+# CHECK:         }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
index 0dd8ddc4c91..d793ea2d62f 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
@@ -1,15 +1,15 @@
 // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck --dump-input-on-failure %s
 // Ensure lstm roundtrip exactly
 
-func @main(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>, %arg3: tensor<4x4xf32>, %arg4: tensor<4x4xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4x4xf32>, %arg10: tensor<4x4xf32>, %arg11: tensor<4x4xf32>, %arg12: tensor<1x4xf32>, %arg13: tensor<1x4xf32>, %arg14: tensor<1x4xf32>, %arg15: tensor<1x4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<1x4xf32>, %arg18: tensor<4xf32>, %arg19: tensor<4xf32>, %arg20: tensor<4xf32>, %arg21: tensor<4xf32>) -> tensor<1x4xf32> {
+func @main(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>, %arg3: tensor<4x4xf32>, %arg4: tensor<4x4xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4xf32>, %arg10: tensor<4xf32>, %arg11: tensor<4xf32>, %arg12: tensor<1x4xf32>, %arg13: tensor<4xf32>, %arg14: tensor<4xf32>, %arg15: tensor<4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<4xf32>, %arg18: tensor<4xf32>, %arg19: tensor<4xf32>, %arg20: tensor<4xf32>, %arg21: tensor<4xf32>) -> tensor<1x4xf32> {
   %cst0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<1x4xf32>} : () -> tensor<1x4xf32> loc("Const")
   %cst1 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<1x4xf32>} : () -> tensor<1x4xf32> loc("Const")
-  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
   return %24 : tensor<1x4xf32>
 // CHECK-LABEL: main
 // seperate lines since there is no region for this op. third_party/tensorflow/compiler/mlir/lite/ir/tfl_ops.td: 3252
 // CHECK: %[[RES0:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg22, %arg23, %arg18, %arg19, %arg20, %arg21) ( {
-// CHECK:  }) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+// CHECK:  }) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
 // CHECK: return %[[RES0]]
 
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir b/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
new file mode 100644
index 00000000000..f08ac0e1027
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
@@ -0,0 +1,14 @@
+// RUN: tf-opt -tfl-prepare-composite-funcs-tf -tfl-fuse-tftext=true %s -split-input-file | FileCheck %s --dump-input-on-failure
+module {
+
+  func @_whitespace_func(%arg0: tensor<1x!tf.string>) -> (tensor<?x!tf.string>, tensor<?xi64>) attributes {tf._GrapplerSpecializedFunc = true, tf._input_shapes = [#tf.shape<1>], tf.api_implements = "tftext:WhitespaceTokenizer", tf.signature.is_stateful} {
+    %0 = "tf.op1"(%arg0)  : (tensor<1x!tf.string>) -> (tensor<?x!tf.string>)
+    %1 = "tf.Const"() {value = dense<-1> : tensor<i64>} : () -> tensor<?xi64>
+    %2:2 = "tf.op2"(%arg0, %1) : (tensor<1x!tf.string>, tensor<?xi64>) -> (tensor<?x!tf.string>, tensor<?xi64>)
+    return %2#0, %2#1 : tensor<?x!tf.string>, tensor<?xi64>
+  }
+
+  // CHECK: func @_whitespace_func(%arg0: tensor<1x!tf.string>) -> (tensor<?x!tf.string>, tensor<?xi64>) attributes {tf._GrapplerSpecializedFunc = true, tf._input_shapes = [#tf.shape<1>], tf.api_implements = "tftext:WhitespaceTokenizer", tf.signature.is_stateful} {
+  // CHECK:  "tfl.custom"(%arg0) {custom_code = "tftext:WhitespaceTokenizer", custom_option = opaque<"tfl", "0x"> : tensor<0xi8>} : (tensor<1x!tf.string>) -> (tensor<?x!tf.string>, tensor<?xi64>)
+  // CHECK:  return %0#0, %0#1 : tensor<?x!tf.string>, tensor<?xi64>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 15b6bf56b7a..15c73d2db2c 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -1048,6 +1048,15 @@ func @concatv2With3Tensors(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>, %arg2
 // CHECK: "tfl.concatenation"(%arg0, %arg1, %arg2) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x3xi32>
 }
 
+func @concatv2I64Axis(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>, %arg2: tensor<2x1xi32>) -> tensor<2x3xi32> {
+  %0 = "tf.Const"() { value = dense<-1> : tensor<i64> } : () -> tensor<i64>
+  %1 = "tf.ConcatV2"(%arg0, %arg1, %arg2, %0) : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>, tensor<i64>) -> tensor<2x3xi32>
+  return %1 : tensor<2x3xi32>
+
+// CHECK-LABEL: concatv2I64Axis
+// CHECK: "tfl.concatenation"(%arg0, %arg1, %arg2) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x3xi32>
+}
+
 func @resize_with_bilinear(%arg0: tensor<1x100x100x3xf32>, %arg1: tensor<4xi32>) -> tensor<?xf32> {
   %0 = "tf.ResizeBilinear"(%arg0, %arg1) {align_corners = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir b/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
index 9b1eeab3d7c..a7fb5b1666e 100644
--- a/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
@@ -292,7 +292,7 @@ func @tensorlistResize(%arg0: tensor<3x10xf32>, %arg1: tensor<1xi32>, %arg2: ten
 // CHECK:  [[SIZE_DIFF:%.*]] = "tf.Sub"([[SIZE]], [[INPUT_SIZE]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK:  [[DIFF_RES:%.*]] = "tf.Greater"([[SIZE_DIFF]], [[ZERO]]) : (tensor<i32>, tensor<i32>) -> tensor<i1>
 // CHECK:  [[SHAPE_1:%.*]] = "tf.Shape"([[INPUT]]) : (tensor<3x10xf32>) -> tensor<?xi32>
-// CHECK:  [[RESULT:%.*]] = "tf.If"([[DIFF_RES]], [[INPUT]], [[SHAPE_1]], [[SIZE_DIFF]], [[SIZE]]) {else_branch = @cond_false, is_stateless = true, output_shapes = [], then_branch = @cond_true} : (tensor<i1>, tensor<3x10xf32>, tensor<?xi32>, tensor<i32>, tensor<i32>) -> tensor<?x10xf32>
+// CHECK:  [[RESULT:%.*]] = "tf.If"([[DIFF_RES]], [[INPUT]], [[SHAPE_1]], [[SIZE_DIFF]], [[SIZE]]) {else_branch = @cond_false, is_stateless = true, then_branch = @cond_true} : (tensor<i1>, tensor<3x10xf32>, tensor<?xi32>, tensor<i32>, tensor<i32>) -> tensor<?x10xf32>
 // CHECK:  return [[RESULT]] : tensor<?x10xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
index 1b46fa3d0e5..320f869ac4c 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
@@ -65,7 +65,7 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-NEXT:      opcode_index: 1,
 // CHECK-NEXT:      inputs: [ 2, 1 ],
 // CHECK-NEXT:      outputs: [ 3 ],
-// CHECK-NEXT:      custom_options: [ 105, 110, 116, 95, 97, 116, 116, 114, 0, 102, 117, 115, 101, 100, 95, 97, 99, 116, 105, 118, 97, 116, 105, 111, 110, 95, 102, 117, 110, 99, 116, 105, 111, 110, 0, 4, 82, 69, 76, 85, 0, 2, 33, 43, 2, 1, 2, 11, 2, 20, 4, 4, 36, 1 ]
+// CHECK-NEXT:      custom_options: [ 102, 117, 115, 101, 100, 95, 97, 99, 116, 105, 118, 97, 116, 105, 111, 110, 95, 102, 117, 110, 99, 116, 105, 111, 110, 0, 4, 82, 69, 76, 85, 0, 105, 110, 116, 95, 97, 116, 116, 114, 0, 2, 42, 11, 2, 1, 2, 20, 2, 20, 4, 4, 36, 1 ]
 // CHECK-NEXT:    }, {
 // CHECK-NEXT:      opcode_index: 2,
 // CHECK-NEXT:      inputs: [ 3 ],
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
index e278572cd1e..ef78f993cc4 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
@@ -1,6 +1,6 @@
 // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
-func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32> {
+func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32> {
 // CHECK: {
 // CHECK-NEXT:   version: 3,
 // CHECK-NEXT:   operator_codes: [ {
@@ -72,21 +72,21 @@ func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, t
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4, 4 ],
+// CHECK-NEXT:       shape: [ 4 ],
 // CHECK-NEXT:       buffer: 10,
 // CHECK-NEXT:       name: "arg9",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4, 4 ],
+// CHECK-NEXT:       shape: [ 4 ],
 // CHECK-NEXT:       buffer: 11,
 // CHECK-NEXT:       name: "arg10",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4, 4 ],
+// CHECK-NEXT:       shape: [ 4 ],
 // CHECK-NEXT:       buffer: 12,
 // CHECK-NEXT:       name: "arg11",
 // CHECK-NEXT:       quantization: {
@@ -100,21 +100,21 @@ func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, t
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 1, 4 ],
+// CHECK-NEXT:       shape: [ 4 ],
 // CHECK-NEXT:       buffer: 14,
 // CHECK-NEXT:       name: "arg13",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 1, 4 ],
+// CHECK-NEXT:       shape: [ 4 ],
 // CHECK-NEXT:       buffer: 15,
 // CHECK-NEXT:       name: "arg14",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 1, 4 ],
+// CHECK-NEXT:       shape: [ 4 ],
 // CHECK-NEXT:       buffer: 16,
 // CHECK-NEXT:       name: "arg15",
 // CHECK-NEXT:       quantization: {
@@ -128,7 +128,7 @@ func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, t
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 1, 4 ],
+// CHECK-NEXT:       shape: [ 4 ],
 // CHECK-NEXT:       buffer: 18,
 // CHECK-NEXT:       name: "arg17",
 // CHECK-NEXT:       quantization: {
@@ -261,9 +261,9 @@ func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, t
 // CHECK-EMPTY:
 
 
-^bb0(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>, %arg3: tensor<4x4xf32>, %arg4: tensor<4x4xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4x4xf32>, %arg10: tensor<4x4xf32>, %arg11: tensor<4x4xf32>, %arg12: tensor<1x4xf32>, %arg13: tensor<1x4xf32>, %arg14: tensor<1x4xf32>, %arg15: tensor<1x4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<1x4xf32>, %arg18: tensor<4xf32>, %arg19: tensor<4xf32>, %arg20: tensor<4xf32>, %arg21: tensor<4xf32>):
+^bb0(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>, %arg3: tensor<4x4xf32>, %arg4: tensor<4x4xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4xf32>, %arg10: tensor<4xf32>, %arg11: tensor<4xf32>, %arg12: tensor<1x4xf32>, %arg13: tensor<4xf32>, %arg14: tensor<4xf32>, %arg15: tensor<4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<4xf32>, %arg18: tensor<4xf32>, %arg19: tensor<4xf32>, %arg20: tensor<4xf32>, %arg21: tensor<4xf32>):
   %cst0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<1x4xf32>} : () -> tensor<1x4xf32> loc("Const")
   %cst1 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<1x4xf32>} : () -> tensor<1x4xf32> loc("Const")
-  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
   return %24 : tensor<1x4xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
index 8e579421b0b..d9bba58b7d7 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
@@ -1,6 +1,6 @@
 // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
-func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -> tensor<4 x f32> {
+func @main(tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32> {
 // CHECK: {
 // CHECK-NEXT:   version: 3,
 // CHECK-NEXT:   operator_codes: [ {
@@ -9,63 +9,63 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, t
 // CHECK-NEXT:   } ],
 // CHECK-NEXT:   subgraphs: [ {
 // CHECK-NEXT:     tensors: [ {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 1,
 // CHECK-NEXT:       name: "arg0",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 2,
 // CHECK-NEXT:       name: "arg1",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 3,
 // CHECK-NEXT:       name: "arg2",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 4,
 // CHECK-NEXT:       name: "arg3",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 5,
 // CHECK-NEXT:       name: "arg4",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 6,
 // CHECK-NEXT:       name: "arg5",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 7,
 // CHECK-NEXT:       name: "arg6",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 8,
 // CHECK-NEXT:       name: "arg7",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 9,
 // CHECK-NEXT:       name: "arg8",
 // CHECK-NEXT:       quantization: {
@@ -121,63 +121,63 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, t
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 17,
 // CHECK-NEXT:       name: "arg16",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 18,
 // CHECK-NEXT:       name: "arg17",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 19,
 // CHECK-NEXT:       name: "arg18",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 20,
 // CHECK-NEXT:       name: "arg19",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 21,
 // CHECK-NEXT:       name: "arg20",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 22,
 // CHECK-NEXT:       name: "arg21",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       name: "Const",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       },
 // CHECK-NEXT:       is_variable: true
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       name: "Const1",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       },
 // CHECK-NEXT:       is_variable: true
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 25,
 // CHECK-NEXT:       name: "tfl.unidirectional_sequence_lstm",
 // CHECK-NEXT:       quantization: {
@@ -244,9 +244,9 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, t
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
 // CHECK-NEXT:   }, {
@@ -259,9 +259,9 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, t
 // CHECK-NEXT: }
 // CHECK-EMPTY:
 
-^bb0(%arg0: tensor<4 x f32>, %arg1: tensor<4 x f32>, %arg2: tensor<4 x f32>, %arg3: tensor<4 x f32>, %arg4: tensor<4 x f32>, %arg5: tensor<4 x f32>, %arg6: tensor<4 x f32>, %arg7: tensor<4 x f32>, %arg8: tensor<4 x f32>, %arg9: tensor<4 x f32>, %arg10: tensor<4 x f32>, %arg11: tensor<4 x f32>, %arg12: tensor<4 x f32>, %arg13: tensor<4 x f32>, %arg14: tensor<4 x f32>, %arg15: tensor<4 x f32>, %arg16: tensor<4 x f32>, %arg17: tensor<4 x f32>, %arg18: tensor<4 x f32>, %arg19: tensor<4 x f32>, %arg20: tensor<4 x f32>, %arg21: tensor<4 x f32>):
-  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
-  %1 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
-  %2 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %0, %1, %arg18, %arg19, %arg20, %arg21) {fused_activation_function = "NONE", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %2 : tensor<4xf32>
+^bb0(%arg0: tensor<4x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>, %arg3: tensor<4x4xf32>, %arg4: tensor<4x4xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4xf32>, %arg10: tensor<4xf32>, %arg11: tensor<4xf32>, %arg12: tensor<4xf32>, %arg13: tensor<4xf32>, %arg14: tensor<4xf32>, %arg15: tensor<4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<4x4xf32>, %arg18: tensor<4x4xf32>, %arg19: tensor<4x4xf32>, %arg20: tensor<4x4xf32>, %arg21: tensor<4x4xf32>):
+  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32> loc("Const")
+  %1 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32> loc("Const")
+  %2 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %0, %1, %arg18, %arg19, %arg20, %arg21) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+  return %2 : tensor<4x4xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
index 7ba24bd5c51..f2b99bcd0df 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
@@ -37,7 +37,7 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -
 // CHECK-EMPTY:
 // CHECK-NEXT:         }
 // CHECK-NEXT:       }, {
-// CHECK-NEXT:         shape: [ 4 ],
+// CHECK-NEXT:         shape: [ 4, 4 ],
 // CHECK-NEXT:         name: "Const",
 // CHECK-NEXT:         quantization: {
 // CHECK-EMPTY:
@@ -76,7 +76,7 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -
 // CHECK-NEXT:     }, {
 // CHECK-EMPTY:
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:      data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:      data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:     }, {
 // CHECK-EMPTY:
 // CHECK-NEXT:     }, {
@@ -90,7 +90,7 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -
 // CHECK-EMPTY:
 
 ^bb0(%arg0: tensor<4 x f32>, %arg1: tensor<4 x f32>, %arg2: tensor<4 x f32>, %arg3: tensor<4 x f32>):
-  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
-  %1 = "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %0) {fused_activation_function = "TANH", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32> loc("Const")
+  %1 = "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %0) {fused_activation_function = "TANH", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>) -> tensor<4xf32>
   return %1 : tensor<4xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index b1d1d81af37..3451f28380b 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -190,9 +190,9 @@ func @testSquare(tensor<? x f32>) -> tensor<? x f32> {
   return %0 : tensor<? x f32>
 }
 
-func @testQuantizedResizeNearestNeighbor(tensor<? x !quant.uniform<u8:f32, 0.1>>, tensor<? x i32>) -> tensor<? x !quant.uniform<u8:f32, 0.1>> {
-^bb0(%arg0: tensor<? x !quant.uniform<u8:f32, 0.1>>, %arg1: tensor<? x i32>):
-  %0 = "tfl.resize_nearest_neighbor"(%arg0, %arg1) { align_corners = false, half_pixel_centers = false } : (tensor<? x !quant.uniform<u8:f32, 0.1>>, tensor<? x i32>) -> tensor<? x !quant.uniform<u8:f32, 0.1>>
+func @testQuantizedResizeNearestNeighbor(tensor<? x ? x ? x ? x !quant.uniform<u8:f32, 0.1>>, tensor<? x i32>) -> tensor<? x !quant.uniform<u8:f32, 0.1>> {
+^bb0(%arg0: tensor<? x ? x ? x ? x !quant.uniform<u8:f32, 0.1>>, %arg1: tensor<? x i32>):
+  %0 = "tfl.resize_nearest_neighbor"(%arg0, %arg1) { align_corners = false, half_pixel_centers = false } : (tensor<? x ? x ? x ? x !quant.uniform<u8:f32, 0.1>>, tensor<? x i32>) -> tensor<? x !quant.uniform<u8:f32, 0.1>>
   return %0 : tensor<? x !quant.uniform<u8:f32, 0.1>>
 }
 
@@ -573,7 +573,7 @@ func @testLogistic(tensor<1x2x3x4x5xf32>) -> tensor<1x2x3x4x5xf32> {
 // test invalid Logistic input
 func @testLogisticWithWrongInputType(tensor<?xi32>) -> tensor<?xi32> {
 ^bb0(%arg0: tensor<?xi32>):
-  // expected-error @+1 {{tfl.logistic' op operand #0 must be tensor of 32-bit float or QI8 type or QUI8 type or QI16 type or QUI16 type values}}
+  // expected-error @+1 {{'tfl.logistic' op operand #0 must be tensor of 32-bit float or QI8 type or QUI8 type or QI16 type or TFLite quint8 type values, but got 'tensor<?xi32>'}}
   %0 = "tfl.logistic"(%arg0): (tensor<?xi32>) -> tensor<?xi32>
   return %0#0 : tensor<?xi32>
 }
@@ -581,36 +581,36 @@ func @testLogisticWithWrongInputType(tensor<?xi32>) -> tensor<?xi32> {
 // -----
 
 // CHECK-LABEL: testUnidirectionalSequenceRnn
-func @testUnidirectionalSequenceRnn(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>) -> tensor<? x f32> {
-  // CHECK: "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %0 = "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+func @testUnidirectionalSequenceRnn(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x ? x f32>) -> tensor<? x f32> {
+  // CHECK: "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>) -> tensor<?xf32>
+  %0 = "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: testUnidirectionalSequenceLstmWithoutProjection
-func @testUnidirectionalSequenceLstmWithoutProjection(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: none, %arg17: none, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
-  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, none, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, none, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+func @testUnidirectionalSequenceLstmWithoutProjection(%arg0: tensor<? x ? x f32>, %arg1: tensor<? x ? x f32>, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x ? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x ? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: none, %arg17: none, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
+  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, none, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, none, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: testUnidirectionalSequenceLstm
-func @testUnidirectionalSequenceLstm(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
-  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+func @testUnidirectionalSequenceLstm(%arg0: tensor<? x ? x f32>, %arg1: tensor<? x ? x f32>, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x ? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x ? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x ? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
+  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: testUnidirectionalSequenceLstmWithNoneTypeAndOverrideAttr
-func @testUnidirectionalSequenceLstmWithNoneTypeAndOverrideAttr(%arg0: tensor<? x f32>, %arg1: none, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
-  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+func @testUnidirectionalSequenceLstmWithNoneTypeAndOverrideAttr(%arg0: tensor<? x ? x f32>, %arg1: none, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x ? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x ? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x ? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
+  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", time_major = false} : (tensor<?x?xf32>, none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", time_major = false} : (tensor<?x?xf32>, none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
@@ -663,10 +663,10 @@ func @testLstmQuantizedType(%arg0: tensor<1x528x!quant.uniform<i8:f32, 0.0372480
 // -----
 
 // CHECK-LABEL: testLstm
-func @testLstm(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
+func @testLstm(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
   // CHECK: "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23)
-  // CHECK-NEXT: {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) ({}) {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  // CHECK-NEXT: {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) ({}) {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
@@ -689,10 +689,10 @@ func @testQuantizedBasicLstm(%arg0: tensor<1x384x!quant.uniform<u8:f32, 7.812500
 // -----
 
 // CHECK-LABEL: testLstmWithNoneTypeAndOverrideAttr
-func @testLstmWithNoneTypeAndOverrideAttr(%arg0: tensor<? x f32>, %arg1: none, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
+func @testLstmWithNoneTypeAndOverrideAttr(%arg0: tensor<? x f32>, %arg1: none, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
   // CHECK: "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23)
-  // CHECK-NEXT: {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) ({}) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  // CHECK-NEXT: {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) ({}) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
@@ -707,11 +707,11 @@ func @testLstmWithInvalidNoneType(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>
 
 // -----
 
-// test invalid input dimension, the first input operand for lstm op should be at least 2D tensor.
+// test invalid input dimension, the third input operand for lstm op should be 2-D tensor.
 func @testLstmWithInvalidInputDimension(%arg0: tensor<4 x f32>, %arg1: tensor<4 x f32>, %arg2: tensor<4 x f32>, %arg3: tensor<4 x f32>, %arg4: tensor<4 x f32>, %arg5: tensor<4 x f32>, %arg6: tensor<4 x f32>, %arg7: tensor<4 x f32>, %arg8: tensor<4 x f32>, %arg9: tensor<4 x f32>, %arg10: tensor<4 x f32>, %arg11: tensor<4 x f32>, %arg12: tensor<4 x f32>, %arg13: tensor<4 x f32>, %arg14: tensor<4 x f32>, %arg15: tensor<4 x f32>, %arg16: tensor<4 x f32>, %arg17: tensor<4 x f32>, %arg18: tensor<4 x f32>, %arg19: tensor<4 x f32>, %arg20: tensor<4 x f32>, %arg21: tensor<4 x f32>) -> tensor<4 x f32> {
   %cst0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
   %cst1 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
-  // expected-error @+1 {{'tfl.lstm' op the first input operand should have more than 2 dimensions.}}
+  // expected-error @+1 {{'tfl.lstm' op failed to verify that operand 2 is 2-D}}
   %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   return %24 : tensor<4xf32>
 
@@ -720,22 +720,22 @@ func @testLstmWithInvalidInputDimension(%arg0: tensor<4 x f32>, %arg1: tensor<4
 // -----
 
 // 'input_to_output_weights' input for lstm op has unmatched rank with `input`.
-func @testLstmWithInvalidInputsRankMatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4x2xf32>, %arg2: tensor<4x2xf32>, %arg3: tensor<4x2xf32>, %arg4: tensor<4x2xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4x4xf32>, %arg10: tensor<4x4xf32>, %arg11: tensor<4x4xf32>, %arg12: tensor<1x4xf32>, %arg13: tensor<1x4xf32>, %arg14: tensor<1x4xf32>, %arg15: tensor<1x4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<1x4xf32>, %arg18: tensor<4xf32>, %arg19: tensor<4xf32>, %arg20: tensor<4xf32>, %arg21: tensor<4xf32>) -> tensor<1x4xf32> {
+func @testLstmWithInvalidInputsRankMatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4x2xf32>, %arg2: tensor<4x2xf32>, %arg3: tensor<4x2xf32>, %arg4: tensor<4x2xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4xf32>, %arg10: tensor<4xf32>, %arg11: tensor<4xf32>, %arg12: tensor<1x4xf32>, %arg13: tensor<4xf32>, %arg14: tensor<4xf32>, %arg15: tensor<4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<4xf32>, %arg18: tensor<4xf32>, %arg19: tensor<4xf32>, %arg20: tensor<4xf32>, %arg21: tensor<4xf32>) -> tensor<1x4xf32> {
   %cst0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<1x4xf32>} : () -> tensor<1x4xf32> loc("Const")
   %cst1 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<1x4xf32>} : () -> tensor<1x4xf32> loc("Const")
   // expected-error @+1 {{'tfl.lstm' op inputs don't match with the dimensions.}}
-  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
   return %24 : tensor<1x4xf32>
 }
 
 // -----
 
 // Coefficient inputs of LSTM op don't match the dimension with input operand `input_to_output_weights`.
-func @testLstmWithInvalidInputsRankMatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>, %arg3: tensor<4x4xf32>, %arg4: tensor<4x4xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4x4xf32>, %arg10: tensor<4x4xf32>, %arg11: tensor<4x4xf32>, %arg12: tensor<1x4xf32>, %arg13: tensor<1x4xf32>, %arg14: tensor<1x4xf32>, %arg15: tensor<1x4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<1x4xf32>, %arg18: tensor<3xf32>, %arg19: tensor<3xf32>, %arg20: tensor<3xf32>, %arg21: tensor<3xf32>) -> tensor<1x4xf32> {
+func @testLstmWithInvalidInputsRankMatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>, %arg3: tensor<4x4xf32>, %arg4: tensor<4x4xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4xf32>, %arg10: tensor<4xf32>, %arg11: tensor<4xf32>, %arg12: tensor<1x4xf32>, %arg13: tensor<4xf32>, %arg14: tensor<4xf32>, %arg15: tensor<4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<4xf32>, %arg18: tensor<3xf32>, %arg19: tensor<3xf32>, %arg20: tensor<3xf32>, %arg21: tensor<3xf32>) -> tensor<1x4xf32> {
   %cst0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<1x4xf32>} : () -> tensor<1x4xf32> loc("Const")
   %cst1 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<1x4xf32>} : () -> tensor<1x4xf32> loc("Const")
   // expected-error @+1 {{'tfl.lstm' op coefficient inputs have more than 2 dimensions or don't match the dimension with input operand `input_to_output_weights`.}}
-  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>) -> tensor<1x4xf32>
+  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>) -> tensor<1x4xf32>
   return %24 : tensor<1x4xf32>
 }
 
@@ -1201,7 +1201,7 @@ func @testResizeBilinear(%arg0 : tensor<1x100x100x3xf32>, %arg1 : tensor<4xi32>)
 // -----
 
 func @testResizeBilinearInvalidOutputType(%arg0 : tensor<1x100x100x3xf32>, %arg1 : tensor<4xi32>) -> tensor<?xi32> {
-  // expected-error @+1 {{'tfl.resize_bilinear' op result #0 must be tensor of 32-bit float or QI8 type or QUI8 type values}}
+  // expected-error @+1 {{'tfl.resize_bilinear' op failed to verify that input and output must have same element type}}
   %0 = "tfl.resize_bilinear"(%arg0, %arg1) {align_corners = false} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
@@ -1252,10 +1252,10 @@ func @testOneHot(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<f32>, %
 
 // -----
 
-func @testOneHotWithInvalidOutputType(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<*xi8> {
-  // expected-error @+1 {{'tfl.one_hot' op result #0 must be tensor of 32-bit float or 32-bit signless integer or 64-bit signless integer or 1-bit signless integer values}}
-  %0 = "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) {axis = -1 : i32} : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xi8>
-  return %0 : tensor<*xi8>
+func @testOneHotWithInvalidOutputType(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<*xi16> {
+  // expected-error @+1 {{'tfl.one_hot' op result #0 must be tensor of 32-bit float or 32-bit signless integer or 64-bit signless integer or 1-bit signless integer or 8-bit signless integer or 8-bit unsigned integer values, but got 'tensor<*xi16>'}}
+  %0 = "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) {axis = -1 : i32} : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xi16>
+  return %0 : tensor<*xi16>
 }
 
 // -----
@@ -1489,7 +1489,8 @@ func @testEmbeddingLookupValueAndResultElementTypeTraitFailed(%arg0 : tensor<?xi
 
 // -----
 
-func @testQuantizedLocalResponseNormalization(%arg0 : tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>) -> tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>> {
+func @testWrongQuantizedLocalResponseNormalization(%arg0 : tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>) -> tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>> {
+  // expected-error @+1 {{'tfl.local_response_normalization' op operand #0 must be tensor of 32-bit float values, but got 'tensor<1x56x56x192x!quant.uniform<u8:f32, 2.000000e-02>>'}}
   %0 = "tfl.local_response_normalization"(%arg0) {alpha = 9.99999974E-5 : f32, beta = 5.000000e-01 : f32, bias = 2.000000e+00 : f32, radius = 5 : i32} : (tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>) -> tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>
   return %0 : tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>
 }
@@ -1498,8 +1499,8 @@ func @testQuantizedLocalResponseNormalization(%arg0 : tensor<1x56x56x192x!quant.
 
 // CHECK-LABEL: testSvdf
 func @testSvdf(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>) -> tensor<? x f32> {
-  // CHECK: "tfl.svdf"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", rank = 2 : i32} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %0 = "tfl.svdf"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", rank = 2 : i32} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  // CHECK: "tfl.svdf"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "RELU", rank = 2 : i32} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.svdf"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "RELU", rank = 2 : i32} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
@@ -1523,32 +1524,32 @@ func @testDepthToSpaceInvalidOutputType(%arg0: tensor<1x1x1x4xf32>) -> tensor<1x
 
 // -----
 
-func @testPReluWrongOutputRank(%arg0: tensor<10x10x10x10xf32>, %arg1: tensor<1x1x10xf32>) -> tensor<10x10x10xf32> {
-  // expected-error @+1 {{'input' and 'output' should have the same rank}}
-  %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<10x10x10x10xf32>, tensor<1x1x10xf32>) -> tensor<10x10x10xf32>
-  return %0 : tensor<10x10x10xf32>
+func @testPReluWrongOutputRank(%arg0: tensor<10x10x10x10xf32>, %arg1: tensor<10x10x10x10xf32>) -> tensor<10x10xf32> {
+  // expected-error @+1 {{'tfl.prelu' op result type '10x10' not broadcast compatible with broadcasted operands's shapes '10x10x10x10'}}
+  %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<10x10x10x10xf32>, tensor<10x10x10x10xf32>) -> tensor<10x10xf32>
+  return %0 : tensor<10x10xf32>
 }
 
 // -----
 
 func @testPReluWrongOutputShape(%arg0: tensor<1x2x3x4xf32>, %arg1: tensor<2x3x4xf32>) -> tensor<1x2x3x5xf32> {
-  // expected-error @+1 {{'input' and 'output' should have the same shape}}
+  // expected-error @+1 {{'tfl.prelu' op result type '1x2x3x5' not broadcast compatible with broadcasted operands's shapes '1x2x3x4'}}
   %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<1x2x3x4xf32>, tensor<2x3x4xf32>) -> tensor<1x2x3x5xf32>
   return %0 : tensor<1x2x3x5xf32>
 }
 
 // -----
 
-func @testPReluWrongAlphaRank(%arg0: tensor<7x3x2x14xf32>, %arg1: tensor<2x7x3x2x14xf32>) -> tensor<7x3x2x14xf32> {
+func @testPReluWrongAlphaRank(%arg0: tensor<7x3x2x14xf32>, %arg1: tensor<7x3x2x14xf32>) -> tensor<7x3x2x14xf32> {
   // expected-error @+1 {{'alpha' should have one less rank than 'input'.}}
-  %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<7x3x2x14xf32>, tensor<2x7x3x2x14xf32>) -> tensor<7x3x2x14xf32>
+  %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<7x3x2x14xf32>, tensor<7x3x2x14xf32>) -> tensor<7x3x2x14xf32>
   return %0 : tensor<7x3x2x14xf32>
 }
 
 // -----
 
 func @testPReluInvalidBroadcast(%arg0: tensor<15x14x2x14xf32>, %arg1: tensor<1x1x3xf32>) -> tensor<15x14x2x14xf32> {
-  // expected-error @+1 {{'alpha' is not broadcastable at dimension 2.}}
+  // expected-error @+1 {{'tfl.prelu' op operands don't have broadcast-compatible shapes}}
   %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<15x14x2x14xf32>, tensor<1x1x3xf32>) -> tensor<15x14x2x14xf32>
   return %0 : tensor<15x14x2x14xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index 2815afd14b9..3f8257b54f0 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -439,6 +439,19 @@ func @NotReorderReshapeAddIfNotTailingDim(%arg0: tensor<40x40x1xf32>) -> tensor<
   // CHECK: return %[[rs2]]
 }
 
+// CHECK-LABEL: @NotReorderReshapeAddIfHighDim
+func @NotReorderReshapeAddIfHighDim(%arg0: tensor<1x1x1x1x30x96xf32>) -> tensor<1x30x96xf32> {
+  %cst = constant dense<2.0> : tensor<f32>
+  %shape = constant dense<[1, 30, 96]> : tensor<3xi32>
+  %1 = "tfl.reshape"(%arg0, %shape) : (tensor<1x1x1x1x30x96xf32>, tensor<3xi32>) -> tensor<1x30x96xf32>
+  %2 = "tfl.add"(%1, %cst) {fused_activation_function = "NONE"} : (tensor<1x30x96xf32>, tensor<f32>) -> tensor<1x30x96xf32>
+  return %2 : tensor<1x30x96xf32>
+
+  // CHECK: %[[rs1:.*]] = "tfl.reshape"(%arg0
+  // CHECK: %[[rs2:.*]] = "tfl.add"(%[[rs1]]
+  // CHECK: return %[[rs2]]
+}
+
 // CHECK-LABEL: @ReorderElementwiseValueOpAndMoveOp
 func @ReorderElementwiseValueOpAndMoveOp(%arg0: tensor<40x40x1xf32>) -> tensor<40x40xf32> {
   %shape = constant dense<[40, 40]> : tensor<2xi32>
diff --git a/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
index 5377c4fdb98..6573a2f1c36 100644
--- a/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
@@ -19,6 +19,16 @@ func @RemoveUnused(%arg0: tensor<4xf32>, %arg1: tensor<i32>) -> (tensor<2xf32>,t
 // CHECK-NEXT: return %[[split]]#0, %[[split]]#1
 }
 
+// CHECK-LABEL: RemoveTrival
+func @RemoveTrival(%arg0: tensor<384x512x!quant.uniform<i8:f32, 1.0:-128>>, %arg1: tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.0>>, %arg2: none) -> tensor<384x128x!quant.uniform<i8:f32, 2.0>> {
+  %1 = "tfl.fully_connected"(%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<384x512x!quant.uniform<i8:f32, 1.0:-128>>, tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.0>>, none) -> tensor<384x128x!quant.uniform<i8:f32, 1.0>>
+  %2 = "tfl.quantize"(%1) {qtype = tensor<384x128x!quant.uniform<i8:f32, 2.0>>} : (tensor<384x128x!quant.uniform<i8:f32, 1.0>>) -> tensor<384x128x!quant.uniform<i8:f32, 2.0>>
+  return %2 : tensor<384x128x!quant.uniform<i8:f32, 2.0>>
+
+// CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"{{.*}} -> tensor<384x128x!quant.uniform<i8:f32, 2.000000e+00>>
+// CHECK-NEXT: return %[[fc]]
+}
+
 func @main(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x1001xf32> {
   %cst = constant dense<[1, 1001]> : tensor<2xi32>
   %0 = "tfl.quantize"(%arg0) {qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>} : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
index f6054f3d65d..e1f496b91f4 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
@@ -63,7 +63,7 @@ func @prepareAdd(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   return %add : tensor<2x2xf32>
 
 // CHECK: %[[cst:.*]] = constant dense<[{{\[}}0.000000e+00, 1.000000e+00], [2.000000e+00, 2.550000e+02]]>
-// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:-128>>}
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:-128>>, volatile}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: %[[add:.*]] = tfl.add %arg0, %[[dq]]
 // CHECK: return %[[add]]
@@ -83,7 +83,7 @@ func @prepareConv2DSplat(%arg0: tensor<1x5x5x3xf32>) -> tensor<1x5x5x3xf32> {
 // CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq]]
 
 // PerTensor: %[[cst:.*]] = constant dense<1.270000e+02> : tensor<3x3x3x3xf32>
-// PerTensor: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<3x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}
+// PerTensor: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<3x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, volatile}
 // PerTensor: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // PerTensor: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq]]
 }
@@ -97,7 +97,7 @@ func @prepareConv2D(%arg0: tensor<1x5x5x1xf32>) -> tensor<1x5x5x3xf32> {
 
 // CHECK: %[[cst:.*]] = constant dense<[{{\[\[\[}}0.000000e+00]]], [{{\[\[}}1.270000e+02]]], [{{\[\[}}-1.270000e+02]]]]>
 // CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<3x1x1x1x!quant.uniform<i8<-127:127>:f32:0,
-// CHECK-SAME: {0.0078740157480314959,1.000000e+00,1.000000e+00}>>}
+// CHECK-SAME: {0.0078740157480314959,1.000000e+00,1.000000e+00}>>, volatile}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq]]
 
@@ -134,12 +134,12 @@ func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112
   return %fc : tensor<1x112x112x32xf32>
 
 // CHECK: %[[cst:.*]] = constant dense<1.270000e+02> : tensor<32x12xf32>
-// CHECK: %[[q:.*]] = "tfl.quantize"(%cst) {qtype = tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>} : (tensor<32x12xf32>) -> tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// CHECK: %[[q:.*]] = "tfl.quantize"(%cst) {qtype = tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, volatile}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%0) : (tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<32x12xf32>
 // CHECK: "tfl.fully_connected"(%arg0, %[[dq]]
 
 // PerTensor: %[[cst:.*]] = constant dense<1.270000e+02> : tensor<32x12xf32>
-// PerTensor: %[[q:.*]] = "tfl.quantize"(%cst) {qtype = tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>} : (tensor<32x12xf32>) -> tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// PerTensor: %[[q:.*]] = "tfl.quantize"(%cst) {qtype = tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, volatile}
 // PerTensor: %[[dq:.*]] = "tfl.dequantize"(%0) : (tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<32x12xf32>
 // PerTensor: "tfl.fully_connected"(%arg0, %[[dq]]
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
index f937d0afd4d..38f76bb4eb5 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
@@ -67,7 +67,7 @@ func @QuantizeConv2DPerChannel(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32,
   return %conv : tensor<1x112x112x32xf32>
 
 // CHECK-NEXT: %[[cst:.*]] = constant dense<1.000000e+00> : tensor<32xf32>
-// CHECK-NEXT: %[[qbias:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<32x!quant.uniform<i32:f32:0, {1.500000e+00,3.000000e+00,4.500000e+00}>>}
+// CHECK-NEXT: %[[qbias:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<32x!quant.uniform<i32:f32:0, {1.500000e+00,3.000000e+00,4.500000e+00}>>, volatile}
 // CHECK-NEXT: %[[bias:.*]] = "tfl.dequantize"(%[[qbias]])
 // CHECK-NEXT: %[[in:.*]] = "tfl.dequantize"(%arg0)
 // CHECK-NEXT: %[[w:.*]] = "tfl.dequantize"(%arg1)
@@ -87,7 +87,7 @@ func @QuantizeConv2DPerChannels(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32:
   return %conv : tensor<1x112x112x32xf32>
 
 // CHECK-NEXT: %[[cst:.*]] = constant dense<1.000000e+00> : tensor<32xf32>
-// CHECK-NEXT: %[[qbias:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<32x!quant.uniform<i32:f32:0, {1.000000e+00,4.000000e+00,9.000000e+00}>>}
+// CHECK-NEXT: %[[qbias:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<32x!quant.uniform<i32:f32:0, {1.000000e+00,4.000000e+00,9.000000e+00}>>, volatile}
 // CHECK-NEXT: %[[bias:.*]] = "tfl.dequantize"(%[[qbias]])
 // CHECK-NEXT: %[[in:.*]] = "tfl.dequantize"(%arg0)
 // CHECK-NEXT: %[[w:.*]] = "tfl.dequantize"(%arg1)
@@ -107,7 +107,7 @@ func @QuantizeConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>
   return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 
 // CHECK: %cst = constant dense<-1.23697901> : tensor<32xf32>
-// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>}
+// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>)
 // CHECK: %2 = "tfl.dequantize"(%arg0)
 // CHECK: %3 = "tfl.pseudo_qconst"()
@@ -129,7 +129,7 @@ func @QuantizeFullyConnected(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e
   return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 
 // CHECK: %cst = constant dense<-1.23697901> : tensor<32xf32>
-// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>}
+// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>)
 // CHECK: %2 = "tfl.dequantize"(%arg0)
 // CHECK: %3 = "tfl.pseudo_qconst"()
@@ -151,7 +151,7 @@ func @QuantizeDepthwiseConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500
   return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 
 // CHECK: %cst = constant dense<-1.23697901> : tensor<32xf32>
-// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>}
+// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>)
 // CHECK: %2 = "tfl.dequantize"(%arg0)
 // CHECK: %3 = "tfl.pseudo_qconst"()
@@ -232,7 +232,7 @@ func @QuantizeStridedSlice(tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>, tensor<
 
 // CHECK: %0 = "tfl.dequantize"(%arg0)
 // CHECK: %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg3)
-// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x2x2x5x!quant.uniform<u8:f32, 1.000000e-01>>}
+// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x2x2x5x!quant.uniform<u8:f32, 1.000000e-01>>, volatile}
 // CHECK: %3 = "tfl.dequantize"(%2)
 // CHECK: return %3 : tensor<1x2x2x5xf32>
 }
@@ -277,7 +277,7 @@ func @QuantizeReshape2D(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>
 
 // CHECK: %0 = "tfl.dequantize"(%arg0) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 // CHECK: %1 = "tfl.reshape"(%0, %{{.*}}) : (tensor<1x6x6x16xf32>, tensor<3xi32>) -> tensor<1x36x16xf32>
-// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x36x16x!quant.uniform<u8:f32, 7.812500e-03:128>>}
+// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x36x16x!quant.uniform<u8:f32, 7.812500e-03:128>>, volatile}
 // CHECK: %3 = "tfl.dequantize"(%2) : (tensor<1x36x16x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 // CHECK: return %3 : tensor<1x36x16xf32>
 }
@@ -291,7 +291,7 @@ func @QuantizeSoftmax(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 
 // CHECK: %0 = "tfl.dequantize"(%arg0)
 // CHECK: %1 = "tfl.softmax"(%0) {beta = 1.000000e+00 : f32} : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
-// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>}
+// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>, volatile}
 // CHECK: %3 = "tfl.dequantize"(%2)
 // CHECK: return %3 : tensor<1x6x6x16xf32>
 }
@@ -305,7 +305,7 @@ func @QuantizeLogistic(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>
 
 // CHECK: %0 = "tfl.dequantize"(%arg0)
 // CHECK: %1 = "tfl.logistic"(%0) : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
-// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>}
+// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>, volatile}
 // CHECK: %3 = "tfl.dequantize"(%2) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>) -> tensor<1x6x6x16xf32>
 // CHECK: return %3 : tensor<1x6x6x16xf32>
 }
@@ -327,7 +327,7 @@ func @QuantizeL2Norm(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 1.0>>) -> ten
 
 // CHECK: %[[in:.*]] = "tfl.dequantize"(%arg0)
 // CHECK: %[[l2:.*]] = "tfl.l2_normalization"(%[[in]])
-// CHECK: %[[q:.*]] = "tfl.quantize"(%[[l2]]) {qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>}
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[l2]]) {qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>, volatile}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: return %[[dq]] : tensor<1x6x6x16xf32>
 }
@@ -350,13 +350,13 @@ func @QuantizeConcatOperand0ToAll(tensor<1x2x!quant.uniform<u8:f32, 0.1:128>>, t
   %1 = "tfl.concatenation"(%0, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
   return %1 : tensor<2x2xf32>
 
-// CHECK: %0 = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %0 = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %2 = "tfl.dequantize"(%arg0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %3 = "tfl.concatenation"(%2, %1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
-// CHECK: %4 = "tfl.quantize"(%3) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %4 = "tfl.quantize"(%3) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %5 = "tfl.dequantize"(%4) : (tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<2x2xf32>
-// CHeCK: return %5 : tensor<2x2xf32>
+// CHECK: return %5 : tensor<2x2xf32>
 }
 
 // CHECK-LABEL: QuantizeConcatOperand1ToAll
@@ -366,11 +366,11 @@ func @QuantizeConcatOperand1ToAll(tensor<1x2xf32>, tensor<1x2x!quant.uniform<u8:
   %1 = "tfl.concatenation"(%arg0, %0) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
   return %1 : tensor<2x2xf32>
 
-// CHECK: %0 = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %0 = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %2 = "tfl.dequantize"(%arg1) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %3 = "tfl.concatenation"(%1, %2) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
-// CHECK: %4 = "tfl.quantize"(%3) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %4 = "tfl.quantize"(%3) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %5 = "tfl.dequantize"(%4) : (tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<2x2xf32>
 // CHECK: return %5 : tensor<2x2xf32>
 }
@@ -382,9 +382,9 @@ func @QuantizeConcatResToAll(tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2x!qu
   %1 = "tfl.quantize"(%0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   return %1 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %0 = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %0 = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
-// CHECK: %2 = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %2 = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %3 = "tfl.dequantize"(%2) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %4 = "tfl.concatenation"(%3, %1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
 // CHECK: %5 = "tfl.quantize"(%4) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
@@ -399,7 +399,7 @@ func @QuantizeConcatResToAllNoRequantize(tensor<1x2x!quant.uniform<u8:f32, 0.1:1
   %2 = "tfl.quantize"(%1) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   return %2 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %0 = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %0 = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %2 = "tfl.dequantize"(%arg0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %3 = "tfl.concatenation"(%2, %1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
@@ -416,7 +416,7 @@ func @QuantizeConcatResToAllRequantize(tensor<1x2xf32>, tensor<1x2xf32>) -> tens
   %3 = "tfl.quantize"(%2) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   return %3 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %[[Q1:.*]] =  "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %[[Q1:.*]] =  "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %[[DQ1:.*]] = "tfl.dequantize"(%[[Q1]]) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %[[Q0:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>
 // CHECK: %[[RQ0:.*]] = "tfl.quantize"(%[[Q0]]) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
@@ -434,7 +434,7 @@ func @QuantizeConcatResToAllRequantizeArg(tensor<1x2x!quant.uniform<u8:f32, 2.0:
   %3 = "tfl.quantize"(%2) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   return %3 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %[[Q1:.*]] =  "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %[[Q1:.*]] =  "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %[[DQ1:.*]] = "tfl.dequantize"(%[[Q1]]) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %[[RQ0:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 // CHECK: %[[DQ0:.*]] = "tfl.dequantize"(%[[RQ0]]) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
@@ -475,22 +475,22 @@ func @QuantizeChain(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>
   return %10 : tensor<1x36x16xf32>
 
 // CHECK: %cst = constant dense<-1.23697901> : tensor<32xf32>
-// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>}
+// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>)
 // CHECK: %2 = "tfl.dequantize"(%arg0) : (tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 // CHECK: %3 = "tfl.pseudo_qconst"()
 // CHECK: %4 = "tfl.dequantize"(%3) : (tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>)
 // CHECK: %5 = "tfl.average_pool_2d"(%2)
-// CHECK: %6 = "tfl.quantize"(%5) {qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>}
+// CHECK: %6 = "tfl.quantize"(%5) {qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>, volatile}
 // CHECK: %7 = "tfl.dequantize"(%6) : (tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 // CHECK: %8 = "tfl.conv_2d"(%7, %4, %1)
 // CHECK: %9 = "tfl.quantize"(%8) {qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>}
 // CHECK: %10 = "tfl.dequantize"(%9) : (tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>)
 // CHECK: %11 = "tfl.reshape"(%10, %{{.*}})
-// CHECK: %12 = "tfl.quantize"(%11) {qtype = tensor<1x36x16x!quant.uniform<u8:f32, 0.023528476789885875>>}
+// CHECK: %12 = "tfl.quantize"(%11) {qtype = tensor<1x36x16x!quant.uniform<u8:f32, 0.023528476789885875>>, volatile}
 // CHECK: %13 = "tfl.dequantize"(%12) : (tensor<1x36x16x!quant.uniform<u8:f32, 0.023528476789885875>>)
 // CHECK: %14 = "tfl.softmax"(%13)
-// CHECK: %15 = "tfl.quantize"(%14) {qtype = tensor<1x36x16x!quant.uniform<u8:f32, 3.906250e-03>>}
+// CHECK: %15 = "tfl.quantize"(%14) {qtype = tensor<1x36x16x!quant.uniform<u8:f32, 3.906250e-03>>, volatile}
 // CHECK: %16 = "tfl.dequantize"(%15) : (tensor<1x36x16x!quant.uniform<u8:f32, 3.906250e-03>>)
 // CHECK: return %16 : tensor<1x36x16xf32>
 }
@@ -501,7 +501,7 @@ func @QuantizeConstant() -> tensor<2x3xf32> {
   return %cst : tensor<2x3xf32>
 
 // CHECK: %cst = constant dense{{.*}}tensor<2x3xf32>
-// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<2x3x!quant.uniform<u8:f32, 0.023529411764705882:128>>}
+// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<2x3x!quant.uniform<u8:f32, 0.023529411764705882:128>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0)
 // CHECK: return %1 : tensor<2x3xf32>
 }
@@ -521,7 +521,7 @@ func @QuantizeZeroSplat() -> tensor<2x3xf32> {
   return %cst : tensor<2x3xf32>
 
 // CHECK-NEXT:  %[[cst:.*]] = constant dense<0.000000e+00> : tensor<2x3xf32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<2x3x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<2x3x!quant.uniform<u8:f32, 1.000000e+00>>, volatile}
 }
 
 // CHECK-LABEL: QuantizeZeroScalar
@@ -530,7 +530,7 @@ func @QuantizeZeroScalar() -> tensor<f32> {
   return %cst : tensor<f32>
 
 // CHECK-NEXT:  %[[cst:.*]] = constant dense<0.000000e+00> : tensor<f32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<!quant.uniform<u8:f32, 1.000000e+00>>, volatile}
 }
 
 // CHECK-LABEL: QuantizePositiveSplat
@@ -539,7 +539,7 @@ func @QuantizePositiveSplat() -> tensor<2x3xf32> {
   return %cst : tensor<2x3xf32>
 
 // CHECK-NEXT:  %[[cst:.*]] = constant dense<2.540000e+01> : tensor<2x3xf32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<2x3x!quant.uniform<u8:f32, 0.099607841641295186>>}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<2x3x!quant.uniform<u8:f32, 0.099607841641295186>>, volatile}
 }
 
 // CHECK-LABEL: QuantizePositiveScalar
@@ -548,7 +548,7 @@ func @QuantizePositiveScalar() -> tensor<f32> {
   return %cst : tensor<f32>
 
 // CHECK-NEXT:  %[[cst:.*]] = constant dense<2.540000e+00> : tensor<f32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<!quant.uniform<u8:f32, 0.0099607841641295193>>}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<!quant.uniform<u8:f32, 0.0099607841641295193>>, volatile}
 }
 
 // CHECK-LABEL: QuantizeNegativeSplat
@@ -557,7 +557,7 @@ func @QuantizeNegativeSplat() -> tensor<2x3xf32> {
   return %cst : tensor<2x3xf32>
 
 // CHECK-NEXT:  %[[cst:.*]] = constant dense<-2.540000e+00> : tensor<2x3xf32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<2x3x!quant.uniform<u8:f32, 0.0099607841641295193:255>>}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<2x3x!quant.uniform<u8:f32, 0.0099607841641295193:255>>, volatile}
 }
 
 // CHECK-LABEL: QuantizeNegativeScalar
@@ -566,7 +566,7 @@ func @QuantizeNegativeScalar() -> tensor<f32> {
   return %cst : tensor<f32>
 
 // CHECK-NEXT:  %[[cst:.*]] = constant dense<-2.540000e+01> : tensor<f32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<!quant.uniform<u8:f32, 0.099607841641295186:255>>}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<!quant.uniform<u8:f32, 0.099607841641295186:255>>, volatile}
 }
 
 // CHECK-LABEL: QuantizeSharedBiases
@@ -617,7 +617,7 @@ func @QuantizeSharedBiases2(
 // CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]])
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: %[[cst_0:.*]] = constant dense<0.000000e+00> : tensor<32xf32>
-// CHECK: %[[q_0:.*]] = "tfl.quantize"(%[[cst_0]]) {qtype = tensor<32x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK: %[[q_0:.*]] = "tfl.quantize"(%[[cst_0]]) {qtype = tensor<32x!quant.uniform<u8:f32, 1.000000e+00>>, volatile}
 // CHECK: %[[dq_0:.*]] = "tfl.dequantize"(%[[q_0]])
 // CHECK: %{{.*}} = tfl.add %{{.*}}, %[[dq_0]]
 // CHECK: %{{.*}} = "tfl.conv_2d"(%{{.*}}, %{{.*}}, %[[dq]])
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize.mlir b/tensorflow/compiler/mlir/lite/tests/quantize.mlir
index 54ca7f043f4..6f42ae6293d 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize.mlir
@@ -195,8 +195,8 @@ func @QuantizeConcat(tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2x!quant.unif
   %1 = "tfl.quantize"(%0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   return %1 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}
+// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %[[cc:.*]] = "tfl.concatenation"(%[[q1]], %[[q0]]) {axis = 0 : i32, fused_activation_function = "NONE"}
 // CHECK: return %[[cc]] : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 }
@@ -209,8 +209,8 @@ func @QuantizeConcatRequantize(tensor<1x2x!quant.uniform<u8:f32, 2.0:128>>, tens
   %3 = "tfl.quantize"(%2) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   return %3 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}
-// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
+// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}
 // CHECK: %[[cc:.*]] = "tfl.concatenation"(%[[q0]], %[[q1]]) {axis = 0 : i32, fused_activation_function = "NONE"}
 // CHECK: return %[[cc]] : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/split-merged-operands.mlir b/tensorflow/compiler/mlir/lite/tests/split-merged-operands.mlir
index d2d0e43e0e9..c5c9ee645f4 100644
--- a/tensorflow/compiler/mlir/lite/tests/split-merged-operands.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/split-merged-operands.mlir
@@ -1,27 +1,27 @@
 // RUN: tf-opt -tfl-split-merged-operands %s | FileCheck %s
 
-func @testSingleLstm(%arg0: tensor<4 x f32>) -> tensor<4xf32> {
+func @testSingleLstm(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>) -> tensor<4x4xf32> {
   // CHECK-LABEL: testSingleLstm
-  // CHECK:  %[[CST_0:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4xf32>} : () -> tensor<4xf32>
-  // CHECK:  %[[CST_1:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4xf32>} : () -> tensor<4xf32>
-  // CHECK:  %[[LSTM:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %[[CST_0]], %[[CST_1]], %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // CHECK:  %[[CST_0:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK:  %[[CST_1:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK:  %[[LSTM:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg0, %[[CST_0]], %[[CST_1]], %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
 
-  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
-  %1 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %0, %0, %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %1 : tensor<4xf32>
+  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32> loc("Const")
+  %1 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg0, %0, %0, %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+  return %1 : tensor<4x4xf32>
 }
 
-func @testMultipleLstms(%arg0: tensor<4 x f32>) -> tensor<4xf32> {
+func @testMultipleLstms(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>) -> tensor<4x4xf32> {
   // CHECK-LABEL: testMultipleLstms
-  // CHECK:  %[[CST_0:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4xf32>} : () -> tensor<4xf32>
-  // CHECK:  %[[CST_1:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4xf32>} : () -> tensor<4xf32>
-  // CHECK:  %[[LSTM_1:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %[[CST_0]], %[[CST_1]], %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  // CHECK:  %[[CST_2:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4xf32>} : () -> tensor<4xf32>
-  // CHECK:  %[[CST_3:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4xf32>} : () -> tensor<4xf32>
-  // CHECK:  %[[LSTM_2:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%[[LSTM_1]], %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %[[CST_2]], %[[CST_3]], %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // CHECK:  %[[CST_0:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK:  %[[CST_1:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK:  %[[LSTM_1:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg0, %[[CST_0]], %[[CST_1]], %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+  // CHECK:  %[[CST_2:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK:  %[[CST_3:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK:  %[[LSTM_2:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%[[LSTM_1]], %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg0, %[[CST_2]], %[[CST_3]], %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
 
-  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
-  %1 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %0, %0, %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  %2 = "tfl.unidirectional_sequence_lstm"(%1, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %0, %0, %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %2 : tensor<4xf32>
+  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32> loc("Const")
+  %1 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg0, %0, %0, %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+  %2 = "tfl.unidirectional_sequence_lstm"(%1, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg0, %0, %0, %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+  return %2 : tensor<4x4xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 6ab16141626..40420eee697 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -48,7 +48,8 @@ void AddQuantizationPasses(const mlir::TFL::QuantizationSpecs& quant_specs,
       quant_specs.default_ranges.second.hasValue()) {
     pass_manager->addPass(mlir::TFL::CreateDefaultQuantParamsPass(
         quant_specs.default_ranges.first.getValueOr(0.0),
-        quant_specs.default_ranges.second.getValueOr(0.0)));
+        quant_specs.default_ranges.second.getValueOr(0.0),
+        quant_specs.IsSignedInferenceType()));
     pass_manager->addPass(mlir::TFL::CreateQuantizePass());
     pass_manager->addPass(
         mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
@@ -161,6 +162,10 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
     pass_manager->addPass(
         mlir::TFL::CreatePrepareTFPass(pass_config.unfold_batch_matmul));
     pass_manager->addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
+    if (pass_config.shape_inference) {
+      // Add a shape inference pass to optimize away the unnecessary casts.
+      pass_manager->addPass(mlir::TF::CreateTFShapeInferencePass());
+    }
     pass_manager->addPass(
         mlir::TFL::CreateLegalizeTFPass(pass_config.runtime_verification));
     pass_manager->addPass(mlir::TFL::CreateOptimizePass());
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index 4bc9d9e0c2d..31dad60c294 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -160,6 +160,11 @@ int main(int argc, char **argv) {
         absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
     absl::Span<std::string> exported_names(exported_names_vector);
 
+    if (exported_names.size() != 1) {
+      llvm::errs() << "There should be only one exported name";
+      return kTrFailure;
+    }
+
     module = tensorflow::ImportSavedModel(input_file_name, saved_model_version,
                                           tags, exported_names, &context);
   } else {
@@ -175,7 +180,7 @@ int main(int argc, char **argv) {
   if (!module.ok()) return kTrFailure;
 
   mlir::PassManager pm(&context);
-  applyPassManagerCLOptions(pm);
+  mlir::applyPassManagerCLOptions(pm);
 
   // Set the quantization specifications from the command line flags.
   mlir::TFL::QuantizationSpecs quant_specs;
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index b9ec67736d9..62f64ab63b4 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -174,7 +174,7 @@ StatusOr<mlir::OwningModuleRef> ImportSavedModel(
     return module;
   } else if (saved_model_version == 1) {
     auto module = tensorflow::SavedModelSignatureDefsToMlirImport(
-        input_filename, tags, context);
+        input_filename, tags, exported_names, context);
 
     if (!module)
       return tensorflow::errors::InvalidArgument("fail to open input file");
diff --git a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
index a1602baced5..c23ae9fcfab 100644
--- a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
@@ -46,8 +46,11 @@ namespace {
 class DefaultQuantParamsPass
     : public PassWrapper<DefaultQuantParamsPass, FunctionPass> {
  public:
-  explicit DefaultQuantParamsPass(double default_min, double default_max)
-      : default_min_(default_min), default_max_(default_max) {}
+  explicit DefaultQuantParamsPass(double default_min, double default_max,
+                                  bool is_signed)
+      : default_min_(default_min),
+        default_max_(default_max),
+        is_signed_(is_signed) {}
 
   void runOnFunction() override;
 
@@ -82,6 +85,7 @@ class DefaultQuantParamsPass
 
   double default_min_;
   double default_max_;
+  bool is_signed_;
   quant::QuantParams default_quant_params_;
 };
 }  // namespace
@@ -214,15 +218,16 @@ quant::QuantParams DefaultQuantParamsPass::GetDefaultQuantParams(
     default_quant_params_ = quant::fakeQuantAttrsToType(
         builder.getUnknownLoc(),
         /*numBits=*/8, default_min_, default_max_, /*narrowRange=*/false,
-        builder.getF32Type());
+        builder.getF32Type(), is_signed_);
   }
   return default_quant_params_;
 }
 
 // Creates an instance of the default quant parameters pass.
 std::unique_ptr<OperationPass<FuncOp>> CreateDefaultQuantParamsPass(
-    double default_min, double default_max) {
-  return absl::make_unique<DefaultQuantParamsPass>(default_min, default_max);
+    double default_min, double default_max, bool is_signed) {
+  return absl::make_unique<DefaultQuantParamsPass>(default_min, default_max,
+                                                   is_signed);
 }
 
 // Registers this pass with default values, only for test
@@ -230,7 +235,8 @@ static PassRegistration<DefaultQuantParamsPass> pass(
     "tfl-default-quant",
     "Apply quantization with default quantization parameter", [] {
       return CreateDefaultQuantParamsPass(/*default_min=*/-1.0,
-                                          /*default_max=*/1.0);
+                                          /*default_max=*/1.0,
+                                          /*is_signed=*/false);
     });
 
 }  // namespace TFL
diff --git a/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc b/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
index 201a0bb2481..9b526f40277 100644
--- a/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
@@ -321,7 +321,8 @@ void DenseToSparse::runOnFunction() {
 
       if (result.needs_densify) {
         const auto value = op->getOperand(operand);
-        auto densify = builder.create<DensifyOp>(op->getLoc(), value);
+        auto densify =
+            builder.create<DensifyOp>(op->getLoc(), value.getType(), value);
         value.replaceAllUsesWith(densify);
         densify.setOperand(value);
       }
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index 4c6a16c2233..3e9d6e488b8 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -33,9 +33,6 @@ class ExtractI32At<int i> : NativeCodeCall<
     "$_builder.getI32IntegerAttr($_self.cast<ArrayAttr>().getValue()[" # i #
     "].cast<IntegerAttr>().getInt())">;
 
-// Merge the two Attributes to a ArrayAttr;
-def Merge2AttrsToArray : NativeCodeCall<"$_builder.getArrayAttr({$0, $1})">;
-
 // Use the tensor type information from $0 and convert min $1, max $2 and
 // numBits $3 and narrowRange $4 to a QuantizedType.
 def ConvertToQuantTypeFromAttrs : NativeCodeCall<
@@ -61,15 +58,12 @@ def HasNotSameStaticShapes : Constraint<Neg<HasSameStaticShapesPred>, "op must h
 def CreateNoneValue : NativeCodeCall<
   "$_builder.create<mlir::ConstantOp>($0.getLoc(), $_builder.getNoneType(), $_builder.getUnitAttr())">;
 
-// Checks if the value has only one user.
-// TODO(karimnosseir): Move to a common place?
-def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
-
 //===----------------------------------------------------------------------===//
 // Nullary ops patterns.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(TF_ConstOp ElementsAttr:$value), (TFL_ConstOp $value)>;
+def LegalizeTFConstToTFLConst: Pat<(TF_ConstOp ElementsAttr:$value),
+                                   (TFL_ConstOp $value)>;
 
 // Convert to std constant for statically shaped, non-opaque constants.
 def : Pat<(TF_ConstOp:$res NonOpaqueElementsAttr:$value), (ConstantOp $value),
@@ -79,17 +73,23 @@ def : Pat<(TF_ConstOp:$res NonOpaqueElementsAttr:$value), (ConstantOp $value),
 // Unary ops patterns.
 //===----------------------------------------------------------------------===//
 def IsDataFormatNHWC : ConstantAttr<TF_ConvnetDataFormatAttr, "NHWC">;
+
+// Constraint that Attr has values [1, X, Y, 1]
 def IsIntList1XY1 : AttrConstraint<CPred<"TFIntListIs1XY1($_self)">>;
+
+// Constraint that values in list attribute are all ones.
 def IsAllOnes : AttrConstraint<CPred<"TFIntListIsAllOnes($_self)">>;
+
+// Constraint that attribute is string with value either "SAME" or "VALID"
 def IsSameOrValid : AttrConstraint<
     CPred<"$_self.cast<StringAttr>().getValue() == \"SAME\" || " #
           "$_self.cast<StringAttr>().getValue() == \"VALID\"">,
     "'SAME' or 'VALID' paddings">;
 
-def : Pat<(TF_AbsOp $arg), (TFL_AbsOp $arg)>;
-def : Pat<(TF_AddNOp $inputs), (TFL_AddNOp $inputs)>;
+def LegalizeAbs : Pat<(TF_AbsOp $arg), (TFL_AbsOp $arg)>;
+def LegalizeAddN : Pat<(TF_AddNOp $inputs), (TFL_AddNOp $inputs)>;
 
-def : Pat<(TF_AvgPoolOp $value,
+def LegalizeAveragePool : Pat<(TF_AvgPoolOp $value,
               IsIntList1XY1:$ksize,
               IsIntList1XY1:$strides,
               $padding,
@@ -102,35 +102,42 @@ def : Pat<(TF_AvgPoolOp $value,
               /*stride_w=*/ExtractI32At<2>:$strides,
               /*fused_activation_function=*/TFL_AF_None)>;
 
-def : Pat<(TF_ArgMaxOp $input, $dim), (TFL_ArgMaxOp $input, $dim)>;
-def : Pat<(TF_ArgMinOp $input, $dim), (TFL_ArgMinOp $input, $dim)>;
+def LegalizeArgMax : Pat<(TF_ArgMaxOp $input, $dim),
+                         (TFL_ArgMaxOp $input, $dim)>;
+def LegalizeArgMin : Pat<(TF_ArgMinOp $input, $dim),
+                         (TFL_ArgMinOp $input, $dim)>;
 
-def : Pat<(TF_CeilOp $arg), (TFL_CeilOp $arg)>;
+def LegalizeCeil : Pat<(TF_CeilOp $arg), (TFL_CeilOp $arg)>;
 
-def : Pat<(TF_CosOp $arg), (TFL_CosOp $arg)>;
+def LegalizeCos : Pat<(TF_CosOp $arg), (TFL_CosOp $arg)>;
 
-def : Pat<(TF_EluOp $arg), (TFL_EluOp $arg)>;
+def LegalizeElu : Pat<(TF_EluOp $arg), (TFL_EluOp $arg)>;
 
-def : Pat<(TF_ExpandDimsOp $input, $dim), (TFL_ExpandDimsOp $input, $dim)>;
+def LegalizeExpandDims : Pat<(TF_ExpandDimsOp $input, $dim),
+                             (TFL_ExpandDimsOp $input, $dim)>;
 
-def : Pat<(TF_FakeQuantWithMinMaxArgsOp $inputs,
-              $min, $max,
-              $num_bits, $narrow_range),
-          (TFL_DequantizeOp
-              (TFL_QuantizeOp $inputs,
-                 (ConvertToQuantTypeFromAttrs $inputs, $min, $max,
-                     $num_bits, $narrow_range)))>;
+def LegalizeFakeQuantToDequantizeQuantize : Pat<
+  (TF_FakeQuantWithMinMaxArgsOp $inputs, $min, $max, $num_bits, $narrow_range),
+  (TFL_DequantizeOp
+   (TFL_QuantizeOp $inputs,
+                   (ConvertToQuantTypeFromAttrs $inputs, $min, $max,
+                    $num_bits, $narrow_range)))>;
 
-def : Pat<(TF_FillOp $arg, $value), (TFL_FillOp $arg, $value)>;
+def LegalizeFill : Pat<(TF_FillOp $arg, $value), (TFL_FillOp $arg, $value)>;
 
-def : Pat<(TF_FloorOp $arg), (TFL_FloorOp $arg)>;
+def LegalizeFloor : Pat<(TF_FloorOp $arg), (TFL_FloorOp $arg)>;
 
-def : Pat<(TF_LeakyReluOp $arg, F32Attr:$a), (TFL_LeakyReluOp $arg, $a)>;
-def : Pat<(TF_LogOp $arg), (TFL_LogOp $arg)>;
-def : Pat<(TF_LogicalNotOp $arg), (TFL_LogicalNotOp $arg)>;
-def : Pat<(TF_LogSoftmaxOp $arg), (TFL_LogSoftmaxOp $arg)>;
+def LegalizeLeakyRelu : Pat<(TF_LeakyReluOp $arg, F32Attr:$a),
+                            (TFL_LeakyReluOp $arg, $a)>;
 
-def : Pat<(TF_MaxPoolOp $value,
+def LegalizeLog : Pat<(TF_LogOp $arg), (TFL_LogOp $arg)>;
+
+def LegalizeNot : Pat<(TF_LogicalNotOp $arg), (TFL_LogicalNotOp $arg)>;
+
+def LegalizeLogSoftmax : Pat<(TF_LogSoftmaxOp $arg), (TFL_LogSoftmaxOp $arg)>;
+
+def LegalizeMaxPool2D : Pat<
+          (TF_MaxPoolOp $value,
               IsIntList1XY1:$ksize,
               IsIntList1XY1:$strides,
               $padding,
@@ -143,8 +150,12 @@ def : Pat<(TF_MaxPoolOp $value,
               /*filter_height=*/ExtractI32At<1>:$ksize,
               /*fused_activation_function=*/TFL_AF_None)>;
 
-def : Pat<(TF_MaximumOp $arg1, $arg2), (TFL_MaximumOp $arg1, $arg2)>;
-def : Pat<(TF_MinimumOp $arg1, $arg2), (TFL_MinimumOp $arg1, $arg2)>;
+def LegalizeMaximum : Pat<(TF_MaximumOp $arg1, $arg2),
+                          (TFL_MaximumOp $arg1, $arg2)>;
+
+def LegalizeMinimum : Pat<(TF_MinimumOp $arg1, $arg2),
+                          (TFL_MinimumOp $arg1, $arg2)>;
+
 def : Pat<(TF_NegOp $arg), (TFL_NegOp $arg)>;
 def : Pat<(TF_OneHotOp $indices, $depth, $on_value, $off_value, $axis),
           (TFL_OneHotOp $indices, $depth, $on_value, $off_value,
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index ab4c4f5c4cf..bfcbc190638 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
@@ -202,6 +203,26 @@ LogicalResult ConvertTFConcatOp::matchAndRewrite(
   return success();
 }
 
+// Converts any IntegerAttr to an IntegerAttr of an i32 type.
+// The value won't change in the new attribute, but if the value is out of
+// the bound of i32, the function returns a failure.
+LogicalResult ConvertToI32Attr(IntegerAttr attr, IntegerAttr* attr_i32) {
+  if (attr.getType().isInteger(/*width=*/32)) {
+    *attr_i32 = attr;
+    return success();
+  }
+
+  int64_t value = attr.getInt();
+  if (value > std::numeric_limits<int>::max() ||
+      value < std::numeric_limits<int>::min()) {
+    return failure();
+  }
+
+  *attr_i32 = IntegerAttr::get(
+      IntegerType::get(/*width=*/32, attr.getContext()), value);
+  return success();
+}
+
 LogicalResult ConvertTFConcatV2Op::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_concat_op = cast<TF::ConcatV2Op>(op);
@@ -211,12 +232,16 @@ LogicalResult ConvertTFConcatV2Op::matchAndRewrite(
   // Extract axis attribute from constant axis tensor
   ElementsAttr axis;
   if (!matchPattern(tf_concat_op.axis(), m_Constant(&axis))) return failure();
+  IntegerAttr axis_int = ExtractSingleElementAsInteger(axis);
+
+  // "axis" operand could be a i64 tensor. Resolve it here.
+  IntegerAttr axis_i32;
+  if (failed(ConvertToI32Attr(axis_int, &axis_i32))) return failure();
 
   StringAttr fused_activation_function =
       StringAttr::get("NONE", rewriter.getContext());
   rewriter.replaceOpWithNewOp<ConcatenationOp>(
-      op, output_type, values, ExtractSingleElementAsInteger(axis),
-      fused_activation_function);
+      op, output_type, values, axis_i32, fused_activation_function);
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/load_quantization_recipe.cc b/tensorflow/compiler/mlir/lite/transforms/load_quantization_recipe.cc
index 307a45639c5..5cb659fa318 100644
--- a/tensorflow/compiler/mlir/lite/transforms/load_quantization_recipe.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/load_quantization_recipe.cc
@@ -184,7 +184,7 @@ void LoadQuantizationRecipe::LoadForLSTMOp(LSTMOp lstm, OpBuilder* builder) {
 
   auto new_cell_tanh = builder->create<TanhOp>(loc, int16, new_cell);
   auto hidden_state = builder->create<MulOp>(
-      loc, int16, new_cell_tanh.y(), output_gate->getResult(0), none_af);
+      loc, int16, new_cell_tanh.output(), output_gate->getResult(0), none_af);
   auto act = builder->create<FullyConnectedOp>(
       loc, int8, hidden_state.output(), lstm.projection_weights(),
       lstm.projection_bias(), none_af, fc_format, keep_dims);
diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index a69b0a3c624..2498a732a86 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -577,7 +577,6 @@ struct ConvertTensorListResize
         ArrayRef<Value>({input_handle, input_shape, size_diff, size}),
         /*then_branch=*/rewriter.getSymbolRefAttr(then_branch_op),
         /*else_branch=*/rewriter.getSymbolRefAttr(else_branch_op),
-        /*output_shapes=*/rewriter.getArrayAttr({}),
         /*is_stateless=*/rewriter.getBoolAttr(true));
     return success();
   }
@@ -838,7 +837,8 @@ LogicalResult LowerStaticTensorListPass::RewriteFunction(
   // TensorFlow operations that doesn't have operands and results of type
   // variant are legal. Here, we don't distinguish between variants encoding
   // TensorList or some other type as that information is not available here.
-  // This constraint should be relaxed to support other variant types in TFLite.
+  // Partial legalization is used below to still allow ops with variant types
+  // still.
   auto is_legal = [](Operation *op) {
     auto is_not_variant = [](Type ty) {
       return !ty.cast<ShapedType>().getElementType().isa<TF::VariantType>();
@@ -859,6 +859,7 @@ LogicalResult LowerStaticTensorListPass::RewriteFunction(
   target.addLegalOp<ConstantOp>();
   target.addLegalOp<FuncOp>();
   target.addLegalOp<ReturnOp>();
+  target.addLegalOp<TFL::CustomOp>();
   // Register fused LSTM/RNN ops as legal.
   target.addLegalOp<TFL::LSTMOp>();
   target.addLegalOp<TFL::UnidirectionalSequenceLSTMOp>();
@@ -872,7 +873,7 @@ LogicalResult LowerStaticTensorListPass::RewriteFunction(
                   ConvertTensorListPushBack, ConvertTensorListReserve,
                   ConvertTensorListSetItem, ConvertTensorListStack,
                   ConvertTensorListResize, ConvertWhile>(context);
-  return applyFullConversion(func, target, patterns);
+  return applyPartialConversion(func, target, patterns);
 }
 
 void LowerStaticTensorListPass::runOnOperation() {
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index a1aedb0af32..30ae4b81f4f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -206,6 +206,28 @@ DenseElementsAttr GetShape(Value output_val) {
       llvm::makeArrayRef(shape));
 }
 
+static Type GetShapeStrippedType(TypeAttr type_attr) {
+  auto type = type_attr.getValue();
+  auto shaped_type = type.dyn_cast<ShapedType>();
+  if (shaped_type) {
+    return shaped_type.getElementType();
+  } else {
+    return type;
+  }
+}
+
+bool NotFromQuantOpDifferentQuant(Value val, TypeAttr qtype_attr) {
+  auto val_defn_op = val.getDefiningOp();
+  TFL::QuantizeOp q_op = llvm::dyn_cast_or_null<TFL::QuantizeOp>(val_defn_op);
+  if (!q_op) return true;
+
+  // Ignore shape details - weŕe really only trying to
+  // check if quantization is the same.
+  auto stripped_src_qtype = GetShapeStrippedType(q_op.qtypeAttr());
+  auto stripped_qtype = GetShapeStrippedType(qtype_attr);
+  return stripped_src_qtype == stripped_qtype;
+}
+
 #include "tensorflow/compiler/mlir/lite/transforms/generated_optimize.inc"
 
 // Fuse Add with proceeding FullyConnected.
@@ -641,8 +663,8 @@ struct ConvertTrivialTransposeOpToReshapeOp
 
   LogicalResult matchAndRewrite(TFL::TransposeOp transpose_op,
                                 PatternRewriter &rewriter) const override {
-    auto input_type = transpose_op.x().getType().cast<ShapedType>();
-    auto output_type = transpose_op.y().getType().cast<ShapedType>();
+    auto input_type = transpose_op.input().getType().cast<ShapedType>();
+    auto output_type = transpose_op.output().getType().cast<ShapedType>();
     // It's possible to know if the transformation is safe only if the input
     // & output shapes are fully known and permutation is a constant.
     if (!input_type.hasStaticShape() || !output_type.hasStaticShape())
@@ -691,7 +713,8 @@ struct ConvertTrivialTransposeOpToReshapeOp
     auto new_shape = rewriter.create<TF::ConstOp>(loc, new_shape_attr);
 
     rewriter.replaceOpWithNewOp<TFL::ReshapeOp>(
-        transpose_op, transpose_op.y().getType(), transpose_op.x(), new_shape);
+        transpose_op, transpose_op.output().getType(), transpose_op.input(),
+        new_shape);
 
     return success();
   }
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index a3244f31053..e8f1c9c2cf3 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -29,6 +29,14 @@ def ExtractSingleElementAsFloat : NativeCodeCall<
 // Checks if the value has only one user.
 def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
 
+// Checks if the value has rank at most 'n'.
+class HasRankAtMost<int n> : Constraint<
+    CPred<"$0.getType().cast<ShapedType>().getRank() <= " # n>>;
+
+// Checks value is not produce by a TLF_QUant with
+// different quantization attribute
+def NotFromQuantOpDifferentQuant : Constraint<CPred<"NotFromQuantOpDifferentQuant($0,$1)">>;
+
 //===----------------------------------------------------------------------===//
 // Ternary ops patterns.
 //===----------------------------------------------------------------------===//
@@ -160,7 +168,10 @@ foreach BinaryOp = [TFL_DivOp, TFL_MulOp] in
 // This pattern applies when the same quantize/dequantize have been used twice
 // with the same scale. We want to remove the redundancy.
 // TODO(fengliuai): move this to the sanity check of pre-quantize pass.
-def : Pat<(TFL_QuantizeOp (TFL_DequantizeOp $in), $qt), (replaceWithValue $in)>;
+def eliminate_dq_q_pairs : Pat<
+  (TFL_QuantizeOp (TFL_DequantizeOp $in), $qt), 
+  (replaceWithValue $in), 
+  [(NotFromQuantOpDifferentQuant $in, $qt)]>;
 
 
 // Constraint that makes sure both operands are the same operands.
@@ -347,7 +358,9 @@ foreach BinaryOp = [TFL_AddOp, TFL_SubOp, TFL_DivOp, TFL_MulOp] in {
              // The result of the new "BinaryOp" will have the same shape as
              // `input`. In other words, the shape of the `Reshape` op are not
              // changed after the transformation.
-             (IsTailOfShape $rhs, $input)]>;
+             (IsTailOfShape $rhs, $input),
+             (HasRankAtMost<5> $input),
+             (HasRankAtMost<5> $rhs)]>;
 }
 
 foreach BinaryOp = [TFL_FloorDivOp, TFL_FloorModOp, TFL_MinimumOp,
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.h b/tensorflow/compiler/mlir/lite/transforms/passes.h
index 959c17e317a..105c9394fb4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.h
@@ -76,7 +76,7 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateOptimizeFunctionalOpsPass();
 // Creates an instance of the TensorFlow Lite dialect pass to add default
 // quantization parameters.
 std::unique_ptr<OperationPass<FuncOp>> CreateDefaultQuantParamsPass(
-    double default_min, double default_max);
+    double default_min, double default_max, bool is_signed);
 
 // Creates an instance of the TensorFlow Lite dialect pass to convert dense
 // tensor to sparse format.
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 97b7d57dbf4..9a1da0ad03d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -118,6 +119,24 @@ void RemoveQuantizationAdaptorOps(FuncOp func) {
   func.setType(new_func_type);
 }
 
+// Remove the back-to-back quantize and dequantize ops with volatile attribute.
+struct RemoveVolatileOps : public OpRewritePattern<DequantizeOp> {
+  explicit RemoveVolatileOps(MLIRContext* context)
+      : OpRewritePattern<DequantizeOp>(context, 1) {}
+
+  LogicalResult matchAndRewrite(DequantizeOp op,
+                                PatternRewriter& rewriter) const override {
+    auto input_op = op.input().getDefiningOp();
+    if (auto q = llvm::dyn_cast_or_null<QuantizeOp>(input_op)) {
+      if (!q.getAttr(mlir::quant::kVolatileOpAttrName)) return failure();
+
+      op.replaceAllUsesWith(q.input());
+      return success();
+    }
+    return failure();
+  }
+};
+
 #include "tensorflow/compiler/mlir/lite/transforms/generated_post_quantize.inc"
 
 void PostQuantizePass::runOnFunction() {
@@ -125,11 +144,15 @@ void PostQuantizePass::runOnFunction() {
   auto func = getFunction();
   auto* ctx = func.getContext();
   TFL::populateWithGenerated(ctx, &patterns);
+  patterns.insert<quant::FoldTrivalRequantizeOp<QuantizeOp>>(ctx);
   applyPatternsAndFoldGreedily(func, patterns);
 
   if (!emit_quant_adaptor_ops_) {
     RemoveQuantizationAdaptorOps(getFunction());
   }
+
+  patterns.insert<RemoveVolatileOps>(ctx);
+  applyPatternsAndFoldGreedily(func, patterns);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
index 6179eb2ce64..56af68f6bbe 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
@@ -41,15 +41,22 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/lstm_utils.h"
+#include "tensorflow/compiler/mlir/lite/utils/tftext_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
+// The cmd line flag to turn on/off Tf.Text API fusion.
 // NOLINTNEXTLINE
+static llvm::cl::opt<bool> fuse_tftext(
+    "tfl-fuse-tftext", llvm::cl::value_desc("bool"),
+    llvm::cl::desc("Fuse TF.Text API ops when it's true"),
+    llvm::cl::init(false));
 
 namespace mlir {
 namespace TFL {
 namespace {
 
 constexpr char kTFAPIImplements[] = "tf.api_implements";
+constexpr char kTfTextAPIPRefix[] = "tftext:";
 
 // Abstracts the conversion of the embedded lookup composite function.
 class ConvertEmbeddedLookupFunc {
@@ -187,6 +194,10 @@ void PrepareCompositeFunctionsPass::ConvertTFAPIImplements(FuncOp func,
     OpBuilder builder(func.getBody());
     if (failed(ConvertKerasLSTMLayer(func, &builder)))
       return signalPassFailure();
+  } else if (fuse_tftext && attr.getValue().startswith(kTfTextAPIPRefix)) {
+    if (failed(ConvertTFTextAPI(func, attr.getValue()))) {
+      return signalPassFailure();
+    }
   }
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index a9e10a485bf..579063f9c9d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -70,6 +70,7 @@ class PrepareQuantizePass
     : public PassWrapper<PrepareQuantizePass, FunctionPass> {
  public:
   // Constructor used by the PassRegistration and enforce uint8 quantization.
+  // This is only used by test.
   explicit PrepareQuantizePass() {
     if (quantize_signed)
       quant_specs_.inference_type = tensorflow::DT_QINT8;
@@ -108,8 +109,8 @@ class PrepareQuantizePass
   // Get the min and max values from the quantization specification for the
   // current function function and argument index. Uses default values if
   // the function is specified in the `quantize_whitelist`.
-  std::pair<double, double> GetMinMaxValuesForArgument(
-      llvm::StringRef func_name, int index) {
+  std::pair<llvm::Optional<double>, llvm::Optional<double>>
+  GetMinMaxValuesForArgument(llvm::StringRef func_name, int index) {
     if (func_name == quant_specs_.target_func) {
       return quant_specs_.input_ranges[index];
     } else {
@@ -159,10 +160,14 @@ bool PrepareQuantizePass::SetInputNodesQuantizationParams(FuncOp func) {
         }
 
         auto min_max = GetMinMaxValuesForArgument(func_name, i);
+        // The input min/max or mean/std are not specified, then skip.
+        if (!min_max.first.hasValue() || !min_max.second.hasValue()) return;
+
         TypeAttr params = quant::GetQuantizedTypeAttr(
-            builder, input_type, builder.getF64FloatAttr(min_max.first),
-            builder.getF64FloatAttr(min_max.second), /*quant_dim=*/-1, num_bits,
-            narrow_range, is_signed);
+            builder, input_type,
+            builder.getF64FloatAttr(min_max.first.getValue()),
+            builder.getF64FloatAttr(min_max.second.getValue()),
+            /*quant_dim=*/-1, num_bits, narrow_range, is_signed);
         builder.setInsertionPoint(block, insertion_point);
         auto q_op =
             builder.create<quant::QuantizeCastOp>(loc, params.getValue(), arg);
@@ -205,6 +210,7 @@ void PrepareQuantizePass::SanityCheckAndAdjustment(FuncOp func) {
   // one is returned directly, we decide to return the quantized result instead,
   // so this op can be quantized. This is only applied on the returned result
   // because the error will not be accumulated.
+
   func.walk([&](ReturnOp ret) {
     int i = 0;
     for (Value returned : ret.operands()) {
@@ -232,6 +238,51 @@ void PrepareQuantizePass::SanityCheckAndAdjustment(FuncOp func) {
         "Missing quantization parameter on the output might introduce "
         "quantization error!");
   });
+
+  // Check for  (Quant (Dequant $in), $qA) "qdq" pairs that couldn't be
+  // eliminated at this point.  This only occurs for the pattern
+  //      (Quant (Dequant (Quant $in, $qB)), $qA)   $qB != $qA
+  // where the  qdq pair denotes a non-trivial requantiziion of an
+  // alreadyquantized value. Since this makes little sense (directly quantizing
+  // (Quant $in, $qA) would introduce less quantization noise) the likley cause
+  // is an minor error in constructing the original network model that
+  // introduced back-to-back Fake Quantization operations. Hence: emit a
+  // warning. N.b. at this point weŕe (teporarility) in the quantization dialect
+  // (presuambly enalbe re-use in xla etc) quant::*QuantizeCastOp weŕe matching
+  // here.
+  //
+  func.walk([&](quant::QuantizeCastOp q_op) {
+    // If up with end up with
+    auto dq_op = dyn_cast_or_null<quant::DequantizeCastOp>(
+        q_op.getOperand().getDefiningOp());
+    if (!dq_op) {
+      return;
+    }
+    auto dq_arg = dq_op.getOperand();
+
+    if (!dq_arg.hasOneUse()) {
+      // The initial quanization is used sompleace else ... so it might be
+      // reasonable for it to requantized for another purpose.
+      // TODO: ideally would want to still check whether requanization narrows
+      // rather than widens the representation
+      return;
+    }
+
+    // Invariant:
+    // isa<quant::QuantizeCastOp>(dq_arg.getDefiningOp()) -->
+    // getdq_arg.getType() != q_op.getResult().getType()
+    //
+    // as otherwise qdq pair would have been optimized away.
+    auto qd_arg_def_q_op =
+        dyn_cast_or_null<quant::QuantizeCastOp>(dq_arg.getDefiningOp());
+    if (!qd_arg_def_q_op) {
+      return;
+    }
+
+    qd_arg_def_q_op.emitWarning()
+        << " quantizer's output has another quantizer (" << q_op.getLoc()
+        << ") as consumer - intentional?";
+  });
 }
 
 using PrepareQuantStats =
@@ -257,15 +308,16 @@ void PrepareQuantizePass::runOnFunction() {
   // convert all of them to signed.
   OwningRewritePatternList patterns;
   bool is_signed = quant_specs_.IsSignedInferenceType();
+  int bit_width = quant_specs_.GetQuantizationTypeWidth();
   if (is_signed) {
     patterns.insert<quant::ConvertUnsignedToSigned<quant::QuantizeCastOp>>(ctx);
     // Convert quant stats to int8 quantization parameters.
     // Currently, only activation stats are imported, so narrow_range = false.
-    patterns.insert<PrepareQuantStats>(8, false, true, ctx);
+    patterns.insert<PrepareQuantStats>(bit_width, false, true, ctx);
   } else {
     // Convert quant stats to uint8 quantization parameters.
     // Currently, only activation stats are imported, so narrow_range = false.
-    patterns.insert<PrepareQuantStats>(8, false, false, ctx);
+    patterns.insert<PrepareQuantStats>(bit_width, false, false, ctx);
   }
   applyPatternsAndFoldGreedily(func, patterns);
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index c5211bdfadb..3310c521a5a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -82,13 +82,48 @@ class PrepareTFPass : public PassWrapper<PrepareTFPass, FunctionPass> {
   bool unfold_batch_matmul_;
 };
 
+template <class TFFakeQuantOp>
+struct FetchConstantMinMaxInputs {
+  using AttrType = DenseFPElementsAttr;
+  bool operator()(TFFakeQuantOp tf_op, AttrType &min_value,
+                  AttrType &max_value) const {
+    Value min = tf_op.min(), max = tf_op.max();
+
+    // TODO: incomplete  neither IdentityN ops
+    // nor chains of Identity* (not rare) are handled
+    if (auto id1 = dyn_cast_or_null<TF::IdentityOp>(min.getDefiningOp()))
+      min = id1.input();
+    if (auto id2 = dyn_cast_or_null<TF::IdentityOp>(max.getDefiningOp()))
+      max = id2.input();
+    if (!matchPattern(min, m_Constant(&min_value))) {
+      return false;
+    }
+    if (!matchPattern(max, m_Constant(&max_value))) {
+      return false;
+    }
+    return true;  // Succesfully matched and fetched.
+  }
+};
+
+template <class TFFakeQuantOp>
+struct FetchMinMaxAttrs {
+  using AttrType = FloatAttr;
+  bool operator()(TFFakeQuantOp tf_op, AttrType &min_value,
+                  AttrType &max_value) const {
+    min_value = tf_op.minAttr();
+    max_value = tf_op.maxAttr();
+    return true;  // Succesfully matched and fetched.
+  }
+};
+
 // TODO(fengliuai): move this rule to PreparePatterns.td
 // TODO(fengliuai): reuse the quantization/tensorflow/tf_to_quant pass.
 // TODO(b/140968741): propagate the sign from the command line. Currently all
 // the FakeQuant is assumed to targeting UIN8, but per-channel kernel is
 // actually INT8.
 // Inserts a "tfl.quantize" and "tfl.dequantize" op pair (QDQs) after the
-// "tf.FakeQuantWithMinMaxVarsOp" to be constant folded. Since the constant
+// tf.FakeQyantWithMinMax{Vars|VarsPerChannel|Args}Op
+// to be constant folded. Since the constant
 // folding logic will use a "std.constant" op to replace the
 // "tf.FakeQuantWithMinMaxVarsOp", the "tfl.quantize" op is used to preserve
 // the quantization parameters as a TypeAttr and "tfl.dequantize" op used to
@@ -112,33 +147,49 @@ class PrepareTFPass : public PassWrapper<PrepareTFPass, FunctionPass> {
 //                   |
 //              tf.dequantize
 //                   |
-template <typename TFFakeQuantOp, bool PerAxis>
+//
+//
+// Warns if the (most likely unwanted, currently not quite correctly handled)
+// case of back-to-back tf.FakeQuant occurs
+//
+//             tf.FakeQuant*
+//                   |
+//             tf.FakeQuant*
+//
+// tf.identity / tf.IdentityN between the tf.FakeQuant* ops
+// need no special treatment are already eliminated before the rewrites / check
+// is applied.
+//
+
+template <typename TFFakeQuantOp, bool PerAxis, class FetchMinMax>
 struct InsertTFLQuantOpsAfterTFFakeQuantOp
     : public OpRewritePattern<TFFakeQuantOp> {
-  using BaseType = InsertTFLQuantOpsAfterTFFakeQuantOp<TFFakeQuantOp, PerAxis>;
+  using BaseType =
+      InsertTFLQuantOpsAfterTFFakeQuantOp<TFFakeQuantOp, PerAxis, FetchMinMax>;
 
-  explicit InsertTFLQuantOpsAfterTFFakeQuantOp<TFFakeQuantOp, PerAxis>(
-      MLIRContext *ctx)
+  explicit InsertTFLQuantOpsAfterTFFakeQuantOp<TFFakeQuantOp, PerAxis,
+                                               FetchMinMax>(MLIRContext *ctx)
       : OpRewritePattern<TFFakeQuantOp>(ctx) {}
 
+  FetchMinMax fetchMinMax;
+
+  using FetchAttrType = typename FetchMinMax::AttrType;
   LogicalResult matchAndRewrite(TFFakeQuantOp tf_op,
                                 PatternRewriter &rewriter) const override {
     // We don't want to insert quantize/dequantize if the quantize op exists.
     auto res = tf_op.outputs();
-    if (!res.hasOneUse() || isa<QuantizeOp>(*res.user_begin()))
+    if (!res.hasOneUse() || isa<QuantizeOp>(*res.user_begin())) {
       return failure();
+    }
 
     // Extract the min/max constant values from the operands. We also consider
     // a special case that there are tf.Identity ops between the min/max
     // constants and the tf.FakeQuantWithMinMaxVarsOp.
-    Value min = tf_op.min(), max = tf_op.max();
-    DenseFPElementsAttr min_value, max_value;
-    if (auto id1 = dyn_cast_or_null<TF::IdentityOp>(min.getDefiningOp()))
-      min = id1.input();
-    if (auto id2 = dyn_cast_or_null<TF::IdentityOp>(max.getDefiningOp()))
-      max = id2.input();
-    if (!matchPattern(min, m_Constant(&min_value))) return failure();
-    if (!matchPattern(max, m_Constant(&max_value))) return failure();
+
+    FetchAttrType min_value, max_value;
+    if (!fetchMinMax(tf_op, min_value, max_value)) {
+      return failure();
+    }
 
     int quant_dim = -1;
     if (PerAxis) {
@@ -155,7 +206,9 @@ struct InsertTFLQuantOpsAfterTFFakeQuantOp
     TypeAttr qtype = quant::GetQuantizedTypeAttr(
         rewriter, res_type, min_value, max_value, quant_dim, num_bits,
         narrow_range, /*is_signed=*/false);
-    if (!qtype) failure();
+    if (!qtype) {
+      return failure();
+    }
 
     // Finally, use the quantization parameter to create the quantize and
     // dequantize ops, and insert them between the tf.FakeQuantWithMinMaxVarsOp
@@ -172,12 +225,22 @@ struct InsertTFLQuantOpsAfterTFFakeQuantOp
   }
 };
 
-using PreparePerTensorFakeQuant =
-    InsertTFLQuantOpsAfterTFFakeQuantOp<TF::FakeQuantWithMinMaxVarsOp, false>;
+//
+// Three instances of the rule to cover the three different types of
+// TF::FakeQuant operators
+//
+using PreparePerTensorFakeQuant = InsertTFLQuantOpsAfterTFFakeQuantOp<
+    TF::FakeQuantWithMinMaxVarsOp, /*PerAxis=*/false,
+    FetchConstantMinMaxInputs<TF::FakeQuantWithMinMaxVarsOp>>;
 
-using PreparePerChannelFakeQuant =
-    InsertTFLQuantOpsAfterTFFakeQuantOp<TF::FakeQuantWithMinMaxVarsPerChannelOp,
-                                        true>;
+using PreparePerChannelFakeQuant = InsertTFLQuantOpsAfterTFFakeQuantOp<
+    TF::FakeQuantWithMinMaxVarsPerChannelOp, /*PerAxis=*/true,
+    FetchConstantMinMaxInputs<TF::FakeQuantWithMinMaxVarsPerChannelOp>>;
+
+using PreparePerTensorFakeQuantWithMinMaxArgs =
+    InsertTFLQuantOpsAfterTFFakeQuantOp<
+        TF::FakeQuantWithMinMaxArgsOp, /*PerAxis=*/false,
+        FetchMinMaxAttrs<TF::FakeQuantWithMinMaxArgsOp>>;
 
 // Templated class for declaring a converter from some TensorFlow convolution
 // op into its counterpart in TensorFlow Lite.
@@ -644,9 +707,10 @@ void PrepareTFPass::runOnFunction() {
 
   // This pattern was intented to uses TFL QDQs to preserve the quantization
   // parameters from the TF Quant ops, thus this pattern should run with the
-  // first `applyPatternsAndFoldGreedily` method, which would otherwise removes
-  // the TF FakeQuant ops by the constant folding.
-  patterns.insert<PreparePerTensorFakeQuant, PreparePerChannelFakeQuant>(ctx);
+  // first `applyPatternsGreedily` method, which would otherwise removes the
+  // TF FakeQuant ops by the constant folding.
+  patterns.insert<PreparePerTensorFakeQuant, PreparePerChannelFakeQuant,
+                  PreparePerTensorFakeQuantWithMinMaxArgs>(ctx);
 
   // This pattern will try to identify and optimize for dilated convolution.
   // e.g. Patterns like "SpaceToBatchND -> Conv2D -> BatchToSpaceND" will be
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
index 5df57de6f71..081ba7ac6e7 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace mlir {
@@ -92,7 +93,9 @@ class LstmUtilsTest : public ::testing::Test {
   LstmUtilsTest() {}
 
   void SetUp() override {
-    builder_ = std::unique_ptr<mlir::Builder>(new Builder(&context_));
+    RegisterDialects();
+    context_ = std::make_unique<mlir::MLIRContext>();
+    builder_ = std::unique_ptr<mlir::Builder>(new Builder(context_.get()));
     fused_lstm_func_ = createLstmCompositeFunc(builder_.get(), false, false);
     fused_lstm_func_cifg_ =
         createLstmCompositeFunc(builder_.get(), false, true);
@@ -105,10 +108,17 @@ class LstmUtilsTest : public ::testing::Test {
     fused_ln_lstm_func_.erase();
     builder_.reset();
   }
+
+  void RegisterDialects() {
+    mlir::registerDialect<mlir::StandardOpsDialect>();
+    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
+    mlir::registerDialect<TensorFlowLiteDialect>();
+  }
+
   FuncOp fused_lstm_func_;
   FuncOp fused_lstm_func_cifg_;
   FuncOp fused_ln_lstm_func_;
-  mlir::MLIRContext context_;
+  std::unique_ptr<mlir::MLIRContext> context_;
   std::unique_ptr<mlir::Builder> builder_;
 };
 
diff --git a/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
new file mode 100644
index 00000000000..12929152d1e
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
@@ -0,0 +1,127 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/utils/tftext_utils.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Identifier.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TFL {
+
+namespace {
+
+constexpr char kWhitespaceTokenizer[] = "tftext:WhitespaceTokenizer";
+constexpr char kTFAPIImplements[] = "tf.api_implements";
+
+inline OpaqueElementsAttr emptyCustomOption(OpBuilder* builder) {
+  std::string content = "";
+  ShapedType type = RankedTensorType::get(
+      {static_cast<int64_t>(content.size())}, builder->getIntegerType(8));
+  return OpaqueElementsAttr::get(
+      builder->getContext()->getRegisteredDialect("tfl"), type, content);
+}
+
+inline RankedTensorType getInputType(mlir::FuncOp func, int idx) {
+  return func.getType()
+      .getInput(idx)
+      .dyn_cast_or_null<mlir::RankedTensorType>();
+}
+
+inline RankedTensorType getResultType(mlir::FuncOp func, int idx) {
+  return func.getType()
+      .getResult(idx)
+      .dyn_cast_or_null<mlir::RankedTensorType>();
+}
+
+LogicalResult VerifyWhitespaceTokenizer(mlir::FuncOp func) {
+  if (func.getNumResults() != 2) {
+    return failure();
+  }
+  if (func.getNumArguments() != 1) {
+    return failure();
+  }
+  auto input_type = getInputType(func, 0);
+  if (!input_type || input_type.getRank() != 1 ||
+      !input_type.getElementType().isa<mlir::TF::StringType>()) {
+    return failure();
+  }
+  auto value_type = getResultType(func, 0);
+  if (!value_type || value_type.getRank() != 1 ||
+      !value_type.getElementType().isa<mlir::TF::StringType>()) {
+    return failure();
+  }
+  auto offset_type = getResultType(func, 1);
+  if (offset_type.getRank() != 1 ||
+      !offset_type.getElementType().isInteger(64)) {
+    return failure();
+  }
+  return success();
+}
+
+LogicalResult ConvertWhitespaceTokenizer(mlir::FuncOp func,
+                                         llvm::StringRef api) {
+  func.eraseBody();
+  func.addEntryBlock();
+  func.setAttr(kTFAPIImplements, StringAttr::get(api, func.getContext()));
+
+  Value text = func.getArgument(0);
+  auto output_type = func.getType().getResult(0);
+  auto offset_type = func.getType().getResult(1);
+  SmallVector<Type, 2> shape = {output_type, offset_type};
+  ArrayRef<Type> output_types(shape);
+
+  OpBuilder builder(func.getBody());
+
+  auto op = builder.create<mlir::TFL::CustomOp>(func.getLoc(), output_types,
+                                                ValueRange(text), api,
+                                                emptyCustomOption(&builder));
+
+  builder.create<mlir::ReturnOp>(func.getLoc(), op.getResults());
+  return success();
+}
+}  // namespace
+
+LogicalResult ConvertTFTextAPI(mlir::FuncOp func, llvm::StringRef api) {
+  if (api.str() == kWhitespaceTokenizer) {
+    if (succeeded(VerifyWhitespaceTokenizer(func))) {
+      return ConvertWhitespaceTokenizer(func, api);
+    }
+  }
+  return failure();
+}
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/utils/tftext_utils.h b/tensorflow/compiler/mlir/lite/utils/tftext_utils.h
new file mode 100644
index 00000000000..283e57c179a
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/tftext_utils.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used by TFLite transformation
+// passes to work with op attributes.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_TFTEXT_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_TFTEXT_UTILS_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+
+namespace mlir {
+namespace TFL {
+
+LogicalResult ConvertTFTextAPI(mlir::FuncOp func, llvm::StringRef api);
+
+}  // end namespace TFL
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_TFTEXT_UTILS_H_
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
index 11d3e7332db..b2225ec1c4f 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_os_ostream.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -93,9 +94,10 @@ MlirOptimizationPassRegistry& MlirOptimizationPassRegistry::Global() {
 static void RegisterDialects() {
   static bool init_once = []() {
     mlir::registerDialect<mlir::StandardOpsDialect>();
+    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
+    mlir::registerDialect<mlir::shape::ShapeDialect>();
     mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
     mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
-    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
     return true;
   }();
   (void)init_once;
diff --git a/tensorflow/compiler/mlir/python/BUILD b/tensorflow/compiler/mlir/python/BUILD
index 666f89ac72f..1189a926383 100644
--- a/tensorflow/compiler/mlir/python/BUILD
+++ b/tensorflow/compiler/mlir/python/BUILD
@@ -12,6 +12,22 @@ cc_library(
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
+        # (yongtang) The graph_optimization_pass_registration needs to be part
+        # of a shared object that will be loaded whenever `import tensorflow`
+        # is run. The natural place is libtensorflow_framework.so.
+        # While adding graph_optimization_pass_registration to
+        # libtensorflow_framework.so is possible with some modification in
+        # dependency, many tests will fail due to multiple copies of LLVM.
+        # See https://github.com/tensorflow/tensorflow/pull/39231 for details.
+        # Alternatively, we place graph_optimization_pass_registration here
+        # because:
+        # - tensorflow/python/_pywrap_mlir.so already depends on LLVM anyway
+        # - tensorflow/python/_pywrap_mlir.so always loaded as part of python
+        #   binding
+        # TODO: It might be still preferrable to place graph_optimization_pass
+        # as part of the libtensorflow_framework.so, as it is the central
+        # place for core related components.
+        "//tensorflow/compiler/mlir/tensorflow:graph_optimization_pass_registration",
         "//tensorflow/compiler/mlir/tensorflow:import_utils",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/python/mlir.cc b/tensorflow/compiler/mlir/python/mlir.cc
index d0f6e015922..f22fb519a64 100644
--- a/tensorflow/compiler/mlir/python/mlir.cc
+++ b/tensorflow/compiler/mlir/python/mlir.cc
@@ -112,7 +112,7 @@ std::string ExperimentalConvertSavedModelV1ToMlir(
   // Convert the SavedModelBundle to an MLIR module.
 
   mlir::MLIRContext context;
-  auto module_or = ConvertSavedModelV1ToMlir(bundle, &context);
+  auto module_or = ConvertSavedModelV1ToMlir(bundle, {}, &context);
   if (!module_or.status().ok()) {
     Set_TF_Status_from_Status(status, module_or.status());
     return "// error";
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD b/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD
new file mode 100644
index 00000000000..78f4312da46
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD
@@ -0,0 +1,41 @@
+load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+
+package(licenses = ["notice"])
+
+tf_python_pybind_extension(
+    name = "mlir_wrapper",
+    srcs = [
+        "attrs.cc",
+        "basic_classes.cc",
+        "builders.cc",
+        "mlir_wrapper.cc",
+        "mlir_wrapper.h",
+        "ops.cc",
+        "types.cc",
+    ],
+    module_name = "mlir_wrapper",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:pybind11_status",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
+        "@pybind11",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "filecheck_wrapper",
+    srcs = ["filecheck_wrapper.cc"],
+    module_name = "filecheck_wrapper",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:pybind11_status",
+        "@llvm-project//llvm:support",
+        "@pybind11",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/attrs.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/attrs.cc
new file mode 100644
index 00000000000..ca7faf2e1d3
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/attrs.cc
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h"
+
+void init_attrs(py::module& m) {
+  py::class_<mlir::Attribute>(m, "Attribute");
+  py::class_<mlir::IntegerAttr, mlir::Attribute>(m, "IntegerAttr")
+      .def("get",
+           py::overload_cast<mlir::Type, int64_t>(&mlir::IntegerAttr::get));
+}
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/basic_classes.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/basic_classes.cc
new file mode 100644
index 00000000000..25adb44fe1d
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/basic_classes.cc
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/FileCheck.h"
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h"
+
+void init_basic_classes(py::module& m) {
+  py::class_<mlir::MLIRContext>(m, "MLIRContext").def(py::init<>());
+
+  py::class_<mlir::Location>(m, "Location");
+
+  py::class_<mlir::UnknownLoc>(m, "UnknownLoc")
+      .def("get", &mlir::UnknownLoc::get);
+
+  py::class_<mlir::Region>(m, "Region")
+      .def("back", &mlir::Region::back, py::return_value_policy::reference)
+      .def("front", &mlir::Region::front, py::return_value_policy::reference)
+      .def("add_block", [](mlir::Region& r) { r.push_back(new mlir::Block); })
+      .def("push_back", &mlir::Region::push_back)
+      .def("size", [](mlir::Region& r) { return r.getBlocks().size(); })
+      .def("front", &mlir::Region::front, py::return_value_policy::reference);
+  py::class_<mlir::Block::iterator>(m, "Block_Iterator");
+  py::class_<mlir::Block>(m, "Block")
+      .def("new", ([]() { return new mlir::Block; }),
+           py::return_value_policy::reference)
+      .def("end", &mlir::Block::end)
+      .def("addArgument", &mlir::Block::addArgument);
+
+  py::class_<mlir::Value>(m, "Value").def("getType", &mlir::Value::getType);
+  py::class_<mlir::OpResult, mlir::Value>(m, "OpResult");
+  py::class_<mlir::BlockArgument, mlir::Value>(m, "BlockArgument");
+}
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/builders.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/builders.cc
new file mode 100644
index 00000000000..338f17ed6df
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/builders.cc
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+
+#include "tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h"
+
+void init_builders(py::module& m) {
+  py::class_<mlir::Builder>(m, "Builder")
+      .def(py::init<mlir::MLIRContext*>())
+      .def("getFunctionType",
+           [](mlir::Builder& b, std::vector<mlir::Type> inputs,
+              std::vector<mlir::Type> outputs) {
+             return b.getFunctionType(llvm::ArrayRef<mlir::Type>(inputs),
+                                      llvm::ArrayRef<mlir::Type>(outputs));
+           });
+  py::class_<mlir::OpBuilder>(m, "OpBuilder")
+      .def(py::init<mlir::MLIRContext*>())
+      .def(py::init<mlir::Region&>())
+      .def(py::init<mlir::Operation*>())
+      .def(py::init<mlir::Block*, mlir::Block::iterator>())
+      .def("getUnknownLoc", &mlir::OpBuilder::getUnknownLoc)
+      .def("setInsertionPoint",
+           py::overload_cast<mlir::Block*, mlir::Block::iterator>(
+               &mlir::OpBuilder::setInsertionPoint))
+      .def("saveInsertionPoint", &mlir::OpBuilder::saveInsertionPoint)
+      .def("restoreInsertionPoint", &mlir::OpBuilder::restoreInsertionPoint)
+      .def(
+          "createOperation",
+          [](mlir::OpBuilder& opb, mlir::OperationState& state) {
+            return opb.createOperation(state);
+          },
+          py::return_value_policy::reference)
+      .def("getContext", &mlir::OpBuilder::getContext,
+           py::return_value_policy::reference);
+
+  py::class_<mlir::OpBuilder::InsertPoint>(m, "OpBuilder_InsertionPoint")
+      .def("getBlock", &mlir::OpBuilder::InsertPoint::getBlock);
+}
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc
new file mode 100644
index 00000000000..8a841856b72
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/FileCheck.h"
+#include "llvm/Support/SourceMgr.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+
+PYBIND11_MODULE(filecheck_wrapper, m) {
+  m.def("check", [](std::string input, std::string check) {
+    llvm::FileCheckRequest fcr;
+    llvm::FileCheck fc(fcr);
+    llvm::SourceMgr SM = llvm::SourceMgr();
+    SM.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(input),
+                          llvm::SMLoc());
+    SM.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(check),
+                          llvm::SMLoc());
+    llvm::Regex regex = fc.buildCheckPrefixRegex();
+    fc.readCheckFile(SM, llvm::StringRef(check), regex);
+    return fc.checkInput(SM, llvm::StringRef(input));
+  });
+}
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc
new file mode 100644
index 00000000000..6f468cd4267
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h"
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+
+PYBIND11_MODULE(mlir_wrapper, m) {
+  m.def("registerDialects", []() {
+    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
+    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
+    mlir::registerDialect<mlir::StandardOpsDialect>();
+  });
+
+  init_basic_classes(m);
+  init_types(m);
+  init_builders(m);
+  init_ops(m);
+  init_attrs(m);
+}
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/dialect_static_registration.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h
similarity index 57%
rename from tensorflow/compiler/mlir/tfrt/runtime_fallback/dialect_static_registration.cc
rename to tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h
index 7632b0546fa..562c59b43e1 100644
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/dialect_static_registration.cc
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h
@@ -13,19 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-//===- dialect_static_registration.cc -------------------------------------===//
-//
-// This file registers the RuntimeFallbackDialect.
-//
-//===----------------------------------------------------------------------===//
+#ifndef TENSORFLOW_COMPILER_MLIR_PYTHON_MLIR_WRAPPER_MLIR_WRAPPER_H
+#define TENSORFLOW_COMPILER_MLIR_PYTHON_MLIR_WRAPPER_MLIR_WRAPPER_H
 
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
 
-namespace mlir {
-namespace tfd {
+namespace py = pybind11;
 
-// Static initialization for dialect registration.
-static DialectRegistration<RuntimeFallbackDialect> tfd_registration;
+void init_basic_classes(py::module& m);
+void init_types(py::module& m);
+void init_builders(py::module& m);
+void init_ops(py::module& m);
+void init_attrs(py::module& m);
 
-}  // namespace tfd
-}  // namespace mlir
+#endif  // TENSORFLOW_COMPILER_MLIR_PYTHON_MLIR_WRAPPER_MLIR_WRAPPER_H
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/ops.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/ops.cc
new file mode 100644
index 00000000000..4432829653e
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/ops.cc
@@ -0,0 +1,194 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+void init_ops(py::module& m) {
+  py::class_<mlir::Operation, std::unique_ptr<mlir::Operation, py::nodelete>>(
+      m, "Operation")
+      .def("getRegion", &mlir::Operation::getRegion,
+           py::return_value_policy::reference)
+      .def("getResult", &mlir::Operation::getResult)
+      .def("dump", &mlir::Operation::dump)
+      .def("getNumResults", &mlir::Operation::getNumResults);
+
+  py::class_<mlir::OperationState>(m, "OperationState")
+      .def(py::init([](mlir::Location loc, std::string name) {
+        return mlir::OperationState(loc, llvm::StringRef(name));
+      }))
+      .def("addTypes",
+           [](mlir::OperationState& state, std::vector<mlir::Type> tys) {
+             state.addTypes(mlir::ArrayRef<mlir::Type>(tys));
+           })
+      .def("addOperands",
+           [](mlir::OperationState& os, std::vector<mlir::Value> ops) {
+             os.addOperands(mlir::ArrayRef<mlir::Value>(ops));
+           })
+      .def("addRegion", py::overload_cast<>(&mlir::OperationState::addRegion),
+           py::return_value_policy::reference);
+
+  py::class_<mlir::ModuleOp>(m, "ModuleOp")
+      .def("create",
+           [](mlir::Location loc) { return mlir::ModuleOp::create(loc); })
+      .def("push_back",
+           [](mlir::ModuleOp& m, mlir::FuncOp f) { m.push_back(f); })
+      .def("dump", &mlir::ModuleOp::dump)
+      .def("getAsStr", [](mlir::ModuleOp& m) {
+        std::string str;
+        llvm::raw_string_ostream os(str);
+        m.print(os);
+        return os.str();
+      });
+
+  py::class_<mlir::FuncOp>(m, "FuncOp")
+      .def("create",
+           [](mlir::Location location, std::string name,
+              mlir::FunctionType type) {
+             auto func = mlir::FuncOp::create(location, name, type);
+             func.addEntryBlock();
+             return func;
+           })
+      .def(
+          "getBody",
+          [](mlir::FuncOp& f) -> mlir::Region& { return f.getBody(); },
+          py::return_value_policy::reference)
+      .def("getArguments",
+           [](mlir::FuncOp& f) { return f.getArguments().vec(); })
+      .def("getName", [](mlir::FuncOp& f) { return f.getName().str(); })
+      .def("getType", &mlir::FuncOp::getType);
+
+  py::class_<mlir::ReturnOp>(m, "ReturnOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc,
+              std::vector<mlir::Value> values) -> mlir::Operation* {
+             return opb
+                 .create<mlir::ReturnOp>(loc,
+                                         mlir::ArrayRef<mlir::Value>(values))
+                 .getOperation();
+           });
+
+  // mlir::TF::AddOp
+  py::class_<mlir::TF::AddV2Op>(m, "Tf_AddV2Op")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::AddV2Op>(loc, x, y).getOperation();
+           });
+
+  py::class_<mlir::TF::AnyOp>(m, "Tf_AnyOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value input,
+              mlir::Value reduction_indices,
+              bool keep_dims = false) -> mlir::Operation* {
+             return opb
+                 .create<mlir::TF::AnyOp>(loc, opb.getI1Type(), input,
+                                          reduction_indices, keep_dims)
+                 .getOperation();
+           });
+
+  // mlir::TF::ConstOp
+  py::class_<mlir::TF::ConstOp>(m, "Tf_ConstOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc,
+              mlir::Attribute value) -> mlir::Operation* {
+             return opb.create<mlir::TF::ConstOp>(loc, value).getOperation();
+           });
+
+  // mlir::TF::EqualOp
+  py::class_<mlir::TF::EqualOp>(m, "Tf_EqualOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb
+                 .create<mlir::TF::EqualOp>(loc, x, y, opb.getBoolAttr(true))
+                 .getOperation();
+           });
+
+  // mlir::TF::GreaterEqualOp
+  py::class_<mlir::TF::GreaterEqualOp>(m, "Tf_GreaterEqualOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::GreaterEqualOp>(loc, x, y)
+                 .getOperation();
+           });
+
+  // mlir::TF::GreaterOp
+  py::class_<mlir::TF::GreaterOp>(m, "Tf_GreaterOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::GreaterOp>(loc, x, y).getOperation();
+           });
+
+  // mlir::TF::LegacyCallOp
+  py::class_<mlir::TF::LegacyCallOp>(m, "Tf_LegacyCallOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc,
+              std::vector<mlir::Type> output, std::vector<mlir::Value> args,
+              std::string f) -> mlir::Operation* {
+             return opb
+                 .create<mlir::TF::LegacyCallOp>(
+                     loc, mlir::ArrayRef<mlir::Type>(output),
+                     mlir::ArrayRef<mlir::Value>(args), mlir::StringRef(f))
+                 .getOperation();
+           });
+
+  // mlir::TF::LessEqualOp
+  py::class_<mlir::TF::LessEqualOp>(m, "Tf_LessEqualOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::LessEqualOp>(loc, x, y).getOperation();
+           });
+
+  // mlir::TF::LessOp
+  py::class_<mlir::TF::LessOp>(m, "Tf_LessOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::LessOp>(loc, x, y).getOperation();
+           });
+
+  // mlir::TF::NegOp
+  py::class_<mlir::TF::NegOp>(m, "Tf_NegOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc,
+              mlir::Value x) -> mlir::Operation* {
+             return opb.create<mlir::TF::NegOp>(loc, x).getOperation();
+           });
+
+  py::class_<mlir::TF::NotEqualOp>(m, "Tf_NotEqualOp")
+      .def("create", [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+                        mlir::Value y) {
+        return opb
+            .create<mlir::TF::NotEqualOp>(
+                loc, x, y, mlir::BoolAttr::get(true, opb.getContext()))
+            .getOperation();
+      });
+
+  // mlir::TF::SubOp
+  py::class_<mlir::TF::SubOp>(m, "Tf_SubOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::SubOp>(loc, x, y).getOperation();
+           });
+}
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/types.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/types.cc
new file mode 100644
index 00000000000..2be67f8e93e
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/types.cc
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+void init_types(py::module& m) {
+  // Type
+  py::class_<mlir::Type> Type(m, "Type");
+  Type.def("getKind", &mlir::Type::getKind);
+
+  // Type Enums
+  py::enum_<mlir::StandardTypes::Kind>(Type, "StandardTypes_Kind")
+      .value("BF16", mlir::StandardTypes::BF16);
+
+  // Type Sub-classes
+  py::class_<mlir::FunctionType, mlir::Type>(m, "FunctionType")
+      .def("getResults",
+           [](mlir::FunctionType& ft) { return ft.getResults().vec(); });
+
+  py::class_<mlir::FloatType, mlir::Type>(m, "FloatType")
+      .def("get", &mlir::FloatType::get);
+
+  py::class_<mlir::IntegerType, mlir::Type>(m, "IntegerType")
+      .def("get", py::overload_cast<unsigned, mlir::MLIRContext*>(
+                      &mlir::IntegerType::get));
+
+  py::class_<mlir::UnrankedTensorType, mlir::Type>(m, "UnrankedTensorType")
+      .def("get", &mlir::UnrankedTensorType::get);
+
+  py::class_<mlir::RankedTensorType, mlir::Type>(m, "RankedTensorType")
+      .def("get", [](std::vector<int64_t> shape, mlir::Type ty) {
+        return mlir::RankedTensorType::get(mlir::ArrayRef<int64_t>(shape), ty);
+      });
+}
diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py
index 6d3131a781c..f1271d0da24 100644
--- a/tensorflow/compiler/mlir/runlit.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.cfg.py
@@ -70,9 +70,9 @@ tool_dirs = config.mlir_tf_tools_dirs + [
 ]
 tool_names = [
     'mlir-opt', 'mlir-translate', 'tf-opt', 'tf_tfl_translate',
-    'flatbuffer_to_string', 'flatbuffer_translate', 'tf-mlir-translate',
-    'mlir-tflite-runner', 'tfcompile', 'json_to_flatbuffer', 'xla-gpu-opt',
-    'xla-opt'
+    'tf_tfjs_translate', 'flatbuffer_to_string', 'flatbuffer_translate',
+    'tf-mlir-translate', 'mlir-tflite-runner', 'tfcompile',
+    'json_to_flatbuffer', 'xla-gpu-opt', 'xla-opt'
 ]
 tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
 llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/tensorflow/compiler/mlir/runlit.site.cfg.py b/tensorflow/compiler/mlir/runlit.site.cfg.py
index 661e6200df3..3e7596c75d7 100644
--- a/tensorflow/compiler/mlir/runlit.site.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.site.cfg.py
@@ -44,6 +44,7 @@ mlir_tf_tools_dirs = [
     'tensorflow/compiler/mlir',
     'tensorflow/compiler/mlir/lite',
     'tensorflow/compiler/mlir/tensorflow',
+    'tensorflow/compiler/mlir/tfjs',
     'tensorflow/compiler/mlir/xla',
     'tensorflow/compiler/aot',
     'tensorflow/compiler/xla/service/mlir_gpu',
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 9099f2be2e1..05b2f891676 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -36,7 +36,7 @@ filegroup(
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
@@ -224,7 +224,10 @@ cc_library(
     hdrs = [
         "ir/tf_attributes.h",
     ],
-    deps = ["@llvm-project//mlir:IR"],
+    deps = [
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+    ],
 )
 
 cc_library(
@@ -427,6 +430,7 @@ cc_library(
         "transforms/parallel_execute_to_islands.cc",
         "transforms/promote_resources_to_args.cc",
         "transforms/raise_control_flow.cc",
+        "transforms/readonly_references_to_resources.cc",
         "transforms/replicate_invariant_op_hoisting.cc",
         "transforms/replicate_to_island.cc",
         "transforms/resource_device_inference.cc",
@@ -448,6 +452,7 @@ cc_library(
         "transforms/tpu_merge_variables_with_execute.cc",
         "transforms/tpu_rewrite_pass.cc",
         "transforms/tpu_sharding_identification_pass.cc",
+        "transforms/tpu_space_to_depth_pass.cc",
         "transforms/tpu_variable_runtime_reformatting.cc",
         "translate/breakup-islands.cc",
         "translate/control_to_executor_dialect.cc",
@@ -555,8 +560,7 @@ cc_library(
     srcs = ["ir/dialect_registration.cc"],
     deps = [
         ":tensorflow",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LoopOpsTransforms",
+        "@llvm-project//mlir:Shape",
     ],
     alwayslink = 1,
 )
@@ -785,6 +789,9 @@ cc_library(
     name = "convert_type",
     srcs = ["utils/convert_type.cc"],
     hdrs = ["utils/convert_type.h"],
+    visibility = [
+        "//visibility:public",
+    ],
     deps = [
         ":tensorflow_types",
         "//tensorflow/core:framework",
@@ -823,6 +830,7 @@ cc_library(
         ":mangling_util",
         ":tensorflow_attributes",
         ":tensorflow_types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -1074,7 +1082,7 @@ genrule(
     srcs = [
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
         "ir/tf_generated_ops.td",
         "ir/tf_op_base.td",
@@ -1139,6 +1147,7 @@ COMPILE_MLIR_UTIL_DEPS = [
     "//tensorflow/compiler/mlir/xla:type_to_shape",
     "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
     "//tensorflow/compiler/mlir/xla:xla_legalize_tf_with_tf2xla",
+    "//tensorflow/compiler/mlir/xla:xla_sink_constants_to_control_flow",
     "//tensorflow/compiler/tf2xla:common",
     "//tensorflow/compiler/tf2xla:xla_compiler",
     "//tensorflow/core:framework",
@@ -1277,6 +1286,7 @@ cc_library(
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -1291,6 +1301,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
         "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
index ff1620347f7..f7b88317cd4 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
@@ -49,7 +49,7 @@ namespace TF {
 namespace {
 
 constexpr int64_t kUnknownResourceId = -1;
-constexpr char kResourceArgUniqueIdAttr[] = "tf.resource_arg_unique_id";
+constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
 
 // Returns if a VarHandleOp is anonymous, which means it always creates a new
 // variable.
diff --git a/tensorflow/compiler/mlir/tensorflow/c/BUILD b/tensorflow/compiler/mlir/tensorflow/c/BUILD
new file mode 100644
index 00000000000..9528874f419
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/c/BUILD
@@ -0,0 +1,55 @@
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_copts",
+    "tf_cuda_library",
+    "tfe_xla_copts",
+)
+
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+package_group(
+    name = "friends",
+    packages = ["//tensorflow/..."],
+)
+
+tf_cuda_library(
+    name = "mlir_c_api",
+    srcs = [
+        "c_api_unified_experimental_mlir.cc",
+    ],
+    copts = tf_copts() + tfe_xla_copts(),
+    deps = [
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c:tf_status_internal",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_internal",
+        "//tensorflow/c/eager:c_api_unified_internal",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:convert_type",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "mlir_c_api_registration",
+    srcs = ["c_api_unified_experimental_mlir_registration.cc"],
+    deps = [
+        ":mlir_c_api",
+        "//tensorflow/c/eager:c_api_unified_internal",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
new file mode 100644
index 00000000000..0e8b7fedd9b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
@@ -0,0 +1,493 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+#include <memory>
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_status_internal.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace mlir {
+namespace TF {
+using tensorflow::internal::AbstractFunction;
+using tensorflow::internal::AbstractOp;
+using tensorflow::internal::AbstractTensor;
+using tensorflow::internal::dyncast;
+using tensorflow::internal::ExecutionContext;
+using tensorflow::internal::OutputList;
+
+namespace {
+
+static void RegisterDialects() {
+  static bool init_once = []() {
+    mlir::registerDialect<mlir::StandardOpsDialect>();
+    mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
+    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
+    return true;
+  }();
+  (void)init_once;
+}
+
+Status ConvertDataTypeToTensor(tensorflow::DataType dtype, Builder builder,
+                               Type* type) {
+  Status s = tensorflow::ConvertDataType(dtype, builder, type);
+  if (s.ok()) *type = UnrankedTensorType::get(*type);
+  return s;
+}
+
+class MlirTensor : public AbstractTensor {
+ public:
+  explicit MlirTensor(Value value) : AbstractTensor(kKind), value_(value) {}
+
+  Value getValue() { return value_; }
+
+  static constexpr AbstractTensorKind kKind = kMlirTensor;
+
+ private:
+  Value value_;
+};
+
+class MlirAbstractOp : public AbstractOp {
+ public:
+  explicit MlirAbstractOp(MLIRContext* context)
+      : AbstractOp(kKind), context_(context) {}
+
+  void SetOpType(const char* op_type, TF_Status* s) override;
+
+  void SetAttrType(const char* attr_name, TF_DataType dtype,
+                   TF_Status* s) override;
+
+  void SetOpName(const char* const op_name, TF_Status* s) override;
+
+  MLIRContext* GetContext() { return context_; }
+
+  Type AddRef(Type type, TF_Status* s);
+
+  OperationState* Create(ArrayRef<Value> operands, TF_Status* s);
+
+  static constexpr AbstractOpKind kKind = kMlirOp;
+
+ private:
+  MLIRContext* context_;
+  llvm::StringMap<Attribute> attrs_;
+  std::unique_ptr<OperationState> state_;
+  const char* op_name_ = nullptr;
+};
+
+// MlirFunction is a thin wrapper over a FuncOp.
+class MlirFunction : public AbstractFunction {
+ public:
+  explicit MlirFunction(std::unique_ptr<MLIRContext> context,
+                        OwningModuleRef module, FuncOp func)
+      : AbstractFunction(kKind),
+        context_(std::move(context)),
+        module_(std::move(module)),
+        func_(func) {}
+
+  TF_Function* GetTfFunction(TF_Status* s) override;
+
+  static constexpr AbstractFunctionKind kKind = kGraphFunc;
+
+ private:
+  std::unique_ptr<MLIRContext> context_;
+  OwningModuleRef module_;
+  FuncOp func_;
+};
+
+class MlirFunctionContext : public ExecutionContext {
+ public:
+  explicit MlirFunctionContext(const char* name)
+      : ExecutionContext(kKind),
+        context_(std::make_unique<MLIRContext>()),
+        builder_(context_.get()) {
+    // TODO(aminim) figure out the location story here
+    module_ = ModuleOp::create(builder_.getUnknownLoc());
+    func_ = FuncOp::create(builder_.getUnknownLoc(), name,
+                           builder_.getFunctionType(llvm::None, llvm::None));
+    module_->push_back(func_);
+    builder_ = OpBuilder::atBlockBegin(func_.addEntryBlock());
+  }
+
+  AbstractOp* CreateOperation() override {
+    return new MlirAbstractOp(context_.get());
+  }
+
+  void ExecuteOperation(AbstractOp* abstract_op, int num_inputs,
+                        AbstractTensor* const* inputs, OutputList* o,
+                        TF_Status* s) override;
+
+  AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) override;
+
+  AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) override;
+
+  void RegisterFunction(AbstractFunction* func, TF_Status* s) override {
+    s->status = tensorflow::errors::Unimplemented(
+        "Registering graph functions has not been implemented yet.");
+  }
+
+  static constexpr ExecutionContextKind kKind = kMlirContext;
+
+ private:
+  std::unique_ptr<MLIRContext> context_;
+  OpBuilder builder_;
+  FuncOp func_;
+  OwningModuleRef module_;
+};
+
+void MlirAbstractOp::SetOpType(const char* op_type, TF_Status* s) {
+  if (state_) {
+    s->status = tensorflow::errors::FailedPrecondition(
+        "SetOpType called on already built op.");
+    return;
+  }
+  std::string name = "tf.";
+  name += op_type;
+  // TODO(aminim) figure out the location story here
+  state_ = std::make_unique<OperationState>(UnknownLoc::get(context_), name);
+}
+
+void MlirAbstractOp::SetAttrType(const char* attr_name, TF_DataType dtype,
+                                 TF_Status* s) {
+  if (!state_) {
+    s->status = tensorflow::errors::FailedPrecondition(
+        "op_type must be specified before specifying attrs.");
+    return;
+  }
+  Type mlir_type;
+  Builder builder(context_);
+  s->status = ConvertDataTypeToTensor(static_cast<tensorflow::DataType>(dtype),
+                                      builder, &mlir_type);
+  if (!s->status.ok()) return;
+  attrs_[attr_name] = TypeAttr::get(mlir_type);
+}
+
+void MlirAbstractOp::SetOpName(const char* const op_name, TF_Status* s) {
+  // TODO(aminim): should we use a location?
+  if (op_name_) {
+    s->status = tensorflow::errors::FailedPrecondition(
+        "SetOpName called on already built op.");
+    return;
+  }
+  op_name_ = op_name;
+}
+
+Type MlirAbstractOp::AddRef(Type type, TF_Status* s) {
+  Type elt_type = getElementTypeOrSelf(type);
+  if (elt_type.isa<mlir::TF::TensorFlowRefType>()) {
+    s->status = tensorflow::errors::InvalidArgument(
+        "Requested reference to a reference type");
+    return nullptr;
+  }
+  elt_type = TensorFlowRefType::get(elt_type);
+  if (RankedTensorType tensor_type = type.dyn_cast<RankedTensorType>()) {
+    return RankedTensorType::get(tensor_type.getShape(), elt_type);
+  }
+  return UnrankedTensorType::get(elt_type);
+}
+
+OperationState* MlirAbstractOp::Create(ArrayRef<Value> operands, TF_Status* s) {
+  state_->operands = llvm::to_vector<4>(operands);
+  const tensorflow::OpDef* op_def;
+  auto node_name = state_->name.getStringRef().drop_front(
+      TensorFlowDialect::getDialectNamespace().size() + 1);
+  s->status =
+      tensorflow::OpRegistry::Global()->LookUpOpDef(node_name.str(), &op_def);
+  if (!s->status.ok()) return nullptr;
+  Builder builder(context_);
+  // Process operands according to the op_def and infer derived attributes.
+  int current_operand = 0;
+  for (const tensorflow::OpDef::ArgDef& input_arg : op_def->input_arg()) {
+    if (!input_arg.number_attr().empty()) {
+      // TODO(b/156122856): we don't support variadic operands.
+      s->status = tensorflow::errors::Unimplemented(
+          "Unsupported 'number_attr' for '", input_arg.number_attr(), "'");
+      return nullptr;
+    } else if (!input_arg.type_list_attr().empty()) {
+      s->status = tensorflow::errors::InvalidArgument(
+          "Unsupported 'type_list_attr' for '", input_arg.number_attr(), "'");
+      return nullptr;
+    }
+    if (current_operand >= operands.size()) {
+      s->status = tensorflow::errors::InvalidArgument("Missing operand for '",
+                                                      input_arg.name(), "'");
+      return nullptr;
+    }
+    Type expected_type;
+    if (input_arg.type() != tensorflow::DT_INVALID) {
+      s->status =
+          ConvertDataTypeToTensor(input_arg.type(), builder, &expected_type);
+      if (!s->status.ok()) return nullptr;
+      if (input_arg.is_ref()) expected_type = AddRef(expected_type, s);
+      if (!s->status.ok()) return nullptr;
+    } else {
+      expected_type = operands[current_operand].getType();
+    }
+    if (!input_arg.type_attr().empty()) {
+      attrs_[input_arg.type_attr()] = TypeAttr::get(expected_type);
+    }
+    ++current_operand;
+  }
+
+  for (const tensorflow::OpDef::ArgDef& output_arg : op_def->output_arg()) {
+    int original_size = state_->types.size();
+    if (!output_arg.number_attr().empty()) {
+      // Same type repeated "repeats" times.
+      Attribute repeats_attr = attrs_[output_arg.number_attr()];
+      if (!repeats_attr) {
+        s->status = tensorflow::errors::InvalidArgument(
+            "Missing attribute '", output_arg.number_attr(),
+            "' required for output list '", output_arg.name(), "'");
+        return nullptr;
+      }
+      if (!repeats_attr.isa<IntegerAttr>()) {
+        s->status = tensorflow::errors::InvalidArgument(
+            "Attribute '", output_arg.number_attr(),
+            "' required for output list '", output_arg.name(),
+            "' isn't an integer");
+        return nullptr;
+      }
+      int64_t repeats = repeats_attr.cast<IntegerAttr>().getInt();
+
+      if (!output_arg.type_attr().empty()) {
+        // Same type repeated "repeats" times.
+        Attribute attr = attrs_[output_arg.type_attr()];
+        if (!attr) {
+          s->status = tensorflow::errors::InvalidArgument(
+              "Missing attribute '", output_arg.type_attr(),
+              "' required for output '", output_arg.name(), "'");
+          return nullptr;
+        }
+        TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
+        if (!type_attr) {
+          s->status = tensorflow::errors::InvalidArgument(
+              "Attribute '", output_arg.type_attr(), "' required for output '",
+              output_arg.name(), "' isn't a type attribute");
+          return nullptr;
+        }
+        for (int i = 0; i < repeats; ++i)
+          state_->types.push_back(type_attr.getType());
+      } else if (output_arg.type() != tensorflow::DT_INVALID) {
+        for (int i = 0; i < repeats; ++i) {
+          Type type;
+          s->status =
+              ConvertDataTypeToTensor(output_arg.type(), builder, &type);
+          if (!s->status.ok()) return nullptr;
+          state_->types.push_back(type);
+        }
+      } else {
+        s->status = tensorflow::errors::InvalidArgument(
+            "Missing type or type_attr field in ",
+            output_arg.ShortDebugString());
+        return nullptr;
+      }
+    } else if (!output_arg.type_attr().empty()) {
+      Attribute attr = attrs_[output_arg.type_attr()];
+      if (!attr) {
+        s->status = tensorflow::errors::InvalidArgument(
+            "Missing attribute '", output_arg.type_attr(),
+            "' required for output '", output_arg.name(), "'");
+        return nullptr;
+      }
+      TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
+      if (!type_attr) {
+        s->status = tensorflow::errors::InvalidArgument(
+            "Attribute '", output_arg.type_attr(), "' required for output '",
+            output_arg.name(), "' isn't a type attribute");
+        return nullptr;
+      }
+      state_->types.push_back(type_attr.getValue());
+    } else if (!output_arg.type_list_attr().empty()) {
+      // This is pointing to an attribute which is an array of types.
+      Attribute attr = attrs_[output_arg.type_list_attr()];
+      if (!attr) {
+        s->status = tensorflow::errors::InvalidArgument(
+            "Missing attribute '", output_arg.type_list_attr(),
+            "' required for output '", output_arg.name(), "'");
+        return nullptr;
+      }
+      ArrayAttr array_attr = attr.dyn_cast<ArrayAttr>();
+      if (!array_attr) {
+        s->status = tensorflow::errors::InvalidArgument(
+            "Attribute '", output_arg.type_list_attr(),
+            "' required for output '", output_arg.name(),
+            "' isn't an array attribute");
+        return nullptr;
+      }
+      for (Attribute attr : array_attr) {
+        TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
+        if (!type_attr) {
+          s->status = tensorflow::errors::InvalidArgument(
+              "Array Attribute '", output_arg.type_list_attr(),
+              "' required for output '", output_arg.name(),
+              "' has a non-Type element");
+          return nullptr;
+        }
+        state_->types.push_back(type_attr.getValue());
+      }
+    } else if (output_arg.type() != tensorflow::DT_INVALID) {
+      Type type;
+      Builder builder(context_);
+      s->status = ConvertDataTypeToTensor(output_arg.type(), builder, &type);
+      if (!s->status.ok()) return nullptr;
+      state_->types.push_back(type);
+    } else {
+      s->status = tensorflow::errors::InvalidArgument(
+          "No type fields in ", output_arg.ShortDebugString());
+      if (!s->status.ok()) return nullptr;
+    }
+    if (output_arg.is_ref()) {
+      // For all types that were added by this function call, make them refs.
+      for (Type& type : llvm::make_range(&state_->types[original_size],
+                                         state_->types.end())) {
+        type = AddRef(type, s);
+        if (!s->status.ok()) return nullptr;
+      }
+    }
+  }
+  return state_.get();
+}
+
+TF_Function* MlirFunction::GetTfFunction(TF_Status* s) {
+  PassManager pm(func_.getContext());
+  pm.addNestedPass<FuncOp>(CreateFunctionalToExecutorDialectConversionPass());
+  pm.addNestedPass<FuncOp>(CreateBreakUpIslandsPass());
+
+  // In case of failure, the `diag_handler` converts MLIR errors emitted to
+  // the MLIRContext into a tensorflow::Status.
+  StatusScopedDiagnosticHandler diag_handler(func_.getContext());
+  LogicalResult result = pm.run(func_.getParentOfType<ModuleOp>());
+  (void)result;
+  s->status = diag_handler.ConsumeStatus();
+  if (!s->status.ok()) return nullptr;
+
+  tensorflow::GraphExportConfig configs;
+  std::unique_ptr<TF_Function> tf_function(new TF_Function);
+  s->status = ConvertMlirFunctionToFunctionLibraryDef(func_, configs,
+                                                      &tf_function->fdef);
+  return tf_function.release();
+}
+
+void MlirFunctionContext::ExecuteOperation(AbstractOp* abstract_op,
+                                           int num_inputs,
+                                           AbstractTensor* const* inputs,
+                                           OutputList* o, TF_Status* s) {
+  auto* mlir_op = dyncast<MlirAbstractOp>(abstract_op);
+  if (mlir_op == nullptr) {
+    s->status = tensorflow::errors::InvalidArgument(
+        "Unable to cast AbstractOp to TF_GraphOp.");
+    return;
+  }
+  SmallVector<Value, 8> operands;
+  for (int i = 0; i < num_inputs; ++i) {
+    auto* operand = dyncast<MlirTensor>(inputs[i]);
+    if (!operand) {
+      s->status = tensorflow::errors::InvalidArgument(
+          "Capturing eager tensors is not supported yet.");
+      return;
+    }
+    if (operand->getValue().getContext() != context_.get()) {
+      s->status = tensorflow::errors::InvalidArgument(
+          "Capturing tensors from other context is not supported.");
+      return;
+    }
+    operands.push_back(operand->getValue());
+  }
+  OperationState* state = mlir_op->Create(operands, s);
+  if (!s->status.ok() || !state) return;
+  Operation* op = builder_.createOperation(*state);
+  int num_results = op->getNumResults();
+  o->outputs.clear();
+  o->outputs.reserve(num_results);
+  for (Value result : op->getResults())
+    o->outputs.push_back(new MlirTensor(result));
+}
+
+AbstractTensor* MlirFunctionContext::AddParameter(TF_DataType dtype,
+                                                  TF_Status* s) {
+  Type type;
+  s->status = ConvertDataTypeToTensor(static_cast<tensorflow::DataType>(dtype),
+                                      builder_, &type);
+  if (!s->status.ok()) return nullptr;
+  return new MlirTensor(func_.getBody().front().addArgument(type));
+}
+
+AbstractFunction* MlirFunctionContext::Finalize(OutputList* outputs,
+                                                TF_Status* s) {
+  Block& body = func_.getBody().front();
+  SmallVector<Value, 8> ret_operands;
+  for (AbstractTensor* output : outputs->outputs) {
+    auto* operand = dyncast<MlirTensor>(output);
+    if (!operand) {
+      s->status = tensorflow::errors::InvalidArgument(
+          "Capturing eager tensors is not supported yet.");
+      return nullptr;
+    }
+    if (operand->getValue().getContext() != context_.get()) {
+      s->status = tensorflow::errors::InvalidArgument(
+          "Capturing tensors from other context is not supported.");
+      return nullptr;
+    }
+    ret_operands.push_back(operand->getValue());
+  }
+  builder_.create<ReturnOp>(func_.getLoc(), ret_operands);
+
+  auto arg_types = llvm::to_vector<8>(body.getArgumentTypes());
+  auto result_types =
+      llvm::to_vector<8>(body.getTerminator()->getOperandTypes());
+  func_.setType(FunctionType::get(arg_types, result_types, func_.getContext()));
+  return new MlirFunction(std::move(context_), std::move(module_), func_);
+}
+
+extern "C" {
+ExecutionContext* MlirTracingFactory(const char* fn_name, TF_Status* s) {
+  RegisterDialects();
+  return new MlirFunctionContext(fn_name);
+}
+}
+
+}  // end anonymous namespace
+}  // end namespace TF
+}  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir_registration.cc b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir_registration.cc
new file mode 100644
index 00000000000..778f4b777a3
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir_registration.cc
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+
+using tensorflow::internal::ExecutionContext;
+
+extern "C" {
+ExecutionContext* MlirTracingFactory(const char* fn_name, TF_Status* s);
+}
+
+namespace {
+// Register the tracing implemented in this file as the default tracing engine.
+static bool register_tracing = [] {
+  RegisterTracingEngineFactory("mlir", MlirTracingFactory);
+  return true;
+}();
+
+}  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
index 15a4ecfc537..39245425a5a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 
 namespace mlir {
 namespace TFControlFlow {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc b/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc
index ac468d9810c..c95d7b7ca7c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -31,5 +32,6 @@ static DialectRegistration<tf_device::TensorFlowDeviceDialect>
     tf_device_dialect;
 static DialectRegistration<tf_saved_model::TensorFlowSavedModelDialect>
     tf_saved_model_dialect;
+static DialectRegistration<mlir::shape::ShapeDialect> shape_dialect;
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
index 6797c04ebcf..dfad1fce26d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+
 namespace mlir {
 namespace TF {
 
@@ -45,6 +47,28 @@ struct ShapeAttrStorage : public AttributeStorage {
   bool unranked = false;
 };
 
+// The storage class for FuncAttr.
+struct FuncAttrStorage : public AttributeStorage {
+  using KeyTy = std::pair<Attribute, Attribute>;
+
+  explicit FuncAttrStorage(Attribute name, Attribute attrs)
+      : name(name), attrs(attrs) {}
+
+  bool operator==(const KeyTy& key) const { return key == KeyTy(name, attrs); }
+  static unsigned hashKey(const KeyTy& key) {
+    return llvm::hash_combine(key.first, key.second);
+  }
+
+  static FuncAttrStorage* construct(mlir::AttributeStorageAllocator& allocator,
+                                    const KeyTy& key) {
+    return new (allocator.allocate<FuncAttrStorage>())
+        FuncAttrStorage(key.first, key.second);
+  }
+
+  Attribute name;
+  Attribute attrs;
+};
+
 }  // namespace detail
 
 // Get or create a shape attribute.
@@ -85,5 +109,24 @@ bool ShapeAttr::hasStaticShape() const {
   return true;
 }
 
+FuncAttr FuncAttr::get(mlir::MLIRContext* context, llvm::StringRef name,
+                       DictionaryAttr attr) {
+  auto symbol = SymbolRefAttr::get(name, context);
+  return Base::get(context, AttrKind::FUNC, symbol, attr);
+}
+
+FuncAttr FuncAttr::get(mlir::MLIRContext* context, SymbolRefAttr symbol,
+                       DictionaryAttr attr) {
+  return Base::get(context, AttrKind::FUNC, symbol, attr);
+}
+
+SymbolRefAttr FuncAttr::GetName() const {
+  return getImpl()->name.cast<SymbolRefAttr>();
+}
+
+DictionaryAttr FuncAttr::GetAttrs() const {
+  return getImpl()->attrs.cast<DictionaryAttr>();
+}
+
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
index 4d85dd95cea..1edc7356ab4 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
@@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_ATTRIBUTES_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_ATTRIBUTES_H_
 
+#include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 
 namespace mlir {
@@ -25,11 +26,12 @@ namespace TF {
 
 namespace AttrKind {
 
-// List of supported custom TensorFlow Attributes kinds, necessary for
+// List of supported custom TensorFlow Attribute kinds, necessary for
 // isa/dyn_cast.
 enum Kind {
   FIRST_USED_TENSORFLOW_ATTR = Attribute::FIRST_TENSORFLOW_ATTR,
   SHAPE = FIRST_USED_TENSORFLOW_ATTR,
+  FUNC,
   LAST_USED_TENSORFLOW_ATTR,
 };
 
@@ -38,6 +40,7 @@ enum Kind {
 namespace detail {
 
 struct ShapeAttrStorage;
+struct FuncAttrStorage;
 
 }  // namespace detail
 
@@ -71,6 +74,33 @@ class ShapeAttr : public Attribute::AttrBase<ShapeAttr, Attribute,
   static bool kindof(unsigned kind) { return kind == AttrKind::SHAPE; }
 };
 
+// Custom attribute to model AttrValue.value.func (NameAttrList type attribute).
+// This attribute holds a SymbolRefAttr, for the NameAttrList.name string and a
+// DictionaryAttr for the NameAttrList.attr map<string, AttrValue>. It is
+// currently printed and parsed for the following format:
+//
+//   #tf.func<@symbol, {attr = "value"}>
+//
+// where the first element is the SymbolRefAttr and the second element is the
+// DictionaryAttr.
+class FuncAttr
+    : public Attribute::AttrBase<FuncAttr, Attribute, detail::FuncAttrStorage> {
+ public:
+  using Base::Base;
+
+  static FuncAttr get(mlir::MLIRContext* context, llvm::StringRef name,
+                      DictionaryAttr attr);
+
+  static FuncAttr get(mlir::MLIRContext* context, SymbolRefAttr symbol,
+                      DictionaryAttr attr);
+
+  SymbolRefAttr GetName() const;
+
+  DictionaryAttr GetAttrs() const;
+
+  static bool kindof(unsigned kind) { return kind == AttrKind::FUNC; }
+};
+
 }  // namespace TF
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index d5ecbf3e292..9daebc22ba1 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -47,37 +47,6 @@ limitations under the License.
 
 namespace mlir {
 namespace tf_executor {
-namespace {
-
-// If the given tensor has elements of type with subtypes, then returns a new
-// type after dropping subtypes info. Otherwise, returns the original type as
-// is.
-ShapedType DropTypeSubTypes(ShapedType ty) {
-  Type element_ty = ty.getElementType();
-  auto subtype_ty = element_ty.dyn_cast<TF::TensorFlowTypeWithSubtype>();
-  if (!subtype_ty) return ty;
-
-  Type default_ty = GetDefaultTypeOf(subtype_ty);
-  if (ty.hasRank()) return RankedTensorType::get(ty.getShape(), default_ty);
-
-  return UnrankedTensorType::get(default_ty);
-}
-
-// If the given tensor has elements of type ref, then returns a new type
-// of the shape, but corresponding non-ref type as element type. Otherwise,
-// returns the original type as is.
-ShapedType DropRefType(ShapedType ty) {
-  Type element_ty = ty.getElementType();
-  auto ref_ty = element_ty.dyn_cast<TF::TensorFlowRefType>();
-  if (!ref_ty) return ty;
-
-  Type default_ty = GetDefaultTypeOf(ref_ty);
-  if (ty.hasRank()) return RankedTensorType::get(ty.getShape(), default_ty);
-
-  return UnrankedTensorType::get(default_ty);
-}
-
-}  // namespace
 
 //===----------------------------------------------------------------------===//
 // TF Executor Dialect
@@ -85,6 +54,9 @@ ShapedType DropRefType(ShapedType ty) {
 
 namespace {
 
+using TF::DropRefType;
+using TF::DropTypeSubTypes;
+
 struct TensorFlowExecutorInlinerInterface : public DialectInlinerInterface {
   using DialectInlinerInterface::DialectInlinerInterface;
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 2b3dd529c3b..59443ce3547 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -192,6 +192,44 @@ retained with length 1.
   let verifier = [{ return Verify(*this); }];
 }
 
+def TF_AllToAllOp : TF_Op<"AllToAll", [NoSideEffect]> {
+  let summary = "An Op to exchange data across TPU replicas.";
+
+  let description = [{
+On each replica, the input is split into `split_count` blocks along
+`split_dimension` and send to the other replicas given group_assignment. After
+receiving `split_count` - 1 blocks from other replicas, we concatenate the
+blocks along `concat_dimension` as the output.
+
+For example, suppose there are 2 TPU replicas:
+replica 0 receives input: `[[A, B]]`
+replica 1 receives input: `[[C, D]]`
+
+group_assignment=`[[0, 1]]`
+concat_dimension=0
+split_dimension=1
+split_count=2
+
+replica 0's output: `[[A], [C]]`
+replica 1's output: `[[B], [D]]`
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    I32Tensor:$group_assignment,
+
+    I64Attr:$concat_dimension,
+    I64Attr:$split_dimension,
+    I64Attr:$split_count
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_AngleOp : TF_Op<"Angle", [NoSideEffect, SameOperandsAndResultShape]> {
   let summary = "Returns the argument of a complex number.";
 
@@ -1157,6 +1195,46 @@ subsequent operation and then be optimized away, however.)
   }];
 }
 
+def TF_CaseOp : TF_Op<"Case", []> {
+  let summary = [{
+An n-way switch statement which calls a single branch function.
+  }];
+
+  let description = [{
+An n-way switch statement, implementing the following:
+    ```
+    switch (branch_index) {
+      case 0:
+        output = branches[0](input);
+        break;
+      case 1:
+        output = branches[1](input);
+        break;
+      ...
+      case [[nbranches-1]]:
+      default:
+        output = branches[nbranches-1](input);
+        break;
+    }
+    ```
+  }];
+
+  let arguments = (ins
+    I32Tensor:$branch_index,
+    Variadic<TF_Tensor>:$input,
+
+    Confined<SymbolRefArrayAttr, [ArrayMinCount<1>]>:$branches,
+    DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$output
+  );
+
+  TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<1>;
+  TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
+}
+
 def TF_CastOp : TF_Op<"Cast", [NoSideEffect, SameOperandsAndResultShape]> {
   let summary = "Cast x of type SrcT to y of DstT.";
 
@@ -1217,7 +1295,7 @@ that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_ClipByValueOp : TF_Op<"ClipByValue", [NoSideEffect, SameOperandsAndResultType]> {
+def TF_ClipByValueOp : TF_Op<"ClipByValue", [NoSideEffect]> {
   let summary = "Clips tensor values to a specified min and max.";
 
   let description = [{
@@ -1240,6 +1318,103 @@ greater than `clip_value_max` are set to `clip_value_max`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_CollectiveBcastRecvOp : TF_Op<"CollectiveBcastRecv", []> {
+  let summary = "Receives a tensor value broadcast from another device.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    I64Attr:$group_size,
+    I64Attr:$group_key,
+    I64Attr:$instance_key,
+    TF_ShapeAttr:$shape,
+    DefaultValuedAttr<StrAttr, "auto">:$communication_hint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, I1, I32, I64]>:$data
+  );
+
+  TF_DerivedResultTypeAttr T = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_CollectiveBcastSendOp : TF_Op<"CollectiveBcastSend", []> {
+  let summary = "Broadcasts a tensor value to one or more other devices.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, I1, I32, I64]>:$input,
+
+    I64Attr:$group_size,
+    I64Attr:$group_key,
+    I64Attr:$instance_key,
+    TF_ShapeAttr:$shape,
+    DefaultValuedAttr<StrAttr, "auto">:$communication_hint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, I1, I32, I64]>:$data
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_CollectiveGatherOp : TF_Op<"CollectiveGather", []> {
+  let summary = [{
+Mutually accumulates multiple tensors of identical type and shape.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, I32, I64]>:$input,
+
+    I64Attr:$group_size,
+    I64Attr:$group_key,
+    I64Attr:$instance_key,
+    TF_ShapeAttr:$shape,
+    DefaultValuedAttr<StrAttr, "auto">:$communication_hint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, I32, I64]>:$data
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_CollectiveReduceOp : TF_Op<"CollectiveReduce", [SameOperandsAndResultType]> {
+  let summary = [{
+Mutually reduces multiple tensors of identical type and shape.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, I32, I64]>:$input,
+
+    I64Attr:$group_size,
+    I64Attr:$group_key,
+    I64Attr:$instance_key,
+    TF_AnyStrAttrOf<["Min", "Max", "Mul", "Add"]>:$merge_op,
+    TF_AnyStrAttrOf<["Id", "Div"]>:$final_op,
+    I64ArrayAttr:$subdiv_offsets,
+    DefaultValuedAttr<I64ArrayAttr, "{}">:$wait_for,
+    DefaultValuedAttr<StrAttr, "auto">:$communication_hint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, I32, I64]>:$data
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_ComplexOp : TF_Op<"Complex", [NoSideEffect, ResultsBroadcastableShape]> {
   let summary = "Converts two real numbers to a complex number.";
 
@@ -1408,6 +1583,30 @@ tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
   let hasCanonicalizer = 1;
 }
 
+def TF_ConjugateTransposeOp : TF_Op<"ConjugateTranspose", [NoSideEffect]> {
+  let summary = [{
+Shuffle dimensions of x according to a permutation and conjugate the result.
+  }];
+
+  let description = [{
+The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+  `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+  `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$x,
+    TF_I32OrI64Tensor:$perm
+  );
+
+  let results = (outs
+    TF_Tensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tperm = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_Conv2DOp : TF_Op<"Conv2D", [NoSideEffect, TF_LayoutSensitiveInterface]> {
   let summary = [{
 Computes a 2-D convolution given 4-D `input` and `filter` tensors.
@@ -1682,7 +1881,28 @@ Given an input tensor, this function computes hyperbolic cosine of every
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_CrossReplicaSumOp : TF_Op<"CrossReplicaSum", [AllTypesMatch<["input", "output"]>, NoSideEffect]> {
+def TF_CrossOp : TF_Op<"Cross", [NoSideEffect]> {
+  let summary = "Compute the pairwise cross product.";
+
+  let description = [{
+`a` and `b` must be the same shape; they can either be simple 3-element vectors,
+or any shape where the innermost dimension is 3. In the latter case, each pair
+of corresponding 3-element vectors is cross-multiplied independently.
+  }];
+
+  let arguments = (ins
+    TF_IntOrFpTensor:$a,
+    TF_IntOrFpTensor:$b
+  );
+
+  let results = (outs
+    TF_IntOrFpTensor:$product
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_CrossReplicaSumOp : TF_Op<"CrossReplicaSum", [NoSideEffect, TF_AllTypesMatch<["input", "output"]>]> {
   let summary = "An Op to sum inputs across replicated TPU instances.";
 
   let description = [{
@@ -1706,7 +1926,7 @@ and `B, D, F, H` as group 1. Thus we get the outputs:
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_CumsumOp : TF_Op<"Cumsum", [AllTypesMatch<["x", "out"]>, NoSideEffect]> {
+def TF_CumsumOp : TF_Op<"Cumsum", [NoSideEffect, TF_AllTypesMatch<["x", "out"]>]> {
   let summary = "Compute the cumulative sum of the tensor `x` along `axis`.";
 
   let description = [{
@@ -2427,6 +2647,76 @@ This operation creates a tensor of `shape` and `dtype`.
   let hasFolder = 1;
 }
 
+def TF_EnqueueTPUEmbeddingRaggedTensorBatchOp : TF_Op<"EnqueueTPUEmbeddingRaggedTensorBatch", [SameVariadicOperandSize]> {
+  let summary = "Eases the porting of code that uses tf.nn.embedding_lookup().";
+
+  let description = [{
+sample_splits[i], embedding_indices[i] and aggregation_weights[i] correspond
+to the ith feature. table_ids[i] indicates which embedding table to look up ith
+feature.
+
+The tensors at corresponding positions in two of the input lists,
+embedding_indices and aggregation_weights, must have the same shape, i.e. rank 1
+with dim_size() equal to the total number of lookups into the table described by
+the corresponding feature.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_I32OrI64Tensor>:$sample_splits,
+    Variadic<TF_I32OrI64Tensor>:$embedding_indices,
+    Variadic<TF_F32OrF64Tensor>:$aggregation_weights,
+    TF_StrTensor:$mode_override,
+
+    DefaultValuedAttr<I64Attr, "-1">:$device_ordinal,
+    DefaultValuedAttr<StrArrayAttr, "{}">:$combiners,
+    I64ArrayAttr:$table_ids,
+    DefaultValuedAttr<I64ArrayAttr, "{}">:$max_sequence_lengths
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T1 = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr T2 = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T3 = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
+}
+
+def TF_EnqueueTPUEmbeddingSparseTensorBatchOp : TF_Op<"EnqueueTPUEmbeddingSparseTensorBatch", [SameVariadicOperandSize]> {
+  let summary = [{
+Eases the porting of code that uses tf.nn.embedding_lookup_sparse().
+  }];
+
+  let description = [{
+sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
+to the ith feature. table_ids[i] indicates which embedding table to look up ith
+feature.
+
+The tensors at corresponding positions in the three input lists (sample_indices,
+embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
+with dim_size() equal to the total number of lookups into the table described by
+the corresponding feature.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_I32OrI64Tensor>:$sample_indices,
+    Variadic<TF_I32OrI64Tensor>:$embedding_indices,
+    Variadic<TF_F32OrF64Tensor>:$aggregation_weights,
+    TF_StrTensor:$mode_override,
+
+    DefaultValuedAttr<I64Attr, "-1">:$device_ordinal,
+    DefaultValuedAttr<StrArrayAttr, "{}">:$combiners,
+    I64ArrayAttr:$table_ids,
+    DefaultValuedAttr<I64ArrayAttr, "{}">:$max_sequence_lengths
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T1 = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr T2 = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T3 = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
+}
+
 def TF_EqualOp : TF_Op<"Equal", [Commutative, NoSideEffect]> {
   let summary = "Returns the truth value of (x == y) element-wise.";
 
@@ -2824,6 +3114,8 @@ fill([2, 3], 9) ==> [[9, 9, 9]
     return Verify(*this);
   }];
 
+  let hasFolder = 1;
+
   let builders = [OpBuilder<
     "OpBuilder &builder, OperationState &result, Value dims, Value value"
   >];
@@ -3256,8 +3548,8 @@ Gather slices from `params` axis `axis` according to `indices`.
 
   let description = [{
 `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-params.shape[axis + 1:]` where:
+Produces an output tensor with shape `params.shape[:axis] +
+indices.shape[batch_dims:] + params.shape[axis + 1:]` where:
 
 ```python
     # Scalar indices (output is rank(params) - 1).
@@ -4242,7 +4534,7 @@ cublas.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_MatrixBandPartOp : TF_Op<"MatrixBandPart", [AllTypesMatch<["input", "band"]>, NoSideEffect]> {
+def TF_MatrixBandPartOp : TF_Op<"MatrixBandPart", [NoSideEffect, TF_AllTypesMatch<["input", "band"]>]> {
   let summary = [{
 Copy a tensor setting everything outside a central band in each innermost matrix to zero.
   }];
@@ -6246,6 +6538,8 @@ If `x` and `y` are reals, this will return the floating-point division.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let hasCanonicalizer = 1;
+
+  let hasFolder = 1;
 }
 
 def TF_ReciprocalOp : TF_Op<"Reciprocal", [NoSideEffect, SameOperandsAndResultType]> {
@@ -7349,9 +7643,15 @@ select(condition, t, e) ==> [[1, 2],
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+
+  let hasCanonicalizer = 1;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
 }
 
-def TF_SelectV2Op : TF_Op<"SelectV2", [NoSideEffect]> {
+def TF_SelectV2Op : TF_Op<"SelectV2", [NoSideEffect, ResultsBroadcastableShape]> {
   let summary = "";
 
   let description = [{
@@ -8224,7 +8524,7 @@ def TF_StackV2Op : TF_Op<"StackV2", []> {
   );
 }
 
-def TF_StopGradientOp : TF_Op<"StopGradient", [NoSideEffect, SameOperandsAndResultType]> {
+def TF_StopGradientOp : TF_Op<"StopGradient", [NoSideEffect, TF_AllTypesMatch<["input", "output"]>]> {
   let summary = "Stops gradient computation.";
 
   let description = [{
@@ -10244,6 +10544,33 @@ https://www.tensorflow.org/xla/operation_semantics#gather
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_XlaHostComputeOp : TF_Op<"XlaHostCompute", []> {
+  let summary = [{
+A pseudo-op to represent host-side computation in an XLA program.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$inputs,
+
+    StrArrayAttr:$ancestors,
+    TF_ShapeAttrArray:$shapes,
+    SymbolRefAttr:$shape_inference_graph,
+    StrAttr:$key,
+    DefaultValuedAttr<I64Attr, "1000000">:$cost_estimate_ns,
+    DefaultValuedAttr<I64Attr, "0">:$tpu_core
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$outputs
+  );
+
+  TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
+}
+
 def TF_XlaKeyValueSortOp : TF_Op<"XlaKeyValueSort", [NoSideEffect]> {
   let summary = "Wraps the XLA Sort operator, documented at";
 
@@ -10292,6 +10619,24 @@ https://www.tensorflow.org/performance/xla/operation_semantics#pad
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_XlaRecvFromHostOp : TF_Op<"XlaRecvFromHost", []> {
+  let summary = "An op to receive a tensor from the host.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_ShapeAttr:$shape,
+    StrAttr:$key
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedResultTypeAttr Toutput = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_XlaReduceOp : TF_Op<"XlaReduce", [NoSideEffect]> {
   let summary = "Wraps the XLA Reduce operator, documented at";
 
@@ -10356,6 +10701,23 @@ i=0...N-1.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_XlaSendToHostOp : TF_Op<"XlaSendToHost", []> {
+  let summary = "An op to send a tensor to the host.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+
+    StrAttr:$key
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tinput = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_XlaSvdOp : TF_Op<"XlaSvd", [NoSideEffect]> {
   let summary = [{
 Computes the eigen decomposition of a batch of self-adjoint matrices
@@ -10439,6 +10801,27 @@ def TF_ZerosLikeOp : TF_Op<"ZerosLike", [NoSideEffect, SameOperandsAndResultType
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF__HostComputeMlirOp : TF_Op<"_HostComputeMlir", []> {
+  let summary = "A host-side computation called from a TPU device.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$inputs,
+
+    StrAttr:$key,
+    DefaultValuedAttr<I64Attr, "0">:$tpu_core
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$outputs
+  );
+
+  TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
+}
+
 def TF__RecvTPUEmbeddingActivationsOp : TF_Op<"_RecvTPUEmbeddingActivations", []> {
   let summary = "An op that receives embeddng activations on the TPU.";
 
@@ -10497,3 +10880,44 @@ used to look up the program in the compilation cache.
   TF_DerivedResultSizeAttr num_computations = TF_DerivedResultSizeAttr<1>;
   TF_DerivedOperandSizeAttr NumDynamicShapes = TF_DerivedOperandSizeAttr<0>;
 }
+
+def TF__XlaRecvAtHostOp : TF_Op<"_XlaRecvAtHost", []> {
+  let summary = [{
+A placeholder op to receive values from a running XLA computation.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_StrTensor:$dynamic_key,
+
+    StrAttr:$key,
+    I64Attr:$device_ordinal
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$outputs
+  );
+
+  TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
+}
+
+def TF__XlaSendFromHostOp : TF_Op<"_XlaSendFromHost", []> {
+  let summary = "A placeholder op to send values to a running XLA computation.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$inputs,
+    TF_StrTensor:$dynamic_key,
+
+    StrAttr:$key,
+    I64Attr:$device_ordinal
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index cb17341cefd..dbd8ab0fae2 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -23,7 +23,7 @@ limitations under the License.
 #define TF_OP_BASE
 
 include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td"
 
 //===----------------------------------------------------------------------===//
@@ -70,6 +70,16 @@ class TF_OpIsBroadcastableToRes<int opId, int resId> : And<[
               "$_op.getOperand(" # opId # ").getType(), "
               "$_op.getResult(" # resId # ").getType())">]>;
 
+
+class TF_AllTypesMatchPred<list<string> values> :
+    CPred<"TF::AreCastCompatible(llvm::makeArrayRef({"# StrJoin<values>.result #"}))">;
+
+class TF_AllTypesMatch<list<string> names> :
+    PredOpTrait<
+        "all of {" # StrJoin<names>.result # "} have dynamically equal types ",
+        TF_AllTypesMatchPred<
+            !foreach(n, names, !subst("$_self", "$" # n, "$_self.getType()"))>>;
+
 //===----------------------------------------------------------------------===//
 // TensorFlow op definitions
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 3a4e9e5985e..c9d61abe507 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -58,6 +58,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/core/platform/logging.h"
@@ -110,48 +111,6 @@ static inline bool HasRankAtMost(Value value, int64_t rank) {
   return !type || type.getRank() <= rank;
 }
 
-// Returns true if the given pair of TensorFlow types can be cast to one
-// another. In other words, a single run-time value is legal for both the types.
-// For example, tensor<*xf32> and tensor<3xf32> are cast compatible.
-static bool AreCastCompatible(Type a, Type b) {
-  if (TensorCastOp::areCastCompatible(a, b)) return true;
-
-  // Resource types may optionally contain subtypes information that does not
-  // match. Check subtypes compatibility when possible, otherwise treat them as
-  // compatible.
-  auto a_or_element_type = getElementTypeOrSelf(a);
-  auto b_or_element_type = getElementTypeOrSelf(b);
-
-  auto a_kind = a_or_element_type.getKind();
-  auto b_kind = b_or_element_type.getKind();
-
-  if (a_kind == TensorFlowTypes::RESOURCE &&
-      b_kind == TensorFlowTypes::RESOURCE) {
-    auto a_resource_type = a_or_element_type.dyn_cast<ResourceType>();
-    auto b_resource_type = b_or_element_type.dyn_cast<ResourceType>();
-    bool a_has_subtype = !a_resource_type.getSubtypes().empty();
-    bool b_has_subtype = !b_resource_type.getSubtypes().empty();
-
-    if (!a_has_subtype || !b_has_subtype) return true;
-
-    assert(a_resource_type.getSubtypes().size() <= 1 &&
-           "Resource type must have at most one subtype");
-    assert(b_resource_type.getSubtypes().size() <= 1 &&
-           "Resource type must have at most one subtype");
-
-    return TensorCastOp::areCastCompatible(
-        a_resource_type.getSubtypes().front(),
-        b_resource_type.getSubtypes().front());
-  }
-
-  // Variant types may optionally contain subtypes information that need not
-  // match.  It is also not possible to compare subtypes for compatibility as
-  // their interpretation depends on the ops operating on them. So, accept all
-  // pairs of variant types.
-  return a_kind == TensorFlowTypes::VARIANT &&
-         b_kind == TensorFlowTypes::VARIANT;
-}
-
 static bool IsUnknownDimOrRank(int64_t dim_or_rank) {
   return dim_or_rank == -1;
 }
@@ -293,6 +252,39 @@ static LogicalResult VerifyTypesCompatibility(
   return success();
 }
 
+// This is a helper for the Select to SelectV2 canonicalization. The `data` rank
+// refers to the rank of `t`/`e` (these two inputs have equal rank; this is
+// checked in the verifier).
+//
+// In most cases, the predicate for Select can be used directly as the predicate
+// for SelectV2. However, there is one case that varies, which is when the
+// predicate is a tensor and the data is multidimensional. In this case, Select
+// op semantics dictate that the predicate tensor length must match the size of
+// the first data dimension. This varies from normal broadcasting semantics
+// (which are used in SelectV2), so we must reshape the tensor in this case to
+// be compatible.
+static Value ReshapeSelectPredIfNecessary(OpBuilder *builder, Location loc,
+                                          Value cond, int data_rank) {
+  auto cond_tensor = cond.getType().cast<RankedTensorType>();
+  // Reshape is only needed in the case that the cond rank is 1 (i.e. it is
+  // a vector) AND t/e rank is > 1.
+  if (cond_tensor.getRank() != 1 || data_rank <= 1) {
+    // No reshape necessary. Leave cond as it is.
+    return cond;
+  }
+
+  // This is the case where a reshape is needed. We want to construct the
+  // shape [x,1,...1], where x is the value in the pred tensor and the
+  // length of the shape is equal to data_rank.
+  SmallVector<int64_t, 8> shape(data_rank, 1);
+  shape[0] = cond_tensor.getShape().front();
+  auto new_shape_type =
+      RankedTensorType::get({data_rank}, builder->getIntegerType(64));
+  auto shape_attr = DenseIntElementsAttr::get(new_shape_type, shape);
+  auto new_shape = builder->create<ConstOp>(loc, shape_attr);
+  return builder->create<ReshapeOp>(loc, cond, new_shape);
+}
+
 //===----------------------------------------------------------------------===//
 // Helper functions detect device capabilities from RuntimeDevices.
 //===----------------------------------------------------------------------===//
@@ -503,9 +495,10 @@ LogicalResult FoldOperandsPermutation(
 namespace {
 // Folder that returns LHS of an Arithmetic Op if the RHS is a constant
 // known to be Identity (e.g X+0)
-template <typename OpT,
-          typename std::enable_if<llvm::is_one_of<
-              OpT, AddV2Op, SubOp, MulOp, DivOp>::value>::type * = nullptr>
+template <
+    typename OpT,
+    typename std::enable_if<llvm::is_one_of<
+        OpT, AddV2Op, SubOp, MulOp, DivOp, RealDivOp>::value>::type * = nullptr>
 OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
                                         ArrayRef<Attribute> operands) {
   auto result_op_type = arithmetic_op.getResult().getType();
@@ -520,7 +513,8 @@ OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
   // Mul and Div ops have identity value one while AddV2 and SubOp have identity
   // value zero.
   int identity =
-      (std::is_same<OpT, MulOp>::value || std::is_same<OpT, DivOp>::value);
+      (std::is_same<OpT, MulOp>::value || std::is_same<OpT, DivOp>::value ||
+       std::is_same<OpT, RealDivOp>::value);
 
   Type element_ty = lhs_type.getElementType();
   Attribute identity_attr;
@@ -537,6 +531,12 @@ OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
       return arithmetic_op.x();
   }
 
+  auto rhs_type = arithmetic_op.y().getType().template cast<ShapedType>();
+  // TODO(chhe): we could fold and add an identity to force the broadcast.
+  if (result_op_type != rhs_type) {
+    return {};
+  }
+
   bool is_symmetric =
       (std::is_same<OpT, AddV2Op>::value || std::is_same<OpT, MulOp>::value);
   if (auto attr = operands[0].dyn_cast_or_null<DenseElementsAttr>()) {
@@ -984,20 +984,17 @@ void ConstOp::build(OpBuilder &builder, OperationState &result, Type type,
 
 LogicalResult ConstOp::inferReturnTypes(
     MLIRContext *context, Optional<Location> location, ValueRange operands,
-    ArrayRef<NamedAttribute> attributes, RegionRange regions,
+    DictionaryAttr attributes, RegionRange regions,
     SmallVectorImpl<Type> &inferredReturnTypes) {
-  for (NamedAttribute named_attr : attributes) {
-    if (named_attr.first.strref() != "value") continue;
-    auto value = named_attr.second;
-    if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
-      inferredReturnTypes.assign({elem_attr.getType()});
-      return success();
-    }
-    return emitOptionalError(location,
-                             "attribute 'value' failed to satisfy constraint: "
-                             "constant vector/tensor");
+  auto value = attributes.get("value");
+  if (!value) return emitOptionalError(location, "missing attribute 'value'");
+  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
+    inferredReturnTypes.assign({elem_attr.getType()});
+    return success();
   }
-  return emitOptionalError(location, "missing attribute 'value'");
+  return emitOptionalError(location,
+                           "attribute 'value' failed to satisfy constraint: "
+                           "constant vector/tensor");
 }
 
 //===----------------------------------------------------------------------===//
@@ -1300,8 +1297,8 @@ static LogicalResult Verify(DataFormatVecPermuteOp op) {
 
   if (rank == 1) {
     int64_t dim0 = input_ty.getDimSize(0);
-    if (dim0 != ShapedType::kDynamicSize && dim0 != 4)
-      return op.emitOpError("requires 1D input of size 4");
+    if (dim0 != ShapedType::kDynamicSize && dim0 != 4 && dim0 != 2)
+      return op.emitOpError("requires 1D input of size 4 or size 2");
   }
 
   if (rank == 2) {
@@ -1416,7 +1413,7 @@ static LogicalResult Verify(DynamicStitchOp op) {
       auto expected_out_ty =
           RankedTensorType::get(expected_shape, out_ty.getElementType());
 
-      if (!AreCastCompatible(out_ty, expected_out_ty)) {
+      if (!AreCastCompatible({out_ty, expected_out_ty})) {
         return op.emitOpError() << "has invalid output type; should be "
                                    "compatible with inferred type "
                                 << expected_out_ty;
@@ -1650,7 +1647,7 @@ static ShapedType InferFillOpType(Value dims, Value value) {
 
   llvm::SmallVector<int64_t, 4> shape;
   shape.reserve(dims_attr.getNumElements());
-  for (const APInt &dim : dims_attr.getValues<APInt>()) {
+  for (const APInt dim : dims_attr.getValues<APInt>()) {
     shape.push_back(dim.getSExtValue());
   }
   return RankedTensorType::get(shape, etype);
@@ -1661,6 +1658,35 @@ void FillOp::build(OpBuilder &builder, OperationState &result, Value dims,
   FillOp::build(builder, result, InferFillOpType(dims, value), dims, value);
 }
 
+OpFoldResult FillOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "fill op has two operand");
+
+  auto type = getType().cast<ShapedType>();
+  // DenseElementsAttr that is used in this folder only supports int and float
+  // types.
+  // TODO(hinsu): Handle complex types once there is a attribute kind for
+  // complex.
+  if (!type.getElementType().isIntOrFloat()) return {};
+
+  auto value = operands[1].dyn_cast_or_null<ElementsAttr>();
+  if (!value) return {};
+
+  if (type.hasStaticShape())
+    return DenseElementsAttr::get(type, value.getValue({}));
+
+  auto dims = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
+  if (!dims) return {};
+
+  llvm::SmallVector<int64_t, 4> shape;
+  shape.reserve(dims.getNumElements());
+  for (const APInt dim : dims.getValues<APInt>()) {
+    shape.push_back(dim.getSExtValue());
+  }
+  type = RankedTensorType::get(shape, type.getElementType());
+
+  return DenseElementsAttr::get(type, value.getValue({}));
+}
+
 //===----------------------------------------------------------------------===//
 // FusedBatchNormGradOp
 //===----------------------------------------------------------------------===//
@@ -1795,75 +1821,129 @@ static LogicalResult Verify(GatherV2Op op) {
 
 static LogicalResult Verify(IfOp op) {
   auto module = op.getParentOfType<ModuleOp>();
-  auto thenFn = module.lookupSymbol<FuncOp>(op.then_branch());
-  if (!thenFn)
+  auto then_fn = module.lookupSymbol<FuncOp>(op.then_branch());
+  if (!then_fn)
     return op.emitOpError("then_branch refers to an undefined function : ")
            << op.then_branch();
-  auto elseFn = module.lookupSymbol<FuncOp>(op.else_branch());
-  if (!elseFn)
+  auto else_fn = module.lookupSymbol<FuncOp>(op.else_branch());
+  if (!else_fn)
     return op.emitOpError("else_branch refers to an undefined function : ")
            << op.else_branch();
-  auto thenFuncType = thenFn.getType();
-  auto elseFuncType = elseFn.getType();
+  auto then_fn_type = then_fn.getType();
+  auto else_fn_type = else_fn.getType();
 
   // Non-conditional operands starting with the second operand are passed to
   // branches and should be pair-wise compatible with branches' inputs.
-  unsigned expectedNumInputs = op.getNumOperands() - 1;
-  if (thenFuncType.getNumInputs() != expectedNumInputs ||
-      elseFuncType.getNumInputs() != expectedNumInputs)
-    return op.emitError("branches should have " + Twine(expectedNumInputs) +
+  unsigned expected_num_inputs = op.getNumOperands() - 1;
+  if (then_fn_type.getNumInputs() != expected_num_inputs ||
+      else_fn_type.getNumInputs() != expected_num_inputs)
+    return op.emitError("branches should have " + Twine(expected_num_inputs) +
                         " inputs");
 
-  for (unsigned i = 0; i < expectedNumInputs; ++i) {
-    auto operandType = op.getOperand(i + 1).getType().cast<TensorType>();
-    auto thenInputType = thenFuncType.getInput(i).cast<TensorType>();
-    if (!AreCastCompatible(operandType, thenInputType))
+  for (unsigned i = 0; i < expected_num_inputs; ++i) {
+    auto operand_type = op.getOperand(i + 1).getType().cast<TensorType>();
+    auto then_input_type = then_fn_type.getInput(i).cast<TensorType>();
+    if (!AreCastCompatible({operand_type, then_input_type}))
       return op.emitError(
           llvm::formatv("then branch input type {0} is incompatible with "
                         "operand type {1} at index {2}",
-                        thenInputType, operandType, i));
+                        then_input_type, operand_type, i));
 
-    auto elseInputType = elseFuncType.getInput(i).cast<TensorType>();
-    if (!AreCastCompatible(operandType, elseInputType))
+    auto else_input_type = else_fn_type.getInput(i).cast<TensorType>();
+    if (!AreCastCompatible({operand_type, else_input_type}))
       return op.emitError(
           llvm::formatv("else branch input type {0} is incompatible with "
                         "operand type {1} at index {2}",
-                        elseInputType, operandType, i));
+                        else_input_type, operand_type, i));
 
     // If branches have incompatible input types that means that no tensor can
     // serve as input to both the functions. Hence, the op is invalid.
-    if (!AreCastCompatible(thenInputType, elseInputType))
+    if (!AreCastCompatible({then_input_type, else_input_type}))
       return op.emitError(llvm::formatv(
           "branches inputs have incompatible types {0} and {1} at index {2}",
-          thenInputType, elseInputType, i));
+          then_input_type, else_input_type, i));
   }
 
   // Branches' results should be pair-wise compatible with the op results.
-  unsigned expectedNumResults = op.getNumResults();
-  if (thenFuncType.getNumResults() != expectedNumResults ||
-      elseFuncType.getNumResults() != expectedNumResults)
-    return op.emitError("branches should have " + Twine(expectedNumResults) +
+  unsigned expected_num_results = op.getNumResults();
+  if (then_fn_type.getNumResults() != expected_num_results ||
+      else_fn_type.getNumResults() != expected_num_results)
+    return op.emitError("branches should have " + Twine(expected_num_results) +
                         " results");
 
-  for (unsigned i = 0; i < expectedNumResults; ++i) {
-    auto resultType = op.getResult(i).getType().cast<TensorType>();
-    auto thenResultType = thenFuncType.getResult(i).cast<TensorType>();
-    if (!AreCastCompatible(thenResultType, resultType))
+  for (unsigned i = 0; i < expected_num_results; ++i) {
+    auto result_type = op.getResult(i).getType().cast<TensorType>();
+    auto then_result_type = then_fn_type.getResult(i).cast<TensorType>();
+    if (!AreCastCompatible({then_result_type, result_type}))
       return op.emitError(
           llvm::formatv("then branch result type {0} is incompatible with op "
                         "result type {1} at index {2}",
-                        thenResultType, resultType, i));
+                        then_result_type, result_type, i));
 
-    auto elseResultType = elseFuncType.getResult(i).cast<TensorType>();
-    if (!AreCastCompatible(elseResultType, resultType))
+    auto else_result_type = else_fn_type.getResult(i).cast<TensorType>();
+    if (!AreCastCompatible({else_result_type, result_type}))
       return op.emitError(
           llvm::formatv("else branch result type {0} is incompatible with op "
                         "result type {1} at index {2}",
-                        elseResultType, resultType, i));
+                        else_result_type, result_type, i));
   }
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// YieldOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(YieldOp op) {
+  auto parent = op.getParentOp();
+  // A YieldOp should be contained within an IfRegion op
+  // (and WhileRegion in future)
+  if (!isa<IfRegionOp>(parent))
+    op.emitError() << " expects parent op "
+                   << "'" << IfRegionOp::getOperationName() << "' but got '"
+                   << parent->getName().getStringRef() << "'";
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// IfRegionOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult VerifyRegionResults(Operation *op, Region &region,
+                                  StringRef region_name) {
+  auto op_name = op->getName().getStringRef();
+  // verify that op outputs match yield inputs
+  YieldOp yield = cast<YieldOp>(region.front().getTerminator());
+  unsigned expected_num_results = op->getNumResults();
+  if (yield.getNumOperands() != expected_num_results)
+    return op->emitError(region_name + " region should have " +
+                         Twine(expected_num_results) + " results");
+
+  for (int idx : llvm::seq<int>(0, expected_num_results)) {
+    auto op_result_type = op->getResult(idx).getType().cast<TensorType>();
+    auto region_result_type =
+        yield.getOperand(idx).getType().cast<TensorType>();
+    if (!AreCastCompatible({region_result_type, op_result_type}))
+      return op->emitError(llvm::formatv(
+          "{0} result type {1} is incompatible with {2} "
+          "result type {3} at index {4}",
+          region_name, region_result_type, op_name, op_result_type, idx));
+  }
+  return success();
+}
+
+static LogicalResult Verify(IfRegionOp op) {
+  if (failed(VerifyRegionResults(op, op.then_branch(), "then")))
+    return failure();
+  if (failed(VerifyRegionResults(op, op.else_branch(), "else")))
+    return failure();
+  if (op.then_branch().front().getNumArguments() != 0)
+    return op.emitOpError() << "then region cannot have any arguments";
+  if (op.else_branch().front().getNumArguments() != 0)
+    return op.emitOpError() << "else region cannot have any arguments";
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // InvertOp
 //===----------------------------------------------------------------------===//
@@ -2429,6 +2509,10 @@ void RealDivOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
   results.insert<RealDivWithSqrtDivisor>(context);
 }
 
+OpFoldResult RealDivOp::fold(ArrayRef<Attribute> operands) {
+  return IdentityArithmeticOpFolder<RealDivOp>(*this, operands);
+}
+
 //===----------------------------------------------------------------------===//
 // ReshapeOp
 //===----------------------------------------------------------------------===//
@@ -2560,6 +2644,81 @@ void ReshapeOp::build(OpBuilder &builder, OperationState &result, Value tensor,
   return unranked();
 }
 
+//===----------------------------------------------------------------------===//
+// SelectOp
+//===----------------------------------------------------------------------===//
+
+void SelectOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                           MLIRContext *context) {
+  results.insert<SelectToSelectV2>(context);
+}
+
+// Verifies a few extra requirements on SelectOp:
+// (1) `then` and `else` must have same shape
+// (2) At least one of the following must be true:
+//     (a) `cond` has the same rank as `then` and `else`
+//     (b) `cond` is a scalar
+//     (c) `cond` is a vector AND `then` and `else` are non-scalar with their
+//         first dimension equal to `cond`.
+static LogicalResult Verify(SelectOp op) {
+  auto then_tensor = op.t().getType().cast<TensorType>();
+  auto else_tensor = op.e().getType().cast<TensorType>();
+  // Check (1).
+  if (!AreCastCompatible({then_tensor, else_tensor}))
+    return op.emitOpError() << "requires t and e have compatible shapes";
+
+  // Get data rank (if exists).
+  int data_rank;
+  // If data is unranked or data_rank is 0, this will remain -2. Otherwise
+  // refers to first dimension of then and/or else.
+  int data_first_dim = -2;
+  bool then_has_rank = then_tensor.hasRank();
+  bool else_has_rank = else_tensor.hasRank();
+  if (then_has_rank && else_has_rank) {
+    data_rank = then_tensor.getRank();
+    if (then_tensor.getRank() > 0)
+      data_first_dim = then_tensor.getShape().front();
+    if (else_tensor.getRank() > 0)
+      data_first_dim = std::max(
+          static_cast<int>(else_tensor.getShape().front()), data_first_dim);
+  } else if (then_has_rank) {
+    data_rank = then_tensor.getRank();
+    if (then_tensor.getRank() > 0)
+      data_first_dim = then_tensor.getShape().front();
+  } else if (else_has_rank) {
+    data_rank = else_tensor.getRank();
+    if (else_tensor.getRank() > 0)
+      data_first_dim = else_tensor.getShape().front();
+  } else {
+    // Neither has a rank.
+    return success();
+  }
+
+  auto cond_tensor = op.condition().getType().dyn_cast<RankedTensorType>();
+  if (!cond_tensor) return success();
+  auto cond_rank = cond_tensor.getRank();
+  // Check (2a) and (2b).
+  if (cond_rank == 0 || cond_rank == data_rank) return success();
+  // Check (2c).
+  if (cond_rank == 1) {
+    auto cond_shape = cond_tensor.getShape().front();
+    if (data_rank == 0) {
+      return op.emitOpError()
+             << "requires that t and e are nonscalar when pred is a vector";
+    }
+    // We know `data` tensor has a rank of at least 1.
+    if (data_first_dim != -1 && cond_shape != -1 &&
+        data_first_dim != cond_shape) {
+      return op.emitOpError() << "requires that, when pred is a vector, the "
+                                 "shape matches the first dimension of t and e";
+    }
+    return success();
+  }
+  // None of (2a,b,c) were true; fail.
+  return op.emitOpError() << "requires that pred is a scalar OR has the same "
+                             "rank as t and e OR is a vector";
+}
+
 //===----------------------------------------------------------------------===//
 // SelectV2Op
 //===----------------------------------------------------------------------===//
@@ -2619,9 +2778,12 @@ LogicalResult VerifyShapeOperandAndResult(Operation *op, Type operand_type,
              << variadic_idx_str << " to match rank of operand"
              << variadic_idx_str;
   } else if (result_ranked_type.hasStaticShape()) {
-    // The operand is an unranked tensor, verify that the result is dynamic.
-    return op->emitOpError("requires dynamic shape result")
-           << variadic_idx_str << " for unranked operand" << variadic_idx_str;
+    // The operand is an unranked tensor, print a warning if the result
+    // is static.
+    // Note: We do not handle this situation as an error, this would be too
+    // restrictive due to incompleteness of shape inference at this point.
+    op->emitWarning("has static shape result")
+        << variadic_idx_str << " for unranked operand" << variadic_idx_str;
   }
 
   Type element_type = result_ranked_type.getElementType();
@@ -3572,12 +3734,20 @@ OpFoldResult FoldIdentityTranspose(TransposeOp op) {
   if (!const_perm) return {};
 
   auto const_value = const_perm.value();
-  const auto &elements = const_value.getValues<APInt>();
+  const auto elements = const_value.getValues<APInt>();
 
   for (auto it : llvm::enumerate(elements)) {
     if (it.index() != it.value()) return {};
   }
 
+  // TODO(jpienaar): Remove if/when we handle this more generally.
+  if (op.getType() != op.x().getType()) {
+    // If the types don't match then only fold if all the operands are in the TF
+    // dialect.
+    for (auto user : op.getOperation()->getUsers())
+      if (user->getDialect() != op.getDialect()) return {};
+  }
+
   return op.x();
 }
 
@@ -3721,36 +3891,37 @@ OpFoldResult VariableShapeOp::fold(ArrayRef<Attribute> operands) {
 
 static LogicalResult Verify(WhileOp op) {
   auto module = op.getParentOfType<ModuleOp>();
-  auto condFn = module.lookupSymbol<FuncOp>(op.cond());
-  auto bodyFn = module.lookupSymbol<FuncOp>(op.body());
-  if (!condFn) {
+  auto cond_fn = module.lookupSymbol<FuncOp>(op.cond());
+  auto body_fn = module.lookupSymbol<FuncOp>(op.body());
+  if (!cond_fn) {
     return op.emitOpError("cond refers to an undefined function : ")
            << op.cond();
   }
-  if (!bodyFn) {
+  if (!body_fn) {
     return op.emitOpError("body refers to an undefined function : ")
            << op.body();
   }
 
-  auto condFuncType = condFn.getType();
-  auto bodyFuncType = bodyFn.getType();
+  auto cond_fn_type = cond_fn.getType();
+  auto body_fn_type = body_fn.getType();
 
   // Verify that the cond function has exactly one result.
-  if (condFuncType.getNumResults() != 1)
+  if (cond_fn_type.getNumResults() != 1)
     return op.emitOpError("requires cond function to have exactly one result");
 
   SmallVector<Type, 4> operands(op.getOperandTypes());
 
   // Collect all the type lists for the op so that different pairs of type lists
   // can be compared for the compatibility.
-  int numTypeLists = 5;
-  std::pair<std::string, ArrayRef<Type>> typeLists[] = {
-      {"operand", operands},
-      {"body function result", bodyFuncType.getResults()},
-      {"result", op.getResultTypes()},
-      {"cond function input", condFuncType.getInputs()},
-      {"body function input", bodyFuncType.getInputs()},
-  };
+  constexpr int kNumTypeLists = 5;
+  const std::array<std::pair<std::string, ArrayRef<Type>>, kNumTypeLists>
+      type_lists = {{
+          {"operand", operands},
+          {"body function result", body_fn_type.getResults()},
+          {"result", op.getResultTypes()},
+          {"cond function input", cond_fn_type.getInputs()},
+          {"body function input", body_fn_type.getInputs()},
+      }};
 
   // A pair of type lists should be cast compatible with each other if one is
   // converted to the another for a function call or assignment or there is a
@@ -3774,28 +3945,28 @@ static LogicalResult Verify(WhileOp op) {
   // never converted from one to the another nor there is a common source
   // tensors.  Compatibility requirement is not transitive.
 
-  for (int i = 0; i < numTypeLists; ++i) {
+  for (int i = 0; i < kNumTypeLists; ++i) {
     // Skip the first pair as the While op operands and body function results
     // does not need to be compatible with each other.
-    for (int j = std::max(2, i + 1); j < numTypeLists; ++j) {
-      auto &a = typeLists[i];
-      auto &b = typeLists[j];
+    for (int j = std::max(2, i + 1); j < kNumTypeLists; ++j) {
+      auto &a = type_lists[i];
+      auto &b = type_lists[j];
 
-      int aSize = a.second.size();
-      if (aSize != b.second.size())
+      int a_size = a.second.size();
+      if (a_size != b.second.size())
         return op.emitOpError(
             llvm::formatv("requires the number of {0}s to be equal to the "
                           "number of {1}s. Found {2} and {3}, respectively",
-                          a.first, b.first, aSize, b.second.size()));
+                          a.first, b.first, a_size, b.second.size()));
 
-      for (int idx = 0; idx < aSize; ++idx) {
-        auto aType = a.second[idx];
-        auto bType = b.second[idx];
+      for (int idx = 0; idx < a_size; ++idx) {
+        auto a_type = a.second[idx];
+        auto b_type = b.second[idx];
 
-        if (!AreCastCompatible(aType, bType))
+        if (!AreCastCompatible({a_type, b_type}))
           return op.emitError(llvm::formatv(
               "{0} type {1} is incompatible with {2} type {3} at index {4}",
-              a.first, aType, b.first, bType, idx));
+              a.first, a_type, b.first, b_type, idx));
       }
     }
   }
@@ -3877,7 +4048,7 @@ TensorFlowDialect::TensorFlowDialect(MLIRContext *context)
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
       >();
   addInterfaces<TFInlinerInterface>();
-  addAttributes<ShapeAttr>();
+  addAttributes<ShapeAttr, FuncAttr>();
 
   // Support unknown operations because not all TensorFlow operations are
   // registered.
@@ -3932,6 +4103,49 @@ void PrintShapeAttr(ShapeAttr attr, DialectAsmPrinter &os) {  // NOLINT
   os << ">";
 }
 
+// Parses a #tf.func attribute of the following format:
+//
+//   #tf.func<@symbol, {attr = "value"}>
+//
+// where the first element is a SymbolRefAttr and the second element is a
+// DictionaryAttr.
+FuncAttr ParseFuncAttr(MLIRContext *context, StringRef spec, Location loc) {
+  auto emit_error = [&, spec]() {
+    emitError(loc, "invalid TensorFlow func attribute: ") << spec;
+    return nullptr;
+  };
+
+  if (!spec.consume_front("func<")) return emit_error();
+
+  size_t func_name_num_read = 0;
+  Attribute func_name_attr =
+      mlir::parseAttribute(spec, context, func_name_num_read);
+  if (!func_name_attr || !func_name_attr.isa<SymbolRefAttr>())
+    return emit_error();
+  spec = spec.drop_front(func_name_num_read);
+
+  if (!spec.consume_front(", ")) return emit_error();
+
+  size_t func_attrs_num_read = 0;
+  Attribute func_attrs_attr =
+      mlir::parseAttribute(spec, context, func_attrs_num_read);
+  if (!func_attrs_attr || !func_attrs_attr.isa<DictionaryAttr>())
+    return emit_error();
+  spec = spec.drop_front(func_attrs_num_read);
+
+  if (!spec.consume_front(">")) return emit_error();
+
+  return mlir::TF::FuncAttr::get(context, func_name_attr.cast<SymbolRefAttr>(),
+                                 func_attrs_attr.cast<DictionaryAttr>());
+}
+
+// Prints a #tf.func attribute of the following format:
+//
+//   #tf.func<@symbol, {attr = "value"}>
+void PrintFuncAttr(FuncAttr attr, DialectAsmPrinter &os) {
+  os << "func<" << attr.GetName() << ", " << attr.GetAttrs() << ">";
+}
+
 }  // namespace
 
 Attribute TensorFlowDialect::parseAttribute(DialectAsmParser &parser,
@@ -3941,6 +4155,8 @@ Attribute TensorFlowDialect::parseAttribute(DialectAsmParser &parser,
 
   if (spec.startswith("shape")) return ParseShapeAttr(getContext(), spec, loc);
 
+  if (spec.startswith("func")) return ParseFuncAttr(getContext(), spec, loc);
+
   return (emitError(loc, "unknown TensorFlow attribute: " + spec), nullptr);
 }
 
@@ -3950,6 +4166,9 @@ void TensorFlowDialect::printAttribute(Attribute attr,
     case AttrKind::SHAPE:
       PrintShapeAttr(attr.cast<ShapeAttr>(), os);
       break;
+    case AttrKind::FUNC:
+      PrintFuncAttr(attr.cast<FuncAttr>(), os);
+      break;
     default:
       llvm_unreachable("unexpected tensorflow attribute kind");
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
index 979f506b3b1..88307267ab4 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
@@ -31,7 +31,7 @@ limitations under the License.
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 744d1ac5b71..5f3a1a5be35 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -99,6 +99,30 @@ def TF_ConstOp : TF_Op<"Const", [ConstantLike, NoSideEffect,
   }];
 }
 
+def TF_CollectivePermuteOp : TF_Op<"CollectivePermute", []> {
+  let summary = "An Op to permute tensors across replicated TPU instances.";
+
+  let description = [{
+Each instance supplies its own input.
+
+For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
+`[D, A, B, C]`.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    I32Tensor:$source_target_pairs
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+
 def TF_DataFormatVecPermuteOp : TF_Op<"DataFormatVecPermute", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Permute input tensor from `src_format` to `dst_format`";
 
@@ -188,7 +212,6 @@ else_branch: A function that takes 'inputs' and returns a list of
 
     FlatSymbolRefAttr:$then_branch,
     FlatSymbolRefAttr:$else_branch,
-    DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes,
 
     // Used to map StatelessIf and If op defined in TensorFlow to a common op.
     BoolAttr:$is_stateless
@@ -207,6 +230,66 @@ else_branch: A function that takes 'inputs' and returns a list of
   }];
 }
 
+def TF_YieldOp : TF_Op<"Yield", [Terminator]> {
+  let summary = "Yield operation";
+  let description = [{
+    The "yield" operation represents a return operation within the conditional
+    and body of structured control flow (e.g., if and while). The operation
+    takes a variable number of operands and produces no results. The number and
+    types of inputs must match the signature of the operation that contains the
+    region.
+  }];
+
+  let arguments = (ins Variadic<AnyType>:$operands);
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
+def TF_IfRegionOp : TF_Op<"IfRegion",
+      [SingleBlockImplicitTerminator<"YieldOp">]> {
+  let summary = "output = cond ? then_branch output : else_branch output";
+
+  let description = [{
+"output = cond ? then_branch output : else_branch output"
+
+cond: A Tensor. If the tensor is a scalar of non-boolean type, the
+    scalar is converted to a boolean according to the
+    following rule: if the scalar is a numerical value, non-zero means
+    True and zero means False; if the scalar is a string, non-empty
+    means True and empty means False. If the tensor is not a scalar,
+    being empty means False and being non-empty means True.
+then_branch: A region that computes the outputs of the op if cond = true.
+    It returns a list of tensors using tf.yield (as the terminator). The
+    types of these returned tensors is same as that of the else_branch
+else_branch: A region that computes the outputs of the op if cond = false.
+    It returns a list of tensors using tf.yield (as the terminator). The
+    types of these returned tensors is same as that of the then_branch
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$cond,
+
+    // Used to map StatelessIf and If op defined in TensorFlow to a common op.
+    BoolAttr:$is_stateless
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tcond = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<1>;
+  TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
+
+  let regions = (region SizedRegion<1>:$then_branch, SizedRegion<1>:$else_branch);
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
 def TF_MeanOp : TF_Op<"Mean", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
   let summary = "Computes the mean of elements across dimensions of a tensor.";
 
@@ -905,5 +988,29 @@ def TF_TensorSliceDatasetOp : TF_Op<"TensorSliceDataset", []> {
   TF_DerivedOperandTypeListAttr Toutput_types = TF_DerivedOperandTypeListAttr<0>;
 }
 
+// TODO(b/156507832): Move tf.InplaceUpdate to tf_generated_ops.td once
+// autogenerated op def matches.
+def TF_InplaceUpdateOp : TF_Op<"InplaceUpdate", [NoSideEffect]> {
+  let summary = "Updates specified rows 'i' with values 'v'.";
+
+  let description = [{
+Computes `x[i, :] = v; return x`.
+
+Originally this function is mutative however for compilation we make this
+operation create / operate on a copy of `x`.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$x,
+    I32Tensor:$i,
+    TF_Tensor:$v
+  );
+
+  let results = (outs
+    TF_Tensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
 
 #endif // TF_OPS
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
index 6c3cd7fac92..994378ea1cf 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
@@ -28,6 +28,134 @@ llvm::Optional<llvm::ArrayRef<int64_t>> GetShape(mlir::Value value) {
   if (shaped_type.hasRank()) return shaped_type.getShape();
   return llvm::None;
 }
+
+// Merges cast compatible shapes and returns a more refined shape. The two
+// shapes are cast compatible if they have the same rank and at each dimension,
+// either both have same size or one of them is dynamic. Returns false if the
+// given shapes are not cast compatible. The refined shape is same or more
+// precise than the two input shapes.
+bool GetCastCompatibleShape(llvm::ArrayRef<int64_t> a_shape,
+                            llvm::ArrayRef<int64_t> b_shape,
+                            llvm::SmallVectorImpl<int64_t>* refined_shape) {
+  if (a_shape.size() != b_shape.size()) return false;
+  int64_t rank = a_shape.size();
+  refined_shape->reserve(rank);
+  for (auto dims : llvm::zip(a_shape, b_shape)) {
+    int64_t dim1 = std::get<0>(dims);
+    int64_t dim2 = std::get<1>(dims);
+
+    if (mlir::ShapedType::isDynamic(dim1)) {
+      refined_shape->push_back(dim2);
+      continue;
+    }
+    if (mlir::ShapedType::isDynamic(dim2)) {
+      refined_shape->push_back(dim1);
+      continue;
+    }
+    if (dim1 == dim2) {
+      refined_shape->push_back(dim1);
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+// Given two types `a` and `b`, returns a refined type which is cast compatible
+// with both `a` and `b` and is equal to or more precise than both of them. It
+// returns empty Type if the input types are not cast compatible.
+//
+// The two types are considered cast compatible if they have dynamically equal
+// shapes and element type. For element types that do not have subtypes, they
+// must be equal. However for TensorFlow types such as Resource and Variant,
+// that also have subtypes, we recursively check for subtype compatibilty for
+// Resource types and assume all variant types are cast compatible. If either
+// one of `a` or `b` have empty subtypes, they are considered cast compatible.
+//
+// The returned type is same or more precise than the input types. For example,
+// if `a` and `b` are cast compatible types tensor<2x?x?xf32> and
+// tensor<?x4x?xf32> respectively, the returned type is tensor<2x4x?xf32>.
+//
+// Provides option to ignore ref types on 'a'. This is useful for TF ops that
+// might allow operands to either be same as result type or be a ref type
+// corresponding to it.
+mlir::Type GetCastCompatibleType(mlir::Type a, mlir::Type b,
+                                 bool may_ignore_ref_type_a) {
+  // Fast path if everything is equal.
+  if (a == b) return b;
+
+  auto a_tt = a.dyn_cast<mlir::TensorType>();
+  auto b_tt = b.dyn_cast<mlir::TensorType>();
+
+  // If only one of a or b is a tensor type, they are incompatible.
+  if (static_cast<bool>(a_tt) ^ static_cast<bool>(b_tt)) return nullptr;
+
+  // For non-tensor types, we do not need to worry about shape and can return
+  // early.
+  if (!a_tt && !b_tt) {
+    // Remove ref types.
+    if (may_ignore_ref_type_a) {
+      if (auto ref_type = a.dyn_cast<mlir::TF::TensorFlowRefType>()) {
+        a = ref_type.RemoveRef();
+        if (a == b) return a;
+      }
+    }
+    if (a.getKind() != b.getKind()) return nullptr;
+
+    // If either is not a type that contain subtypes then the types are not cast
+    // compatible.
+    auto a_wst = a.dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>();
+    auto b_wst = b.dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>();
+    if (!a_wst || !b_wst) return nullptr;
+
+    // For Variant types we are more permissive right now and accept all pairs
+    // of Variant types. If we are more constrainted and check compatibility of
+    // subtypes, we might reject valid graphs.
+    // TODO(prakalps): Variant doesn't have a subtype, we assign it
+    // one, so we should only assign it one when we know the subtype. Then we
+    // can be more constrained and check subtypes for cast compatibility as
+    // well.
+    if (a.isa<mlir::TF::VariantType>()) return a;
+
+    // For Resource types, we recursively check the subtypes for cast
+    // compatibility, if possible. Otherwise treat them as compatible.
+    auto a_wst_st = a_wst.GetSubtypes();
+    auto b_wst_st = b_wst.GetSubtypes();
+    if (a_wst_st.empty() || b_wst_st.empty()) return a;
+    if (a_wst_st.size() != b_wst_st.size()) return nullptr;
+    llvm::SmallVector<mlir::TensorType, 4> refined_subtypes;
+    for (auto subtypes : llvm::zip(a_wst_st, b_wst_st)) {
+      mlir::Type refined_st =
+          GetCastCompatibleType(std::get<0>(subtypes), std::get<1>(subtypes),
+                                /*may_ignore_ref_type_a=*/false);
+      if (!refined_st) return nullptr;
+      refined_subtypes.push_back(refined_st.cast<mlir::TensorType>());
+    }
+
+    return mlir::TF::ResourceType::get(refined_subtypes, a.getContext());
+  }
+
+  // For tensor types, check compatibility of both element type and shape.
+  mlir::Type refined_element_ty = GetCastCompatibleType(
+      a_tt.getElementType(), b_tt.getElementType(), may_ignore_ref_type_a);
+  if (!refined_element_ty) return nullptr;
+
+  if (!a_tt.hasRank() && !b_tt.hasRank()) {
+    return mlir::UnrankedTensorType::get(refined_element_ty);
+  }
+  if (!a_tt.hasRank()) {
+    return mlir::RankedTensorType::get(b_tt.getShape(), refined_element_ty);
+  }
+  if (!b_tt.hasRank()) {
+    return mlir::RankedTensorType::get(a_tt.getShape(), refined_element_ty);
+  }
+
+  llvm::SmallVector<int64_t, 8> refined_shape;
+  if (!GetCastCompatibleShape(a_tt.getShape(), b_tt.getShape(), &refined_shape))
+    return nullptr;
+
+  return mlir::RankedTensorType::get(refined_shape, refined_element_ty);
+}
 }  // namespace
 
 namespace mlir {
@@ -224,47 +352,41 @@ bool BroadcastCompatible(ArrayRef<Type> lhs, ArrayRef<Type> rhs) {
 
 bool HasCompatibleElementTypes(Type lhs, Type rhs,
                                bool may_ignore_ref_type_lhs) {
-  // Fast path if everything is equal.
-  if (lhs == rhs) return true;
+  return GetCastCompatibleType(lhs, rhs, may_ignore_ref_type_lhs) != nullptr;
+}
 
-  // In TF all values are tensors.
-  auto lhs_tt = lhs.cast<TensorType>();
-  auto rhs_tt = rhs.cast<TensorType>();
-
-  // Verify matching element types. These should be identical dynamically,
-  // so this allows for types not yet fully refined.
-  auto lhs_et = lhs_tt.getElementType();
-  auto rhs_et = rhs_tt.getElementType();
-  if (lhs_et == rhs_et) return true;
-
-  // Remove ref types.
-  if (may_ignore_ref_type_lhs) {
-    if (auto ref_type = lhs_et.dyn_cast<TF::TensorFlowRefType>()) {
-      lhs_et = ref_type.RemoveRef();
-      if (lhs_et == rhs_et) return true;
-    }
-  }
-
-  if (lhs_et.getKind() != rhs_et.getKind()) return false;
-
-  // If either is not type that contain subtypes then the element types don't
-  // match.
-  auto lhs_wst = lhs_et.dyn_cast<TF::TensorFlowTypeWithSubtype>();
-  auto rhs_wst = rhs_et.dyn_cast<TF::TensorFlowTypeWithSubtype>();
-  if (!lhs_wst || !rhs_wst) return false;
-
-  // Consider the subtype recursively.
-  auto lhs_wst_st = lhs_wst.GetSubtypes();
-  auto rhs_wst_st = rhs_wst.GetSubtypes();
-  if (lhs_wst_st.empty() || rhs_wst_st.empty()) return true;
-  if (lhs_wst_st.size() != rhs_wst_st.size()) return false;
-  for (auto subtypes : llvm::zip(lhs_wst_st, rhs_wst_st)) {
-    if (!HasCompatibleElementTypes(std::get<0>(subtypes),
-                                   std::get<1>(subtypes)))
-      return false;
+bool AreCastCompatible(ArrayRef<Type> types) {
+  Type common = types.front();
+  for (auto type : types.drop_front()) {
+    Type refined_type =
+        GetCastCompatibleType(common, type, /*may_ignore_ref_type_a=*/false);
+    if (!refined_type) return false;
+    common = refined_type;
   }
   return true;
 }
 
+ShapedType DropTypeSubTypes(ShapedType ty) {
+  Type element_ty = ty.getElementType();
+  auto subtype_ty = element_ty.dyn_cast<TF::TensorFlowTypeWithSubtype>();
+  if (!subtype_ty) return ty;
+
+  Type default_ty = GetDefaultTypeOf(subtype_ty);
+  if (ty.hasRank()) return RankedTensorType::get(ty.getShape(), default_ty);
+
+  return UnrankedTensorType::get(default_ty);
+}
+
+ShapedType DropRefType(ShapedType ty) {
+  Type element_ty = ty.getElementType();
+  TF::TensorFlowRefType ref_ty = element_ty.dyn_cast<TF::TensorFlowRefType>();
+  if (!ref_ty) return ty;
+
+  Type default_ty = TF::GetDefaultTypeOf(ref_ty);
+  if (ty.hasRank()) return RankedTensorType::get(ty.getShape(), default_ty);
+
+  return UnrankedTensorType::get(default_ty);
+}
+
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
index d1e6a74a0c5..5f108e834a9 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
@@ -313,6 +313,22 @@ bool BroadcastCompatible(ArrayRef<Type> lhs, ArrayRef<Type> rhs);
 bool HasCompatibleElementTypes(Type lhs, Type rhs,
                                bool may_ignore_ref_type_lhs = false);
 
+// Returns true if all TensorFlow types can be cast to one
+// another. In other words, a single run-time value is legal for both the types.
+// For example, tensor<*xf32>, tensor<?xf32> and tensor<3xf32> are cast
+// compatible.
+bool AreCastCompatible(ArrayRef<Type> types);
+
+// If the given tensor has elements of type with subtypes, then returns a new
+// type after dropping subtypes info. Otherwise, returns the original type as
+// is.
+ShapedType DropTypeSubTypes(ShapedType ty);
+
+// If the given tensor has elements of type ref, then returns a new type
+// of the shape, but corresponding non-ref type as element type. Otherwise,
+// returns the original type as is.
+ShapedType DropRefType(ShapedType ty);
+
 }  // end namespace TF
 }  // end namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 18f8d5f4486..a77aa5b8346 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -258,6 +258,59 @@ func @testDoubleReciprocal(%arg0: tensor<8x16x32x64xi32>) -> tensor<8x16x32x64xi
 // CHECK: return %arg0
 }
 
+// CHECK-LABEL: testSelectScalarPred
+func @testSelectScalarPred(%arg0: tensor<i1>, %arg1: tensor<4x2xf16>, %arg2: tensor<4x2xf16>) -> tensor<4x2xf16> {
+  // CHECK-NEXT: "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<4x2xf16>, tensor<4x2xf16>) -> tensor<4x2xf16>
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<4x2xf16>, tensor<4x2xf16>) -> tensor<4x2xf16>
+  return %0: tensor<4x2xf16>
+}
+
+// CHECK-LABEL: testSelectVectorPred
+func @testSelectVectorPred(%arg0: tensor<2xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
+  // CHECK-NEXT: %[[SHAPE:.*]] = "tf.Const"
+  // CHECK-NEXT: %[[PRED:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) : (tensor<2xi1>, tensor<2xi64>) -> tensor<2x1xi1>
+  // CHECK-NEXT: "tf.SelectV2"(%[[PRED]], %arg1, %arg2) : (tensor<2x1xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  return %0: tensor<2x3xf16>
+}
+
+// CHECK-LABEL: testSelectAllSameShape
+func @testSelectAllSameShape(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
+  // CHECK-NEXT: "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<2x3xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<2x3xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  return %0: tensor<2x3xf16>
+}
+
+// If we don't have guarantees on input shapes, we can't support canonicalizing
+// to SelectV2. Test these cases.
+// CHECK-LABEL: testSelectInvalid
+func @testSelectInvalid(%arg0: tensor<?xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
+  // CHECK-NEXT: tf.Select
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<?xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  return %0: tensor<2x3xf16>
+}
+
+// CHECK-LABEL: testSelectInvalidUnranked
+func @testSelectInvalidUnranked(%arg0: tensor<6x7xi1>, %arg1: tensor<*xf16>, %arg2: tensor<*xf16>) -> tensor<*xf16> {
+  // CHECK-NEXT: tf.Select
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<6x7xi1>, tensor<*xf16>, tensor<*xf16>) -> tensor<*xf16>
+  return %0: tensor<*xf16>
+}
+
+// CHECK-LABEL: testSelectThenUnranked
+func @testSelectThenUnranked(%arg0: tensor<3xi1>, %arg1: tensor<*xf16>, %arg2: tensor<3x2xf16>) -> tensor<*xf16> {
+  // CHECK-NEXT: tf.Select
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<*xf16>, tensor<3x2xf16>) -> tensor<*xf16>
+  return %0: tensor<*xf16>
+}
+
+// CHECK-LABEL: testSelectElseUnranked
+func @testSelectElseUnranked(%arg0: tensor<3xi1>, %arg1: tensor<3x2xf16>, %arg2: tensor<*xf16>) -> tensor<*xf16> {
+  // CHECK-NEXT: tf.Select
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<3x2xf16>, tensor<*xf16>) -> tensor<*xf16>
+  return %0: tensor<*xf16>
+}
+
 // CHECK-LABEL: testLogicalNotOfEqual
 func @testLogicalNotOfEqual(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xi1> {
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
@@ -471,3 +524,22 @@ func @testRankOfRankedTensor(%arg0 : tensor<4x3x2xf32>) -> tensor<i32> {
   // CHECK: return [[VAL0]]
   return %0 : tensor<i32>
 }
+
+// CHECK-LABEL: @foldFill
+func @foldFill() -> (tensor<3x2x1xf32>, tensor<*xf32>, tensor<*xcomplex<f32>>) {
+  %0 = "tf.Const"() {value = dense<[3, 2, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %1 = "tf.Const"() {value = dense<23.0> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: "tf.Const"() {value = dense<2.300000e+01> : tensor<3x2x1xf32>}
+  %2 = "tf.Fill"(%0, %1) : (tensor<3xi32>, tensor<f32>) -> tensor<3x2x1xf32>
+  // CHECK: "tf.Const"() {value = dense<2.300000e+01> : tensor<3x2x1xf32>}
+  %3 = "tf.Fill"(%0, %1) : (tensor<3xi32>, tensor<f32>) -> tensor<*xf32>
+
+  %complex_cst = "tf.Const"() {value = dense<(0.000000e+00,1.000000e+00)> : tensor<complex<f32>>} : () -> tensor<complex<f32>>
+  // Here, custom folder doesn't handle complex dtypes and it is folded through
+  // the constant folding hook.
+  // TODO(hinsu): Handle complex dtypes in the custom folder for FillOp.
+  // CHECK: "tf.Const"() {value = dense<(0.000000e+00,1.000000e+00)> : tensor<3x2x1xcomplex<f32>>} : () -> tensor<*xcomplex<f32>>
+  %4 = "tf.Fill"(%0, %complex_cst) : (tensor<3xi32>, tensor<complex<f32>>) -> tensor<*xcomplex<f32>>
+
+  return %2, %3, %4 : tensor<3x2x1xf32>, tensor<*xf32>, tensor<*xcomplex<f32>>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index bccb8923134..3ae6023400c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -302,15 +302,13 @@ func @testTensorListElementShape(%arg0: tensor<!tf.variant<tensor<2x4xf32>>>) ->
   return %0: tensor<2xi32>
 }
 
-func @RemoveTrivialAdd(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+func @RemoveTrivialAdd(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<0.0> : tensor<2x2xf32>
-  %0 = "tf.Add"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = "tf.Add"(%0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.Add"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: RemoveTrivialAdd
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
 }
 
 func @RemoveTrivialAddBf16RHS(%arg0: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
@@ -331,26 +329,22 @@ func @RemoveTrivialAddBf16LHS(%arg0: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
   // CHECK-NEXT: return %arg0 : tensor<2x2xbf16>
 }
 
-func @RemoveTrivialAddV2(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+func @RemoveTrivialAddV2(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<0.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = "tf.AddV2"(%0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.AddV2"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: RemoveTrivialAddV2
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
 }
 
-func @RemoveTrivialSub(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+func @RemoveTrivialSub(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<0.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = "tf.Sub"(%0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.Sub"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: RemoveTrivialSub
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
 }
 
 func @RemoveTrivialSubInt8(%arg0: tensor<2x2xi8>) -> tensor<2x2xi8> {
@@ -362,26 +356,31 @@ func @RemoveTrivialSubInt8(%arg0: tensor<2x2xi8>) -> tensor<2x2xi8> {
   // CHECK-NEXT: return %arg0 : tensor<2x2xi8>
 }
 
-func @RemoveTrivialMul(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+func @RemoveTrivialMul(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<1.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = "tf.Mul"(%0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.Mul"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: RemoveTrivialMul
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
 }
 
-func @RemoveTrivialDiv(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+func @RemoveTrivialDiv(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<1.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = "tf.Div"(%0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.Div"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: RemoveTrivialDiv
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
+}
+
+func @RemoveTrivialRealDiv(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %cst = constant dense<1.0> : tensor<2x2xf32>
+  %0 = "tf.RealDiv"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+
+  // CHECK-LABEL: RemoveTrivialRealDiv
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
 }
 
 func @RemoveTrivialDivBf16RHS(%arg0: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
@@ -411,28 +410,35 @@ func @DivBf16LHS(%arg0: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
   // CHECK: tf.Div
 }
 
-func @DontRemoveTrivialAdd(%arg0: tensor<1x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<2x2xf32> {
+func @DontRemoveTrivialAdd(%arg0: tensor<1x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<0.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x2xf32>
-  %1 = "tf.AddV2"(%0, %cst) : (tensor<1x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.AddV2"(%arg0, %cst) : (tensor<1x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: DontRemoveTrivialAdd
   // CHECK: %[[CONST:.*]] = constant dense<0.000000e+00> : tensor<2x2xf32>
-  // CHECK: %[[add:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x2xf32>
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%[[add]], %[[CONST]]) : (tensor<1x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %[[CONST]]) : (tensor<1x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
   // CHECK: return %[[RESULT]] : tensor<2x2xf32>
 }
 
-func @DontRemoveTrivialAdd2(%arg0: tensor<?x?xf32>, %arg1: tensor<2x2xf32>) -> tensor<?x?xf32> {
+func @DontRemoveTrivialAdd2(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %cst = constant dense<0.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<?x?xf32>, tensor<2x2xf32>) -> tensor<?x?xf32>
-  %1 = "tf.AddV2"(%0, %cst) : (tensor<?x?xf32> , tensor<2x2xf32>) -> tensor<?x?xf32>
-  return %1 :tensor<?x?xf32>
+  %0 = "tf.AddV2"(%arg0, %cst) : (tensor<?x?xf32> , tensor<2x2xf32>) -> tensor<?x?xf32>
+  return %0 :tensor<?x?xf32>
 
   // CHECK-LABEL: DontRemoveTrivialAdd2
   // CHECK: %[[CONST:.*]] = constant dense<0.000000e+00> : tensor<2x2xf32>
-  // CHECK: %[[add:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<?x?xf32>, tensor<2x2xf32>) -> tensor<?x?xf32>
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%[[add]], %[[CONST]]) : (tensor<?x?xf32>, tensor<2x2xf32>) -> tensor<?x?xf32>
+  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %[[CONST]]) : (tensor<?x?xf32>, tensor<2x2xf32>) -> tensor<?x?xf32>
   // CHECK: return %[[RESULT]] : tensor<?x?xf32>
 }
+
+// Test no fold because of the broadcast.
+func @DontRemoveTrivialMul(%arg0: tensor<1x6x8x1xf32>) -> tensor<1x6x8x1xf32> {
+  %0 = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.Mul"(%arg0, %0) : (tensor<1x6x8x1xf32>, tensor<f32>) -> tensor<1x6x8x1xf32>
+  return %1 : tensor<1x6x8x1xf32>
+  // CHECK-LABEL: DontRemoveTrivialMul
+  // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: %[[RESULT:.*]] = "tf.Mul"(%arg0, %[[CONST]]) : (tensor<1x6x8x1xf32>, tensor<f32>) -> tensor<1x6x8x1xf32>
+  // CHECK: return %[[RESULT]] : tensor<1x6x8x1xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/case_op.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/case_op.mlir
new file mode 100644
index 00000000000..de6f9b42ba4
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/case_op.mlir
@@ -0,0 +1,47 @@
+// RUN: tf-opt %s -tf-executor-tpu-v1-island-outlining | FileCheck %s --dump-input=fail
+
+// CHECK: func @control_input
+// CHECK-NOT: func @
+// CHECK-LABEL: module @_tpu_v1_compat_outlined
+// CHECK: @_tpu_v1_compat_outlined_func0
+// CHECK: func @branch_0
+// CHECK: func @branch_1
+// CHECK: func @branch_2
+// CHECK: func @branch_3
+// CHECK: func @branch_4
+module {
+  func @control_input(%arg0: tensor<i1>) -> tensor<i32> {
+    %0 = tf_executor.graph {
+      %output, %control = tf_executor.island {
+       "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "device", num_replicas = 1, topology = "topology"} : () -> ()
+        %index = "tf.opA"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i1>) -> tensor<i32>
+        %input = "tf.opB"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i1>) -> tensor<i32>
+        %result = "tf.Case"(%index, %input) {branches = [@branch_0, @branch_1, @branch_2, @branch_3, @branch_4]} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        tf_executor.yield %result : tensor<i32>
+      }
+      tf_executor.fetch %output : tensor<i32>
+
+    }
+    return %0 : tensor<i32>
+  }
+  func @branch_0(%arg0: tensor<i32>) -> tensor<i32> {
+    %0 = "tf.some_op"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+  func @branch_1(%arg0: tensor<i32>) -> tensor<i32> {
+    %0 = "tf.some_op"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+  func @branch_2(%arg0: tensor<i32>) -> tensor<i32> {
+    %0 = "tf.some_op"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+  func @branch_3(%arg0: tensor<i32>) -> tensor<i32> {
+    %0 = "tf.some_op"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+  func @branch_4(%arg0: tensor<i32>) -> tensor<i32> {
+    %0 = "tf.some_op"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/func-attr-invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/func-attr-invalid.mlir
new file mode 100644
index 00000000000..cd3b8b55032
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/func-attr-invalid.mlir
@@ -0,0 +1,50 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics
+
+// Tests invalid #tf.func attributes.
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func}}
+func @main() attributes {tf._implements = #tf.func} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<>}}
+func @main() attributes {tf._implements = #tf.func<>} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<@symbol>}}
+func @main() attributes {tf._implements = #tf.func<@symbol>} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<{}>}}
+func @main() attributes {tf._implements = #tf.func<{}>} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<"test", {}>}}
+func @main() attributes {tf._implements = #tf.func<"test", {}>} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<@symbol, "">}}
+func @main() attributes {tf._implements = #tf.func<@symbol, "">} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<@symbol, {}, "">}}
+func @main() attributes {tf._implements = #tf.func<@symbol, {}, "">} {
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/func-attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/func-attr.mlir
new file mode 100644
index 00000000000..de17778c105
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/func-attr.mlir
@@ -0,0 +1,13 @@
+// RUN: tf-opt %s | tf-opt | FileCheck %s --dump-input=fail
+
+// CHECK-LABEL: func @func_attr
+// CHECK-SAME: tf._implements = #tf.func<@symbol_a, {attr0 = 1 : i32, attr1 = "random"}>
+func @func_attr() attributes {tf._implements = #tf.func<@symbol_a, {attr0 = 1 : i32, attr1 = "random"}>} {
+  return
+}
+
+// CHECK-LABEL: func @nested_func_attr
+// CHECK-SAME: tf._implements = #tf.func<@symbol_a, {attr0 = 1 : i32, attr1 = "random", nested = #tf.func<@symbol_b, {attr2 = true, attr3 = 8.000000e+00 : f32}>}>
+func @nested_func_attr() attributes {tf._implements = #tf.func<@symbol_a, {attr0 = 1 : i32, attr1 = "random", nested = #tf.func<@symbol_b, {attr2 = true, attr3 = 8.0 : f32}>}>} {
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/function-func-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/function-func-attr.pbtxt
new file mode 100644
index 00000000000..9f044c62736
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/function-func-attr.pbtxt
@@ -0,0 +1,53 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s --dump-input-on-failure
+
+node {
+  name: "custom_relu_func_call"
+  op: "custom_relu"
+}
+node {
+  name: "custom_embedding_matmul_func_call"
+  op: "custom_embedding_matmul"
+}
+library {
+  function {
+    signature {
+      name: "custom_relu"
+    }
+    attr {
+      key: "_implements"
+      value {
+        func {
+          name: "tensorflow.relu"
+        }
+      }
+    }
+  }
+  function {
+    signature {
+      name: "custom_embedding_matmul"
+    }
+    attr {
+      key: "_implements"
+      value {
+        func {
+          name: "tensorflow.embedding_matmul"
+          attr {
+            key: "key1"
+            value {
+              i: 2
+            }
+          }
+          attr {
+            key: "key2"
+            value {
+              b: false
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+# CHECK-DAG: func @custom_relu{{[0-9]*}}() attributes {tf._implements = #tf.func<@tensorflow.relu, {}>}
+# CHECK-DAG: func @custom_embedding_matmul{{[0-9]*}}() attributes {tf._implements = #tf.func<@tensorflow.embedding_matmul, {key1 = 2 : i64, key2 = false}>}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
index d26585edb03..03640e24aac 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
@@ -13,7 +13,7 @@
 # CHECK:          %[[ISLAND_2:.*]], %[[ISLAND_2_control:.*]] = tf_executor.island wraps "tf.StatefulPartitionedCall"
 # CHECK-SAME:       f = @[[FUNC:[a-z0-9]*]]
 # CHECK:          tf_executor.fetch %[[ISLAND_1]], %[[ISLAND_2]] : tensor<*xf32>, tensor<*xf32>
-# CHECK:      func @[[FUNC]](%arg0: tensor<*xf32>, %arg1: tensor<*x!tf.resource>) -> tensor<*xf32>
+# CHECK:      func @[[FUNC]](%arg0: tensor<*xf32> {tf._user_specified_name = "inputs"}, %arg1: tensor<*x!tf.resource>) -> tensor<*xf32>
 
 node {
   name: "args_0"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
index 0e6e561225d..eb358d52b26 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
@@ -81,7 +81,7 @@ library {
 }
 
 # Check that the `resource_arg_unique_id` for each argument is propagated to the
-# `tf.resource_arg_unique_id` argument attribute of the function
+# `tf._resource_arg_unique_id` argument attribute of the function
 # @test_func_name0.
 
 # CHECK:  func @main
@@ -92,8 +92,8 @@ library {
 # CHECK:      tf_executor.fetch
 # CHECK:    return
 # CHECK:  func @test_func_name0
-# CHECK-SAME:   tf.resource_arg_unique_id = 0
-# CHECK-SAME:   tf.resource_arg_unique_id = 0
+# CHECK-SAME:   tf._resource_arg_unique_id = 0
+# CHECK-SAME:   tf._resource_arg_unique_id = 0
 # CHECK:    tf_executor.graph
 # CHECK:      tf_executor.fetch
 # CHECK:    return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
index 10cb4f8019d..198227bf5dd 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
@@ -2,17 +2,17 @@
 
 
 func @biasAdd_NHWC(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  %0 = "xla_chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
 func @biasAdd_NCHW(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  %0 = "xla_chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
 func @biasAdd_dynamic(%arg0: tensor<?x?x?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?x?x?xi32> {
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
+  %0 = "xla_chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
   return %0 : tensor<?x?x?x?xi32>
 }
 
@@ -23,12 +23,12 @@ func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "xla_chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
 func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>} : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
+  %0 = "xla_chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>} : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
   return %0 : tensor<4x4x4x4xi32>
 }
 
@@ -38,7 +38,7 @@ func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 func @broadcast_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
@@ -48,7 +48,7 @@ func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 }
 
 func @div_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  %0 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+  %0 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
   return %0 : tensor<?x?xi32>
 }
 
@@ -68,7 +68,7 @@ func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 func @broadcast_mul(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "xla_hlo.multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "xla_chlo.broadcast_multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
@@ -78,7 +78,7 @@ func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 func @broadcast_real_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
@@ -88,7 +88,7 @@ func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 func @broadcast_sub(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "xla_hlo.subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "xla_chlo.broadcast_subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
@@ -98,7 +98,7 @@ func @shift_right(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 }
 
 func @broadcast_shift_right(%arg0: tensor<4xi32>, %arg1: tensor<2x4xi32>) -> tensor<2x4xi32> {
-  %0 = "xla_hlo.shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
+  %0 = "xla_chlo.broadcast_shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
   return %0 : tensor<2x4xi32>
 }
 
@@ -108,12 +108,12 @@ func @and(%arg0: tensor<2xi1>) -> tensor<2xi1> {
 }
 
 func @and_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @and_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
-  %0 = "xla_hlo.and"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
+  %0 = "xla_chlo.broadcast_and"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
@@ -123,12 +123,12 @@ func @or(%arg0: tensor<2xi1>) -> tensor<2xi1> {
 }
 
 func @or_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @or_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
-  %0 = "xla_hlo.or"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
+  %0 = "xla_chlo.broadcast_or"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
@@ -138,12 +138,12 @@ func @bitwise_or(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 }
 
 func @bitwise_or_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  %0 = "xla_hlo.or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
+  %0 = "xla_chlo.broadcast_or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
   return %0 : tensor<1x4xi8>
 }
 
 func @bitwise_or_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
-  %0 = "xla_hlo.or"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %0 = "xla_chlo.broadcast_or"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
 
@@ -153,12 +153,12 @@ func @bitwise_and(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 }
 
 func @bitwise_and_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  %0 = "xla_hlo.and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
+  %0 = "xla_chlo.broadcast_and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
   return %0 : tensor<1x4xi8>
 }
 
 func @bitwise_and_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
-  %0 = "xla_hlo.and"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %0 = "xla_chlo.broadcast_and"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
 
@@ -174,19 +174,19 @@ func @pow_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
 
 func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> tensor<2x3xi32> {
   %0 = xla_hlo.constant dense<0> : tensor<2x3xi32>
-  %1 = "xla_hlo.compare"(%arg0, %0) {comparison_direction = "LT"} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
+  %1 = "xla_chlo.broadcast_compare"(%arg0, %0) {comparison_direction = "LT"} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
   %2 = xla_hlo.constant dense<0> : tensor<3xi32>
-  %3 = "xla_hlo.compare"(%arg1, %2) {comparison_direction = "LT"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
-  %4 = "xla_hlo.compare"(%1, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<2x3xi1>, tensor<3xi1>) -> tensor<2x3xi1>
-  %5 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+  %3 = "xla_chlo.broadcast_compare"(%arg1, %2) {comparison_direction = "LT"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
+  %4 = "xla_chlo.broadcast_compare"(%1, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<2x3xi1>, tensor<3xi1>) -> tensor<2x3xi1>
+  %5 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
   %6 = "xla_hlo.abs"(%arg0) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %7 = "xla_hlo.abs"(%arg1) : (tensor<3xi32>) -> tensor<3xi32>
   %8 = xla_hlo.constant dense<1> : tensor<3xi32>
   %9 = xla_hlo.subtract %7, %8 : tensor<3xi32>
-  %10 = "xla_hlo.add"(%6, %9) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+  %10 = "xla_chlo.broadcast_add"(%6, %9) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
   %11 = "xla_hlo.negate"(%10) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %12 = "xla_hlo.abs"(%arg1) : (tensor<3xi32>) -> tensor<3xi32>
-  %13 = "xla_hlo.divide"(%11, %12) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+  %13 = "xla_chlo.broadcast_divide"(%11, %12) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
   %14 = "xla_hlo.select"(%4, %5, %13) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   return %14 : tensor<2x3xi32>
 }
@@ -195,14 +195,14 @@ func @floordiv_reverse_broadcast_i32(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32
   %0 = xla_hlo.constant dense<0> : tensor<3xi32>
   %1 = "xla_hlo.compare"(%arg0, %0) {comparison_direction = "LT"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
   %2 = xla_hlo.constant dense<0> : tensor<2x3xi32>
-  %3 = "xla_hlo.compare"(%arg1, %2) {comparison_direction = "LT"} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
-  %4 = "xla_hlo.compare"(%1, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<3xi1>, tensor<2x3xi1>) -> tensor<2x3xi1>
-  %5 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %3 = "xla_chlo.broadcast_compare"(%arg1, %2) {comparison_direction = "LT"} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
+  %4 = "xla_chlo.broadcast_compare"(%1, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<3xi1>, tensor<2x3xi1>) -> tensor<2x3xi1>
+  %5 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   %6 = "xla_hlo.abs"(%arg0) : (tensor<3xi32>) -> tensor<3xi32>
   %7 = "xla_hlo.abs"(%arg1) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %8 = xla_hlo.constant dense<1> : tensor<2x3xi32>
   %9 = xla_hlo.subtract %7, %8 : tensor<2x3xi32>
-  %10 = "xla_hlo.add"(%6, %9) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %10 = "xla_chlo.broadcast_add"(%6, %9) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   %11 = "xla_hlo.negate"(%10) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %12 = "xla_hlo.abs"(%arg1) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %13 = xla_hlo.divide %11, %12 : tensor<2x3xi32>
@@ -218,8 +218,8 @@ func @floordiv_f32(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 }
 
 func @floordiv_f16_broadcast(%arg0: tensor<2x3xf16>, %arg1: tensor<3xf16>) -> tensor<2x3xf16> {
-  %0 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
-  %1 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
+  %0 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
+  %1 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
   %2 = "xla_hlo.floor"(%1) : (tensor<2x3xf16>) -> tensor<2x3xf16>
   return %2 : tensor<2x3xf16>
 }
@@ -230,22 +230,22 @@ func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
 func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @equal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @equal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
@@ -255,17 +255,17 @@ func @notequal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @notequal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @notequal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @notequal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "NE"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {comparison_direction = "NE"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
@@ -275,7 +275,7 @@ func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GT"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GT"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
@@ -285,7 +285,7 @@ func @greater_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @broadcast_greater_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
@@ -295,7 +295,7 @@ func @less(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @broadcast_less(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
@@ -305,7 +305,7 @@ func @less_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @broadcast_less_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
@@ -326,35 +326,35 @@ func @const() -> tensor<2xi32> {
 
 func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   %0 = xla_hlo.constant dense<0> : tensor<i32>
-  %1 = "xla_hlo.maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
+  %1 = "xla_chlo.broadcast_maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
   return %1 : tensor<1xi32>
 }
 
 func @relu_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = xla_hlo.constant dense<0> : tensor<i32>
-  %1 = "xla_hlo.maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
+  %1 = "xla_chlo.broadcast_maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
   return %1 : tensor<?xi32>
 }
 
 func @relu6(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   %0 = xla_hlo.constant dense<0> : tensor<i32>
   %1 = xla_hlo.constant dense<6> : tensor<i32>
-  %2 = "xla_hlo.minimum"(%arg0, %1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
-  %3 = "xla_hlo.maximum"(%2, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %2 = "xla_chlo.broadcast_minimum"(%arg0, %1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %3 = "xla_chlo.broadcast_maximum"(%2, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
   return %3 : tensor<1xi32>
 }
 
 func @relu6_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = xla_hlo.constant dense<0> : tensor<i32>
   %1 = xla_hlo.constant dense<6> : tensor<i32>
-  %2 = "xla_hlo.minimum"(%arg0, %1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
-  %3 = "xla_hlo.maximum"(%2, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+  %2 = "xla_chlo.broadcast_minimum"(%arg0, %1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+  %3 = "xla_chlo.broadcast_maximum"(%2, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
   return %3 : tensor<?xi32>
 }
 
 func @relu_grad(%arg0: tensor<4x8xf32>, %arg1: tensor<?x?xf32>) -> tensor<4x8xf32> {
   %0 = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = "xla_hlo.compare"(%arg1, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "GT"} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
+  %1 = "xla_chlo.broadcast_compare"(%arg1, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "GT"} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
   %2 = xla_hlo.constant dense<0.000000e+00> : tensor<4x8xf32>
   %3 = "xla_hlo.select"(%1, %arg0, %2) : (tensor<?x?xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
   return %3 : tensor<4x8xf32>
@@ -682,6 +682,37 @@ func @convert_i32_f32(%arg0: tensor<2xi32>) -> tensor<2xf32> {
   return %0 : tensor<2xf32>
 }
 
+func @convert_slice(%arg0: tensor<1x4672xf32>) -> tensor<1x519xf32> {
+  %0 = "xla_hlo.slice"(%arg0) {limit_indices = dense<[1, 4672]> : tensor<2xi64>, start_indices = dense<[0, 4153]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x4672xf32>) -> tensor<1x519xf32>
+  return %0 : tensor<1x519xf32>
+}
+
+func @reshape(%arg0: tensor<4x6xf32>) -> tensor<2x2x6xf32> {
+  %0 = "xla_hlo.reshape"(%arg0) : (tensor<4x6xf32>) -> tensor<2x2x6xf32>
+  return %0 : tensor<2x2x6xf32>
+
+}
+
+func @convert_dot_1d_2d(%arg0: tensor<256xf32>, %arg1: tensor<256x1xf32>) -> tensor<1xf32> {
+  %0 = "xla_hlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<256xf32>, tensor<256x1xf32>) -> tensor<1xf32>
+  return %0 : tensor<1xf32>
+}
+
+func @convert_dot_2d_1d(%arg0: tensor<1x256xf32>, %arg1: tensor<256xf32>) -> tensor<1xf32> {
+  %0 = "xla_hlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x256xf32>, tensor<256xf32>) -> tensor<1xf32>
+  return %0 : tensor<1xf32>
+}
+
+func @convert_dot_1d_1d(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<f32> {
+  %0 = "xla_hlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<256xf32>, tensor<256xf32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+func @convert_dot_2d_2d(%arg0: tensor<1x256xf32>, %arg1: tensor<256x1xf32>) -> tensor<1x1xf32> {
+  %0 = "xla_hlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
+  return %0 : tensor<1x1xf32>
+}
+
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
 // CHECK-LABEL:   func @biasAdd_NHWC(
@@ -1493,3 +1524,49 @@ func @convert_i32_f32(%arg0: tensor<2xi32>) -> tensor<2xf32> {
 // CHECK:           [[VAL_371:%.*]] = "tf.Cast"([[VAL_370]]) {Truncate = false} : (tensor<2xi32>) -> tensor<2xf32>
 // CHECK:           return [[VAL_371]] : tensor<2xf32>
 // CHECK:         }
+
+// CHECK-LABEL:   func @convert_slice(
+// CHECK-SAME:                          [[VAL_372:%.*]]: tensor<1x4672xf32>) -> tensor<1x519xf32> {
+// CHECK:           [[VAL_373:%.*]] = "tf.Const"() {value = dense<[0, 4153]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           [[VAL_374:%.*]] = "tf.Const"() {value = dense<[1, 519]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           [[VAL_375:%.*]] = "tf.Slice"([[VAL_372]], [[VAL_373]], [[VAL_374]]) : (tensor<1x4672xf32>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x519xf32>
+// CHECK:           return [[VAL_375]] : tensor<1x519xf32>
+// CHECK:         }
+
+// CHECK-LABEL:   func @reshape(
+// CHECK-SAME:                  [[VAL_372:%.*]]: tensor<4x6xf32>) -> tensor<2x2x6xf32> {
+// CHECK:           [[VAL_373:%.*]] = constant dense<[2, 2, 6]> : tensor<3xi64>
+// CHECK:           [[VAL_374:%.*]] = "tf.Reshape"([[VAL_372]], [[VAL_373]]) : (tensor<4x6xf32>, tensor<3xi64>) -> tensor<2x2x6xf32>
+// CHECK:           return [[VAL_374]] : tensor<2x2x6xf32>
+// CHECK:         }
+
+// CHECK-LABEL:   func @convert_dot_1d_2d(
+// CHECK-SAME:                            [[VAL_376:%.*]]: tensor<256xf32>, [[VAL_377:%.*]]: tensor<256x1xf32>) -> tensor<1xf32> {
+// CHECK:           [[VAL_378:%.*]] = "tf.Reshape"([[VAL_376]], {{.*}}) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
+// CHECK:           [[VAL_379:%.*]] = "tf.MatMul"([[VAL_378]], [[VAL_377]]) {transpose_a = false, transpose_b = false} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
+// CHECK:           [[VAL_380:%.*]] = "tf.Reshape"([[VAL_379]], {{.*}}) : (tensor<1x1xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK:           return [[VAL_380]] : tensor<1xf32>
+// CHECK:         }
+
+// CHECK-LABEL:   func @convert_dot_2d_1d(
+// CHECK-SAME:                            [[VAL_381:%.*]]: tensor<1x256xf32>, [[VAL_382:%.*]]: tensor<256xf32>) -> tensor<1xf32> {
+// CHECK:           [[VAL_383:%.*]] = "tf.Reshape"([[VAL_382]], {{.*}}) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
+// CHECK:           [[VAL_384:%.*]] = "tf.MatMul"([[VAL_381]], [[VAL_383]]) {transpose_a = false, transpose_b = true} : (tensor<1x256xf32>, tensor<1x256xf32>) -> tensor<1x1xf32>
+// CHECK:           [[VAL_385:%.*]] = "tf.Reshape"([[VAL_384]], {{.*}}) : (tensor<1x1xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK:           return [[VAL_385]] : tensor<1xf32>
+// CHECK:         }
+
+// CHECK-LABEL:   func @convert_dot_1d_1d(
+// CHECK-SAME:                            [[VAL_386:%.*]]: tensor<256xf32>, [[VAL_387:%.*]]: tensor<256xf32>) -> tensor<f32> {
+// CHECK-DAG:       [[VAL_388:%.*]] = "tf.Reshape"([[VAL_386]], {{.*}}) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
+// CHECK-DAG:       [[VAL_389:%.*]] = "tf.Reshape"([[VAL_387]], {{.*}}) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
+// CHECK:           [[VAL_390:%.*]] = "tf.MatMul"([[VAL_388]], [[VAL_389]]) {transpose_a = false, transpose_b = true} : (tensor<1x256xf32>, tensor<1x256xf32>) -> tensor<1x1xf32>
+// CHECK:           [[VAL_391:%.*]] = "tf.Reshape"([[VAL_390]], {{.*}}) : (tensor<1x1xf32>, tensor<0xi64>) -> tensor<f32>
+// CHECK:           return [[VAL_391]] : tensor<f32>
+// CHECK:         }
+
+// CHECK-LABEL:   func @convert_dot_2d_2d(
+// CHECK-SAME:                            [[VAL_392:%.*]]: tensor<1x256xf32>, [[VAL_393:%.*]]: tensor<256x1xf32>) -> tensor<1x1xf32> {
+// CHECK:           [[VAL_394:%.*]] = "tf.MatMul"([[VAL_392]], [[VAL_393]]) {transpose_a = false, transpose_b = false} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
+// CHECK:           return [[VAL_394]] : tensor<1x1xf32>
+// CHECK:         }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir
index 680e26f5cbb..44824ea1424 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir
@@ -8,14 +8,14 @@ func @main() -> tensor<*x!tf.resource> attributes {tf.entry_function = {inputs =
   }
   return %0 : tensor<*x!tf.resource>
 }
-func @test_func_name0(%arg0: tensor<*x!tf.resource> {tf.resource_arg_unique_id = 0 : i64}, %arg1: tensor<*x!tf.resource> {tf.resource_arg_unique_id = 0 : i64}) -> tensor<*x!tf.resource> attributes {tf._disable_call_shape_inference = true} {
+func @test_func_name0(%arg0: tensor<*x!tf.resource> {tf._resource_arg_unique_id = 0 : i64}, %arg1: tensor<*x!tf.resource> {tf._resource_arg_unique_id = 0 : i64}) -> tensor<*x!tf.resource> attributes {tf._disable_call_shape_inference = true} {
   %0 = tf_executor.graph {
     tf_executor.fetch %arg0 : tensor<*x!tf.resource>
   }
   return %0 : tensor<*x!tf.resource>
 }
 
-// Check that the `tf.resource_arg_unique_id` argument attributes of
+// Check that the `tf._resource_arg_unique_id` argument attributes of
 // test_func_name0 are propagated to the function's arg_attr and
 // resource_arg_unique_id.
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
index e7f4873594b..59c93a66d12 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
@@ -1,11 +1,11 @@
 // RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-promote-resources-to-args | FileCheck %s -dump-input-on-failure
 
 // One resource, one read. The initial value of the resource is read.
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.VarHandleOp"
   // CHECK-NOT: "tf.ReadVariableOp"
-  // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%arg0, %[[CONST:[0-9]*]])
+  // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%arg1, %[[CONST:[0-9]*]])
   // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%[[CONST]], %[[ADD]])
   // CHECK: return %[[PACK]]
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
@@ -19,8 +19,8 @@ func @main() -> tensor<2xf32> {
 // -----
 
 // One resource, one write. The initial value of the resource is not read.
-// CHECK-LABEL: func @main() -> (tensor<f32> {tf.resource_name = "x"})
-func @main() {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>) -> (tensor<f32> {tf.resource_name = "x"})
+func @main(%arg0: tensor<i1>) {
   // CHECK-NOT: "tf.VarHandleOp"
   // CHECK-NOT: "tf.AssignVariableOp"
   // CHECK: return %[[CONST]]
@@ -33,12 +33,12 @@ func @main() {
 // -----
 
 // One resource, two reads using different resource handles.
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.VarHandleOp"
   // CHECK-NOT: "tf.ReadVariableOp"
-  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg0, %[[CONST:[0-9]*]])
-  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg0)
+  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg1, %[[CONST:[0-9]*]])
+  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg1)
   // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%[[CONST]], %[[ADD2]])
   // CHECK: return %[[PACK]]
 
@@ -56,12 +56,12 @@ func @main() -> tensor<2xf32> {
 // -----
 
 // Two resources, two reads using different resources.
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.resource_name = "x"}, %arg1: tensor<f32> {tf.resource_name = "y"}) -> tensor<2xf32>
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}, %arg2: tensor<f32> {tf.resource_name = "y"}) -> tensor<2xf32>
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.VarHandleOp"
   // CHECK-NOT: "tf.ReadVariableOp"
-  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg0, %[[CONST:[0-9]*]])
-  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg1)
+  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg1, %[[CONST:[0-9]*]])
+  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg2)
   // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%[[CONST]], %[[ADD2]])
   // CHECK: return %[[PACK]]
 
@@ -79,12 +79,12 @@ func @main() -> tensor<2xf32> {
 // -----
 
 // One resource with read and write. The initial value of the resource is read.
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.aliasing_output = 1 : i64, tf.resource_name = "x"}) -> (tensor<2xf32>, tensor<f32>)
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.aliasing_output = 1 : i64, tf.resource_name = "x"}) -> (tensor<2xf32>, tensor<f32>)
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.AssignVariableOp"
-  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg0, %{{[0-9]*}})
+  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg1, %{{[0-9]*}})
   // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %[[ADD1]])
-  // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%arg0, %[[ADD2]])
+  // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%arg1, %[[ADD2]])
   // CHECK: return %[[PACK]], %[[ADD1]]
 
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
@@ -102,8 +102,8 @@ func @main() -> tensor<2xf32> {
 // -----
 
 // One resource with read and write. The initial value of the resource is not read.
-// CHECK-LABEL: func @main() -> (tensor<2xf32>, tensor<f32> {tf.resource_name = "x"})
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>) -> (tensor<2xf32>, tensor<f32> {tf.resource_name = "x"})
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.AssignVariableOp"
   // CHECK: %[[CONST:[a-z0-9]+]] = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>}
   // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%[[CONST]], %[[CONST]])
@@ -138,15 +138,15 @@ func @cond_true(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<f32>) ->
   return %2 : tensor<f32>
 }
 
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
-func @main() -> tensor<2xf32> attributes {tf.entry_function = {inputs = "", outputs = "result"}} {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> attributes {tf.entry_function = {inputs = "", outputs = "result"}} {
   %0 = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>} : () -> tensor<f32>
   %1 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
   %2 = "tf.ReadVariableOp"(%1) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
   %3 = "tf.Less"(%2, %0) : (tensor<f32>, tensor<f32>) -> tensor<i1>
   %4 = "tf.If"(%3, %1, %2) {Tcond = i1, Tin = ["tfdtype$DT_RESOURCE", "tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"],
-       else_branch = @cond_false, is_stateless = false, output_shapes = [#tf.shape<>],
-       then_branch = @cond_true} : (tensor<i1>, tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> tensor<f32>
+       else_branch = @cond_false, is_stateless = false,then_branch = @cond_true} :
+       (tensor<i1>, tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> tensor<f32>
   %5 = "tf.Identity"(%4) : (tensor<f32>) -> tensor<f32>
   %6 = "tf.Pack"(%2, %5) {N = 2 : i64, T = f32, axis = 0 : i64, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<2xf32>
   return %6 : tensor<2xf32>
@@ -157,10 +157,11 @@ func @main() -> tensor<2xf32> attributes {tf.entry_function = {inputs = "", outp
 // Tests resource passed in as an argument is not modified and not returned.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32>
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
-  %0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
-  // CHECK-NEXT: "tf.AddV2"(%[[ARG_0]], %[[ARG_0]])
+// CHECK-SAME: %arg0: tensor<i1>
+// CHECK-SAME: %[[ARG_1:[a-z0-9]+]]: tensor<f32>
+func @main(%arg0: tensor<i1>, %arg1: tensor<!tf.resource<tensor<f32>>>) {
+  %0 = "tf.ReadVariableOp"(%arg1) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+  // CHECK-NEXT: "tf.AddV2"(%[[ARG_1]], %[[ARG_1]])
   %1 = "tf.AddV2"(%0, %0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK-NEXT: return
   return
@@ -171,9 +172,10 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
 // Tests resource passed in as an argument is modified but not returned.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32> {tf.aliasing_output = 0 : i64}
+// CHECK-SAME: %{{[a-z0-9]+}}: tensor<f32> {tf.aliasing_output = 0 : i64}
+// CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> tensor<f32>
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
+func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<i1>) {
   // CHECK-NEXT: %[[CONST:[a-z0-9]+]] = "tf.Const"
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
@@ -186,9 +188,10 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
 // Tests last resource assign is returned as a result.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32> {tf.aliasing_output = 0 : i64}
+// CHECK-SAME: %{{[a-z0-9]+}}: tensor<f32> {tf.aliasing_output = 0 : i64}
+// CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> tensor<f32>
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
+func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<i1>) {
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
   // CHECK: %[[CONST:[a-z0-9]+]] = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>}
@@ -204,9 +207,10 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
 // returns the same value prior.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32> {tf.aliasing_output = 1 : i64}
+// CHECK-SAME: %{{[a-z0-9]+}}: tensor<f32> {tf.aliasing_output = 1 : i64}
+// CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> (tensor<f32>, tensor<f32>)
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<f32> {
+func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<i1>) -> tensor<f32> {
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
   // CHECK: %[[CONST:[a-z0-9]+]] = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>}
@@ -221,9 +225,10 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<f32> {
 // Tests read interleaved between writes.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32> {tf.aliasing_output = 1 : i64}
+// CHECK-SAME: %{{[a-z0-9]+}}: tensor<f32> {tf.aliasing_output = 1 : i64}
+// CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> (tensor<f32>, tensor<f32>)
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<f32> {
+func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<i1>) -> tensor<f32> {
   // CHECK-NEXT: %[[CONST_0:[a-z0-9]+]] = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>}
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
@@ -271,7 +276,7 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<!tf.resource<
 
 // Tests main function with multiple blocks.
 
-// expected-error@+1 {{expects 'main' function to have 1 block, got 2}}
+// expected-error@+1 {{expects function 'main' to have 1 block, got 2}}
 func @main() {
   br ^bb1
 ^bb1:
@@ -282,7 +287,7 @@ func @main() {
 
 // Tests main function is terminated with a non MLIR ReturnOp.
 
-// expected-error@+1 {{expects 'main' function to have a MLIR ReturnOp}}
+// expected-error@+1 {{expects function 'main' to have a MLIR ReturnOp}}
 func @main() {
 ^bb0:
   tf_device.return
@@ -312,9 +317,10 @@ func @main() {
 // Tests resource argument has users that are not ReadVariableOp or
 // AssignVariableOp.
 
-// expected-error@+1 {{expects users of resource argument 0 to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp'}}
+// expected-error@+1 {{expects users of resource argument 0 to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp', got [tf.UnknownOp, tf.VarIsInitializedOp]}}
 func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<i1> {
   %0 = "tf.VarIsInitializedOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
+  %1 = "tf.UnknownOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
   return %0 : tensor<i1>
 }
 
@@ -323,7 +329,7 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<i1> {
 // Tests VarHandleOp has users that are not removed.
 
 func @main() -> tensor<i1> {
-  // expected-error@+1 {{expects no uses but used by operations: tf.UnknownOp, tf.VarIsInitializedOp}}
+  // expected-error@+1 {{expects users to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp', got [tf.UnknownOp, tf.VarIsInitializedOp]}}
   %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
   %1 = "tf.VarIsInitializedOp"(%0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
   %2 = "tf.UnknownOp"(%0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir
new file mode 100644
index 00000000000..8b8a070cfab
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir
@@ -0,0 +1,59 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-promote-var-handles-to-args | FileCheck %s -dump-input-on-failure
+
+// Tests main function with multiple blocks.
+
+// expected-error@+1 {{expects function 'main' to have 1 block, got 2}}
+func @main() {
+  br ^bb1
+^bb1:
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @no_args
+// CHECK-SAME: (%arg0: tensor<!tf.resource> {tf.resource_name = "x"})
+// CHECK-NOT: "tf.VarHandleOp"
+func @no_args() {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource>
+  return
+}
+
+// CHECK-LABEL: func @some_args
+// CHECK-SAME: (%arg0: tensor<i1>, %arg1: tensor<!tf.resource> {tf.resource_name = "x"})
+// CHECK-NOT: "tf.VarHandleOp"
+func @some_args(%arg0: tensor<i1>) {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource>
+  return
+}
+
+// CHECK-LABEL: func @unique_vars
+// CHECK-SAME: (%arg0: tensor<!tf.resource<tensor<f32>>> {tf.resource_name = "x"}, %arg1: tensor<!tf.resource<tensor<i32>>> {tf.resource_name = "y"})
+// CHECK-NOT: "tf.VarHandleOp"
+func @unique_vars() {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  %1 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "y"} : () -> tensor<!tf.resource<tensor<i32>>>
+  return
+}
+
+// CHECK-LABEL: func @duplicate_vars
+// CHECK-SAME: (%arg0: tensor<!tf.resource<tensor<f32>>> {tf.resource_name = "x"})
+// CHECK-NOT: "tf.VarHandleOp"
+func @duplicate_vars() {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  %1 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  return
+}
+
+// CHECK-LABEL: func @duplicate_vars_with_users
+// CHECK-SAME: (%arg0: tensor<f32>, %arg1: tensor<!tf.resource<tensor<f32>>> {tf.resource_name = "x"})
+// CHECK: "tf.ReadVariableOp"(%arg1)
+// CHECK: "tf.AssignAddVariableOp"(%arg1, %arg0)
+// CHECK-NOT: "tf.VarHandleOp"
+func @duplicate_vars_with_users(%arg0: tensor<f32>) {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+  %2 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  "tf.AssignAddVariableOp"(%2, %arg0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/readonly_references_to_resources.mlir b/tensorflow/compiler/mlir/tensorflow/tests/readonly_references_to_resources.mlir
new file mode 100644
index 00000000000..2970e31c3c9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/readonly_references_to_resources.mlir
@@ -0,0 +1,85 @@
+// RUN: tf-opt -verify-diagnostics -readonly-references-to-resources -split-input-file %s | FileCheck %s --dump-input=fail
+
+// Test case: Basic converting.
+
+func @f() {
+  // CHECK: "tf.VarHandleOp"
+  // CHECK: "tf.ReadVariableOp"
+  %val0 = "tf.VariableV2"() {_class = ["loc:@v"], container = "", device = "", shape = #tf.shape<96>, shared_name = ""} : () -> tensor<96x!tf.f32ref>
+  %val1 = "tf.Identity"(%val0) : (tensor<96x!tf.f32ref>) -> tensor<96xf32>
+  return
+}
+
+// -----
+
+// Test case: Two ReadVariable ops.
+
+func @f() {
+  // CHECK: "tf.VarHandleOp"
+
+  // During lowering to resource variables, this pass will preserve the
+  // locations of the ReadVariableOps as Identity ops to keep the original graph
+  // composition and order.
+
+  // CHECK: "tf.ReadVariableOp"
+  // CHECK: "tf.ReadVariableOp"
+  %val0 = "tf.VariableV2"() {_class = ["loc:@v"], container = "", device = "", shape = #tf.shape<96>, shared_name = ""} : () -> tensor<96x!tf.f32ref>
+  %val1 = "tf.Identity"(%val0) : (tensor<96x!tf.f32ref>) -> tensor<96xf32>
+  %val2 = "tf.Identity"(%val0) : (tensor<96x!tf.f32ref>) -> tensor<96xf32>
+  return
+}
+
+// -----
+
+// Test case: No follow-up ReadVariable case.
+
+func @f() {
+  // CHECK-NOT: "tf.VariableV2"
+  // CHECK-NOT: "tf.VarHandleOp"
+  %val0 = "tf.VariableV2"() {_class = ["loc:@v"], container = "", device = "", shape = #tf.shape<96>, shared_name = ""} : () -> tensor<96x!tf.f32ref>
+  return
+}
+
+// -----
+
+// Test case: No converting when there is another use case.
+
+func @f() {
+  // expected-error @+1 {{'tf.VariableV2' op expects all users to be 'tf.Identity', but got user tf.CustomIdentity}}
+  %val0 = "tf.VariableV2"() {_class = ["loc:@v"], container = "", device = "", shape = #tf.shape<96>, shared_name = ""} : () -> tensor<96x!tf.f32ref>
+  %val1 = "tf.CustomIdentity"(%val0) : (tensor<96x!tf.f32ref>) -> tensor<96xf32>
+  return
+}
+
+// -----
+
+// Test case: No class attribute on VariableV2 op.
+
+func @f() {
+  // expected-error @+1 {{'tf.VariableV2' op has no '_class' attribute}}
+  %val0 = "tf.VariableV2"() {container = "", device = "", shape = #tf.shape<96>, shared_name = ""} : () -> tensor<96x!tf.f32ref>
+  %val1 = "tf.Identity"(%val0) : (tensor<96x!tf.f32ref>) -> tensor<96xf32>
+  return
+}
+
+// -----
+
+// Test case: No named location found on VariableV2 op.
+
+func @f() {
+  // expected-error @+1 {{'tf.VariableV2' op expects variable name in '_class' attribute, but got ["unrelated_class"]}}
+  %val0 = "tf.VariableV2"() {_class = ["unrelated_class"], container = "", device = "", shape = #tf.shape<96>, shared_name = ""} : () -> tensor<96x!tf.f32ref>
+  %val1 = "tf.Identity"(%val0) : (tensor<96x!tf.f32ref>) -> tensor<96xf32>
+  return
+}
+
+// -----
+
+// Test case: Invalid multiple location information in a class attribute on VariableV2 op.
+
+func @f() {
+  // expected-error @+1 {{'tf.VariableV2' op expects only one named location in '_class' attribute, but got ["loc:@v1", "loc:@v2"]}}
+  %val0 = "tf.VariableV2"() {_class = ["loc:@v1", "loc:@v2"], container = "", device = "", shape = #tf.shape<96>, shared_name = ""} : () -> tensor<96x!tf.f32ref>
+  %val1 = "tf.Identity"(%val0) : (tensor<96x!tf.f32ref>) -> tensor<96xf32>
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
index c8b4ad2cb9f..28c542cded1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
@@ -119,3 +119,60 @@ func @replicate_control() {
 // CHECK: %[[REPLICA_1:.*]] = tf_executor.island
 // CHECK: %[[SINK:.*]] = tf_executor.island(%[[REPLICA_0]], %[[REPLICA_1]])
 // CHECK: tf_executor.fetch %[[SINK]]
+
+
+// Tests replicate results are remapped correctly.
+// CHECK-LABEL: func @replicate_result
+func @replicate_result(%arg0: tensor<i1>, %arg1: tensor<i1>) {
+  %0:4 = tf_executor.graph {
+    %1:5 = tf_executor.island {
+      %2:4 = tf_device.replicate([%arg0, %arg1] as %arg2: tensor<i1>) {n = 2 : i32} {
+        %3 = "tf.opA"(%arg2) : (tensor<i1>) -> tensor<f32>
+        %4 = "tf.opB"(%arg2) : (tensor<i1>) -> tensor<i32>
+        tf_device.return %3, %4 : tensor<f32>, tensor<i32>
+      }
+      tf_executor.yield %2#0, %2#1, %2#2, %2#3 : tensor<f32>, tensor<f32>, tensor<i32>, tensor<i32>
+    }
+    tf_executor.fetch %1#0, %1#1, %1#2, %1#3 : tensor<f32>, tensor<f32>, tensor<i32>, tensor<i32>
+  }
+  return
+}
+
+// CHECK: %[[REPLICA_0:.*]]:2, %{{.*}} = tf_executor.island
+// CHECK: %[[REPLICA_1:.*]]:2, %{{.*}} = tf_executor.island
+// CHECK: tf_executor.fetch %[[REPLICA_0]]#0, %[[REPLICA_1]]#0, %[[REPLICA_0]]#1, %[[REPLICA_1]]#1
+
+
+// Tests replica id is added correctly.
+// CHECK-LABEL: func @replica_id_attr_added
+func @replica_id_attr_added(%arg0: tensor<!tf.string>, %arg1: tensor<!tf.string>) {
+  tf_executor.graph {
+    tf_executor.island {
+      tf_device.replicate([%arg0, %arg1] as %arg2: tensor<!tf.string>) {n = 2 : i32} {
+        "tf.EnqueueTPUEmbeddingSparseTensorBatch"(%arg2){table_ids = [1, 2]} : (tensor<!tf.string>) -> ()
+        "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg2){table_ids = [1, 2]} : (tensor<!tf.string>) -> ()
+        "tf.A"(%arg2) : (tensor<!tf.string>) -> ()
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:      tf_executor.island
+// CHECK:      "tf.EnqueueTPUEmbeddingSparseTensorBatch"
+// CHECK-SAME:   _xla_replica_id = 0
+// CHECK:      "tf.EnqueueTPUEmbeddingRaggedTensorBatch"
+// CHECK-SAME:   _xla_replica_id = 0
+// CHECK:      "tf.A"
+// CHECK-NOT:   _xla_replica_id
+// CHECK:      tf_executor.island
+// CHECK:      "tf.EnqueueTPUEmbeddingSparseTensorBatch"
+// CHECK-SAME:   _xla_replica_id = 1
+// CHECK:      "tf.EnqueueTPUEmbeddingRaggedTensorBatch"
+// CHECK-SAME:   _xla_replica_id = 1
+// CHECK:      "tf.A"
+// CHECK-NOT:   _xla_replica_id
+// CHECK:      tf_executor.fetch
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir
index c98e40fed05..60eded3de7e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir
@@ -217,7 +217,7 @@ func @error_on_conflict_multiple_callers(
       // expected-error@above {{Conflicting device assignment for resource}}
           then_branch = @if_then_and_else,
           else_branch = @if_then_and_else,
-          output_shapes = [], is_stateless = false}
+          is_stateless = false}
         : (tensor<i1>, tensor<*x!tf.resource<tensor<32xf32>>>,
            tensor<*x!tf.resource<tensor<32xf32>>>) -> ()
       tf_executor.yield
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
index 9e7358ab2f5..2353dc5a7a8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
@@ -406,6 +406,61 @@ func @while_cond(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
 
 // -----
 
+// CHECK: func @cluster_with_case(%[[ARG0:.*]]: tensor<i32>) -> tensor<4xf32>
+func @cluster_with_case(%arg0: tensor<i32>) -> tensor<4xf32> {
+  // CHECK: %[[VH0:.*]] = "tf.VarHandleOp"()
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  // CHECK: %[[VH1:.*]] = "tf.VarHandleOp"()
+  %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  // CHECK-DAG: %[[READ0:.*]] = "tf.ReadVariableOp"(%[[VH0]])
+  // CHECK-DAG: %[[READ1:.*]] = "tf.ReadVariableOp"(%[[VH1]])
+  // CHECK: %[[CLUSTER:.*]]:2 = "tf_device.cluster"()
+  %2 = "tf_device.cluster"() ( {
+    // CHECK: %[[CASE:.*]]:2 = "tf.Case"(%[[ARG0]], %[[READ0]], %[[READ1]])
+    %3:2 = "tf.Case"(%arg0, %0, %1) {branches = [@branch_0, @branch_1, @branch_2]}
+      : (tensor<i32>, tensor<*x!tf.resource<tensor<4xf32>>>, tensor<*x!tf.resource<tensor<4xf32>>>)
+      -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>)
+    // CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[CASE]]#1, %[[CASE]]#0)
+    %4 = "tf.ReadVariableOp"(%3#0) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+    %5 = "tf.AddV2"(%4, %3#1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+    // CHECK-NEXT: tf_device.return %[[ADD]], %[[CASE]]#1
+    tf_device.return %5 : tensor<4xf32>
+  // CHECK: {cluster_attr = "cluster_attr"} : () -> (tensor<4xf32>, tensor<4xf32>)
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<4xf32>
+  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[CLUSTER]]#1)
+  // CHECK: return %[[CLUSTER]]#0
+  return %2 : tensor<4xf32>
+}
+// CHECK: func @branch_0(%[[TARG0:.*]]: tensor<4xf32>, %[[TARG1:.*]]: tensor<4xf32>)
+func @branch_0(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>, %arg1: tensor<*x!tf.resource<tensor<4xf32>>>)
+    -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) {
+  // CHECK-NEXT: %[[CONST:.*]] = "tf.Const"()
+  %constant = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
+  "tf.AssignVariableOp"(%arg0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+  // CHECK-NEXT: return %[[CONST]], %[[CONST]]
+  return %arg0, %constant : tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>
+}
+// CHECK: func @branch_1(%[[EARG0:.*]]: tensor<4xf32>, %[[EARG1:.*]]: tensor<4xf32>)
+func @branch_1(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>, %arg1: tensor<*x!tf.resource<tensor<4xf32>>>)
+    -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) {
+  %id = "tf.Identity"(%arg1) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %read = "tf.ReadVariableOp"(%id) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+  "tf.AssignVariableOp"(%arg0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+  // CHECK-NEXT: return %[[EARG1]], %[[EARG1]]
+  return %arg0, %read : tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>
+}
+// CHECK: func @branch_2(%[[EARG0:.*]]: tensor<4xf32>, %[[EARG1:.*]]: tensor<4xf32>)
+func @branch_2(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>, %arg1: tensor<*x!tf.resource<tensor<4xf32>>>)
+    -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) {
+  %id = "tf.Identity"(%arg1) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %read = "tf.ReadVariableOp"(%id) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+  "tf.AssignVariableOp"(%arg0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+  // CHECK-NEXT: return %[[EARG1]], %[[EARG1]]
+  return %arg0, %read : tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>
+}
+
+// -----
+
 // Tests that pass lifts resource reads from if branches.
 
 // CHECK: func @cluster_with_if(%[[ARG0:.*]]: tensor<i1>) -> tensor<4xf32>
@@ -420,7 +475,7 @@ func @cluster_with_if(%arg0: tensor<i1>) -> tensor<4xf32> {
   %2 = "tf_device.cluster"() ( {
     // CHECK: %[[IF:.*]]:2 = "tf.If"(%[[ARG0]], %[[READ0]], %[[READ1]])
     %3:2 = "tf.If"(%arg0, %0, %1) {then_branch = @if_then, else_branch = @if_else,
-        output_shapes = [#tf.shape<>, #tf.shape<4>], is_stateless = false}
+        is_stateless = false}
       : (tensor<i1>, tensor<*x!tf.resource<tensor<4xf32>>>, tensor<*x!tf.resource<tensor<4xf32>>>)
       -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>)
     // CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[IF]]#1, %[[IF]]#0)
@@ -468,7 +523,7 @@ func @cluster_with_nested_if(%arg0: tensor<i1>) -> tensor<f32> {
   %2 = "tf_device.cluster"() ( {
     // CHECK: %[[IF:.*]] = "tf.If"(%[[ARG0]], %[[READ0]])
     %3 = "tf.If"(%arg0, %0, %1) {then_branch = @if_then, else_branch = @if_else,
-        output_shapes = [], is_stateless = false}
+        is_stateless = false}
       : (tensor<i1>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
       -> (tensor<*x!tf.resource<tensor<f32>>>)
     // CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[IF]], %[[IF]])
@@ -488,7 +543,7 @@ func @if_then(%arg0: tensor<*x!tf.resource<tensor<f32>>>, %arg1: tensor<*x!tf.re
   // CHECK-NEXT: %[[IIF:.*]] = "tf.If"(%[[TARG0]], %[[TARG0]])
   %read = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
   %3 = "tf.If"(%read, %arg0) {then_branch = @inner_if_then, else_branch = @inner_if_else,
-      output_shapes = [], is_stateless = false}
+      is_stateless = false}
     : (tensor<f32>, tensor<*x!tf.resource<tensor<f32>>>)
     -> (tensor<*x!tf.resource<tensor<f32>>>)
   // CHECK-NEXT: return %[[IIF]]
@@ -524,9 +579,9 @@ func @cluster_with_if(%arg0: tensor<i1>) -> tensor<4xf32> {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
   %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
   %2 = "tf_device.cluster"() ( {
-    // expected-error @+1 {{unsupported tf.IfOp output: resource does not alias a single input.}}
+    // expected-error @+1 {{unsupported output: resource does not alias a single input}}
     %3 = "tf.If"(%arg0, %0, %1) {then_branch = @if_then, else_branch = @if_else,
-        output_shapes = [#tf.shape<>], is_stateless = false}
+        is_stateless = false}
       : (tensor<i1>, tensor<*x!tf.resource<tensor<4xf32>>>, tensor<*x!tf.resource<tensor<4xf32>>>)
       -> (tensor<*x!tf.resource<tensor<4xf32>>>)
     %4 = "tf.ReadVariableOp"(%3) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 160bba94cfc..e3766a7d9d6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -1,10 +1,11 @@
-// RUN: tf-opt %s -tf-shape-inference -verify-diagnostics | FileCheck %s -dump-input=fail
+// RUN: tf-opt %s -tf-shape-inference=propagate-caller-callee-constants=false -verify-diagnostics | FileCheck %s -dump-input=fail
+// RUN: tf-opt %s -tf-shape-inference=propagate-caller-callee-constants -verify-diagnostics | FileCheck %s -dump-input=fail
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 130 : i32}} {
 // CHECK-LABEL: func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<1xi32>
   func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<*xi32> {
- // CHECK-NOT: tf.Cast
- // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+ // CHECK: %[[RESULT:.*]] = "tf.AddV2"
+ // CHECK-SAME: (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
  // CHECK: return %[[RESULT]] : tensor<1xi32>
     %0 = "tf.Cast"(%arg0) : (tensor<1xi32>) -> tensor<*xi32>
     %1 = "tf.Cast"(%arg1) : (tensor<1xi32>) -> tensor<*xi32>
@@ -60,8 +61,8 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: func @simple_folding
   func @simple_folding(%arg0: tensor<1x1x1x1xi32>, %arg1: tensor<1x1x1x1xf32>) -> tensor<?x?x?x?xf32> {
-// CHECK: %[[CST:.*]] = "tf.Const"{{.*}} {value = dense<1> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK: %[[CONV:.*]] = "tf.Conv2DBackpropInput"(%[[CST]]
+// CHECK: %[[SHAPE:.*]] = "tf.Shape"
+// CHECK: %[[CONV:.*]] = "tf.Conv2DBackpropInput"(%[[SHAPE]]
 // CHECK-SAME: (tensor<4xi32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
 // CHECK: return %[[CONV]] : tensor<1x1x1x1xf32>
     %0 = "tf.Shape"(%arg0) : (tensor<1x1x1x1xi32>) -> tensor<4xi32>
@@ -101,7 +102,7 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
   // CHECK-LABEL: func @shape_from_if_to_branch_functions
   func @shape_from_if_to_branch_functions(%arg0: tensor<i1>, %arg1: tensor<1x2x3xf32>) -> tensor<1x2x3xf32> {
-    %0 = "tf.If"(%arg0, %arg1) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @if_else_branch, is_stateless = true, name = "if", output_shapes = [#tf.shape<>], then_branch = @if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+    %0 = "tf.If"(%arg0, %arg1) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @if_else_branch, is_stateless = true, name = "if", then_branch = @if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
     return %0 : tensor<1x2x3xf32>
   }
 
@@ -183,16 +184,16 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
   // CHECK-LABEL: func @invalid_function_reused_by_control_flows
   func @invalid_function_reused_by_control_flows(%arg0: tensor<i1>, %arg1: tensor<1x2x3xf32>) -> tensor<1x2x3xf32> {
-	  // expected-warning @+1 {{unable to refine shape}}
-    %0 = "tf.If"(%arg0, %arg1) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @reused_if_else_branch, is_stateless = true, name = "if", output_shapes = [#tf.shape<>], then_branch = @reused_if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
-	  // expected-warning @+1 {{unable to refine shape}}
-    %1 = "tf.If"(%arg0, %0) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @reused_if_else_branch, is_stateless = true, name = "if", output_shapes = [#tf.shape<>], then_branch = @reused_if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+    // expected-warning @+1 {{unable to refine shape}}
+    %0 = "tf.If"(%arg0, %arg1) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @reused_if_else_branch, is_stateless = true, name = "if", then_branch = @reused_if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+    // expected-warning @+1 {{unable to refine shape}}
+    %1 = "tf.If"(%arg0, %0) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @reused_if_else_branch, is_stateless = true, name = "if", then_branch = @reused_if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
     return %0 : tensor<1x2x3xf32>
   }
 
   // CHECK-LABEL: func @reused_if_then_branch
   // CHECK-SAME: (%arg0: tensor<*xf32>) -> tensor<*xf32>
-	// expected-warning @+1 {{expected control flow function reused_if_then_branch to have exactly 1 use}}
+  // expected-warning @+1 {{expected control flow function reused_if_then_branch to have exactly 1 use}}
   func @reused_if_then_branch(%arg0: tensor<*xf32>) -> tensor<*xf32> {
     // CHECK: return
     // CHECK-SAME: tensor<*xf32>
@@ -201,7 +202,7 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
   // CHECK-LABEL: func @reused_if_else_branch
   // CHECK-SAME: (%arg0: tensor<*xf32>) -> tensor<*xf32>
-	// expected-warning @+1 {{expected control flow function reused_if_else_branch to have exactly 1 use}}
+  // expected-warning @+1 {{expected control flow function reused_if_else_branch to have exactly 1 use}}
   func @reused_if_else_branch(%arg0: tensor<*xf32>) -> tensor<*xf32> {
     // CHECK: "tf.Identity"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
     %0 = "tf.Identity"(%arg0) : (tensor<*xf32>) -> (tensor<*xf32>)
@@ -300,13 +301,6 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %0 : tensor<*xi32>
   }
 
-  // CHECK-LABEL: func @fold_cast
-  func @fold_cast(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-    // CHECK-NOT: Cast
-    %0 = "tf.Cast"(%arg0) : (tensor<*xf32>) -> (tensor<*xf32>)
-    return %0 : tensor<*xf32>
-  }
-
   // CHECK-LABEL: func @while_variant
   // CHECK-SAME: -> tensor<!tf.variant<tensor<16x1xf32>>>
   func @while_variant(%arg0: tensor<!tf.variant<tensor<16x1xf32>>>) -> tensor<!tf.variant> {
@@ -362,8 +356,6 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
   // CHECK-LABEL: func @partitioned_call_func_const
   func @partitioned_call_func_const(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-    // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<[3, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
-    // CHECK: return %[[CONST]]
     return %arg0 : tensor<2xi32>
   }
 
@@ -410,4 +402,18 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     %40 = "tf.Reshape"(%39, %19) {T = f32, Tshape = i32, device = ""} : (tensor<1x4x4x32xf32>, tensor<2xi32>) -> tensor<?x?xf32>
    return
   }
+
+  // CHECK-LABEL: const_fold
+  func @const_fold() -> () {
+    // CHECK: tf.Const
+    // CHECK-SAME: () -> tensor<4xi32>
+    %0 = "tf.Const"() {value = dense<[200, 26, 26, 32]> : tensor<4xi32>} : () -> tensor<*xi32>
+    // CHECK: tf.Const
+    // CHECK-SAME: () -> tensor<4xi32>
+    %1 = "tf.Const"() {value = dense<[200, 26, 26, 32]> : tensor<4xi32>} : () -> tensor<*xi32>
+    // CHECK: tf.Add
+    // CHECK-SAME: (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+    %2 = "tf.Add"(%0, %1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+    return
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
index c36771c0576..965b3b10843 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
@@ -786,9 +786,9 @@ func @tf_registry_ops(
 // CHECK-LABEL: func @arguments_with_unique_ids
 func @arguments_with_unique_ids(
   // expected-remark@above {{ID: 9}}
-  %arg0: tensor<*x!tf.resource<tensor<32xf32>>> {tf.resource_arg_unique_id = 0 : i64},
-  %arg1: tensor<*x!tf.resource<tensor<32xf32>>> {tf.resource_arg_unique_id = 0 : i64},
-  %arg2: tensor<*x!tf.resource<tensor<32xf32>>> {tf.resource_arg_unique_id = 33 : i64}) {
+  %arg0: tensor<*x!tf.resource<tensor<32xf32>>> {tf._resource_arg_unique_id = 0 : i64},
+  %arg1: tensor<*x!tf.resource<tensor<32xf32>>> {tf._resource_arg_unique_id = 0 : i64},
+  %arg2: tensor<*x!tf.resource<tensor<32xf32>>> {tf._resource_arg_unique_id = 33 : i64}) {
   tf_executor.graph {
   // expected-remark@above {{ID: 7}}
   // expected-remark@above {{Successors: {8}}}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 118ce2e8645..8692104e772 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -854,6 +854,265 @@ func @testInvalidIfOp(tensor<i1>, tensor<*xf32>) -> tensor<2xf32> {
 
 // -----
 
+// Test invalid tf.Yield operation (parent should be IfRegion)
+func @testInvalidYieldOp(%arg0: f32) -> () {
+  // expected-error @+1 {{expects parent op 'tf.IfRegion'}}
+  "tf.Yield"(%arg0) : (f32) -> ()
+}
+
+// -----
+
+// Test valid tf.IfRegion operation
+// CHECK-LABEL: func @testValidIfRegionOp
+func @testValidIfRegionOp(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  %neg = "tf.Neg"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+  %0 = "tf.IfRegion"(%arg0) ({
+     %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     %e = "tf.Acos"(%neg) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// Test valid tf.IfRegion operation with multiple results
+// CHECK-LABEL: func @testValidIfRegionOpWithMultipleResults
+func @testValidIfRegionOpWithMultipleResults(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  %0, %1, %2 = "tf.IfRegion"(%arg0) ({
+     %t0 = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     %t1 = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     %t2 = "tf.Acosh"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+    "tf.Yield"(%t0, %t1, %t2) : (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> ()
+    }, {
+     %e0 = "tf.Neg"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     %e1 = "tf.Relu"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     %e2 = "tf.Sin"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e0, %e1, %e2) : (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
+
+  %3 = "tf.Add"(%0, %1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  %4 = "tf.Add"(%2, %3) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  return %4 : tensor<2xf32>
+}
+
+// -----
+
+// Test invalid type for operand #0 for tf.IfRegion operation
+func @testInvalidIfRegionOpType0(%arg0: f32, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{operand #0 must be tensor of tf.dtype values}}
+  %0 = "tf.IfRegion"(%arg0) ({
+     %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (f32) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// tf.IfRegion operation should have 2 regions
+func @testInvalidIfRegionOp1Region(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{op expected 2 regions}}
+  %0 = "tf.IfRegion"(%arg0) ({
+     %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @testInvalidIfRegionOpNoRegions(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{op expected 2 regions}}
+  %0 = "tf.IfRegion"(%arg0) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @testInvalidIfRegionOp3Regions(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{op expected 2 regions}}
+  %0 = "tf.IfRegion"(%arg0) ({
+     %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     %te = "tf.Relu"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%te) : (tensor<2xf32>) -> ()
+    }, {
+     %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// tf.IfRegion regions should be terminated with a tf.Yield
+func @testIfRegionThenTerminator(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+2 {{'tf.IfRegion' op expects regions to end with 'tf.Yield'}}
+  // expected-note @+1 {{in custom textual format, the absence of terminator implies 'tf.Yield'}}
+  %0 = "tf.IfRegion"(%arg0) ({
+     %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+   }, {
+     %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @testIfRegionElseTerminator(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+2 {{'tf.IfRegion' op expects regions to end with 'tf.Yield'}}
+  // expected-note @+1 {{in custom textual format, the absence of terminator implies 'tf.Yield'}}
+  %0 = "tf.IfRegion"(%arg0) ({
+     %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// tf.Region yield number of results should match op number of results
+func @testIfRegionThenResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{then region should have 1 result}}
+  %0 = "tf.IfRegion"(%arg0) ({
+     %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t, %t) : (tensor<2xf32>, tensor<2xf32>) -> ()
+    }, {
+     %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @testIfRegionElseResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{else region should have 1 result}}
+  %0 = "tf.IfRegion"(%arg0) ({
+     %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e, %e) : (tensor<2xf32>, tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// tf.IfRegion yield types should match op result types
+func @testIfRegionOpYieldMismatchThen(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{then result type tensor<i1> is incompatible with tf.IfRegion result type tensor<2xf32> at index 0}}
+  %0 = "tf.IfRegion"(%arg0) ({
+     "tf.Yield"(%arg0) : (tensor<i1>) -> ()
+    }, {
+     %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @testIfRegionOpYieldMismatchElse(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{else result type tensor<i1> is incompatible with tf.IfRegion result type tensor<2xf32> at index 0}}
+  %0 = "tf.IfRegion"(%arg0) ({
+     %t = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     "tf.Yield"(%arg0) : (tensor<i1>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// value generated in one branch cannot be consumed in the other branch
+func @testIfRegionElseConsumingThen(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  %0 = "tf.IfRegion"(%arg0) ({
+     %t = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     // expected-error @+1 {{use of undeclared SSA value name}}
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @testIfRegionThenConsumingElse(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+   %0 = "tf.IfRegion"(%arg0) ({
+     // expected-error @+1 {{does not dominate this use}}
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+      // expected-note @+1 {{operand defined here}}
+      %t = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// The regions for IfRegion themselves cannot have any arguments
+func @testInvalidIfRegionThenArg(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  %neg = "tf.Neg"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+  // expected-error @+1 {{then region cannot have any arguments}}
+  %0 = "tf.IfRegion"(%arg0) ({
+     ^bb(%arg_bb: tensor<2xf32>):
+     %t = "tf.Abs"(%arg_bb) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     %e = "tf.Acos"(%neg) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @testInvalidIfRegionElseArg(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  %neg = "tf.Neg"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+  // expected-error @+1 {{else region cannot have any arguments}}
+  %0 = "tf.IfRegion"(%arg0) ({
+     %t = "tf.Abs"(%neg) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     ^bb(%arg_bb: tensor<2xf32>):
+     %e = "tf.Acos"(%arg_bb) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
 // Test valid tf.MatrixBandPart
 // CHECK-LABEL: func @testValidMatrixBandPartOp
 func @testValidMatrixBandPartOp(%arg0: tensor<64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64xbf16> {
@@ -881,20 +1140,29 @@ func @testValidMatrixBandPartOpUnranked(%arg0: tensor<*xbf16>, %arg1: tensor<i64
 
 // -----
 
-// Test invalid tf.MatrixBandPart
-func @testInvalidMatrixBandPartOp(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64xbf16> {
-  // expected-error @+1 {{op failed to verify that all of {input, band} have same type}}
-  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<64x64xbf16>
-  return %0 : tensor<64x64xbf16>
+// Test valid tf.MatrixBandPart
+// CHECK-LABEL: func @testValidMatrixBandPartOpUnrankedBand
+func @testValidMatrixBandPartOpUnrankedBand(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<*xbf16> {
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<*xbf16>
+  return %0 : tensor<*xbf16>
+}
+
+// -----
+
+// Test valid tf.MatrixBandPart
+// CHECK-LABEL: func @testValidMatrixBandPartOpCompatibleDynamicShapes
+func @testValidMatrixBandPartOpCompatibleDynamicShapes(%arg0: tensor<?x10x?xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<?x?x8xbf16> {
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<?x10x?xbf16>, tensor<i64>, tensor<i64>) -> tensor<?x?x8xbf16>
+  return %0 : tensor<?x?x8xbf16>
 }
 
 // -----
 
 // Test invalid tf.MatrixBandPart
-func @testInvalidMatrixBandPartOp(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<*xbf16> {
-  // expected-error @+1 {{op failed to verify that all of {input, band} have same type}}
-  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<*xbf16>
-  return %0 : tensor<*xbf16>
+func @testInvalidMatrixBandPartOp(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64xbf16> {
+  // expected-error @+1 {{op failed to verify that all of {input, band} have dynamically equal types}}
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<64x64xbf16>
+  return %0 : tensor<64x64xbf16>
 }
 
 // -----
@@ -998,6 +1266,116 @@ func @pcall_func_2(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
 
 // -----
 
+//===--------------------------------------------------------------------===//
+//  tf.Select
+//===--------------------------------------------------------------------===//
+
+// Test valid tf.Select
+// CHECK-LABEL: func @testSelect
+func @testSelect(%arg0: tensor<3xi1>, %arg1: tensor<3x2xf16>, %arg2: tensor<3x2xf16>) -> tensor<3x2xf16> {
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<3x2xf16>, tensor<3x2xf16>) -> tensor<3x2xf16>
+  return %0: tensor<3x2xf16>
+}
+
+// -----
+
+func @testInvalidSelect(%arg0: tensor<3xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
+  // expected-error @+1 {{requires that, when pred is a vector, the shape matches the first dimension of t and e}}
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  return %0: tensor<2x3xf16>
+}
+
+// -----
+
+// Test invalid tf.Select - broadcasting then/else parameters is not supported
+func @selectBroadcastThen(%arg0: tensor<i1>, %arg1: tensor<8x1xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x8x8xi32> {
+  // expected-error @+1 {{requires t and e have compatible shapes}}
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<8x1xi32>, tensor<2x8x8xi32>) -> tensor<2x8x8xi32>
+  return %0: tensor<2x8x8xi32>
+}
+
+// -----
+
+func @invalidSelect(%arg0: tensor<2xi1>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<2xi32> {
+  // expected-error @+1 {{requires that t and e are nonscalar when pred is a vector}}
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// -----
+
+func @invalidSelect(%arg0: tensor<1x8xi1>, %arg1: tensor<1x8x8xi32>, %arg2: tensor<1x8x8xi32>) -> tensor<1x8x8xi32> {
+  // expected-error @+1 {{requires that pred is a scalar OR has the same rank as t and e OR is a vector}}
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<1x8xi1>, tensor<1x8x8xi32>, tensor<1x8x8xi32>) -> tensor<1x8x8xi32>
+  return %0: tensor<1x8x8xi32>
+}
+
+// -----
+
+//===--------------------------------------------------------------------===//
+//  tf.SelectV2
+//===--------------------------------------------------------------------===//
+
+// Test valid tf.SelectV2
+// CHfaECK-LABEL: func @selectV2BroadcastThen
+func @selectV2BroadcastThen(%arg0: tensor<i1>, %arg1: tensor<8x1xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x8x8xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<8x1xi32>, tensor<2x8x8xi32>) -> tensor<2x8x8xi32>
+  return %0: tensor<2x8x8xi32>
+}
+
+// -----
+
+// Test valid tf.SelectV2
+// CHECK-LABEL: func @selectV2BroadcastElse
+func @selectV2BroadcastElse(%arg0: tensor<i1>, %arg1: tensor<2x8x8xi32>, %arg2: tensor<8x1xi32>) -> tensor<2x8x8xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x8x8xi32>, tensor<8x1xi32>) -> tensor<2x8x8xi32>
+  return %0: tensor<2x8x8xi32>
+}
+
+// -----
+
+// Test valid tf.SelectV2
+// CHECK-LABEL: func @selectV2BroadcastPred
+func @selectV2BroadcastPred(%arg0: tensor<1xi1>, %arg1: tensor<2x8x8xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x8x8xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<1xi1>, tensor<2x8x8xi32>, tensor<2x8x8xi32>) -> tensor<2x8x8xi32>
+  return %0: tensor<2x8x8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @selectV2BroadcastAll
+func @selectV2BroadcastAll(%arg0: tensor<8x1x1xi1>, %arg1: tensor<1x8x1xi32>, %arg2: tensor<1x1x8xi32>) -> tensor<8x8x8xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<8x1x1xi1>, tensor<1x8x1xi32>, tensor<1x1x8xi32>) -> tensor<8x8x8xi32>
+  return %0: tensor<8x8x8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @selectV2DynamicRanked
+func @selectV2DynamicRanked(%arg0: tensor<1xi1>, %arg1: tensor<2x?x8xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x?x8xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<1xi1>, tensor<2x?x8xi32>, tensor<2x8x8xi32>) -> tensor<2x?x8xi32>
+  return %0: tensor<2x?x8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @selectV2Unranked
+func @selectV2Unranked(%arg0: tensor<1xi1>, %arg1: tensor<2x8x8xi32>, %arg2: tensor<*xi32>) -> tensor<*xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<1xi1>, tensor<2x8x8xi32>, tensor<*xi32>) -> tensor<*xi32>
+  return %0: tensor<*xi32>
+}
+
+// -----
+
+// Test invalid tf.SelectV2: this is an invalid broadcast for the predicate
+func @testInvalidSelectV2(%arg0: tensor<3xi1>, %arg1: tensor<3x2xf16>, %arg2: tensor<3x2xf16>) -> tensor<3x2xf16> {
+  // expected-error @+1 {{operands don't have broadcast-compatible shapes}}
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<3x2xf16>, tensor<3x2xf16>) -> tensor<3x2xf16>
+  return %0: tensor<3x2xf16>
+}
+
+// -----
+
 //===--------------------------------------------------------------------===//
 //  tf.Softmax
 //===--------------------------------------------------------------------===//
@@ -1317,7 +1695,7 @@ func @testShapeMismatchDim(tensor<1x32x32x16xf32>) -> tensor<2xi32> {
 
 func @testShapeWrongResultDimDynamic(tensor<*xf32>) -> tensor<2xi32> {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{requires dynamic shape result for unranked operand}}
+  // expected-warning @+1 {{has static shape result for unranked operand}}
   %0 = "tf.Shape"(%arg0) {T = "tfdtype$DT_FLOAT", output = "tfdtype$DT_INT32"} : (tensor<*xf32>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }
@@ -1361,7 +1739,7 @@ func @testShapeNMismatchDim(tensor<1x32x32x16xf32>) -> tensor<2xi32> {
 
 func @testShapeNWrongResultDimDynamic(tensor<*xf32>) -> tensor<2xi32> {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{requires dynamic shape result #1 for unranked operand #1}}
+  // expected-warning @+1 {{has static shape result #1 for unranked operand #1}}
   %0:2 = "tf.ShapeN"(%arg0, %arg0) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<?xi32>, tensor<2xi32>)
   return %0#1 : tensor<2xi32>
 }
@@ -1419,7 +1797,7 @@ func @testVariableShapeMismatchDim(%arg0: tensor<*x!tf.resource<tensor<1x32x32x1
 // -----
 
 func @testVariableShapeWrongResultDimDynamic(%arg0: tensor<*x!tf.resource<tensor<*xf32>>>) -> tensor<2xi32> {
-  // expected-error @+1 {{requires dynamic shape result for unranked operand}}
+  // expected-warning @+1 {{has static shape result for unranked operand}}
   %0 = "tf.VariableShape"(%arg0) {output = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource<tensor<*xf32>>>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
index 78c18a17d4a..b337224e680 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
@@ -45,7 +45,7 @@ class TestModule(tf.Module):
   # CHECK: "tf_saved_model.global_tensor"() {is_mutable, sym_name = "[[VAR:[a-zA-Z_0-9]+]]", tf_saved_model.exported_names = ["v42"], type = tensor<f32>, value = dense<4.200000e+01> : tensor<f32>} : () -> ()
   # CHECK: "tf_saved_model.global_tensor"() {sym_name = "[[CONST:[a-zA-Z_0-9]+]]", tf_saved_model.exported_names = [], type = tensor<f32>, value = dense<4.300000e+01> : tensor<f32>} : () -> ()
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]},
   # CHECK-SAME:   %arg1: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @[[VAR]]},
   # CHECK-SAME:   %arg2: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @[[CONST]]}) -> (
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = []})
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py
index 658cc37a22f..694942f4b00 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py
@@ -45,7 +45,7 @@ class TestModule(tf.Module):
   # modify signatures interprocedurally).
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]},
   # CHECK-SAME:   %arg1: tensor<!tf.resource<{{.*}}>> {tf_saved_model.bound_input = {{@[a-zA-Z_0-9]+}}}
   # CHECK-SAME: ) -> (
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = [0]},
@@ -54,7 +54,7 @@ class TestModule(tf.Module):
   # CHECK:        "tf.StatefulPartitionedCall"{{.*}}f = @[[CALLEE_INTERNAL:[a-zA-Z_0-9]+]]
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]},
   # CHECK-SAME:   %arg1: tensor<!tf.resource<{{.*}}>> {tf_saved_model.bound_input = {{@[a-zA-Z_0-9]+}}}
   # CHECK-SAME: ) -> (
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = [0]},
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/defun_export.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/defun_export.py
new file mode 100644
index 00000000000..8bd128898a0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/defun_export.py
@@ -0,0 +1,63 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# RUN: %p/defun_export | FileCheck %s
+
+# pylint: disable=missing-docstring,line-too-long
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common_v1
+from tensorflow.python.framework import function
+
+
+@function.Defun(tf.float32, tf.float32)
+def plus(a, b):
+  return a + b
+
+
+def test_defun():
+  x = tf.constant([[1.0], [1.0], [1.0]])
+  y = tf.constant([[2.0], [2.0], [2.0]])
+
+  # Verify that the function defined using function.Defun
+  # has a corresponding tf.LegacyCall op.
+  # CHECK:      func {{@[a-zA-Z_0-9]+}}(
+  # CHECK-SAME: [[ARG0:%.*]]: tensor<3x1xf32> {tf_saved_model.index_path = ["y"]},
+  # CHECK-SAME: [[ARG1:%.*]]: tensor<3x1xf32> {tf_saved_model.index_path = ["x"]}
+  #
+  # CHECK-NEXT: [[R0:%.*]] = "tf.LegacyCall"([[ARG1]], [[ARG0]])
+  z = plus(x, y)
+
+  tensor_info_x = tf.compat.v1.saved_model.utils.build_tensor_info(x)
+  tensor_info_y = tf.compat.v1.saved_model.utils.build_tensor_info(y)
+  tensor_info_z = tf.compat.v1.saved_model.utils.build_tensor_info(z)
+
+  return {
+      'key': (tf.compat.v1.saved_model.signature_def_utils.build_signature_def(
+          inputs={
+              'x': tensor_info_x,
+              'y': tensor_info_y
+          },
+          outputs={'z': tensor_info_z},
+          method_name='test_function'))
+  }
+
+
+if __name__ == '__main__':
+  common_v1.set_tf_options()
+  common_v1.do_test(test_defun())
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/keras.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/keras.py
index a95909b61ef..ffb5c024bbb 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/keras.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/keras.py
@@ -39,7 +39,7 @@ class TestModule(tf.Module):
     super(TestModule, self).__init__()
     self.model = mnist_model()
 
-  # CHECK: func {{@[a-zA-Z_0-9]+}}(%arg0: tensor<1x28x28x1xf32> {tf_saved_model.index_path = [0]}
+  # CHECK: func {{@[a-zA-Z_0-9]+}}(%arg0: tensor<1x28x28x1xf32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]}
   # CHECK: attributes {{.*}} tf_saved_model.exported_names = ["my_predict"]
   @tf.function(input_signature=[
       tf.TensorSpec([1, 28, 28, 1], tf.float32),
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_input.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_input.py
index 095fddbda96..43591d12183 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_input.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_input.py
@@ -36,8 +36,8 @@ class TestModule(tf.Module):
   # The outer layer of the index path indexes into the arguments.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [1]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "y", tf_saved_model.index_path = [1]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0000_function_arity"]
   @tf.function(input_signature=[
       tf.TensorSpec([1], tf.float32),
@@ -49,8 +49,8 @@ class TestModule(tf.Module):
   # Check index paths for lists.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0, 0]},
-  # CHECK-SAME:   %arg1: tensor<f32> {tf_saved_model.index_path = [0, 1]})
+  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "l", tf_saved_model.index_path = [0, 0]},
+  # CHECK-SAME:   %arg1: tensor<f32> {tf._user_specified_name = "l", tf_saved_model.index_path = [0, 1]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0001_list_2_elements"]
   @tf.function(input_signature=[[
       tf.TensorSpec([], tf.float32),
@@ -63,8 +63,8 @@ class TestModule(tf.Module):
   # Keys are linearized in sorted order, matching `tf.nest.flatten`.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0, "x"]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [0, "y"]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x"]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "y"]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0002_dict_2_keys"]
   @tf.function(input_signature=[{
       'x': tf.TensorSpec([1], tf.float32),
@@ -77,8 +77,8 @@ class TestModule(tf.Module):
   # The index path should be insensitive to the key order.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0, "x"]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [0, "y"]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x"]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "y"]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0003_dict_2_keys_out_of_order"]
   @tf.function(input_signature=[{
       'y': tf.TensorSpec([2], tf.float32),
@@ -90,12 +90,12 @@ class TestModule(tf.Module):
   # Slightly stronger stress test of multiple dict keys.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0, "a"]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [0, "b"]},
-  # CHECK-SAME:   %arg2: tensor<3xf32> {tf_saved_model.index_path = [0, "c"]},
-  # CHECK-SAME:   %arg3: tensor<4xf32> {tf_saved_model.index_path = [0, "x"]},
-  # CHECK-SAME:   %arg4: tensor<5xf32> {tf_saved_model.index_path = [0, "y"]},
-  # CHECK-SAME:   %arg5: tensor<6xf32> {tf_saved_model.index_path = [0, "z"]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "a"]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "b"]},
+  # CHECK-SAME:   %arg2: tensor<3xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "c"]},
+  # CHECK-SAME:   %arg3: tensor<4xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x"]},
+  # CHECK-SAME:   %arg4: tensor<5xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "y"]},
+  # CHECK-SAME:   %arg5: tensor<6xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "z"]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0004_dict_many_keys"]
   @tf.function(input_signature=[{
       'x': tf.TensorSpec([4], tf.float32),
@@ -112,9 +112,9 @@ class TestModule(tf.Module):
   # Note that list elements can have heterogenous types.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0, "x", 0]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [0, "x", 1]},
-  # CHECK-SAME:   %arg2: tensor<3xf32> {tf_saved_model.index_path = [0, "y"]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x", 0]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x", 1]},
+  # CHECK-SAME:   %arg2: tensor<3xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "y"]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0005_more_complex_recursive_structure"]
   @tf.function(input_signature=[{
       'x': [tf.TensorSpec([1], tf.float32),
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
index 38aa078358b..961039e7968 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
@@ -104,3 +104,9 @@ module attributes {tf_saved_model.semantics} {
     return
   }
 }
+
+// -----
+
+// Test running the pass on a module that does not have
+// tf_saved_model.semantics.
+module {}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
index f985be16ab8..80d9a498253 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
@@ -136,3 +136,9 @@ module attributes {tf_saved_model.semantics} {
   }
 
 }
+
+// -----
+
+// Test running the pass on a module that does not have
+// tf_saved_model.semantics.
+module {}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
index 77ca08c089a..d5fb821b5e6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
@@ -1,30 +1,388 @@
 // RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-tpu-extract-head-tail-outside-compilation | FileCheck %s --dump-input-on-failure
 
-// Tests extraction of a single outside compiled cluster with no input or output dependecies.
+// Tests extraction of a outside compiled ops at head of TPU computation.
 
-// CHECK-LABEL: func @nodep_single_head_outside_compilation
-func @nodep_single_head_outside_compilation() -> () {
-   // CHECK: "tf.A"
-   // CHECK-NEXT: "tf_device.launch"
-  "tf_device.launch"() ( {
-    "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> ()
-    "tf.B"() : () -> ()
-    "tf.C"() : () -> ()
-    tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
-  return
-}
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  // CHECK-LABEL: func @head_single_outside_compiled_op
+  func @head_single_outside_compiled_op(%arg0: tensor<i32>) {
+    // CHECK:      "tf_device.launch"
+    // CHECK-NEXT:   "tf.A"
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.B"
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
+      "tf.B"() : () -> ()
+      "tf.C"() : () -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
 
-// CHECK-LABEL: func @nodep_multiple_head_outside_compilation
-func @nodep_multiple_head_outside_compilation() -> () {
-   // CHECK: "tf.A"
-   // CHECK-NEXT: "tf.B"
-   // CHECK-NEXT: "tf_device.launch"
-  "tf_device.launch"() ( {
-    "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> ()
-    "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
-    "tf.C"() : () -> ()
-    tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
-  return
+  // CHECK-LABEL: func @head_single_outside_compiled_op_no_operands
+  func @head_single_outside_compiled_op_no_operands() {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:   tf_device.return %[[A_OUT]]
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.B"(%[[LAUNCH_OUT]])
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %a = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> tensor<i32>
+      %b = "tf.B"(%a) : (tensor<i32>) -> tensor<i32>
+      "tf.C"(%b) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // CHECK-LABEL: func @head_operand_op_outside_cluster
+  func @head_operand_op_outside_cluster() {
+    // CHECK:      %[[A_OUT:.*]] = "tf.A"
+    %a = "tf.A"() : () -> tensor<i32>
+    // CHECK-NEXT: %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"
+    // CHECK-NEXT:   tf_device.return %[[B_OUT]]
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.C"(%[[LAUNCH_OUT]])
+    // CHECK-NEXT:   "tf.D"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %b = "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      %c = "tf.C"(%b) : (tensor<i32>) -> tensor<i32>
+      "tf.D"(%c) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // CHECK-LABEL: func @head_aliased_output
+  func @head_aliased_output() -> (tensor<i32>, tensor<i32>, tensor<i32>) {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:   tf_device.return %[[A_OUT]]
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      %[[CLUSTER_OUT:.*]]:2 = "tf_device.cluster"
+    // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[LAUNCH_OUT]])
+    // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"
+    // CHECK-NEXT:   tf_device.return %[[C_OUT]], %[[B_OUT]]
+    // CHECK-NEXT: {
+    // CHECK-DAG:  num_cores_per_replica = 1
+    // CHECK-DAG:  step_marker_location = ""
+    // CHECK-DAG:  padding_map = []
+    // CHECK-DAG:  topology = ""
+    // CHECK-DAG:  device_assignment = []
+    %cluster:3 = "tf_device.cluster"() ( {
+      %a = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> tensor<i32>
+      %b = "tf.B"(%a) : (tensor<i32>) -> tensor<i32>
+      %c = "tf.C"(%b) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %a, %c, %b : tensor<i32>, tensor<i32>, tensor<i32>
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> (tensor<i32>, tensor<i32>, tensor<i32>)
+    // CHECK:      return %[[LAUNCH_OUT]], %[[CLUSTER_OUT]]#0, %[[CLUSTER_OUT]]#1
+    return %cluster#0, %cluster#1, %cluster#2 : tensor<i32>, tensor<i32>, tensor<i32>
+  }
+
+  // CHECK-LABEL: func @head_all_cluster_op
+  func @head_all_cluster_op(%arg0: tensor<i32>) -> tensor<i32> {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
+    // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"(%[[B_OUT]], %arg0)
+    // CHECK-NEXT:   tf_device.return %[[C_OUT]]
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   tf_device.return
+    %cluster = "tf_device.cluster"() ( {
+      %a = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      %b = "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      %c = "tf.C"(%b, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      tf_device.return %c : tensor<i32>
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> tensor<i32>
+    // CHECK:      return %[[LAUNCH_OUT]]
+    return %cluster : tensor<i32>
+  }
+
+  // CHECK-LABEL: func @head_multiple_outside_compiled_ops
+  func @head_multiple_outside_compiled_ops(%arg0: tensor<i32>) {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return %[[B_OUT]]
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.D"(%[[LAUNCH_OUT]])
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %a = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      %b = "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      "tf.C"(%b, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> ()
+      "tf.D"(%b) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // CHECK-LABEL: func @head_replicated_outside_compilation
+  func @head_replicated_outside_compilation(%arg0: tensor<i32>, %arg1: tensor<i32>) {
+    // CHECK:      tf_device.replicate([%arg0, %arg1] as %[[RI:.*]]: tensor<i32>)
+    //
+    // CHECK-NEXT:   %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK-NEXT:     %[[A_OUT:.*]] = "tf.A"(%[[RI]])
+    // CHECK-NEXT:     tf_device.return %[[A_OUT]]
+    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST"
+    //
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:     "tf.B"(%[[LAUNCH_OUT]])
+    // CHECK-NEXT:     tf_device.return
+    tf_device.replicate([%arg0, %arg1] as %ri : tensor<i32>) {n = 2 : i32} {
+      "tf_device.cluster"() ( {
+        %a = "tf.A"(%ri) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+        "tf.B"(%a) : (tensor<i32>) -> ()
+        tf_device.return
+      }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+      tf_device.return
+    }
+    return
+  }
+
+  // CHECK-LABEL: func @tail_single_outside_compiled_op
+  func @tail_single_outside_compiled_op() {
+    // CHECK:      %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return %[[A_OUT]]
+    // CHECK-NEXT: {
+    // CHECK-DAG:  num_cores_per_replica = 1
+    // CHECK-DAG:  step_marker_location = ""
+    // CHECK-DAG:  padding_map = []
+    // CHECK-DAG:  topology = ""
+    // CHECK-DAG:  device_assignment = []
+    //
+    // CHECK:      "tf_device.launch"
+    // CHECK-NEXT:   "tf.B"(%[[CLUSTER_OUT]])
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    "tf_device.cluster"() ( {
+      %a = "tf.A"() : () -> tensor<i32>
+      "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
+      "tf.C"() : () -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // CHECK-LABEL: func @tail_single_outside_compiled_op_user
+  func @tail_single_outside_compiled_op_user() -> tensor<i32> {
+    // CHECK:      %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return %[[A_OUT]]
+    // CHECK-NEXT: {
+    // CHECK-DAG:  num_cores_per_replica = 1
+    // CHECK-DAG:  step_marker_location = ""
+    // CHECK-DAG:  padding_map = []
+    // CHECK-DAG:  topology = ""
+    // CHECK-DAG:  device_assignment = []
+    //
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[CLUSTER_OUT]])
+    // CHECK-NEXT:   tf_device.return %[[B_OUT]]
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    %cluster = "tf_device.cluster"() ( {
+      %a = "tf.A"() : () -> tensor<i32>
+      %b = "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      "tf.C"() : () -> ()
+      tf_device.return %b : tensor<i32>
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> tensor<i32>
+    // CHECK:      return %[[LAUNCH_OUT]]
+    return %cluster : tensor<i32>
+  }
+
+  // CHECK-LABEL: func @tail_multiple_outside_compiled_ops
+  func @tail_multiple_outside_compiled_ops(%arg0: tensor<i32>) {
+    // CHECK:      %[[CLUSTER_OUT:.*]]:2 = "tf_device.cluster"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"
+    // CHECK-NEXT:   tf_device.return %[[B_OUT]], %[[A_OUT]]
+    // CHECK-NEXT: {
+    // CHECK-DAG:  num_cores_per_replica = 1
+    // CHECK-DAG:  step_marker_location = ""
+    // CHECK-DAG:  padding_map = []
+    // CHECK-DAG:  topology = ""
+    // CHECK-DAG:  device_assignment = []
+    //
+    // CHECK:      "tf_device.launch"
+    // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"(%arg0, %[[CLUSTER_OUT]]#1)
+    // CHECK-NEXT:   "tf.D"(%[[C_OUT]], %arg0, %[[CLUSTER_OUT]]#0)
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    "tf_device.cluster"() ( {
+      %a = "tf.A"() : () -> tensor<i32>
+      %b = "tf.B"(%arg0) : (tensor<i32>) -> tensor<i32>
+      %c = "tf.C"(%arg0, %a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      "tf.D"(%c, %arg0, %b) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // CHECK-LABEL: func @tail_aliased_output
+  func @tail_aliased_output() -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>) {
+    // CHECK-NEXT: %[[A_OUT:.*]] = "tf.A"
+    %a = "tf.A"() : () -> tensor<i32>
+    // CHECK-NEXT: %[[B_OUT:.*]] = "tf.B"
+    %b = "tf.B"() : () -> tensor<i32>
+    // CHECK:      %[[CLUSTER_OUT:.*]]:2 = "tf_device.cluster"
+    // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"
+    // CHECK-NEXT:   %[[E_OUT:.*]] = "tf.E"
+    // CHECK-NEXT:   tf_device.return %[[C_OUT]], %[[E_OUT]]
+    // CHECK-NEXT: {
+    // CHECK-DAG:  num_cores_per_replica = 1
+    // CHECK-DAG:  step_marker_location = ""
+    // CHECK-DAG:  padding_map = []
+    // CHECK-DAG:  topology = ""
+    // CHECK-DAG:  device_assignment = []
+    //
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[D_OUT:.*]] = "tf.D"(%[[CLUSTER_OUT]]#0, %[[A_OUT]])
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    %cluster:5 = "tf_device.cluster"() ( {
+      %c = "tf.C"()  : () -> tensor<i32>
+      %d = "tf.D"(%c, %a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      %e = "tf.E"()  : () -> tensor<i32>
+      tf_device.return %a, %b, %c, %d, %e : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>)
+    // CHECK:      return %[[A_OUT]], %[[B_OUT]], %[[CLUSTER_OUT]]#0, %[[LAUNCH_OUT]], %[[CLUSTER_OUT]]#1
+    return %cluster#0, %cluster#1, %cluster#2, %cluster#3, %cluster#4 : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+  }
+
+  // CHECK-LABEL: func @tail_replicated_outside_compilation
+  func @tail_replicated_outside_compilation(%arg0: tensor<i32>, %arg1: tensor<i32>) {
+    // CHECK:      tf_device.replicate([%arg0, %arg1] as %[[RI:.*]]: tensor<i32>)
+    //
+    // CHECK:        %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:     %[[A_OUT:.*]] = "tf.A"(%[[RI]])
+    // CHECK-NEXT:     tf_device.return %[[A_OUT]]
+    // CHECK-NEXT:   {
+    // CHECK-DAG:    num_cores_per_replica = 1
+    // CHECK-DAG:    step_marker_location = ""
+    // CHECK-DAG:    padding_map = []
+    // CHECK-DAG:    topology = ""
+    // CHECK-DAG:    device_assignment = []
+    //
+    // CHECK-NEXT:   "tf_device.launch"()
+    // CHECK-NEXT:     %[[B_OUT:.*]] = "tf.B"(%[[CLUSTER_OUT]], %[[RI]])
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST"
+    tf_device.replicate([%arg0, %arg1] as %ri : tensor<i32>) {n = 2 : i32} {
+      "tf_device.cluster"() ( {
+        %a = "tf.A"(%ri) : (tensor<i32>) -> tensor<i32>
+        %b = "tf.B"(%a, %ri) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        tf_device.return
+      }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+      tf_device.return
+    }
+    return
+  }
+
+  // CHECK-LABEL: func @head_tail_no_extraction_middle_outside_compiled_ops
+  func @head_tail_no_extraction_middle_outside_compiled_ops(%arg0: tensor<i32>) {
+    // CHECK-NOT:  "tf_device.launch"
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.A"
+    // CHECK-NEXT:   "tf.B"
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %a = "tf.A"(%arg0) : (tensor<i32>) -> tensor<i32>
+      %b = "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      "tf.C"(%b) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // CHECK-LABEL: func @head_tail_simple_extraction
+  func @head_tail_simple_extraction(%arg0: tensor<i32>) -> tensor<i32> {
+    // CHECK:      %[[HEAD_LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"(%arg0)
+    // CHECK-NEXT:   tf_device.return %[[A_OUT]]
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[HEAD_LAUNCH_OUT]])
+    // CHECK-NEXT:   tf_device.return %[[B_OUT]]
+    // CHECK-NEXT: {
+    // CHECK-DAG:  num_cores_per_replica = 1
+    // CHECK-DAG:  step_marker_location = ""
+    // CHECK-DAG:  padding_map = []
+    // CHECK-DAG:  topology = ""
+    // CHECK-DAG:  device_assignment = []
+    //
+    // CHECK:      %[[TAIL_LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"(%[[CLUSTER_OUT]])
+    // CHECK-NEXT:   tf_device.return %[[C_OUT]]
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    %cluster = "tf_device.cluster"() ( {
+      %a = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      %b = "tf.B"(%a) : (tensor<i32>) -> tensor<i32>
+      %c = "tf.C"(%b) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      tf_device.return %c : tensor<i32>
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> tensor<i32>
+    // CHECK:      return %[[TAIL_LAUNCH_OUT]]
+    return %cluster : tensor<i32>
+  }
+
+  // CHECK-LABEL: func @head_tail_replicated_outside_compilation
+  func @head_tail_replicated_outside_compilation(%arg0: tensor<i32>, %arg1: tensor<i32>) {
+    // CHECK:      tf_device.replicate([%arg0, %arg1] as %[[RI:.*]]: tensor<i32>)
+    //
+    // CHECK-NEXT:   %[[HEAD_LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK-NEXT:     %[[A_OUT:.*]] = "tf.A"(%[[RI]])
+    // CHECK-NEXT:     tf_device.return %[[A_OUT]]
+    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST"
+    //
+    // CHECK:        %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:     %[[B_OUT:.*]] = "tf.B"
+    // CHECK-NEXT:     %[[C_OUT:.*]] = "tf.C"(%[[RI]], %[[B_OUT]])
+    // CHECK-NEXT:     "tf.E"(%[[C_OUT]], %[[HEAD_LAUNCH_OUT]])
+    // CHECK-NEXT:     tf_device.return %[[C_OUT]]
+    // CHECK-NEXT:   {
+    // CHECK-DAG:    num_cores_per_replica = 1
+    // CHECK-DAG:    step_marker_location = ""
+    // CHECK-DAG:    padding_map = []
+    // CHECK-DAG:    topology = ""
+    // CHECK-DAG:    device_assignment = []
+    //
+    // CHECK-NEXT:   "tf_device.launch"()
+    // CHECK-NEXT:     "tf.D"(%[[HEAD_LAUNCH_OUT]], %[[CLUSTER_OUT]], %[[RI]])
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST"
+    tf_device.replicate([%arg0, %arg1] as %ri : tensor<i32>) {n = 2 : i32} {
+      "tf_device.cluster"() ( {
+        %a = "tf.A"(%ri) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+        %b = "tf.B"() : () -> tensor<i32>
+        %c = "tf.C"(%ri, %b) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        %d = "tf.D"(%a, %c, %ri) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
+        %e = "tf.E"(%c, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        tf_device.return
+      }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+      tf_device.return
+    }
+    return
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
index b2e8f116827..0c4de285b16 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
@@ -3,12 +3,12 @@
 // Tests that missing `_xla_outside_compilation` attribute value results in an error.
 
 func @missing_outside_compilation_attribute() -> () {
-  "tf_device.launch"() ( {
+  "tf_device.cluster"() ( {
     "tf.A"() : () -> ()
     // expected-error@+1 {{attribute '_xla_outside_compilation' is empty}}
     "tf.B"() {_xla_outside_compilation = ""} : () -> ()
     tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  }) {cluster_attr = "cluster_attr"} : () -> ()
   return
 }
 
@@ -18,11 +18,11 @@ func @missing_outside_compilation_attribute() -> () {
 
 // CHECK-LABEL: func @no_outside_compilation
 func @no_outside_compilation() -> tensor<?xi32> {
-  %0 = "tf_device.launch"() ( {
+  %0 = "tf_device.cluster"() ( {
     %1 = "tf.A"() : () -> tensor<?xi32>
     %2 = "tf.B"(%1) : (tensor<?xi32>) -> tensor<?xi32>
     tf_device.return %2 : tensor<?xi32>
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<?xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
 
@@ -36,16 +36,15 @@ func @nodep_single_outside_compilation() -> () {
    // CHECK-NEXT: "tf_device.launch"
    // CHECK-NEXT: "tf.B"
    // CHECK-NOT: _xla_outside_compilation
-   // CHECK: "tf_device.launch"
+   // CHECK: "tf_device.cluster"
    // CHECK-NEXT: "tf.A"
-   // CHECK: device = "tpu0"
-   // CHECK-SAME: launch_attr = "launch_attr"
-  "tf_device.launch"() ( {
+   // CHECK: cluster_attr = "cluster_attr"
+  "tf_device.cluster"() ( {
     "tf.A"() : () -> ()
     "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
     "tf.C"() : () -> ()
     tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  }) {cluster_attr = "cluster_attr"} : () -> ()
   return
 }
 
@@ -59,19 +58,18 @@ func @nodep_single_cluster_multiple_ops_outside_compilation() -> () {
    // CHECK-NEXT: "tf.C"
    // CHECK-NEXT: "tf.D"
    // CHECK-NOT: _xla_outside_compilation
-   // CHECK: "tf_device.launch"
+   // CHECK: "tf_device.cluster"
    // CHECK-NEXT: "tf.A"
    // CHECK-NEXT: "tf.E"
-   // CHECK: device = "tpu0"
-   // CHECK-SAME: launch_attr = "launch_attr"
-  "tf_device.launch"() ( {
+   // CHECK: cluster_attr = "cluster_attr"
+  "tf_device.cluster"() ( {
     "tf.A"() : () -> ()
     "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
     "tf.C"() {_xla_outside_compilation = "cluster1"} : () -> ()
     "tf.D"() {_xla_outside_compilation = "cluster1"} : () -> ()
     "tf.E"() : () -> ()
     tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  }) {cluster_attr = "cluster_attr"} : () -> ()
   return
 }
 
@@ -80,15 +78,16 @@ func @nodep_single_cluster_multiple_ops_outside_compilation() -> () {
 // CHECK-LABEL: func @nodep_multiple_outside_compilation
 func @nodep_multiple_outside_compilation() -> () {
    // CHECK: "tf_device.parallel_execute"
-   // CHECK-COUNT-3: "tf_device.launch"
-  "tf_device.launch"() ( {
+   // CHECK-COUNT-2: "tf_device.launch"
+   // CHECK: "tf_device.cluster"
+  "tf_device.cluster"() ( {
     "tf.A"() : () -> ()
     "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
     "tf.C"() : () -> ()
     "tf.D"() {_xla_outside_compilation = "cluster2"} : () -> ()
     "tf.E"() : () -> ()
     tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  }) {cluster_attr = "cluster_attr"} : () -> ()
   return
 }
 
@@ -97,20 +96,20 @@ func @nodep_multiple_outside_compilation() -> () {
 // CHECK-LABEL: func @single_tpu_return_single_outside_compilation
 func @single_tpu_return_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-    // CHECK: %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-      // CHECK-NEXT: "tf_device.launch"
-      // CHECK: %[[TPU_LAUNCH_OUTPUT:[0-9]*]] = "tf_device.launch"
-        // CHECK: tf_device.return
-      // CHECK: tf_device.return %[[TPU_LAUNCH_OUTPUT]]
-      // CHECK: tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:          %[[TPU_CLUSTER_OUTPUT:[0-9]*]] = "tf_device.cluster"
+  // CHECK:            tf_device.return
+  // CHECK:          tf_device.return %[[TPU_CLUSTER_OUTPUT]]
+  // CHECK:        tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]
   %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-    %2 = "tf_device.launch"() ( {
+    %2 = "tf_device.cluster"() ( {
       "tf.A"() : () -> ()
       "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
       %3 = "tf.C"() : () -> tensor<?xi32>
       tf_device.return %3 : tensor<?xi32>
-    }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
     tf_device.return %2 : tensor<?xi32>
   }
 
@@ -122,24 +121,321 @@ func @single_tpu_return_single_outside_compilation(%arg0: tensor<?xi32>) -> tens
 // CHECK-LABEL: func @multiple_tpu_return_single_outside_compilation
 func @multiple_tpu_return_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xf32> {
   %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK: %[[REPLICATE:[0-9]*]]:4 = tf_device.replicate
-    // CHECK: %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:2  = "tf_device.parallel_execute"
-      // CHECK-NEXT: "tf_device.launch"
-      // CHECK: %[[TPU_LAUNCH_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
-        // CHECK: tf_device.return
-      // CHECK: tf_device.return %[[TPU_LAUNCH_OUTPUT]]
-    // CHECK: tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]
+  // CHECK:      %[[REPLICATE:[0-9]*]]:4 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:2  = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:          %[[TPU_CLUSTER_OUTPUT:[0-9]*]]:2 = "tf_device.cluster"
+  // CHECK:            tf_device.return
+  // CHECK:          tf_device.return %[[TPU_CLUSTER_OUTPUT]]
+  // CHECK:        tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]
   %1:4 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-    %2, %3 = "tf_device.launch"() ( {
+    %2, %3 = "tf_device.cluster"() ( {
       %4 = "tf.A"() : () -> tensor<?xf32>
       "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
       %5 = "tf.C"() : () -> tensor<?xi32>
       tf_device.return %4, %5  : tensor<?xf32>, tensor<?xi32>
-    }) {device = "tpu0", launch_attr = "launch_attr"} : () -> (tensor<?xf32>, tensor<?xi32>)
+    }) {cluster_attr = "cluster_attr"} : () -> (tensor<?xf32>, tensor<?xi32>)
     tf_device.return %2, %3 : tensor<?xf32>, tensor<?xi32>
   }
 
   return %1 : tensor<?xf32>
 }
 
-// TODO(b/154363171): Add test cases for when output of outside compilation is returned by parallel_execute.
+// Tests extraction of a single outside compiled cluster with single device->host input.
+
+// CHECK-LABEL: func @single_outside_compiled_input_single_outside_compilation
+func @single_outside_compiled_input_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            "tf.B"(%[[RECV_OUTPUT]])
+  // CHECK:          "tf_device.cluster"
+  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> ()
+      %4 = "tf.C"() : () -> tensor<?xi32>
+      tf_device.return %4 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
+// Tests extraction of a single outside compiled cluster with single host->device output.
+
+// CHECK-LABEL: func @single_outside_compiled_output_single_outside_compilation
+func @single_outside_compiled_output_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"()
+  // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:         "tf_device.cluster"
+  // CHECK:           %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:           %[[HOST_OUTPUT:[0-9]*]] = "tf._HostComputeMlir"()
+  // CHECK-SAME:      key = "host_compute_channel_cluster1"
+  // CHECK:           "tf.C"(%[[HOST_OUTPUT]])
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      %4 = "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> (tensor<?xi32>)
+      %5 = "tf.C"(%4) : (tensor<?xi32>) -> tensor<?xi32>
+      tf_device.return %5 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
+// Tests extraction of a single outside compiled cluster host output returned by TPU cluster.
+
+// CHECK-LABEL: func @return_host_output_outside_compilation
+func @return_host_output_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT]])
+  // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
+  // CHECK:          "tf_device.cluster"
+  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:            %[[HOST_OUTPUT:[0-9]*]] = "tf._HostComputeMlir"(%[[A_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            tf_device.return %[[HOST_OUTPUT]]
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> (tensor<?xi32>)
+      %5 = "tf.C"(%3) : (tensor<?xi32>) -> (tensor<?xi32>)
+      tf_device.return %4 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
+// Tests extraction of a single outside compiled cluster with single input/output.
+
+// CHECK-LABEL: func @single_outside_compiled_input_output_single_outside_compilation
+func @single_outside_compiled_input_output_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT]])
+  // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:          "tf_device.cluster"
+  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:            %[[HOST_OUTPUT:[0-9]*]] = "tf._HostComputeMlir"(%[[A_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            "tf.C"(%[[HOST_OUTPUT]])
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> (tensor<?xi32>)
+      %5 = "tf.C"(%4) : (tensor<?xi32>) -> tensor<?xi32>
+      tf_device.return %5 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
+
+// Tests extraction of a single outside compiled cluster with multiple input/output.
+
+// CHECK-LABEL: func @multiple_outside_compiled_input_output_single_outside_compilation
+func @multiple_outside_compiled_input_output_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+  // CHECK:            %[[B_OUTPUT:[0-9]*]]:2 = "tf.C"(%[[RECV_OUTPUT]]#0, %[[RECV_OUTPUT]]#1)
+  // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]]#0, %[[B_OUTPUT]]#1, %[[PROGRAM_OUTPUT]])
+  // CHECK-SAME:        key = "host_compute_channel_cluster1"
+  // CHECK:          "tf_device.cluster"
+  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+  // CHECK:            %[[HOST_OUTPUT:[0-9]*]]:2 = "tf._HostComputeMlir"(%[[A_OUTPUT]], %[[B_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            "tf.D"(%[[HOST_OUTPUT]]#0)
+  // CHECK:            "tf.E"(%[[HOST_OUTPUT]]#1)
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      %4 = "tf.B"() : () -> (tensor<?xi32>)
+      %5, %6 = "tf.C"(%3, %4) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>, tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+      %7 = "tf.D"(%5) : (tensor<?xi32>) -> tensor<?xi32>
+      %8 = "tf.E"(%6) : (tensor<?xi32>) -> tensor<?xi32>
+      tf_device.return %8 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
+// Tests extraction of a multiple outside compiled clusters with input/output.
+
+// CHECK-LABEL: func @outside_compiled_input_output_multiple_outside_compilation
+func @outside_compiled_input_output_multiple_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT2:[a-z_0-9]*]], %[[PROGRAM_OUTPUT2:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT2]])
+  // CHECK:            %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[RECV_OUTPUT2]])
+  // CHECK:            "tf._XlaSendFromHost"(%[[D_OUTPUT]], %[[PROGRAM_OUTPUT]])
+  // CHECK-SAME:         key = "host_compute_channel_cluster2"
+  // CHECK:          "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT1:[a-z_0-9]*]], %[[PROGRAM_OUTPUT1:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT1:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT1]])
+  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT1]])
+  // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:          "tf_device.cluster"
+  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:            %[[HOST_OUTPUT1:[0-9]*]] = "tf._HostComputeMlir"(%[[A_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[HOST_OUTPUT1]])
+  // CHECK:            %[[HOST_OUTPUT2:[0-9]*]] = "tf._HostComputeMlir"(%[[C_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster2"
+  // CHECK:            "tf.E"(%[[HOST_OUTPUT2]])
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> (tensor<?xi32>)
+      %5 = "tf.C"(%4) : (tensor<?xi32>) -> (tensor<?xi32>)
+      %6 = "tf.D"(%5) {_xla_outside_compilation = "cluster2"} : (tensor<?xi32>) -> (tensor<?xi32>)
+      %7 = "tf.E"(%6) : (tensor<?xi32>) -> tensor<?xi32>
+      tf_device.return %7 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
+// Tests extraction of a single outside compiled cluster with arg input and single device->host input.
+
+// CHECK-LABEL: func @mixed_input_single_outside_compilation
+func @mixed_input_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            "tf.B"(%arg0, %[[RECV_OUTPUT]])
+  // CHECK:          "tf_device.cluster"
+  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      "tf.B"(%arg0, %3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>, tensor<?xi32>) -> ()
+      %4 = "tf.C"() : () -> tensor<?xi32>
+      tf_device.return %4 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
+// Tests extraction of a multiple outside compiled clusters with single device->host input.
+
+// CHECK-LABEL: func @single_outside_compiled_input_multiple_outside_compilation
+func @single_outside_compiled_input_multiple_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT_2:[a-z_0-9]*]], %[[PROGRAM_OUTPUT_2:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT_2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT_2]])
+  // CHECK-SAME:      key = "host_compute_channel_cluster2"
+  // CHECK:           "tf.D"(%[[RECV_OUTPUT_2]])
+  // CHECK:          "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT_1:[a-z_0-9]*]], %[[PROGRAM_OUTPUT_1:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT_1:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT_1]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            "tf.B"(%[[RECV_OUTPUT_1]])
+  // CHECK:          "tf_device.cluster"
+  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            %[[C_OUTPUT:[0-9]*]] = "tf.C"
+  // CHECK:            "tf._HostComputeMlir"(%[[C_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster2"
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> ()
+      %4 = "tf.C"() : () -> tensor<?xi32>
+      "tf.D"(%4) {_xla_outside_compilation = "cluster2"} : (tensor<?xi32>) -> ()
+      tf_device.return %4 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
+// Tests extraction of a single outside compiled cluster with multiple device->host inputs.
+
+// CHECK-LABEL: func @multiple_outside_compiled_inputs_single_outside_compilation
+func @multiple_outside_compiled_inputs_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            "tf.C"(%[[RECV_OUTPUT]]#0)
+  // CHECK:            "tf.D"(%[[RECV_OUTPUT]]#1, %[[RECV_OUTPUT]]#0)
+  // CHECK:          "tf_device.cluster"
+  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+  // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]], %[[B_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      %4 = "tf.B"() : () -> (tensor<?xi32>)
+      "tf.C"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> ()
+      "tf.D"(%4, %3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>, tensor<?xi32>) -> ()
+      %5 = "tf.E"() : () -> tensor<?xi32>
+      tf_device.return %5 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index af0119dab8f..5d65342b4a7 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -747,7 +747,9 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests simple case of `tf_device.cluster_func` on TPU with replication.
+// Tests simple case of `tf_device.cluster_func` on TPU with replication. Under
+// data parallelism replicated host devices are also added to the
+// tf_device.replicate
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
   // CHECK-LABEL: func @replicated_tpu_cluster_func
@@ -758,7 +760,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
     // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     // CHECK-SAME: ([%[[A_OUTPUT]], %[[ARG_0]]] as %[[RI_0:[a-z0-9]*]]: tensor<?xi32>)
-    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]}
+    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"], TPU_REPLICATED_HOST = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:CPU:0"]}
     // CHECK-SAME: n = 2
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[RI_0]])
@@ -1222,6 +1224,51 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
+// Tests simple case of `tf_device.cluster_func` on TPU with replication and
+// parallel_execute.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
+  // CHECK-LABEL: func @replicated_parallel_tpu_cluster_func
+  func @replicated_parallel_tpu_cluster_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
+      // CHECK: "tf._TPUCompileMlir"
+      // CHECK: "tf.TPUCompileSucceededAssert"
+      // CHECK: "tf_device.parallel_execute"
+      // CHECK-NOT:"tf._TPUCompileMlir"
+      // CHECK:    "tf.D"(%[[COMPILE_OUTPUT]]#1
+      // CHECK:    "tf.TPUExecute"
+      // CHECK-NOT:"tf._TPUCompileMlir"
+      // CHECK:    "tf.E"(%[[COMPILE_OUTPUT]]#1
+      %3 = "tf_device.parallel_execute"() ( {
+        %status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+        "tf.D"(%program) : (tensor<!tf.string>) -> ()
+        tf_device.return
+      }, {
+        %4 = "tf_device.cluster_func"(%ri_0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+        tf_device.return %4 : tensor<?xi32>
+      }, {
+        %status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+        "tf.E"(%program) : (tensor<!tf.string>) -> ()
+        tf_device.return
+      }) : () -> (tensor<?xi32>)
+      tf_device.return %3 : tensor<?xi32>
+    }
+    %2 = "tf.C"(%1#1) : (tensor<?xi32>) -> tensor<?xi32>
+    return %2 : tensor<?xi32>
+  }
+
+  func @tpu0_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.B"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
 // Tests devices are set properly for non replicated model parallelism.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"]} {
@@ -1282,15 +1329,14 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 //   "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01"
 // -----
 
-// Tests devices are set properly for replicated model parallelism.
+// Tests devices are set properly for replicated model parallelism. No
+// replicated host device should be present.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:1/device:CPU:0", "/job:localhost/replica:0/task:1/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU_SYSTEM:0"]} {
   // CHECK-LABEL: func @replicated_parallel_execute
   func @replicated_parallel_execute(%arg0: tensor<8xi32>, %arg1: tensor<8xi32>) -> (tensor<8xi32>, tensor<8xi32>) {
     // CHECK: tf_device.replicate
-    // CHECK-SAME: devices =
-    // CHECK-SAME: TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"]
-    // CHECK-SAME: TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"]
+    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"], TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"]}
     %0:2 = tf_device.replicate([%arg0, %arg1] as %ri: tensor<8xi32>) {n = 2 : i32} {
       // CHECK-NEXT: %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
       // CHECK-NEXT:   "tf._TPUCompileMlir"()
@@ -1322,8 +1368,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 
 // -----
 
-// Tests that inputs are inputs with maximal and replicate sharding are set properly
-// for replicated model parallelism.
+// Tests that inputs are inputs with maximal and replicate sharding are set
+// properly for replicated model parallelism.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:1/device:CPU:0", "/job:localhost/replica:0/task:1/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU_SYSTEM:0"]} {
   // CHECK-LABEL: func @parallel_execute_with_input_with_sharding_configurations
@@ -1357,8 +1403,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 
 // -----
 
-// Tests devices are set properly for replicated model parallelism with
-// outputs to TPU computation placed on logical device 0.
+// Tests devices are set properly for replicated model parallelism with outputs
+// to TPU computation placed on logical device 0.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:1/device:CPU:0", "/job:localhost/replica:0/task:1/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU_SYSTEM:0"]} {
   // CHECK-LABEL: func @parallel_execute_with_different_outputs
@@ -1434,8 +1480,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 
 // -----
 
-// Tests inputs are correctly split and fed into TPU computation for
-// tiled input sharding.
+// Tests inputs are correctly split and fed into TPU computation for tiled input
+// sharding.
 
 // The following OpSharding is used for TPU computation inputs in below test:
 // Proto debug string:
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir
new file mode 100644
index 00000000000..aa333caa2ae
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir
@@ -0,0 +1,87 @@
+// RUN: tf-opt %s -split-input-file -tf-tpu-space-to-depth-pass | FileCheck %s --dump-input=fail
+
+// Tests for space to depth host and device transform.
+
+module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0" = {}, "/job:localhost/replica:0/task:0/device:TPU:0" = {}, "/job:localhost/replica:0/task:0/device:TPU:1" = {}, "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0" = {}}, tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 390 : i32}} {
+  func @main(%arg0: tensor<!tf.resource> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg1: tensor<!tf.variant> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg2: tensor<!tf.resource> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg3: tensor<!tf.variant> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg4: tensor<!tf.resource<tensor<7x7x3x64xf32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg5: tensor<!tf.resource<tensor<f32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg6: tensor<!tf.resource<tensor<f32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg7: tensor<!tf.resource<tensor<i64>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) attributes {tf.entry_function = {control_outputs = "while", inputs = "iterator,iterator_1,iterator_2,iterator_3,while_input_6,while_input_7,while_input_8,while_input_9", outputs = ""}} {
+    %0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %3:10 = "tf.While"(%2, %1, %2, %0, %1, %arg2, %arg4, %arg5, %arg6, %arg7) {_lower_using_switch_merge = true, _num_original_outputs = 10 : i64, _read_only_resource_inputs = [], body = @while_body_2710, cond = @while_cond_2700, device = "", is_stateless = false, output_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>], parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource>, tensor<!tf.resource<tensor<7x7x3x64xf32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<i64>>>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource>, tensor<!tf.resource<tensor<7x7x3x64xf32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<i64>>>)
+    return
+  }
+  // CHECK-LABEL: func @while_body_2710
+  func @while_body_2710(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<!tf.resource> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg6: tensor<!tf.resource<tensor<7x7x3x64xf32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg7: tensor<!tf.resource<tensor<f32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg8: tensor<!tf.resource<tensor<f32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg9: tensor<!tf.resource<tensor<i64>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource>, tensor<!tf.resource<tensor<7x7x3x64xf32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<i64>>>) attributes {tf.signature.is_stateful} {
+    %0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    // CHECK: %[[INPUT:.*]] = "tf.IteratorGetNext"
+    %1 = "tf.IteratorGetNext"(%arg5) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<!tf.resource>) -> tensor<2x224x224x3xf32>
+    // CHECK-DAG: %[[SPACETODEPTH0:.*]] = "tf.SpaceToDepth"([[INPUT:.*]]) {block_size = 2 : i64, data_format = "NHWC"} : (tensor<2x224x224x3xf32>) -> tensor<2x112x112x12xf32>
+    %2 = "tf.AddV2"(%arg2, %arg3) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %3 = "tf.ReadVariableOp"(%arg6) : (tensor<!tf.resource<tensor<7x7x3x64xf32>>>) -> tensor<7x7x3x64xf32>
+    %4 = "tf.ReadVariableOp"(%arg8) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    %5 = "tf.ReadVariableOp"(%arg7) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    %6 = "tf.ReadVariableOp"(%arg9) : (tensor<!tf.resource<tensor<i64>>>) -> tensor<i64>
+    %7:2 = "tf_device.cluster_func"(%1, %3, %5, %6) {_tpu_replicate = "while/cluster_while_body_271", allow_soft_placement = false, computation_shape = [], device = "", device_assignment = [0, 0, 0, 0], func = @_func, host_compute_core = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"], num_cores_per_replica = 1 : i64, output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"], padding_map = [], step_marker_location = "STEP_MARK_AT_ENTRY", topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", use_tpu = true} : (tensor<2x224x224x3xf32>, tensor<7x7x3x64xf32>, tensor<f32>, tensor<i64>) -> (tensor<7x7x3x64xf32>, tensor<i64>)
+    "tf.AssignVariableOp"(%arg6, %7#0) : (tensor<!tf.resource<tensor<7x7x3x64xf32>>>, tensor<7x7x3x64xf32>) -> ()
+    "tf.AssignVariableOp"(%arg9, %7#1) : (tensor<!tf.resource<tensor<i64>>>, tensor<i64>) -> ()
+    %8 = "tf.Identity"(%arg1) {device = ""} : (tensor<i32>) -> tensor<i32>
+    %9 = "tf.Identity"(%2) {device = ""} : (tensor<i32>) -> tensor<i32>
+    %10 = "tf.AddV2"(%arg0, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %11 = "tf.Identity"(%10) {device = ""} : (tensor<i32>) -> tensor<i32>
+    return %11, %8, %9, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9 : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource>, tensor<!tf.resource<tensor<7x7x3x64xf32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<i64>>>
+  }
+  func @while_cond_2700(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<!tf.resource> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg6: tensor<!tf.resource<tensor<7x7x3x64xf32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg7: tensor<!tf.resource<tensor<f32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg8: tensor<!tf.resource<tensor<f32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg9: tensor<!tf.resource<tensor<i64>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) -> tensor<i1> {
+    %0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.GreaterEqual"(%arg3, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %2 = "tf.Less"(%arg3, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %3 = "tf.Greater"(%arg2, %arg4) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %4 = "tf.LogicalAnd"(%2, %3) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+    %5 = "tf.Less"(%arg2, %arg4) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %6 = "tf.LogicalAnd"(%1, %5) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+    %7 = "tf.LogicalOr"(%6, %4) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+    %8 = "tf.Less"(%arg0, %arg1) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %9 = "tf.LogicalAnd"(%8, %7) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+    %10 = "tf.Identity"(%9) {device = ""} : (tensor<i1>) -> tensor<i1>
+    return %10 : tensor<i1>
+  }
+  // CHECK-LABEL: func @_func
+  // CHECK-SAME: [[FUNCINPUT0:.*]]: tensor<2x112x112x12xf32> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}, [[FUNCINPUT1:%.*]]: tensor<7x7x3x64xf32> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}, [[FUNCINPUT2:%.*]]: tensor<f32> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}, [[VAL_59:%.*]]: tensor<i64> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}) -> (tensor<7x7x3x64xf32> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}, tensor<i64> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}) attributes {sym_visibility = "private"} {
+  func @_func(%arg0: tensor<2x224x224x3xf32> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg1: tensor<7x7x3x64xf32> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg2: tensor<f32> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg3: tensor<i64> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}) -> (tensor<7x7x3x64xf32> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}, tensor<i64> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}) attributes {sym_visibility = "private"} {
+    %0 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+    %1 = "tf.Const"() {value = dense<0> : tensor<1x1xi32>} : () -> tensor<1x1xi32>
+    %2 = "tf.Const"() {value = dense<[7, 7, 3, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
+    %3 = "tf.Const"() {value = dense<[[0, 0], [3, 3], [3, 3], [0, 0]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+    %4 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %5 = "tf.Pad"(%arg0, %3) : (tensor<2x224x224x3xf32>, tensor<4x2xi32>) -> tensor<2x230x230x3xf32>
+    // CHECK: "tf.Conv2D"
+    // CHECK-SAME: strides = [1, 1, 1, 1]
+    // CHECK-SAME: (tensor<2x115x115x12xf32>, tensor<4x4x12x64xf32>) -> tensor<2x112x112x64xf32>
+    %6 = "tf.Conv2D"(%5, %arg1) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<2x230x230x3xf32>, tensor<7x7x3x64xf32>) -> tensor<2x112x112x64xf32>
+    // CHECK: %[[BACKPROP:.*]] = "tf.Conv2DBackpropFilter"
+    // CHECK-SAME: strides = [1, 1, 1, 1]
+    // CHECK-SAME: (tensor<2x115x115x12xf32>, tensor<4xi32>, tensor<2x112x112x64xf32>) -> tensor<4x4x12x64xf32>
+    %7 = "tf.Conv2DBackpropFilter"(%5, %2, %6) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<2x230x230x3xf32>, tensor<4xi32>, tensor<2x112x112x64xf32>) -> tensor<7x7x3x64xf32>
+    // CHECK: %[[CONST0:.*]] = "tf.Const"() {value = dense<
+    // CHECK-SAME: [4, 4, 2, 2, 3, 64]
+    // CHECK: %[[RESHAPE0:.*]] = "tf.Reshape"(%[[BACKPROP:.*]], %[[CONST0:.*]]) : (tensor<4x4x12x64xf32>, tensor<6xi64>) -> tensor<4x4x2x2x3x64xf32>
+    // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<
+    // CHECK-SAME: [0, 2, 1, 3, 4, 5]
+    // CHECK: %[[TRANSPOSE0:.*]] = "tf.Transpose"(%[[RESHAPE0:.*]], %[[CONST1:.*]]) : (tensor<4x4x2x2x3x64xf32>, tensor<6xi32>) -> tensor<4x2x4x2x3x64xf32>
+    // CHECK: %[[CONST2:.*]] = "tf.Const"() {value = dense<
+    // CHECK-SAME: [8, 8, 3, 64]
+    // CHECK: %[[RESHAPE1:.*]] = "tf.Reshape"(%[[TRANSPOSE1:.*]], %[[CONST2:.*]]) : (tensor<4x2x4x2x3x64xf32>, tensor<4xi64>) -> tensor<8x8x3x64xf32>
+    // CHECK: %[[CONST3:.*]] = "tf.Const"() {value = dense<
+    // CHECK-SAME: [7, 7, 3, 64]
+    // CHECK: %[[CONST4:.*]] = "tf.Const"() {value = dense<
+    // CHECK-SAME: 0
+    // CHECK: %[[SLICE0:.*]] = "tf.Slice"(%[[RESHAPE1:.*]], %[[CONST4:.*]], %[[CONST3:.*]]) : (tensor<8x8x3x64xf32>, tensor<4xi64>, tensor<4xi32>) -> tensor<7x7x3x64xf32>
+    %8 = "tf.CrossReplicaSum"(%7, %1) : (tensor<7x7x3x64xf32>, tensor<1x1xi32>) -> tensor<7x7x3x64xf32>
+    %9 = "tf.Mul"(%arg2, %8) : (tensor<f32>, tensor<7x7x3x64xf32>) -> tensor<7x7x3x64xf32>
+    %10 = "tf.Sub"(%arg1, %9) : (tensor<7x7x3x64xf32>, tensor<7x7x3x64xf32>) -> tensor<7x7x3x64xf32>
+    %11 = "tf.AddV2"(%arg3, %0) : (tensor<i64>, tensor<i64>) -> tensor<i64>
+    return %10, %11 : tensor<7x7x3x64xf32>, tensor<i64>
+  }
+}
+
+// ----
+
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
index ccc3e83a2a2..cf09f8d64fb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
@@ -152,6 +152,23 @@ def RealDivWithSqrtDivisor : Pat<(TF_RealDivOp $arg0, (TF_SqrtOp $arg1)),
 def ReciprocalNested : Pat<(TF_ReciprocalOp (TF_ReciprocalOp $arg)),
                            (replaceWithValue $arg)>;
 
+//===----------------------------------------------------------------------===//
+// Select op patterns.
+//===----------------------------------------------------------------------===//
+
+def ReshapeSelectPredIfNecessary : NativeCodeCall<
+  "ReshapeSelectPredIfNecessary(&($_builder), $0.getOwner()->getLoc(), $1, "
+  "$2.getType().cast<RankedTensorType>().getRank())">;
+
+// Select supports tensor `condition` where the shape is equal to the first
+// dimension of t and e. SelectV2 op supports normal broadcasting, so in these
+// cases the condition needs to be reshaped.
+def SelectToSelectV2 : Pat<
+  (TF_SelectOp:$op StaticShapeTensorOf<[AnyType]>:$cond,
+                   StaticShapeTensorOf<[AnyType]>:$t,
+                   StaticShapeTensorOf<[AnyType]>:$e),
+  (TF_SelectV2Op (ReshapeSelectPredIfNecessary $op, $cond, $t), $t, $e)>;
+
 //===----------------------------------------------------------------------===//
 // Square op patterns.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
index be35c6caa16..55a0b5c3fd3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/tf_status.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
index 08645333d5d..e04f6bf3daa 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
@@ -49,6 +49,16 @@ struct TPUBridgeExecutorIslandOutlining
   void runOnOperation() override;
 };
 
+// Move FuncOp referenced by `symbol_ref` from one symbol table to another.
+void MoveFuncOp(FlatSymbolRefAttr &symbol_ref, SymbolTable &from,
+                SymbolTable &to) {
+  if (to.lookup<FuncOp>(symbol_ref.getValue())) return;
+  FuncOp callee = from.lookup<FuncOp>(symbol_ref.getValue());
+  callee.getOperation()->getBlock()->getOperations().remove(
+      callee.getOperation());
+  to.insert(callee);
+}
+
 void TPUBridgeExecutorIslandOutlining::runOnOperation() {
   MLIRContext *ctx = &getContext();
 
@@ -141,14 +151,17 @@ void TPUBridgeExecutorIslandOutlining::runOnOperation() {
   for (FuncOp func : outlined_module.getOps<FuncOp>()) {
     func.walk([&](Operation *op) {
       for (NamedAttribute attr : op->getAttrs()) {
-        auto symbol_ref = attr.second.dyn_cast<FlatSymbolRefAttr>();
-        if (!symbol_ref) continue;
-        if (outlined_symbol_table.lookup<FuncOp>(symbol_ref.getValue()))
+        if (auto symbol_ref = attr.second.dyn_cast<FlatSymbolRefAttr>()) {
+          MoveFuncOp(symbol_ref, symbol_table, outlined_symbol_table);
           continue;
-        FuncOp callee = symbol_table.lookup<FuncOp>(symbol_ref.getValue());
-        callee.getOperation()->getBlock()->getOperations().remove(
-            callee.getOperation());
-        outlined_symbol_table.insert(callee);
+        }
+        if (auto array_attr = attr.second.dyn_cast<ArrayAttr>()) {
+          for (const Attribute &attribute : array_attr) {
+            auto symbol_ref = attribute.dyn_cast<FlatSymbolRefAttr>();
+            if (!symbol_ref) continue;
+            MoveFuncOp(symbol_ref, symbol_table, outlined_symbol_table);
+          }
+        }
       }
     });
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
index d3b064f3efa..a0cf9c8eb9a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
@@ -48,6 +48,9 @@ struct FreezeGlobalTensorsPass
 
 void FreezeGlobalTensorsPass::runOnOperation() {
   auto module = getOperation();
+  if (!tf_saved_model::HasTfSavedModelSemantics(module)) {
+    return;
+  }
   SymbolTable symbol_table(module);
   DenseSet<Operation*> frozen_global_tensors;
 
@@ -66,7 +69,9 @@ void FreezeGlobalTensorsPass::runOnOperation() {
       // previous optimize global tensors pass). If not, this pass has to fail
       // since it cannot perform one of its goals.
       if (global_tensor.is_mutable()) {
-        global_tensor.emitError() << "is not immutable";
+        global_tensor.emitError() << "is not immutable, try running "
+                                     "tf-saved-model-optimize-global-tensors "
+                                     "to prove tensors are immutable";
         return signalPassFailure();
       }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
index 50f77cd9c3d..d635d605607 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
@@ -16,21 +16,64 @@ limitations under the License.
 // This file implements logic for legalizing HLO to TensorFlow.
 
 #include <memory>
+#include <vector>
 
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/xla/ir/chlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 
 namespace mlir {
 namespace TF {
 namespace {
 
+class ConvertSliceOp : public OpConversionPattern<xla_hlo::SliceOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      xla_hlo::SliceOp slice_op, ArrayRef<Value> args,
+      ConversionPatternRewriter &rewriter) const final {
+    DenseIntElementsAttr strides = slice_op.strides();
+    // Strides must be 1 otherwise we cannot legalize this `xla_hlo.slice` op.
+    if (!strides.isSplat() ||
+        strides.getSplatValue().cast<IntegerAttr>().getInt() != 1)
+      return failure();
+
+    rewriter.setInsertionPointAfter(slice_op);
+    auto start_indices = slice_op.start_indices();
+    auto limit_indices = slice_op.limit_indices();
+    std::vector<int64_t> size_values;
+    for (auto pair : llvm::zip(start_indices.getValues<APInt>(),
+                               limit_indices.getValues<APInt>())) {
+      size_values.emplace_back(std::get<1>(pair).getSExtValue() -
+                               std::get<0>(pair).getSExtValue());
+    }
+
+    RankedTensorType ty =
+        RankedTensorType::get({static_cast<int64_t>(size_values.size())},
+                              rewriter.getIntegerType(64));
+    auto start = rewriter.create<ConstOp>(slice_op.getLoc(), start_indices);
+    auto size = rewriter.create<ConstOp>(
+        slice_op.getLoc(), DenseIntElementsAttr::get(ty, size_values));
+    rewriter.replaceOpWithNewOp<SliceOp>(slice_op, slice_op.getType(),
+                                         slice_op.operand(), start, size);
+    return success();
+  };
+};
+
 class LegalizeHloToTf : public PassWrapper<LegalizeHloToTf, FunctionPass> {
  public:
   LegalizeHloToTf() = default;
@@ -54,6 +97,68 @@ static bool AreBroadcastCompatible(Value x, Value y) {
                                             y_ranked.getShape(), resultShape);
 }
 
+// Returns the shape of the given value in a Constant Op.
+ConstantOp ShapeToConst(PatternRewriter &rewriter, Value value) {
+  ArrayRef<int64_t> shape = value.getType().cast<ShapedType>().getShape();
+  auto attr_type = RankedTensorType::get({static_cast<int64_t>(shape.size())},
+                                         rewriter.getIntegerType(64));
+  auto attr = DenseElementsAttr::get(attr_type, shape);
+  return rewriter.create<ConstantOp>(value.getLoc(), attr_type, attr);
+}
+
+// Converts xla_hlo.dot to tf.MatMul. Reshape ops will be inserted when
+// necessary.
+Value ConvertDotOp(PatternRewriter &rewriter, Operation *old_op) {
+  auto dot_op = cast<xla_hlo::DotOp>(old_op);
+  const mlir::Location loc = dot_op.getLoc();
+  // Normalizes a ShapedType to 2d if the ShapedType is less than 2d by
+  // inserting dummy 1-element dimensions in the begining. Does nothing if the
+  // old shape is already 2d or higher. This is necessary because tf.MatMul
+  // requires input tensors to be at least 2d.
+  const auto normalize_rank = [](ShapedType type) -> ShapedType {
+    if (type.getRank() >= 2) {
+      return type;
+    }
+
+    const int rank = type.getRank();
+    llvm::SmallVector<int64_t, 2> shape_2d(type.getShape().begin(),
+                                           type.getShape().end());
+    for (int i = 0; i < 2 - rank; ++i) {
+      shape_2d.insert(shape_2d.begin(), 1);
+    }
+    return RankedTensorType::get(shape_2d, type.getElementType());
+  };
+
+  // Reshapes a tensor value to 2d if it is 1d or scalar. Otherwise does
+  // nothing.
+  const auto reshape_to_2d = [&rewriter, &loc,
+                              &normalize_rank](mlir::Value input) {
+    const auto input_type = input.getType().cast<ShapedType>();
+    if (input_type.getRank() >= 2) {
+      return input;
+    }
+
+    auto reshape = rewriter.create<xla_hlo::ReshapeOp>(
+        loc, normalize_rank(input_type), input);
+    return reshape.getResult();
+  };
+
+  // Reshapes both operand to be 2d for tf.MatMul op.
+  auto a = reshape_to_2d(dot_op.lhs());
+  auto b = reshape_to_2d(dot_op.rhs());
+  // Operand `b` needs to be transposed if it is 1d. This is because dot op will
+  // contract on the only dimension if rhs is 1d.
+  auto b_old_type = dot_op.rhs().getType().cast<ShapedType>();
+  BoolAttr transpose_b = rewriter.getBoolAttr(b_old_type.getRank() == 1);
+  auto output_type = dot_op.getResult().getType().cast<ShapedType>();
+  auto matmul = rewriter.create<TF::MatMulOp>(
+      loc, normalize_rank(output_type), a, b,
+      /*transpose_a=*/rewriter.getBoolAttr(false), transpose_b);
+  auto reshape =
+      rewriter.create<xla_hlo::ReshapeOp>(loc, output_type, matmul.product());
+  return reshape.getResult();
+}
+
 #include "tensorflow/compiler/mlir/tensorflow/transforms/generated_legalize_hlo.inc"
 
 /// Performs the lowering to XLA dialect.
@@ -63,12 +168,15 @@ void LegalizeHloToTf::runOnFunction() {
   // Add legalization patterns to the list.
   OwningRewritePatternList patterns;
   populateWithGenerated(&context, &patterns);
+  patterns.insert<ConvertSliceOp>(&context);
 
   ConversionTarget target(context);
   target.addLegalDialect<TensorFlowDialect>();
-  target.addLegalOp<CallOp>();
-  if (failed(applyPartialConversion(getFunction(), target, patterns)))
+  target.addLegalOp<CallOp, ConstantOp>();
+  if (failed(applyPartialConversion(getFunction(), target, patterns))) {
+    getFunction().emitError("xla_hlo to TF legalization failed.");
     signalPassFailure();
+  }
 }
 
 static PassRegistration<LegalizeHloToTf> pass(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
index f3371989b73..e4b6a28d65f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
@@ -18,49 +18,65 @@ limitations under the License.
 include "mlir/IR/OpBase.td"
 include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+include "tensorflow/compiler/mlir/xla/ir/chlo_ops.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 
-def : Pat<(HLO_ConstOp $value), (TF_ConstOp $value)>;
-
-//===----------------------------------------------------------------------===//
-// Binary op patterns.
-//===----------------------------------------------------------------------===//
-
 // Check that two values can be broadcasted together
 // TODO(jpienaar): Move somewhere more general
 def AreBroadcastCompatible : Constraint<CPred<"AreBroadcastCompatible($0, $1)">,
     "types must be broadcastable">;
 
-foreach fromToBinPair = [[HLO_AddOp, TF_AddV2Op],
-                         [HLO_DivOp, TF_DivOp],
-                         [HLO_ShiftLeftOp, TF_LeftShiftOp],
-                         [HLO_MaxOp, TF_MaximumOp],
-                         [HLO_MinOp, TF_MinimumOp],
-                         [HLO_MulOp, TF_MulOp],
-                         [HLO_PowOp, TF_PowOp],
-                         [HLO_SubOp, TF_SubOp],
-                         [HLO_Atan2Op, TF_Atan2Op],
-                         [HLO_RemOp, TF_ModOp]] in
-  def : Pat<(fromToBinPair[0] $l, $r, $_), (fromToBinPair[1] $l, $r),
-            [(AreBroadcastCompatible $l, $r)]>;
+// Return a constant op that carries the shape of the given value.
+def ShapeToConst : NativeCodeCall<"ShapeToConst($_builder, $0)">;
 
-foreach pair  = [[HLO_AndOp, TF_BitwiseAndOp],
-                 [HLO_OrOp, TF_BitwiseOrOp],
-                 [HLO_XorOp, TF_BitwiseXorOp]] in
-  def : Pat<(pair[0] TF_IntTensor:$l, TF_IntTensor:$r, $_), (pair[1] $l, $r),
-            [(AreBroadcastCompatible $l, $r)]>;
+def : Pat<(HLO_ConstOp $value), (TF_ConstOp $value)>;
 
-foreach pair  = [[HLO_AndOp, TF_LogicalAndOp],
-                 [HLO_OrOp, TF_LogicalOrOp]] in
-  def : Pat<(pair[0] I1Tensor:$l, I1Tensor:$r, $_), (pair[1] $l, $r),
-            [(AreBroadcastCompatible $l, $r)]>;
+//===----------------------------------------------------------------------===//
+// Binary op patterns.
+// Note that these are legalized from chlo.broadcast_* ops, since those are
+// semantically compatible with the corresponding TF ops. Depending on
+// context, getting to these ops may require some raising.
+//===----------------------------------------------------------------------===//
 
-def : Pat<(HLO_ShiftRightArithmeticOp $l, $r, $_), (TF_RightShiftOp $l, $r),
+foreach fromToBinPair = [[HLO_AddOp, HLOClient_BroadcastAddOp, TF_AddV2Op],
+                         [HLO_DivOp, HLOClient_BroadcastDivOp, TF_DivOp],
+                         [HLO_ShiftLeftOp, HLOClient_BroadcastShiftLeftOp, TF_LeftShiftOp],
+                         [HLO_MaxOp, HLOClient_BroadcastMaxOp, TF_MaximumOp],
+                         [HLO_MinOp, HLOClient_BroadcastMinOp, TF_MinimumOp],
+                         [HLO_MulOp, HLOClient_BroadcastMulOp, TF_MulOp],
+                         [HLO_PowOp, HLOClient_BroadcastPowOp, TF_PowOp],
+                         [HLO_SubOp, HLOClient_BroadcastSubOp, TF_SubOp],
+                         [HLO_Atan2Op, HLOClient_BroadcastAtan2Op, TF_Atan2Op],
+                         [HLO_RemOp, HLOClient_BroadcastRemOp, TF_ModOp]] in {
+  def : Pat<(fromToBinPair[0] $l, $r), (fromToBinPair[2] $l, $r)>;
+  def : Pat<(fromToBinPair[1] $l, $r, $_), (fromToBinPair[2] $l, $r),
+            [(AreBroadcastCompatible $l, $r)]>;
+}
+
+foreach pair  = [[HLO_AndOp, HLOClient_BroadcastAndOp, TF_BitwiseAndOp],
+                 [HLO_OrOp, HLOClient_BroadcastOrOp, TF_BitwiseOrOp],
+                 [HLO_XorOp, HLOClient_BroadcastXorOp, TF_BitwiseXorOp]] in {
+  def : Pat<(pair[0] TF_IntTensor:$l, TF_IntTensor:$r), (pair[2] $l, $r)>;
+  def : Pat<(pair[1] TF_IntTensor:$l, TF_IntTensor:$r, $_), (pair[2] $l, $r),
+            [(AreBroadcastCompatible $l, $r)]>;
+}
+
+foreach pair  = [[HLO_AndOp, HLOClient_BroadcastAndOp, TF_LogicalAndOp],
+                 [HLO_OrOp, HLOClient_BroadcastOrOp, TF_LogicalOrOp]] in {
+  def : Pat<(pair[0] I1Tensor:$l, I1Tensor:$r), (pair[2] $l, $r)>;
+  def : Pat<(pair[1] I1Tensor:$l, I1Tensor:$r, $_), (pair[2] $l, $r),
+            [(AreBroadcastCompatible $l, $r)]>;
+}
+
+def : Pat<(HLO_ShiftRightArithmeticOp $l, $r), (TF_RightShiftOp $l, $r)>;
+def : Pat<(HLOClient_BroadcastShiftRightArithmeticOp $l, $r, $_), (TF_RightShiftOp $l, $r),
           [(AreBroadcastCompatible $l, $r)]>;
-def : Pat<(HLO_ShiftRightLogicalOp $l, $r, $_), (TF_RightShiftOp $l, $r),
+def : Pat<(HLO_ShiftRightLogicalOp $l, $r), (TF_RightShiftOp $l, $r)>;
+def : Pat<(HLOClient_BroadcastShiftRightLogicalOp $l, $r, $_), (TF_RightShiftOp $l, $r),
           [(AreBroadcastCompatible $l, $r)]>;
 
-def : Pat<(HLO_FloorOp (HLO_DivOp $l, $r, $_)), (TF_FloorDivOp $l, $r),
+def : Pat<(HLO_FloorOp (HLO_DivOp $l, $r)), (TF_FloorDivOp $l, $r)>;
+def : Pat<(HLO_FloorOp (HLOClient_BroadcastDivOp $l, $r, $_)), (TF_FloorDivOp $l, $r),
           [(AreBroadcastCompatible $l, $r)]>;
 
 def : Pat<(HLO_ComplexOp $r, $i), (TF_ComplexOp $r, $i)>;
@@ -99,6 +115,8 @@ def : Pat<(HLO_BroadcastOp $arg, $shape),
 def : Pat<(HLO_TransposeOp $arg, $permutation),
           (TF_TransposeOp $arg, (TF_ConstOp $permutation))>;
 def : Pat<(HLO_ReverseOp $op, $dims), (TF_ReverseV2Op $op, (TF_ConstOp $dims))>;
+def : Pat<(HLO_ReshapeOp:$output $input),
+          (TF_ReshapeOp $input, (ShapeToConst $output))>;
 
 //===----------------------------------------------------------------------===//
 // Ternary op patterns.
@@ -117,16 +135,29 @@ def : Pat<(HLO_ConcatenateOp $inputs, $dim),
 
 //===----------------------------------------------------------------------===//
 // Compare op patterns.
+// Note that these are legalized from chlo.broadcast_* ops, since those are
+// semantically compatible with the corresponding TF ops. Depending on
+// context, getting to these ops may require some raising.
 //===----------------------------------------------------------------------===//
 
 foreach p = [[TF_EqualOp, HLO_COMPARISON_DIRECTION_EQ],
-             [TF_NotEqualOp, HLO_COMPARISON_DIRECTION_NE]] in
-  def : Pat<(HLO_CompareOp $l, $r, $_, p[1]), (p[0] $l, $r, ConstBoolAttrTrue),
+             [TF_NotEqualOp, HLO_COMPARISON_DIRECTION_NE]] in {
+  def : Pat<(HLOClient_BroadcastCompareOp $l, $r, $_, p[1]), (p[0] $l, $r, ConstBoolAttrTrue),
             [(AreBroadcastCompatible $l, $r)]>;
+  def : Pat<(HLO_CompareOp $l, $r, p[1]), (p[0] $l, $r, ConstBoolAttrTrue)>;
+}
 
 foreach pair = [[TF_GreaterEqualOp, HLO_COMPARISON_DIRECTION_GE],
                 [TF_GreaterOp, HLO_COMPARISON_DIRECTION_GT],
                 [TF_LessEqualOp, HLO_COMPARISON_DIRECTION_LE],
-                [TF_LessOp, HLO_COMPARISON_DIRECTION_LT]] in
-  def : Pat<(HLO_CompareOp $l, $r, $_, pair[1]), (pair[0] $l, $r),
+                [TF_LessOp, HLO_COMPARISON_DIRECTION_LT]] in {
+  def : Pat<(HLOClient_BroadcastCompareOp $l, $r, $_, pair[1]), (pair[0] $l, $r),
             [(AreBroadcastCompatible $l, $r)]>;
+  def : Pat<(HLO_CompareOp $l, $r, pair[1]), (pair[0] $l, $r)>;
+}
+
+def ConvertDotOp : NativeCodeCall<"ConvertDotOp($_builder, "
+                                               "$0.getDefiningOp())">;
+def : Pat<(HLO_DotOp:$old_value AnyStaticShapeTensor:$lhs,
+               AnyStaticShapeTensor:$rhs, $precision_config),
+          (ConvertDotOp $old_value)>;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
index 550100c8ebf..cd8f988fd5f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
@@ -278,6 +278,10 @@ void EraseUnusedBoundInputs(ModuleOp module) {
 
 void OptimizeGlobalTensorsPass::runOnOperation() {
   auto module = getOperation();
+  if (!tf_saved_model::HasTfSavedModelSemantics(module)) {
+    return;
+  }
+
   EraseUnusedBoundInputs(module);
 
   ResourceAnalyzer resource_analyzer(module);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index c1d99c2dee3..5c140ddd6aa 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -91,6 +91,15 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateResourceDeviceInferencePass();
 // of their aliasing output arguments.
 std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteResourcesToArgsPass();
 
+// Creates a pass that promotes tf.VarHandleOp to resource arguments for all
+// functions.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteVarHandlesToArgsPass();
+
+// Creates a pass that converts readonly reference variables to the
+// corresponding resource variables.
+std::unique_ptr<OperationPass<FuncOp>>
+CreateConvertReadonlyReferenceVariablesToResourceVariablesPass();
+
 // Marks function visibility using tf.entry_function specification. That is,
 // functions with tf.entry_function attributes are marked with public
 // visibility while the other functions are marked with private visibility.
@@ -258,7 +267,7 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateTPUVariableReformattingPass();
 
 // Creates a pass that extracts outside compilation (CPU ops inside TPU cluster)
 // at head/tail of TPU cluster to run before/after TPU computation.
-std::unique_ptr<OperationPass<FuncOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
 CreateTPUExtractHeadTailOutsideCompilationPass();
 
 // Creates a pass that extract outside compilation (CPU ops inside TPU cluster)
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
index fa4fe461317..cece23b4750 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
@@ -47,11 +47,14 @@ limitations under the License.
 //  . Dead functions have already been removed, as resource arguments in dead
 //    functions can cause the pass to fail.
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -73,114 +76,189 @@ constexpr char kResourceFunctionMsg[] =
     "expects function level resource argument";
 constexpr char kInvalidResourceMsg[] =
     "expects resource to be a VarHandleOp or function argument";
+constexpr char kResourceNameArgAttr[] = "tf.resource_name";
 
-// Records the input argument index and the current live value for a resource
-// variable.
-//
-// . If the input argument already exists or has been added, input_index is the
-//   index of the function, and live_value_or_type tracks the live value of the
-//   resource.
-//
-// . If the input argument has not been added in the pass, input_index is
-//   kInputUnassigned, live_value_or_type represents the type of the resource.
-//   (a) If this resource is read, add a new argument whose type is obtained
-//       from live_value_or_type, and input_index and live_value_or_type will be
-//       updated to reference the new argument.
-//   (b) If this resource is written, live_value_or_type will track the new
-//       value of the resource. input_index will remain to be kInputUnassigned.
+// Checks if a function has only one block.
+mlir::LogicalResult CheckSingleBlockFunction(FuncOp function) {
+  if (!hasSingleElement(function.getBlocks()))
+    return function.emitError()
+           << "expects function '" << function.getName()
+           << "' to have 1 block, got " << function.getBlocks().size();
+
+  return success();
+}
+
+// Collects names of users of a resource that are not `tf.ReadVariableOp` and
+// not `tf.AssignVariableOp`.
+llvm::SmallSet<llvm::StringRef, 1> GetCompositeResourceUserNames(
+    Value resource) {
+  // SmallSet will use a vector when there is only one element and use std::set
+  // when there are more than one elements. This ensures that the operations in
+  // the error message are ordered.
+  llvm::SmallSet<llvm::StringRef, 1> composite_users;
+  for (Operation* user : resource.getUsers())
+    if (!llvm::isa<TF::ReadVariableOp>(user) &&
+        !llvm::isa<TF::AssignVariableOp>(user))
+      composite_users.insert(user->getName().getStringRef());
+
+  return composite_users;
+}
+
+// Checks if `tf.VarHandleOp` has a valid resource subtype and its users are of
+// `tf.ReadVariableOp` and `tf.AssignVariableOp` only.
+mlir::LogicalResult ValidateVarHandle(TF::VarHandleOp var_handle_op) {
+  auto resource_type =
+      getElementTypeOrSelf(var_handle_op.getType()).cast<TF::ResourceType>();
+  if (resource_type.getSubtypes().size() != 1)
+    return var_handle_op.emitOpError()
+           << "expects resource type to have one subtype, got "
+           << resource_type;
+
+  auto composite_ops = GetCompositeResourceUserNames(var_handle_op);
+  if (!composite_ops.empty())
+    return var_handle_op.emitOpError()
+           << "expects users to be 'tf.ReadVariableOp' or "
+              "'tf.AssignVariableOp', got ["
+           << llvm::join(composite_ops.begin(), composite_ops.end(), ", ")
+           << "]";
+
+  return success();
+}
+
+// Checks if resource argument has a valid resource subtype and its users are of
+// `tf.ReadVariableOp` and `tf.AssignVariableOp` only.
+mlir::LogicalResult ValidateResourceArgument(FuncOp function,
+                                             BlockArgument resource_arg,
+                                             TF::ResourceType resource_type) {
+  if (resource_type.getSubtypes().size() != 1)
+    return function.emitError()
+           << "expects resource type of argument "
+           << resource_arg.getArgNumber() << " to have one subtype, got "
+           << resource_type;
+
+  auto composite_ops = GetCompositeResourceUserNames(resource_arg);
+  if (!composite_ops.empty())
+    return function.emitError()
+           << "expects users of resource argument "
+           << resource_arg.getArgNumber()
+           << " to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp', got ["
+           << llvm::join(composite_ops.begin(), composite_ops.end(), ", ")
+           << "]";
+
+  return success();
+}
+
+// Adds resource arguments for every unique (name) variable handle. Associated
+// `tf.VarHandleOp` are removed from the function. Variable shared names are
+// returned in `var_handle_shared_names` based on the ordering of added resource
+// arguments.
+mlir::LogicalResult PromoteVarHandlesToArguments(
+    FuncOp function, bool add_validation,
+    llvm::SmallVectorImpl<std::string>* var_handle_shared_names) {
+  Block& block = function.front();
+  auto func_type = function.getType();
+
+  auto func_arg_types = llvm::to_vector<4>(func_type.getInputs());
+  llvm::SmallDenseMap<llvm::StringRef, int> var_arg_index_by_name;
+  for (auto var_handle_op :
+       llvm::make_early_inc_range(block.getOps<TF::VarHandleOp>())) {
+    if (add_validation && failed(ValidateVarHandle(var_handle_op)))
+      return failure();
+
+    llvm::StringRef name = var_handle_op.shared_nameAttr().getValue();
+    auto it = var_arg_index_by_name.insert({name, func_arg_types.size()});
+    if (it.second) {
+      var_handle_shared_names->emplace_back(name);
+      auto resource_type = var_handle_op.resource().getType();
+      func_arg_types.push_back(resource_type);
+      var_handle_op.resource().replaceAllUsesWith(
+          block.addArgument(resource_type));
+    } else {
+      var_handle_op.resource().replaceAllUsesWith(
+          block.getArgument(it.first->getSecond()));
+    }
+    var_handle_op.erase();
+  }
+
+  if (!var_handle_shared_names->empty())
+    function.setType(FunctionType::get(func_arg_types, func_type.getResults(),
+                                       function.getContext()));
+
+  return success();
+}
+
+// Records the current live value for a resource variable and whether a read or
+// write on the variable occurred.
 struct ResourceInfo {
-  static constexpr int64_t kInputUnassigned = -1;
-  int64_t input_index;
-  llvm::PointerUnion<Value, Type> live_value_or_type;
+  Value live_value = nullptr;
+  bool read = false;
+  bool write = false;
 };
 
-using ArgOrName = llvm::PointerUnion<BlockArgument, Attribute>;
-using ResourceMap = llvm::SmallDenseMap<ArgOrName, ResourceInfo>;
-
-LogicalResult PromoteResourcesToArguments(FuncOp function) {
+LogicalResult PromoteResourcesToArguments(
+    FuncOp function, llvm::ArrayRef<std::string> var_handle_shared_names) {
   Block& block = function.front();
 
   auto return_op = llvm::dyn_cast_or_null<ReturnOp>(block.getTerminator());
   if (!return_op)
-    return function.emitError(
-        "expects 'main' function to have a MLIR ReturnOp");
+    return function.emitError() << "expects function '" << function.getName()
+                                << "' to have a MLIR ReturnOp";
 
-  ResourceMap resource_map;
+  llvm::SmallVector<ResourceInfo, 4> resources(function.getNumArguments());
   auto argument_types = llvm::to_vector<4>(function.getType().getInputs());
+  bool has_resources = false;
+  auto add_resource_argument = [&](BlockArgument arg,
+                                   TF::ResourceType resource_type) {
+    Type arg_type = resource_type.getSubtypes().front();
+    arg.setType(arg_type);
+    resources[arg.getArgNumber()].live_value = arg;
+    argument_types[arg.getArgNumber()] = arg_type;
+    has_resources = true;
+  };
 
-  // Loop through the resource arguments in the function and store a mapping
-  // from that argument to its index and itself as the current live value.
-  for (BlockArgument& func_arg : function.getArguments()) {
+  // Loop through the non `tf.VarHandleOp` resource arguments in the function,
+  // validate its uses and subtype, and store a mapping from that argument to
+  // itself as the current live value.
+  auto func_args = function.getArguments().take_front(
+      function.getNumArguments() - var_handle_shared_names.size());
+  for (BlockArgument& func_arg : func_args) {
     auto resource_type =
         getElementTypeOrSelf(func_arg.getType()).dyn_cast<TF::ResourceType>();
     if (!resource_type) continue;
-    if (resource_type.getSubtypes().size() != 1)
-      return function.emitError()
-             << "expects resource type of argument " << func_arg.getArgNumber()
-             << " to have one subtype, got " << resource_type;
+    if (failed(ValidateResourceArgument(function, func_arg, resource_type)))
+      return failure();
 
-    for (auto* user : func_arg.getUsers())
-      if (!llvm::isa<TF::ReadVariableOp>(user) &&
-          !llvm::isa<TF::AssignVariableOp>(user))
-        return function.emitError()
-               << "expects users of resource argument "
-               << func_arg.getArgNumber()
-               << " to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp'";
-
-    Type arg_type = resource_type.getSubtypes().front();
-    func_arg.setType(arg_type);
-    resource_map[func_arg] = {func_arg.getArgNumber(), func_arg};
-    argument_types[func_arg.getArgNumber()] = arg_type;
+    add_resource_argument(func_arg, resource_type);
   }
 
-  // Loop through the VarHandleOp in the function. When the first VarHandleOp
-  // for a resource variable is encountered, add an entry to the resource_map to
-  // record the information. Do not add a new function argument yet.
-  for (auto var_handle_op : block.getOps<TF::VarHandleOp>()) {
-    if (resource_map.count(var_handle_op.shared_nameAttr())) continue;
-
+  // Loop through `tf.VarHandleOp` resource arguments in the function and store
+  // a mapping from that argument to itself as the current live value. No
+  // validations are necessary here as these arguments were validated prior to
+  // being added.
+  auto var_handle_args =
+      function.getArguments().take_back(var_handle_shared_names.size());
+  for (BlockArgument& var_handle_arg : var_handle_args) {
     auto resource_type =
-        getElementTypeOrSelf(var_handle_op.getType()).cast<TF::ResourceType>();
-    if (resource_type.getSubtypes().size() != 1)
-      return var_handle_op.emitOpError()
-             << "expects resource type to have one subtype, got "
-             << resource_type;
-
-    resource_map[var_handle_op.shared_nameAttr()] = {
-        ResourceInfo::kInputUnassigned, resource_type.getSubtypes().front()};
+        getElementTypeOrSelf(var_handle_arg.getType()).cast<TF::ResourceType>();
+    add_resource_argument(var_handle_arg, resource_type);
   }
 
-  if (resource_map.empty()) return success();
+  if (!has_resources) return success();
 
   // We initially assign the argument for a resource as the live value for the
   // resource. We then walk through the operations in the function in their
   // lexical order, to update the live value for the resource when we see a
   // store to the resource and replace reads of the resource with uses of its
-  // live value. For the reads, if the resource does not have a live value yet,
-  // we add a new argument and use it as the live value.
+  // live value.
   for (Operation& op : llvm::make_early_inc_range(block)) {
     if (auto read_op = llvm::dyn_cast<TF::ReadVariableOp>(&op)) {
       if (auto func_arg = read_op.resource().dyn_cast<BlockArgument>()) {
         if (func_arg.getOwner() != &block)
           return read_op.emitOpError(kResourceFunctionMsg);
 
-        // resource_map[func_arg] is always a Value when func_arg is a
-        // BlockArgument.
-        read_op.value().replaceAllUsesWith(
-            resource_map[func_arg].live_value_or_type.get<Value>());
-      } else if (auto var_handle_op = llvm::dyn_cast<TF::VarHandleOp>(
-                     read_op.resource().getDefiningOp())) {
-        ResourceInfo& info = resource_map[var_handle_op.shared_nameAttr()];
-        if (auto live_value = info.live_value_or_type.dyn_cast<Value>()) {
-          read_op.value().replaceAllUsesWith(live_value);
-        } else {
-          auto arg_type = info.live_value_or_type.get<Type>();
-          BlockArgument arg = block.addArgument(arg_type);
-          info.input_index = argument_types.size();
-          info.live_value_or_type = arg;
-          argument_types.push_back(arg_type);
-          read_op.value().replaceAllUsesWith(arg);
-        }
+        ResourceInfo& resource_info = resources[func_arg.getArgNumber()];
+        resource_info.read = true;
+        read_op.value().replaceAllUsesWith(resource_info.live_value);
       } else {
         return read_op.emitOpError(kInvalidResourceMsg);
       }
@@ -191,11 +269,9 @@ LogicalResult PromoteResourcesToArguments(FuncOp function) {
         if (func_arg.getOwner() != &block)
           return write_op.emitOpError(kResourceFunctionMsg);
 
-        resource_map[func_arg].live_value_or_type = write_op.value();
-      } else if (auto var_handle_op = llvm::dyn_cast<TF::VarHandleOp>(
-                     write_op.resource().getDefiningOp())) {
-        resource_map[var_handle_op.shared_nameAttr()].live_value_or_type =
-            write_op.value();
+        ResourceInfo& resource_info = resources[func_arg.getArgNumber()];
+        resource_info.write = true;
+        resource_info.live_value = write_op.value();
       } else {
         return read_op.emitOpError(kInvalidResourceMsg);
       }
@@ -206,67 +282,68 @@ LogicalResult PromoteResourcesToArguments(FuncOp function) {
 
   const int64_t num_results_before = function.getNumResults();
   auto return_operands = llvm::to_vector<4>(return_op.getOperands());
-  return_operands.reserve(num_results_before + resource_map.size());
   auto result_types = llvm::to_vector<4>(return_op.getOperandTypes());
-  result_types.reserve(num_results_before + resource_map.size());
-  llvm::SmallVector<std::pair<int64_t, Attribute>, 4> output_only_resources;
-  output_only_resources.reserve(resource_map.size());
+  llvm::SmallVector<std::pair<int64_t, llvm::StringRef>, 4>
+      output_only_resources;
   llvm::SmallVector<std::pair<int64_t, int64_t>, 4> input_output_alias;
-  input_output_alias.reserve(resource_map.size());
 
-  // Collect new return values and either (a) output-only resource attributes
-  // (if the resource is not promoted to an argument) or (b) mapping from
-  // resource input index to output alias (if the resource has been promoted to
-  // an argument). If the last live value is itself (argument), then that live
-  // value will not be returned as the resource is unmodified.
-  for (auto& resource : resource_map) {
-    int64_t input_index = resource.getSecond().input_index;
-    auto live_value = resource.getSecond().live_value_or_type.dyn_cast<Value>();
-    if (input_index == ResourceInfo::kInputUnassigned) {
-      if (!live_value) continue;
-
-      output_only_resources.push_back(
-          {return_operands.size(), resource.getFirst().dyn_cast<Attribute>()});
-    } else {
-      // live_value is not nullptr because any input-assigned resource has a
-      // Value as live_value.
-      auto live_arg = live_value.dyn_cast<BlockArgument>();
-      if (live_arg && live_arg.getOwner() == &block &&
-          live_arg.getArgNumber() == input_index)
-        continue;
-
-      input_output_alias.push_back({input_index, return_operands.size()});
-    }
-    return_operands.push_back(live_value);
-    result_types.push_back(live_value.getType());
-  }
-
-  // Erase all VarHandleOp.
-  for (Operation& op : llvm::make_early_inc_range(function.front())) {
-    auto var_handle_op = llvm::dyn_cast<TF::VarHandleOp>(op);
-    if (!var_handle_op) continue;
-    if (!var_handle_op.use_empty()) {
-      // SmallSet will use a vector when there is only one element and use
-      // std::set when there are more than one elements. This ensures that
-      // the operations in the error message are ordered.
-      llvm::SmallSet<std::string, 2> unique_operations;
-      llvm::for_each(
-          var_handle_op.getOperation()->getUsers(), [&](Operation* user) {
-            unique_operations.insert(user->getName().getStringRef().str());
-          });
-
-      return var_handle_op.emitOpError(
-                 "expects no uses but used by operations: ")
-             << llvm::join(unique_operations.begin(), unique_operations.end(),
-                           ", ");
-    }
-
-    op.erase();
-  }
-
-  // Rewrite return if more results need to be returned by the function.
+  // Collect new return values for variable writes and either (a) output-only
+  // resource attributes (if the resource is not promoted to an argument) or (b)
+  // mapping from resource input index to output alias (if the resource has been
+  // promoted to an argument). Resource arguments that were originally
+  // `tf.VarHandleOp` but not read are collected and then removed.
   OpBuilder builder(return_op);
-  if (!output_only_resources.empty() || !input_output_alias.empty()) {
+  const int var_handles_start_idx =
+      function.getNumArguments() - var_handle_shared_names.size();
+  int new_argument_index = 0;
+  llvm::SmallVector<int, 4> argument_indices_to_remove;
+  for (auto resource_and_index : llvm::enumerate(resources)) {
+    const auto& resource = resource_and_index.value();
+    if (!resource.live_value) {
+      // Ignore non resource arguments.
+      ++new_argument_index;
+      continue;
+    }
+
+    const auto index = resource_and_index.index();
+    const bool is_var_handle = index >= var_handles_start_idx;
+    if (resource.write) {
+      if (!is_var_handle || resource.read) {
+        input_output_alias.push_back(
+            {new_argument_index, return_operands.size()});
+      } else if (is_var_handle) {
+        output_only_resources.push_back(
+            {return_operands.size(),
+             var_handle_shared_names[index - var_handles_start_idx]});
+      }
+      return_operands.push_back(resource.live_value);
+      result_types.push_back(resource.live_value.getType());
+    }
+
+    if (is_var_handle && !resource.read) {
+      assert(block.getArgument(index).getUses().empty());
+      argument_indices_to_remove.push_back(index);
+    } else {
+      if (is_var_handle) {
+        // Add resource_name attribute to VarHandleOp read.
+        function.setArgAttr(
+            new_argument_index, kResourceNameArgAttr,
+            builder.getStringAttr(
+                var_handle_shared_names[index - var_handles_start_idx]));
+      }
+      ++new_argument_index;
+    }
+  }
+
+  // Remove unread var handle arguments.
+  for (int argument_index_to_remove :
+       llvm::reverse(argument_indices_to_remove)) {
+    block.eraseArgument(argument_index_to_remove);
+    argument_types.erase(argument_types.begin() + argument_index_to_remove);
+  }
+
+  // Rewrite return if there are variable writes.
+  if (return_operands.size() > num_results_before) {
     builder.create<ReturnOp>(return_op.getLoc(), return_operands);
     return_op.erase();
   }
@@ -274,17 +351,10 @@ LogicalResult PromoteResourcesToArguments(FuncOp function) {
   // Update function argument and result types with new resource subtypes.
   function.setType(builder.getFunctionType(argument_types, result_types));
 
-  // Add resource_name attribute to the input argument for the resources.
-  for (auto& resource : resource_map) {
-    if (auto attr = resource.getFirst().dyn_cast<Attribute>()) {
-      int64_t input_index = resource.getSecond().input_index;
-      if (input_index != ResourceInfo::kInputUnassigned)
-        function.setArgAttr(input_index, "tf.resource_name", attr);
-    }
-  }
   // Add resource_name attribute to the output for the resources.
   for (auto& resource : output_only_resources)
-    function.setResultAttr(resource.first, "tf.resource_name", resource.second);
+    function.setResultAttr(resource.first, kResourceNameArgAttr,
+                           builder.getStringAttr(resource.second));
 
   // Add aliasing_output attribute to the input argument for the resources that
   // are updated by the function.
@@ -309,26 +379,60 @@ void PromoteResourcesToArgsPass::runOnOperation() {
   // This routine should only be called when control flow operations are still
   // represented with TF IfOp and WhileOp operations. In this case, there should
   // be only one basic blocks in the MLIR representation.
-  if (!hasSingleElement(main_func.getBlocks())) {
-    main_func.emitError() << "expects 'main' function to have 1 block, got "
-                          << main_func.getBlocks().size();
-    return signalPassFailure();
-  }
+  if (failed(CheckSingleBlockFunction(main_func))) return signalPassFailure();
 
+  llvm::SmallVector<std::string, 4> var_handle_shared_names;
   if (failed(ResourceLiftingForFunctionalControlFlow(main_func)) ||
-      failed(PromoteResourcesToArguments(main_func)))
+      failed(PromoteVarHandlesToArguments(main_func, /*add_validation=*/true,
+                                          &var_handle_shared_names)) ||
+      failed(PromoteResourcesToArguments(main_func, var_handle_shared_names)))
     return signalPassFailure();
 }
 
+class PromoteVarHandlesToArgsPass
+    : public PassWrapper<PromoteVarHandlesToArgsPass, OperationPass<ModuleOp>> {
+ public:
+  void runOnOperation() override;
+};
+
+void PromoteVarHandlesToArgsPass::runOnOperation() {
+  ModuleOp module = getOperation();
+  MLIRContext* context = module.getContext();
+  for (auto function : module.getOps<FuncOp>()) {
+    if (failed(CheckSingleBlockFunction(function))) return signalPassFailure();
+
+    llvm::SmallVector<std::string, 4> var_handle_shared_names;
+    PromoteVarHandlesToArguments(function, /*add_validation=*/false,
+                                 &var_handle_shared_names);
+
+    // Add resource names for each `tf.VarHandleOp` that were promoted to
+    // resource arguments.
+    const int var_handle_args_offset =
+        function.getNumArguments() - var_handle_shared_names.size();
+    for (auto var_name_and_index : llvm::enumerate(var_handle_shared_names))
+      function.setArgAttr(var_name_and_index.index() + var_handle_args_offset,
+                          kResourceNameArgAttr,
+                          StringAttr::get(var_name_and_index.value(), context));
+  }
+}
+
 }  // namespace
 
 std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteResourcesToArgsPass() {
   return std::make_unique<PromoteResourcesToArgsPass>();
 }
 
+std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteVarHandlesToArgsPass() {
+  return std::make_unique<PromoteVarHandlesToArgsPass>();
+}
+
 static PassRegistration<PromoteResourcesToArgsPass> pass(
     "tf-promote-resources-to-args",
     "Promote resources reads/writes to function inputs/outputs.");
 
+static PassRegistration<PromoteVarHandlesToArgsPass> var_handle_pass(
+    "tf-promote-var-handles-to-args",
+    "Promote tf.VarHandleOps to function arguments.");
+
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc b/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc
new file mode 100644
index 00000000000..a80b84ddeda
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc
@@ -0,0 +1,179 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+namespace mlir {
+namespace TF {
+namespace {
+
+// Location attribute.
+constexpr StringRef kClassAttr = "_class";
+constexpr StringRef kLocationPrefix = "loc:@";
+
+// A pass that converts readonly reference variables to the corresponding
+// resource variables.
+//
+// It converts (VariableV2 -> Identity) to (VarHandle -> ReadVariable).
+//
+// For the background, this pass is a part of hoisting VariableV2 ops by
+// re-using the pipeline for hoisting (VarHandle -> ReadVariable) cases, which
+//  can be done by the following passes:
+//  - Capturing resource values into global tensors (importing saved model).
+//  - Promoting VarHandle ops to function input/outputs.
+//  - Freezing global tensor pass.
+//
+// This path assumes that all the VariableV2 ops is read-only via verifying the
+// heuristic method that assumes that all the users of them is Identity op,
+// fed directly.
+class ConvertReadonlyReferenceVariablesToResourceVariablesPass
+    : public PassWrapper<
+          ConvertReadonlyReferenceVariablesToResourceVariablesPass,
+          FunctionPass> {
+ public:
+  void runOnFunction() override;
+};
+
+// Parse node name from "_class" attribute.
+StringRef GetNodeNameFromClassAttr(Operation *op) {
+  ArrayAttr classes_attr = op->getAttrOfType<ArrayAttr>(kClassAttr);
+  if (!classes_attr) {
+    op->emitOpError() << "has no '_class' attribute";
+    return StringRef();
+  }
+
+  StringRef result;
+  for (Attribute class_attr : classes_attr) {
+    StringRef node_name = class_attr.cast<StringAttr>().getValue();
+    if (!node_name.startswith(kLocationPrefix)) {
+      continue;
+    }
+    if (!result.empty()) {
+      // Invalid case since there are multiple loc:@ attributes.
+      op->emitOpError()
+          << "expects only one named location in '_class' attribute, but got "
+          << classes_attr;
+      return StringRef();
+    }
+    result = node_name.drop_front(kLocationPrefix.size());
+  }
+  if (result.empty()) {
+    op->emitOpError() << "expects variable name in '_class' attribute, but got "
+                      << classes_attr;
+  }
+  return result;
+}
+
+void ConvertReadonlyReferenceVariablesToResourceVariablesPass::runOnFunction() {
+  FuncOp func = getFunction();
+
+  OpBuilder builder(func.getContext());
+  SmallVector<VariableV2Op, 4> variable_v2s_to_replace;
+
+  // Checks all the VariableV2 ops is read-only via verifying the heuristic
+  // method that assumes that all the users of them is Identity op, feeded
+  // directly.
+  auto read_only_vars_fn = [&variable_v2s_to_replace](
+                               VariableV2Op variable_v2_op) {
+    if (variable_v2_op.getResult().use_empty()) {
+      // Erase the op when there is no user.
+      variable_v2_op.erase();
+      return mlir::WalkResult::advance();
+    }
+    if (!all_of(variable_v2_op.getResult().getUsers(), [&variable_v2_op](
+                                                           Operation *user) {
+          if (!isa<IdentityOp>(user)) {
+            variable_v2_op.emitOpError()
+                << "expects all users to be 'tf.Identity', but got user "
+                << user->getName();
+            return false;
+          }
+          return true;
+        })) {
+      return mlir::WalkResult::interrupt();
+    }
+    variable_v2s_to_replace.push_back(variable_v2_op);
+    return mlir::WalkResult::advance();
+  };
+
+  WalkResult walk_res = func.walk(read_only_vars_fn);
+  if (walk_res.wasInterrupted()) return signalPassFailure();
+
+  for (VariableV2Op variable_v2_op : variable_v2s_to_replace) {
+    builder.setInsertionPoint(variable_v2_op);
+    ShapedType shaped_type =
+        variable_v2_op.getResult().getType().cast<ShapedType>();
+    TensorType tensor_type = DropRefType(shaped_type).cast<TensorType>();
+    StringAttr device_attr = variable_v2_op.getAttrOfType<StringAttr>("device");
+    if (!device_attr) device_attr = builder.getStringAttr("");
+    StringRef variable_name = GetNodeNameFromClassAttr(variable_v2_op);
+    if (variable_name.empty()) {
+      return signalPassFailure();
+    }
+    VarHandleOp var_handle_op = builder.create<VarHandleOp>(
+        variable_v2_op.getLoc(),
+        ArrayRef<Type>{RankedTensorType::get(
+            {}, TF::ResourceType::get(ArrayRef<TensorType>{tensor_type},
+                                      builder.getContext()))},
+        ArrayRef<Value>{},
+        ArrayRef<NamedAttribute>{
+            builder.getNamedAttr("device", device_attr),
+            builder.getNamedAttr("container", variable_v2_op.containerAttr()),
+            builder.getNamedAttr("shared_name",
+                                 builder.getStringAttr(variable_name))});
+    for (Operation *user :
+         make_early_inc_range(variable_v2_op.getResult().getUsers())) {
+      builder.setInsertionPoint(user);
+      ReadVariableOp read_variable_op = builder.create<ReadVariableOp>(
+          user->getLoc(), ArrayRef<Type>{tensor_type},
+          ArrayRef<Value>{var_handle_op}, ArrayRef<NamedAttribute>{});
+      user->getResult(0).replaceAllUsesWith(read_variable_op.getResult());
+      user->erase();
+    }
+    variable_v2_op.erase();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<FuncOp>>
+CreateConvertReadonlyReferenceVariablesToResourceVariablesPass() {
+  return std::make_unique<
+      ConvertReadonlyReferenceVariablesToResourceVariablesPass>();
+}
+
+static PassRegistration<
+    ConvertReadonlyReferenceVariablesToResourceVariablesPass>
+    pass("readonly-references-to-resources",
+         "Convert readonly reference variables to resource variables.");
+
+}  // namespace TF
+
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
index fe9283d6932..15eb5593651 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
@@ -37,18 +37,37 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace mlir {
 namespace TFDevice {
 namespace {
 constexpr char kDeviceAttr[] = "device";
+constexpr char kReplicaIdAttr[] = "_xla_replica_id";
 
 struct ReplicateToIslandPass
     : public PassWrapper<ReplicateToIslandPass, FunctionPass> {
   void runOnFunction() override;
 };
 
+// Returns whether op requires `_xla_replica_id` attribute.
+bool RequiresReplicaIDAttribute(Operation* op) {
+  return llvm::isa<TF::EnqueueTPUEmbeddingSparseTensorBatchOp>(op) ||
+         llvm::isa<TF::EnqueueTPUEmbeddingRaggedTensorBatchOp>(op);
+}
+
+// Adds integer attribute that represents replica id for replicated ops that
+// require replica id attribute.
+void AddReplicaIdToOpsInReplicatedRegion(OpBuilder* builder, Region* region,
+                                         const int replica_id) {
+  region->walk([&](Operation* replicated_op) {
+    if (RequiresReplicaIDAttribute(replicated_op))
+      replicated_op->setAttr(kReplicaIdAttr,
+                             builder->getI32IntegerAttr(replica_id));
+  });
+}
+
 // Creates islands per replica from `tf_device.replicate` region. If for a
 // `tf_device.launch` op the device is an aliased device of the
 // `tf_device.replicate`, the device will be remapped to an explicit device
@@ -90,6 +109,14 @@ llvm::SmallVector<tf_executor::IslandOp, 8> ExpandReplicateIntoReplicas(
     // Copy over replicate region into replica island.
     replicate_op.body().cloneInto(&replica.body(), mapping);
 
+    // TODO(b/157624749): Replace this with better abstraction to
+    // differentiate ops for different replicas.
+    // Some ops, such as XlaHostCompute op or TPU Embedding ops, require
+    // replica id to be added as an op attribute to be used during
+    // execution. Handle such ops separately and add an integer attribute
+    // that represents replica id.
+    AddReplicaIdToOpsInReplicatedRegion(builder, &replica.body(), i);
+
     // Map aliased devices to explicit devices based on replica.
     if (has_devices) {
       replica.walk([&](tf_device::LaunchOp launch) {
@@ -156,9 +183,9 @@ llvm::SmallVector<tf_executor::IslandOp, 8> ExpandReplicateIntoReplicas(
 //   }) {device = "/DEVICE:3"} : () -> tensor<i1>
 //   tf_executor.yield %a1, %b1 : tensor<i1>, tensor<i1>
 // }
-LogicalResult CreateIslandsFromReplicate(const Dialect* tf_dialect,
-                                         tf_executor::IslandOp island_op,
-                                         tf_device::ReplicateOp replicate_op) {
+void CreateIslandsFromReplicate(const Dialect* tf_dialect,
+                                tf_executor::IslandOp island_op,
+                                tf_device::ReplicateOp replicate_op) {
   OpBuilder builder(island_op);
   const int num_replicas = replicate_op.n().getLimitedValue();
 
@@ -199,21 +226,17 @@ LogicalResult CreateIslandsFromReplicate(const Dialect* tf_dialect,
   }
 
   island_op.erase();
-  return success();
 }
 
 // Finds islands with a single `tf_device.replicate` and create individual
 // islands per replica of the replicate.
-LogicalResult LowerSingleIslandReplicateToIslands(
-    const Dialect* tf_dialect, tf_executor::IslandOp island_op) {
-  if (!hasSingleElement(island_op.GetBody().without_terminator()))
-    return success();
+void LowerSingleIslandReplicateToIslands(const Dialect* tf_dialect,
+                                         tf_executor::IslandOp island_op) {
+  if (!island_op.WrapsSingleOp()) return;
 
   if (auto replicate_op =
           llvm::dyn_cast<tf_device::ReplicateOp>(&island_op.GetBody().front()))
-    return CreateIslandsFromReplicate(tf_dialect, island_op, replicate_op);
-
-  return success();
+    CreateIslandsFromReplicate(tf_dialect, island_op, replicate_op);
 }
 
 void ReplicateToIslandPass::runOnFunction() {
@@ -223,13 +246,9 @@ void ReplicateToIslandPass::runOnFunction() {
     getFunction().emitError() << "'tf' dialect is not registered";
   }
 
-  auto result = getFunction().walk([&](tf_executor::IslandOp island_op) {
-    if (failed(LowerSingleIslandReplicateToIslands(tf_dialect, island_op)))
-      return WalkResult::interrupt();
-    return WalkResult::advance();
+  getFunction().walk([&](tf_executor::IslandOp island_op) {
+    LowerSingleIslandReplicateToIslands(tf_dialect, island_op);
   });
-
-  if (result.wasInterrupted()) return signalPassFailure();
 }
 }  // anonymous namespace
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
index d37dfd14590..21d74d81b20 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
@@ -149,7 +149,7 @@ LogicalResult ComputeResourceDevicesInComputation(FuncOp func_op,
   }
   auto walk_res = func_op.walk([&](Operation* op) {
     if (auto var_handle = llvm::dyn_cast<TF::VarHandleOp>(op)) {
-      // Record VarHanldeOp's device attribute.
+      // Record VarHandleOp's device attribute.
       auto device_attr =
           var_handle.getAttrOfType<mlir::StringAttr>(kDeviceAttr);
       if (!device_attr || device_attr.getValue().empty()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index 611c4d2725a..ed7ebc25c9f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -571,7 +571,7 @@ void AddLoadsStoresOutsideControlFlowOp(
 }
 
 // Lifts loads/stores from while loop's body and cond functions.
-LogicalResult HanldeWhileLoop(TF::WhileOp while_op, FuncOp body, FuncOp cond) {
+LogicalResult HandleWhileLoop(TF::WhileOp while_op, FuncOp body, FuncOp cond) {
   // Remove identity nodes to avoid aliasing.
   RemoveIdentity(&body.front());
   RemoveIdentity(&cond.front());
@@ -668,72 +668,74 @@ LogicalResult HanldeWhileLoop(TF::WhileOp while_op, FuncOp body, FuncOp cond) {
   return success();
 }
 
-// Lifts loads/stores from an IfOp's branches.
-LogicalResult HandleIfOP(TF::IfOp if_op, FuncOp then_branch,
-                         FuncOp else_branch) {
+// Lifts loads/stores from an IfOp or CaseOp's branches.
+template <class CaseOrIfOp>
+LogicalResult HandleCaseOrIfOp(CaseOrIfOp op, ArrayRef<FuncOp> branches) {
   // Remove identity nodes to avoid aliasing.
-  RemoveIdentity(&then_branch.front());
-  RemoveIdentity(&else_branch.front());
+  for (auto func : branches) RemoveIdentity(&func.front());
+
   // Sanity check: branch return of resources should be aliases of inputs. If
   // so, replace the output uses with the input so that we can remove these
   // outputs.
-  for (auto entry : llvm::enumerate(
-           llvm::zip(then_branch.front().getTerminator()->getOperands(),
-                     else_branch.front().getTerminator()->getOperands()))) {
-    auto then_retval = std::get<0>(entry.value());
-    auto else_retval = std::get<1>(entry.value());
-    assert(then_retval.getType() == else_retval.getType());
-    if (!getElementTypeOrSelf(then_retval.getType()).isa<TF::ResourceType>()) {
+  for (OpResult result : op.getResults()) {
+    if (!getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>())
       continue;
+    unsigned result_index = result.getResultNumber();
+    constexpr unsigned kUnassigned = -1;
+    unsigned common_aliasing_arg_num = kUnassigned;
+    for (auto func : branches) {
+      auto retval = func.front().getTerminator()->getOperand(result_index);
+      assert(result.getType() == retval.getType());
+      auto aliasing_arg = retval.dyn_cast<BlockArgument>();
+      if (common_aliasing_arg_num == kUnassigned)
+        common_aliasing_arg_num = aliasing_arg.getArgNumber();
+      if (!aliasing_arg ||
+          aliasing_arg.getArgNumber() != common_aliasing_arg_num)
+        return op.emitOpError("unsupported output: ")
+               << "resource does not alias a single input";
     }
-    auto then_aliasing_arg = then_retval.dyn_cast<BlockArgument>();
-    auto else_aliasing_arg = else_retval.dyn_cast<BlockArgument>();
-    if (!then_aliasing_arg || !else_aliasing_arg ||
-        then_aliasing_arg.getArgNumber() != else_aliasing_arg.getArgNumber()) {
-      return if_op.emitOpError("unsupported tf.IfOp output: ")
-             << "resource does not alias a single input.";
-    }
-    if_op.getResult(entry.index())
-        .replaceAllUsesWith(
-            if_op.getOperand(then_aliasing_arg.getArgNumber() + 1));
+    assert(common_aliasing_arg_num != kUnassigned);
+    result.replaceAllUsesWith(op.getOperand(common_aliasing_arg_num + 1));
   }
+
   // Erase the resource outputs from the branches.
   int64_t non_resource_results = 0;
   llvm::SmallVector<int64_t, 4> old_to_new_output_indices;
-  llvm::SmallVector<Attribute, 4> new_output_shapes;
   bool output_removed = false;
-  for (auto result : if_op.getResults()) {
-    if (!getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>()) {
+  for (auto result : op.getResults()) {
+    if (!getElementTypeOrSelf(result.getType())
+             .template isa<TF::ResourceType>()) {
       old_to_new_output_indices.push_back(non_resource_results++);
-      if (!if_op.output_shapes().getValue().empty()) {
-        new_output_shapes.push_back(
-            if_op.output_shapes().getValue()[result.getResultNumber()]);
-      }
       continue;
     }
     old_to_new_output_indices.push_back(-1);
-    then_branch.front().getTerminator()->eraseOperand(non_resource_results);
-    else_branch.front().getTerminator()->eraseOperand(non_resource_results);
+    for (auto func : branches)
+      func.front().getTerminator()->eraseOperand(non_resource_results);
     output_removed = true;
   }
 
-  llvm::SmallDenseMap<int64_t, ResourceArgUseInfo> then_use_info;
-  llvm::SmallDenseMap<int64_t, ResourceArgUseInfo> else_use_info;
-  if (failed(FindResourceArgUseInfo(then_branch, &then_use_info)) ||
-      failed(FindResourceArgUseInfo(else_branch, &else_use_info))) {
+  llvm::SmallDenseMap<int64_t, ResourceArgUseInfo> resource_arg_uses;
+  if (failed(FindResourceArgUseInfo(branches.front(), &resource_arg_uses)))
     return failure();
+
+  for (auto func : branches.drop_front()) {
+    llvm::SmallDenseMap<int64_t, ResourceArgUseInfo> branch_use_info;
+    if (failed(FindResourceArgUseInfo(func, &branch_use_info)))
+      return failure();
+    // A resource is considered used as long as it is used in either branch.
+    resource_arg_uses =
+        MergeArgResourceUseInfo(resource_arg_uses, branch_use_info);
   }
-  // A resource is considered used as long as it is used in either branch.
-  auto resource_arg_uses =
-      MergeArgResourceUseInfo(then_use_info, else_use_info);
+
   if (resource_arg_uses.empty() && !output_removed) return success();
   // Remove unused resources in functions.
   llvm::SmallDenseMap<int64_t, Type> remaining_resource_data_types;
   RemoveUnusedResourceArgumentsAndForwardedRetvals(
-      resource_arg_uses, then_branch, /*old_to_new_arg_indices=*/nullptr,
+      resource_arg_uses, branches.front(), /*old_to_new_arg_indices=*/nullptr,
       &remaining_resource_data_types);
-  RemoveUnusedResourceArgumentsAndForwardedRetvals(resource_arg_uses,
-                                                   else_branch);
+  for (auto func : branches.drop_front())
+    RemoveUnusedResourceArgumentsAndForwardedRetvals(resource_arg_uses, func);
+
   // Forward resource inputs updated in any branch to the outputs of both
   // branches. First prepare the mapping from arg to new update output.
   llvm::SmallDenseMap<int64_t, int64_t> resource_arg_to_new_output;
@@ -751,10 +753,11 @@ LogicalResult HandleIfOP(TF::IfOp if_op, FuncOp then_branch,
           new_output_index;
     }
   }
+
   // Append resource updates to the return ops: now they are just forwarded
   // input resources, but will be replaced by the data value in
   // LiftArgRetResourcesForFunction().
-  for (auto branch : {then_branch, else_branch}) {
+  for (auto branch : branches) {
     auto new_retvals =
         llvm::to_vector<4>(branch.front().getTerminator()->getOperands());
     for (const auto& entry : resource_arg_to_new_output) {
@@ -771,18 +774,18 @@ LogicalResult HandleIfOP(TF::IfOp if_op, FuncOp then_branch,
         });
   }
 
-  // Recreate the if op.
-  OpBuilder builder(if_op);
+  // Recreate the op without resource operands.
+  OpBuilder builder(op);
   // Now use the filtered original operands, which will be replaced by
   // AddLoadsStoresOutsideControlFlowOp().
   auto new_operands =
-      FilterRange<Value, OperandRange>(if_op.input(), resource_arg_uses);
-  new_operands.insert(new_operands.begin(), if_op.cond());
-  auto new_if = builder.create<TF::IfOp>(if_op.getLoc(),
-                                         then_branch.getType().getResults(),
-                                         new_operands, if_op.getAttrs());
-  // Prepare for AddLoadsStoresOutsideControlFlowOp() and update
-  // new_output_shapes.
+      FilterRange<Value, OperandRange>(op.input(), resource_arg_uses);
+  new_operands.insert(new_operands.begin(), op.getOperand(0));
+  FuncOp first_func = branches.front();
+  auto new_op =
+      builder.create<CaseOrIfOp>(op.getLoc(), first_func.getType().getResults(),
+                                 new_operands, op.getAttrs());
+  // Prepare for AddLoadsStoresOutsideControlFlowOp()
   llvm::SmallDenseMap<int64_t, std::pair<Type, int64_t>>
       arg_data_type_and_updated_output_index;
   for (const auto& entry : remaining_resource_data_types) {
@@ -792,22 +795,17 @@ LogicalResult HandleIfOP(TF::IfOp if_op, FuncOp then_branch,
                                : new_output_it->getSecond();
     arg_data_type_and_updated_output_index[entry.getFirst() + 1] = {
         entry.getSecond(), update_index};
-    if (!if_op.output_shapes().getValue().empty() && update_index >= 0) {
-      new_output_shapes.push_back(
-          tensorflow::ConvertTypeToTensorShapeAttr(entry.getSecond()));
-    }
   }
-  AddLoadsStoresOutsideControlFlowOp(new_if,
+  AddLoadsStoresOutsideControlFlowOp(new_op,
                                      arg_data_type_and_updated_output_index);
-  new_if.setAttr("output_shapes", builder.getArrayAttr(new_output_shapes));
   // Replace uses.
   for (int64_t i = 0; i < old_to_new_output_indices.size(); ++i) {
     if (old_to_new_output_indices[i] >= 0) {
-      if_op.getResult(i).replaceAllUsesWith(
-          new_if.getResult(old_to_new_output_indices[i]));
+      op.getResult(i).replaceAllUsesWith(
+          new_op.getResult(old_to_new_output_indices[i]));
     }
   }
-  if_op.erase();
+  op.erase();
   return success();
 }
 
@@ -985,7 +983,7 @@ LogicalResult HoistForFunctionalControlFlow(
                                     lifted_partitioned_call_callees);
       HoistForFunctionalControlFlow(&cond.front(), module,
                                     lifted_partitioned_call_callees);
-      if (failed(HanldeWhileLoop(while_op, body, cond))) return failure();
+      if (failed(HandleWhileLoop(while_op, body, cond))) return failure();
     } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(&op)) {
       auto then_branch =
           llvm::cast<FuncOp>(module.lookupSymbol(if_op.then_branch()));
@@ -996,7 +994,20 @@ LogicalResult HoistForFunctionalControlFlow(
                                     lifted_partitioned_call_callees);
       HoistForFunctionalControlFlow(&else_branch.front(), module,
                                     lifted_partitioned_call_callees);
-      if (failed(HandleIfOP(if_op, then_branch, else_branch))) return failure();
+      if (failed(HandleCaseOrIfOp(if_op, {then_branch, else_branch})))
+        return failure();
+    } else if (auto case_op = llvm::dyn_cast<TF::CaseOp>(&op)) {
+      SmallVector<FuncOp, 4> branch_functions;
+      branch_functions.reserve(case_op.branches().size());
+      for (const Attribute& branch : case_op.branches()) {
+        FuncOp func =
+            module.lookupSymbol<FuncOp>(branch.cast<FlatSymbolRefAttr>());
+        // Recursively handle the nested control flow.
+        HoistForFunctionalControlFlow(&func.front(), module,
+                                      lifted_partitioned_call_callees);
+        branch_functions.push_back(func);
+      }
+      if (failed(HandleCaseOrIfOp(case_op, branch_functions))) return failure();
     } else if (auto call_op = llvm::dyn_cast<TF::PartitionedCallOp>(&op)) {
       if (!call_op.f().isa<FlatSymbolRefAttr>()) {
         return call_op.emitOpError(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 789088bd585..1e9be76aa66 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -66,8 +66,7 @@ using tensorflow::shape_inference::ShapeHandle;
 namespace mlir {
 namespace TF {
 namespace {
-Optional<llvm::SmallVector<mlir::Type, 4>> InferShapeForFunctionReturnType(
-    FuncOp func) {
+Optional<SmallVector<Type, 4>> InferShapeForFunctionReturnType(FuncOp func) {
   // Find any return ops.
   SmallVector<ReturnOp, 4> return_ops;
   for (Block& block : func) {
@@ -137,9 +136,9 @@ void AddCastBackForUnsupportedNonTFUses(Operation* op, Value result,
       cast_op = b.create<TF::CastOp>(op->getLoc(), old_type, result,
                                      /*truncate=*/b.getBoolAttr(false));
     }
-    return mlir::Value(cast_op);
+    return Value(cast_op);
   };
-  for (OpOperand& use : llvm::make_early_inc_range(result.getUses())) {
+  for (OpOperand& use : make_early_inc_range(result.getUses())) {
     if (use.getOwner()->getDialect() != tf_dialect &&
         !IsSupportedNonTFOp(use.getOwner()))
       use.set(get_cast_op());
@@ -162,7 +161,7 @@ Optional<tensorflow::PartialTensorShape> GetShapeFromMlirType(Type t) {
 bool InferShapeForPassThroughOps(OperandRange pass_through_operands,
                                  Operation* op, Dialect* tf_dialect) {
   bool changed = false;
-  for (auto entry : llvm::zip(pass_through_operands, op->getResults())) {
+  for (auto entry : zip(pass_through_operands, op->getResults())) {
     Type operand_type = std::get<0>(entry).getType();
     Value result = std::get<1>(entry);
     if (result.getType() == operand_type) continue;
@@ -204,7 +203,7 @@ bool InferShapeForNonTFDialectOperation(Operation* op, Dialect* tf_dialect) {
         tf_dialect);
   }
   // TODO(b/155227679): Use OpInterface instead of hard-coding for TensorCastOp.
-  if (auto tensor_cast = dyn_cast<mlir::TensorCastOp>(op)) {
+  if (auto tensor_cast = dyn_cast<TensorCastOp>(op)) {
     return InferShapeForPassThroughOps(
         tensor_cast.getOperation()->getOperands(), op, tf_dialect);
   }
@@ -254,7 +253,7 @@ GetSubtypes(Type type) {
 // match the i-th operand type). Returns true if anything is changed.
 bool PassThroughOperandTypes(OperandRange operands, ResultRange results) {
   bool changed = false;
-  for (auto entry : llvm::zip(operands, results)) {
+  for (auto entry : zip(operands, results)) {
     Type operand_type = std::get<0>(entry).getType();
     Type result_type = std::get<1>(entry).getType();
     if (operand_type == result_type) continue;
@@ -291,14 +290,13 @@ bool InferShapeForCall(Operation* op) {
   CallInterfaceCallable callable = call_op.getCallableForCallee();
   SymbolRefAttr sym = callable.dyn_cast<SymbolRefAttr>();
   if (!sym) return false;
-  FuncOp func =
-      dyn_cast<mlir::FuncOp>(SymbolTable::lookupNearestSymbolFrom(op, sym));
+  FuncOp func = dyn_cast<FuncOp>(SymbolTable::lookupNearestSymbolFrom(op, sym));
   if (!func) return false;
 
   bool changed = false;
   // Map each of the results of the call to the returned type of the
   // function.
-  for (auto result : llvm::zip(op->getResults(), func.getType().getResults())) {
+  for (auto result : zip(op->getResults(), func.getType().getResults())) {
     if (std::get<0>(result).getType() == std::get<1>(result)) continue;
     // Skip already statically shaped results.
     if (!CanBeRefined(std::get<0>(result).getType())) continue;
@@ -323,8 +321,8 @@ bool RefineWithInferTypeOpInterface(InferTypeOpInterface infer_ti,
   Operation* op = infer_ti.getOperation();
   SmallVector<Type, 4> inferred;
   LogicalResult res = infer_ti.inferReturnTypes(
-      op->getContext(), op->getLoc(), op->getOperands(), op->getAttrs(),
-      op->getRegions(), inferred);
+      op->getContext(), op->getLoc(), op->getOperands(),
+      op->getAttrDictionary(), op->getRegions(), inferred);
   if (failed(res)) {
     op->emitOpError("failed to refine type as inference failed");
     return false;
@@ -335,7 +333,7 @@ bool RefineWithInferTypeOpInterface(InferTypeOpInterface infer_ti,
   // Map each of the results of the call to the returned type of the
   // function.
   bool changed = false;
-  for (auto result : llvm::zip(op->getResults(), inferred)) {
+  for (auto result : zip(op->getResults(), inferred)) {
     if (std::get<0>(result).getType() == std::get<1>(result)) continue;
 
     // Inserts a cast back to the original type if any user is not in the
@@ -356,7 +354,7 @@ bool RefineWithInferTypeOpInterface(InferTypeOpInterface infer_ti,
 // so for tf.Const -> tensor<10x20xf32>, [0,2,18] would point to a unique output
 // scalar value).
 struct ValuePort {
-  llvm::PointerUnion<Operation*, BlockArgument> producer;
+  PointerUnion<Operation*, BlockArgument> producer;
   SmallVector<unsigned int, 2> port;
 
   bool operator==(const ValuePort& other) const {
@@ -374,39 +372,38 @@ struct ValuePort {
       port = {0};
     }
   }
-  ValuePort(llvm::PointerUnion<Operation*, BlockArgument> producer,
+  ValuePort(PointerUnion<Operation*, BlockArgument> producer,
             SmallVector<unsigned int, 2> port)
       : producer(producer), port(port) {}
 
-  llvm::raw_ostream& print(llvm::raw_ostream& os) const {
+  raw_ostream& print(raw_ostream& os) const {
     if (auto* op = producer.dyn_cast<Operation*>())
       os << "op " << op->getName();
     if (auto ba = producer.dyn_cast<BlockArgument>())
       os << "block_arg " << ba.getArgNumber();
-    os << llvm::formatv(" [{0}]", llvm::make_range(port.begin(), port.end()));
+    os << formatv(" [{0}]", llvm::make_range(port.begin(), port.end()));
     return os;
   }
 };
 
 struct ValuePortHasher {
   std::size_t operator()(const ValuePort& other) const {
-    return llvm::hash_combine(
-        llvm::hash_value(other.producer.getOpaqueValue()),
-        llvm::hash_value(ArrayRef<unsigned int>(other.port)));
+    return hash_combine(llvm::hash_value(other.producer.getOpaqueValue()),
+                        hash_value(ArrayRef<unsigned int>(other.port)));
   }
 };
 
 using ValuePortResultMap =
     std::unordered_map<ValuePort, Attribute, ValuePortHasher>;
-using ComputedQueryFn = llvm::function_ref<bool(ValuePort)>;
-using ValueQueryFn = llvm::function_ref<Attribute(const ValuePort&)>;
-using ValuePortInputs = llvm::SmallVectorImpl<ValuePort>;
+using ComputedQueryFn = function_ref<bool(ValuePort)>;
+using ValueQueryFn = function_ref<Attribute(const ValuePort&)>;
+using ValuePortInputs = SmallVectorImpl<ValuePort>;
 
-// TODO(jpienaar): InputsRequiredForOutput and ComputeOutputComponent are
+// TODO(jpienaar): ComputeInputsRequiredForOutput and ComputeOutputComponent are
 // intended to be switched to op interfaces once more refined.
-LogicalResult InputsRequiredForOutput(ValuePort value_port,
-                                      ComputedQueryFn has_been_computed,
-                                      ValuePortInputs* inputs) {
+LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
+                                             ComputedQueryFn has_been_computed,
+                                             ValuePortInputs* inputs) {
   auto op = value_port.producer.dyn_cast<Operation*>();
   auto& port = value_port.port;
   if (!op) return failure();
@@ -432,7 +429,8 @@ LogicalResult InputsRequiredForOutput(ValuePort value_port,
 // existing computed values.
 Attribute ComputeOutputComponent(const ValuePort& value_port,
                                  ValueQueryFn values) {
-  LLVM_DEBUG(value_port.print(llvm::errs() << "\nComputing output for "));
+  LLVM_DEBUG(value_port.print(llvm::dbgs() << "Computing output for ") << "\n");
+  if (auto known = values(value_port)) return known;
 
   auto op = value_port.producer.dyn_cast<Operation*>();
   if (!op) return nullptr;
@@ -457,29 +455,140 @@ Attribute ComputeOutputComponent(const ValuePort& value_port,
     ValuePort op_port(op->getOperand(port[1]));
     return values(op_port);
   }
+
+  if (auto graph = dyn_cast<tf_executor::GraphOp>(op)) {
+    if (port.size() == 1)
+      return ComputeOutputComponent(
+          ValuePort(graph.GetFetch().fetches()[port[0]]), values);
+    return nullptr;
+  }
+
+  if (auto island = dyn_cast<tf_executor::IslandOp>(op)) {
+    if (port.size() == 1)
+      return ComputeOutputComponent(
+          ValuePort(island.GetYield().fetches()[port[0]]), values);
+    return nullptr;
+  }
+
   return nullptr;
 }
 
-ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic) {
+// Context used during ShapeInference. This class contains common information
+// that is required by the individual shape inference helper functions (e.g.,
+// TF Graph version, constant values computed, etc.)
+class ShapeInference {
+ public:
+  ShapeInference(int64_t graph_version, MLIRContext* context,
+                 bool propagate_caller_callee_constants);
+
+  LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
+                                               ValuePortInputs* inputs) {
+    return ::mlir::TF::ComputeInputsRequiredForOutput(
+        value_port,
+        [this](const ValuePort& port) {
+          return results_.find(port) != results_.end();
+        },
+        inputs);
+  }
+
+  Attribute ComputeOutputComponent(const ValuePort& value_port) {
+    if (auto known_attr = results_[value_port]) return known_attr;
+    auto attr = ::mlir::TF::ComputeOutputComponent(
+        value_port, [this](const ValuePort& port) { return results_[port]; });
+    RecordValue(value_port, attr);
+    return attr;
+  }
+
+  // Returns ShapeHandle if the op result could be computed as shape.
+  ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic);
+
+  void RecordValue(const ValuePort& value_port, Attribute value) {
+    LLVM_DEBUG(value_port.print(llvm::dbgs() << "\trecording ")
+               << value << "\n");
+    results_[value_port] = value;
+  }
+
+  // Performs shape inference on the provided op and return true if the type of
+  // at least one result has been changed.
+  // A tf.Cast() is inserted for any uses that isn't in the TensorFlow dialect.
+  // `graph_version` indicates the current GraphDef compatibility versions
+  // (the versions field in graph.proto).
+  bool InferShapeForSingleOperation(Operation* op);
+
+  // Infers shape on the provided region, including nested ones, iterate until
+  // fix point with a limit of max_iteration. Returns success if fix point is
+  // reached before max_iteration.
+  LogicalResult InferShapeUntilFixPoint(Region* region,
+                                        int64_t max_iteration = 10);
+
+  // Updates input types and refine shapes inside body of functions that are
+  // attached to ControlFlow ops (If/While). These functions include Then/Else
+  // branches of IfOp and Cond/Body functions of WhileOp. These functions share
+  // following common properties:
+  //   1) They are never reused, ie. having a single use in module.
+  //   2) Their input types match those of their parent ops (excluding inputs
+  //      like predicate).
+  // Returns a boolean indicating whether any change has been applied.
+  LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
+                                              ArrayRef<Type> input_types,
+                                              int64_t max_iteration);
+
+  // Propagate the shapes to the functions named.
+  LogicalResult PropagateShapeToFunctions(
+      ModuleOp module, Operation::operand_type_range input_types,
+      ArrayRef<StringRef> func_names, int64_t max_iteration);
+
+  // Shape propagation for call/control flow ops.
+  LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
+                                                    int64_t max_iteration);
+
+  // Propagates any constant operand of call_op to the called function body's
+  // corresponding argument if the callee has only one use.
+  //
+  // TODO(b/154065712): Move this to a more general inter-procedural constant
+  // folding pass.
+  void PropagateConstantToCallee(CallOpInterface call_op,
+                                 SymbolRefAttr callee_sym, ModuleOp module);
+
+  // Propagates any constant return value of the callee function to the call
+  // op's corresponding result.
+  void PropagateConstantFromCallee(CallOpInterface call_op,
+                                   SymbolRefAttr callee_sym, ModuleOp module);
+
+  // Tries to compute the result of folding the op. This doesn't actually
+  // perform constant folding, it is just computes the equivalent constants.
+  // Returns whether it was able to compute constant values.
+  LogicalResult TryToFold(Operation* op);
+
+ private:
+  // Mapping between ValuePort (which corresponds to an OpResult or smaller,
+  // e.g., first element of OpResult produced) to an Attribute if the ValuePort
+  // corresponds to a constant value.
+  ValuePortResultMap results_;
+  int64_t graph_version_;
+  Dialect* tf_dialect_;
+
+  // TODO(b/154065712): Remove propagate_caller_callee_constants once using
+  // SCCP pass instead.
+  bool propagate_caller_callee_constants_;
+};
+
+ShapeInference::ShapeInference(int64_t graph_version, MLIRContext* context,
+                               bool propagate_caller_callee_constants)
+    : graph_version_(graph_version),
+      propagate_caller_callee_constants_(propagate_caller_callee_constants) {
+  tf_dialect_ = context->getRegisteredDialect<TensorFlowDialect>();
+}
+
+ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
+                                                 InferenceContext* ic) {
   LLVM_DEBUG(result.print(llvm::dbgs() << "\nEvaluate partially "));
   auto rt = result.getType().dyn_cast<RankedTensorType>();
   if (!rt || !rt.hasStaticShape() || rt.getRank() != 1) return {};
   int dim_size = rt.getDimSize(0);
 
   // Worklist to direct partial evaluation.
-  llvm::SmallVector<ValuePort, 4> worklist;
-  // The ValuePort evaluated results.
-  // TODO(jpienaar): This could be cached across invocations (e.g., part of some
-  // inference context).
-  ValuePortResultMap evaluated;
-  // Returns whether a ValuePort has been previously computed.
-  auto has_been_computed = [&evaluated](const ValuePort& port) {
-    return evaluated.find(port) != evaluated.end();
-  };
-  // Returns previously computed ValuePort value.
-  auto values = [&evaluated](const ValuePort& port) -> Attribute {
-    return evaluated[port];
-  };
+  SmallVector<ValuePort, 4> worklist;
 
   // Simple evaluator that attempts to partially evaluate the input value even
   // if unable to evaluate the complete output. Below follows a simple stack
@@ -498,7 +607,7 @@ ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic) {
       LLVM_DEBUG(front.print(llvm::errs() << "\nWorklist front "));
 
       SmallVector<ValuePort, 4> inputs;
-      auto res = InputsRequiredForOutput(front, has_been_computed, &inputs);
+      auto res = ComputeInputsRequiredForOutput(front, &inputs);
       if (failed(res)) {
         // Abort if unable to find which required inputs need to be computed.
         worklist.clear();
@@ -513,16 +622,15 @@ ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic) {
         continue;
       }
 
-      auto ret = ComputeOutputComponent(front, values);
+      auto ret = ComputeOutputComponent(front);
       if (!ret) continue;
 
-      evaluated[front] = ret;
       LLVM_DEBUG(ret.print(llvm::dbgs() << "\ncomputed result = "));
 
       // If worklist is empty, then this is the root query op.
       if (worklist.empty()) {
         LLVM_DEBUG(llvm::dbgs() << "[root node]\n");
-        if (auto dea = ret.dyn_cast<mlir::DenseIntElementsAttr>()) {
+        if (auto dea = ret.dyn_cast<DenseIntElementsAttr>()) {
           if (dea.getNumElements() != 1) {
             LLVM_DEBUG(llvm::errs() << "Unexpected number of elements\n");
             return {};
@@ -536,9 +644,10 @@ ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic) {
   return ic->MakeShape(dims);
 }
 
-bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
-                                  int64_t graph_version) {
-  assert(tf_dialect == op->getDialect());
+bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
+  LLVM_DEBUG(op->print(llvm::dbgs() << "InferShapeForSingleOperation for ");
+             llvm::dbgs() << "\n");
+  assert(tf_dialect_ == op->getDialect());
   // The shape function of these ops sometimes does not propagate subtypes
   // (handle shapes) for resource and variant types. We use a simple passthrough
   // to make sure they are preserved in the output.
@@ -550,7 +659,7 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   // If no result for this op needs shape inference, we have a fast-path return.
   // But if the type is a resource/variant, we do not skip it because we might
   // not have the handle shapes.
-  if (llvm::none_of(op->getResultTypes(), CanBeRefined)) {
+  if (none_of(op->getResultTypes(), CanBeRefined)) {
     LLVM_DEBUG(llvm::dbgs() << "Skipping inference for statically shaped op '"
                             << op->getName() << "'.\n");
     return false;
@@ -565,8 +674,8 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   // This is necessary to avoid reprocessing the tf.Cast that are inserted at
   // the end of this function.
   if (isa<CastOp>(op) &&
-      llvm::all_of(op->getResult(0).getUsers(), [&](Operation* user) {
-        return user->getDialect() != tf_dialect;
+      all_of(op->getResult(0).getUsers(), [&](Operation* user) {
+        return user->getDialect() != tf_dialect_;
       })) {
     LLVM_DEBUG(llvm::dbgs() << "Skipping inference for tf.Cast with no TF "
                                "dialect operation users '"
@@ -622,10 +731,14 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
     size_t index = it.index();
 
     // If the operand is constant, then convert it to Tensor.
-    ElementsAttr attr;
-    if (matchPattern(operand, m_Constant(&attr))) {
+    ValuePort vp(operand);
+    Attribute attr = ComputeOutputComponent(vp);
+    if (!attr && matchPattern(operand, m_Constant(&attr)))
+      RecordValue(vp, attr);
+    if (attr) {
       tensorflow::Tensor* input_tensor = &tensors[index];
-      auto status = tensorflow::ConvertToTensor(attr, input_tensor);
+      auto status =
+          tensorflow::ConvertToTensor(attr.cast<ElementsAttr>(), input_tensor);
       if (status.ok()) {
         input_tensors[index] = input_tensor;
       } else {
@@ -646,7 +759,7 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   // Perform the shape inference using an InferenceContext with the input
   // shapes. This object is abstracting the information that the ShapeInference
   // function operates on.
-  InferenceContext c(graph_version, *node_def, op_reg_data->op_def,
+  InferenceContext c(graph_version_, *node_def, op_reg_data->op_def,
                      input_shapes, input_tensors,
                      /*input_tensors_as_shapes=*/{}, handle_shapes_and_types);
   auto status = c.Run(op_reg_data->shape_inference_fn);
@@ -659,15 +772,17 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   // Determine if, during shape computation, the shape functions attempted to
   // query an input operand as shape where the input was not known/constant.
   bool requires_inputs =
-      llvm::any_of(llvm::seq<int>(0, c.num_inputs()), [&](int input) {
+      any_of(llvm::seq<int>(0, c.num_inputs()), [&](int input) {
         return c.requested_input_tensor_as_partial_shape(input) &&
                !input_tensors[input];
       });
   if (requires_inputs) {
+    LLVM_DEBUG(llvm::dbgs() << "\trequired input\n");
     std::vector<ShapeHandle> input_tensors_as_shapes;
     for (int input : llvm::seq<int>(0, c.num_inputs())) {
       if (c.requested_input_tensor_as_partial_shape(input) &&
           !input_tensors[input]) {
+        LLVM_DEBUG(llvm::dbgs() << "Requesting " << input << " as shape\n");
         auto op_result = op->getOperand(input).dyn_cast<OpResult>();
         if (!op_result) continue;
         // Resize on first valid shape computed.
@@ -723,7 +838,7 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
         new_element_type.isa<TF::VariantType>()) {
       auto handle_shapes_types = c.output_handle_shapes_and_types(output);
       if (handle_shapes_types) {
-        llvm::SmallVector<mlir::TensorType, 1> subtypes;
+        SmallVector<TensorType, 1> subtypes;
         OpBuilder b(op);
         for (const auto& shape_n_type : *handle_shapes_types) {
           Type element_type;
@@ -743,7 +858,7 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
     if (result.getType() == new_type) continue;
     // Inserts a cast back to the original type if any user is not in the TF
     // dialect.
-    AddCastBackForUnsupportedNonTFUses(op, result, tf_dialect,
+    AddCastBackForUnsupportedNonTFUses(op, result, tf_dialect_,
                                        result.getType());
     // Finally we inferred the shape and replace the type for this result.
     result.setType(new_type);
@@ -755,23 +870,13 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   return changed;
 }
 
-// Updates input types and refine shapes inside body of functions that are
-// attached to ControlFlow ops (If/While). These functions include Then/Else
-// branches of IfOp and Cond/Body functions of WhileOp. These functions share
-// following common properties:
-//   1) They are never reused, ie. having a single use in module.
-//   2) Their input types match those of their parent ops (excluding inputs like
-//      predicate).
-// Returns a boolean indicating whether any change has been applied.
-LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
-                                            llvm::ArrayRef<Type> input_types,
-                                            int64_t graph_version,
-                                            int64_t max_iteration) {
+LogicalResult ShapeInference::RefineShapeForControlFlowFunc(
+    FuncOp func, ArrayRef<Type> input_types, int64_t max_iteration) {
   ModuleOp module = func.getParentOfType<ModuleOp>();
   auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
   int num_uses = std::distance(func_uses->begin(), func_uses->end());
   if (num_uses != 1) {
-    func.emitWarning(llvm::formatv(
+    func.emitWarning(formatv(
         "expected control flow function {0} to have exactly 1 use, found {1}.",
         func.getName(), num_uses));
     return failure();
@@ -785,8 +890,7 @@ LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
     arg_and_idx.value().setType(input_types[arg_and_idx.index()]);
   }
 
-  auto res =
-      InferShapeUntilFixPoint(&func.getBody(), graph_version, max_iteration);
+  auto res = InferShapeUntilFixPoint(&func.getBody(), max_iteration);
   if (failed(res)) return res;
 
   auto new_return_types = InferShapeForFunctionReturnType(func);
@@ -798,85 +902,98 @@ LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
   return success();
 }
 
-LogicalResult PropagateShapeToFunctions(
+LogicalResult ShapeInference::PropagateShapeToFunctions(
     ModuleOp module, Operation::operand_type_range input_types,
-    llvm::ArrayRef<StringRef> func_names, int64_t graph_version,
-    int64_t max_iteration) {
-  bool success = true;
+    ArrayRef<StringRef> func_names, int64_t max_iteration) {
+  bool all_succeeded = true;
   auto types = llvm::to_vector<4>(input_types);
   for (auto func_name : func_names) {
     FuncOp func = module.lookupSymbol<FuncOp>(func_name);
-    if (failed(RefineShapeForControlFlowFunc(func, types, graph_version,
-                                             max_iteration))) {
-      success = false;
-    }
+    all_succeeded =
+        succeeded(RefineShapeForControlFlowFunc(func, types, max_iteration)) &&
+        all_succeeded;
   }
-  return mlir::success(success);
+  return success(all_succeeded);
 }
 
-// If the callee has only one use, propagates any constant operand of call_op to
-// the called function body's corresponding argument.
-//
-// TODO(b/154065712): Move this to a more general inter-procedural constant
-// folding pass.
-void PropagateConstantToCallee(CallOpInterface call_op,
-                               SymbolRefAttr callee_sym, ModuleOp module) {
+void ShapeInference::PropagateConstantToCallee(CallOpInterface call_op,
+                                               SymbolRefAttr callee_sym,
+                                               ModuleOp module) {
   auto func = module.lookupSymbol<FuncOp>(callee_sym.getRootReference());
   auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
   int num_uses = std::distance(func_uses->begin(), func_uses->end());
+  if (num_uses != 1) return;
+
   OpBuilder builder(&func.front().front());
   Operation* op = call_op.getOperation();
-  if (num_uses == 1) {
-    // If this is the only caller, and an operand is a constant, propagate
-    // the constant inside the function.
-    for (auto arg : func.getArguments()) {
-      auto operand = op->getOperand(arg.getArgNumber()).getDefiningOp();
-      if (llvm::isa_and_nonnull<TF::ConstOp>(operand)) {
-        arg.replaceAllUsesWith(builder.clone(*operand)->getResult(0));
+  // If this is the only caller, and an operand is a constant, propagate
+  // the constant value inside the function.
+  for (auto arg : func.getArguments()) {
+    auto operand = op->getOperand(arg.getArgNumber());
+    if (propagate_caller_callee_constants_) {
+      if (isa_and_nonnull<TF::ConstOp>(operand.getDefiningOp())) {
+        arg.replaceAllUsesWith(
+            builder.clone(*operand.getDefiningOp())->getResult(0));
       }
+      continue;
     }
+
+    auto known_constant = ComputeOutputComponent(ValuePort(operand));
+    if (!known_constant) continue;
+    LLVM_DEBUG(call_op.print(llvm::dbgs() << "Propagate to calee: ");
+               known_constant.print(llvm::dbgs() << " constant ");
+               llvm::dbgs() << "\n");
+    RecordValue(ValuePort(arg), known_constant);
   }
 }
 
-// Propagates any constant return value of the callee function to the call op's
-// corresponding result.
-void PropagateConstantFromCallee(CallOpInterface call_op,
-                                 SymbolRefAttr callee_sym, ModuleOp module) {
+void ShapeInference::PropagateConstantFromCallee(CallOpInterface call_op,
+                                                 SymbolRefAttr callee_sym,
+                                                 ModuleOp module) {
   auto func = module.lookupSymbol<FuncOp>(callee_sym.getRootReference());
-  // If the return value is a constant, replace the call result with a constant.
+  // If the return value is a constant, use the constant as the value of
+  // the call return.
   Operation* op = call_op.getOperation();
   OpBuilder builder(op);
   builder.setInsertionPointAfter(op);
   for (auto retval :
        llvm::enumerate(func.front().getTerminator()->getOperands())) {
-    auto retval_op = retval.value().getDefiningOp();
-    if (llvm::isa_and_nonnull<TF::ConstOp>(retval_op)) {
-      op->getResult(retval.index())
-          .replaceAllUsesWith(builder.clone(*retval_op)->getResult(0));
+    if (propagate_caller_callee_constants_) {
+      auto retval_op = retval.value().getDefiningOp();
+      if (isa_and_nonnull<TF::ConstOp>(retval_op)) {
+        op->getResult(retval.index())
+            .replaceAllUsesWith(builder.clone(*retval_op)->getResult(0));
+      }
+      continue;
+    }
+
+    ValuePort vp(retval.value());
+    if (auto known_constant = ComputeOutputComponent(vp)) {
+      LLVM_DEBUG(known_constant.print(llvm::dbgs() << "Propagate constant ");
+                 call_op.print(llvm::dbgs() << "from "); llvm::dbgs() << "\n");
+      RecordValue(ValuePort(op->getResult(retval.index())), known_constant);
     }
   }
 }
 
-LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
-                                                  int64_t graph_version,
-                                                  int64_t max_iteration) {
+LogicalResult ShapeInference::PropagateShapeIntoAttachedFunctions(
+    Operation* op, int64_t max_iteration) {
   ModuleOp module = op->getParentOfType<ModuleOp>();
   if (auto if_op = dyn_cast<TF::IfOp>(op)) {
     return PropagateShapeToFunctions(
-        module, llvm::drop_begin(if_op.getOperandTypes(), 1),
-        {if_op.then_branch(), if_op.else_branch()}, graph_version,
-        max_iteration);
+        module, drop_begin(if_op.getOperandTypes(), 1),
+        {if_op.then_branch(), if_op.else_branch()}, max_iteration);
   } else if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
     return PropagateShapeToFunctions(module, while_op.getOperandTypes(),
                                      {while_op.cond(), while_op.body()},
-                                     graph_version, max_iteration);
+                                     max_iteration);
   } else if (auto call_op = dyn_cast<CallOpInterface>(op)) {
     CallInterfaceCallable callable = call_op.getCallableForCallee();
     if (SymbolRefAttr sym = callable.dyn_cast<SymbolRefAttr>()) {
       PropagateConstantToCallee(call_op, sym, module);
       if (failed(PropagateShapeToFunctions(
               module, call_op.getArgOperands().getTypes(),
-              {sym.getRootReference()}, graph_version, max_iteration))) {
+              {sym.getRootReference()}, max_iteration))) {
         return failure();
       }
       PropagateConstantFromCallee(call_op, sym, module);
@@ -889,13 +1006,71 @@ LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
   return success();
 }
 
-LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
-                                      int64_t max_iteration) {
-  MLIRContext* ctx = region->getContext();
-  Dialect* tf_dialect = ctx->getRegisteredDialect<TensorFlowDialect>();
+LogicalResult ShapeInference::TryToFold(Operation* op) {
+  LLVM_DEBUG(op->print(llvm::dbgs() << "TryToFold "); llvm::dbgs() << "\n");
+  // If any output result is known, then the op probably has been computed
+  // before.
+  if (op->getNumResults() > 0 && results_[ValuePort(op->getResult(0))])
+    return success();
 
-  // An operation folder that is used to attempt folding before inference.
-  OperationFolder folder(ctx);
+  SmallVector<Attribute, 8> constant_operands(op->getNumOperands());
+  SmallVector<OpFoldResult, 8> fold_results;
+
+  // Check to see if any operands to the operation is constant and whether
+  // the operation knows how to constant fold itself.
+  bool some_unknown = false;
+  for (int i = 0, e = op->getNumOperands(); i != e; ++i) {
+    if (!(constant_operands[i] =
+              ComputeOutputComponent(ValuePort(op->getOperand(i)))))
+      some_unknown = true;
+  }
+
+  // Attempt to constant fold the operation.
+  auto* abstract_op = op->getAbstractOperation();
+  LogicalResult folded = failure();
+  if (abstract_op) {
+    folded = abstract_op->foldHook(op, constant_operands, fold_results);
+  }
+  // Attempt dialect fallback if op's fold hook failed.
+  if (failed(folded)) {
+    Dialect* dialect = op->getDialect();
+    if (!dialect) return failure();
+    // Only attempt TF dialect fallback if there are no unknown operands.
+    if (some_unknown && dialect == tf_dialect_) return failure();
+    SmallVector<Attribute, 8> constants;
+    if (failed(dialect->constantFoldHook(op, constant_operands, constants)))
+      return failure();
+    fold_results.assign(constants.begin(), constants.end());
+  }
+
+  for (auto result : zip(op->getResults(), fold_results)) {
+    auto fold_result = std::get<1>(result);
+    Attribute attr = nullptr;
+    if ((attr = fold_result.dyn_cast<Attribute>())) {
+      RecordValue(ValuePort(std::get<0>(result)), attr);
+    } else {
+      auto value = fold_result.get<Value>();
+      if ((attr = ComputeOutputComponent(ValuePort(value))))
+        RecordValue(ValuePort(std::get<0>(result)), attr);
+    }
+
+    if (ElementsAttr eattr = attr.dyn_cast_or_null<ElementsAttr>()) {
+      if (std::get<0>(result).getType() == eattr.getType()) continue;
+
+      // Inserts a cast back to the original type if any user is not in the
+      // TF dialect.
+      Type old_type = std::get<0>(result).getType();
+      std::get<0>(result).setType(eattr.getType());
+      AddCastBackForUnsupportedNonTFUses(op, std::get<0>(result), tf_dialect_,
+                                         old_type);
+    }
+  }
+
+  return success();
+}
+
+LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
+                                                      int64_t max_iteration) {
   bool changed = true;
 
   // TODO(aminim): we could have a more efficient traversal by guiding the
@@ -908,29 +1083,27 @@ LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
                << "Shape inference, iteration " << iteration << "\n");
     region->walk([&](Operation* op) {
       if (auto infer_ti = dyn_cast<InferTypeOpInterface>(op)) {
-        changed |= RefineWithInferTypeOpInterface(infer_ti, tf_dialect);
-        // TODO(jpienaar): Debug why we can't just return here. We end up with
-        // additional constant due to the propagation of constant into attached
-        // function if we return already.
-      }
-
-      if (op->getDialect() != tf_dialect) {
-        changed |= InferShapeForNonTFDialectOperation(op, tf_dialect);
+        changed |= RefineWithInferTypeOpInterface(infer_ti, tf_dialect_);
         return;
       }
 
-      // Before attempting inference, just try to fold the operation.
-      if (succeeded(folder.tryToFold(op))) return;
+      if (op->getDialect() != tf_dialect_) {
+        changed |= InferShapeForNonTFDialectOperation(op, tf_dialect_);
+        return;
+      }
+
+      // Before attempting inference, just try to compute the folded
+      // value/shape.
+      if (succeeded(TryToFold(op))) return;
 
       // Best-effort shape inference in attached functions. Do not return
       // failure even if it doesn't get to fixed point.
-      if (failed(PropagateShapeIntoAttachedFunctions(op, graph_version,
-                                                     max_iteration))) {
+      if (failed(PropagateShapeIntoAttachedFunctions(op, max_iteration))) {
         op->emitWarning() << "unable to refine shape of attached function "
                              "arguments and bodies";
       }
 
-      changed |= InferShapeForSingleOperation(op, tf_dialect, graph_version);
+      changed |= InferShapeForSingleOperation(op);
     });
   }
 
@@ -944,32 +1117,46 @@ LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
 
 LogicalResult InferShapeForFunction(FuncOp func,
                                     ArrayRef<ArrayRef<int64_t>> arg_shapes,
-                                    int64_t graph_version) {
-  mlir::FunctionType func_type = func.getType();
+                                    int64_t graph_version,
+                                    bool propagate_caller_callee_constants) {
+  ShapeInference context(graph_version, func.getContext(),
+                         propagate_caller_callee_constants);
+  if (arg_shapes.empty()) {
+    if (failed(context.InferShapeUntilFixPoint(&func.getBody())))
+      return failure();
+    // TODO(b/156276510): Verify that it is always fine to refine a function's
+    // return type, as long as we do not change the argument shapes.
+    if (auto return_types = InferShapeForFunctionReturnType(func)) {
+      func.setType(FunctionType::get(func.getType().getInputs(),
+                                     return_types.getValue(),
+                                     func.getContext()));
+    }
+
+    return success();
+  }
+  FunctionType func_type = func.getType();
   bool needs_refinement = false;
-  llvm::SmallVector<mlir::Type, 4> new_arg_types;
+  SmallVector<Type, 4> new_arg_types;
   new_arg_types.reserve(func_type.getNumInputs());
 
   // Update argument types in-place using the provided arg_shapes.
   for (size_t i = 0; i < func_type.getNumInputs(); ++i) {
     ArrayRef<int64_t> shape = arg_shapes[i];
-    mlir::Type element_type;
-    if (auto input_ty =
-            func_type.getInput(i).dyn_cast<mlir::RankedTensorType>()) {
-      if (!input_ty || input_ty.getShape().size() != shape.size()) {
+    Type element_type;
+    if (auto input_ty = func_type.getInput(i).dyn_cast<RankedTensorType>()) {
+      if (input_ty.getRank() != shape.size()) {
         return failure();
       }
       element_type = input_ty.getElementType();
     } else {
-      auto unranked_input_ty =
-          func_type.getInput(i).dyn_cast<mlir::TensorType>();
+      auto unranked_input_ty = func_type.getInput(i).dyn_cast<TensorType>();
       if (!unranked_input_ty) {
         return failure();
       }
       element_type = unranked_input_ty.getElementType();
     }
 
-    auto new_arg_type = mlir::RankedTensorType::get(shape, element_type);
+    auto new_arg_type = RankedTensorType::get(shape, element_type);
     if (new_arg_type != func_type.getInput(i)) {
       // If the new type is more detailed, trigger shape inference.
       func.getArgument(i).setType(new_arg_type);
@@ -982,28 +1169,17 @@ LogicalResult InferShapeForFunction(FuncOp func,
     return success();
   }
 
-  mlir::LogicalResult result =
-      mlir::TF::InferShapeUntilFixPoint(&func.getBody(), graph_version);
+  LogicalResult result = context.InferShapeUntilFixPoint(&func.getBody());
   if (failed(result)) {
     return failure();
   }
 
   auto return_types = InferShapeForFunctionReturnType(func);
-  func.setType(mlir::FunctionType::get(new_arg_types,
-                                       return_types.hasValue()
-                                           ? return_types.getValue()
-                                           : func.getType().getResults(),
-                                       func.getContext()));
-
-  return success();
-}
-
-LogicalResult InferShapeForFunctionType(FuncOp func) {
-  if (auto return_types = InferShapeForFunctionReturnType(func)) {
-    func.setType(mlir::FunctionType::get(func.getType().getInputs(),
-                                         return_types.getValue(),
-                                         func.getContext()));
-  }
+  func.setType(FunctionType::get(new_arg_types,
+                                 return_types.hasValue()
+                                     ? return_types.getValue()
+                                     : func.getType().getResults(),
+                                 func.getContext()));
 
   return success();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
index 0524ec678ed..7486fd77388 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
@@ -27,29 +27,14 @@ namespace mlir {
 
 namespace TF {
 
-// Performs shape inference on the provided op and return true if the type of
-// at least one result has been changed.
-// A tf.Cast() is inserted for any uses that isn't in the TensorFlow dialect.
-// `graph_version` indicates the current GraphDef compatibility versions
-// (the versions field in graph.proto).
-bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
-                                  int64_t graph_version);
-
-// Infers shape on the provided region, including nested ones, iterate until fix
-// point with a limit of max_iteration. Returns success if fix point is reached
-// before max_iteration.
-LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
-                                      int64_t max_iteration = 10);
-
 // Given a list of refined shapes matching the function arguments of func, runs
 // shape inference over the function to propagate this updated information.
-LogicalResult InferShapeForFunction(FuncOp func,
-                                    ArrayRef<ArrayRef<int64_t>> arg_shapes,
-                                    int64_t graph_version);
-
-// Refines the return type of the given function by folding tf.Cast that
-// precedes the return instruction.
-LogicalResult InferShapeForFunctionType(FuncOp func);
+// If arg_shapes are empty, then argument shapes will be left unchanged.
+// TODO(b/154065712): Remove propagate_caller_callee_constants once using
+// SCCP pass instead.
+LogicalResult InferShapeForFunction(
+    FuncOp func, ArrayRef<ArrayRef<int64_t>> arg_shapes, int64_t graph_version,
+    bool propagate_caller_callee_constants = true);
 
 }  // namespace TF
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
index 48e4e77ce0f..1a846398412 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
@@ -47,8 +47,15 @@ namespace {
 
 // This transformation pass propagate shapes on the TensorFlow graph.
 // It is a ModulePass in order to be able to change function types.
-struct ShapeInference
+class ShapeInference
     : public PassWrapper<ShapeInference, OperationPass<ModuleOp>> {
+ public:
+  ShapeInference() = default;
+  ShapeInference(const ShapeInference& that) {
+    propagate_caller_callee_constants_ =
+        that.propagate_caller_callee_constants_;
+  }
+
   void runOnOperation() override {
     auto module = getOperation();
     auto producer_or = tensorflow::GetTfGraphProducerVersion(module);
@@ -58,12 +65,17 @@ struct ShapeInference
     }
     int64_t producer = producer_or.ValueOrDie();
     for (auto func : module.getOps<FuncOp>()) {
-      InferShapeUntilFixPoint(&func.getBody(), producer);
-      // TODO(yuanzx): Verify that it is always fine to refine a function's
-      // return type, as long as we do not change the argument shapes.
-      InferShapeForFunctionType(func);
+      if (failed(InferShapeForFunction(func, /*arg_shapes=*/{}, producer,
+                                       propagate_caller_callee_constants_)))
+        return signalPassFailure();
     }
   }
+
+ private:
+  Option<bool> propagate_caller_callee_constants_{
+      *this, "propagate-caller-callee-constants",
+      llvm::cl::desc("Propagate constants between callers and callees"),
+      llvm::cl::init(true)};
 };
 
 PassRegistration<ShapeInference> pass(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
index 6e27823191b..b2203c890e3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
@@ -254,22 +254,14 @@ LogicalResult HandleIfOp(TF::IfOp if_op, ModuleOp module,
   if (output_buffer_to_size.empty() && arg_no_changed) return success();
   // Recreate the If op.
   auto new_if_operands = llvm::to_vector<8>(if_op.getOperands());
-  auto new_output_shapes = llvm::to_vector<8>(if_op.output_shapes().getValue());
   for (int64_t i = 1; i < if_op.getNumOperands(); ++i) {
     auto it = buffer_to_size->find(if_op.getOperand(i));
     if (it == buffer_to_size->end()) continue;
     new_if_operands.push_back(it->getSecond().size);
-    if (!new_output_shapes.empty()) {
-      // Size is a scalar shape.
-      tensorflow::TensorShapeProto shape_proto;
-      new_output_shapes.push_back(builder.getStringAttr(
-          tensorflow::mangling_util::MangleShape(shape_proto)));
-    }
   }
   auto new_if = OpBuilder(if_op).create<TF::IfOp>(
       if_op.getLoc(), then_branch.getType().getResults(), new_if_operands,
       if_op.getAttrs());
-  new_if.setAttr("output_shapes", builder.getArrayAttr(new_output_shapes));
   for (const auto& entry : output_buffer_to_size) {
     (*buffer_to_size)[new_if.getResult(std::get<0>(entry))] = {
         new_if.getResult(std::get<1>(entry)), std::get<2>(entry)};
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
index 141feeb6b24..688f21c1d52 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
@@ -14,11 +14,31 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <tuple>
+#include <type_traits>
+#include <utility>
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
 
 namespace mlir {
 namespace TFTPU {
@@ -30,30 +50,400 @@ namespace {
 
 constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
 
-struct TPUExtractHeadTailOutsideCompilation
-    : public PassWrapper<TPUExtractHeadTailOutsideCompilation, FunctionPass> {
-  void runOnFunction() override;
-};
+bool HasOutsideCompilationAttribute(Operation* op) {
+  return op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr) != nullptr;
+}
 
-void TPUExtractHeadTailOutsideCompilation::runOnFunction() {
-  getFunction().walk([&](tf_device::LaunchOp launch) {
-    Block& launch_block = launch.GetBody();
-    for (auto& op : llvm::make_early_inc_range(launch_block.getOperations())) {
-      // TODO(b/155115766): Handle outputs that should be inputs to TPU
-      // LaunchOp.
-      if (auto attr =
-              op.getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
-        op.moveBefore(launch);
-      } else {
+// Finds op that created a given value. If the value is a BlockArgument, this
+// returns the owner of the Block.
+Operation* GetOpOfValue(Value value) {
+  if (auto block_arg = value.dyn_cast<BlockArgument>())
+    return block_arg.getOwner()->getParentOp();
+
+  return value.getDefiningOp();
+}
+
+// Checks if `op` is nested in `block`.
+bool OpInBlock(Operation* op, Block* block) {
+  Block* op_block = op->getBlock();
+  while (op_block) {
+    if (op_block == block) return true;
+    if (auto* parent_op = op_block->getParentOp()) {
+      op_block = parent_op->getBlock();
+    } else {
+      break;
+    }
+  }
+  return false;
+}
+
+// Wraps block in a Launch. External uses of ops in the block will be return
+// values of the Launch and remapped to the Launch results. If `before` is set
+// to true, the Launch is created before `op`. Otherwise the Launch is created
+// after `op`.
+tf_device::LaunchOp CreateLaunchForBlock(OpBuilder* builder, Operation* op,
+                                         bool before, Block* launch_block,
+                                         llvm::StringRef host_device) {
+  // Find results and result types of ops in block that needs to returned.
+  llvm::SmallVector<Value, 4> launch_results;
+  llvm::SmallVector<Type, 4> launch_result_types;
+  for (Operation& head_outside_compiled_op : *launch_block) {
+    for (Value result : head_outside_compiled_op.getResults()) {
+      bool has_external_uses = false;
+      for (Operation* user : result.getUsers()) {
+        if (OpInBlock(user, launch_block)) continue;
+        has_external_uses = true;
         break;
       }
+      if (has_external_uses) {
+        launch_results.push_back(result);
+        launch_result_types.push_back(result.getType());
+      }
     }
-  });
+  }
+
+  before ? builder->setInsertionPoint(op) : builder->setInsertionPointAfter(op);
+  auto launch = builder->create<tf_device::LaunchOp>(
+      op->getLoc(), builder->getStringAttr(host_device), launch_result_types);
+  launch.body().push_back(launch_block);
+
+  builder->setInsertionPointToEnd(&launch.GetBody());
+  builder->create<tf_device::ReturnOp>(op->getLoc(), launch_results);
+
+  return launch;
+}
+
+// Parses TPU compilation and execution devices from a TPU cluster and returns
+// the host device for the head and tail computations. If the TPU computation is
+// replicated, kTPUReplicatedHost is returned instead.
+LogicalResult GetHostDeviceForHeadTailComputation(
+    mlir::TF::RuntimeDevices devices, tf_device::ClusterOp cluster,
+    std::string* host_device) {
+  auto replicate = cluster.getParentOfType<tf_device::ReplicateOp>();
+  if (replicate) {
+    *host_device = tensorflow::kTPUReplicatedHost;
+    return success();
+  }
+
+  auto num_cores_per_replica_attr =
+      cluster.getAttrOfType<IntegerAttr>(tensorflow::kNumCoresPerReplicaAttr);
+  if (!num_cores_per_replica_attr)
+    return cluster.emitOpError(
+        "cluster op missing `num_cores_per_replica` attribute");
+
+  if (num_cores_per_replica_attr.getInt() != 1)
+    return cluster.emitOpError(
+        "outside compilation is not supported with model parallelism.");
+
+  auto topology_attr =
+      cluster.getAttrOfType<StringAttr>(tensorflow::kTopologyAttr);
+  if (!topology_attr)
+    return cluster.emitOpError("cluster op missing `topology` attribute");
+
+  auto device_assignment_attr =
+      cluster.getAttrOfType<mlir::ArrayAttr>(tensorflow::kDeviceAssignmentAttr);
+  if (!device_assignment_attr)
+    return cluster.emitOpError(llvm::formatv("requires attribute '{0}'",
+                                             tensorflow::kDeviceAssignmentAttr)
+                                   .str());
+
+  auto status_or_device_coodinates =
+      tensorflow::GetDeviceCoordinates(device_assignment_attr);
+
+  if (!status_or_device_coodinates.ok())
+    return cluster.emitError()
+           << "error in fetching tpu device coordinates: "
+           << status_or_device_coodinates.status().error_message();
+
+  // Determine compilation and execution devices.
+  auto status_or_tpu_device_assignment =
+      tensorflow::GetTPUCompilationAndExecutionDevices(
+          devices.device_names(), /*num_replicas=*/1,
+          /*num_cores_per_replica=*/1, topology_attr.getValue(),
+          status_or_device_coodinates.ConsumeValueOrDie());
+  if (!status_or_tpu_device_assignment.ok())
+    return cluster.emitError()
+           << "error in fetching TPU compilation/execution devices: "
+           << status_or_tpu_device_assignment.status().error_message();
+  auto& tpu_device_assignment = status_or_tpu_device_assignment.ValueOrDie();
+
+  *host_device = tpu_device_assignment.tpu_devices[0][0].host;
+  return success();
+}
+
+// Returns a set of ops that are outside compiled and can be extracted to before
+// the TPU computation. These ops are either connected to the inputs of the TPU
+// computation or other ops that can be extracted, and have no operands from
+// other ops in the TPU computation that cannot be extracted.
+llvm::SmallVector<Operation*, 4> FindOutsideCompiledOpsAtHead(
+    tf_device::ClusterOp cluster) {
+  Region* cluster_region = &cluster.body();
+  llvm::SmallSetVector<Operation*, 4> head_outside_compiled_ops;
+
+  auto cluster_ops = cluster.GetBody().without_terminator();
+  for (Operation& cluster_op : cluster_ops) {
+    if (!HasOutsideCompilationAttribute(&cluster_op)) continue;
+    // An outside compiled op can be extracted if its operands are not from
+    // other ops in the cluster that cannot be extracted.
+    auto walk_result = cluster_op.walk([&](Operation* op) {
+      for (Value operand : op->getOperands()) {
+        Operation* operand_op = GetOpOfValue(operand);
+        if (head_outside_compiled_ops.count(operand_op)) continue;
+
+        if (operand_op->getParentRegion() == cluster_region)
+          return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    });
+
+    if (!walk_result.wasInterrupted())
+      head_outside_compiled_ops.insert(&cluster_op);
+  }
+
+  return head_outside_compiled_ops.takeVector();
+}
+
+// Moves head outside compiled ops into its own `tf_device.LaunchOp`
+// computation before the cluster.
+void CreateHeadComputation(OpBuilder* builder, tf_device::ClusterOp cluster,
+                           llvm::ArrayRef<Operation*> head_outside_compiled_ops,
+                           llvm::StringRef host_device) {
+  Block* launch_block = new Block;
+  for (Operation* head_outside_compiled_op : head_outside_compiled_ops)
+    head_outside_compiled_op->moveBefore(launch_block, launch_block->end());
+
+  tf_device::LaunchOp launch = CreateLaunchForBlock(
+      builder, cluster, /*before=*/true, launch_block, host_device);
+
+  for (auto result : llvm::zip(launch.GetBody().getTerminator()->getOperands(),
+                               launch.getResults()))
+    replaceAllUsesInRegionWith(std::get<0>(result), std::get<1>(result),
+                               cluster.body());
+}
+
+// Extracts and move outside compiled ops that have no dependencies in the
+// cluster to before the cluster.
+mlir::LogicalResult LiftHeadOutsideCompiledOps(
+    OpBuilder* builder, const mlir::TF::RuntimeDevices& devices,
+    tf_device::ClusterOp cluster, std::string* host_device,
+    bool* cluster_updated) {
+  llvm::SmallVector<Operation*, 4> head_outside_compiled_ops =
+      FindOutsideCompiledOpsAtHead(cluster);
+  if (head_outside_compiled_ops.empty()) return success();
+  if (failed(
+          GetHostDeviceForHeadTailComputation(devices, cluster, host_device)))
+    return failure();
+
+  CreateHeadComputation(builder, cluster, head_outside_compiled_ops,
+                        *host_device);
+
+  *cluster_updated = true;
+  return success();
+}
+
+// Fills `tail_outside_compiled_ops` with ops that are outside compiled and
+// can be extracted to after the TPU computation, and `cluster_results` with new
+// results of the cluster. These ops are either connected to the output of the
+// TPU computation or other ops that can be extracted, and have no results used
+// by other ops in the TPU computation that cannot be extracted.
+void FindOutsideCompiledOpsAtTailAndClusterResults(
+    tf_device::ClusterOp cluster,
+    llvm::SmallVectorImpl<Operation*>* tail_outside_compiled_ops,
+    llvm::SmallVectorImpl<Value>* cluster_results) {
+  Region* cluster_region = &cluster.body();
+  llvm::SmallSetVector<Operation*, 4> tail_outside_compiled_ops_set;
+  Operation* terminator = cluster.GetBody().getTerminator();
+  llvm::SmallSetVector<Value, 4> cluster_results_set;
+  cluster_results_set.insert(terminator->getOperands().begin(),
+                             terminator->getOperands().end());
+
+  auto cluster_ops = llvm::reverse(cluster.GetBody().without_terminator());
+  for (Operation& cluster_op : cluster_ops) {
+    if (!HasOutsideCompilationAttribute(&cluster_op)) continue;
+
+    llvm::SmallVector<int, 4> results_to_forward;
+    bool can_be_extracted =
+        llvm::all_of(cluster_op.getUsers(), [&](Operation* op) {
+          return op == terminator || tail_outside_compiled_ops_set.count(op);
+        });
+    if (!can_be_extracted) continue;
+
+    // Collect operands of cluster op that are generated within the cluster.
+    // These values should be returned by the cluster.
+    cluster_op.walk([&](Operation* op) {
+      for (Value operand : op->getOperands()) {
+        Operation* operand_op = GetOpOfValue(operand);
+        if (operand_op->getParentRegion() == cluster_region)
+          cluster_results_set.insert(operand);
+      }
+    });
+
+    // Remove results of op to be extracted as there are no uses in the cluster.
+    for (Value result : cluster_op.getResults())
+      cluster_results_set.remove(result);
+    tail_outside_compiled_ops_set.insert(&cluster_op);
+  }
+
+  *tail_outside_compiled_ops = tail_outside_compiled_ops_set.takeVector();
+  *cluster_results = cluster_results_set.takeVector();
+}
+
+// Moves tail outside compiled ops into its own `tf_device.LaunchOp`
+// computation after the cluster.
+void CreateTailComputation(OpBuilder* builder, tf_device::ClusterOp cluster,
+                           llvm::ArrayRef<Operation*> tail_outside_compiled_ops,
+                           llvm::StringRef host_device) {
+  Block* launch_block = new Block;
+  for (Operation* tail_outside_compiled_op : tail_outside_compiled_ops)
+    tail_outside_compiled_op->moveBefore(launch_block, launch_block->begin());
+
+  tf_device::LaunchOp launch = CreateLaunchForBlock(
+      builder, cluster, /*before=*/false, launch_block, host_device);
+
+  auto operand_not_in_launch = [&](OpOperand& operand) {
+    return !launch.getOperation()->isProperAncestor(operand.getOwner());
+  };
+  for (auto result : llvm::zip(launch.GetBody().getTerminator()->getOperands(),
+                               launch.getResults()))
+    std::get<0>(result).replaceUsesWithIf(std::get<1>(result),
+                                          operand_not_in_launch);
+}
+
+// Updates cluster with updated cluster results after extracting tail outside
+// compiled ops.
+tf_device::ClusterOp UpdateClusterResults(
+    OpBuilder* builder, tf_device::ClusterOp cluster,
+    llvm::ArrayRef<Value> new_cluster_results) {
+  Operation* old_terminator = cluster.GetBody().getTerminator();
+  builder->setInsertionPoint(old_terminator);
+  builder->create<tf_device::ReturnOp>(old_terminator->getLoc(),
+                                       new_cluster_results);
+  old_terminator->erase();
+
+  builder->setInsertionPoint(cluster);
+  llvm::SmallVector<Type, 4> new_cluster_result_types;
+  new_cluster_result_types.reserve(new_cluster_results.size());
+  for (const auto& new_cluster_result : new_cluster_results)
+    new_cluster_result_types.push_back(new_cluster_result.getType());
+
+  auto new_cluster = builder->create<tf_device::ClusterOp>(
+      cluster.getLoc(), new_cluster_result_types,
+      /*operands=*/llvm::ArrayRef<Value>{}, cluster.getAttrs());
+  new_cluster.body().takeBody(cluster.body());
+
+  auto operand_not_in_cluster = [&](OpOperand& operand) {
+    return !new_cluster.getOperation()->isProperAncestor(operand.getOwner());
+  };
+  for (auto result :
+       llvm::zip(new_cluster.GetBody().getTerminator()->getOperands(),
+                 new_cluster.getResults()))
+    std::get<0>(result).replaceUsesWithIf(std::get<1>(result),
+                                          operand_not_in_cluster);
+
+  cluster.erase();
+  return new_cluster;
+}
+
+// Extracts and move outside compiled ops that do not create dependencies in the
+// cluster to after the cluster.
+mlir::LogicalResult LiftTailOutsideCompiledOps(
+    OpBuilder* builder, const mlir::TF::RuntimeDevices& devices,
+    std::string host_device, tf_device::ClusterOp* cluster,
+    bool* cluster_updated) {
+  llvm::SmallVector<Operation*, 4> tail_outside_compiled_ops;
+  llvm::SmallVector<Value, 4> cluster_results;
+  FindOutsideCompiledOpsAtTailAndClusterResults(
+      *cluster, &tail_outside_compiled_ops, &cluster_results);
+  if (tail_outside_compiled_ops.empty()) return success();
+
+  if (host_device.empty())
+    if (failed(GetHostDeviceForHeadTailComputation(devices, *cluster,
+                                                   &host_device)))
+      return failure();
+
+  // Forward all results of cluster first. These results will be remapped once
+  // a new cluster is formed.
+  cluster->replaceAllUsesWith(
+      cluster->GetBody().getTerminator()->getOperands());
+
+  CreateTailComputation(builder, *cluster, tail_outside_compiled_ops,
+                        host_device);
+
+  *cluster = UpdateClusterResults(builder, *cluster, cluster_results);
+
+  *cluster_updated = true;
+  return success();
+}
+
+// Removes aliased outputs in cluster from ops outside of cluster.
+void RemoveClusterAliasedOutputs(OpBuilder* builder,
+                                 tf_device::ClusterOp cluster) {
+  llvm::SmallVector<Value, 4> used_old_cluster_results;
+  llvm::SmallVector<Value, 4> new_cluster_results;
+  llvm::SmallVector<Type, 4> new_cluster_result_types;
+  Operation* cluster_terminator = cluster.GetBody().getTerminator();
+  for (auto result :
+       llvm::zip(cluster_terminator->getOperands(), cluster.getResults())) {
+    Value cluster_terminator_operand = std::get<0>(result);
+    if (cluster.getOperation()->isProperAncestor(
+            cluster_terminator_operand.getDefiningOp())) {
+      new_cluster_results.push_back(cluster_terminator_operand);
+      new_cluster_result_types.push_back(cluster_terminator_operand.getType());
+      used_old_cluster_results.push_back(std::get<1>(result));
+    } else {
+      std::get<1>(result).replaceAllUsesWith(cluster_terminator_operand);
+    }
+  }
+
+  if (new_cluster_results.size() == cluster.getNumResults()) return;
+
+  builder->setInsertionPoint(cluster);
+  auto new_cluster = builder->create<tf_device::ClusterOp>(
+      cluster.getLoc(), new_cluster_result_types,
+      /*operands=*/llvm::ArrayRef<Value>{}, cluster.getAttrs());
+  new_cluster.body().takeBody(cluster.body());
+  new_cluster.GetBody().getTerminator()->setOperands(new_cluster_results);
+
+  for (auto result :
+       llvm::zip(used_old_cluster_results, new_cluster.getResults()))
+    std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
+
+  cluster.erase();
+}
+
+struct TPUExtractHeadTailOutsideCompilation
+    : public PassWrapper<TPUExtractHeadTailOutsideCompilation,
+                         OperationPass<ModuleOp>> {
+  void runOnOperation() override;
+};
+
+void TPUExtractHeadTailOutsideCompilation::runOnOperation() {
+  // Get runtime devices information from the closest parent module.
+  auto module = getOperation();
+  mlir::TF::RuntimeDevices devices;
+  if (failed(tensorflow::GetDevicesFromOp(module, &devices)))
+    return signalPassFailure();
+
+  OpBuilder builder(&getContext());
+  llvm::SmallVector<tf_device::ClusterOp, 4> clusters;
+  module.walk(
+      [&](tf_device::ClusterOp cluster) { clusters.push_back(cluster); });
+
+  for (tf_device::ClusterOp cluster : clusters) {
+    std::string host_device;
+    bool cluster_updated = false;
+    if (failed(LiftHeadOutsideCompiledOps(&builder, devices, cluster,
+                                          &host_device, &cluster_updated)) ||
+        failed(LiftTailOutsideCompiledOps(&builder, devices, host_device,
+                                          &cluster, &cluster_updated)))
+      return signalPassFailure();
+    if (cluster_updated) RemoveClusterAliasedOutputs(&builder, cluster);
+  }
 }
 
 }  // anonymous namespace
 
-std::unique_ptr<OperationPass<FuncOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
 CreateTPUExtractHeadTailOutsideCompilationPass() {
   return std::make_unique<TPUExtractHeadTailOutsideCompilation>();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index 4e20cd9d64b..93e5cc22c30 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -19,22 +19,27 @@ limitations under the License.
 
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace mlir {
 namespace TFTPU {
 
 namespace {
 
-constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
+constexpr char kAncestorsAttr[] = "ancestors";
 constexpr char kDeviceAttr[] = "device";
+constexpr char kKeyAttr[] = "key";
+constexpr char kShapesAttr[] = "shapes";
+constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
 
 // Mapping for `_xla_outside_compilation` attribute to ops of a cluster.
-using ClusterMap =
+using OutsideClusterMap =
     llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<Operation*, 8>, 8>;
 
 // This pass extracts a CPU computation cluster with `_xla_outside_compilation`
@@ -51,7 +56,8 @@ struct TPUExtractOutsideCompilation
 // Collects and clusters ops in `block` with the same `_xla_outside_compilation`
 // attribute into `clusters` This returns an error if a
 // `_xla_outside_compilation` attribute of an op is empty.
-LogicalResult CollectAndGroupClusterOps(Block* block, ClusterMap* clusters) {
+LogicalResult CollectAndGroupOutsideClusterOps(Block* block,
+                                               OutsideClusterMap* clusters) {
   for (Operation& op : *block) {
     if (auto attr = op.getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
       if (attr.getValue().empty())
@@ -67,7 +73,7 @@ LogicalResult CollectAndGroupClusterOps(Block* block, ClusterMap* clusters) {
 }
 
 // Moves `cluster_ops` to associated `launch_op` body.
-void MoveClusterOpsToLaunchOp(
+void MoveOutsideClusterOpsToLaunchOp(
     tf_device::LaunchOp launch_op,
     const llvm::SmallVector<Operation*, 8>& cluster_ops) {
   MLIRContext* context = launch_op.getContext();
@@ -84,8 +90,8 @@ void MoveClusterOpsToLaunchOp(
 }
 
 // Creates a `tf_device::LaunchOp` to wrap cluster ops.
-tf_device::LaunchOp CreateLaunchOpForCluster(OpBuilder* builder,
-                                             Operation* last_cluster_op) {
+tf_device::LaunchOp CreateLaunchOpForOutsideCluster(
+    OpBuilder* builder, Operation* last_cluster_op) {
   // TODO(b/154363171): Set the CPU device.
   // An empty string placeholder is used for the device as that will be later
   // populated with the device of the associated TPUReplicateMetadata op.
@@ -115,16 +121,159 @@ void PropagateParallelExecuteReturnToReplicate(
         parallel_execute_op.execute_outputs());
 }
 
+// Extracts all externally provided operands of `cluster_ops`.
+llvm::SmallSetVector<Value, 4> GetExternalOperands(
+    const llvm::SmallVector<Operation*, 8>& cluster_ops) {
+  llvm::SmallSetVector<Value, 4> external_values;
+
+  for (Operation* op : cluster_ops) {
+    for (Value v : op->getOperands()) {
+      Operation* defining_op = v.getDefiningOp();
+      if (!defining_op) continue;
+      bool is_external = llvm::none_of(cluster_ops, [&](Operation* cluster_op) {
+        return defining_op == cluster_op;
+      });
+
+      if (is_external) external_values.insert(v);
+    }
+  }
+
+  return external_values;
+}
+
+// Extracts all externally used outputs of `cluster_ops`.
+llvm::SmallVector<Value, 4> GetExternalOutputs(
+    const llvm::SmallVector<Operation*, 8>& cluster_ops) {
+  llvm::SmallSetVector<Value, 4> external_outputs;
+
+  for (Operation* op : cluster_ops) {
+    for (Operation* user : op->getUsers()) {
+      bool is_external = llvm::none_of(cluster_ops, [&](Operation* cluster_op) {
+        return user == cluster_op;
+      });
+      if (!is_external) continue;
+      for (Value v : user->getOperands()) {
+        if (v.getDefiningOp() == op) external_outputs.insert(v);
+      }
+    }
+  }
+
+  return external_outputs.takeVector();
+}
+
+// Sets the insertion point on `builder` for HostCompute op.  Sets insertion
+// point to the first op in `cluster_ops` that has one of `external_inputs`
+// as an operand.  If there are no external_inputs, set insertion point to first
+// cluster_op.
+void SetHostComputeInsertion(
+    OpBuilder* builder, const llvm::SmallVector<Operation*, 8>& cluster_ops,
+    const llvm::SmallSetVector<Value, 4>& external_inputs) {
+  if (external_inputs.empty()) builder->setInsertionPoint(cluster_ops.front());
+  for (const auto& cluster_op : cluster_ops) {
+    for (Value v : cluster_op->getOperands()) {
+      if (external_inputs.count(v)) {
+        builder->setInsertionPoint(cluster_op);
+        return;
+      }
+    }
+  }
+}
+
+// Creates the HostCompute with `inputs` and `outputs`
+// using `communication_key`.
+TF::_HostComputeMlirOp CreateHostCompute(
+    OpBuilder* builder, tf_device::ClusterOp tpu_cluster,
+    const llvm::SmallVector<Operation*, 8>& cluster_ops,
+    const llvm::SmallSetVector<Value, 4>& inputs, llvm::ArrayRef<Value> outputs,
+    const std::string& communication_key) {
+  llvm::SmallVector<Type, 4> device_output_types;
+  for (const auto& output : outputs)
+    device_output_types.push_back(output.getType());
+  SetHostComputeInsertion(builder, cluster_ops, inputs);
+  auto host_compute = builder->create<TF::_HostComputeMlirOp>(
+      tpu_cluster.getLoc(), device_output_types, inputs.getArrayRef(),
+      llvm::ArrayRef<NamedAttribute>{});
+  host_compute.setAttr(kAncestorsAttr, builder->getArrayAttr({}));
+  host_compute.setAttr(kShapesAttr, builder->getArrayAttr({}));
+  host_compute.setAttr(kKeyAttr, builder->getStringAttr(communication_key));
+  return host_compute;
+}
+
+void MoveOutsideCompiledOps(
+    tf_device::ClusterOp tpu_cluster, llvm::StringRef outside_cluster_name,
+    tf_device::LaunchOp host_launch_op,
+    const llvm::SmallVector<Operation*, 8>& cluster_ops,
+    const llvm::SmallSetVector<Value, 4>& external_inputs,
+    const llvm::SmallVector<Value, 4>& external_outputs) {
+  if (external_inputs.empty() && external_outputs.empty()) {
+    MoveOutsideClusterOpsToLaunchOp(host_launch_op, cluster_ops);
+    return;
+  }
+
+  OpBuilder builder(host_launch_op.GetBody().getTerminator());
+  auto result_type =
+      RankedTensorType::get({}, builder.getType<TF::StringType>());
+
+  std::string txt_metadata;
+  std::string txt_module;
+  // TODO(b/157054714): Use a better abstraction instead of _TPUCompileMlirOp
+  // and _XlaRecvAtHostOp and _XlaSendFromHostOp.
+
+  // A placeholder _TpuCompileMlirOp is created because it is required input to
+  // XlaRecvAtHostOp and XlaSendFromHostOp but the _TpuCompileMlirOp has not yet
+  // been created for the TPU cluster that contains the outside compiled ops.
+  // This placeholder should be replaced by the TPU cluster _TPUCompileMlirOp in
+  // a subsequent pass.
+  auto compile_op = builder.create<TF::_TPUCompileMlirOp>(
+      tpu_cluster.getLoc(), /*compilation_status=*/result_type, /*program=*/
+      llvm::ArrayRef<Type>{result_type}, llvm::ArrayRef<Value>{}, txt_module,
+      txt_metadata);
+
+  llvm::SmallVector<Type, 4> host_output_types;
+  for (const auto& external_input : external_inputs)
+    host_output_types.push_back(external_input.getType());
+
+  std::string communication_key =
+      llvm::formatv("host_compute_channel_{0}", outside_cluster_name).str();
+  // XlaRecvAtHostOp takes both the program key(dynamic_key) from the
+  // _TpuCompileMlirOp and the communication_key.
+  auto recv_at_host = builder.create<TF::_XlaRecvAtHostOp>(
+      tpu_cluster.getLoc(), host_output_types,
+      /*dynamic_key=*/compile_op.getResult(1),
+      builder.getStringAttr(communication_key),
+      builder.getIntegerAttr(builder.getIntegerType(64), 0));
+
+  auto host_compute =
+      CreateHostCompute(&builder, tpu_cluster, cluster_ops, external_inputs,
+                        external_outputs, communication_key);
+  MoveOutsideClusterOpsToLaunchOp(host_launch_op, cluster_ops);
+
+  builder.setInsertionPoint(host_launch_op.GetBody().getTerminator());
+  builder.create<TF::_XlaSendFromHostOp>(
+      tpu_cluster.getLoc(), external_outputs,
+      /*dynamic_key=*/compile_op.getResult(1),
+      builder.getStringAttr(communication_key),
+      /*device_ordinal=*/builder.getIntegerAttr(builder.getIntegerType(64), 0));
+
+  for (auto result : llvm::zip(external_inputs, recv_at_host.getResults()))
+    mlir::replaceAllUsesInRegionWith(std::get<0>(result), std::get<1>(result),
+                                     host_launch_op.body());
+
+  for (auto result : llvm::zip(external_outputs, host_compute.getResults()))
+    mlir::replaceAllUsesInRegionWith(std::get<0>(result), std::get<1>(result),
+                                     tpu_cluster.body());
+}
+
 // Creates a `parallel_execute` op in place of launch with 'clusters` and
 // 'launch` as regions.
-void CreateParallelExecuteFromClusters(tf_device::LaunchOp launch,
-                                       const ClusterMap& clusters) {
-  OpBuilder builder(launch);
+void CreateParallelExecuteFromOutsideClusters(
+    tf_device::ClusterOp tpu_cluster, const OutsideClusterMap& clusters) {
+  OpBuilder builder(tpu_cluster);
   // Create parallel_execute regions.  The original TPU cluster computation
   // is the extra region.
-  int num_regions = 1 + clusters.size();
+  const int num_regions = 1 + clusters.size();
   auto parallel_execute_op = builder.create<tf_device::ParallelExecuteOp>(
-      launch.getLoc(), num_regions, launch.results().getTypes());
+      tpu_cluster.getLoc(), num_regions, tpu_cluster.results().getTypes());
 
   // Move outside compilation clusters to parallel_execute regions.
   for (const auto& cluster : llvm::enumerate(clusters)) {
@@ -133,41 +282,48 @@ void CreateParallelExecuteFromClusters(tf_device::LaunchOp launch,
     Block& outside_block =
         parallel_execute_op.GetRegionBlockWithIndex(cluster.index());
     builder.setInsertionPointToEnd(&outside_block);
-    tf_device::LaunchOp launch_op =
-        CreateLaunchOpForCluster(&builder, cluster_ops.back());
-    MoveClusterOpsToLaunchOp(launch_op, cluster_ops);
+    tf_device::LaunchOp host_launch_op =
+        CreateLaunchOpForOutsideCluster(&builder, cluster_ops.back());
+
+    // Determine if there are any inputs that are provided out of cluster.
+    auto external_inputs = GetExternalOperands(cluster_ops);
+    auto external_outputs = GetExternalOutputs(cluster_ops);
+
+    MoveOutsideCompiledOps(tpu_cluster, cluster.value().getFirst(),
+                           host_launch_op, cluster_ops, external_inputs,
+                           external_outputs);
+
     builder.setInsertionPointToEnd(&outside_block);
-    // TODO(b/154363171): Handle returns from OutsideCompiled parallel_execute
-    // regions either through communication with TPU parallel_execute regions
-    // or modifying parallel_execute returns.
-    builder.create<tf_device::ReturnOp>(launch.getLoc(), ArrayRef<Value>{});
+    builder.create<tf_device::ReturnOp>(tpu_cluster.getLoc(),
+                                        ArrayRef<Value>{});
   }
 
   // Move the launch body to last parallel_execute block.
-  Block& inside_block =
+  Block& parallel_execute_tpu_block =
       parallel_execute_op.GetRegionBlockWithIndex(num_regions - 1);
-  builder.setInsertionPointToEnd(&inside_block);
-  builder.create<tf_device::ReturnOp>(launch.getLoc(), launch.getResults());
-  launch.getOperation()->moveBefore(inside_block.getTerminator());
+  builder.setInsertionPointToEnd(&parallel_execute_tpu_block);
+  builder.create<tf_device::ReturnOp>(tpu_cluster.getLoc(),
+                                      tpu_cluster.getResults());
+  tpu_cluster.getOperation()->moveBefore(
+      parallel_execute_tpu_block.getTerminator());
 
   PropagateParallelExecuteReturnToReplicate(parallel_execute_op);
-  // TODO(b/154363171): Handle returns from OutsideCompiled parallel_execute
-  // regions either through communication with TPU parallel_execute regions
-  // or modifying parallel_execute returns.
 }
 
 void TPUExtractOutsideCompilation::runOnFunction() {
-  auto extract_result = getFunction().walk([&](tf_device::LaunchOp launch) {
-    ClusterMap clusters;
-    if (failed(CollectAndGroupClusterOps(&launch.GetBody(), &clusters)))
-      return WalkResult::interrupt();
+  auto extract_result =
+      getFunction().walk([&](tf_device::ClusterOp tpu_cluster) {
+        OutsideClusterMap clusters;
+        if (failed(CollectAndGroupOutsideClusterOps(&tpu_cluster.GetBody(),
+                                                    &clusters)))
+          return WalkResult::interrupt();
 
-    if (clusters.empty()) return WalkResult::advance();
+        if (clusters.empty()) return WalkResult::advance();
 
-    CreateParallelExecuteFromClusters(launch, clusters);
+        CreateParallelExecuteFromOutsideClusters(tpu_cluster, clusters);
 
-    return WalkResult::advance();
-  });
+        return WalkResult::advance();
+      });
 
   if (extract_result.wasInterrupted()) return signalPassFailure();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index 98ff0de7645..696882cd105 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -64,19 +64,14 @@ static llvm::cl::opt<bool> tpu_compile_metadata_debug(
                    "'tf._TPUCompileMlir' op as a proto debug string"));
 
 constexpr char kNumReplicasAttr[] = "num_replicas";
-constexpr char kNumCoresPerReplicaAttr[] = "num_cores_per_replica";
 constexpr char kStepMarkerLocationAttr[] = "step_marker_location";
 constexpr char kPaddingMapAttr[] = "padding_map";
-constexpr char kTopologyAttr[] = "topology";
-constexpr char kDeviceAssignmentAttr[] = "device_assignment";
 constexpr char kDeviceAttr[] = "device";
 constexpr char kDevicesAttr[] = "devices";
 constexpr char kVersionsAttr[] = "tf.versions";
 
 constexpr char kBadStringArrayElementMsg[] =
     "bad '{0}' attribute at index {1}, not a string";
-constexpr char kBadIntArrayElementMsg[] =
-    "bad '{0}' attribute at index {1}, not an int";
 constexpr char kBadArrayElementMsg[] =
     "bad '{0}' attribute at index {1} with value '{2}': failed to parse to {3}";
 constexpr char kBadArrayAttrLengthMsg[] =
@@ -92,7 +87,7 @@ constexpr char kBadArrayAttrLengthMsg[] =
 //
 // Would become following ops (unimportant attributes, types are omitted):
 //    %1 = "tf.Shape"(%0)
-//    %2:2 = "tf.MLIRCompileToTPU"(%1) {module = "<Serialized @tpu_func>"}
+//    %2:2 = "tf._TPUCompileMlir"(%1) {module = "<Serialized @tpu_func>"}
 //    "tf.TPUCompileSucceededAssert"(%2#0)
 //    %3 = "tf.TPUExecute"(%0, %2#1)
 //    %4 = "tf.SomeOp"(%3)
@@ -163,32 +158,6 @@ LogicalResult EncapsulateFuncAndSerialize(FuncOp entry_func,
   return success();
 }
 
-// Extracts device coordinates from a device assignment attribute on an op.
-LogicalResult GetDeviceCoordinates(
-    tf_device::ClusterFuncOp op,
-    llvm::SmallVectorImpl<int64_t>* device_assignment) {
-  auto device_assignment_attr =
-      op.getAttrOfType<ArrayAttr>(kDeviceAssignmentAttr);
-  if (!device_assignment_attr)
-    return op.emitOpError(CreateMissingAttributeMsg(kDeviceAssignmentAttr));
-
-  device_assignment->reserve(device_assignment_attr.size());
-
-  for (auto device_coordinate_and_idx :
-       llvm::enumerate(device_assignment_attr)) {
-    auto device_coordinate =
-        device_coordinate_and_idx.value().dyn_cast<IntegerAttr>();
-    if (!device_coordinate)
-      return op.emitOpError(llvm::formatv(kBadIntArrayElementMsg,
-                                          kDeviceAssignmentAttr,
-                                          device_coordinate_and_idx.index()));
-
-    device_assignment->push_back(device_coordinate.getInt());
-  }
-
-  return success();
-}
-
 // Populates a TPUCompileMetadataProto with StepMarkerLocation from a
 // `tf_device::ClusterFuncOp`.
 LogicalResult SetMetadataProtoStepMarkerLocation(
@@ -448,25 +417,38 @@ Operation* BuildCompileOp(
 // core, and all replica devices per core are grouped together.
 void AssignDevicesToReplicate(
     tf_device::ReplicateOp replicate,
-    llvm::ArrayRef<llvm::SmallVector<std::string, 8>> execution_devices,
+    llvm::ArrayRef<llvm::SmallVector<tensorflow::TPUDeviceAndHost, 8>>
+        tpu_devices,
     OpBuilder* builder) {
   if (!replicate) return;
 
-  const int num_replicas = execution_devices.size();
-  const int num_cores_per_replica = execution_devices.front().size();
+  const int num_replicas = tpu_devices.size();
+  const int num_cores_per_replica = tpu_devices.front().size();
 
   llvm::SmallVector<NamedAttribute, 8> device_attrs;
   for (int core = 0; core < num_cores_per_replica; ++core) {
     llvm::SmallVector<StringRef, 8> devices_by_core;
     devices_by_core.reserve(num_replicas);
     for (int replica = 0; replica < num_replicas; ++replica)
-      devices_by_core.push_back(execution_devices[replica][core]);
+      devices_by_core.push_back(tpu_devices[replica][core].device);
 
     device_attrs.push_back(
         builder->getNamedAttr(tensorflow::GetDeviceAliasForLogicalCore(core),
                               builder->getStrArrayAttr(devices_by_core)));
   }
 
+  // For data parallelism, also add replicated host devices, as these are
+  // necessary for outside compilation.
+  if (num_cores_per_replica == 1) {
+    llvm::SmallVector<StringRef, 8> hosts;
+    hosts.reserve(num_replicas);
+    for (int replica = 0; replica < num_replicas; ++replica)
+      hosts.push_back(tpu_devices[replica][0].host);
+
+    device_attrs.push_back(builder->getNamedAttr(
+        tensorflow::kTPUReplicatedHost, builder->getStrArrayAttr(hosts)));
+  }
+
   replicate.setAttr(kDevicesAttr, builder->getDictionaryAttr(device_attrs));
 }
 
@@ -492,11 +474,12 @@ LogicalResult BuildExecuteOp(
 // Creates a tf_device.parallel_execute op that wraps TPUExecute op to
 // represent execution of TPU program in multiple logical cores.
 LogicalResult BuildParallelExecuteOp(
-    llvm::ArrayRef<llvm::SmallVector<std::string, 8>> execution_devices,
+    llvm::ArrayRef<llvm::SmallVector<tensorflow::TPUDeviceAndHost, 8>>
+        tpu_devices,
     llvm::ArrayRef<xla::OpSharding> output_sharding_config,
     Operation* compile_op, tf_device::ClusterFuncOp cluster_func,
     OpBuilder* builder, tf_device::ParallelExecuteOp* parallel_execute_op) {
-  const int num_cores_per_replica = execution_devices.front().size();
+  const int num_cores_per_replica = tpu_devices.front().size();
   // parallel_execute op returns concatenated list of return values of
   // all its regions.
   //
@@ -528,7 +511,7 @@ LogicalResult BuildParallelExecuteOp(
       num_cores_per_replica, cluster_func, builder, &input_list);
   if (failed(result)) return failure();
 
-  const bool replicated = execution_devices.size() != 1;
+  const bool replicated = tpu_devices.size() != 1;
   // For each logical core, create a region with TPUExecute op.
   assert(input_list.size() == num_cores_per_replica);
   for (int core = 0; core < num_cores_per_replica; ++core) {
@@ -553,7 +536,7 @@ LogicalResult BuildParallelExecuteOp(
     // op.
     std::string device = replicated
                              ? tensorflow::GetDeviceAliasForLogicalCore(core)
-                             : execution_devices.front()[core];
+                             : tpu_devices.front()[core].device;
 
     auto region_launch_op =
         WrapOpInLaunch(builder, region.getParent()->getLoc(), execute, device);
@@ -566,13 +549,14 @@ LogicalResult BuildParallelExecuteOp(
 }
 
 tf_device::LaunchOp AssignDevicesToReplicatedExecute(
-    llvm::ArrayRef<llvm::SmallVector<std::string, 8>> execution_devices,
+    llvm::ArrayRef<llvm::SmallVector<tensorflow::TPUDeviceAndHost, 8>>
+        tpu_devices,
     Operation* execute_op, OpBuilder* builder) {
-  const bool replicated = execution_devices.size() != 1;
+  const bool replicated = tpu_devices.size() != 1;
   // If computation is replicated, use aliased device. Otherwise there is only
   // one execution device and the device is assigned to the execute op.
   std::string device = replicated ? tensorflow::GetDeviceAliasForLogicalCore(0)
-                                  : execution_devices.front().front();
+                                  : tpu_devices.front().front().device;
 
   return WrapOpInLaunch(builder, execute_op->getLoc(), execute_op, device);
 }
@@ -658,27 +642,41 @@ LogicalResult Rewrite(
           : nullptr;
   if (replicate) num_replicas = replicate.n().getLimitedValue();
 
-  auto num_cores_per_replica_attr =
-      cluster_func.getAttrOfType<IntegerAttr>(kNumCoresPerReplicaAttr);
+  auto num_cores_per_replica_attr = cluster_func.getAttrOfType<IntegerAttr>(
+      tensorflow::kNumCoresPerReplicaAttr);
   if (!num_cores_per_replica_attr)
     return cluster_func.emitOpError(
-        CreateMissingAttributeMsg(kNumCoresPerReplicaAttr));
+        CreateMissingAttributeMsg(tensorflow::kNumCoresPerReplicaAttr));
 
   int num_cores_per_replica = num_cores_per_replica_attr.getInt();
 
-  auto topology_attr = cluster_func.getAttrOfType<StringAttr>(kTopologyAttr);
+  auto topology_attr =
+      cluster_func.getAttrOfType<StringAttr>(tensorflow::kTopologyAttr);
   if (!topology_attr)
-    return cluster_func.emitOpError(CreateMissingAttributeMsg(kTopologyAttr));
+    return cluster_func.emitOpError(
+        CreateMissingAttributeMsg(tensorflow::kTopologyAttr));
 
-  llvm::SmallVector<int64_t, 6> device_assignment;
-  if (failed(GetDeviceCoordinates(cluster_func, &device_assignment)))
-    return failure();
+  auto device_assignment_attr = cluster_func.getAttrOfType<mlir::ArrayAttr>(
+      tensorflow::kDeviceAssignmentAttr);
+  if (!device_assignment_attr)
+    return cluster_func.emitOpError(
+        llvm::formatv("requires attribute '{0}'",
+                      tensorflow::kDeviceAssignmentAttr)
+            .str());
+
+  auto status_or_device_coodinates =
+      tensorflow::GetDeviceCoordinates(device_assignment_attr);
+  if (!status_or_device_coodinates.ok())
+    return cluster_func.emitError()
+           << "error in fetching tpu device coordinates: "
+           << status_or_device_coodinates.status().error_message();
 
   // Determine compilation and execution devices.
   auto status_or_tpu_device_assignment =
       tensorflow::GetTPUCompilationAndExecutionDevices(
           devices, num_replicas, num_cores_per_replica,
-          topology_attr.getValue(), device_assignment);
+          topology_attr.getValue(),
+          status_or_device_coodinates.ConsumeValueOrDie());
   if (!status_or_tpu_device_assignment.ok())
     return cluster_func.emitError()
            << "error in fetching TPU compilation/execution devices: "
@@ -687,12 +685,35 @@ LogicalResult Rewrite(
   // Create compile op.
   auto& tpu_device_assignment = status_or_tpu_device_assignment.ValueOrDie();
   builder->setInsertionPoint(cluster_func);
+
+  // Create the TPUCompileMlir and TPUCompileSucceededAssert outside of
+  // parallel_execute region if it exists.
+  if (llvm::isa<tf_device::ParallelExecuteOp>(cluster_func.getParentOp())) {
+    // Currently, outside compilation and model parallelism are not supported
+    // together.
+    assert(num_cores_per_replica == 1);
+    builder->setInsertionPoint(cluster_func.getParentOp());
+  }
+
   Operation* compile_op = BuildCompileOp(
       cluster_func, num_replicas, num_cores_per_replica,
       tpu_device_assignment.compilation_device,
       std::move(tpu_device_assignment.xla_device_assignment), builder);
   if (!compile_op) return failure();
 
+  // This replaces _TPUCompileMlir placeholder ops that are required
+  // by XlaRecvAtHost and XlaSendFromHost ops add in earlier pass.
+  // TODO(b/157054714): When a better abstraction instead of _TPUCompileMlirOp
+  // and _XlaRecvAtHostOp and _XlaSendFromHostOp are used, update to a more
+  // structured lowering.
+  if (auto parallel_op = llvm::dyn_cast<tf_device::ParallelExecuteOp>(
+          cluster_func.getParentOp())) {
+    parallel_op.walk([&](TF::_TPUCompileMlirOp parallel_compile_op) {
+      parallel_compile_op.replaceAllUsesWith(compile_op);
+      parallel_compile_op.erase();
+    });
+  }
+
   // After rewrite, find if there is a TPUCompilationResultOp in the block with
   // the same _tpu_replicate attribute and replace it with the result of the
   // compile op. This op is used as a placeholder to hook during graph creation
@@ -704,7 +725,7 @@ LogicalResult Rewrite(
   BuildTPUCompileSucceededAssertOp(
       compile_op, tpu_device_assignment.compilation_device, builder);
 
-  AssignDevicesToReplicate(replicate, tpu_device_assignment.execution_devices,
+  AssignDevicesToReplicate(replicate, tpu_device_assignment.tpu_devices,
                            builder);
 
   llvm::SmallVector<xla::OpSharding, 4> output_shardings;
@@ -712,12 +733,13 @@ LogicalResult Rewrite(
       num_cores_per_replica, cluster_func, &output_shardings);
   if (failed(result)) return failure();
 
+  builder->setInsertionPoint(cluster_func);
   if (num_cores_per_replica > 1) {
     // For model parallelism, tf_device.parallel_execute is used to express
     // concurrent device execution across multiple logical devices.
 
     tf_device::ParallelExecuteOp execute_op;
-    result = BuildParallelExecuteOp(tpu_device_assignment.execution_devices,
+    result = BuildParallelExecuteOp(tpu_device_assignment.tpu_devices,
                                     output_shardings, compile_op, cluster_func,
                                     builder, &execute_op);
     if (failed(result)) return failure();
@@ -740,7 +762,7 @@ LogicalResult Rewrite(
     if (failed(result)) return failure();
 
     tf_device::LaunchOp launch_op = AssignDevicesToReplicatedExecute(
-        tpu_device_assignment.execution_devices, execute_op, builder);
+        tpu_device_assignment.tpu_devices, execute_op, builder);
     cluster_func.replaceAllUsesWith(launch_op);
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
new file mode 100644
index 00000000000..7befa68f3d8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
@@ -0,0 +1,703 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <iostream>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace mlir {
+namespace TFTPU {
+
+namespace {
+
+constexpr char kDeviceAttr[] = "device";
+typedef std::pair<TF::Conv2DOp, int64_t> Conv2DWithBlockSize;
+
+// A pass that applies automatic space to depth transform for the first or
+// frontier convolutions consume host inputs on TPU.
+// This is done by adding space to depth transform op after host input and
+// applying space to depth transform for the first convolution and its backprop
+// filter on TPU.
+//
+// Example: original program:
+//
+// module {
+//   func @while_body {
+//     %input = "tf.IteratorGetNext"(...) {device = "/CPU:0"}:
+//              -> tensor<2x224x224x3xf32>
+//     %device_launch = "tf_device.cluster_func"(%input,...) {func = @_func,...)
+//     return ...
+//   }
+//   func @_func(%input: tensor<2x224x224x3xf32>,
+//               %filter: tensor<7x7x3x64xf32>) {
+//     %6 = "tf.Conv2D"(%input, %filter)  {strides = [1, 2, 2, 1]}:
+//      (tensor<2x230x230x3xf32>, tensor<7x7x3x64xf32>) ->
+//      tensor<2x112x112x64xf32>
+//   }
+// }
+//
+// With this pass, the program will be transformed into:
+// module {
+//   func @while_body {
+//     %input = "tf.IteratorGetNext"(...) {device = "/CPU:0"}
+//               -> tensor<2x224x224x3xf32>
+//     %space_to_depth = "tf.SpaceToDepth"(%input) {block_size = 2, ...}:
+//        (tensor<2x224x224x3xf32>) -> tensor<2x112x112x12xf32>
+//     %device_launch = "tf_device.cluster_func"(%space_to_depth,...)
+//       {func = @_func,...)
+//     return ...
+//   }
+//   func @_func(%input: tensor<2x112x112x12xf32>,
+//               %filter: tensor<7x7x3x64xf32>) {
+//     %filter_transform = "tf.Pad/tf.Transpose/tf.Reshape"(%filter):
+//       tensor<7x7x3x64xf32>) -> tensor<4x4x12x64xf32>
+//     %conv = "tf.Conv2D"(%input, %filter_transfrom) {strides = [1, 1, 1, 1]}:
+//       (tensor<2x112x112x12xf32>, tensor<4x4x12x64xf32>) ->
+//       tensor<2x112x112x64xf32>
+//   }
+// }
+//
+// This way, the first convolution with 3 feature dimension will be transformed
+// to 12 feature dimension, which has better performance on TPU.
+//
+// TODO(wangtao): add a pass to check if it is profitable to space to depth
+// transform and invoke the transform if it is needed.
+struct TPUSpaceToDepthPass
+    : public PassWrapper<TPUSpaceToDepthPass, OperationPass<ModuleOp>> {
+  void runOnOperation() override;
+};
+
+// Handle padding before convolution for space to depth transform.
+LogicalResult HandlePad(TF::PadOp op, int32_t kernel_size, int32_t block_size) {
+  auto ranked_type = op.input().getType().dyn_cast<RankedTensorType>();
+  if (!ranked_type) return failure();
+  auto pad_input_shape = ranked_type.getShape();
+  Location loc = op.getLoc();
+  OpBuilder builder(op);
+  builder.setInsertionPoint(op);
+  auto padding_type = RankedTensorType::get({4, 2}, builder.getIntegerType(32));
+
+  // Calculate paddings.
+  int32_t pad_total = kernel_size - 1;
+  int32_t pad_beg = (pad_total / 2 + 1) / block_size;
+  int32_t pad_end = (pad_total / 2) / block_size;
+  SmallVector<int32_t, 8> values = {0,       0,       pad_beg, pad_end,
+                                    pad_beg, pad_end, 0,       0};
+  auto paddings = DenseIntElementsAttr::get(padding_type, values);
+  // Update pad_op paddings.
+  op.setOperand(1, builder.create<TF::ConstOp>(loc, paddings));
+
+  // Set input type.
+  auto input = op.getOperand(0);
+  SmallVector<int64_t, 4> transform_shape = {
+      pad_input_shape[0], pad_input_shape[1] / block_size,
+      pad_input_shape[2] / block_size,
+      pad_input_shape[3] * block_size * block_size};
+  auto transform_result_type =
+      RankedTensorType::get(transform_shape, getElementTypeOrSelf(input));
+  input.setType(transform_result_type);
+  op.setOperand(0, input);
+  return success();
+}
+
+// Handle stride for the first convolution for the transform.
+void HandleConv2DStride(TF::Conv2DOp conv2d) {
+  MLIRContext* context = conv2d.getContext();
+  SmallVector<int64_t, 4> values = {1, 1, 1, 1};
+  auto attrs = llvm::map_range(values, [context](int64_t v) -> Attribute {
+    return IntegerAttr::get(IntegerType::get(64, context), v);
+  });
+  // TODO(b/157276506): change type of strides to DenseElementsAttr
+  auto strides = ArrayAttr::get(llvm::to_vector<4>(attrs), context);
+  conv2d.setAttr("strides", strides);
+}
+
+// Transform input shape for the first convolution.
+void HandleConv2DInput(TF::Conv2DOp conv2d, int64_t block_size) {
+  auto input = conv2d.input();
+  auto input_shape = input.getType().cast<RankedTensorType>().getShape();
+  SmallVector<int64_t, 4> transform_shape = {
+      input_shape[0], input_shape[1] / block_size, input_shape[2] / block_size,
+      input_shape[3] * block_size * block_size};
+  auto transform_result_type =
+      RankedTensorType::get(transform_shape, getElementTypeOrSelf(input));
+  input.setType(transform_result_type);
+}
+
+// Add padding for convolution filter for space to depth transform.
+TF::PadOp GetPadOpForConv2DFilter(ArrayRef<int64_t> filter_shape, Value filter,
+                                  OpBuilder* builder, int32_t pad_h,
+                                  int32_t pad_w) {
+  SmallVector<int32_t, 8> values = {pad_h, 0, pad_w, 0, 0, 0, 0, 0};
+  auto padding_type =
+      RankedTensorType::get({4, 2}, builder->getIntegerType(32));
+  auto paddings = DenseIntElementsAttr::get(padding_type, values);
+  auto paddings_value = builder->create<TF::ConstOp>(filter.getLoc(), paddings);
+  std::vector<int64_t> pad_shape = {filter_shape[0] + pad_h,
+                                    filter_shape[1] + pad_w, filter_shape[2],
+                                    filter_shape[3]};
+  SmallVector<int64_t, 4> expand_shape(pad_shape.begin(), pad_shape.end());
+
+  auto expand_result_type =
+      RankedTensorType::get(expand_shape, getElementTypeOrSelf(filter));
+  return builder->create<TF::PadOp>(filter.getLoc(), expand_result_type, filter,
+                                    paddings_value);
+}
+
+// Create reshape op for space to depth transform.
+TF::ReshapeOp GetReshapeOpForConv2DFilter(ArrayRef<int64_t> new_shape,
+                                          Value input, OpBuilder* builder) {
+  auto reshape_result_type =
+      RankedTensorType::get(new_shape, getElementTypeOrSelf(input));
+  auto reshape_type = RankedTensorType::get(
+      {static_cast<int64_t>(new_shape.size())}, builder->getIntegerType(64));
+  auto reshape_sizes = DenseIntElementsAttr::get(reshape_type, new_shape);
+  auto reshape_value =
+      builder->create<TF::ConstOp>(input.getLoc(), reshape_sizes);
+  return builder->create<TF::ReshapeOp>(input.getLoc(), reshape_result_type,
+                                        input, reshape_value);
+}
+
+// Create transpose op for shape to depth transform.
+TF::TransposeOp GetTransposeOpForConv2DFilter(OpBuilder* builder, Value input) {
+  SmallVector<int32_t, 6> permutation = {0, 2, 1, 3, 4, 5};
+  auto permute_type = RankedTensorType::get({6}, builder->getIntegerType(32));
+  auto permute_attr = DenseIntElementsAttr::get(permute_type, permutation);
+  auto permute_value =
+      builder->create<TF::ConstOp>(input.getLoc(), permute_attr);
+  return builder->create<TF::TransposeOp>(input.getLoc(), input, permute_value);
+}
+
+void HandleConv2DFilter(TF::Conv2DOp conv2d, int64_t block_size) {
+  // For example, if filter shape is [7, 7, 3, 64] with block_size 2,
+  // will apply below transforms to the filter:
+  // 1. Pad the filter to [8, 8, 3, 64]
+  // 2. Reshape to [4, 2, 4, 2, 3, 64]
+  // 3. Transpose to [4, 4, 2, 2, 3, 64]
+  // 4. Reshape to [4, 4, 12, 64]
+  auto filter = conv2d.filter();
+  OpBuilder builder(conv2d);
+  builder.setInsertionPoint(conv2d);
+  // Book keeping filter information.
+  auto filter_shape = filter.getType().cast<RankedTensorType>().getShape();
+  int64_t height = filter_shape[0];
+  int64_t width = filter_shape[1];
+  int64_t channel = filter_shape[2];
+  int64_t out_channel = filter_shape[3];
+  // Value/Op before reshape op.
+  Value before_reshape_value = filter;
+  if (height % block_size != 0 || width % block_size != 0) {
+    // Calculate paddings for height and width.
+    int32_t pad_h = block_size - height % block_size;
+    int32_t pad_w = block_size - width % block_size;
+    auto pad_op =
+        GetPadOpForConv2DFilter(filter_shape, filter, &builder, pad_h, pad_w);
+    // Update op, height and width before reshape.
+    before_reshape_value = pad_op;
+    height = height + pad_h;
+    width = width + pad_w;
+  }
+
+  // Reshape.
+  SmallVector<int64_t, 6> new_shape = {
+      height / block_size, block_size, width / block_size,
+      block_size,          channel,    out_channel};
+  auto reshape_op =
+      GetReshapeOpForConv2DFilter(new_shape, before_reshape_value, &builder);
+
+  // Transpose.
+  auto transpose_op = GetTransposeOpForConv2DFilter(&builder, reshape_op);
+
+  // Reshape Back.
+  SmallVector<int64_t, 4> final_shape = {
+      height / block_size, width / block_size,
+      channel * block_size * block_size, out_channel};
+  auto final_reshape_op =
+      GetReshapeOpForConv2DFilter(final_shape, transpose_op, &builder);
+  // Update filter of Conv2D.
+  conv2d.setOperand(1, final_reshape_op);
+}
+
+// Create slice op for filter in back prop pass.
+TF::SliceOp GetSliceOpForConv2DBackPropFilter(
+    ArrayRef<int32_t> old_filter_shape, Value input, OpBuilder* builder) {
+  SmallVector<int64_t, 4> slice_size(old_filter_shape.begin(),
+                                     old_filter_shape.end());
+  auto slice_result_type =
+      RankedTensorType::get(slice_size, getElementTypeOrSelf(input));
+  auto slice_size_op = builder->create<TF::ConstOp>(
+      input.getLoc(),
+      DenseIntElementsAttr::get(
+          RankedTensorType::get({4}, builder->getIntegerType(32)),
+          old_filter_shape));
+  SmallVector<int64_t, 4> slice_start_position = {0, 0, 0, 0};
+  auto start_position_type =
+      RankedTensorType::get({4}, builder->getIntegerType(64));
+  auto start_position = builder->create<TF::ConstOp>(
+      input.getLoc(),
+      DenseIntElementsAttr::get(start_position_type, slice_start_position));
+  return builder->create<TF::SliceOp>(input.getLoc(), slice_result_type, input,
+                                      start_position, slice_size_op);
+}
+
+// Transform Conv2DBackPropFilter for space to depth.
+void HandleConv2DBackPropFilter(TF::Conv2DBackpropFilterOp backprop,
+                                ArrayRef<int32_t> old_filter_shape,
+                                ArrayRef<int32_t> new_filter_shape,
+                                int64_t block_size) {
+  OpBuilder builder(backprop);
+  builder.setInsertionPoint(backprop);
+
+  auto input = backprop.input();
+  // Get new filter size from new_filter_shape.
+  auto new_filter_sizes = builder.create<TF::ConstOp>(
+      backprop.getLoc(),
+      DenseIntElementsAttr::get(
+          RankedTensorType::get({4}, builder.getIntegerType(32)),
+          new_filter_shape));
+
+  // Set stride to [1, 1, 1, 1].
+  MLIRContext* context = backprop.getContext();
+  SmallVector<int64_t, 4> values = {1, 1, 1, 1};
+  auto attrs = llvm::map_range(values, [context](int64_t v) -> Attribute {
+    return IntegerAttr::get(IntegerType::get(64, context), APInt(64, v));
+  });
+  auto strides = ArrayAttr::get(llvm::to_vector<4>(attrs), context);
+
+  // new result type.
+  SmallVector<int64_t, 4> new_shape(new_filter_shape.begin(),
+                                    new_filter_shape.end());
+  auto new_result_type =
+      RankedTensorType::get(new_shape, getElementTypeOrSelf(input));
+
+  // Build new BackPropFilterOp.
+  auto loc = backprop.getLoc();
+  auto new_backprop = builder.create<TF::Conv2DBackpropFilterOp>(
+      loc, new_result_type, input, new_filter_sizes, backprop.out_backprop(),
+      strides, backprop.use_cudnn_on_gpu(), backprop.padding(),
+      backprop.explicit_paddings(), backprop.data_format(),
+      backprop.dilations());
+
+  // For example, if new filter shape is [4, 4, 12, 64], old filter shape
+  // is [7, 7, 3, 64] with block_size 2.
+  // Below transforms will be applied to the filter:
+  // 1. Reshape to [4, 4, 2, 2, 3, 64];
+  // 2. Transpose to [4, 2, 4, 2, 3, 64];
+  // 3. Reshape to [8, 8, 3, 64];
+  // 4. Slice to [7, 7, 3, 64].
+  SmallVector<int64_t, 6> first_reshape_shape = {
+      new_filter_shape[0],
+      new_filter_shape[1],
+      block_size,
+      block_size,
+      new_filter_shape[2] / (block_size * block_size),
+      new_filter_shape[3]};
+  auto first_reshape_op =
+      GetReshapeOpForConv2DFilter(first_reshape_shape, new_backprop, &builder);
+
+  // Transpose.
+  auto transpose_op = GetTransposeOpForConv2DFilter(&builder, first_reshape_op);
+
+  // Last Reshape op.
+  SmallVector<int64_t, 4> last_reshape_shape = {
+      new_filter_shape[0] * block_size, new_filter_shape[1] * block_size,
+      new_filter_shape[2] / (block_size * block_size), new_filter_shape[3]};
+  auto final_reshape_op =
+      GetReshapeOpForConv2DFilter(last_reshape_shape, transpose_op, &builder);
+
+  // create slice op.
+  auto slice_op = GetSliceOpForConv2DBackPropFilter(old_filter_shape,
+                                                    final_reshape_op, &builder);
+
+  // Update backprop's user with the slice op.
+  backprop.replaceAllUsesWith(slice_op.getResult());
+}
+
+// Update func arugument type to have the updated input shape.
+void UpdateFuncType(FuncOp func) {
+  llvm::SmallVector<Type, 8> arg_types;
+  arg_types.reserve(func.getNumArguments());
+  for (auto arg : func.getArguments()) arg_types.emplace_back(arg.getType());
+  auto terminator = func.front().getTerminator();
+  SmallVector<Type, 4> result_types(terminator->operand_type_begin(),
+                                    terminator->operand_type_end());
+  func.setType(FunctionType::get(arg_types, result_types, func.getContext()));
+}
+
+void HandleFuncOp(Operation* op) {
+  auto func = llvm::cast<FuncOp>(op);
+  UpdateFuncType(func);
+}
+
+// Checks if the input producer op is supported in this transform. Right now, we
+// only check if it is a host tf.IteratorGetNext.
+bool IsSupportedHostInputOp(Operation* op) {
+  TF::IteratorGetNextOp iter = llvm::dyn_cast<TF::IteratorGetNextOp>(op);
+  if (!iter) return false;
+  auto device = op->getAttrOfType<StringAttr>(kDeviceAttr);
+  if (!device) return false;
+  tensorflow::DeviceNameUtils::ParsedName parsed_device;
+  if (!tensorflow::DeviceNameUtils::ParseFullName(device.getValue().str(),
+                                                  &parsed_device)) {
+    return false;
+  }
+  return parsed_device.type == "CPU";
+}
+
+// Builds a SpaceToDepthOp with the given get_layout op and input.
+TF::SpaceToDepthOp BuildSpaceToDepth(tf_device::ClusterFuncOp cluster_func,
+                                     Value input, int32_t block_size,
+                                     ArrayRef<int64_t> input_shape) {
+  auto input_op = input.getDefiningOp();
+  OpBuilder builder(input_op);
+  builder.setInsertionPointAfter(input_op);
+  SmallVector<int64_t, 4> transform_shape = {
+      input_shape[0], input_shape[1] / block_size, input_shape[2] / block_size,
+      input_shape[3] * block_size * block_size};
+  auto transform_result_type =
+      RankedTensorType::get(transform_shape, getElementTypeOrSelf(input));
+  return builder.create<TF::SpaceToDepthOp>(cluster_func.getLoc(),
+                                            transform_result_type, input,
+                                            APInt(64, block_size));
+}
+
+// Performs transformation for a non-replicated input.
+TF::SpaceToDepthOp HandleHostInput(Value input, int64_t index,
+                                   tf_device::ClusterFuncOp cluster_func,
+                                   int32_t block_size,
+                                   ArrayRef<int64_t> input_shape) {
+  auto space_to_depth =
+      BuildSpaceToDepth(cluster_func, input, block_size, input_shape);
+  cluster_func.setOperand(index, space_to_depth);
+  return space_to_depth;
+}
+
+// Performs transformation for replicated inputs. Returns true if this is a
+// supported case (thus transform happened).
+bool HandleHostReplicatedInputs(int64_t index,
+                                tf_device::ClusterFuncOp cluster_func,
+                                int64_t replicate_arg_index,
+                                tf_device::ReplicateOp replicate,
+                                int32_t block_size) {
+  // We need to know the devices to copy to.
+  if (!replicate.devices()) return false;
+  int64_t num_replicas = replicate.n().getZExtValue();
+  // Gets inputs at replicate_arg_index for each replica.
+  auto inputs = replicate.getOperands()
+                    .drop_front(replicate_arg_index * num_replicas)
+                    .take_front(num_replicas);
+  for (auto input : inputs) {
+    auto input_op = input.getDefiningOp();
+    if (!input_op || !IsSupportedHostInputOp(input_op)) return false;
+  }
+  for (auto entry : llvm::enumerate(inputs)) {
+    auto ranked_type = entry.value().getType().dyn_cast<RankedTensorType>();
+    if (!ranked_type) return false;
+    auto input_shape = ranked_type.getShape();
+    auto space_to_depth =
+        BuildSpaceToDepth(cluster_func, entry.value(), block_size, input_shape);
+    replicate.setOperand(num_replicas * replicate_arg_index + entry.index(),
+                         space_to_depth);
+  }
+  return true;
+}
+
+// Performs transformation on a pair of execute and compile ops. The compile
+// should not have other uses.
+void HandleCluster(tf_device::ClusterFuncOp cluster_func, int32_t block_size,
+                   unsigned arg_num) {
+  auto maybe_replicate =
+      llvm::dyn_cast<tf_device::ReplicateOp>(cluster_func.getParentOp());
+
+  llvm::SmallVector<int64_t, 8> transform_input_indices;
+  for (auto input : llvm::enumerate(cluster_func.operands())) {
+    if (auto block_arg = input.value().dyn_cast<BlockArgument>()) {
+      if (block_arg.getArgNumber() != arg_num) continue;
+      // For a block argument, consider transforms only when it is a replicated
+      // input (defining ops will be outside the replicate node).
+      if (maybe_replicate == block_arg.getParentRegion()->getParentOp()) {
+        HandleHostReplicatedInputs(input.index(), cluster_func,
+                                   block_arg.getArgNumber(), maybe_replicate,
+                                   block_size);
+      }
+    } else {
+      // For an op output, consider transforms only when 1) there is no
+      // replicateion or 2) it is outside the replicate node that encloses the
+      // execute node. (Because if the op is inside replicate, it is probably
+      // not on the host.)
+      if (input.index() != arg_num) continue;
+      auto input_op = input.value().getDefiningOp();
+      if (maybe_replicate &&
+          maybe_replicate.body().isAncestor(input_op->getParentRegion())) {
+        continue;
+      }
+      if (!IsSupportedHostInputOp(input_op)) continue;
+      auto ranked_type = input.value().getType().dyn_cast<RankedTensorType>();
+      if (!ranked_type) continue;
+      auto input_shape = ranked_type.getShape();
+      HandleHostInput(input.value(), input.index(), cluster_func, block_size,
+                      input_shape);
+    }
+  }
+}
+
+// Check if input shape of convolution is good for space to depth transform.
+bool Conv2DInputShapeCanTransform(Value input) {
+  auto ranked_type = input.getType().dyn_cast<RankedTensorType>();
+  if (!ranked_type) return false;
+  auto input_shape = ranked_type.getShape();
+  int32_t batch_size = input_shape[0];
+  int32_t channel = input_shape[3];
+  if (batch_size > 8 || channel > 8) {
+    return false;
+  }
+  return true;
+}
+
+// Checks if a convoluton can apply SpaceToDepth transform.
+// Only the first convolution in the graph whose batch size smaller than 8
+// and its input feature size smaller than 8 can be transformed.
+Optional<std::pair<unsigned, int>> GetConv2DInputArgNum(TF::Conv2DOp conv2d) {
+  if (conv2d.data_format() != "NHWC" || conv2d.strides().size() != 4) {
+    return None;
+  }
+  auto conv2d_input = conv2d.input();
+  if (auto block_arg = conv2d_input.dyn_cast<mlir::BlockArgument>()) {
+    if (!Conv2DInputShapeCanTransform(conv2d_input)) return None;
+    int num_users =
+        std::distance(block_arg.getUsers().begin(), block_arg.getUsers().end());
+    return std::make_pair(block_arg.getArgNumber(), num_users);
+  }
+
+  if (auto pad_op = llvm::dyn_cast<TF::PadOp>(conv2d_input.getDefiningOp())) {
+    auto pad_input = pad_op.input();
+    if (auto block_arg = pad_input.dyn_cast<mlir::BlockArgument>()) {
+      if (!Conv2DInputShapeCanTransform(pad_input)) return None;
+      int num_users = std::distance(block_arg.getUsers().begin(),
+                                    block_arg.getUsers().end());
+      return std::make_pair(block_arg.getArgNumber(), num_users);
+    }
+  }
+
+  return None;
+}
+
+// Apply space to depth transform for the first convolution on TPU device.
+void HandleFirstConvolution(TF::Conv2DOp conv2d, int64_t block_size) {
+  // Check if input and filter type are RankedTensorType.
+  auto input_tensor_type =
+      conv2d.input().getType().dyn_cast<RankedTensorType>();
+  auto filter_tensor_type =
+      conv2d.filter().getType().dyn_cast<RankedTensorType>();
+  if (!input_tensor_type || !filter_tensor_type) return;
+  // Book keeping filter shape for padding and backprop filter rewrite.
+  auto filter_shape = filter_tensor_type.getShape();
+  SmallVector<int32_t, 4> old_filter_shape(filter_shape.begin(),
+                                           filter_shape.end());
+  // Handles input.
+  auto conv2d_input = conv2d.input();
+  if (auto block_arg = conv2d_input.dyn_cast<mlir::BlockArgument>()) {
+    // Change on device function type/shape.
+    HandleFuncOp(block_arg.getOwner()->getParentOp());
+  }
+
+  if (auto pad_op = dyn_cast_or_null<TF::PadOp>(conv2d_input.getDefiningOp())) {
+    // Rewrite pad_op before Convolutioin.
+    if (failed(HandlePad(pad_op, filter_shape[0], block_size))) return;
+    auto pad_input = pad_op.input();
+    if (auto block_arg = pad_input.dyn_cast<mlir::BlockArgument>()) {
+      // Change on device function type/shape.
+      HandleFuncOp(block_arg.getOwner()->getParentOp());
+    }
+  }
+
+  // Handle Conv2D input, stride and filter.
+  HandleConv2DInput(conv2d, block_size);
+  HandleConv2DStride(conv2d);
+  HandleConv2DFilter(conv2d, block_size);
+
+  // Book keeping new filter shape for backprop filter rewrite.
+  // Filter shape is defined in HandleConv2DFilter, thus it is RankedTensorType.
+  filter_shape = conv2d.filter().getType().cast<RankedTensorType>().getShape();
+  SmallVector<int32_t, 4> new_filter_shape(filter_shape.begin(),
+                                           filter_shape.end());
+
+  // Rewrite Conv2DBackPropFilter after the first convolution.
+  for (Operation* user : conv2d.getOperation()->getUsers()) {
+    if (auto backprop = dyn_cast<TF::Conv2DBackpropFilterOp>(user)) {
+      HandleConv2DBackPropFilter(backprop, old_filter_shape, new_filter_shape,
+                                 block_size);
+    }
+  }
+}
+
+// Get block size that is equal to stride from spatial dimension
+// from convolution.
+// Space to depth transform won't be triggered if block size <= 1.
+int32_t GetConv2DBlockSize(TF::Conv2DOp conv2d) {
+  SmallVector<int32_t, 4> strides(4, 1);
+  for (int i = 0; i < 3; ++i) {
+    strides[i] = conv2d.strides()[i].cast<mlir::IntegerAttr>().getInt();
+  }
+
+  // Space to depth only supports striding at spatial dimension.
+  if (strides[0] != 1 || strides[3] != 1) return 1;
+
+  // Space to depth only supports height_stride == width_stride case.
+  if (strides[1] != strides[2]) return 1;
+
+  return strides[1];
+}
+
+void TPUSpaceToDepthPass::runOnOperation() {
+  Optional<tf_device::ClusterFuncOp> cluster_func;
+  // Space to depth only supports training loop.
+  auto func_result = getOperation().walk([&](tf_device::ClusterFuncOp cluster) {
+    cluster_func = cluster;
+    return WalkResult::interrupt();
+  });
+
+  // Return if there is no tf_device::ClusterFuncOp in training loop.
+  if (!func_result.wasInterrupted() || !cluster_func.hasValue()) {
+    return;
+  }
+
+  // Get the function on device.
+  auto device_func =
+      getOperation().lookupSymbol<mlir::FuncOp>(cluster_func->getFunc());
+  if (!device_func) return;
+
+  TF::Conv2DOp first_conv;
+  Optional<ArrayRef<int64_t>> input_shape;
+  // A map maps block argument id to the convolutions consumes them.
+  llvm::SmallDenseMap<unsigned, std::vector<Conv2DWithBlockSize>>
+      argnum_and_convolutions;
+  // A map maps block argument id to the number of users.
+  llvm::SmallDenseMap<unsigned, int> argnum_num_users;
+
+  // Find out the qualified convolutions and its block argument ids.
+  auto conv2d_result = device_func.walk([&](TF::Conv2DOp conv2d) {
+    Optional<std::pair<unsigned, int>> arg_num_and_num_users =
+        GetConv2DInputArgNum(conv2d);
+    if (arg_num_and_num_users.hasValue()) {
+      // Get block size for the first convolution.
+      int64_t block_size = GetConv2DBlockSize(conv2d);
+      auto arg_num = arg_num_and_num_users.getValue().first;
+      auto num_users = arg_num_and_num_users.getValue().second;
+      argnum_and_convolutions[arg_num].emplace_back(conv2d, block_size);
+      argnum_num_users[arg_num] = num_users;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  if (!conv2d_result.wasInterrupted()) {
+    return;
+  }
+
+  // Iterate through block argument and its convolution users. Space to depth
+  // transform will be applied only if all the below conditions are satisfied:
+  //  1. All the users of the block argument will lead to convolutions;
+  //  2. block_size of for the space to depth transform for these convolutions
+  //     are the same;
+  //  3. block_size of for the space to depth transform for these convolutions
+  //     are larger than 1.
+  for (auto argnum_and_convolution : argnum_and_convolutions) {
+    auto arg_num = argnum_and_convolution.getFirst();
+    auto conv2d_and_block_sizes = argnum_and_convolution.getSecond();
+    // Continue if number of users of the block argment doesn't equal to number
+    // of transformable convolutions and there is no qualified convolution
+    // for transform or block size is smaller than 2.
+    if (argnum_num_users[arg_num] != conv2d_and_block_sizes.size() ||
+        conv2d_and_block_sizes.empty()) {
+      argnum_and_convolutions.erase(arg_num);
+      continue;
+    }
+    int64_t block_size = conv2d_and_block_sizes[0].second;
+    if (block_size < 2) {
+      argnum_and_convolutions.erase(arg_num);
+      continue;
+    }
+    // Continue if not all the block sizes for space to depth transform are the
+    // same.
+    for (auto conv2d_and_block_size : conv2d_and_block_sizes) {
+      if (conv2d_and_block_size.second != block_size) {
+        argnum_and_convolutions.erase(arg_num);
+        break;
+      }
+    }
+  }
+
+  // If there is no qualified space to depth transform.
+  if (argnum_and_convolutions.empty()) {
+    return;
+  }
+
+  // Apply space to depth transform.
+  for (auto argnum_and_convolution : argnum_and_convolutions) {
+    auto conv2d_and_block_sizes = argnum_and_convolution.getSecond();
+    int64_t block_size = conv2d_and_block_sizes[0].second;
+    // Apply space to depth transform to the input on the host.
+    HandleCluster(cluster_func.getValue(), block_size,
+                  argnum_and_convolution.getFirst());
+    // Transform the convolution.
+    for (auto conv2d_and_block_size : conv2d_and_block_sizes) {
+      HandleFirstConvolution(conv2d_and_block_size.first,
+                             conv2d_and_block_size.second);
+    }
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUSpaceToDepthPass() {
+  return std::make_unique<TPUSpaceToDepthPass>();
+}
+
+static PassRegistration<TPUSpaceToDepthPass> pass(
+    "tf-tpu-space-to-depth-pass",
+    "Adds ops that allow TPU program enable automaic space to depth for the"
+    "convolution determined at JIT compile time.");
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
index 9e8745918e3..ec4a25c6fdd 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
@@ -229,7 +229,7 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
     mapping.emplace_back(it->second, std::move(while_args));
   }
   // Sort the mapping according to execute operand order.
-  llvm::sort(mapping);
+  llvm::sort(mapping, llvm::less_first());
   // Populate the `retval_index_for_sharding` field of the argument metadate.
   for (auto entry : llvm::enumerate(execute.device_var_reads_indices())) {
     int64_t arg_index = entry.value().cast<IntegerAttr>().getInt();
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 75fcede8fbb..8e51f8c9a25 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -70,7 +70,6 @@ using llvm::isa;
 using mlir::BlockArgument;
 using mlir::Dialect;
 using mlir::Operation;
-using mlir::OperationState;
 using mlir::Value;
 using stream_executor::port::StatusOr;
 
@@ -79,6 +78,9 @@ namespace {
 constexpr char kInvalidExecutorGraphMsg[] =
     "Functions must be of a single Graph with single op Islands: ";
 
+constexpr char kDeviceAttr[] = "tf.device";
+constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
+
 bool IsLegalChar(char c, bool first_char) {
   if (isalpha(c)) return true;
   if (isdigit(c)) return true;
@@ -267,17 +269,14 @@ StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(
   (*node_def->mutable_attr())["index"] = index_attr;
 
   if (auto device_attr =
-          func.getArgAttrOfType<mlir::StringAttr>(index, "tf.device")) {
+          func.getArgAttrOfType<mlir::StringAttr>(index, kDeviceAttr))
     *node_def->mutable_device() = device_attr.getValue().str();
-  }
 
-  if (auto resource_arg_unique_id_attr =
-          func.getArgAttrOfType<mlir::IntegerAttr>(
-              index, "tf.resource_arg_unique_id")) {
-    AttrValue unique_id_attr;
-    unique_id_attr.set_i(resource_arg_unique_id_attr.getInt());
-    (*node_def->mutable_attr())["_resource_arg_unique_id"] = unique_id_attr;
-  }
+  llvm::ArrayRef<mlir::NamedAttribute> func_arg_i_attrs =
+      func.getArgAttrs(index);
+  absl::flat_hash_set<absl::string_view> attrs_to_ignore = {kDeviceAttr};
+  TF_RETURN_IF_ERROR(ConvertAttributes(func_arg_i_attrs, attrs_to_ignore,
+                                       node_def->mutable_attr()));
 
   return node_def;
 }
@@ -682,14 +681,6 @@ Status Exporter::ConvertLibFunction(const GraphExportConfig& configs,
   if (auto attr = function.getAttrOfType<mlir::UnitAttr>(stateful_string)) {
     func_def.mutable_signature()->set_is_stateful(true);
   }
-  for (int64 i = 0; i < function.getNumArguments(); ++i) {
-    if (auto resource_arg_unique_id_attr =
-            function.getArgAttrOfType<mlir::IntegerAttr>(
-                i, "tf.resource_arg_unique_id")) {
-      (*func_def.mutable_resource_arg_unique_id())[i] =
-          resource_arg_unique_id_attr.getInt();
-    }
-  }
 
   // Ignore the gradient and is_stateful attribute on the function as they have
   // been handled above.
@@ -699,7 +690,28 @@ Status Exporter::ConvertLibFunction(const GraphExportConfig& configs,
       function.getDialectAttrs());
   TF_RETURN_IF_ERROR(
       ConvertAttributes(funcAttrs, attrs_to_ignore, func_def.mutable_attr()));
-  (*flib->add_function()) = func_def;
+
+  for (int i = 0, e = function.getNumArguments(); i < e; ++i) {
+    if (auto resource_arg_unique_id_attr =
+            function.getArgAttrOfType<mlir::IntegerAttr>(
+                i, kResourceArgUniqueIdAttr)) {
+      (*func_def.mutable_resource_arg_unique_id())[i] =
+          resource_arg_unique_id_attr.getInt();
+    }
+
+    llvm::ArrayRef<mlir::NamedAttribute> func_arg_i_attrs =
+        function.getArgAttrs(i);
+    if (func_arg_i_attrs.empty()) continue;
+    absl::flat_hash_set<absl::string_view> attrs_to_ignore = {
+        kDeviceAttr, kResourceArgUniqueIdAttr};
+    FunctionDef::ArgAttrs func_def_arg_i_attrs;
+    TF_RETURN_IF_ERROR(ConvertAttributes(func_arg_i_attrs, attrs_to_ignore,
+                                         func_def_arg_i_attrs.mutable_attr()));
+    if (func_def_arg_i_attrs.attr().empty()) continue;
+    (*func_def.mutable_arg_attr())[i] = std::move(func_def_arg_i_attrs);
+  }
+
+  (*flib->add_function()) = std::move(func_def);
   return Status::OK();
 }
 
@@ -782,4 +794,22 @@ StatusOr<std::unique_ptr<GraphDef>> ConvertMlirToGraphdef(
   return graphdef;
 }
 
+stream_executor::port::Status ConvertMlirFunctionToFunctionLibraryDef(
+    mlir::FuncOp func, const GraphExportConfig& configs,
+    FunctionDef* function_def) {
+  Dialect* tf_dialect = func.getContext()->getRegisteredDialect("tf");
+  FunctionDefLibrary flib;
+  TF_RETURN_IF_ERROR(
+      Exporter::ConvertLibFunction(configs, tf_dialect, func, &flib));
+  for (auto& func_def : flib.function()) {
+    if (func_def.signature().name() == func.getName()) {
+      *function_def = func_def;
+      return Status::OK();
+    }
+  }
+  return errors::InvalidArgument(
+      "Function couldn't be found in the FunctionDefLibrary after converting "
+      "from MLIR");
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h
index 2d522f6031e..a5aebd16146 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
@@ -50,6 +51,12 @@ stream_executor::port::Status ConvertMlirToGraph(
 stream_executor::port::Status ConvertMlirToGraph(
     mlir::ModuleOp module, const GraphExportConfig& configs,
     std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def);
+
+// Converts an MLIR function and adds it to a FunctionLibraryDefinition.
+stream_executor::port::Status ConvertMlirFunctionToFunctionLibraryDef(
+    mlir::FuncOp func, const GraphExportConfig& configs,
+    FunctionDef* function_def);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_EXPORT_GRAPHDEF_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 49be3da912a..3aa700d3718 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -40,7 +40,9 @@ limitations under the License.
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
@@ -57,6 +59,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h"
@@ -65,6 +68,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
@@ -109,6 +113,7 @@ static inline absl::string_view StringRefToView(llvm::StringRef ref) {
 }
 
 namespace tensorflow {
+using mlir::NamedAttrList;
 using mlir::TensorType;
 using mlir::TF::VarHandleOp;
 using mlir::tf_saved_model::GlobalTensorOp;
@@ -128,6 +133,13 @@ bool IsOutputShapesAttribute(const AttrValue& attr_value,
          attr_value.value_case() == AttrValue::kList;
 }
 
+bool IsResourceOutputShapesAttribute(const AttrValue& attr_value,
+                                     llvm::StringRef attr_name) {
+  if (attr_name == "_handle_dtypes" || attr_name == "_handle_shapes")
+    return attr_value.value_case() == AttrValue::kList;
+  return false;
+}
+
 // This class is used to generate new MLIR function name strings that are both
 // unique in the TF function library `flib_` and unique among the name strings
 // generated by the class object during its lifetime.
@@ -191,15 +203,11 @@ class ImporterBase {
   StatusOr<mlir::FunctionType> InferLibFunctionType(const FunctionBody& fbody);
 
   // Extracts arg and ret nodes from FunctionBody.
-  // `resource_arg_unique_ids` will be filled with the unique IDs of resource
-  // variables, as a list of {index, ID} pairs.
   void GetArgsAndRetsFromFunctionBody(
       const FunctionBody& fbody,
       absl::InlinedVector<OutputTensor, 4>* arg_nodes,
       absl::InlinedVector<OutputTensor, 4>* ret_nodes,
-      absl::InlinedVector<Node*, 4>* control_ret_nodes,
-      absl::InlinedVector<std::pair<int64_t, int64_t>, 4>*
-          resource_arg_unique_ids);
+      absl::InlinedVector<Node*, 4>* control_ret_nodes);
 
   // Prepares converting the graph to an MLIR module. This step removes the
   // backedges of the graph, orders the nodes and infers the shapes.
@@ -213,8 +221,7 @@ class ImporterBase {
                  const absl::InlinedVector<OutputTensor, 4>& ret_nodes,
                  const absl::InlinedVector<Node*, 4>& control_ret_nodes,
                  llvm::ArrayRef<mlir::NamedAttribute> attrs,
-                 const absl::InlinedVector<std::pair<int64_t, int64_t>, 4>&
-                     resource_arg_unique_ids);
+                 bool function_graph);
 
   // Finds out the function definition for the given function name from the
   // graph and converts it to a function of the module. This method is called
@@ -306,9 +313,9 @@ class ImporterBase {
   // AttrValue {name : foo, attrs : {k1 : bar, k2 : rfc}}, it will convert it to
   // a list of MLIR Attributes: [{base_name : foo}, {base_name.k1 : bar},
   // {base_name.k2 : rfc}}.
-  Status ConvertFunctionCallAttribute(
-      const std::string& base_name, const AttrValue& value,
-      llvm::SmallVector<mlir::NamedAttribute, 4>* attributes);
+  Status ConvertFunctionCallAttribute(const std::string& base_name,
+                                      const AttrValue& value,
+                                      NamedAttrList* attributes);
 
   // Helper to create either a tf_executor operation or a TF operation wrapped
   // in an island. When convert_to_legacy_call is true, converts the operation
@@ -974,7 +981,6 @@ StatusOr<mlir::Type> ImporterBase::InferOutputType(const Node& node, int idx,
     if (dtype == DT_RESOURCE) {
       const AttrValue* dtype_attr = node.attrs().Find("_handle_dtypes");
       const AttrValue* shape_attr = node.attrs().Find("_handle_shapes");
-      LOG(INFO) << dtype_attr << " " << shape_attr;
       if (dtype_attr && shape_attr) {
         if (dtype_attr->list().type().empty()) {
           return errors::InvalidArgument(
@@ -1089,9 +1095,9 @@ StatusOr<ImporterBase::ElementSubtypes> ImporterBase::ConvertSubtypes(
   return subtypes;
 }
 
-Status ImporterBase::ConvertFunctionCallAttribute(
-    const std::string& base_name, const AttrValue& value,
-    llvm::SmallVector<mlir::NamedAttribute, 4>* attributes) {
+Status ImporterBase::ConvertFunctionCallAttribute(const std::string& base_name,
+                                                  const AttrValue& value,
+                                                  NamedAttrList* attributes) {
   TF_ASSIGN_OR_RETURN(auto func_attr,
                       ConvertFunctionCallName(value.func().name()));
   attributes->push_back(builder_.getNamedAttr(base_name, func_attr));
@@ -1165,8 +1171,18 @@ StatusOr<mlir::Attribute> ImporterBase::ConvertAttributeValue(
       return builder_.getArrayAttr(
           llvm::makeArrayRef(attrs.begin(), attrs.end()));
     }
-    case AttrValue::kFunc:
-      return errors::Unknown("kFunc type should be handled separately!");
+    case AttrValue::kFunc: {
+      // TODO(b/156546237): Unify kFunc/NameAttrList attribute representation.
+      // Currently kFunc/NameAttrList attributes in a kList/repeated AttrValue
+      // will not use this representation.
+      NamedAttrList attrs;
+      for (const auto& func_attr : value.func().attr()) {
+        TF_ASSIGN_OR_RETURN(auto attr, ConvertAttributeValue(func_attr.second));
+        attrs.push_back(builder_.getNamedAttr(func_attr.first, attr));
+      }
+      auto func_attrs = builder_.getDictionaryAttr(attrs);
+      return mlir::TF::FuncAttr::get(context_, value.func().name(), func_attrs);
+    }
     case AttrValue::VALUE_NOT_SET:
       return builder_.getUnitAttr();
     // kPlaceholder is not implemented.
@@ -1179,9 +1195,7 @@ StatusOr<mlir::Attribute> ImporterBase::ConvertAttributeValue(
 void ImporterBase::GetArgsAndRetsFromFunctionBody(
     const FunctionBody& fbody, absl::InlinedVector<OutputTensor, 4>* arg_nodes,
     absl::InlinedVector<OutputTensor, 4>* ret_nodes,
-    absl::InlinedVector<Node*, 4>* control_ret_nodes,
-    absl::InlinedVector<std::pair<int64_t, int64_t>, 4>*
-        resource_arg_unique_ids) {
+    absl::InlinedVector<Node*, 4>* control_ret_nodes) {
   arg_nodes->reserve(fbody.arg_nodes.size());
   ret_nodes->reserve(fbody.ret_nodes.size());
   for (auto arg : fbody.arg_nodes) {
@@ -1190,9 +1204,6 @@ void ImporterBase::GetArgsAndRetsFromFunctionBody(
   for (auto ret : fbody.ret_nodes) {
     ret_nodes->emplace_back(ret, 0);
   }
-  for (const auto& entry : fbody.fdef.resource_arg_unique_id()) {
-    resource_arg_unique_ids->push_back(entry);
-  }
   *control_ret_nodes = fbody.control_ret_nodes;
 }
 
@@ -1287,14 +1298,13 @@ Status ImporterBase::ConvertLibFunction(llvm::StringRef func_name) {
   absl::InlinedVector<OutputTensor, 4> arg_nodes;
   absl::InlinedVector<OutputTensor, 4> ret_nodes;
   absl::InlinedVector<Node*, 4> control_ret_nodes;
-  absl::InlinedVector<std::pair<int64_t, int64_t>, 4> resource_arg_unique_ids;
   GetArgsAndRetsFromFunctionBody(*fbody, &arg_nodes, &ret_nodes,
-                                 &control_ret_nodes, &resource_arg_unique_ids);
+                                 &control_ret_nodes);
 
   TF_RETURN_IF_ERROR(child_importer.Convert(
       mlir_func_name, func_type, arg_nodes, ret_nodes, control_ret_nodes,
       llvm::makeArrayRef(attributes.begin(), attributes.end()),
-      resource_arg_unique_ids));
+      /*function_graph=*/true));
   return Status::OK();
 }
 
@@ -1396,9 +1406,7 @@ Status ImporterBase::Convert(
     const absl::InlinedVector<OutputTensor, 4>& arg_nodes,
     const absl::InlinedVector<OutputTensor, 4>& ret_nodes,
     const absl::InlinedVector<Node*, 4>& control_ret_nodes,
-    llvm::ArrayRef<mlir::NamedAttribute> attrs,
-    const absl::InlinedVector<std::pair<int64_t, int64_t>, 4>&
-        resource_arg_unique_ids) {
+    llvm::ArrayRef<mlir::NamedAttribute> attrs, bool function_graph) {
   // TODO(b/122040776): Uses debug info for FunctionDef.
   auto function = mlir::FuncOp::create(mlir::UnknownLoc::get(context_),
                                        func_name, func_type, attrs);
@@ -1424,10 +1432,6 @@ Status ImporterBase::Convert(
   TF_RETURN_IF_ERROR(ConvertFunctionArgAndRets(function, graph,
                                                func_type.getInputs(), arg_nodes,
                                                ret_nodes, control_ret_nodes));
-  for (const auto& entry : resource_arg_unique_ids) {
-    function.setArgAttr(entry.first, "tf.resource_arg_unique_id",
-                        builder_.getI64IntegerAttr(entry.second));
-  }
 
   // TODO(jpienaar): Update post removing shape_refinier_.
   if (!specs_.enable_shape_inference) {
@@ -1486,6 +1490,22 @@ Status ImporterBase::ConvertFunctionArgAndRets(
           i, "tf.device",
           builder_.getStringAttr(arg_node.node->requested_device()));
 
+    if (arg_node.node->IsArg()) {
+      for (const auto& arg_node_attr : arg_node.node->attrs()) {
+        const auto& key = arg_node_attr.first;
+        // Only import attributes starting with an underscore.
+        if (key.empty() || key[0] != '_') continue;
+        // Ignore shape inference attributes as shape information is already
+        // populated in the result type.
+        if (IsOutputShapesAttribute(arg_node_attr.second, key) ||
+            IsResourceOutputShapesAttribute(arg_node_attr.second, key))
+          continue;
+        TF_ASSIGN_OR_RETURN(auto converted_attr,
+                            ConvertAttributeValue(arg_node_attr.second));
+        func.setArgAttr(i, llvm::formatv("tf.{0}", key).str(), converted_attr);
+      }
+    }
+
     island->dropAllReferences();
     island->erase();
   }
@@ -2095,14 +2115,10 @@ class GraphDefImporter : public ImporterBase {
   // output nodes, for function graphs. Arguments and return values are
   // determined by node op type. Type and shape information of the function are
   // inferred by the shape refiner in ImporterBase.
-  // `resource_arg_unique_ids` will be filled with the unique IDs of resource
-  // variables, as a list of {index, ID} pairs.
   StatusOr<mlir::FunctionType> GetArgsRetsAndTypesFromFunctionGraph(
       mlir::MLIRContext* context,
       absl::InlinedVector<OutputTensor, 4>* arg_nodes,
-      absl::InlinedVector<OutputTensor, 4>* ret_nodes,
-      absl::InlinedVector<std::pair<int64_t, int64_t>, 4>*
-          resource_arg_unique_ids);
+      absl::InlinedVector<OutputTensor, 4>* ret_nodes);
 
   // Finds the graph's target nodes/function's control ret nodes based on
   // supplied node names in `control_outputs`. If `control_outputs` are not
@@ -2130,7 +2146,6 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
   absl::InlinedVector<OutputTensor, 4> arg_nodes;
   absl::InlinedVector<OutputTensor, 4> ret_nodes;
   absl::InlinedVector<Node*, 4> control_ret_nodes;
-  absl::InlinedVector<std::pair<int64_t, int64_t>, 4> resource_arg_unique_ids;
   llvm::SmallVector<mlir::NamedAttribute, 1> attrs;
   if (specs.graph_as_function) {
     if (specs.prune_unused_nodes || !specs.inputs.empty() ||
@@ -2139,10 +2154,9 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
           "Pruning of graph is currently unsupported when the main graph is "
           "converted to a function.");
 
-    TF_ASSIGN_OR_RETURN(
-        func_type,
-        importer.GetArgsRetsAndTypesFromFunctionGraph(
-            context, &arg_nodes, &ret_nodes, &resource_arg_unique_ids));
+    TF_ASSIGN_OR_RETURN(func_type,
+                        importer.GetArgsRetsAndTypesFromFunctionGraph(
+                            context, &arg_nodes, &ret_nodes));
 
     TF_RETURN_IF_ERROR(importer.GetControlRetsFromGraph(specs.control_outputs,
                                                         &control_ret_nodes));
@@ -2210,7 +2224,7 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
 
   TF_RETURN_IF_ERROR(importer.ImporterBase::Convert(
       func_name, func_type, arg_nodes, ret_nodes, control_ret_nodes, attrs,
-      resource_arg_unique_ids));
+      specs.graph_as_function));
   return module;
 }
 
@@ -2327,9 +2341,7 @@ StatusOr<mlir::FunctionType> GraphDefImporter::InferMainFunctionType(
 StatusOr<mlir::FunctionType>
 GraphDefImporter::GetArgsRetsAndTypesFromFunctionGraph(
     mlir::MLIRContext* context, absl::InlinedVector<OutputTensor, 4>* arg_nodes,
-    absl::InlinedVector<OutputTensor, 4>* ret_nodes,
-    absl::InlinedVector<std::pair<int64_t, int64_t>, 4>*
-        resource_arg_unique_ids) {
+    absl::InlinedVector<OutputTensor, 4>* ret_nodes) {
   auto add_node = [](Node* node, absl::InlinedVector<OutputTensor, 4>* nodes) {
     auto* attr = node->attrs().Find("index");
     if (!attr)
@@ -2370,12 +2382,6 @@ GraphDefImporter::GetArgsRetsAndTypesFromFunctionGraph(
     TF_ASSIGN_OR_RETURN(auto type,
                         InferOutputType(*arg_node.node, /*idx=*/0, builder));
     arg_types.push_back(type);
-    tensorflow::int64 resource_arg_unique_id;
-    if (TryGetNodeAttr(arg_node.node->attrs(), "_resource_arg_unique_id",
-                       &resource_arg_unique_id)) {
-      resource_arg_unique_ids->emplace_back(arg_node_and_idx.index(),
-                                            resource_arg_unique_id);
-    }
   }
 
   llvm::SmallVector<mlir::Type, 4> ret_types;
@@ -2428,8 +2434,8 @@ class SavedModelObjectGraphImporter : public ImporterBase {
   // Main entry point: converts all functions in the given meta graph to an MLIR
   // Module.
   static StatusOr<mlir::OwningModuleRef> Convert(
-      SavedModelV2Bundle* saved_model, mlir::MLIRContext* context,
-      absl::Span<std::string> exported_names, bool add_default_attributes);
+      SavedModelV2Bundle* saved_model, absl::Span<std::string> exported_names,
+      mlir::MLIRContext* context, bool add_default_attributes);
 
  private:
   explicit SavedModelObjectGraphImporter(
@@ -3129,8 +3135,8 @@ Status CreateSavedModelIR(
 }
 
 StatusOr<mlir::OwningModuleRef> SavedModelObjectGraphImporter::Convert(
-    SavedModelV2Bundle* saved_model, mlir::MLIRContext* context,
-    absl::Span<std::string> exported_names, bool add_default_attributes) {
+    SavedModelV2Bundle* saved_model, absl::Span<std::string> exported_names,
+    mlir::MLIRContext* context, bool add_default_attributes) {
   GraphDebugInfo dummy_debug_info;
   const GraphDebugInfo& debug_info =
       saved_model->debug_info() ? *saved_model->debug_info() : dummy_debug_info;
@@ -3207,17 +3213,20 @@ class SavedModelSignatureDefImporter {
  public:
   // Main entry point: converts all functions (specified by SignatureDefs) in
   // the given meta graph to an MLIR Module.
-  static StatusOr<mlir::OwningModuleRef> Convert(const SavedModelBundle& bundle,
-                                                 mlir::MLIRContext* context) {
-    SavedModelSignatureDefImporter importer(bundle, context);
+  static StatusOr<mlir::OwningModuleRef> Convert(
+      const SavedModelBundle& bundle, absl::Span<std::string> exported_names,
+      mlir::MLIRContext* context) {
+    SavedModelSignatureDefImporter importer(bundle, exported_names, context);
 
     return importer.ConvertSignatures();
   }
 
  private:
   SavedModelSignatureDefImporter(const SavedModelBundle& bundle,
+                                 absl::Span<std::string> exported_names,
                                  mlir::MLIRContext* context)
       : bundle_(bundle),
+        exported_names_(exported_names),
         module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(context))) {}
 
   // Converts the SavedModel to the SavedModel dialect. Creates an MLIR function
@@ -3250,6 +3259,7 @@ class SavedModelSignatureDefImporter {
       const std::vector<std::pair<std::string, TensorInfo>>& inputs);
 
   const SavedModelBundle& bundle_;
+  absl::Span<std::string> exported_names_;
   mlir::OwningModuleRef module_;
 };
 
@@ -3265,6 +3275,9 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
   GraphDebugInfo debug_info;
   if (bundle_.debug_info != nullptr) debug_info = *bundle_.debug_info;
 
+  llvm::StringSet<> exported_name_set;
+  exported_name_set.insert(exported_names_.begin(), exported_names_.end());
+
   for (const auto& key_and_signature_def : signatures) {
     const std::string& sig_def_key = key_and_signature_def.first;
     const SignatureDef& signature_def = key_and_signature_def.second;
@@ -3274,6 +3287,10 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
     if (sig_def_key == "__saved_model_init_op") {
       continue;
     }
+    if (!exported_name_set.empty() &&
+        exported_name_set.count(sig_def_key) == 0) {
+      continue;
+    }
 
     TF_RETURN_IF_ERROR(ConvertSignature(graphdef, sig_def_key, signature_def,
                                         debug_info, flib_def));
@@ -3317,6 +3334,9 @@ Status SavedModelSignatureDefImporter::ConvertSignature(
       graphdef, &sub_graph_def,
       /*terminal_nodes=*/{specs.outputs.begin(), specs.outputs.end()}));
 
+  // Set the function library definitions in the pruned graphdef.
+  *sub_graph_def.mutable_library() = flib_def.ToProto();
+
   // Convert sub-graphdef to sub-graph.
   GraphConstructorOptions options;
   options.allow_internal_ops = true;
@@ -3556,12 +3576,14 @@ StatusOr<mlir::OwningModuleRef> ConvertSavedModelToMlir(
     SavedModelV2Bundle* saved_model, mlir::MLIRContext* context,
     absl::Span<std::string> exported_names, bool add_default_attributes) {
   return SavedModelObjectGraphImporter::Convert(
-      saved_model, context, exported_names, add_default_attributes);
+      saved_model, exported_names, context, add_default_attributes);
 }
 
 StatusOr<mlir::OwningModuleRef> ConvertSavedModelV1ToMlir(
-    const SavedModelBundle& saved_model, mlir::MLIRContext* context) {
-  return SavedModelSignatureDefImporter::Convert(saved_model, context);
+    const SavedModelBundle& saved_model, absl::Span<std::string> exported_names,
+    mlir::MLIRContext* context) {
+  return SavedModelSignatureDefImporter::Convert(saved_model, exported_names,
+                                                 context);
 }
 
 std::string MlirModuleToString(mlir::ModuleOp module, bool show_debug_info) {
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
index 8603eadb487..bdb72345201 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
@@ -55,6 +55,7 @@ stream_executor::port::StatusOr<mlir::OwningModuleRef> ConvertSavedModelToMlir(
 // expressed with tf_executor dialect.
 stream_executor::port::StatusOr<mlir::OwningModuleRef>
 ConvertSavedModelV1ToMlir(const SavedModelBundle& saved_model,
+                          absl::Span<std::string> exported_names,
                           mlir::MLIRContext* context);
 
 // Serialize a MLIR module to a string.
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
index 2c7f84d8268..6ada0fec4e2 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
@@ -141,7 +141,8 @@ mlir::OwningModuleRef SavedModelObjectGraphToMlirImport(
 
 mlir::OwningModuleRef SavedModelSignatureDefsToMlirImport(
     absl::string_view saved_model_dir,
-    const std::unordered_set<std::string>& tags, mlir::MLIRContext* context) {
+    const std::unordered_set<std::string>& tags,
+    absl::Span<std::string> exported_names, mlir::MLIRContext* context) {
   tensorflow::SavedModelBundle bundle;
   tensorflow::SessionOptions session_options;
   // Force saved model states to be restored to CPU.
@@ -155,7 +156,7 @@ mlir::OwningModuleRef SavedModelSignatureDefsToMlirImport(
     return nullptr;
   }
 
-  auto module_or = ConvertSavedModelV1ToMlir(bundle, context);
+  auto module_or = ConvertSavedModelV1ToMlir(bundle, exported_names, context);
   if (!module_or.status().ok()) {
     LOG(ERROR) << "SavedModel V1 import failed: " << module_or.status();
     return nullptr;
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
index f498864c8aa..490b7c7d8f0 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
@@ -64,7 +64,8 @@ mlir::OwningModuleRef SavedModelObjectGraphToMlirImport(
 // given MLIR `context`.
 mlir::OwningModuleRef SavedModelSignatureDefsToMlirImport(
     absl::string_view saved_model_dir,
-    const std::unordered_set<std::string>& tags, mlir::MLIRContext* context);
+    const std::unordered_set<std::string>& tags,
+    absl::Span<std::string> exported_names, mlir::MLIRContext* context);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
index 06805e633e2..d7b511094d3 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
@@ -38,6 +38,7 @@ inline static void Log(BridgeLoggerConfig::PrintCallbackFn print_callback,
   std::unique_ptr<llvm::raw_ostream> os;
   std::string filepath;
   if (CreateFileForDumping(name, &os, &filepath).ok()) print_callback(*os);
+  VLOG(1) << "Dumped MLIR module to " << filepath;
 }
 
 void BridgeLoggerConfig::printBeforeIfEnabled(mlir::Pass* pass,
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
index 2374687c920..fd1ba3b1901 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
@@ -247,9 +248,10 @@ Status RefineShapes(llvm::ArrayRef<TensorShape> arg_shapes,
 
 static void RegisterDialects() {
   static bool init_once = []() {
-    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
-    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
     mlir::registerDialect<mlir::StandardOpsDialect>();
+    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
+    mlir::registerDialect<mlir::shape::ShapeDialect>();
+    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
     mlir::registerDialect<mlir::xla_hlo::XlaHloDialect>();
     return true;
   }();
@@ -293,12 +295,22 @@ Status ConvertMLIRToXlaComputation(
   tf2xla.addPass(mlir::xla_hlo::createLegalizeTfWithTf2XlaPass(device_type));
   tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
 
+  // Run shape inference pass to propagate shapes through tensor_cast operations
+  // from static to dynamic shapes. This could be generated if the shape
+  // inference was originally missing in a TF op but the corresponding HLO op
+  // had static shape after lowering.
+  tf2xla.addPass(mlir::TF::CreateTFShapeInferencePass());
+
   // Run LegalizeTFPass again because the previous legalization passes can
   // expose more graph pruning and canonicalization opportunities that are
   // necessary for the second LegalizeTFPass(allow_partial_conversion=false)
   // invocation.
   tf2xla.addNestedPass<mlir::FuncOp>(
       mlir::xla_hlo::createLegalizeTFPass(false));
+  // In order to export to XLA, we must sink constants to control flow regions,
+  // since XLA uses functional control flow.
+  tf2xla.addNestedPass<mlir::FuncOp>(
+      mlir::xla_hlo::createSinkConstantsToControlFlowPass());
 
   if (VLOG_IS_ON(1)) {
     // Print the whole module after each pass which requires disabling
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
index fcfef565952..b28f26b6c3c 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
@@ -31,12 +31,14 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
@@ -131,13 +133,21 @@ StatusOr<ElementsAttr> ConvertTensor(const Tensor& input_tensor,
   case DTYPE:                      \
     return ConvertFlatTensor<CTYPE>(input_tensor, type);
 
-  // TODO(fengliuai): customize the conversions for more types.
+  // TODO(fengliuai): customize the conversions for quantized and string types.
   switch (input_dtype) {
     CONVERT_FLAT(DT_BOOL, bool)
     CONVERT_FLAT(DT_FLOAT, float)
     CONVERT_FLAT(DT_DOUBLE, double)
+    CONVERT_FLAT(DT_INT8, int8)
+    CONVERT_FLAT(DT_INT16, int16)
     CONVERT_FLAT(DT_INT32, int32)
     CONVERT_FLAT(DT_INT64, int64)
+    CONVERT_FLAT(DT_UINT8, uint8)
+    CONVERT_FLAT(DT_UINT16, uint16)
+    CONVERT_FLAT(DT_UINT32, uint32)
+    CONVERT_FLAT(DT_UINT64, uint64)
+    CONVERT_FLAT(DT_COMPLEX64, std::complex<float>)
+    CONVERT_FLAT(DT_COMPLEX128, std::complex<double>)
 
     // BFLOAT16 is a special case that it needs to be cast to double type to
     // match its storage type.
@@ -207,12 +217,20 @@ mlir::TF::ShapeAttr ConvertTypeToTensorShapeAttr(const mlir::Type& type) {
 
 // Converts an MLIR dense string elements attribute to a TensorFlow tensor
 // proto.
-Status ConvertStringElementsAttr(const DenseStringElementsAttr attr,
-                                 TensorProto* output_tensor) {
-  for (const auto& val : attr.getRawStringData()) {
-    output_tensor->add_string_val(val.data(), val.size());
+void ConvertStringElementsAttr(
+    const DenseStringElementsAttr attr,
+    protobuf::RepeatedPtrField<std::string>* output) {
+  for (const auto& val : attr.getRawStringData())
+    output->Add({val.data(), val.size()});
+}
+
+template <typename T>
+void ConvertComplexElementsAttr(const mlir::DenseElementsAttr attr,
+                                protobuf::RepeatedField<T>* output) {
+  for (const auto& val : attr.getValues<std::complex<T>>()) {
+    output->Add(val.real());
+    output->Add(val.imag());
   }
-  return Status::OK();
 }
 
 // Converts an MLIR opaque elements attribute to a TensorFlow tensor proto.
@@ -226,139 +244,80 @@ Status ConvertOpaqueElementsAttr(const ElementsAttr attr,
   return InvalidArgument("Unexpected elements attribute type from MLIR.");
 }
 
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the double_val field updated.
-Status ConvertDoubleElementsAttr(const ElementsAttr attr,
-                                 TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_double_val(elts.getSplatValue<double>());
-    } else {
-      for (auto value : elts.getValues<double>())
-        output_tensor->add_double_val(value);
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the float_val field updated.
-Status ConvertFloatElementsAttr(const ElementsAttr attr,
-                                TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_float_val(elts.getSplatValue<float>());
-    } else {
-      for (auto value : elts.getValues<float>())
-        output_tensor->add_float_val(value);
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the half_val field updated.
-Status ConvertHalfElementsAttr(const ElementsAttr attr,
-                               TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_half_val(
-          (*elts.begin()).bitcastToAPInt().getSExtValue());
-    } else {
-      for (const auto& value : elts.getFloatValues())
-        output_tensor->add_half_val(value.bitcastToAPInt().getSExtValue());
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the int_val field updated.
-Status ConvertIntElementsAttr(const mlir::ElementsAttr attr,
-                              TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_int_val((*elts.begin()).getSExtValue());
-    } else {
-      for (const auto& val : elts)
-        output_tensor->add_int_val(val.getSExtValue());
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-Status ConvertBfloat16ElementsAttr(const mlir::ElementsAttr attr,
-                                   TensorProto* output_tensor) {
-  auto elts = attr.dyn_cast<DenseFPElementsAttr>();
-  if (!elts) {
-    return ConvertOpaqueElementsAttr(attr, output_tensor);
-  }
-
-  // Bfloat16 is internally represented as `double` in MLIR.
-  if (elts.isSplat()) {
-    double v = elts.getSplatValue<double>();
-    bfloat16 bf16_val = static_cast<bfloat16>(v);
-    output_tensor->add_half_val(absl::bit_cast<int16>(bf16_val));
+// Converts an MLIR elements attribute and adds it to specified repeated field.
+template <typename T>
+void ConvertElementsAttr(const mlir::DenseElementsAttr attr,
+                         protobuf::RepeatedField<T>* output) {
+  if (attr.isSplat()) {
+    output->Add(attr.getSplatValue<T>());
   } else {
-    for (auto v : elts.getValues<double>()) {
+    for (auto value : attr.getValues<T>()) output->Add(value);
+  }
+}
+
+// Converts an MLIR elements attribute containing half values and adds it to
+// specified repeated field.
+void ConvertHalfElementsAttr(const DenseFPElementsAttr attr,
+                             protobuf::RepeatedField<int>* output_tensor) {
+  if (attr.isSplat()) {
+    output_tensor->Add((*attr.begin()).bitcastToAPInt().getSExtValue());
+  } else {
+    for (const llvm::APFloat value : attr.getFloatValues())
+      output_tensor->Add(value.bitcastToAPInt().getSExtValue());
+  }
+}
+
+// Converts an MLIR elements attribute containing int values and adds it to
+// specified repeated field.
+void ConvertIntElementsAttr(const mlir::DenseIntElementsAttr attr,
+                            protobuf::RepeatedField<int>* output) {
+  if (attr.isSplat()) {
+    output->Add((*attr.begin()).getSExtValue());
+  } else {
+    for (const llvm::APInt val : attr) output->Add(val.getSExtValue());
+  }
+}
+
+void ConvertBfloat16ElementsAttr(const mlir::DenseFPElementsAttr attr,
+                                 protobuf::RepeatedField<int>* output) {
+  // Bfloat16 is internally represented as `double` in MLIR.
+  if (attr.isSplat()) {
+    double v = attr.getSplatValue<double>();
+    bfloat16 bf16_val = static_cast<bfloat16>(v);
+    output->Add(absl::bit_cast<int16>(bf16_val));
+  } else {
+    for (auto v : attr.getValues<double>()) {
       bfloat16 bf16_val = static_cast<bfloat16>(v);
-      output_tensor->add_half_val(absl::bit_cast<int16>(bf16_val));
+      output->Add(absl::bit_cast<int16>(bf16_val));
     }
   }
-
-  return Status::OK();
 }
 
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the int64_val field updated.
-Status ConvertInt64ElementsAttr(const mlir::ElementsAttr attr,
-                                TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_int64_val((*elts.begin()).getSExtValue());
-    } else {
-      for (const auto& val : elts)
-        output_tensor->add_int64_val(val.getSExtValue());
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with bool_val field updated.
-Status ConvertBoolElementsAttr(const mlir::ElementsAttr attr,
-                               TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
-    for (const auto& val : elts) {
-      output_tensor->add_bool_val(val.getBoolValue());
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-Status ConvertToTensorProto(const ElementsAttr attr,
-                            TensorProto* output_tensor) {
+Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
   auto type = attr.getType();
   auto shape = type.getShape();
   DataType output_dtype;
   TF_RETURN_IF_ERROR(ConvertToDataType(type, &output_dtype));
-  output_tensor->set_dtype(output_dtype);
-  ConvertToTensorShapeProto(shape, output_tensor->mutable_tensor_shape());
+  output->set_dtype(output_dtype);
+  ConvertToTensorShapeProto(shape, output->mutable_tensor_shape());
+
+  if (attr.isa<OpaqueElementsAttr>())
+    return ConvertOpaqueElementsAttr(attr.cast<OpaqueElementsAttr>(), output);
+
+  auto dense_attr = attr.dyn_cast<mlir::DenseElementsAttr>();
+  if (!dense_attr) return errors::InvalidArgument("Unsupported elements attr");
 
   switch (output_dtype) {
     case DT_FLOAT:
-      return ConvertFloatElementsAttr(attr, output_tensor);
+      ConvertElementsAttr<float>(dense_attr, output->mutable_float_val());
+      break;
     case DT_HALF:
-      // Handles both DenseFPElementsAttr and OpaqueElementsAttr.
-      return ConvertHalfElementsAttr(attr, output_tensor);
+      ConvertHalfElementsAttr(dense_attr.cast<DenseFPElementsAttr>(),
+                              output->mutable_half_val());
+      break;
     case DT_DOUBLE:
-      return ConvertDoubleElementsAttr(attr, output_tensor);
+      ConvertElementsAttr(dense_attr, output->mutable_double_val());
+      break;
     case DT_QUINT8:
     case DT_UINT8:
     case DT_INT8:
@@ -366,20 +325,40 @@ Status ConvertToTensorProto(const ElementsAttr attr,
     case DT_UINT16:
     case DT_INT16:
     case DT_INT32:
-      return ConvertIntElementsAttr(attr, output_tensor);
+      ConvertIntElementsAttr(dense_attr.cast<DenseIntElementsAttr>(),
+                             output->mutable_int_val());
+      break;
+    case DT_UINT32:
+      ConvertElementsAttr(dense_attr, output->mutable_uint32_val());
+      break;
+    case DT_UINT64:
+      ConvertElementsAttr(dense_attr, output->mutable_uint64_val());
+      break;
     case DT_INT64:
-      return ConvertInt64ElementsAttr(attr, output_tensor);
+      ConvertElementsAttr(dense_attr, output->mutable_int64_val());
+      break;
     case DT_BOOL:
-      return ConvertBoolElementsAttr(attr, output_tensor);
+      ConvertElementsAttr(dense_attr, output->mutable_bool_val());
+      break;
     case DT_BFLOAT16:
-      return ConvertBfloat16ElementsAttr(attr, output_tensor);
+      ConvertBfloat16ElementsAttr(dense_attr.cast<DenseFPElementsAttr>(),
+                                  output->mutable_half_val());
+      break;
     case DT_STRING:
-      return ConvertStringElementsAttr(attr.cast<DenseStringElementsAttr>(),
-                                       output_tensor);
+      ConvertStringElementsAttr(dense_attr.cast<DenseStringElementsAttr>(),
+                                output->mutable_string_val());
+      break;
+    case DT_COMPLEX64:
+      ConvertComplexElementsAttr(dense_attr, output->mutable_scomplex_val());
+      break;
+    case DT_COMPLEX128:
+      ConvertComplexElementsAttr(dense_attr, output->mutable_dcomplex_val());
+      break;
     default:
-      return ConvertOpaqueElementsAttr(attr.cast<OpaqueElementsAttr>(),
-                                       output_tensor);
+      return errors::Unimplemented(absl::StrCat("Unimplemented data type ",
+                                                DataTypeString(output_dtype)));
   }
+  return Status::OK();
 }
 
 Status ConvertToTensor(const mlir::ElementsAttr attr, Tensor* output_tensor) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
index d711c19baae..bf96e3d1df4 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 
 #include <cstring>
+#include <initializer_list>
 
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -99,48 +100,74 @@ TEST(ConvertTypeToTensorTypeTest, ConvertStringTensor) {
   EXPECT_EQ(string_values[3], mlir::StringRef("four"));
 }
 
-TEST(ConvertTypeToTensorTypeTest, Convert16BitFloats) {
+class ConvertTensorTest : public ::testing::Test {
+ protected:
+  template <typename T>
+  void VerifyConversion(std::initializer_list<T> values, DataType dtype,
+                        mlir::Type expected_ty) {
+    mlir::Builder b(expected_ty.getContext());
+    Tensor tensor(dtype, TensorShape({static_cast<int64>(values.size())}));
+    tensor.flat<T>().setValues(values);
+
+    auto value_or = ConvertTensor(tensor, &b);
+    TF_ASSERT_OK(value_or.status());
+    auto attr = value_or.ValueOrDie();
+
+    EXPECT_EQ(attr.getType().getElementType(), expected_ty);
+
+    Tensor out;
+    TF_ASSERT_OK(ConvertToTensor(attr, &out));
+
+    test::ExpectTensorEqual<T>(tensor, out);
+  }
+};
+
+TEST_F(ConvertTensorTest, Simple) {
   RegisterDialects();
+
   mlir::MLIRContext context;
-  mlir::Builder b(&context);
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<Eigen::half>(
+      {Eigen::half(1.0)}, DT_HALF, mlir::FloatType::getF16(&context)));
+  ASSERT_NO_FATAL_FAILURE(
+      VerifyConversion<bfloat16>({bfloat16(1.0), bfloat16(-1.0)}, DT_BFLOAT16,
+                                 mlir::FloatType::getBF16(&context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<float>(
+      {1.0, -1.0}, DT_FLOAT, mlir::FloatType::getF32(&context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<double>(
+      {1.0, -1.0}, DT_DOUBLE, mlir::FloatType::getF64(&context)));
 
-  {
-    // Create the sample tensor to convert.
-    Tensor tensor(DT_HALF, TensorShape({1}));
-    auto Tt = tensor.flat<Eigen::half>();
-    Tt.setValues({Eigen::half(1.0)});
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int8>(
+      {1, -1}, DT_INT8, mlir::IntegerType::get(8, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int16>(
+      {1, -1}, DT_INT16, mlir::IntegerType::get(16, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int32>(
+      {1, -1}, DT_INT32, mlir::IntegerType::get(32, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int64>(
+      {1, -1}, DT_INT64, mlir::IntegerType::get(64, &context)));
 
-    auto value_or = ConvertTensor(tensor, &b);
-    TF_EXPECT_OK(value_or.status());
-    auto attr = value_or.ValueOrDie();
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint8>(
+      {1, 2}, DT_UINT8,
+      mlir::IntegerType::get(
+          8, mlir::IntegerType::SignednessSemantics::Unsigned, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint16>(
+      {1, 2}, DT_UINT16,
+      mlir::IntegerType::get(
+          16, mlir::IntegerType::SignednessSemantics::Unsigned, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint32>(
+      {1, 2}, DT_UINT32,
+      mlir::IntegerType::get(
+          32, mlir::IntegerType::SignednessSemantics::Unsigned, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint64>(
+      {1, 2}, DT_UINT64,
+      mlir::IntegerType::get(
+          64, mlir::IntegerType::SignednessSemantics::Unsigned, &context)));
 
-    EXPECT_TRUE(attr.isa<mlir::DenseFPElementsAttr>());
-    EXPECT_TRUE(attr.getType().getElementType().isF16());
-
-    Tensor out;
-    TF_ASSERT_OK(ConvertToTensor(attr, &out));
-
-    test::ExpectTensorEqual<Eigen::half>(tensor, out);
-  }
-
-  {
-    // Create the sample tensor to convert.
-    Tensor tensor(DT_BFLOAT16, TensorShape({2}));
-    auto Tt = tensor.flat<bfloat16>();
-    Tt.setValues({bfloat16(1.0), bfloat16(-1.0)});
-
-    auto value_or = ConvertTensor(tensor, &b);
-    TF_EXPECT_OK(value_or.status());
-    auto attr = value_or.ValueOrDie();
-
-    EXPECT_TRUE(attr.isa<mlir::DenseFPElementsAttr>());
-    EXPECT_TRUE(attr.getType().getElementType().isBF16());
-
-    Tensor out;
-    TF_ASSERT_OK(ConvertToTensor(attr, &out));
-
-    test::ExpectTensorEqual<bfloat16>(tensor, out);
-  }
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<std::complex<float>>(
+      {{0.0, 1.0}, {1.0, 0.0}}, DT_COMPLEX64,
+      mlir::ComplexType::get(mlir::FloatType::getF32(&context))));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<std::complex<double>>(
+      {{0.0, 1.0}, {1.0, 0.0}}, DT_COMPLEX128,
+      mlir::ComplexType::get(mlir::FloatType::getF64(&context))));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index cc795259893..4877cbc4a44 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -59,6 +59,18 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
+// static TensorFlow op prefix set.
+std::set<std::string>* GlobalOpPrefixes() {
+  static std::set<std::string>* global_op_prefixes = [] {
+    std::set<std::string>* result = new std::set<std::string>;
+    result->insert("tf.");
+    result->insert("_tf.");
+    result->insert("tf_executor.");
+    return result;
+  }();
+  return global_op_prefixes;
+}
+
 // Converts a location to the debug information for the node def.
 Status ConvertLocation(mlir::Location inst_loc,
                        NodeDef::ExperimentalDebugInfo* debug_info) {
@@ -268,8 +280,10 @@ StatusOr<llvm::StringRef> GetTensorFlowOpName(llvm::StringRef op_name) {
   // - ".sink" or ".Sink": only the NextIteration operation has this suffix. We
   // don't need to consider ".source"/".Source" because the nodes with this
   // suffix are skipped by the caller and will not be added to the graph.
-  if (!op_name.consume_front("_tf.") && !op_name.consume_front("tf.") &&
-      !op_name.consume_front("tf_executor.")) {
+  auto prefixes = GlobalOpPrefixes();
+  if (std::none_of(prefixes->begin(), prefixes->end(), [&](std::string prefix) {
+        return op_name.consume_front(prefix);
+      })) {
     return errors::FailedPrecondition("op node '", op_name.str(),
                                       "' was not a TF op!");
   }
@@ -506,4 +520,9 @@ bool IsLegacyCallInstruction(mlir::Operation* inst) {
          inst->getName().getStringRef().compare("_tf.LegacyCall") == 0;
 }
 
+Status AddTensorFlowOpPrefix(std::string prefix) {
+  GlobalOpPrefixes()->insert(prefix);
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
index 32ed528bd0d..58fe39fa4e8 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
@@ -34,10 +34,17 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
+namespace mlir {
+class ShapedType;
+}  // namespace mlir
+
 namespace tensorflow {
 
 using stream_executor::port::StatusOr;
 
+// Add custom op prefix for TensorFlow dialects.
+Status AddTensorFlowOpPrefix(std::string);
+
 // Maps an MLIR op name in the TensorFlow dialect or the TensorFlow control
 // dialect back into a TensorFlow valid op name.
 StatusOr<llvm::StringRef> GetTensorFlowOpName(llvm::StringRef);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index 6cf2781e48d..282b7ad3139 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -26,9 +26,9 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -39,6 +39,12 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
+
+const char* const kTPUReplicatedHost = "TPU_REPLICATED_HOST";
+const char* const kNumCoresPerReplicaAttr = "num_cores_per_replica";
+const char* const kTopologyAttr = "topology";
+const char* const kDeviceAssignmentAttr = "device_assignment";
+
 // Device coordinates are defined as (x, y, z, core), thus resulting in a rank 4
 // topology.
 constexpr int kTPUTopologyRank = 4;
@@ -46,8 +52,8 @@ constexpr int kTPUTopologyRank = 4;
 constexpr char kDeviceTPUSystem[] = "TPU_SYSTEM";
 constexpr char kDeviceTPU[] = "TPU";
 constexpr char kTPUReplicatedCore[] = "TPU_REPLICATED_CORE";
-constexpr char kTopologyAttr[] = "topology";
-constexpr char kDeviceAssignmentAttr[] = "device_assignment";
+constexpr char kBadIntArrayElementMsg[] =
+    "bad '{0}' attribute at index {1}, not an int";
 
 using Device = DeviceNameUtils::ParsedName;
 using Devices = llvm::ArrayRef<DeviceNameUtils::ParsedName>;
@@ -164,12 +170,19 @@ std::string GetTPUCompilationDevice(Device system_device) {
   return DeviceNameUtils::ParsedNameToString(system_device);
 }
 
+// Finds the host CPU device for a given TPU device.
+std::string GetCPUHostDeviceForTPUDevice(Device tpu_device) {
+  tpu_device.type = DEVICE_CPU;
+  tpu_device.id = 0;
+  return DeviceNameUtils::ParsedNameToString(tpu_device);
+}
+
 // Determines execution devices when topology and device assignment are not
 // defined. This is a special case where a single core computation is replicated
 // to every core in the mesh. TPU devices are simply added to
 // `execution_devices` of one replica. `num_replicas` must be 1 or the total
 // number of TPU devices available, and `num_cores_per_replica` must be 1.
-StatusOr<ExecutionDevices> GetFullMeshTPUExecutionDeviceAssignment(
+StatusOr<TPUDevicesAndHosts> GetFullMeshTPUExecutionDeviceAssignment(
     int num_replicas, int num_cores_per_replica,
     llvm::ArrayRef<llvm::SmallVector<Device, 8>> tpu_devices) {
   const int num_tasks = tpu_devices.size();
@@ -185,17 +198,18 @@ StatusOr<ExecutionDevices> GetFullMeshTPUExecutionDeviceAssignment(
         "'num_cores_per_replica' must be equal to 1, got ",
         num_cores_per_replica);
 
-  ExecutionDevices execution_devices;
-  execution_devices.reserve(num_replicas);
+  TPUDevicesAndHosts devices_and_hosts;
+  devices_and_hosts.reserve(num_replicas);
   for (int i = 0; i < num_replicas; ++i) {
     const int task = i / num_tpus_per_task;
     const int device = i % num_tpus_per_task;
-    execution_devices.push_back(
-        {tensorflow::DeviceNameUtils::ParsedNameToString(
-            tpu_devices[task][device])});
+    const auto& tpu_device = tpu_devices[task][device];
+    devices_and_hosts.push_back({TPUDeviceAndHost(
+        /*device=*/tensorflow::DeviceNameUtils::ParsedNameToString(tpu_device),
+        /*host=*/GetCPUHostDeviceForTPUDevice(tpu_device))});
   }
 
-  return execution_devices;
+  return devices_and_hosts;
 }
 
 // Helper struct for keeping track of task and device for an associated TPU
@@ -326,7 +340,7 @@ StatusOr<xla::Array4D<TaskAndDevice>> ParseTopologyAttr(
 //  - number of device coordinates (in tuple 3) match number 'num_replicas' *
 //    'num_cores_per_replica'
 //  - a TPU device associated with each device coordinate
-StatusOr<std::pair<ExecutionDevices, xla::DeviceAssignmentProto>>
+StatusOr<std::pair<TPUDevicesAndHosts, xla::DeviceAssignmentProto>>
 GetGeneralTPUExecutionDeviceAssignment(
     int num_replicas, int num_cores_per_replica,
     llvm::ArrayRef<llvm::SmallVector<Device, 8>> tpu_devices,
@@ -361,9 +375,9 @@ GetGeneralTPUExecutionDeviceAssignment(
   std::vector<bool> used_device_ids(
       location_to_id(bound_x - 1, bound_y - 1, bound_z - 1, bound_core - 1),
       false);
-  ExecutionDevices execution_devices(
-      num_replicas,
-      llvm::SmallVector<std::string, 8>(num_cores_per_replica, ""));
+  TPUDevicesAndHosts devices_and_hosts(
+      num_replicas, llvm::SmallVector<TPUDeviceAndHost, 8>(
+                        num_cores_per_replica, TPUDeviceAndHost()));
   xla::DeviceAssignment device_assignment(num_replicas, num_cores_per_replica);
   int pos = 0;
   for (int replica = 0; replica < num_replicas; ++replica) {
@@ -393,20 +407,43 @@ GetGeneralTPUExecutionDeviceAssignment(
 
       used_device_ids[device_id] = true;
       device_assignment(replica, logical_core) = device_id;
-      execution_devices[replica][logical_core] =
-          DeviceNameUtils::ParsedNameToString(tpu_devices[task][device]);
+      auto& device_and_host = devices_and_hosts[replica][logical_core];
+      const auto& tpu_device = tpu_devices[task][device];
+      device_and_host.device = DeviceNameUtils::ParsedNameToString(tpu_device);
+      device_and_host.host = GetCPUHostDeviceForTPUDevice(tpu_device);
     }
   }
 
   xla::DeviceAssignmentProto device_assignment_proto;
   TF_RETURN_IF_ERROR(device_assignment.Serialize(&device_assignment_proto));
 
-  return std::pair<ExecutionDevices, xla::DeviceAssignmentProto>(
-      std::move(execution_devices), std::move(device_assignment_proto));
+  return std::pair<TPUDevicesAndHosts, xla::DeviceAssignmentProto>(
+      std::move(devices_and_hosts), std::move(device_assignment_proto));
 }
 
 }  // anonymous namespace
 
+StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
+    mlir::ArrayAttr device_assignment_attr) {
+  llvm::SmallVector<int64_t, 8> device_coordinates;
+  device_coordinates.reserve(device_assignment_attr.size());
+
+  for (auto device_coordinate_and_idx :
+       llvm::enumerate(device_assignment_attr)) {
+    auto device_coordinate =
+        device_coordinate_and_idx.value().dyn_cast<mlir::IntegerAttr>();
+    if (!device_coordinate)
+      return errors::InvalidArgument(
+          llvm::formatv(kBadIntArrayElementMsg, kDeviceAssignmentAttr,
+                        device_coordinate_and_idx.index())
+              .str());
+
+    device_coordinates.push_back(device_coordinate.getInt());
+  }
+
+  return device_coordinates;
+}
+
 StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
     Devices devices, int num_replicas, int num_cores_per_replica,
     llvm::StringRef topology_attr,
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
index dd296a13f4b..6bb541ab683 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -30,32 +31,52 @@ limitations under the License.
 namespace tensorflow {
 using stream_executor::port::StatusOr;
 
-// TPU devices to be used for execution (e.g. devices for TPUExecute ops). They
-// are ordered by `num_replicas` followed by `num_cores_per_replica`.
-using ExecutionDevices =
-    llvm::SmallVector<llvm::SmallVector<std::string, 8>, 8>;
+extern const char* const kTPUReplicatedHost;
+extern const char* const kNumCoresPerReplicaAttr;
+extern const char* const kTopologyAttr;
+extern const char* const kDeviceAssignmentAttr;
 
-// TPU compilation device, execution devices, and optionally execution device
-// IDs. Execution device IDs are populated if `topology` and `device_assignment`
-// are provided.
+// A TPU device for execution alongside its associated host CPU device.
+struct TPUDeviceAndHost {
+  TPUDeviceAndHost() {}
+  TPUDeviceAndHost(llvm::StringRef device, llvm::StringRef host)
+      : device(device), host(host) {}
+
+  std::string device;
+  std::string host;
+};
+
+// TPU devices to be used for execution (e.g. devices for TPUExecute ops) and
+// their associated host CPU devices (for outside compilation). They are ordered
+// by `num_replicas` followed by `num_cores_per_replica`.
+using TPUDevicesAndHosts =
+    llvm::SmallVector<llvm::SmallVector<TPUDeviceAndHost, 8>, 8>;
+
+// TPU compilation device, execution and associated host devices, and optionally
+// execution device IDs. Execution device IDs are populated if `topology` and
+// `device_assignment` are provided.
 struct TPUDeviceAssignment {
   TPUDeviceAssignment(llvm::StringRef compilation_device,
-                      ExecutionDevices&& execution_devices)
+                      TPUDevicesAndHosts&& tpu_devices)
       : compilation_device(compilation_device),
-        execution_devices(std::move(execution_devices)) {}
+        tpu_devices(std::move(tpu_devices)) {}
 
   TPUDeviceAssignment(llvm::StringRef compilation_device,
-                      ExecutionDevices&& execution_devices,
+                      TPUDevicesAndHosts&& tpu_devices,
                       xla::DeviceAssignmentProto&& xla_device_assignment)
       : compilation_device(compilation_device),
-        execution_devices(std::move(execution_devices)),
+        tpu_devices(std::move(tpu_devices)),
         xla_device_assignment(std::move(xla_device_assignment)) {}
 
   std::string compilation_device;
-  ExecutionDevices execution_devices;
+  TPUDevicesAndHosts tpu_devices;
   llvm::Optional<xla::DeviceAssignmentProto> xla_device_assignment;
 };
 
+// Extracts device coordinates from a device assignment attribute on an op.
+StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
+    mlir::ArrayAttr device_assignment_attr);
+
 // Finds the TPU compilation device and execution devices from `devices` for a
 // TPU computation subgraph. Compilation device is determined from looking up
 // all TPU_SYSTEM:0 devices and choosing the CPU device associated to the first
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index 87319f2adeb..a70e93a0195 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <tuple>
 
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/tpu/topology.pb.h"
@@ -323,30 +325,46 @@ TEST(TPURewriteDeviceUtilTest, ValidFullMeshDeviceAssignment) {
 
   TF_ASSERT_OK(status_or.status());
 
-  auto& tpu_device_assignment = status_or.ValueOrDie();
+  const auto& tpu_device_assignment = status_or.ValueOrDie();
   EXPECT_EQ(tpu_device_assignment.compilation_device,
             "/job:worker/replica:0/task:0/device:CPU:0");
-  auto& execution_devices = tpu_device_assignment.execution_devices;
-  ASSERT_EQ(execution_devices.size(), 8);
-  for (const auto& replica_execution_device : execution_devices)
-    ASSERT_EQ(replica_execution_device.size(), 1);
+  const auto& tpu_devices = tpu_device_assignment.tpu_devices;
+  ASSERT_EQ(tpu_devices.size(), 8);
+  for (const auto& replica_tpu_devices : tpu_devices)
+    ASSERT_EQ(replica_tpu_devices.size(), 1);
 
-  EXPECT_EQ(execution_devices[0][0],
+  EXPECT_EQ(tpu_devices[0][0].device,
             "/job:worker/replica:0/task:0/device:TPU:0");
-  EXPECT_EQ(execution_devices[1][0],
+  EXPECT_EQ(tpu_devices[0][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][0].device,
             "/job:worker/replica:0/task:0/device:TPU:1");
-  EXPECT_EQ(execution_devices[2][0],
+  EXPECT_EQ(tpu_devices[1][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[2][0].device,
             "/job:worker/replica:0/task:0/device:TPU:2");
-  EXPECT_EQ(execution_devices[3][0],
+  EXPECT_EQ(tpu_devices[2][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[3][0].device,
             "/job:worker/replica:0/task:0/device:TPU:3");
-  EXPECT_EQ(execution_devices[4][0],
+  EXPECT_EQ(tpu_devices[3][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[4][0].device,
             "/job:worker/replica:0/task:1/device:TPU:0");
-  EXPECT_EQ(execution_devices[5][0],
+  EXPECT_EQ(tpu_devices[4][0].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[5][0].device,
             "/job:worker/replica:0/task:1/device:TPU:1");
-  EXPECT_EQ(execution_devices[6][0],
+  EXPECT_EQ(tpu_devices[5][0].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[6][0].device,
             "/job:worker/replica:0/task:1/device:TPU:2");
-  EXPECT_EQ(execution_devices[7][0],
+  EXPECT_EQ(tpu_devices[6][0].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[7][0].device,
             "/job:worker/replica:0/task:1/device:TPU:3");
+  EXPECT_EQ(tpu_devices[7][0].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
 
   EXPECT_FALSE(tpu_device_assignment.xla_device_assignment.hasValue());
 }
@@ -410,30 +428,46 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh2x2x2) {
 
   TF_ASSERT_OK(status_or.status());
 
-  auto& tpu_device_assignment = status_or.ValueOrDie();
+  const auto& tpu_device_assignment = status_or.ValueOrDie();
   EXPECT_EQ(tpu_device_assignment.compilation_device,
             "/job:worker/replica:0/task:0/device:CPU:0");
-  auto& execution_devices = tpu_device_assignment.execution_devices;
-  ASSERT_EQ(execution_devices.size(), 4);
-  for (const auto& replica_execution_device : execution_devices)
-    ASSERT_EQ(replica_execution_device.size(), 2);
+  const auto& tpu_devices = tpu_device_assignment.tpu_devices;
+  ASSERT_EQ(tpu_devices.size(), 4);
+  for (const auto& replica_tpu_devices : tpu_devices)
+    ASSERT_EQ(replica_tpu_devices.size(), 2);
 
-  EXPECT_EQ(execution_devices[0][0],
+  EXPECT_EQ(tpu_devices[0][0].device,
             "/job:worker/replica:0/task:0/device:TPU:0");
-  EXPECT_EQ(execution_devices[0][1],
+  EXPECT_EQ(tpu_devices[0][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[0][1].device,
             "/job:worker/replica:0/task:1/device:TPU:3");
-  EXPECT_EQ(execution_devices[1][0],
+  EXPECT_EQ(tpu_devices[0][1].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][0].device,
             "/job:worker/replica:0/task:0/device:TPU:1");
-  EXPECT_EQ(execution_devices[1][1],
+  EXPECT_EQ(tpu_devices[1][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][1].device,
             "/job:worker/replica:0/task:1/device:TPU:2");
-  EXPECT_EQ(execution_devices[2][0],
+  EXPECT_EQ(tpu_devices[1][1].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[2][0].device,
             "/job:worker/replica:0/task:0/device:TPU:3");
-  EXPECT_EQ(execution_devices[2][1],
+  EXPECT_EQ(tpu_devices[2][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[2][1].device,
             "/job:worker/replica:0/task:1/device:TPU:0");
-  EXPECT_EQ(execution_devices[3][0],
+  EXPECT_EQ(tpu_devices[2][1].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[3][0].device,
             "/job:worker/replica:0/task:0/device:TPU:2");
-  EXPECT_EQ(execution_devices[3][1],
+  EXPECT_EQ(tpu_devices[3][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[3][1].device,
             "/job:worker/replica:0/task:1/device:TPU:1");
+  EXPECT_EQ(tpu_devices[3][1].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
 
   auto& xla_device_assignment = tpu_device_assignment.xla_device_assignment;
   ASSERT_TRUE(xla_device_assignment.hasValue());
@@ -511,23 +545,35 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh1x2x1x3) {
   EXPECT_EQ(tpu_device_assignment.compilation_device,
             "/job:worker/replica:0/task:0/device:CPU:0");
 
-  auto& execution_devices = tpu_device_assignment.execution_devices;
-  ASSERT_EQ(execution_devices.size(), 2);
-  for (const auto& replica_execution_device : execution_devices)
-    ASSERT_EQ(replica_execution_device.size(), 3);
+  auto& tpu_devices = tpu_device_assignment.tpu_devices;
+  ASSERT_EQ(tpu_devices.size(), 2);
+  for (const auto& replica_tpu_devices : tpu_devices)
+    ASSERT_EQ(replica_tpu_devices.size(), 3);
 
-  EXPECT_EQ(execution_devices[0][0],
+  EXPECT_EQ(tpu_devices[0][0].device,
             "/job:worker/replica:0/task:1/device:TPU:1");
-  EXPECT_EQ(execution_devices[0][1],
+  EXPECT_EQ(tpu_devices[0][0].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[0][1].device,
             "/job:worker/replica:0/task:1/device:TPU:0");
-  EXPECT_EQ(execution_devices[0][2],
+  EXPECT_EQ(tpu_devices[0][1].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[0][2].device,
             "/job:worker/replica:0/task:2/device:TPU:0");
-  EXPECT_EQ(execution_devices[1][0],
+  EXPECT_EQ(tpu_devices[0][2].host,
+            "/job:worker/replica:0/task:2/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][0].device,
             "/job:worker/replica:0/task:2/device:TPU:1");
-  EXPECT_EQ(execution_devices[1][1],
+  EXPECT_EQ(tpu_devices[1][0].host,
+            "/job:worker/replica:0/task:2/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][1].device,
             "/job:worker/replica:0/task:0/device:TPU:0");
-  EXPECT_EQ(execution_devices[1][2],
+  EXPECT_EQ(tpu_devices[1][1].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][2].device,
             "/job:worker/replica:0/task:0/device:TPU:1");
+  EXPECT_EQ(tpu_devices[1][2].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
 
   auto& xla_device_assignment = tpu_device_assignment.xla_device_assignment;
   ASSERT_TRUE(xla_device_assignment.hasValue());
@@ -552,5 +598,29 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh1x2x1x3) {
   EXPECT_EQ(computation_device_2.replica_device_ids(1), 3);
 }
 
+TEST(TPURewriteDeviceUtilTest, TestGetDeviceCoordinates) {
+  mlir::MLIRContext context;
+  mlir::Builder builder(&context);
+  auto device_assignment_attr = builder.getI64ArrayAttr({1, 2, 3});
+  auto status_or_device_coodinates =
+      GetDeviceCoordinates(device_assignment_attr);
+  ASSERT_TRUE(status_or_device_coodinates.ok());
+  auto device_coordinates = status_or_device_coodinates.ConsumeValueOrDie();
+  EXPECT_EQ(device_coordinates[0], 1);
+  EXPECT_EQ(device_coordinates[1], 2);
+  EXPECT_EQ(device_coordinates[2], 3);
+}
+
+TEST(TPURewriteDeviceUtilTest, TestInvalidAttrForDeviceAssignmentDisallowed) {
+  mlir::MLIRContext context;
+  mlir::Builder builder(&context);
+  auto device_assignment_attr = builder.getF32ArrayAttr({1.0, 2.0, 3.0});
+  auto status_or_device_coodinates =
+      GetDeviceCoordinates(device_assignment_attr);
+  ASSERT_TRUE(!status_or_device_coodinates.ok());
+  EXPECT_EQ(status_or_device_coodinates.status().error_message(),
+            "bad 'device_assignment' attribute at index 0, not an int");
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
index 62b862f5e21..2e1528e0d60 100644
--- a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
@@ -104,26 +104,24 @@ int main(int argc, char** argv) {
     return 1;
   }
 
+  std::unordered_set<std::string> tags = absl::StrSplit(saved_model_tags, ',');
+  std::vector<std::string> exported_names_vector =
+      absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
+  absl::Span<std::string> exported_names(exported_names_vector);
+
   if (import_saved_model_object_graph) {
-    std::unordered_set<std::string> tags =
-        absl::StrSplit(saved_model_tags, ',');
-    std::vector<std::string> exported_names =
-        absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
     mlir::MLIRContext context;
 
     auto module = tensorflow::SavedModelObjectGraphToMlirImport(
-        input_filename, tags, absl::Span<std::string>(exported_names),
-        &context);
+        input_filename, tags, exported_names, &context);
     if (!module) return 1;
 
     module->print(output->os());
   } else if (import_saved_model_signature_defs) {
-    std::unordered_set<std::string> tags =
-        absl::StrSplit(saved_model_tags, ',');
     mlir::MLIRContext context;
 
     auto module = tensorflow::SavedModelSignatureDefsToMlirImport(
-        input_filename, tags, &context);
+        input_filename, tags, exported_names, &context);
     if (!module) return 1;
 
     module->print(output->os());
diff --git a/tensorflow/compiler/mlir/tfjs/BUILD b/tensorflow/compiler/mlir/tfjs/BUILD
index 9b731d2c912..ac629ac4573 100644
--- a/tensorflow/compiler/mlir/tfjs/BUILD
+++ b/tensorflow/compiler/mlir/tfjs/BUILD
@@ -1,4 +1,5 @@
 load("//third_party/mlir:tblgen.bzl", "gentbl")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -39,7 +40,7 @@ gentbl(
         "ir/tfjs_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
@@ -131,10 +132,106 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
-        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
-        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Transforms",
     ],
 )
+
+cc_library(
+    name = "json_translate_lib",
+    srcs = [
+        "translate/json_translate.cc",
+    ],
+    hdrs = [
+        "translate/json_translate.h",
+    ],
+    deps = [
+        ":tensorflow_js",
+        ":tensorflow_js_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:export_utils",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Translation",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tf_to_tfjs_json",
+    srcs = ["translate/tf_to_tfjs_json.cc"],
+    hdrs = [
+        "translate/tf_to_tfjs_json.h",
+    ],
+    deps = [
+        ":json_translate_lib",
+        ":tfjs_optimize",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:decode_constant_pass",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
+        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
+        "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
+        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_binary(
+    name = "json_translate",
+    deps = [
+        ":json_translate_lib",
+        "@llvm-project//mlir:MlirTranslateMain",
+    ],
+)
+
+filegroup(
+    name = "tf_tfjs_translate_main",
+    srcs = [
+        "translate/tf_tfjs_translate.cc",
+    ],
+)
+
+tf_cc_binary(
+    name = "tf_tfjs_translate",
+    srcs = [":tf_tfjs_translate_main"],
+    deps = [
+        ":json_translate_lib",
+        ":tensorflow_js_passes",
+        ":tf_to_tfjs_json",
+        ":tfjs_optimize",
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
index 318895de79c..9c98c9b0e19 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
@@ -26,8 +26,9 @@ limitations under the License.
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+
 namespace mlir {
 namespace tfjs {
 
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
index 172347bc0f5..134aa010d8c 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
@@ -23,7 +23,7 @@ limitations under the License.
 #define TFJS_DIALECT
 
 include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 
 //===----------------------------------------------------------------------===//
 // TensorFlow.js dialect definitions
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD b/tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD
similarity index 65%
rename from tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
rename to tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD
index 4faa8d2efe8..5c8d37da2f0 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
+++ b/tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD
@@ -1,11 +1,15 @@
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 glob_lit_tests(
-    data = [":test_utilities"],
+    data = [
+        ":test_utilities",
+    ],
     driver = "@llvm-project//mlir:run_lit.sh",
-    test_file_exts = ["mlir"],
+    test_file_exts = [
+        "pbtxt",
+    ],
 )
 
 # Bundle together all of the test utilities that are used by tests.
@@ -13,7 +17,7 @@ filegroup(
     name = "test_utilities",
     testonly = True,
     data = [
-        "//tensorflow/compiler/mlir:tf-opt",
+        "//tensorflow/compiler/mlir/tfjs:tf_tfjs_translate",
         "@llvm-project//llvm:FileCheck",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tfjs/tests/e2e/add.pbtxt b/tensorflow/compiler/mlir/tfjs/tests/e2e/add.pbtxt
new file mode 100644
index 00000000000..f6a324fdc13
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/tests/e2e/add.pbtxt
@@ -0,0 +1,78 @@
+# RUN: tf_tfjs_translate %s -tf-input-arrays=input0,input1 -tf-input-data-types=DT_INT32,DT_INT32 -tf-input-shapes=10:10 -tf-output-arrays=Mul -o - | FileCheck %s --dump-input-on-failure
+# Add two tensor<4xi32> inputs and return the result
+
+node {
+  name: "Add"
+  op: "Add"
+  input: "input0"
+  input: "input1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "input0"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "input1"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Mul"
+  op: "Mul"
+  input: "Add"
+  input: "Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+versions {
+  producer: 27
+}
+
+# CHECK: "name": "input0"
+# CHECK-NEXT: "op": "Placeholder"
+# CHECK: "type": "DT_INT32"
+# CHECK: "name": "input1",
+# CHECK-NEXT: "op": "Placeholder"
+# CHECK: "type": "DT_INT32"
+# CHECK: "name": "Add"
+# CHECK-NEXT: "op": "AddV2"
+# CHECK-NEXT: "input":
+# CHECK-NEXT: "input0"
+# CHECK-NEXT: "input1"
+# CHECK: "type": "DT_INT32"
+# CHECK: "name": "Mul1"
+# CHECK-NEXT: "op": "Mul"
+# CHECK-NEXT: "input":
+# CHECK-NEXT: "Add"
+# CHECK-NEXT: "Add"
+# CHECK: "type": "DT_INT32"
+# CHECK: "name": "Mul"
+# CHECK-NEXT: "op": "_Retval"
+# CHECK-NEXT: "input":
+# CHECK-NEXT: "Mul1"
+# CHECK: "type": "DT_INT32"
+# CHECK: "library"
+# CHECK: "versions"
+# CHECK: "producer": 27
+
diff --git a/tensorflow/compiler/mlir/tfjs/tests/e2e/prelu.pbtxt b/tensorflow/compiler/mlir/tfjs/tests/e2e/prelu.pbtxt
new file mode 100644
index 00000000000..810db71f5e0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/tests/e2e/prelu.pbtxt
@@ -0,0 +1,175 @@
+# RUN: tf_tfjs_translate %s -tf-input-arrays=input0 -tf-input-data-types=DT_FLOAT -tf-input-shapes=10 -tf-output-arrays=Add -tf-custom-opdefs="name: 'Prelu' input_arg: { name: 'x' type: DT_FLOAT } input_arg: { name: 'alpha' type: DT_FLOAT } output_arg: { name: 'c' type: DT_FLOAT }" -o - | FileCheck %s --dump-input-on-failure
+# Add two tensor<4xi32> inputs and return the result
+
+node {
+  name: "input0"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "alpha"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5
+      }
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "input0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Neg"
+  op: "Neg"
+  input: "input0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Relu1"
+  op: "Relu"
+  input: "Neg"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Mul"
+  op: "Mul"
+  input: "alpha"
+  input: "Relu1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Add"
+  op: "Add"
+  input: "Relu"
+  input: "Mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "main"
+  op: "_Retval"
+  input: "Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 344
+}
+
+# CHECK: "node":
+# CHECK: "name": "input0",
+# CHECK-NEXT: "op": "Placeholder",
+# CHECK-NEXT: "attr":
+# CHECK: "type": "DT_FLOAT"
+# CHECK: "name": "Add.Relu.Neg.Relu1.Mul",
+# CHECK-NEXT: "op": "Const",
+# CHECK-NEXT: "attr":
+# CHECK: "value":
+# CHECK: "tensor":
+# CHECK: "dtype": "DT_FLOAT",
+# CHECK: "tensorShape": {},
+# CHECK: "floatVal":
+# CHECK: -0.5
+# CHECK: "name": "Add.Relu.Neg.Relu1.Mul1",
+# CHECK-NEXT: "op": "Prelu",
+# CHECK-NEXT: "input":
+# CHECK: "input0",
+# CHECK: "Add.Relu.Neg.Relu1.Mul"
+# CHECK: "attr":
+# CHECK: "_output_shapes":
+# CHECK: "list":
+# CHECK: "shape":
+# CHECK: "dim":
+# CHECK: "size": "10"
+# CHECK: "experimentalDebugInfo": {}
+# CHECK: "name": "Add",
+# CHECK-NEXT: "op": "_Retval",
+# CHECK-NEXT: "input":
+# CHECK: "Add.Relu.Neg.Relu1.Mul1"
+# CHECK: "attr":
+# CHECK: "T":
+# CHECK: "type": "DT_FLOAT"
+# CHECK: "library": {},
+# CHECK: "versions":
+# CHECK: "producer": 344
+
diff --git a/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc b/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
index 631bb1ae2af..a445937570e 100644
--- a/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
+++ b/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/transforms/decode_constant.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tfjs/transforms/passes.h"
 
@@ -47,6 +46,11 @@ void AddTFToTFJSConversionPasses(mlir::OpPassManager* pm) {
   // Canonicalize, CSE etc.
   pm->addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
   pm->addNestedPass<mlir::FuncOp>(mlir::createCSEPass());
+
+  // raise to executor dialect in order to use GraphDef converter
+  pm->addNestedPass<mlir::FuncOp>(
+      mlir::CreateFunctionalToExecutorDialectConversionPass());
+  pm->addNestedPass<mlir::FuncOp>(mlir::CreateBreakUpIslandsPass());
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfjs/translate/json_translate.cc b/tensorflow/compiler/mlir/tfjs/translate/json_translate.cc
new file mode 100644
index 00000000000..7f4b8ffae09
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/json_translate.cc
@@ -0,0 +1,105 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfjs/translate/json_translate.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Translation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+
+using mlir::ModuleOp;
+using mlir::TranslateFromMLIRRegistration;
+using std::string;
+using tensorflow::Status;
+using xla::StatusOr;
+
+// Translates the given MLIR module in the TFJS dialect to TFJS JSON
+// format. Returns false on success.
+//
+bool tfjs::MlirToJSONTranslateFunction(ModuleOp module,
+                                       std::string* serialized_json) {
+  string json_output;
+  // Allow TF to treat TFJS ops as TF ops.
+  if (!tensorflow::AddTensorFlowOpPrefix("tfjs.").ok()) {
+    LOG(ERROR) << "Failed to add tfjs op prefix.";
+    return false;
+  }
+  tensorflow::GraphExportConfig confs;
+  confs.export_shapes = true;
+  confs.export_library = true;
+  tensorflow::FunctionLibraryDefinition flib_def(
+      tensorflow::OpRegistry::Global(), tensorflow::FunctionDefLibrary());
+  absl::flat_hash_set<tensorflow::Node*> control_ret_nodes;
+  auto graph = absl::make_unique<tensorflow::Graph>(flib_def);
+  auto status = tensorflow::ConvertMlirToGraph(module, confs, &graph, &flib_def,
+                                               &control_ret_nodes);
+  if (!status.ok()) {
+    LOG(ERROR) << "Graph export failed: " << status;
+    return false;
+  }
+  auto graphdef = absl::make_unique<tensorflow::GraphDef>();
+  graph->ToGraphDef(graphdef.get());
+
+  // Replace the _Arg nodes of the main function with Placeholder op.
+  auto nodes = graphdef->mutable_node();
+  for (const auto& node : llvm::enumerate(*nodes)) {
+    if (node.value().op() == "_Arg") {
+      nodes->Mutable(node.index())->set_op("Placeholder");
+    }
+  }
+
+  tensorflow::protobuf::util::JsonPrintOptions json_options;
+  json_options.add_whitespace = true;
+  auto jsonStatus = tensorflow::protobuf::util::MessageToJsonString(
+      *graphdef, &json_output, json_options);
+  if (!jsonStatus.ok()) {
+    LOG(ERROR) << "Proto2Json failed: " << status;
+    return false;
+  }
+  *serialized_json = std::move(json_output);
+  return true;
+}
+
+static mlir::LogicalResult MlirToJSONFileTranslateFunction(
+    ModuleOp module, llvm::raw_ostream& output) {
+  std::string serialized_json;
+  if (!tfjs::MlirToJSONTranslateFunction(module, &serialized_json))
+    return mlir::failure();
+
+  output << serialized_json;
+  return mlir::success();
+}
+
+static TranslateFromMLIRRegistration MLIRToJSONFileTranslate(
+    "mlir-to-tfjs-json", MlirToJSONFileTranslateFunction);
diff --git a/tensorflow/compiler/mlir/tfjs/translate/json_translate.h b/tensorflow/compiler/mlir/tfjs/translate/json_translate.h
new file mode 100644
index 00000000000..0a931f770ad
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/json_translate.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_JSON_TRANSLATE_H_
+#define TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_JSON_TRANSLATE_H_
+
+#include <string>
+
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tfjs {
+
+// Translates the given MLIR `module` into a JSON string. Returns true if
+// translation fails, otherwise returns false.
+bool MlirToJSONTranslateFunction(mlir::ModuleOp module,
+                                 std::string* serialized_json);
+}  // namespace tfjs
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_JSON_TRANSLATE_H_
diff --git a/tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc b/tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc
new file mode 100644
index 00000000000..e735a3c7b8c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc
@@ -0,0 +1,173 @@
+
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iostream>
+#include <string>
+
+#include "absl/strings/str_split.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h"
+#include "tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.h"
+#include "tensorflow/compiler/mlir/tfjs/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+using llvm::cl::opt;
+using mlir::MLIRContext;
+using stream_executor::port::StatusOr;
+
+// NOLINTNEXTLINE
+opt<std::string> input_file_name(llvm::cl::Positional,
+                                 llvm::cl::desc("<input file>"),
+                                 llvm::cl::init("-"));
+
+// NOLINTNEXTLINE
+opt<bool> import_saved_model_object_graph(
+    "savedmodel-objectgraph-to-mlir",
+    llvm::cl::desc("Import a saved model to its MLIR representation"),
+    llvm::cl::value_desc("dir"));
+
+// NOLINTNEXTLINE
+opt<bool> import_saved_model_signature_defs(
+    "savedmodel-signaturedefs-to-mlir",
+    llvm::cl::desc("Import a saved model V1 to its MLIR representation"),
+    llvm::cl::value_desc("dir"));
+
+// NOLINTNEXTLINE
+opt<std::string> saved_model_tags(
+    "tf-savedmodel-tags",
+    llvm::cl::desc("Tags used to indicate which MetaGraphDef to import, "
+                   "separated by ','"),
+    llvm::cl::init("serve"));
+
+// NOLINTNEXTLINE
+opt<std::string> saved_model_exported_names(
+    "tf-savedmodel-exported-names",
+    llvm::cl::desc("Names to export from SavedModel, separated by ','. Empty "
+                   "(the default) means export all."),
+    llvm::cl::init(""));
+
+// NOLINTNEXTLINE
+opt<std::string> output_file_name("o", llvm::cl::desc("<output file>"),
+                                  llvm::cl::value_desc("filename"),
+                                  llvm::cl::init("-"));
+// NOLINTNEXTLINE
+opt<bool> input_mlir(
+    "input-mlir",
+    llvm::cl::desc("Take input TensorFlow model in textual MLIR instead of "
+                   "GraphDef format"),
+    llvm::cl::init(false), llvm::cl::Hidden);
+// NOLINTNEXTLINE
+opt<bool> output_mlir(
+    "output-mlir",
+    llvm::cl::desc("Output MLIR rather than JSON for the generated TFJS model"),
+    llvm::cl::init(false));
+
+// The following approach allows injecting opdefs in addition
+// to those that are already part of the global TF registry  to be linked in
+// prior to importing the graph. The primary goal is for support of custom ops.
+// This is not intended to be a general solution for custom ops for the future
+// but mainly for supporting older models like mobilenet_ssd. More appropriate
+// mechanisms, such as op hints or using functions to represent composable ops
+// like https://github.com/tensorflow/community/pull/113 should be encouraged
+// going forward.
+// NOLINTNEXTLINE
+llvm::cl::list<std::string> custom_opdefs(
+    "tf-custom-opdefs", llvm::cl::desc("List of custom opdefs when importing "
+                                       "graphdef"));
+
+// Debugging flag to print function mapping in the JSON.
+// NOLINTNEXTLINE
+static opt<bool> print_function_result_mapping(
+    "print-function-result-mapping",
+    llvm::cl::desc(
+        "Print the mapping of function result to json output buffer"),
+    llvm::cl::init(false));
+
+enum TranslationStatus { kTrSuccess, kTrFailure };
+
+static int PrintFunctionResultMapping(const std::string& result) {
+  std::cout << result << std::endl;
+  return kTrSuccess;
+}
+
+int main(int argc, char** argv) {
+  tensorflow::InitMlir y(&argc, &argv);
+
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "TF GraphDef to TFJS JSON converter\n");
+
+  MLIRContext context;
+  llvm::SourceMgr source_mgr;
+  mlir::SourceMgrDiagnosticHandler sourceMgrHandler(source_mgr, &context);
+
+  StatusOr<mlir::OwningModuleRef> module;
+
+  if (import_saved_model_object_graph || import_saved_model_signature_defs) {
+    if (input_mlir)
+      module = tensorflow::errors::InvalidArgument(
+          "Importing saved model should not have input_mlir set");
+    module = tensorflow::ImportSavedModel(
+        import_saved_model_object_graph, import_saved_model_signature_defs,
+        custom_opdefs, input_file_name, saved_model_tags,
+        saved_model_exported_names, &context);
+  } else {
+    module = tensorflow::LoadFromGraphdefOrMlirSource(
+        input_file_name, input_mlir, custom_opdefs, debug_info_file,
+        input_arrays, input_dtypes, input_shapes, output_arrays,
+        /*prune_unused_nodes=*/true, &source_mgr, &context);
+  }
+
+  // If errors occur, the library call in the above already logged the error
+  // message. So we can just return here.
+  if (!module.ok()) return kTrFailure;
+
+  mlir::PassManager pm(&context);
+
+  tensorflow::AddTFToTFJSConversionPasses(&pm);
+
+  std::string result;
+  auto status = tensorflow::ConvertTFOpsToTfjsJSON(module.ValueOrDie().get(),
+                                                   output_mlir, &result, &pm);
+  if (!status.ok()) return kTrFailure;
+
+  std::string error_msg;
+  auto output = mlir::openOutputFile(output_file_name, &error_msg);
+  if (output == nullptr) {
+    llvm::errs() << error_msg << '\n';
+    return kTrFailure;
+  }
+  output->os() << result;
+  output->keep();
+
+  // Print out debugging info related to function mapping.
+  if (print_function_result_mapping) return PrintFunctionResultMapping(result);
+  return kTrSuccess;
+}
diff --git a/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.cc b/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.cc
new file mode 100644
index 00000000000..7dc9ea049ba
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.cc
@@ -0,0 +1,152 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h"
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/tfjs/translate/json_translate.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+
+using mlir::MLIRContext;
+using mlir::ModuleOp;
+using mlir::OwningModuleRef;
+using stream_executor::port::StatusOr;
+
+namespace {
+tensorflow::Status RegisterCustomOps(
+    const std::vector<std::string>& extra_tf_opdefs) {
+  for (const auto& tf_opdefs_string : extra_tf_opdefs) {
+    tensorflow::OpDef opdef;
+    if (!tensorflow::protobuf::TextFormat::ParseFromString(tf_opdefs_string,
+                                                           &opdef)) {
+      LOG(ERROR) << "OpDef parsing failed for: " << tf_opdefs_string;
+      return errors::InvalidArgument("fail to parse extra OpDef");
+    }
+    // Register extra opdefs.
+    tensorflow::OpRegistry::Global()->Register(
+        [opdef](tensorflow::OpRegistrationData* op_reg_data) -> Status {
+          *op_reg_data = tensorflow::OpRegistrationData(opdef);
+          return Status::OK();
+        });
+  }
+  return Status::OK();
+}
+}  // namespace
+
+StatusOr<OwningModuleRef> LoadFromGraphdefOrMlirSource(
+    const std::string& input_filename, bool input_mlir,
+    const std::vector<std::string>& extra_tf_opdefs,
+    absl::string_view debug_info_file, absl::string_view input_arrays,
+    absl::string_view input_dtypes, absl::string_view input_shapes,
+    absl::string_view output_arrays, bool prune_unused_nodes,
+    llvm::SourceMgr* source_mgr, MLIRContext* context) {
+  // Set up the input file.
+  std::string error_message;
+  auto file = mlir::openInputFile(input_filename, &error_message);
+  if (!file) {
+    llvm::errs() << error_message << "\n";
+    return errors::InvalidArgument("fail to open input file");
+  }
+
+  if (input_mlir) {
+    source_mgr->AddNewSourceBuffer(std::move(file), llvm::SMLoc());
+    return OwningModuleRef(mlir::parseSourceFile(*source_mgr, context));
+  }
+
+  TF_RETURN_IF_ERROR(RegisterCustomOps(extra_tf_opdefs));
+
+  return tensorflow::GraphdefToMlirTranslateFunction(
+      file->getBuffer(), debug_info_file, input_arrays, input_dtypes,
+      input_shapes, output_arrays, /*control_output_arrays=*/"",
+      prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
+      /*graph_as_function=*/false, /*upgrade_legacy=*/true,
+      /*enable_shape_inference=*/true, context);
+}
+
+Status ConvertTFOpsToTfjsJSON(mlir::ModuleOp module, bool export_to_mlir,
+                              std::string* result,
+                              mlir::PassManager* pass_manager) {
+  mlir::StatusScopedDiagnosticHandler statusHandler(module.getContext(),
+                                                    /*propagate=*/true);
+  if (failed(pass_manager->run(module))) {
+    return statusHandler.ConsumeStatus();
+  }
+
+  if (export_to_mlir) {
+    llvm::raw_string_ostream os(*result);
+    module.print(os);
+    return Status::OK();
+  }
+
+  return tfjs::MlirToJSONTranslateFunction(module, result)
+             ? Status::OK()
+             : statusHandler.ConsumeStatus();
+}
+
+StatusOr<mlir::OwningModuleRef> ImportSavedModel(
+    bool import_saved_model, bool import_saved_model_v1,
+    const std::vector<std::string>& extra_tf_opdefs,
+    const std::string& input_filename, const std::string& saved_model_tags,
+    const std::string& saved_model_exported_names, mlir::MLIRContext* context) {
+  std::unordered_set<std::string> tags = absl::StrSplit(saved_model_tags, ',');
+  std::vector<std::string> exported_names_in_vector =
+      absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
+  absl::Span<std::string> exported_names(exported_names_in_vector);
+  if (import_saved_model) {
+    auto module = tensorflow::SavedModelObjectGraphToMlirImport(
+        input_filename, tags, absl::Span<std::string>(exported_names), context);
+    if (!module)
+      return tensorflow::errors::InvalidArgument("fail to open input file");
+    TF_RETURN_IF_ERROR(RegisterCustomOps(extra_tf_opdefs));
+    return module;
+  } else if (import_saved_model_v1) {
+    auto module = tensorflow::SavedModelSignatureDefsToMlirImport(
+        input_filename, tags, exported_names, context);
+
+    if (!module)
+      return tensorflow::errors::InvalidArgument("fail to open input file");
+    TF_RETURN_IF_ERROR(RegisterCustomOps(extra_tf_opdefs));
+    return module;
+  } else {
+    return tensorflow::errors::InvalidArgument(
+        "Should be either saved model v1 or v2");
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h b/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h
new file mode 100644
index 00000000000..d68f0e7d46e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_TF_TO_TFJS_JSON_H_
+#define TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_TF_TO_TFJS_JSON_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "llvm/Support/SourceMgr.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+
+// Load a TF model from a GraphDef definition or a TF control flow dialect MLIR
+// source into a MLIR module. If `input_mlir` is true, load from a MLIR source
+// file; otherwise, load from a GraphDef.
+// Setting prune_unused_nodes to true, would prune unreachable nodes if
+// output_arrays is specified.
+stream_executor::port::StatusOr<mlir::OwningModuleRef>
+LoadFromGraphdefOrMlirSource(
+    const std::string& input_filename, bool input_mlir,
+    const std::vector<std::string>& extra_tf_opdefs,
+    absl::string_view debug_info_file, absl::string_view input_arrays,
+    absl::string_view input_dtypes, absl::string_view input_shapes,
+    absl::string_view output_arrays, bool prune_unused_nodes,
+    llvm::SourceMgr* source_mgr, mlir::MLIRContext* context);
+
+// Load Saved model (either v1 or v2) into MLIR.
+stream_executor::port::StatusOr<mlir::OwningModuleRef> ImportSavedModel(
+    bool import_saved_model, bool import_saved_model_v1,
+    const std::vector<std::string>& extra_tf_opdefs,
+    const std::string& input_filename, const std::string& saved_model_tags,
+    const std::string& saved_model_exported_names, mlir::MLIRContext* context);
+
+// Taking a MLIR module in TF executor dialect and a set of parameters,
+// applies a set of passes to convert the module to TFJS dialect and
+// serializes the result to JSON string.
+// If `export_to_mlir` is true, the result is exported in MLIR text format,
+// otherwise exported in JSON.
+Status ConvertTFOpsToTfjsJSON(mlir::ModuleOp module, bool export_to_mlir,
+                              std::string* result,
+                              mlir::PassManager* pass_manager);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_TF_TO_TFJS_JSON_H_
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
deleted file mode 100644
index 88e214f601b..00000000000
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ /dev/null
@@ -1,156 +0,0 @@
-load("//third_party/mlir:tblgen.bzl", "gentbl")
-load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library_cc")
-
-# TF to TFRT kernels conversion.
-package(
-    default_visibility = [":friends"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-package_group(
-    name = "friends",
-    packages = [
-        "//learning/brain/experimental/tfrt/...",
-        "//tensorflow/compiler/...",
-        "//tensorflow/core/runtime_fallback/...",
-        "//tensorflow/core/tfrt/experimental/saved_model/...",
-        "//third_party/tf_runtime_google/...",
-    ],
-)
-
-cc_library(
-    name = "tf_legalize_to_tfrt",
-    srcs = [
-        "tf_legalize_to_hex.cc",
-    ],
-    deps = [
-        "//tensorflow/compiler/mlir/tensorflow",
-        "@com_google_absl//absl/memory",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
-    ],
-    alwayslink = 1,
-)
-
-filegroup(
-    name = "runtime_fallback_ops_td_files",
-    srcs = [
-        "runtime_fallback/runtime_fallback_ops.td",
-        "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
-        "@tf_runtime//:OpBaseTdFiles",
-    ],
-)
-
-gentbl(
-    name = "runtime_fallback_ops_inc_gen",
-    tbl_outs = [
-        (
-            "-gen-op-decls",
-            "runtime_fallback_ops.h.inc",
-        ),
-        (
-            "-gen-op-defs",
-            "runtime_fallback_ops.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "runtime_fallback/runtime_fallback_ops.td",
-    td_includes = [
-        "external/tf_runtime/include",
-    ],
-    td_srcs = [
-        ":runtime_fallback_ops_td_files",
-    ],
-)
-
-cc_library(
-    name = "runtime_fallback_opdefs_alwayslink",
-    srcs = [
-        "runtime_fallback/dialect_static_registration.cc",
-        "runtime_fallback/runtime_fallback_combine.cc",
-        "runtime_fallback/runtime_fallback_ops.cc",
-    ],
-    hdrs = [
-        "runtime_fallback/runtime_fallback_ops.h",
-    ],
-    deps = [
-        ":runtime_fallback_ops_inc_gen",
-        "@llvm-project//mlir:Dialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:SideEffects",
-        "@llvm-project//mlir:Support",
-        "@tf_runtime//:basic_kernels_opdefs_alwayslink",
-        "@tf_runtime//:tensor_opdefs_alwayslink",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "lower_tf_to_tfd_alwayslink",
-    srcs = ["runtime_fallback/lower_tf_to_tfd.cc"],
-    deps = [
-        "runtime_fallback_opdefs_alwayslink",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Transforms",
-        "@tf_runtime//:basic_kernels_opdefs_alwayslink",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "tf_to_corert",
-    srcs = [
-        "transforms/optimize.cc",
-        "transforms/tf_to_corert.cc",
-    ],
-    hdrs = [
-        "transforms/passes.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
-        "//tensorflow/core:framework",
-        "//tensorflow/core/platform:tstring",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Transforms",
-        "@tf_runtime//:basic_kernels_opdefs_alwayslink",
-        "@tf_runtime//:core_runtime_opdefs_alwayslink",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "compatibility_analysis",
-    srcs = [
-        "analysis/compatibility_analysis.cc",
-    ],
-    hdrs = [
-        "analysis/compatibility_analysis.h",
-    ],
-    deps = [
-        ":analysis/analysis_proto_cc",
-        ":tf_to_corert",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/core:lib_proto_parsing",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Translation",
-    ],
-    alwayslink = 1,
-)
-
-tf_proto_library_cc(
-    name = "analysis/analysis_proto",
-    srcs = ["analysis/analysis.proto"],
-    cc_api_version = 2,
-)
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/analysis.proto b/tensorflow/compiler/mlir/tfrt/analysis/analysis.proto
deleted file mode 100644
index 0716a243bb3..00000000000
--- a/tensorflow/compiler/mlir/tfrt/analysis/analysis.proto
+++ /dev/null
@@ -1,25 +0,0 @@
-syntax = "proto3";
-
-package mlir.tfrt;
-
-message CompatibilityAnalysisReportProto {
-  bool unknown_dialect = 1;
-  bool ref_variable = 2;
-  bool incompatible_variable = 3;
-  bool incompatible_attribute = 4;
-  bool control_flow_v1 = 5;
-
-  // TODO(chky): add more checks, eg. tensor datatypes.
-}
-
-message CompatibilityAnalysisProto {
-  CompatibilityAnalysisReportProto summary = 1;
-
-  message OpInfo {
-    int32 count = 1;
-
-    CompatibilityAnalysisReportProto report = 2;
-  }
-
-  map<string, OpInfo> ops = 2;
-}
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.cc b/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.cc
deleted file mode 100644
index 7e9c5544c25..00000000000
--- a/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.cc
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.h"
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Translation.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/core/platform/protobuf.h"
-
-namespace tensorflow {
-namespace {
-
-class CompatibilityAnalysis {
- public:
-  void AnalyzeOperation(mlir::Operation* op);
-
-  const mlir::tfrt::CompatibilityAnalysisProto& GetResult() const {
-    return analysis_;
-  }
-
- private:
-  // Return true if some attributes in the op are not supported.
-  bool AnalyzeOpAttributes(mlir::Operation* op);
-  // Return true if this op has unsupported operation (eg. mutate) on resource
-  // variables.
-  bool AnalyzeVariable(mlir::Operation* op);
-
-  void UpdateReport(
-      const mlir::tfrt::CompatibilityAnalysisReportProto& new_report,
-      mlir::tfrt::CompatibilityAnalysisReportProto* old_report);
-
-  mlir::tfrt::CompatibilityAnalysisProto analysis_;
-};
-
-void CompatibilityAnalysis::AnalyzeOperation(mlir::Operation* op) {
-  // Skip the standard ops that are allowed in tf dialect.
-  if (llvm::isa<mlir::ReturnOp>(op) || llvm::isa<mlir::FuncOp>(op) ||
-      llvm::isa<mlir::ModuleOp>(op) || llvm::isa<mlir::ModuleTerminatorOp>(op))
-    return;
-
-  auto op_name = op->getName();
-
-  std::string name = op_name.getStringRef().str();
-
-  mlir::tfrt::CompatibilityAnalysisReportProto op_report;
-
-  if (op_name.getDialect() ==
-      mlir::TF::TensorFlowDialect::getDialectNamespace()) {
-    // Analyze op attributes.
-    if (AnalyzeOpAttributes(op)) op_report.set_incompatible_attribute(true);
-
-    // Analyze variable operations.
-    if (AnalyzeVariable(op)) op_report.set_incompatible_variable(true);
-
-    // Reference variable is not supported.
-    if (op_name.getStringRef() == "tf.VariableV2")
-      op_report.set_ref_variable(true);
-  } else if (op_name.getDialect() == "tf_executor") {
-    if (llvm::isa<mlir::tf_executor::SwitchOp>(op) ||
-        llvm::isa<mlir::tf_executor::SwitchNOp>(op) ||
-        llvm::isa<mlir::tf_executor::MergeOp>(op) ||
-        llvm::isa<mlir::tf_executor::EnterOp>(op) ||
-        llvm::isa<mlir::tf_executor::NextIterationSourceOp>(op) ||
-        llvm::isa<mlir::tf_executor::NextIterationSinkOp>(op)) {
-      op_report.set_control_flow_v1(true);
-    } else {
-      // Skip the rest of the tf_executor ops as they can be handled.
-      //
-      // TODO(chky): consider adding whitelist here.
-      return;
-    }
-  } else {
-    // Mark unknown dialect in the report.
-    op_report.set_unknown_dialect(true);
-  }
-
-  auto& op_info = (*analysis_.mutable_ops())[name];
-  op_info.set_count(op_info.count() + 1);
-
-  UpdateReport(op_report, op_info.mutable_report());
-  UpdateReport(op_report, analysis_.mutable_summary());
-}
-
-bool CompatibilityAnalysis::AnalyzeOpAttributes(mlir::Operation* op) {
-  // tf.Const gets special handling so it is always compatible.
-  if (llvm::isa<mlir::TF::ConstOp>(op)) return false;
-
-  // TODO(chky): Derived attributes should be also analyzed here.
-  for (auto attr : op->getAttrs()) {
-    if (attr.first.strref() == "_output_shapes") continue;
-    if (attr.first.strref() == "_class") continue;
-
-    // Symbol attributes (eg. function names) is currently not supported.
-    //
-    // TODO(chky): CoreRT should ideally support function call operatoins.
-    // Remove this condition once that is implemented.
-    if (attr.second.isa<mlir::FlatSymbolRefAttr>()) return true;
-
-    // Currently only tensors of simple dtypes (i1, i32, i64, f32, f64) are
-    // supported.
-    if (auto elements_attr = attr.second.dyn_cast<mlir::ElementsAttr>()) {
-      if (!elements_attr.isa<mlir::DenseElementsAttr>()) return true;
-      auto element_type = elements_attr.getType().getElementType();
-      if (element_type.isa<mlir::TF::TensorFlowType>()) return true;
-    }
-
-    // Currently only arrays of simple element types (i1, i32, i64, f32, f64)
-    // are supported.
-    if (auto array_attr = attr.second.dyn_cast<mlir::ArrayAttr>()) {
-      if (array_attr.size() > 0) {
-        if (array_attr[0].isa<mlir::ElementsAttr>()) return true;
-
-        if (array_attr[0].isa<mlir::StringAttr>()) return true;
-
-        if (array_attr[0].isa<mlir::FlatSymbolRefAttr>()) return true;
-      }
-    }
-  }
-  return false;
-}
-
-bool CompatibilityAnalysis::AnalyzeVariable(mlir::Operation* op) {
-  // Currently only supported variable op is ReadVariableOp.
-  if (llvm::isa<mlir::TF::ReadVariableOp>(op)) return false;
-
-  for (auto value : op->getOperands()) {
-    auto type = value.getType();
-    if (auto tensor_type = type.dyn_cast<mlir::TensorType>()) {
-      auto element_type = tensor_type.getElementType();
-      if (element_type.isa<mlir::TF::ResourceType>()) return true;
-    }
-  }
-
-  return false;
-}
-
-void CompatibilityAnalysis::UpdateReport(
-    const mlir::tfrt::CompatibilityAnalysisReportProto& new_report,
-    mlir::tfrt::CompatibilityAnalysisReportProto* old_report) {
-  if (new_report.unknown_dialect()) old_report->set_unknown_dialect(true);
-
-  if (new_report.ref_variable()) old_report->set_ref_variable(true);
-
-  if (new_report.incompatible_variable())
-    old_report->set_incompatible_variable(true);
-
-  if (new_report.incompatible_attribute())
-    old_report->set_incompatible_attribute(true);
-
-  if (new_report.control_flow_v1()) old_report->set_control_flow_v1(true);
-}
-
-}  // namespace
-
-mlir::tfrt::CompatibilityAnalysisProto AnalyzeTFCompatibility(
-    mlir::ModuleOp op) {
-  CompatibilityAnalysis analysis;
-  op.walk([&analysis](mlir::Operation* op) { analysis.AnalyzeOperation(op); });
-  return analysis.GetResult();
-}
-
-static mlir::TranslateFromMLIRRegistration registration(
-    "analyze-tf-for-tfrt", [](mlir::ModuleOp op, llvm::raw_ostream& output) {
-      auto analysis_proto = AnalyzeTFCompatibility(op);
-      std::string text_proto;
-      if (tensorflow::protobuf::TextFormat::PrintToString(analysis_proto,
-                                                          &text_proto)) {
-        output << text_proto;
-        return mlir::success();
-      }
-
-      return mlir::failure();
-    });
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/lower_tf_to_tfd.cc b/tensorflow/compiler/mlir/tfrt/runtime_fallback/lower_tf_to_tfd.cc
deleted file mode 100644
index 5f831c9ef6a..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/lower_tf_to_tfd.cc
+++ /dev/null
@@ -1,390 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstddef>
-#include <string>
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h"
-#include "tfrt/basic_kernels/opdefs/basic_kernels.h"
-
-namespace mlir {
-namespace {
-
-constexpr const char kTmpLoweringCastOpName[] = "tmp_lowering_cast_op";
-
-static Type GetChainType(MLIRContext* context) {
-  auto hexDialect = Identifier::get("hex", context);
-  return OpaqueType::get(hexDialect, "chain", context);
-}
-
-static Type GetTfdTensorType(MLIRContext* context) {
-  auto tfdDialect = Identifier::get("tfd", context);
-  return OpaqueType::get(tfdDialect, "tf_tensor", context);
-}
-
-struct TfToTfdLoweringPass
-    : public PassWrapper<TfToTfdLoweringPass, OperationPass<ModuleOp>> {
-  void runOnOperation() final;
-};
-
-class FuncOpSignatureConversion : public OpConversionPattern<FuncOp> {
- public:
-  using OpConversionPattern<FuncOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      FuncOp funcOp, llvm::ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter) const override {
-    auto ctx = funcOp.getContext();
-    auto chain_type = GetChainType(ctx);
-    auto tfd_tensor_type = GetTfdTensorType(ctx);
-    FunctionType type = funcOp.getType();
-
-    // Convert function return results. The lowered function is expected to
-    // return a chain as the first return result. For each original TF tensor,
-    // the lowered function returns a TFD tensor instead.
-    llvm::SmallVector<Type, 2> converted_results;
-    if (type.getNumResults() > 0) {
-      // Add a chain as the first return result.
-      converted_results.push_back(chain_type);
-
-      // Convert the original TF tensor return results.
-      for (unsigned i = 0, e = type.getNumResults(); i != e; ++i) {
-        if (auto tensor_type = type.getResult(i).dyn_cast<TensorType>()) {
-          // Each TF tensor is converted to a TFD tensor.
-          converted_results.push_back(tfd_tensor_type);
-        } else {
-          // Only handle TF tensor conversion for now.
-          return failure();
-        }
-      }
-    }
-
-    // Create the new function signature. The lowered function is expected to
-    // take a Chain as the first argument. Then for each TF tensor argument,
-    // expect a TFD tensor argument instead.
-    TypeConverter::SignatureConversion new_func_sig(type.getNumInputs() + 1);
-    if (type.getNumInputs() > 0) {
-      // Add the first chain argument.
-      new_func_sig.addInputs(chain_type);
-      for (unsigned i = 0, e = type.getNumInputs(); i != e; ++i) {
-        // For each original TF tensor type, convert it to one TFD tensor type.
-        if (auto tensor_type = type.getInput(i).dyn_cast<TensorType>()) {
-          new_func_sig.addInputs(i, {tfd_tensor_type});
-        } else {
-          // Only handle TF tensor argument for now.
-          return failure();
-        }
-      }
-    }
-    // Each function has a single region. In general, each region can have
-    // multiple blocks. Assume that all TF-dialect functions only have a
-    // single entry block.
-    Block* entry = &funcOp.front();
-
-    // Tell the rewriter to convert the region signature. After this, the
-    // function region takes the new function signature, which means index
-    // shifts by one.
-    Block* convertedEntry =
-        rewriter.applySignatureConversion(&funcOp.getBody(), new_func_sig);
-
-    {
-      // Generate the "fake" mapping ops. The insertion guard restores rewriter
-      // insertion pointer when it gets out of scope.
-      OpBuilder::InsertionGuard guard(rewriter);
-      rewriter.setInsertionPointToStart(convertedEntry);
-      // Replace block arguments. For example,
-      // func @example(i64, i1) -> i64 {
-      //   ^bb0(%a: i64, %cond: i1):  // replacing this.
-      for (unsigned i = 0, e = type.getNumInputs(); i != e; ++i) {
-        // For each original block argument, create a fake op that takes the
-        // input the input chain argument to the function, and the tfd tensor
-        // argument, and returns the original TF tensor input. Note that the
-        // function signature has been replaced, so entry->getArgument(0) is the
-        // input chain. And we need to add 1 to index to get the original
-        // argument.
-        Type orig_input = type.getInput(i);
-        OperationState tmp_lowering_cast_op(
-            funcOp.getLoc(), kTmpLoweringCastOpName,
-            {convertedEntry->getArgument(0),
-             convertedEntry->getArgument(i + 1)},
-            orig_input, {});
-        Value repl_value =
-            rewriter.createOperation(tmp_lowering_cast_op)->getResult(0);
-        // Replace original uses of TF tensor block argument with the result of
-        // the fake op. This sets up the lowering passes for individual ops
-        // which at this point still expect TF tensors rather than TFD tensor
-        // inputs.
-        rewriter.replaceUsesOfBlockArgument(entry->getArgument(i), repl_value);
-      }
-    }
-
-    // Create a new function op with an updated signature.
-    auto new_func_op = rewriter.cloneWithoutRegions(funcOp);
-    rewriter.inlineRegionBefore(funcOp.getBody(), new_func_op.getBody(),
-                                new_func_op.end());
-    new_func_op.setType(FunctionType::get(new_func_sig.getConvertedTypes(),
-                                          converted_results, ctx));
-    // Remove the old function op.
-    rewriter.eraseOp(funcOp);
-    return success();
-  }
-};
-
-// Lower each TF op to a tfd.delegate_kernel op. For example,
-//
-// %1 = "tf.ReadVariableOp"(%arg) {
-//     dtype = "tfdtype$DT_FLOAT"
-// } : (tensor<*x!tf.resource>) -> tensor<10xf32>
-//
-// would be lowered to
-//
-// %1:2 = "tfd.delegate_kernel"(%chain_in, %arg) {
-//   _name = "tf.ReadVariableOp",
-//   attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-// } : (!hex.chain, !tfd.tf_tensor) -> (!hex.chain, !tfd.tf_tensor)
-//
-// Each tfd.delegate_kernel op expects a chain as the first input. This chain
-// may come from the first function argument or the previous converted op
-// output. The rest of inputs would be converted to a tfd tensor input.
-// Each tfd.delegate_kernel op returns a chain as the first output. Each
-// original output TensorType is converted a tfd tensor type.
-// The TF op name becomes an _name attribute. Each TF attribute is lowered to
-// two TFD attributes, one for the name, one for the type and value.
-//
-// Because delegate_kernel ops are threaded through chains, we lowered to a
-// serial execution plan.
-// TODO(zhangqiaorjc): Do analysis to allow concurrent execution.
-template <typename TF_OP>
-class TFOpConversion : public OpConversionPattern<TF_OP> {
- public:
-  using OpConversionPattern<TF_OP>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      TF_OP op, llvm::ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter)  // NOLINT(google-runtime-references
-      const override {
-    auto ctx = op.getContext();
-    // Handle new op operands.
-    // Delegate kernel expects the first argument to be a chain, followed by
-    // original arguments to the target TF op converted to TFD tensors.
-    llvm::SmallVector<Value, 4> delegate_kernel_op_operands;
-    int num_new_operands = op.getOperation()->getNumOperands() + 1;
-    delegate_kernel_op_operands.reserve(num_new_operands);
-
-    // Get the input chain from the previous delegate_kernel op or first block
-    // argument.
-    Value chain_input = nullptr;
-    auto* block = op.getOperation()->getBlock();
-    assert(block->isEntryBlock() && "only supports a single block");
-    // Find a previous delegate_kernel op for its output chain.
-    auto* prev_op = op.getOperation()->getPrevNode();
-    while (prev_op != nullptr && !isa<tfd::DelegateKernelOp>(prev_op)) {
-      prev_op = prev_op->getPrevNode();
-    }
-    if (prev_op != nullptr) {
-      // There is another delegate kernel op before this op.
-      auto prev_op_result_0 = prev_op->getResult(0);
-      assert(prev_op_result_0.getType() == GetChainType(ctx));
-      chain_input = prev_op_result_0;
-    } else {
-      // This op is the first delegate kernel op in a block.
-      auto arg_0 = block->getArgument(0);
-      assert(arg_0.getType() == GetChainType(ctx));
-      chain_input = arg_0;
-    }
-    delegate_kernel_op_operands.push_back(chain_input);
-
-    // Convert each TensorType operand to the corresponding TFD tensor operand.
-    for (auto operand : operands) {
-      auto* tmp_lowering_cast_op = operand.getDefiningOp();
-      assert(tmp_lowering_cast_op->getName().getStringRef() ==
-             kTmpLoweringCastOpName);
-      delegate_kernel_op_operands.push_back(
-          tmp_lowering_cast_op->getOperand(1));
-    }
-
-    // Handle new op results.
-    llvm::SmallVector<Type, 4> delegate_kernel_op_results;
-    // The first output is a chain.
-    delegate_kernel_op_results.push_back(GetChainType(ctx));
-    // For each original output, there is a corresponding TFD tensor output.
-    for (int i = 0, e = op.getOperation()->getNumResults(); i != e; ++i) {
-      delegate_kernel_op_results.push_back(GetTfdTensorType(ctx));
-    }
-
-    // Convert TF attribute to TFD attribute.
-    llvm::SmallVector<NamedAttribute, 4> delegate_kernel_op_attributes;
-    NamedAttribute op_name_attr(Identifier::get("_name", ctx),
-                                StringAttr::get(op.getOperationName(), ctx));
-    delegate_kernel_op_attributes.push_back(op_name_attr);
-
-    int attr_idx = 0;
-    for (const NamedAttribute& tf_attr : op.getAttrs()) {
-      // Small std::string benefits from small string optimization in libc++.
-      NamedAttribute attr_name(
-          Identifier::get("attr" + std::to_string(attr_idx) + "_name", ctx),
-          StringAttr::get(tf_attr.first, ctx));
-      NamedAttribute attr_value(
-          Identifier::get("attr" + std::to_string(attr_idx) + "_value", ctx),
-          tf_attr.second);
-      delegate_kernel_op_attributes.push_back(attr_name);
-      delegate_kernel_op_attributes.push_back(attr_value);
-      attr_idx++;
-    }
-
-    // Replace the TF op with TFD delegate kernel op.
-    auto new_op = rewriter.create<tfd::DelegateKernelOp>(
-        op.getLoc(), delegate_kernel_op_results, delegate_kernel_op_operands,
-        delegate_kernel_op_attributes);
-
-    // Create lowering cast ops for non-chain results.
-    llvm::SmallVector<Value, 4> lowering_cast_ops_values;
-    // Skip the first result. It's a chain which has no current users.
-    for (int i = 1, e = new_op.getOperation()->getNumResults(); i != e; ++i) {
-      Type orig_input = op.getType();
-      OperationState tmp_lowering_cast_op(new_op.getLoc(),
-                                          kTmpLoweringCastOpName,
-                                          {new_op.getOperation()->getResult(0),
-                                           new_op.getOperation()->getResult(i)},
-                                          {orig_input}, {});
-      Value repl_value =
-          rewriter.createOperation(tmp_lowering_cast_op)->getResult(0);
-      lowering_cast_ops_values.push_back(repl_value);
-    }
-
-    rewriter.replaceOp(op, lowering_cast_ops_values);
-    return success();
-  }
-};
-
-class ReturnOpConversion : public OpConversionPattern<ReturnOp> {
- public:
-  using OpConversionPattern<ReturnOp>::OpConversionPattern;
-
-  // Replace std.return with hex.return. The first result is always a chain and
-  // each original TF tensor result is converted to a TFD tensor.
-  LogicalResult matchAndRewrite(
-      ReturnOp return_op, llvm::ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter) const override {
-    auto ctx = return_op.getContext();
-    Value chain_output = nullptr;
-    llvm::SmallVector<Value, 4> new_return_op_operands;
-    new_return_op_operands.reserve(return_op.getNumOperands() + 1);
-    // Convert each TF tensor operand to the corresponding TFD tensor operand.
-    for (auto operand : operands) {
-      auto* tmp_lowering_cast_op = operand.getDefiningOp();
-      if (tmp_lowering_cast_op->getName().getStringRef() !=
-          kTmpLoweringCastOpName) {
-        assert(false && "unexpected producer of operand");
-      }
-      if (chain_output == nullptr) {
-        // Get the input chain from the previous op or first block argument.
-        auto* block = return_op.getOperation()->getBlock();
-        if (!block->isEntryBlock()) {
-          assert(false && "only supports a single block");
-        }
-        // Find a previous delegate_kernel op for its output chain.
-        auto* prev_op = return_op.getOperation()->getPrevNode();
-        while (prev_op != nullptr && !isa<tfd::DelegateKernelOp>(prev_op)) {
-          prev_op = prev_op->getPrevNode();
-        }
-        if (prev_op != nullptr) {
-          // There is another delegate kernel op before this op.
-          auto prev_op_result_0 = prev_op->getResult(0);
-          if (prev_op_result_0.getType() != GetChainType(ctx)) {
-            assert(false &&
-                   "delegate kernel must produce chain as the first result");
-          }
-          chain_output = prev_op_result_0;
-        } else {
-          // This op is the first delegate kernel op in a block.
-          auto arg_0 = block->getArgument(0);
-          if (arg_0.getType() != GetChainType(ctx)) {
-            assert(false && "first block argument must be a chain");
-          }
-          chain_output = arg_0;
-        }
-        new_return_op_operands.push_back(chain_output);
-      }
-      new_return_op_operands.push_back(tmp_lowering_cast_op->getOperand(1));
-    }
-    // Replace the old std.return op with the new hex.return op.
-    rewriter.create<tfrt::hex::ReturnOp>(return_op.getLoc(),
-                                         new_return_op_operands);
-    rewriter.eraseOp(return_op);
-
-    return success();
-  }
-};
-
-void TfToTfdLoweringPass::runOnOperation() {
-  ConversionTarget target(getContext());
-
-  // Make tmp_lowering_cast_op legal for conversion. But delete them after the
-  // passes.
-  OperationName tmp_lowering_cast_op_name(kTmpLoweringCastOpName,
-                                          &getContext());
-  target.setOpAction(tmp_lowering_cast_op_name,
-                     ConversionTarget::LegalizationAction::Legal);
-
-  // target.addLegalDialect<TF::TensorFlowDialect,
-  // tfd::RuntimeFallbackDialect>();
-  target.addLegalDialect<tfd::RuntimeFallbackDialect>();
-
-  target.addDynamicallyLegalOp<FuncOp>([](FuncOp function) {
-    // Returns true if this function is legal, i.e. all inputs and outputs are
-    // TFRT types.
-    FunctionType type = function.getType();
-    for (unsigned i = 0, e = type.getNumInputs(); i != e; ++i) {
-      if (type.getInput(i).isa<TensorType>()) return false;
-    }
-    for (unsigned i = 0, e = type.getNumResults(); i != e; ++i) {
-      if (type.getResult(i).isa<TensorType>()) return false;
-    }
-    return true;
-  });
-
-  target.addLegalOp<mlir::ModuleTerminatorOp, mlir::ModuleOp,
-                    tfrt::hex::ReturnOp>();
-
-  OwningRewritePatternList patterns;
-  patterns.insert<FuncOpSignatureConversion, TFOpConversion<TF::ReadVariableOp>,
-                  TFOpConversion<TF::MatMulOp>, TFOpConversion<TF::AddV2Op>,
-                  TFOpConversion<TF::ReluOp>, TFOpConversion<TF::IdentityOp>,
-                  ReturnOpConversion>(&getContext());
-
-  if (failed(applyPartialConversion(getOperation(), target, patterns)))
-    signalPassFailure();
-
-  // Delete the tmp_lowering_cast_op's since they are illegal.
-  getOperation().walk([&tmp_lowering_cast_op_name](Operation* op) {
-    if (op->getName() == tmp_lowering_cast_op_name) op->erase();
-  });
-}
-
-}  // namespace
-}  // namespace mlir
-
-static mlir::PassRegistration<mlir::TfToTfdLoweringPass> pass(
-    "tf-to-tfd-lowering", "Lowers the TF dialect to Runtime Fallback dialect.");
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_combine.cc b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_combine.cc
deleted file mode 100644
index 4fd57af55cc..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_combine.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-//===----------------------------------------------------------------------===//
-//
-// This file implements a set of simple combiners for optimizing operations in
-// the Runtime Fallback dialect.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/PatternMatch.h"
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h"
-
-// This optimizes the following scenario:
-// %tft0, %c2 = "tfd.move_dht_to_tft"(%dht0, %c1)
-//     : (!dht.host_tensor, !hex.chain) -> (!tfd.tf_tensor, !hex.chain)
-// %dht1, %c3 = "tfd.convert_tft_to_dht"(%tft0, %c2)
-//     : (!tfd.tf_tensor, !hex.chain) -> (!dht.host_tensor, !hex.chain)
-// some_op %dht1, %c3
-//
-// becomes
-// some_op %dht0, %c1
-
-struct SimplifyDoubleConversion
-    : public mlir::OpRewritePattern<mlir::tfd::ConvertTftToDhtOp> {
-  // We register this pattern to match every tfd.move_dht_to_tft op.
-  // The "benefit" is used by the framework to order the patterns and process
-  // them in order of profitability.
-  explicit SimplifyDoubleConversion(mlir::MLIRContext* context)
-      : mlir::OpRewritePattern<mlir::tfd::ConvertTftToDhtOp>(context,
-                                                             /*benefit=*/1) {}
-
-  // This method attempts to match a pattern and rewrite it. The rewriter
-  // argument is the orchestrator of the sequence of rewrites. The pattern is
-  // expected to interact with it to perform any changes to the IR from here.
-  mlir::LogicalResult matchAndRewrite(
-      mlir::tfd::ConvertTftToDhtOp op,
-      mlir::PatternRewriter& rewriter) const override {
-    // Look through the inputs of the ConvertTftToDhtOp.
-    mlir::Value convert_op_input_0 = op.getOperand(0);
-    mlir::Value convert_op_input_1 = op.getOperand(1);
-    mlir::tfd::MoveDhtToTftOp move_input_op_0 =
-        llvm::dyn_cast_or_null<mlir::tfd::MoveDhtToTftOp>(
-            convert_op_input_0.getDefiningOp());
-    mlir::tfd::MoveDhtToTftOp move_input_op_1 =
-        llvm::dyn_cast_or_null<mlir::tfd::MoveDhtToTftOp>(
-            convert_op_input_1.getDefiningOp());
-
-    // The inputs should be MoveDhtToTftOp.
-    if (!move_input_op_0 || !move_input_op_1) return mlir::failure();
-    // Both inputs are the same MoveDhtToTftOp.
-    if (move_input_op_0 != move_input_op_1) return mlir::failure();
-
-    // Use the rewriter to replace the ConvertTftToDhtOp's users with the
-    // operands of MoveDhtToTftOp.
-    rewriter.replaceOp(
-        op, {move_input_op_0.getOperand(0), move_input_op_0.getOperand(1)});
-    return mlir::success();
-  }
-};
-
-// Register rewrite pattern as "canonicalization" patterns on the MoveDhtToTftOp
-// so that they can be picked up by the Canonicalization framework.
-void mlir::tfd::ConvertTftToDhtOp::getCanonicalizationPatterns(
-    OwningRewritePatternList& results, MLIRContext* context) {
-  results.insert<SimplifyDoubleConversion>(context);
-}
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.cc b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.cc
deleted file mode 100644
index 9c69154673b..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h"
-
-namespace mlir {
-namespace tfd {
-
-//===----------------------------------------------------------------------===//
-// TfrtDelegate Dialect
-//===----------------------------------------------------------------------===//
-
-RuntimeFallbackDialect::RuntimeFallbackDialect(MLIRContext *context)
-    : Dialect(/*name=*/"tfd", context) {
-  allowUnknownTypes();
-
-  allowUnknownOperations();
-
-  addOperations<
-#define GET_OP_LIST
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback_ops.cc.inc"
-      >();
-}
-
-//===----------------------------------------------------------------------===//
-// TableGen'd op method definitions
-//===----------------------------------------------------------------------===//
-
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback_ops.cc.inc"
-
-}  // namespace tfd
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h
deleted file mode 100644
index 009d565e40d..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines the operations used in the Runtime Fallback dialect.
-
-#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_OPS_H_
-#define TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_OPS_H_
-
-#include "mlir/IR/Dialect.h"  // from @llvm-project
-#include "mlir/IR/OpDefinition.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
-
-namespace mlir {
-namespace tfd {
-
-// Dialect for TFRT delegate operations.
-class RuntimeFallbackDialect : public Dialect {
- public:
-  explicit RuntimeFallbackDialect(MLIRContext* context);
-  static StringRef getDialectNamespace() { return "tfd"; }
-};
-
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback_ops.h.inc"
-
-}  // namespace tfd
-}  // namespace mlir
-#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_OPS_H_
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
deleted file mode 100644
index aeed800a1c3..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This is the definition file for the Runtime Fallback Dialect.
-
-#ifdef TFRT_DELEGATE_DIALECT
-#else
-#define TFRT_DELEGATE_DIALECT
-
-include "tfrt/tfrt_op_base.td"
-include "mlir/Interfaces/SideEffects.td"
-
-//===----------------------------------------------------------------------===//
-// Type definitions
-//===----------------------------------------------------------------------===//
-def TfTensorType : OpaqueType<"tfd", "tf_tensor", "!tfd.tf_tensor type">;
-
-//===----------------------------------------------------------------------===//
-// Runtime Fallback Dialect definitions
-//===----------------------------------------------------------------------===//
-
-def RuntimeFallback_Dialect : Dialect {
-  let name = "tfd";
-
-  let description = [{
-    The Runtime Fallback dialect.
-
-    This dialect contains operations to run existing TF kernels on TFRT by
-    invoking TF Eager API.
-  }];
-
-  let cppNamespace = "tfd";
-}
-
-//===----------------------------------------------------------------------===//
-// Runtime Fallback Dialect Ops definitions
-//===----------------------------------------------------------------------===//
-
-// Base class for the operation in this dialect.
-class RuntimeFallbackDialect_Op<string mnemonic, list<OpTrait> traits = []> :
-    Op<RuntimeFallback_Dialect, mnemonic, traits> { }
-
-def InitEagerContextOp : RuntimeFallbackDialect_Op<"init_eager_context"> {
-  let summary = "eager context initialization operation";
-  let description = [{
-    The "tfd.init_eager_context" operation takes an input chain, creates and
-    initializes the TF EagerContext and returns an output chain.
-
-    Example:
-      %c1 = "tfd.init_eager_context"(%c0): (!hex.chain) -> !hex.chain
-  }];
-
-  let arguments = (ins ChainType);
-  let results = (outs ChainType);
-}
-
-def DelegateKernelOp : RuntimeFallbackDialect_Op<"delegate_kernel"> {
-  let summary = "delegate kernel operation";
-  let description = [{
-    The "tfd.delegate_kernel" operation takes an input chain, and arbitrary
-    number of input arguments, and runs a specified TF op via TFE C API. It
-    returns an output chain and variable number of outputs from the TF op.
-
-    The input arguments and attributes are passed to the TF op. The ouputs are
-    outputs of the TF op.
-
-    Note that `_name` is a required attribute specifying the TF op to run.
-    TFRT attributes are sorted alphabetically, passed in as positional
-    attributes to the TFRT kernel, rather than as named attributes.
-
-    Example:
-      To run "tf.MatMul" op, which has two boolean attributes,
-        1. Set _name = "MatMul"
-        2. For each TF attribute, split it into two attributes, one for name of
-           the TF attribute, and the other for the type and value of the
-           attribute value. Attribute value is a string with the format of
-           "type$val", where type can be "bool", "string", "tfdtype", "tfshape",
-           "tftensor".
-           The value serialization format can be found in attr_util.h.
-
-      %out_c, %out_tensor = "tfd.delegate_kernel"(
-        %in_c, %in1_tensor, %in2_tensor) {
-        _name = "MatMul",
-        attr1_name = "transpose_a", attr1_value = "bool$false",
-        attr2_name = "transpose_b", attr2_value = "bool$false"
-      } : (!hex.chain, !tfd.tf_tensor, !tfd.tf_tensor) -> (
-        !hex.chain, !tfd.tf_tensor)
-  }];
-
-  let arguments = (ins ChainType, Variadic<AnyType>);
-  let results = (outs ChainType, Variadic<AnyType>);
-}
-
-def PrintTftOp : RuntimeFallbackDialect_Op<"print_tft"> {
-  let summary = "print TF tensor operation";
-  let description = [{
-    The "tfd.print_tft" operation prints the input TF tensor. It takes an input
-    TF tensor to be printed and an input chain, and returns an output chain.
-
-    Example:
-      %c1 = "tfd.print_tft"(%t, %c) : (!tfd.tf_tensor, !hex.chain) -> !hex.chain
-
-  }];
-
-  let arguments = (ins TfTensorType, ChainType);
-  let results = (outs ChainType);
-}
-
-def ConvertTftToDhtOp : RuntimeFallbackDialect_Op<"convert_tft_to_dht", [NoSideEffect]> {
-  let summary = "convert TF tensor to TFRT DHT tensor operation";
-  let description = [{
-    The "tfd.convert_tft_to_dht" operation converts a TF tensor to a TFRT
-    DenseHostTensor.
-
-    It takes as input a TF Tensor and an input chain, and returns a converted
-    TFRT DHT tensor and an output chain.
-
-    Example:
-      %dht, %c0 = "tfd.convert_tft_to_dht"(%tft, %c)
-      : (!tfd.tf_tensor, !hex.chain) -> (!dht.host_tensor, !hex.chain)
-  }];
-
-  let arguments = (ins TfTensorType, ChainType);
-  // Enable registering canonicalization patterns with this operation.
-  let hasCanonicalizer = 1;
-  let results = (outs TensorType, ChainType);
-}
-
-def MoveDhtToTftOp : RuntimeFallbackDialect_Op<"move_dht_to_tft", [NoSideEffect]> {
-  let summary = "convert TFRT DHT tensor to DHT tensor operation";
-  let description = [{
-    The "tfd.move_dht_to_tft" operation moves a TFRT tensor into a TF Tensor.
-
-    It takes as input a TFRT Tensor and an input chain, and returns a TF tensor
-    with the same underlying buffer and an output chain.
-
-    Example:
-      %dht, %c0 = "tfd.convert_tft_to_dht"(%tft, %c)
-        : (!tfd.tf_tensor, !hex.chain) -> (!dht.host_tensor, !hex.chain)
-  }];
-
-  let arguments = (ins TensorType, ChainType);
-  let results = (outs TfTensorType, ChainType);
-}
-
-#endif // TFRT_DELEGATE_DIALECT
diff --git a/tensorflow/compiler/mlir/tfrt/tests/BUILD b/tensorflow/compiler/mlir/tfrt/tests/BUILD
deleted file mode 100644
index 4faa8d2efe8..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-
-package(licenses = ["notice"])
-
-glob_lit_tests(
-    data = [":test_utilities"],
-    driver = "@llvm-project//mlir:run_lit.sh",
-    test_file_exts = ["mlir"],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//tensorflow/compiler/mlir:tf-opt",
-        "@llvm-project//llvm:FileCheck",
-    ],
-)
diff --git a/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD b/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
deleted file mode 100644
index fc7c142ea73..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-
-package(licenses = ["notice"])
-
-glob_lit_tests(
-    data = [":test_utilities"],
-    driver = "@llvm-project//mlir:run_lit.sh",
-    test_file_exts = ["mlir"],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//tensorflow/compiler/mlir:tf-mlir-translate",
-        "@llvm-project//llvm:FileCheck",
-    ],
-)
diff --git a/tensorflow/compiler/mlir/tfrt/tests/analysis/compatibility_analysis.mlir b/tensorflow/compiler/mlir/tfrt/tests/analysis/compatibility_analysis.mlir
deleted file mode 100644
index 5943997a1bc..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/analysis/compatibility_analysis.mlir
+++ /dev/null
@@ -1,65 +0,0 @@
-// RUN: tf-mlir-translate -analyze-tf-for-tfrt %s | FileCheck %s
-
-func @main(%serialized: tensor<32x!tf.string>,
-           %names : tensor<32x!tf.string>,
-           %dense_keys : tensor<2x!tf.string>,
-           %dense_default_0 : tensor<?xf32>,
-           %dense_default_1 : tensor<?xf32>) {
-  // CHECK:      summary {
-  // CHECK-NEXT:   ref_variable: true
-  // CHECK-NEXT:   incompatible_variable: true
-  // CHECK-NEXT: }
-  // CHECK-NEXT: ops {
-  // CHECK-NEXT:   key: "tf.AssignVariableOp"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     count: 1
-  // CHECK-NEXT:     report {
-  // CHECK-NEXT:       incompatible_variable: true
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
-  // CHECK-NEXT: ops {
-  // CHECK-NEXT:   key: "tf.Const"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     count: 2
-  // CHECK-NEXT:     report {
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
-  // CHECK-NEXT: ops {
-  // CHECK-NEXT:   key: "tf.ParseExampleV2"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     count: 1
-  // CHECK-NEXT:     report {
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
-  // CHECK-NEXT: ops {
-  // CHECK-NEXT:   key: "tf.VarHandleOp"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     count: 1
-  // CHECK-NEXT:     report {
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
-  // CHECK-NEXT: ops {
-  // CHECK-NEXT:   key: "tf.VariableV2"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     count: 1
-  // CHECK-NEXT:     report {
-  // CHECK-NEXT:       ref_variable: true
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
-  %0 = "tf.VariableV2"() {shape = #tf.shape<2>, container = "", shared_name = ""} : () -> tensor<!tf.int32ref>
-  %1 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
-  %2 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
-  "tf.AssignVariableOp"(%2, %1) : (tensor<*x!tf.resource>, tensor<f32>) -> ()
-  %empty_str_vector = "tf.Const"()
-    {dtype = !tf.string, value = opaque<"tf", "0x746674656E736F722464747970653A2044545F535452494E472074656E736F725F7368617065207B2064696D207B207D207D"> : tensor<0x!tf.string>}
-      : () -> tensor<0x!tf.string>
-  %result:2 = "tf.ParseExampleV2"(%serialized, %names, %empty_str_vector, %dense_keys, %empty_str_vector, %dense_default_0, %dense_default_1)
-    {dense_shapes = [#tf.shape<>, #tf.shape<>], num_sparse = 0 : i64, result_segment_sizes = dense<[0, 0, 0, 2, 0, 0]> : vector<6xi32>}
-      : (tensor<32x!tf.string>, tensor<32x!tf.string>, tensor<0x!tf.string>, tensor<2x!tf.string>, tensor<0x!tf.string>, tensor<?xf32>, tensor<?xf32>) -> (tensor<32xf32>, tensor<32xf32>)
-  return
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/basics.mlir b/tensorflow/compiler/mlir/tfrt/tests/basics.mlir
deleted file mode 100644
index 650bd04b882..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/basics.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: tf-opt -tf-legalize-to-hex %s -o -| FileCheck %s
-
-
-// CHECK-LABEL: func @constants() {
-func @constants() {
-  // CHECK: "hex.constant_int"() {value = 1 : i32}
-  %0 = "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "x", value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: "hex.constant_int"() {value = 42 : i32}
-  %1 = "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "y", value = dense<42> : tensor<1x1xi32>} : () -> tensor<1x1xi32>
-  // CHECK: hex.return
-  return
-}
-
-// CHECK-LABEL: func @add
-func @add(%arg0: tensor<1xi32>) {
-  // CHECK: hex.add_int
-  %2 = "tf.Add"(%arg0, %arg0) {T = "tfdtype$DT_INT32", device = "", name = "z"} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
-  return
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/err_partial_convert.mlir b/tensorflow/compiler/mlir/tfrt/tests/err_partial_convert.mlir
deleted file mode 100644
index 410ff299883..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/err_partial_convert.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: tf-opt %s -tf-legalize-to-hex  -verify-diagnostics
-
-func @partial_convert() {
-  %0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // expected-error @+1 {{failed to legalize operation 'tf.Const'}}
-  %1 = "tf.Const"() {value = dense<42> : tensor<2xi32>} : () -> tensor<2xi32>
-  %2 = "tf.Add"(%0, %1) : (tensor<i32>, tensor<2xi32>) -> tensor<2xi32>
-  return
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/opt.mlir b/tensorflow/compiler/mlir/tfrt/tests/opt.mlir
deleted file mode 100644
index 6f27fa6d7e4..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/opt.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: tf-opt %s -pass-pipeline='func(canonicalize)' | FileCheck %s
-
-// CHECK-LABEL: func @simplify_double_conversion_test(
-func @simplify_double_conversion_test() {
-  // CHECK: %[[CREATE:.*]] = dht.create
-  // CHECK: %[[FILL:.*]] = dht.fill
-  // CHECK: dht.print_tensor %[[CREATE]], %[[FILL]]
-  %c0 = hex.new.chain
-
-  // Create 2x2 dht<i32, 2> with value 1
-  %dht0 = dht.create_uninitialized_tensor.i32.2 [2 : i32, 2 : i32]
-  %c1 = dht.fill_tensor_with_constant.i32 %dht0, %c0 1 : i32
-
-  // Convert dht to tf tensor
-  %tft0, %c2 = "tfd.move_dht_to_tft"(%dht0, %c1)
-      : (!t.tensor, !hex.chain) -> (!tfd.tf_tensor, !hex.chain)
-
-  // Convert tf tensor back to dht
-  %dht1, %c3 = "tfd.convert_tft_to_dht"(%tft0, %c2)
-      : (!tfd.tf_tensor, !hex.chain) -> (!t.tensor, !hex.chain)
-
-  // Print the result dht
-  %c4 = dht.print_tensor %dht1, %c3
-
-  hex.return
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir
deleted file mode 100644
index 6c129c4be22..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN: tf-opt -tf-to-corert %s | FileCheck %s
-
-module attributes {tf_saved_model.semantics} {
-
-"tf_saved_model.global_tensor"() {is_mutable, sym_name = "y", type = tensor<1x3xf32>, value = dense<[[1.67482901, -0.529208779, -0.803792417]]> : tensor<1x3xf32>} : () -> ()
-
-// CHECK-LABEL: func @basic
-func @func_basic(
-    %arg0: tensor<3x1xf32> {tf_saved_model.index_path = [0]},
-    %arg1: tensor<!tf.resource<tensor<1x3xf32>>> {tf_saved_model.bound_input = @y})
-      -> (tensor<3x3xf32> {tf_saved_model.index_path = []})
-  attributes {tf_saved_model.exported_names = ["basic"]} {
-  %1 = "tf.ReadVariableOp"(%arg1) {_output_shapes = ["tfshape$dim { size: 1 } dim { size: 3 }"], device = "cpu", dtype = f32} : (tensor<!tf.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
-
-  // CHECK: {{%.*}} = corert.executeop({{%.*}}) "tf.MatMul"
-  // CHECK-SAME: {T = f32, transpose_a = false, transpose_b = false}
-  %2 = "tf.MatMul"(%arg0, %1) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "cpu", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-  return %2 : tensor<3x3xf32>
-}
-
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir
deleted file mode 100644
index 40b0332b61c..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir
+++ /dev/null
@@ -1,34 +0,0 @@
-// RUN: tf-opt -tf-to-corert %s | FileCheck %s
-
-// CHECK-NOT: tf_saved_model.semantics
-module attributes {tf_saved_model.semantics} {
-
-// CHECK-NOT: "tf_saved_model.global_tensor"
-"tf_saved_model.global_tensor"() {is_mutable, sym_name = "y", type = tensor<1x3xf32>, value = dense<[[1.67482901, -0.529208779, -0.803792417]]> : tensor<1x3xf32>} : () -> ()
-"tf_saved_model.global_tensor"() {is_mutable, sym_name = "z", type = tensor<3xf32>, value = dense<[1.67482901, -0.529208779, -0.803792417]> : tensor<3xf32>} : () -> ()
-
-// CHECK-LABEL: func @basic
-// CHECK-SAME: ([[arg0:%.*]]: !corert.tensorhandle, [[arg1:%.*]]: !corert.tensorhandle,
-// CHECK-SAME: [[arg2:%.*]]: !corert.tensorhandle) -> !corert.tensorhandle {
-func @func_basic(
-    %arg0: tensor<3x1xf32> {tf_saved_model.index_path = [0]},
-    %arg1: tensor<!tf.resource<tensor<1x3xf32>>> {tf_saved_model.bound_input = @y},
-    %arg2: tensor<!tf.resource<tensor<3xf32>>> {tf_saved_model.bound_input = @z})
-      -> (tensor<3x3xf32> {tf_saved_model.index_path = []})
-  attributes {tf_saved_model.exported_names = ["basic"]} {
-  // CHECK-NEXT: [[cpu_device:%.*]] = corert.get_device "cpu"
-  // CHECK-NEXT: [[r0:%.*]] = corert.executeop([[cpu_device]]) "tf.MatMul"([[arg0]], [[arg1]])
-  // CHECK-NEXT: [[r1:%.*]] = corert.executeop([[cpu_device]]) "tf.BiasAdd"([[r0]], [[arg2]])
-  // CHECK-NEXT: [[r2:%.*]] = corert.executeop([[cpu_device]]) "tf.Tanh"([[r1]])
-  // CHECK-NEXT: hex.return [[r2]] : !corert.tensorhandle
-
-  %0 = "tf.ReadVariableOp"(%arg2) {_output_shapes = ["tfshape$dim { size: 3 }"], device = "cpu", dtype = f32} : (tensor<!tf.resource<tensor<3xf32>>>) -> tensor<3xf32>
-  %1 = "tf.ReadVariableOp"(%arg1) {_output_shapes = ["tfshape$dim { size: 1 } dim { size: 3 }"], device = "cpu", dtype = f32} : (tensor<!tf.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
-  %2 = "tf.MatMul"(%arg0, %1) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "cpu", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-  %3 = "tf.BiasAdd"(%2, %0) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], data_format = "NHWC", device = "cpu"} : (tensor<3x3xf32>, tensor<3xf32>) -> tensor<3x3xf32>
-  %4 = "tf.Tanh"(%3) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "cpu"} : (tensor<3x3xf32>) -> tensor<3x3xf32>
-  %5 = "tf.Identity"(%4) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "cpu"} : (tensor<3x3xf32>) -> tensor<3x3xf32>
-  return %5 : tensor<3x3xf32>
-}
-
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/derived_attrs.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/derived_attrs.mlir
deleted file mode 100644
index 774ea0526bd..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/derived_attrs.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN: tf-opt -tf-to-corert %s | FileCheck %s
-
-// CHECK-LABEL: func @derived_attrs
-func @derived_attrs(
-  %serialized: tensor<?x!tf.string>,
-  %names: tensor<0x!tf.string>,
-  %sparse_keys: tensor<0x!tf.string>,
-  %dense_keys: tensor<1x!tf.string>,
-  %ragged_keys: tensor<0x!tf.string>,
-  %dense_default: tensor<0xi64>) -> tensor<?xi64> {
-
-  %dense_value =
-    "tf.ParseExampleV2"(%serialized, %names, %sparse_keys, %dense_keys, %ragged_keys, %dense_default)
-    // CHECK: Tdense = [i64]
-    // CHECK-SAME: dense_shapes = [#corert.shape<>]
-    { device = "cpu", num_sparse = 0 : i64, dense_shapes = [#tf.shape<>], result_segment_sizes = dense<[0, 0, 0, 1, 0, 0]> : vector<6xi32>}
-      : (tensor<?x!tf.string>, tensor<0x!tf.string>, tensor<0x!tf.string>, tensor<1x!tf.string>, tensor<0x!tf.string>, tensor<0xi64>)
-      -> tensor<?xi64>
-
-  return %dense_value : tensor<?xi64>
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
deleted file mode 100644
index 7077523b1e2..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: tf-opt -tf-to-corert %s | FileCheck %s
-
-// CHECK-LABEL: func @device_test
-func @device_test(
-    %arg0: tensor<3x1xf32> {tf_saved_model.index_path = [0]},
-    %arg1: tensor<1x3xf32> {tf_saved_model.index_path = [0]})
-      -> (tensor<3x3xf32> {tf_saved_model.index_path = []}) {
-  // CHECK: {{%.*}} = corert.get_device "gpu"
-
-  %2 = "tf.MatMul"(%arg0, %arg1) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "gpu", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-  return %2 : tensor<3x3xf32>
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fold.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fold.mlir
deleted file mode 100644
index 950cef928a9..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fold.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: tf-opt -corert-optimize %s | FileCheck %s
-
-// CHECK-LABEL: func @fold_test
-func @fold_test(%arg: !corert.tensorhandle) -> !corert.tensorhandle {
-    %cpu = corert.get_device "cpu"
-    // CHECK-NOT: tf.Const
-    %0 = corert.executeop(%cpu) "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : 1
-    // CHECK: "_tf.Transpose"({{%.*}})
-    // CHECK-SAME: perm = dense<[0, 3, 1, 2]> : tensor<4xi32>
-    %1 = corert.executeop(%cpu) "tf.Transpose"(%arg, %0) {T = f32, Tperm = i32} : 1
-    hex.return %1 : !corert.tensorhandle
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/string_tensor.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/string_tensor.mlir
deleted file mode 100644
index b1306be825c..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/string_tensor.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: tf-opt -tf-to-corert %s | FileCheck %s
-
-// CHECK-LABEL: func @string_tensor
-func @string_tensor() -> (tensor<0x!tf.string>, tensor<7x!tf.string>) {
-  // CHECK: {shape = [0], value = []}
-  %0 = "tf.Const"() {value = dense<[]> : tensor<0x!tf.string>} : () -> tensor<0x!tf.string>
-  // CHECK: {shape = [7], value = ["has_login_page_feature", "num_terms_inside_postform", "num_terms_outside_postform", "num_terms_outside_postform_without_bp", "query_params_contains_url", "title_with_login_phase", "url_contains_login_terms"]}
-  %1 = "tf.Const"() {value = dense<["has_login_page_feature", "num_terms_inside_postform", "num_terms_outside_postform", "num_terms_outside_postform_without_bp", "query_params_contains_url", "title_with_login_phase", "url_contains_login_terms"]> : tensor<7x!tf.string>} : () -> tensor<7x!tf.string>
-  return %0, %1 : tensor<0x!tf.string>, tensor<7x!tf.string>
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_executor_to_corert_pipeline.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_executor_to_corert_pipeline.mlir
deleted file mode 100644
index 5c44f558280..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_executor_to_corert_pipeline.mlir
+++ /dev/null
@@ -1,24 +0,0 @@
-// RUN: tf-opt -tf-executor-to-corert-pipeline %s | FileCheck %s
-
-// CHECK-LABEL: func @basic
-// CHECK-SAME: ([[arg0:%.*]]: !corert.tensorhandle, [[arg1:%.*]]: !corert.tensorhandle)
-// CHECK-NEXT: [[cpu:%.*]] = corert.get_device "cpu"
-// CHECK-NEXT: [[res:%.*]] = corert.executeop([[cpu]]) "tf.MatMul"([[arg0]], [[arg1]])
-// CHECK-NEXT: hex.return [[res]] : !corert.tensorhandle
-module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 293 : i32}} {
-  func @basic(%arg0: tensor<3x1xf32>,
-              %arg1: tensor<!tf.resource<tensor<1x3xf32>>>
-  ) -> tensor<3x3xf32> {
-    %0 = tf_executor.graph {
-      %outputs, %control = tf_executor.island wraps "tf.Const"() {value = dense<0.899999976> : tensor<f32>} : () -> tensor<f32>
-      %outputs_0, %control_0 = tf_executor.island {
-        %1 = "tf.Cast"(%arg1) {Truncate = false} : (tensor<!tf.resource<tensor<1x3xf32>>>) -> tensor<*x!tf.resource>
-        %2 = "tf.ReadVariableOp"(%1) {_output_shapes = ["tfshape$dim { size: 1 } dim { size: 3 }"], device = "", dtype = f32} : (tensor<*x!tf.resource>) -> tensor<1x3xf32>
-        %3 = "tf.MatMul"(%arg0, %2) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-        tf_executor.yield %3 : tensor<3x3xf32>
-      }
-      tf_executor.fetch %outputs_0, %control_0 : tensor<3x3xf32>, !tf_executor.control
-    }
-    return %0 : tensor<3x3xf32>
-  }
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfd_lowering.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfd_lowering.mlir
deleted file mode 100644
index 5968a590f91..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfd_lowering.mlir
+++ /dev/null
@@ -1,111 +0,0 @@
-// RUN: tf-opt %s -tf-to-tfd-lowering | FileCheck %s
-
-// CHECK: func @inference_call(
-// CHECK-SAME: %arg0: !hex.chain,
-// CHECK-SAME: %arg1: !tfd.tf_tensor,
-// CHECK-SAME: %arg2: !tfd.tf_tensor,
-// CHECK-SAME: %arg3: !tfd.tf_tensor,
-// CHECK-SAME: %arg4: !tfd.tf_tensor,
-// CHECK-SAME: %arg5: !tfd.tf_tensor
-// CHECK-SAME: ) -> (!hex.chain, !tfd.tf_tensor)
-func @inference_call(
-  %arg0: tensor<?x784xf32>,
-  %arg1: tensor<*x!tf.resource>,
-  %arg2: tensor<*x!tf.resource>,
-  %arg3: tensor<*x!tf.resource>,
-  %arg4: tensor<*x!tf.resource>
-  )-> tensor<?x10xf32> {
-    // CHECK: %0:2 = "tfd.delegate_kernel"(%arg0, %arg5)
-    // CHECK-SAME: _name = "tf.ReadVariableOp"
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: (!hex.chain, !tfd.tf_tensor) -> (!hex.chain, !tfd.tf_tensor)
-    %0 = "tf.ReadVariableOp"(%arg4) {
-      dtype = "tfdtype$DT_FLOAT"
-      } : (tensor<*x!tf.resource>) -> tensor<10xf32>
-
-    // CHECK: %1:2 = "tfd.delegate_kernel"(%0#0, %arg3) {
-    // CHECK-SAME: _name = "tf.ReadVariableOp"
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %1 = "tf.ReadVariableOp"(%arg2) {
-      dtype = "tfdtype$DT_FLOAT"
-      } : (tensor<*x!tf.resource>) -> tensor<512xf32>
-
-    // CHECK: %2:2 = "tfd.delegate_kernel"(%1#0, %arg4) {
-    // CHECK-SAME: _name = "tf.ReadVariableOp",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %2 = "tf.ReadVariableOp"(%arg3) {
-      dtype = "tfdtype$DT_FLOAT"
-      } : (tensor<*x!tf.resource>) -> tensor<512x10xf32>
-
-    // CHECK: %3:2 = "tfd.delegate_kernel"(%2#0, %arg2) {
-    // CHECK-SAME: _name = "tf.ReadVariableOp",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %3 = "tf.ReadVariableOp"(%arg1) {
-      dtype = "tfdtype$DT_FLOAT"
-      } : (tensor<*x!tf.resource>) -> tensor<784x512xf32>
-
-    // CHECK: %4:2 = "tfd.delegate_kernel"(%3#0, %arg1, %3#1) {
-    // CHECK-SAME: _name = "tf.MatMul",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT",
-    // CHECK-SAME: attr1_name = "transpose_a", attr1_value = false,
-    // CHECK-SAME: attr2_name = "transpose_b", attr2_value = false
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %4 = "tf.MatMul"(%arg0, %3) {
-      dtype = "tfdtype$DT_FLOAT", transpose_a = false, transpose_b = false
-    } : (tensor<?x784xf32>, tensor<784x512xf32>) -> tensor<?x512xf32>
-
-    // CHECK: %5:2 = "tfd.delegate_kernel"(%4#0, %4#1, %1#1) {
-    // CHECK-SAME: _name = "tf.AddV2"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %5 = "tf.AddV2"(%4, %1)
-      : (tensor<?x512xf32>, tensor<512xf32>)-> tensor<?x512xf32>
-
-    // CHECK: %6:2 = "tfd.delegate_kernel"(%5#0, %5#1) {
-    // CHECK-SAME: _name = "tf.Relu",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %6 = "tf.Relu"(%5) {
-      dtype = "tfdtype$DT_FLOAT"
-    } : (tensor<?x512xf32>) -> tensor<?x512xf32>
-
-    // CHECK: %7:2 = "tfd.delegate_kernel"(%6#0, %6#1, %2#1) {
-    // CHECK-SAME: _name = "tf.MatMul",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT",
-    // CHECK-SAME: attr1_name = "transpose_a", attr1_value = false,
-    // CHECK-SAME: attr2_name = "transpose_b", attr2_value = false
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %7 = "tf.MatMul"(%6, %2) {
-      dtype = "tfdtype$DT_FLOAT", transpose_a = false, transpose_b = false
-    } : (tensor<?x512xf32>, tensor<512x10xf32>) -> tensor<?x10xf32>
-
-    // CHECK: %8:2 = "tfd.delegate_kernel"(%7#0, %7#1, %0#1) {
-    // CHECK-SAME: _name = "tf.AddV2",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %8 = "tf.AddV2"(%7, %0) {
-      dtype = "tfdtype$DT_FLOAT"
-    } : (tensor<?x10xf32>, tensor<10xf32>) -> tensor<?x10xf32>
-
-    // CHECK: %9:2 = "tfd.delegate_kernel"(%8#0, %8#1) {
-    // CHECK-SAME: _name = "tf.Identity",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %9 = "tf.Identity"(%8) {
-      dtype = "tfdtype$DT_FLOAT"
-    } : (tensor<?x10xf32>) -> tensor<?x10xf32>
-
-    // CHECK: hex.return %9#0, %9#1 : !hex.chain, !tfd.tf_tensor
-    return %9 : tensor<?x10xf32>
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tf_legalize_to_hex.cc b/tensorflow/compiler/mlir/tfrt/tf_legalize_to_hex.cc
deleted file mode 100644
index 9d13955490b..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tf_legalize_to_hex.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements lowering of Tf dialect to TFRT Hex kernels.
-//
-// Current lowering is a placeholder performing trivial conversion
-// for integer constants and additions.
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "absl/memory/memory.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-
-namespace mlir {
-namespace {
-
-// Pattern rewrite rules for "tf.Const", "tf.Add" and "return" ops.
-bool isInt32LikeType(Type t) {
-  if (t.isSignlessInteger(32)) return true;
-  if (auto ttype = t.dyn_cast<RankedTensorType>()) {
-    if (ttype.hasStaticShape() && ttype.getNumElements() == 1 &&
-        ttype.getElementType().isSignlessInteger(32))
-      return true;
-  }
-  return false;
-}
-
-// Replaces 32-bit integer TF::ConstOp with "hex.constant_int" op.
-struct ConstOpConversion : public ConversionPattern {
-  explicit ConstOpConversion(MLIRContext *context)
-      : ConversionPattern(TF::ConstOp::getOperationName(), 1, context) {}
-
-  LogicalResult matchAndRewrite(
-      Operation *op, ArrayRef<Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
-    auto constOp = cast<TF::ConstOp>(op);
-    if (!isInt32LikeType(constOp.getType())) return failure();
-
-    auto valueAttr = constOp.value();
-    auto newAttr = Attribute();
-
-    // Convert constant op if it has an integer or dense elements attribute.
-    // Other kinds of element attributes are not converted for now.
-    if (valueAttr.isa<IntegerAttr>()) {
-      newAttr = valueAttr;
-    } else if (auto v = valueAttr.dyn_cast<SplatElementsAttr>()) {
-      if (v.isSplat()) newAttr = v.getSplatValue();
-    }
-    if (!newAttr) return failure();
-
-    mlir::OperationState state(constOp.getLoc(), "hex.constant_int");
-    state.types.push_back(rewriter.getIntegerType(32));
-    state.addAttribute("value", newAttr);
-    auto newOp = rewriter.createOperation(state);
-    rewriter.replaceOp(op, newOp->getResult(0));
-    return success();
-  }
-};
-
-// Replaces 32-bit integer TF::Add op with "hex.add_int" op.
-struct AddOpConversion : public ConversionPattern {
-  explicit AddOpConversion(MLIRContext *context)
-      : ConversionPattern(TF::AddOp::getOperationName(), 1, context) {}
-
-  LogicalResult matchAndRewrite(
-      Operation *op, ArrayRef<Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
-    auto addOp = cast<TF::AddOp>(op);
-
-    if (!isInt32LikeType(operands[0].getType()) ||
-        !isInt32LikeType(operands[1].getType()))
-      return failure();
-
-    auto int32Ty = rewriter.getIntegerType(32);
-    mlir::OperationState state(addOp.getLoc(), "hex.add_int", operands,
-                               {int32Ty}, {});
-    auto newOp = rewriter.createOperation(state);
-    rewriter.replaceOp(op, newOp->getResult(0));
-    return success();
-  }
-};
-
-// Replaces return op that has no arguments with "hex.return" op.
-struct ReturnOpConversion : public OpConversionPattern<ReturnOp> {
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      ReturnOp srcOp, ArrayRef<Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
-    if (srcOp.getNumOperands() != 0) return failure();
-
-    mlir::OperationState state(srcOp.getLoc(), "hex.return");
-    rewriter.createOperation(state);
-
-    rewriter.eraseOp(srcOp);
-    return success();
-  }
-};
-
-// Legalize TF operations to host program dialect.
-struct TfLegalizeToHex
-    : public PassWrapper<TfLegalizeToHex, OperationPass<ModuleOp>> {
-  void runOnOperation() override {
-    auto *ctx = &getContext();
-    TypeConverter converter;
-    converter.addConversion([](Type type) -> Type {
-      // Convert single element tensor type of int32s to int32 type
-      if (isInt32LikeType(type)) {
-        return IntegerType::get(32, type.getContext());
-      }
-      return Type();
-    });
-
-    OwningRewritePatternList patterns;
-
-    // For now, replace only int32 TF::OpConst, TF::OpAdd and OpReturn with
-    // "hex.constant_int", "hex.add_int" and "hex.return", respectively.
-    patterns.insert<ConstOpConversion, AddOpConversion, ReturnOpConversion>(
-        ctx);
-
-    ConversionTarget target(*ctx);
-    const auto legal = ConversionTarget::LegalizationAction::Legal;
-    target.setOpAction(OperationName(StringRef("hex.constant_int"), ctx),
-                       legal);
-    target.setOpAction(OperationName(StringRef("hex.add_int"), ctx), legal);
-    target.setOpAction(OperationName(StringRef("hex.return"), ctx), legal);
-    target.addLegalOp<ModuleOp, ModuleTerminatorOp, FuncOp>();
-
-    auto result =
-        applyFullConversion(getOperation(), target, patterns, &converter);
-    if (failed(result)) signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToHexPass() {
-  return std::make_unique<TfLegalizeToHex>();
-}
-
-static PassRegistration<TfLegalizeToHex> pass(
-    "tf-legalize-to-hex",
-    "Convert TF dialect to the TF runtime host program dialect.");
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc b/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc
deleted file mode 100644
index 9e06ba1f4bc..00000000000
--- a/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements the optimzation passe on TFRT CoreRuntime dialect.
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/PatternMatch.h"
-#include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
-#include "tfrt/core_runtime/opdefs/core_runtime.h"
-
-namespace tensorflow {
-namespace {
-
-// Implement a constant fold pattern for corert dialect. The following pattern
-// will be matched:
-//
-// %0 = corert.executeop(%cpu) "tf.Const"()
-//  {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : 1
-// %1 = corert.executeop(%cpu) "tf.Transpose"(%arg, %0)
-//  {T = f32, Tperm = i32} : 1
-//
-// And it will converted to:
-//
-// %1 = corert.executeop(%cpu) "_tf.Transpose"(%arg)
-//  {T = f32, Tperm = i32, perm = dense<[0, 3, 1, 2]> : tensor<4xi32>} : 1
-//
-class CoreRTExecuteOpRewritePattern
-    : public mlir::OpRewritePattern<tfrt::corert::ExecuteOp> {
- public:
-  CoreRTExecuteOpRewritePattern(
-      mlir::MLIRContext *context,
-      ArrayRef<std::pair<StringRef, ArrayRef<StringRef>>> ops_to_attrs)
-      : OpRewritePattern(context),
-        ops_to_attrs_(ops_to_attrs.begin(), ops_to_attrs.end()) {}
-
-  mlir::LogicalResult matchAndRewrite(
-      tfrt::corert::ExecuteOp op,
-      mlir::PatternRewriter &rewriter) const override {
-    auto attr_names = ops_to_attrs_.lookup(op.op_name());
-    if (attr_names.empty()) return failure();
-
-    SmallVector<mlir::Value, 4> new_operands;
-    SmallVector<std::pair<StringRef, Attribute>, 4> new_attributes;
-    op.getOpAttrs(&new_attributes);
-    assert(op.operands().size() == attr_names.size());
-    for (const auto &iter : llvm::zip(op.operands(), attr_names)) {
-      mlir::Value arg = std::get<0>(iter);
-      StringRef name = std::get<1>(iter);
-
-      Attribute const_attr;
-      if (!name.empty() && matchPattern(arg, m_Constant(&const_attr))) {
-        // Convert the folded argument to an attribute.
-        new_attributes.push_back({name, const_attr});
-      } else {
-        // Keep the argument that is not folded.
-        new_operands.push_back(arg);
-      }
-    }
-
-    if (new_operands.size() == op.operands().size()) return failure();
-
-    SmallString<32> new_op_name{"_"};
-    new_op_name += op.op_name();
-
-    rewriter.replaceOpWithNewOp<tfrt::corert::ExecuteOp>(
-        op, op.getResultTypes(), op.device(), new_operands, new_attributes,
-        new_op_name);
-
-    return success();
-  }
-
- private:
-  // Map from op_name to attr_names. The attr_names indicates the name of the
-  // attribute to which each constant-folded argument is converted. An empty
-  // string means this argument should not be folded.
-  llvm::DenseMap<StringRef, ArrayRef<StringRef>> ops_to_attrs_;
-};
-
-struct CoreRTOptimizePass
-    : public mlir::PassWrapper<CoreRTOptimizePass, FunctionPass> {
-  void runOnFunction() override {
-    mlir::OwningRewritePatternList patterns;
-    auto func = getFunction();
-
-    static constexpr StringRef kMeanAttrs[] = {"", "reduction_indices"};
-    static constexpr StringRef kPadAttrs[] = {"", "paddings"};
-    static constexpr StringRef kTransposeAttrs[] = {"", "perm"};
-
-    static constexpr std::pair<StringRef, ArrayRef<StringRef>> kOpsToAttrs[] = {
-        {"tf.Mean", kMeanAttrs},
-        {"tf.Pad", kPadAttrs},
-        {"tf.Transpose", kTransposeAttrs},
-    };
-
-    patterns.insert<CoreRTExecuteOpRewritePattern>(&getContext(), kOpsToAttrs);
-
-    mlir::applyPatternsAndFoldGreedily(func, patterns);
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::FuncOp>> CreateCoreRTOptimizePass() {
-  return std::make_unique<CoreRTOptimizePass>();
-}
-
-static mlir::PassRegistration<CoreRTOptimizePass> pass("corert-optimize",
-                                                       "Optimizes corert.");
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/passes.h b/tensorflow/compiler/mlir/tfrt/transforms/passes.h
deleted file mode 100644
index be0bf0fbd1f..00000000000
--- a/tensorflow/compiler/mlir/tfrt/transforms/passes.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_PASSES_H_
-#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_PASSES_H_
-
-#include <memory>
-
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-
-namespace tensorflow {
-
-// Create a pass that converts MLIR TF dialect to MLIR TFRT CoreRT dialect.
-std::unique_ptr<mlir::Pass> CreateTFToCoreRTConversionPass();
-
-// Run TFToCoreRTConversionPass as a free function. Useful for reusing the pass
-// logic in a custom pass with additional conversions.
-mlir::LogicalResult TFToCoreRTConversionPassRun(
-    mlir::MLIRContext* context, mlir::ModuleOp* module,
-    mlir::ConversionTarget* target, mlir::OwningRewritePatternList* patterns);
-
-// Create the corert optimization pass.
-std::unique_ptr<mlir::OperationPass<mlir::FuncOp>> CreateCoreRTOptimizePass();
-
-struct CoreRTPipelineOptions
-    : public mlir::PassPipelineOptions<CoreRTPipelineOptions> {
-  Option<std::string> default_device{
-      *this, "default-device", llvm::cl::desc("default device assignment"),
-      llvm::cl::init("cpu")};
-  Option<bool> enable_optimizer{
-      *this, "enable-optimizer",
-      llvm::cl::desc("run optimization passes on corert dialect"),
-      llvm::cl::init(false)};
-  Option<std::string> force_data_format{
-      *this, "force-data-format",
-      llvm::cl::desc("force data format for all layout sensitive operations")};
-};
-
-// Creates a pipeline of passes that lowers MLIR TF Executor dialect to TF
-// dialect for CoreRT purposes.
-void CreateTFExecutorToTFPipeline(
-    mlir::OpPassManager& pm, const CoreRTPipelineOptions& options);  // NOLINT
-
-// Creates a pipeline of passes that converts MLIR TF Executor dialect to CoreRT
-// dialect.
-void CreateTFExecutorToCoreRTPipeline(
-    mlir::OpPassManager& pm, const CoreRTPipelineOptions& options);  // NOLINT
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/tf_to_corert.cc b/tensorflow/compiler/mlir/tfrt/transforms/tf_to_corert.cc
deleted file mode 100644
index 0784dc4ffea..00000000000
--- a/tensorflow/compiler/mlir/tfrt/transforms/tf_to_corert.cc
+++ /dev/null
@@ -1,484 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements lowering of TF dialect to TFRT CoreRuntime ExecuteOp.
-// This lowering pass is heavily experimental and incomplete. External code
-// should not depend on the code here. And please do not take example on it as
-// "the path forward" for this.
-
-#include <vector>
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/Passes.h"
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Pass/PassOptions.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
-#include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/platform/tstring.h"
-#include "tfrt/basic_kernels/opdefs/basic_kernels.h"
-#include "tfrt/core_runtime/opdefs/attributes.h"
-#include "tfrt/core_runtime/opdefs/core_runtime.h"
-
-namespace tensorflow {
-namespace {
-
-// TODO(chky): define these dialect types instead of using opaque types.
-mlir::Type CreateDeviceType(mlir::Builder *builder) {
-  return mlir::OpaqueType::get(builder->getIdentifier("corert"), "device",
-                               builder->getContext());
-}
-
-mlir::Type CreateTensorHandleType(mlir::Builder *builder) {
-  return mlir::OpaqueType::get(builder->getIdentifier("corert"), "tensorhandle",
-                               builder->getContext());
-}
-
-mlir::Type CreateStringType(mlir::Builder *builder) {
-  return mlir::OpaqueType::get(builder->getIdentifier("hex"), "string",
-                               builder->getContext());
-}
-
-// A helper class for converting CoreRT types and attributes.
-class CoreRTConverter : public mlir::TypeConverter {
- public:
-  explicit CoreRTConverter(mlir::MLIRContext *context)
-      : builder_(context),
-        device_type_(CreateDeviceType(&builder_)),
-        tensor_handle_type_(CreateTensorHandleType(&builder_)) {
-    addConversion([](Type type) { return type; });
-    addConversion([=](TensorType type) { return tensor_handle_type_; });
-  }
-
-  // Create a single attribute that contains the named attribute lists. It is an
-  // array of pairs. The key must be a string attribute, and the value can be
-  // any attribute that is supported by CoreRuntime.
-  mlir::ArrayAttr CreateOpAttrs(ArrayRef<NamedAttribute> attrs) {
-    llvm::SmallVector<mlir::Attribute, 4> attr_array;
-    for (auto key_and_value : attrs) {
-      if (!IsUnusedAttribute(key_and_value.first)) {
-        auto converted = ConvertAttribute(key_and_value.second);
-        if (!converted) return {};
-
-        mlir::StringAttr key = builder_.getStringAttr(key_and_value.first);
-        attr_array.push_back(builder_.getArrayAttr({key, converted}));
-      }
-    }
-    return builder_.getArrayAttr(attr_array);
-  }
-
-  // Convert the device attribute in `op` to a device value produced by the
-  // corresponding GetDeviceOp in the current block. If there does not exist
-  // one, insert a GetDeviceOp to the beginning of the block and return the
-  // device value.
-  Value ConvertDevice(mlir::Operation *op,
-                      ConversionPatternRewriter *rewriter) const {
-    auto device_attr = op->getAttr("device");
-    if (!device_attr) {
-      op->emitOpError("device attribute not found.");
-      return {};
-    }
-
-    auto device_name = device_attr.cast<mlir::StringAttr>().getValue();
-    if (device_name.empty()) {
-      op->emitOpError("device has not been assigned.");
-      return {};
-    }
-
-    op->removeAttr(rewriter->getIdentifier("device"));
-
-    auto *block = op->getBlock();
-
-    if (auto get_device_op = GetDeviceOrNull(device_name, block))
-      return get_device_op.device();
-
-    ConversionPatternRewriter::InsertionGuard insertion_guard(*rewriter);
-    rewriter->setInsertionPointToStart(block);
-    return rewriter
-        ->create<tfrt::corert::GetDeviceOp>(block->getParent()->getLoc(),
-                                            device_type(), device_name)
-        .device();
-  }
-
-  mlir::Type device_type() const { return device_type_; }
-  mlir::Type tensor_handle_type() const { return tensor_handle_type_; }
-
- private:
-  // TODO(chky): attributes "_output_shapes" should be removed by any tool that
-  // generates TF MLIR dialect, as they are not used by CoreRuntime. Remove this
-  // filtering logic once unused attributes are cleaned up in the upper layer.
-  bool IsUnusedAttribute(llvm::StringRef name) const {
-    return name == "_output_shapes";
-  }
-
-  // Returns the converted attribute in TFRT dialect. If the conversion fails,
-  // returns a null attribute instead.
-  mlir::Attribute ConvertAttribute(mlir::Attribute attr) {
-    // The supported attributes here should be kept consistent with
-    // //third_party/tf_runtime/include/tfrt/core_runtime/op_attr_type.h
-    //
-    // Currently, not all tensorflow data types are supported. Unranked shape
-    // attributes are not supported yet.
-
-    // Return directly if the attribute is already supported.
-    if (attr.isa<mlir::IntegerAttr>() || attr.isa<mlir::FloatAttr>() ||
-        attr.isa<mlir::BoolAttr>() || attr.isa<mlir::TypeAttr>() ||
-        attr.isa<mlir::StringAttr>() ||
-        attr.isa<mlir::DenseIntOrFPElementsAttr>())
-      return attr;
-
-    // Convert the attribute to the corresponding format in TFRT dialect if
-    // needed.
-    if (auto shape_attr = attr.dyn_cast<mlir::TF::ShapeAttr>()) {
-      if (!shape_attr.hasRank()) return {};
-      return tfrt::corert::ShapeAttr::get(builder_.getContext(),
-                                          shape_attr.getShape());
-    }
-
-    // For arrays, we recursively convert the elements.
-    if (auto array_attr = attr.dyn_cast<mlir::ArrayAttr>()) {
-      llvm::SmallVector<mlir::Attribute, 8> attrs;
-      attrs.reserve(array_attr.size());
-      for (auto attr : array_attr) {
-        auto converted = ConvertAttribute(attr);
-        if (!converted) return {};
-        attrs.push_back(converted);
-      }
-      return builder_.getArrayAttr(attrs);
-    }
-
-    return {};
-  }
-
-  // Find a GetDeviceOp that matches the device_name at the beginning of the
-  // block. Return nullptr if it does not find one.
-  tfrt::corert::GetDeviceOp GetDeviceOrNull(StringRef device_name,
-                                            Block *block) const {
-    for (auto &op : *block) {
-      auto get_device_op = llvm::dyn_cast<tfrt::corert::GetDeviceOp>(&op);
-      if (!get_device_op) break;
-      if (get_device_op.device_name() == device_name) return get_device_op;
-    }
-    return nullptr;
-  }
-
-  mlir::Builder builder_;
-  mlir::Type device_type_;
-  mlir::Type tensor_handle_type_;
-};
-
-// Lower a tf.Const op that creates a string tensor to a native
-// corert.create_string_tensor op.
-class CoreRTConstStringTensorOpConversion
-    : public mlir::OpConversionPattern<mlir::TF::ConstOp> {
- public:
-  CoreRTConstStringTensorOpConversion(mlir::MLIRContext *context,
-                                      CoreRTConverter *corert_converter)
-      : mlir::OpConversionPattern<mlir::TF::ConstOp>(context),
-        corert_converter_(*corert_converter) {}
-
-  LogicalResult matchAndRewrite(
-      mlir::TF::ConstOp op, ArrayRef<mlir::Value> operands,
-      ConversionPatternRewriter &rewriter) const override {  // NOLINT
-    if (!op.dtype().isa<mlir::TF::StringType>()) return failure();
-
-    DenseStringElementsAttr attr = op.value().cast<DenseStringElementsAttr>();
-
-    llvm::SmallVector<Attribute, 4> values;
-    values.reserve(attr.getNumElements());
-    for (const auto &element : attr.getRawStringData())
-      values.push_back(rewriter.getStringAttr(
-          llvm::StringRef(element.data(), element.size())));
-
-    // Create the shape attribute from the tensor shape.
-    ArrayRef<int64_t> shape = op.value().getType().getShape();
-    llvm::SmallVector<mlir::Attribute, 4> dims;
-    dims.reserve(shape.size());
-    auto i64_type = rewriter.getIntegerType(64);
-    for (auto dim : shape)
-      dims.push_back(rewriter.getIntegerAttr(i64_type, dim));
-
-    auto new_op = rewriter.create<tfrt::corert::ConstStringTensorOp>(
-        op.getLoc(), corert_converter_.tensor_handle_type(),
-        rewriter.getArrayAttr(dims), rewriter.getArrayAttr(values));
-
-    rewriter.replaceOp(op, new_op.result());
-
-    return success();
-  }
-
- private:
-  CoreRTConverter &corert_converter_;
-};
-
-// Convert TF dialect operations with no side effects to CoreRT ExecuteOp. For
-// example,
-//
-// %0 = "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = false} :
-//    (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-//
-// is converted to
-//
-// %result = corert.executeop(%device)
-//    "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = false} :
-//    (!corert.tensorhandle, !corert.tensorhandle) -> !corert.tensorhandle
-//
-// Note that it will fail to match if some attributes are not supported.
-template <typename TF_Op>
-class CoreRTExecuteOpConversion : public mlir::OpConversionPattern<TF_Op> {
- public:
-  CoreRTExecuteOpConversion(mlir::MLIRContext *context,
-                            CoreRTConverter *corert_converter)
-      : mlir::OpConversionPattern<TF_Op>(context),
-        corert_converter_(*corert_converter) {}
-
-  LogicalResult matchAndRewrite(
-      TF_Op op, ArrayRef<mlir::Value> operands,
-      ConversionPatternRewriter &rewriter) const override {  // NOLINT
-    mlir::StringAttr op_name = rewriter.getStringAttr(op.getOperationName());
-
-    llvm::SmallVector<Type, 4> result_types;
-    for (auto type : op.getOperation()->getResultTypes())
-      result_types.push_back(corert_converter_.convertType(type));
-
-    // Get the device, or create one if there does not exist one.
-    auto device = corert_converter_.ConvertDevice(op, &rewriter);
-    if (!device) return failure();
-
-    auto derived_attrs = op.materializeDerivedAttributes();
-    for (auto named_attr : derived_attrs) {
-      op.setAttr(named_attr.first, named_attr.second);
-    }
-
-    ArrayAttr op_attrs = corert_converter_.CreateOpAttrs(op.getAttrs());
-    if (!op_attrs) return failure();
-
-    auto new_op = rewriter.create<tfrt::corert::ExecuteOp>(
-        op.getLoc(), result_types, device, operands, op_attrs, op_name);
-
-    rewriter.replaceOp(op, new_op.results());
-    return success();
-  }
-
- private:
-  CoreRTConverter &corert_converter_;
-};
-
-// Deletes the op and forwards the arguments.
-template <typename TF_Op>
-class PassThroughConversion : public mlir::OpConversionPattern<TF_Op> {
- public:
-  explicit PassThroughConversion(MLIRContext *context)
-      : mlir::OpConversionPattern<TF_Op>(context) {}
-
-  LogicalResult matchAndRewrite(
-      TF_Op op, ArrayRef<mlir::Value> operands,
-      ConversionPatternRewriter &rewriter) const override {  // NOLINT
-    // Just forward the arguments to results.
-    rewriter.replaceOp(op, operands);
-    return success();
-  }
-};
-
-// Convert standard ReturnOp to hex.return.
-//
-// TODO(chky): conversion to hex kernels should come from a common tf_to_hex
-// library.
-class ReturnOpConversion : public mlir::OpConversionPattern<mlir::ReturnOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      mlir::ReturnOp op, ArrayRef<mlir::Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<tfrt::hex::ReturnOp>(op, operands);
-    return success();
-  }
-};
-
-// Convert TF dialect to CoreRT dialect.
-class TFToCoreRTConversionPass
-    : public mlir::PassWrapper<TFToCoreRTConversionPass,
-                               OperationPass<ModuleOp>> {
-  void runOnOperation() override {
-    auto module = getOperation();
-    mlir::ConversionTarget target(getContext());
-    mlir::OwningRewritePatternList patterns;
-    if (failed(TFToCoreRTConversionPassRun(&getContext(), &module, &target,
-                                           &patterns)))
-      signalPassFailure();
-  }
-};
-
-}  // namespace
-
-LogicalResult TFToCoreRTConversionPassRun(
-    mlir::MLIRContext *context, mlir::ModuleOp *module,
-    mlir::ConversionTarget *target, mlir::OwningRewritePatternList *patterns) {
-  module->removeAttr("tf_saved_model.semantics");
-
-  mlir::Builder builder(context);
-  auto bound_id = builder.getIdentifier("tf_saved_model.bound_input");
-  auto path_id = builder.getIdentifier("tf_saved_model.index_path");
-
-  module->walk([bound_id, path_id, module](mlir::Operation *op) mutable {
-    if (auto func_op = dyn_cast<mlir::FuncOp>(op)) {
-      // Remove tf_saved_model specific function arg attributes.
-      for (unsigned i = 0, e = func_op.getNumArguments(); i != e; ++i) {
-        func_op.removeArgAttr(i, bound_id);
-        func_op.removeArgAttr(i, path_id);
-      }
-      for (unsigned i = 0, e = func_op.getNumResults(); i != e; ++i) {
-        func_op.removeResultAttr(i, bound_id);
-        func_op.removeResultAttr(i, path_id);
-      }
-      if (auto exported_names = func_op.getAttrOfType<mlir::ArrayAttr>(
-              "tf_saved_model.exported_names")) {
-        // Create a function for each exported name.
-        //
-        // TODO(b/148477882): TFRT dialect should have similar concepts of
-        // exported names so that a function can be referenced by multiple
-        // exported names.
-        func_op.removeAttr("tf_saved_model.exported_names");
-        for (auto exported_name : exported_names) {
-          auto exported_func_op = func_op.clone();
-          exported_func_op.setName(
-              exported_name.cast<mlir::StringAttr>().getValue());
-          module->insert(module->begin(), exported_func_op);
-        }
-        func_op.erase();
-      }
-    } else if (isa<mlir::tf_saved_model::GlobalTensorOp>(op)) {
-      // Remove all global_tensor_ops.
-      op->erase();
-    }
-  });
-
-  CoreRTConverter corert_converter(context);
-
-  target->addLegalDialect<tfrt::corert::CoreRTDialect>();
-  target->addLegalDialect<tfrt::hex::HexDialect>();
-  target->addIllegalDialect<TF::TensorFlowDialect>();
-  target->addDynamicallyLegalOp<mlir::FuncOp>([&corert_converter](FuncOp op) {
-    return corert_converter.isSignatureLegal(op.getType());
-  });
-
-  patterns->insert<PassThroughConversion<TF::ReadVariableOp>,
-                   PassThroughConversion<TF::IdentityOp>, ReturnOpConversion>(
-      context);
-
-  // Here we use one specialized pattern for tf.Const with string tensors as
-  // it will incorrect to use ExecuteOp pattern to convert string tensor
-  // attribute.
-  patterns->insert<CoreRTConstStringTensorOpConversion>(context,
-                                                        &corert_converter);
-
-  // TODO(b/148823030): Pattern registration for TF operations is not
-  // sustainable currently. We need to figure out a plan
-  patterns->insert<CoreRTExecuteOpConversion<TF::AddV2Op>,
-                   // TODO(chky): Move the ReadVariableOp + Identity pattern
-                   // to optimizer.
-                   // CoreRTExecuteOpConversion<TF::IdentityOp>,
-                   CoreRTExecuteOpConversion<TF::MulOp>,
-                   CoreRTExecuteOpConversion<TF::BiasAddOp>,
-                   CoreRTExecuteOpConversion<TF::Conv2DOp>,
-                   CoreRTExecuteOpConversion<TF::ConcatV2Op>,
-                   CoreRTExecuteOpConversion<TF::ConstOp>,
-                   CoreRTExecuteOpConversion<TF::CastOp>,
-                   CoreRTExecuteOpConversion<TF::ExpandDimsOp>,
-                   CoreRTExecuteOpConversion<TF::TransposeOp>,
-                   CoreRTExecuteOpConversion<TF::FusedBatchNormV3Op>,
-                   CoreRTExecuteOpConversion<TF::FusedBatchNormExOp>,
-                   CoreRTExecuteOpConversion<TF::MatMulOp>,
-                   CoreRTExecuteOpConversion<TF::MaxPoolOp>,
-                   CoreRTExecuteOpConversion<TF::MeanOp>,
-                   CoreRTExecuteOpConversion<TF::PadOp>,
-                   CoreRTExecuteOpConversion<TF::ParseExampleV2Op>,
-                   CoreRTExecuteOpConversion<TF::ReluOp>,
-                   CoreRTExecuteOpConversion<TF::SoftmaxOp>,
-                   CoreRTExecuteOpConversion<TF::ShapeOp>,
-                   CoreRTExecuteOpConversion<TF::TanhOp>>(context,
-                                                          &corert_converter);
-
-  mlir::populateFuncOpTypeConversionPattern(*patterns, context,
-                                            corert_converter);
-  return mlir::applyPartialConversion(*module, *target, *patterns);
-}
-
-std::unique_ptr<mlir::Pass> CreateTFToCoreRTConversionPass() {
-  return std::make_unique<TFToCoreRTConversionPass>();
-}
-
-void CreateTFExecutorToTFPipeline(mlir::OpPassManager &pm,
-                                  const CoreRTPipelineOptions &options) {
-  // First, we prune unused operations in MLIR in TF Executor dialect.
-  pm.addPass(mlir::tf_executor::CreateTFExecutorGraphPruningPass());
-
-  // Then we pass the MLIR module through the TF standard pipeline, which for
-  // instances does shape inference, canonicalization, inlining, etc.
-  mlir::TF::StandardPipelineOptions tf_options;
-  tf_options.enable_inliner = true;
-  mlir::TF::CreateTFStandardPipeline(pm, tf_options);
-
-  // After all standard passes run layout optimization to assign optimal data
-  // format for all layout sensitive operations.
-  mlir::TF::LayoutOptimizationPipelineOptions layout_optimization_options;
-  layout_optimization_options.force_data_format =
-      options.force_data_format.getValue();
-  mlir::TF::CreateLayoutOptimizationPipeline(pm, layout_optimization_options);
-
-  // Run canonicalization pipeline to remove unused constants and bypassed
-  // transpose operations left in the IR after layout optimization.
-  pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
-
-  if (options.default_device == "gpu")
-    pm.addNestedPass<mlir::FuncOp>(mlir::TF::CreateGpuOpFusionPass());
-
-  // Then we assign default devices.
-  pm.addNestedPass<mlir::FuncOp>(
-      mlir::TF::CreateSimpleTFDeviceAssignmentPass(options.default_device));
-}
-
-void CreateTFExecutorToCoreRTPipeline(mlir::OpPassManager &pm,
-                                      const CoreRTPipelineOptions &options) {
-  CreateTFExecutorToTFPipeline(pm, options);
-
-  // Convert it to MLIR in CoreRT dialect.
-  pm.addPass(CreateTFToCoreRTConversionPass());
-
-  // Run optimizer on the MLIR module in CoreRT dialect.
-  if (options.enable_optimizer)
-    pm.addNestedPass<mlir::FuncOp>(CreateCoreRTOptimizePass());
-}
-
-static mlir::PassRegistration<TFToCoreRTConversionPass> pass(
-    "tf-to-corert",
-    "Convert Tensorflow dialect to TFRT's CoreRuntime dialect.");
-
-static mlir::PassPipelineRegistration<CoreRTPipelineOptions> pipeline(
-    "tf-executor-to-corert-pipeline",
-    "Convert Tensorflow Executor dialect to TFRT's CoreRuntime dialect, and "
-    "also apply necessary optimization passes.",
-    CreateTFExecutorToCoreRTPipeline);
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
new file mode 100644
index 00000000000..27a8dbd2809
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -0,0 +1,50 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+
+licenses(["notice"])
+
+cc_library(
+    name = "cubin_creator",
+    srcs = ["cubin_creator.cc"],
+    hdrs = ["cubin_creator.h"],
+    copts = if_cuda(["-DGOOGLE_CUDA=1"]),
+    deps = [
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:TargetNVVMIR",
+        "@llvm-project//mlir:Transforms",
+        "//tensorflow/compiler/mlir/xla:hlo",
+        "//tensorflow/compiler/mlir/xla:lhlo",
+        "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/xla:xla_materialize_broadcasts",  # buildcleaner: keep
+        "//tensorflow/compiler/mlir/xla:xla_unfuse_batch_norm",  # buildcleaner: keep
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/service/gpu:stream_executor_util",
+        "//tensorflow/compiler/xla/service/gpu:target_constants",
+        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
+        "//tensorflow/compiler/xla/service/mlir_gpu:kernel_lowering",
+        "//tensorflow/core:cuda_libdevice_path",
+        "//tensorflow/core:lib",
+    ] + if_cuda(["//tensorflow/stream_executor/gpu:asm_compiler"]),
+)
+
+tf_cc_binary(
+    name = "tf_to_cubin",
+    srcs = ["tf_to_cubin.cc"],
+    visibility = ["//tensorflow/core/kernels/cubin_headers:__pkg__"],
+    deps = [
+        ":cubin_creator",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
new file mode 100644
index 00000000000..f47485d0214
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
@@ -0,0 +1,270 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+//===- cubin_creator.cc -----------------------------------------*- C++ -*-===//
+//
+// This file implements the function to compile a TF kernel function to a cubin.
+//
+//===----------------------------------------------------------------------===//
+#include "tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/escaping.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Target/NVVMIR.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
+#include "tensorflow/compiler/xla/service/gpu/target_constants.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h"
+#include "tensorflow/core/platform/cuda_libdevice_path.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/path.h"
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
+#endif
+
+namespace {
+using tensorflow::Status;
+using xla::InternalError;
+using xla::StatusOr;
+
+StatusOr<std::string> GetLibdeviceDir(
+    const xla::HloModuleConfig& hlo_module_config) {
+  for (const std::string& cuda_root : tensorflow::CandidateCudaRoots(
+           hlo_module_config.debug_options().xla_gpu_cuda_data_dir())) {
+    std::string libdevice_dir =
+        tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
+    VLOG(2) << "Looking for libdevice at " << libdevice_dir;
+    if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
+      VLOG(2) << "Found libdevice dir " << libdevice_dir;
+      return libdevice_dir;
+    }
+  }
+  return InternalError(
+      "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
+}
+
+struct MaterializeBroadcastsPass
+    : public mlir::PassWrapper<MaterializeBroadcastsPass, mlir::FunctionPass> {
+  void runOnFunction() override {
+    mlir::ConversionTarget conversionTarget(getContext());
+    mlir::OwningRewritePatternList conversionPatterns;
+
+    // Consider the xla_hlo dialect legal for tests.
+    conversionTarget.addLegalDialect<mlir::xla_hlo::XlaHloDialect>();
+    // The conversion uses helpers from the Standard dialect.
+    conversionTarget.addLegalDialect<mlir::StandardOpsDialect>();
+
+    mlir::xla_hlo::SetupMaterializeBroadcastsLegality(&getContext(),
+                                                      &conversionTarget);
+    mlir::xla_hlo::PopulateMaterializeBroadcastsPatterns(&getContext(),
+                                                         &conversionPatterns);
+
+    if (failed(applyPartialConversion(getFunction(), conversionTarget,
+                                      conversionPatterns))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+struct UnfuseBatchNormPass
+    : public mlir::PassWrapper<UnfuseBatchNormPass, mlir::FunctionPass> {
+  void runOnFunction() override {
+    mlir::OwningRewritePatternList patterns;
+    mlir::xla_hlo::PopulateUnfuseBatchNormPatterns(&getContext(), &patterns);
+    mlir::applyPatternsAndFoldGreedily(getOperation(), patterns);
+  }
+};
+
+Status LowerTfOpToLhloWithDynamicShapes(mlir::ModuleOp module) {
+  mlir::PassManager pm(module.getContext());
+  auto enable_if_vlog_is_on = [](mlir::Pass* pass, mlir::Operation* op) {
+    return VLOG_IS_ON(1);
+  };
+  pm.enableIRPrinting(/*shouldPrintBeforePass=*/{},
+                      /*shouldPrintAfterPass=*/enable_if_vlog_is_on,
+                      /*printModuleScope=*/false,
+                      /*printAfterOnlyOnChange=*/false, llvm::dbgs());
+  pm.addNestedPass<mlir::FuncOp>(mlir::xla_hlo::createLegalizeTFPass(false));
+  pm.addNestedPass<mlir::FuncOp>(
+      absl::make_unique<MaterializeBroadcastsPass>());
+  pm.addNestedPass<mlir::FuncOp>(absl::make_unique<UnfuseBatchNormPass>());
+  pm.addPass(mlir::xla_hlo::createLegalizeToLhloPass());
+  pm.addNestedPass<mlir::FuncOp>(mlir::xla_lhlo::createLhloCopyRemovalPass());
+
+  if (failed(pm.run(module))) {
+    return InternalError("Lowering TF to LHLO failed.");
+  }
+  return Status::OK();
+}
+
+struct PropagateStaticKnowledge
+    : public mlir::PassWrapper<PropagateStaticKnowledge,
+                               mlir::OperationPass<mlir::LLVM::LLVMFuncOp>> {
+  explicit PropagateStaticKnowledge(mlir::FunctionType type,
+                                    llvm::ArrayRef<uint32_t> same_shape_)
+      : func_type(type), same_shape(same_shape_) {}
+
+  void runOnOperation() override {
+    // We know due to tensorflow ABI that the offset is always 0 and that the
+    // innermost stride is always 1. To make this visible to the compiler,
+    // we insert constants into the code and replace usages accordingly.
+    // We do not change the signature so that we keep a somewhat stable ABI
+    // that is easy to undertand by tools.
+    mlir::LLVM::LLVMFuncOp func = getOperation();
+    mlir::OpBuilder b(func.getBody());
+    auto index_type = func.getArgument(3).getType();
+    mlir::Value one = b.create<mlir::LLVM::ConstantOp>(
+        func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 1));
+    mlir::Value zero = b.create<mlir::LLVM::ConstantOp>(
+        func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 0));
+    uint32_t arg_pos = 0;
+    std::vector<uint32_t> positions;
+    for (mlir::Type arg_type : func_type.getInputs()) {
+      positions.push_back(arg_pos);
+      func.getArgument(arg_pos + 2).replaceAllUsesWith(zero);
+      arg_pos += 3 + arg_type.cast<mlir::ShapedType>().getRank() * 2;
+      func.getArgument(arg_pos - 1).replaceAllUsesWith(one);
+    }
+
+    // If we have knowledge that some arguments have the same shape, we
+    // can use that here. Simply replace usages of the shape parameters within
+    // the function body to a single shape parameter.
+    if (!same_shape.empty()) {
+      auto first = same_shape.front();
+      auto first_offset = positions.at(first);
+      mlir::ShapedType first_type =
+          func_type.getInput(first).cast<mlir::ShapedType>();
+      uint32_t rank = first_type.getRank();
+      for (auto same : same_shape.drop_front(1)) {
+        uint32_t same_offset = positions.at(same);
+        auto same_type = func_type.getInput(same).cast<mlir::ShapedType>();
+        if (same_type.getRank() != rank) {
+          func.emitOpError() << "same shape constraints on arguments with "
+                                "non-matching shapes: #"
+                             << first << " and #" << same;
+          signalPassFailure();
+        }
+
+        for (uint32_t i = 0; i < 2 * rank; ++i) {
+          // Replace uses for second arg data with first arg.
+          auto same_arg = func.getArgument(same_offset + 3 + i);
+          auto first_arg = func.getArgument(first_offset + 3 + i);
+          same_arg.replaceAllUsesWith(first_arg);
+        }
+      }
+    }
+  }
+
+  mlir::FunctionType func_type;
+  llvm::ArrayRef<uint32_t> same_shape;
+};
+
+Status PropagateStaticShapeKnowledgeToKernel(
+    mlir::ModuleOp module, llvm::ArrayRef<uint32_t> same_shape) {
+  // Grab the original signature from the single function.
+  auto func = *module.getBody()->op_begin<mlir::FuncOp>();
+
+  mlir::PassManager pm(module.getContext());
+  auto enable_if_vlog_is_on = [](mlir::Pass*, mlir::Operation*) {
+    return VLOG_IS_ON(1);
+  };
+  pm.enableIRPrinting(/*shouldPrintBeforePass=*/{},
+                      /*shouldPrintAfterPass=*/enable_if_vlog_is_on,
+                      /*printModuleScope=*/false,
+                      /*printAfterOnlyOnChange=*/false, llvm::dbgs());
+  auto& kernel_pm = pm.nest<::mlir::gpu::GPUModuleOp>();
+  kernel_pm.addNestedPass<mlir::LLVM::LLVMFuncOp>(
+      absl::make_unique<PropagateStaticKnowledge>(func.getType(), same_shape));
+
+  if (failed(pm.run(module))) {
+    return InternalError("Static knowledge propagation failed.");
+  }
+  return Status::OK();
+}
+}  // namespace
+
+StatusOr<std::vector<uint8_t>> tensorflow::kernel_gen::GenerateCubinForTfCode(
+    llvm::StringRef tf_code, std::pair<int32_t, int32_t> compute_capability,
+    llvm::ArrayRef<uint32_t> tile_sizes, llvm::ArrayRef<uint32_t> same_shape,
+    llvm::ArrayRef<uint32_t> unroll_factors) {
+  mlir::MLIRContext context;
+  context.allowUnregisteredDialects();  // TODO(b/152572127)
+  mlir::OwningModuleRef module = mlir::parseSourceString(tf_code, &context);
+
+  TF_RETURN_IF_ERROR(LowerTfOpToLhloWithDynamicShapes(module.get()));
+  TF_RETURN_IF_ERROR(
+      xla::mlir_gpu::LowerLHLOToGPU(module.get(), tile_sizes, unroll_factors,
+                                    /*collapseParallelLoops=*/false));
+  TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
+  // TODO(b/156985522): Figure out why we get a segfault when generating Tanh
+  // with 'same_shape' containing {0, 1}. We would also get the crash if we
+  // unconditionally call PropagateStaticShapeKnowledgeToKernel while
+  // 'same_shape' is empty.
+  if (!same_shape.empty()) {
+    TF_RETURN_IF_ERROR(
+        PropagateStaticShapeKnowledgeToKernel(module.get(), same_shape));
+  }
+
+  mlir::OwningModuleRef kernel_module =
+      xla::mlir_gpu::ExtractKernelModule(*module).ValueOrDie();
+  auto llvmModule = mlir::translateModuleToNVVMIR(*kernel_module);
+  if (!llvmModule) {
+    return InternalError("Could not translate MLIR module to NVVM");
+  }
+
+  llvmModule->setModuleIdentifier("acme");
+  llvmModule->setDataLayout(xla::gpu::nvptx::kDataLayout);
+
+  xla::HloModuleConfig config;
+  config.set_debug_options(xla::GetDebugOptionsFromFlags());
+
+  TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
+  TF_ASSIGN_OR_RETURN(std::string ptx, xla::gpu::nvptx::CompileToPtx(
+                                           llvmModule.get(), compute_capability,
+                                           config, libdevice_dir));
+  VLOG(1) << ptx;
+
+#if GOOGLE_CUDA
+  return tensorflow::se::CompileGpuAsm(
+      std::get<0>(compute_capability), std::get<1>(compute_capability),
+      ptx.c_str(), xla::gpu::PtxOptsFromConfig(config));
+#else
+  return InternalError(
+      "GOOGLE_CUDA not defined. Did you specify --config=cuda ?");
+#endif
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h
new file mode 100644
index 00000000000..47626ba9d0d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+//===- cubin_creator.h ------------------------------------------*- C++ -*-===//
+//
+// This file declares the function to compile a TF kernel function to a cubin.
+//
+//===----------------------------------------------------------------------===//
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CUBIN_CREATOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CUBIN_CREATOR_H_
+
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace tensorflow {
+namespace kernel_gen {
+xla::StatusOr<std::vector<uint8_t>> GenerateCubinForTfCode(
+    llvm::StringRef tf_code,
+    std::pair<int32_t, int32_t> compute_capability = {7, 5},
+    llvm::ArrayRef<uint32_t> tile_sizes = {16, 64},
+    llvm::ArrayRef<uint32_t> same_shape = {},
+    llvm::ArrayRef<uint32_t> unroll_factors = {});
+}  // namespace kernel_gen
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CUBIN_CREATOR_H_
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
new file mode 100644
index 00000000000..8edc567e777
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
@@ -0,0 +1,118 @@
+// Copyright 2020 The TensorFlow Runtime Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//===- tf_to_cubin.cc -------------------------------------------*- C++ -*-===//
+//
+// This file implements the entry point to compile a tf op to a cubin file.
+//
+//===----------------------------------------------------------------------===//
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace {
+bool ParseStringList(std::string string_list, std::vector<uint32_t>* result) {
+  result->clear();
+  uint32_t item;
+  auto items = absl::StrSplit(string_list, ',');
+  for (const auto& item_str : items) {
+    if (!absl::SimpleAtoi(item_str, &item)) {
+      LOG(ERROR) << "Expected token " << item_str << " to be an integer";
+      return false;
+    }
+    result->push_back(item);
+  }
+  return true;
+}
+}  // namespace
+
+int main(int argc, char** argv) {
+  std::string output_file = "foo.bin";
+  int32_t architecture = 50;
+  std::vector<uint32_t> tile_sizes;
+  std::vector<uint32_t> unroll_factors;
+  std::vector<uint32_t> same_shape;
+
+  auto parse_tile_sizes = [&tile_sizes](std::string tile_sizes_str) {
+    if (!ParseStringList(tile_sizes_str, &tile_sizes)) {
+      return false;
+    }
+    // Initialize with the default.
+    if (tile_sizes.empty()) {
+      tile_sizes.push_back(16);
+      tile_sizes.push_back(64);
+    }
+    return true;
+  };
+
+  auto parse_unroll_factors =
+      [&unroll_factors](std::string unroll_factors_str) {
+        return ParseStringList(unroll_factors_str, &unroll_factors);
+      };
+
+  auto parse_same_shape = [&same_shape](std::string same_shape_str) {
+    return ParseStringList(same_shape_str, &same_shape);
+  };
+
+  std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("output", &output_file, "output file"),
+      tensorflow::Flag("arch", &architecture,
+                       "target architecture (e.g. 50 for sm_50)"),
+      tensorflow::Flag("tile_sizes", parse_tile_sizes, "16,64",
+                       "tile sizes to use"),
+      tensorflow::Flag("unroll_factors", parse_unroll_factors, "",
+                       "factors to unroll by, separated by commas"),
+      tensorflow::Flag("same_shape", parse_same_shape, "",
+                       "arguments with same shape, separated by commas"),
+  };
+  bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  tensorflow::port::InitMain("usage", &argc, &argv);
+  if (!parse_ok) {
+    return 1;
+  }
+
+  std::pair<int32_t, int32_t> compute_capability(architecture / 10,
+                                                 architecture % 10);
+
+  auto cubin = tensorflow::kernel_gen::GenerateCubinForTfCode(
+      argv[1], compute_capability, tile_sizes, same_shape, unroll_factors);
+
+  if (!cubin.ok()) {
+    LOG(ERROR) << cubin.status();
+    return 1;
+  }
+
+  std::vector<uint8_t> cubin_data = cubin.ConsumeValueOrDie();
+
+  auto status = tensorflow::WriteStringToFile(
+      tensorflow::Env::Default(), output_file,
+      absl::string_view{reinterpret_cast<char*>(cubin_data.data()),
+                        cubin_data.size()});
+
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index e4309d5eef0..736651b5022 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -23,7 +23,6 @@ package_group(
         "//tensorflow/compiler/xla/...",
         "//third_party/iree/...",
         "//third_party/mlir_edge/...",
-        "//third_party/tf_runtime/tools/tf_kernel_gen/...",
     ],
 )
 
@@ -39,7 +38,8 @@ filegroup(
         "ir/lhlo_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/ViewLikeInterface.td",
     ],
 )
 
@@ -51,9 +51,7 @@ gentbl(
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/chlo_ops.td",
-    td_srcs = [
-        ":hlo_ops_td_files",
-    ],
+    td_srcs = [":hlo_ops_td_files"],
 )
 
 gentbl(
@@ -113,16 +111,11 @@ gentbl(
 gentbl(
     name = "xla_canonicalize_inc_gen",
     tbl_outs = [
-        (
-            "-gen-rewriters",
-            "transforms/generated_canonicalize.inc",
-        ),
+        ("-gen-rewriters", "transforms/generated_canonicalize.inc"),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/canonicalize.td",
-    td_srcs = [
-        ":hlo_ops_td_files",
-    ],
+    td_srcs = [":hlo_ops_td_files"],
 )
 
 cc_library(
@@ -133,6 +126,7 @@ cc_library(
         "transforms/legalize_tf_control_flow.cc",
     ],
     deps = [
+        ":chlo_legalize_to_hlo",
         ":convert_op_folder",
         ":hlo",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -157,9 +151,7 @@ cc_library(
 
 cc_library(
     name = "xla_legalize_tf_with_tf2xla",
-    srcs = [
-        "transforms/legalize_tf_with_tf2xla.cc",
-    ],
+    srcs = ["transforms/legalize_tf_with_tf2xla.cc"],
     deps = [
         ":hlo",
         ":mlir_hlo_builder",
@@ -193,6 +185,22 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "xla_sink_constants_to_control_flow",
+    srcs = ["transforms/sink_constants_to_control_flow.cc"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "map_xla_to_scalar_op",
     hdrs = ["transforms/map_xla_to_scalar_op.h"],
@@ -241,14 +249,29 @@ cc_library(
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
-        "@llvm-project//mlir:LoopOps",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Transforms",
     ],
     alwayslink = 1,
 )
 
+cc_library(
+    name = "lhlo_legalize_to_llvm",
+    srcs = ["transforms/lhlo_legalize_to_llvm.cc"],
+    deps = [
+        ":lhlo",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMTransforms",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "xla_legalize_to_linalg",
     srcs = ["transforms/xla_legalize_to_linalg.cc"],
@@ -267,6 +290,21 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "xla_transform_unranked_hlo",
+    srcs = ["transforms/xla_transform_unranked_hlo.cc"],
+    deps = [
+        ":hlo",
+        "@com_google_absl//absl/memory",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Shape",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "lhlo_legalize_to_gpu",
     srcs = ["transforms/lhlo_legalize_to_gpu.cc"],
@@ -279,8 +317,8 @@ cc_library(
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
-        "@llvm-project//mlir:LoopOps",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Transforms",
     ],
@@ -331,42 +369,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "buffer_assignment",
-    srcs = ["transforms/buffer_assignment.cc"],
-    hdrs = ["transforms/buffer_assignment.h"],
-    deps = [
-        "@com_google_absl//absl/memory",
-        "@llvm-project//mlir:Analysis",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "buffer_assignment_test",
-    srcs = ["transforms/buffer_assignment_test.cc"],
-    hdrs = [
-        "transforms/buffer_assignment.h",
-        "transforms/passes.h",
-    ],
-    deps = [
-        "@com_google_absl//absl/memory",
-        "@llvm-project//llvm:support",
-        "@llvm-project//mlir:Analysis",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
-    ],
-    alwayslink = 1,
-)
-
 gentbl(
     name = "xla_legalize_to_standard_inc_gen",
     tbl_outs = [
@@ -382,9 +384,7 @@ gentbl(
 
 cc_library(
     name = "xla_legalize_control_flow",
-    srcs = [
-        "transforms/legalize_control_flow.cc",
-    ],
+    srcs = ["transforms/legalize_control_flow.cc"],
     deps = [
         ":hlo",
         "@llvm-project//llvm:support",
@@ -471,9 +471,7 @@ cc_library(
 
 cc_library(
     name = "xla_materialize_broadcasts",
-    srcs = [
-        "transforms/materialize_broadcasts.cc",
-    ],
+    srcs = ["transforms/materialize_broadcasts.cc"],
     deps = [
         ":hlo",
         "@llvm-project//mlir:IR",
@@ -484,9 +482,7 @@ cc_library(
 
 cc_library(
     name = "xla_unfuse_batch_norm",
-    srcs = [
-        "transforms/unfuse_batch_norm.cc",
-    ],
+    srcs = ["transforms/unfuse_batch_norm.cc"],
     deps = [
         ":hlo",
         "@llvm-project//llvm:support",
@@ -498,9 +494,7 @@ cc_library(
 
 cc_library(
     name = "chlo_legalize_to_hlo",
-    srcs = [
-        "transforms/chlo_legalize_to_hlo.cc",
-    ],
+    srcs = ["transforms/chlo_legalize_to_hlo.cc"],
     deps = [
         ":hlo",
         "@llvm-project//mlir:IR",
@@ -513,6 +507,7 @@ cc_library(
     name = "xla_test_passes",
     srcs = [
         "transforms/chlo_legalize_to_hlo_pass.cc",
+        "transforms/lhlo_legalize_to_llvm_pass.cc",
         "transforms/materialize_broadcasts_pass.cc",
         "transforms/test_infer_shaped_type_pass.cc",
         "transforms/unfuse_batch_norm_pass.cc",
@@ -520,10 +515,14 @@ cc_library(
     deps = [
         ":chlo_legalize_to_hlo",  # build-cleaner: keep
         ":hlo",
+        ":lhlo",
+        ":lhlo_legalize_to_llvm",  # build-cleaner: keep
         ":xla_materialize_broadcasts",  # build-cleaner: keep
         ":xla_unfuse_batch_norm",  # build-cleaner: keep
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMTransforms",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Shape",
         "@llvm-project//mlir:StandardOps",
@@ -575,12 +574,8 @@ cc_library(
 
 cc_library(
     name = "mlir_hlo_builder",
-    srcs = [
-        "ir/mlir_hlo_builder.cc",
-    ],
-    hdrs = [
-        "ir/mlir_hlo_builder.h",
-    ],
+    srcs = ["ir/mlir_hlo_builder.cc"],
+    hdrs = ["ir/mlir_hlo_builder.h"],
     deps = [
         ":attribute_importer",
         ":hlo",
@@ -626,6 +621,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:ViewLikeInterface",
     ],
     alwayslink = 1,
 )
@@ -823,7 +819,7 @@ genrule(
     name = "operator_writer_inc",
     srcs = [
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
         ":ir/hlo_ops.td",
         ":ir/hlo_ops_base.td",
@@ -853,8 +849,6 @@ cc_library(
         "//tensorflow/compiler/mlir:__subpackages__",
     ],
     deps = [
-        ":buffer_assignment",
-        ":buffer_assignment_test",
         ":chlo_legalize_to_hlo",
         ":hlo",
         ":hlo_legalize_to_lhlo",
@@ -872,8 +866,9 @@ cc_library(
         ":xla_legalize_to_linalg",
         ":xla_legalize_to_standard",
         ":xla_lower",
-        ":xla_materialize_broadcasts",
+        ":xla_sink_constants_to_control_flow",
         ":xla_test_passes",
+        ":xla_transform_unranked_hlo",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index 5dc610a5670..22a0b038833 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -420,15 +420,37 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
     }
     case HloOpcode::kConditional: {
       llvm::SmallVector<Type, 4> rets;
-      TF_RETURN_IF_ERROR(GetMlirTypes(
-          {instruction->true_computation()->root_instruction()}, &rets));
+      mlir::Type pred_or_index_type =
+          operands[0].getType().cast<mlir::TensorType>().getElementType();
+      // It is a predicated conditional if first argument is a boolean and
+      // should be mapped to If op.
+      if (pred_or_index_type.isInteger(1)) {
+        TF_RETURN_IF_ERROR(GetMlirTypes(
+            {instruction->true_computation()->root_instruction()}, &rets));
 
-      auto op = func_builder->create<mlir::xla_hlo::ConditionalOp>(
-          loc, rets, operands, attributes);
-      TF_RETURN_IF_ERROR(ImportComputation(instruction->true_computation(),
-                                           &op.true_branch()));
-      TF_RETURN_IF_ERROR(ImportComputation(instruction->false_computation(),
-                                           &op.false_branch()));
+        auto op = func_builder->create<mlir::xla_hlo::IfOp>(loc, rets, operands,
+                                                            attributes);
+        TF_RETURN_IF_ERROR(ImportComputation(instruction->true_computation(),
+                                             &op.true_branch()));
+        TF_RETURN_IF_ERROR(ImportComputation(instruction->false_computation(),
+                                             &op.false_branch()));
+        return op.getOperation();
+      }
+
+      // Otherwise, it is a indexed conditional and should be mapped to Case op.
+      TF_RETURN_IF_ERROR(GetMlirTypes(
+          {instruction->branch_computation(0)->root_instruction()}, &rets));
+
+      int num_branches = instruction->branch_count();
+      auto op = func_builder->create<mlir::xla_hlo::CaseOp>(
+          loc, rets, operands, attributes, num_branches);
+      for (auto index_and_computation :
+           llvm::enumerate(instruction->branch_computations())) {
+        auto index = index_and_computation.index();
+        HloComputation* computation = index_and_computation.value();
+        TF_RETURN_IF_ERROR(
+            ImportComputation(computation, &op.branches()[index]));
+      }
       return op.getOperation();
     }
     case HloOpcode::kConcatenate: {
diff --git a/tensorflow/compiler/mlir/xla/hlo_utils.cc b/tensorflow/compiler/mlir/xla/hlo_utils.cc
index c685cc296fd..dc801f64ede 100644
--- a/tensorflow/compiler/mlir/xla/hlo_utils.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_utils.cc
@@ -139,6 +139,10 @@ StatusOr<mlir::DenseElementsAttr> CreateDenseElementsAttrFromLiteral(
       return CreateDenseAttrFromLiteral<uint32>(type, literal);
     case PrimitiveType::U64:
       return CreateDenseAttrFromLiteral<uint64>(type, literal);
+    case PrimitiveType::C64:
+      return CreateDenseAttrFromLiteral<complex64>(type, literal);
+    case PrimitiveType::C128:
+      return CreateDenseAttrFromLiteral<complex128>(type, literal);
     default:
       return tensorflow::errors::Internal(
           absl::StrCat("Unsupported type: ", PrimitiveType_Name(element_type)));
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
index bc6842a617e..26db4549a2a 100644
--- a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
@@ -97,16 +97,12 @@ static Type GetBroadcastType(Type x, Type y, Type element_type,
 
 LogicalResult InferBroadcastBinaryOpReturnTypeComponents(
     MLIRContext* context, Optional<Location> location, ValueRange operands,
-    ArrayRef<NamedAttribute> attributes, Type element_type,
+    DictionaryAttr attributes, Type element_type,
     SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
   // Find broadcast_dimensions.
-  DenseIntElementsAttr broadcast_dimensions;
-  for (auto attr : attributes) {
-    if (attr.first == "broadcast_dimensions") {
-      broadcast_dimensions = attr.second.dyn_cast<DenseIntElementsAttr>();
-      break;
-    }
-  }
+  DenseIntElementsAttr broadcast_dimensions =
+      attributes.get("broadcast_dimensions")
+          .dyn_cast_or_null<DenseIntElementsAttr>();
 
   ShapedType lhs_type = operands[0].getType().dyn_cast<ShapedType>();
   ShapedType rhs_type = operands[1].getType().dyn_cast<ShapedType>();
@@ -168,7 +164,7 @@ LogicalResult ReifyBroadcastBinaryOpReturnTypeShapes(
 
 LogicalResult BroadcastComplexOp::inferReturnTypeComponents(
     MLIRContext* context, Optional<Location> location, ValueRange operands,
-    ArrayRef<NamedAttribute> attributes, RegionRange regions,
+    DictionaryAttr attributes, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
   ShapedType lhs_type = operands[0].getType().dyn_cast<ShapedType>();
   if (!lhs_type) {
@@ -189,9 +185,19 @@ LogicalResult BroadcastComplexOp::reifyReturnTypeShapes(
 // BroadcastCompareOp (has custom type inference due to different result type).
 //===----------------------------------------------------------------------===//
 
+void BroadcastCompareOp::build(OpBuilder& builder, OperationState& result,
+                               Value lhs, Value rhs,
+                               DenseIntElementsAttr broadcast_dimensions,
+                               StringAttr comparison_direction) {
+  auto new_type = GetBroadcastType(lhs.getType(), rhs.getType(),
+                                   builder.getI1Type(), broadcast_dimensions);
+  build(builder, result, new_type, lhs, rhs, broadcast_dimensions,
+        comparison_direction);
+}
+
 LogicalResult BroadcastCompareOp::inferReturnTypeComponents(
     MLIRContext* context, Optional<Location> location, ValueRange operands,
-    ArrayRef<NamedAttribute> attributes, RegionRange regions,
+    DictionaryAttr attributes, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
   Type element_type = IntegerType::get(1, context);
   return InferBroadcastBinaryOpReturnTypeComponents(context, location, operands,
@@ -211,7 +217,7 @@ LogicalResult BroadcastCompareOp::reifyReturnTypeShapes(
 #define BROADCAST_INFER_SHAPE_TYPE_OP_DEFS(Op)                                \
   LogicalResult Op::inferReturnTypeComponents(                                \
       MLIRContext* context, Optional<Location> location, ValueRange operands, \
-      ArrayRef<NamedAttribute> attributes, RegionRange regions,               \
+      DictionaryAttr attributes, RegionRange regions,                         \
       SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {           \
     return InferBroadcastBinaryOpReturnTypeComponents(                        \
         context, location, operands, attributes, /*element_type=*/nullptr,    \
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.h b/tensorflow/compiler/mlir/xla/ir/chlo_ops.h
index 474d4b7d95a..a5337907579 100644
--- a/tensorflow/compiler/mlir/xla/ir/chlo_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 
 namespace mlir {
 namespace xla_chlo {
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.td b/tensorflow/compiler/mlir/xla/ir/chlo_ops.td
index a244985c9b5..febc99f6b72 100644
--- a/tensorflow/compiler/mlir/xla/ir/chlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.td
@@ -31,7 +31,7 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
 
 def HLOClient_Dialect : Dialect {
@@ -360,6 +360,11 @@ def HLOClient_BroadcastCompareOp : HLOClient_BroadcastBinaryElementwiseOp<
     HLO_ComparisonDirectionAttr:$comparison_direction
   );
   let results = (outs HLO_PredTensor);
+
+  let builders = [OpBuilder<
+    "OpBuilder &builder, OperationState &result, Value lhs, Value rhs, "
+    "DenseIntElementsAttr broadcast_dimensions, StringAttr comparison_direction"
+  >];
 }
 
 #endif  // CHLO_OPS
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index cb7372a762c..c66b8f12332 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -262,6 +262,34 @@ static LogicalResult Verify(IotaOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// DynamicIotaOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+struct DynamicIotaIsStatic : public OpRewritePattern<DynamicIotaOp> {
+  using OpRewritePattern<DynamicIotaOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(DynamicIotaOp iota,
+                                PatternRewriter& rewriter) const override {
+    auto result_ty = iota.getType().cast<ShapedType>();
+    if (!result_ty.hasStaticShape()) {
+      return failure();
+    }
+
+    rewriter.replaceOpWithNewOp<IotaOp>(iota, result_ty, iota.iota_dimension());
+    return success();
+  }
+};
+
+}  // namespace
+
+void DynamicIotaOp::getCanonicalizationPatterns(
+    OwningRewritePatternList& results, MLIRContext* context) {
+  results.insert<DynamicIotaIsStatic>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // AbsOp
 //===----------------------------------------------------------------------===//
@@ -567,48 +595,6 @@ OpFoldResult BroadcastInDimOp::fold(ArrayRef<Attribute>) {
   return getOperand();
 }
 
-//===----------------------------------------------------------------------===//
-// ScalarsToDimensionTensorOp
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-// Canonicalizes the pattern of the form
-//
-// %2 = "xla_hlo.scalars_to_dimension_tensor"(%0, %1)
-//          : (i32, i32) -> tensor<2xi32>
-// %3 = extract_element %2[%c0] : tensor<2xi32>
-//
-// to just %0.
-struct ExtractElementFromScalarsToDimensionTensor
-    : public OpRewritePattern<ExtractElementOp> {
-  using OpRewritePattern<ExtractElementOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ExtractElementOp extract,
-                                PatternRewriter& rewriter) const override {
-    if (extract.indices().size() != 1) return failure();
-
-    if (auto scalars_to_tensor = dyn_cast_or_null<ScalarsToDimensionTensorOp>(
-            extract.aggregate().getDefiningOp())) {
-      APInt index;
-      if (!matchPattern(*extract.indices().begin(), m_ConstantInt(&index))) {
-        return failure();
-      }
-      rewriter.replaceOp(extract,
-                         scalars_to_tensor.getOperand(index.getZExtValue()));
-      return success();
-    }
-    return failure();
-  }
-};
-
-}  // namespace
-
-void ScalarsToDimensionTensorOp::getCanonicalizationPatterns(
-    OwningRewritePatternList& results, MLIRContext* context) {
-  results.insert<ExtractElementFromScalarsToDimensionTensor>(context);
-}
-
 //===----------------------------------------------------------------------===//
 // DynamicBroadcastInDimOp
 //===----------------------------------------------------------------------===//
@@ -1170,9 +1156,22 @@ OpFoldResult CopyOp::fold(ArrayRef<Attribute> operands) { return getOperand(); }
 //===----------------------------------------------------------------------===//
 
 OpFoldResult ReverseOp::fold(ArrayRef<Attribute> operands) {
+  auto input = operand();
+
   // No dimensions to reverse.
-  if (dimensions().getNumElements() == 0) return operand();
-  return nullptr;
+  if (dimensions().getNumElements() == 0) return input;
+
+  llvm::SmallVector<APInt, 5> new_dims;
+  new_dims.reserve(dimensions().getNumElements());
+
+  auto shaped_type = input.getType().cast<ShapedType>();
+  for (auto dim : dimensions().getValues<APInt>()) {
+    if (shaped_type.getDimSize(dim.getLimitedValue()) != 1) {
+      return nullptr;
+    }
+  }
+
+  return input;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1240,7 +1239,7 @@ static LogicalResult Verify(SelectOp op) {
 // the return type based on operand type.
 LogicalResult SelectOp::inferReturnTypes(
     MLIRContext*, Optional<Location> location, ValueRange operands,
-    ArrayRef<NamedAttribute> attributes, RegionRange regions,
+    DictionaryAttr attributes, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   auto x_type = operands[1].getType();
   auto y_type = operands[2].getType();
@@ -1345,19 +1344,23 @@ static LogicalResult Verify(PadOp op) {
 //===----------------------------------------------------------------------===//
 
 static LogicalResult Verify(ReshapeOp op) {
-  auto operand_ty = op.operand().getType().cast<TensorType>();
+  // If the operand type is dynamically shaped there is nothing to verify.
+  auto operand_ty = op.operand().getType().cast<RankedTensorType>();
   if (!operand_ty || !operand_ty.hasStaticShape()) return success();
-  int64_t num_input_elements = operand_ty.getNumElements();
 
-  auto out_ty = op.getType().cast<RankedTensorType>();
-  if (out_ty && out_ty.hasStaticShape()) {
-    int64_t num_output_elements = out_ty.getNumElements();
-    if (num_input_elements != num_output_elements)
-      return op.emitOpError()
-             << "number of output elements (" << num_output_elements
-             << ") doesn't match expected number of elements ("
-             << num_input_elements << ")";
-  }
+  // If the operand type is statically shaped (not required) the number of
+  // elements must match that of the result type.
+  auto result_ty = op.getType().cast<RankedTensorType>();
+  assert(result_ty && result_ty.hasStaticShape() &&
+         "result type must be statically shaped");
+  int64_t num_result_elements = result_ty.getNumElements();
+  int64_t num_operand_elements = operand_ty.getNumElements();
+  if (num_result_elements != num_operand_elements)
+    return op.emitOpError()
+           << "number of output elements (" << num_result_elements
+           << ") doesn't match expected number of elements ("
+           << num_operand_elements << ")";
+
   return success();
 }
 
@@ -1379,94 +1382,71 @@ OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
   return {};
 }
 
+//===----------------------------------------------------------------------===//
+// Case Op
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(CaseOp op) {
+  auto num_branches = op.branches().size();
+  if (op.branch_operands().size() != num_branches)
+    return op.emitOpError() << "expects number of branches " << num_branches
+                            << " to be same as number of branch operands "
+                            << op.branch_operands().size();
+
+  MutableArrayRef<Region> branches = op.branches();
+  OperandRange branch_operands = op.branch_operands();
+  for (unsigned i = 0; i < num_branches; ++i) {
+    mlir::Region& branch_region = branches[i];
+    if (branch_region.empty())
+      return op.emitOpError() << "cannot have empty regions";
+    mlir::Block& entry_block = branch_region.front();
+    if (entry_block.getNumArguments() != 1)
+      return op.emitOpError()
+             << "expects branch regions to have single argument, but found "
+             << entry_block.getNumArguments() << " for branch " << i;
+    auto operand = branch_operands[i];
+    if (entry_block.getArgument(0).getType() != operand.getType())
+      return op.emitOpError()
+             << "expects operand " << i + 1 << " to be of type "
+             << entry_block.getArgument(0).getType() << ", but found "
+             << operand.getType();
+    WalkResult walker = branch_region.walk([&](ReturnOp return_op) {
+      if (return_op.getOperands().getTypes() != op.getResultTypes())
+        return WalkResult::interrupt();
+      return WalkResult::advance();
+    });
+    if (walker.wasInterrupted())
+      return op.emitOpError()
+             << "branch " << i
+             << " returned values do not match op result types";
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // BinaryOps
 //===----------------------------------------------------------------------===//
 
 namespace {
-// Gets the resulting type from a broadcast between two types.
-static Type GetBroadcastType(Builder* builder, Type x, Type y,
-                             Type element_type,
-                             DenseIntElementsAttr broadcast_dimensions) {
+
+// Updates the element type of a (presumed) tensor type 'x', returning either
+// a permuted UnrankedTensorType or RankedTensorType.
+static Type UpdateResultElementType(Builder* builder, Type x,
+                                    Type element_type) {
   auto x_ranked = x.dyn_cast<RankedTensorType>();
-  auto y_ranked = y.dyn_cast<RankedTensorType>();
-  if (!x_ranked || !y_ranked) {
+  if (!x_ranked) {
     return UnrankedTensorType::get(element_type);
   }
 
   auto shape_x = x_ranked.getShape();
-  auto shape_y = y_ranked.getShape();
-
-  if (shape_x.size() == shape_y.size()) {
-    llvm::SmallVector<int64_t, 4> out_shape(shape_x.size());
-    for (int i = 0; i < shape_x.size(); i++) {
-      auto x_val = shape_x[i];
-      auto y_val = shape_y[i];
-      if (x_val == -1 || y_val == -1) {
-        out_shape[i] = -1;
-      } else {
-        out_shape[i] = std::max(x_val, y_val);
-      }
-    }
-    return RankedTensorType::get(out_shape, element_type);
-  }
-
-  // Return unranked tensor for invalid broadcast dimensions.
-  if (!broadcast_dimensions) return UnrankedTensorType::get(element_type);
-
-  auto shape_large = shape_x.size() > shape_y.size() ? shape_x : shape_y;
-  auto shape_small = shape_x.size() <= shape_y.size() ? shape_x : shape_y;
-
-  llvm::SmallVector<int64_t, 4> out_shape(shape_large.begin(),
-                                          shape_large.end());
-
-  // Update according to the broadcast dimensions.
-  for (auto index_pair : llvm::enumerate(broadcast_dimensions.getIntValues())) {
-    auto old_value = out_shape[index_pair.value().getSExtValue()];
-    auto new_value = shape_small[index_pair.index()];
-    if (old_value != -1 && (new_value == -1 || new_value > old_value)) {
-      out_shape[index_pair.value().getSExtValue()] = new_value;
-    }
-  }
-
-  return RankedTensorType::get(out_shape, element_type);
+  return RankedTensorType::get(shape_x, element_type);
 }
 }  // namespace
 
-#define BINARY_BUILDER(Op)                                                    \
-  void Op::build(OpBuilder& builder, OperationState& result, Value left,      \
-                 Value right, DenseIntElementsAttr broadcast_dimensions) {    \
-    auto type = GetBroadcastType(&builder, left.getType().cast<ShapedType>(), \
-                                 right.getType().cast<ShapedType>(),          \
-                                 getElementTypeOrSelf(right.getType()),       \
-                                 broadcast_dimensions);                       \
-    return Op::build(builder, result, type, left, right,                      \
-                     broadcast_dimensions);                                   \
-  }
-
-BINARY_BUILDER(AddOp);
-BINARY_BUILDER(AndOp);
-BINARY_BUILDER(Atan2Op);
-BINARY_BUILDER(DivOp);
-BINARY_BUILDER(MaxOp);
-BINARY_BUILDER(MinOp);
-BINARY_BUILDER(MulOp);
-BINARY_BUILDER(OrOp);
-BINARY_BUILDER(PowOp);
-BINARY_BUILDER(RemOp);
-BINARY_BUILDER(ShiftLeftOp);
-BINARY_BUILDER(ShiftRightArithmeticOp);
-BINARY_BUILDER(ShiftRightLogicalOp);
-BINARY_BUILDER(SubOp);
-BINARY_BUILDER(XorOp);
-
-#undef BINARY_BUILDER
-
 template <typename Op, typename ElementType = Type, typename ValType,
           typename Convert>
 static Attribute BinaryFolder(Op* op, ArrayRef<Attribute> attrs) {
   if (!attrs[0] || !attrs[1]) return {};
-  if (op->broadcast_dimensions().hasValue()) return {};
 
   DenseElementsAttr lhs = attrs[0].dyn_cast<DenseElementsAttr>();
   DenseElementsAttr rhs = attrs[1].dyn_cast<DenseElementsAttr>();
@@ -1494,6 +1474,38 @@ static Attribute BinaryFolder(Op* op, ArrayRef<Attribute> attrs) {
   return DenseElementsAttr::get(type, values);
 }
 
+template <typename T>
+struct divide : std::divides<T> {};
+
+template <>
+struct divide<APInt> {
+  APInt operator()(const APInt& a, const APInt& b) const { return a.sdiv(b); }
+};
+
+template <typename T>
+struct max {
+  T operator()(const T& a, const T& b) const { return std::max<T>(a, b); }
+};
+
+template <>
+struct max<APInt> {
+  APInt operator()(const APInt& a, const APInt& b) const {
+    return llvm::APIntOps::smax(a, b);
+  }
+};
+
+template <typename T>
+struct min {
+  T operator()(const T& a, const T& b) const { return std::min<T>(a, b); }
+};
+
+template <>
+struct min<APInt> {
+  APInt operator()(const APInt& a, const APInt& b) const {
+    return llvm::APIntOps::smin(a, b);
+  }
+};
+
 #define BINARY_FOLDER(Op, Func)                                                \
   OpFoldResult Op::fold(ArrayRef<Attribute> attrs) {                           \
     if (getElementTypeOrSelf(getType()).isa<FloatType>())                      \
@@ -1503,9 +1515,16 @@ static Attribute BinaryFolder(Op* op, ArrayRef<Attribute> attrs) {
     return {};                                                                 \
   }
 
+// Addition, subtraction and multiplication use the std:: versions of the ops.
+// Due to the other ops behaving differently in signed vs unsigned integers,
+// APInts need a special implementation. Currently, it replicates signed int
+// op behavior.
 BINARY_FOLDER(AddOp, std::plus);
 BINARY_FOLDER(SubOp, std::minus);
 BINARY_FOLDER(MulOp, std::multiplies);
+BINARY_FOLDER(DivOp, divide);
+BINARY_FOLDER(MaxOp, max);
+BINARY_FOLDER(MinOp, min);
 
 #undef BINARY_FOLDER
 
@@ -1876,12 +1895,10 @@ void UnaryEinsumOp::getCanonicalizationPatterns(
 //===----------------------------------------------------------------------===//
 
 void CompareOp::build(OpBuilder& builder, OperationState& result, Value lhs,
-                      Value rhs, DenseIntElementsAttr broadcast_dimensions,
-                      StringAttr comparison_direction) {
-  auto new_type = GetBroadcastType(&builder, lhs.getType(), rhs.getType(),
-                                   builder.getI1Type(), broadcast_dimensions);
-  build(builder, result, new_type, lhs, rhs, broadcast_dimensions,
-        comparison_direction);
+                      Value rhs, StringAttr comparison_direction) {
+  auto new_type =
+      UpdateResultElementType(&builder, lhs.getType(), builder.getI1Type());
+  build(builder, result, new_type, lhs, rhs, comparison_direction);
 }
 
 #define GET_OP_CLASSES
@@ -1969,11 +1986,8 @@ LogicalResult deriveShapeFromFirstOperand(
           loc, builder->getI64IntegerAttr(element.value())));
     }
   }
-  *reifiedReturnShapes =
-      SmallVector<Value, 1>{builder->create<ScalarsToDimensionTensorOp>(
-          loc,
-          RankedTensorType::get({operand_type.getRank()}, shape_scalar_type),
-          shape_values)};
+  *reifiedReturnShapes = SmallVector<Value, 1>{
+      builder->create<TensorFromElementsOp>(loc, shape_values)};
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.h b/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
index 25b2f009cc6..9725a0684f6 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 
 namespace mlir {
 class OpBuilder;
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index dabf03d3c9f..97b8e1c1863 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -23,7 +23,7 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_utils.td"
 
@@ -79,6 +79,25 @@ def HLO_IotaOp : HLO_Op<"iota", [NoSideEffect]>, BASE_HLO_IotaOp {
   let hasCustomHLOConverter = 1;
 }
 
+def HLO_DynamicIotaOp: HLO_Op<"dynamic_iota", [NoSideEffect]> {
+  let summary = "Create linear increasing values from 0 to length -1.";
+  let description = [{
+    Produces an HLO Tensor of the specified shape, with an incremental set of
+    values along the specified dimension starting at 0.
+
+    Requires:
+    - The output length of the tensor result.
+  }];
+
+  let arguments = (ins HLO_DimensionTensor:$output_shape, I64Attr:$iota_dimension);
+  let results = (outs HLO_Tensor:$result);
+
+  let hasCanonicalizer = 1;
+  // Cannot be exported to legacy formats.
+  let hasCustomHLOConverter = 1;
+}
+
+
 def HLO_CreateTokenOp : HLO_Op<"create_token", [NoSideEffect]> {
   string summary = "Create Token operator";
 
@@ -95,6 +114,7 @@ def HLO_CreateTokenOp : HLO_Op<"create_token", [NoSideEffect]> {
 // XLA unary elementwise op definitions.
 //===----------------------------------------------------------------------===//
 // See https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions
+
 class HLO_UnaryElementwiseOp<string mnemonic, list<OpTrait> traits,
       Type TensorType>: HLO_Op<mnemonic,
         !listconcat(traits, [InferShapedTypeOpInterface])> {
@@ -103,8 +123,7 @@ class HLO_UnaryElementwiseOp<string mnemonic, list<OpTrait> traits,
     let extraClassDeclaration = [{
       static  LogicalResult inferReturnTypeComponents(
           MLIRContext* context, Optional<Location> location,
-          ValueRange operands, ArrayRef<NamedAttribute> attributes,
-          RegionRange regions,
+          ValueRange operands, DictionaryAttr attributes, RegionRange regions,
           SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
         return failure();
       }
@@ -161,6 +180,16 @@ def HLO_Expm1Op: HLO_UnaryElementwiseOp<"exponential_minus_one",
 def HLO_FloorOp: HLO_UnaryElementwiseOp<"floor",
     [NoSideEffect, SameOperandsAndResultType], HLO_FpTensor>, BASE_HLO_FloorOp;
 
+def HLO_ImagOp: HLO_Op<
+    "imag", [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_ImagOp {
+  let builders = [OpBuilder<
+    "OpBuilder &, OperationState &tblgen_state, Value val">];
+
+  let arguments = (ins HLO_ComplexTensor);
+  let results = (outs HLO_FpTensor);
+  let hasFolder = 1;
+}
+
 def HLO_IsFiniteOp: HLO_UnaryElementwiseOp<"is_finite",
     [NoSideEffect, SameOperandsAndResultShape], HLO_Tensor>,
     BASE_HLO_IsFiniteOp {
@@ -188,6 +217,16 @@ def HLO_PopulationCountOp: HLO_UnaryElementwiseOp<"popcnt",
     [NoSideEffect, SameOperandsAndResultType], HLO_IntTensor>,
     BASE_HLO_PopulationCountOp;
 
+def HLO_RealOp: HLO_Op<
+    "real", [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_RealOp {
+  let builders = [OpBuilder<
+    "OpBuilder &, OperationState &tblgen_state, Value val">];
+
+  let arguments = (ins HLO_ComplexTensor);
+  let results = (outs HLO_FpTensor);
+  let hasFolder = 1;
+}
+
 def HLO_RoundOp: HLO_UnaryElementwiseOp<"round_nearest_afz",
     [NoSideEffect, SameOperandsAndResultType], HLO_FpTensor>, BASE_HLO_RoundOp;
 
@@ -209,67 +248,25 @@ def HLO_SqrtOp: HLO_UnaryElementwiseOp<"sqrt",
     BASE_HLO_SqrtOp;
 
 def HLO_TanhOp: HLO_UnaryElementwiseOp<"tanh",
-    [ResultsAreFloatLike, NoSideEffect, SameOperandsAndResultType],
+    [NoSideEffect, SameOperandsAndResultType],
     HLO_FpOrComplexTensor>, BASE_HLO_TanhOp;
 
-//===----------------------------------------------------------------------===//
-// XLA complex unary elementwise op definitions.
-//===----------------------------------------------------------------------===//
-// See https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions
-
-def HLO_ComplexOp: HLO_Op<"complex",
-    [NoSideEffect, SameOperandsElementType, SameOperandsAndResultShape]>,
-    BASE_HLO_ComplexOp {
-  let builders = [OpBuilder<
-    "OpBuilder &, OperationState &tblgen_state, Value lhs, Value rhs">];
-
-  let arguments = (ins HLO_FpTensor:$lhs, HLO_FpTensor:$rhs);
-  let results = (outs HLO_ComplexTensor);
-  let hasFolder = 1;
-}
-
-def HLO_ImagOp: HLO_Op<
-    "imag", [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_ImagOp {
-  let builders = [OpBuilder<
-    "OpBuilder &, OperationState &tblgen_state, Value val">];
-
-  let arguments = (ins HLO_ComplexTensor);
-  let results = (outs HLO_FpTensor);
-  let hasFolder = 1;
-}
-
-def HLO_RealOp: HLO_Op<
-    "real", [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_RealOp {
-  let builders = [OpBuilder<
-    "OpBuilder &, OperationState &tblgen_state, Value val">];
-
-  let arguments = (ins HLO_ComplexTensor);
-  let results = (outs HLO_FpTensor);
-  let hasFolder = 1;
-}
-
 //===----------------------------------------------------------------------===//
 // XLA binary elementwise op definitions.
 //===----------------------------------------------------------------------===//
-
 // See https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations
+
 class HLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
         HLO_Op<mnemonic, !listconcat(traits, [InferShapedTypeOpInterface])> {
   let arguments = (ins
     HLO_Tensor:$lhs,
-    HLO_Tensor:$rhs,
-    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
+    HLO_Tensor:$rhs
   );
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value left, Value  right, "
-    "DenseIntElementsAttr broadcast_dimensions"
-  >];
-
   let extraClassDeclaration = [{
     static  LogicalResult inferReturnTypeComponents(
         MLIRContext* context, Optional<Location> location, ValueRange operands,
-        ArrayRef<NamedAttribute> attributes, RegionRange regions,
+        DictionaryAttr attributes, RegionRange regions,
         SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
       return failure();
     }
@@ -286,47 +283,61 @@ class HLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
 }
 
 def HLO_AddOp : HLO_BinaryElementwiseOp<"add",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_AddOp {
+      [Commutative, NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_AddOp {
   let hasFolder = 1;
 }
 
 def HLO_Atan2Op : HLO_BinaryElementwiseOp<"atan2",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_Atan2Op;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_Atan2Op;
+
+def HLO_ComplexOp: HLO_Op<"complex",
+    [NoSideEffect, SameOperandsAndResultShape]>,
+    BASE_HLO_ComplexOp {
+  let builders = [OpBuilder<
+    "OpBuilder &, OperationState &tblgen_state, Value lhs, Value rhs">];
+
+  let arguments = (ins HLO_FpTensor:$lhs, HLO_FpTensor:$rhs);
+  let results = (outs HLO_ComplexTensor);
+  let hasFolder = 1;
+}
 
 def HLO_DivOp : HLO_BinaryElementwiseOp<"divide",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_DivOp {
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_DivOp {
+  let hasFolder = 1;
 }
 
 def HLO_MaxOp : HLO_BinaryElementwiseOp<"maximum",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MaxOp {
+      [Commutative, NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_MaxOp {
+  let hasFolder = 1;
 }
 
 def HLO_MinOp : HLO_BinaryElementwiseOp<"minimum",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MinOp {
+      [Commutative, NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_MinOp {
+  let hasFolder = 1;
 }
 
 def HLO_MulOp : HLO_BinaryElementwiseOp<"multiply",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MulOp {
+      [Commutative, NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_MulOp {
   let hasFolder = 1;
 }
 
 def HLO_PowOp : HLO_BinaryElementwiseOp<"power",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_PowOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_PowOp;
 
 def HLO_RemOp : HLO_BinaryElementwiseOp<"remainder",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_RemOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_RemOp;
 
 def HLO_ShiftLeftOp : HLO_BinaryElementwiseOp<"shift_left",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ShiftLeftOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_ShiftLeftOp;
 
 def HLO_ShiftRightArithmeticOp : HLO_BinaryElementwiseOp<"shift_right_arithmetic",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ShiftRightArithmeticOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_ShiftRightArithmeticOp;
 
 def HLO_ShiftRightLogicalOp : HLO_BinaryElementwiseOp<"shift_right_logical",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ShiftRightLogicalOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_ShiftRightLogicalOp;
 
 def HLO_SubOp : HLO_BinaryElementwiseOp<"subtract",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_SubOp {
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_SubOp {
   let hasFolder = 1;
 }
 
@@ -336,11 +347,11 @@ def HLO_SubOp : HLO_BinaryElementwiseOp<"subtract",
 
 // See https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations
 class HLO_BinaryLogicalElementwiseOp<string mnemonic> :
-        HLO_BinaryElementwiseOp<mnemonic, [Commutative, NoSideEffect]> {
+        HLO_BinaryElementwiseOp<
+            mnemonic, [Commutative, NoSideEffect, SameOperandsAndResultType]> {
   let arguments = (ins
     HLO_PredOrIntTensor:$lhs,
-    HLO_PredOrIntTensor:$rhs,
-    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
+    HLO_PredOrIntTensor:$rhs
   );
 }
 
@@ -472,7 +483,7 @@ def HLO_ReplicaIdOp : HLO_Op<"replica_id", [NoSideEffect]>,
 // XLA control flow op definitions.
 //===----------------------------------------------------------------------===//
 
-def HLO_AfterAllOp : HLO_Op<"after_all", []> {
+def HLO_AfterAllOp : HLO_Op<"after_all", [NoSideEffect]> {
 
   string summary = "AfterAll operator";
 
@@ -489,8 +500,11 @@ def HLO_AfterAllOp : HLO_Op<"after_all", []> {
   let results = (outs HLO_Token);
 }
 
-def HLO_ConditionalOp: HLO_Op<"conditional", []> {
-  string summary = "Conditional operator";
+// Xla Client API has two separate calls for indexed and predicated conditional,
+// although both eventually map to kConditional HLO. IfOp maps to predicated
+// conditional use of kConditional HLO.
+def HLO_IfOp: HLO_Op<"if", [RecursiveSideEffects]> {
+  string summary = "If operator";
 
   string description = [{
     Returns the result of executing either a true or false function depending on
@@ -505,7 +519,8 @@ def HLO_ConditionalOp: HLO_Op<"conditional", []> {
     HLO_TensorOrTuple:$false_arg
   );
 
-  let regions = (region AnyRegion:$true_branch, AnyRegion:$false_branch);
+  let regions = (region AnyRegion:$true_branch,
+                        AnyRegion:$false_branch);
 
   let results = (outs HLO_TensorOrTuple);
 
@@ -513,7 +528,27 @@ def HLO_ConditionalOp: HLO_Op<"conditional", []> {
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_WhileOp: HLO_Op<"while", [SameOperandsAndResultType]> {
+// Xla Client API has two separate calls for indexed and predicated conditional,
+// although both eventually map to kConditional HLO. CaseOp maps to indexed
+// conditional use of kConditional HLO.
+def HLO_CaseOp: HLO_Op<"case", [RecursiveSideEffects]>,
+      BASE_HLO_CaseOp {
+
+  let arguments = (ins
+    I32Tensor:$index,
+    Variadic<HLO_TensorOrTuple>:$branch_operands
+  );
+
+  let regions = (region VariadicRegion<AnyRegion>:$branches);
+
+  let results = (outs Variadic<HLO_TensorOrTuple>);
+
+  let hasCustomHLOConverter = 1;
+}
+
+
+def HLO_WhileOp: HLO_Op<"while", [RecursiveSideEffects,
+                                  SameOperandsAndResultType]> {
   string summary = "While operator";
 
   string description = [{
@@ -534,7 +569,7 @@ def HLO_WhileOp: HLO_Op<"while", [SameOperandsAndResultType]> {
 }
 
 def HLO_AllReduceOp : HLO_Op<"all_reduce",
-    [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_AllReduceOp {
+    [SameOperandsAndResultType]>, BASE_HLO_AllReduceOp {
 
   let arguments = (ins
     HLO_Tensor:$operand,
@@ -561,7 +596,7 @@ def HLO_AllToAllOp : HLO_Op<"all_to_all",
 }
 
 def HLO_ReduceOp: HLO_Op<"reduce", [
-      NoSideEffect,
+      RecursiveSideEffects,
       SameVariadicOperandSize,
       SingleBlockImplicitTerminator<"ReturnOp">
     ]>, BASE_HLO_ReduceOp {
@@ -619,23 +654,18 @@ def HLO_TupleOp : HLO_Op<"tuple", [NoSideEffect]>, BASE_HLO_TupleOp {
 }
 
 def HLO_CompareOp: HLO_Op<"compare",
-      [NoSideEffect, SameOperandsElementType]>, BASE_HLO_CompareOp {
+      [NoSideEffect, SameTypeOperands, SameOperandsAndResultShape]>,
+      BASE_HLO_CompareOp {
   let arguments = (ins
     HLO_Tensor:$lhs,
     HLO_Tensor:$rhs,
-    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions,
     HLO_ComparisonDirectionAttr:$comparison_direction
   );
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value left, Value  right, "
-    "DenseIntElementsAttr broadcast_dimensions, "
-    "StringAttr comparison_direction"
-  >];
   let results = (outs HLO_PredTensor);
 
   let builders = [OpBuilder<
     "OpBuilder &builder, OperationState &result, Value lhs, Value rhs, "
-    "DenseIntElementsAttr broadcast_dimensions, StringAttr comparison_direction"
+    "StringAttr comparison_direction"
   >];
 }
 
@@ -781,23 +811,6 @@ def HLO_BroadcastInDimOp : HLO_Op<"broadcast_in_dim",
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_ScalarsToDimensionTensorOp : HLO_Op<"scalars_to_dimension_tensor",
-    [SameOperandsElementType, NoSideEffect]> {
-  string summary = "Converts a sequence of scalars into a 1d tensor.";
-
-  string description = [{
-    This is a useful operation that is currently missing in Standard. Used to
-    compute shape arguments to dynamic operations.
-  }];
-
-  let arguments = (ins Variadic<AnySignlessInteger>:$scalars);
-  let results = (outs HLO_DimensionTensor);
-
-  // Cannot be exported to legacy formats.
-  let hasCustomHLOConverter = 1;
-  let hasCanonicalizer = 1;
-}
-
 def HLO_DynamicBroadcastInDimOp : HLO_Op<"dynamic_broadcast_in_dim",
       [NoSideEffect]> {
   string summary = "Broadcast a tensor into the given dynamic shape by adding dimensions.";
@@ -1047,8 +1060,8 @@ def HLO_GetDimensionSizeOp: HLO_Op<"get_dimension_size", [NoSideEffect]>,
 }
 
 def HLO_MapOp: HLO_Op<"map",
-      [NoSideEffect, SameOperandsElementType, SameOperandsAndResultShape,
-        SingleBlockImplicitTerminator<"ReturnOp">]>,
+      [RecursiveSideEffects, SameOperandsElementType,
+       SameOperandsAndResultShape, SingleBlockImplicitTerminator<"ReturnOp">]>,
       BASE_HLO_MapOp {
   let arguments = (ins
     Variadic<HLO_Tensor>:$operands,
@@ -1063,13 +1076,13 @@ def HLO_ReshapeOp: HLO_Op<"reshape",
       [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ReshapeOp {
   let arguments = (ins HLO_Tensor:$operand);
 
-  let results = (outs HLO_Tensor);
+  let results = (outs HLO_StaticShapeTensor);
   let hasFolder = 1;
 
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_DynamicReshapeOp: HLO_Op<"dynamic_reshape", []> {
+def HLO_DynamicReshapeOp: HLO_Op<"dynamic_reshape", [NoSideEffect]> {
   let summary = "Reshape a tensor to a given, possibly dynamic, shape.";
   let description = [{
     Reshapes `operand` to `output_shape`.
@@ -1097,7 +1110,8 @@ def ScatterDimensionNumbers : StructAttr<"ScatterDimensionNumbers", HLO_Dialect,
   let description = "Structure of dimension information for scatter";
 }
 
-def HLO_ScatterOp: HLO_Op<"scatter", [NoSideEffect]>, BASE_HLO_ScatterOp {
+def HLO_ScatterOp: HLO_Op<"scatter", [RecursiveSideEffects]>,
+      BASE_HLO_ScatterOp {
   let arguments = (ins
     HLO_Tensor:$operand,
     HLO_Tensor:$scatter_indices,
@@ -1126,7 +1140,7 @@ def HLO_SelectOp: HLO_Op<"select", [NoSideEffect, DeclareOpInterfaceMethods<Infe
 }
 
 def HLO_SelectAndScatterOp: HLO_Op<"select_and_scatter",
-      [NoSideEffect]>, BASE_HLO_SelectAndScatterOp {
+      [RecursiveSideEffects]>, BASE_HLO_SelectAndScatterOp {
   let arguments = (ins
     HLO_Tensor:$operand,
     HLO_Tensor:$source,
@@ -1153,7 +1167,7 @@ def HLO_SetDimensionSizeOp: HLO_Op<"set_dimension_size", [NoSideEffect]>,
   let results = (outs HLO_Tensor);
 }
 
-def HLO_SortOp : HLO_Op<"sort", [NoSideEffect]>, BASE_HLO_SortOp {
+def HLO_SortOp : HLO_Op<"sort", [RecursiveSideEffects]>, BASE_HLO_SortOp {
   let arguments = (ins
     Variadic<HLO_Tensor>:$operands,
     DefaultValuedAttr<I64Attr, "-1">:$dimension,
@@ -1205,7 +1219,7 @@ def HLO_PadOp: HLO_Op<"pad",
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_TraceOp: HLO_Op<"trace", [NoSideEffect]>, BASE_HLO_TraceOp {
+def HLO_TraceOp: HLO_Op<"trace", []>, BASE_HLO_TraceOp {
   let arguments = (ins
     HLO_Tensor:$operand,
     StrAttr:$tag
@@ -1239,7 +1253,7 @@ def HLO_TriangularSolveOp: HLO_Op<"triangular_solve",
 }
 
 def HLO_ReduceWindowOp: HLO_Op<"reduce_window", [
-      NoSideEffect,
+      RecursiveSideEffects,
       SingleBlockImplicitTerminator<"ReturnOp">
     ]>, BASE_HLO_ReduceWindowOp {
 
@@ -1270,7 +1284,7 @@ def HLO_ReduceWindowOp: HLO_Op<"reduce_window", [
   // TODO(hinsu): Implement custom printer and parser.
 }
 
-def HLO_ReturnOp : HLO_Op<"return", [Terminator]> {
+def HLO_ReturnOp : HLO_Op<"return", [NoSideEffect, Terminator]> {
   let summary = [{
     The `hlo.return` operation terminates a region and returns values.
   }];
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
index c087ffd1f40..bad1bf16ec3 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
@@ -62,9 +62,11 @@ def HLO_Tuple : NestedTupleOf<[HLO_Tensor, HLO_Token]>;
 
 def HLO_TensorOrTuple : AnyTypeOf<[HLO_Tensor, HLO_Tuple]>;
 
+def HLO_DimensionValue : AnyTypeOf<[Index, HLO_Pred, HLO_Int]>;
+
 // Dynamic representation of a shape vector as a tensor.
 def HLO_DimensionTensor : ShapedContainerType<
-    [Index, HLO_Pred, HLO_Int],
+    [HLO_DimensionValue],
     And<[IsTensorTypePred, HasAnyRankOfPred<[1]>]>,
     "a 1D tensor of dimensions">;
 
@@ -150,15 +152,6 @@ class BASE_HLO_ClzOp {
   }];
 }
 
-class BASE_HLO_ComplexOp {
-  string summary = "Complex operator";
-
-  string description = [{
-    Performs element-wise conversion of a pair of real and imaginary values to
-    a complex value.
-  }];
-}
-
 class BASE_HLO_ConvertOp {
   string summary = "Convert operator";
 
@@ -400,6 +393,15 @@ class BASE_HLO_AddOp {
   }];
 }
 
+class BASE_HLO_ComplexOp {
+  string summary = "Complex operator";
+
+  string description = [{
+    Performs element-wise conversion of a pair of real and imaginary values to
+    a complex value.
+  }];
+}
+
 class BASE_HLO_DivOp {
   string summary = "Division operator";
 
@@ -553,6 +555,29 @@ class BASE_HLO_XorOp {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// XLA control flow related op definitions.
+//===----------------------------------------------------------------------===//
+
+class BASE_HLO_CaseOp {
+  string summary = "Switch-Case operator";
+
+  string description = [{
+    Returns the result of executing `branches[index]`. If
+    `index` is < 0 or >= N, then `branches[N-1] is executed as
+    the default branch.
+
+    Each branch `branches[b]` must take in a single argument of same type as
+    `branch_operands[b]` and will be invoked with `branch_operands[b]`. The type
+    of the returned value of each branch must be the same.
+
+    Note that only one of the branches will be executed depending on the value
+    of index.
+    See https://www.tensorflow.org/xla/operation_semantics#conditional.
+  }];
+
+}
+
 //===----------------------------------------------------------------------===//
 // XLA parallelism related op definitions.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_utils.h b/tensorflow/compiler/mlir/xla/ir/hlo_utils.h
index 079169e9c5c..03e41f6432c 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_utils.h
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_utils.h
@@ -35,22 +35,33 @@ mlir::DenseIntElementsAttr getBroadcastDimensionsAttr(mlir::Builder* b,
                                                       mlir::Value y,
                                                       bool allow_empty = true);
 
-/// Get a constant splat for the given value type.
+// Get a constant splat for the given value of type. Requires value to be of
+// type static shaped RankedTensorType.
+template <typename T>
+static ElementsAttr getSplat(Builder* b, RankedTensorType ty, T constant) {
+  Type element_ty = getElementTypeOrSelf(ty);
+
+  if (element_ty.isSignlessInteger())
+    return DenseElementsAttr::get(ty, b->getIntegerAttr(element_ty, constant));
+
+  if (element_ty.isa<FloatType>())
+    return DenseElementsAttr::get(ty, b->getFloatAttr(element_ty, constant));
+
+  if (auto complex_ty = element_ty.dyn_cast<ComplexType>()) {
+    auto complex_element_ty = complex_ty.getElementType();
+    if (complex_element_ty.isF32())
+      return DenseElementsAttr::get(ty,
+                                    static_cast<std::complex<float>>(constant));
+    if (complex_element_ty.isF64())
+      return DenseElementsAttr::get(
+          ty, static_cast<std::complex<double>>(constant));
+  }
+  llvm_unreachable("unhandled element type");
+}
+
 template <typename T>
 static ElementsAttr getSplat(Builder* b, Value val, T constant) {
-  auto valType = val.getType().cast<TensorType>();
-  auto valElementType = getElementTypeOrSelf(val.getType());
-
-  // Handle integer elements.
-  Attribute elementAttr;
-  if (valElementType.isSignlessInteger())
-    elementAttr = b->getIntegerAttr(valElementType, constant);
-  else if (valElementType.isa<FloatType>())
-    elementAttr = b->getFloatAttr(valElementType, constant);
-  else
-    llvm_unreachable("unhandled element type");
-
-  return DenseElementsAttr::get(valType, elementAttr);
+  return getSplat(b, val.getType().cast<RankedTensorType>(), constant);
 }
 
 // Returns DenseElementsAttr of rank zero with the given element type and the
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc
index 680a73e49c5..6f9b39377af 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc
@@ -55,6 +55,36 @@ XlaLhloDialect::XlaLhloDialect(MLIRContext *context)
       >();
 }
 
+//===----------------------------------------------------------------------===//
+// StaticMemRefCastOp
+//===----------------------------------------------------------------------===//
+
+Value StaticMemRefCastOp::getViewSource() { return *getODSOperands(0).begin(); }
+
+static LogicalResult Verify(StaticMemRefCastOp op) {
+  if (!op.operand().getType().cast<ShapedType>().hasStaticShape())
+    return op.emitOpError("operand must have static shape");
+  if (!op.getType().hasStaticShape())
+    return op.emitOpError("result must have static shape");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// DynamicMemRefCastOp
+//===----------------------------------------------------------------------===//
+
+Value DynamicMemRefCastOp::getViewSource() {
+  return *getODSOperands(0).begin();
+}
+
+static LogicalResult Verify(DynamicMemRefCastOp op) {
+  // Check if `sizes` and `strides` args are compatible with the result type.
+  if (op.sizes().size() != op.getType().getRank())
+    return op.emitOpError(
+        "`sizes` args count must be equal to the rank of the output memref");
+  return success();
+}
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc.inc"
 
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
index 190c5ff832d..3827e8a7a4e 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
@@ -27,7 +27,8 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/ViewLikeInterface.h"  // from @llvm-project
 
 namespace mlir {
 class OpBuilder;
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index 3abd117f570..d9f3648bb09 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -19,7 +19,8 @@ limitations under the License.
 #define LHLO_OPS
 
 include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/ViewLikeInterface.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
 
 def LHLO_Dialect : Dialect {
@@ -92,10 +93,20 @@ def LHLO_CosOp: LHLO_UnaryElementwiseOp<"cosine">, BASE_HLO_CosOp;
 
 def LHLO_ExpOp: LHLO_UnaryElementwiseOp<"exponential">, BASE_HLO_ExpOp;
 
+def LHLO_ImagOp: LHLO_Op<"imag", [SameOperandsShape]>, BASE_HLO_ImagOp {
+  let arguments = (ins Arg<LHLO_Buffer, "", [MemRead]>:$input,
+                       Arg<LHLO_Buffer, "", [MemWrite]>:$output);
+}
+
 def LHLO_LogOp: LHLO_UnaryElementwiseOp<"log">, BASE_HLO_LogOp;
 
 def LHLO_NegOp: LHLO_UnaryElementwiseOp<"negate">, BASE_HLO_NegOp;
 
+def LHLO_RealOp: LHLO_Op<"real", [SameOperandsShape]>, BASE_HLO_RealOp {
+  let arguments = (ins Arg<LHLO_Buffer, "", [MemRead]>:$input,
+                       Arg<LHLO_Buffer, "", [MemWrite]>:$output);
+}
+
 def LHLO_RsqrtOp: LHLO_UnaryElementwiseOp<"rsqrt">, BASE_HLO_RsqrtOp;
 
 def LHLO_SqrtOp: LHLO_UnaryElementwiseOp<"sqrt">, BASE_HLO_SqrtOp;
@@ -106,27 +117,6 @@ def LHLO_SinOp: LHLO_UnaryElementwiseOp<"sine">, BASE_HLO_SinOp;
 
 def LHLO_TanhOp: LHLO_UnaryElementwiseOp<"tanh">, BASE_HLO_TanhOp;
 
-//===----------------------------------------------------------------------===//
-// XLA complex unary elementwise op definitions.
-//===----------------------------------------------------------------------===//
-// See https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions
-
-def LHLO_ComplexOp: LHLO_Op<"complex", [SameOperandsShape]>, BASE_HLO_ComplexOp {
-  let arguments = (ins Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
-                       Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
-                       Arg<LHLO_Buffer, "", [MemWrite]>:$output);
-}
-
-def LHLO_ImagOp: LHLO_Op<"imag", [SameOperandsShape]>, BASE_HLO_ImagOp {
-  let arguments = (ins Arg<LHLO_Buffer, "", [MemRead]>:$input,
-                       Arg<LHLO_Buffer, "", [MemWrite]>:$output);
-}
-
-def LHLO_RealOp: LHLO_Op<"real", [SameOperandsShape]>, BASE_HLO_RealOp {
-  let arguments = (ins Arg<LHLO_Buffer, "", [MemRead]>:$input,
-                       Arg<LHLO_Buffer, "", [MemWrite]>:$output);
-}
-
 //===----------------------------------------------------------------------===//
 // XLA binary elementwise op definitions.
 //===----------------------------------------------------------------------===//
@@ -144,6 +134,12 @@ class LHLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
 
 def LHLO_AddOp : LHLO_BinaryElementwiseOp<"add", []>, BASE_HLO_AddOp;
 
+def LHLO_ComplexOp: LHLO_Op<"complex", [SameOperandsShape]>, BASE_HLO_ComplexOp {
+  let arguments = (ins Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
+                       Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
+                       Arg<LHLO_Buffer, "", [MemWrite]>:$output);
+}
+
 def LHLO_DivOp : LHLO_BinaryElementwiseOp<"divide", []>, BASE_HLO_DivOp;
 
 def LHLO_MaxOp : LHLO_BinaryElementwiseOp<"maximum", []>, BASE_HLO_MaxOp;
@@ -201,6 +197,19 @@ def LHLO_ReduceWindowOp: LHLO_Op<"reduce_window", [
   let regions = (region SizedRegion<1>:$body);
 }
 
+def LHLO_CaseOp: LHLO_Op<"case", [
+      SingleBlockImplicitTerminator<"TerminatorOp">
+    ]>, BASE_HLO_CaseOp {
+
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", [MemRead]>:$index,
+    Arg<Variadic<LHLO_BufferOrTuple>, "", [MemRead]>:$branch_operands,
+    Arg<LHLO_BufferOrTuple, "", [MemWrite]>:$out
+  );
+
+  let regions = (region VariadicRegion<SizedRegion<1>>:$branches);
+}
+
 //===----------------------------------------------------------------------===//
 // XLA tuple op definitions.
 //===----------------------------------------------------------------------===//
@@ -254,6 +263,96 @@ def HLO_DynamicUpdateSliceOp: LHLO_Op<"dynamic-update-slice", []> {
   );
 }
 
+//===----------------------------------------------------------------------===//
+// StaticMemRefCastOp
+//===----------------------------------------------------------------------===//
+
+def HLO_StaticMemRefCastOp: Op<LHLO_Dialect, "static_memref_cast",
+    [NoSideEffect, DeclareOpInterfaceMethods<ViewLikeOpInterface>]> {
+  let summary = "static memref cast operation";
+  let description = [{
+    Allows to modify the offset, sizes and strides of a statically shaped memref.
+
+    Example:
+    ```mlir
+    %buf_transformed =
+        xla_lhlo.static_memref_cast %buf
+        : memref<1x5xf32> -> memref<5xf32, offset: 2, strides: [1]>
+
+    // The result of the op is a rank-1 memref with `[5]` shape, stride 1 and
+    // offset 2.
+    ```
+  }];
+
+  let arguments = (ins Arg<LHLO_Buffer, "", []>:$operand);
+  let results = (outs Res<LHLO_Buffer, "", []>:$result);
+
+  let builders = [OpBuilder<
+    "OpBuilder &builder, OperationState &result, MemRefType resultType, " #
+    "Value operand", [{
+       result.addOperands(operand);
+       result.types.push_back(resultType);
+     }]>];
+
+  let extraClassDeclaration = [{
+    MemRefType getType() { return getResult().getType().cast<MemRefType>(); }
+  }];
+
+  let verifier = [{ return Verify(*this); }];
+  let assemblyFormat = [{
+    $operand attr-dict `:` type($operand) `->` type($result)
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// DynamicMemRefCastOp
+//===----------------------------------------------------------------------===//
+
+def HLO_DynamicMemRefCastOp: Op<LHLO_Dialect, "dynamic_memref_cast",
+    [SameVariadicOperandSize, NoSideEffect,
+     DeclareOpInterfaceMethods<ViewLikeOpInterface>]> {
+  let summary = "dynamic memref cast operation";
+  let description = [{
+    Change sizes and strides of a memref using the values computed in runtime.
+
+    Example:
+    ```mlir
+    %buf_transformed =
+        xla_lhlo.dynamic_memref_cast %buf(%size_X, %size_Y)[%step_X, %step_Y]
+        : memref<?x?xf32> -> memref<?x?xf32, offset: 0, strides: [?, ?]>
+    // The result of the op is a type-erased memref with `[%size_X, %size_Y]`
+    // shape and `[%step_X, %step_Y]` strides. The offset will be inherited
+    // from the input.
+    ```
+  }];
+
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", []>:$operand,
+    Variadic<Index>:$sizes,
+    Variadic<Index>:$strides
+  );
+  let results = (outs Res<LHLO_Buffer, "", []>:$result);
+
+  let builders = [OpBuilder<
+    "OpBuilder &builder, OperationState &result, MemRefType resultType, " #
+    "Value operand, ValueRange sizes, ValueRange strides", [{
+       result.addOperands(operand);
+       result.addOperands(sizes);
+       result.addOperands(strides);
+       result.types.push_back(resultType);
+     }]>];
+
+  let extraClassDeclaration = [{
+    MemRefType getType() { return getResult().getType().cast<MemRefType>(); }
+  }];
+
+  let verifier = [{ return Verify(*this); }];
+  let assemblyFormat = [{
+    $operand `(` $sizes `)` `[` $strides `]` attr-dict `:` type($operand) `->`
+    type($result)
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // XLA Other op definitions.
 //===----------------------------------------------------------------------===//
@@ -436,6 +535,10 @@ def TerminatorOp :
   let description = [{
     Terminator operation for the LHLO dialect.
   }];
+  let builders = [OpBuilder<
+    "OpBuilder &b, OperationState &result, ValueRange operands",
+    [{ build(b, result, llvm::None, operands, llvm::None); }]
+  >];
 }
 
 #endif // LHLO_OPS
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
index 99d1da74fc5..774caab77fb 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
@@ -56,6 +56,20 @@ static mlir::DenseIntElementsAttr GetI64ElementsAttr(
   return mlir::DenseIntElementsAttr::get(ty, mlir_values);
 }
 
+static mlir::DenseIntElementsAttr ConvertPadding(
+    absl::Span<const std::pair<tensorflow::int64, tensorflow::int64>> padding,
+    mlir::Builder* builder) {
+  llvm::SmallVector<int64_t, 8> elements;
+  elements.reserve(padding.size() * 2);
+  for (const auto& vals : padding) {
+    elements.push_back(vals.first);
+    elements.push_back(vals.second);
+  }
+  auto ty = mlir::RankedTensorType::get(
+      {static_cast<int64_t>(padding.size()), 2}, builder->getIntegerType(64));
+  return mlir::DenseIntElementsAttr::get(ty, elements);
+}
+
 MlirHloBuilder::~MlirHloBuilder() = default;
 
 StatusOr<XlaOp> MlirHloBuilder::MakeXlaOp(mlir::Value val) {
@@ -79,6 +93,31 @@ XlaOp MlirHloBuilder::ConstantLiteral(const LiteralSlice& literal) {
   });
 }
 
+StatusOr<XlaOp> MlirHloBuilder::ConvGeneralDilatedInternal(
+    const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
+    absl::Span<const int64> window_strides,
+    absl::Span<const std::pair<int64, int64>> padding,
+    absl::Span<const int64> lhs_dilation, absl::Span<const int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config) {
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         shape, builder_));
+  mlir::ArrayAttr config_attr;
+  if (precision_config)
+    config_attr = ConvertPrecisionConfig(precision_config, &builder_);
+  auto op = builder_.create<mlir::xla_hlo::ConvOp>(
+      loc_, ty, GetValue(lhs), GetValue(rhs),
+      GetI64ElementsAttr(window_strides, &builder_),
+      ConvertPadding(padding, &builder_),
+      GetI64ElementsAttr(lhs_dilation, &builder_),
+      GetI64ElementsAttr(rhs_dilation, &builder_),
+      ConvertConvDimensionNumbers(dimension_numbers, &builder_),
+      builder_.getI64IntegerAttr(feature_group_count),
+      builder_.getI64IntegerAttr(batch_group_count), config_attr);
+  return MakeXlaOp(op);
+}
+
 StatusOr<XlaOp> MlirHloBuilder::TransposeInternal(
     const Shape& shape, XlaOp operand, absl::Span<const int64> permutation) {
   TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
@@ -170,7 +209,6 @@ StatusOr<XlaOp> MlirHloBuilder::Compare(const Shape& shape, XlaOp lhs,
                                          shape, builder_));
   auto op = builder_.create<mlir::xla_hlo::CompareOp>(
       loc_, ty, GetValue(lhs), GetValue(rhs),
-      /*broadcast_dimensions=*/mlir::DenseIntElementsAttr(),
       builder_.getStringAttr(ComparisonDirectionToString(direction)));
   return MakeXlaOp(op.getResult());
 }
@@ -243,6 +281,28 @@ StatusOr<XlaOp> MlirHloBuilder::SliceInternal(
       GetI64ElementsAttr(strides, &builder_)));
 }
 
+StatusOr<XlaOp> MlirHloBuilder::DynamicSliceInternal(
+    const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+    absl::Span<const int64> slice_sizes) {
+  TF_ASSIGN_OR_RETURN(
+      mlir::Type result_ty,
+      ConvertShapeToType<mlir::RankedTensorType>(shape, builder_));
+  return MakeXlaOp(builder_.create<mlir::xla_hlo::DynamicSliceOp>(
+      loc_, result_ty, GetValue(operand), GetValues(start_indices),
+      GetI64ElementsAttr(slice_sizes, &builder_)));
+}
+
+StatusOr<XlaOp> MlirHloBuilder::DynamicUpdateSliceInternal(
+    const Shape& shape, XlaOp operand, XlaOp update,
+    absl::Span<const XlaOp> start_indices) {
+  TF_ASSIGN_OR_RETURN(
+      mlir::Type result_ty,
+      ConvertShapeToType<mlir::RankedTensorType>(shape, builder_));
+  return MakeXlaOp(builder_.create<mlir::xla_hlo::DynamicUpdateSliceOp>(
+      loc_, result_ty, GetValue(operand), GetValue(update),
+      GetValues(start_indices)));
+}
+
 StatusOr<XlaOp> MlirHloBuilder::PadInternal(
     const Shape& shape, XlaOp operand, XlaOp padding_value,
     const PaddingConfig& padding_config) {
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
index dbcb6856971..fc5baaee44d 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
@@ -110,6 +110,16 @@ class MlirHloBuilder : public XlaBuilder {
  private:
   XlaOp ConstantLiteral(const LiteralSlice& literal) override;
 
+  StatusOr<XlaOp> ConvGeneralDilatedInternal(
+      const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
+      absl::Span<const int64> window_strides,
+      absl::Span<const std::pair<int64, int64>> padding,
+      absl::Span<const int64> lhs_dilation,
+      absl::Span<const int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count, int64 batch_group_count,
+      const PrecisionConfig* precision_config) override;
+
   StatusOr<XlaOp> TransposeInternal(
       const Shape& shape, XlaOp operand,
       absl::Span<const int64> permutation) override;
@@ -165,6 +175,14 @@ class MlirHloBuilder : public XlaBuilder {
                                 absl::Span<const int64> limit_indices,
                                 absl::Span<const int64> strides) override;
 
+  StatusOr<XlaOp> DynamicSliceInternal(
+      const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+      absl::Span<const int64> slice_sizes) override;
+
+  StatusOr<XlaOp> DynamicUpdateSliceInternal(
+      const Shape& shape, XlaOp operand, XlaOp update,
+      absl::Span<const XlaOp> start_indices) override;
+
   StatusOr<XlaOp> PadInternal(const Shape& shape, XlaOp operand,
                               XlaOp padding_value,
                               const PaddingConfig& padding_config) override;
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index a1fb6b559e3..3c670ef0c6e 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -602,13 +602,12 @@ LogicalResult ExportXlaOp(BroadcastInDimOp op, OpLoweringContext ctx) {
   return success();
 }
 
-LogicalResult ExportXlaOp(ScalarsToDimensionTensorOp op,
-                          OpLoweringContext ctx) {
+LogicalResult ExportXlaOp(DynamicBroadcastInDimOp op, OpLoweringContext ctx) {
   // This op has no expression in the legacy export format.
   return failure();
 }
 
-LogicalResult ExportXlaOp(DynamicBroadcastInDimOp op, OpLoweringContext ctx) {
+LogicalResult ExportXlaOp(DynamicIotaOp op, OpLoweringContext ctx) {
   // This op has no expression in the legacy export format.
   return failure();
 }
@@ -618,7 +617,7 @@ LogicalResult ExportXlaOp(DynamicReshapeOp op, OpLoweringContext ctx) {
   return failure();
 }
 
-LogicalResult ExportXlaOp(ConditionalOp op, OpLoweringContext ctx) {
+LogicalResult ExportXlaOp(IfOp op, OpLoweringContext ctx) {
   xla::XlaComputation true_branch;
   xla::XlaComputation false_branch;
   auto& value_map = *ctx.values;
@@ -636,6 +635,33 @@ LogicalResult ExportXlaOp(ConditionalOp op, OpLoweringContext ctx) {
   return success();
 }
 
+LogicalResult ExportXlaOp(CaseOp op, OpLoweringContext ctx) {
+  llvm::DenseMap<mlir::Value, xla::XlaOp>& value_map = *ctx.values;
+  OperandRange operands = op.branch_operands();
+  MutableArrayRef<Region> branches = op.branches();
+  llvm::SmallVector<xla::XlaOp, 4> branch_operands(branches.size());
+  std::vector<xla::XlaComputation> computations(branches.size());
+  std::vector<xla::XlaComputation*> computations_p(branches.size());
+
+  for (unsigned i = 0; i < branches.size(); ++i) {
+    branch_operands[i] = value_map[operands[i]];
+    computations_p[i] = &computations[i];
+    if (failed(ctx.converter->LowerRegionAsComputation(&branches[i],
+                                                       computations_p[i])))
+      return failure();
+  }
+  xla::XlaOp result =
+      xla::Conditional(value_map[op.index()], computations_p, branch_operands);
+  if (op.getNumResults() == 1) {
+    value_map[op.getResult(0)] = result;
+  } else {
+    for (auto item : llvm::enumerate(op.getResults())) {
+      value_map[item.value()] = xla::GetTupleElement(result, item.index());
+    }
+  }
+  return success();
+}
+
 LogicalResult ExportXlaOp(ConstOp op, OpLoweringContext ctx) {
   return failure();
 }
@@ -933,6 +959,8 @@ StatusOr<xla::Literal> CreateLiteralFromAttr(ElementsAttr attr) {
     ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U16, uint16)
     ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U32, uint32)
     ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U64, uint64)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::C64, std::complex<float>)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::C128, std::complex<double>)
     case xla::PrimitiveType::F16: {
       llvm::SmallVector<xla::half, 16> values;
       values.reserve(attr.getNumElements());
@@ -984,6 +1012,21 @@ LogicalResult ConvertToHloModule::Lower(
     return LowerFunctionCall(&call_op, builder, &value_map);
   }
 
+  if (auto op = dyn_cast<mlir::TensorCastOp>(inst)) {
+    Value operand = op.getOperand();
+    auto ty = operand.getType().dyn_cast<ShapedType>();
+    // If this was a cast from a static shaped tensors, then it is a noop for
+    // export to HLO and we can use the operand.
+    if (!ty || !ty.hasStaticShape()) {
+      inst->emitOpError()
+          << "requires static shaped operand for HLO translation";
+      return failure();
+    }
+
+    value_map[op.getResult()] = value_map[operand];
+    return success();
+  }
+
   // TODO(jpienaar): This doesn't support layouts yet.
   if (matchPattern(inst, m_Constant(&const_attr))) {
     auto literal_or = CreateLiteralFromAttr(const_attr);
diff --git a/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir b/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir
deleted file mode 100644
index ad007d0eb50..00000000000
--- a/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir
+++ /dev/null
@@ -1,260 +0,0 @@
-// RUN: tf-opt -test-buffer-assignment -allow-unregistered-dialect -split-input-file %s | FileCheck %s -dump-input-on-failure
-
-// CHECK-LABEL: func @func_signature_conversion
-func @func_signature_conversion(%arg0: tensor<4x8xf32>) {
-    return
-}
-// CHECK: ({{.*}}: memref<4x8xf32>) {
-
-// -----
-
-// CHECK-LABEL: func @non_void_to_void_return_op_converter
-func @non_void_to_void_return_op_converter(%arg0: tensor<4x8xf32>) -> tensor<4x8xf32> {
-  return %arg0 : tensor<4x8xf32>
-}
-//      CHECK: (%[[ARG0:.*]]: [[TYPE:.*]]<[[RANK:.*]]>, %[[RESULT:.*]]: [[TYPE]]<[[RANK]]>) {
-// CHECK-NEXT: "buffer_assignment_test.copy"(%[[ARG0]], %[[RESULT]])
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @func_and_block_signature_conversion
-func @func_and_block_signature_conversion(%arg0 : tensor<2xf32>, %cond : i1, %arg1: tensor<4x4xf32>) -> tensor<4x4xf32>{
-    cond_br %cond, ^bb1, ^bb2
-  ^bb1:
-    br ^exit(%arg0 : tensor<2xf32>)
-  ^bb2:
-    br ^exit(%arg0 : tensor<2xf32>)
-  ^exit(%arg2: tensor<2xf32>):
-    return %arg1 : tensor<4x4xf32>
-}
-//      CHECK: (%[[ARG0:.*]]: [[ARG0_TYPE:.*]], %[[COND:.*]]: i1, %[[ARG1:.*]]: [[ARG1_TYPE:.*]], %[[RESULT:.*]]: [[RESULT_TYPE:.*]]) {
-//      CHECK: br ^[[EXIT_BLOCK:.*]](%[[ARG0]] : [[ARG0_TYPE]])
-//      CHECK: br ^[[EXIT_BLOCK]](%[[ARG0]] : [[ARG0_TYPE]])
-//      CHECK: ^[[EXIT_BLOCK]](%{{.*}}: [[ARG0_TYPE]])
-// CHECK-NEXT: "buffer_assignment_test.copy"(%[[ARG1]], %[[RESULT]])
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @condBranch
-func @condBranch(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-    cond_br %cond, ^bb1, ^bb2
-  ^bb1:
-    br ^exit(%arg0 : tensor<2xf32>)
-  ^bb2:
-    %1 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-    br ^exit(%1 : tensor<2xf32>)
-  ^exit(%arg1: tensor<2xf32>):
-    return %arg1 : tensor<2xf32>
-
-}
-// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
-// CHECK-NEXT: cond_br
-//      CHECK: "buffer_assignment_test.copy
-// CHECK-NEXT: dealloc %[[ALLOC]]
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @emptyUsesValue
-func @emptyUsesValue(%arg0: memref<4xf32>) {
-  %0 = alloc() : memref<4xf32>
-  return
-}
-// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
-// CHECK-NEXT: dealloc %[[ALLOC]]
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @criticalEdge
-func @criticalEdge(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-    cond_br %cond, ^bb1, ^exit(%arg0 : tensor<2xf32>)
-  ^bb1:
-    %0 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-    br ^exit(%0 : tensor<2xf32>)
-  ^exit(%arg1: tensor<2xf32>):
-    return %arg1 : tensor<2xf32>
-}
-// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
-// CHECK-NEXT: cond_br
-//      CHECK: "buffer_assignment_test.copy
-// CHECK-NEXT: dealloc %[[ALLOC]]
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @invCriticalEdge
-func @invCriticalEdge(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-    %0 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-    cond_br %cond, ^bb1, ^exit(%arg0 : tensor<2xf32>)
-  ^bb1:
-    br ^exit(%0 : tensor<2xf32>)
-  ^exit(%arg1: tensor<2xf32>):
-    return %arg1 : tensor<2xf32>
-}
-// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
-// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
-//      CHECK: "buffer_assignment_test.copy
-// CHECK-NEXT: dealloc %[[ALLOC]]
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @ifElse
-func @ifElse(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-    %0 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-    cond_br %cond, ^bb1(%arg0, %0: tensor<2xf32>, tensor<2xf32>),
-                   ^bb2(%0, %arg0: tensor<2xf32>, tensor<2xf32>)
-  ^bb1(%arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>):
-    br ^exit(%arg1, %arg2 : tensor<2xf32>, tensor<2xf32>)
-  ^bb2(%arg3 : tensor<2xf32>, %arg4 : tensor<2xf32>):
-    br ^exit(%arg3, %arg4 : tensor<2xf32>, tensor<2xf32>)
-  ^exit(%arg5 : tensor<2xf32>, %arg6 : tensor<2xf32>):
-    %1 = "buffer_assignment_test.unary"(%arg5) : (tensor<2xf32>) -> tensor<2xf32>
-    return %1 : tensor<2xf32>
-}
-// CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
-// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
-//      CHECK: %[[SECOND_ALLOC:.*]] = alloc()
-// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
-// CHECK-NEXT: dealloc %[[FIRST_ALLOC]]
-// CHECK-NEXT: "buffer_assignment_test.copy
-// CHECK-NEXT: dealloc %[[SECOND_ALLOC]]
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @ifElseNoUsers
-func @ifElseNoUsers(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-    %0 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-    cond_br %cond, ^bb1(%arg0, %0: tensor<2xf32>, tensor<2xf32>),
-                   ^bb2(%0, %arg0: tensor<2xf32>, tensor<2xf32>)
-  ^bb1(%arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>):
-    br ^exit(%arg1, %arg2 : tensor<2xf32>, tensor<2xf32>)
-  ^bb2(%arg3 : tensor<2xf32>, %arg4 : tensor<2xf32>):
-    br ^exit(%arg3, %arg4 : tensor<2xf32>, tensor<2xf32>)
-  ^exit(%arg5 : tensor<2xf32>, %arg6 : tensor<2xf32>):
-    return %arg0 : tensor<2xf32>
-}
-// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
-// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
-//      CHECK: "buffer_assignment_test.copy
-// CHECK-NEXT: dealloc %[[ALLOC]]
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @ifElseNested
-func @ifElseNested(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-    %0 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-    cond_br %cond, ^bb1(%arg0, %0: tensor<2xf32>, tensor<2xf32>),
-                   ^bb2(%0, %arg0: tensor<2xf32>, tensor<2xf32>)
-  ^bb1(%arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>):
-    br ^exit(%arg1, %arg2 : tensor<2xf32>, tensor<2xf32>)
-  ^bb2(%arg3 : tensor<2xf32>, %arg4 : tensor<2xf32>):
-    cond_br %cond, ^bb3(%arg3 : tensor<2xf32>), ^bb4(%arg4 : tensor<2xf32>)
-  ^bb3(%arg7 : tensor<2xf32>):
-    br ^exit(%arg7, %arg3 : tensor<2xf32>, tensor<2xf32>)
-  ^bb4(%arg8 : tensor<2xf32>):
-    br ^exit(%arg3, %arg8 : tensor<2xf32>, tensor<2xf32>)
-  ^exit(%arg5 : tensor<2xf32>, %arg6 : tensor<2xf32>):
-    %1 = "buffer_assignment_test.unary"(%arg5) : (tensor<2xf32>) -> tensor<2xf32>
-    return %1 : tensor<2xf32>
-}
-// CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
-// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
-//      CHECK: %[[SECOND_ALLOC:.*]] = alloc()
-// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
-// CHECK-NEXT: dealloc %[[FIRST_ALLOC]]
-// CHECK-NEXT: "buffer_assignment_test.copy
-// CHECK-NEXT: dealloc %[[SECOND_ALLOC]]
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @redundantOperations
-func @redundantOperations(%arg0: tensor<4xf32>) {
-  %1 = "buffer_assignment_test.unary"(%arg0) : (tensor<4xf32>) -> tensor<4xf32>
-  %2 = "buffer_assignment_test.unary"(%1) : (tensor<4xf32>) -> tensor<4xf32>
-  return
-}
-// CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
-// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
-// CHECK-NEXT: %[[SECOND_ALLOC:.*]] = alloc()
-// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
-// CHECK-NEXT: dealloc
-// CHECK-NEXT: dealloc
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @moving_alloc_and_inserting_missing_dealloc
-func @moving_alloc_and_inserting_missing_dealloc(%cond : i1, %arg0 : memref<2xf32>, %arg1: memref<2xf32>){
-    cond_br %cond, ^bb1, ^bb2
-  ^bb1:
-    %0 = alloc() : memref<2xf32>
-    "buffer_assignment_test.unary_lowered"(%arg0, %0) : (memref<2xf32>, memref<2xf32>) -> ()
-    br ^exit(%0 : memref<2xf32>)
-  ^bb2:
-
-    %1 = alloc() : memref<2xf32>
-    "buffer_assignment_test.unary_lowered"(%arg0, %1) : (memref<2xf32>, memref<2xf32>) -> ()
-    br ^exit(%1 : memref<2xf32>)
-  ^exit(%arg2: memref<2xf32>):
-    "bufer_assignment_test.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
-    return
-}
-// CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
-// CHECK-NEXT: %[[SECOND_ALLOC:.*]] = alloc()
-//      CHECK: "bufer_assignment_test.copy"
-// CHECK-NEXT: dealloc
-// CHECK-NEXT: dealloc
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @moving_invalid_dealloc_op_complex
-func @moving_invalid_dealloc_op_complex(%cond : i1, %arg0 : memref<2xf32>, %arg1: memref<2xf32>){
-    cond_br %cond, ^bb1, ^bb2
-  ^bb1:
-    br ^exit(%arg0 : memref<2xf32>)
-  ^bb2:
-    %1 = alloc() : memref<2xf32>
-    "buffer_assignment_test.unary_lowered"(%arg0, %1) : (memref<2xf32>, memref<2xf32>) -> ()
-    dealloc %1 : memref<2xf32>
-    br ^exit(%1 : memref<2xf32>)
-  ^exit(%arg2: memref<2xf32>):
-    "bufer_assignment_test.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
-    return
-}
-// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
-//      CHECK: bufer_assignment_test.copy
-// CHECK-NEXT: dealloc
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @inserting_missing_dealloc_simple
-func @inserting_missing_dealloc_simple(%arg0 : memref<2xf32>, %arg1: memref<2xf32>){
-    %0 = alloc() : memref<2xf32>
-    "buffer_assignment_test.unary_lowered"(%arg0, %0) : (memref<2xf32>, memref<2xf32>) -> ()
-    "bufer_assignment_test.copy"(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
-    return
-}
-//      CHECK: bufer_assignment_test.copy
-// CHECK-NEXT: dealloc
-
-// -----
-
-// CHECK-LABEL: func @moving_invalid_dealloc_op
-func @moving_invalid_dealloc_op(%arg0 : memref<2xf32>, %arg1: memref<2xf32>){
-    %0 = alloc() : memref<2xf32>
-    "buffer_assignment_test.unary_lowered"(%arg0, %0) : (memref<2xf32>, memref<2xf32>) -> ()
-    dealloc %0 : memref<2xf32>
-    "bufer_assignment_test.copy"(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
-    return
-}
-//      CHECK: bufer_assignment_test.copy
-// CHECK-NEXT: dealloc
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
index 30255586002..6b9ed3e463a 100644
--- a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
@@ -45,6 +45,60 @@ func @multiply_scalar_fold() -> tensor<4xi64> {
   return %2 : tensor<4xi64>
 }
 
+// CHECK-LABEL: divide_scalar_fold
+func @divide_scalar_fold() -> tensor<4xi64> {
+  %0 = xla_hlo.constant dense<7> : tensor<4xi64>
+  %1 = xla_hlo.constant dense<5> : tensor<4xi64>
+  // CHECK: xla_hlo.constant dense<1>
+  %2 = "xla_hlo.divide"(%0, %1) : (tensor<4xi64>, tensor<4xi64>) -> (tensor<4xi64>)
+  return %2 : tensor<4xi64>
+}
+
+// CHECK-LABEL: divide_fold_float
+func @divide_fold_float() -> tensor<4xf64> {
+  %0 = xla_hlo.constant dense<[5.0, 66.0, 5.0, 1.0]> : tensor<4xf64>
+  %1 = xla_hlo.constant dense<[5.0, 3.0, 2.0, 4.0]> : tensor<4xf64>
+  // CHECK: xla_hlo.constant dense<[1.000000e+00, 2.200000e+01, 2.500000e+00, 2.500000e-01]>
+  %2 = "xla_hlo.divide"(%0, %1) : (tensor<4xf64>, tensor<4xf64>) -> (tensor<4xf64>)
+  return %2 : tensor<4xf64>
+}
+
+// CHECK-LABEL: max_scalar_fold
+func @max_scalar_fold() -> tensor<4xi64> {
+  %0 = xla_hlo.constant dense<7> : tensor<4xi64>
+  %1 = xla_hlo.constant dense<5> : tensor<4xi64>
+  // CHECK: xla_hlo.constant dense<7>
+  %2 = "xla_hlo.maximum"(%0, %1) : (tensor<4xi64>, tensor<4xi64>) -> (tensor<4xi64>)
+  return %2 : tensor<4xi64>
+}
+
+// CHECK-LABEL: max_fold_float
+func @max_fold_float() -> tensor<4xf64> {
+  %0 = xla_hlo.constant dense<[5.0, 66.0, 5.0, 1.0]> : tensor<4xf64>
+  %1 = xla_hlo.constant dense<[5.0, 3.0, 2.0, 4.0]> : tensor<4xf64>
+  // CHECK: xla_hlo.constant dense<[5.000000e+00, 6.600000e+01, 5.000000e+00, 4.000000e+00]>
+  %2 = "xla_hlo.maximum"(%0, %1) : (tensor<4xf64>, tensor<4xf64>) -> (tensor<4xf64>)
+  return %2 : tensor<4xf64>
+}
+
+// CHECK-LABEL: min_scalar_fold
+func @min_scalar_fold() -> tensor<4xi64> {
+  %0 = xla_hlo.constant dense<7> : tensor<4xi64>
+  %1 = xla_hlo.constant dense<-5> : tensor<4xi64>
+  // CHECK: xla_hlo.constant dense<-5>
+  %2 = "xla_hlo.minimum"(%0, %1) : (tensor<4xi64>, tensor<4xi64>) -> (tensor<4xi64>)
+  return %2 : tensor<4xi64>
+}
+
+// CHECK-LABEL: min_fold_float
+func @min_fold_float() -> tensor<4xf64> {
+  %0 = xla_hlo.constant dense<[5.0, 66.0, 5.0, 1.0]> : tensor<4xf64>
+  %1 = xla_hlo.constant dense<[5.0, 3.0, 2.0, 4.0]> : tensor<4xf64>
+  // CHECK: xla_hlo.constant dense<[5.000000e+00, 3.000000e+00, 2.000000e+00, 1.000000e+00]>
+  %2 = "xla_hlo.minimum"(%0, %1) : (tensor<4xf64>, tensor<4xf64>) -> (tensor<4xf64>)
+  return %2 : tensor<4xf64>
+}
+
 // CHECK-LABEL: concatenate_noop
 func @concatenate_noop(%arg0: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK-SAME: [[ARG:%.+]]: tensor<4xi32>
@@ -281,6 +335,14 @@ func @complex_collapse_fold(%arg0: tensor<4xcomplex<f32>>) -> tensor<4xcomplex<f
   return %2 : tensor<4xcomplex<f32>>
 }
 
+// CHECK-LABEL: @dynamic_iota_is_static
+func @dynamic_iota_is_static(%arg0 : tensor<1xindex>) -> tensor<4xi32> {
+  // CHECK: [[RESULT:%.*]] = "xla_hlo.iota"
+  // CHECK: return [[RESULT]]
+  %0 = "xla_hlo.dynamic_iota"(%arg0) {iota_dimension = 0 : i64} : (tensor<1xindex>) -> tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
 // CHECK-LABEL: @iota_not_lowered_to_constant
 func @iota_not_lowered_to_constant() -> tensor<4xi32> {
   // CHECK: [[RESULT:%.*]] = "xla_hlo.iota"
@@ -297,16 +359,6 @@ func @unary_einsum(%arg0: tensor<2x3xf32>) -> tensor<2x2xf32> {
   return %0 : tensor<2x2xf32>
 }
 
-// CHECK-LABEL: @extract_scalars_to_tensor
-// CHECK-SAME: %[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32
-func @extract_scalars_to_tensor(%arg0: i32, %arg1: i32) -> i32 {
-  %0 = "xla_hlo.scalars_to_dimension_tensor"(%arg0, %arg1) : (i32, i32) -> tensor<2xi32>
-  %1 = constant 0 : index
-  %2 = extract_element %0[%1] : tensor<2xi32>
-  // CHECK: return %[[ARG0]]
-  return %2 : i32
-}
-
 // CHECK-LABEL: func @fold_copy
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func @fold_copy(%arg : tensor<1x4xf32>) -> tensor<1x4xf32> {
@@ -387,8 +439,8 @@ func @dynamic_reshape_not_actually_dynamic(%arg0: tensor<4xf32>, %shape: tensor<
   return %0 : tensor<4x1xf32>
 }
 
-// CHECK-LABEL: do_not_dce_while
-func @do_not_dce_while(%arg0: tensor<i64>) -> tensor<i64> {
+// CHECK-LABEL: do_not_dce_while_with_outfeed
+func @do_not_dce_while_with_outfeed(%arg0: tensor<i64>) -> tensor<i64> {
   // CHECK: xla_hlo.while
   %0 = "xla_hlo.while"(%arg0) ( {
   ^bb0(%arg1: tensor<i64>):
@@ -404,3 +456,19 @@ func @do_not_dce_while(%arg0: tensor<i64>) -> tensor<i64> {
 
   return %arg0 : tensor<i64>
 }
+
+// CHECK-LABEL: dce_while_without_side_effect
+func @dce_while_without_side_effect(%arg0: tensor<i64>) -> tensor<i64> {
+  // CHECK-NOT: xla_hlo.while
+  %0 = "xla_hlo.while"(%arg0) ( {
+  ^bb0(%arg1: tensor<i64>):
+    %1 = "xla_hlo.compare"(%arg1, %arg1) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+    "xla_hlo.return"(%1) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg1: tensor<i64>):
+    %1 = "xla_hlo.create_token"() : () -> !xla_hlo.token
+    "xla_hlo.return"(%arg1) : (tensor<i64>) -> ()
+  }) : (tensor<i64>) -> tensor<i64>
+
+  return %arg0 : tensor<i64>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/chlo_infer_shape_type_methods.mlir b/tensorflow/compiler/mlir/xla/tests/chlo_infer_shape_type_methods.mlir
index ce0243e416c..d67a7d09f7c 100644
--- a/tensorflow/compiler/mlir/xla/tests/chlo_infer_shape_type_methods.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/chlo_infer_shape_type_methods.mlir
@@ -6,8 +6,8 @@
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>,
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?xf32>
 func @broadcast_add(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<1xindex> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = "shape.shape_of"(%[[ARG0]])
-  // CHECK-DAG: %[[ARG1_S:.+]] = "shape.shape_of"(%[[ARG1]])
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
   // CHECK-DAG: %[[BCAST_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
   // CHECK: %[[EXTENTS:.+]] = "shape.to_extent_tensor"(%[[BCAST_S]])
   // CHECK: return %[[EXTENTS]]
diff --git a/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir b/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
index 2bc1e0c6852..7194f7034b5 100644
--- a/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
@@ -14,8 +14,8 @@ func @addWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
 func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = "shape.shape_of"(%[[ARG0]])
-  // CHECK-DAG: %[[ARG1_S:.+]] = "shape.shape_of"(%[[ARG1]])
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
   // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
   // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_S]])
   // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
@@ -31,8 +31,8 @@ func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
 func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = "shape.shape_of"(%[[ARG0]])
-  // CHECK-DAG: %[[ARG1_S:.+]] = "shape.shape_of"(%[[ARG1]])
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
   // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
   // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_S]])
   // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
@@ -48,8 +48,8 @@ func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> t
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
 func @dynamicBroadcastCompare(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xi1> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = "shape.shape_of"(%[[ARG0]])
-  // CHECK-DAG: %[[ARG1_S:.+]] = "shape.shape_of"(%[[ARG1]])
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
   // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
   // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_S]])
   // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
index 262533bbf08..f4b9fa206f2 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt -hlo-legalize-to-lhlo %s -o - | FileCheck %s --dump-input-on-failure
+// RUN: xla-opt -hlo-legalize-to-lhlo -buffer-placement %s -o - | FileCheck %s --dump-input-on-failure
 
 // CHECK-LABEL: func @attrs
 func @attrs_copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
@@ -13,33 +13,42 @@ func @attrs_copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
 
 // -----
 
+func @return_func(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  return %arg0 : tensor<4xf32>
+}
+//      CHECK: (%[[ARG0:.*]]: [[TYPE:.*]], %[[RESULT:.*]]: [[TYPE]])
+// CHECK-NEXT: "xla_lhlo.copy"(%[[ARG0]], %[[RESULT]]) : ([[TYPE]], [[TYPE]]) -> ()
+// CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
+
+// -----
+
 // CHECK-LABEL: func @func_op_long
 func @func_op_long(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: (%[[NEW_ARG0:.*]]: memref<4xf32>, %[[NEW_ARG1:.*]]: memref<4xf32>, %[[RESULT:.*]]: memref<4xf32>)
-  // CHECK-NEXT: %[[MUL_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
-  // CHECK-NEXT: %[[SUB_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
-  // CHECK-NEXT: %[[MIN_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
-  // CHECK-NEXT: %[[ADD_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
-  // CHECK-NEXT: %[[MAX_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
   %1 = xla_hlo.maximum %arg0, %arg1 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.maximum"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MAX_RESULT]])
   %2 = xla_hlo.add %arg0, %1 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.add"(%[[NEW_ARG0]], %[[MAX_RESULT]], %[[ADD_RESULT]])
   %3 = xla_hlo.minimum %arg0, %arg1 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.minimum"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MIN_RESULT]])
   %4 = xla_hlo.subtract %arg1, %3 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.subtract"(%[[NEW_ARG1]], %[[MIN_RESULT]], %[[SUB_RESULT]])
   %5 = xla_hlo.multiply %2, %4 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.multiply"(%[[ADD_RESULT]], %[[SUB_RESULT]], %[[MUL_RESULT]])
-  // CHECK-NEXT: dealloc %[[MAX_RESULT]] : memref<4xf32>
-  // CHECK-NEXT: dealloc %[[ADD_RESULT]] : memref<4xf32>
-  // CHECK-NEXT: dealloc %[[MIN_RESULT]] : memref<4xf32>
-  // CHECK-NEXT: dealloc %[[SUB_RESULT]] : memref<4xf32>
-  // CHECK-NEXT: "xla_lhlo.copy"(%[[MUL_RESULT]], %[[RESULT]]) : (memref<4xf32>, memref<4xf32>) -> ()
-  // CHECK-NEXT: dealloc %[[MUL_RESULT]] : memref<4xf32>
   return %5 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
 }
+//      CHECK: (%[[NEW_ARG0:.*]]: memref<4xf32>, %[[NEW_ARG1:.*]]: memref<4xf32>, %[[RESULT:.*]]: memref<4xf32>)
+// CHECK-NEXT: %[[MAX_RESULT:.*]] = alloc() : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.maximum"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MAX_RESULT]])
+// CHECK-NEXT: %[[ADD_RESULT:.*]] = alloc() : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.add"(%[[NEW_ARG0]], %[[MAX_RESULT]], %[[ADD_RESULT]])
+// CHECK-NEXT: dealloc %[[MAX_RESULT]] : memref<4xf32>
+// CHECK-NEXT: %[[MIN_RESULT:.*]] = alloc() : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.minimum"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MIN_RESULT]])
+// CHECK-NEXT: %[[SUB_RESULT:.*]] = alloc() : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.subtract"(%[[NEW_ARG1]], %[[MIN_RESULT]], %[[SUB_RESULT]])
+// CHECK-NEXT: dealloc %[[MIN_RESULT]] : memref<4xf32>
+// CHECK-NEXT: %[[MUL_RESULT:.*]] = alloc() : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.multiply"(%[[ADD_RESULT]], %[[SUB_RESULT]], %[[MUL_RESULT]])
+// CHECK-NEXT: dealloc %[[SUB_RESULT]] : memref<4xf32>
+// CHECK-NEXT: dealloc %[[ADD_RESULT]] : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.copy"(%[[MUL_RESULT]], %[[RESULT]]) : (memref<4xf32>, memref<4xf32>) -> ()
+// CHECK-NEXT: dealloc %[[MUL_RESULT]] : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
 
 // -----
 
@@ -47,20 +56,20 @@ func @func_op_long(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
 func @fusion(%multiplier: memref<2x2xf32>, %summand_1: memref<2x2xf32>,
              %summand_2: memref<2x2xf32>, %result: memref<2x2xf32>) {
   // CHECK: (%{{.*}}: {{.*}}, {{.*}}: {{.*}}, {{.*}}: {{.*}}, %[[RESULT:.*]]: {{.*}})
-  // CHECK-NEXT:  %[[MUL_RESULT:.*]] = alloc() {temp = true} : memref<2x2xf32>
-  // CHECK-NEXT:  %[[ADD_RESULT:.*]] = alloc() {temp = true} : memref<2x2xf32>
+  // CHECK-NEXT:  %[[ADD_RESULT:.*]] = alloc() : memref<2x2xf32>
   %tensor_summand_1 = tensor_load %summand_1 : memref<2x2xf32>
   %tensor_summand_2 = tensor_load %summand_2 : memref<2x2xf32>
   %sum = "xla_hlo.add"(%tensor_summand_1, %tensor_summand_2)
       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
   // CHECK-NEXT: "xla_lhlo.add"(%{{.*}}, %{{.*}}, %[[ADD_RESULT]])
+  // CHECK-NEXT:  %[[MUL_RESULT:.*]] = alloc() : memref<2x2xf32>
   %tensor_multiplier = tensor_load %multiplier : memref<2x2xf32>
   %tensor_result = "xla_hlo.multiply"(%sum, %tensor_multiplier)
       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
   // CHECK-NEXT: "xla_lhlo.multiply"(%[[ADD_RESULT]], %{{.*}}, %[[MUL_RESULT]])
+  // CHECK-NEXT:  dealloc %[[ADD_RESULT]] : memref<2x2xf32>
   // CHECK-NEXT: "xla_lhlo.copy"(%[[MUL_RESULT]], %[[RESULT]])
   tensor_store %tensor_result, %result : memref<2x2xf32>
-  // CHECK-NEXT:  dealloc %[[ADD_RESULT]] : memref<2x2xf32>
   // CHECK-NEXT:  dealloc %[[MUL_RESULT]] : memref<2x2xf32>
   // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
   "xla_lhlo.terminator"() : () -> ()
@@ -354,7 +363,7 @@ func @add_dyn(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>) {
   // CHECK: %[[IC0:.*]] = index_cast %[[DIM0]] : index to i64
   // CHECK: %[[DIM1:.*]] = dim %arg0, 1 : memref<?x?xf32>
   // CHECK: %[[IC1:.*]] = index_cast %[[DIM1]] : index to i64
-  // CHECK: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[IC0]], %[[IC1]]) : (i64, i64) -> tensor<2xi64>
+  // CHECK: %[[SHAPE:.*]] = tensor_from_elements(%[[IC0]], %[[IC1]]) : tensor<2xi64>
   // CHECK: %[[C0:.*]] = constant 0 : index
   // CHECK: %[[EE0:.*]] = extract_element %[[SHAPE]][%[[C0]]] : tensor<2xi64>
   // CHECK: %[[ICS0:.*]] = index_cast %[[EE0]] : i64 to index
@@ -376,7 +385,7 @@ func @tanh_dyn(%arg0: tensor<?x?xf32>) {
   // CHECK: %[[IC0:.*]] = index_cast %[[DIM0]] : index to i64
   // CHECK: %[[DIM1:.*]] = dim %arg0, 1 : memref<?x?xf32>
   // CHECK: %[[IC1:.*]] = index_cast %[[DIM1]] : index to i64
-  // CHECK: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[IC0]], %[[IC1]]) : (i64, i64) -> tensor<2xi64>
+  // CHECK: %[[SHAPE:.*]] = tensor_from_elements(%[[IC0]], %[[IC1]]) : tensor<2xi64>
   // CHECK: %[[C0:.*]] = constant 0 : index
   // CHECK: %[[EE0:.*]] = extract_element %[[SHAPE]][%[[C0]]] : tensor<2xi64>
   // CHECK: %[[ICS0:.*]] = index_cast %[[EE0]] : i64 to index
@@ -386,3 +395,15 @@ func @tanh_dyn(%arg0: tensor<?x?xf32>) {
   // CHECK: "xla_lhlo.tanh"(%arg0, %[[RESULT]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
   return
 }
+
+// -----
+
+// CHECK-LABEL: func @dot
+func @dot(%arg0: tensor<1024x1024xf32>) -> tensor<1024x1024xf32> {
+// CHECK-SAME: (%[[ARG0:.*]]: [[TYPE:.*]],
+// CHECK-SAME:  %[[RESULT:.*]]: [[TYPE]])
+// CHECK: "xla_lhlo.dot"(%[[ARG0]], %[[ARG0]], %{{.*}}) : ([[TYPE]], [[TYPE]], [[TYPE]]) -> ()
+    %dot = "xla_hlo.dot"(%arg0, %arg0)
+      : (tensor<1024x1024xf32>, tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+    return %dot : tensor<1024x1024xf32>
+  }
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
index aa949a01388..a27bf2cff79 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
@@ -530,3 +530,28 @@ func @convert_f64_to_f32(%input: tensor<2x2xf64>) -> tensor<2x2xf32> {
 // CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f64):
 // CHECK-NEXT:   %[[RESULT:.*]] = fptrunc %[[OPERAND_IN]] : f64 to f32
 // CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
+
+// -----
+
+// CHECK-LABEL: func @convert_f32_to_i32
+func @convert_f32_to_i32(%input: tensor<2x2xf32>) -> tensor<2x2xi32> {
+  %result = "xla_hlo.convert"(%input) : (tensor<2x2xf32>) -> tensor<2x2xi32>
+  return %result : tensor<2x2xi32>
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32):
+// CHECK-NEXT:   %[[RESULT:.*]] = fptosi %[[OPERAND_IN]] : f32 to i32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 2)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @reverse
+func @reverse(%input: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  %result = "xla_hlo.reverse"(%input) {
+    dimensions = dense<1> : tensor<1xi64>
+  } : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  return %result : tensor<2x3xf32>
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir
index cda1dc481a7..6a2b68adac3 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir
@@ -8,7 +8,9 @@
 // CHECK-SAME: ) {
 func @main(%value: tensor<2x2xf32>) -> tensor<2x2xf32> {
   // The only expected instruction is a copy from the input into the output.
-  // CHECK: %[[OUTPUT:.*]] = std.view %[[ARG1]][][] : memref<16xi8> to memref<2x2xf32>
+  // CHECK: %[[C0:.*]] = constant 0 : index
+  // CHECK: %[[C02:.*]] = constant 0 : index
+  // CHECK: %[[OUTPUT:.*]] = std.view %[[ARG1]][%[[C02]]][] : memref<16xi8> to memref<2x2xf32>
   // CHECK: xla_lhlo.copy
   // CHECK-SAME: %[[ARG0]], %[[OUTPUT]]
   return %value : tensor<2x2xf32>
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-control-flow.mlir
index 83c3f765dc3..83880bc8ce9 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-control-flow.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-control-flow.mlir
@@ -35,7 +35,7 @@ func @conditional(%arg0: tensor<f32>) -> tensor<f32> {
 
   // CHECK:   [[VAL1:%.+]] = extract_element [[VAL0]][] : tensor<i1>
   // CHECK:   cond_br [[VAL1]], ^bb1(%arg0 : tensor<f32>), ^bb2(%arg0 : tensor<f32>)
-  %1 = "xla_hlo.conditional"(%0, %arg0, %arg0) ( {
+  %1 = "xla_hlo.if"(%0, %arg0, %arg0) ( {
 
   ^bb0(%arg1: tensor<f32>):
     // CHECK: ^bb1([[VAL2:%.+]]: tensor<f32>):
@@ -131,7 +131,7 @@ func @conditional_with_multiple_blocks(%arg0: tensor<f32>, %arg1: tensor<f32>, %
   // CHECK: ^[[EXIT]](%6: tensor<f32>):
   // CHECK:   return %6 : tensor<f32>
   // CHECK: }
-  %1 = "xla_hlo.conditional"(%pred, %arg0, %arg1) ( {
+  %1 = "xla_hlo.if"(%pred, %arg0, %arg1) ( {
   ^then_entry(%arg2: tensor<f32>):
     br ^then_succ(%arg2: tensor<f32>)
   ^then_succ(%0: tensor<f32>):
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
index 08df9fd3808..3605e2a0d5c 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
@@ -7,8 +7,8 @@
 func @batchmatmulv2_basic(%arg0: tensor<1x4x2xf32>, %arg1: tensor<3x2x4xf32>) -> tensor<3x4x4xf32> {
 // CHECK-LABEL:   func @batchmatmulv2_basic
 // CHECK-SAME:        ([[LHS:%.*]]: tensor<1x4x2xf32>, [[RHS:%.*]]: tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
-// CHECK:           [[LHSSHAPE:%.*]] = "shape.shape_of"([[LHS]]) : (tensor<1x4x2xf32>) -> !shape.shape
-// CHECK:           [[RHSSHAPE:%.*]] = "shape.shape_of"([[RHS]]) : (tensor<3x2x4xf32>) -> !shape.shape
+// CHECK:           [[LHSSHAPE:%.*]] = shape.shape_of [[LHS]] : tensor<1x4x2xf32>
+// CHECK:           [[RHSSHAPE:%.*]] = shape.shape_of [[RHS]] : tensor<3x2x4xf32>
 // CHECK:           [[CM2:%.*]] = constant -2 : i32
 // CHECK:           [[LHSHEAD:%.*]], [[LHSTAIL:%.*]] = "shape.split_at"([[LHSSHAPE]], [[CM2]]) : (!shape.shape, i32) -> (!shape.shape, !shape.shape)
 // CHECK:           [[RHSHEAD:%.*]], [[RHSTAIL:%.*]] = "shape.split_at"([[RHSSHAPE]], [[CM2]]) : (!shape.shape, i32) -> (!shape.shape, !shape.shape)
@@ -86,8 +86,8 @@ func @batchmatmulv2_adj_complex(%arg0: tensor<5x2xcomplex<f32>>, %arg1: tensor<2
 // CHECK:           [[RHSIM:%.*]] = "xla_hlo.imag"([[RHS]])
 // CHECK:           [[RHSIMNEG:%.*]] = "xla_hlo.negate"([[RHSIM]])
 // CHECK:           [[RHSCONJ:%.*]] = "xla_hlo.complex"([[RHSRE]], [[RHSIMNEG]])
-// CHECK:           "shape.shape_of"([[LHSCONJ]])
-// CHECK:           "shape.shape_of"([[RHSCONJ]])
+// CHECK:           shape.shape_of [[LHSCONJ]]
+// CHECK:           shape.shape_of [[RHSCONJ]]
   %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = true, device = ""} : (tensor<5x2xcomplex<f32>>, tensor<2x4xcomplex<f32>>) -> tensor<5x4xcomplex<f32>>
   return %0 : tensor<5x4xcomplex<f32>>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
new file mode 100644
index 00000000000..c114b8c50a5
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
@@ -0,0 +1,334 @@
+// Note that binary elementwise tests are run with chlo legalization enabled
+// (unlike the rest), since this is the primary use case for such ops and
+// verification of shapes and broadcasts is desired.
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" %s | FileCheck %s --dump-input-on-failure
+
+//===----------------------------------------------------------------------===//
+// Binary op legalizations.
+// Most of these expand from the same pattern. Full semantics are
+// verified for tf.Add and pattern application only for the rest.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @add
+func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK-NEXT:  %[[SUM0:.*]] = xla_hlo.add %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  %[[SUM1:.*]] = xla_hlo.add %[[SUM0]], %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  return %[[SUM1]] : tensor<2xi32>
+  %0 = "tf.Add"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  %1 = "tf.AddV2"(%0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %1: tensor<2xi32>
+}
+
+// CHECK-LABEL: func @broadcast_add
+// TODO(laurenzo): Change this to a (5 + 2x1) shaped add to make the check
+// patterns unambiguous and more interesting (once broadcastable trait is
+// fixed upstream).
+func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
+  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
+  %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  return %0: tensor<1x2xi32>
+}
+
+// CHECK-LABEL: func @broadcast_multi_dim_add
+// TODO(laurenzo): Change this to a (4x1x1 + 1x4x4x4) shaped add once upstream
+// broadcastable bug is fixed (helps make the CHECK matching unambiguous)
+func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
+  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [4, 1, 1]
+  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>}
+  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
+  %0 = "tf.Add"(%arg0, %arg1) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
+  return %0: tensor<4x4x4x4xi32>
+}
+
+// CHECK-LABEL: func @add_dynamic
+func @add_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: xla_hlo.add %4, %5 : tensor<?x?xi32>
+  %0 = "tf.Add"(%arg0, %arg1) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+  return %0: tensor<?x?xi32>
+}
+
+// CHECK-LABEL: func @div
+func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK-NEXT:  %0 = xla_hlo.divide %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  return %0 : tensor<2xi32>
+  %0 = "tf.Div"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// CHECK-LABEL: func @shift_left
+func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
+  // CHECK:  xla_hlo.shift_left %arg0, %arg1 : tensor<4xi32>
+  %0 = "tf.LeftShift"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @div_unranked
+func @div_unranked(%arg0: tensor<*xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  // CHECK: tf.Div
+  %0 = "tf.Div"(%arg0, %arg1) : (tensor<*xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+  return %0: tensor<?x?xi32>
+}
+
+// CHECK-LABEL: func @maximum
+func @maximum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK:  xla_hlo.maximum %arg0, %arg1 : tensor<4xf32>
+  %0 = "tf.Maximum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @minimum
+func @minimum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK:  xla_hlo.minimum %arg0, %arg1 : tensor<4xf32>
+  %0 = "tf.Minimum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @mul
+func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK-NEXT:  %0 = xla_hlo.multiply %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  return %0 : tensor<2xi32>
+  %0 = "tf.Mul"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// CHECK-LABEL: func @real_div
+func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK-NEXT:  %0 = xla_hlo.divide %arg0, %arg0 : tensor<2xi32>
+  %0 = "tf.RealDiv"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// CHECK-LABEL: func @sub
+func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK-NEXT:  %0 = xla_hlo.subtract %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  return %0 : tensor<2xi32>
+  %0 = "tf.Sub"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// CHECK-LABEL: func @shift_right
+func @shift_right(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
+  // CHECK:  xla_hlo.shift_right_arithmetic %arg0, %arg1 : tensor<4xi32>
+  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @shift_right_unsigned
+func @shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<4xui8>) -> tensor<4xui8> {
+  // CHECK:  tf.RightShift
+  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xui8>, tensor<4xui8>) -> tensor<4xui8>
+  return %0 : tensor<4xui8>
+}
+
+// CHECK-LABEL: func @broadcast_shift_right_unsigned
+func @broadcast_shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<2x4xui8>) -> tensor<2x4xui8> {
+  // CHECK:  tf.RightShift
+  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xui8>, tensor<2x4xui8>) -> tensor<2x4xui8>
+  return %0 : tensor<2x4xui8>
+}
+
+// CHECK-LABEL: func @and
+func @and(%arg0: tensor<2xi1>) -> tensor<2xi1> {
+  // CHECK-NEXT:  xla_hlo.and
+  %0 = "tf.LogicalAnd"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @and_unranked
+func @and_unranked(%arg0: tensor<*xi1>, %arg1: tensor<*xi1>) -> tensor<*xi1> {
+  // CHECK: tf.LogicalAnd
+  %0 = "tf.LogicalAnd"(%arg0, %arg1) : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
+  return %0: tensor<*xi1>
+}
+
+// CHECK-LABEL: func @or
+func @or(%arg0: tensor<2xi1>) -> tensor<2xi1> {
+  // CHECK-NEXT:  xla_hlo.or
+  %0 = "tf.LogicalOr"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @bitwise_or
+func @bitwise_or(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
+  // CHECK-NEXT: xla_hlo.or
+  %0 = "tf.BitwiseOr"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %0: tensor<4xi32>
+}
+
+// CHECK-LABEL: func @bitwise_and
+func @bitwise_and(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
+  // CHECK-NEXT: xla_hlo.and
+  %0 = "tf.BitwiseAnd"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %0: tensor<4xi32>
+}
+
+// CHECK-LABEL: func @pow
+func @pow(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  // CHECK-NEXT:  xla_hlo.power
+  %0 = "tf.Pow"(%arg0, %arg0) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  return %0: tensor<2xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// Equality op legalizations.
+// tf.Equal and tf.NotEqual expand from the same pattern. Full semantics are
+// verified for tf.Equal and pattern application only for tf.NotEqual
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @equal
+func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "EQ"}
+  %0 = "tf.Equal"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @equal_dynamic
+func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
+  %0 = "tf.Equal"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+  return %0: tensor<?xi1>
+}
+
+// CHECK-LABEL: func @equal_broadcast
+func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
+  %0 = "tf.Equal"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  return %0: tensor<1x2xi1>
+}
+
+// CHECK-LABEL: func @equal_broadcast_no_incompatible_shapes_error
+func @equal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
+  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  return %0: tensor<1x2xi1>
+}
+
+// CHECK-LABEL: func @equal_incompatible_shape_broadcastable
+func @equal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
+  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
+  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+  return %0: tensor<?xi1>
+}
+
+// CHECK-LABEL: func @equal_incompatible_shape_dynamic
+func @equal_incompatible_shape_dynamic(%arg0: tensor<2xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
+  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
+  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<?xi32>) -> tensor<*xi1>
+  return %0: tensor<*xi1>
+}
+
+// CHECK-LABEL: func @equal_incompatible_shape_both_dynamic
+func @equal_incompatible_shape_both_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
+  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
+  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<?xi32>) -> tensor<*xi1>
+  return %0: tensor<*xi1>
+}
+
+// CHECK-LABEL: func @equal_unranked
+func @equal_unranked(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*xi1> {
+  // CHECK: "tf.Equal"
+  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
+  return %0: tensor<*xi1>
+}
+
+// CHECK-LABEL: func @notequal
+func @notequal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "NE"}
+  %0 = "tf.NotEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+//===----------------------------------------------------------------------===//
+// Compare op legalizations.
+// These expand from the same pattern. Full semantics are checked for
+// tf.Greater. Others just check that the pattern applied.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @greater
+func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK: "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GT"}
+  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @broadcast_greater
+func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
+  %0 = "tf.Greater"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  return %0: tensor<1x2xi1>
+}
+
+// CHECK-LABEL: func @greater_dynamic
+func @greater_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<?xi1> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
+  %0 = "tf.Greater"(%arg0, %arg1) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi1>
+  return %0: tensor<?xi1>
+}
+
+// CHECK-LABEL: func @greater_uranked
+func @greater_uranked(%arg0: tensor<*xi32>) -> tensor<*xi1> {
+  // CHECK:  "tf.Greater"
+  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
+  return %0: tensor<*xi1>
+}
+
+// CHECK-LABEL: func @greater_equal
+func @greater_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GE"}
+  %0 = "tf.GreaterEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @less
+func @less(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LT"}
+  %0 = "tf.Less"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @less_equal
+func @less_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LE"}
+  %0 = "tf.LessEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
index 2984ba46993..b3307a8f52a 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
@@ -1,12 +1,12 @@
 // RUN: tf-opt -xla-legalize-tf-control-flow %s | FileCheck %s --dump-input-on-failure
 
-// CHECK-LABEL: @conditional
-func @conditional(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>)
+// CHECK-LABEL: @if
+func @if(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>)
 attributes  {tf._input_shapes = ["tfshape$", "tfshape$"]} {
   // CHECK: [[VAL0:%.+]] = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
   %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
   // CHECK: [[VAL1:%.+]] = "xla_hlo.tuple"(%arg0, %arg1)
-  // CHECK: [[VAL2:%.+]] = "xla_hlo.conditional"([[VAL0]], [[VAL1]], [[VAL1]]) ( {
+  // CHECK: [[VAL2:%.+]] = "xla_hlo.if"([[VAL0]], [[VAL1]], [[VAL1]]) ( {
   // CHECK: ^bb0(%arg2: tuple<tensor<f32>, tensor<f32>>):
   // CHECK:   [[VAL4:%.+]] = "xla_hlo.get_tuple_element"(%arg2) {index = 0 : i32}
   // CHECK:   [[VAL5:%.+]] = "xla_hlo.get_tuple_element"(%arg2) {index = 1 : i32}
@@ -40,7 +40,52 @@ attributes  {tf._input_shapes = ["tfshape$", "tfshape$"]} {
   return %0 : tensor<f32>
 }
 
-// CHECK-LABEL: @while
+
+// CHECK-LABEL: func @case
+// CHECK-SAME:  %[[BRANCH_INDEX:.*]]: tensor<i32>, %[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>) -> (tensor<f32>, tensor<f32>)
+func @case(%index: tensor<i32>, %arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+  %0:2 = "tf.Case"(%index, %arg0, %arg1) {branches = [@exponential, @log, @floor]} : (tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  // CHECK: %[[TUPLE_INPUT:.*]] = "xla_hlo.tuple"(%[[ARG0]], %[[ARG1]]) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>>
+  // CHECK: %[[CASE:.*]]:2 = "xla_hlo.case"(%[[BRANCH_INDEX]], %[[TUPLE_INPUT]], %[[TUPLE_INPUT]], %[[TUPLE_INPUT]]) ( {
+  // CHECK:   ^bb0(%[[TUPLE_ARG:.*]]: tuple<tensor<f32>, tensor<f32>>):
+  // CHECK:     %[[TUPLE_ELEMENT_0:.*]] = "xla_hlo.get_tuple_element"(%[[TUPLE_ARG]]) {index = 0 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+  // CHECK:     %[[TUPLE_ELEMENT_1:.*]] = "xla_hlo.get_tuple_element"(%[[TUPLE_ARG]]) {index = 1 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+  // CHECK:     %[[CALL_EXP:.*]]:2 = call @exponential(%[[TUPLE_ELEMENT_0]], %[[TUPLE_ELEMENT_1]]) : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  // CHECK:     "xla_hlo.return"(%[[CALL_EXP]]#0, %[[CALL_EXP]]#1) : (tensor<f32>, tensor<f32>) -> ()
+  // CHECK:   },  {
+  // CHECK:   ^bb0(%[[TUPLE_ARG:.*]]: tuple<tensor<f32>, tensor<f32>>):
+  // CHECK:     %[[TUPLE_ELEMENT_0:.*]] = "xla_hlo.get_tuple_element"(%[[TUPLE_ARG]]) {index = 0 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+  // CHECK:     %[[TUPLE_ELEMENT_1:.*]] = "xla_hlo.get_tuple_element"(%[[TUPLE_ARG]]) {index = 1 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+  // CHECK:     %[[CALL_LOG:.*]]:2 = call @log(%[[TUPLE_ELEMENT_0]], %[[TUPLE_ELEMENT_1]]) : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  // CHECK:     "xla_hlo.return"(%[[CALL_LOG]]#0, %[[CALL_LOG]]#1) : (tensor<f32>, tensor<f32>) -> ()
+  // CHECK:   },  {
+  // CHECK:   ^bb0(%[[TUPLE_ARG:.*]]: tuple<tensor<f32>, tensor<f32>>):
+  // CHECK:     %[[TUPLE_ELEMENT_0:.*]] = "xla_hlo.get_tuple_element"(%[[TUPLE_ARG]]) {index = 0 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+  // CHECK:     %[[TUPLE_ELEMENT_1:.*]] = "xla_hlo.get_tuple_element"(%[[TUPLE_ARG]]) {index = 1 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+  // CHECK:     %[[CALL_FLOOR:.*]]:2 = call @floor(%[[TUPLE_ELEMENT_0]], %[[TUPLE_ELEMENT_1]]) : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  // CHECK:     "xla_hlo.return"(%[[CALL_FLOOR]]#0, %[[CALL_FLOOR]]#1) : (tensor<f32>, tensor<f32>) -> ()
+  // CHECK:   }) : (tensor<i32>, tuple<tensor<f32>, tensor<f32>>, tuple<tensor<f32>, tensor<f32>>, tuple<tensor<f32>, tensor<f32>>) -> (tensor<f32>, tensor<f32>)
+  return %0#0, %0#1 : tensor<f32>, tensor<f32>
+// CHECK:   return %[[CASE]]#0, %[[CASE]]#1 : tensor<f32>, tensor<f32>
+}
+
+func @exponential(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+  %0 = "xla_hlo.exponential"(%arg1) : (tensor<f32>) -> tensor<f32>
+  return %0, %arg1 : tensor<f32>, tensor<f32>
+}
+
+func @log(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+  %0 = "xla_hlo.log"(%arg0) : (tensor<f32>) -> tensor<f32>
+  return %0, %arg1 : tensor<f32>, tensor<f32>
+}
+
+func @floor(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+  %0 = "xla_hlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
+  return %0, %arg1 : tensor<f32>, tensor<f32>
+}
+
+
+// CHECK-LABEL: func @while
 func @while(%arg0: tensor<f32> {tf_saved_model.index_path = [0]}) -> (tensor<i32> {tf_saved_model.index_path = []})
 attributes  {tf._input_shapes = ["tfshape$"]} {
   // CHECK: [[VAL0:%.+]] = xla_hlo.constant dense<0>
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-full-conversion.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-full-conversion.mlir
index d2b4d269fef..0660af4ed1c 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-full-conversion.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-full-conversion.mlir
@@ -1,22 +1,24 @@
 // RUN: tf-opt %s -xla-legalize-tf -split-input-file -verify-diagnostics
 
+// expected-error@below{{The following operations cannot be legalized: tf.NoOp (count: 1); tf_executor.fetch (count: 1); tf_executor.graph (count: 1); tf_executor.island (count: 1); tf_executor.yield (count: 1). These legalization failure(s) may be due to missing TF to HLO lowerings and/or unsupported attributes, etc.}}
+// expected-error@below{{Emitting more detail about one op that failed to legalize...}}
 func @tf_executor_graph_op() {
-    // expected-error@+1 {{failed to legalize operation 'tf_executor.graph'}}
     tf_executor.graph {
       %0 = tf_executor.island {
+        // expected-error@+1 {{'tf.NoOp' op is not legalizable}}
         "tf.NoOp"() {} : () -> ()
         tf_executor.yield
       }
       tf_executor.fetch
     }
     return
-
 }
 
 // -----
 
+// expected-error@below{{The following operations cannot be legalized: tf.OpA (count: 1). These legalization failure(s) may be due to missing TF to HLO lowerings and/or unsupported attributes, etc.}}
 func @tf_unknown_op(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // expected-error@+1 {{failed to legalize operation 'tf.OpA'}}
+  // expected-error@+1 {{'tf.OpA' op is not legalizable}}
   %0 = "tf.OpA"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   return %0: tensor<2xi32>
 }
@@ -27,3 +29,16 @@ func @tf_known_op(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   %0 = "tf.Add"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   return %0: tensor<2xi32>
 }
+
+// -----
+
+// expected-error@below{{The following operations cannot be legalized: tf.OpA (count: 1); tf.OpB (count: 2). These legalization failure(s) may be due to missing TF to HLO lowerings and/or unsupported attributes, etc.}}
+// expected-error@below{{Emitting more detail about one op that failed to legalize...}}
+func @tf_unknown_known_mix(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // expected-error@+1 {{'tf.OpA' op is not legalizable}}
+  %0 = "tf.OpA"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  %1 = "tf.OpB"(%0, %0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  %2 = "tf.Add"(%1, %1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  %3 = "tf.OpB"(%2, %2) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %2: tensor<2xi32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
index 01398eb7314..e8d5cfe997d 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
@@ -163,6 +163,30 @@ func @truncated_normal() -> tensor<2x2xf32> {
   return %1 : tensor<2x2xf32>
 }
 
+// CHECK-LABEL: dynamic_update_slice
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<3x4xi32>, %[[ARG1:.*]]: tensor<2x2xi32>, %[[ARG2:.*]]: tensor<2xi32>
+func @dynamic_update_slice(%arg0: tensor<3x4xi32>, %arg1: tensor<2x2xi32>, %arg2: tensor<2xi32>) -> tensor<3x4xi32> {
+
+  // CHECK: %[[SLICE0:.*]] = "xla_hlo.slice"(%[[ARG2]])
+  // CHECK-DAG-SAME: start_indices = dense<0> : tensor<1xi64>
+  // CHECK-DAG-SAME: limit_indices = dense<1> : tensor<1xi64>
+  // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>
+  // CHECK-SAME: (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK: %[[DIM0:.*]] = "xla_hlo.reshape"(%[[SLICE0]]) : (tensor<1xi32>) -> tensor<i32>
+
+  // CHECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%[[ARG2]])
+  // CHECK-DAG-SAME: start_indices = dense<1> : tensor<1xi64>
+  // CHECK-DAG-SAME: limit_indices = dense<2> : tensor<1xi64>
+  // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>
+  // CHECK-SAME: (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK: %[[DIM1:.*]] = "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<1xi32>) -> tensor<i32>
+
+  // CHECK: "xla_hlo.dynamic-update-slice"(%[[ARG0]], %[[ARG1]], %[[DIM0]], %[[DIM1]])
+
+  %0 = "tf.XlaDynamicUpdateSlice"(%arg0, %arg1, %arg2) : (tensor<3x4xi32>, tensor<2x2xi32>, tensor<2xi32>) -> tensor<3x4xi32>
+  return %0: tensor<3x4xi32>
+}
+
 // TODO(hinsu): Add a test with a valid TF op for which tf2xla kernel is
 // available but doesn't support this instance.
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index e15101a165e..6406e2fee48 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -1,4 +1,11 @@
-// RUN: tf-opt -xla-legalize-tf=allow-partial-conversion %s | FileCheck %s --dump-input-on-failure
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=false" %s | FileCheck %s --dump-input-on-failure
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" -verify-diagnostics %s
+// This test runs twice:
+//   1. Through FileCheck with chlo legalization disabled since verifying
+//      that the chlo ops emit produces more useful tests.
+//   2. With chlo legalization enabled, verifying diagnostics to pick up any
+//      issues with the full lowering (can catch some broadcasting corner
+//      cases which emit with a warning).
 
 //===----------------------------------------------------------------------===//
 // BatchNorm op legalizations.
@@ -47,7 +54,7 @@ func @fusedBatchNormV3_training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>
   // CHECK: "xla_hlo.get_tuple_element"(%[[RESULT0]]) {index = 1 : i32} : (tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>) -> tensor<8xf32>
   // CHECK: %[[VAR:.*]] = "xla_hlo.get_tuple_element"(%[[RESULT0]]) {index = 2 : i32} : (tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>) -> tensor<8xf32>
   // CHECK: xla_hlo.constant
-  // CHECK: "xla_hlo.multiply"(%[[VAR]], {{.*}}) : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK: xla_chlo.broadcast_multiply %[[VAR]], {{.*}} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   return %0#0 : tensor<8x8x8x8xf32>
 }
 
@@ -68,18 +75,18 @@ func @fusedBatchNormV3_training_exponentialAvgFactor(%arg0: tensor<8x8x8x8xf32>,
   // CHECK-DAG: %[[BATCH_VAR:.*]] = "xla_hlo.get_tuple_element"(%[[RESULT0]]) {index = 2 : i32}
 
   // CHECK: %[[FACTOR:.*]] = xla_hlo.constant dense<1.00195694>
-  // CHECK: %[[CORRECTED_VAR:.*]] = "xla_hlo.multiply"(%[[BATCH_VAR]], %[[FACTOR]])
+  // CHECK: %[[CORRECTED_VAR:.*]] = xla_chlo.broadcast_multiply %[[BATCH_VAR]], %[[FACTOR]]
 
   // CHECK-DAG: %[[ALPHA:.*]] = xla_hlo.constant dense<0.199999988>
   // CHECK-DAG: %[[BETA:.*]] = xla_hlo.constant dense<8.000000e-01>
 
-  // CHECK: %[[ALPHA_MUL_OLD_MEAN:.*]] = "xla_hlo.multiply"(%[[ALPHA]], %arg3)
-  // CHECK: %[[BETA_MUL_BATCH_MEAN:.*]] = "xla_hlo.multiply"(%[[BETA]], %[[BATCH_MEAN]])
-  // CHECK: %[[NEW_BATCH_MEAN:.*]] = xla_hlo.add %[[ALPHA_MUL_OLD_MEAN]], %[[BETA_MUL_BATCH_MEAN]]
+  // CHECK: %[[ALPHA_MUL_OLD_MEAN:.*]] = xla_chlo.broadcast_multiply %[[ALPHA]], %arg3
+  // CHECK: %[[BETA_MUL_BATCH_MEAN:.*]] = xla_chlo.broadcast_multiply %[[BETA]], %[[BATCH_MEAN]]
+  // CHECK: %[[NEW_BATCH_MEAN:.*]] = xla_chlo.broadcast_add %[[ALPHA_MUL_OLD_MEAN]], %[[BETA_MUL_BATCH_MEAN]]
 
-  // CHECK: %[[ALPHA_MUL_OLD_VAR:.*]] = "xla_hlo.multiply"(%[[ALPHA]], %arg4)
-  // CHECK: %[[BETA_MUL_CORRECTED_VAR:.*]] = "xla_hlo.multiply"(%[[BETA]], %[[CORRECTED_VAR]])
-  // CHECK: %[[NEW_BATCH_VAR:.*]] = xla_hlo.add %[[ALPHA_MUL_OLD_VAR]], %[[BETA_MUL_CORRECTED_VAR]]
+  // CHECK: %[[ALPHA_MUL_OLD_VAR:.*]] = xla_chlo.broadcast_multiply %[[ALPHA]], %arg4
+  // CHECK: %[[BETA_MUL_CORRECTED_VAR:.*]] = xla_chlo.broadcast_multiply %[[BETA]], %[[CORRECTED_VAR]]
+  // CHECK: %[[NEW_BATCH_VAR:.*]] = xla_chlo.broadcast_add %[[ALPHA_MUL_OLD_VAR]], %[[BETA_MUL_CORRECTED_VAR]]
 
   // CHECK: return %[[NEW_BATCH_MEAN]], %[[NEW_BATCH_VAR]], %[[BATCH_MEAN]], %[[BATCH_VAR]]
   return %0#1, %0#2, %0#3, %0#4 : tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>
@@ -127,11 +134,12 @@ func @fusedBatchNormGrad_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x
   // CHECK-NEXT: %[[act:.*]] = "xla_hlo.convert"(%arg1) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[eps:.*]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
 
-  // CHECK-NEXT: %[[add:.*]] = "xla_hlo.add"(%arg4, %[[eps]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[add:.*]] = xla_chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = "xla_hlo.rsqrt"(%[[add]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[sub:.*]] = "xla_hlo.subtract"(%[[act]], %arg3) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_arg3:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[sub:.*]] = xla_hlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cmul:.*]] = "xla_hlo.convert"(%[[mul]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[init:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
@@ -142,10 +150,10 @@ func @fusedBatchNormGrad_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x
   // CHECK-NEXT: }) {dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<8x8x8x8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr2:.*]] = "xla_hlo.convert"(%[[red1]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
-  // CHECK-NEXT: %[[mul3:.*]] = "xla_hlo.multiply"(%[[grad]], %[[mul2]]) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
-
-  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
+  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
+  // CHECK:      %[[bcast_mul2:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul3:.*]] = xla_hlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
 
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cgrad:.*]] = "xla_hlo.convert"(%[[grad]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
@@ -185,11 +193,12 @@ func @fusedBatchNormGradV2_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<
   // CHECK-NEXT: %[[act:.*]] = "xla_hlo.convert"(%arg1) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[eps:.*]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
 
-  // CHECK-NEXT: %[[add:.*]] = "xla_hlo.add"(%arg4, %[[eps]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[add:.*]] = xla_chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = "xla_hlo.rsqrt"(%[[add]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[sub:.*]] = "xla_hlo.subtract"(%[[act]], %arg3) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_arg3:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[sub:.*]] = xla_hlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cmul:.*]] = "xla_hlo.convert"(%[[mul]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[init:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
@@ -200,10 +209,11 @@ func @fusedBatchNormGradV2_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<
   // CHECK-NEXT: }) {dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<8x8x8x8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr2:.*]] = "xla_hlo.convert"(%[[red1]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
-  // CHECK-NEXT: %[[mul3:.*]] = "xla_hlo.multiply"(%[[grad]], %[[mul2]]) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
+  // CHECK:      %[[bcast_mul2:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul3:.*]] = xla_hlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
 
-  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
+  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
 
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cgrad:.*]] = "xla_hlo.convert"(%[[grad]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
@@ -270,11 +280,12 @@ func @fusedBatchNormGradV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<
   // CHECK-NEXT: %[[act:.*]] = "xla_hlo.convert"(%arg1) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[eps:.*]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
 
-  // CHECK-NEXT: %[[add:.*]] = "xla_hlo.add"(%arg4, %[[eps]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[add:.*]] = xla_chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = "xla_hlo.rsqrt"(%[[add]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[sub:.*]] = "xla_hlo.subtract"(%[[act]], %arg3) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_arg3:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[sub:.*]] = xla_hlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cmul:.*]] = "xla_hlo.convert"(%[[mul]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[init:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
@@ -285,10 +296,11 @@ func @fusedBatchNormGradV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<
   // CHECK-NEXT: }) {dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<8x8x8x8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr2:.*]] = "xla_hlo.convert"(%[[red1]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
-  // CHECK-NEXT: %[[mul3:.*]] = "xla_hlo.multiply"(%[[grad]], %[[mul2]]) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
+  // CHECK:      %[[bcast_mul2:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul3:.*]] = xla_hlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
 
-  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
+  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
 
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cgrad:.*]] = "xla_hlo.convert"(%[[grad]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
@@ -355,11 +367,12 @@ func @fusedBatchNormGradV3_noTraining_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg1: te
   // CHECK-NEXT: %[[act:.*]] = "xla_hlo.convert"(%arg1) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[eps:.*]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
 
-  // CHECK-NEXT: %[[add:.*]] = "xla_hlo.add"(%arg4, %[[eps]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[add:.*]] = xla_chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = "xla_hlo.rsqrt"(%[[add]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[sub:.*]] = "xla_hlo.subtract"(%[[act]], %arg3) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_arg3:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[sub:.*]] = xla_hlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: xla_hlo.constant dense<[0, 2, 3]> : tensor<3xi64>
   // CHECK-NEXT: %[[cmul:.*]] = "xla_hlo.convert"(%[[mul]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[init:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
@@ -370,10 +383,11 @@ func @fusedBatchNormGradV3_noTraining_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg1: te
   // CHECK-NEXT: }) {dimensions = dense<[0, 2, 3]> : tensor<3xi64>} : (tensor<8x8x8x8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr2:.*]] = "xla_hlo.convert"(%[[red1]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
-  // CHECK-NEXT: %[[mul3:.*]] = "xla_hlo.multiply"(%[[grad]], %[[mul2]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
+  // CHECK:      %[[bcast_mul2:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul3:.*]] = xla_hlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
 
-  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
+  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
 
   // CHECK-NEXT: xla_hlo.constant dense<[0, 2, 3]> : tensor<3xi64>
   // CHECK-NEXT: %[[cgrad:.*]] = "xla_hlo.convert"(%[[grad]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
@@ -405,280 +419,41 @@ func @fusedBatchNormGradV3_Training_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg1: tens
 
 // CHECK-LABEL: func @biasAdd_NHWC
 func @biasAdd_NHWC(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  // CHECK: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK: %[[ARG0_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK: %[[ARG0_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[ARG0_SHAPE]])
+  // CHECK: %[[ARG1_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[ARG0_EXTENTS]])
+  // CHECK-SAME:   {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.+]] = xla_hlo.add %arg0, %[[ARG1_BCAST]]
   %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NHWC"} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
 // CHECK-LABEL: func @biasAdd_NCHW
 func @biasAdd_NCHW(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  // CHECK: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %[[ARG0_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK: %[[ARG0_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[ARG0_SHAPE]])
+  // CHECK: %[[ARG1_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[ARG0_EXTENTS]])
+  // CHECK-SAME:   {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.+]] = xla_hlo.add %arg0, %[[ARG1_BCAST]]
   %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NCHW"} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
 // CHECK-LABEL: func @biasAdd_dynamic
 func @biasAdd_dynamic(%arg0: tensor<?x?x?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?x?x?xi32> {
-  // CHECK: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %[[ARG0_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK: %[[ARG0_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[ARG0_SHAPE]])
+  // CHECK: %[[ARG1_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[ARG0_EXTENTS]])
+  // CHECK-SAME:   {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.+]] = xla_hlo.add %arg0, %[[ARG1_BCAST]]
   %0 = "tf.BiasAdd"(%arg0, %arg1) {data_format = "NCHW"} : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
   return %0 : tensor<?x?x?x?xi32>
 }
 
 //===----------------------------------------------------------------------===//
-// Binary op legalizations.
+// DiagPart
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: func @add
-func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %[[SUM0:.*]] = xla_hlo.add %arg0, %arg0 : tensor<2xi32>
-  // CHECK-NEXT:  %[[SUM1:.*]] = xla_hlo.add %[[SUM0]], %arg0 : tensor<2xi32>
-  // CHECK-NEXT:  return %[[SUM1]] : tensor<2xi32>
-  %0 = "tf.Add"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  %1 = "tf.AddV2"(%0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %1: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @broadcast_add
-func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
-// CHECK-LABEL: func @broadcast_multi_dim_add
-func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-  // CHECK-NEXT: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
-  %0 = "tf.Add"(%arg0, %arg1) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
-  return %0: tensor<4x4x4x4xi32>
-}
-
-// CHECK-LABEL: func @div
-func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla_hlo.divide %arg0, %arg0 : tensor<2xi32>
-  // CHECK-NEXT:  return %0 : tensor<2xi32>
-  %0 = "tf.Div"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @broadcast_div
-func @broadcast_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Div"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
-// CHECK-LABEL: func @shift_left
-func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK:  xla_hlo.shift_left %arg0, %arg1 : tensor<4xi32>
-  %0 = "tf.LeftShift"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  return %0 : tensor<4xi32>
-}
-
-// CHECK-LABEL: func @div_dynamic
-func @div_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  // CHECK: "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Div"(%arg0, %arg1) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
-  return %0: tensor<?x?xi32>
-}
-
-// CHECK-LABEL: func @div_unranked
-func @div_unranked(%arg0: tensor<*xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  // CHECK: tf.Div
-  %0 = "tf.Div"(%arg0, %arg1) : (tensor<*xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
-  return %0: tensor<?x?xi32>
-}
-
-// CHECK-LABEL: func @maximum
-func @maximum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK:  xla_hlo.maximum %arg0, %arg1 : tensor<4xf32>
-  %0 = "tf.Maximum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-
-// CHECK-LABEL: func @minimum
-func @minimum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK:  xla_hlo.minimum %arg0, %arg1 : tensor<4xf32>
-  %0 = "tf.Minimum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-
-// CHECK-LABEL: func @mul
-func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla_hlo.multiply %arg0, %arg0 : tensor<2xi32>
-  // CHECK-NEXT:  return %0 : tensor<2xi32>
-  %0 = "tf.Mul"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @broadcast_mul
-func @broadcast_mul(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Mul"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
-// CHECK-LABEL: func @real_div
-func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla_hlo.divide %arg0, %arg0 : tensor<2xi32>
-  %0 = "tf.RealDiv"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @broadcast_real_div
-func @broadcast_real_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.RealDiv"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
-// CHECK-LABEL: func @sub
-func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla_hlo.subtract %arg0, %arg0 : tensor<2xi32>
-  // CHECK-NEXT:  return %0 : tensor<2xi32>
-  %0 = "tf.Sub"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @broadcast_sub
-func @broadcast_sub(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Sub"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
-// CHECK-LABEL: func @shift_right
-func @shift_right(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK:  xla_hlo.shift_right_arithmetic %arg0, %arg1 : tensor<4xi32>
-  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  return %0 : tensor<4xi32>
-}
-
-// CHECK-LABEL: func @broadcast_shift_right
-func @broadcast_shift_right(%arg0: tensor<4xi32>, %arg1: tensor<2x4xi32>) -> tensor<2x4xi32> {
-  // CHECK: "xla_hlo.shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
-  return %0 : tensor<2x4xi32>
-}
-
-// CHECK-LABEL: func @shift_right_unsigned
-func @shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<4xui8>) -> tensor<4xui8> {
-  // CHECK:  tf.RightShift
-  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xui8>, tensor<4xui8>) -> tensor<4xui8>
-  return %0 : tensor<4xui8>
-}
-
-// CHECK-LABEL: func @broadcast_shift_right_unsigned
-func @broadcast_shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<2x4xui8>) -> tensor<2x4xui8> {
-  // CHECK:  tf.RightShift
-  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xui8>, tensor<2x4xui8>) -> tensor<2x4xui8>
-  return %0 : tensor<2x4xui8>
-}
-
-// CHECK-LABEL: func @and
-func @and(%arg0: tensor<2xi1>) -> tensor<2xi1> {
-  // CHECK-NEXT:  xla_hlo.and
-  %0 = "tf.LogicalAnd"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @and_broadcast
-func @and_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.and"
-  %0 = "tf.LogicalAnd"(%arg0, %arg1) : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @and_dynamic
-func @and_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
-  // CHECK-NEXT: "xla_hlo.and"
-  %0 = "tf.LogicalAnd"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @and_unranked
-func @and_unranked(%arg0: tensor<*xi1>, %arg1: tensor<*xi1>) -> tensor<*xi1> {
-  // CHECK: tf.LogicalAnd
-  %0 = "tf.LogicalAnd"(%arg0, %arg1) : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @or
-func @or(%arg0: tensor<2xi1>) -> tensor<2xi1> {
-  // CHECK-NEXT:  xla_hlo.or
-  %0 = "tf.LogicalOr"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @or_broadcast
-func @or_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.LogicalOr"(%arg0, %arg1) : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @or_dynamic
-func @or_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.LogicalOr"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @bitwise_or
-func @bitwise_or(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.BitwiseOr"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  return %0: tensor<4xi32>
-}
-
-// CHECK-LABEL: func @bitwise_or_broadcast
-func @bitwise_or_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.BitwiseOr"(%arg0, %arg1) : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
-  return %0: tensor<1x4xi8>
-}
-
-// CHECK-LABEL: func @bitwise_or_dynamic
-func @bitwise_or_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.BitwiseOr"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-  return %0: tensor<?xi32>
-}
-
-// CHECK-LABEL: func @bitwise_and
-func @bitwise_and(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK-NEXT: xla_hlo.and
-  %0 = "tf.BitwiseAnd"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  return %0: tensor<4xi32>
-}
-
-// CHECK-LABEL: func @bitwise_and_broadcast
-func @bitwise_and_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  // CHECK-NEXT: xla_hlo.and
-  %0 = "tf.BitwiseAnd"(%arg0, %arg1) : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
-  return %0: tensor<1x4xi8>
-}
-
-// CHECK-LABEL: func @bitwise_and_dynamic
-func @bitwise_and_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
-  // CHECK-NEXT: xla_hlo.and
-  %0 = "tf.BitwiseAnd"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-  return %0: tensor<?xi32>
-}
-
-// CHECK-LABEL: func @pow
-func @pow(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK-NEXT:  xla_hlo.power
-  %0 = "tf.Pow"(%arg0, %arg0) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-  return %0: tensor<2xf32>
-}
-
-// CHECK-LABEL: func @pow_dynamic
-func @pow_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
-  // CHECK-NEXT:  xla_hlo.power
-  %0 = "tf.Pow"(%arg0, %arg0) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  return %0: tensor<?xf32>
-}
-
 // CHECK-LABEL: func @diag_part
 // CHECK-SAME: %[[ARG:.*]]: tensor<4x3x4x3xf32>
 func @diag_part(%arg0: tensor<4x3x4x3xf32>) -> tensor<4x3xf32> {
@@ -698,6 +473,10 @@ func @diag_part(%arg0: tensor<4x3x4x3xf32>) -> tensor<4x3xf32> {
   return %0: tensor<4x3xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// Einsum.
+//===----------------------------------------------------------------------===//
+
 // CHECK-LABEL: func @einsum
 func @einsum(%arg0: tensor<2x3xf32>, %arg1: tensor<3x4xf32>) -> tensor<2x4xf32> {
   // CHECK:  xla_hlo.einsum
@@ -712,22 +491,26 @@ func @unary_einsum(%arg0: tensor<2x3xf32>) -> tensor<2x2xf32> {
   return %0: tensor<2x2xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// FloorDiv and FloorMod.
+//===----------------------------------------------------------------------===//
+
 // CHECK-LABEL: func @floordiv_broadcast_i32
 func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> tensor<2x3xi32> {
   // CHECK-DAG: [[ZEROS1:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP1:%.+]] = "xla_hlo.compare"(%arg0, [[ZEROS1]]) {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare %arg0, [[ZEROS1]] {comparison_direction = "LT"}
   // CHECK-DAG: [[ZEROS2:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP2:%.+]] = "xla_hlo.compare"(%arg1, [[ZEROS2]]) {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP3:%.+]] = "xla_hlo.compare"([[CMP1]], [[CMP2]]) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
-  // CHECK-DAG: [[DIV1:%.+]] = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZEROS2]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[CMP1]], [[CMP2]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
+  // CHECK-DAG: [[DIV1:%.+]] = xla_chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[ABS1:%.+]] = "xla_hlo.abs"(%arg0)
   // CHECK-DAG: [[ABS2:%.+]] = "xla_hlo.abs"(%arg1)
-  // CHECK-DAG: [[ZEROS3:%.+]] = xla_hlo.constant dense<1>
-  // CHECK-DAG: [[SUB:%.+]] = xla_hlo.subtract [[ABS2]], [[ZEROS3]]
-  // CHECK-DAG: [[ADD:%.+]] = "xla_hlo.add"([[ABS1]], [[SUB]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[ONES:%.+]] = xla_hlo.constant dense<1>
+  // CHECK-DAG: [[SUB:%.+]] = xla_chlo.broadcast_subtract [[ABS2]], [[ONES]]
+  // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add [[ABS1]], [[SUB]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[NEG:%.+]] = "xla_hlo.negate"([[ADD]])
   // CHECK-DAG: [[ABS3:%.+]] = "xla_hlo.abs"(%arg1)
-  // CHECK-DAG: [[DIV2:%.+]] = "xla_hlo.divide"([[NEG]], [[ABS3]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[DIV2:%.+]] = xla_chlo.broadcast_divide [[NEG]], [[ABS3]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[SELECT:%.+]] = "xla_hlo.select"([[CMP3]], [[DIV1]], [[DIV2]])
   // CHECK: return [[SELECT]]
   %0 = "tf.FloorDiv"(%arg0, %arg1) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
@@ -737,19 +520,19 @@ func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> te
 // CHECK-LABEL: func @floordiv_reverse_broadcast_i32
 func @floordiv_reverse_broadcast_i32(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>) -> tensor<2x3xi32> {
   // CHECK-DAG: [[ZEROS1:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP1:%.+]] = "xla_hlo.compare"(%arg0, [[ZEROS1]]) {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare %arg0, [[ZEROS1]] {comparison_direction = "LT"}
   // CHECK-DAG: [[ZEROS2:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP2:%.+]] = "xla_hlo.compare"(%arg1, [[ZEROS2]]) {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP3:%.+]] = "xla_hlo.compare"([[CMP1]], [[CMP2]]) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
-  // CHECK-DAG: [[DIV1:%.+]] = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZEROS2]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[CMP1]], [[CMP2]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
+  // CHECK-DAG: [[DIV1:%.+]] = xla_chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[ABS1:%.+]] = "xla_hlo.abs"(%arg0)
   // CHECK-DAG: [[ABS2:%.+]] = "xla_hlo.abs"(%arg1)
-  // CHECK-DAG: [[ZEROS3:%.+]] = xla_hlo.constant dense<1>
-  // CHECK-DAG: [[SUB:%.+]] = xla_hlo.subtract [[ABS2]], [[ZEROS3]]
-  // CHECK-DAG: [[ADD:%.+]] = "xla_hlo.add"([[ABS1]], [[SUB]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[ONES:%.+]] = xla_hlo.constant dense<1>
+  // CHECK-DAG: [[SUB:%.+]] = xla_chlo.broadcast_subtract [[ABS2]], [[ONES]]
+  // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add [[ABS1]], [[SUB]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[NEG:%.+]] = "xla_hlo.negate"([[ADD]])
   // CHECK-DAG: [[ABS3:%.+]] = "xla_hlo.abs"(%arg1)
-  // CHECK-DAG: [[DIV2:%.+]] = xla_hlo.divide [[NEG]], [[ABS3]]
+  // CHECK-DAG: [[DIV2:%.+]] = xla_chlo.broadcast_divide [[NEG]], [[ABS3]]
   // CHECK-DAG: [[SELECT:%.+]] = "xla_hlo.select"([[CMP3]], [[DIV1]], [[DIV2]])
   // CHECK: return [[SELECT]]
   %0 = "tf.FloorDiv"(%arg0, %arg1) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
@@ -758,7 +541,7 @@ func @floordiv_reverse_broadcast_i32(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32
 
 // CHECK-LABEL: func @floordiv_f32
 func @floordiv_f32(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK-NEXT:  %[[DIV:.*]] = xla_hlo.divide %arg0, %arg0
+  // CHECK-NEXT:  %[[DIV:.*]] = xla_chlo.broadcast_divide %arg0, %arg0
   // CHECK-NEXT:  %[[FLOOR:.*]] = "xla_hlo.floor"(%[[DIV]])
   // CHECK-NEXT:  return %[[FLOOR]] : tensor<2xf32>
   %0 = "tf.FloorDiv"(%arg0, %arg0) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
@@ -769,7 +552,7 @@ func @floordiv_f32(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 func @floordiv_bf16(%arg0: tensor<2xbf16>) -> tensor<2xbf16> {
   // CHECK-NEXT:  xla_hlo.convert
   // CHECK-NEXT:  xla_hlo.convert
-  // CHECK-NEXT:  xla_hlo.divide
+  // CHECK-NEXT:  xla_chlo.broadcast_divide
   // CHECK-NEXT:  xla_hlo.floor
   // CHECK-NEXT:  xla_hlo.convert
   // CHECK-NEXT:  return
@@ -779,7 +562,7 @@ func @floordiv_bf16(%arg0: tensor<2xbf16>) -> tensor<2xbf16> {
 
 // CHECK-LABEL: func @floordiv_f16_broadcast
 func @floordiv_f16_broadcast(%arg0: tensor<2x3xf16>, %arg1: tensor<3xf16>) -> tensor<2x3xf16> {
-  // CHECK-NEXT:  xla_hlo.divide
+  // CHECK-NEXT:  xla_chlo.broadcast_divide
   // CHECK-NEXT:  xla_hlo.floor
   // CHECK-NEXT:  return
   %0 = "tf.FloorDiv"(%arg0, %arg1) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
@@ -788,7 +571,22 @@ func @floordiv_f16_broadcast(%arg0: tensor<2x3xf16>, %arg1: tensor<3xf16>) -> te
 
 // CHECK-LABEL: func @floordiv_dynamic
 func @floordiv_dynamic(%arg0: tensor<?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?xi32> {
-  // CHECK: tf.FloorDiv
+  // CHECK-DAG: [[ZEROS1:%.+]] = xla_hlo.constant dense<0>
+  // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare %arg0, [[ZEROS1]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[ZEROS2:%.+]] = xla_hlo.constant dense<0>
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZEROS2]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[CMP1]], [[CMP2]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
+  // CHECK-DAG: [[DIV1:%.+]] = xla_chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[ABS1:%.+]] = "xla_hlo.abs"(%arg0)
+  // CHECK-DAG: [[ABS2:%.+]] = "xla_hlo.abs"(%arg1)
+  // CHECK-DAG: [[ONES:%.+]] = xla_hlo.constant dense<1>
+  // CHECK-DAG: [[SUB:%.+]] = xla_chlo.broadcast_subtract [[ABS2]], [[ONES]]
+  // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add [[ABS1]], [[SUB]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[NEG:%.+]] = "xla_hlo.negate"([[ADD]])
+  // CHECK-DAG: [[ABS3:%.+]] = "xla_hlo.abs"(%arg1)
+  // CHECK-DAG: [[DIV2:%.+]] = xla_chlo.broadcast_divide [[NEG]], [[ABS3]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SELECT:%.+]] = "xla_hlo.select"([[CMP3]], [[DIV1]], [[DIV2]])
+  // CHECK: return [[SELECT]]
   %0 = "tf.FloorDiv"(%arg0, %arg1) : (tensor<?x?xi32>, tensor<?xi32>) -> tensor<?x?xi32>
   return %0: tensor<?x?xi32>
 }
@@ -802,15 +600,15 @@ func @floordiv_unranked(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*x
 
 // CHECK-LABEL: func @floormod_broadcast_numerator
 func @floormod_broadcast_numerator(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>) -> tensor<2x3xi32> {
-  // CHECK-DAG: [[REM:%.+]] = "xla_hlo.remainder"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[REM:%.+]] = xla_chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[ZL:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP1:%.+]] = "xla_hlo.compare"([[REM]], [[ZL]]) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
+  // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare [[REM]], [[ZL]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
   // CHECK-DAG: [[ZR:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP2:%.+]] = "xla_hlo.compare"(%arg1, [[ZR:%.+]]) {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP3:%.+]] = "xla_hlo.compare"([[REM:%.+]], [[ZR]]) {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP4:%.+]] = "xla_hlo.compare"([[CMP2]], [[CMP3]]) {comparison_direction = "NE"}
-  // CHECK-DAG: [[AND:%.+]] = xla_hlo.and [[CMP1]], [[CMP4]]
-  // CHECK-DAG: [[ADD:%.+]] = xla_hlo.add %arg1, [[REM]]
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZR]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[REM]], [[ZR]] {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP4:%.+]] = xla_chlo.broadcast_compare [[CMP2]], [[CMP3]] {comparison_direction = "NE"}
+  // CHECK-DAG: [[AND:%.+]] = xla_chlo.broadcast_and [[CMP1]], [[CMP4]]
+  // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add %arg1, [[REM]]
   // CHECK-DAG: [[SELECT:%.+]] = "xla_hlo.select"([[AND]], [[ADD]], [[REM]])
   // CHECK-NEXT: return [[SELECT]]
   %0 = "tf.FloorMod"(%arg0, %arg1) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
@@ -819,15 +617,15 @@ func @floormod_broadcast_numerator(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>)
 
 // CHECK-LABEL: func @floormod_broadcast_denominator
 func @floormod_broadcast_denominator(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> tensor<2x3xi32> {
-  // CHECK-DAG: [[REM:%.+]] = "xla_hlo.remainder"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[REM:%.+]] = xla_chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[ZL:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP1:%.+]] = "xla_hlo.compare"([[REM]], [[ZL]]) {comparison_direction = "NE"}
+  // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare [[REM]], [[ZL]] {comparison_direction = "NE"}
   // CHECK-DAG: [[ZR:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP2:%.+]] = "xla_hlo.compare"(%arg1, [[ZR:%.+]]) {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP3:%.+]] = "xla_hlo.compare"([[REM:%.+]], [[ZR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP4:%.+]] = "xla_hlo.compare"([[CMP2]], [[CMP3]]) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
-  // CHECK-DAG: [[AND:%.+]] = xla_hlo.and [[CMP1]], [[CMP4]]
-  // CHECK-DAG: [[ADD:%.+]] = "xla_hlo.add"(%arg1, [[REM]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZR]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[REM]], [[ZR]] {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP4:%.+]] = xla_chlo.broadcast_compare [[CMP2]], [[CMP3]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
+  // CHECK-DAG: [[AND:%.+]] = xla_chlo.broadcast_and [[CMP1]], [[CMP4]]
+  // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add %arg1, [[REM]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[SELECT:%.+]] = "xla_hlo.select"([[AND]], [[ADD]], [[REM]])
   // CHECK-NEXT: return [[SELECT]]
   %0 = "tf.FloorMod"(%arg0, %arg1) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
@@ -836,7 +634,17 @@ func @floormod_broadcast_denominator(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32
 
 // CHECK-LABEL: func @floormod_dynamic
 func @floormod_dynamic(%arg0: tensor<?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?xi32> {
-  // CHECK: tf.FloorMod
+  // CHECK-DAG: [[REM:%.+]] = xla_chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[ZL:%.+]] = xla_hlo.constant dense<0>
+  // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare [[REM]], [[ZL]] {comparison_direction = "NE"}
+  // CHECK-DAG: [[ZR:%.+]] = xla_hlo.constant dense<0>
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZR]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[REM]], [[ZR]] {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP4:%.+]] = xla_chlo.broadcast_compare [[CMP2]], [[CMP3]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
+  // CHECK-DAG: [[AND:%.+]] = xla_chlo.broadcast_and [[CMP1]], [[CMP4]]
+  // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add %arg1, [[REM]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SELECT:%.+]] = "xla_hlo.select"([[AND]], [[ADD]], [[REM]])
+  // CHECK-NEXT: return [[SELECT]]
   %0 = "tf.FloorMod"(%arg0, %arg1) : (tensor<?x?xi32>, tensor<?xi32>) -> tensor<?x?xi32>
   return %0: tensor<?x?xi32>
 }
@@ -848,6 +656,10 @@ func @floormod_unranked(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*x
   return %0: tensor<*xi32>
 }
 
+//===----------------------------------------------------------------------===//
+// BroadcastTo.
+//===----------------------------------------------------------------------===//
+
 // CHECK-LABEL: func @broadcast_to
 func @broadcast_to(%arg0: tensor<16xf32>) -> tensor<16x16x16x16xf32> {
   %cst = "tf.Const"() { value = dense<16> : tensor<4xi32> } : () -> tensor<4xi32>
@@ -860,190 +672,6 @@ func @broadcast_to(%arg0: tensor<16xf32>) -> tensor<16x16x16x16xf32> {
   return %0 : tensor<16x16x16x16xf32>
 }
 
-//===----------------------------------------------------------------------===//
-// Equality op legalizations.
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func @equal
-func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "EQ"}
-  %0 = "tf.Equal"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @equal_dynamic
-func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"}
-  %0 = "tf.Equal"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @equal_broadcast
-func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
-  %0 = "tf.Equal"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @equal_broadcast_no_incompatible_shapes_error
-func @equal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @equal_incompatible_shape_broadcastable
-func @equal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @equal_incompatible_shape_dynamic
-func @equal_incompatible_shape_dynamic(%arg0: tensor<2xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
-  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<?xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @equal_incompatible_shape_both_dynamic
-func @equal_incompatible_shape_both_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
-  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<?xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @equal_unranked
-func @equal_unranked(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*xi1> {
-  // CHECK: "tf.Equal"
-  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @notequal
-func @notequal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "NE"}
-  %0 = "tf.NotEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @notequal_dynamic
-func @notequal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "NE"}
-  %0 = "tf.NotEqual"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @notequal_broadcast
-func @notequal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
-  %0 = "tf.NotEqual"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @notequal_broadcast_no_incompatible_shapes_error
-func @notequal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @notequal_incompatible_shape_broadcastable
-func @notequal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-NEXT: "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @notequal_incompatible_shape_dynamic
-func @notequal_incompatible_shape_dynamic(%arg0: tensor<2xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
-  // CHECK-NEXT: "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<?xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @notequal_incompatible_shape_both_dynamic
-func @notequal_incompatible_shape_both_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
-  // CHECK-NEXT: "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<?xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-//===----------------------------------------------------------------------===//
-// Compare op legalizations.
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func @greater
-func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GT"}
-  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @broadcast_greater
-func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GT"}
-  %0 = "tf.Greater"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @greater_dynamic
-func @greater_dynamic(%arg0: tensor<?xi32>) -> tensor<?xi1> {
-  // CHECK:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GT"}
-  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @greater_uranked
-func @greater_uranked(%arg0: tensor<*xi32>) -> tensor<*xi1> {
-  // CHECK:  "tf.Greater"
-  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @greater_equal
-func @greater_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GE"}
-  %0 = "tf.GreaterEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @broadcast_greater_equal
-func @broadcast_greater_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GE"}
-  %0 = "tf.GreaterEqual"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @less
-func @less(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LT"}
-  %0 = "tf.Less"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @broadcast_less
-func @broadcast_less(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"}
-  %0 = "tf.Less"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @less_equal
-func @less_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LE"}
-  %0 = "tf.LessEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @broadcast_less_equal
-func @broadcast_less_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LE"}
-  %0 = "tf.LessEqual"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-
 //===----------------------------------------------------------------------===//
 // Complex op legalizations.
 //===----------------------------------------------------------------------===//
@@ -1332,12 +960,12 @@ func @matrix_band_part(%arg0: tensor<64x64xbf16>, %arg1: tensor<i64>, %arg2: ten
   // CHECK: %[[X:.*]] = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<64x64xbf16>
   // CHECK: %[[Y:.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<64x64xbf16>
   // CHECK: %[[OFFSET:.*]] = xla_hlo.subtract %[[X]], %[[Y]] : tensor<64x64xbf16>
-  // CHECK: %[[G:.*]] = "xla_hlo.compare"(%[[F]], %[[OFFSET]]) {comparison_direction = "LE"} : (tensor<bf16>, tensor<64x64xbf16>) -> tensor<*xi1>
+  // CHECK: %[[G:.*]] = xla_chlo.broadcast_compare %[[F]], %[[OFFSET]] {comparison_direction = "LE"} : (tensor<bf16>, tensor<64x64xbf16>) -> tensor<64x64xi1>
 
   // CHECK: %[[H:.*]] = "xla_hlo.convert"(%[[D]]) : (tensor<i64>) -> tensor<bf16>
-  // CHECK: %[[I:.*]] = "xla_hlo.compare"(%[[OFFSET]], %[[H]]) {comparison_direction = "LE"} : (tensor<64x64xbf16>, tensor<bf16>) -> tensor<*xi1>
+  // CHECK: %[[I:.*]] = xla_chlo.broadcast_compare %[[OFFSET]], %[[H]] {comparison_direction = "LE"} : (tensor<64x64xbf16>, tensor<bf16>) -> tensor<64x64xi1>
 
-  // CHECK: %[[J:.*]] = xla_hlo.and %[[G]], %[[I]] : tensor<*xi1>
+  // CHECK: %[[J:.*]] = xla_hlo.and %[[G]], %[[I]] : tensor<64x64xi1>
 
   // CHECK: %[[ZERO2:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<64x64xbf16>
   // CHECK: %[[R:.*]] = "xla_hlo.select"(%[[J]], %[[INPUT]], %[[ZERO2]])
@@ -1353,11 +981,11 @@ func @matrix_band_part_2(%arg0: tensor<12x24x48xbf16>, %arg1: tensor<i64>, %arg2
   // CHECK: %[[Y:.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<24x48xbf16>
   // CHECK: %[[OFFSET:.*]] = xla_hlo.subtract %[[X]], %[[Y]] : tensor<24x48xbf16>
 
-  // CHECK: %[[G:.*]] = "xla_hlo.compare"(%[[F]], %[[OFFSET]]) {comparison_direction = "LE"} : (tensor<bf16>, tensor<24x48xbf16>) -> tensor<*xi1>
+  // CHECK: %[[G:.*]] = xla_chlo.broadcast_compare %[[F]], %[[OFFSET]] {comparison_direction = "LE"} : (tensor<bf16>, tensor<24x48xbf16>) -> tensor<24x48xi1>
 
   // CHECK: %[[H:.*]] = "xla_hlo.convert"(%[[D]]) : (tensor<i64>) -> tensor<bf16>
-  // CHECK: %[[I:.*]] = "xla_hlo.compare"(%[[OFFSET]], %[[H]]) {comparison_direction = "LE"} : (tensor<24x48xbf16>, tensor<bf16>) -> tensor<*xi1>
-  // CHECK: %[[J:.*]] = xla_hlo.and %[[G]], %[[I]] {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : tensor<*xi1>
+  // CHECK: %[[I:.*]] = xla_chlo.broadcast_compare %[[OFFSET]], %[[H]] {comparison_direction = "LE"} : (tensor<24x48xbf16>, tensor<bf16>) -> tensor<24x48xi1>
+  // CHECK: %[[J:.*]] = xla_hlo.and %[[G]], %[[I]] : tensor<24x48xi1>
 
   // CHECK: %[[ZERO2:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<12x24x48xbf16>
   // CHECK: %[[R:.*]] = "xla_hlo.select"(%[[J]], %[[INPUT]], %[[ZERO2]])
@@ -1504,7 +1132,8 @@ func @max_pool_3d_grad_same(%orig_input: tensor<2x8x13x25x7xf32>, %orig_output:
 // CHECK-LABEL:one_hot
 func @one_hot(%indices: tensor<3xi32>, %on_value: tensor<f32>, %off_value: tensor<f32>) -> tensor<3x5xf32> {
   // CHECK: %[[IOTA:.*]] = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<3x5xi32>
-  // CHECK: %[[COMPARE:.*]] = "xla_hlo.compare"(%arg0, %[[IOTA]]) {broadcast_dimensions = dense<0> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<3xi32>, tensor<3x5xi32>) -> tensor<3x5xi1>
+  // CHECK: %[[BCAST_ARG0:.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<3x5xi32>
+  // CHECK: %[[COMPARE:.*]] = "xla_hlo.compare"(%[[BCAST_ARG0]], %[[IOTA]]) {comparison_direction = "EQ"} : (tensor<3x5xi32>, tensor<3x5xi32>) -> tensor<3x5xi1>
   // CHECK: %[[ON_VALUE:.*]] = "xla_hlo.broadcast"(%arg1) {broadcast_sizes = dense<[3, 5]> : tensor<2xi64>} : (tensor<f32>) -> tensor<3x5xf32>
   // CHECK: %[[OFF_VALUE:.*]] = "xla_hlo.broadcast"(%arg2) {broadcast_sizes = dense<[3, 5]> : tensor<2xi64>} : (tensor<f32>) -> tensor<3x5xf32>
   // CHECK: %[[RESULT:.*]] = "xla_hlo.select"(%[[COMPARE]], %[[ON_VALUE]], %[[OFF_VALUE]]) : (tensor<3x5xi1>, tensor<3x5xf32>, tensor<3x5xf32>) -> tensor<3x5xf32>
@@ -1596,6 +1225,44 @@ func @unhandled_partitioned_call_2(%arg0: tensor<i32>, %arg1: tensor<*xi32>) ->
   return %0, %1 : tensor<i32>, tensor<i32>
 }
 
+
+//===----------------------------------------------------------------------===//
+// ReverseV2 op legalization.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @reverse_func_32
+func @reverse_func_32(%arg0: tensor<5xi32>) -> tensor<5xi32> {
+  %axis = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> (tensor<1xi32>)
+
+  // CHECK: [[VAL:%.+]] = "xla_hlo.reverse"(%arg0) {dimensions = dense<0> : tensor<1xi64>}
+  %reversed = "tf.ReverseV2"(%arg0, %axis) : (tensor<5xi32>, tensor<1xi32>) -> tensor<5xi32>
+
+  // CHECK: return [[VAL]] : tensor<5xi32>
+  return %reversed : tensor<5xi32>
+}
+
+// CHECK-LABEL: @reverse_func_64
+func @reverse_func_64(%arg0: tensor<5xi32>) -> tensor<5xi32> {
+  %axis = "tf.Const"() {value = dense<0> : tensor<1xi64>} : () -> (tensor<1xi64>)
+
+  // CHECK: [[VAL:%.+]] = "xla_hlo.reverse"(%arg0) {dimensions = dense<0> : tensor<1xi64>}
+  %reversed = "tf.ReverseV2"(%arg0, %axis) : (tensor<5xi32>, tensor<1xi64>) -> tensor<5xi32>
+
+  // CHECK: return [[VAL]] : tensor<5xi32>
+  return %reversed : tensor<5xi32>
+}
+
+// CHECK-LABEL: @reverse_func_neg
+func @reverse_func_neg(%arg0: tensor<5x5xi32>) -> tensor<5x5xi32> {
+  %axis = "tf.Const"() {value = dense<[-1]> : tensor<1xi32>} : () -> (tensor<1xi32>)
+
+  // CHECK: [[VAL:%.+]] = "xla_hlo.reverse"(%arg0) {dimensions = dense<1> : tensor<1xi64>}
+  %reversed = "tf.ReverseV2"(%arg0, %axis) : (tensor<5x5xi32>, tensor<1xi32>) -> tensor<5x5xi32>
+
+  // CHECK: return [[VAL]] : tensor<5x5xi32>
+  return %reversed : tensor<5x5xi32>
+}
+
 //===----------------------------------------------------------------------===//
 // StatefulPartitionedCall op legalization.
 //===----------------------------------------------------------------------===//
@@ -1631,7 +1298,7 @@ func @stateful_pcall_multi_in_out(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (te
 // CHECK-LABEL: func @relu
 func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   // CHECK: %[[ZERO:.*]] = xla_hlo.constant dense<0> : tensor<i32>
-  // CHECK: "xla_hlo.maximum"(%[[ZERO]], %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
+  // CHECK: xla_chlo.broadcast_maximum %[[ZERO]], %arg0 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
   %0 = "tf.Relu"(%arg0) : (tensor<1xi32>) -> tensor<1xi32>
   return %0: tensor<1xi32>
 }
@@ -1639,7 +1306,7 @@ func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
 // CHECK-LABEL: func @relu_unranked
 func @relu_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   // CHECK: %[[ZERO:.*]] = xla_hlo.constant dense<0> : tensor<i32>
-  // CHECK: "xla_hlo.maximum"(%[[ZERO]], %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
+  // CHECK: xla_chlo.broadcast_maximum %[[ZERO]], %arg0 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
   %0 = "tf.Relu"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
   return %0: tensor<?xi32>
 }
@@ -1667,8 +1334,8 @@ func @relu6_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
 func @relu_grad(%gradients: tensor<4x8xf32>, %features: tensor<?x?xf32>) -> tensor<4x8xf32> {
   // CHECK-DAG: %[[ZERO_SCALAR:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[ZERO:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<4x8xf32>
-  // CHECK-DAG: %[[PRED:.*]] = "xla_hlo.compare"(%[[FEATURES]], %[[ZERO_SCALAR]]) {comparison_direction = "GT"} : (tensor<?x?xf32>, tensor<f32>) -> tensor<*xi1>
-  // CHECK-DAG: %[[RESULT:.*]] = "xla_hlo.select"(%[[PRED]], %[[GRADIENTS]], %[[ZERO]]) : (tensor<*xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
+  // CHECK-DAG: %[[PRED:.*]] = xla_chlo.broadcast_compare %[[FEATURES]], %[[ZERO_SCALAR]] {comparison_direction = "GT"} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
+  // CHECK-DAG: %[[RESULT:.*]] = "xla_hlo.select"(%[[PRED]], %[[GRADIENTS]], %[[ZERO]]) : (tensor<?x?xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
   // CHECK-DAG: return %[[RESULT]] : tensor<4x8xf32>
   %2 = "tf.ReluGrad"(%gradients, %features) : (tensor<4x8xf32>, tensor<?x?xf32>) -> tensor<4x8xf32>
   return %2 : tensor<4x8xf32>
@@ -1678,27 +1345,6 @@ func @relu_grad(%gradients: tensor<4x8xf32>, %features: tensor<?x?xf32>) -> tens
 // Select op legalizations.
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: func @select
-func @select(%arg0: tensor<2xi1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT: "xla_hlo.select"(%arg0, %arg1, %arg2)
-  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @select_float
-func @select_float(%arg0: tensor<2xi1>, %arg1: tensor<2xf32>, %arg2: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK-NEXT: "xla_hlo.select"(%arg0, %arg1, %arg2)
-  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-  return %0: tensor<2xf32>
-}
-
-// CHECK-LABEL: func @select_multidimensional
-func @select_multidimensional(%arg0: tensor<3x2xi1>, %arg1: tensor<3x2xi32>, %arg2: tensor<3x2xi32>) -> tensor<3x2xi32> {
-  // CHECK-NEXT: "xla_hlo.select"(%arg0, %arg1, %arg2)
-  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<3x2xi1>, tensor<3x2xi32>, tensor<3x2xi32>) -> tensor<3x2xi32>
-  return %0: tensor<3x2xi32>
-}
-
 // CHECK-LABEL: func @selectv2
 func @selectv2(%arg0: tensor<2xi1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
   // CHECK-NEXT: "xla_hlo.select"(%arg0, %arg1, %arg2)
@@ -1737,6 +1383,14 @@ func @selectv2_broadcast_pred(%arg0: tensor<1xi1>, %arg1: tensor<2x8x8xi32>, %ar
   return %0: tensor<2x8x8xi32>
 }
 
+// CHECK-LABEL: func @selectv2_broadcast_tensor_pred
+func @selectv2_broadcast_tensor_pred(%arg0: tensor<3xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
+  // CHECK: %[[BROADCAST:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi1>) -> tensor<2x3xi1>
+  // CHECK: "xla_hlo.select"(%[[BROADCAST]], %arg1, %arg2)
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  return %0: tensor<2x3xf16>
+}
+
 // CHECK-LABEL: func @selectv2_broadcast_all
 func @selectv2_broadcast_all(%arg0: tensor<8x1x1xi1>, %arg1: tensor<1x8x1xi32>, %arg2: tensor<1x1x8xi32>) -> tensor<8x8x8xi32> {
   // CHECK-DAG: %[[BROADCAST_0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<8x1x1xi1>) -> tensor<8x8x8xi1>
@@ -1778,7 +1432,10 @@ func @simple_softmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   // CHECK: {dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf32>, tensor<f32>) -> tensor<2xf32>
   // CHECK: %[[CASTED_MAX:.*]] = "xla_hlo.convert"(%[[MAX]]) : (tensor<2xf32>) -> tensor<2xf32>
 
-  // CHECK: %[[SHIFTED_INP:.*]] = "xla_hlo.subtract"(%[[ARG0]], %[[CASTED_MAX]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]]) : (!shape.shape) -> tensor<2xindex>
+  // CHECK: %[[BCAST_MAX:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[CASTED_MAX]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: %[[SHIFTED_INP:.*]] = xla_hlo.subtract %[[ARG0]], %[[BCAST_MAX]]
   // CHECK: %[[EXP:.*]] = "xla_hlo.exponential"(%[[SHIFTED_INP]])
 
   // Verify reduce op for summation and its body.
@@ -1790,8 +1447,11 @@ func @simple_softmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   // CHECK: {dimensions = dense<1> : tensor<1xi64>}
   // CHECK: %[[CASTED_SUM:.*]] = "xla_hlo.convert"(%[[SUM]]) : (tensor<2xf32>) -> tensor<2xf32>
 
-  // CHECK: %[[RESULT:.*]] = "xla_hlo.divide"(%[[EXP]], %[[CASTED_SUM]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // return %[[RESULT]]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]]) : (!shape.shape) -> tensor<2xindex>
+  // CHECK: %[[BCAST_SUM:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[CASTED_SUM]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.*]] = xla_hlo.divide %[[EXP]], %[[BCAST_SUM]]
+  // CHECK: return %[[RESULT]]
 
   %0 = "tf.Softmax"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
   return %0: tensor<2x3xf32>
@@ -1800,7 +1460,7 @@ func @simple_softmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
 // Verify intermediate and final shape are correct with dynamic shapes.
 // CHECK-LABEL: func @dynamic_softmax
 func @dynamic_softmax(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK: "xla_hlo.divide"({{.*}}) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<?x?xf32>, tensor<?xf32>) -> tensor<?x?xf32>
+  // CHECK: xla_hlo.divide {{.*}}  : tensor<?x?xf32>
   %0 = "tf.Softmax"(%arg0) : (tensor<?x?xf32>) -> tensor<?x?xf32>
   return %0: tensor<?x?xf32>
 }
@@ -1826,43 +1486,29 @@ func @rank4_softmax(%arg0: tensor<2x3x4x5xf16>) -> tensor<2x3x4x5xf16> {
   // CHECK: "xla_hlo.reduce"
   // CHECK: dimensions = dense<3>
 
-  // CHECK: "xla_hlo.divide"{{.*}} {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
+  // CHECK: {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
+  // CHECK: xla_hlo.divide {{.*}}
   %0 = "tf.Softmax"(%arg0) : (tensor<2x3x4x5xf16>) -> tensor<2x3x4x5xf16>
   return %0: tensor<2x3x4x5xf16>
 }
 
 //===----------------------------------------------------------------------===//
 // LogSoftmax op legalizations.
+// This just changes the tail of the regular Softmax legalization
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: func @simple_logsoftmax
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<2x3xf32>)
 func @simple_logsoftmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
-
-  // Verify reduce op for max computation and its body.
-  // CHECK-DAG: %[[CASTED_INP:.*]] = "xla_hlo.convert"(%[[ARG0]]) : (tensor<2x3xf32>) -> tensor<2x3xf32>
-  // CHECK-DAG: %[[NEG_INF:.*]] = xla_hlo.constant dense<0xFF800000> : tensor<f32>
-  // CHECK: %[[MAX:.*]] = "xla_hlo.reduce"(%[[CASTED_INP]], %[[NEG_INF]])
-  // CHECK:  xla_hlo.maximum
-  // CHECK: "xla_hlo.return"
-  // CHECK: {dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf32>, tensor<f32>) -> tensor<2xf32>
-  // CHECK: %[[CASTED_MAX:.*]] = "xla_hlo.convert"(%[[MAX]]) : (tensor<2xf32>) -> tensor<2xf32>
-
-  // CHECK: %[[SHIFTED_INP:.*]] = "xla_hlo.subtract"(%[[ARG0]], %[[CASTED_MAX]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK: %[[EXP:.*]] = "xla_hlo.exponential"(%[[SHIFTED_INP]])
-
-  // Verify reduce op for summation and its body.
-  // CHECK-DAG: %[[CASTED_EXP:.*]] = "xla_hlo.convert"(%[[EXP]]) : (tensor<2x3xf32>) -> tensor<2x3xf32>
-  // CHECK-DAG: %[[ZERO:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK: %[[SUM:.*]] = "xla_hlo.reduce"(%[[CASTED_EXP]], %[[ZERO]])
-  // CHECK:  xla_hlo.add
-  // CHECK: "xla_hlo.return"
-  // CHECK: {dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %{{.*}} = "xla_hlo.reduce"({{.*}})
+  // CHECK: %[[SUM:.*]] = "xla_hlo.reduce"({{.*}})
   // CHECK: %[[CASTED_SUM:.*]] = "xla_hlo.convert"(%[[SUM]]) : (tensor<2xf32>) -> tensor<2xf32>
   // CHECK: %[[LOG:.*]] = "xla_hlo.log"(%[[CASTED_SUM]]) : (tensor<2xf32>) -> tensor<2xf32>
-
-  // CHECK: %[[RESULT:.*]] = "xla_hlo.subtract"(%[[SHIFTED_INP]], %[[LOG]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // return %[[RESULT]]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]]) : (!shape.shape) -> tensor<2xindex>
+  // CHECK: %[[BCAST_SUM:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[LOG]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.*]] = xla_hlo.subtract {{.*}}, %[[BCAST_SUM]]
+  // CHECK: return %[[RESULT]]
 
   %0 = "tf.LogSoftmax"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
   return %0: tensor<2x3xf32>
@@ -2163,16 +1809,41 @@ func @neg_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: @sigmoid
 func @sigmoid(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK-DAG: [[R0:%.+]] = xla_hlo.constant dense<5.000000e-01> : tensor<f32>
-  // CHECK-DAG: [[R1:%.+]] = "xla_hlo.broadcast"([[R0]]) {broadcast_sizes = dense<2> : tensor<1xi64>} : (tensor<f32>) -> tensor<2xf32>
-  // CHECK-DAG: [[R2:%.+]] =  xla_hlo.multiply %arg0, [[R1]] : tensor<2xf32>
-  // CHECK-DAG: [[R3:%.+]] =  "xla_hlo.tanh"([[R2]]) : (tensor<2xf32>) -> tensor<2xf32>
-  // CHECK-DAG: [[R4:%.+]] =  xla_hlo.multiply [[R3]], [[R1]] : tensor<2xf32>
-  // CHECK-DAG: [[R5:%.+]] =  xla_hlo.add [[R4]], [[R1]] : tensor<2xf32>
+  // CHECK-DAG: [[SCALAR:%.+]] = xla_hlo.constant dense<5.000000e-01> : tensor<f32>
+  // CHECK-DAG: [[SHAPE:%.+]] = shape.shape_of %arg0 : tensor<2xf32>
+  // CHECK-DAG: [[SHAPE_VAL:%.+]] = "shape.to_extent_tensor"([[SHAPE]]) : (!shape.shape) -> tensor<1xindex>
+  // CHECK-DAG: [[HALF:%.+]] = "xla_hlo.dynamic_broadcast_in_dim"([[SCALAR]], [[SHAPE_VAL]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<2xf32>
+  // CHECK-DAG: [[R1:%.+]] =  xla_hlo.multiply %arg0, [[HALF]] : tensor<2xf32>
+  // CHECK-DAG: [[R2:%.+]] =  "xla_hlo.tanh"([[R1]]) : (tensor<2xf32>) -> tensor<2xf32>
+  // CHECK-DAG: [[R3:%.+]] =  xla_hlo.multiply [[R2]], [[HALF]] : tensor<2xf32>
+  // CHECK-DAG: [[R4:%.+]] =  xla_hlo.add [[R3]], [[HALF]] : tensor<2xf32>
   %0 = "tf.Sigmoid"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL: @sigmoid_complex
+func @sigmoid_complex(%arg0: tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>> {
+  // CHECK: [[R0:%.+]] = xla_hlo.constant dense<(5.000000e-01,0.000000e+00)> : tensor<complex<f32>>
+  // CHECK-NOT: tf.Sigmoid
+  %0 = "tf.Sigmoid"(%arg0) : (tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
+  return %0 : tensor<2xcomplex<f32>>
+}
+
+// CHECK-LABEL: @sigmoid_unranked
+func @sigmoid_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK-DAG: [[SCALAR:%.+]] = xla_hlo.constant dense<5.000000e-01> : tensor<f32>
+  // CHECK-DAG: [[SHAPE:%.+]] = shape.shape_of %arg0 : tensor<*xf32>
+  // CHECK-DAG: [[SHAPE_VAL:%.+]] = "shape.to_extent_tensor"([[SHAPE]]) : (!shape.shape) -> tensor<?xindex>
+  // CHECK-DAG: [[HALF:%.+]] = "xla_hlo.dynamic_broadcast_in_dim"([[SCALAR]], [[SHAPE_VAL]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<?xindex>) -> tensor<*xf32>
+  // CHECK-DAG: [[R1:%.+]] =  xla_hlo.multiply %arg0, [[HALF]] : tensor<*xf32>
+  // CHECK-DAG: [[R2:%.+]] =  "xla_hlo.tanh"([[R1]]) : (tensor<*xf32>) -> tensor<*xf32>
+  // CHECK-DAG: [[R3:%.+]] =  xla_hlo.multiply [[R2]], [[HALF]] : tensor<*xf32>
+  // CHECK-DAG: [[R4:%.+]] =  xla_hlo.add [[R3]], [[HALF]] : tensor<*xf32>
+  %0 = "tf.Sigmoid"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+
 // CHECK-LABEL: @sigmoid_grad
 func @sigmoid_grad(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
   // CHECK-DAG: [[MUL0:%.+]] =  xla_hlo.multiply %arg1, %arg0 : tensor<2xf32>
@@ -2184,6 +1855,17 @@ func @sigmoid_grad(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL: @sigmoid_grad_complex
+func @sigmoid_grad_complex(%arg0: tensor<2xcomplex<f32>>, %arg1: tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>> {
+  // CHECK-DAG: [[MUL0:%.+]] =  xla_hlo.multiply %arg1, %arg0 : tensor<2xcomplex<f32>>
+  // CHECK-DAG: [[ONE:%.+]] = xla_hlo.constant dense<(1.000000e+00,0.000000e+00)> : tensor<2xcomplex<f32>>
+  // CHECK-DAG: [[SUB:%.+]] =  xla_hlo.subtract [[ONE]], %arg0 : tensor<2xcomplex<f32>>
+  // CHECK-DAG: [[MUL1:%.+]] =  xla_hlo.multiply [[MUL0]], [[SUB]] : tensor<2xcomplex<f32>>
+  // CHECK: return [[MUL1]]
+  %0 = "tf.SigmoidGrad"(%arg0, %arg1) : (tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
+  return %0 : tensor<2xcomplex<f32>>
+}
+
 // CHECK-LABEL: @sin
 func @sin(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   // CHECK:  "xla_hlo.sine"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
@@ -2205,13 +1887,6 @@ func @sin_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   return %0 : tensor<*xf32>
 }
 
-// CHECK-LABEL: func @round
-func @round(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK:  "xla_hlo.round_nearest_afz"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-  %0 = "tf.Round"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-  return %0 : tensor<2xf32>
-}
-
 // CHECK-LABEL: func @rsqrt
 func @rsqrt(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   // CHECK:  "xla_hlo.rsqrt"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
@@ -2505,6 +2180,18 @@ func @simple_strided_slice(%input: tensor<4x8xf32>) -> tensor<3x2xf32> {
   return %output : tensor<3x2xf32>
 }
 
+// CHECK-LABEL: dynamic_strided_slice
+func @dynamic_strided_slice(%input: tensor<?x8xf32>) -> tensor<?x2xf32> {
+  %begin = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+  %end = "tf.Const"() {value = dense<[3, 7]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+  %strides = "tf.Const"() {value = dense<[1, 3]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+
+  // CHECK: "tf.StridedSlice"
+  %output = "tf.StridedSlice"(%input, %begin, %end, %strides)
+      : (tensor<?x8xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<?x2xf32>
+  return %output : tensor<?x2xf32>
+}
+
 // CHECK-LABEL: strided_slice_negative_indices
 func @strided_slice_negative_indices(%input: tensor<4x8xf32>) -> tensor<3x2xf32> {
   %begin = "tf.Const"() {value = dense<[-1, -2]> : tensor<2xi32>} : () -> (tensor<2xi32>)
@@ -2524,6 +2211,18 @@ func @strided_slice_negative_indices(%input: tensor<4x8xf32>) -> tensor<3x2xf32>
   return %output : tensor<3x2xf32>
 }
 
+// CHECK-LABEL: dynamic_strided_slice_negative_indices
+func @dynamic_strided_slice_negative_indices(%input: tensor<?x8xf32>) -> tensor<?x2xf32> {
+  %begin = "tf.Const"() {value = dense<[-1, -2]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+  %end = "tf.Const"() {value = dense<[-4, -8]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+  %strides = "tf.Const"() {value = dense<[-1, -3]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+
+  // CHECK: tf.StridedSlice
+  %output = "tf.StridedSlice"(%input, %begin, %end, %strides)
+      : (tensor<?x8xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<?x2xf32>
+  return %output : tensor<?x2xf32>
+}
+
 // CHECK-LABEL: strided_slice_range_clamping
 func @strided_slice_range_clamping(%input: tensor<4x8xf32>) -> tensor<0x3xf32> {
   %begin = "tf.Const"() {value = dense<[-4, -10]> : tensor<2xi32>} : () -> (tensor<2xi32>)
@@ -2720,10 +2419,10 @@ func @strided_slice_nonconstant_begin_end(%arg0: tensor<i32>, %arg1: tensor<32x1
   // CHECK-DAG-SAME: start_indices = dense<0> : tensor<1xi64>,
   // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>} : (tensor<1xi32>) -> tensor<1xi32>
   // CHECK-NEXT: %[[INDEX2:.*]] = "xla_hlo.reshape"(%[[INDEX]]) : (tensor<1xi32>) -> tensor<i32>
-  // CHECK-NEXT: %[[CMP:.*]] = "xla_hlo.compare"(%[[INDEX2]], %[[ZERO]])
+  // CHECK-NEXT: %[[CMP:.*]] = xla_chlo.broadcast_compare %[[INDEX2]], %[[ZERO]]
   // CHECK-DAG-SAME: {comparison_direction = "LT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
   // CHECK-NEXT: %[[DIM:.*]] = xla_hlo.constant dense<32> : tensor<i32>
-  // CHECK-NEXT: %[[WRAP:.*]] = xla_hlo.add %[[DIM]], %[[INDEX2]] : tensor<i32>
+  // CHECK-NEXT: %[[WRAP:.*]] = xla_chlo.broadcast_add %[[DIM]], %[[INDEX2]] : (tensor<i32>, tensor<i32>) -> tensor<i32>
   // CHECK-NEXT: %[[INDEX3:.*]] = "xla_hlo.select"(%[[CMP]], %[[WRAP]], %[[INDEX2]]) :
   // CHECK-DAG-SAME: (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
   // CHECK-NEXT: %[[SLICED:.*]] = "xla_hlo.dynamic-slice"
@@ -2852,7 +2551,7 @@ func @mean(%arg0: tensor<4x8xf16>) -> tensor<4x1xf16> {
   // CHECK:  "xla_hlo.return"(%[[REDUCE_BODY_RESULT]]) : (tensor<f32>) -> ()
   // CHECK: }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
   // CHECK: %[[DIVISOR:.*]] = xla_hlo.constant dense<8.000000e+00> : tensor<f32>
-  // CHECK: %[[MEAN:.*]] = "xla_hlo.divide"(%[[REDUCED]], %[[DIVISOR]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: %[[MEAN:.*]] = xla_chlo.broadcast_divide %[[REDUCED]], %[[DIVISOR]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   // CHECK: %[[CAST_BACK:.*]] = "xla_hlo.convert"(%[[MEAN]]) : (tensor<4xf32>) -> tensor<4xf16>
   // CHECK: %[[RESULT:.*]] = "xla_hlo.reshape"(%[[CAST_BACK]]) : (tensor<4xf16>) -> tensor<4x1xf16>
   // CHECK: return %[[RESULT]] : tensor<4x1xf16>
@@ -3156,8 +2855,8 @@ func @rng_std_normal(%arg0: tensor<3xi32>) -> tensor<12x?x64xf32> {
 func @range(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<5xf32> {
   %1 = "tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "range/limit", value = dense<5.000000e+00> : tensor<f32>} : () -> tensor<f32>
   // CHECK-DAG: [[IOTA:%.*]] = "xla_hlo.iota"
-  // CHECK-DAG: [[MUL:%.*]] = "xla_hlo.multiply"([[IOTA]], [[DELTA]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>}
-  // CHECK: "xla_hlo.add"([[MUL]], [[START]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>}
+  // CHECK-DAG: [[MUL:%.*]] = xla_chlo.broadcast_multiply [[IOTA]], [[DELTA]] {broadcast_dimensions = dense<[]> : tensor<0xi64>}
+  // CHECK: xla_chlo.broadcast_add [[MUL]], [[START]] {broadcast_dimensions = dense<[]> : tensor<0xi64>}
   %3 = "tf.Range"(%arg0, %1, %arg1) {Tidx = "tfdtype$DT_FLOAT", device = "", name = "range"} : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<5xf32>
   return %3 : tensor<5xf32>
 }
@@ -3169,12 +2868,12 @@ func @linspace_static(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<4xf32> {
   // CHECK-DAG: [[NUM_CAST:%.*]] = tensor_cast [[NUM]]
   // CHECK-DAG: [[NUM_F32:%.*]] = "xla_hlo.convert"([[NUM_CAST]])
   // CHECK-DAG: [[ONE:%.*]] = xla_hlo.constant dense<1.000000e+00>
-  // CHECK-DAG: [[STEP_DENOMINATOR:%.*]] = xla_hlo.subtract [[NUM_F32]], [[ONE]]
-  // CHECK-DAG: [[STEP_NUMERATOR:%.*]] = xla_hlo.subtract [[STOP]], [[START]]
-  // CHECK-DAG: [[STEP:%.*]] = xla_hlo.divide [[STEP_NUMERATOR]], [[STEP_DENOMINATOR]]
+  // CHECK-DAG: [[STEP_DENOMINATOR:%.*]] = xla_chlo.broadcast_subtract [[NUM_F32]], [[ONE]]
+  // CHECK-DAG: [[STEP_NUMERATOR:%.*]] = xla_chlo.broadcast_subtract [[STOP]], [[START]]
+  // CHECK-DAG: [[STEP:%.*]] = xla_chlo.broadcast_divide [[STEP_NUMERATOR]], [[STEP_DENOMINATOR]]
   // CHECK-DAG: [[IOTA:%.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64}
-  // CHECK-DAG: [[MUL:%.*]] = "xla_hlo.multiply"([[IOTA]], [[STEP]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>}
-  // CHECK-DAG: [[LINSPACE:%.*]] = "xla_hlo.add"([[MUL]], [[START]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>}
+  // CHECK-DAG: [[MUL:%.*]] = xla_chlo.broadcast_multiply [[IOTA]], [[STEP]] {broadcast_dimensions = dense<[]> : tensor<0xi64>}
+  // CHECK-DAG: [[LINSPACE:%.*]] = xla_chlo.broadcast_add [[MUL]], [[START]] {broadcast_dimensions = dense<[]> : tensor<0xi64>}
   // CHECK: return [[LINSPACE]]
   %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<4> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.LinSpace"(%arg0, %arg1, %0) : (tensor<f32>, tensor<f32>, tensor<i32>) -> tensor<4xf32>
@@ -3469,13 +3168,13 @@ func @size_ranked(%input: tensor<2x?x8xf32>) -> (tensor<i32>) {
   // CHECK: %[[CONST:.*]] = xla_hlo.constant dense<1>
   // CHECK: %[[DIM_0:.*]] = "xla_hlo.get_dimension_size"(%[[INPUT]])
   // CHECK-SAME: dimension = 0
-  // CHECK: %[[MUL_0:.*]] = xla_hlo.multiply %[[CONST]], %[[DIM_0]]
+  // CHECK: %[[MUL_0:.*]] = xla_chlo.broadcast_multiply %[[CONST]], %[[DIM_0]]
   // CHECK: %[[DIM_1:.*]] = "xla_hlo.get_dimension_size"(%[[INPUT]])
   // CHECK-SAME: dimension = 1
-  // CHECK: %[[MUL_1:.*]] = xla_hlo.multiply %[[MUL_0]], %[[DIM_1]]
+  // CHECK: %[[MUL_1:.*]] = xla_chlo.broadcast_multiply %[[MUL_0]], %[[DIM_1]]
   // CHECK: %[[DIM_2:.*]] = "xla_hlo.get_dimension_size"(%[[INPUT]])
   // CHECK-SAME: dimension = 2
-  // CHECK: %[[MUL_2:.*]] = xla_hlo.multiply %[[MUL_1]], %[[DIM_2]]
+  // CHECK: %[[MUL_2:.*]] = xla_chlo.broadcast_multiply %[[MUL_1]], %[[DIM_2]]
   %size = "tf.Size"(%input) {T = "tfdtype$DT_FLOAT", out_type = "tfdtype$DT_INT32"} : (tensor<2x?x8xf32>) -> tensor<i32>
   // CHECK: return %[[MUL_2]]
   return %size : tensor<i32>
@@ -3632,30 +3331,31 @@ func @assert(%arg0: tensor<i1>, %arg1: tensor<*xf32>) {
 // tf.Unpack legalization
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: @unpack
-func @unpack(%input: tensor<4x3x6xf32>) -> (tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>) {
-  // CHECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 1, 6]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
-  // CHECK: %[[RES1:.*]] = "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<4x1x6xf32>) -> tensor<4x?xf32>
-  // CHECK: %[[SLICE2:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 2, 6]> : tensor<3xi64>, start_indices = dense<[0, 1, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
-  // CHECK: %[[RES2:.*]] = "xla_hlo.reshape"(%[[SLICE2]]) : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[SLICE3:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 3, 6]> : tensor<3xi64>, start_indices = dense<[0, 2, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
-  // CHECK: %[[RES3:.*]] = "xla_hlo.reshape"(%[[SLICE3]]) : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
+// TODO(b/156340000): Re-enable when fixed.
+// // C-HECK-LABEL: @unpack
+// func @unpack(%input: tensor<4x3x6xf32>) -> (tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>) {
+//   // C-HECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 1, 6]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
+//   // C-HECK: %[[RES1:.*]] = "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<4x1x6xf32>) -> tensor<4x?xf32>
+//   // C-HECK: %[[SLICE2:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 2, 6]> : tensor<3xi64>, start_indices = dense<[0, 1, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
+//   // C-HECK: %[[RES2:.*]] = "xla_hlo.reshape"(%[[SLICE2]]) : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
+//   // C-HECK: %[[SLICE3:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 3, 6]> : tensor<3xi64>, start_indices = dense<[0, 2, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
+//   // C-HECK: %[[RES3:.*]] = "xla_hlo.reshape"(%[[SLICE3]]) : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
 
-  %0:3 = "tf.Unpack"(%input) {axis = 1} : (tensor<4x3x6xf32>) -> (tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>)
-  // return %[[RES1]], %[[RES2]], %[[RES3]]
-  return %0#0, %0#1, %0#2 : tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>
-}
+//   %0:3 = "tf.Unpack"(%input) {axis = 1} : (tensor<4x3x6xf32>) -> (tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>)
+//   // return %[[RES1]], %[[RES2]], %[[RES3]]
+//   return %0#0, %0#1, %0#2 : tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>
+// }
 
-// CHECK-LABEL: @unpack_dynamic
-func @unpack_dynamic(%input: tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
-  // CHECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, -1, 1]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<?x?x2xf32>) -> tensor<?x?x1xf32>
-  // CHECK: "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<?x?x1xf32>) -> tensor<?x?xf32>
-  // CHECK: %[[SLICE2:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, -1, 2]> : tensor<3xi64>, start_indices = dense<[0, 0, 1]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<?x?x2xf32>) -> tensor<?x?x1xf32>
-  // CHECK: "xla_hlo.reshape"(%[[SLICE2]]) : (tensor<?x?x1xf32>) -> tensor<?x?xf32>
+// // C-HECK-LABEL: @unpack_dynamic
+// func @unpack_dynamic(%input: tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+//   // C-HECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, -1, 1]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<?x?x2xf32>) -> tensor<?x?x1xf32>
+//   // C-HECK: "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<?x?x1xf32>) -> tensor<?x?xf32>
+//   // C-HECK: %[[SLICE2:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, -1, 2]> : tensor<3xi64>, start_indices = dense<[0, 0, 1]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<?x?x2xf32>) -> tensor<?x?x1xf32>
+//   // C-HECK: "xla_hlo.reshape"(%[[SLICE2]]) : (tensor<?x?x1xf32>) -> tensor<?x?xf32>
 
-  %0:2 = "tf.Unpack"(%input) {axis = -1} : (tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>)
-  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
-}
+//   %0:2 = "tf.Unpack"(%input) {axis = -1} : (tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>)
+//   return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
+// }
 
 //===----------------------------------------------------------------------===//
 // tf.UnsortedSegment{Max|Min|Prod|Sum} legalization
@@ -3720,11 +3420,11 @@ func @unsorted_segment_max(%data: tensor<8x?x64xf32>, %segment_ids : tensor<?x16
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: @gather_v2
-func @gather_v2(%arg0: tensor<16x2x3xf32>, %arg1: tensor<16x5xi32>) -> tensor<16x2x5x3xf32> {
-  // CHECK: "xla_hlo.torch_index_select"(%arg0, %arg1) {batch_dims = 1 : i64, dim = 2 : i64} : (tensor<16x2x3xf32>, tensor<16x5xi32>) -> tensor<16x2x5x3xf32>
+func @gather_v2(%arg0: tensor<16x2x3xf32>, %arg1: tensor<16x5xi32>) -> tensor<16x2x5xf32> {
+  // CHECK: "xla_hlo.torch_index_select"(%arg0, %arg1) {batch_dims = 1 : i64, dim = 2 : i64} : (tensor<16x2x3xf32>, tensor<16x5xi32>) -> tensor<16x2x5xf32>
   %0 = "tf.Const"() { value = dense<[-1]> : tensor<1xi32> } : () -> tensor<1xi32>
-  %1 = "tf.GatherV2"(%arg0, %arg1, %0) {batch_dims = -1 : i64} : (tensor<16x2x3xf32>, tensor<16x5xi32>, tensor<1xi32>) -> tensor<16x2x5x3xf32>
-  return %1 : tensor<16x2x5x3xf32>
+  %1 = "tf.GatherV2"(%arg0, %arg1, %0) {batch_dims = -1 : i64} : (tensor<16x2x3xf32>, tensor<16x5xi32>, tensor<1xi32>) -> tensor<16x2x5xf32>
+  return %1 : tensor<16x2x5xf32>
 }
 
 // CHECK-LABEL: @gather_v2_dynamic
@@ -3991,7 +3691,7 @@ func @random_shuffle_3D(%input: tensor<4x?x16xf32>) -> tensor<4x?x16xf32> {
   // CHECK:   [[INDICES1:%.*]] = "xla_hlo.dynamic-update-slice"([[INDICES]], [[TGT_IDX]], [[IV]]) : (tensor<4xi32>, tensor<1xi32>, tensor<i32>) -> tensor<4xi32>
   // CHECK:   [[INDICES2:%.*]] = "xla_hlo.dynamic-update-slice"([[INDICES1]], [[SRC_IDX]], [[SWP]]) : (tensor<4xi32>, tensor<1xi32>, tensor<i32>) -> tensor<4xi32>
   // CHECK:   [[ONE:%.*]] = xla_hlo.constant dense<1> : tensor<i32>
-  // CHECK:   [[NEW_IV:%.*]] = xla_hlo.add [[IV]], [[ONE]]
+  // CHECK:   [[NEW_IV:%.*]] = xla_chlo.broadcast_add [[IV]], [[ONE]]
   // CHECK:   [[NEW_TUPLE:%.*]] = "xla_hlo.tuple"([[NEW_IV]], [[SWAPS]], [[INDICES2]])
   // CHECK:   "xla_hlo.return"([[NEW_TUPLE]])
   // CHECK: }) : (tuple<tensor<i32>, tensor<4xi32>, tensor<4xi32>>) -> tuple<tensor<i32>, tensor<4xi32>, tensor<4xi32>>
@@ -4060,7 +3760,7 @@ func @avgpool_valid_padding(%arg0: tensor<2x12x20x7xf16>) -> tensor<2x3x5x7xf16>
   // CHECK:   "xla_hlo.return"([[ADD]])
   // CHECK: }) {window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 4, 4, 1]> : tensor<4xi64>} : (tensor<2x12x20x7xf32>, tensor<f32>) -> tensor<2x3x5x7xf32>
   // CHECK: [[COUNT:%.+]] = xla_hlo.constant dense<4.000000e+00> : tensor<f32>
-  // CHECK: [[DIV:%.+]] = "xla_hlo.divide"([[REDUCE]], [[COUNT]]) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<2x3x5x7xf32>, tensor<f32>) -> tensor<2x3x5x7xf32>
+  // CHECK: [[DIV:%.+]] = xla_chlo.broadcast_divide [[REDUCE]], [[COUNT]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<2x3x5x7xf32>, tensor<f32>) -> tensor<2x3x5x7xf32>
   // CHECK: [[CONV16:%.+]] = "xla_hlo.convert"([[DIV]]) : (tensor<2x3x5x7xf32>) -> tensor<2x3x5x7xf16>
   // CHECK: return [[CONV16]]
   %0 = "tf.AvgPool"(%arg0) {data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 4, 4, 1]} : (tensor<2x12x20x7xf16>) -> tensor<2x3x5x7xf16>
@@ -4081,6 +3781,41 @@ func @xla_sharding(%arg0: tensor<4x16xf32>) -> tensor<4x16xf32> {
   return %0 : tensor<4x16xf32>
 }
 
+// CHECK-LABEL: inplace_update_one
+func @inplace_update_one(%arg0: tensor<8x4xf32>, %arg1: tensor<1x4xf32>, %arg2: tensor<1xi32>) -> tensor<8x4xf32> {
+  // CHECK-DAG: [[CST:%.+]] = xla_hlo.constant dense<0>
+  // CHECK-DAG: [[SLICE1:%.+]] = "xla_hlo.slice"(%arg2) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SLICE2:%.+]] = "xla_hlo.slice"(%arg1) {limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+  // CHECK-DAG: [[RESHAPE1:%.+]] = "xla_hlo.reshape"([[SLICE1]])
+  // CHECK-DAG: [[UPDATE:%.+]] = "xla_hlo.dynamic-update-slice"(%arg0, [[SLICE2]], [[RESHAPE1]], [[CST]])
+  %0 = "tf.InplaceUpdate"(%arg0, %arg2, %arg1) : (tensor<8x4xf32>, tensor<1xi32>, tensor<1x4xf32>) -> tensor<8x4xf32>
+
+  // CHECK: return [[UPDATE]]
+  return %0 : tensor<8x4xf32>
+}
+
+// CHECK-LABEL: inplace_update_three
+func @inplace_update_three(%arg0: tensor<8x8x4xf32>, %arg1: tensor<3x8x4xf32>, %arg2: tensor<3xi32>) -> tensor<8x8x4xf32> {
+  // CHECK-DAG: [[CST:%.+]] = xla_hlo.constant dense<0>
+  // CHECK-DAG: [[SLICE1:%.+]] = "xla_hlo.slice"(%arg2) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SLICE2:%.+]] = "xla_hlo.slice"(%arg2) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SLICE3:%.+]] = "xla_hlo.slice"(%arg2) {limit_indices = dense<3> : tensor<1xi64>, start_indices = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SLICE4:%.+]] = "xla_hlo.slice"(%arg1) {limit_indices = dense<[1, 8, 4]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
+  // CHECK-DAG: [[SLICE5:%.+]] = "xla_hlo.slice"(%arg1) {limit_indices = dense<[2, 8, 4]> : tensor<3xi64>, start_indices = dense<[1, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
+  // CHECK-DAG: [[SLICE6:%.+]] = "xla_hlo.slice"(%arg1) {limit_indices = dense<[3, 8, 4]> : tensor<3xi64>, start_indices = dense<[2, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
+  // CHECK-DAG: [[RESHAPE1:%.+]] = "xla_hlo.reshape"([[SLICE1]])
+  // CHECK-DAG: [[RESHAPE2:%.+]] = "xla_hlo.reshape"([[SLICE2]])
+  // CHECK-DAG: [[RESHAPE3:%.+]] = "xla_hlo.reshape"([[SLICE3]])
+  // CHECK-DAG: [[UPDATE1:%.+]] = "xla_hlo.dynamic-update-slice"(%arg0, [[SLICE4]], [[RESHAPE1]], [[CST]], [[CST]])
+  // CHECK-DAG: [[UPDATE2:%.+]] = "xla_hlo.dynamic-update-slice"([[UPDATE1]], [[SLICE5]], [[RESHAPE2]], [[CST]], [[CST]])
+  // CHECK-DAG: [[UPDATE3:%.+]] = "xla_hlo.dynamic-update-slice"([[UPDATE2]], [[SLICE6]], [[RESHAPE3]], [[CST]], [[CST]])
+  %0 = "tf.InplaceUpdate"(%arg0, %arg2, %arg1) : (tensor<8x8x4xf32>, tensor<3xi32>, tensor<3x8x4xf32>) -> tensor<8x8x4xf32>
+
+  // CHECK:  return [[UPDATE3]] : tensor<8x8x4xf32>
+  return %0 : tensor<8x8x4xf32>
+}
+
+
 // CHECK-LABEL: xla_dynamic_update_slice
 func @xla_dynamic_update_slice(%arg0: tensor<4x16xf32>, %arg1: tensor<2x4xf32>, %arg2: tensor<2xi32>) -> tensor<4x16xf32> {
   // CHECK: [[SLICE0:%.+]] = "xla_hlo.slice"(%arg2) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
@@ -4103,6 +3838,21 @@ func @xla_dynamic_update_slice2(%arg0: tensor<4xf32>, %arg1: tensor<2xf32>, %arg
   return %0 : tensor<4xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// AllToAll op legalizations.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @alltoall_basic
+func @alltoall_basic(%input: tensor<10xf32>) -> tensor<10xf32> {
+  %group_assignment = "tf.Const" () {
+    value = dense<[[0, 2, 4, 6], [1, 3, 5, 7], [3, 5, 6, 8]]> : tensor<3x4xi32>
+  } : () -> tensor<3x4xi32>
+  %result = "tf.AllToAll"(%input, %group_assignment) {T = f32, concat_dimension = 1 : i64, split_count = 2 : i64, split_dimension = 0 : i64} :  (tensor<10xf32>, tensor<3x4xi32>)  -> tensor<10xf32>
+  // CHECK: xla_hlo.all_to_all
+  // CHECK-SAME: replica_groups = dense<{{\[}}[0, 2, 4, 6], [1, 3, 5, 7], [3, 5, 6, 8]]> : tensor<3x4xi64>
+  return %result : tensor<10xf32>
+}
+
 //===----------------------------------------------------------------------===//
 // Cumsum op legalizations.
 //===----------------------------------------------------------------------===//
@@ -4150,177 +3900,11 @@ func @cumsum_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> tensor<?xf32>
 
 // CHECK:  func @qr([[VAL_0:%.*]]: tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>)
 func @qr(%arg0: tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>) {
-// CHECK:    [[VAL_1:%.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<100x100xi32>
-// CHECK:    [[VAL_2:%.*]] = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<100x100xi32>
-// CHECK:    [[VAL_3:%.*]] = "xla_hlo.compare"([[VAL_1]], [[VAL_2]]) {comparison_direction = "EQ"} : (tensor<100x100xi32>, tensor<100x100xi32>) -> tensor<100x100xi1>
-// CHECK:    [[VAL_4:%.*]] = "xla_hlo.convert"([[VAL_3]]) : (tensor<100x100xi1>) -> tensor<100x100xf32>
-// CHECK:    [[VAL_5:%.*]] = "xla_hlo.broadcast"([[VAL_4]]) {broadcast_sizes = dense<500> : tensor<1xi64>} : (tensor<100x100xf32>) -> tensor<500x100x100xf32>
-// CHECK:    [[VAL_6:%.*]] = "xla_hlo.slice"([[VAL_0]]) {limit_indices = dense<[500, 100, 75]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<500x100x75xf32>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_7:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:    [[VAL_8:%.*]] = "xla_hlo.broadcast"([[VAL_7]]) {broadcast_sizes = dense<[500, 100, 75]> : tensor<3xi64>} : (tensor<f32>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_9:%.*]] = "xla_hlo.broadcast"([[VAL_7]]) {broadcast_sizes = dense<[500, 75]> : tensor<2xi64>} : (tensor<f32>) -> tensor<500x75xf32>
-// CHECK:    [[VAL_10:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_11:%.*]] = "xla_hlo.tuple"([[VAL_10]], [[VAL_6]], [[VAL_8]], [[VAL_9]]) : (tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>) -> tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>
-// CHECK:    [[VAL_12:%.*]] = "xla_hlo.while"([[VAL_11]]) ( {
-// CHECK:         ^bb0([[VAL_13:%.*]]: tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>):
-// CHECK:           [[VAL_14:%.*]] = "xla_hlo.get_tuple_element"([[VAL_13]]) {index = 0 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<i32>
-// CHECK:           [[VAL_15:%.*]] = xla_hlo.constant dense<75> : tensor<i32>
-// CHECK:           [[VAL_16:%.*]] = "xla_hlo.compare"([[VAL_14]], [[VAL_15]]) {comparison_direction = "LT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
-// CHECK:           "xla_hlo.return"([[VAL_16]]) : (tensor<i1>) -> ()
-// CHECK:         },  {
-// CHECK:         ^bb0([[VAL_17:%.*]]: tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>):
-// CHECK:           [[VAL_18:%.*]] = "xla_hlo.get_tuple_element"([[VAL_17]]) {index = 0 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<i32>
-// CHECK:           [[VAL_19:%.*]] = "xla_hlo.get_tuple_element"([[VAL_17]]) {index = 1 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_20:%.*]] = "xla_hlo.get_tuple_element"([[VAL_17]]) {index = 2 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_21:%.*]] = "xla_hlo.get_tuple_element"([[VAL_17]]) {index = 3 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x75xf32>
-// CHECK:           [[VAL_22:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:           [[VAL_23:%.*]] = "xla_hlo.dynamic-slice"([[VAL_19]], [[VAL_22]], [[VAL_22]], [[VAL_18]]) {slice_sizes = dense<[500, 100, 1]> : tensor<3xi64>} : (tensor<500x100x75xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<500x100x1xf32>
-// CHECK:           [[VAL_24:%.*]] = "xla_hlo.reshape"([[VAL_23]]) : (tensor<500x100x1xf32>) -> tensor<500x100xf32>
-// CHECK:           [[VAL_25:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:           [[VAL_26:%.*]] = xla_hlo.constant dense<1.000000e+00> : tensor<f32>
-// CHECK:           [[VAL_27:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:           [[VAL_28:%.*]] = "xla_hlo.dynamic-slice"([[VAL_24]], [[VAL_27]], [[VAL_18]]) {slice_sizes = dense<[500, 1]> : tensor<2xi64>} : (tensor<500x100xf32>, tensor<i32>, tensor<i32>) -> tensor<500x1xf32>
-// CHECK:           [[VAL_29:%.*]] = "xla_hlo.reshape"([[VAL_28]]) : (tensor<500x1xf32>) -> tensor<500xf32>
-// CHECK:           [[VAL_30:%.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<100xi32>
-// CHECK:           [[VAL_31:%.*]] = "xla_hlo.compare"([[VAL_30]], [[VAL_18]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "GT"} : (tensor<100xi32>, tensor<i32>) -> tensor<100xi1>
-// CHECK:           [[VAL_32:%.*]] = "xla_hlo.convert"([[VAL_31]]) : (tensor<100xi1>) -> tensor<100xf32>
-// CHECK:           [[VAL_33:%.*]] = "xla_hlo.multiply"([[VAL_24]], [[VAL_32]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<500x100xf32>, tensor<100xf32>) -> tensor<500x100xf32>
-// CHECK:           [[VAL_34:%.*]] = xla_hlo.multiply [[VAL_33]], [[VAL_33]] : tensor<500x100xf32>
-// CHECK:           [[VAL_35:%.*]] = "xla_hlo.reduce"([[VAL_34]], [[VAL_25]]) ( {
-// CHECK:           ^bb0([[VAL_36:%.*]]: tensor<f32>, [[VAL_37:%.*]]: tensor<f32>):
-// CHECK:             [[VAL_38:%.*]] = xla_hlo.add [[VAL_36]], [[VAL_37]] : tensor<f32>
-// CHECK:             "xla_hlo.return"([[VAL_38]]) : (tensor<f32>) -> ()
-// CHECK:           }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<500x100xf32>, tensor<f32>) -> tensor<500xf32>
-// CHECK:           [[VAL_39:%.*]] = xla_hlo.multiply [[VAL_29]], [[VAL_29]] : tensor<500xf32>
-// CHECK:           [[VAL_40:%.*]] = xla_hlo.add [[VAL_39]], [[VAL_41:%.*]] : tensor<500xf32>
-// CHECK:           [[VAL_42:%.*]] = "xla_hlo.sqrt"([[VAL_40]]) : (tensor<500xf32>) -> tensor<500xf32>
-// CHECK:           [[VAL_43:%.*]] = "xla_hlo.compare"([[VAL_41]], [[VAL_25]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "EQ"} : (tensor<500xf32>, tensor<f32>) -> tensor<500xi1>
-// CHECK:           [[VAL_44:%.*]] = "xla_hlo.compare"([[VAL_29]], [[VAL_25]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "LT"} : (tensor<500xf32>, tensor<f32>) -> tensor<500xi1>
-// CHECK:           [[VAL_45:%.*]] = "xla_hlo.broadcast"([[VAL_26]]) {broadcast_sizes = dense<500> : tensor<1xi64>} : (tensor<f32>) -> tensor<500xf32>
-// CHECK:           [[VAL_46:%.*]] = "xla_hlo.negate"([[VAL_45]]) : (tensor<500xf32>) -> tensor<500xf32>
-// CHECK:           [[VAL_47:%.*]] = "xla_hlo.select"([[VAL_44]], [[VAL_45]], [[VAL_46]]) : (tensor<500xi1>, tensor<500xf32>, tensor<500xf32>) -> tensor<500xf32>
-// CHECK:           [[VAL_48:%.*]] = xla_hlo.multiply [[VAL_47]], [[VAL_42]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<500xf32>
-// CHECK:           [[VAL_49:%.*]] = "xla_hlo.select"([[VAL_43]], [[VAL_29]], [[VAL_48]]) : (tensor<500xi1>, tensor<500xf32>, tensor<500xf32>) -> tensor<500xf32>
-// CHECK:           [[VAL_50:%.*]] = xla_hlo.subtract [[VAL_49]], [[VAL_29]] : tensor<500xf32>
-// CHECK:           [[VAL_51:%.*]] = xla_hlo.divide [[VAL_50]], [[VAL_49]] : tensor<500xf32>
-// CHECK:           [[VAL_52:%.*]] = "xla_hlo.broadcast"([[VAL_25]]) {broadcast_sizes = dense<500> : tensor<1xi64>} : (tensor<f32>) -> tensor<500xf32>
-// CHECK:           [[VAL_53:%.*]] = "xla_hlo.select"([[VAL_43]], [[VAL_52]], [[VAL_51]]) : (tensor<500xi1>, tensor<500xf32>, tensor<500xf32>) -> tensor<500xf32>
-// CHECK:           [[VAL_54:%.*]] = xla_hlo.subtract [[VAL_29]], [[VAL_49]] : tensor<500xf32>
-// CHECK:           [[VAL_55:%.*]] = "xla_hlo.select"([[VAL_43]], [[VAL_45]], [[VAL_54]]) : (tensor<500xi1>, tensor<500xf32>, tensor<500xf32>) -> tensor<500xf32>
-// CHECK:           [[VAL_56:%.*]] = "xla_hlo.compare"([[VAL_30]], [[VAL_18]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "EQ"} : (tensor<100xi32>, tensor<i32>) -> tensor<100xi1>
-// CHECK:           [[VAL_57:%.*]] = "xla_hlo.convert"([[VAL_56]]) : (tensor<100xi1>) -> tensor<100xf32>
-// CHECK:           [[VAL_58:%.*]] = "xla_hlo.broadcast"([[VAL_57]]) {broadcast_sizes = dense<1> : tensor<1xi64>} : (tensor<100xf32>) -> tensor<1x100xf32>
-// CHECK:           [[VAL_59:%.*]] = "xla_hlo.divide"([[VAL_33]], [[VAL_55]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<500x100xf32>, tensor<500xf32>) -> tensor<500x100xf32>
-// CHECK:           [[VAL_60:%.*]] = "xla_hlo.add"([[VAL_58]], [[VAL_59]]) : (tensor<1x100xf32>, tensor<500x100xf32>) -> tensor<500x100xf32>
-// CHECK:           [[VAL_61:%.*]] = "xla_hlo.reshape"([[VAL_60]]) : (tensor<500x100xf32>) -> tensor<500x1x100xf32>
-// CHECK:           [[VAL_62:%.*]] = "xla_hlo.dot_general"([[VAL_61]], [[VAL_19]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<2> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["HIGHEST", "HIGHEST"]} : (tensor<500x1x100xf32>, tensor<500x100x75xf32>) -> tensor<500x1x75xf32>
-// CHECK:           [[VAL_63:%.*]] = "xla_hlo.dot_general"([[VAL_61]], [[VAL_62]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<1> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["HIGHEST", "HIGHEST"]} : (tensor<500x1x100xf32>, tensor<500x1x75xf32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_64:%.*]] = "xla_hlo.multiply"([[VAL_53]], [[VAL_63]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<500xf32>, tensor<500x100x75xf32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_65:%.*]] = xla_hlo.subtract [[VAL_19]], [[VAL_64]] : tensor<500x100x75xf32>
-// CHECK:           [[VAL_66:%.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<100x1xi32>
-// CHECK:           [[VAL_67:%.*]] = "xla_hlo.compare"([[VAL_66]], [[VAL_18]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "LT"} : (tensor<100x1xi32>, tensor<i32>) -> tensor<100x1xi1>
-// CHECK:           [[VAL_68:%.*]] = "xla_hlo.convert"([[VAL_67]]) : (tensor<100x1xi1>) -> tensor<100x1xf32>
-// CHECK:           [[VAL_69:%.*]] = "xla_hlo.compare"([[VAL_66]], [[VAL_18]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "EQ"} : (tensor<100x1xi32>, tensor<i32>) -> tensor<100x1xi1>
-// CHECK:           [[VAL_70:%.*]] = "xla_hlo.convert"([[VAL_69]]) : (tensor<100x1xi1>) -> tensor<100x1xf32>
-// CHECK:           [[VAL_71:%.*]] = "xla_hlo.broadcast"([[VAL_70]]) {broadcast_sizes = dense<1> : tensor<1xi64>} : (tensor<100x1xf32>) -> tensor<1x100x1xf32>
-// CHECK:           [[VAL_72:%.*]] = "xla_hlo.multiply"([[VAL_23]], [[VAL_68]]) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<500x100x1xf32>, tensor<100x1xf32>) -> tensor<500x100x1xf32>
-// CHECK:           [[VAL_73:%.*]] = "xla_hlo.multiply"([[VAL_49]], [[VAL_71]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<500xf32>, tensor<1x100x1xf32>) -> tensor<500x100x1xf32>
-// CHECK:           [[VAL_74:%.*]] = xla_hlo.add [[VAL_72]], [[VAL_73]] : tensor<500x100x1xf32>
-// CHECK:           [[VAL_75:%.*]] = "xla_hlo.broadcast_in_dim"([[VAL_74]]) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<500x100x1xf32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_76:%.*]] = "xla_hlo.iota"() {iota_dimension = 2 : i64} : () -> tensor<500x100x75xi32>
-// CHECK:           [[VAL_77:%.*]] = "xla_hlo.compare"([[VAL_76]], [[VAL_18]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "EQ"} : (tensor<500x100x75xi32>, tensor<i32>) -> tensor<500x100x75xi1>
-// CHECK:           [[VAL_78:%.*]] = "xla_hlo.select"([[VAL_77]], [[VAL_75]], [[VAL_65]]) : (tensor<500x100x75xi1>, tensor<500x100x75xf32>, tensor<500x100x75xf32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_79:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:           [[VAL_80:%.*]] = "xla_hlo.broadcast"([[VAL_79]]) {broadcast_sizes = dense<[500, 100, 75]> : tensor<3xi64>} : (tensor<f32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_81:%.*]] = "xla_hlo.add"([[VAL_80]], [[VAL_60]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<500x100x75xf32>, tensor<500x100xf32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_82:%.*]] = "xla_hlo.select"([[VAL_77]], [[VAL_81]], [[VAL_80]]) : (tensor<500x100x75xi1>, tensor<500x100x75xf32>, tensor<500x100x75xf32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_83:%.*]] = xla_hlo.add [[VAL_20]], [[VAL_82]] : tensor<500x100x75xf32>
-// CHECK:           [[VAL_84:%.*]] = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<500x75xi32>
-// CHECK:           [[VAL_85:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:           [[VAL_86:%.*]] = "xla_hlo.broadcast"([[VAL_85]]) {broadcast_sizes = dense<[500, 75]> : tensor<2xi64>} : (tensor<f32>) -> tensor<500x75xf32>
-// CHECK:           [[VAL_87:%.*]] = "xla_hlo.compare"([[VAL_84]], [[VAL_18]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "EQ"} : (tensor<500x75xi32>, tensor<i32>) -> tensor<500x75xi1>
-// CHECK:           [[VAL_88:%.*]] = "xla_hlo.add"([[VAL_86]], [[VAL_53]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<500x75xf32>, tensor<500xf32>) -> tensor<500x75xf32>
-// CHECK:           [[VAL_89:%.*]] = "xla_hlo.select"([[VAL_87]], [[VAL_88]], [[VAL_86]]) : (tensor<500x75xi1>, tensor<500x75xf32>, tensor<500x75xf32>) -> tensor<500x75xf32>
-// CHECK:           [[VAL_90:%.*]] = xla_hlo.add [[VAL_21]], [[VAL_89]] : tensor<500x75xf32>
-// CHECK:           [[VAL_91:%.*]] = xla_hlo.constant dense<1> : tensor<i32>
-// CHECK:           [[VAL_92:%.*]] = xla_hlo.add [[VAL_18]], [[VAL_91]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<i32>
-// CHECK:           [[VAL_93:%.*]] = "xla_hlo.tuple"([[VAL_92]], [[VAL_78]], [[VAL_83]], [[VAL_90]]) : (tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>) -> tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>
-// CHECK:           "xla_hlo.return"([[VAL_93]]) : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> ()
-// CHECK:         }) : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>
-// CHECK:    [[VAL_94:%.*]] = "xla_hlo.get_tuple_element"([[VAL_95:%.*]]) {index = 1 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_96:%.*]] = "xla_hlo.get_tuple_element"([[VAL_95]]) {index = 2 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_97:%.*]] = "xla_hlo.get_tuple_element"([[VAL_95]]) {index = 3 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x75xf32>
-// CHECK:    [[VAL_98:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_99:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_100:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_101:%.*]] = "xla_hlo.dynamic-update-slice"([[VAL_0]], [[VAL_94]], [[VAL_100]], [[VAL_98]], [[VAL_99]]) : (tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_102:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:    [[VAL_103:%.*]] = "xla_hlo.broadcast"([[VAL_102]]) {broadcast_sizes = dense<[500, 100, 75]> : tensor<3xi64>} : (tensor<f32>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_104:%.*]] = "xla_hlo.slice"([[VAL_96]]) {limit_indices = dense<[500, 100, 1]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<500x100x75xf32>) -> tensor<500x100x1xf32>
-// CHECK:    [[VAL_105:%.*]] = "xla_hlo.slice"([[VAL_97]]) {limit_indices = dense<[500, 1]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<500x75xf32>) -> tensor<500x1xf32>
-// CHECK:    [[VAL_106:%.*]] = "xla_hlo.negate"([[VAL_105]]) : (tensor<500x1xf32>) -> tensor<500x1xf32>
-// CHECK:    [[VAL_107:%.*]] = "xla_hlo.multiply"([[VAL_106]], [[VAL_104]]) {broadcast_dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<500x1xf32>, tensor<500x100x1xf32>) -> tensor<500x100x1xf32>
-// CHECK:    [[VAL_108:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_109:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_110:%.*]] = "xla_hlo.dynamic-update-slice"([[VAL_103]], [[VAL_107]], [[VAL_109]], [[VAL_109]], [[VAL_108]]) : (tensor<500x100x75xf32>, tensor<500x100x1xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_111:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_112:%.*]] = "xla_hlo.tuple"([[VAL_111]], [[VAL_110]], [[VAL_96]], [[VAL_97]]) : (tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>) -> tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>
-// CHECK:    [[VAL_113:%.*]] = "xla_hlo.while"([[VAL_112]]) ( {
-// CHECK:         ^bb0([[VAL_114:%.*]]: tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>):
-// CHECK:           [[VAL_115:%.*]] = "xla_hlo.get_tuple_element"([[VAL_114]]) {index = 0 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<i32>
-// CHECK:           [[VAL_116:%.*]] = xla_hlo.constant dense<74> : tensor<i32>
-// CHECK:           [[VAL_117:%.*]] = "xla_hlo.compare"([[VAL_115]], [[VAL_116]]) {comparison_direction = "LT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
-// CHECK:           "xla_hlo.return"([[VAL_117]]) : (tensor<i1>) -> ()
-// CHECK:         },  {
-// CHECK:         ^bb0([[VAL_118:%.*]]: tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>):
-// CHECK:           [[VAL_119:%.*]] = "xla_hlo.get_tuple_element"([[VAL_118]]) {index = 0 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<i32>
-// CHECK:           [[VAL_120:%.*]] = "xla_hlo.get_tuple_element"([[VAL_118]]) {index = 1 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_121:%.*]] = "xla_hlo.get_tuple_element"([[VAL_118]]) {index = 2 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_122:%.*]] = "xla_hlo.get_tuple_element"([[VAL_118]]) {index = 3 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x75xf32>
-// CHECK:           [[VAL_123:%.*]] = xla_hlo.constant dense<1> : tensor<i32>
-// CHECK:           [[VAL_124:%.*]] = xla_hlo.add [[VAL_119]], [[VAL_123]] : tensor<i32>
-// CHECK:           [[VAL_125:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:           [[VAL_126:%.*]] = "xla_hlo.dynamic-slice"([[VAL_121]], [[VAL_125]], [[VAL_125]], [[VAL_124]]) {slice_sizes = dense<[500, 100, 1]> : tensor<3xi64>} : (tensor<500x100x75xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<500x100x1xf32>
-// CHECK:           [[VAL_127:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:           [[VAL_128:%.*]] = "xla_hlo.dynamic-slice"([[VAL_122]], [[VAL_127]], [[VAL_124]]) {slice_sizes = dense<[500, 1]> : tensor<2xi64>} : (tensor<500x75xf32>, tensor<i32>, tensor<i32>) -> tensor<500x1xf32>
-// CHECK:           [[VAL_129:%.*]] = "xla_hlo.iota"() {iota_dimension = 2 : i64} : () -> tensor<500x100x75xi32>
-// CHECK:           [[VAL_130:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:           [[VAL_131:%.*]] = "xla_hlo.broadcast"([[VAL_130]]) {broadcast_sizes = dense<[500, 100, 75]> : tensor<3xi64>} : (tensor<f32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_132:%.*]] = "xla_hlo.compare"([[VAL_129]], [[VAL_124]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<500x100x75xi32>, tensor<i32>) -> tensor<500x100x75xi1>
-// CHECK:           [[VAL_133:%.*]] = "xla_hlo.select"([[VAL_132]], [[VAL_131]], [[VAL_121]]) : (tensor<500x100x75xi1>, tensor<500x100x75xf32>, tensor<500x100x75xf32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_134:%.*]] = "xla_hlo.dot_general"([[VAL_133]], [[VAL_126]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<1> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["HIGHEST", "HIGHEST"]} : (tensor<500x100x75xf32>, tensor<500x100x1xf32>) -> tensor<500x75x1xf32>
-// CHECK:           [[VAL_135:%.*]] = "xla_hlo.dot_general"([[VAL_120]], [[VAL_134]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<2> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["HIGHEST", "HIGHEST"]} : (tensor<500x100x75xf32>, tensor<500x75x1xf32>) -> tensor<500x100x1xf32>
-// CHECK:           [[VAL_136:%.*]] = "xla_hlo.negate"([[VAL_128]]) : (tensor<500x1xf32>) -> tensor<500x1xf32>
-// CHECK:           [[VAL_137:%.*]] = xla_hlo.add [[VAL_126]], [[VAL_135]] : tensor<500x100x1xf32>
-// CHECK:           [[VAL_138:%.*]] = "xla_hlo.multiply"([[VAL_136]], [[VAL_137]]) {broadcast_dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<500x1xf32>, tensor<500x100x1xf32>) -> tensor<500x100x1xf32>
-// CHECK:           [[VAL_139:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:           [[VAL_140:%.*]] = "xla_hlo.dynamic-update-slice"([[VAL_120]], [[VAL_138]], [[VAL_139]], [[VAL_139]], [[VAL_124]]) : (tensor<500x100x75xf32>, tensor<500x100x1xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_141:%.*]] = xla_hlo.constant dense<1> : tensor<i32>
-// CHECK:           [[VAL_142:%.*]] = xla_hlo.add [[VAL_119]], [[VAL_141]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<i32>
-// CHECK:           [[VAL_143:%.*]] = "xla_hlo.tuple"([[VAL_142]], [[VAL_140]], [[VAL_121]], [[VAL_122]]) : (tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>) -> tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>
-// CHECK:           "xla_hlo.return"([[VAL_143]]) : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> ()
-// CHECK:         }) : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>
-// CHECK:    [[VAL_144:%.*]] = "xla_hlo.get_tuple_element"([[VAL_145:%.*]]) {index = 1 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_146:%.*]] = "xla_hlo.get_tuple_element"([[VAL_145]]) {index = 2 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_147:%.*]] = "xla_hlo.get_tuple_element"([[VAL_145]]) {index = 3 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x75xf32>
-// CHECK:    [[VAL_148:%.*]] = "xla_hlo.slice"([[VAL_101]]) {limit_indices = dense<[500, 100, 75]> : tensor<3xi64>, start_indices = dense<[0, 0, 75]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<500x100x75xf32>) -> tensor<500x100x0xf32>
-// CHECK:    [[VAL_149:%.*]] = "xla_hlo.dot_general"([[VAL_144]], [[VAL_148]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<1> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["HIGHEST", "HIGHEST"]} : (tensor<500x100x75xf32>, tensor<500x100x0xf32>) -> tensor<500x75x0xf32>
-// CHECK:    [[VAL_150:%.*]] = "xla_hlo.dot_general"([[VAL_96]], [[VAL_149]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<2> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["HIGHEST", "HIGHEST"]} : (tensor<500x100x75xf32>, tensor<500x75x0xf32>) -> tensor<500x100x0xf32>
-// CHECK:    [[VAL_151:%.*]] = xla_hlo.add [[VAL_148]], [[VAL_150]] : tensor<500x100x0xf32>
-// CHECK:    [[VAL_152:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_153:%.*]] = xla_hlo.constant dense<75> : tensor<i32>
-// CHECK:    [[VAL_154:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_155:%.*]] = "xla_hlo.dynamic-update-slice"([[VAL_101]], [[VAL_151]], [[VAL_154]], [[VAL_152]], [[VAL_153]]) : (tensor<500x100x75xf32>, tensor<500x100x0xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_156:%.*]] = "xla_hlo.slice"([[VAL_5]]) {limit_indices = dense<[500, 100, 100]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<500x100x100xf32>) -> tensor<500x100x100xf32>
-// CHECK:    [[VAL_157:%.*]] = "xla_hlo.dot_general"([[VAL_156]], [[VAL_144]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<2> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["HIGHEST", "HIGHEST"]} : (tensor<500x100x100xf32>, tensor<500x100x75xf32>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_158:%.*]] = "xla_hlo.dot_general"([[VAL_157]], [[VAL_96]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<2> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}, precision_config = ["HIGHEST", "HIGHEST"]} : (tensor<500x100x75xf32>, tensor<500x100x75xf32>) -> tensor<500x100x100xf32>
-// CHECK:    [[VAL_159:%.*]] = xla_hlo.add [[VAL_156]], [[VAL_158]] : tensor<500x100x100xf32>
-// CHECK:    [[VAL_160:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_161:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_162:%.*]] = "xla_hlo.dynamic-update-slice"([[VAL_5]], [[VAL_159]], [[VAL_161]], [[VAL_161]], [[VAL_160]]) : (tensor<500x100x100xf32>, tensor<500x100x100xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<500x100x100xf32>
-// CHECK:    [[VAL_163:%.*]] = "xla_hlo.slice"([[VAL_162]]) {limit_indices = dense<[500, 100, 75]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<500x100x100xf32>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_164:%.*]] = "xla_hlo.slice"([[VAL_155]]) {limit_indices = dense<[500, 75, 75]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<500x100x75xf32>) -> tensor<500x75x75xf32>
-// CHECK:    return [[VAL_163]], [[VAL_164]] : tensor<500x100x75xf32>, tensor<500x75x75xf32>
+  // The tf.Qr lowering is a full algorithm that is not effective to verify with
+  // FileCheck. Just verify that it converted.
+  // TODO(laurenzo): Move this out of the mainline tf2xla conversion as it is
+  // really only applicable to certain legacy uses.
+  // CHECK-NOT: "tf.Qr"
   %0:2 = "tf.Qr"(%arg0) {full_matrices = false} : (tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>)
   return %0#0, %0#1 : tensor<500x100x75xf32>, tensor<500x75x75xf32>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
index d25a84d0e25..9f27a204baf 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt -xla-legalize-to-std %s -o - | FileCheck %s
+// RUN: xla-opt -xla-legalize-to-std %s -o - | FileCheck %s --dump-input-on-failure
 
 // CHECK-LABEL: func @binary_ops_float(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
 func @binary_ops_float(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
@@ -42,40 +42,6 @@ func @binary_ops_int(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32
   return %4 : tensor<4xi32>
 }
 
-// Broadcasting is not currently supported.
-// TODO(suderman):Future pass should take all broadcasted binary ops and convert
-// them to separate broadcast and binary op.
-// CHECK-LABEL: func @binary_ops_broadcast(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>) -> tensor<4x4xf32> {
-func @binary_ops_broadcast(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>) -> tensor<4x4xf32> {
-  // CHECK-NEXT: %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "add.3"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {
-      name = "add.3", broadcast_dimensions = dense<1> : tensor<1xi64>} :
-          (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-
-  // CHECK-NEXT: %1 = "xla_hlo.multiply"(%0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "mul.4"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %1 = "xla_hlo.multiply"(%0, %arg1) {
-      name = "mul.4", broadcast_dimensions = dense<1> : tensor<1xi64>} :
-          (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-
-  // CHECK-NEXT: %2 = "xla_hlo.subtract"(%1, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "sub.5"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %2 = "xla_hlo.subtract"(%1, %arg1) {
-      name = "sub.5", broadcast_dimensions = dense<1> : tensor<1xi64>} :
-          (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-
-  // CHECK-NEXT: %3 = "xla_hlo.divide"(%2, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "div.6"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %3 = "xla_hlo.divide"(%2, %arg1) {
-      name = "div.6", broadcast_dimensions = dense<1> : tensor<1xi64>} :
-          (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-
-  // CHECK-NEXT: %4 = "xla_hlo.remainder"(%3, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %4 = "xla_hlo.remainder"(%3, %arg1) {
-    broadcast_dimensions = dense<1> : tensor<1xi64>} :
-          (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-
-  // CHECK-NEXT: return %4 : tensor<4x4xf32>
-  return %4 : tensor<4x4xf32>
-}
-
 // CHECK-LABEL: func @compare_int(%arg0: tensor<4xi32>) -> (tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>) {
 func @compare_int(%arg0: tensor<4xi32>) -> (tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>) {
   // CHECK-NEXT: %0 = cmpi "eq", %arg0, %arg0 : tensor<4xi32>
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
index 013748fea28..99b1766e73c 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
@@ -24,9 +24,9 @@ func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
 // CHECK-LABEL: func @fusion
 //       CHECK:  %[[C1:.*]] = constant 1
 //   CHECK-NOT:  linalg.generic
-//       CHECK:  loop.for {{.*}} step %[[C1]]
-//       CHECK:    loop.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  loop.for
+//       CHECK:  scf.for {{.*}} step %[[C1]]
+//       CHECK:    scf.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  scf.for
 //       CHECK:      linalg.generic
 //       CHECK:        addf
 //       CHECK:      linalg.generic
@@ -36,9 +36,9 @@ func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
 //   TILED-DAG:  %[[C2:.*]] = constant 2
 //   TILED-DAG:  %[[C3:.*]] = constant 3
 //   TILED-NOT:  linalg.generic
-//       TILED:  loop.for {{.*}} step %[[C2]]
-//       TILED:    loop.for {{.*}} step %[[C3]]
-//   TILED-NOT:  loop.for
+//       TILED:  scf.for {{.*}} step %[[C2]]
+//       TILED:    scf.for {{.*}} step %[[C3]]
+//   TILED-NOT:  scf.for
 //       TILED:      linalg.generic
 //       TILED:        addf
 //       TILED:      linalg.generic
@@ -46,8 +46,8 @@ func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
 
 // PLOOP-LABEL: func @fusion
 //   PLOOP-NOT:  linalg.generic
-//       PLOOP:  loop.parallel
-//   PLOOP-NOT:  loop.parallel
+//       PLOOP:  scf.parallel
+//   PLOOP-NOT:  scf.parallel
 //       PLOOP:      linalg.generic
 //       PLOOP:        addf
 //       PLOOP:      linalg.generic
@@ -94,9 +94,9 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 // CHECK-LABEL: func @fusion
 //       CHECK:  %[[C1:.*]] = constant 1
 //   CHECK-NOT:  linalg.generic
-//       CHECK:  loop.for {{.*}} step %[[C1]]
-//       CHECK:    loop.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  loop.for
+//       CHECK:  scf.for {{.*}} step %[[C1]]
+//       CHECK:    scf.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  scf.for
 //       CHECK:      linalg.generic
 //       CHECK:      linalg.generic
 //       CHECK:        subf
@@ -107,9 +107,9 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 //   TILED-DAG:   %[[C2:.*]] = constant 2
 //   TILED-DAG:   %[[C3:.*]] = constant 3
 //   TILED-NOT:   linalg.generic
-//       TILED:   loop.for {{.*}} step %[[C2]]
-//       TILED:     loop.for {{.*}} step %[[C3]]
-//   TILED-NOT:   loop.for
+//       TILED:   scf.for {{.*}} step %[[C2]]
+//       TILED:     scf.for {{.*}} step %[[C3]]
+//   TILED-NOT:   scf.for
 //       TILED:       linalg.generic
 //       TILED:       linalg.generic
 //       TILED:         subf
@@ -118,8 +118,8 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 
 // PLOOP-LABEL: func @fusion_of_three
 //   PLOOP-NOT:   linalg.generic
-//       PLOOP:   loop.parallel
-//   PLOOP-NOT:   loop.parallel
+//       PLOOP:   scf.parallel
+//   PLOOP-NOT:   scf.parallel
 //       PLOOP:       linalg.generic
 //       PLOOP:       linalg.generic
 //       PLOOP:         subf
@@ -147,11 +147,11 @@ func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32
 // CHECK-LABEL: func @fusion_4d
 //       CHECK:  %[[C1:.*]] = constant 1
 //   CHECK-NOT:  linalg.generic
-//       CHECK:  loop.for {{.*}} step %[[C1]]
-//       CHECK:    loop.for {{.*}} step %[[C1]]
-//       CHECK:      loop.for {{.*}} step %[[C1]]
-//       CHECK:        loop.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  loop.for
+//       CHECK:  scf.for {{.*}} step %[[C1]]
+//       CHECK:    scf.for {{.*}} step %[[C1]]
+//       CHECK:      scf.for {{.*}} step %[[C1]]
+//       CHECK:        scf.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  scf.for
 //       CHECK:      linalg.generic
 //       CHECK:        addf
 //       CHECK:      linalg.generic
@@ -161,9 +161,9 @@ func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32
 //   TILED-DAG:  %[[C2:.*]] = constant 2
 //   TILED-DAG:  %[[C3:.*]] = constant 3
 //   TILED-NOT:  linalg.generic
-//       TILED:  loop.for {{.*}} step %[[C2]]
-//       TILED:    loop.for {{.*}} step %[[C3]]
-//   TILED-NOT:  loop.for
+//       TILED:  scf.for {{.*}} step %[[C2]]
+//       TILED:    scf.for {{.*}} step %[[C3]]
+//   TILED-NOT:  scf.for
 //       TILED:      linalg.generic
 //       TILED:        addf
 //       TILED:      linalg.generic
@@ -171,8 +171,8 @@ func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32
 
 // PLOOP-LABEL: func @fusion_4d
 //   PLOOP-NOT:  linalg.generic
-//       PLOOP:  loop.parallel
-//   PLOOP-NOT:  loop.parallel
+//       PLOOP:  scf.parallel
+//   PLOOP-NOT:  scf.parallel
 //       PLOOP:      linalg.generic
 //       PLOOP:        addf
 //       PLOOP:      linalg.generic
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir
index 5b763cde2ed..c640b395f4d 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir
@@ -50,19 +50,19 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 
 // Parallel loop to initialize the output buffer.
 // CHECK:    [[INIT:%.*]] = load [[INIT_BUF]][] : memref<f32>
-// CHECK:    loop.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
+// CHECK:    scf.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:          to ([[C112]], [[C112]]) step ([[C1]], [[C1]]) {
 // CHECK:      store [[INIT]], [[RESULT_BUF]]{{\[}}[[I]], [[J]]]
-// CHECK:      loop.yield
+// CHECK:      scf.yield
 // CHECK:    }
 
 // Parallel loop over source buffer to compute scattered values.
-// CHECK:    loop.parallel ([[II:%.*]], [[JJ:%.*]]) = ([[C0]], [[C0]])
+// CHECK:    scf.parallel ([[II:%.*]], [[JJ:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:          to ([[C56]], [[C56]]) step ([[C1]], [[C1]]) {
 
 // Window loop w.r.t. first dim.
 // CHECK:      [[SEL_RES_I:%.*]]:4
-// CHECK-SAME:   = loop.for [[WIN_I:%.*]] = [[C0]] to [[C3]] step [[C1]]
+// CHECK-SAME:   = scf.for [[WIN_I:%.*]] = [[C0]] to [[C3]] step [[C1]]
 // CHECK-SAME:     iter_args(
 // CHECK-SAME:       [[SEL_I_0:%.*]] = [[C0]], [[SEL_J_0:%.*]] = [[C0]],
 // CHECK-SAME:       [[SEL_VAL_0:%.*]] = [[C0_F32]],
@@ -71,7 +71,7 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 
 // Window loop w.r.t. second dim.
 // CHECK:      [[SEL_RES_J:%.*]]:4
-// CHECK-SAME:   = loop.for [[WIN_J:%.*]] = [[C0]] to [[C3]] step [[C1]]
+// CHECK-SAME:   = scf.for [[WIN_J:%.*]] = [[C0]] to [[C3]] step [[C1]]
 // CHECK-SAME:     iter_args(
 // CHECK-SAME:       [[SEL_I:%.*]] = [[SEL_I_0]], [[SEL_J:%.*]] = [[SEL_J_0]],
 // CHECK-SAME:       [[SEL_VAL:%.*]] = [[SEL_VAL_0]],
@@ -102,14 +102,14 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 // be applied, current selected ivs (SEL_I, SEL_J) and value (SEL_VAL) are
 // returned in that case.
 // CHECK:  [[IF_INBOUNDS_RES:%.*]]:4
-// CHECK-SAME:  = loop.if [[INBOUNDS_1]] -> (index, index, f32, i1) {
+// CHECK-SAME:  = scf.if [[INBOUNDS_1]] -> (index, index, f32, i1) {
 
 
   // INBOUNDS-THEN-BODY, i.e. if INBOUNDS == true
 
   // CHECK: [[ARG_ELEM:%.*]] = load [[ARG_BUF]]{{\[}}[[ARG_I]], [[ARG_J]]]
   // CHECK: [[IF_INIT_RES:%.*]]:4
-  // CHECK-SAME:  = loop.if [[SEL_INIT]] -> (index, index, f32, i1) {
+  // CHECK-SAME:  = scf.if [[SEL_INIT]] -> (index, index, f32, i1) {
 
     // INIT-THEN-BODY, i.e. INBOUNDS == true and INIT = true
 
@@ -133,40 +133,40 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 
 
     // Depending on PRED, return ARG ivs & elem or current select ivs and value.
-    // CHECK:  [[IF_PRED_RES:%.*]]:4 = loop.if [[PRED]]
-    // CHECK:    loop.yield [[ARG_I]], [[ARG_J]], [[ARG_ELEM]], [[CTRUE]]
+    // CHECK:  [[IF_PRED_RES:%.*]]:4 = scf.if [[PRED]]
+    // CHECK:    scf.yield [[ARG_I]], [[ARG_J]], [[ARG_ELEM]], [[CTRUE]]
     // CHECK:  } else {
-    // CHECK:    loop.yield [[SEL_I]], [[SEL_J]], [[SEL_VAL]], [[SEL_INIT]]
+    // CHECK:    scf.yield [[SEL_I]], [[SEL_J]], [[SEL_VAL]], [[SEL_INIT]]
     // CHECK:  }
 
     // INIT-THEN-BODY yield.
-    // CHECK:  loop.yield [[IF_PRED_RES]]#0, [[IF_PRED_RES]]#1,
+    // CHECK:  scf.yield [[IF_PRED_RES]]#0, [[IF_PRED_RES]]#1,
     // CHECK-SAME:        [[IF_PRED_RES]]#2, [[IF_PRED_RES]]#3
 
     // INIT-ELSE-BODY, i.e. if INBOUNDS == TRUE and INIT == FALSE, returns ARG
     // ivs and element without computing Select function.
-    // CHECK:  loop.yield [[ARG_I]], [[ARG_J]], [[ARG_ELEM]],
+    // CHECK:  scf.yield [[ARG_I]], [[ARG_J]], [[ARG_ELEM]],
     // CHECK-SAME:        [[CTRUE]] : index, index, f32, i1
     // CHECK:  }
 
   // INBOUNDS-THEN-BODY yield.
-  // CHECK:  loop.yield [[IF_INIT_RES]]#0, [[IF_INIT_RES]]#1, [[IF_INIT_RES]]#2,
+  // CHECK:  scf.yield [[IF_INIT_RES]]#0, [[IF_INIT_RES]]#1, [[IF_INIT_RES]]#2,
   // CHECK-SAME:        [[IF_INIT_RES]]#3 : index, index, f32, i1
   // CHECK:  }
 
   // INBOUNDS-ELSE-REGION, i.e. if INBOUNDS == FALSE
   // We are in the pad area, return current iter_args.
-  // CHECK:  loop.yield [[SEL_I]], [[SEL_J]], [[SEL_VAL]],
+  // CHECK:  scf.yield [[SEL_I]], [[SEL_J]], [[SEL_VAL]],
   // CHECK-SAME:  [[SEL_INIT]] : index, index, f32, i1
   // CHECK:  }
 
 // Window loop w.r.t. second dim yield.
-// CHECK:  loop.yield [[IF_INBOUNDS_RES]]#0, [[IF_INBOUNDS_RES]]#1,
+// CHECK:  scf.yield [[IF_INBOUNDS_RES]]#0, [[IF_INBOUNDS_RES]]#1,
 // CHECK-SAME:        [[IF_INBOUNDS_RES]]#2, [[IF_INBOUNDS_RES]]#3
 // CHECK:  }
 
 // Window loop w.r.t. first dim yield.
-// CHECK:    loop.yield [[SEL_RES_J]]#0, [[SEL_RES_J]]#1, [[SEL_RES_J]]#2,
+// CHECK:    scf.yield [[SEL_RES_J]]#0, [[SEL_RES_J]]#1, [[SEL_RES_J]]#2,
 // CHECK-SAME:          [[SEL_RES_J]]#3 : index, index, f32, i1
 // CHECK:  }
 
@@ -196,4 +196,4 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 // CHECK:  atomic_yield [[RES]] : f32
 
 // Parallel loop over source buffer yield
-// CHECK:  loop.yield
+// CHECK:  scf.yield
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-affine.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-affine.mlir
index 08ba9f02f3e..aaf65b5a38a 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-affine.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-affine.mlir
@@ -5,15 +5,15 @@
 func @min_op(%lhs: memref<4x3x2x1xf32>, %rhs: memref<4x3x2x1xf32>,
              %result: memref<4x3x2x1xf32>) -> () {
   // CHECK-NEXT: affine.for %[[I:.*]] = 0 to 4 {
-  // CHECK-NEXT: affine.for %[[J:.*]] = 0 to 3 {
-  // CHECK-NEXT: affine.for %[[K:.*]] = 0 to 2 {
-  // CHECK-NEXT: affine.for %[[L:.*]] = 0 to 1 {
-  // CHECK-NEXT: %[[LHS:.*]] = load %{{.*}}[%[[I]], %[[J]], %[[K]], %[[L]]] : memref<4x3x2x1xf32>
-  // CHECK-NEXT: %[[RHS:.*]] = load %{{.*}}[%[[I]], %[[J]], %[[K]], %[[L]]] : memref<4x3x2x1xf32>
-  // CHECK-NEXT: %[[MIN_PREDICATE:.*]] = cmpf "olt", %[[LHS]], %[[RHS]] : f32
-  // CHECK-NEXT: %[[MIN:.*]] = select %[[MIN_PREDICATE]], %[[LHS]], %[[RHS]] : f32
-  // CHECK-NEXT: store %[[MIN]], %{{.*}}[%[[I]], %[[J]], %[[K]], %[[L]]] : memref<4x3x2x1xf32>
-  // CHECK: return
+  // CHECK-NEXT:   affine.for %[[J:.*]] = 0 to 3 {
+  // CHECK-NEXT:     affine.for %[[K:.*]] = 0 to 2 {
+  // CHECK-NEXT:       affine.for %[[L:.*]] = 0 to 1 {
+  // CHECK-NEXT:         %[[LHS:.*]] = affine.load %{{.*}}[%[[I]], %[[J]], %[[K]], %[[L]]] : memref<4x3x2x1xf32>
+  // CHECK-NEXT:         %[[RHS:.*]] = affine.load %{{.*}}[%[[I]], %[[J]], %[[K]], %[[L]]] : memref<4x3x2x1xf32>
+  // CHECK-NEXT:         %[[MIN_PREDICATE:.*]] = cmpf "olt", %[[LHS]], %[[RHS]] : f32
+  // CHECK-NEXT:         %[[MIN:.*]] = select %[[MIN_PREDICATE]], %[[LHS]], %[[RHS]] : f32
+  // CHECK-NEXT:         affine.store %[[MIN]], %{{.*}}[%[[I]], %[[J]], %[[K]], %[[L]]] : memref<4x3x2x1xf32>
+  // CHECK:      return
   "xla_lhlo.minimum"(%lhs, %rhs, %result) {name = "min.1"} :
       (memref<4x3x2x1xf32>, memref<4x3x2x1xf32>, memref<4x3x2x1xf32>) -> ()
   return
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-gpu.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-gpu.mlir
index 4d878cee6f4..16ffbf241b0 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-gpu.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-gpu.mlir
@@ -22,7 +22,7 @@ func @reduce(%arg: memref<100x10xf32>,
 // CHECK-DAG: %[[LB:.*]] = constant 0 : index
 // CHECK-DAG: %[[UB:.*]] = constant 10 : index
 // CHECK-DAG: %[[STEP:.*]] = constant 1 : index
-// CHECK: loop.for %[[IDX1:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
+// CHECK: scf.for %[[IDX1:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
 // CHECK: %[[LHS:.*]] = linalg.slice %[[ARG2]][%[[IDX]]] : memref<100xf32>, index, memref<f32, #map0>
 // CHECK: %[[RHS:.*]] = linalg.slice %[[ARG0]][%[[IDX]], %[[IDX1]]] : memref<100x10xf32>, index, index, memref<f32, #map0>
 // CHECK: "xla_lhlo.add"(%[[LHS]], %[[RHS]], %[[LHS]]) : (memref<f32, {{.*}}>, memref<f32, {{.*}}>, memref<f32, {{.*}}>) -> ()
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
index 0fc30ed4901..626e905695c 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
@@ -411,6 +411,19 @@ func @convert_f32_to_f32(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
 
 // -----
 
+// CHECK-LABEL: func @convert_f32_to_i32
+func @convert_f32_to_i32(%input: memref<2x2xf32>, %result: memref<2x2xi32>) {
+  "xla_lhlo.convert"(%input, %result)
+      : (memref<2x2xf32>, memref<2x2xi32>) -> ()
+  return
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]: i32):
+// CHECK-NEXT:   %[[RESULT:.*]] = fptosi %[[OPERAND_IN]] : f32 to i32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
+
+// -----
+
 // CHECK-LABEL: func @cos
 func @cos(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
   "xla_lhlo.cosine"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
@@ -523,6 +536,48 @@ func @tanh(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
 // CHECK-NEXT:   %[[RESULT:.*]] = tanh %[[OPERAND_IN]] : f32
 // CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
 
+// -----
+
+// CHECK-LABEL: func @complex
+func @complex(%real: memref<2x2xf32>,
+              %imag: memref<2x2xf32>,
+              %cplx: memref<2x2xcomplex<f32>>) {
+  "xla_lhlo.complex"(%real, %imag, %cplx)
+      : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xcomplex<f32>>) -> ()
+  return
+}
+// CHECK:      linalg.generic
+// CHECK-NEXT: ^bb0(%[[RE:.*]]: f32, %[[IM:.*]]: f32, %[[CP:.*]]: complex<f32>):
+// CHECK-NEXT:   %[[RESULT:.*]] = create_complex %[[RE]], %[[IM]] : complex<f32>
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : complex<f32>
+
+// -----
+
+// CHECK-LABEL: func @real
+func @real(%cplx: memref<2x2xcomplex<f32>>,
+           %real: memref<2x2xf32>) {
+  "xla_lhlo.real"(%cplx, %real)
+      : (memref<2x2xcomplex<f32>>, memref<2x2xf32>) -> ()
+  return
+}
+// CHECK:      linalg.generic
+// CHECK-NEXT: ^bb0(%[[CPLX_IN:.*]]: complex<f32>, %[[REAL_OUT:.*]]: f32):
+// CHECK-NEXT:   %[[REAL:.*]] = re %[[CPLX_IN:.*]] : complex<f32>
+// CHECK-NEXT:   linalg.yield %[[REAL]] : f32
+
+// -----
+
+// CHECK-LABEL: func @imag
+func @imag(%cplx: memref<2x2xcomplex<f32>>,
+           %imag: memref<2x2xf32>) {
+  "xla_lhlo.imag"(%cplx, %imag)
+      : (memref<2x2xcomplex<f32>>, memref<2x2xf32>) -> ()
+  return
+}
+// CHECK:      linalg.generic
+// CHECK-NEXT: ^bb0(%[[CPLX_IN:.*]]: complex<f32>, %[[IMAG_OUT:.*]]: f32):
+// CHECK-NEXT:   %[[IMAG:.*]] = im %[[CPLX_IN:.*]] : complex<f32>
+// CHECK-NEXT:   linalg.yield %[[IMAG]] : f32
 
 // -----
 
@@ -581,3 +636,16 @@ func @reshape_2D_4D(%arg0: memref<12x42xi32>, %arg1 : memref<12x1x42x1xi32>) {
   return
 }
 // CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 2)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @reverse
+func @reverse(%arg0: memref<2x3xf32>, %arg1: memref<2x3xf32>) {
+  "xla_lhlo.reverse"(%arg0, %arg1) {
+    dimensions = dense<1> : tensor<1xi64>
+  } : (memref<2x3xf32>, memref<2x3xf32>) -> ()
+  return
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-llvm.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-llvm.mlir
new file mode 100644
index 00000000000..16aad8f7cf3
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-llvm.mlir
@@ -0,0 +1,65 @@
+// RUN: xla-opt %s --test-lhlo-legalize-to-llvm -split-input-file | FileCheck %s --dump-input-on-failure
+
+// CHECK-LABEL: func @static_memref_cast
+func @static_memref_cast(%buf : memref<10x1x5xf32>) {
+  %0 = xla_lhlo.static_memref_cast %buf
+        : memref<10x1x5xf32> -> memref<10x5xf32, offset: 2, strides: [5, 1]>
+  return
+}
+// CHECK: %[[INPUT_MEMREF_BLDR:.*]] = llvm.mlir.undef : [[DESCRIPTOR_TYPE_3D:!.*]]
+// CHECK: llvm.insertvalue
+// CHECK: %[[MEMREF_BLDR_0:.*]] = llvm.mlir.undef : [[DESCRIPTOR_TYPE_2D:!.*]]
+
+// CHECK: %[[IN_PTR:.*]] = llvm.extractvalue %[[INPUT_MEMREF:.*]][0] : [[DESCRIPTOR_TYPE_3D]]
+// CHECK: %[[PTR:.*]] = llvm.bitcast %[[IN_PTR]] : !llvm<"float*"> to !llvm<"float*">
+// CHECK: %[[MEMREF_BLDR_1:.*]] = llvm.insertvalue %[[PTR]], %[[MEMREF_BLDR_0]][0] : [[DESCRIPTOR_TYPE_2D]]
+
+// CHECK: %[[IN_ALIGNED_PTR:.*]] = llvm.extractvalue %[[INPUT_MEMREF]][1] : [[DESCRIPTOR_TYPE_3D]]
+// CHECK: %[[ALIGNED_PTR:.*]] = llvm.bitcast %[[IN_ALIGNED_PTR]] : !llvm<"float*"> to !llvm<"float*">
+// CHECK: %[[MEMREF_BLDR_2:.*]] = llvm.insertvalue %[[ALIGNED_PTR]], %[[MEMREF_BLDR_1]][1] : [[DESCRIPTOR_TYPE_2D]]
+
+// CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
+// CHECK: %[[MEMREF_BLDR_3:.*]] = llvm.insertvalue %[[C2]], %[[MEMREF_BLDR_2]][2] : [[DESCRIPTOR_TYPE_2D]]
+
+// CHECK: %[[C10:.*]] = llvm.mlir.constant(10 : index) : !llvm.i64
+// CHECK: %[[MEMREF_BLDR_4:.*]] = llvm.insertvalue %[[C10]], %[[MEMREF_BLDR_3]][3, 0] : [[DESCRIPTOR_TYPE_2D]]
+// CHECK: %[[C5:.*]] = llvm.mlir.constant(5 : index) : !llvm.i64
+// CHECK: %[[MEMREF_BLDR_5:.*]] = llvm.insertvalue %[[C5]], %[[MEMREF_BLDR_4]][4, 0] : [[DESCRIPTOR_TYPE_2D]]
+// CHECK: %[[C5_:.*]] = llvm.mlir.constant(5 : index) : !llvm.i64
+// CHECK: %[[MEMREF_BLDR_6:.*]] = llvm.insertvalue %[[C5_]], %[[MEMREF_BLDR_5]][3, 1] : [[DESCRIPTOR_TYPE_2D]]
+// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK: %[[MEMREF_BLDR_7:.*]] = llvm.insertvalue %[[C1]], %[[MEMREF_BLDR_6]][4, 1] : [[DESCRIPTOR_TYPE_2D]]
+
+// -----
+
+// CHECK-LABEL: func @dynamic_memref_cast
+func @dynamic_memref_cast(%buf : memref<?x?xf32>) {
+  %size_X = constant 10 : index
+  %size_Y = constant 50 : index
+  %stride_X = constant 1 : index
+  %stride_Y = constant 0 : index
+  %0 = xla_lhlo.dynamic_memref_cast %buf(%size_X, %size_Y)[%stride_X, %stride_Y]
+        : memref<?x?xf32> -> memref<?x?xf32, offset: 0, strides: [?, ?]>
+  return
+}
+// CHECK: %[[C10:.*]] = llvm.mlir.constant(10 : index) : !llvm.i64
+// CHECK: %[[C50:.*]] = llvm.mlir.constant(50 : index) : !llvm.i64
+// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+
+// CHECK: %[[MEMREF_BLDR_0:.*]] = llvm.mlir.undef : [[DESCRIPTOR_TYPE:!.*]]
+
+// CHECK: %[[IN_PTR:.*]] = llvm.extractvalue %[[INPUT_MEMREF:.*]][0] : [[DESCRIPTOR_TYPE]]
+// CHECK: %[[PTR:.*]] = llvm.bitcast %[[IN_PTR]] : !llvm<"float*"> to !llvm<"float*">
+// CHECK: %[[MEMREF_BLDR_1:.*]] = llvm.insertvalue %[[PTR]], %[[MEMREF_BLDR_0]][0] : [[DESCRIPTOR_TYPE]]
+
+// CHECK: %[[IN_ALIGNED_PTR:.*]] = llvm.extractvalue %[[INPUT_MEMREF]][1] : [[DESCRIPTOR_TYPE]]
+// CHECK: %[[ALIGNED_PTR:.*]] = llvm.bitcast %[[IN_ALIGNED_PTR]] : !llvm<"float*"> to !llvm<"float*">
+// CHECK: %[[MEMREF_BLDR_2:.*]] = llvm.insertvalue %[[ALIGNED_PTR]], %[[MEMREF_BLDR_1]][1] : [[DESCRIPTOR_TYPE]]
+
+// CHECK: %[[SRC_OFFSET:.*]] = llvm.extractvalue %[[INPUT_MEMREF]][2] : [[DESCRIPTOR_TYPE]]
+// CHECK: %[[MEMREF_BLDR_3:.*]] = llvm.insertvalue %[[SRC_OFFSET]], %[[MEMREF_BLDR_2]][2] : [[DESCRIPTOR_TYPE]]
+// CHECK: %[[MEMREF_BLDR_4:.*]] = llvm.insertvalue %[[C10]], %[[MEMREF_BLDR_3]][3, 0] : [[DESCRIPTOR_TYPE]]
+// CHECK: %[[MEMREF_BLDR_5:.*]] = llvm.insertvalue %[[C1]], %[[MEMREF_BLDR_4]][4, 0] : [[DESCRIPTOR_TYPE]]
+// CHECK: %[[MEMREF_BLDR_6:.*]] = llvm.insertvalue %[[C50]], %[[MEMREF_BLDR_5]][3, 1] : [[DESCRIPTOR_TYPE]]
+// CHECK: %[[MEMREF_BLDR_7:.*]] = llvm.insertvalue %[[C0]], %[[MEMREF_BLDR_6]][4, 1] : [[DESCRIPTOR_TYPE]]
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
index cb169e060ef..32c367f97d6 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
@@ -22,13 +22,13 @@ func @reduce(%arg: memref<100x10x5xf32>,
 // CHECK-DAG:  [[C10:%.*]] = constant 10 : index
 // CHECK-DAG:  [[C100:%.*]] = constant 100 : index
 // CHECK:  [[INIT:%.*]] = load [[INIT_BUF]]
-// CHECK:  loop.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
+// CHECK:  scf.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:                     to ([[C100]], [[C5]]) step ([[C1]], [[C1]]) {
-// CHECK:    [[REDUCTION_RESULT:%.*]] = loop.parallel ([[J:%.*]]) =
+// CHECK:    [[REDUCTION_RESULT:%.*]] = scf.parallel ([[J:%.*]]) =
 // CHECK-SAME:      ([[C0]]) to ([[C10]]) step ([[C1]]) init ([[INIT]]) -> f32 {
 // CHECK:      [[ELEM_TO_REDUCE:%.*]] = load [[ARG_BUF]]
 // CHECK-SAME:                 {{\[}}[[I]], [[J]], [[K]]] : memref<100x10x5xf32>
-// CHECK:      loop.reduce([[ELEM_TO_REDUCE]]) : f32 {
+// CHECK:      scf.reduce([[ELEM_TO_REDUCE]]) : f32 {
 // CHECK:      ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
 // CHECK:        [[ELEM_BUF:%.*]] = alloc() : memref<f32>
 // CHECK:        [[ACC_BUF:%.*]] = alloc() : memref<f32>
@@ -37,12 +37,12 @@ func @reduce(%arg: memref<100x10x5xf32>,
 // CHECK:        store [[ACC]], [[ACC_BUF]][] : memref<f32>
 // CHECK:        "xla_lhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
 // CHECK:        [[ACC_RESULT:%.*]] = load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:        loop.reduce.return [[ACC_RESULT]] : f32
+// CHECK:        scf.reduce.return [[ACC_RESULT]] : f32
 // CHECK:      }
-// CHECK:      loop.yield
+// CHECK:      scf.yield
 // CHECK:    }
 // CHECK:    store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[K]]]
-// CHECK:    loop.yield
+// CHECK:    scf.yield
 
 // -----
 
@@ -66,10 +66,10 @@ func @reduce_no_outer_loop(%arg: memref<100xf32>,
 // CHECK-DAG:  [[C1:%.*]] = constant 1 : index
 // CHECK-DAG:  [[C100:%.*]] = constant 100 : index
 // CHECK:      [[INIT:%.*]] = load [[INIT_BUF]]
-// CHECK:      [[REDUCTION_RESULT:%.*]] = loop.parallel ([[I:%.*]]) = ([[C0]])
+// CHECK:      [[REDUCTION_RESULT:%.*]] = scf.parallel ([[I:%.*]]) = ([[C0]])
 // CHECK-SAME:     to ([[C100]]) step ([[C1]]) init ([[INIT]]) -> f32 {
 // CHECK:        [[ELEM_TO_REDUCE:%.*]] = load [[ARG_BUF]]{{\[}}[[I]]{{\]}}
-// CHECK:        loop.reduce([[ELEM_TO_REDUCE]]) : f32 {
+// CHECK:        scf.reduce([[ELEM_TO_REDUCE]]) : f32 {
 // CHECK:        ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
 // CHECK:          [[ELEM_BUF:%.*]] = alloc() : memref<f32>
 // CHECK:          [[ACC_BUF:%.*]] = alloc() : memref<f32>
@@ -78,9 +78,9 @@ func @reduce_no_outer_loop(%arg: memref<100xf32>,
 // CHECK:          store [[ACC]], [[ACC_BUF]][] : memref<f32>
 // CHECK:          "xla_lhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
 // CHECK:          [[ACC_RESULT:%.*]] = load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:          loop.reduce.return [[ACC_RESULT]]
+// CHECK:          scf.reduce.return [[ACC_RESULT]]
 // CHECK:        }
-// CHECK:        loop.yield
+// CHECK:        scf.yield
 // CHECK:      store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[C0]]]
 
 // -----
@@ -107,13 +107,13 @@ func @dynamic_reduce(%arg: memref<?x?x?xf32>,
 // CHECK:  [[DIM1:%.*]] = dim [[ARG_BUF]], 1 : memref<?x?x?xf32>
 // CHECK:  [[DIM2:%.*]] = dim [[ARG_BUF]], 2 : memref<?x?x?xf32>
 // CHECK:  [[INIT:%.*]] = load [[INIT_BUF]]
-// CHECK:  loop.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
+// CHECK:  scf.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:                     to ([[DIM0]], [[DIM2]]) step ([[C1]], [[C1]]) {
-// CHECK:    [[REDUCTION_RESULT:%.*]] = loop.parallel ([[J:%.*]]) =
+// CHECK:    [[REDUCTION_RESULT:%.*]] = scf.parallel ([[J:%.*]]) =
 // CHECK-SAME:     ([[C0]]) to ([[DIM1]]) step ([[C1]]) init ([[INIT]]) -> f32 {
 // CHECK:      [[ELEM_TO_REDUCE:%.*]] = load [[ARG_BUF]]
 // CHECK-SAME:                 {{\[}}[[I]], [[J]], [[K]]] : memref<?x?x?xf32>
-// CHECK:      loop.reduce([[ELEM_TO_REDUCE]]) : f32 {
+// CHECK:      scf.reduce([[ELEM_TO_REDUCE]]) : f32 {
 // CHECK:      ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
 // CHECK:        [[ELEM_BUF:%.*]] = alloc() : memref<f32>
 // CHECK:        [[ACC_BUF:%.*]] = alloc() : memref<f32>
@@ -122,12 +122,12 @@ func @dynamic_reduce(%arg: memref<?x?x?xf32>,
 // CHECK:        store [[ACC]], [[ACC_BUF]][] : memref<f32>
 // CHECK:        "xla_lhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
 // CHECK:        [[ACC_RESULT:%.*]] = load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:        loop.reduce.return [[ACC_RESULT]] : f32
+// CHECK:        scf.reduce.return [[ACC_RESULT]] : f32
 // CHECK:      }
-// CHECK:      loop.yield
+// CHECK:      scf.yield
 // CHECK:    }
 // CHECK:    store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[K]]]
-// CHECK:    loop.yield
+// CHECK:    scf.yield
 
 // -----
 
@@ -158,9 +158,9 @@ func @reduce_window(%arg: memref<112x112xf32>,
 // CHECK-DAG:  [[C56:%.*]] = constant 56 : index
 // CHECK-DAG:  [[C112:%.*]] = constant 112 : index
 // CHECK:      [[INIT:%.*]] = load [[INIT_BUF]][] : memref<f32>
-// CHECK:      loop.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
+// CHECK:      scf.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:         to ([[C56]], [[C56]]) step ([[C1]], [[C1]]) {
-// CHECK:        [[REDUCTION_RESULT:%.*]] = loop.parallel
+// CHECK:        [[REDUCTION_RESULT:%.*]] = scf.parallel
 // CHECK-SAME:       ([[IW:%.*]], [[JW:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:       to ([[C3]], [[C3]]) step ([[C1]], [[C1]])
 // CHECK-SAME:       init ([[INIT]]) -> f32 {
@@ -177,15 +177,15 @@ func @reduce_window(%arg: memref<112x112xf32>,
 // CHECK:          [[INDEX_J_FITS:%.*]] = cmpi "ult", [[INDEX_J]], [[C112]]
 // CHECK:          [[IN_BOUNDS_1:%.*]] = and [[IN_BOUNDS_0]], [[INDEX_J_FITS]]
 
-// CHECK:          [[ELEM_TO_REDUCE:%.*]] = loop.if [[IN_BOUNDS_1]] -> (f32) {
+// CHECK:          [[ELEM_TO_REDUCE:%.*]] = scf.if [[IN_BOUNDS_1]] -> (f32) {
 // CHECK:            [[OPERAND_ELEM:%.*]] =
 // CHECK-SAME:         load [[OPERAND_BUF]]{{\[}}[[INDEX_I]], [[INDEX_J]]]
-// CHECK:              loop.yield [[OPERAND_ELEM]] : f32
+// CHECK:              scf.yield [[OPERAND_ELEM]] : f32
 // CHECK:            } else {
-// CHECK:              loop.yield [[INIT]] : f32
+// CHECK:              scf.yield [[INIT]] : f32
 // CHECK:            }
 
-// CHECK:          loop.reduce([[ELEM_TO_REDUCE]])  : f32 {
+// CHECK:          scf.reduce([[ELEM_TO_REDUCE]])  : f32 {
 // CHECK:          ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
 // CHECK:            [[ELEM_BUF:%.*]] = alloc() : memref<f32>
 // CHECK:            [[ACC_BUF:%.*]] = alloc() : memref<f32>
@@ -194,12 +194,12 @@ func @reduce_window(%arg: memref<112x112xf32>,
 // CHECK:            store [[ACC]], [[ACC_BUF]][] : memref<f32>
 // CHECK:            "xla_lhlo.maximum"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
 // CHECK:            [[ACC_RESULT:%.*]] = load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:            loop.reduce.return [[ACC_RESULT]] : f32
+// CHECK:            scf.reduce.return [[ACC_RESULT]] : f32
 // CHECK:          }
-// CHECK:          loop.yield
+// CHECK:          scf.yield
 // CHECK:        }
 // CHECK:        store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[J]]]
-// CHECK:        loop.yield
+// CHECK:        scf.yield
 // CHECK:      }
 // CHECK:      return
 // CHECK:    }
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
index 23e9d9b68e0..1a44428d2e9 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
@@ -178,3 +178,73 @@ func @fusion_memref(%input1: memref<10xf32>, %input2: memref<10xf32>, %input3: m
   } ) : () -> ()
   return
 }
+
+// -----
+
+// CHECK-LABEL: func @case_memref
+func @case_memref(%index: memref<i32>, %operand_1: memref<f32>, %operand_2: memref<f32>, %operand_3: memref<f32>, %out: memref<f32>) -> () {
+  "xla_lhlo.case"(%index, %operand_1, %operand_2, %operand_3, %out) ( {
+    ^bb0(%arg0: memref<f32>):
+      "xla_lhlo.negate"(%arg0, %out) : (memref<f32>, memref<f32>) -> ()
+      "xla_lhlo.terminator"() : () -> ()
+    },  {
+    ^bb0(%arg0: memref<f32>):
+      "xla_lhlo.copy"(%arg0, %out) : (memref<f32>, memref<f32>) -> ()
+      "xla_lhlo.terminator"() : () -> ()
+    },  {
+    ^bb0(%arg0: memref<f32>):
+      "xla_lhlo.add"(%arg0, %arg0, %out) : (memref<f32>, memref<f32>, memref<f32>) -> ()
+      "xla_lhlo.terminator"() : () -> ()
+    }
+  ) : (memref<i32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>) -> ()
+  return
+}
+
+// -----
+
+func @static_memref_cast(%in: memref<10x1xf32>) {
+  %out = xla_lhlo.static_memref_cast %in
+           : memref<10x1xf32> -> memref<10xf32, offset: 0, strides: [1]>
+  return
+}
+// CHECK-LABEL: func @static_memref_cast
+
+// -----
+
+func @static_memref_cast_dynamic_operand(%in: memref<10x?xf32>) {
+  // expected-error @+1 {{operand must have static shape}}
+  %out = xla_lhlo.static_memref_cast %in
+           : memref<10x?xf32> -> memref<10x1xf32, offset: 0, strides: [10, 1]>
+  return
+}
+
+// -----
+
+func @static_memref_cast_dynamic_result(%in: memref<10x1xf32>) {
+  // expected-error @+1 {{result must have static shape}}
+  %out = xla_lhlo.static_memref_cast %in
+           : memref<10x1xf32> -> memref<10x?xf32, offset: 0, strides: [?, ?]>
+  return
+}
+
+// -----
+
+func @dynamic_memref_cast(%in: memref<?xf32>) {
+  %size = constant 10 : index
+  %step = constant 1 : index
+  %out = xla_lhlo.dynamic_memref_cast %in(%size)[%step]
+           : memref<?xf32> -> memref<?xf32, offset: 0, strides: [?]>
+  return
+}
+// CHECK-LABEL: func @dynamic_memref_cast
+
+// -----
+
+func @dynamic_memref_cast_incompatible_result_type(%in: memref<?xf32>) {
+  // expected-error @+3 {{`sizes` args count must be equal to the rank of the output memref}}
+  %size = constant 10 : index
+  %step = constant 1 : index
+  %out = xla_lhlo.dynamic_memref_cast %in(%size)[%step]
+           : memref<?xf32> -> memref<?x?xf32, offset: 0, strides: [?, ?]>
+  return
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/lower-complex.mlir b/tensorflow/compiler/mlir/xla/tests/lower-complex.mlir
index 35a5ae549d5..81376761467 100644
--- a/tensorflow/compiler/mlir/xla/tests/lower-complex.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lower-complex.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt %s -test-xla-lower-complex | FileCheck %s
+// RUN: xla-opt %s -test-xla-chlo-legalize-to-hlo -test-xla-lower-complex | FileCheck %s --dump-input-on-failure
 
 // CHECK-LABEL: @add
 func @add(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) {
@@ -15,21 +15,6 @@ func @add(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %
   return %5, %6 : tensor<2xf32>, tensor<2xf32>
 }
 
-// CHECK-LABEL: @add_broadcast
-func @add_broadcast(%arg0 : tensor<1x2xf32>, %arg1 : tensor<1x2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>) {
-  %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> (tensor<1x2xcomplex<f32>>)
-  %3 = "xla_hlo.complex"(%arg2, %arg3) : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xcomplex<f32>>)
-
-  // CHECK-DAG: [[VAL0:%.+]] = "xla_hlo.add"(%arg0, %arg2) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL1:%.+]] = "xla_hlo.add"(%arg1, %arg3) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %4 = "xla_hlo.add"(%2, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> (tensor<1x2xcomplex<f32>>)
-  %5 = "xla_hlo.real"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-  %6 = "xla_hlo.imag"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-
-  // CHECK: return [[VAL0]], [[VAL1]]
-  return %5, %6 : tensor<1x2xf32>, tensor<1x2xf32>
-}
-
 // CHECK-LABEL: @add_unranked
 func @add_unranked(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>, %arg2 : tensor<*xf32>, %arg3 : tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
   %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xcomplex<f32>>)
@@ -60,21 +45,6 @@ func @sub(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %
   return %5, %6 : tensor<2xf32>, tensor<2xf32>
 }
 
-// CHECK-LABEL: @sub_broadcast
-func @sub_broadcast(%arg0 : tensor<1x2xf32>, %arg1 : tensor<1x2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>) {
-  %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> (tensor<1x2xcomplex<f32>>)
-  %3 = "xla_hlo.complex"(%arg2, %arg3) : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xcomplex<f32>>)
-
-  // CHECK-DAG: [[VAL0:%.+]] = "xla_hlo.subtract"(%arg0, %arg2) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL1:%.+]] = "xla_hlo.subtract"(%arg1, %arg3) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %4 = "xla_hlo.subtract"(%2, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> (tensor<1x2xcomplex<f32>>)
-  %5 = "xla_hlo.real"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-  %6 = "xla_hlo.imag"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-
-  // CHECK: return [[VAL0]], [[VAL1]]
-  return %5, %6 : tensor<1x2xf32>, tensor<1x2xf32>
-}
-
 // CHECK-LABEL: @sub_unranked
 func @sub_unranked(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>, %arg2 : tensor<*xf32>, %arg3 : tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
   %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xcomplex<f32>>)
@@ -109,25 +79,6 @@ func @mul(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %
   return %5, %6 : tensor<2xf32>, tensor<2xf32>
 }
 
-// CHECK-LABEL: @mul_broadcast
-func @mul_broadcast(%arg0 : tensor<1x2xf32>, %arg1 : tensor<1x2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>) {
-  %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> (tensor<1x2xcomplex<f32>>)
-  %3 = "xla_hlo.complex"(%arg2, %arg3) : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xcomplex<f32>>)
-
-  // CHECK-DAG: [[VAL0:%.+]] = "xla_hlo.multiply"(%arg0, %arg2) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL1:%.+]] = "xla_hlo.multiply"(%arg1, %arg3) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL2:%.+]] = xla_hlo.subtract [[VAL0]], [[VAL1]]
-  // CHECK-DAG: [[VAL3:%.+]] = "xla_hlo.multiply"(%arg0, %arg3) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL4:%.+]] = "xla_hlo.multiply"(%arg1, %arg2) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL5:%.+]] = xla_hlo.add [[VAL3]], [[VAL4]]
-  %4 = "xla_hlo.multiply"(%2, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> (tensor<1x2xcomplex<f32>>)
-  %5 = "xla_hlo.real"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-  %6 = "xla_hlo.imag"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-
-  // CHECK: return %2, %5 : tensor<1x2xf32>, tensor<1x2xf32>
-  return %5, %6 : tensor<1x2xf32>, tensor<1x2xf32>
-}
-
 // CHECK-LABEL: @mul_unranked
 func @mul_unranked(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>, %arg2 : tensor<*xf32>, %arg3 : tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
   %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xcomplex<f32>>)
@@ -186,45 +137,6 @@ func @div(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %
 
 // -----
 
-// CHECK-LABEL: @div_broadcast
-func @div_broadcast(%arg0 : tensor<1x2xf32>, %arg1 : tensor<1x2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>) {
-  %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> (tensor<1x2xcomplex<f32>>)
-  %3 = "xla_hlo.complex"(%arg2, %arg3) : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xcomplex<f32>>)
-
-  // CHECK-DAG: [[VAL0:%.+]] = "xla_hlo.negate"(%arg3)
-
-  // Compute the numerator's real component:
-  //   numerator.real = lhs.real * rhs.real  lhs.imag * rhs.imag
-  // CHECK-DAG: [[VAL1:%.+]] = "xla_hlo.multiply"(%arg0, %arg2) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL2:%.+]] = "xla_hlo.multiply"(%arg1, [[VAL0]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL3:%.+]] = xla_hlo.subtract [[VAL1]], [[VAL2]]
-
-  // Compute the real valued denominator as rhs * con(rhs):
-  //   denominator = rhs.real * rhs.real + rhs.imag * rhs.imag
-  // CHECK-DAG: [[VAL4:%.+]] = xla_hlo.multiply %arg2, %arg2
-  // CHECK-DAG: [[VAL5:%.+]] = xla_hlo.multiply %arg3, [[VAL0]]
-  // CHECK-DAG: [[VAL6:%.+]] = xla_hlo.subtract [[VAL4]], [[VAL5]]
-
-  // Compute the numerator's imaginary component:
-  //   numerator.imag = lhs.imag * rhs.real - lhs.real * rhs.imag
-  // CHECK-DAG: [[VAL7:%.+]] = "xla_hlo.multiply"(%arg1, %arg2)
-  // CHECK-DAG: [[VAL8:%.+]] = "xla_hlo.multiply"(%arg0, [[VAL0]])
-  // CHECK-DAG: [[VAL9:%.+]] = xla_hlo.add [[VAL8]], [[VAL7]]
-
-  // Divide the numerator by the real valued denominator.
-  // CHECK-DAG: [[VAL10:%.+]] = "xla_hlo.divide"([[VAL3]], [[VAL6]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL11:%.+]] = "xla_hlo.divide"([[VAL9]], [[VAL6]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %4 = "xla_hlo.divide"(%2, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> (tensor<1x2xcomplex<f32>>)
-
-  %5 = "xla_hlo.real"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-  %6 = "xla_hlo.imag"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-
-  // CHECK: return [[VAL10]], [[VAL11]]
-  return %5, %6 : tensor<1x2xf32>, tensor<1x2xf32>
-}
-
-// -----
-
 // CHECK-LABEL: @div_unranked
 func @div_unranked(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>, %arg2 : tensor<*xf32>, %arg3 : tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
   %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xcomplex<f32>>)
diff --git a/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir b/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
index 4050340ce49..55b55c7b4e2 100644
--- a/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
@@ -1,214 +1,5 @@
 // RUN: xla-opt -test-xla-materialize-broadcasts -split-input-file %s -o - | FileCheck --dump-input=fail %s
 
-// CHECK-LABEL: @addBroadcastRhs
-func @addBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addBroadcastLhs
-func @addBroadcastLhs(%arg0: tensor<4xf32>, %arg1: tensor<1x4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %[[BROADCAST0]], %arg1 : tensor<1x4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>, tensor<1x4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addBroadcastMultidimension
-func @addBroadcastMultidimension(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1x4xf32>) -> tensor<1x1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<1x1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %[[BROADCAST0]], %arg1 : tensor<1x1x4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>, tensor<1x1x4xf32>) -> tensor<1x1x4xf32>
-  return %0 : tensor<1x1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addBroadcastBothArgs
-func @addBroadcastBothArgs(%arg0: tensor<1x2xf32>, %arg1: tensor<3x2x1xf32>) -> tensor<3x2x2xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x2xf32>) -> tensor<3x2x2xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<3x2x1xf32>) -> tensor<3x2x2xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<3x2x2xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x2xf32>, tensor<3x2x1xf32>) -> tensor<3x2x2xf32>
-  return %0 : tensor<3x2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addBroadcastScalar
-func @addBroadcastScalar(%arg0: tensor<4xf32>, %arg1: tensor<f32>) -> tensor<4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %arg0, %[[BROADCAST1]] : tensor<4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addWithoutBroadcast
-func @addWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %arg0, %arg1 : tensor<4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addUnranked
-func @addUnranked(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %arg0, %arg1 : tensor<*xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-  return %0 : tensor<*xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @atan2BroadcastRhs
-func @atan2BroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.atan2 %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.atan2"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @divBroadcastRhs
-func @divBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.divide %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @maxBroadcastRhs
-func @maxBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.maximum %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.maximum"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @minBroadcastRhs
-func @minBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.minimum %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.minimum"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @mulBroadcastRhs
-func @mulBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.multiply %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @powBroadcastRhs
-func @powBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.power %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.power"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @remainderBroadcastRhs
-func @remainderBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.remainder %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.remainder"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @shiftLeftBroadcastRhs
-func @shiftLeftBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.shift_left %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.shift_left"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @shiftRightArithmeticBroadcastRhs
-func @shiftRightArithmeticBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.shift_right_arithmetic %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @shiftRightLogicalBroadcastRhs
-func @shiftRightLogicalBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.shift_right_logical %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.shift_right_logical"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @subBroadcastRhs
-func @subBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.subtract %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @andBroadcastRhs
-func @andBroadcastRhs(%arg0: tensor<1x4xi32>, %arg1: tensor<4xi32>) -> tensor<1x4xi32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1x4xi32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.and %arg0, %[[BROADCAST1]] : tensor<1x4xi32>
-  %0 = "xla_hlo.and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xi32>, tensor<4xi32>) -> tensor<1x4xi32>
-  return %0 : tensor<1x4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @orBroadcastRhs
-func @orBroadcastRhs(%arg0: tensor<1x4xi32>, %arg1: tensor<4xi32>) -> tensor<1x4xi32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1x4xi32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.or %arg0, %[[BROADCAST1]] : tensor<1x4xi32>
-  %0 = "xla_hlo.or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xi32>, tensor<4xi32>) -> tensor<1x4xi32>
-  return %0 : tensor<1x4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @xorBroadcastRhs
-func @xorBroadcastRhs(%arg0: tensor<1x4xi32>, %arg1: tensor<4xi32>) -> tensor<1x4xi32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1x4xi32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.xor %arg0, %[[BROADCAST1]] : tensor<1x4xi32>
-  %0 = "xla_hlo.xor"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xi32>, tensor<4xi32>) -> tensor<1x4xi32>
-  return %0 : tensor<1x4xi32>
-}
-
-// -----
-
 // CHECK-LABEL: @clampBroadcast
 // CHECK-SAME: (%[[MIN:.+]]: tensor<f32>, %[[VAL:.+]]: tensor<4xf32>, %[[MAX:.+]]: tensor<f32>)
 func @clampBroadcast(%min: tensor<f32>, %value: tensor<4xf32>, %max: tensor<f32>) -> tensor<4xf32> {
@@ -218,69 +9,3 @@ func @clampBroadcast(%min: tensor<f32>, %value: tensor<4xf32>, %max: tensor<f32>
   %0 = "xla_hlo.clamp"(%min, %value, %max) : (tensor<f32>, tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
-
-// -----
-
-// CHECK-LABEL: @compareBroadcastRhs
-func @compareBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xi1> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = "xla_hlo.compare"(%arg0, %[[BROADCAST1]]) {comparison_direction = "NE"} : (tensor<1x4xf32>, tensor<1x4xf32>) -> tensor<1x4xi1>
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xi1>
-  return %0 : tensor<1x4xi1>
-}
-
-// -----
-
-// CHECK-LABEL: @dynamicCompareBroadcastRhs
-func @dynamicCompareBroadcastRhs(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -> tensor<?x?xi1> {
-  // CHECK-NEXT: %[[DIM0:.*]] = dim %arg0, 0 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM0C:.*]] = index_cast %[[DIM0]] : index to i32
-  // CHECK-NEXT: %c1 = constant 1 : index
-  // CHECK-NEXT: %[[DIM1_0:.*]] = dim %arg0, 1 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM1_1:.*]] = dim %arg1, 0 : tensor<?xf32>
-  // CHECK-NEXT: %[[CMPI:.*]] = cmpi "eq", %[[DIM1_0]], %c1 : index
-  // CHECK-NEXT: %[[SEL:.*]] = select %[[CMPI]], %[[DIM1_0]], %[[DIM1_1]] : index
-  // CHECK-NEXT: %[[DIM1C:.*]] = index_cast %[[SEL]] : index to i32
-  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0C]], %[[DIM1C]]) : (i32, i32) -> tensor<2xi32>
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
-  // CHECK-NEXT: "xla_hlo.compare"(%[[BROADCAST0]], %[[BROADCAST1]]) {comparison_direction = "NE"} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<?x?xf32>, tensor<?xf32>) -> tensor<?x?xi1>
-  return %0 : tensor<?x?xi1>
-}
-
-// -----
-
-// CHECK-LABEL: @dynamicBroadcastAdd
-func @dynamicBroadcastAdd(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -> tensor<?x?xf32> {
-  // CHECK-NEXT: %[[DIM0:.*]] = dim %arg0, 0 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM0C:.*]] = index_cast %[[DIM0]] : index to i32
-  // CHECK-NEXT: %c1 = constant 1 : index
-  // CHECK-NEXT: %[[DIM1_0:.*]] = dim %arg0, 1 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM1_1:.*]] = dim %arg1, 0 : tensor<?xf32>
-  // CHECK-NEXT: %[[CMPI:.*]] = cmpi "eq", %[[DIM1_0]], %c1 : index
-  // CHECK-NEXT: %[[SEL:.*]] = select %[[CMPI]], %[[DIM1_0]], %[[DIM1_1]] : index
-  // CHECK-NEXT: %[[DIM1C:.*]] = index_cast %[[SEL]] : index to i32
-  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0C]], %[[DIM1C]]) : (i32, i32) -> tensor<2xi32>
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
-  // CHECK-NEXT: xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<?x?xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?x?xf32>, tensor<?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @dynamicBroadcastAddScalar
-func @dynamicBroadcastAddScalar(%arg0: tensor<?x?xf32>, %arg1: tensor<f32>) -> tensor<?x?xf32> {
-  // CHECK-NEXT: %[[DIM0:.*]] = dim %arg0, 0 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM0C:.*]] = index_cast %[[DIM0]] : index to i32
-  // CHECK-NEXT: %[[DIM1:.*]] = dim %arg0, 1 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM1C:.*]] = index_cast %[[DIM1]] : index to i32
-  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0C]], %[[DIM1C]]) : (i32, i32) -> tensor<2xi32>
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<2xi32>) -> tensor<?x?xf32>
-  // CHECK-NEXT: xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<?x?xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
diff --git a/tensorflow/compiler/mlir/xla/tests/ops.mlir b/tensorflow/compiler/mlir/xla/tests/ops.mlir
index 8cb63311657..0a69ee93aee 100644
--- a/tensorflow/compiler/mlir/xla/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/ops.mlir
@@ -156,6 +156,98 @@ func @broadcast_in_dim_bad_shape_mismatch(%arg0: tensor<3xi32>) -> tensor<1x2x3x
 
 // -----
 
+func @case_mismatch_num_args(%index: tensor<i32>, %operand_1: tensor<f32>, %operand_2: tensor<f32>, %operand_3: tensor<f32>) -> tensor<f32> {
+  // expected-error@+1 {{expects branch regions to have single argument, but found 2 for branch 1}}
+  %0 = "xla_hlo.case"(%index, %operand_1, %operand_2, %operand_3) ( {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
+      %1 = "xla_hlo.copy"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    }
+  ) : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func @case_mismatch_num_results(%index: tensor<i32>, %operand_1: tensor<f32>, %operand_2: tensor<f32>, %operand_3: tensor<f32>) -> tensor<f32> {
+  // expected-error@+1 {{branch 1 returned values do not match op result types}}
+  %0 = "xla_hlo.case"(%index, %operand_1, %operand_2, %operand_3) ( {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.copy"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1, %arg0) : (tensor<f32>, tensor<f32>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    }
+  ) : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func @case_mismatch_arg_type(%index: tensor<i32>, %operand_1: tensor<f32>, %operand_2: tensor<f32>, %operand_3: tensor<f32>) -> tensor<f32> {
+  // expected-error@+1 {{expects operand 2 to be of type 'tensor<i32>', but found 'tensor<f32>'}}
+  %0 = "xla_hlo.case"(%index, %operand_1, %operand_2, %operand_3) ( {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<i32>):
+      %1 = xla_hlo.constant dense<2.0> : tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    }
+  ) : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func @case_mismatch_return_type(%index: tensor<i32>, %operand_1: tensor<f32>, %operand_2: tensor<f32>, %operand_3: tensor<f32>) -> tensor<f32> {
+  // expected-error@+1 {{branch 1 returned values do not match op result types}}
+  %0 = "xla_hlo.case"(%index, %operand_1, %operand_2, %operand_3) ( {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = xla_hlo.constant dense<2> : tensor<i32>
+      "xla_hlo.return"(%1) : (tensor<i32>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    }
+  ) : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func @case_empty_region(%index: tensor<i32>, %operand_1: tensor<f32>) -> () {
+  // expected-error@+1 {{cannot have empty regions}}
+  "xla_hlo.case"(%index, %operand_1) ( {} ) : (tensor<i32>, tensor<f32>) -> tensor<f32>
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @comp_eq
 func @comp_eq(%arg0: tensor<3xi32>, %arg1: tensor<3xi32>) -> tensor<3xi1> {
   %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
@@ -453,14 +545,6 @@ func @rng_uniform_invalid_type(%mu: tensor<complex<f32>>, %sigma: tensor<f32>) -
 
 // -----
 
-// CHECK-LABEL: @scalars_to_dimension_tensor
-func @scalars_to_dimension_tensor(%arg0: i32, %arg1: i32) -> tensor<2xi32> {
-  %0 = "xla_hlo.scalars_to_dimension_tensor"(%arg0, %arg1) : (i32, i32) -> tensor<2xi32>
-  return %0 : tensor<2xi32>
-}
-
-// -----
-
 // CHECK-LABEL: func @select
 func @select(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
   %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
diff --git a/tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir
new file mode 100644
index 00000000000..9f54e40dcaa
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir
@@ -0,0 +1,60 @@
+// RUN: xla-opt %s -xla-hlo-sink-constants-to-control-flow | FileCheck %s --dump-input=fail
+
+// Tests sinking constants to a while loop.
+
+// CHECK-LABEL: func @sink_const_to_while
+func @sink_const_to_while(%arg0: tensor<i64>) -> tensor<i64> {
+  // CHECK-NEXT: xla_hlo.while
+  %c0 = xla_hlo.constant dense<1> : tensor<i64>
+  %c1 = xla_hlo.constant dense<2> : tensor<i64>
+  %0 = "xla_hlo.while"(%arg0) ( {
+  ^bb0(%arg1: tensor<i64>):
+    // CHECK: %[[ARG1A:.+]]: tensor<i64>
+    // CHECK: %[[C0:.+]] = xla_hlo.constant dense<1> : tensor<i64>
+    // CHECK: "xla_hlo.compare"(%[[C0]], %[[ARG1A]])
+    %1 = "xla_hlo.compare"(%c0, %arg1) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+    "xla_hlo.return"(%1) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg1: tensor<i64>):
+    // CHECK: %[[ARG1B:.+]]: tensor<i64>
+    // CHECK-DAG: %[[C1:.+]] = xla_hlo.constant dense<2> : tensor<i64>
+    // CHECK-DAG: %[[ADD0:.+]] = xla_hlo.add %[[ARG1B]], %[[ARG1B]]
+    %2 = xla_hlo.add %arg1, %arg1 : tensor<i64>
+    // CHECK: %[[ADD1:.+]] = xla_hlo.add %[[C1]], %[[ADD0]]
+    %3 = xla_hlo.add %c1, %2 : tensor<i64>
+    // CHECK: %[[ADD2:.+]] = xla_hlo.add %[[C1]], %[[ADD1]]
+    %4 = xla_hlo.add %c1, %3 : tensor<i64>
+    "xla_hlo.return"(%4) : (tensor<i64>) -> ()
+  }) : (tensor<i64>) -> tensor<i64>
+  return %0 : tensor<i64>
+}
+
+// Tests sinking constants to a conditional op.
+
+// CHECK-LABEL: func @sink_const_to_conditional
+func @sink_const_to_conditional(%arg0: tensor<i64>) -> tensor<i64> {
+  %c0 = xla_hlo.constant dense<1> : tensor<i64>
+  %c1 = xla_hlo.constant dense<2> : tensor<i64>
+  %0 = "xla_hlo.compare"(%arg0, %c0) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %1 = "xla_hlo.tuple"(%arg0) : (tensor<i64>) -> tuple<tensor<i64>>
+  // CHECK: xla_hlo.if
+  %2 = "xla_hlo.if"(%0, %1, %1) ( {
+  ^bb0(%arg1: tuple<tensor<i64>>):
+    // CHECK: %[[C0:.+]] = xla_hlo.constant dense<1> : tensor<i64>
+    %3 = "xla_hlo.get_tuple_element"(%arg1) {index = 0 : i32} : (tuple<tensor<i64>>) -> tensor<i64>
+    // CHECK: %[[ADD0:.+]] = xla_hlo.add %[[C0]],
+    %4 = xla_hlo.add %c0, %3 : tensor<i64>
+    %5 = "xla_hlo.tuple"(%4) : (tensor<i64>) -> tuple<tensor<i64>>
+    "xla_hlo.return"(%5) : (tuple<tensor<i64>>) -> ()
+  },  {
+  ^bb0(%arg1: tuple<tensor<i64>>):
+    // CHECK: %[[C1:.+]] = xla_hlo.constant dense<2> : tensor<i64>
+    %6 = "xla_hlo.get_tuple_element"(%arg1) {index = 0 : i32} : (tuple<tensor<i64>>) -> tensor<i64>
+    // CHECK: %[[ADD1:.+]] = xla_hlo.add %[[C1]],
+    %7 = xla_hlo.add %c1, %6 : tensor<i64>
+    %8 = "xla_hlo.tuple"(%7) : (tensor<i64>) -> tuple<tensor<i64>>
+    "xla_hlo.return"(%8) : (tuple<tensor<i64>>) -> ()
+  }) : (tensor<i1>, tuple<tensor<i64>>, tuple<tensor<i64>>) -> tuple<tensor<i64>>
+  %9 = "xla_hlo.get_tuple_element"(%2) {index = 0 : i32} : (tuple<tensor<i64>>) -> tensor<i64>
+  return %9 : tensor<i64>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/case.mlir b/tensorflow/compiler/mlir/xla/tests/translate/case.mlir
new file mode 100644
index 00000000000..dba9e8b61ca
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/case.mlir
@@ -0,0 +1,99 @@
+// RUN: tf-mlir-translate -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s
+
+func @main() -> tensor<f32> {
+  %cst = constant  {name = "constant"} dense<1> : tensor<i32>
+  %cst_0 = constant  {name = "constant.1"} dense<5.600000e+01> : tensor<f32>
+  %cst_1 = constant  {name = "constant.2"} dense<1.200000e+01> : tensor<f32>
+  %cst_2 = constant  {name = "constant.3"} dense<1.300000e+01> : tensor<f32>
+  %0 = "xla_hlo.case"(%cst, %cst_0, %cst_1, %cst_2) ( {
+  ^bb0(%arg0: tensor<f32>):
+    %1 = "xla_hlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
+    "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg0: tensor<f32>):
+    %1 = "xla_hlo.copy"(%arg0) : (tensor<f32>) -> tensor<f32>
+    "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg0: tensor<f32>):
+    %1 = "xla_hlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
+    "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+  }) {name = "conditional"} : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// CHECK: %[[NEGATE_BRANCH:.*]] ({{.*}}: f32[]) -> f32[] {
+// CHECK:   %[[ARG:.*]] = f32[] parameter(0)
+// CHECK:   ROOT %[[RESULT:.*]] = f32[] negate(f32[] %[[ARG]])
+// CHECK: }
+
+// CHECK: %[[COPY_BRANCH:.*]] ({{.*}}: f32[]) -> f32[] {
+// CHECK:   %[[ARG:.*]] = f32[] parameter(0)
+// CHECK:   ROOT %[[RESULT:.*]] = f32[] copy(f32[] %[[ARG]])
+// CHECK: }
+
+// CHECK: %[[FLOOR_BRANCH:.*]] ({{.*}}: f32[]) -> f32[] {
+// CHECK:   %[[ARG:.*]] = f32[] parameter(0)
+// CHECK:   ROOT %[[RESULT:.*]] = f32[] floor(f32[] %[[ARG]])
+// CHECK: }
+
+// CHECK-LABEL: ENTRY
+// CHECK-SAME:  () -> f32[]
+
+// CHECK: %[[INDEX:.*]] = s32[] constant(1)
+// CHECK: %[[OPERAND_1:.*]] = f32[] constant(56)
+// CHECK: %[[OPERAND_2:.*]] = f32[] constant(12)
+// CHECK: %[[OPERAND_3:.*]] = f32[] constant(13)
+// CHECK: ROOT %[[RESULT:.*]] = f32[] conditional(s32[] %[[INDEX]], f32[] %[[OPERAND_1]], f32[] %[[OPERAND_2]], f32[] %[[OPERAND_3]]), branch_computations={%[[NEGATE_BRANCH]], %[[COPY_BRANCH]], %[[FLOOR_BRANCH]]}
+
+// -----
+
+func @main() -> (tensor<f32>, tensor<f32>) {
+  %cst = constant  {name = "constant"} dense<1> : tensor<i32>
+  %cst_0 = constant  {name = "constant.1"} dense<5.600000e+01> : tensor<f32>
+  %cst_1 = constant  {name = "constant.2"} dense<1.200000e+01> : tensor<f32>
+  %cst_2 = constant  {name = "constant.3"} dense<1.300000e+01> : tensor<f32>
+  %0:2 = "xla_hlo.case"(%cst, %cst_0, %cst_1, %cst_2) ( {
+  ^bb0(%arg0: tensor<f32>):
+    %1 = "xla_hlo.negate"(%arg0) {name = "negate"} : (tensor<f32>) -> tensor<f32>
+    "xla_hlo.return"(%1, %1) : (tensor<f32>, tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg0: tensor<f32>):
+    %1 = "xla_hlo.copy"(%arg0) {name = "copy"} : (tensor<f32>) -> tensor<f32>
+    "xla_hlo.return"(%1, %1) : (tensor<f32>, tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg0: tensor<f32>):
+    %1 = "xla_hlo.floor"(%arg0) {name = "floor"} : (tensor<f32>) -> tensor<f32>
+    "xla_hlo.return"(%1, %1) : (tensor<f32>, tensor<f32>) -> ()
+  }) {name = "conditional"} : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  return %0#0, %0#1 : tensor<f32>, tensor<f32>
+}
+
+// CHECK: %[[NEGATE_BRANCH:.*]] ({{.*}}: f32[]) -> (f32[], f32[]) {
+// CHECK:   %[[ARG:.*]] = f32[] parameter(0)
+// CHECK:   %[[NEGATE:.*]] = f32[] negate(f32[] %[[ARG]])
+// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(f32[] %[[NEGATE]], f32[] %[[NEGATE]])
+// CHECK: }
+
+// CHECK: %[[COPY_BRANCH:.*]] ({{.*}}: f32[]) -> (f32[], f32[]) {
+// CHECK:   %[[ARG:.*]] = f32[] parameter(0)
+// CHECK:   %[[COPY:.*]] = f32[] copy(f32[] %[[ARG]])
+// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(f32[] %[[COPY]], f32[] %[[COPY]])
+// CHECK: }
+
+// CHECK: %[[FLOOR_BRANCH:.*]] ({{.*}}: f32[]) -> (f32[], f32[]) {
+// CHECK:   %[[ARG:.*]] = f32[] parameter(0)
+// CHECK:   %[[FLOOR:.*]] = f32[] floor(f32[] %[[ARG]])
+// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(f32[] %[[FLOOR]], f32[] %[[FLOOR]])
+// CHECK: }
+
+// CHECK-LABEL: ENTRY
+// CHECK-SAME:  () -> (f32[], f32[])
+
+// CHECK: %[[INDEX:.*]] = s32[] constant(1)
+// CHECK: %[[OPERAND_1:.*]] = f32[] constant(56)
+// CHECK: %[[OPERAND_2:.*]] = f32[] constant(12)
+// CHECK: %[[OPERAND_3:.*]] = f32[] constant(13)
+// CHECK: %[[TUPLE:.*]] = (f32[], f32[]) conditional(s32[] %[[INDEX]], f32[] %[[OPERAND_1]], f32[] %[[OPERAND_2]], f32[] %[[OPERAND_3]]), branch_computations={%[[NEGATE_BRANCH]], %[[COPY_BRANCH]], %[[FLOOR_BRANCH]]}
+// CHECK: %[[RES_1:.*]] = f32[] get-tuple-element((f32[], f32[]) %[[TUPLE]]), index=0
+// CHECK: %[[RES_2:.*]] = f32[] get-tuple-element((f32[], f32[]) %[[TUPLE]]), index=1
+// CHECK: ROOT %[[RESULT:.*]] = (f32[], f32[]) tuple(f32[] %[[RES_1]], f32[] %[[RES_2]])
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/case_conditional.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/case_conditional.hlotxt
new file mode 100644
index 00000000000..2ff223cd480
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/case_conditional.hlotxt
@@ -0,0 +1,46 @@
+// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s
+
+HloModule Indexed_Conditional
+
+%Negate (x: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  ROOT %negate = f32[] negate(f32[] %x)
+}
+
+%Identity (y: f32[]) -> f32[] {
+  %y = f32[] parameter(0)
+  ROOT %copy = f32[] copy(f32[] %y)
+}
+
+%Floor (z: f32[]) -> f32[] {
+  %z = f32[] parameter(0)
+  ROOT %floor = f32[] floor(f32[] %z)
+}
+
+ENTRY %indexed_conditional () -> f32[] {
+  %constant = s32[] constant(1)
+  %constant.1 = f32[] constant(56)
+  %constant.2 = f32[] constant(12)
+  %constant.3 = f32[] constant(13)
+  ROOT %conditional = f32[] conditional(s32[] %constant, f32[] %constant.1, f32[] %constant.2, f32[] %constant.3), branch_computations={%Negate, %Identity, %Floor}
+}
+
+// CHECK-LABEL: func @main() -> tensor<f32>
+// CHECK: %[[INDEX:.*]] = constant  {name = "constant"} dense<1> : tensor<i32>
+// CHECK: %[[OPERAND_1:.*]] = constant  {name = "{{.*}}"} dense<5.600000e+01> : tensor<f32>
+// CHECK: %[[OPERAND_2:.*]] = constant  {name = "{{.*}}"} dense<1.200000e+01> : tensor<f32>
+// CHECK: %[[OPERAND_3:.*]] = constant  {name = "{{.*}}"} dense<1.300000e+01> : tensor<f32>
+// CHECK: %[[RESULT:.*]] = "xla_hlo.case"(%[[INDEX]], %[[OPERAND_1]], %[[OPERAND_2]], %[[OPERAND_3]]) ( {
+// CHECK:   ^bb0(%[[ARG_1:.*]]: tensor<f32>):
+// CHECK:     %[[RES_1:.*]] = "xla_hlo.negate"(%[[ARG_1]]) {name = "{{.*}}"} : (tensor<f32>) -> tensor<f32>
+// CHECK:     "xla_hlo.return"(%[[RES_1]]) : (tensor<f32>) -> ()
+// CHECK:   },  {
+// CHECK:   ^bb0(%[[ARG_2:.*]]: tensor<f32>):
+// CHECK:     %[[RES_2:.*]] = "xla_hlo.copy"(%[[ARG_2]]) {name = "{{.*}}"} : (tensor<f32>) -> tensor<f32>
+// CHECK:     "xla_hlo.return"(%[[RES_2]]) : (tensor<f32>) -> ()
+// CHECK:   },  {
+// CHECK:   ^bb0(%[[ARG_3:.*]]: tensor<f32>):
+// CHECK:     %[[RES_3:.*]] = "xla_hlo.floor"(%[[ARG_3]]) {name = "{{.*}}"} : (tensor<f32>) -> tensor<f32>
+// CHECK:     "xla_hlo.return"(%[[RES_3]]) : (tensor<f32>) -> ()
+// CHECK:   }) {name = "{{.*}}"} : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK: return %[[RESULT]] : tensor<f32>
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
index 3650307ea94..20b43e8633d 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-mlir-translate -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s
+// RUN: tf-mlir-translate -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s --dump-input-on-failure
 
 // CHECK:  HloModule
 func @main(%arg0: !xla_hlo.token, %arg1: !xla_hlo.token) -> !xla_hlo.token {
@@ -96,34 +96,6 @@ func @main(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> (tensor<4xi32>, tensor
 
 // -----
 
-// CHECK:  HloModule
-func @main(%arg0: tensor<1x4xi32>, %arg1: tensor<2x4xi32>, %arg2: tensor<2x3x4xi32>) -> tensor<2x3x4xi32> {
-  // Same rank degenerate broadcast
-  // CHECK:  [[ARG_0:%.*]] = s32[1,4] parameter(0)
-  // CHECK-NEXT:  [[RESHAPE_1:%.*]] = s32[4] reshape(s32[1,4] [[ARG_0]])
-  // CHECK-NEXT:  [[BROADCAST_1:%.*]] = s32[2,4] broadcast(s32[4] [[RESHAPE_1]])
-  // CHECK-NEXT:  [[ARG_1:%.*]] = s32[2,4] parameter(1)
-  // CHECK-NEXT:  s32[2,4] add(s32[2,4] [[BROADCAST_1]], s32[2,4] [[ARG_1]])
-  %0 = "xla_hlo.add"(%arg0, %arg1) : (tensor<1x4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
-
-  // Broadcast up rank
-  // CHECK-NEXT:  [[BROADCAST_2:%.*]] = s32[2,3,4] broadcast(s32[2,4] [[ARG_1]]), dimensions={0,2}
-  // CHECK-NEXT:  [[ARG_2:%.*]] = s32[2,3,4] parameter(2)
-  // CHECK-NEXT:  s32[2,3,4] add(s32[2,3,4] [[BROADCAST_2]], s32[2,3,4] [[ARG_2]])
-  %1 = "xla_hlo.add"(%arg1, %arg2) {broadcast_dimensions = dense<[0,2]> : tensor<2xi64>} : (tensor<2x4xi32>, tensor<2x3x4xi32>) -> tensor<2x3x4xi32>
-
-  // Broadcast up rank + degenerate broadcast
-  // CHECK-NEXT:  [[BROADCAST_3:%.*]] = s32[2,1,4] broadcast(s32[1,4] [[ARG_0]]), dimensions={1,2}
-  // CHECK-NEXT:  [[RESHAPE_2:%.*]] = s32[2,4] reshape(s32[2,1,4] [[BROADCAST_3]])
-  // CHECK-NEXT:  [[BROADCAST_4:%.*]] = s32[2,3,4] broadcast(s32[2,4] [[RESHAPE_2]]), dimensions={0,2}
-  // CHECK:  ROOT
-  // CHECK-SAME:  s32[2,3,4] add(s32[2,3,4] [[BROADCAST_4]], s32[2,3,4] [[ARG_2]])
-  %2 = "xla_hlo.add"(%arg0, %arg2) {broadcast_dimensions = dense<[1,2]> : tensor<2xi64>} : (tensor<1x4xi32>, tensor<2x3x4xi32>) -> tensor<2x3x4xi32>
-  return %2 : tensor<2x3x4xi32>
-}
-
-// -----
-
 // CHECK:  HloModule
 func @main(%arg0: tensor<2xi32>) -> tensor<2xf32> {
   %0 = "xla_hlo.bitcast_convert"(%arg0) : (tensor<2xi32>) -> tensor<2xf32>
@@ -294,6 +266,12 @@ func @main() {
   // CHECK: f16[4] constant({1, -4, -65504, 0.015625}
   %cst_8 = constant dense<[1.0e+00, -4.0e+00, -65504.0e+00, 1.5625e-02]> : tensor<4xf16>
 
+  // CHECK: c64[] constant((1, 0))
+  %cst_9 = constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>
+
+  // CHECK: c128[] constant((1, 0))
+  %cst_10 = constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>
+
   return
 }
 
@@ -1038,3 +1016,16 @@ func @main(%arg0: tensor<4xui8>) -> (tensor<4xui8>) {
 // CHECK: ENTRY
 // CHECK: %[[ARG0:.*]] = u8[4] parameter(0)
 //  ROOT %[[RESULT:.*]] = u8[4] not(u8[4] %[[ARG0]])
+
+// -----
+
+// CHECK:  HloModule
+func @main(%arg0: tensor<4xi32>) -> (tensor<*xi32>) {
+  %0 = "xla_hlo.not"(%arg0) : (tensor<4xi32>) -> tensor<4xi32>
+  %1 = tensor_cast %0 : tensor<4xi32> to tensor<*xi32>
+  return %1 : tensor<*xi32>
+}
+
+// CHECK: ENTRY
+// CHECK: %[[ARG0:.*]] = s32[4] parameter(0)
+//  ROOT %[[RESULT:.*]] = s32[4] not(s32[4] %[[ARG0]])
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/conditional.mlir b/tensorflow/compiler/mlir/xla/tests/translate/if.mlir
similarity index 98%
rename from tensorflow/compiler/mlir/xla/tests/translate/conditional.mlir
rename to tensorflow/compiler/mlir/xla/tests/translate/if.mlir
index e510a2aa35f..6542966fc7c 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/conditional.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/if.mlir
@@ -41,7 +41,7 @@ func @main(%arg0: tensor<f32>) -> tuple<tensor<f32>> {
   %1 = "xla_hlo.tuple"(%arg0) : (tensor<f32>) -> tuple<tensor<f32>>
 
   // CHECK:   %[[VAL3:.+]] = (f32[]) conditional(pred[] %[[VAL1]], (f32[]) %[[VAL2]], (f32[]) %[[VAL2]]), true_computation=[[R0]], false_computation=[[R1]]
-  %2 = "xla_hlo.conditional"(%0, %1, %1) ( {
+  %2 = "xla_hlo.if"(%0, %1, %1) ( {
   ^bb0(%arg1: tuple<tensor<f32>>):
     %6 = "xla_hlo.get_tuple_element"(%arg1) {index = 0 : i32} : (tuple<tensor<f32>>) -> tensor<f32>
     %7 = "xla_hlo.log"(%6) : (tensor<f32>) -> tensor<f32>
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/conditional.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/if_conditional.hlotxt
similarity index 97%
rename from tensorflow/compiler/mlir/xla/tests/translate/conditional.hlotxt
rename to tensorflow/compiler/mlir/xla/tests/translate/if_conditional.hlotxt
index 00f6ec2d308..d2c6e669e9b 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/conditional.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/if_conditional.hlotxt
@@ -29,7 +29,7 @@ ENTRY %tfcompile.20 {
   // CHECK: [[R2:%.+]] = "xla_hlo.tuple"([[A0]])
   %tuple.5 = (f32[]) tuple(%arg0.1), metadata={op_type="If" op_name="cond/Merge_if"}
 
-  // CHECK: [[R3:%.+]] = "xla_hlo.conditional"([[R1]], [[R2]], [[R2]]) ( {
+  // CHECK: [[R3:%.+]] = "xla_hlo.if"([[R1]], [[R2]], [[R2]]) ( {
   // CHECK: ^bb0([[A1:%.+]]: tuple<tensor<f32>>):
   // CHECK:   [[R7:%.+]] = "xla_hlo.get_tuple_element"([[A1]])
   // CHECK:   [[R8:%.+]] = "xla_hlo.log"([[R7]])
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
index 75471e3a090..af45f84b34d 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
@@ -1,4 +1,4 @@
-// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s
+// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s --dump-input-on-failure
 
 HloModule main
 
@@ -20,29 +20,6 @@ ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
   ROOT %dot.4 = f32[] dot(f32[4]{0} %add.3, f32[4]{0} %Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
 }
 
-// This test is more thorough than those of the the other binary ops to test
-// their shared functionality.
-
-// CHECK-LABEL:  func @test_add
-%test_add (Arg_0.1: f32[4], Arg_1.2: f32[4], Arg_2.3: f32[], Arg_3.4: f32[]) -> f32[4] {
-  %Arg_0.1 = f32[4] parameter(0)
-  %Arg_1.2 = f32[4] parameter(1)
-  %Arg_2.3 = f32[] parameter(2)
-  %Arg_3.4 = f32[] parameter(3)
-
-  // Add two tensors
-  // CHECK-NEXT:  xla_hlo.add %arg0, %arg1 {name = "{{.*}}"}
-  %add.3 = f32[4] add(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
-
-  // Add two scalars
-  // CHECK-NEXT:  xla_hlo.add %arg2, %arg3
-  %add.4 = f32[] add(f32[] %Arg_2.3, f32[] %Arg_3.4)
-
-  // Add a tensor and scalar
-  // CHECK-NEXT:  "xla_hlo.add"(%0, %1)
-  ROOT %add.5 = f32[4] add(f32[4] %add.3, f32[] %add.4)
-}
-
 // CHECK-LABEL:  func @test_after_all
 // CHECK-SAME:  ([[VAL_0:%.*]]: !xla_hlo.token, [[VAL_1:%.*]]: !xla_hlo.token) -> !xla_hlo.token
 %test_after_all (token0: token[], token1: token[] ) -> token[] {
@@ -159,11 +136,11 @@ add {
 }
 
 
-// CHECK-LABEL:  func @test_compare(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>, %arg2: tensor<1xf32>) -> tensor<3xi1> {
-%test_compare (Arg_0.1: f32[3], Arg_1.2: f32[3], Arg_2.3: f32[1]) -> pred[3] {
+// CHECK-LABEL:  func @test_compare(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>, %arg2: tensor<3xf32>) -> tensor<3xi1> {
+%test_compare (Arg_0.1: f32[3], Arg_1.2: f32[3], Arg_2.3: f32[3]) -> pred[3] {
   %Arg_0.1 = f32[3] parameter(0)
   %Arg_1.2 = f32[3] parameter(1)
-  %Arg_2.3 = f32[1] parameter(2)
+  %Arg_2.3 = f32[3] parameter(2)
 
   // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ", name = "{{.*}}"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
   %compare.4 = pred[3] compare(Arg_0.1, Arg_1.2), direction=EQ
@@ -172,7 +149,7 @@ add {
   %compare.5 = pred[3] compare(Arg_0.1, Arg_1.2), direction=LE
 
   // Requires broadcast of compatible tensors.
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg2) {comparison_direction = "GT", name = "{{.*}}"} : (tensor<3xf32>, tensor<1xf32>) -> tensor<3xi1>
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg2) {comparison_direction = "GT", name = "{{.*}}"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
   ROOT %compare.6 = pred[3] compare(Arg_0.1, Arg_2.3), direction=GT
 }
 
@@ -212,10 +189,14 @@ add {
   // CHECK: dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xbf16>
   %constant.3 = bf16[4] constant({1, 2, 3, 4})
 
+  // CHECK: dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>
+  %constant.4 = c64[] constant((1, 0))
+
+  // CHECK: dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>
+  %constant.5 = c128[] constant((1, 0))
+
   // CHECK: dense<[1.000000e+00, -4.000000e+00, -6.550400e+04, 1.562500e-02]> : tensor<4xf16>
-  ROOT %constant.4 = f16[4] constant({1, -4, -65504, 0.015625})
-
-
+  ROOT %constant.6 = f16[4] constant({1, -4, -65504, 0.015625})
 }
 
 // TODO(b/129422361) Potentially update when copy, reshape, and conv have actual
@@ -276,19 +257,19 @@ add {
   ROOT %convolution = f32[1,5,1] convolution(f32[1,2,1] %input, f32[1,1,1] %filter), feature_group_count=1, dim_labels=b0f_0io->b0f, window={pad=1_2 size=1}
 }
 
-// CHECK-LABEL:  func @test_convert(%arg0: tensor<4xf32>, %arg1: tensor<f32>) -> tensor<4xf64> {
-%test_convert (Arg_0.1: f32[4], Arg_1.2: f32[]) -> f64[4] {
+// CHECK-LABEL:  func @test_convert(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf64> {
+%test_convert (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f64[4] {
   %Arg_0.1 = f32[4] parameter(0)
-  %Arg_1.2 = f32[] parameter(1)
+  %Arg_1.2 = f32[4] parameter(1)
 
   // CHECK-NEXT:  %0 = "xla_hlo.convert"(%arg0) {name = "{{.*}}"} : (tensor<4xf32>) -> tensor<4xf64>
   %convert.3 = f64[4] convert(f32[4] %Arg_0.1)
 
-  // CHECK-NEXT:  %1 = "xla_hlo.convert"(%arg1) {name = "{{.*}}"} : (tensor<f32>) -> tensor<f64>
-  %convert.4 = f64[] convert(f32[] %Arg_1.2)
+  // CHECK-NEXT:  %1 = "xla_hlo.convert"(%arg1) {name = "{{.*}}"} : (tensor<4xf32>) -> tensor<4xf64>
+  %convert.4 = f64[4] convert(f32[4] %Arg_1.2)
 
-  // CHECK-NEXT:  "xla_hlo.add"(%0, %1)
-  ROOT %add.5 = f64[4] add(f64[4] %convert.3, f64[] %convert.4)
+  // CHECK-NEXT:  xla_hlo.add %0, %1
+  ROOT %add.5 = f64[4] add(f64[4] %convert.3, f64[4] %convert.4)
 }
 
 // CHECK-LABEL:  func @test_cosine(%arg0: tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32> {
diff --git a/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir b/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
index 9778772e250..3d8646e7fb9 100644
--- a/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
@@ -106,24 +106,19 @@ func @batchNormInference_dynamic_shape(
     -> tensor<?x?x?x?xf32> {
   // CHECK-DAG: %[[EPS:.+]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
   // CHECK-DAG: %[[DIM:.+]] = dim %[[VARIANCE]], 0 : tensor<?xf32>
-  // CHECK-DAG: %[[INDEX_CAST:.+]] = index_cast %[[DIM]] : index to i32
-  // CHECK-DAG: %[[TO_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[INDEX_CAST]]) : (i32) -> tensor<1xi32>
-  // CHECK-DAG: %[[EPS_BCAST:.+]] =  "xla_hlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[TO_DIM_TENSOR]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<1xi32>) -> tensor<?xf32>
+  // CHECK-DAG: %[[TO_DIM_TENSOR:.+]] = tensor_from_elements(%[[DIM]]) : tensor<1xindex>
+  // CHECK-DAG: %[[EPS_BCAST:.+]] =  "xla_hlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[TO_DIM_TENSOR]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK-DAG: %[[VARIANCE_EPS:.+]] = xla_hlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<?xf32>
   // CHECK-DAG: %[[STDDEV:.+]] = "xla_hlo.sqrt"(%[[VARIANCE_EPS]]) : (tensor<?xf32>) -> tensor<?xf32>
   // CHECK-DAG: %[[INPUT_DIM_0:.+]] = dim %[[X]], 0 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[INPUT_INDEX_CAST_0:.+]] = index_cast %[[INPUT_DIM_0]] : index to i32
   // CHECK-DAG: %[[INPUT_DIM_1:.+]] = dim %[[X]], 1 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[INPUT_INDEX_CAST_1:.+]] = index_cast %[[INPUT_DIM_1]] : index to i32
   // CHECK-DAG: %[[INPUT_DIM_2:.+]] = dim %[[X]], 2 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[INPUT_INDEX_CAST_2:.+]] = index_cast %[[INPUT_DIM_2]] : index to i32
   // CHECK-DAG: %[[INPUT_DIM_3:.+]] = dim %[[X]], 3 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[INPUT_INDEX_CAST_3:.+]] = index_cast %[[INPUT_DIM_3]] : index to i32
-  // CHECK-DAG: %[[TO_INPUT_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[INPUT_INDEX_CAST_0]], %[[INPUT_INDEX_CAST_1]], %[[INPUT_INDEX_CAST_2]], %[[INPUT_INDEX_CAST_3]]) : (i32, i32, i32, i32) -> tensor<4xi32>
-  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[STDDEV]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[SCALE]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[OFFSET]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[MEAN]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[TO_INPUT_DIM_TENSOR:.+]] = tensor_from_elements(%[[INPUT_DIM_0]], %[[INPUT_DIM_1]], %[[INPUT_DIM_2]], %[[INPUT_DIM_3]]) : tensor<4xindex>
+  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[STDDEV]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[SCALE]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[OFFSET]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[MEAN]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_CENTER:.+]] = xla_hlo.subtract %[[X]], %[[MEAN_BCAST]] : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_SCALED:.+]] = xla_hlo.multiply %[[X_CENTER]], %[[SCALE_BCAST]] : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_NORMED:.+]] = xla_hlo.divide %[[X_SCALED]], %[[STDDEV_BCAST]] : tensor<?x?x?x?xf32>
diff --git a/tensorflow/compiler/mlir/xla/tests/xla-transform-unranked-hlo.mlir b/tensorflow/compiler/mlir/xla/tests/xla-transform-unranked-hlo.mlir
new file mode 100644
index 00000000000..8b26a5e4121
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/xla-transform-unranked-hlo.mlir
@@ -0,0 +1,65 @@
+// RUN: xla-opt -transform-unranked-hlo -split-input-file %s | FileCheck --dump-input=fail %s
+
+// Check the validity of expected IR.
+// CHECK-LABEL: @sqr_transform_result
+func @sqr_transform_result(%a: tensor<*xf32>) -> tensor<*xf32> {
+
+  // Flatten operand shape.
+  %shape = shape.shape_of %a : tensor<*xf32>
+  %num_elements = shape.num_elements %shape
+  %num_elements_as_index = shape.size_to_index %num_elements
+  %flat_shape = tensor_from_elements(%num_elements_as_index) : tensor<1xindex>
+  %flat_a = "xla_hlo.dynamic_reshape"(%a, %flat_shape)
+      : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+
+  // Apply operation.
+  %flat_b = "xla_hlo.sqrt"(%flat_a) : (tensor<?xf32>) -> tensor<?xf32>
+
+  // Restore original shape.
+  %shape_as_extent_tensor = "shape.to_extent_tensor"(%shape)
+      : (!shape.shape) -> tensor<?xindex>
+  %b = "xla_hlo.dynamic_reshape"(%flat_b, %shape_as_extent_tensor)
+      : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+
+  return %b : tensor<*xf32>
+}
+
+// -----
+// Check transformation of unranked code.
+// CHECK-LABEL: @sqrt
+// CHECK-SAME: (%[[A:.*]]: tensor<*xf32>)
+func @sqrt(%a: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK-NEXT: %[[SHAPE:.*]] = shape.shape_of %[[A]] : tensor<*xf32>
+  // CHECK-NEXT: %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE]]
+  // CHECK-NEXT: %[[NUM_ELEMENTS_AS_INDEX:.*]] = shape.size_to_index %[[NUM_ELEMENTS]]
+  // CHECK-NEXT: %[[FLAT_SHAPE:.*]] = tensor_from_elements(%[[NUM_ELEMENTS_AS_INDEX]]) : tensor<1xindex>
+  // CHECK-NEXT: %[[FLAT_A:.*]] = "xla_hlo.dynamic_reshape"(%[[A]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK-NEXT: %[[FLAT_B:.*]] = "xla_hlo.sqrt"(%[[FLAT_A]]) : (tensor<?xf32>) -> tensor<?xf32>
+  // CHECK-NEXT: %[[SHAPE_AS_EXTENT_TENSOR:.*]] = "shape.to_extent_tensor"(%[[SHAPE]]) : (!shape.shape) -> tensor<?xindex>
+  // CHECK-NEXT: %[[B:.*]] = "xla_hlo.dynamic_reshape"(%[[FLAT_B]], %[[SHAPE_AS_EXTENT_TENSOR]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+  // CHECK-NEXT: return %[[B]] : tensor<*xf32>
+  %b = "xla_hlo.sqrt"(%a) : (tensor<*xf32>) -> tensor<*xf32>
+  return %b : tensor<*xf32>
+}
+
+// -----
+// Not transformed when ranked.
+// CHECK-LABEL: @sqrt_ranked
+// CHECK-SAME: (%[[A:.*]]: tensor<3x?xf32>)
+func @sqrt_ranked(%a: tensor<3x?xf32>) -> tensor<3x?xf32> {
+  // CHECK-NEXT: %[[B:.*]] = "xla_hlo.sqrt"(%[[A]]) : (tensor<3x?xf32>) -> tensor<3x?xf32>
+  // CHECK-NEXT: return %[[B]] : tensor<3x?xf32>
+  %b = "xla_hlo.sqrt"(%a) : (tensor<3x?xf32>) -> tensor<3x?xf32>
+  return %b : tensor<3x?xf32>
+}
+
+// -----
+// Not transformed when statically shaped.
+// CHECK-LABEL: @sqrt_static
+// CHECK-SAME: (%[[A:.*]]: tensor<2x3xf32>)
+func @sqrt_static(%a: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  // CHECK-NEXT: %[[B:.*]] = "xla_hlo.sqrt"(%[[A]]) : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  // CHECK-NEXT: return %[[B]] : tensor<2x3xf32>
+  %b = "xla_hlo.sqrt"(%a) : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  return %b : tensor<2x3xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.cc b/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.cc
deleted file mode 100644
index 640b9b84622..00000000000
--- a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.cc
+++ /dev/null
@@ -1,485 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements logic for computing proper alloc and dealloc positions.
-// The main class is the BufferAssignment class that realizes this analysis.
-// In order to put allocations and deallocations at safe positions, it is
-// significantly important to put them into the proper blocks. However, the
-// liveness analysis does not pay attention to aliases, which can occur due to
-// branches (and their associated block arguments) in general. For this purpose,
-// BufferAssignment firstly finds all possible aliases for a single value (using
-// the BufferAssignmentAliasAnalysis class). Consider the following example:
-//
-// ^bb0(%arg0):
-//   cond_br %cond, ^bb1, ^bb2
-// ^bb1:
-//   br ^exit(%arg0)
-// ^bb2:
-//   %new_value = ...
-//   br ^exit(%new_value)
-// ^exit(%arg1):
-//   return %arg1;
-//
-// Using liveness information on its own would cause us to place the allocs and
-// deallocs in the wrong block. This is due to the fact that %new_value will not
-// be liveOut of its block. Instead, we have to place the alloc for %new_value
-// in bb0 and its associated dealloc in exit. Using the class
-// BufferAssignmentAliasAnalysis, we will find out that %new_value has a
-// potential alias %arg1. In order to find the dealloc position we have to find
-// all potential aliases, iterate over their uses and find the common
-// post-dominator block. In this block we can safely be sure that %new_value
-// will die and can use liveness information to determine the exact operation
-// after which we have to insert the dealloc. Finding the alloc position is
-// highly similar and non- obvious. Again, we have to consider all potential
-// aliases and find the common dominator block to place the alloc.
-//
-// TODO(dfki):
-// The current implementation does not support loops. The only thing that
-// is currently missing is a high-level loop analysis that allows us to move
-// allocs and deallocs outside of the loop blocks.
-
-#include "tensorflow/compiler/mlir/xla/transforms/buffer_assignment.h"
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
-#include "mlir/IR/Function.h"                 // TF:llvm-project
-#include "mlir/IR/Operation.h"                // TF:llvm-project
-#include "mlir/Pass/Pass.h"                   // TF:llvm-project
-#include "absl/memory/memory.h"
-
-namespace mlir {
-namespace xla {
-namespace {
-
-//===----------------------------------------------------------------------===//
-// BufferAssignmentAliasAnalysis
-//===----------------------------------------------------------------------===//
-
-/// A straight-forward alias analysis which ensures that all aliases of all
-/// values will be determined. This is a requirement for the BufferAssignment
-/// class since you need to determine safe positions to place alloc and
-/// deallocs.
-class BufferAssignmentAliasAnalysis {
- public:
-  using ValueSetT = SmallPtrSet<Value, 16>;
-
- public:
-  /// Constructs a new alias analysis using the op provided.
-  BufferAssignmentAliasAnalysis(Operation* op) { build(op->getRegions()); }
-
-  /// Finds all immediate and indirect aliases this value could potentially
-  /// have. Note that the resulting set will also contain the value provided as
-  /// it is an alias of itself.
-  ValueSetT resolve(Value value) const {
-    ValueSetT result;
-    resolveRecursive(value, result);
-    return result;
-  }
-
- private:
-  /// Recursively determines alias information for the given value. It stores
-  /// all newly found potential aliases in the given result set.
-  void resolveRecursive(Value value, ValueSetT& result) const {
-    if (!result.insert(value).second) {
-      return;
-    }
-    auto it = aliases.find(value);
-    if (it == aliases.end()) return;
-    for (auto alias : it->second) {
-      resolveRecursive(alias, result);
-    }
-  }
-
-  /// This function constructs a mapping from values to its immediate aliases.
-  /// It iterates over all blocks, gets their predecessors, determines the
-  /// values that will be passed to the corresponding block arguments and
-  /// inserts them into map.
-  void build(MutableArrayRef<Region> regions) {
-    for (Region& region : regions) {
-      for (Block& block : region) {
-        // Iterate over all predecessor and get the mapped values to their
-        // corresponding block arguments values.
-        for (auto pred : block.getPredecessors()) {
-          // Determine the current successor index of the current predecessor.
-          unsigned successorIndex = std::distance(
-              pred->getSuccessors().begin(),
-              llvm::find_if(pred->getSuccessors(), [&](Block* successor) {
-                return successor == &block;
-              }));
-          // Get the terminator and the values that will be passed to our block.
-          if (auto branchInterface =
-                  dyn_cast<BranchOpInterface>(pred->getTerminator())) {
-            // Query the branch op interace to get the successor operands.
-            auto successorOps =
-                branchInterface.getSuccessorOperands(successorIndex);
-            if (successorOps.hasValue()) {
-              // Build the actual mapping of values to their immediate aliases.
-              for (auto arg : block.getArguments()) {
-                Value predecessorArgValue =
-                    successorOps.getValue()[arg.getArgNumber()];
-                aliases[predecessorArgValue].insert(arg);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  /// Maps values to all immediate aliases this value can have.
-  llvm::DenseMap<Value, ValueSetT> aliases;
-};
-
-//===----------------------------------------------------------------------===//
-// BufferAssignmentPositions
-//===----------------------------------------------------------------------===//
-
-/// Stores proper alloc and dealloc positions to place dialect-specific alloc
-/// and dealloc operations.
-struct BufferAssignmentPositions {
- public:
-  BufferAssignmentPositions()
-      : allocPosition(nullptr), deallocPosition(nullptr) {}
-
-  /// Creates a new positions tuple including alloc and dealloc positions.
-  BufferAssignmentPositions(Operation* allocPosition,
-                            Operation* deallocPosition)
-      : allocPosition(allocPosition), deallocPosition(deallocPosition) {}
-
-  /// Returns the alloc position before which the alloc operation has to be
-  /// inserted.
-  Operation* getAllocPosition() const { return allocPosition; }
-
-  /// Returns the dealloc position after which the dealloc operation has to be
-  /// inserted.
-  Operation* getDeallocPosition() const { return deallocPosition; }
-
- private:
-  Operation* allocPosition;
-  Operation* deallocPosition;
-};
-
-//===----------------------------------------------------------------------===//
-// BufferAssignmentAnalysis
-//===----------------------------------------------------------------------===//
-
-// The main buffer assignment analysis used to place allocs and deallocs.
-class BufferAssignmentAnalysis {
- public:
-  using DeallocSetT = SmallPtrSet<Operation*, 2>;
-
- public:
-  BufferAssignmentAnalysis(Operation* op)
-      : operation(op),
-        liveness(op),
-        dominators(op),
-        postDominators(op),
-        aliases(op) {}
-
-  /// Computes the actual positions to place allocs and deallocs for the given
-  /// value.
-  BufferAssignmentPositions computeAllocAndDeallocPositions(Value value) const {
-    if (value.use_empty()) {
-      return BufferAssignmentPositions(value.getDefiningOp(),
-                                       value.getDefiningOp());
-    }
-    // Get all possible aliases
-    auto possibleValues = aliases.resolve(value);
-    return BufferAssignmentPositions(getAllocPosition(value, possibleValues),
-                                     getDeallocPosition(value, possibleValues));
-  }
-
-  /// Finds all associated dealloc nodes for the alloc nodes using alias
-  /// information.
-  DeallocSetT findAssociatedDeallocs(AllocOp alloc) const {
-    DeallocSetT result;
-    auto possibleValues = aliases.resolve(alloc);
-    for (auto alias : possibleValues) {
-      for (auto user : alias.getUsers()) {
-        if (isa<DeallocOp>(user)) result.insert(user);
-      }
-    }
-    return result;
-  }
-
-  /// Dumps the buffer assignment information to the given stream.
-  void print(raw_ostream& os) const {
-    os << "// ---- Buffer Assignment -----\n";
-
-    for (Region& region : operation->getRegions())
-      for (Block& block : region)
-        for (Operation& operation : block)
-          for (Value result : operation.getResults()) {
-            BufferAssignmentPositions positions =
-                computeAllocAndDeallocPositions(result);
-            os << "Positions for ";
-            result.print(os);
-            os << "\n Alloc: ";
-            positions.getAllocPosition()->print(os);
-            os << "\n Dealloc: ";
-            positions.getDeallocPosition()->print(os);
-            os << "\n";
-          }
-  }
-
- private:
-  /// Finds a proper placement block to store alloc/dealloc node according to
-  /// the algorithm described at the top of the file. It supports dominator and
-  /// post-dominator analyses via template arguments.
-  template <typename AliasesT, typename DominatorT>
-  Block* findPlacementBlock(Value value, const AliasesT& aliases,
-                            const DominatorT& doms) const {
-    assert(!value.isa<BlockArgument>() && "Cannot place a block argument");
-    // Start with the current block the value is defined in.
-    Block* dom = value.getDefiningOp()->getBlock();
-    // Iterate over all aliases and their uses to find a safe placement block
-    // according to the given dominator information.
-    for (auto alias : aliases) {
-      for (auto user : alias.getUsers()) {
-        // Move upwards in the dominator tree to find an appropriate
-        // dominator block that takes the current use into account.
-        dom = doms.findNearestCommonDominator(dom, user->getBlock());
-      }
-    }
-    return dom;
-  }
-
-  /// Finds a proper alloc positions according to the algorithm described at the
-  /// top of the file.
-  template <typename AliasesT>
-  Operation* getAllocPosition(Value value, const AliasesT& aliases) const {
-    // Determine the actual block to place the alloc and get liveness
-    // information.
-    auto placementBlock = findPlacementBlock(value, aliases, dominators);
-    auto livenessInfo = liveness.getLiveness(placementBlock);
-
-    // We have to ensure that the alloc will be before the first use of all
-    // aliases of the given value. We first assume that there are no uses in the
-    // placementBlock and that we can safely place the alloc before the
-    // terminator at the end of the block.
-    Operation* startOperation = placementBlock->getTerminator();
-    // Iterate over all aliases and ensure that the startOperation will point to
-    // the first operation of all potential aliases in the placementBlock.
-    for (auto alias : aliases) {
-      auto aliasStartOperation = livenessInfo->getStartOperation(alias);
-      // Check whether the aliasStartOperation lies in the desired block and
-      // whether it is before the current startOperation. If yes, this will be
-      // the new startOperation.
-      if (aliasStartOperation->getBlock() == placementBlock &&
-          aliasStartOperation->isBeforeInBlock(startOperation)) {
-        startOperation = aliasStartOperation;
-      }
-    }
-    // startOperation is the first operation before which we can safely store
-    // the alloc taking all potential aliases into account.
-    return startOperation;
-  }
-
-  /// Finds a proper dealloc positions according to the algorithm described at
-  /// the top of the file.
-  template <typename AliasesT>
-  Operation* getDeallocPosition(Value value, const AliasesT& aliases) const {
-    // Determine the actual block to place the dealloc and get liveness
-    // information.
-    auto placementBlock = findPlacementBlock(value, aliases, postDominators);
-    auto livenessInfo = liveness.getLiveness(placementBlock);
-
-    // We have to ensure that the dealloc will be after the last use of all
-    // aliases of the given value. We first assume that there are no uses in the
-    // placementBlock and that we can safely place the dealloc at the beginning.
-    Operation* endOperation = &placementBlock->front();
-    // Iterate over all aliases and ensure that the endOperation will point to
-    // the last operation of all potential aliases in the placementBlock.
-    for (auto alias : aliases) {
-      auto aliasEndOperation =
-          livenessInfo->getEndOperation(alias, endOperation);
-      // Check whether the aliasEndOperation lies in the desired block and
-      // whether it is behind the current endOperation. If yes, this will be the
-      // new endOperation.
-      if (aliasEndOperation->getBlock() == placementBlock &&
-          endOperation->isBeforeInBlock(aliasEndOperation)) {
-        endOperation = aliasEndOperation;
-      }
-    }
-    // endOperation is the last operation behind which we can safely store the
-    // dealloc taking all potential aliases into account.
-    return endOperation;
-  }
-
-  /// The operation this transformation was constructed from.
-  Operation* operation;
-
-  /// The underlying liveness analysis to compute fine grained information about
-  /// alloc and dealloc positions.
-  Liveness liveness;
-
-  /// The dominator analysis to place allocs in the appropriate blocks.
-  DominanceInfo dominators;
-
-  /// The post dominator analysis to place deallocs in the appropriate blocks.
-  PostDominanceInfo postDominators;
-
-  /// The internal alias analysis to ensure that allocs and deallocs take all
-  /// their potential aliases into account.
-  BufferAssignmentAliasAnalysis aliases;
-};
-
-//===----------------------------------------------------------------------===//
-// BufferAssignmentPass
-//===----------------------------------------------------------------------===//
-
-/// The actual buffer assignment pass that moves alloc and dealloc nodes into
-/// the right positions. It uses the algorithm described at the top of the file.
-// TODO(dfki): create a templated version that allows to match dialect-specific
-// alloc/dealloc nodes and to insert dialect-specific dealloc node.
-struct BufferAssignmentPass
-    : mlir::PassWrapper<BufferAssignmentPass, FunctionPass> {
-  void runOnFunction() override {
-    // Get required analysis information first.
-    auto& analysis = getAnalysis<BufferAssignmentAnalysis>();
-
-    // Compute an initial placement of all nodes.
-    llvm::SmallDenseMap<Value, BufferAssignmentPositions, 16> placements;
-    getFunction().walk([&](AllocOp alloc) {
-      placements[alloc] = analysis.computeAllocAndDeallocPositions(alloc);
-    });
-
-    // Move alloc (and dealloc - if any) nodes into the right places
-    // and insert dealloc nodes if necessary.
-    getFunction().walk([&](AllocOp alloc) {
-      // Find already associated dealloc nodes.
-      auto deallocs = analysis.findAssociatedDeallocs(alloc);
-      assert(deallocs.size() < 2 &&
-             "Not supported number of associated dealloc operations");
-
-      // Move alloc node to the right place.
-      BufferAssignmentPositions& positions = placements[alloc];
-      Operation* allocOperation = alloc.getOperation();
-      allocOperation->moveBefore(positions.getAllocPosition());
-
-      // If there is an existing dealloc, move it to the right place.
-      if (deallocs.size()) {
-        Operation* nextOp = positions.getDeallocPosition()->getNextNode();
-        assert(nextOp && "Invalid Dealloc operation position");
-        (*deallocs.begin())->moveBefore(nextOp);
-      } else {
-        // If there is no dealloc node, insert one in the right place.
-        OpBuilder builder(alloc);
-        builder.setInsertionPointAfter(positions.getDeallocPosition());
-        builder.create<DeallocOp>(allocOperation->getLoc(), alloc);
-      }
-    });
-  };
-};
-
-}  // namespace
-
-//===----------------------------------------------------------------------===//
-// BufferAssignmentPlacer
-//===----------------------------------------------------------------------===//
-
-/// Creates a new assignment placer.
-BufferAssignmentPlacer::BufferAssignmentPlacer(Operation* op)
-    : operation(op), dominators(op) {}
-
-/// Computes the actual position to place allocs for the given value.
-OpBuilder::InsertPoint BufferAssignmentPlacer::computeAllocPosition(
-    Value value) {
-  Operation* insertOp = value.getDefiningOp();
-  assert(insertOp && "There is not a defining operation for the input value");
-  OpBuilder opBuilder(insertOp);
-  return opBuilder.saveInsertionPoint();
-}
-
-//===----------------------------------------------------------------------===//
-// FunctionAndBlockSignatureConverter
-//===----------------------------------------------------------------------===//
-
-// Performs the actual signature rewriting step.
-LogicalResult FunctionAndBlockSignatureConverter::matchAndRewrite(
-    FuncOp funcOp, ArrayRef<Value> operands,
-    ConversionPatternRewriter& rewriter) const {
-  auto toMemrefConverter = [&](Type t) -> Type {
-    if (auto tensorType = t.dyn_cast<RankedTensorType>()) {
-      return MemRefType::get(tensorType.getShape(),
-                             tensorType.getElementType());
-    }
-    return t;
-  };
-  // Converting tensor-type function arguments to memref-type.
-  auto funcType = funcOp.getType();
-  TypeConverter::SignatureConversion conversion(funcType.getNumInputs());
-  for (auto argType : llvm::enumerate(funcType.getInputs())) {
-    conversion.addInputs(argType.index(), toMemrefConverter(argType.value()));
-  }
-  for (auto resType : funcType.getResults()) {
-    conversion.addInputs(toMemrefConverter(resType));
-  }
-  rewriter.updateRootInPlace(funcOp, [&] {
-    funcOp.setType(
-        rewriter.getFunctionType(conversion.getConvertedTypes(), llvm::None));
-    rewriter.applySignatureConversion(&funcOp.getBody(), conversion);
-  });
-  // Converting tensor-type block arugments of all blocks inside the
-  // function region to memref-type except for the entry block.
-  for (auto& block : funcOp.getBlocks()) {
-    if (block.isEntryBlock()) continue;
-    for (int i = 0, e = block.getNumArguments(); i < e; ++i) {
-      auto oldArg = block.getArgument(i);
-      auto newArg =
-          block.insertArgument(i, toMemrefConverter(oldArg.getType()));
-      oldArg.replaceAllUsesWith(newArg);
-      block.eraseArgument(i + 1);
-    }
-  }
-  return success();
-}
-
-/// A helper method to make the functions, whose all block argument types are
-/// Memref or non-shaped type, legal. BufferAssignmentPlacer expects all
-/// function and block argument types are in Memref or non-shaped type. Using
-/// this helper method and additionally, FunctionAndBlockSignatureConverter as a
-/// pattern conversion make sure that the type of block arguments are compatible
-/// with using BufferAssignmentPlacer.
-void FunctionAndBlockSignatureConverter::addDynamicallyLegalFuncOp(
-    ConversionTarget& target) {
-  auto isLegalBlockArg = [](BlockArgument arg) -> bool {
-    auto type = arg.getType();
-    return type.isa<MemRefType>() || !type.isa<ShapedType>();
-  };
-  target.addDynamicallyLegalOp<FuncOp>([&](FuncOp funcOp) {
-    bool legality = true;
-    for (auto& block2 : funcOp.getBlocks()) {
-      legality &= llvm::all_of(block2.getArguments(), isLegalBlockArg);
-      if (!legality) break;
-    }
-    return legality;
-  });
-}
-
-//===----------------------------------------------------------------------===//
-// Buffer assignment pass registrations
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<OperationPass<FuncOp>> createBufferAssignmentPass() {
-  return absl::make_unique<BufferAssignmentPass>();
-}
-
-static PassRegistration<BufferAssignmentPass> buffer_assignment_pass(
-    "buffer-assignment",
-    "Executes buffer assignment pass to automatically move alloc and dealloc "
-    "operations into their proper positions");
-
-}  // namespace xla
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.h b/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.h
deleted file mode 100644
index ced5769b44c..00000000000
--- a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_BUFFER_ASSIGNMENT_H_
-#define TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_BUFFER_ASSIGNMENT_H_
-
-#include "mlir/Analysis/Liveness.h"
-#include "mlir/IR/Builders.h"  // TF:llvm-project
-#include "mlir/IR/Dominance.h"
-#include "mlir/IR/Operation.h"  // TF:llvm-project
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/DialectConversion.h"  // TF:llvm-project
-
-namespace mlir {
-namespace xla {
-
-/// Prepares a buffer assignment phase. It can place (user-defined) alloc
-/// nodes. This simplifies the integration of the actual buffer-assignment
-/// pass. Sample usage:
-///   BufferAssignmentPlacer baHelper(regionOp);
-///   -> determine alloc positions
-///   auto allocPosition = baHelper.computeAllocPosition(value);
-///   -> place alloc
-///   allocBuilder.setInsertionPoint(positions.getAllocPosition());
-///   <create alloc>
-///   alternatively:
-///   -> place alloc
-///   baHelper.insertAlloc<AllocOp>(...);
-/// Note: this class is intended to be used during legalization. In order
-/// to move alloc and dealloc nodes into the right places you can use the
-/// createBufferAssignmentPass() function.
-class BufferAssignmentPlacer {
- public:
-  /// Creates a new assignment builder.
-  explicit BufferAssignmentPlacer(Operation* op);
-
-  /// Returns the operation this analysis was constructed from.
-  Operation* getOperation() const { return operation; }
-
-  /// Computes the actual position to place allocs for the given value.
-  OpBuilder::InsertPoint computeAllocPosition(Value value);
-
- private:
-  /// The operation this analysis was constructed from.
-  Operation* operation;
-
-  /// The dominator analysis to place allocs in the appropriate blocks.
-  DominanceInfo dominators;
-};
-
-/// Helper conversion pattern that encapsulates a BufferAssignmentPlacer
-/// instance.
-template <typename SourceOp>
-class BufferAssignmentOpConversionPattern
-    : public OpConversionPattern<SourceOp> {
- public:
-  explicit BufferAssignmentOpConversionPattern(
-      MLIRContext* context_,
-      xla::BufferAssignmentPlacer* bufferAssignment_ = nullptr,
-      PatternBenefit benefit_ = 1)
-      : OpConversionPattern<SourceOp>(context_, benefit_),
-        bufferAssignment(bufferAssignment_) {}
-
- protected:
-  xla::BufferAssignmentPlacer* bufferAssignment;
-};
-
-// Converts only the tensor-type function and block arguments to memref-type.
-class FunctionAndBlockSignatureConverter
-    : public BufferAssignmentOpConversionPattern<FuncOp> {
- public:
-  using BufferAssignmentOpConversionPattern<
-      FuncOp>::BufferAssignmentOpConversionPattern;
-
-  // Adding functions whose arguments are memref type to the set of legal
-  // operations.
-  static void addDynamicallyLegalFuncOp(ConversionTarget& target);
-
-  // Performs the actual signature rewriting step.
-  LogicalResult matchAndRewrite(
-      FuncOp funcOp, ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter) const final;
-};
-
-// This pattern converter transforms a non-void ReturnOpSourceTy into a void
-// return of type ReturnOpTargetTy. It uses a copy operation of type CopyOpTy to
-// copy the results to the output buffer.
-template <typename ReturnOpSourceTy, typename ReturnOpTargetTy,
-          typename CopyOpTy>
-class NonVoidToVoidReturnOpConverter
-    : public BufferAssignmentOpConversionPattern<ReturnOpSourceTy> {
- public:
-  using BufferAssignmentOpConversionPattern<
-      ReturnOpSourceTy>::BufferAssignmentOpConversionPattern;
-
-  // Performs the actual return-op conversion step.
-  LogicalResult matchAndRewrite(
-      ReturnOpSourceTy returnOp, ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter) const final {
-    auto numReturnValues = returnOp.getNumOperands();
-    auto funcOp = returnOp.template getParentOfType<FuncOp>();
-    auto numFuncArgs = funcOp.getNumArguments();
-    auto loc = returnOp.getLoc();
-
-    // Find the corresponding output buffer for each operand.
-    for (auto operand : llvm::enumerate(operands)) {
-      auto returnArgNumber = numFuncArgs - numReturnValues + operand.index();
-      auto dstBuffer = funcOp.getArgument(returnArgNumber);
-      if (dstBuffer == operand.value()) {
-        continue;
-      }
-
-      // Insert the copy operation to copy before the return.
-      rewriter.setInsertionPoint(
-          returnOp.getOperation()->getBlock()->getTerminator());
-      rewriter.create<CopyOpTy>(loc, operand.value(),
-                                funcOp.getArgument(returnArgNumber));
-    }
-    // Insert the new target return operation.
-    rewriter.replaceOpWithNewOp<ReturnOpTargetTy>(returnOp);
-    return success();
-  }
-};
-
-}  // namespace xla
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_BUFFER_ASSIGNMENT_H_
diff --git a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc b/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc
deleted file mode 100644
index 5a0d791079c..00000000000
--- a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements logic for testing buffer assignment including its
-// utility converters.
-
-#include "tensorflow/compiler/mlir/xla/transforms/buffer_assignment.h"
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
-#include "mlir/IR/Function.h"                 // TF:llvm-project
-#include "mlir/IR/Operation.h"                // TF:llvm-project
-#include "mlir/Pass/Pass.h"                   // TF:llvm-project
-#include "mlir/Pass/PassManager.h"            // TF:llvm-project
-#include "absl/memory/memory.h"
-#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-
-namespace mlir {
-namespace xla {
-namespace {
-/// This pass tests two provided operation converters,
-/// FunctionAndBlockSignatureConverter and NonVoidToVoidReturnOpConverter, for
-/// Buffer Assignment.
-struct BufferAssignmentPreparationTestPass
-    : mlir::PassWrapper<BufferAssignmentPreparationTestPass, FunctionPass> {
-  /// This dialect independent unary operation has been defined only for testing
-  /// buffer assignment.
-  class BufferAssignmentTestUnaryOp
-      : public Op<BufferAssignmentTestUnaryOp, OpTrait::OneResult,
-                  OpTrait::OneOperand> {
-   public:
-    using Op::Op;
-    static StringRef getOperationName() {
-      return "buffer_assignment_test.unary";
-    }
-    static void build(OpBuilder& b, OperationState& state, Value source) {
-      state.addOperands(source);
-    }
-  };
-
-  /// This dialect independent lowered unary operation has been defined only for
-  /// testing buffer assignment.
-  class BufferAssignmentTestUnaryLoweredOp
-      : public Op<BufferAssignmentTestUnaryLoweredOp, OpTrait::ZeroResult,
-                  OpTrait::NOperands<2>::Impl> {
-   public:
-    using Op::Op;
-    static StringRef getOperationName() {
-      return "buffer_assignment_test.unary_lowered";
-    }
-    static void build(OpBuilder& b, OperationState& state, Value source,
-                      Value target) {
-      state.addOperands(source);
-      state.addOperands(target);
-    }
-  };
-
-  /// This dialect independent copy operation has been defined only for testing
-  /// NonVoidToVoidReturnOpConverter
-  class BufferAssignmentTestCopyOp
-      : public Op<BufferAssignmentTestCopyOp, OpTrait::ZeroResult,
-                  OpTrait::NOperands<2>::Impl> {
-   public:
-    using Op::Op;
-    static StringRef getOperationName() {
-      return "buffer_assignment_test.copy";
-    }
-    static void build(OpBuilder& b, OperationState& state, Value from,
-                      Value to) {
-      state.addOperands(from);
-      state.addOperands(to);
-    }
-  };
-
-  /// A simple converter that legalizes a BufferAssignmentTestUnaryOp to a
-  /// BufferAssignmentTestUnaryLoweredOp and creates buffer allocation for
-  /// the result of the computation.
-  class TestUnaryOpConverter : public BufferAssignmentOpConversionPattern<
-                                   BufferAssignmentTestUnaryOp> {
-   public:
-    using BufferAssignmentOpConversionPattern<
-        BufferAssignmentTestUnaryOp>::BufferAssignmentOpConversionPattern;
-
-    // Performs the actual legalization conversion step.
-    LogicalResult matchAndRewrite(
-        BufferAssignmentTestUnaryOp op, ArrayRef<Value> operands,
-        ConversionPatternRewriter& rewriter) const final {
-      // Create a new buffer allocation using the current BufferAssignmentPlacer
-      // instance.
-      auto result = op.getResult();
-      auto result_type = result.getType().dyn_cast<ShapedType>();
-      auto memref_type =
-          MemRefType::get(result_type.getShape(), result_type.getElementType());
-      rewriter.restoreInsertionPoint(
-          bufferAssignment->computeAllocPosition(result));
-      auto alloc = rewriter.create<AllocOp>(op.getLoc(), memref_type);
-
-      // Create the lowered operation and replace the old operation with a
-      // reference to the allocated buffer.
-      rewriter.create<BufferAssignmentTestUnaryLoweredOp>(op.getLoc(),
-                                                          operands[0], alloc);
-      rewriter.replaceOp(op, {alloc});
-      return success();
-    }
-  };
-
-  void runOnFunction() override {
-    OwningRewritePatternList patterns;
-    auto funcOp = getOperation();
-    auto context = funcOp.getContext();
-    ConversionTarget target(*context);
-    BufferAssignmentPlacer bufferAssignmentPlacer(funcOp);
-
-    // Specifying the legal and illegal operations.
-    context->allowUnregisteredDialects(true);
-    target.addIllegalOp<BufferAssignmentTestUnaryOp>();
-    target.addLegalOp<BufferAssignmentTestUnaryLoweredOp>();
-    target.addLegalOp<BufferAssignmentTestCopyOp>();
-    target.addLegalOp<AllocOp>();
-    target.addLegalOp<DeallocOp>();
-    // TODO(dfki): ReturnOp can also be changed to TestReturnOp like
-    // BufferAssignmentTestCopyOp.
-    target.addDynamicallyLegalOp<ReturnOp>(
-        [](ReturnOp returnOp) { return returnOp.getNumOperands() == 0; });
-    FunctionAndBlockSignatureConverter::addDynamicallyLegalFuncOp(target);
-
-    // Adding patterns for testing this pass.
-    // clang-format off
-    patterns.insert<
-        FunctionAndBlockSignatureConverter,
-        TestUnaryOpConverter,
-        NonVoidToVoidReturnOpConverter
-          <ReturnOp, ReturnOp, BufferAssignmentTestCopyOp>
-    >(context, &bufferAssignmentPlacer);
-    // clang-format on
-
-    if (failed(applyPartialConversion(funcOp, target, patterns, nullptr))) {
-      funcOp.emitOpError()
-          << "Failed to apply buffer assignment preparation steps";
-    }
-  };
-};
-}  // namespace
-
-/// This pass tests helper methods such as computeAllocPosition,
-/// FunctionAndBlockSignatureConverter, NonVoidToVoidReturnOpConverter
-/// conversion patterns. Furthermore, it checks buffer-assignment pass that
-/// moves existing Alloc and Dealloc operations to their proper positions, and
-/// insert missing Dealloc operations.
-static PassPipelineRegistration<> buffer_assignment_test_pass(
-    "test-buffer-assignment",
-    "Tests buffer assignment helper methods and buffer assignment pass.",
-    [](mlir::OpPassManager& pm) {
-      pm.addPass(absl::make_unique<BufferAssignmentPreparationTestPass>());
-      pm.addPass(createBufferAssignmentPass());
-    });
-
-}  // namespace xla
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
index a20511a95fc..e5a79616d5b 100644
--- a/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
@@ -33,24 +33,23 @@ namespace {
 // Converts binary ops that statically are determined to not broadcast directly
 // to the corresponding xla_hlo non-broadcasting op.
 template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
-struct ConvertTrivialNonBroadcastBinaryOp
-    : public OpConversionPattern<ChloOpTy> {
-  using OpConversionPattern<ChloOpTy>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ChloOpTy op, ArrayRef<Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
+struct ConvertTrivialNonBroadcastBinaryOp : public OpRewritePattern<ChloOpTy> {
+  using OpRewritePattern<ChloOpTy>::OpRewritePattern;
+  LogicalResult matchAndRewrite(ChloOpTy op,
+                                PatternRewriter &rewriter) const override {
     // Only rewrite for statically determinable non-broadcasting cases.
-    auto lhs = operands[0].getType().dyn_cast<RankedTensorType>();
-    auto rhs = operands[1].getType().dyn_cast<RankedTensorType>();
-    if (!lhs || !rhs) return failure();
+    auto lhs_type = op.lhs().getType().template dyn_cast<RankedTensorType>();
+    auto rhs_type = op.rhs().getType().template dyn_cast<RankedTensorType>();
+    if (!lhs_type || !rhs_type) return failure();
 
     // Requires rank broadcast.
-    if (lhs.getRank() != rhs.getRank()) return failure();
+    if (lhs_type.getRank() != rhs_type.getRank()) return failure();
     // Any dynamic dimension may require broadcasting and requires more
     // analysis.
-    if (!lhs.hasStaticShape() || !rhs.hasStaticShape()) return failure();
+    if (!lhs_type.hasStaticShape() || !rhs_type.hasStaticShape())
+      return failure();
 
-    for (auto extents : llvm::zip(lhs.getShape(), rhs.getShape())) {
+    for (auto extents : llvm::zip(lhs_type.getShape(), rhs_type.getShape())) {
       auto lhs_extent = std::get<0>(extents);
       auto rhs_extent = std::get<1>(extents);
       if (lhs_extent != rhs_extent) {
@@ -58,9 +57,8 @@ struct ConvertTrivialNonBroadcastBinaryOp
       }
     }
 
-    rewriter.replaceOp(
-        op, {Adaptor::CreateOp(op, op.getResult().getType(), operands[0],
-                               operands[1], rewriter)});
+    rewriter.replaceOp(op, {Adaptor::CreateOp(op, op.getResult().getType(),
+                                              op.lhs(), op.rhs(), rewriter)});
     return success();
   }
 };
@@ -83,14 +81,13 @@ struct ConvertTrivialNonBroadcastBinaryOp
 // Whether that is of any practical benefit remains to be seen.
 template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
 struct ConvertRankedDynamicBroadcastBinaryOp
-    : public OpConversionPattern<ChloOpTy> {
-  using OpConversionPattern<ChloOpTy>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ChloOpTy op, ArrayRef<Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
+    : public OpRewritePattern<ChloOpTy> {
+  using OpRewritePattern<ChloOpTy>::OpRewritePattern;
+  LogicalResult matchAndRewrite(ChloOpTy op,
+                                PatternRewriter &rewriter) const override {
     // Only support ranked operands.
-    Value lhs = operands[0];
-    Value rhs = operands[1];
+    Value lhs = op.lhs();
+    Value rhs = op.rhs();
     auto lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
     auto rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
     auto result_type =
@@ -166,8 +163,7 @@ struct HloBinaryElementwiseAdaptor {
                          Value broadcasted_lhs, Value broadcasted_rhs,
                          OpBuilder &builder) {
     return builder.create<ToOpTy>(from_op.getLoc(), result_type,
-                                  broadcasted_lhs, broadcasted_rhs,
-                                  /*broadcast_dimensions=*/nullptr);
+                                  broadcasted_lhs, broadcasted_rhs);
   }
 };
 
@@ -186,9 +182,9 @@ struct HloCompareAdaptor {
                                      Type result_type, Value broadcasted_lhs,
                                      Value broadcasted_rhs,
                                      OpBuilder &builder) {
-    return builder.create<xla_hlo::CompareOp>(
-        from_op.getLoc(), result_type, broadcasted_lhs, broadcasted_rhs,
-        /*broadcast_dimensions=*/nullptr, from_op.comparison_direction());
+    return builder.create<xla_hlo::CompareOp>(from_op.getLoc(), result_type,
+                                              broadcasted_lhs, broadcasted_rhs,
+                                              from_op.comparison_direction());
   }
 };
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index aa29241048b..df92681cd97 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
@@ -39,16 +40,11 @@ namespace xla_hlo {
 namespace {
 
 constexpr StringRef kTempBufferAttr = "temp";
-
-/// Returns DeallocOp to ensure that CopyOp is not inserted after dealloc.
-Operation* FindInsertionPointForCopy(Value value) {
-  for (const auto& user : value.getUsers()) {
-    if (auto dealloc = dyn_cast<DeallocOp>(user)) {
-      return user;
-    }
-  }
-  return nullptr;
-}
+template <typename T>
+using BaseOpConversion = BufferAssignmentOpConversionPattern<T>;
+using StdReturnOpConverter =
+    BufferAssignmentReturnOpConverter<mlir::ReturnOp, xla_lhlo::TerminatorOp,
+                                      xla_lhlo::CopyOp>;
 
 Value InsertDynamicAllocAndDealloc(Location loc, Value result,
                                    Value shape_operand,
@@ -92,8 +88,9 @@ Value InsertDynamicAllocAndDealloc(Location loc, Value result,
   return alloc;
 }
 
-Value InsertAllocAndDealloc(Location loc, Value result,
-                            ConversionPatternRewriter* rewriter) {
+Value InsertAlloc(Location loc, OpResult result,
+                  BufferAssignmentPlacer* bufferAssignment,
+                  ConversionPatternRewriter* rewriter) {
   auto result_type = result.getType().dyn_cast<ShapedType>();
   if (!result_type || !result_type.hasStaticShape()) {
     result.getDefiningOp()->emitOpError()
@@ -101,31 +98,21 @@ Value InsertAllocAndDealloc(Location loc, Value result,
   }
   auto memref_type =
       MemRefType::get(result_type.getShape(), result_type.getElementType());
-
-  Operation* op = result.getDefiningOp();
-  auto block = op->getBlock();
-
-  OpBuilder allocBuilder(op);
-  allocBuilder.setInsertionPointToStart(block);  // Inserting at the beginning
-  auto alloc = allocBuilder.create<AllocOp>(loc, memref_type);
-
-  alloc.setAttr(kTempBufferAttr, rewriter->getBoolAttr(true));
-
-  allocBuilder.setInsertionPoint(block, std::prev(block->end()));
-  allocBuilder.create<DeallocOp>(loc, alloc);
-
+  OpBuilder::InsertionGuard guard(*rewriter);
+  rewriter->restoreInsertionPoint(
+      bufferAssignment->computeAllocPosition(result));
+  auto alloc = rewriter->create<AllocOp>(loc, memref_type);
   return alloc;
 }
 
 template <typename HloOpTy>
-class HloToLhloOpConverter : public ConversionPattern {
+class HloToLhloOpConverter : public BaseOpConversion<HloOpTy> {
  public:
-  explicit HloToLhloOpConverter(MLIRContext* context)
-      : ConversionPattern(HloOpTy::getOperationName(), 1, context) {}
-
+  using BaseOpConversion<HloOpTy>::BaseOpConversion;
   LogicalResult matchAndRewrite(
-      Operation* op, ArrayRef<Value> operands,
+      HloOpTy hloOp, ArrayRef<Value> operands,
       ConversionPatternRewriter& rewriter) const final {
+    Operation* op = hloOp.getOperation();
     const auto& original_results = op->getResults();
     SmallVector<Value, 4> buffer_args(operands.begin(), operands.end());
     for (auto result : llvm::enumerate(original_results)) {
@@ -135,8 +122,8 @@ class HloToLhloOpConverter : public ConversionPattern {
         return failure();
       }
       if (resultType.hasStaticShape()) {
-        buffer_args.push_back(
-            InsertAllocAndDealloc(op->getLoc(), result.value(), &rewriter));
+        buffer_args.push_back(InsertAlloc(op->getLoc(), result.value(),
+                                          this->bufferAssignment, &rewriter));
       } else {
         SmallVector<Value, 1> results_shape;
         auto shape_type_op = dyn_cast<InferShapedTypeOpInterface>(op);
@@ -156,9 +143,9 @@ class HloToLhloOpConverter : public ConversionPattern {
 };
 
 struct HloToLhloDynamicBroadcastInDimOpConverter
-    : public OpConversionPattern<xla_hlo::DynamicBroadcastInDimOp> {
+    : public BaseOpConversion<xla_hlo::DynamicBroadcastInDimOp> {
  public:
-  using OpConversionPattern::OpConversionPattern;
+  using BaseOpConversion<xla_hlo::DynamicBroadcastInDimOp>::BaseOpConversion;
 
   LogicalResult matchAndRewrite(
       xla_hlo::DynamicBroadcastInDimOp op, ArrayRef<Value> operands,
@@ -175,10 +162,9 @@ struct HloToLhloDynamicBroadcastInDimOpConverter
   }
 };
 
-struct HloToLhloReduceOpConverter
-    : public OpConversionPattern<xla_hlo::ReduceOp> {
+struct HloToLhloReduceOpConverter : public BaseOpConversion<xla_hlo::ReduceOp> {
  public:
-  using OpConversionPattern::OpConversionPattern;
+  using BaseOpConversion<xla_hlo::ReduceOp>::BaseOpConversion;
 
   LogicalResult matchAndRewrite(
       xla_hlo::ReduceOp op, ArrayRef<Value> operands,
@@ -194,7 +180,8 @@ struct HloToLhloReduceOpConverter
     const auto& original_results = op.getResults();
     SmallVector<Value, 4> buffer_args(operands.begin(), operands.end());
     for (auto result : original_results) {
-      buffer_args.push_back(InsertAllocAndDealloc(loc, result, &rewriter));
+      buffer_args.push_back(
+          InsertAlloc(loc, result, this->bufferAssignment, &rewriter));
     }
     auto new_op = rewriter.create<xla_lhlo::ReduceOp>(
         loc, llvm::None, buffer_args, op.getAttrs());
@@ -230,12 +217,12 @@ struct HloToLhloReduceOpConverter
   }
 };
 
-class HloToLhloTensorLoadOpConverter : public ConversionPattern {
+class HloToLhloTensorLoadOpConverter
+    : public BaseOpConversion<mlir::TensorLoadOp> {
  public:
-  explicit HloToLhloTensorLoadOpConverter(MLIRContext* context)
-      : ConversionPattern(TensorLoadOp::getOperationName(), 1, context) {}
+  using BaseOpConversion<mlir::TensorLoadOp>::BaseOpConversion;
   LogicalResult matchAndRewrite(
-      Operation* op, ArrayRef<Value> operands,
+      mlir::TensorLoadOp op, ArrayRef<Value> operands,
       ConversionPatternRewriter& rewriter) const final {
     rewriter.replaceOp(op, operands);
     return success();
@@ -243,13 +230,13 @@ class HloToLhloTensorLoadOpConverter : public ConversionPattern {
 };
 
 // TODO(b/137624192): Rewrite into a copy and elide copy if possible.
-class HloToLhloTensorStoreOpConverter : public ConversionPattern {
+class HloToLhloTensorStoreOpConverter
+    : public BaseOpConversion<mlir::TensorStoreOp> {
  public:
-  explicit HloToLhloTensorStoreOpConverter(MLIRContext* context)
-      : ConversionPattern(TensorStoreOp::getOperationName(), 1, context) {}
+  using BaseOpConversion<mlir::TensorStoreOp>::BaseOpConversion;
 
   LogicalResult matchAndRewrite(
-      Operation* op, ArrayRef<Value> operands,
+      mlir::TensorStoreOp op, ArrayRef<Value> operands,
       ConversionPatternRewriter& rewriter) const final {
     rewriter.replaceOpWithNewOp<xla_lhlo::CopyOp>(
         op, llvm::None, operands.front(), operands.back());
@@ -291,7 +278,6 @@ class HloToLhloTensorStoreOpConverter : public ConversionPattern {
 //         (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
 //     "xla_lhlo.multiply"(%0, %arg0, %arg3) :
 //         (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-//     dealloc %0 : memref<2x2xf32>
 //     "xla_lhlo.terminator"() : () -> ()
 //   }) : () -> ()
 //   return
@@ -313,14 +299,13 @@ class HloToLhloTensorStoreOpConverter : public ConversionPattern {
 //               %arg1: memref<4xf32>,
 //               %arg2: memref<4xf32>) {
 //   %0 = alloc() : memref<4xf32>
-//   %1 = alloc() : memref<4xf32>
+
 //   "xla_lhlo.maximum"(%arg0, %arg1, %0) :
 //         (memref<4xf32>, memref<4xf32>, memref<4xf32>) -> ()
+//   %1 = alloc() : memref<4xf32>
 //   "xla_lhlo.add"(%arg0, %0, %1) :
 //         (memref<4xf32>, memref<4xf32>, memref<4xf32>) -> ()
 //   "xla_lhlo.copy"(%1, %arg2) : (memref<4xf32>, memref<4xf32>) -> ()
-//   dealloc %0 : memref<4xf32>
-//   dealloc %1 : memref<4xf32>
 //   "xla_lhlo.terminator"() : () -> ()
 // }
 
@@ -337,7 +322,7 @@ struct HloLegalizeToLhlo
     target.addIllegalOp<mlir::TensorLoadOp>();
     target.addIllegalOp<mlir::TensorStoreOp>();
     target.addLegalOp<ModuleTerminatorOp>();
-    target.addLegalOp<ScalarsToDimensionTensorOp>();
+    target.addLegalOp<TensorFromElementsOp>();
     target.addIllegalDialect<xla_hlo::XlaHloDialect>();
     target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
       auto inputs = op.getType().getInputs();
@@ -346,101 +331,25 @@ struct HloLegalizeToLhlo
     });
 
     auto module = getOperation();
-    populateHLOToLHLOConversionPattern(module.getContext(), &patterns);
-
-    // Do partial conversion so we can have unknown ops in tests.
-    if (failed(applyPartialConversion(module, target, patterns, nullptr))) {
-      signalPassFailure();
-    }
+    BufferAssignmentTypeConverter converter;
+    module.walk([&](FuncOp func) {
+      BufferAssignmentPlacer bufferAssignment(func);
+      OwningRewritePatternList patterns;
+      populateHLOToLHLOConversionPattern(func.getContext(), &bufferAssignment,
+                                         &converter, &patterns);
+      return WalkResult(
+          applyPartialConversion(func, target, patterns, &converter));
+    });
   }
 };
-
-Type ConvertType(Type t) {
-  if (auto tensorType = t.dyn_cast<RankedTensorType>()) {
-    return MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-  }
-  return t;
-}
-
 }  // namespace
 
-/// Transforms FuncOp arguments and results from tensors to buffers. Tensor
-/// results are converted to memrefs and appended to the argument list.
-class HloToLhloFuncOpConverter : public OpConversionPattern<FuncOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      FuncOp funcOp, ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter) const final {
-    if (funcOp.getBody().getBlocks().size() > 1) {
-      funcOp.emitOpError() << "tensor to buffer conversion expects a single "
-                              "block in the region containing the operation";
-      return failure();
-    }
-
-    auto funcType = funcOp.getType();
-
-    TypeConverter::SignatureConversion conversion(funcType.getNumInputs());
-    for (auto argType : llvm::enumerate(funcType.getInputs())) {
-      conversion.addInputs(argType.index(), ConvertType(argType.value()));
-    }
-    for (auto resType : funcType.getResults()) {
-      conversion.addInputs(ConvertType(resType));
-    }
-    rewriter.updateRootInPlace(funcOp, [&] {
-      funcOp.setType(
-          rewriter.getFunctionType(conversion.getConvertedTypes(), llvm::None));
-      rewriter.applySignatureConversion(&funcOp.getBody(), conversion);
-    });
-    return success();
-  }
-};
-
-/// Transforms ReturnOp to LhloTerminator. CopyOp is inserted to copy each
-/// result to the corresponding buffer argument.
-class StdToLhloReturnOpConverter : public OpConversionPattern<mlir::ReturnOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      mlir::ReturnOp returnOp, ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter) const final {
-    auto numReturnValues = returnOp.getNumOperands();
-    auto funcOp = returnOp.getParentOfType<FuncOp>();
-    auto numFuncArgs = funcOp.getNumArguments();
-    auto loc = returnOp.getLoc();
-
-    for (auto operand : llvm::enumerate(operands)) {
-      auto returnArgNumber = numFuncArgs - numReturnValues + operand.index();
-      auto dstBuffer = funcOp.getArgument(returnArgNumber);
-      if (dstBuffer == operand.value()) {
-        continue;
-      }
-
-      auto dealloc = FindInsertionPointForCopy(operand.value());
-
-      if (dealloc == nullptr) {
-        returnOp.emitOpError()
-            << "Missing dealloc for operand " << operand.index();
-        return failure();
-      }
-      OpBuilder::InsertionGuard guard(rewriter);
-      rewriter.setInsertionPoint(dealloc);
-      rewriter.create<xla_lhlo::CopyOp>(loc, llvm::None, operand.value(),
-                                        funcOp.getArgument(returnArgNumber));
-    }
-    rewriter.replaceOpWithNewOp<xla_lhlo::TerminatorOp>(returnOp);
-    return success();
-  }
-};
-
-void populateHLOToLHLOConversionPattern(MLIRContext* context,
-                                        OwningRewritePatternList* patterns) {
+void populateHLOToLHLOConversionPattern(
+    MLIRContext* context, BufferAssignmentPlacer* bufferAssignment,
+    TypeConverter* converter, OwningRewritePatternList* patterns) {
   // clang-format off
   patterns->insert<
       HloToLhloDynamicBroadcastInDimOpConverter,
-      HloToLhloFuncOpConverter,
       HloToLhloOpConverter<xla_hlo::AbsOp>,
       HloToLhloOpConverter<xla_hlo::AddOp>,
       HloToLhloOpConverter<xla_hlo::AndOp>,
@@ -453,6 +362,7 @@ void populateHLOToLHLOConversionPattern(MLIRContext* context,
       HloToLhloOpConverter<xla_hlo::CopyOp>,
       HloToLhloOpConverter<xla_hlo::CosOp>,
       HloToLhloOpConverter<xla_hlo::DivOp>,
+      HloToLhloOpConverter<xla_hlo::DotOp>,
       HloToLhloOpConverter<xla_hlo::ExpOp>,
       HloToLhloOpConverter<xla_hlo::ImagOp>,
       HloToLhloOpConverter<xla_hlo::IotaOp>,
@@ -472,8 +382,9 @@ void populateHLOToLHLOConversionPattern(MLIRContext* context,
       HloToLhloReduceOpConverter,
       HloToLhloTensorLoadOpConverter,
       HloToLhloTensorStoreOpConverter,
-      StdToLhloReturnOpConverter
-  >(context);
+      FunctionAndBlockSignatureConverter,
+      StdReturnOpConverter
+  >(context, bufferAssignment, converter);
   // clang-format on
 }
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc
index 129a24600a2..bb1169a57d6 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc
@@ -61,47 +61,46 @@ LogicalResult ReplaceTerminators(Region* region, Block* target_block,
   return success();
 }
 
-LogicalResult LowerConditionalOp(mlir::xla_hlo::ConditionalOp conditional_op) {
-  Operation* op_inst = conditional_op.getOperation();
-  mlir::OpBuilder builder(conditional_op);
+LogicalResult LowerIfOp(mlir::xla_hlo::IfOp if_op) {
+  Operation* op_inst = if_op.getOperation();
+  mlir::OpBuilder builder(if_op);
   auto orig_block = op_inst->getBlock();
   auto* tail_block = orig_block->splitBlock(op_inst);
-  auto loc = conditional_op.getLoc();
+  auto loc = if_op.getLoc();
 
   // Duplicate the true and false regions in the block between the sections
   // before and after the conditional.
   BlockAndValueMapping mapper;
-  conditional_op.true_branch().cloneInto(orig_block->getParent(),
-                                         Region::iterator(tail_block), mapper);
-  conditional_op.false_branch().cloneInto(orig_block->getParent(),
-                                          Region::iterator(tail_block), mapper);
+  if_op.true_branch().cloneInto(orig_block->getParent(),
+                                Region::iterator(tail_block), mapper);
+  if_op.false_branch().cloneInto(orig_block->getParent(),
+                                 Region::iterator(tail_block), mapper);
 
   // Determine the blocks for the start of the true and false regions.
-  Block* true_block = mapper.lookup(&conditional_op.true_branch().front());
-  Block* false_block = mapper.lookup(&conditional_op.false_branch().front());
+  Block* true_block = mapper.lookup(&if_op.true_branch().front());
+  Block* false_block = mapper.lookup(&if_op.false_branch().front());
 
   // Perform the conditional branch into the true/false cases.
   builder.setInsertionPointToEnd(orig_block);
 
   // Extract the predicate for checking branching, then branch to the true and
   // false regions appropriately.
-  auto cond_value =
-      builder.create<mlir::ExtractElementOp>(loc, conditional_op.pred());
+  auto cond_value = builder.create<mlir::ExtractElementOp>(loc, if_op.pred());
   builder.create<mlir::CondBranchOp>(loc, cond_value, true_block,
-                                     conditional_op.true_arg(), false_block,
-                                     conditional_op.false_arg());
+                                     if_op.true_arg(), false_block,
+                                     if_op.false_arg());
 
   // Replace the true case's return operations with a branch to the tail of
   // the condition.
-  if (failed(ReplaceTerminators(&conditional_op.true_branch(), tail_block, loc,
-                                mapper, &builder)))
+  if (failed(ReplaceTerminators(&if_op.true_branch(), tail_block, loc, mapper,
+                                &builder)))
     return failure();
-  if (failed(ReplaceTerminators(&conditional_op.false_branch(), tail_block, loc,
-                                mapper, &builder)))
+  if (failed(ReplaceTerminators(&if_op.false_branch(), tail_block, loc, mapper,
+                                &builder)))
     return failure();
 
-  tail_block->addArguments(conditional_op.getResult().getType());
-  conditional_op.getResult().replaceAllUsesWith(tail_block->getArgument(0));
+  tail_block->addArguments(if_op.getResult().getType());
+  if_op.getResult().replaceAllUsesWith(tail_block->getArgument(0));
 
   op_inst->erase();
   return success();
@@ -210,11 +209,11 @@ LogicalResult LowerWhileOp(mlir::xla_hlo::WhileOp while_op) {
 
 void LegalizeControlFlow::runOnFunction() {
   auto func = getFunction();
-  llvm::SmallVector<ConditionalOp, 4> conditional_ops;
-  func.walk([&](ConditionalOp op) { conditional_ops.push_back(op); });
+  llvm::SmallVector<IfOp, 4> if_ops;
+  func.walk([&](IfOp op) { if_ops.push_back(op); });
 
-  for (auto& op : conditional_ops) {
-    if (failed(LowerConditionalOp(op))) return signalPassFailure();
+  for (auto& op : if_ops) {
+    if (failed(LowerIfOp(op))) return signalPassFailure();
   }
 
   llvm::SmallVector<WhileOp, 4> while_ops;
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index fb03c9b82e5..2d6da67fc60 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
@@ -43,9 +44,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/xla/convert_op_folder.h"
+#include "tensorflow/compiler/mlir/xla/ir/chlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/hlo_utils.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -64,8 +67,9 @@ class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
  public:
   LegalizeTF() = default;
   LegalizeTF(const LegalizeTF &) {}
-  explicit LegalizeTF(bool allow_partial_conversion) {
+  explicit LegalizeTF(bool allow_partial_conversion, bool legalize_chlo) {
     allow_partial_conversion_ = allow_partial_conversion;
+    legalize_chlo_ = legalize_chlo;
   }
 
   /// Performs the lowering to XLA dialect.
@@ -76,6 +80,11 @@ class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
       *this, "allow-partial-conversion",
       llvm::cl::desc("Allow operations that can't be legalized."),
       llvm::cl::init(false)};
+  Option<bool> legalize_chlo_{
+      *this, "legalize-chlo",
+      llvm::cl::desc(
+          "Also legalizes intermediate chlo ops to hlo (default true)"),
+      llvm::cl::init(true)};
 };
 
 /// Returns if the given TF data format string is the default format.
@@ -359,6 +368,174 @@ static Value UpdateSliceInMinorDims(Location loc, Value v, Value update,
   return DynamicUpdateSliceInMinorDims(loc, v, update, dus_starts, builder);
 }
 
+// Deprecated: This is maintained to aid in porting old code that is not yet
+// dynamic shape aware and uses broadcasting modes that CHLO does not support.
+// Gets the resulting type from a broadcast between two types for statically
+// shaped types. This is to be used for legacy lowerings that both use non
+// left-padded broadcasting and static shapes. Its use should not be permitted
+// in new code.
+// May return nullptr on invalid static broadcast dimensions.
+// ABSL_DEPRECATED()
+static RankedTensorType GetStaticBroadcastType(
+    RankedTensorType x, RankedTensorType y,
+    DenseIntElementsAttr broadcast_dimensions_attr) {
+  auto element_type = x.getElementType();
+  auto shape_x = x.getShape();
+  auto shape_y = y.getShape();
+
+  if (shape_x.size() == shape_y.size()) {
+    llvm::SmallVector<int64_t, 4> out_shape(shape_x.size());
+    for (int i = 0; i < shape_x.size(); i++) {
+      auto x_val = shape_x[i];
+      auto y_val = shape_y[i];
+      out_shape[i] = std::max(x_val, y_val);
+    }
+    return RankedTensorType::get(out_shape, element_type);
+  }
+
+  auto shape_large = shape_x.size() > shape_y.size() ? shape_x : shape_y;
+  auto shape_small = shape_x.size() <= shape_y.size() ? shape_x : shape_y;
+
+  llvm::SmallVector<int64_t, 4> broadcast_dimensions;
+  // Explicit broadcast dimensions.
+  for (const APInt &int_value : broadcast_dimensions_attr) {
+    broadcast_dimensions.push_back(int_value.getSExtValue());
+  }
+  if (broadcast_dimensions.size() != shape_small.size()) {
+    return nullptr;
+  }
+  llvm::SmallVector<int64_t, 4> out_shape(shape_large.begin(),
+                                          shape_large.end());
+
+  // Update according to the broadcast dimensions.
+  for (auto index_pair : llvm::enumerate(broadcast_dimensions)) {
+    auto old_value = out_shape[index_pair.value()];
+    auto new_value = shape_small[index_pair.index()];
+    out_shape[index_pair.value()] = std::max(old_value, new_value);
+  }
+  return RankedTensorType::get(out_shape, element_type);
+}
+
+// Deprecated: This is maintained to aid in porting old code that is not yet
+// dynamic shape aware and uses broadcasting modes that CHLO does not support.
+// Applies static binary broadcasting to a binary elementwise op.
+// This is a legacy helper to provide general broadcasting support in legacy,
+// static shaped code that relies on non-left-padded broadcasting semantics.
+template <typename BinaryOp>
+static Value StaticBinaryBroadcast(Location loc, Value x, Value y,
+                                   DenseIntElementsAttr broadcast_dims,
+                                   OpBuilder &builder) {
+  auto x_type = x.getType().cast<RankedTensorType>();
+  auto y_type = y.getType().cast<RankedTensorType>();
+  auto result_type = GetStaticBroadcastType(x_type, y_type, broadcast_dims);
+  if (!result_type) {
+    emitError(loc) << "could not binary broadcast " << x_type << ", " << y_type
+                   << " with broadcast_dims = " << broadcast_dims;
+    return nullptr;
+  }
+  auto larger_broadcast_dims =
+      GetI64ElementsAttrForSeq(0, result_type.getRank(), &builder);
+  if (x_type.getRank() < y_type.getRank()) {
+    if (x_type != result_type) {
+      x = builder.create<BroadcastInDimOp>(loc, result_type, x, broadcast_dims);
+    }
+    if (y_type != result_type) {
+      y = builder.create<BroadcastInDimOp>(loc, result_type, y,
+                                           larger_broadcast_dims);
+    }
+  } else {
+    if (x_type != result_type) {
+      x = builder.create<BroadcastInDimOp>(loc, result_type, x,
+                                           larger_broadcast_dims);
+    }
+    if (y_type != result_type) {
+      y = builder.create<BroadcastInDimOp>(loc, result_type, y, broadcast_dims);
+    }
+  }
+  return builder.create<BinaryOp>(loc, x, y);
+}
+
+// Gets a 1D tensor type suitable for expressing extents of the given tensor
+// value type. If the value type is ranked, the result will be statically
+// shaped. Otherwise, it will have a dynamic dimension.
+static RankedTensorType GetExtentsTensorTypeFor(TensorType value_type) {
+  Builder b(value_type.getContext());
+  int64_t dim = value_type.hasRank() ? value_type.getRank() : -1;
+  return RankedTensorType::get({dim}, b.getIndexType());
+}
+
+// Broadcasts a 'lower_rank_value' to the shape of a 'higher_rank_value'
+// by assuming that the shape of the lower ranked is a broadcast compatible
+// prefix of the higher ranked.
+// Values must be RankedTensorType (this restriction derives from the
+// broadcast_dimensions attribute on DynamicBroadcastInDim).
+//
+// Example:
+//   CommonPrefixBroadcast(tensor<4x3x256>, tensor<4, 3>) will broadcast the
+//   lower rank value to [4, 3, 256] (i.e. the opposite of numpy-style
+//   implicit broadcasting).
+static Value CommonPrefixBroadcast(Location loc, Value higher_rank_value,
+                                   Value lower_rank_value, OpBuilder &builder) {
+  Value higher_rank_shape =
+      builder.create<shape::ShapeOfOp>(loc, higher_rank_value);
+  auto result_extents_type =
+      GetExtentsTensorTypeFor(higher_rank_value.getType().cast<TensorType>());
+  Value result_extents = builder.create<shape::ToExtentTensorOp>(
+      loc, result_extents_type, higher_rank_shape);
+
+  auto lower_rank_type = lower_rank_value.getType().cast<RankedTensorType>();
+  auto lower_rank = lower_rank_type.getRank();
+  auto prefix_dims = GetI64ElementsAttrForSeq(0, lower_rank, &builder);
+  return builder.create<DynamicBroadcastInDimOp>(
+      loc, higher_rank_value.getType(), lower_rank_value, result_extents,
+      prefix_dims);
+}
+
+// Given a value (broadcast_to) and a feature dimension, broadcasts a 1D
+// value (broadcast_from) along that feature dimension. This is a shortcut
+// for the cases where a 1D tensor must be broadcast along a specific feature
+// dimension, which can vary based on data layout, etc.
+//
+// The extent of `broadcast_from` dim0 must be equal to the extent of the
+// feature_dim of `broadcast_to`.
+//
+// Example:
+//   [1x2x3x4], [2], 1 -> [1x2x3x4]
+// TODO(laurenzo): Swap the order of broadcast_to and broadcast_from for
+// consistency. Possibly also rename for clarity.
+static Value Broadcast1DToFeatureDim(Location loc, Value broadcast_to,
+                                     Value broadcast_from, int64_t feature_dim,
+                                     OpBuilder &builder) {
+  auto broadcast_dims = GetI64ElementsAttr({feature_dim}, &builder);
+  auto to_type = broadcast_to.getType().cast<RankedTensorType>();
+  auto result_shape = builder.create<shape::ShapeOfOp>(loc, broadcast_to);
+  auto result_extents_type = GetExtentsTensorTypeFor(to_type);
+  auto result_extents = builder.create<shape::ToExtentTensorOp>(
+      loc, result_extents_type, result_shape);
+  return builder.create<DynamicBroadcastInDimOp>(
+      loc, to_type, broadcast_from, result_extents, broadcast_dims);
+}
+
+// Broadcasts `input` to the shape of `broadcast_to` value following
+// TF::BroadcastTo semantics.
+//
+// Requires that input is a ranked tensor.
+//
+// TODO(hinsu): Utilize TF::ShapeOp followed by TF::BroadcastTo once ShapeOp
+// supports unranked inputs in the lowering.
+static Value BroadcastToShapeOf(Location loc, Value input, Value broadcast_to,
+                                OpBuilder &builder) {
+  auto result_shape = builder.create<shape::ShapeOfOp>(loc, broadcast_to);
+  auto to_type = broadcast_to.getType().cast<TensorType>();
+  auto result_extents_type = GetExtentsTensorTypeFor(to_type);
+  auto result_extents = builder.create<shape::ToExtentTensorOp>(
+      loc, result_extents_type, result_shape);
+  int64_t rank = input.getType().cast<RankedTensorType>().getRank();
+  auto broadcast_dims = GetI64ElementsAttrForSeq(0, rank, &builder);
+  return builder.create<DynamicBroadcastInDimOp>(
+      loc, to_type, input, result_extents, broadcast_dims);
+}
+
 // Creates a batch dot using xla_hlo::DotGeneralOp.
 Value BatchDot(Location loc, Value lhs, bool transpose_lhs, Value rhs,
                bool transpose_rhs, int64_t num_batch_dims,
@@ -404,8 +581,7 @@ static void BuildReduceBody(Type element_type, Region *body,
 
   Location loc = body->getLoc();
   auto reducer =
-      builder->create<Op>(loc, block->getArgument(0), block->getArgument(1),
-                          /*broadcast_dimensions=*/nullptr);
+      builder->create<Op>(loc, block->getArgument(0), block->getArgument(1));
   builder->create<ReturnOp>(loc, reducer.getResult());
 }
 
@@ -505,8 +681,7 @@ static void CreateWhile32(Location loc, int num_iterations,
         loc, builder->getI32IntegerAttr(num_iterations));
     StringAttr compare_direction = StringAttr::get("LT", builder->getContext());
     Value compare = builder->create<xla_hlo::CompareOp>(
-        loc, loop_iv, upper_limit,
-        /*broadcast_dimensions=*/nullptr, compare_direction);
+        loc, loop_iv, upper_limit, compare_direction);
 
     builder->create<xla_hlo::ReturnOp>(loc, compare);
   }
@@ -536,9 +711,9 @@ static void CreateWhile32(Location loc, int num_iterations,
     // Increment the loop induction variable by one.
     auto one =
         builder->create<xla_hlo::ConstOp>(loc, builder->getI32IntegerAttr(1));
-    auto no_broadcast_dims = GetI64ElementsAttr({}, builder);
-    auto plus_one = builder->create<xla_hlo::AddOp>(loc, old_values[0], one,
-                                                    no_broadcast_dims);
+    auto scalar_broadcast_dims = GetI64ElementsAttr({}, builder);
+    auto plus_one = builder->create<xla_chlo::BroadcastAddOp>(
+        loc, old_values[0], one, scalar_broadcast_dims);
     // Prepend with the updated loop induction variable.
     new_values.insert(new_values.begin(), plus_one);
 
@@ -563,21 +738,6 @@ static IntegerAttr getFeatureDimensionAttr(Builder &b, StringAttr format,
       GetFeatureDimension(format, input.getType().cast<RankedTensorType>()));
 }
 
-//===----------------------------------------------------------------------===//
-// Bias op utilities.
-//===----------------------------------------------------------------------===//
-
-// Return a 1D DenseIntElementsAttr for the feature dimension of a BiasAdd.
-// Requires input to have ranked tensor.
-static DenseIntElementsAttr getBiasFeatureDimension(Builder &b,
-                                                    StringAttr format,
-                                                    Value input) {
-  auto inputType = input.getType().cast<RankedTensorType>();
-  size_t featureDim = GetFeatureDimension(format, inputType);
-  RankedTensorType type = RankedTensorType::get(1, b.getIntegerType(64));
-  return DenseIntElementsAttr::get(type, featureDim);
-}
-
 //===----------------------------------------------------------------------===//
 // MatMul op utilities.
 //===----------------------------------------------------------------------===//
@@ -740,8 +900,7 @@ static void BuildArgMinMaxReductionBody(Type input_element_type,
   StringAttr compare_direction =
       StringAttr::get(direction, builder->getContext());
   Value compare = builder->create<CompareOp>(
-      loc, block->getArgument(0), block->getArgument(2),
-      /*broadcast_dimensions=*/nullptr, compare_direction);
+      loc, block->getArgument(0), block->getArgument(2), compare_direction);
 
   Value selected_input = builder->create<SelectOp>(
       loc, input_type, compare, block->getArgument(0), block->getArgument(2));
@@ -857,8 +1016,7 @@ static void BuildSortComparisonBody(llvm::ArrayRef<Type> element_types,
   StringAttr compare_direction =
       StringAttr::get(direction, builder->getContext());
   Value compare = builder->create<xla_hlo::CompareOp>(
-      loc, block->getArgument(0), block->getArgument(1),
-      /*broadcast_dimensions=*/nullptr, compare_direction);
+      loc, block->getArgument(0), block->getArgument(1), compare_direction);
 
   builder->create<xla_hlo::ReturnOp>(loc, compare);
 }
@@ -897,6 +1055,27 @@ NamedAttribute GetConvDimensionNumbersAttr(
           feature_dim, spatial_dims, builder->getContext()));
 }
 
+// Converts a TF::BiasAddOp to HLO.
+// This differs from a normal TF::AddOp with respect to how the data_format
+// is handled, which can optionally require a general broadcast of the
+// 'bias' term in a way that is not compatible with the standard left-padded
+// broadcast semantics (i.e. NCHW will broadcast into dimension 1).
+// The correct 'bias' broadcast will be synthesized manually.
+class ConvertBiasAddOp : public OpRewritePattern<TF::BiasAddOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(TF::BiasAddOp op,
+                                PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto feature_dim = GetFeatureDimension(
+        op.data_formatAttr(), op.value().getType().cast<RankedTensorType>());
+    auto bias_broadcast = Broadcast1DToFeatureDim(loc, op.value(), op.bias(),
+                                                  feature_dim, rewriter);
+    rewriter.replaceOpWithNewOp<AddOp>(op, op.value(), bias_broadcast);
+    return success();
+  }
+};
+
 // Converts the TensorFlow conv op in template to the generic HLO conv op by
 // converting TensorFlow op attributes to HLO op attributes.
 //
@@ -1158,7 +1337,6 @@ class ConvertDiagPartOp : public OpRewritePattern<TF::DiagPartOp> {
                                          rewriter.getI64IntegerAttr(1));
     Value compare = rewriter.create<CompareOp>(
         op.getLoc(), iota0, iota1,
-        /*broadcast_dimensions=*/nullptr,
         StringAttr::get("EQ", rewriter.getContext()));
     Value zero = GetScalarConstOfType(input_type.getElementType(), op.getLoc(),
                                       0, &rewriter);
@@ -1271,33 +1449,35 @@ class ConvertFusedBatchNormGradBase
         non_feature_dims.push_back(i);
       }
       auto reduce_dims = GetI64ElementsAttr(non_feature_dims, &rewriter);
-      auto broadcast_dims = GetI64ElementsAttr({feature_dim}, &rewriter);
-      auto no_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
+      auto scalar_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
 
       // scratch1 = rsqrt(var + epsilon)
       RankedTensorType scalar_float = RankedTensorType::get({}, kernel_type);
       auto epsilon = rewriter.create<ConstOp>(
           loc, DenseFPElementsAttr::get(scalar_float, {op.epsilon()}));
-      auto add_op = rewriter.create<AddOp>(loc, var, epsilon.getResult(),
-                                           no_broadcast_dims);
+      auto add_op = rewriter.create<xla_chlo::BroadcastAddOp>(
+          loc, var, epsilon.getResult(), scalar_broadcast_dims);
+
       Value scratch1 = rewriter.create<RsqrtOp>(loc, add_op);
 
       // scratch2 = sum(y_backprop * (x - mean))
-      auto sub_op = rewriter.create<SubOp>(loc, act, mean, broadcast_dims);
-      auto weighted_grad =
-          rewriter.create<MulOp>(loc, grad, sub_op, no_broadcast_dims);
+      auto sub_op = rewriter.create<xla_hlo::SubOp>(
+          loc, act,
+          Broadcast1DToFeatureDim(loc, act, mean, feature_dim, rewriter));
+      auto weighted_grad = rewriter.create<xla_hlo::MulOp>(loc, grad, sub_op);
       Value scratch2 =
           ApplyReduction(loc, weighted_grad, reduce_dims, &rewriter);
 
       // x_backprop = y_backprop * (scale * scratch1)
       auto scaled_grad =
-          rewriter.create<MulOp>(loc, op.scale(), scratch1, no_broadcast_dims);
-      x_backprop =
-          rewriter.create<MulOp>(loc, grad, scaled_grad, broadcast_dims);
+          rewriter.create<xla_hlo::MulOp>(loc, op.scale(), scratch1);
+      x_backprop = rewriter.create<xla_hlo::MulOp>(
+          loc, grad,
+          Broadcast1DToFeatureDim(loc, act, scaled_grad, feature_dim,
+                                  rewriter));
 
       // scale_backprop = scratch2 * scratch1
-      scale_backprop =
-          rewriter.create<MulOp>(loc, scratch1, scratch2, no_broadcast_dims);
+      scale_backprop = rewriter.create<xla_hlo::MulOp>(loc, scratch1, scratch2);
 
       // offset_backprop = sum(y_backprop)
       offset_backprop = ApplyReduction(loc, grad, reduce_dims, &rewriter);
@@ -1393,7 +1573,7 @@ class ConvertFusedBatchNormV3Op
       auto factor_const_op = rewriter.create<xla_hlo::ConstOp>(
           op.getLoc(), rewriter.getFloatAttr(scale_element_type, factor));
 
-      Value corrected_variance = rewriter.create<xla_hlo::MulOp>(
+      Value corrected_variance = rewriter.create<xla_chlo::BroadcastMulOp>(
           op.getLoc(), batch_variance.getType(), batch_variance,
           factor_const_op, /*broadcast_dimensions=*/DenseIntElementsAttr());
 
@@ -1413,24 +1593,26 @@ class ConvertFusedBatchNormV3Op
             rewriter.getFloatAttr(mean_element_type, exponential_avg_factor));
 
         // new_running_mean = alpha * old_mean + beta * batch_mean.
-        auto alpha_mul_old_mean = rewriter.create<MulOp>(
+        auto alpha_mul_old_mean = rewriter.create<xla_chlo::BroadcastMulOp>(
             op.getLoc(), op.mean().getType(), alpha, op.mean(),
             /*broadcast_dimensions=*/DenseIntElementsAttr());
-        auto beta_mul_batch_mean = rewriter.create<MulOp>(
+        auto beta_mul_batch_mean = rewriter.create<xla_chlo::BroadcastMulOp>(
             op.getLoc(), batch_mean.getType(), beta, batch_mean,
             /*broadcast_dimensions=*/DenseIntElementsAttr());
-        batch_mean = rewriter.create<AddOp>(
+        batch_mean = rewriter.create<xla_chlo::BroadcastAddOp>(
             op.getLoc(), alpha_mul_old_mean, beta_mul_batch_mean,
             /*broadcast_dimensions=*/DenseIntElementsAttr());
 
         // new_running_variance = alpha * old_variance + beta * batch_variance.
-        auto alpha_mul_old_variance = rewriter.create<MulOp>(
+        auto alpha_mul_old_variance = rewriter.create<xla_chlo::BroadcastMulOp>(
             op.getLoc(), op.variance().getType(), alpha, op.variance(),
             /*broadcast_dimensions=*/DenseIntElementsAttr());
-        auto beta_mul_batch_variance = rewriter.create<MulOp>(
-            op.getLoc(), corrected_variance.getType(), beta, corrected_variance,
-            /*broadcast_dimensions=*/DenseIntElementsAttr());
-        corrected_variance = rewriter.create<AddOp>(
+        auto beta_mul_batch_variance =
+            rewriter.create<xla_chlo::BroadcastMulOp>(
+                op.getLoc(), corrected_variance.getType(), beta,
+                corrected_variance,
+                /*broadcast_dimensions=*/DenseIntElementsAttr());
+        corrected_variance = rewriter.create<xla_chlo::BroadcastAddOp>(
             op.getLoc(), alpha_mul_old_variance, beta_mul_batch_variance,
             /*broadcast_dimensions=*/DenseIntElementsAttr());
       }
@@ -1583,10 +1765,9 @@ class ConvertAvgPoolOp : public OpRewritePattern<TF::AvgPoolOp> {
     // Divide by the number of elements in the window.
     Value divisor =
         GetScalarConstOfType(sum_element_type, op.getLoc(), count, &rewriter);
-    auto batch_dims =
-        GetI64ElementsAttrForSeq(0, input_type.getRank(), &rewriter);
-    Value result = rewriter.create<DivOp>(op.getLoc(), result_type, reduce,
-                                          divisor, batch_dims);
+    auto scalar_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
+    Value result = rewriter.create<xla_chlo::BroadcastDivOp>(
+        op.getLoc(), result_type, reduce, divisor, scalar_broadcast_dims);
 
     // Convert back if we enlarged the element type's bitwidth.
     if (input_element_type != sum_element_type)
@@ -1743,29 +1924,20 @@ class ConvertSigmoidOp : public OpRewritePattern<TF::SigmoidOp> {
 
   LogicalResult matchAndRewrite(TF::SigmoidOp op,
                                 PatternRewriter &rewriter) const override {
-    auto operand = op.getOperand();
+    Location loc = op.getLoc();
 
-    auto scalar_one = rewriter.create<ConstOp>(
-        op.getLoc(),
-        rewriter.getFloatAttr(getElementTypeOrSelf(operand.getType()), 0.5));
+    // Create constant half with shape and element type same as the operand.
+    Value operand = op.getOperand();
+    auto operand_ty = operand.getType().cast<TensorType>();
+    auto scalar_ty = RankedTensorType::get({}, operand_ty.getElementType());
+    ElementsAttr attr = mlir::xla::getSplat(&rewriter, scalar_ty, 0.5);
+    auto scalar_half = rewriter.create<ConstOp>(loc, attr);
+    auto half = BroadcastToShapeOf(loc, scalar_half, operand, rewriter);
 
-    auto type = operand.getType().dyn_cast<RankedTensorType>();
-    if (!type)
-      return rewriter.notifyMatchFailure(op, "requires ranked tensor type");
-    auto constant_ones = rewriter.create<BroadcastOp>(
-        op.getLoc(), type, scalar_one,
-        GetI64ElementsAttr(type.getShape(), &rewriter));
-
-    auto scaled_input = rewriter.create<MulOp>(
-        op.getLoc(), operand, constant_ones, DenseIntElementsAttr());
-    auto tanh_op =
-        rewriter.create<TanhOp>(op.getLoc(), operand.getType(), scaled_input);
-    auto mul_op =
-        rewriter.create<MulOp>(op.getLoc(), tanh_op, constant_ones,
-                               /*DenseIntElementsAttr=*/DenseIntElementsAttr());
-    auto add_op =
-        rewriter.create<AddOp>(op.getLoc(), mul_op, constant_ones,
-                               /*DenseIntElementsAttr=*/DenseIntElementsAttr());
+    auto scaled_input = rewriter.create<MulOp>(loc, operand, half);
+    auto tanh_op = rewriter.create<TanhOp>(loc, scaled_input);
+    auto mul_op = rewriter.create<MulOp>(loc, tanh_op, half);
+    auto add_op = rewriter.create<AddOp>(loc, mul_op, half);
 
     rewriter.replaceOp(op, add_op.getResult());
     return success();
@@ -1804,20 +1976,18 @@ class ConvertSoftmaxOp : public OpRewritePattern<OpTy> {
 
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
-    Value logits = op.logits();
-
     // Softmax converter requires ranked type because the XLA reduce ops used
     // while lowering requires dimensions attribute to reduce along.
+    // Note that the input and output shape is equivalent, so we use 'logits'
+    // and its type for shape calculations.
+    Value logits = op.logits();
     RankedTensorType type = logits.getType().dyn_cast<RankedTensorType>();
     if (!type) return failure();
-
     auto loc = op.getLoc();
     int rank = type.getRank();
 
     // Note that the TensorFlow Softmax op verifies that the input rank is
-    // greater than or equal to one so both of the following sequences are
-    // valid.
-    auto batch_dims = GetI64ElementsAttrForSeq(0, rank - 1, &rewriter);
+    // greater than or equal to one so the following sequence is valid.
     auto reduce_dim = rewriter.create<TF::ConstOp>(
         loc, GetI64ElementsAttr({rank - 1}, &rewriter));
 
@@ -1830,8 +2000,10 @@ class ConvertSoftmaxOp : public OpRewritePattern<OpTy> {
     auto max_logits =
         rewriter.create<TF::MaxOp>(loc, logits, reduce_dim,
                                    /*keep_dims=*/rewriter.getBoolAttr(false));
-    auto shifted_logits =
-        rewriter.create<SubOp>(loc, type, logits, max_logits, batch_dims);
+    auto max_logits_broadcast =
+        CommonPrefixBroadcast(loc, logits, max_logits, rewriter);
+    auto shifted_logits = rewriter.create<xla_hlo::SubOp>(loc, type, logits,
+                                                          max_logits_broadcast);
 
     // Exponentiate the inputs.
     Value exp = rewriter.create<ExpOp>(loc, type, shifted_logits);
@@ -1844,9 +2016,12 @@ class ConvertSoftmaxOp : public OpRewritePattern<OpTy> {
 
     if (use_log) {
       Value log = rewriter.create<LogOp>(loc, sum);
-      rewriter.replaceOpWithNewOp<SubOp>(op, shifted_logits, log, batch_dims);
+      auto log_broadcast = CommonPrefixBroadcast(loc, logits, log, rewriter);
+      rewriter.replaceOpWithNewOp<xla_hlo::SubOp>(op, shifted_logits,
+                                                  log_broadcast);
     } else {
-      rewriter.replaceOpWithNewOp<DivOp>(op, exp, sum, batch_dims);
+      auto sum_broadcast = CommonPrefixBroadcast(loc, logits, sum, rewriter);
+      rewriter.replaceOpWithNewOp<xla_hlo::DivOp>(op, exp, sum_broadcast);
     }
     return success();
   }
@@ -1893,7 +2068,7 @@ class ConvertSizeOp : public OpRewritePattern<TF::SizeOp> {
       auto dim = rewriter.create<GetDimensionSizeOp>(
           op.getLoc(), result_type, input,
           rewriter.getIntegerAttr(rewriter.getIntegerType(32), i));
-      size = rewriter.create<MulOp>(
+      size = rewriter.create<xla_chlo::BroadcastMulOp>(
           op.getLoc(), size->getResult(0), dim.getResult(),
           /*DenseIntElementsAttr=*/DenseIntElementsAttr());
     }
@@ -2579,16 +2754,31 @@ class ConvertRangeOp : public OpRewritePattern<TF::RangeOp> {
 
     auto iota = rewriter.create<IotaOp>(op.getLoc(), result_type,
                                         rewriter.getI64IntegerAttr(0));
-    auto scaled = rewriter.create<MulOp>(
+    auto scaled = rewriter.create<xla_chlo::BroadcastMulOp>(
         op.getLoc(), result_type, iota, op.delta(),
         xla::getBroadcastDimensionsAttr(&rewriter, iota, op.delta()));
-    rewriter.replaceOpWithNewOp<AddOp>(
+    rewriter.replaceOpWithNewOp<xla_chlo::BroadcastAddOp>(
         op, result_type, scaled, op.start(),
         xla::getBroadcastDimensionsAttr(&rewriter, scaled, op.start()));
     return success();
   }
 };
 
+ElementsAttr ConvertAxisAttr(Value val, ElementsAttr attr, Builder *builder) {
+  auto int_attr = attr.cast<DenseIntElementsAttr>();
+  auto type = val.getType().cast<ShapedType>();
+
+  SmallVector<int64_t, 6> axis;
+  axis.reserve(int_attr.getNumElements());
+
+  int64_t rank = type.getRank();
+  for (auto val : int_attr.getValues<APInt>()) {
+    axis.push_back((val.getSExtValue() + rank) % rank);
+  }
+
+  return builder->getI64TensorAttr(axis);
+}
+
 /// Converts the LinSpace tensorflow op to a xla_hlo.iota op with a scaling
 /// and offset applied to generate the linspace values. The output tensor needs
 /// to have a static shape.  The implementation is defined in C++ because there
@@ -2615,7 +2805,7 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
     int64_t num = (*num_attr.begin()).getSExtValue();
 
     // Calculate the scaling that needs to be applied to the iota.
-    auto step_numerator = rewriter.create<SubOp>(
+    auto step_numerator = rewriter.create<xla_chlo::BroadcastSubOp>(
         op.getLoc(), op.start().getType(), op.stop(), op.start(),
         xla::getBroadcastDimensionsAttr(&rewriter, op.stop(), op.start()));
     Value step_denominator = rewriter.create<ConvertOp>(
@@ -2623,11 +2813,11 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
     if (num > 1) {
       Value one = GetScalarConstOfType(result_type.getElementType(),
                                        op.getLoc(), 1, &rewriter);
-      step_denominator = rewriter.create<SubOp>(
+      step_denominator = rewriter.create<xla_chlo::BroadcastSubOp>(
           op.getLoc(), step_denominator.getType(), step_denominator, one,
           xla::getBroadcastDimensionsAttr(&rewriter, step_denominator, one));
     }
-    auto step = rewriter.create<DivOp>(
+    auto step = rewriter.create<xla_chlo::BroadcastDivOp>(
         op.getLoc(), step_numerator.getType(), step_numerator, step_denominator,
         xla::getBroadcastDimensionsAttr(&rewriter, step_numerator,
                                         step_denominator));
@@ -2635,10 +2825,10 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
     // Scale the iota and add the offset.
     auto iota = rewriter.create<IotaOp>(op.getLoc(), result_type,
                                         rewriter.getI64IntegerAttr(0));
-    auto scaled = rewriter.create<MulOp>(
+    auto scaled = rewriter.create<xla_chlo::BroadcastMulOp>(
         op.getLoc(), result_type, iota, step,
         xla::getBroadcastDimensionsAttr(&rewriter, iota, step));
-    rewriter.replaceOpWithNewOp<AddOp>(
+    rewriter.replaceOpWithNewOp<xla_chlo::BroadcastAddOp>(
         op, result_type, scaled, op.start(),
         xla::getBroadcastDimensionsAttr(&rewriter, scaled, op.start()));
     return success();
@@ -2714,8 +2904,8 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
       auto divisor = GetScalarConstOfType(reduce_element_type, loc,
                                           divisor_count, &rewriter);
       auto broadcast_dims = GetI64ElementsAttr({}, &rewriter);
-      result = rewriter.create<DivOp>(loc, result, divisor.getResult(),
-                                      broadcast_dims);
+      result = rewriter.create<xla_chlo::BroadcastDivOp>(
+          loc, result, divisor.getResult(), broadcast_dims);
     }
 
     result = rewriter.create<ConvertOp>(loc, result, element_type);
@@ -3100,7 +3290,6 @@ class ConvertMaxPoolGradOp : public OpRewritePattern<OpTy> {
 
       auto reducer = rewriter.create<CompareOp>(
           loc, block->getArgument(0), block->getArgument(1),
-          /*broadcast_dimensions=*/nullptr,
           StringAttr::get("GE", rewriter.getContext()));
       rewriter.create<ReturnOp>(loc, reducer.getResult());
     }
@@ -3526,13 +3715,20 @@ class ConvertOneHotOp : public OpRewritePattern<TF::OneHotOp> {
     output_dims.insert(output_dims.begin() + axis, depth);
 
     Location loc = op.getLoc();
+
+    // The iota result is the effective output shape of the computation,
+    // and indices must be broadcast into it. At this point, this computation
+    // would need to be reworked quite a bit to support dynamic shapes, so
+    // just using static broadcasting.
     auto index_type = RankedTensorType::get(output_dims, element_type);
-    Value compare = rewriter.create<CompareOp>(
-        loc, op.indices(),
-        rewriter.create<IotaOp>(
-            loc, index_type,
-            IntegerAttr::get(rewriter.getIntegerType(64), axis)),
-        GetI64ElementsAttr(broadcast_dims, &rewriter),
+    auto iota = rewriter.create<IotaOp>(
+        loc, index_type, IntegerAttr::get(rewriter.getIntegerType(64), axis));
+    auto broadcast_indices = rewriter.create<BroadcastInDimOp>(
+        loc, index_type, op.indices(),
+        GetI64ElementsAttr(broadcast_dims, &rewriter));
+
+    Value compare = rewriter.create<xla_hlo::CompareOp>(
+        loc, broadcast_indices, iota,
         StringAttr::get("EQ", rewriter.getContext()));
     Value on_value = rewriter.create<BroadcastOp>(
         loc, op.getType(), op.on_value(),
@@ -4181,6 +4377,68 @@ class ConvertXlaShardingOp : public OpRewritePattern<TF::XlaShardingOp> {
   }
 };
 
+// Converts a TF InplaceUpdate op to DynamicUpdateSlice HLO.
+class ConvertInplaceUpdateOp : public OpRewritePattern<TF::InplaceUpdateOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::InplaceUpdateOp op,
+                                PatternRewriter &rewriter) const override {
+    auto input = op.x();
+    auto indices = op.i();
+    auto updates = op.v();
+
+    // Slice each row of `i` and `v` to perform a separate dynamic-update-slice
+    // on the contents of `x`.
+    auto input_type = input.getType().cast<ShapedType>();
+    auto updates_type = updates.getType().cast<ShapedType>();
+    auto indices_type = indices.getType().cast<ShapedType>();
+    if (!indices_type.hasStaticShape()) return failure();
+
+    if (indices_type.getRank() != 1) return failure();
+
+    SmallVector<Type, 4> unpacked_indices_type(
+        indices_type.getDimSize(0),
+        RankedTensorType::get({}, indices_type.getElementType()));
+    auto zero_attr = IntegerAttr::get(rewriter.getIntegerType(64), 0);
+    auto unpacked_indices = rewriter.create<TF::UnpackOp>(
+        op.getLoc(), unpacked_indices_type, indices, zero_attr);
+
+    SmallVector<int64_t, 4> split_updates_shape;
+    split_updates_shape.append(updates_type.getShape().begin(),
+                               updates_type.getShape().end());
+    split_updates_shape.front() = 1;
+    SmallVector<Type, 4> split_updates_type;
+    split_updates_type.resize(
+        updates_type.getShape().front(),
+        RankedTensorType::get(split_updates_shape,
+                              updates_type.getElementType()));
+
+    auto cst =
+        rewriter.create<xla_hlo::ConstOp>(op.getLoc(), zero_attr).getResult();
+    auto split_updates = rewriter.create<TF::SplitOp>(
+        op.getLoc(), split_updates_type, cst, updates);
+
+    SmallVector<Value, 6> input_indices;
+    input_indices.resize(input_type.getRank(), cst);
+
+    SmallVector<int64_t, 6> starts(updates_type.getRank(), 0);
+    SmallVector<int64_t, 6> strides(updates_type.getRank(), 1);
+    SmallVector<int64_t, 6> limits(updates_type.getShape().begin(),
+                                   updates_type.getShape().end());
+
+    for (auto pair :
+         llvm::zip(unpacked_indices.output(), split_updates.output())) {
+      input_indices.front() = std::get<0>(pair);
+      input = rewriter.create<xla_hlo::DynamicUpdateSliceOp>(
+          op.getLoc(), op.getType(), input, std::get<1>(pair), input_indices);
+    }
+
+    rewriter.replaceOp(op, input);
+    return success();
+  }
+};
+
 // Converts a TF XlaDynamicUpdateSlice op to DynamicUpdateSlice HLO.
 class ConvertXlaDynamicUpdateSliceOp
     : public OpRewritePattern<TF::XlaDynamicUpdateSliceOp> {
@@ -4316,7 +4574,6 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
                                          rewriter.getI64IntegerAttr(1));
     Value compare = rewriter.create<CompareOp>(
         op.getLoc(), iota0, iota1,
-        /*broadcast_dimensions=*/nullptr,
         StringAttr::get("EQ", rewriter.getContext()));
     Value identity_matrix =
         rewriter.create<ConvertOp>(op.getLoc(), compare, type.getElementType());
@@ -4350,8 +4607,7 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
                                batch_dims.size(), precision_config, &rewriter);
       a_update = BatchDot(op.getLoc(), y, false, a_update, false,
                           batch_dims.size(), precision_config, &rewriter);
-      a_panel = rewriter.create<AddOp>(op.getLoc(), a_panel, a_update,
-                                       /*broadcast_dimensions=*/nullptr);
+      a_panel = rewriter.create<AddOp>(op.getLoc(), a_panel, a_update);
       a = UpdateSliceInMinorDims(op.getLoc(), a, a_panel, {i, i + k},
                                  &rewriter);
 
@@ -4362,8 +4618,7 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
                                 batch_dims.size(), precision_config, &rewriter);
       q_update = BatchDot(op.getLoc(), q_update, false, y, true,
                           batch_dims.size(), precision_config, &rewriter);
-      q_panel = rewriter.create<AddOp>(op.getLoc(), q_panel, q_update,
-                                       /*broadcast_dimensions=*/nullptr);
+      q_panel = rewriter.create<AddOp>(op.getLoc(), q_panel, q_update);
       q = UpdateSliceInMinorDims(op.getLoc(), q, q_panel, {i}, &rewriter);
     }
     // full_matrices is false when only a partial result in needed. Slice to the
@@ -4425,34 +4680,31 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
     Value iota = builder->create<IotaOp>(
         loc, RankedTensorType::get({m}, builder->getIntegerType(32)),
         builder->getI64IntegerAttr(0));
-    Value gtk = builder->create<CompareOp>(
+    Value gtk = builder->create<xla_chlo::BroadcastCompareOp>(
         loc, iota, k, GetI64ElementsAttr({}, builder),
         StringAttr::get("GT", builder->getContext()));
     gtk = builder->create<ConvertOp>(loc, gtk, x_type.getElementType());
-    Value x_after_k = builder->create<MulOp>(
+    Value x_after_k = builder->create<xla_chlo::BroadcastMulOp>(
         loc, x, gtk, GetI64ElementsAttr({minor_dim}, builder));
-    Value x_after_k_sq = builder->create<MulOp>(
-        loc, x_after_k, x_after_k, /*broadcast_dimensions=*/nullptr);
+    Value x_after_k_sq = builder->create<MulOp>(loc, x_after_k, x_after_k);
     // sigma = np.dot(x[k+1:], x[k+1:])
     auto sigma = builder->create<ReduceOp>(
         loc, x_after_k_sq, zero, GetI64ElementsAttr({minor_dim}, builder));
     BuildReduceBody<AddOp>(x_type.getElementType(), &sigma.body(), builder);
     // mu = np.sqrt(x[k]*x[k] + sigma)
-    Value alpha_sq = builder->create<MulOp>(loc, alpha, alpha,
-                                            /*broadcast_dimensions=*/nullptr);
+    Value alpha_sq = builder->create<MulOp>(loc, alpha, alpha);
     Value mu = builder->create<SqrtOp>(
-        loc, builder->create<AddOp>(loc, alpha_sq, sigma.getResult(0),
-                                    /*broadcast_dimensions=*/nullptr));
+        loc, builder->create<AddOp>(loc, alpha_sq, sigma.getResult(0)));
 
-    Value sigma_is_zero = builder->create<CompareOp>(
+    Value sigma_is_zero = builder->create<xla_chlo::BroadcastCompareOp>(
         loc, sigma.getResult(0), zero, GetI64ElementsAttr({}, builder),
         StringAttr::get("EQ", builder->getContext()));
-    Value alpha_is_negative = builder->create<CompareOp>(
+    Value alpha_is_negative = builder->create<xla_chlo::BroadcastCompareOp>(
         loc, alpha, zero, GetI64ElementsAttr({}, builder),
         StringAttr::get("LT", builder->getContext()));
     auto batch_size_one = builder->create<BroadcastOp>(
         loc, alpha.getType(), one, GetI64ElementsAttr(batch_dims, builder));
-    Value signed_mu = builder->create<MulOp>(
+    Value signed_mu = builder->create<xla_chlo::BroadcastMulOp>(
         loc,
         builder->create<SelectOp>(loc, mu.getType(), alpha_is_negative,
                                   batch_size_one,
@@ -4461,21 +4713,16 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
     *beta = builder->create<SelectOp>(loc, alpha.getType(), sigma_is_zero,
                                       alpha, signed_mu);
     *tau = builder->create<DivOp>(
-        loc,
-        builder->create<SubOp>(loc, *beta, alpha,
-                               /*broadcast_dimensions=*/nullptr),
-        *beta,
-        /*broadcast_dimensions=*/nullptr);
+        loc, builder->create<SubOp>(loc, *beta, alpha), *beta);
     Value zero_tau = builder->create<BroadcastOp>(
         loc, alpha.getType(), zero, GetI64ElementsAttr(batch_dims, builder));
     *tau = builder->create<SelectOp>(loc, alpha.getType(), sigma_is_zero,
                                      zero_tau, *tau);
-    Value divisor = builder->create<SubOp>(loc, alpha, *beta,
-                                           /*broadcast_dimensions=*/nullptr);
+    Value divisor = builder->create<SubOp>(loc, alpha, *beta);
     divisor = builder->create<SelectOp>(loc, divisor.getType(), sigma_is_zero,
                                         batch_size_one, divisor);
 
-    Value eqk = builder->create<CompareOp>(
+    Value eqk = builder->create<xla_chlo::BroadcastCompareOp>(
         loc, iota, k, GetI64ElementsAttr({}, builder),
         StringAttr::get("EQ", builder->getContext()));
     eqk = builder->create<ConvertOp>(loc, eqk, x_type.getElementType());
@@ -4488,10 +4735,12 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
 
     // Form v as [0, 0, ..., 1] ++ x[k+1:] / divisor
     // If sigma is zero, x[k+1:] is zero, so use any non-zero divisor.
-    *v = builder->create<AddOp>(
+    // Note that the add performs a degenerate broadcast.
+    *v = builder->create<xla_chlo::BroadcastAddOp>(
         loc, e_k,
-        builder->create<DivOp>(loc, x_after_k, divisor,
-                               GetI64ElementsAttr(batch_dim_ids, builder)),
+        StaticBinaryBroadcast<DivOp>(loc, x_after_k, divisor,
+                                     GetI64ElementsAttr(batch_dim_ids, builder),
+                                     *builder),
         /*broadcast_dimensions=*/nullptr);
   }
 
@@ -4565,10 +4814,10 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
                           precision, builder);
       vva = BatchDot(loc, v_broadcast, true, vva, false, num_batch_dims,
                      precision, builder);
-      auto tau_x_vva = builder->create<MulOp>(
-          loc, tau, vva, GetI64ElementsAttr(batch_dim_indices, builder));
-      a = builder->create<SubOp>(loc, a, tau_x_vva,
-                                 /*broadcast_dimensions=*/nullptr);
+      auto tau_x_vva = StaticBinaryBroadcast<xla_hlo::MulOp>(
+          loc, tau, vva, GetI64ElementsAttr(batch_dim_indices, builder),
+          *builder);
+      a = builder->create<SubOp>(loc, a, tau_x_vva);
 
       // It is more precise to populate column 'k' explicitly, rather than
       // computing it implicitly by applying the Householder transformation.
@@ -4577,12 +4826,12 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
       auto iota = builder->create<IotaOp>(
           loc, RankedTensorType::get({m, 1}, builder->getIntegerType(32)),
           builder->getI64IntegerAttr(0));
-      Value predecessor_mask = builder->create<CompareOp>(
+      Value predecessor_mask = builder->create<xla_chlo::BroadcastCompareOp>(
           loc, iota, j, GetI64ElementsAttr({}, builder),
           StringAttr::get("LT", builder->getContext()));
       predecessor_mask = builder->create<ConvertOp>(loc, predecessor_mask,
                                                     a_type.getElementType());
-      Value mask = builder->create<CompareOp>(
+      Value mask = builder->create<xla_chlo::BroadcastCompareOp>(
           loc, iota, j, GetI64ElementsAttr({}, builder),
           StringAttr::get("EQ", builder->getContext()));
       mask = builder->create<ConvertOp>(loc, mask, a_type.getElementType());
@@ -4594,14 +4843,14 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
           mask,
           GetI64ElementsAttr(llvm::SmallVector<int64_t, 4>(num_batch_dims, 1),
                              builder));
-      Value predecessor_masked_x = builder->create<MulOp>(
+      Value predecessor_masked_x = StaticBinaryBroadcast<MulOp>(
           loc, x, predecessor_mask,
-          GetI64ElementsAttr({num_dims - 2, num_dims - 1}, builder));
-      Value masked_beta = builder->create<MulOp>(
-          loc, beta, mask, GetI64ElementsAttr(batch_dim_indices, builder));
+          GetI64ElementsAttr({num_dims - 2, num_dims - 1}, builder), *builder);
+      Value masked_beta = StaticBinaryBroadcast<MulOp>(
+          loc, beta, mask, GetI64ElementsAttr(batch_dim_indices, builder),
+          *builder);
       Value new_x =
-          builder->create<AddOp>(loc, predecessor_masked_x, masked_beta,
-                                 /*broadcast_dimensions=*/nullptr);
+          builder->create<AddOp>(loc, predecessor_masked_x, masked_beta);
       // Update a[:,j]
       llvm::SmallVector<int64_t, 4> dim_ids(num_dims);
       std::iota(dim_ids.begin(), dim_ids.end(), 0);
@@ -4612,7 +4861,7 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
           loc,
           RankedTensorType::get(a_type.getShape(), builder->getIntegerType(32)),
           builder->getI64IntegerAttr(minor_dim + 1));
-      Value xa_mask = builder->create<CompareOp>(
+      Value xa_mask = builder->create<xla_chlo::BroadcastCompareOp>(
           loc, iota_mn, j, GetI64ElementsAttr({}, builder),
           StringAttr::get("EQ", builder->getContext()));
       a = builder->create<SelectOp>(loc, a_type, xa_mask, new_x, a);
@@ -4628,11 +4877,11 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
                              builder));
       auto vs_update = builder->create<SelectOp>(
           loc, vs.getType(), xa_mask,
-          builder->create<AddOp>(
-              loc, vs_zeros, v, GetI64ElementsAttr(vs_broadcast_dims, builder)),
+          StaticBinaryBroadcast<AddOp>(
+              loc, vs_zeros, v, GetI64ElementsAttr(vs_broadcast_dims, builder),
+              *builder),
           vs_zeros);
-      vs = builder->create<AddOp>(loc, vs, vs_update,
-                                  /*broadcast_dimensions=*/nullptr);
+      vs = builder->create<AddOp>(loc, vs, vs_update);
 
       // taus[j] = tau
       llvm::SmallVector<int64_t, 4> tau_broadcast_dims(batch_dims.size());
@@ -4649,17 +4898,16 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
           loc, taus.getType(), taus_zeros,
           GetI64ElementsAttr(taus.getType().cast<RankedTensorType>().getShape(),
                              builder));
-      Value taus_mask = builder->create<CompareOp>(
+      Value taus_mask = builder->create<xla_chlo::BroadcastCompareOp>(
           loc, iota_n, j, GetI64ElementsAttr({}, builder),
           StringAttr::get("EQ", builder->getContext()));
       auto taus_update = builder->create<SelectOp>(
           loc, taus.getType(), taus_mask,
-          builder->create<AddOp>(
+          StaticBinaryBroadcast<AddOp>(
               loc, taus_zeros, tau,
-              GetI64ElementsAttr(tau_broadcast_dims, builder)),
+              GetI64ElementsAttr(tau_broadcast_dims, builder), *builder),
           taus_zeros);
-      taus = builder->create<AddOp>(loc, taus, taus_update,
-                                    /*broadcast_dimensions=*/nullptr);
+      taus = builder->create<AddOp>(loc, taus, taus_update);
       new_values->assign({a, vs, taus});
     };
 
@@ -4716,8 +4964,7 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
       j = builder->create<AddOp>(
           loc, j,
           GetScalarConstOfType(getElementTypeOrSelf(j.getType()), loc, 1,
-                               builder),
-          /*broadcast_dimensions=*/nullptr);
+                               builder));
       // vs has shape [..., m, 1]
       auto v = DynamicSliceInMinorDims(loc, vs, {j}, {1}, builder);
       // beta has shape [..., 1]
@@ -4736,7 +4983,7 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
           loc, vs.getType(), zero,
           GetI64ElementsAttr(vs.getType().cast<RankedTensorType>().getShape(),
                              builder));
-      auto compare = builder->create<CompareOp>(
+      auto compare = builder->create<xla_chlo::BroadcastCompareOp>(
           loc, iota_mn, j, GetI64ElementsAttr({}, builder),
           StringAttr::get("GE", builder->getContext()));
       auto y = builder->create<SelectOp>(loc, vs.getType(), compare, zero, vs);
@@ -4751,13 +4998,12 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
 
       // z = -beta * (v + wyv)
       auto neg_beta = builder->create<NegOp>(loc, beta);
-      auto v_wyv = builder->create<AddOp>(loc, v, wyv,
-                                          /*broadcast_dimensions=*/nullptr);
+      auto v_wyv = builder->create<AddOp>(loc, v, wyv);
       auto beta_broadcast_dims = llvm::to_vector<4>(batch_dim_indices);
       beta_broadcast_dims.push_back(n_index);
-      auto z = builder->create<MulOp>(
+      auto z = StaticBinaryBroadcast<MulOp>(
           loc, neg_beta, v_wyv,
-          GetI64ElementsAttr(beta_broadcast_dims, builder));
+          GetI64ElementsAttr(beta_broadcast_dims, builder), *rewriter);
 
       w = DynamicUpdateSliceInMinorDims(loc, w, z, {j}, builder);
       new_values->assign({w, vs, taus});
@@ -4775,8 +5021,9 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
     auto neg_beta = rewriter->create<NegOp>(loc, beta);
     auto beta_broadcast_dims = llvm::to_vector<4>(batch_dim_indices);
     beta_broadcast_dims.push_back(n_index);
-    auto bv = rewriter->create<MulOp>(
-        loc, neg_beta, v, GetI64ElementsAttr(beta_broadcast_dims, rewriter));
+    auto bv = StaticBinaryBroadcast<MulOp>(
+        loc, neg_beta, v, GetI64ElementsAttr(beta_broadcast_dims, rewriter),
+        *rewriter);
     w = UpdateSliceInMinorDims(loc, w, bv, {0}, rewriter);
 
     SmallVector<Value, 4> while_output;
@@ -4785,9 +5032,55 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
   }
 };
 
+// Emits debug information which includes the number of ops of each type which
+// failed to legalize.
+void EmitLegalizationErrors(Operation *op,
+                            const DenseSet<Operation *> &nonlegalized_ops) {
+  // Track the legalization failures by mapping op name to information about
+  // that failure: the number of unlegalized occurances of the op, and one
+  // example operation that failed.
+  std::map<StringRef, std::pair<int, Operation *>> op_name_to_error_info;
+  DenseSet<Operation *> error_ops;
+  for (Operation *nonlegalized_op : nonlegalized_ops) {
+    // Increment count of this legalization failure.
+    StringRef op_name = nonlegalized_op->getName().getStringRef();
+    // If this emplace is successful, it's the first time we've encountered
+    // this op type. Initialize count to 0 so that after increment, it is 1.
+    auto insertion_result = op_name_to_error_info.emplace(
+        op_name, std::make_pair(0, nonlegalized_op));
+    ++insertion_result.first->second.first;
+  }
+  std::vector<std::string> error_messages;
+  error_messages.reserve(op_name_to_error_info.size());
+  for (const auto &op_info : op_name_to_error_info) {
+    error_messages.push_back(
+        llvm::formatv("{0} (count: {1})", op_info.first, op_info.second.first));
+  }
+  Location loc = op->getLoc();
+  emitError(loc) << "The following operations cannot be legalized: "
+                 << llvm::join(error_messages, "; ")
+                 << ". These legalization failure(s) may be due to missing TF "
+                    "to HLO lowerings and/or unsupported attributes, etc.";
+  // Emit more information about the missing ops. This error message
+  // contains useful details beyond the op name (input and output shapes,
+  // attributes, etc.).
+  if (!VLOG_IS_ON(1) && nonlegalized_ops.size() != 1) {
+    emitError(loc)
+        << "Emitting more detail about one op that failed to legalize...";
+  } else if (VLOG_IS_ON(1)) {
+    emitError(loc) << "Emitting more detail about one of each type of op "
+                      "that failed to legalize...";
+  }
+  for (const auto &op_info : op_name_to_error_info) {
+    op_info.second.second->emitOpError() << "is not legalizable";
+    if (!VLOG_IS_ON(1)) break;
+  }
+}
+
 // Performs the lowering to XLA dialect.
 void LegalizeTF::runOnFunction() {
-  if (failed(legalizeTF(getFunction(), allow_partial_conversion_)))
+  if (failed(
+          legalizeTF(getFunction(), allow_partial_conversion_, legalize_chlo_)))
     signalPassFailure();
 }
 
@@ -4798,7 +5091,8 @@ static PassRegistration<LegalizeTF> pass(
 
 #include "tensorflow/compiler/mlir/xla/transforms/generated_legalize_tf.inc"
 
-LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
+LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion,
+                         bool legalize_chlo) {
   MLIRContext *context = op->getContext();
 
   // Add lowering patterns to the list.
@@ -4811,13 +5105,14 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
   TF::PopulateLoweringTFPatterns(context, &patterns);
   patterns.insert<
       ConvertAllOp, ConvertAnyOp, ConvertArgMaxOp, ConvertBatchMatMulV2Op,
-      ConvertBroadcastToOp, ConvertBF16FloorDivOp, ConvertConv2DOp,
-      ConvertConv3DOp, ConvertDepthConv2DOp, ConvertConv2DBackpropFilterOp,
-      ConvertConv3DBackpropFilterOp, ConvertConv2DBackpropInputOp,
-      ConvertConv3DBackpropInputOp, ConvertCumsumOp, ConvertDiagPartOp,
-      ConvertEinsumOp, ConvertFusedBatchNormGradOp,
-      ConvertFusedBatchNormGradV2Op, ConvertFusedBatchNormGradV3Op,
-      ConvertFusedBatchNormV3Op, ConvertInfeedDequeueTupleOp, ConvertLinSpaceOp,
+      ConvertBiasAddOp, ConvertBroadcastToOp, ConvertBF16FloorDivOp,
+      ConvertConv2DOp, ConvertConv3DOp, ConvertDepthConv2DOp,
+      ConvertConv2DBackpropFilterOp, ConvertConv3DBackpropFilterOp,
+      ConvertConv2DBackpropInputOp, ConvertConv3DBackpropInputOp,
+      ConvertCumsumOp, ConvertDiagPartOp, ConvertEinsumOp,
+      ConvertFusedBatchNormGradOp, ConvertFusedBatchNormGradV2Op,
+      ConvertFusedBatchNormGradV3Op, ConvertFusedBatchNormV3Op,
+      ConvertInfeedDequeueTupleOp, ConvertInplaceUpdateOp, ConvertLinSpaceOp,
       ConvertMaxOp, ConvertMinOp, ConvertAvgPoolOp, ConvertMaxPool2DOp,
       ConvertMaxPool3DOp, ConvertMaxPool2DGradOp, ConvertMaxPool3DGradOp,
       ConvertMeanOp, ConvertOneHotOp, ConvertOutfeedEnqueueTupleOp,
@@ -4831,7 +5126,18 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
       ConvertRandomShuffleOp, ConvertVariableShapeOp, ConvertXlaShardingOp,
       ConvertXlaDynamicUpdateSliceOp>(op->getContext());
 
+  // Populate with CHLO->HLO lowerings to account for TF ops legalized to
+  // CHLO first.
+  if (legalize_chlo) {
+    xla_chlo::PopulateLegalizeChloToHloPatterns(context, &patterns);
+  }
+
   ConversionTarget target(*context);
+  if (legalize_chlo) {
+    target.addIllegalDialect<xla_chlo::XlaHloClientDialect>();
+  } else {
+    target.addLegalDialect<xla_chlo::XlaHloClientDialect>();
+  }
   target.addLegalDialect<XlaHloDialect>();
   target.addLegalDialect<StandardOpsDialect>();
   target.addLegalDialect<shape::ShapeDialect>();
@@ -4841,15 +5147,24 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
   if (!allow_partial_conversion) {
     // Fully qualify ReturnOp here as xla_hlo dialect also defines a ReturnOp.
     target.addLegalOp<ModuleOp, FuncOp, ModuleTerminatorOp, ::mlir::ReturnOp>();
-    return applyFullConversion(op, target, patterns);
+    DenseSet<Operation *> nonlegalized_ops;
+    LogicalResult result = applyPartialConversion(
+        op, target, patterns, /*converter=*/nullptr, &nonlegalized_ops);
+    // In order to enforce that the conversion result is fully converted,
+    // fail if there are any nonlegalized ops in the set.
+    if (failed(result) || !nonlegalized_ops.empty()) {
+      EmitLegalizationErrors(op, nonlegalized_ops);
+      return failure();
+    }
+    return result;
   }
 
   return applyPartialConversion(op, target, patterns);
 }
 
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeTFPass(
-    bool allow_partial_conversion) {
-  return std::make_unique<LegalizeTF>(allow_partial_conversion);
+    bool allow_partial_conversion, bool legalize_chlo) {
+  return std::make_unique<LegalizeTF>(allow_partial_conversion, legalize_chlo);
 }
 
 }  // end namespace xla_hlo
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
index 86927fe0e07..d5e5b6f5a71 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
@@ -66,7 +66,7 @@ createLegalizeTFControlFlowPass() {
 namespace {
 
 void Detuple(Value tuple, Operation::result_range replace, OpBuilder* builder) {
-  // De-tuple the results of the xla hlo conditional result.
+  // De-tuple the results of the xla hlo if result.
   for (auto result_it : llvm::enumerate(replace)) {
     auto get_tuple_value = builder->create<xla_hlo::GetTupleElementOp>(
         result_it.value().getLoc(), tuple, result_it.index());
@@ -74,14 +74,13 @@ void Detuple(Value tuple, Operation::result_range replace, OpBuilder* builder) {
   }
 }
 
-// Imports the source region into the destination region. The XLA conditional
+// Imports the source region into the destination region. The XLA if
 // operation only supports one argument per branch. Therefore any branch that
 // requires additional arguments requires their values be tupled together. Then,
 // to support multiple returns (as XLA only supports a single return value) the
-// results of the conditional are tupled together.
+// results of the if operation are tupled together.
 void ImportXlaRegion(mlir::FuncOp func, Region* dest_region, Location loc,
                      bool tuple_return = true) {
-  BlockAndValueMapping mapper;
   OpBuilder builder(dest_region);
 
   auto entry_block = builder.createBlock(dest_region);
@@ -111,27 +110,52 @@ void LowerIf(TF::IfOp op, ModuleOp module) {
   // XLA prefers tuple arguments for control flow due to XLA not supporting
   // multiple return values.
   SmallVector<Value, 3> inputs(op.input());
-  builder.setInsertionPoint(op);
   auto tuple_input = builder.create<xla_hlo::TupleOp>(loc, inputs);
 
-  // Create the new conditional op with tuple inputs.
-  SmallVector<Value, 3> operands(op.getOperands());
+  // Create the new if op with tuple inputs.
   auto result_type = builder.getTupleType(op.getResultTypes());
-  auto conditional = builder.create<xla_hlo::ConditionalOp>(
-      loc, result_type, op.cond(), tuple_input, tuple_input);
+  auto if_op = builder.create<xla_hlo::IfOp>(loc, result_type, op.cond(),
+                                             tuple_input, tuple_input);
 
   // Import the regions for both the true and false cases. These regions
   // must be updated to tuple the return results together and use the xla hlo
   // return op.
-  BlockAndValueMapping mapper;
   auto then_branch = module.lookupSymbol<mlir::FuncOp>(op.then_branch());
   auto else_branch = module.lookupSymbol<mlir::FuncOp>(op.else_branch());
-  ImportXlaRegion(then_branch, &conditional.true_branch(), loc);
-  ImportXlaRegion(else_branch, &conditional.false_branch(), loc);
+  ImportXlaRegion(then_branch, &if_op.true_branch(), loc);
+  ImportXlaRegion(else_branch, &if_op.false_branch(), loc);
 
-  // De-tuple the results of the xla hlo conditional result.
-  builder.setInsertionPointAfter(op);
-  Detuple(conditional.getResult(), op.getResults(), &builder);
+  // De-tuple the results of the xla hlo if result.
+  Detuple(if_op.getResult(), op.getResults(), &builder);
+  op.erase();
+}
+
+void LowerCase(TF::CaseOp op, ModuleOp module) {
+  Location loc = op.getLoc();
+  OpBuilder builder(op);
+
+  // XLA requires one argument per branch so we create a tuple of inputs to pass
+  // to each branch.
+  SmallVector<Value, 4> inputs(op.input());
+  auto tuple_input = builder.create<xla_hlo::TupleOp>(loc, inputs);
+
+  // Create replica of input tuple for each branch
+  SmallVector<Value, 4> n_tuple_inputs(op.branches().size(), tuple_input);
+
+  // Create the new case op with tuple inputs.
+  auto case_op = builder.create<xla_hlo::CaseOp>(
+      loc, op.getResultTypes(), op.branch_index(), n_tuple_inputs,
+      op.branches().size());
+
+  // Import the regions for all branches.
+  for (unsigned i = 0; i < op.branches().size(); ++i) {
+    mlir::FuncOp branch_func = module.lookupSymbol<mlir::FuncOp>(
+        op.branches()[i].cast<SymbolRefAttr>());
+    ImportXlaRegion(branch_func, &case_op.branches()[i], loc,
+                    /*tuple_return=*/false);
+  }
+
+  op.replaceAllUsesWith(case_op.getResults());
   op.erase();
 }
 
@@ -146,7 +170,6 @@ void LowerWhile(TF::WhileOp op, ModuleOp module) {
   Value tuple_input = builder.create<xla_hlo::TupleOp>(loc, inputs);
 
   // Create the new while op with tuple inputs.
-  SmallVector<Value, 3> operands(op.getOperands());
   auto while_op = builder.create<xla_hlo::WhileOp>(
       loc, builder.getTupleType(op.getResultTypes()), tuple_input);
 
@@ -159,7 +182,6 @@ void LowerWhile(TF::WhileOp op, ModuleOp module) {
   ImportXlaRegion(cond_branch, &while_op.cond(), loc, /*tuple_return=*/false);
 
   // De-tuple the results of the xla hlo while.
-  builder.setInsertionPointAfter(op);
   Detuple(while_op.getResult(), op.getResults(), &builder);
   op.erase();
 }
@@ -168,8 +190,20 @@ void LowerWhile(TF::WhileOp op, ModuleOp module) {
 void LegalizeTFControlFlow::runOnOperation() {
   auto module = getOperation();
 
-  module.walk([&](TF::WhileOp op) -> void { LowerWhile(op, module); });
-  module.walk([&](TF::IfOp op) -> void { LowerIf(op, module); });
+  module.walk([&](Operation* op) {
+    if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
+      LowerWhile(while_op, module);
+      return;
+    }
+    if (auto if_op = dyn_cast<TF::IfOp>(op)) {
+      LowerIf(if_op, module);
+      return;
+    }
+    if (auto case_op = dyn_cast<TF::CaseOp>(op)) {
+      LowerCase(case_op, module);
+      return;
+    }
+  });
 }
 }  // namespace xla_hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index b2a7c1e7f62..ef5a8356a32 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -18,6 +18,7 @@ limitations under the License.
 include "mlir/IR/OpBase.td"
 include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+include "tensorflow/compiler/mlir/xla/ir/chlo_ops.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 
 def SignedIntTensor : TensorOf<[I1, I8, I16, I32, I64]>;
@@ -72,18 +73,6 @@ def : Pattern<
 // HLO and XLA doesn't support Assertions.
 def LowerAssert : Pattern<(TF_AssertOp $condition, $data, $summarize), []>;
 
-//===----------------------------------------------------------------------===//
-// Bias op patterns.
-//===----------------------------------------------------------------------===//
-def BiasAddFeatureDimension : NativeCodeCall<
-    "getBiasFeatureDimension($_builder, $0, $1)">;
-
-// $input needs to be a ranked tensor to identify index of the feature
-// dimension depending on the data_format 'NHWC' or 'NCHW'.
-def : Pat<(TF_BiasAddOp AnyRankedTensor:$input, $bias, $data_format),
-          (HLO_AddOp $input, $bias,
-              (BiasAddFeatureDimension $data_format, $input))>;
-
 //===----------------------------------------------------------------------===//
 // Binary op patterns.
 //===----------------------------------------------------------------------===//
@@ -96,21 +85,22 @@ class DirectBinaryPat<Op FromOp, Op ToOp>
   : Pat<(FromOp AnyRankedTensor:$l, AnyRankedTensor:$r),
         (ToOp $l, $r, (BinBroadcastDimensions $l, $r))>;
 
-foreach fromToBinPair = [[TF_AddOp, HLO_AddOp],
-                         [TF_AddV2Op, HLO_AddOp],
-                         [TF_DivOp, HLO_DivOp],
-                         [TF_LeftShiftOp, HLO_ShiftLeftOp],
-                         [TF_MaximumOp, HLO_MaxOp],
-                         [TF_MinimumOp, HLO_MinOp],
-                         [TF_MulOp, HLO_MulOp],
-                         [TF_PowOp, HLO_PowOp],
-                         [TF_RealDivOp, HLO_DivOp],
-                         [TF_SubOp, HLO_SubOp]] in
+foreach fromToBinPair = [[TF_AddOp, HLOClient_BroadcastAddOp],
+                         [TF_AddV2Op, HLOClient_BroadcastAddOp],
+                         [TF_DivOp, HLOClient_BroadcastDivOp],
+                         [TF_LeftShiftOp, HLOClient_BroadcastShiftLeftOp],
+                         [TF_MaximumOp, HLOClient_BroadcastMaxOp],
+                         [TF_MinimumOp, HLOClient_BroadcastMinOp],
+                         [TF_MulOp, HLOClient_BroadcastMulOp],
+                         [TF_PowOp, HLOClient_BroadcastPowOp],
+                         [TF_RealDivOp, HLOClient_BroadcastDivOp],
+                         [TF_SubOp, HLOClient_BroadcastSubOp]] in
   def : DirectBinaryPat<fromToBinPair[0], fromToBinPair[1]>;
 
 def LowerRightShiftSigned :
   Pat<(TF_RightShiftOp AnyRankedTensor:$l, AnyRankedTensor:$r),
-      (HLO_ShiftRightArithmeticOp $l, $r, (BinBroadcastDimensions $l, $r)),
+      (HLOClient_BroadcastShiftRightArithmeticOp $l, $r,
+       (BinBroadcastDimensions $l, $r)),
       [(SignedIntTensor $r)]>;
 
 // TODO(hinsu): Lower unsigned types to HLO_ShiftRightLogical once the HLO op
@@ -122,10 +112,11 @@ def : Pat<(TF_ComplexOp $r, $i), (HLO_ComplexOp $r, $i)>;
 //
 //  return floor(div(x, y))
 def : Pat<(TF_FloorDivOp AnyRankedTensor:$l, AnyRankedTensor:$r),
-          (HLO_FloorOp (HLO_DivOp $l, $r, (BinBroadcastDimensions $l, $r))),
+          (HLO_FloorOp
+           (HLOClient_BroadcastDivOp $l, $r, (BinBroadcastDimensions $l, $r))),
           [(IEEEFloatTensor $l)]>;
 
-// Performs a substitution of FloorDir for integer tensors, which required
+// Performs a substitution of FloorDiv for integer tensors, which required
 // additional correction for a negative numerator / denominator. Equivalent
 // pseudocode is shown below:
 //
@@ -144,19 +135,19 @@ def : Pat<(TF_FloorDivOp AnyRankedTensor:$l, AnyRankedTensor:$r),
 // NOTE: This should be optimized for unsigned integers.
 // Requires static shaped inputs to create constant splats and computation of
 // broadcast attributes.
-def : Pat<(TF_FloorDivOp AnyStaticShapeTensor:$l, AnyStaticShapeTensor:$r),
+def : Pat<(TF_FloorDivOp AnyRankedTensor:$l, AnyRankedTensor:$r),
         (HLO_SelectOp
-         (HLO_CompareOp
-          (HLO_CompareOp $l, (HLO_ConstOp (ConstantSplat<"0"> $l)),
+         (HLOClient_BroadcastCompareOp
+          (HLOClient_BroadcastCompareOp $l, (HLO_ConstOp (GetScalarOfType<0> $l)),
            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
-          (HLO_CompareOp $r, (HLO_ConstOp (ConstantSplat<"0"> $r)),
+          (HLOClient_BroadcastCompareOp $r, (HLO_ConstOp (GetScalarOfType<0> $r)),
            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
           (BinBroadcastDimensions $l, $r), HLO_COMPARISON_DIRECTION_EQ),
-        (HLO_DivOp $l, $r, (BinBroadcastDimensions $l, $r)),
-          (HLO_DivOp
-           (HLO_NegOp:$neg (HLO_AddOp (HLO_AbsOp $l),
-                       (HLO_SubOp (HLO_AbsOp $r),
-                        (HLO_ConstOp (ConstantSplat<"1"> $r)),
+        (HLOClient_BroadcastDivOp $l, $r, (BinBroadcastDimensions $l, $r)),
+          (HLOClient_BroadcastDivOp
+           (HLO_NegOp:$neg (HLOClient_BroadcastAddOp (HLO_AbsOp $l),
+                       (HLOClient_BroadcastSubOp (HLO_AbsOp $r),
+                        (HLO_ConstOp (GetScalarOfType<1> $r)),
                         (NullDenseIntElementsAttr)),
                      (BinBroadcastDimensions $l, $r))),
            (HLO_AbsOp:$abs $r), (BinBroadcastDimensions $neg, $abs))),
@@ -169,22 +160,22 @@ def : Pat<(TF_FloorDivOp AnyStaticShapeTensor:$l, AnyStaticShapeTensor:$r),
 //   return trunc_mod != 0 && (y < 0 != trunc_mod < 0) ? trunc_mod + y
 // Requires static shaped inputs to create constant splats and computation of
 // broadcast attributes.
-def : Pat<(TF_FloorModOp AnyStaticShapeTensor:$l, AnyStaticShapeTensor:$r),
+def : Pat<(TF_FloorModOp AnyRankedTensor:$l, AnyRankedTensor:$r),
       (HLO_SelectOp
-       (HLO_AndOp
-        (HLO_CompareOp
-         (HLO_RemOp:$rem $l, $r, (BinBroadcastDimensions $l, $r)),
-         (HLO_ConstOp:$l_zeros (ConstantSplat<"0"> $l)),
+       (HLOClient_BroadcastAndOp
+        (HLOClient_BroadcastCompareOp
+         (HLOClient_BroadcastRemOp:$rem $l, $r, (BinBroadcastDimensions $l, $r)),
+         (HLO_ConstOp:$l_zeros (GetScalarOfType<0> $l)),
          (BinBroadcastDimensions $l, $rem), HLO_COMPARISON_DIRECTION_NE),
-        (HLO_CompareOp
-         (HLO_CompareOp:$r_cmp $r,
-          (HLO_ConstOp:$r_zeros (ConstantSplat<"0"> $r)),
+        (HLOClient_BroadcastCompareOp
+         (HLOClient_BroadcastCompareOp:$r_cmp $r,
+          (HLO_ConstOp:$r_zeros (GetScalarOfType<0> $r)),
           (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
-         (HLO_CompareOp:$rem_cmp $rem, $r_zeros,
+         (HLOClient_BroadcastCompareOp:$rem_cmp $rem, $r_zeros,
           (BinBroadcastDimensions $rem, $r_zeros), HLO_COMPARISON_DIRECTION_LT),
          (BinBroadcastDimensions $r_cmp, $rem_cmp), HLO_COMPARISON_DIRECTION_NE),
         (NullDenseIntElementsAttr)),
-        (HLO_AddOp $r,
+        (HLOClient_BroadcastAddOp $r,
          $rem, (BinBroadcastDimensions $r, $rem)), $rem)>;
 
 //===----------------------------------------------------------------------===//
@@ -196,10 +187,10 @@ class DirectLogicalBinaryPat<Op FromOp, Op ToOp>
         (ToOp $l, $r, (BinBroadcastDimensions $l, $r)),
         [(SignedIntTensor $l)]>;
 
-foreach fromToBinPair = [[TF_LogicalAndOp, HLO_AndOp],
-                         [TF_LogicalOrOp, HLO_OrOp],
-                         [TF_BitwiseOrOp, HLO_OrOp],
-                         [TF_BitwiseAndOp, HLO_AndOp]] in
+foreach fromToBinPair = [[TF_LogicalAndOp, HLOClient_BroadcastAndOp],
+                         [TF_LogicalOrOp, HLOClient_BroadcastOrOp],
+                         [TF_BitwiseOrOp, HLOClient_BroadcastOrOp],
+                         [TF_BitwiseAndOp, HLOClient_BroadcastAndOp]] in
   def : DirectLogicalBinaryPat<fromToBinPair[0], fromToBinPair[1]>;
 
 //===----------------------------------------------------------------------===//
@@ -208,7 +199,8 @@ foreach fromToBinPair = [[TF_LogicalAndOp, HLO_AndOp],
 
 class DirectComparePat<Op FromOp, StrEnumAttrCase direction>
   : Pat<(FromOp AnyRankedTensor:$l, AnyRankedTensor:$r),
-        (HLO_CompareOp $l, $r, (BinBroadcastDimensions $l, $r), direction)>;
+        (HLOClient_BroadcastCompareOp
+           $l, $r, (BinBroadcastDimensions $l, $r), direction)>;
 
 def : DirectComparePat<TF_GreaterOp, HLO_COMPARISON_DIRECTION_GT>;
 def : DirectComparePat<TF_GreaterEqualOp, HLO_COMPARISON_DIRECTION_GE>;
@@ -218,7 +210,8 @@ def : DirectComparePat<TF_LessEqualOp, HLO_COMPARISON_DIRECTION_LE>;
 class EqualityPat<Op FromOp, StrEnumAttrCase direction>
     : Pat<(FromOp AnyRankedTensor:$l, AnyRankedTensor:$r,
            TrueBoolAttr:$incompatible_shape_error),
-        (HLO_CompareOp $l, $r, (BinBroadcastDimensions $l, $r), direction),
+        (HLOClient_BroadcastCompareOp
+         $l, $r, (BinBroadcastDimensions $l, $r), direction),
         [(AreBroadcastCompatible $l, $r)]>;
 
 def : EqualityPat<TF_EqualOp, HLO_COMPARISON_DIRECTION_EQ>;
@@ -273,6 +266,13 @@ def : Pat<(TF_CrossReplicaSumOp $input, (TF_ConstOp $group_assignment)),
           (HLO_CrossReplicaSumOp $input,
             (CastElementsToI64Elements $group_assignment))>;
 
+//===----------------------------------------------------------------------===//
+// All2All op patterns.
+//===----------------------------------------------------------------------===//
+
+def : Pat<(TF_AllToAllOp AnyRankedTensor:$input, (TF_ConstOp $group_assignment), I64Attr:$concat_dimension, $split_dimension, $split_count),
+          (HLO_AllToAllOp $input, $split_dimension, $concat_dimension, $split_count, (CastElementsToI64Elements $group_assignment))>;
+
 //===----------------------------------------------------------------------===//
 // FFT op patterns.
 //===----------------------------------------------------------------------===//
@@ -393,39 +393,36 @@ def : Pattern<(TF_MatrixBandPartOp:$op AnyRankedTensor:$input, $num_lower, $num_
           (HLO_SelectOp:$num_lower_or_m
            (HLO_CompareOp
             $num_lower, (HLO_ConstOp:$zero (ConstantSplat<"0"> $num_lower)),
-            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT
+            HLO_COMPARISON_DIRECTION_LT
            ),
            $m_dim,
            $num_lower
           ),
           (HLO_SelectOp:$num_upper_or_n
            (HLO_CompareOp
-            $num_upper, $zero,
-            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT
+            $num_upper, $zero, HLO_COMPARISON_DIRECTION_LT
            ),
            $n_dim,
            $num_upper
           ),
           (HLO_SelectOp
            (HLO_AndOp
-            (HLO_CompareOp
+            (HLOClient_BroadcastCompareOp
              (HLO_NegOp
               (createConvertOp $op, $num_lower_or_m, $input)
              ),
              (HLO_SubOp:$offset
-              (createIotaOp<"1"> $op, $input), (createIotaOp<"0"> $op, $input),
-              (NullDenseIntElementsAttr)
+              (createIotaOp<"1"> $op, $input), (createIotaOp<"0"> $op, $input)
              ),
              (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LE
             ),
-            (HLO_CompareOp
+            (HLOClient_BroadcastCompareOp
              $offset,
              (createConvertOp
               $op, $num_upper_or_n, $input
              ),
              (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LE
-            ),
-            (BinBroadcastDimensions $offset, $input)
+            )
            ),
            $input,
            (HLO_ConstOp (ConstantSplat<"0"> $input))
@@ -449,8 +446,9 @@ def : Pat<(TF_ConstOp:$res ElementsAttr:$value),
 // TODO(hinsu): Lower unsigned and quantized types after supporting
 // them in GetScalarOfType.
 def : Pat<(TF_ReluOp AnyRankedTensor:$input),
-          (HLO_MaxOp (HLO_ConstOp:$zero (GetScalarOfType<0> $input)), $input,
-                     (BinBroadcastDimensions $zero, $input)),
+          (HLOClient_BroadcastMaxOp
+               (HLO_ConstOp:$zero (GetScalarOfType<0> $input)), $input,
+               (BinBroadcastDimensions $zero, $input)),
           [(TF_SintOrFpTensor $input)]>;
 
 // TODO(hinsu): Lower unsigned and quantized types after supporting
@@ -472,7 +470,7 @@ def : Pat<(TF_Relu6Op AnyRankedTensor:$input),
 // to create splat tensor of dynamic shape in HLO.
 def : Pat<(TF_ReluGradOp AnyStaticShapeTensor:$gradients, AnyRankedTensor:$features),
           (HLO_SelectOp
-            (HLO_CompareOp $features,
+            (HLOClient_BroadcastCompareOp $features,
               (HLO_ConstOp (GetScalarOfType<0> $features)),
               (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_GT),
             $gradients, (HLO_ConstOp (ConstantSplat<"0"> $gradients)))>;
@@ -514,16 +512,14 @@ foreach callOp = [TF_PartitionedCallOp, TF_StatefulPartitionedCallOp] in {
 }
 
 //===----------------------------------------------------------------------===//
-// Ternary op patterns.
+// Reverse op patterns.
 //===----------------------------------------------------------------------===//
 
-def BothTypesMatch : Constraint<CPred<"$0.getType() == $1.getType()">,
-   "types must be equal">;
+// Handles axis conversion for TF reverse.
+def ConvertAxisAttr : NativeCodeCall<"ConvertAxisAttr($0, $1, &$_builder)">;
 
-def : Pat<(TF_SelectOp $cond, $t, $e), (HLO_SelectOp $cond, $t, $e),
-  // TODO(jpienaar): This restriction is to avoid creating a currently
-  // unsupported HLO select.
-  [(BothTypesMatch $t, $e)]>;
+def : Pat<(TF_ReverseV2Op AnyRankedTensor:$values, (TF_ConstOp $axis)),
+    (HLO_ReverseOp $values, (ConvertAxisAttr $values, $axis))>;
 
 //===----------------------------------------------------------------------===//
 // Unary op patterns.
@@ -543,7 +539,6 @@ foreach Mapping = [
                    [TF_LogicalNotOp, HLO_NotOp],
                    [TF_NegOp, HLO_NegOp],
                    [TF_RealOp, HLO_RealOp],
-                   [TF_RoundOp, HLO_RoundOp],
                    [TF_RsqrtOp, HLO_RsqrtOp],
                    [TF_SinOp, HLO_SinOp],
                    [TF_SqrtOp, HLO_SqrtOp],
@@ -576,7 +571,6 @@ def : Pat<(TF_SignOp $x),
             (HLO_CompareOp
               $x,
               $x,
-              (NullDenseIntElementsAttr),
               HLO_COMPARISON_DIRECTION_NE
             ),
             (HLO_ConstOp (ConstantSplat<"0"> $x)),
@@ -617,10 +611,10 @@ def : Pat<(srcDstOpPair[0]:$old $shape, $seed, $seed2),
 //===----------------------------------------------------------------------===//
 // Sigmoid grad op.
 //===----------------------------------------------------------------------===//
+
+// TODO(hinsu): Handle unranked inputs by broadcasting constant one to the
+// shape of $l instead of having it as a constant.
 def : Pat<(TF_SigmoidGradOp AnyRankedTensor:$l, AnyRankedTensor:$r),
           (HLO_MulOp
-           (HLO_MulOp $r, $l, (NullDenseIntElementsAttr)),
-           (HLO_SubOp (HLO_ConstOp (ConstantSplat<"1"> $l)), $l,
-            (NullDenseIntElementsAttr)),
-           (NullDenseIntElementsAttr)),
-          [(IEEEFloatTensor $l)]>;
+           (HLO_MulOp $r, $l),
+           (HLO_SubOp (HLO_ConstOp (ConstantSplat<"1"> $l)), $l))>;
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index 551462572f1..b15974979c9 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -83,31 +83,52 @@ static bool IsOpWhitelisted(Operation* op) {
   // clang-format off
   static llvm::SmallDenseSet<mlir::TypeID, 512> ops = {
     TypeID::get<TF::AbsOp>(),
+    TypeID::get<TF::AcoshOp>(),
+    TypeID::get<TF::AcosOp>(),
     TypeID::get<TF::AddNOp>(),
     TypeID::get<TF::AddV2Op>(),
+    TypeID::get<TF::AngleOp>(),
+    TypeID::get<TF::ApproximateEqualOp>(),
+    TypeID::get<TF::AsinhOp>(),
+    TypeID::get<TF::AsinOp>(),
     TypeID::get<TF::Atan2Op>(),
+    TypeID::get<TF::AtanhOp>(),
+    TypeID::get<TF::AtanOp>(),
     TypeID::get<TF::BatchMatMulV2Op>(),
-    TypeID::get<TF::BiasAddOp>(),
     TypeID::get<TF::BiasAddGradOp>(),
+    TypeID::get<TF::BiasAddOp>(),
     TypeID::get<TF::BitwiseAndOp>(),
     TypeID::get<TF::BitwiseOrOp>(),
     TypeID::get<TF::BitwiseXorOp>(),
     TypeID::get<TF::CastOp>(),
+    TypeID::get<TF::ClipByValueOp>(),
     TypeID::get<TF::ComplexAbsOp>(),
+    TypeID::get<TF::ConjugateTransposeOp>(),
+    TypeID::get<TF::CoshOp>(),
+    TypeID::get<TF::CrossOp>(),
     TypeID::get<TF::DataFormatDimMapOp>(),
     TypeID::get<TF::DataFormatVecPermuteOp>(),
+    TypeID::get<TF::DigammaOp>(),
     TypeID::get<TF::DivNoNanOp>(),
+    TypeID::get<TF::EluGradOp>(),
+    TypeID::get<TF::EluOp>(),
     TypeID::get<TF::EqualOp>(),
+    TypeID::get<TF::ErfcOp>(),
+    TypeID::get<TF::ErfOp>(),
+    TypeID::get<TF::Expm1Op>(),
     TypeID::get<TF::FloorDivOp>(),
     TypeID::get<TF::FloorModOp>(),
-    TypeID::get<TF::GreaterOp>(),
-    TypeID::get<TF::GreaterEqualOp>(),
     TypeID::get<TF::GatherNdOp>(),
-    TypeID::get<TF::InvOp>(),
+    TypeID::get<TF::GreaterEqualOp>(),
+    TypeID::get<TF::GreaterOp>(),
     TypeID::get<TF::InvertOp>(),
+    TypeID::get<TF::InvOp>(),
+    TypeID::get<TF::LeakyReluGradOp>(),
+    TypeID::get<TF::LeakyReluOp>(),
     TypeID::get<TF::LeftShiftOp>(),
-    TypeID::get<TF::LessOp>(),
     TypeID::get<TF::LessEqualOp>(),
+    TypeID::get<TF::LessOp>(),
+    TypeID::get<TF::LgammaOp>(),
     TypeID::get<TF::LogicalAndOp>(),
     TypeID::get<TF::LogicalNotOp>(),
     TypeID::get<TF::LogicalOrOp>(),
@@ -116,21 +137,43 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::MulOp>(),
     TypeID::get<TF::NegOp>(),
     TypeID::get<TF::NotEqualOp>(),
+    TypeID::get<TF::PadOp>(),
     TypeID::get<TF::PlaceholderWithDefaultOp>(),
     TypeID::get<TF::PowOp>(),
     TypeID::get<TF::RealDivOp>(),
+    TypeID::get<TF::ReciprocalOp>(),
+    TypeID::get<TF::ReciprocalGradOp>(),
+    TypeID::get<TF::Relu6GradOp>(),
     TypeID::get<TF::RightShiftOp>(),
-    TypeID::get<TF::SinOp>(),
+    TypeID::get<TF::RintOp>(),
+    TypeID::get<TF::RoundOp>(),
     TypeID::get<TF::SelectV2Op>(),
-    TypeID::get<TF::SubOp>(),
+    TypeID::get<TF::SeluGradOp>(),
+    TypeID::get<TF::SeluOp>(),
+    TypeID::get<TF::SigmoidGradOp>(),
+    TypeID::get<TF::SinhOp>(),
+    TypeID::get<TF::SinOp>(),
+    TypeID::get<TF::SoftplusGradOp>(),
+    TypeID::get<TF::SoftsignGradOp>(),
+    TypeID::get<TF::SoftsignOp>(),
+    TypeID::get<TF::SqrtGradOp>(),
     TypeID::get<TF::SquareOp>(),
+    TypeID::get<TF::SubOp>(),
+    TypeID::get<TF::TanOp>(),
     TypeID::get<TF::TransposeOp>(),
     TypeID::get<TF::TruncateDivOp>(),
-    TypeID::get<TF::TruncateModOp>(),
     TypeID::get<TF::TruncatedNormalOp>(),
+    TypeID::get<TF::TruncateModOp>(),
     TypeID::get<TF::UnpackOp>(),
+    TypeID::get<TF::XdivyOp>(),
+    TypeID::get<TF::XlaBroadcastHelperOp>(),
+    TypeID::get<TF::XlaConvOp>(),
     TypeID::get<TF::XlaDotOp>(),
-    TypeID::get<TF::XlaPadOp>()
+    TypeID::get<TF::XlaDynamicSliceOp>(),
+    TypeID::get<TF::XlaDynamicUpdateSliceOp>(),
+    TypeID::get<TF::XlaPadOp>(),
+    TypeID::get<TF::Xlog1pyOp>(),
+    TypeID::get<TF::XlogyOp>()
   };
   // clang-format on
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td
index c0f6c2c3541..21e39db018b 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td
@@ -36,47 +36,36 @@ def IsSameSizePred : CPred<
 def IsSameSizeConstraint : Constraint<IsSameSizePred, "inputs are same size">;
 
 
-def : Pat<(HLO_AndOp HLO_PredTensor:$l, HLO_PredTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_AndOp HLO_PredTensor:$l, HLO_PredTensor:$r),
           (AndOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_AddOp HLO_FpTensor:$l, HLO_FpTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_AddOp HLO_FpTensor:$l, HLO_FpTensor:$r),
           (AddFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_SubOp HLO_FpTensor:$l, HLO_FpTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_SubOp HLO_FpTensor:$l, HLO_FpTensor:$r),
           (SubFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_MulOp HLO_FpTensor:$l, HLO_FpTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_MulOp HLO_FpTensor:$l, HLO_FpTensor:$r),
           (MulFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_DivOp HLO_FpTensor:$l, HLO_FpTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_DivOp HLO_FpTensor:$l, HLO_FpTensor:$r),
           (DivFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_RemOp HLO_FpTensor:$l, HLO_FpTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_RemOp HLO_FpTensor:$l, HLO_FpTensor:$r),
           (RemFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_AddOp HLO_IntTensor:$l, HLO_IntTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_AddOp HLO_IntTensor:$l, HLO_IntTensor:$r),
           (AddIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_SubOp HLO_IntTensor:$l, HLO_IntTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_SubOp HLO_IntTensor:$l, HLO_IntTensor:$r),
           (SubIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_MulOp HLO_IntTensor:$l, HLO_IntTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_MulOp HLO_IntTensor:$l, HLO_IntTensor:$r),
           (MulIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_DivOp HLO_IntTensor:$l, HLO_IntTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_DivOp HLO_IntTensor:$l, HLO_IntTensor:$r),
           (SignedDivIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_RemOp HLO_IntTensor:$l, HLO_IntTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_RemOp HLO_IntTensor:$l, HLO_IntTensor:$r),
           (SignedRemIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
index 43c0911a4a6..ddbb672c70a 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
@@ -57,8 +57,9 @@ class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
     for (auto func_arg : func.getArguments()) {
       func_args.insert(func_arg);
     }
+    MLIRContext* ctx = func.getContext();
     OpBuilder b(func);
-    OperationFolder folder(func.getContext());
+    OperationFolder folder(ctx);
     func.walk([&](linalg::GenericOp generic_op) {
       SmallVector<int64_t, 2> tile_sizes(tile_sizes_.begin(),
                                          tile_sizes_.end());
@@ -68,12 +69,14 @@ class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
       auto op = cast<LinalgOp>(generic_op.getOperation());
       for (const Value result : op.getOutputBuffers()) {
         if (!func_args.count(result)) continue;
-        if (tileGenericOp(op, tile_sizes, &b, &folder)) {
+        if (tileGenericOp(op, tile_sizes, &b)) {
           generic_op.erase();
           return;
         }
       }
     });
+    auto patterns = linalg::getLinalgTilingCanonicalizationPatterns(ctx);
+    applyPatternsAndFoldGreedily(func, patterns);
 
     // Fuse producers of tiled linalg ops.
     llvm::SmallDenseSet<Operation*> erase_set;
@@ -92,19 +95,22 @@ class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
           *originalOpInLinalgOpsVector = info->fusedProducer.getOperation();
         }
       }
+
+      auto patterns = linalg::getLinalgTilingCanonicalizationPatterns(ctx);
+      applyPatternsAndFoldGreedily(func, patterns);
     }
     for (auto* e : erase_set) e->erase();
   }
 
  private:
-  bool tileGenericOp(LinalgOp op, ArrayRef<int64_t> tile_sizes, OpBuilder* b,
-                     OperationFolder* folder) {
-    auto tiled_generic_op =
-        use_parallel_loops_
-            ? linalg::tileLinalgOpToParallelLoops(*b, op, tile_sizes,
-                                                  /*permutation=*/{}, folder)
-            : linalg::tileLinalgOp(*b, op, tile_sizes,
-                                   /*permutation=*/{}, folder);
+  bool tileGenericOp(LinalgOp op, ArrayRef<int64_t> tile_sizes, OpBuilder* b) {
+    auto loopType = use_parallel_loops_
+                        ? linalg::LinalgTilingLoopType::ParallelLoops
+                        : linalg::LinalgTilingLoopType::Loops;
+    auto tiled_generic_op = linalg::tileLinalgOp(*b, op,
+                                                 linalg::LinalgTilingOptions()
+                                                     .setTileSizes(tile_sizes)
+                                                     .setLoopType(loopType));
     return tiled_generic_op.hasValue();
   }
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
index 2921a49ba70..f7f5537f882 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
@@ -54,14 +54,14 @@ struct BinaryOpConverter : public OpRewritePattern<LhloOpTy> {
       induction_vars.push_back(forOp.getInductionVar());
       rewriter.setInsertionPointToStart(forOp.getBody());
     }
-    auto l = rewriter.create<LoadOp>(loc, lhs, induction_vars);
-    auto r = rewriter.create<LoadOp>(loc, rhs, induction_vars);
+    auto l = rewriter.create<AffineLoadOp>(loc, lhs, induction_vars);
+    auto r = rewriter.create<AffineLoadOp>(loc, rhs, induction_vars);
     Value opResult = xla_lhlo::XlaOpToStdScalarOp::map<LhloOpTy>(
         op, element_type, {l, r}, &rewriter);
     if (opResult == nullptr) {
       return failure();
     }
-    rewriter.create<StoreOp>(loc, opResult, op.out(), induction_vars);
+    rewriter.create<AffineStoreOp>(loc, opResult, op.out(), induction_vars);
     rewriter.eraseOp(op);
     return success();
   }
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
index e6f3ac02d4f..f0eb3cc1a0f 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
-#include "mlir/Dialect/LoopOps/LoopOps.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
@@ -112,7 +112,7 @@ class LhloReduceToGPULaunchConverter : public OpConversionPattern<ReduceOp> {
       auto step = rewriter.create<mlir::ConstantOp>(
           loc, rewriter.getIndexType(),
           rewriter.getIntegerAttr(rewriter.getIndexType(), 1));
-      auto loop = rewriter.create<mlir::loop::ForOp>(loc, zero, upper, step);
+      auto loop = rewriter.create<mlir::scf::ForOp>(loc, zero, upper, step);
 
       rewriter.setInsertionPointToStart(loop.getBody());
       // Compute memrefs for the value to reduce. This makes it easier to just
@@ -173,8 +173,7 @@ struct LhloLegalizeToGpu : public PassWrapper<LhloLegalizeToGpu, FunctionPass> {
     OwningRewritePatternList patterns;
     ConversionTarget target(getContext());
     target.addLegalDialect<linalg::LinalgDialect, StandardOpsDialect,
-                           gpu::GPUDialect, loop::LoopOpsDialect,
-                           XlaLhloDialect>();
+                           gpu::GPUDialect, scf::SCFDialect, XlaLhloDialect>();
     target.addIllegalOp<ReduceOp>();
     auto func = getFunction();
     patterns.insert<LhloReduceToGPULaunchConverter>(func.getContext());
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
new file mode 100644
index 00000000000..385e0859906
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
@@ -0,0 +1,136 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
+
+namespace mlir {
+namespace xla_lhlo {
+namespace {
+
+struct StaticMemRefCastOpConverter
+    : public ConvertOpToLLVMPattern<StaticMemRefCastOp> {
+  using ConvertOpToLLVMPattern<StaticMemRefCastOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult matchAndRewrite(
+      Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto cast_op = cast<StaticMemRefCastOp>(op);
+
+    StaticMemRefCastOpOperandAdaptor operands_adaptor(operands);
+    MemRefDescriptor sourceMemRef(operands_adaptor.operand());
+
+    MemRefType targetMemRefType =
+        cast_op.getResult().getType().cast<MemRefType>();
+    auto llvmTargetDescriptorTy = typeConverter.convertType(targetMemRefType)
+                                      .dyn_cast_or_null<LLVM::LLVMType>();
+    if (!llvmTargetDescriptorTy || !llvmTargetDescriptorTy.isStructTy())
+      return failure();
+    // Create descriptor.
+    auto desc = MemRefDescriptor::undef(rewriter, loc, llvmTargetDescriptorTy);
+    Type llvmTargetElementTy = desc.getElementType();
+    // Set allocated ptr.
+    Value allocated = sourceMemRef.allocatedPtr(rewriter, loc);
+    allocated =
+        rewriter.create<LLVM::BitcastOp>(loc, llvmTargetElementTy, allocated);
+    desc.setAllocatedPtr(rewriter, loc, allocated);
+    // Set aligned ptr.
+    Value ptr = sourceMemRef.alignedPtr(rewriter, loc);
+    ptr = rewriter.create<LLVM::BitcastOp>(loc, llvmTargetElementTy, ptr);
+    desc.setAlignedPtr(rewriter, loc, ptr);
+
+    // Fill size and stride descriptors in memref.
+    auto target_sizes = targetMemRefType.getShape();
+    int64_t target_offset;
+    llvm::SmallVector<int64_t, 4> target_strides;
+    if (failed((getStridesAndOffset(targetMemRefType, target_strides,
+                                    target_offset))))
+      return failure();
+
+    // Copy offset of `targetMemRef`.
+    desc.setConstantOffset(rewriter, loc, target_offset);
+    for (int i = 0, e = targetMemRefType.getRank(); i < e; ++i) {
+      desc.setConstantSize(rewriter, loc, i, target_sizes[i]);
+      desc.setConstantStride(rewriter, loc, i, target_strides[i]);
+    }
+    rewriter.replaceOp(op, {desc});
+    return success();
+  }
+};
+
+struct DynamicMemRefCastOpConverter
+    : public ConvertOpToLLVMPattern<DynamicMemRefCastOp> {
+  using ConvertOpToLLVMPattern<DynamicMemRefCastOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult matchAndRewrite(
+      Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto cast_op = cast<DynamicMemRefCastOp>(op);
+
+    DynamicMemRefCastOpOperandAdaptor operands_adaptor(operands);
+    MemRefDescriptor sourceMemRef(operands_adaptor.operand());
+
+    MemRefType targetMemRefType =
+        cast_op.getResult().getType().cast<MemRefType>();
+    auto llvmTargetDescriptorTy = typeConverter.convertType(targetMemRefType)
+                                      .dyn_cast_or_null<LLVM::LLVMType>();
+    if (!llvmTargetDescriptorTy || !llvmTargetDescriptorTy.isStructTy())
+      return failure();
+    // Create descriptor.
+    auto desc = MemRefDescriptor::undef(rewriter, loc, llvmTargetDescriptorTy);
+    Type llvmTargetElementTy = desc.getElementType();
+    // Set allocated ptr.
+    Value allocated = sourceMemRef.allocatedPtr(rewriter, loc);
+    allocated =
+        rewriter.create<LLVM::BitcastOp>(loc, llvmTargetElementTy, allocated);
+    desc.setAllocatedPtr(rewriter, loc, allocated);
+    // Set aligned ptr.
+    Value ptr = sourceMemRef.alignedPtr(rewriter, loc);
+    ptr = rewriter.create<LLVM::BitcastOp>(loc, llvmTargetElementTy, ptr);
+    desc.setAlignedPtr(rewriter, loc, ptr);
+    // Copy offset of `sourceMemRef`.
+    desc.setOffset(rewriter, loc, sourceMemRef.offset(rewriter, loc));
+
+    // Fill size and stride descriptors in memref.
+    if (!cast_op.sizes().empty()) {
+      auto sizes = operands_adaptor.sizes();
+      auto strides = operands_adaptor.strides();
+      for (int i = 0, e = targetMemRefType.getRank(); i < e; ++i) {
+        desc.setSize(rewriter, loc, i, sizes[i]);
+        desc.setStride(rewriter, loc, i, strides[i]);
+      }
+    }
+    rewriter.replaceOp(op, {desc});
+    return success();
+  }
+};
+
+}  // namespace
+
+void PopulateLhloToLLVMConversionPatterns(LLVMTypeConverter *converter,
+                                          OwningRewritePatternList *patterns) {
+  patterns->insert<DynamicMemRefCastOpConverter, StaticMemRefCastOpConverter>(
+      *converter);
+}
+
+}  // namespace xla_lhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc
new file mode 100644
index 00000000000..9b809049290
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
+
+namespace mlir {
+namespace xla_lhlo {
+namespace {
+
+class TestLhloToLLVMPass
+    : public ::mlir::PassWrapper<TestLhloToLLVMPass,
+                                 ::mlir::OperationPass<::mlir::ModuleOp>> {
+ public:
+  void runOnOperation() override {
+    ModuleOp m = getOperation();
+
+    OwningRewritePatternList patterns;
+    LLVMTypeConverter converter(m.getContext());
+    populateStdToLLVMConversionPatterns(converter, patterns);
+    PopulateLhloToLLVMConversionPatterns(&converter, &patterns);
+
+    ConversionTarget target(getContext());
+    target.addLegalDialect<LLVM::LLVMDialect>();
+    target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
+    target.addIllegalDialect<XlaLhloDialect>();
+
+    if (failed(applyFullConversion(m, target, patterns, &converter))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+static PassRegistration<TestLhloToLLVMPass> legalize_lhlo_pass(
+    "test-lhlo-legalize-to-llvm", "Legalize from LHLO dialect to LLVM.");
+
+}  // namespace xla_lhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
index 54b3acd3787..734a75a4307 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
-#include "mlir/Dialect/LoopOps/LoopOps.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
@@ -61,15 +61,15 @@ Value ApplySingleResultLhloCode(Location loc, ValueRange operands,
 
 // Converts a block with LHLO ops and with signature:
 //   ^bb(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
-// into a reduction operator of loop.reduce by doing buffer allocation for
-// scalar arguments and the result of `loop.reduce` to make it compatible with
+// into a reduction operator of scf.reduce by doing buffer allocation for
+// scalar arguments and the result of `scf.reduce` to make it compatible with
 // LHLO ops.
-void ConvertToReductionOperator(Location loc, loop::ReduceOp reduce_op,
+void ConvertToReductionOperator(Location loc, scf::ReduceOp reduce_op,
                                 Block* lhlo_block, OpBuilder* b) {
   Block& loop_reduce_op_body = reduce_op.reductionOperator().front();
   OpBuilder::InsertionGuard guard(*b);
   b->setInsertionPointToStart(&loop_reduce_op_body);
-  b->create<loop::ReduceReturnOp>(
+  b->create<scf::ReduceReturnOp>(
       loc, ApplySingleResultLhloCode(loc, loop_reduce_op_body.getArguments(),
                                      lhlo_block, b));
 }
@@ -136,9 +136,9 @@ MappedIvs MapWindowIvsToInput(OpTy op, ValueRange ivs, ValueRange window_ivs,
   return mapped_ivs;
 }
 
-// Returns loop::Parallel over a shaped value with static or dynamic shape.
-loop::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
-                                   OpBuilder* b) {
+// Returns scf::Parallel over a shaped value with static or dynamic shape.
+scf::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
+                                  OpBuilder* b) {
   Value zero = b->create<ConstantIndexOp>(loc, 0);
   Value one = b->create<ConstantIndexOp>(loc, 1);
 
@@ -151,10 +151,10 @@ loop::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
     lower.push_back(zero);
     step.push_back(one);
   }
-  return b->create<loop::ParallelOp>(loc, lower, upper, step);
+  return b->create<scf::ParallelOp>(loc, lower, upper, step);
 }
 
-// Converts `xla_lhlo.ReduceOp` into two loop::ParallelOp and a loop::ReduceOp.
+// Converts `xla_lhlo.ReduceOp` into two scf::ParallelOp and a scf::ReduceOp.
 // The outper `ParallelOp` refers to the parallel loops if there are
 // any. The inner `ParalleOp` refers to the reduction loops and `ReduceOp`
 // contains the reduction operator.
@@ -170,10 +170,10 @@ loop::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
 //  is roughly converted into:
 //
 //  %init = load %init_buf[] : memref<f32>
-//  loop.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
-//    %result = loop.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
+//  scf.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
+//    %result = scf.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
 //      %elem_to_reduce = load %buffer[%i, %j, %k] : memref<100x10x5xf32>
-//      loop.reduce(%elem_to_reduce)  {
+//      scf.reduce(%elem_to_reduce)  {
 //        ^bb0(%elem: f32, %acc: f32):   // no predecessors
 //          elem_buf = alloc() : memref<f32>
 //          store %elem, elem_buf[] : memref<f32>
@@ -181,11 +181,11 @@ loop::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
 //          store %acc, acc_buf[] : memref<f32>
 //          <LHLO_ops>
 //          %acc_result = load acc_buf[] : memref<f32>
-//          loop.reduce.return %acc_result : f32
+//          scf.reduce.return %acc_result : f32
 //      } : f32
-//      loop.yield
+//      scf.yield
 //    } : f32
-//    loop.yield
+//    scf.yield
 //  }
 class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
  public:
@@ -197,7 +197,7 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
     // TODO(b/137624192) Implement variadic reduce.
     if (xla_reduce_op.out().size() != 1) return failure();
 
-    loop::ReduceOp reduce_op =
+    scf::ReduceOp reduce_op =
         CreateReduceOpInNestedParallelLoops(xla_reduce_op, &rewriter);
     ConvertToReductionOperator(xla_reduce_op.getLoc(), reduce_op,
                                &xla_reduce_op.body().front(), &rewriter);
@@ -206,26 +206,26 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
   }
 
  private:
-  // Creates nested `loop.parallel` ops with `loop.reduce`. The outer ParallelOp
+  // Creates nested `scf.parallel` ops with `scf.reduce`. The outer ParallelOp
   // refers to the parallel dimensions of `xla_reduce_op` if any and the inner
-  // ParallelOp refers to the reduction dimensions. The loop.reduce op is
+  // ParallelOp refers to the reduction dimensions. The scf.reduce op is
   // returned.
   //
   // If the reduction argument is a memref<100x10x5xf32> and the
   // reduction is performed along dimension 1 then this method will generate
   //
   //  %init = load %init_buf[] : memref<f32>
-  //  loop.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
-  //    %result = loop.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
+  //  scf.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
+  //    %result = scf.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
   //      %elem_to_reduce = load %buffer[%i, %j, %k] : memref<100x10x5xf32>
-  //      loop.reduce(%elem_to_reduce)  {
+  //      scf.reduce(%elem_to_reduce)  {
   //        <THE BLOCK PTR TO BE RETURNED>
   //      } : f32
-  //      loop.yield
+  //      scf.yield
   //    } : f32
-  //    loop.yield
+  //    scf.yield
   //  }
-  loop::ReduceOp CreateReduceOpInNestedParallelLoops(
+  scf::ReduceOp CreateReduceOpInNestedParallelLoops(
       xla_lhlo::ReduceOp xla_reduce_op,
       ConversionPatternRewriter* rewriter) const {
     auto loc = xla_reduce_op.getLoc();
@@ -254,13 +254,13 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
     SmallVector<Value, 1> init_value = {
         rewriter->create<LoadOp>(loc, *xla_reduce_op.init_values().begin())};
     // Outer ParallelOp is not needed if it is a reduction across all dims.
-    loop::ParallelOp outer;
+    scf::ParallelOp outer;
     if (!parallel_lower.empty()) {
-      outer = rewriter->create<loop::ParallelOp>(loc, parallel_lower,
-                                                 parallel_upper, parallel_step);
+      outer = rewriter->create<scf::ParallelOp>(loc, parallel_lower,
+                                                parallel_upper, parallel_step);
       rewriter->setInsertionPointToStart(outer.getBody());
     }
-    loop::ParallelOp inner = rewriter->create<loop::ParallelOp>(
+    scf::ParallelOp inner = rewriter->create<scf::ParallelOp>(
         loc, reduce_lower, reduce_upper, reduce_step, init_value);
     Value reduction_result = *inner.getResults().begin();
 
@@ -294,7 +294,7 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
     rewriter->setInsertionPointToStart(inner.getBody());
     Value elem = rewriter->create<mlir::LoadOp>(
         loc, *xla_reduce_op.operands().begin(), indices);
-    return rewriter->create<loop::ReduceOp>(loc, elem);
+    return rewriter->create<scf::ReduceOp>(loc, elem);
   }
 };
 
@@ -314,8 +314,8 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
 //     accumulator = reduction_operator(output[O], value)
 //   output[O] = accumulator
 //
-// Converts `xla_lhlo.ReduceWindowOp` into two loop::ParallelOp and a
-// loop::ReduceOp.
+// Converts `xla_lhlo.ReduceWindowOp` into two scf::ParallelOp and a
+// scf::ReduceOp.
 // The outper `ParallelOp` refers to the parallel loops that traverese output
 // buffer. The inner `ParalleOp` refers to the reduction loops that traverse
 // reduction windows and `ReduceOp` contains the reduction operator.
@@ -341,20 +341,20 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
 // is roughly converted into:
 //
 //    %neutral_elem = load %init_buf[] : memref<f32>
-//    loop.parallel (%i, %j) = (%c0, %c0) to (%c56, %c56) step (%c1, %c1) {
-//      %result = loop.parallel (%iw, %jw) = (%c0, %c0)
+//    scf.parallel (%i, %j) = (%c0, %c0) to (%c56, %c56) step (%c1, %c1) {
+//      %result = scf.parallel (%iw, %jw) = (%c0, %c0)
 //                  to (%c3, %c3) step (%c1, %c1) neutral_elem (%0) -> f32 {
 //        %in_bounds = <COMPUTE IF INDEX IS IN OPERAND'S pad>
 //        %elem = load %operand[%computed_i, %computed_j]
 //        %elem_or_neutral = select %in_bounds, %elem, %neutral_elem : f32
-//        loop.reduce(%elem_to_reduce)  : f32 {
+//        scf.reduce(%elem_to_reduce)  : f32 {
 //          ^bb0(%arg7: f32, %arg8: f32):
 //            <LHLO ops>
 //        }
-//        loop.yield
+//        scf.yield
 //      }
 //      store %result, %output_buffer[%i, %j] : memref<56x56xf32>
-//      loop.yield
+//      scf.yield
 //    }
 //    return
 //  }
@@ -366,12 +366,12 @@ class ReduceWindowOpConverter
   LogicalResult matchAndRewrite(
       xla_lhlo::ReduceWindowOp xla_reduce_window_op, ArrayRef<Value> /*args*/,
       ConversionPatternRewriter& rewriter) const final {
-    loop::ParallelOp output_loop, window_loop;
+    scf::ParallelOp output_loop, window_loop;
     std::tie(output_loop, window_loop) =
         CreateParallelLoopsToTraverseOutputAndWindow(xla_reduce_window_op,
                                                      &rewriter);
 
-    loop::ReduceOp reduce_op = CreateReduceOpInNestedParallelLoops(
+    scf::ReduceOp reduce_op = CreateReduceOpInNestedParallelLoops(
         xla_reduce_window_op, output_loop, window_loop, &rewriter);
 
     ConvertToReductionOperator(xla_reduce_window_op.getLoc(), reduce_op,
@@ -381,7 +381,7 @@ class ReduceWindowOpConverter
   }
 
  private:
-  std::pair<loop::ParallelOp, loop::ParallelOp>
+  std::pair<scf::ParallelOp, scf::ParallelOp>
   CreateParallelLoopsToTraverseOutputAndWindow(
       xla_lhlo::ReduceWindowOp xla_reduce_window_op,
       ConversionPatternRewriter* rewriter) const {
@@ -405,7 +405,7 @@ class ReduceWindowOpConverter
       window_upper.push_back(
           rewriter->create<ConstantIndexOp>(loc, window_dim.getSExtValue()));
     }
-    auto window_loop = rewriter->create<loop::ParallelOp>(
+    auto window_loop = rewriter->create<scf::ParallelOp>(
         loc, window_lower, window_upper, window_step, init_value);
 
     Value reduction_result = *window_loop.getResults().begin();
@@ -414,9 +414,9 @@ class ReduceWindowOpConverter
     return std::make_pair(output_loop, window_loop);
   }
 
-  loop::ReduceOp CreateReduceOpInNestedParallelLoops(
+  scf::ReduceOp CreateReduceOpInNestedParallelLoops(
       xla_lhlo::ReduceWindowOp xla_reduce_window_op,
-      loop::ParallelOp output_loop, loop::ParallelOp window_loop,
+      scf::ParallelOp output_loop, scf::ParallelOp window_loop,
       ConversionPatternRewriter* rewriter) const {
     rewriter->setInsertionPointToStart(window_loop.getBody());
     auto loc = xla_reduce_window_op.getLoc();
@@ -436,20 +436,20 @@ class ReduceWindowOpConverter
         xla_reduce_window_op, output_loop.getInductionVars(),
         window_loop.getInductionVars(), rewriter);
 
-    auto elem_or_init = rewriter->create<loop::IfOp>(
+    auto elem_or_init = rewriter->create<scf::IfOp>(
         loc, xla_operand_type.getElementType(), mapped_ivs.in_bounds,
         /*withElseRegion=*/true);
 
     OpBuilder then_builder = elem_or_init.getThenBodyBuilder();
     Value elem = then_builder.create<mlir::LoadOp>(
         loc, xla_reduce_window_op.operand(), mapped_ivs.ivs);
-    then_builder.create<loop::YieldOp>(loc, elem);
+    then_builder.create<scf::YieldOp>(loc, elem);
 
     OpBuilder else_builder = elem_or_init.getElseBodyBuilder();
-    else_builder.create<loop::YieldOp>(loc, *window_loop.initVals().begin());
+    else_builder.create<scf::YieldOp>(loc, *window_loop.initVals().begin());
 
-    return rewriter->create<loop::ReduceOp>(loc,
-                                            *elem_or_init.results().begin());
+    return rewriter->create<scf::ReduceOp>(loc,
+                                           *elem_or_init.results().begin());
   }
 };
 
@@ -457,16 +457,16 @@ class ReduceWindowOpConverter
 // https://www.tensorflow.org/xla/operation_semantics#selectandscatter
 //
 // Pseudocode:
-//  loop.parallel(coordinates O in the output):
+//  scf.parallel(coordinates O in the output):
 //    output[O] = init
-//  loop.parallel(coordinates S in the source):
+//  scf.parallel(coordinates S in the source):
 //    selected_ivs = 0
 //    selected_val = 0
 //    initialized_flag = false
-//    loop.for (first dim W_1 in the window)
+//    scf.for (first dim W_1 in the window)
 //         iter_args (selected_ivs, selected_val, initialized_flag):
 //    ...
-//      loop.for (last dim W_N in the window):
+//      scf.for (last dim W_N in the window):
 //           iter_args (selected_ivs, selected_val, initialized_flag):
 //        I = S * stride + W - pad_low
 //        if I within bounds of operand:
@@ -490,7 +490,7 @@ class SelectAndScatterOpConverter
       ConversionPatternRewriter& rewriter) const final {
     auto loc = s_and_s_op.getLoc();
     InitializeOutput(s_and_s_op, &rewriter);
-    loop::ParallelOp loop_over_src =
+    scf::ParallelOp loop_over_src =
         MakeLoopOverShape(loc, s_and_s_op.source(), &rewriter);
     rewriter.setInsertionPointToStart(loop_over_src.getBody());
 
@@ -520,7 +520,7 @@ class SelectAndScatterOpConverter
     auto loc = s_and_s_op.getLoc();
     Value init_value = b->create<LoadOp>(loc, s_and_s_op.init_value());
 
-    loop::ParallelOp loop_over_output =
+    scf::ParallelOp loop_over_output =
         MakeLoopOverShape(loc, s_and_s_op.out(), b);
     OpBuilder::InsertionGuard guard(*b);
     b->setInsertionPointToStart(loop_over_output.getBody());
@@ -531,10 +531,10 @@ class SelectAndScatterOpConverter
   struct WindowLoops {
     SmallVector<Value, 2> selected_ivs;
     SmallVector<Value, 2> window_ivs;
-    loop::ForOp inner_loop;
+    scf::ForOp inner_loop;
   };
   WindowLoops InsertWindowLoops(xla_lhlo::SelectAndScatterOp s_and_s_op,
-                                loop::ParallelOp loop_over_src,
+                                scf::ParallelOp loop_over_src,
                                 OpBuilder* b) const {
     auto loc = s_and_s_op.getLoc();
     Value zero = b->create<ConstantIndexOp>(loc, 0);
@@ -558,12 +558,12 @@ class SelectAndScatterOpConverter
          s_and_s_op.window_dimensions()->getIntValues()) {
       Value upper = b->create<ConstantIndexOp>(loc, window_dim.getSExtValue());
       result.inner_loop =
-          b->create<loop::ForOp>(loc, zero, upper, one, iter_args);
+          b->create<scf::ForOp>(loc, zero, upper, one, iter_args);
       if (b->getInsertionBlock() == loop_over_src.getBody()) {
         ip = b->saveInsertionPoint();
         result.selected_ivs = result.inner_loop.getResults().take_front(rank);
       } else {
-        b->create<loop::YieldOp>(loc, result.inner_loop.getResults());
+        b->create<scf::YieldOp>(loc, result.inner_loop.getResults());
       }
       b->setInsertionPointToStart(result.inner_loop.getBody());
       iter_args = ValueRange{result.inner_loop.getRegionIterArgs()};
@@ -599,7 +599,7 @@ class SelectAndScatterOpConverter
   };
 
   SmallVector<Value, 2> SelectIvs(xla_lhlo::SelectAndScatterOp s_and_s_op,
-                                  loop::ParallelOp loop_over_src,
+                                  scf::ParallelOp loop_over_src,
                                   OpBuilder* b) const {
     auto loc = s_and_s_op.getLoc();
 
@@ -614,7 +614,7 @@ class SelectAndScatterOpConverter
 
     IterArgs ivs_val_flag(window_loops.inner_loop.getRegionIterArgs());
 
-    auto if_in_bounds = inner_loop_b.create<loop::IfOp>(
+    auto if_in_bounds = inner_loop_b.create<scf::IfOp>(
         loc, window_loops.inner_loop.getResultTypes(), mapped_ivs.in_bounds,
         /*withElseRegion=*/true);
 
@@ -623,16 +623,16 @@ class SelectAndScatterOpConverter
       OpBuilder in_bounds_then_b = if_in_bounds.getThenBodyBuilder();
       auto select_or_init_results = SelectOrInitialize(
           s_and_s_op, mapped_ivs.ivs, &ivs_val_flag, &in_bounds_then_b);
-      in_bounds_then_b.create<loop::YieldOp>(loc, select_or_init_results);
+      in_bounds_then_b.create<scf::YieldOp>(loc, select_or_init_results);
     }
 
     // Case when we are in the pad.
     {
       OpBuilder in_bounds_else_b = if_in_bounds.getElseBodyBuilder();
-      in_bounds_else_b.create<loop::YieldOp>(loc, ivs_val_flag.to_vector());
+      in_bounds_else_b.create<scf::YieldOp>(loc, ivs_val_flag.to_vector());
     }
 
-    inner_loop_b.create<loop::YieldOp>(loc, if_in_bounds.getResults());
+    inner_loop_b.create<scf::YieldOp>(loc, if_in_bounds.getResults());
     return window_loops.selected_ivs;
   }
 
@@ -647,8 +647,8 @@ class SelectAndScatterOpConverter
     Value operand_elem =
         b->create<LoadOp>(loc, s_and_s_op.operand(), operand_ivs);
     auto if_init =
-        b->create<loop::IfOp>(loc, iter_arg_types, ivs_val_flag->is_init(),
-                              /*withElseRegion=*/true);
+        b->create<scf::IfOp>(loc, iter_arg_types, ivs_val_flag->is_init(),
+                             /*withElseRegion=*/true);
     // Init == true, i.e. iter args are already initialized with a selected
     // element in boundaries of the operand. Select function has to be computed
     // here.
@@ -660,32 +660,31 @@ class SelectAndScatterOpConverter
           ApplySingleResultLhloCode(loc, {operand_elem, ivs_val_flag->value()},
                                     &lhlo_select, &if_init_then_b);
 
-      auto if_pred =
-          if_init_then_b.create<loop::IfOp>(loc, iter_arg_types, pred,
-                                            /*withElseRegion=*/true);
+      auto if_pred = if_init_then_b.create<scf::IfOp>(loc, iter_arg_types, pred,
+                                                      /*withElseRegion=*/true);
 
       // Pred == true, therefore pack newly selected ivs, val and init flag back
       // to iter_args and return.
       {
         OpBuilder if_pred_then_b = if_pred.getThenBodyBuilder();
-        if_pred_then_b.create<loop::YieldOp>(
+        if_pred_then_b.create<scf::YieldOp>(
             loc, IterArgs{operand_ivs, operand_elem, true_i1}.to_vector());
       }
 
       // Pred == false, therefore return old iter_args.
       {
         OpBuilder if_pred_else_b = if_pred.getElseBodyBuilder();
-        if_pred_else_b.create<loop::YieldOp>(loc, ivs_val_flag->to_vector());
+        if_pred_else_b.create<scf::YieldOp>(loc, ivs_val_flag->to_vector());
       }
 
-      if_init_then_b.create<loop::YieldOp>(loc, if_pred.getResults());
+      if_init_then_b.create<scf::YieldOp>(loc, if_pred.getResults());
     }
     // Init == false, i.e. only pad was visited before and this is the first
     // element in the boundaries of the operand.
     {
       OpBuilder if_init_else_b = if_init.getElseBodyBuilder();
 
-      if_init_else_b.create<loop::YieldOp>(
+      if_init_else_b.create<scf::YieldOp>(
           loc, IterArgs{operand_ivs, operand_elem, true_i1}.to_vector());
     }
     return if_init.getResults();
@@ -708,7 +707,7 @@ struct LhloLegalizeToParallelLoops
 
     ConversionTarget target(getContext());
     target.addLegalDialect<linalg::LinalgDialect, StandardOpsDialect,
-                           loop::LoopOpsDialect, XlaLhloDialect>();
+                           scf::SCFDialect, XlaLhloDialect>();
     target.addIllegalOp<xla_lhlo::ReduceOp, xla_lhlo::ReduceWindowOp,
                         xla_lhlo::SelectAndScatterOp>();
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/lower_complex_patterns.td b/tensorflow/compiler/mlir/xla/transforms/lower_complex_patterns.td
index dcb0ab20e9e..e1ae5ef6abf 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lower_complex_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/lower_complex_patterns.td
@@ -28,70 +28,62 @@ include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 // and imaginary components.
 foreach elementwiseOp = [HLO_AddOp, HLO_SubOp] in
   def : Pat<(elementwiseOp HLO_ComplexTensor:$lhs,
-             HLO_ComplexTensor:$rhs, $broadcast_dimensions),
+             HLO_ComplexTensor:$rhs),
             (HLO_ComplexOp
-              (elementwiseOp (HLO_RealOp $lhs), (HLO_RealOp $rhs),
-               $broadcast_dimensions),
-              (elementwiseOp (HLO_ImagOp $lhs), (HLO_ImagOp $rhs),
-               $broadcast_dimensions))>;
+              (elementwiseOp (HLO_RealOp $lhs), (HLO_RealOp $rhs)),
+              (elementwiseOp (HLO_ImagOp $lhs), (HLO_ImagOp $rhs)))>;
 
 // Complex multiplication results in a cross product multiplication between the
 // real and imaginary components such that:
 //   result.real = lhs.real * rhs.real - lhs.imag * rhs.imag
 //   result.imag = lhs.imag * rhs.real + lhs.real * rhs.imag
 def : Pat<(HLO_MulOp HLO_ComplexTensor:$lhs,
-           HLO_ComplexTensor:$rhs, $broadcast_dimensions),
+           HLO_ComplexTensor:$rhs),
           (HLO_ComplexOp
            (HLO_SubOp
             (HLO_MulOp
              (HLO_RealOp:$lhs_real $lhs),
-             (HLO_RealOp:$rhs_real $rhs),
-             $broadcast_dimensions),
+             (HLO_RealOp:$rhs_real $rhs)),
             (HLO_MulOp
              (HLO_ImagOp:$lhs_imag $lhs),
-             (HLO_ImagOp:$rhs_imag $rhs),
-             $broadcast_dimensions),
-            (NullDenseIntElementsAttr)),
+             (HLO_ImagOp:$rhs_imag $rhs))),
            (HLO_AddOp
-            (HLO_MulOp $lhs_real, $rhs_imag, $broadcast_dimensions),
-            (HLO_MulOp $lhs_imag, $rhs_real, $broadcast_dimensions),
-            (NullDenseIntElementsAttr)))>;
+            (HLO_MulOp $lhs_real, $rhs_imag),
+            (HLO_MulOp $lhs_imag, $rhs_real)))>;
 
 // Multiplication between a complex and real tensor can be distributed by
 // applying the real multiplicant to both the real and complex component.
 //
 // Note that the sourcep pattern is not legal according to the HLO dialect but
 // instead handle intermediates generated by other patterns.
-def : Pat<(HLO_MulOp HLO_ComplexTensor:$lhs, HLO_IntOrFpTensor:$rhs, $broadcast_dimensions),
+def : Pat<(HLO_MulOp HLO_ComplexTensor:$lhs, HLO_IntOrFpTensor:$rhs),
           (HLO_ComplexOp
-           (HLO_MulOp (HLO_RealOp $lhs), $rhs, $broadcast_dimensions),
-           (HLO_MulOp (HLO_ImagOp $lhs), $rhs, $broadcast_dimensions))>;
+           (HLO_MulOp (HLO_RealOp $lhs), $rhs),
+           (HLO_MulOp (HLO_ImagOp $lhs), $rhs))>;
 
-def : Pat<(HLO_MulOp HLO_IntOrFpTensor:$lhs, HLO_ComplexTensor:$rhs, $broadcast_dimensions),
+def : Pat<(HLO_MulOp HLO_IntOrFpTensor:$lhs, HLO_ComplexTensor:$rhs),
           (HLO_ComplexOp
-           (HLO_MulOp $lhs, (HLO_RealOp $rhs), $broadcast_dimensions),
-           (HLO_MulOp $lhs, (HLO_ImagOp $rhs), $broadcast_dimensions))>;
+           (HLO_MulOp $lhs, (HLO_RealOp $rhs)),
+           (HLO_MulOp $lhs, (HLO_ImagOp $rhs)))>;
 
 
 // Division is performed by normalizing the denominator by multiplying by the
 // conjugate of the rhs.
 //   numerator = lhs * conj(rhs)
 //   denominator = rhs * conj(rhs)
-def : Pat<(HLO_DivOp HLO_ComplexTensor:$lhs, HLO_ComplexTensor:$rhs, $broadcast_dimensions),
+def : Pat<(HLO_DivOp HLO_ComplexTensor:$lhs, HLO_ComplexTensor:$rhs),
             (HLO_DivOp
              (HLO_MulOp:$num $lhs,
               (HLO_ComplexOp:$conj
                (HLO_RealOp $rhs),
-               (HLO_NegOp (HLO_ImagOp $rhs))),
-              $broadcast_dimensions),
-             (HLO_RealOp:$den (HLO_MulOp $rhs, $conj, $broadcast_dimensions)),
-             (BinBroadcastDimensions $num, $den))>;
+               (HLO_NegOp (HLO_ImagOp $rhs)))),
+             (HLO_RealOp:$den (HLO_MulOp $rhs, $conj)))>;
 
 
-def : Pat<(HLO_DivOp HLO_ComplexTensor:$lhs, HLO_IntOrFpTensor:$rhs, $broadcast_dimensions),
+def : Pat<(HLO_DivOp HLO_ComplexTensor:$lhs, HLO_IntOrFpTensor:$rhs),
           (HLO_ComplexOp
-           (HLO_DivOp (HLO_RealOp $lhs), $rhs, $broadcast_dimensions),
-           (HLO_DivOp (HLO_ImagOp $lhs), $rhs, $broadcast_dimensions))>;
+           (HLO_DivOp (HLO_RealOp $lhs), $rhs),
+           (HLO_DivOp (HLO_ImagOp $lhs), $rhs))>;
 
 
 // Absolute value is evaluated as:
@@ -100,11 +92,8 @@ def : Pat<(HLO_AbsOp HLO_ComplexTensor:$val),
           (HLO_ComplexOp
            (HLO_SqrtOp
              (HLO_AddOp
-              (HLO_MulOp (HLO_RealOp:$real $val), $real,
-               (NullDenseIntElementsAttr)),
-              (HLO_MulOp (HLO_ImagOp:$imag $val), $imag,
-               (NullDenseIntElementsAttr)),
-              (NullDenseIntElementsAttr))),
+              (HLO_MulOp (HLO_RealOp:$real $val), $real),
+              (HLO_MulOp (HLO_ImagOp:$imag $val), $imag))),
            (HLO_ConstOp (ConstantSplat<"0"> $real)))>;
 
 // Exponential can be lowered to an exponential on the real component and a
@@ -117,5 +106,4 @@ def : Pat<(HLO_ExpOp HLO_ComplexTensor:$val),
            (HLO_ExpOp (HLO_RealOp $val)),
            (HLO_ComplexOp
             (HLO_CosOp (HLO_ImagOp:$imag $val)),
-            (HLO_SinOp $imag)),
-           (NullDenseIntElementsAttr))>;
+            (HLO_SinOp $imag)))>;
diff --git a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
index fed21e9bafc..21b954a3eb4 100644
--- a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
+++ b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
@@ -49,6 +49,7 @@ MAP_HLO_TO_LHLO(ConvertOp);
 MAP_HLO_TO_LHLO(CopyOp);
 MAP_HLO_TO_LHLO(CosOp);
 MAP_HLO_TO_LHLO(DivOp);
+MAP_HLO_TO_LHLO(DotOp);
 MAP_HLO_TO_LHLO(ExpOp);
 MAP_HLO_TO_LHLO(ImagOp);
 MAP_HLO_TO_LHLO(IotaOp);
diff --git a/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h b/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
index dceb73efb33..c317dc36b3c 100644
--- a/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
+++ b/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
@@ -227,6 +227,28 @@ inline Value MapLhloOpToStdScalarOp<xla_lhlo::CeilOp>(
       loc, result_types, args, b);
 }
 
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::ComplexOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<CreateComplexOp>{}(loc, result_types, args,
+                                                       b);
+}
+
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::RealOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<ReOp>{}(loc, result_types, args, b);
+}
+
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::ImagOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<ImOp>{}(loc, result_types, args, b);
+}
+
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::ConvertOp>(
     Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
@@ -259,11 +281,9 @@ inline Value MapLhloOpToStdScalarOp<xla_lhlo::ConvertOp>(
     // No conversion is needed for the same width integers
     return args.front();
   }
-  // TODO(dfki-ehna): Add other primitive type conversions
-  // if (mlir::FpToSiOp::areCastCompatible(sourceType, targetType)) {
-  //   return b.create<mlir::FpToSiOp>(loc, result_types,
-  //   args,mlir::None);
-  // }
+  if (mlir::FPToSIOp::areCastCompatible(sourceType, targetType)) {
+    return b->create<mlir::FPToSIOp>(loc, result_types, args, mlir::None);
+  }
   return nullptr;
 }
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
index a4ffa57957e..c56f5adc12d 100644
--- a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
@@ -28,272 +28,6 @@ namespace xla_hlo {
 
 namespace {
 
-// Returns a 1-d i64 elements attribute populated with numbers from start to
-// end, excluding.
-static DenseIntElementsAttr GetI64ElementsAttrForSeq(int start, int end,
-                                                     Builder *builder) {
-  int size = end - start;
-
-  SmallVector<int64_t, 4> vals;
-  vals.resize(size);
-  std::iota(vals.begin(), vals.end(), start);
-
-  TensorType ty = RankedTensorType::get({size}, builder->getIntegerType(64));
-  return DenseIntElementsAttr::get(ty, vals);
-}
-
-// Helper function for OpRewritePattern classes to materialize broadcasts on
-// LHS and RHS arguments to a binary op.
-//
-// Returns true and sets out_lhs and out_rhs to BroadcastInDimOps if successful,
-// returns false otherwise.
-template <typename SrcOp>
-bool CreateStaticBroadcastsForBinaryOp(SrcOp op, PatternRewriter *rewriter,
-                                       Value *out_lhs, Value *out_rhs) {
-  if (!op.broadcast_dimensions().hasValue()) {
-    // Note: the op may still have an implicit broadcast on it, such as
-    // for (tensor<1xf32>, tensor<4xf32>).
-    return false;
-  }
-
-  // Insert BroadcastInDimOps for the left-hand-side and right-hand-side args,
-  // replacing the original LHS and RHS args in the source op with the results
-  // of the broadcasts.
-  //
-  // If the higher dimensional argument does not actually need the broadcast,
-  // a canonicalization pass should be able to remove that op later.
-  Value lhs = op.lhs();
-  Value rhs = op.rhs();
-
-  auto op_ranked_type = op.getType().template dyn_cast<RankedTensorType>();
-  auto lhs_ranked_type = lhs.getType().dyn_cast<RankedTensorType>();
-  auto rhs_ranked_type = rhs.getType().dyn_cast<RankedTensorType>();
-  if (!op_ranked_type || !lhs_ranked_type || !rhs_ranked_type) {
-    // Unranked, can't determine at this point how to perform the broadcast.
-    return false;
-  }
-
-  // Dynamic result shape, can't use BroadcastInDimOp.
-  assert(op_ranked_type.hasStaticShape() &&
-         "dynamic shape requires DynamicBroadcastInDim");
-
-  auto lhs_rank = lhs_ranked_type.getRank();
-  auto rhs_rank = rhs_ranked_type.getRank();
-
-  // Set broadcast_dimensions to [0, ..., rank] for the higher rank arg.
-  // Use the original op.broadcast_dimensions for the lower rank arg.
-  auto higher_rank_broadcast_dims =
-      GetI64ElementsAttrForSeq(0, std::max(lhs_rank, rhs_rank), rewriter);
-  DenseIntElementsAttr lhs_broadcast_dims;
-  DenseIntElementsAttr rhs_broadcast_dims;
-  if (lhs_rank > rhs_rank) {
-    lhs_broadcast_dims = higher_rank_broadcast_dims;
-    rhs_broadcast_dims = op.broadcast_dimensions().getValue();
-  } else if (lhs_rank < rhs_rank) {
-    lhs_broadcast_dims = op.broadcast_dimensions().getValue();
-    rhs_broadcast_dims = higher_rank_broadcast_dims;
-  } else {
-    // This shouldn't happen for legal ops. If the broadcast_dimensions
-    // attribute is set, the ranks should be different.
-    // TODO(scotttodd): Add a custom verification for ops and assert here.
-    return false;
-  }
-
-  // BroadcastInDimOp must have the same element type for operands and results,
-  // so preserve the original output shape and the original input element type.
-  // For example, `SrcOp (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xi1>`:
-  //   broadcast_in_dim (tensor<1x4xf32>) -> tensor<1x4xf32>
-  //   broadcast_in_dim (tensor<4xf32>) -> tensor<1x4xf32>
-  //   SrcOp (tensor<1x4xf32>, tensor<1x4xf32>) -> tensor<1x4xi1>
-  ArrayRef<int64_t> op_shape = op_ranked_type.getShape();
-  auto lhs_type =
-      RankedTensorType::get(op_shape, lhs_ranked_type.getElementType());
-  auto rhs_type =
-      RankedTensorType::get(op_shape, rhs_ranked_type.getElementType());
-
-  *out_lhs = rewriter->createOrFold<BroadcastInDimOp>(op.getLoc(), lhs_type,
-                                                      lhs, lhs_broadcast_dims);
-  *out_rhs = rewriter->createOrFold<BroadcastInDimOp>(op.getLoc(), rhs_type,
-                                                      rhs, rhs_broadcast_dims);
-  return true;
-}
-
-// Helper template to generate code for computing the result shape of a
-// broadcasted operation. This ultimately should be subsumed by functions
-// from the shape dialect.
-// Assumes that large and small are the operand values of `op` and that they
-// have a ranked tensory type with rank(large) >= rank(small).
-template <typename SrcOp>
-std::vector<Value> ComputeBroadcastedShape(SrcOp op, Value small, Value large,
-                                           PatternRewriter *rewriter) {
-  auto loc = op.getLoc();
-  auto larger_ranked_type = large.getType().cast<RankedTensorType>();
-  auto output_rank = larger_ranked_type.getRank();
-
-  constexpr int kExpandShape = -1;
-
-  std::vector<Value> shape_values;
-  shape_values.reserve(output_rank);
-  std::vector<int> indexes(output_rank, kExpandShape);
-  DenseIntElementsAttr broadcast_dimensions =
-      op.broadcast_dimensions().getValue();
-  // Compute a mapping from output dimensions to their corresponding input
-  // dimensions in the smaller ranked operand.
-  for (auto pair : llvm::enumerate(broadcast_dimensions.getIntValues())) {
-    indexes.at(pair.value().getLimitedValue()) = pair.index();
-  }
-
-  // Compute the broadcasted shape of the result using numpy style broadcasting
-  // semantics. The result shape at a position is the shape of the larger
-  // operand at that position if the no dimension of the smaller operand is
-  // mapped to it.
-  // If both operands contribute to an output dimension, their shape has to
-  // either be the same in that dimension or it can be 1, in which case the
-  // shape of the other operand is used.
-  for (int i = 0; i < output_rank; ++i) {
-    Value index_value;
-    if (indexes[i] == kExpandShape) {
-      // The smaller shape gets expanded to the larger one in this case.
-      index_value = rewriter->create<mlir::DimOp>(loc, large, i);
-    } else {
-      // Compute the result shape depending on whether the rank of smaller is 1.
-      // This does not check that the broadcast operation actualy is correct.
-      // In particular, we do not check that both shapes are the same if the
-      // smaller ranked shape is not 1.
-      ConstantOp one = rewriter->create<mlir::ConstantOp>(
-          loc, rewriter->getIntegerAttr(rewriter->getIndexType(), 1));
-      DimOp lrg_dim = rewriter->create<mlir::DimOp>(loc, large, i);
-      DimOp sml_dim = rewriter->create<mlir::DimOp>(loc, small, indexes[i]);
-      CmpIOp compare =
-          rewriter->create<mlir::CmpIOp>(loc, CmpIPredicate::eq, lrg_dim, one);
-      index_value =
-          rewriter->create<mlir::SelectOp>(loc, compare, lrg_dim, sml_dim);
-    }
-    // Ideally, we would like to keep this on index but MLIR does not allow
-    // this.
-    shape_values.push_back(rewriter->create<mlir::IndexCastOp>(
-        loc, index_value, rewriter->getIntegerType(32)));
-  }
-
-  return shape_values;
-}
-
-// Helper function for OpRewritePattern classes to materialize dynamic
-// broadcasts on LHS and RHS arguments to a binary op.
-//
-// Returns true and set out_lhs and out_rhs for materialized dynamic broadcasts
-// for LHS and RHS arguments, else returns false.
-template <typename SrcOp>
-bool CreateDynamicBroadcastsForBinaryOp(SrcOp op, PatternRewriter *rewriter,
-                                        Value *out_lhs, Value *out_rhs) {
-  if (!op.broadcast_dimensions().hasValue()) {
-    // Note: the op may still have an implicit broadcast on it, such as
-    // for (tensor<1xf32>, tensor<4xf32>).
-    return false;
-  }
-
-  // Insert BroadcastInDimOps for the left-hand-side and right-hand-side args,
-  // replacing the original LHS and RHS args in the source op with the results
-  // of the broadcasts.
-  Value lhs = op.lhs();
-  Value rhs = op.rhs();
-
-  auto lhs_ranked_type = lhs.getType().dyn_cast<RankedTensorType>();
-  auto rhs_ranked_type = rhs.getType().dyn_cast<RankedTensorType>();
-  if (!lhs_ranked_type || !rhs_ranked_type) {
-    // Unranked, can't determine at this point how to perform the broadcast.
-    return false;
-  }
-
-  auto lhs_rank = lhs_ranked_type.getRank();
-  auto rhs_rank = rhs_ranked_type.getRank();
-
-  // Set broadcast_dimensions to [0, ..., rank] for the higher rank arg.
-  // Use the original op.broadcast_dimensions for the lower rank arg.
-  auto higher_rank_broadcast_dims =
-      GetI64ElementsAttrForSeq(0, std::max(lhs_rank, rhs_rank), rewriter);
-  DenseIntElementsAttr lhs_broadcast_dims;
-  DenseIntElementsAttr rhs_broadcast_dims;
-  std::vector<Value> shape_elements;
-  if (lhs_rank > rhs_rank) {
-    lhs_broadcast_dims = higher_rank_broadcast_dims;
-    rhs_broadcast_dims = op.broadcast_dimensions().getValue();
-    shape_elements = ComputeBroadcastedShape<SrcOp>(op, rhs, lhs, rewriter);
-  } else if (lhs_rank < rhs_rank) {
-    lhs_broadcast_dims = op.broadcast_dimensions().getValue();
-    rhs_broadcast_dims = higher_rank_broadcast_dims;
-    shape_elements = ComputeBroadcastedShape<SrcOp>(op, lhs, rhs, rewriter);
-  } else {
-    // This shouldn't happen for legal ops. If the broadcast_dimensions
-    // attribute is set, the ranks should be different.
-    // TODO(scotttodd): Add a custom verification for ops and assert here.
-    return false;
-  }
-
-  // DynamicBroadcastInDimOp preserves the element type but produces a tensor
-  // with unranked shape. The rank of the output is the length of the
-  // output shape argument.
-  SmallVector<int64_t, 4> op_shape(shape_elements.size(),
-                                   RankedTensorType::kDynamicSize);
-  auto lhs_type =
-      RankedTensorType::get(op_shape, lhs_ranked_type.getElementType());
-  auto rhs_type =
-      RankedTensorType::get(op_shape, rhs_ranked_type.getElementType());
-
-  // We need a way to turn a list of scalars into a vector. While Standard
-  // dialect does not have one, use the XLA_HLO variant.
-  int shape_size = shape_elements.size();
-  Type shape_element_type = shape_elements.front().getType();
-  Value shape_value = rewriter->create<ScalarsToDimensionTensorOp>(
-      op.getLoc(), RankedTensorType::get({shape_size}, shape_element_type),
-      shape_elements);
-
-  *out_lhs = rewriter->createOrFold<DynamicBroadcastInDimOp>(
-      op.getLoc(), lhs_type, lhs, shape_value, lhs_broadcast_dims);
-  *out_rhs = rewriter->createOrFold<DynamicBroadcastInDimOp>(
-      op.getLoc(), rhs_type, rhs, shape_value, rhs_broadcast_dims);
-  return true;
-}
-
-template <typename SrcOp>
-bool CreateBroadcastForBinaryOp(SrcOp op, PatternRewriter *rewriter,
-                                Value *out_lhs, Value *out_rhs) {
-  auto op_ranked_type = op.getType().template dyn_cast<RankedTensorType>();
-  if (!op_ranked_type) return false;
-
-  if (op_ranked_type.hasStaticShape()) {
-    if (!CreateStaticBroadcastsForBinaryOp(op, rewriter, out_lhs, out_rhs)) {
-      return false;
-    }
-  } else {
-    if (!CreateDynamicBroadcastsForBinaryOp(op, rewriter, out_lhs, out_rhs)) {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <typename SrcOp>
-struct BinaryOpWithBroadcastConvert : public OpRewritePattern<SrcOp> {
-  explicit BinaryOpWithBroadcastConvert(MLIRContext *context)
-      : OpRewritePattern<SrcOp>(context) {}
-
-  LogicalResult matchAndRewrite(SrcOp op,
-                                PatternRewriter &rewriter) const override {
-    Value new_lhs;
-    Value new_rhs;
-
-    if (!CreateBroadcastForBinaryOp(op, &rewriter, &new_lhs, &new_rhs))
-      return failure();
-
-    // Replace the original op with a new one that uses the new args.
-    // New args are broadcasts, so no dims are needed on the replacement op.
-    rewriter.replaceOpWithNewOp<SrcOp>(op, op.getType(), new_lhs, new_rhs,
-                                       /*broadcast_dims=*/nullptr);
-    return success();
-  }
-};
-
 // Converts ClampOp with broadcast semantics. ClampOp requires "all three arrays
 // must be the same shape. Alternatively, as a restricted form of broadcasting,
 // min and/or max can be a scalar of type T."
@@ -335,57 +69,10 @@ struct ClampWithBroadcastConvert : public OpRewritePattern<ClampOp> {
   }
 };
 
-// Specialized class for CompareOp, as it has an additional builder argument.
-struct CompareWithBroadcastConvert : public OpRewritePattern<CompareOp> {
-  explicit CompareWithBroadcastConvert(MLIRContext *context)
-      : OpRewritePattern<CompareOp>(context) {}
-
-  LogicalResult matchAndRewrite(CompareOp op,
-                                PatternRewriter &rewriter) const override {
-    Value new_lhs;
-    Value new_rhs;
-
-    if (!CreateBroadcastForBinaryOp(op, &rewriter, &new_lhs, &new_rhs))
-      return failure();
-
-    rewriter.replaceOpWithNewOp<CompareOp>(op, op.getType(), new_lhs, new_rhs,
-                                           /*broadcast_dims=*/nullptr,
-                                           op.comparison_direction());
-    return success();
-  }
-};
-
 }  // namespace
 
 void SetupMaterializeBroadcastsLegality(MLIRContext *context,
                                         ConversionTarget *conversionTarget) {
-#define ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(OpType) \
-  conversionTarget->addDynamicallyLegalOp<OpType>(      \
-      [](OpType op) { return !op.broadcast_dimensions().hasValue(); });
-  // Binary elementwise ops.
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(AddOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(Atan2Op);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(DivOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(MaxOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(MinOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(MulOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(PowOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(RemOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(ShiftLeftOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(ShiftRightArithmeticOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(ShiftRightLogicalOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(SubOp);
-
-  // Binary logical elementwise ops.
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(AndOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(OrOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(XorOp);
-
-  // CompareOp.
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(CompareOp);
-
-#undef ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST
-
   conversionTarget->addDynamicallyLegalOp<ClampOp>([](ClampOp op) {
     return op.max().getType() == op.operand().getType() &&
            op.min().getType() == op.operand().getType();
@@ -394,30 +81,10 @@ void SetupMaterializeBroadcastsLegality(MLIRContext *context,
 
 void PopulateMaterializeBroadcastsPatterns(MLIRContext *context,
                                            OwningRewritePatternList *patterns) {
-  // Binary elementwise ops.
-  patterns->insert<BinaryOpWithBroadcastConvert<AddOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<Atan2Op>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<DivOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<MaxOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<MinOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<MulOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<PowOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<RemOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<ShiftLeftOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<ShiftRightArithmeticOp>>(
-      context);
-  patterns->insert<BinaryOpWithBroadcastConvert<ShiftRightLogicalOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<SubOp>>(context);
-
-  // Binary logical elementwise ops.
-  patterns->insert<BinaryOpWithBroadcastConvert<AndOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<OrOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<XorOp>>(context);
-
-  // ClampOp. It can have a restricted form of broadcasting.
+  // ClampOp. This op has a special case where it accepts either same-shaped
+  // inputs or scalars (a restricted form of broadcasting). This makes the
+  // broadcast explicit.
   patterns->insert<ClampWithBroadcastConvert>(context);
-  // CompareOp. Note the specialized class instead of using the template.
-  patterns->insert<CompareWithBroadcastConvert>(context);
 }
 
 }  // namespace xla_hlo
diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index 2d0164981a3..e3dd5380d7c 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -36,7 +36,7 @@ namespace xla_hlo {
 /// Lowers from TF dialect to HLO dialect. When allow_partial_conversion is
 /// false, emits an error if there is any operation that can't be legalized.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeTFPass(
-    bool allow_partial_conversion = false);
+    bool allow_partial_conversion = false, bool legalize_chlo = true);
 
 /// Lowers from TF dialect to HLO dialect using tf2xla op kernels for the
 /// specified device type.
@@ -50,7 +50,8 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFControlFlowPass();
 /// dialect using the conversion patterns registered by the HLO dialect. When
 /// allow_partial_conversion is false, emits an error if there is any operation
 /// that can't be legalized.
-LogicalResult legalizeTF(Operation* op, bool allow_partial_conversion = false);
+LogicalResult legalizeTF(Operation* op, bool allow_partial_conversion = false,
+                         bool legalize_chlo = true);
 
 /// Lowers HLO control flow ops to the Standard dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeControlFlowPass();
@@ -65,6 +66,13 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToLhloPass();
 // Lowers from HLO dialect to Linalg dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeHloToLinalgPass();
 
+// Transforms unranked HLO operations to ranked ones where possible.
+std::unique_ptr<OperationPass<FuncOp>> createTransformUnrankedHloPass();
+
+// Sinks constants implicitly captured in control flow regions. This is
+// necessary to export to XLA.
+std::unique_ptr<OperationPass<FuncOp>> createSinkConstantsToControlFlowPass();
+
 }  // namespace xla_hlo
 
 namespace xla_lhlo {
@@ -81,8 +89,8 @@ std::unique_ptr<OperationPass<FuncOp>> createLegalizeToGpuPass();
 // Fuses linalg ops obtained after LHLO lowering. To enable fusion,
 // operations are first tiled.
 //
-// When 'use_parallel_loops' is set, the tiling will use loop.parallel
-// operations. Otherwise, loop.for operations are used.
+// When 'use_parallel_loops' is set, the tiling will use scf.parallel
+// operations. Otherwise, scf.for operations are used.
 //
 // 'tile_sizes' provides the tile sizes to use for tiling. If the linalg
 // operation has more dimensions than tile sizes provided, 1 is used as
diff --git a/tensorflow/compiler/mlir/xla/transforms/rewriters.h b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
index ad81cda19b9..59347198fe4 100644
--- a/tensorflow/compiler/mlir/xla/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
@@ -23,6 +23,9 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 
 namespace mlir {
+class LLVMTypeConverter;
+class OwningRewritePatternList;
+class BufferAssignmentPlacer;
 namespace xla_hlo {
 
 // Collection of rewrite patterns for lowering a general dot product.
@@ -38,9 +41,9 @@ void PopulateXlaToStdPatterns(OwningRewritePatternList *patterns,
                               MLIRContext *ctx);
 
 // Collection of rewrite patterns for lowering of HLO to LHLO dialect.
-void populateHLOToLHLOConversionPattern(MLIRContext *context,
-                                        OwningRewritePatternList *patterns);
-
+void populateHLOToLHLOConversionPattern(
+    MLIRContext *context, BufferAssignmentPlacer *bufferAssignment,
+    TypeConverter *converter, OwningRewritePatternList *patterns);
 // Collection of rewrite patterns for lowering of HLO to Linalg dialect.
 void populateHLOToLinalgConversionPattern(MLIRContext *context,
                                           OwningRewritePatternList *patterns);
@@ -54,6 +57,15 @@ void SetupMaterializeBroadcastsLegality(MLIRContext *context,
 void PopulateMaterializeBroadcastsPatterns(MLIRContext *context,
                                            OwningRewritePatternList *patterns);
 
+// Sets up legality definitions for element-wise operations on ranked tensors.
+void SetupTransformUnrankedHloLegality(MLIRContext *context,
+                                       ConversionTarget *conversionTarget);
+
+// Populates a collection of rewrite patterns to realize element-wise operations
+// on ranked tensors where possible.
+void PopulateTransformUnrankedHloPatterns(MLIRContext *context,
+                                          OwningRewritePatternList *patterns);
+
 // Populate a collection of conversion patterns for un-fusing
 // batch_norm_inference and batch_norm_training into constituent HLO ops.
 // TODO(laurenzo): Implement un-fusing of batch_norm_training.
@@ -62,6 +74,14 @@ void PopulateUnfuseBatchNormPatterns(MLIRContext *context,
 
 }  // namespace xla_hlo
 
+namespace xla_lhlo {
+
+/// Collect a set of patterns to convert from the LHLO dialect to LLVM.
+void PopulateLhloToLLVMConversionPatterns(LLVMTypeConverter *converter,
+                                          OwningRewritePatternList *patterns);
+
+}  // namespace xla_lhlo
+
 namespace xla_chlo {
 
 // Populates a collection of conversion patterns for legalizing client-HLO to
diff --git a/tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc
new file mode 100644
index 00000000000..5a45e0f3b18
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+
+namespace mlir {
+namespace xla_hlo {
+
+namespace {
+
+// A pass that sinks constants implicitly captured in control flow regions. This
+// is necessary to export to XLA.
+class SinkConstantsToControlFlow
+    : public mlir::PassWrapper<SinkConstantsToControlFlow, FunctionPass> {
+  void runOnFunction() override {
+    getFunction().walk([](Operation* op) {
+      if (auto while_op = llvm::dyn_cast<WhileOp>(op)) {
+        SinkToRegion(&while_op.body());
+        SinkToRegion(&while_op.cond());
+      } else if (auto if_op = llvm::dyn_cast<IfOp>(op)) {
+        SinkToRegion(&if_op.true_branch());
+        SinkToRegion(&if_op.false_branch());
+      }
+    });
+  }
+
+ private:
+  // Performs constant sinking into a region.
+  static void SinkToRegion(Region* region) {
+    llvm::DenseMap<Value, ConstOp> sunk_constant;
+    visitUsedValuesDefinedAbove({*region}, [&](OpOperand* use) {
+      Value constant = use->get();
+      auto const_op = dyn_cast_or_null<ConstOp>(constant.getDefiningOp());
+      if (!const_op) return;
+      auto map_entry = sunk_constant.try_emplace(constant, nullptr);
+      if (!map_entry.second) {
+        // This constant has already been cloned into the region, reuse it.
+        use->set(map_entry.first->getSecond().getResult());
+        if (constant.use_empty()) const_op.erase();
+        return;
+      }
+      if (constant.hasOneUse()) {
+        const_op.getOperation()->moveBefore(&region->front().front());
+        return;
+      }
+      map_entry.first->getSecond() = const_op.clone();
+      region->front().getOperations().insert(region->front().begin(),
+                                             map_entry.first->getSecond());
+      use->set(map_entry.first->getSecond().getResult());
+    });
+  }
+};
+
+static mlir::PassRegistration<SinkConstantsToControlFlow> pass(
+    "xla-hlo-sink-constants-to-control-flow",
+    "Sink constants implicitly captured in control flow regions. This is "
+    "necessary to export to XLA.");
+
+}  // anonymous namespace
+
+std::unique_ptr<OperationPass<FuncOp>> createSinkConstantsToControlFlowPass() {
+  return std::make_unique<SinkConstantsToControlFlow>();
+}
+
+}  // namespace xla_hlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/test_infer_shaped_type_pass.cc b/tensorflow/compiler/mlir/xla/transforms/test_infer_shaped_type_pass.cc
index 8976bd5b7d2..71441656c08 100644
--- a/tensorflow/compiler/mlir/xla/transforms/test_infer_shaped_type_pass.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/test_infer_shaped_type_pass.cc
@@ -38,7 +38,8 @@ struct InferReturnTypeComponentsPattern : public RewritePattern {
     SmallVector<ShapedTypeComponents, 4> components;
     if (failed(defining_op_int.inferReturnTypeComponents(
             op->getContext(), op->getLoc(), defining_op->getOperands(),
-            defining_op->getAttrs(), defining_op->getRegions(), components))) {
+            defining_op->getAttrDictionary(), defining_op->getRegions(),
+            components))) {
       return failure();
     }
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc b/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
index 32d8b079c89..eead03404cb 100644
--- a/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
@@ -58,13 +58,9 @@ Value CalculateShapeValue(Location loc, Value operand,
   int64_t rank = result_type.getRank();
   shape_values.reserve(rank);
   for (int64_t i = 0; i < rank; ++i) {
-    auto index_value = rewriter.create<mlir::DimOp>(loc, operand, i);
-    shape_values.push_back(rewriter.create<mlir::IndexCastOp>(
-        loc, index_value, rewriter.getIntegerType(32)));
+    shape_values.push_back(rewriter.create<mlir::DimOp>(loc, operand, i));
   }
-  Type shape_element_type = shape_values.front().getType();
-  return rewriter.create<ScalarsToDimensionTensorOp>(
-      loc, RankedTensorType::get({rank}, shape_element_type), shape_values);
+  return rewriter.create<TensorFromElementsOp>(loc, shape_values);
 }
 
 Value MaterializeEpsilon(Operation* op, FloatAttr epsilon_attr,
@@ -137,8 +133,8 @@ class UnfuseBatchNormInferencePattern
     if (!epsilon) {
       return failure();
     }
-    Value stddev = rewriter.create<xla_hlo::AddOp>(
-        bn_op.getLoc(), bn_op.variance(), epsilon, /*broadcast_dims=*/nullptr);
+    Value stddev = rewriter.create<xla_hlo::AddOp>(bn_op.getLoc(),
+                                                   bn_op.variance(), epsilon);
     stddev = rewriter.create<xla_hlo::SqrtOp>(bn_op.getLoc(), stddev);
 
     // Broadcast all terms.
@@ -162,13 +158,13 @@ class UnfuseBatchNormInferencePattern
     // Compute:
     // scale * (input - mean) / stddev + offset
     Value result = rewriter.create<xla_hlo::SubOp>(
-        bn_op.getLoc(), bn_op.operand(), broadcast_mean, nullptr);
+        bn_op.getLoc(), bn_op.operand(), broadcast_mean);
     result = rewriter.create<xla_hlo::MulOp>(bn_op.getLoc(), result,
-                                             broadcast_scale, nullptr);
+                                             broadcast_scale);
     result = rewriter.create<xla_hlo::DivOp>(bn_op.getLoc(), result,
-                                             broadcast_stddev, nullptr);
-    rewriter.replaceOpWithNewOp<xla_hlo::AddOp>(bn_op, result, broadcast_offset,
-                                                nullptr);
+                                             broadcast_stddev);
+    rewriter.replaceOpWithNewOp<xla_hlo::AddOp>(bn_op, result,
+                                                broadcast_offset);
 
     return success();
   }
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc
index 436a3e701e1..a12bd9e7c1a 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc
@@ -251,17 +251,15 @@ Value LhloDialectEmitter::GetOrCreateView(
 
   // Create the view for this slice size, possible with an affine map to model
   // the offset. The result is cached in the slices_ map.
-  SmallVector<AffineMap, 1> offset_map;
-  if (slice.offset()) {
-    offset_map.push_back(AffineMap::get(
-        /*dimCount=*/1, /*symbolCount=*/0,
-        {getAffineDimExpr(0, builder_.getContext()) + slice.offset()},
-        builder_.getContext()));
-  }
-  auto slice_type = MemRefType::get({slice.size()}, i8_type_, offset_map);
+  // The std.view result type does not carry the static offset: this is not
+  // useful information. Rather, the view op must have the static offset.
+  auto slice_type = MemRefType::get({slice.size()}, i8_type_, {});
 
-  auto slice_view = builder_.create<ViewOp>(
-      alloc_buffer.getLoc(), slice_type, alloc_buffer, /*operands=*/llvm::None);
+  Value byte_shift =
+      builder_.create<ConstantIndexOp>(alloc_buffer.getLoc(), slice.offset());
+  auto slice_view =
+      builder_.create<ViewOp>(alloc_buffer.getLoc(), slice_type, alloc_buffer,
+                              byte_shift, /*sizes=*/ArrayRef<Value>{});
   slices_.insert({slice_key, slice_view});
   return slice_view;
 }
@@ -277,9 +275,12 @@ StatusOr<Value> LhloDialectEmitter::GetOrCreateView(
   Value slice_view = GetOrCreateView(out_slice);
   TF_ASSIGN_OR_RETURN(Type out_type, ::xla::ConvertShapeToType<MemRefType>(
                                          target_shape, builder_));
+  Value byte_shift =
+      builder_.create<ConstantIndexOp>(builder_.getUnknownLoc(), 0);
   if (slice_view.getType() != out_type)
-    slice_view = builder_.create<ViewOp>(builder_.getUnknownLoc(), out_type,
-                                         slice_view, llvm::None);
+    slice_view =
+        builder_.create<ViewOp>(builder_.getUnknownLoc(), out_type, slice_view,
+                                byte_shift, /*sizes=*/ArrayRef<Value>{});
   return slice_view;
 }
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
index 9cce6799288..2b496677d62 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
@@ -84,7 +84,8 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
       emitError(loc, "lhlo to linalg conversion expects ranked args");
       return failure();
     }
-    if (!argType.getElementType().isSignlessIntOrFloat()) {
+    auto elemTy = argType.getElementType();
+    if (!elemTy.isSignlessIntOrFloat() && !elemTy.template isa<ComplexType>()) {
       return failure();
     }
 
@@ -572,6 +573,34 @@ class ConstConverter : public OpConversionPattern<xla_lhlo::ConstOp> {
   }
 };
 
+// TODO(b/156787842): Support the lowering for dynamic shapes.
+template <typename OpTy, bool isLHLO = true>
+class ReverseConverter
+    : public DataMovementOpConverter<ReverseConverter<OpTy, isLHLO>, OpTy,
+                                     isLHLO> {
+ public:
+  using DataMovementOpConverter<ReverseConverter<OpTy, isLHLO>, OpTy,
+                                isLHLO>::DataMovementOpConverter;
+  static ArrayAttr getIndexingMapsAttr(OpTy op, Builder* b) {
+    auto resultType =
+        getXLAOpResultType<isLHLO>(op).template cast<ShapedType>();
+    auto nloops = resultType.getRank();
+    SmallVector<AffineExpr, 2> inputExprs;
+    inputExprs.reserve(nloops);
+    for (int i = 0; i < nloops; ++i)
+      inputExprs.push_back(b->getAffineDimExpr(i));
+    for (auto dim : op.dimensions()) {
+      int i = dim.getZExtValue();
+      if (resultType.isDynamicDim(i)) return {};
+      int n = resultType.getShape()[i];
+      inputExprs[i] = b->getAffineConstantExpr(n - 1) - inputExprs[i];
+    }
+    return b->getAffineMapArrayAttr(
+        {AffineMap::get(nloops, /*symbolCount=*/0, inputExprs, b->getContext()),
+         b->getMultiDimIdentityMap(nloops)});
+  }
+};
+
 class SliceConverter : public OpConversionPattern<xla_lhlo::SliceOp> {
  public:
   using OpConversionPattern<xla_lhlo::SliceOp>::OpConversionPattern;
@@ -618,17 +647,20 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<xla_lhlo::AndOp>,
                    PointwiseToLinalgConverter<xla_lhlo::CeilOp>,
                    PointwiseToLinalgConverter<xla_lhlo::CompareOp>,
+                   PointwiseToLinalgConverter<xla_lhlo::ComplexOp>,
                    PointwiseToLinalgConverter<xla_lhlo::ConvertOp>,
                    // TODO(ataei): Remove this pattern, CopyOp is folded away.
                    PointwiseToLinalgConverter<xla_lhlo::CopyOp>,
                    PointwiseToLinalgConverter<xla_lhlo::CosOp>,
                    PointwiseToLinalgConverter<xla_lhlo::DivOp>,
                    PointwiseToLinalgConverter<xla_lhlo::ExpOp>,
+                   PointwiseToLinalgConverter<xla_lhlo::ImagOp>,
                    PointwiseToLinalgConverter<xla_lhlo::LogOp>,
                    PointwiseToLinalgConverter<xla_lhlo::MaxOp>,
                    PointwiseToLinalgConverter<xla_lhlo::MinOp>,
                    PointwiseToLinalgConverter<xla_lhlo::MulOp>,
                    PointwiseToLinalgConverter<xla_lhlo::NegOp>,
+                   PointwiseToLinalgConverter<xla_lhlo::RealOp>,
                    PointwiseToLinalgConverter<xla_lhlo::RemOp>,
                    PointwiseToLinalgConverter<xla_lhlo::RsqrtOp>,
                    PointwiseToLinalgConverter<xla_lhlo::SelectOp>,
@@ -638,6 +670,7 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<xla_lhlo::SubOp>,
                    PointwiseToLinalgConverter<xla_lhlo::TanhOp>,
                    ReshapeAddRemoveDimConverter<xla_lhlo::ReshapeOp>,
+                   ReverseConverter<xla_lhlo::ReverseOp>,
                    ScalarPointwiseToStandardConverter<xla_lhlo::AddOp>,
                    SliceConverter
                   >(context);
@@ -716,16 +749,19 @@ void populateHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<xla_hlo::AndOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::CeilOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::CompareOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::ComplexOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::ConvertOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::CopyOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::CosOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::DivOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::ExpOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::ImagOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::LogOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::MaxOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::MinOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::MulOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::NegOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::RealOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::RemOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::RsqrtOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::SelectOp, false>,
@@ -735,6 +771,7 @@ void populateHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<xla_hlo::TanhOp, false>,
                    ReshapeAddRemoveDimConverter<xla_hlo::ReshapeOp, false>,
                    ReshapeOpConverter<xla_hlo::ReshapeOp, false>,
+                   ReverseConverter<xla_hlo::ReverseOp, false>,
                    TransposeConverter<xla_hlo::TransposeOp, false>>(context);
 }
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_transform_unranked_hlo.cc b/tensorflow/compiler/mlir/xla/transforms/xla_transform_unranked_hlo.cc
new file mode 100644
index 00000000000..b2afc7c1026
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_transform_unranked_hlo.cc
@@ -0,0 +1,122 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+==============================================================================*/
+
+#include "absl/memory/memory.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
+
+namespace mlir {
+namespace xla_hlo {
+namespace {
+
+template <typename OpTy>
+inline void AddLegalOpOnRankedTensor(ConversionTarget *conversionTarget) {
+  conversionTarget->addDynamicallyLegalOp<OpTy>([](OpTy op) {
+    return op.getOperand().getType().template cast<TensorType>().hasRank();
+  });
+}
+
+template <typename OpTy>
+struct UnaryElementwiseOpConversion : public OpRewritePattern<OpTy> {
+  explicit UnaryElementwiseOpConversion(MLIRContext *context)
+      : OpRewritePattern<OpTy>(context) {}
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    // Don't apply conversion to ops with statically shaped operands.
+    Value operand = op.getOperand();
+    auto operandTy = operand.getType().dyn_cast<TensorType>();
+    if (operandTy.hasRank()) return failure();
+
+    // Generate IR to flatten the operand.
+    auto loc = op.getLoc();
+    Value shape = rewriter.create<shape::ShapeOfOp>(loc, operand);
+    Value numElements = rewriter.create<shape::NumElementsOp>(
+        loc, rewriter.getType<shape::SizeType>(), shape);
+    Value numElementsAsIndex = rewriter.create<shape::SizeToIndexOp>(
+        loc, rewriter.getIndexType(), numElements);
+    Value flatShapeAsDimTensor =
+        rewriter.create<TensorFromElementsOp>(loc, numElementsAsIndex);
+    auto flatTensorTy = RankedTensorType::get({ShapedType::kDynamicSize},
+                                              operandTy.getElementType());
+    Value flatOperand = rewriter.create<xla_hlo::DynamicReshapeOp>(
+        loc, flatTensorTy, operand, flatShapeAsDimTensor);
+
+    // Generate IR for the actual operation.
+    Value flatResult = rewriter.create<OpTy>(loc, flatTensorTy, flatOperand);
+
+    // Generate IR to restore the original shape.
+    auto extentTensorTy = RankedTensorType::get({ShapedType::kDynamicSize},
+                                                rewriter.getIndexType());
+    Value shapeAsExtentTensor =
+        rewriter.create<shape::ToExtentTensorOp>(loc, extentTensorTy, shape);
+    Value result = rewriter.create<xla_hlo::DynamicReshapeOp>(
+        loc, operandTy, flatResult, shapeAsExtentTensor);
+    rewriter.replaceOp(op, result);
+
+    return success();
+  }
+};
+
+struct TransformUnrankedHloPass
+    : public PassWrapper<TransformUnrankedHloPass, FunctionPass> {
+  void runOnFunction() override {
+    ConversionTarget conversionTarget(getContext());
+    OwningRewritePatternList conversionPatterns;
+    SetupTransformUnrankedHloLegality(&getContext(), &conversionTarget);
+    PopulateTransformUnrankedHloPatterns(&getContext(), &conversionPatterns);
+
+    if (failed(applyPartialConversion(getFunction(), conversionTarget,
+                                      conversionPatterns)))
+      return signalPassFailure();
+  }
+};
+
+}  // namespace
+
+void SetupTransformUnrankedHloLegality(MLIRContext *context,
+                                       ConversionTarget *conversionTarget) {
+  conversionTarget->addLegalDialect<XlaHloDialect, StandardOpsDialect,
+                                    shape::ShapeDialect>();
+
+  // Targeted operations are only legal when they operate on ranked tensors.
+  AddLegalOpOnRankedTensor<SqrtOp>(conversionTarget);
+}
+
+void PopulateTransformUnrankedHloPatterns(MLIRContext *context,
+                                          OwningRewritePatternList *patterns) {
+  patterns->insert<UnaryElementwiseOpConversion<SqrtOp>>(context);
+}
+
+std::unique_ptr<OperationPass<FuncOp>> createTransformUnrankedHloPass() {
+  return absl::make_unique<TransformUnrankedHloPass>();
+}
+
+static PassRegistration<TransformUnrankedHloPass> transform_unranked_hlo_pass(
+    "transform-unranked-hlo",
+    "Realize element-wise operations on ranked tensors where possible");
+
+}  // namespace xla_hlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index cd22b527444..ea4ba8dab6b 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -470,6 +470,7 @@ tf_xla_py_test(
     name = "concat_ops_test",
     size = "medium",
     srcs = ["concat_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "many_xla_args",
@@ -561,6 +562,7 @@ tf_xla_py_test(
     name = "dynamic_slice_ops_test",
     size = "small",
     srcs = ["dynamic_slice_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1342,6 +1344,7 @@ tf_xla_py_test(
     name = "ternary_ops_test",
     size = "medium",
     srcs = ["ternary_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1384,6 +1387,7 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["fused_batchnorm_test.py"],
     python_version = "PY3",
+    shard_count = 5,
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 92ea1cfaf87..eb8883c9ccd 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import itertools
-import os
 
 import numpy as np
 
@@ -73,8 +72,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
       self.assertAllCloseAccordingToType(
           result[i], expected[i], rtol=rtol, atol=atol)
 
-  @test_util.disable_mlir_bridge(
-      "F16 type is not supported in CreateDenseElementsAttrFromLiteral")
   def testFloatOps(self):
     for dtype in self.float_types:
       if dtype == dtypes.bfloat16.as_numpy_dtype:
@@ -1020,7 +1017,8 @@ class BinaryOpsTest(xla_test.XLATestCase):
           math_ops.matmul,
           np.array([[3.1415926535897932]], dtype=dtype),
           np.array([[2.7182818284590452]], dtype=dtype),
-          expected=np.array([[8.5397342226735668]], dtype=dtype))
+          expected=np.array([[8.5397342226735668]], dtype=dtype),
+          rtol=1e-14)
 
       # Edge case with a large range of exponent. Not supported by float16.
       if dtype != np.float16:
@@ -1028,7 +1026,8 @@ class BinaryOpsTest(xla_test.XLATestCase):
             math_ops.matmul,
             np.array([[9.4039548065783000e-38]], dtype=dtype),
             np.array([[4.5070591730234615e37]], dtype=dtype),
-            expected=np.array([[4.2384180773686798]], dtype=dtype))
+            expected=np.array([[4.2384180773686798]], dtype=dtype),
+            rtol=1e-14)
 
   # TODO(phawkins): failing on GPU, no registered kernel.
   def DISABLED_testSparseMatMul(self):
@@ -1098,8 +1097,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
             x,
             expected=np.matmul(x, x.transpose([0, 1, 3, 2])))
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/155097273): Handle complex dtype constants")
   def testExpandDims(self):
     for dtype in self.numeric_types:
       self._testBinary(
@@ -1197,8 +1194,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
         np.full([1, 1, 3, 5], 3., dtype=np.float32),
         expected=np.full([4, 5, 1, 2, 5], 18., dtype=np.float32))
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/155097273): Handle complex dtype constants")
   def testPad(self):
     for dtype, pad_type in itertools.product(
         self.numeric_types, [np.int32, np.int64]):
@@ -1339,8 +1334,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
               ],
               dtype=dtype))
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/155097273): Handle complex dtype constants")
   def testReshape(self):
     for dtype in self.numeric_types:
       self._testBinary(
@@ -1473,8 +1466,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
                [1, 2]],
               dtype=dtype))
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/155097273): Handle complex dtype constants")
   def testTranspose(self):
     for dtype in self.numeric_types:
       self._testBinary(
@@ -1493,8 +1484,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
           np.array([1, 0], dtype=np.int32),
           expected=np.array([[1, 3], [2, 4]], dtype=dtype))
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/155097273): Handle complex dtype constants")
   def testConjugateTranspose(self):
     for dtype in self.complex_types:
       self._testBinary(
@@ -1513,7 +1502,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
           np.array([1, 0], dtype=np.int32),
           expected=np.array([[1 + 1j, 3 + 3j], [2 - 2j, 4 - 4j]], dtype=dtype))
 
-  @test_util.disable_mlir_bridge("Enable tf.Cross Compilation")
   def testCross(self):
     for dtype in self.float_types:
       self._testBinary(
@@ -1592,8 +1580,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
                        np.array([4, 5, 6], dtype=np.int32),
                        expected=None)
 
-  @test_util.disable_mlir_bridge(
-      "Requires BroadcastInDim method in MlirHloBuilder")
   def testBroadcastTo(self):
     for dtype in self.all_types:
       x = np.random.randint(0, high=100, size=[2, 3])
@@ -1604,29 +1590,16 @@ class BinaryOpsTest(xla_test.XLATestCase):
           expected=x)
       self._testBinary(
           array_ops.broadcast_to,
-          x,
-          np.array([6, 6], dtype=np.int32),
-          expected=np.tile(x, [3, 2]))
+          np.zeros([2, 3], dtype=dtype),
+          np.array([2, 2, 3], dtype=np.int32),
+          expected=np.zeros([2, 2, 3], dtype=dtype))
+
+      x = np.arange(2).reshape((2, 1)).astype(dtype)
       self._testBinary(
           array_ops.broadcast_to,
           x,
-          np.array([7, 4, 3], dtype=np.int32),
-          expected=np.tile(x, [7, 2, 1]))
-      self._testBinary(
-          array_ops.broadcast_to,
-          x,
-          np.array([7, 0, 3], dtype=np.int32),
-          expected=np.zeros([7, 0, 3], dtype=dtype))
-      self._testBinary(
-          array_ops.broadcast_to,
-          x,
-          np.array([7, 1, 2, 9], dtype=np.int32),
-          expected=np.tile(x, [7, 1, 1, 3]))
-      self._testBinary(
-          array_ops.broadcast_to,
-          np.zeros([2, 0], dtype=dtype),
-          np.array([4, 0], dtype=np.int32),
-          expected=np.zeros([4, 0], dtype=dtype))
+          np.array([2, 2, 3], dtype=np.int32),
+          expected=np.tile(x, (2, 1, 3)))
 
       x = np.arange(3).reshape((3, 1, 1, 1)).astype(dtype)
       self._testBinary(
@@ -1637,8 +1610,4 @@ class BinaryOpsTest(xla_test.XLATestCase):
 
 
 if __name__ == "__main__":
-  # TODO(b/130689556): XLA CPU does not honor inf/nan which causes problems
-  os.environ[
-      "XLA_FLAGS"] = "--xla_cpu_enable_fast_math=false " + os.environ.get(
-          "XLA_FLAGS", "")
   googletest.main()
diff --git a/tensorflow/compiler/tests/concat_ops_test.py b/tensorflow/compiler/tests/concat_ops_test.py
index 10dd2d6542c..f35ded924d5 100644
--- a/tensorflow/compiler/tests/concat_ops_test.py
+++ b/tensorflow/compiler/tests/concat_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradients_impl
@@ -293,6 +294,7 @@ class ConcatTest(xla_test.XLATestCase):
 
   # The purpose of this is to ensure that XLA on GPU will not run out of memory
   # with too many arguments.
+  @test_util.disable_mlir_bridge("TODO(b/153895138): Debug.")
   def testConcatLargeNumberOfTensors(self):
     if "CPU" in self.device:
       self.skipTest("This test can time out on CPU, so we will just allow "
diff --git a/tensorflow/compiler/tests/data_format_ops_test.py b/tensorflow/compiler/tests/data_format_ops_test.py
index 681c1f3499e..08d44256b50 100644
--- a/tensorflow/compiler/tests/data_format_ops_test.py
+++ b/tensorflow/compiler/tests/data_format_ops_test.py
@@ -81,11 +81,21 @@ class XlaPermuteOpTest(xla_test.XLATestCase):
       x = np.array([7, 4, 9, 3], dtype=dtype)
       self._runPermuteAndCompare(x, "NHWC", "NCHW", [7, 3, 4, 9])
 
+  def testNHWCToNCHW_Size2(self):
+    for dtype in {np.int32, np.int64}:
+      x = np.array([4, 9], dtype=dtype)
+      self._runPermuteAndCompare(x, "NHWC", "NCHW", [4, 9])
+
   def testNCHWToNHWC(self):
     for dtype in {np.int32, np.int64}:
       x = np.array([7, 4, 9, 3], dtype=dtype)
       self._runPermuteAndCompare(x, "NCHW", "NHWC", [7, 9, 3, 4])
 
+  def testNCHWToNHWC_Size2(self):
+    for dtype in {np.int32, np.int64}:
+      x = np.array([9, 3], dtype=dtype)
+      self._runPermuteAndCompare(x, "NCHW", "NHWC", [9, 3])
+
   def testNHWCToHWNC(self):
     for dtype in {np.int32, np.int64}:
       x = np.array([7, 4, 9, 3], dtype=dtype)
diff --git a/tensorflow/compiler/tests/fused_batchnorm_test.py b/tensorflow/compiler/tests/fused_batchnorm_test.py
index 6a9076e9be8..a36effe5984 100644
--- a/tensorflow/compiler/tests/fused_batchnorm_test.py
+++ b/tensorflow/compiler/tests/fused_batchnorm_test.py
@@ -23,7 +23,6 @@ import numpy as np
 
 from tensorflow.compiler.tests import test_utils
 from tensorflow.compiler.tests import xla_test
-from tensorflow.python.compat import compat
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
@@ -132,9 +131,6 @@ class FusedBatchNormTest(xla_test.XLATestCase, parameterized.TestCase):
 
   def _testLearning(self, use_gradient_checker, data_format,
                     exponential_avg_factor):
-    if not compat.forward_compatible(2020, 3,
-                                     6) and exponential_avg_factor != 1.0:
-      self.skipTest("running average not available.")
     channel = 3
     x_shape = [2, 2, 6, channel]
     scale_shape = [channel]
diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py
index a2a47f19a6e..7bbfecff403 100644
--- a/tensorflow/compiler/tests/ternary_ops_test.py
+++ b/tensorflow/compiler/tests/ternary_ops_test.py
@@ -24,6 +24,7 @@ import scipy.special as sps
 
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
@@ -47,6 +48,8 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
       {'start': 1, 'end': 2, 'num': 1},
       {'start': 1, 'end': 4, 'num': 3},
       {'start': 0, 'end': 41, 'num': 42})
+  @test_util.disable_mlir_bridge(
+      'TODO(b/156174708): Dynamic result types not supported')
   def testLinspace(self, start, end, num):
     expected = np.linspace(start, end, num, dtype=np.float32)
     result = self._testTernary(
@@ -211,6 +214,7 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
             upper,
             expected=np.minimum(np.maximum(x, lower), upper))
 
+  @test_util.disable_mlir_bridge('Enable tf.Betainc Compilation')
   def testBetaincSanity(self):
     # This operation is only supported for float32 and float64.
     for dtype in self.numeric_types & {np.float32, np.float64}:
@@ -248,6 +252,7 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
           'atol': 2e-4
       },
   )
+  @test_util.disable_mlir_bridge('Enable tf.Betainc Compilation')
   def testBetainc(self, sigma, rtol, atol):
     # This operation is only supported for float32 and float64.
     for dtype in self.numeric_types & {np.float32, np.float64}:
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 0c4c7bacdf3..85bf89c4f9e 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -186,8 +186,6 @@ class UnaryOpsTest(xla_test.XLATestCase):
         self._assertOpOutputMatchesExpected(
             math_ops.cos, x, expected=np.cos(x), rtol=tol, atol=1e-5)
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/153812660): Handle tf.Softmax compilation")
   def testFloatOps(self):
     for dtype in self.float_types:
       x = np.arange(-0.90, 0.90, 0.25)
@@ -349,17 +347,15 @@ class UnaryOpsTest(xla_test.XLATestCase):
           expected=np.array(
               [1.55740772, -2.18503986, -0.14254654, 1.15782128], dtype=dtype))
 
-      # TODO(b/130689556): Turn this on for CPU when we start honoring NaNs.
-      if self.device != "XLA_CPU":
-        self._assertOpOutputMatchesExpected(
-            math_ops.tanh,
-            np.array([[1, 2, 3, 4], [np.inf, -np.inf, np.nan, 20],
-                      [19, -19, 22, -22]],
-                     dtype=dtype),
-            expected=np.array(
-                [[0.76159418, 0.96402758, 0.99505478, 0.99932933],
-                 [1.0, -1.0, np.nan, 1.0], [1.0, -1.0, 1.0, -1.0]],
-                dtype=dtype))
+      self._assertOpOutputMatchesExpected(
+          math_ops.tanh,
+          np.array([[1, 2, 3, 4], [np.inf, -np.inf, np.nan, 20],
+                    [19, -19, 22, -22]],
+                   dtype=dtype),
+          expected=np.array(
+              [[0.76159418, 0.96402758, 0.99505478, 0.99932933],
+               [1.0, -1.0, np.nan, 1.0], [1.0, -1.0, 1.0, -1.0]],
+              dtype=dtype))
 
       self._assertOpOutputMatchesExpected(
           nn_ops.log_softmax,
@@ -514,6 +510,11 @@ class UnaryOpsTest(xla_test.XLATestCase):
               ],
               dtype=dtype))
 
+  @test_util.disable_mlir_bridge(
+      "TODO(b/153812660): Handle tf.QuantizeAndDequantize compilation")
+  def testQuantizeAndDequantize(self):
+    for dtype in self.float_types:
+
       def quantize_and_dequantize_v2(x):
         return array_ops.quantize_and_dequantize_v2(
             x, -127, 127, signed_input=True, num_bits=8)
@@ -598,8 +599,6 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([-1, -0.5, 0, 0.3], dtype=dtype),
           expected=np.array([-1., -0.5, 0., 0.296875], dtype=dtype))
 
-  @test_util.disable_mlir_bridge(
-      "Complex types not supported in CreateDenseElementsAttrFromLiteral")
   def testComplexOps(self):
     for dtype in self.complex_types:
 
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index 3b304df9024..f3e915daa67 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -51,7 +51,6 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
         equality_fn = self.assertAllClose
       equality_fn(result, expected, rtol=1e-3)
 
-  @test_util.disable_mlir_bridge('Not supported yet')
   def testAdd(self):
     for dtype in self.numeric_types:
       self._assertOpOutputMatchesExpected(
@@ -72,7 +71,6 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
                 np.array([7, 11], dtype=dtype)),
           expected=np.array([[8, 13], [10, 15]], dtype=dtype))
 
-  @test_util.disable_mlir_bridge('Not supported yet')
   def testBroadcast(self):
     for dtype in self.numeric_types:
       v = np.arange(4, dtype=np.int32).astype(dtype).reshape([2, 2])
@@ -110,7 +108,6 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
                       xla_data_pb2.PrecisionConfig.HIGHEST)
 
   @parameterized.parameters(*PRECISION_VALUES)
-  @test_util.disable_mlir_bridge('Not supported yet')
   def testConv(self, precision):
     for dtype in set(self.float_types).intersection(
         set([dtypes.bfloat16.as_numpy_dtype, np.float32])):
@@ -195,8 +192,6 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
           args=(np.array([1, 2, 3], dtype=dtype),),
           expected=np.array([-1, -2, -3], dtype=dtype))
 
-  @test_util.disable_mlir_bridge(
-      'Requires XlaPad op shape inference to have static result types')
   def testPad(self):
     for dtype in self.numeric_types:
 
@@ -309,7 +304,6 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
       self._assertOpOutputMatchesExpected(
           lambda x: xla.transpose(x, [1, 0]), args=(v,), expected=v.T)
 
-  @test_util.disable_mlir_bridge('Not supported yet')
   def testDynamicSlice(self):
     for dtype in self.numeric_types:
       self._assertOpOutputMatchesExpected(
@@ -322,7 +316,7 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
                         [[673, 674], [683, 684], [693, 694]]]),
               dtype=dtype))
 
-  @test_util.disable_mlir_bridge('Not supported yet')
+  @test_util.disable_mlir_bridge('Error handling')
   def testDynamicSliceWithIncorrectStartIndicesShape(self):
     with self.session() as session:
       with self.test_scope():
@@ -336,7 +330,7 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
           (r'start_indices must be a vector with length equal to input rank, '
            r'but input rank is 3 and start_indices has shape \[2\].*'))
 
-  @test_util.disable_mlir_bridge('Not supported yet')
+  @test_util.disable_mlir_bridge('Error handling')
   def testDynamicSliceWithIncorrectSizeIndicesShape(self):
     with self.session() as session:
       with self.test_scope():
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index 356798c19bd..3d3eab51268 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -403,6 +403,7 @@ tf_cuda_library(
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/stream_executor/lib",
+        "//tensorflow/tools/graph_transforms:transform_utils",
     ] + if_tensorrt([":tensorrt_lib"]) + tf_custom_op_library_additional_deps(),
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 806d930b76f..414d27477bc 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -41,18 +41,16 @@ limitations under the License.
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/devices.h"
-#include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
 #include "tensorflow/core/protobuf/device_properties.pb.h"  // NOLINT
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"  // NOLINT
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -90,8 +88,6 @@ bool AllowDynamicNonBatchDimension(const ConversionParams& params) {
          GetEngineType(params) == EngineInfo::EngineType::TRTDynamic;
 }
 
-}  // namespace
-
 struct EdgePtrCompare {
   bool operator()(const Edge* lhs, const Edge* rhs) const {
     return lhs->id() < rhs->id();
@@ -555,6 +551,58 @@ Status CreateTRTNode(const ConversionParams& params,
   return Status::OK();
 }
 
+int64 GetNextGraphSequenceNumber() {
+  static std::atomic<int64> graph_sequence_num;
+  return graph_sequence_num++;
+}
+
+constexpr char kCastInputTypeAttrName[] = "SrcT";
+
+// Transforms node = cast(x, fp32) where datatype(x) != fp16 to:
+//   castToFp16 = cast(x, fp16)
+//   node = cast(castToFp16, fp32)
+//
+Status MaybeRewriteCastToFp32(GraphDef* graph_def, NodeDef* node_def) {
+  if (node_def->op() != "Cast") {
+    return Status::OK();
+  }
+
+  DataTypeVector input_types;
+  DataTypeVector output_types;
+  TF_RETURN_IF_ERROR(
+      graph_transforms::GetInOutTypes(*node_def, &input_types, &output_types));
+
+  if (input_types.size() != 1 || output_types.size() != 1) {
+    return errors::Internal("Bad cast operation");
+  }
+
+  if (input_types[0] == DT_HALF || output_types[0] != DT_FLOAT) {
+    return Status::OK();
+  }
+
+  VLOG(2) << "Rewriting cast to FP32 " << node_def->DebugString();
+
+  NodeDef* castToFp16 = graph_def->add_node();
+  for (auto attr_value : node_def->attr()) {
+    (*castToFp16->mutable_attr())[attr_value.first] = attr_value.second;
+  }
+  castToFp16->set_name(node_def->name() + "_split");
+  castToFp16->set_op("Cast");
+  castToFp16->set_device(node_def->device());
+  castToFp16->add_input(node_def->input(0));
+  (*castToFp16->mutable_attr())[kCastOutputTypeAttrName].set_type(DT_HALF);
+
+  node_def->set_input(0, castToFp16->name() + ":0");
+  (*node_def->mutable_attr())[kCastInputTypeAttrName].set_type(DT_HALF);
+
+  VLOG(2) << castToFp16->DebugString();
+  VLOG(2) << node_def->DebugString();
+
+  return Status::OK();
+}
+
+}  // namespace
+
 Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def,
                                       Graph* graph, const string& engine_name) {
   Graph segment_graph(graph->flib_def());
@@ -629,11 +677,6 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
   return std::make_pair(cuda_device_id, dev_allocator);
 }
 
-int64 GetNextGraphSequenceNumber() {
-  static std::atomic<int64> graph_sequence_num;
-  return graph_sequence_num++;
-}
-
 // Entry function from optimization pass.
 Status ConvertAfterShapes(const ConversionParams& params) {
   // Sanity checks.
@@ -643,12 +686,43 @@ Status ConvertAfterShapes(const ConversionParams& params) {
         "Calibration with FP32 or FP16 is not supported.");
   }
 
+  // Make a copy of the input_graph_def because grappler doesn't allow changes
+  // to the input_graph_def and GraphProperties only accepts GraphDef, but not
+  // Graph, as inputs.
+  //
+  // If the overhead of copying the input_graph_def becomes a concern, we can
+  // avoid the copy by (1) enhancing the GraphPropertiers representation to
+  // allow adding shape properties for newly created graph nodes and (2) rewrite
+  // the GraphDef transformation to Graph transformation.
+  GraphDef modified_graph_def = params.grappler_item->graph;
+  // When precision_mode is FP16, transform cast(x, fp32) to
+  // cast(cast(x, fp16), fp32). This creates cast(fp16, f32) that can be
+  // included in the TRTEngineOp as an TensorRT Identity layer for performance:
+  //  . Avoid cast(fp32, fp16) in the TRT engine implementation for fp16
+  //    precision.
+  //  . Changing the input to the TRTEngine from fp32 to fp16 may reduce data
+  //    moving from the host to the GPU.
+  if (params.precision_mode == TrtPrecisionMode::FP16) {
+    for (int i = 0; i < modified_graph_def.node_size(); i++) {
+      NodeDef* node_def = modified_graph_def.mutable_node(i);
+      TF_RETURN_IF_ERROR(MaybeRewriteCastToFp32(&modified_graph_def, node_def));
+    }
+  }
+
+  // Construct a GrapplerItem using the modified graph_def and the input
+  // grappler_item.
+  grappler::GrapplerItem grappler_item =
+      params.grappler_item->WithGraph(std::move(modified_graph_def));
+  const GraphDef& graph_def = grappler_item.graph;
+
+  grappler::GraphProperties static_graph_properties(grappler_item);
+  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
+
   // Convert graphdef to graph.
-  FunctionLibraryDefinition flib(OpRegistry::Global(),
-                                 params.input_graph_def->library());
+  FunctionLibraryDefinition flib(OpRegistry::Global(), graph_def.library());
   Graph graph(flib);
-  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(GraphConstructorOptions(),
-                                            *params.input_graph_def, &graph));
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), graph_def, &graph));
 
   // Segment the graph into subgraphs that can be converted to TensorRT
   segment::SegmentOptions segment_options;
@@ -662,10 +736,10 @@ Status ConvertAfterShapes(const ConversionParams& params) {
       AllowDynamicNonBatchDimension(params);
 
   segment::SegmentNodesVector initial_segments;
-  TrtNodeValidator validator(*params.graph_properties, params.precision_mode,
+  TrtNodeValidator validator(static_graph_properties, params.precision_mode,
                              params.use_calibration, params.use_implicit_batch);
   TF_RETURN_IF_ERROR(segment::SegmentGraph(
-      &graph, params.graph_properties,
+      &graph, &static_graph_properties,
       std::bind(&TrtNodeValidator::IsTensorRTCandidate, &validator,
                 std::placeholders::_1),
       // Input validation is already done by TrtNodeValidator, so we don't
@@ -693,9 +767,8 @@ Status ConvertAfterShapes(const ConversionParams& params) {
     auto& curr_segment = initial_segments.at(t);
     EngineInfo curr_engine;
     curr_engine.engine_name = StrCat(engine_name_prefix, t);
-    Status status =
-        GetEngineInfo(&graph, *params.graph_properties, curr_segment, node_map,
-                      reverse_topo_order, &curr_engine);
+    Status status = GetEngineInfo(&graph, static_graph_properties, curr_segment,
+                                  node_map, reverse_topo_order, &curr_engine);
     if (!status.ok()) {
       LOG(WARNING) << "Failed to get engine info for segment " << t << ": "
                    << status;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
index 2bfaa2a786c..53ab84a6fa9 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
@@ -18,10 +18,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
-#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
-#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -33,7 +32,7 @@ namespace tensorrt {
 namespace convert {
 
 struct ConversionParams {
-  const GraphDef* input_graph_def = nullptr;
+  const grappler::GrapplerItem* grappler_item = nullptr;
   const std::vector<string>* output_names = nullptr;
   string trt_logger_name;
   size_t max_batch_size = 1;
@@ -41,7 +40,6 @@ struct ConversionParams {
   GraphDef* output_graph_def = nullptr;
   TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32;
   int minimum_segment_size = 3;
-  const grappler::GraphProperties* graph_properties = nullptr;
   const grappler::Cluster* cluster = nullptr;
   // Whether to create engine on conversion or execution time
   bool is_dyn_op = false;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
index 2cfefd27a67..a1f523d6bfa 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
@@ -162,12 +162,11 @@ class ConvertAfterShapesTest : public ::testing::Test {
     // Construct ConversionParams.
     const std::vector<string> output_names{"output"};
     ConversionParams params;
-    params.input_graph_def = &item.graph;
     params.output_names = &output_names;
     params.max_workspace_size_bytes = 8 << 20;
     params.output_graph_def = output_graph_def;
     params.minimum_segment_size = 1;
-    params.graph_properties = &graph_properties;
+    params.grappler_item = &item;
     params.use_calibration = false;
     params.trt_logger_name = "DefaultLogger";
 
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 4aec6a2512c..8ca7c4cdf8f 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
@@ -403,6 +404,18 @@ Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
   // Compare broadcast feasibility
   if (check_feasibility) {
     for (int i = 0; i < broadcast_num_dims; ++i) {
+      if (!use_implicit_batch && (output_l[i] == -1 || output_r[i] == -1)) {
+        // If the condition is true then we are in explicit batch mode and (at
+        // least) one of the input dimensions are unknown. In other words we
+        // are in dynamic shape mode. During conversion time we only see -1 for
+        // the unknown shapes, therefore we cannot decide on the feasibility of
+        // broadcast over the unknown dimensions. Therefore we just continue for
+        // the next dimension. In dynamic shape mode TRT can only check the
+        // feasibility of the broadcast when the actual input dimensions are
+        // specified by SetTrtEngineInputs and the inference job is launched by
+        // TrtEnque.
+        continue;
+      }
       if ((output_l[i] != output_r[i]) && (output_l[i] != 1) &&
           (output_r[i] != 1)) {
         return errors::InvalidArgument("Infeasible broadcast scheme (",
@@ -795,6 +808,19 @@ nvinfer1::Dims TRT_TensorOrWeights::GetTrtDims() const {
   }
 }
 
+Status TRT_TensorOrWeights::GetTfType(DataType* tf_type) const {
+  if (is_tensor()) {
+    nvinfer1::DataType trt_type = tensor()->getType();
+    return TrtTypeToTfType(trt_type, tf_type);
+  }
+
+  if (is_weights()) {
+    *tf_type = weights().GetTensor().dtype();
+    return Status::OK();
+  }
+  return errors::Internal("The object is probably not initialized");
+}
+
 string TRT_TensorOrWeights::DebugString() const {
   string output = "TRT_TensorOrWeights(type=";
   if (is_tensor()) {
@@ -1900,27 +1926,48 @@ Status CheckInputsWeights(
   return Status::OK();
 }
 
-Status AllowDataTypes(const OpConverterParams& params,
-                      const std::set<DataType>& allowed_dtypes,
-                      const char* dtype_attr_name = "T") {
-  const auto& node_def = params.node_def;
+Status GetNodeDefTfType(const NodeDef& node_def, DataType* tf_type,
+                        const char* type_attr_name) {
   TFAttrs attrs(node_def);
-  if (!attrs.count(dtype_attr_name)) {
-    return errors::InvalidArgument("Attribute with name ", dtype_attr_name,
+  if (!attrs.count(type_attr_name)) {
+    return errors::InvalidArgument("Attribute with name ", type_attr_name,
                                    " not found.");
   }
-  const auto op_dtype = attrs.get<DataType>(dtype_attr_name);
-  if (!allowed_dtypes.count(op_dtype)) {
-    // Build string list of allowed types.
-    std::ostringstream ss;
-    for (auto it = allowed_dtypes.begin(); it != allowed_dtypes.end(); ++it) {
-      if (it != allowed_dtypes.begin()) ss << ", ";
-      ss << DataTypeString(*it);
-    }
-    return errors::Unimplemented("Data type ", DataTypeString(op_dtype),
+  *tf_type = attrs.get<DataType>(type_attr_name);
+  return Status::OK();
+}
+
+Status GetInputTfType(const OpConverterParams& params, DataType* tf_type,
+                      int pos) {
+  const std::vector<TRT_TensorOrWeights>& inputs = params.inputs;
+  if (inputs.size() <= pos) {
+    return errors::Internal("Invalid input position");
+  }
+
+  return inputs[pos].GetTfType(tf_type);
+}
+
+constexpr const char kOutputTypeAttrName[] = "T";
+
+Status GetOutputTfType(const OpConverterParams& params, DataType* tf_type) {
+  return GetNodeDefTfType(params.node_def, tf_type, kOutputTypeAttrName);
+}
+
+Status AllowDataTypes(const OpConverterParams& params,
+                      const std::set<DataType>& allowed_types,
+                      const char* type_attr_name = kOutputTypeAttrName) {
+  const auto& node_def = params.node_def;
+  DataType tf_type;
+  TF_RETURN_IF_ERROR(GetNodeDefTfType(node_def, &tf_type, type_attr_name));
+  if (!allowed_types.count(tf_type)) {
+    string allowed_types_string = absl::StrJoin(
+        allowed_types, ", ", [](string* out, const DataType& type) {
+          absl::StrAppendFormat(out, "%s", DataTypeString(type));
+        });
+    return errors::Unimplemented("Data type ", DataTypeString(tf_type),
                                  " is not supported for ", node_def.op(),
-                                 ", must be one of [", ss.str(), "], at ",
-                                 node_def.name());
+                                 ", must be one of [", allowed_types_string,
+                                 "], at ", node_def.name());
   }
   return Status::OK();
 }
@@ -2111,6 +2158,12 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
         "Stride must be 1 for batch and channel dimensions, at ",
         node_def.name());
   }
+  // Channel dim must be static for DepthwiseConv2dNative since we use that
+  // value for num_groups at build time.
+  if (!params->use_implicit_batch && tensor->getDimensions().d[c_index] == -1) {
+    return errors::InvalidArgument("Channel dimension must be static, at ",
+                                   node_def.name());
+  }
   const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
   if (params->validation_only) return Status::OK();
 
@@ -2122,11 +2175,12 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
   }
   // Dimensions of transposed tensor.
   const auto tensor_dim = tensor->getDimensions();
+  const int c_dim_size = tensor_dim.d[params->use_implicit_batch ? 0 : 1];
 
   // group == 0 signifies that this is a depthwise convolution, so set
   // num_groups to size of input's channel dim. For a non-depthwise conv,
   // num_groups will be 1.
-  const int num_groups = (group == 0) ? tensor_dim.d[0] : group;
+  const int num_groups = (group == 0) ? c_dim_size : group;
 
   // For conv, TF weights are RSCK, and TRT expects KCRS.
   // For backprop, TF weights are RSKC, and TRT expects CKRS.
@@ -2413,26 +2467,19 @@ Status ConvertExpandDims(OpConverterParams* params) {
 }
 
 Status Converter::SqueezeTensor(nvinfer1::ITensor* input,
-                                const std::vector<int>& trt_axes,
+                                std::vector<int>* input_dims,
                                 nvinfer1::ITensor** output) {
-  const nvinfer1::Dims dims = input->getDimensions();
-  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
-  // Mark axes to remove by setting them to 0.
-  for (int axis : trt_axes) {
-    input_dims[axis] = 0;
-  }
-
 #if IS_TRT_VERSION_GE(6, 0, 0, 0)
   // If the remaining dimensions of a squeeze operation have dynamic sizes, we
   // need to use TRT ops to build the result shape for the squeeze operation.
   // This is because IShuffleLayer::setReshapeDimensions treats -1 as a special
   // value.
-  if (absl::c_any_of(input_dims, [](int i) { return i == -1; })) {
+  if (absl::c_any_of(*input_dims, [](int i) { return i == -1; })) {
     nvinfer1::ITensor* shape = network()->addShape(*input)->getOutput(0);
     std::vector<nvinfer1::ITensor const*> concat_inputs;
-    for (int i = 0; i < input_dims.size(); i++) {
+    for (int i = 0; i < input_dims->size(); i++) {
       // If input dim wasn't set to 0 earlier, we include it in new shape.
-      if (input_dims[i] != 0) {
+      if (input_dims->at(i) != 0) {
         concat_inputs.push_back(
             network()
                 ->addSlice(*shape, {1, {i}}, {1, {1}}, {1, {1}})
@@ -2452,11 +2499,12 @@ Status Converter::SqueezeTensor(nvinfer1::ITensor* input,
   }
 #endif
   // Remove all dims which are equal to 0.
-  input_dims.erase(std::remove(input_dims.begin(), input_dims.end(), 0),
-                   input_dims.end());
+  input_dims->erase(std::remove(input_dims->begin(), input_dims->end(), 0),
+                    input_dims->end());
   // Reshape tensor.
   nvinfer1::Dims new_dims;
-  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims));
+  VLOG(2) << "input_dims" << input_dims;
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(*input_dims, &new_dims));
   TF_RETURN_IF_ERROR(PrepareTensorForShape(TRT_TensorOrWeights(input), new_dims,
                                            /*validation_only=*/false, output));
   return Status::OK();
@@ -2475,31 +2523,48 @@ Status ConvertSqueeze(OpConverterParams* params) {
   TFAttrs attrs(node_def);
   auto squeeze_dims = attrs.get<std::vector<int64>>("squeeze_dims");
   if (squeeze_dims.empty()) {
-    return errors::Unimplemented(
-        "Squeeze is only implemented for explicit dims, at ", node_def.name());
-  }
-  std::vector<int> trt_axes;
-  trt_axes.reserve(squeeze_dims.size());
-  for (int tf_axis : squeeze_dims) {
-    // If the axis is valid, then convert it to TRT axis, otherwise abort
-    // conversion.
-    int trt_axis;
-    TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(),
-                                   params->use_implicit_batch, &trt_axis));
-    // Make sure target dimension is size 1 or unknown size (-1)
-    if (input_dims[trt_axis] != -1 && input_dims[trt_axis] != 1) {
-      return errors::InvalidArgument(
-          "Dimension ", tf_axis, " with size ", input_dims[trt_axis],
-          " cannot be squeezed because it must be size 1, at ",
+    if (params->use_implicit_batch || !HasStaticShape(dims)) {
+      return errors::Unimplemented(
+          "Squeeze is not implemented for empty squeeze_dims, at ",
           node_def.name());
+    } else {
+      // explicit batch mode with static input shape we squeeze all singleton
+      // dimensions
+      for (int& dim : input_dims) {
+        if (dim == 1) {
+          // Mark it for removal by setting it to 0
+          dim = 0;
+        }
+      }
+    }
+  } else {
+    std::vector<int> trt_axes;
+    trt_axes.reserve(squeeze_dims.size());
+    for (int tf_axis : squeeze_dims) {
+      // If the axis is valid, then convert it to TRT axis, otherwise abort
+      // conversion.
+      int trt_axis;
+      TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(),
+                                     params->use_implicit_batch, &trt_axis));
+      // Make sure target dimension is size 1 or unknown size (-1)
+      if (input_dims[trt_axis] != -1 && input_dims[trt_axis] != 1) {
+        return errors::InvalidArgument(
+            "Dimension ", tf_axis, " with size ", input_dims[trt_axis],
+            " cannot be squeezed because it must be size 1, at ",
+            node_def.name());
+      }
+      trt_axes.push_back(trt_axis);
+    }
+    // Mark axes to remove by setting them to 0.
+    for (int axis : trt_axes) {
+      input_dims[axis] = 0;
     }
-    trt_axes.push_back(trt_axis);
   }
   if (params->validation_only) return Status::OK();
 
   nvinfer1::ITensor* output_tensor = nullptr;
   TF_RETURN_IF_ERROR(params->converter->SqueezeTensor(
-      input_tensor.tensor(), trt_axes, &output_tensor));
+      input_tensor.tensor(), &input_dims, &output_tensor));
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
@@ -3842,11 +3907,26 @@ Status ConvertBiasAdd(OpConverterParams* params) {
 
   nvinfer1::Dims input_shape = inputs.at(0).GetTrtDims();
   nvinfer1::Dims bias_shape = inputs.at(1).GetTrtDims();
-  // If the input is NCHW, then we need to unsqueeze the bias such that its last
-  // dimensions are 1s (and the first dimension is C).
+  // The bias input arg is a 1-D tensor with length C. If the input is NCHW,
+  // then we need to unsqueeze the bias such that its shape is [1, C, 1, 1].
   if (data_format == "NCHW") {
-    bias_shape.nbDims = input_shape.nbDims;
-    std::fill(bias_shape.d + 1, bias_shape.d + bias_shape.nbDims, 1);
+    if (params->use_implicit_batch) {
+      // The batch dim is not included in implicit batch mode, so the shape of
+      // the bias tensor is [C, 1, 1].
+      bias_shape.nbDims = input_shape.nbDims;
+      std::fill(bias_shape.d + 1, bias_shape.d + bias_shape.nbDims, 1);
+    } else {
+      // In explicit batch mode we create a tensor with shape [1, C, 1, 1].
+      std::vector<int> bias_shape_vec(bias_shape.d,
+                                      bias_shape.d + bias_shape.nbDims);
+      // Insert 1 before for batch dim
+      bias_shape_vec.insert(bias_shape_vec.begin(), 1);
+      // Trail with 1s to match input_shape size
+      bias_shape_vec.insert(bias_shape_vec.end(),
+                            input_shape.nbDims - bias_shape_vec.size(), 1);
+      TF_RETURN_IF_ERROR(
+          TensorShapeArrayToTrtDims(bias_shape_vec, &bias_shape));
+    }
   } else {
     // Next, broadcast the bias across the input.
     TF_RETURN_IF_ERROR(GetTrtBroadcastShape(inputs.at(0), inputs.at(1),
@@ -4587,6 +4667,44 @@ Status ConvertUnpack(OpConverterParams* params) {
   return ConvertSplitHelper(params, inputs.at(0), tf_axis, num, true);
 }
 
+// Supports cast fp16=>fp32 through IIdentityLayer.
+Status ConvertCast(OpConverterParams* params) {
+  const NodeDef& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
+  auto unsupport_cast_error = [&]() {
+    return errors::Unimplemented("Cast op: ", node_def.op(),
+                                 " not supported at: ", node_def.name());
+  };
+
+  DataType input_type;
+  TF_RETURN_IF_ERROR(GetInputTfType(*params, &input_type, 0));
+  if (input_type != DataType::DT_HALF) {
+    return unsupport_cast_error();
+  }
+
+  DataType output_type;
+  TF_RETURN_IF_ERROR(GetNodeDefTfType(params->node_def, &output_type,
+                                      kCastOutputTypeAttrName));
+
+  if (output_type != DataType::DT_FLOAT) {
+    return unsupport_cast_error();
+  }
+
+  if (params->validation_only) return Status::OK();
+
+  nvinfer1::ITensor* input = params->inputs.at(0).tensor();
+  nvinfer1::IIdentityLayer* layer =
+      params->converter->network()->addIdentity(*input);
+  layer->setPrecision(nvinfer1::DataType::kFLOAT);
+
+  if (layer->getOutput(0)->getType() != nvinfer1::DataType::kFLOAT) {
+    return errors::Internal("IIdentityLayer doesn't work as expected");
+  }
+
+  params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
+  return Status::OK();
+}
+
 Status ConvertConcat(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
@@ -5664,6 +5782,7 @@ static void RegisterValidatableOpConverters(
   (*registration)["CombinedNonMaxSuppression"] = ConvertCombinedNMS;
 #endif
   (*registration)["AddN"] = ConvertAddN;
+  (*registration)["Cast"] = ConvertCast;
   (*registration)["ConcatV2"] = ConvertConcat;
   (*registration)["Const"] = ConvertConst;
   (*registration)["Conv2D"] = ConvertConv2D;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index 8608c8226ee..2fe8eec9675 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -294,6 +294,8 @@ class TRT_TensorOrWeights {
 
   nvinfer1::Dims GetTrtDims() const;
 
+  Status GetTfType(DataType* tf_type) const;
+
   int batch_size() const { return batch_size_; }
 
   string DebugString() const;
@@ -529,11 +531,9 @@ class Converter {
 
   // Helper function to add a squeeze op to the network.
   //
-  // The trt_axes argument lists those axes that need to be squeezed. Each axis
-  // in the list is numbered according to TRT convention (see ConvertAxis for
-  // details).
-  Status SqueezeTensor(nvinfer1::ITensor* input,
-                       const std::vector<int>& trt_axes,
+  // The input_dims argument stores the TRT dimensions of the input tensor,
+  // where the dimensions to be squeezed are replaced by 0.
+  Status SqueezeTensor(nvinfer1::ITensor* input, std::vector<int>* input_dims,
                        nvinfer1::ITensor** output);
 
   // Creates an IConstantLayer using 'weights' whose dimensions are specified by
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 1f30b837450..f9b0cafe253 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 
 #include <algorithm>
+#include <functional>
 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -67,7 +68,6 @@ using absl::StrCat;
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 using ::testing::Matcher;
-using ::testing::NanSensitiveFloatNear;
 
 // TensorRT modes for testing. We define the following three modes:
 // 1. Implicit batch mode: The tensors have static (known) input shape and the
@@ -135,30 +135,18 @@ std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) {
   return os;
 }
 
-nvinfer1::DataType TfDataTypeToTrt(DataType tf_dtype) {
-  switch (tf_dtype) {
-    case DT_FLOAT:
-      return nvinfer1::DataType::kFLOAT;
-    case DT_HALF:
-      return nvinfer1::DataType::kHALF;
-    case DT_INT32:
-      return nvinfer1::DataType::kINT32;
-    default:
-      QCHECK(false) << "Unexpected data type " << DataTypeString(tf_dtype);
-  }
+nvinfer1::DataType TfDataTypeToTrt(DataType tf_type) {
+  nvinfer1::DataType trt_type;
+  Status status = TfTypeToTrtType(tf_type, &trt_type);
+  EXPECT_EQ(status, Status::OK());
+  return trt_type;
 }
 
-DataType TrtDataTypeToTf(nvinfer1::DataType trt_dtype) {
-  switch (trt_dtype) {
-    case nvinfer1::DataType::kFLOAT:
-      return DT_FLOAT;
-    case nvinfer1::DataType::kHALF:
-      return DT_HALF;
-    case nvinfer1::DataType::kINT32:
-      return DT_INT32;
-    default:
-      QCHECK(false) << "Unexpected data type " << static_cast<int>(trt_dtype);
-  }
+DataType TrtDataTypeToTf(nvinfer1::DataType trt_type) {
+  DataType tf_type;
+  Status status = TrtTypeToTfType(trt_type, &tf_type);
+  EXPECT_EQ(status, Status::OK());
+  return tf_type;
 }
 
 NodeDef MakeNodeDef(const string& name, const string& op,
@@ -216,6 +204,24 @@ void ExpectTrtDimsEqualsArray(const std::vector<int>& lhs,
       << "  actual: " << DebugString(rhs);
 }
 
+Matcher<std::vector<float>> ArrayFloatNear(const std::vector<float>& values,
+                                           float max_abs_error = 1e-5,
+                                           bool nan_sensitive = false) {
+  std::vector<Matcher<float>> matchers;
+  matchers.reserve(values.size());
+  for (const float& v : values) {
+    if (nan_sensitive) {
+      matchers.emplace_back(::testing::NanSensitiveFloatNear(v, max_abs_error));
+    } else if (max_abs_error == 0) {
+      matchers.emplace_back(::testing::FloatEq(v));
+    } else {
+      EXPECT_GE(max_abs_error, 0);
+      matchers.emplace_back(::testing::FloatNear(v, max_abs_error));
+    }
+  }
+  return ElementsAreArray(matchers);
+}
+
 template <typename T>
 void ExpectArrayNear(const std::vector<T>& lhs, absl::Span<const T> rhs) {
   ASSERT_EQ(lhs.size(), rhs.size());
@@ -293,7 +299,8 @@ struct StaticCaster {
 };
 
 template <typename InCType, typename OutCType>
-std::vector<OutCType> CastTestVector(const std::vector<InCType>& vals) {
+std::vector<OutCType> CastTestVector(
+    const gtl::ArraySlice<InCType>& vals) {  // non-absl ok
   std::vector<OutCType> res(vals.size());
   std::transform(vals.begin(), vals.end(), res.begin(),
                  StaticCaster<InCType, OutCType>());
@@ -1283,6 +1290,21 @@ inline absl::Span<const T> GetSpanForData(const InputOutputData& data) {
   return absl::Span<const T>(tensor_map.data(), tensor_map.size());
 }
 
+std::vector<float> GetDataAsFloat(InputOutputData& data) {
+  if (data.tensor.dtype() == DT_FLOAT) {
+    auto span = GetSpanForData<float>(data);
+    return std::vector<float>(span.begin(), span.end());
+  }
+  if (data.tensor.dtype() == DT_HALF) {
+    return CastTestVector<Eigen::half, float>(
+        GetSpanForData<Eigen::half>(data));
+  }
+  if (data.tensor.dtype() == DT_INT32) {
+    return CastTestVector<int32, float>(GetSpanForData<int32>(data));
+  }
+  LOG(FATAL) << "DataType not supported for testing "
+             << DataTypeString(data.tensor.dtype());
+}
 // Class to test various op converters, using both a TrtNodeValidator and
 // Converter.
 class OpConverterTest : public ::testing::Test {
@@ -1336,6 +1358,33 @@ class OpConverterTest : public ::testing::Test {
     return ret;
   }
 
+  // Constructs a tensor with given values (vals). The tensor type is defined by
+  // the tf_dtype argument, its shape is given by input_dims. The tensor is
+  // constructed using the allocator of OpConverterTest in Unified Memory.
+  template <typename T>
+  Tensor AsTensor(std::vector<T> vals, const std::vector<int> input_dims,
+                  DataType tf_dtype) {
+    Tensor ret(allocator_.get(), tf_dtype, {static_cast<int64>(vals.size())});
+    if (tf_dtype == DT_FLOAT) {
+      auto conv_vals = CastTestVector<T, float>(vals);
+      std::copy_n(conv_vals.data(), conv_vals.size(), ret.flat<float>().data());
+    } else if (tf_dtype == DT_HALF) {
+      auto conv_vals = CastTestVector<T, Eigen::half>(vals);
+      std::copy_n(conv_vals.data(), conv_vals.size(),
+                  ret.flat<Eigen::half>().data());
+    } else if (tf_dtype == DT_INT32) {
+      auto conv_vals = CastTestVector<T, int32>(vals);
+      std::copy_n(conv_vals.data(), conv_vals.size(), ret.flat<int32>().data());
+    } else {
+      LOG(FATAL) << "Cannot create tensor with type "
+                 << DataTypeString(tf_dtype);
+    }
+    TensorShape shape;
+    TF_EXPECT_OK(TensorShapeUtils::MakeShape(input_dims, &shape));
+    CHECK(ret.CopyFrom(ret, shape));
+    return ret;
+  }
+
   // Constructs a flat tensor in Unified Memory.
   template <typename T>
   Tensor ConstructTensor(int data_size, const T& value = T()) {
@@ -1343,6 +1392,13 @@ class OpConverterTest : public ::testing::Test {
     return AsTensor<T>(values);
   }
 
+  // Constructs a flat tensor in Unified Memory.
+  template <typename T>
+  Tensor ConstructTensor(int data_size, const T& value, DataType tf_dtype) {
+    std::vector<T> values(data_size, value);
+    return AsTensor<T>(values, {data_size}, tf_dtype);
+  }
+
   void CheckDataTypeMatches(const DataVec& datas) {
     for (const auto& data : datas) {
       const int input_index = engine_->getBindingIndex(data.name.c_str());
@@ -1356,27 +1412,29 @@ class OpConverterTest : public ::testing::Test {
     }
   }
 
-  void BuildAndRun(const DataVec& input_data, DataVec* output_data,
-                   const int batch_size = 1) {
+  Status BuildAndRun(const DataVec& input_data, DataVec* output_data,
+                     const int batch_size = 1) {
     // Mark the output tensor as TRT engine output.
     std::vector<Converter::EngineOutputInfo> output_info;
     for (const auto& data : *output_data) {
       output_info.push_back(
           {data.name, data.name, TfDataTypeToTrt(data.tensor.dtype())});
     }
-    TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors(output_info));
+    TF_RETURN_IF_ERROR(converter_->RenameAndMarkOutputTensors(output_info));
 
     // Build the TRT engine.
-    ASSERT_EQ(nullptr, engine_.get());
+    if (engine_.get() != nullptr) {
+      return errors::Internal("Engine already exists");
+    }
     TrtShapeOptimizationProfile profiles;
     if (!converter_->use_implicit_batch()) {
       // Create a single optimization profile for explicit batch mode
       std::vector<TensorShape> input_shapes;
-      TF_ASSERT_OK(GetShapeFromDataVec(input_data, &input_shapes));
+      TF_RETURN_IF_ERROR(GetShapeFromDataVec(input_data, &input_shapes));
       profiles.AddShape(input_shapes);
       profiles.InitProfiles();
     }
-    TF_ASSERT_OK(
+    TF_RETURN_IF_ERROR(
         converter_->BuildCudaEngine(&engine_,
                                     /*max_batch_size=*/batch_size,
                                     /*max_workspace_size_bytes=*/1 << 26,
@@ -1390,7 +1448,9 @@ class OpConverterTest : public ::testing::Test {
     const int num_bindings = input_data.size() + output_data->size();
     std::vector<void*> buffers(num_bindings);
 
-    ASSERT_EQ(engine_->getNbBindings(), num_bindings);
+    if (engine_->getNbBindings() != num_bindings) {
+      return errors::Internal("Number of bindings do not match");
+    }
     // Since we have only 1 optimization profile (which is enabled by default)
     // it is fine to create execution context directly, instead of calling
     // profiles.CreateExecutionContexts()
@@ -1398,19 +1458,19 @@ class OpConverterTest : public ::testing::Test {
         engine_->createExecutionContext());
 
     // Prepare input bindings.
-    TF_ASSERT_OK(SetTrtEngineInputs(engine_.get(), execution_context.get(), 0,
-                                    buffers, converter_->use_implicit_batch(),
-                                    batch_size, nullptr, &input_data));
-
+    TF_RETURN_IF_ERROR(SetTrtEngineInputs(
+        engine_.get(), execution_context.get(), 0, buffers,
+        converter_->use_implicit_batch(), batch_size, nullptr, &input_data));
     // Prepare output bindings.
-    TF_ASSERT_OK(SetTrtEngineOutputs(engine_.get(), execution_context.get(), 0,
-                                     buffers, converter_->use_implicit_batch(),
-                                     batch_size, nullptr, output_data));
-
+    TF_RETURN_IF_ERROR(SetTrtEngineOutputs(
+        engine_.get(), execution_context.get(), 0, buffers,
+        converter_->use_implicit_batch(), batch_size, nullptr, output_data));
     // Execute the TRT engine.
-    TF_ASSERT_OK(TrtEnqueue(execution_context.get(), buffers, stream_,
-                            converter_->use_implicit_batch(), batch_size));
+    TF_RETURN_IF_ERROR(TrtEnqueue(execution_context.get(), buffers, stream_,
+                                  converter_->use_implicit_batch(),
+                                  batch_size));
     cudaStreamSynchronize(stream_);
+    return Status::OK();
   }
 
   bool HasStaticShape(const nvinfer1::Dims& dims) const {
@@ -1427,7 +1487,7 @@ class OpConverterTest : public ::testing::Test {
 
   // Adds ITensor for both validation and conversion, assuming explicit batch
   // dimension is included in dims (ie for an NCHW tensor dims = {N, C, H, W}).
-  void AddTestTensorWithExplicitBatchDim(
+  void AddTestTensorWithTFDims(
       const string& name, const std::vector<int32>& dims,
       nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT) {
     DataType tf_dtype = TrtDataTypeToTf(trt_dtype);
@@ -1447,54 +1507,19 @@ class OpConverterTest : public ::testing::Test {
     }
   }
 
-  // Adds ITensor for both validation and conversion. The tensor can have
-  // partial input shape. This function defines static or dynamic shape input
-  // tensor for the network based on the trt_mode attribute. This is done
-  // automatically, unless the user overrides it with an explicit
-  // partial_input_shape_dims argument.
-  //
-  // Parameters:
-  // - dims actual dimensions of the tensor that we will use during the test
-  //   (including explicit batch dim). This is not used if partial_input_shape
-  //   is defined.
-  // - partial_input_shape dimensions which can incude unknown shapes. This can
-  //   be empty, in that case the partial_input_shape will be set automatically
-  //   depending on the trt_mode argument. (This also includse explicit batch
-  //   dim).
-  //
-  //  On return skip_test is false if trt_mode is not compatible with the
-  // partial input shape.
-  void AddTestTensor(
-      const string& name, const std::vector<int32>& dims,
-      nvinfer1::DataType trt_dtype, TrtTestMode trt_mode,
-      const std::vector<int32>* partial_input_shape_dims = nullptr) {
-    std::vector<int32> partial_shape;
-    if (partial_input_shape_dims && !partial_input_shape_dims->empty()) {
-      partial_shape = *partial_input_shape_dims;
-    } else {
-      if (trt_mode == TrtTestMode::kDynamicShape) {
-        // In dynamic shape mode we set the all dims unknown.
-        partial_shape = std::vector<int32>(dims.size(), -1);
-      } else {
-        // Use static (known) input shapes.
-        partial_shape = dims;
-      }
-    }
-    AddTestTensorWithExplicitBatchDim(name, partial_shape, trt_dtype);
-  }
-
   // Adds ITensor for both validation and conversion. The difference compared to
-  // AddTestTensorWithExplicitBatchDim is in the meaning of the dims parameter.
-  // To define a tensor with NCHW shape, here we set dims = {C,H,W} and
-  // batch_size = N. TODO(tfeher) remove this function once all test are updated
-  // to use the other version of AddTestTensor which has the trt_mode arg.
+  // AddTestTensorWithTFDims is in the meaning of the dims parameter. To define
+  // a tensor with NCHW shape, here we set dims = {C,H,W} and batch_size = N.
+  // TODO(tfeher) remove this function once all test are updated to use the
+  // other version of AddTestTensor (defined by
+  // ParameterizedOpConverterTestBase).
   void AddTestTensor(
       const string& name, const std::vector<int32>& dims, int batch_size = 1,
       nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT) {
     std::vector<int32> dims_with_batch(dims.size() + 1);
     dims_with_batch[0] = batch_size;
     std::copy(dims.begin(), dims.end(), dims_with_batch.begin() + 1);
-    AddTestTensorWithExplicitBatchDim(name, dims_with_batch, trt_dtype);
+    AddTestTensorWithTFDims(name, dims_with_batch, trt_dtype);
     if (HasStaticShape(dims)) {
       ASSERT_EQ(batch_size, converter_->batch_size_);
     }
@@ -1527,6 +1552,21 @@ class OpConverterTest : public ::testing::Test {
         converter_->AddTensorOrWeights(name, TRT_TensorOrWeights{weights}));
   }
 
+  template <typename T>
+  void AddTestWeights(const string& name, const std::vector<int>& dims,
+                      const std::vector<T>& values, DataType tf_dtype) {
+    if (tf_dtype == DT_FLOAT) {
+      AddTestWeights(name, dims, CastTestVector<T, float>(values));
+    } else if (tf_dtype == DT_HALF) {
+      AddTestWeights(name, dims, CastTestVector<T, Eigen::half>(values));
+    } else if (tf_dtype == DT_INT32) {
+      AddTestWeights(name, dims, CastTestVector<T, int32>(values));
+    } else {
+      FAIL() << "Cannot create test weights with type "
+             << DataTypeString(tf_dtype);
+    }
+  }
+
   // Test validation in validation-only mode.
   void RunValidation(const Node* node, error::Code expected_code = error::OK,
                      const char* expected_msg_substr = nullptr) {
@@ -1664,20 +1704,146 @@ std::ostream& operator<<(std::ostream& os, const TestParamBase& p) {
   return os;
 }
 
-// Parameterized version of OpConverterTest. This class will be instantiated
-// to test all the TrtTestModes but only in FP32 precision. This means that we
-// will use the following combinations of test parameters:
+// Parameterized version of OpConverterTest. We have the following parameters:
 // 1. TrtTestMode: implicit batch, explicit batch, dynamic shape modes
-// 2. DataType of the input TF tensors: DT_FLOAT
-// 3. TrtPrecisionMode argument for the Converter: FP32
-class ParameterizedOpConverterTest
+// 2. DataType of the input TF tensors: DT_FLOAT, DT_HALF, DT_INT32
+// 3. TrtPrecisionMode argument for the Converter: FP32, FP16, INT8
+// We will introduce subclasses that will be instantiated using different
+// combinations of the DataType and TrtPrecisionMode parameters.
+class ParameterizedOpConverterTestBase
     : public OpConverterTest,
       public ::testing::WithParamInterface<
-          std::tuple<TrtTestMode, DataType, TrtPrecisionMode>> {};
+          std::tuple<TrtTestMode, DataType, TrtPrecisionMode>> {
+ public:
+  ParameterizedOpConverterTestBase()
+      : trt_mode(std::get<0>(GetParam())),
+        tf_dtype(std::get<1>(GetParam())),
+        converter_precision(std::get<2>(GetParam())) {}
 
-// Instantiate parameter combinations to test. For debugging purposes it might
-// make sense to run over all possible combinations, but normally a subset of
-// them would be sufficient:
+  void Reset() {
+    OpConverterTest::Reset(converter_precision, trt_mode);
+    input_data_.clear();
+  }
+
+  // Adds an input ITensor for TRT network. Also creates the corresponding TF
+  // tensor, and stores it in the list of inputs (input_data_).
+  //
+  // The TF tensor is always created with concrete static input shape given by
+  // dims. The ITensor can have static or dynamic shape based on the trt_mode
+  // attribute. The ITensor shape is set automatically according to the trt_mode
+  // parameter, unless the user overrides it with an explicit
+  // partial_input_shape_dims argument.
+  //
+  // Parameters:
+  // - name of the input node
+  // - dims actual dimensions of the tensor that we will use during the test
+  //   (including explicit batch dim)
+  // - values initial values for the TF tensor
+  // - dtype data type of the tensor
+  // - partial_input_shape dimensions which can incude unknown shapes. This can
+  //   be empty, in that case the partial_input_shape will be set automatically
+  //   depending on the trt_mode argument. (This argument also includes explicit
+  //   batch dim).
+  //
+  template <typename T>
+  void AddTestTensor(const string& name, const std::vector<int32>& dims,
+                     DataType tf_dtype, const std::vector<T>& values,
+                     const std::vector<int32>& partial_input_shape_dims = {}) {
+    std::vector<int32> partial_shape;
+    if (!partial_input_shape_dims.empty()) {
+      partial_shape = partial_input_shape_dims;
+    } else {
+      if (trt_mode == TrtTestMode::kDynamicShape) {
+        // In dynamic shape mode we make all dims unknown.
+        partial_shape = std::vector<int32>(dims.size(), -1);
+      } else {
+        // Use static (known) input shapes.
+        partial_shape = dims;
+      }
+    }
+    AddTestTensorWithTFDims(name, partial_shape, TfDataTypeToTrt(tf_dtype));
+    if (!values.empty()) {
+      VLOG(2) << "Adding test tensor: " << name << " "
+              << DataTypeString(tf_dtype);
+      InputOutputData data{name, AsTensor(values, dims, tf_dtype)};
+      VLOG(2) << "Added tensor: " << data.name
+              << DataTypeString(data.tensor.dtype());
+      input_data_.push_back(data);
+    }
+  }
+
+  // Adds test tensor (same as above) but with the default tf_dtype defined by
+  // the test params.
+  void AddTestTensor(const string& name, const std::vector<int32>& dims,
+                     const std::vector<float>& values = {},
+                     const std::vector<int32>& partial_input_shape_dims = {}) {
+    AddTestTensor<float>(name, dims, tf_dtype, values,
+                         partial_input_shape_dims);
+  }
+
+  // Builds and runs the converted network. Checks output tensor shape. Tests
+  // output values using a matcher. The network can have multiple input and
+  // output tensors. The inputs are defined by the input_data_ member variable.
+  void BuildAndRun(const string& name,
+                   const std::vector<std::vector<int>>& expected_output_dims,
+                   const Status& expected_runtime_status,
+                   const std::vector<Matcher<std::vector<float>>>& matcher) {
+    TensorShape shape;
+    const int n_output = expected_output_dims.size();
+    ASSERT_EQ(n_output, matcher.size());
+    DataVec output_data;
+    for (int i = 0; i < n_output; i++) {
+      TF_EXPECT_OK(
+          TensorShapeUtils::MakeShape(expected_output_dims[i], &shape));
+      string out_name = (n_output == 1) ? name : StrCat(name, ":", i);
+      InputOutputData data{out_name,
+                           ConstructTensor(shape.num_elements(), 0, tf_dtype)};
+      output_data.push_back(data);
+    }
+    ASSERT_FALSE(input_data_.empty());
+    const int batch_size = input_data_[0].tensor.shape().dim_size(0);
+    Status stat =
+        OpConverterTest::BuildAndRun(input_data_, &output_data, batch_size);
+    ASSERT_EQ(expected_runtime_status, stat);
+    if (expected_runtime_status.ok() && stat.ok()) {
+      for (int i = 0; i < n_output; i++) {
+        // Check the shape of the actual output tensors
+        TF_EXPECT_OK(
+            TensorShapeUtils::MakeShape(expected_output_dims[i], &shape));
+        EXPECT_TRUE(output_data[i].tensor.shape() == shape)
+            << "Expected shape: " << shape.DebugString() << ", actual shape"
+            << output_data[i].tensor.shape().DebugString();
+        EXPECT_THAT(GetDataAsFloat(output_data[i]), matcher[i]);
+      }
+    }
+  }
+
+  // Runs validation and conversion. If conversion is successfull then builds
+  // the TRT network, executes it and checks the output.
+  void TestOpConverter(const string& name, const NodeDef node_def,
+                       const std::vector<int>& expected_output_dims,
+                       const Status& expected_conversion_status,
+                       const Status& expected_runtime_status,
+                       const Matcher<std::vector<float>>& matcher) {
+    RunValidationAndConversion(node_def, expected_conversion_status,
+                               name.c_str(), expected_output_dims);
+    if (expected_conversion_status.ok()) {
+      BuildAndRun(name, std::vector<std::vector<int>>({expected_output_dims}),
+                  expected_runtime_status,
+                  std::vector<Matcher<std::vector<float>>>({matcher}));
+    }
+  }
+
+ protected:
+  const TrtTestMode trt_mode;
+  const DataType tf_dtype;
+  const TrtPrecisionMode converter_precision;
+  DataVec input_data_;
+};
+
+// Op converter test in FP32 mode. While for debugging purposes it might make
+// sense to run over all possible combinations, normally a subset of them
+// would be sufficient:
 // - All valid options to TrtTestMode (implicit, explicit, dynamic shape)
 // - DataType: is the TF data type of the input tensors. This usually only
 //   influences the data type added by Converter::AddInputTensor. We test the
@@ -1687,66 +1853,22 @@ class ParameterizedOpConverterTest
 //   how TRT handles the precision inside the TRT network, but should not matter
 //   for the TF -> TRT conversion. Therefore it should be sufficient to test
 //   for FP32.
+class OpConverterTest1 : public ParameterizedOpConverterTestBase {};
+
+// Instantiate parameter combinations to OpConverterTest1
 INSTANTIATE_TEST_CASE_P(
-    OpConvTestInstantiation, ParameterizedOpConverterTest,
+    OpConvTestInstantiation, OpConverterTest1,
     ::testing::Combine(::testing::ValuesIn(ValidTrtModes),
                        ::testing::Values(DT_FLOAT),
                        ::testing::Values(TrtPrecisionMode::FP32)));
 
-// Builds and runs the converted network. Checks output tensor shape. Tests
-// output values using a matcher.
-template <DataType dtype>
-void BuildAndRunConvertedNetwork(const string& name, OpConverterTest* test,
-                                 const TestParamBase& p,
-                                 const std::vector<float>& input_vec,
-                                 const Matcher<std::vector<float>>& matcher) {
-  if (!p.status.ok()) {
-    // conversion was not successful, we cannot run the network
-    return;
-  }
-  if (!p.runtime_status.ok()) {
-    // Runtime error is expected. This can happen if the operation is invalid
-    // for the actual input shape. Usually we catch these errors during
-    // conversion. If the network was defined with dynamic input shape than we
-    // have to postpone these steps until runtime.
-    //
-    // TODO(tfeher) Instead of early return, modify BuildAndRun to handle
-    // runtime errors.
-    return;
-  }
-  typedef typename EnumToDataType<dtype>::Type T;
-  TensorShape shape;
-  TF_EXPECT_OK(TensorShapeUtils::MakeShape(p.input_dims, &shape));
-  const DataVec input_data{
-      {"input", test->AsTensor<T>(CastTestVector<float, T>(input_vec), shape)}};
-  DataVec output_data{{name, test->ConstructTensor<T>(6)}};
-  test->BuildAndRun(input_data, &output_data);
-  // Check the shape of the actual output tensor
-  TF_EXPECT_OK(TensorShapeUtils::MakeShape(p.expected_output_dims, &shape));
-  EXPECT_TRUE(output_data[0].tensor.shape() == shape)
-      << "Expected shape: " << shape.DebugString() << ", actual shape"
-      << output_data[0].tensor.shape().DebugString();
-  // Cast the output to float and compare to expected output
-  auto out_span = GetSpanForData<T>(output_data[0]);
-  std::vector<float> casted_output(out_span.begin(), out_span.end());
-  EXPECT_THAT(casted_output, matcher);
-}
-
-void InstantiateBuildAndRun(DataType tf_dtype, const string& name,
-                            OpConverterTest* test, const TestParamBase& p,
-                            const std::vector<float>& input_vec,
-                            const Matcher<std::vector<float>>& matcher) {
-  if (tf_dtype == DT_FLOAT) {
-    BuildAndRunConvertedNetwork<DT_FLOAT>(name, test, p, input_vec, matcher);
-  } else if (tf_dtype == DT_HALF) {
-    BuildAndRunConvertedNetwork<DT_HALF>(name, test, p, input_vec, matcher);
-  } else if (tf_dtype == DT_INT32) {
-    BuildAndRunConvertedNetwork<DT_INT32>(name, test, p, input_vec, matcher);
-  } else {
-    FAIL() << "Test not supported for " << tf_dtype;
-  }
-}
-
+// Base class for tests that need to be tested for both FP32 and FP16.
+class OpConverterTest2 : public ParameterizedOpConverterTestBase {};
+INSTANTIATE_TEST_CASE_P(
+    OpConvTestInstantiation, OpConverterTest2,
+    ::testing::Combine(::testing::ValuesIn(ValidTrtModes),
+                       ::testing::Values(DT_FLOAT, DT_HALF),
+                       ::testing::Values(TrtPrecisionMode::FP32)));
 template <typename T>
 void CopyTensorElements(const Tensor& tensor, protobuf::RepeatedField<T>* out) {
   out->Clear();
@@ -1884,14 +2006,7 @@ TEST_F(OpConverterTest, ConvertConst) {
   TestConvertConst<DT_UINT64, uint64, int32>(this);
 }
 
-TEST_P(ParameterizedOpConverterTest, ConvertTranspose) {
-  const auto& spec = GetParam();
-  const TrtTestMode trt_mode = std::get<0>(spec);
-  // Data type of TF input tensors
-  const DataType tf_dtype = std::get<1>(spec);
-  // Precision mode used for  TensorRT engine
-  TrtPrecisionMode converter_precision = std::get<2>(spec);
-
+TEST_P(OpConverterTest1, ConvertTranspose) {
   // Get the NodeDef for Transpose.
   Scope s = Scope::NewRootScope();
   auto input = ops::Placeholder(s.WithOpName("input"), tf_dtype);
@@ -1902,7 +2017,7 @@ TEST_P(ParameterizedOpConverterTest, ConvertTranspose) {
   std::vector<TestParamBase> test_params = {
       // For the first test we leave param empty. This signals to use a
       // input as weight which will be invalid
-      TestParamBase{{1, 1, 2, 3},
+      TestParamBase{{3, 1, 2, 1},
                     {},
                     {},
                     {},
@@ -1936,20 +2051,17 @@ TEST_P(ParameterizedOpConverterTest, ConvertTranspose) {
   std::vector<float> expected_values{1, 4, 2, 5, 3, 6};
   for (auto p : test_params) {
     SCOPED_TRACE(p);
-    Reset(converter_precision, trt_mode);
-    AddTestTensor("input", p.input_dims, TfDataTypeToTrt(tf_dtype), trt_mode,
-                  &p.partial_input_dims);
+    Reset();
+    AddTestTensor("input", p.input_dims, {1, 2, 3, 4, 5, 6},
+                  p.partial_input_dims);
     if (p.param.empty()) {
       AddTestTensor("weights", {3});
     } else {
       AddTestWeights<int32>("weights", {static_cast<int>(p.param.size())},
                             p.param);
     }
-    RunValidationAndConversion(node_def, p.status, "my_transpose",
-                               p.expected_output_dims);
-    InstantiateBuildAndRun(tf_dtype, "my_transpose", this, p,
-                           {1, 2, 3, 4, 5, 6},
-                           ElementsAreArray(expected_values));
+    TestOpConverter("my_transpose", node_def, p.expected_output_dims, p.status,
+                    p.runtime_status, ElementsAreArray(expected_values));
   }
 }
 
@@ -2046,7 +2158,7 @@ TEST_F(OpConverterTest, ConvertReshape) {
     const DataVec input_data{{"input", AsTensor<float>(input_vec)}};
     DataVec output_data{
         {"my_reshape", ConstructTensor<float>(input_vec.size())}};
-    BuildAndRun(input_data, &output_data, batch_size);
+    TF_EXPECT_OK(BuildAndRun(input_data, &output_data, batch_size));
     EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                 ElementsAreArray(input_vec));
   }
@@ -2101,7 +2213,7 @@ void TestMatMulHelper(
 
       const DataVec input_data{{"input", test->AsTensor<float>({0, 1})}};
       DataVec output_data{{"my_matmul", test->ConstructTensor<float>(2)}};
-      test->BuildAndRun(input_data, &output_data);
+      TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
       if (transpose_b) {
         EXPECT_THAT(GetSpanForData<float>(output_data[0]), ElementsAre(1, 3));
       } else {
@@ -2128,7 +2240,7 @@ void TestMatMulHelper(
     ExpectTrtDimsEqualsArray({2}, output.tensor()->getDimensions());
     const DataVec input_data{{"input", test->AsTensor<float>({0, 1})}};
     DataVec output_data{{"my_matmul", test->ConstructTensor<float>(2)}};
-    test->BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     if (transpose_b) {
       EXPECT_THAT(GetSpanForData<float>(output_data[0]), ElementsAre(1, 3));
     } else {
@@ -2271,7 +2383,7 @@ TEST_F(OpConverterTest, ConvertBatchMatMul) {
       ExpectTrtDimsEqualsArray({2, 2}, output.tensor()->getDimensions());
       const DataVec input_data{{"input", AsTensor<float>({0, 1, 2, 3})}};
       DataVec output_data{{"my_matmul", ConstructTensor<float>(4)}};
-      BuildAndRun(input_data, &output_data);
+      TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
       if (!transpose_a && !transpose_b) {
         EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                     ElementsAre(3, 4, 11, 16));
@@ -2291,91 +2403,70 @@ TEST_F(OpConverterTest, ConvertBatchMatMul) {
   TestMatMulHelper(this, get_batch_matmul_nodedef, "BatchMatMul");
 }
 
-template <DataType dtype>
-void TestConvertBiasAdd(OpConverterTest* test) {
+TEST_P(OpConverterTest2, ConvertBiasAdd) {
+  // Note that kINT32 is not supported by IScaleLayer, so we don't test
+  // DT_INT32 type here. DT_FLOAT and DT_HALF are tested.
   // Get the NodeDef for BiasAdd.
-  auto get_biasadd_nodedef = [](const string& data_format) -> NodeDef {
+  auto get_biasadd_nodedef = [](const string& data_format,
+                                DataType tf_dtype) -> NodeDef {
     Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), dtype);
-    auto weights = ops::Placeholder(s.WithOpName("weights"), dtype);
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_dtype);
+    auto weights = ops::Placeholder(s.WithOpName("weights"), tf_dtype);
     const auto biasadd_attrs = ops::BiasAdd::DataFormat(data_format);
     auto biasadd =
         ops::BiasAdd(s.WithOpName("my_biasadd"), input, weights, biasadd_attrs);
     return biasadd.operation.node()->def();
   };
 
-  typedef typename EnumToDataType<dtype>::Type CType;
   for (const string& data_format : {"NHWC", "NCHW"}) {
     for (const int trt_input_rank : {1, 2, 3, 4}) {
-      test->Reset();
-      NodeDef node_def = get_biasadd_nodedef(data_format);
+      Reset();
+      NodeDef node_def = get_biasadd_nodedef(data_format, tf_dtype);
 
       // Add input, dims_array will be like {2, 1, ..., 1, 3}
-      std::vector<int32> dims_array(trt_input_rank, 1);
+      std::vector<int32> dims_array(trt_input_rank + 1, 1);
       if (trt_input_rank == 1) {
-        dims_array[0] = (data_format == "NHWC" ? 3 : 2);
+        dims_array[1] = (data_format == "NHWC" ? 3 : 2);
       } else {
-        dims_array[0] = 2;
-        dims_array[trt_input_rank - 1] = 3;
+        dims_array[1] = 2;
+        dims_array[trt_input_rank] = 3;
       }
-      test->AddTestTensor("input", dims_array, /*batch_size=*/1,
-                          TfDataTypeToTrt(dtype));
-
-      // Add bias weights.
-      const int channel_size = (data_format == "NHWC" ? 3 : 2);
-      std::vector<CType> bias(channel_size);
-      for (int i = 0; i < channel_size; ++i) {
-        bias[i] = CType(i + 1);  // bias will be {1, 2, 3, ...}
-      }
-      test->AddTestWeights<CType>("weights", {channel_size}, bias);
-
-      // Run the conversion.
-      test->RunValidationAndConversion(node_def);
-      TRT_TensorOrWeights output;
-      TF_EXPECT_OK(test->GetTensorOrWeights("my_biasadd", &output));
-      ASSERT_TRUE(output.is_tensor());
-      ExpectTrtDimsEqualsArray(dims_array, output.tensor()->getDimensions());
-
-      // Build and run the engine.
       const int num_input = TrtTensorDimsNumElements(GetTestDims(dims_array));
       ASSERT_EQ(trt_input_rank > 1 ? 6 : (data_format == "NHWC" ? 3 : 2),
                 num_input);
+      std::vector<float> input_data(num_input, 0);
+
+      AddTestTensor("input", dims_array, input_data);
+
+      const int channel_size = (data_format == "NHWC" ? 3 : 2);
+      std::vector<float> bias(channel_size);
+      for (int i = 0; i < channel_size; ++i) {
+        bias[i] = i + 1;  // bias will be {1, 2, 3, ...}
+      }
+      AddTestWeights("weights", {channel_size}, bias, tf_dtype);
+
+      // Build and run the engine.
+      std::vector<float> output_data;
 
-      const DataVec input_data{
-          {"input", test->ConstructTensor<CType>(num_input, CType(0))}};
-      DataVec output_data{
-          {"my_biasadd", test->ConstructTensor<CType>(num_input)}};
-      test->BuildAndRun(input_data, &output_data);
       if (trt_input_rank == 1) {
         if (data_format == "NHWC") {
-          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                      ElementsAre(CType(1), CType(2), CType(3)));
+          output_data = {1, 2, 3};
         } else {
-          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                      ElementsAre(CType(1), CType(2)));
+          output_data = {1, 2};
         }
       } else {
         if (data_format == "NHWC") {
-          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                      ElementsAre(CType(1), CType(2), CType(3), CType(1),
-                                  CType(2), CType(3)));
+          output_data = {1, 2, 3, 1, 2, 3};
         } else {
-          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                      ElementsAre(CType(1), CType(1), CType(1), CType(2),
-                                  CType(2), CType(2)));
+          output_data = {1, 1, 1, 2, 2, 2};
         }
       }
+      TestOpConverter("my_biasadd", node_def, dims_array, Status::OK(),
+                      Status::OK(), ElementsAreArray(output_data));
     }
   }
 }
 
-TEST_F(OpConverterTest, ConvertBiasAdd) {
-  // OK. Note that kINT32 is not supported by IScaleLayer, so we don't test
-  // DT_INT32 type here.
-  TestConvertBiasAdd<DT_FLOAT>(this);
-  TestConvertBiasAdd<DT_HALF>(this);
-}
-
 template <typename OpType>
 NodeDef GetBinaryOpNodeDef(const string& input_name_l,
                            const string& input_name_r, DataType dtype) {
@@ -2428,7 +2519,7 @@ void TestBinaryOp(OpConverterTest* test, bool operand_1_is_tensor,
   ExpectTrtDimsEqualsArray({2, 2}, output.tensor()->getDimensions());
   // After broadcasting first input becomes {3, 6, 3, 6} and second input
   // becomes {2, 3, 2, 3}.
-  test->BuildAndRun(input_data, &output_data, /*batch_size=*/2);
+  TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data, /*batch_size=*/2));
   if (node_def.op() == "Add") {
     EXPECT_THAT(
         GetSpanForData<CType>(output_data[0]),
@@ -2562,7 +2653,7 @@ void TestAddN(OpConverterTest* test) {
     ExpectTrtDimsEqualsArray({1, 2}, output.tensor()->getDimensions());
 
     DataVec output_data{{"my_addn", test->ConstructTensor<CType>(4)}};
-    test->BuildAndRun(input_data, &output_data, /*batch_size=*/2);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data, /*batch_size=*/2));
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAreArray(CastTestVector<int, CType>({3, 6, 9, 12})));
   }
@@ -2586,7 +2677,7 @@ void TestAddN(OpConverterTest* test) {
     ExpectTrtDimsEqualsArray({1, 2}, output.tensor()->getDimensions());
 
     DataVec output_data{{"my_addn", test->ConstructTensor<CType>(2)}};
-    test->BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAreArray(CastTestVector<int, CType>({5, 8})));
   }
@@ -2752,7 +2843,7 @@ void TestConvertSquare(OpConverterTest* test) {
   // Engine outputs are converted to FP16 automatically if we set FP16 mode in
   // the builder.
   DataVec output_data{{"my_square", test->ConstructTensor<CType>(num_inputs)}};
-  test->BuildAndRun(input_data, &output_data);
+  TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
   ExpectArrayNear(expected_outputs, GetSpanForData<CType>(output_data[0]));
 }
 
@@ -2864,7 +2955,7 @@ TEST_F(OpConverterTest, ConvertCombinedNMS) {
     };
     const DataVec input_data{{"boxes", AsTensor<float>({0, 0, 0.3, 0.4})},
                              {"scores", AsTensor<float>({0.4, 0.7, 0.3})}};
-    BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                 ElementsAre(0, 0, 0.3, 0.4, 0, 0, 0.3, 0.4));
     EXPECT_THAT(GetSpanForData<float>(output_data[1]), ElementsAre(0.7, 0.4));
@@ -2874,90 +2965,67 @@ TEST_F(OpConverterTest, ConvertCombinedNMS) {
 }
 #endif  // IS_TRT_VERSION_GE(5, 1, 0, 0)
 
-TEST_F(OpConverterTest, ConvertActivation) {
+template <typename T>
+NodeDef CreateUnaryOp(DataType tf_dtype) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_dtype);
+  return T(s.WithOpName("my_unary"), input).operation.node()->def();
+}
+
+constexpr float kLeakyReluAlpha = 0.2f;
+template <>
+NodeDef CreateUnaryOp<ops::internal::LeakyRelu>(DataType tf_dtype) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_dtype);
+  return ops::internal::LeakyRelu(
+             s.WithOpName("my_unary"), input,
+             ops::internal::LeakyRelu::Alpha(kLeakyReluAlpha))
+      .operation.node()
+      ->def();
+}
+
+TEST_P(OpConverterTest1, ConvertActivation) {
   {
     // Input is weights, should fail.
     Reset();
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    auto relu = ops::Relu(s.WithOpName("my_act"), input);
-    const NodeDef& node_def = relu.operation.node()->def();
+    const NodeDef& node_def = CreateUnaryOp<ops::Relu>(tf_dtype);
     AddTestWeights<int32>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
-        "The input \"input\" for Relu must be a tensor, at my_act");
+        "The input \"input\" for Relu must be a tensor, at my_unary");
   }
 
-  constexpr float kLeakyReluAlpha = 0.2f;
   constexpr float kSeluAlpha = 1.7580993408473768599402175208123f;
   constexpr float kSeluScale = 1.0507009873554804934193349852946f;
+  using OpFunc = std::function<NodeDef(DataType)>;
+  using ValFunc = float (*)(float);
+  std::map<std::string, std::pair<OpFunc, ValFunc>> op_map;
 
-  // Get nodedef for activation layer.
-  auto get_act_nodedef = [](string op_name) -> NodeDef {
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    if (op_name == "LeakyRelu") {
-      auto act = ops::internal::LeakyRelu(
-          s.WithOpName("my_act"), input,
-          ops::internal::LeakyRelu::Alpha(kLeakyReluAlpha));
-      return act.operation.node()->def();
-    } else if (op_name == "Relu") {
-      auto act = ops::Relu(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Relu6") {
-      auto act = ops::Relu6(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Sigmoid") {
-      auto act = ops::Sigmoid(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Tanh") {
-      auto act = ops::Tanh(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Elu") {
-      auto act = ops::Elu(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Selu") {
-      auto act = ops::Selu(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Softsign") {
-      auto act = ops::Softsign(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Softplus") {
-      auto act = ops::Softplus(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    }
-    EXPECT_TRUE(false);
-    return NodeDef();
-  };
-  // Get expected output for activation layer.
-  auto get_act_output = [](string op_name, float input) -> float {
-    if (op_name == "LeakyRelu") {
-      return (input > 0.0f) ? input : input * kLeakyReluAlpha;
-    } else if (op_name == "Relu") {
-      return (input > 0.0f) ? input : 0.0f;
-    } else if (op_name == "Relu6") {
-      return std::min(std::max(input, 0.0f), 6.0f);
-    } else if (op_name == "Sigmoid") {
-      return 1.0f / (1.0f + std::exp(-input));
-    } else if (op_name == "Tanh") {
-      return std::tanh(input);
-    } else if (op_name == "Elu") {
-      return (input > 0.0f) ? input : std::exp(input) - 1;
-    } else if (op_name == "Selu") {
-      return (input > 0.0f) ? kSeluScale * input
-                            : kSeluScale * kSeluAlpha * (std::exp(input) - 1);
-    } else if (op_name == "Softsign") {
-      return input / (std::abs(input) + 1);
-    } else if (op_name == "Softplus") {
-      return std::log(std::exp(input) + 1);
-    }
-    EXPECT_TRUE(false);
-    return 0;
-  };
+#define ADD_OP(name, op, compute) \
+  op_map[name] = std::make_pair(CreateUnaryOp<op>, compute)
+  ADD_OP("LeakyRelu", ops::internal::LeakyRelu,
+         [](float x) { return (x > 0.0f) ? x : x * kLeakyReluAlpha; });
+  ADD_OP("Relu", ops::Relu, [](float x) { return (x > 0.0f) ? x : 0.0f; });
+  ADD_OP("Relu6", ops::Relu6,
+         [](float x) { return std::min(std::max(x, 0.0f), 6.0f); });
+  ADD_OP("Sigmoid", ops::Sigmoid,
+         [](float x) { return 1.0f / (1.0f + std::exp(-x)); });
+  ADD_OP("Tanh", ops::Tanh, static_cast<ValFunc>(std::tanh));
+  ADD_OP("Elu", ops::Elu,
+         [](float x) { return (x > 0.0f) ? x : std::exp(x) - 1; });
+  ADD_OP("Selu", ops::Selu, [](float x) {
+    return (x > 0.0f) ? kSeluScale * x
+                      : kSeluScale * kSeluAlpha * (std::exp(x) - 1);
+  });
+  ADD_OP("Softsign", ops::Softsign,
+         [](float x) { return x / (std::abs(x) + 1); });
+  ADD_OP("Softplus", ops::Softplus,
+         [](float x) { return std::log(std::exp(x) + 1); });
+#undef ADD_OP
 
   // Get list of ops to test.
   std::vector<string> ops_to_test;
-  // Add all ops supported by ConvertUnary.
+  // Add all ops supported by ConvertActivation.
   auto* map = ActivationTypeMap();
   ops_to_test.reserve(map->size());
   for (auto& pair : *map) {
@@ -2966,16 +3034,30 @@ TEST_F(OpConverterTest, ConvertActivation) {
   // Add other activation ops to test.
   ops_to_test.push_back("Relu6");
   ops_to_test.push_back("LeakyRelu");
+  auto p = TestParamBase{
+      {1, 1, 2, 3},  // input dims
+      {},            // input partial dims
+      {1, 1, 2, 3},  // expected output dims
+  };
   // Ok.
   for (const string& op_name : ops_to_test) {
+    if (!op_map.count(op_name)) {
+      FAIL() << "Activation op test map does not contain op " << op_name;
+    }
     Reset();
-    NodeDef node_def = get_act_nodedef(op_name);
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(node_def);
+    NodeDef node_def = op_map[op_name].first(tf_dtype);
+    const std::vector<float> input = {-100, -2, -1, 0, 1, 88};
+    AddTestTensor("input", p.input_dims, input);
+
+    // std::exp in Softplus will overflow for input > 88
+    std::vector<float> output_values;
+    std::transform(input.begin(), input.end(),
+                   std::back_inserter(output_values), op_map[op_name].second);
+    TestOpConverter("my_unary", node_def, p.expected_output_dims, Status::OK(),
+                    Status::OK(), ArrayFloatNear(output_values, 0, false));
+
     TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_act", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions());
+    TF_EXPECT_OK(GetTensorOrWeights("my_unary", &output));
 
     // Certain activations should set quantization range automatically.
     auto ranges = quantization_ranges();
@@ -2985,17 +3067,6 @@ TEST_F(OpConverterTest, ConvertActivation) {
                op_name == "Softsign") {
       EXPECT_EQ(ranges[output.tensor()], 1.0f);
     }
-
-    // std::exp in Softplus will overflow for input > 88
-    const std::vector<float> input = {-100, -2, -1, 0, 1, 88};
-    const DataVec input_data{{"input", AsTensor<float>(input)}};
-    DataVec output_data{{"my_act", ConstructTensor<float>(6)}};
-    BuildAndRun(input_data, &output_data);
-    for (int i = 0; i < input.size(); i++) {
-      const float expected_output = get_act_output(op_name, input[i]);
-      EXPECT_FLOAT_EQ(GetSpanForData<float>(output_data[0])[i],
-                      expected_output);
-    }
   }
 }
 
@@ -3095,23 +3166,17 @@ TEST_F(OpConverterTest, ConvertExpandDims) {
 
     const DataVec input_data{{"input", AsTensor<float>({1, 2, 3, 4, 5, 6})}};
     DataVec output_data{{"my_expanddims", ConstructTensor<float>(6)}};
-    BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                 ElementsAre(1, 2, 3, 4, 5, 6));
   }
 }
 
-TEST_P(ParameterizedOpConverterTest, ConvertSqueeze) {
-  const auto& spec = GetParam();
-  const TrtTestMode trt_mode = std::get<0>(spec);
+TEST_P(OpConverterTest1, ConvertSqueeze) {
   const bool use_implicit_batch = (trt_mode == TrtTestMode::kImplicitBatch);
-  // Data type of TF input tensors
-  const DataType tf_dtype = std::get<1>(spec);
-  // Precision mode used for  TensorRT engine
-  TrtPrecisionMode converter_precision = std::get<2>(spec);
-
   // Get the NodeDef for Squeeze.
-  auto get_squeeze_nodedef = [tf_dtype](std::vector<int> axes) -> NodeDef {
+  auto get_squeeze_nodedef = [](std::vector<int> axes,
+                                DataType tf_dtype) -> NodeDef {
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), tf_dtype);
     if (!axes.empty()) {
@@ -3129,11 +3194,13 @@ TEST_P(ParameterizedOpConverterTest, ConvertSqueeze) {
       TestParamBase{
           {1, 2, 1, 3},  // input dims
           {},            // input partial dims
-          {2, 1, 3},     // expected output dims
+          {2, 3},        // expected output dims
           {},            // axis
-          Status{
-              error::UNIMPLEMENTED,
-              "Squeeze is only implemented for explicit dims, at my_squeeze"}},
+          trt_mode == TrtTestMode::kExplicitBatch
+              ? Status::OK()
+              : Status{error::UNIMPLEMENTED,
+                       "Squeeze is not implemented for empty squeeze_dims, at "
+                       "my_squeeze"}},
       TestParamBase{{1, 2, 1, 3},
                     {},
                     {2, 1, 3},
@@ -3202,14 +3269,12 @@ TEST_P(ParameterizedOpConverterTest, ConvertSqueeze) {
 
   for (TestParamBase p : test_params) {
     SCOPED_TRACE(p);
-    Reset(converter_precision, trt_mode);
-    NodeDef node_def = get_squeeze_nodedef(p.param);
-    AddTestTensor("input", p.input_dims, TfDataTypeToTrt(tf_dtype), trt_mode,
-                  &p.partial_input_dims);
-    RunValidationAndConversion(node_def, p.status, "my_squeeze",
-                               p.expected_output_dims);
-    InstantiateBuildAndRun(tf_dtype, "my_squeeze", this, p, {1, 2, 3, 4, 5, 6},
-                           ElementsAreArray({1, 2, 3, 4, 5, 6}));
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef(p.param, tf_dtype);
+    AddTestTensor("input", p.input_dims, {1, 2, 3, 4, 5, 6},
+                  p.partial_input_dims);
+    TestOpConverter("my_squeeze", node_def, p.expected_output_dims, p.status,
+                    p.runtime_status, ElementsAreArray({1, 2, 3, 4, 5, 6}));
   }
 }
 
@@ -3812,7 +3877,7 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
     DataVec output_data{
         {"my_strided_slice",
          ConstructTensor<float>(ok_params[i].expected_output.size())}};
-    BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                 ElementsAreArray(ok_params[i].expected_output));
   }
@@ -3952,21 +4017,22 @@ TEST_F(OpConverterTest, ConvertSlice) {
     const DataVec input_data{{"input", AsTensor<float>({1, 2, 3, 4, 5, 6})}};
     DataVec output_data{{"my_slice", ConstructTensor<float>(
                                          ok_params[i].expected_output.size())}};
-    BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                 ElementsAreArray(ok_params[i].expected_output));
   }
 }
 
-TEST_F(OpConverterTest, ConvertConv2D) {
+TEST_P(OpConverterTest1, ConvertConv2D) {
   // Get nodedef for Conv2D layer.
+  DataType tf_type = tf_dtype;
   auto get_conv2d_nodedef =
-      [](std::vector<int> strides = {1, 1, 1, 1}, string padding = "SAME",
-         string data_format = "NCHW",
-         std::vector<int> dilations = {1, 1, 1, 1}) -> NodeDef {
+      [tf_type](std::vector<int> strides = {1, 1, 1, 1},
+                string padding = "SAME", string data_format = "NCHW",
+                std::vector<int> dilations = {1, 1, 1, 1}) -> NodeDef {
     Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    auto filter = ops::Placeholder(s.WithOpName("weights"), DT_FLOAT);
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+    auto filter = ops::Placeholder(s.WithOpName("weights"), tf_type);
     ops::Conv2D::Attrs attrs =
         ops::Conv2D::Attrs().DataFormat(data_format).Dilations(dilations);
     auto conv2d = ops::Conv2D(s.WithOpName("my_conv2d"), input, filter, strides,
@@ -3988,7 +4054,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     // Filter is tensor, should fail.
     Reset();
     NodeDef node_def = get_conv2d_nodedef();
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {3, 1, 2, 1});
     AddTestTensor("weights", {3, 3, 1, 1});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
@@ -3998,7 +4064,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     // Filter is not 4D, should fail.
     Reset();
     NodeDef node_def = get_conv2d_nodedef();
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 1, 2, 3});
     AddTestWeights<float>("weights", {3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
@@ -4009,7 +4075,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     Reset();
     NodeDef node_def =
         get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NCHW", {1, 1, 1});
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 1, 2, 3});
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
@@ -4020,7 +4086,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     Reset();
     NodeDef node_def =
         get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NCHW", {1, 2, 1, 1});
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 1, 2, 3});
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
                                "Dilation rate must be 1 for batch and channel "
@@ -4031,7 +4097,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     Reset();
     NodeDef node_def =
         get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NHWC", {1, 1, 1, 2});
-    AddTestTensor("input", {2, 3, 1});
+    AddTestTensor("input", {1, 2, 3, 1});
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
                                "Dilation rate must be 1 for batch and channel "
@@ -4042,7 +4108,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     Reset();
     NodeDef node_def =
         get_conv2d_nodedef({1, 1, 1}, "SAME", "NCHW", {1, 1, 1, 1});
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 1, 2, 3});
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
@@ -4053,12 +4119,23 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     Reset();
     NodeDef node_def =
         get_conv2d_nodedef({1, 2, 1, 1}, "SAME", "NCHW", {1, 1, 1, 1});
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 1, 2, 3});
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
         "Stride must be 1 for batch and channel dimensions, at my_conv2d");
   }
+  if (trt_mode == TrtTestMode::kDynamicShape) {
+    Reset();
+    NodeDef node_def = get_conv2d_nodedef();
+    // Channel dim unknown, should fail.
+    AddTestTensorWithTFDims("input", {-1, -1, -1, -1},
+                            TfDataTypeToTrt(tf_type));
+    AddTestWeights<float>("weights", {1, 2, 1, 1}, {-1, 1});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Channel dimension must be static, at my_conv2d");
+  }
 
   struct TestParams {
     std::vector<int> input_dims;
@@ -4076,7 +4153,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
   // Ok.
   std::vector<TestParams> ok_params = {
       // Basic
-      TestParams{/*input_dims=*/{1, 2, 3},
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
                  /*input=*/{0, 1, 2, 3, 3, 4},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -4084,10 +4161,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*expected_output_dims=*/{1, 2, 2},
+                 /*expected_output_dims=*/{1, 1, 2, 2},
                  /*expected_output=*/{1, 1, 0, 1}},
       // SAME padding (Asymmetric)
-      TestParams{/*input_dims=*/{1, 2, 3},
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
                  /*input=*/{0, 1, 2, 3, 3, 4},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -4095,10 +4172,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"SAME",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*expected_output_dims=*/{1, 2, 3},
+                 /*expected_output_dims=*/{1, 1, 2, 3},
                  /*expected_output=*/{1, 1, -2, 0, 1, -4}},
       // SAME padding (Symmetric)
-      TestParams{/*input_dims=*/{1, 2, 3},
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
                  /*input=*/{0, 1, 2, 3, 3, 4},
                  /*filter_dims=*/{1, 3, 1, 1},
                  /*filter=*/{-1, 0, 1},
@@ -4106,10 +4183,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"SAME",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*expected_output_dims=*/{1, 2, 3},
+                 /*expected_output_dims=*/{1, 1, 2, 3},
                  /*expected_output=*/{1, 2, -1, 3, 1, -3}},
       // NHWC
-      TestParams{/*input_dims=*/{2, 3, 1},
+      TestParams{/*input_dims=*/{1, 2, 3, 1},
                  /*input=*/{0, 1, 2, 3, 3, 4},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -4117,10 +4194,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NHWC",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*expected_output_dims=*/{2, 2, 1},
+                 /*expected_output_dims=*/{1, 2, 2, 1},
                  /*expected_output=*/{1, 1, 0, 1}},
       // Dilated
-      TestParams{/*input_dims=*/{1, 2, 3},
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
                  /*input=*/{0, 1, 2, 3, 3, 4},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -4128,10 +4205,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 2},
-                 /*expected_output_dims=*/{1, 2, 1},
+                 /*expected_output_dims=*/{1, 1, 2, 1},
                  /*expected_output=*/{2, 1}},
       // Strided
-      TestParams{/*input_dims=*/{1, 2, 4},
+      TestParams{/*input_dims=*/{1, 1, 2, 4},
                  /*input=*/{0, 1, 2, 2, 3, 4, 4, 7},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -4139,7 +4216,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*expected_output_dims=*/{1, 2, 2},
+                 /*expected_output_dims=*/{1, 1, 2, 2},
                  /*expected_output=*/{1, 0, 1, 3}},
   };
 
@@ -4148,23 +4225,22 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     NodeDef node_def =
         get_conv2d_nodedef(ok_params[i].strides, ok_params[i].padding,
                            ok_params[i].data_format, ok_params[i].dilations);
-    AddTestTensor("input", ok_params[i].input_dims);
+    std::vector<int> partial_input_shape;
+    if (trt_mode == TrtTestMode::kDynamicShape) {
+      // The channel dim cannot have unknown size, fix that.
+      partial_input_shape.resize(ok_params[i].input_dims.size(), -1);
+      int channel_id = (ok_params[i].data_format == "NCHW") ? 1 : 3;
+      partial_input_shape[channel_id] = ok_params[i].input_dims[channel_id];
+    }
+
+    AddTestTensor("input", ok_params[i].input_dims, tf_dtype,
+                  ok_params[i].input, partial_input_shape);
     AddTestWeights<float>("weights", ok_params[i].filter_dims,
                           ok_params[i].filter);
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_conv2d", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
 
-    const DataVec input_data{{"input", AsTensor<float>(ok_params[i].input)}};
-    DataVec output_data{
-        {"my_conv2d",
-         ConstructTensor<float>(ok_params[i].expected_output.size())}};
-    BuildAndRun(input_data, &output_data);
-    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                ElementsAreArray(ok_params[i].expected_output));
+    TestOpConverter("my_conv2d", node_def, ok_params[i].expected_output_dims,
+                    Status::OK(), Status::OK(),
+                    ElementsAreArray(ok_params[i].expected_output));
   }
 }
 
@@ -4289,7 +4365,7 @@ TEST_F(OpConverterTest, ConvertConv2DBackpropInput) {
       DataVec output_data{
           {"my_conv2d_backprop_input",
            ConstructTensor<float>(ok_params[i].expected_output.size())}};
-      BuildAndRun(input_data, &output_data);
+      TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
       EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                   ElementsAreArray(ok_params[i].expected_output));
     }
@@ -4621,7 +4697,7 @@ TEST_F(OpConverterTest, ConvertConv3D) {
     DataVec output_data{
         {"my_conv3d",
          ConstructTensor<float>(ok_params[i].expected_output.size())}};
-    BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                 ElementsAreArray(ok_params[i].expected_output));
   }
@@ -4808,7 +4884,7 @@ TEST_F(OpConverterTest, ConvertPool3D) {
     DataVec output_data{
         {expected_node_name,
          ConstructTensor<float>(ok_params[i].expected_output.size())}};
-    BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                 ElementsAreArray(ok_params[i].expected_output));
   }
@@ -4853,7 +4929,7 @@ TEST_F(OpConverterTest, ConvertTopK) {
           {"input", AsTensor<float>({-9, 3, 5, 1, 6, -5, 7, 1, 0, -1})}};
       DataVec output_data{{"my_topk", ConstructTensor<float>(4)},
                           {"my_topk:1", ConstructTensor<int32>(4)}};
-      BuildAndRun(input_data, &output_data);
+      TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
       EXPECT_THAT(GetSpanForData<float>(output_data[0]),
                   ElementsAre(6, 5, 7, 1));
       EXPECT_THAT(GetSpanForData<int32>(output_data[1]),
@@ -5040,8 +5116,8 @@ void TestConvertGather(OpConverterTest* test) {
     }
     DataVec output_data{
         {"my_gather", test->ConstructTensor<CType>(expected_output.size())}};
-    test->BuildAndRun(input_data, &output_data,
-                      /*batch_size=*/expected_output_shape[0]);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data,
+                                   /*batch_size=*/expected_output_shape[0]));
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAreArray(converted_expected_output));
   }
@@ -5112,135 +5188,52 @@ TEST_F(OpConverterTest, ConvertGather) {
   TestConvertGather<DT_INT32>(this);
 }
 
-TEST_F(OpConverterTest, ConvertUnary) {
+NodeDef CreateCastOp(DataType tf_dtype) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), DT_HALF);
+  return ops::Cast(s.WithOpName("my_unary"), input, DT_FLOAT)
+      .operation.node()
+      ->def();
+}
+
+TEST_P(OpConverterTest1, ConvertUnary) {
   {
     // Input is weights, should fail.
     Reset();
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    auto neg = ops::Neg(s.WithOpName("my_unary"), input);
-    const NodeDef& node_def = neg.operation.node()->def();
+    const NodeDef node_def = CreateUnaryOp<ops::Neg>(tf_dtype);
     AddTestWeights<float>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
         "The input \"x\" for Neg must be a tensor, at my_unary");
   }
-
-  // Get nodedef for unary layer.
-  auto get_unary_nodedef = [](string op_name) -> NodeDef {
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    if (op_name == "Abs") {
-      auto unary = ops::Abs(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Acos") {
-      auto unary = ops::Acos(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Acosh") {
-      auto unary = ops::Acosh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Asin") {
-      auto unary = ops::Asin(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Asinh") {
-      auto unary = ops::Asinh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Atan") {
-      auto unary = ops::Atan(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Atanh") {
-      auto unary = ops::Atanh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Ceil") {
-      auto unary = ops::Ceil(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Cos") {
-      auto unary = ops::Cos(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Cosh") {
-      auto unary = ops::Cosh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Exp") {
-      auto unary = ops::Exp(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Floor") {
-      auto unary = ops::Floor(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Log") {
-      auto unary = ops::Log(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Neg") {
-      auto unary = ops::Neg(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Reciprocal") {
-      auto unary = ops::Reciprocal(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Rsqrt") {
-      auto unary = ops::Rsqrt(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Sin") {
-      auto unary = ops::Sin(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Sinh") {
-      auto unary = ops::Sinh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Sqrt") {
-      auto unary = ops::Sqrt(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Tan") {
-      auto unary = ops::Tan(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    }
-    EXPECT_TRUE(false);
-    return NodeDef();
-  };
-  // Get expected output for unary layer.
-  auto get_unary_output = [](string op_name, float input) -> float {
-    if (op_name == "Abs") {
-      return std::abs(input);
-    } else if (op_name == "Acos") {
-      return std::acos(input);
-    } else if (op_name == "Acosh") {
-      return std::acosh(input);
-    } else if (op_name == "Asin") {
-      return std::asin(input);
-    } else if (op_name == "Asinh") {
-      return std::asinh(input);
-    } else if (op_name == "Atan") {
-      return std::atan(input);
-    } else if (op_name == "Atanh") {
-      return std::atanh(input);
-    } else if (op_name == "Ceil") {
-      return std::ceil(input);
-    } else if (op_name == "Cos") {
-      return std::cos(input);
-    } else if (op_name == "Cosh") {
-      return std::cosh(input);
-    } else if (op_name == "Exp") {
-      return std::exp(input);
-    } else if (op_name == "Floor") {
-      return std::floor(input);
-    } else if (op_name == "Log") {
-      return std::log(input);
-    } else if (op_name == "Neg") {
-      return -input;
-    } else if (op_name == "Reciprocal") {
-      return 1.0 / input;
-    } else if (op_name == "Rsqrt") {
-      return 1.0 / std::sqrt(input);
-    } else if (op_name == "Sin") {
-      return std::sin(input);
-    } else if (op_name == "Sinh") {
-      return std::sinh(input);
-    } else if (op_name == "Sqrt") {
-      return std::sqrt(input);
-    } else if (op_name == "Tan") {
-      return std::tan(input);
-    }
-    EXPECT_TRUE(false);
-    return 0;
-  };
-
+  using OpFunc = std::function<NodeDef(DataType)>;
+  using ValFunc = float (*)(float);
+  std::map<std::string, std::pair<OpFunc, ValFunc>> op_map;
+#define ADD_OP(name, op, compute) \
+  op_map[name] =                  \
+      std::make_pair(CreateUnaryOp<op>, static_cast<ValFunc>(compute))
+  ADD_OP("Abs", ops::Abs, std::abs);
+  ADD_OP("Acos", ops::Acos, std::acos);
+  ADD_OP("Acosh", ops::Acosh, std::acosh);
+  ADD_OP("Asin", ops::Asin, std::asin);
+  ADD_OP("Asinh", ops::Asinh, std::asinh);
+  ADD_OP("Atan", ops::Atan, std::atan);
+  ADD_OP("Atanh", ops::Atanh, std::atanh);
+  op_map["Cast"] = std::make_pair(CreateCastOp, [](float x) { return x; });
+  ADD_OP("Ceil", ops::Ceil, std::ceil);
+  ADD_OP("Cos", ops::Cos, std::cos);
+  ADD_OP("Cosh", ops::Cosh, std::cosh);
+  ADD_OP("Exp", ops::Exp, std::exp);
+  ADD_OP("Floor", ops::Floor, std::floor);
+  ADD_OP("Log", ops::Log, std::log);
+  ADD_OP("Neg", ops::Neg, [](float x) { return -x; });
+  ADD_OP("Reciprocal", ops::Reciprocal, [](float x) { return 1.0f / x; });
+  ADD_OP("Rsqrt", ops::Rsqrt, [](float x) { return 1.0f / std::sqrt(x); });
+  ADD_OP("Sin", ops::Sin, std::sin);
+  ADD_OP("Sinh", ops::Sinh, std::sinh);
+  ADD_OP("Sqrt", ops::Sqrt, std::sqrt);
+  ADD_OP("Tan", ops::Tan, std::tan);
+#undef ADD_OP
   // Get list of ops to test.
   std::vector<string> ops_to_test;
   // Add all ops supported by ConvertUnary.
@@ -5251,26 +5244,35 @@ TEST_F(OpConverterTest, ConvertUnary) {
   }
   // Add other unary ops to test.
   ops_to_test.push_back("Rsqrt");
-  // Ok.
+  // Prepare test parameters
+  auto p = TestParamBase{
+      {1, 1, 2, 3},  // input dims
+      {},            // input partial dims
+      {1, 1, 2, 3},  // expected output dims
+  };
   for (const string& op_name : ops_to_test) {
+    SCOPED_TRACE(op_name);
     Reset();
-    NodeDef node_def = get_unary_nodedef(op_name);
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_unary", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions());
-
-    const std::vector<float> input = {-0.9f, 0.6f, 0.0f, -3.5f, 100.0f, 2.9f};
-    const DataVec input_data{{"input", AsTensor<float>(input)}};
-    DataVec output_data{{"my_unary", ConstructTensor<float>(6)}};
-    BuildAndRun(input_data, &output_data);
-    for (int i = 0; i < input.size(); ++i) {
-      const float expected_output = get_unary_output(op_name, input[i]);
-      EXPECT_THAT(GetSpanForData<float>(output_data[0])[i],
-                  NanSensitiveFloatNear(expected_output, 0.0001));
+    if (!op_map.count(op_name)) {
+      FAIL() << "Unary op test map does not contain op " << op_name;
     }
+    NodeDef node_def = op_map[op_name].first(tf_dtype);
+
+    // TODO(bixia): we assume this test is only instantiated for DT_FLOAT for
+    // now. Need to find a better way to express input and output types.
+    //
+    // TODO(tfeher): improve tests by defining an expected output data type and
+    // check that. Currently only the shape and values of the output are
+    // checked.
+    DataType input_tf_dtype = op_name == "Cast" ? DT_HALF : tf_dtype;
+
+    std::vector<float> input_values{-0.9f, 0.6f, 0.0f, -3.5f, 100.0f, 2.9f};
+    AddTestTensor("input", p.input_dims, input_tf_dtype, input_values);
+    std::vector<float> output;
+    std::transform(input_values.begin(), input_values.end(),
+                   std::back_inserter(output), op_map[op_name].second);
+    TestOpConverter("my_unary", node_def, p.expected_output_dims, Status::OK(),
+                    p.runtime_status, ArrayFloatNear(output, 0.0001, true));
   }
 }
 
@@ -5374,7 +5376,7 @@ void TestConvertConcat(OpConverterTest* test) {
     DataVec output_data{
         {"my_concat",
          test->ConstructTensor<CType>(ok_params[i].expected_output.size())}};
-    test->BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAreArray(ok_params[i].expected_output));
   }
@@ -5539,7 +5541,7 @@ void TestConvertSplit(OpConverterTest* test) {
     // Verify output values are correct.
     const DataVec input_data{
         {"value", test->AsTensor<CType>(ok_params[i].value)}};
-    test->BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     for (int j = 0; j < outputs.size(); ++j) {
       EXPECT_THAT(GetSpanForData<CType>(output_data[j]),
                   ElementsAreArray(ok_params[i].expected_outputs[j]));
@@ -5716,7 +5718,7 @@ void TestConvertUnpack(OpConverterTest* test) {
     // Verify output values are correct.
     const DataVec input_data{
         {"value", test->AsTensor<CType>(ok_params[i].value)}};
-    test->BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     for (int j = 0; j < outputs.size(); ++j) {
       EXPECT_THAT(GetSpanForData<CType>(output_data[j]),
                   ElementsAreArray(ok_params[i].expected_outputs[j]));
@@ -5885,7 +5887,7 @@ void TestConvertPack(OpConverterTest* test) {
     }
     DataVec output_data{{"my_pack", test->ConstructTensor<CType>(
                                         params[i].expected_output.size())}};
-    test->BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAreArray(params[i].expected_output));
   }
@@ -6033,7 +6035,7 @@ void TestConvertArgMinMax(OpConverterTest* test) {
     DataVec output_data{
         {"my_arg", test->ConstructTensor<int32>(
                        params[i].expected_argmax_output.size())}};
-    test->BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
 
     if (node_def.op() == "ArgMax") {
       EXPECT_THAT(GetSpanForData<int32>(output_data[0]),
@@ -6132,7 +6134,7 @@ void TestConvertDepthSpaceShuffle(
     DataVec input_data{{"input", test->AsTensor<CType>(params[i].input_value)}};
     DataVec output_data{{"my_shuffle", test->ConstructTensor<CType>(
                                            params[i].expected_output.size())}};
-    test->BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAreArray(params[i].expected_output));
   }
@@ -6408,7 +6410,7 @@ void TestConvertClipByValue(OpConverterTest* test) {
     DataVec input_data{{"t", test->AsTensor<CType>(params[i].input_value)}};
     DataVec output_data{{"my_clip", test->ConstructTensor<CType>(
                                         params[i].expected_output.size())}};
-    test->BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAreArray(params[i].expected_output));
   }
@@ -6516,7 +6518,7 @@ void TestConvertSquaredDifference(OpConverterTest* test) {
     DataVec output_data{
         {"my_squared_diff",
          test->ConstructTensor<CType>(params[i].expected_output.size())}};
-    test->BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAreArray(params[i].expected_output));
   }
@@ -6621,7 +6623,7 @@ void TestConvertResize(OpConverterTest* test) {
         {"my_resize", test->ConstructTensor<CType>(
                           params[i].expected_nearest_output_values.size())}};
 
-    test->BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
 
     if (node_def.op() == "ResizeBilinear") {
       ExpectArrayAlmostEqual(params[i].expected_bilinear_output_values,
@@ -6721,7 +6723,7 @@ void TestConvertPad(OpConverterTest* test) {
         {"my_pad", test->ConstructTensor<CType>(
                        params[i].expected_output_values.size())}};
 
-    test->BuildAndRun(input_data, &output_data);
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     ExpectArrayAlmostEqual(params[i].expected_output_values,
                            GetSpanForData<CType>(output_data[0]), CType(1e-5));
   }
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index 6ab719db54d..72f4fe5ef9b 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -228,9 +228,6 @@ Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster,
                 << "This can result in poor performance.";
     }
   }
-  grappler::GraphProperties static_graph_properties(item);
-  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
-  ConversionParams cp;
 
   if (use_calibration_ && precision_mode_ != TrtPrecisionMode::INT8) {
     VLOG(1) << "Calibration with FP32 or FP16 is not implemented. "
@@ -255,7 +252,9 @@ Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster,
     }
     nodes_to_preserve.push_back(s);
   }
-  cp.input_graph_def = &item.graph;
+
+  ConversionParams cp;
+  cp.grappler_item = &item;
   cp.output_names = &nodes_to_preserve;
   cp.trt_logger_name = trt_logger_name_;
   cp.max_batch_size = maximum_batch_size_;
@@ -263,7 +262,6 @@ Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster,
   cp.output_graph_def = optimized_graph;
   cp.precision_mode = precision_mode_;
   cp.minimum_segment_size = minimum_segment_size_;
-  cp.graph_properties = &static_graph_properties;
   cp.cluster = cluster;
   cp.is_dyn_op = is_dynamic_op_;
   cp.max_cached_engines = max_cached_batches_;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
index fb3ae6943d3..a4b64ec0dc5 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 namespace tensorrt {
@@ -185,6 +186,40 @@ Status TrtDimsToTensorShape(const nvinfer1::Dims trt_dims,
   return Status::OK();
 }
 
+Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type) {
+  switch (tf_type) {
+    case DT_FLOAT:
+      *trt_type = nvinfer1::DataType::kFLOAT;
+      break;
+    case DT_HALF:
+      *trt_type = nvinfer1::DataType::kHALF;
+      break;
+    case DT_INT32:
+      *trt_type = nvinfer1::DataType::kINT32;
+      break;
+    default:
+      return errors::Internal("Unsupported tensorflow type");
+  }
+  return Status::OK();
+}
+
+Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type) {
+  switch (trt_type) {
+    case nvinfer1::DataType::kFLOAT:
+      *tf_type = DT_FLOAT;
+      break;
+    case nvinfer1::DataType::kHALF:
+      *tf_type = DT_HALF;
+      break;
+    case nvinfer1::DataType::kINT32:
+      *tf_type = DT_INT32;
+      break;
+    default:
+      return errors::Internal("Invalid TRT type");
+  }
+  return Status::OK();
+}
+
 int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine) {
   int n_bindings = engine->getNbBindings();
   int n_input = 0;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h
index 5d4cf1bb851..43697573bbd 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -29,6 +29,8 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 
+static constexpr char kCastOutputTypeAttrName[] = "DstT";
+
 class IONamePrefixes {
  public:
   static constexpr const char* const kInputPHName = "TensorRTInputPH_";
@@ -106,6 +108,9 @@ Status TrtDimsToTensorShape(const nvinfer1::Dims trt_dims,
                             bool use_implicit_batch, int batch_size,
                             TensorShape& shape);
 
+Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type);
+Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type);
+
 // Returns a string that includes compile time TensorRT library version
 // information {Maj, Min, Patch}.
 string GetLinkedTensorRTVersion();
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index a5332385994..37110442b26 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -81,7 +81,7 @@ tf_portable_proto_library(
     name = "portable_tf2xla_proto",
     config_string = "allow_all:true",
     header_outs = ["//tensorflow/compiler/tf2xla/tf2xla.proto.h"],
-    portable_deps = ["//tensorflow/core:portable_proto_lib_full_runtime"],
+    portable_deps = ["//tensorflow/core:portable_proto_lib"],
     proto_deps = [
         ":tf2xla_proto",
         "//tensorflow/core:protos_all",
@@ -182,6 +182,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Shape",
         "@llvm-project//mlir:StandardOps",
     ],
 )
@@ -349,6 +350,7 @@ cc_library(
         ":sharding_util",
         ":side_effect_util",
         ":tf2xla_util",
+        "//tensorflow/compiler/jit:common",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:shape_inference",
         "//tensorflow/compiler/jit:xla_cluster_util",
@@ -703,12 +705,8 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
-        "//tensorflow/compiler/mlir/tensorflow:device_util",
-        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/core:core_cpu",
-        "@com_google_absl//absl/container:flat_hash_set",
+        "//tensorflow/core:lib",
         "@llvm-project//llvm:support",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/tf2xla/graph_compiler_util.cc b/tensorflow/compiler/tf2xla/graph_compiler_util.cc
index 57278eea292..a9385e05564 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler_util.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler_util.cc
@@ -49,10 +49,12 @@ typedef std::unordered_map<string, Node*> NodeMap;
 // Each feed id identifies the positional output of some node, which may consist
 // of multiple edges. AddPlaceholdersForFeeds has already replaced each fed
 // tensor with a placeholder.  For each feed tensor, replaces all edges so they
-// point from a new _Arg node instead.
+// point from a new _Arg node instead. The newly created _Arg nodes are added to
+// `arg_nodes`.
 Status AddArgNodes(Graph* graph, const NodeMap& node_map,
                    const protobuf::RepeatedPtrField<tf2xla::Feed>& feeds,
-                   const std::unordered_map<string, string>& feed_remapping) {
+                   const std::unordered_map<string, string>& feed_remapping,
+                   std::unordered_set<const Node*>* arg_nodes) {
   for (int arg_index = 0; arg_index < feeds.size(); ++arg_index) {
     const tf2xla::Feed& feed = feeds[arg_index];
     // All feeds have been replaced by placeholders.
@@ -86,6 +88,7 @@ Status AddArgNodes(Graph* graph, const NodeMap& node_map,
             .Attr(kShapeAttr, TensorShape(feed.shape()))
             .Attr(kDebugNameAttr, feed.name())
             .Finalize(graph, &arg_node));
+    arg_nodes->insert(arg_node);
 
     // Collects out-edges from the feed node that have a matching edge index;
     // these will be replaced with edges from the arg node instead.
@@ -149,13 +152,13 @@ Status RewriteAndPruneGraph(
   for (Node* n : graph->nodes()) {
     node_map[n->name()] = n;
   }
+  std::unordered_set<const Node*> nodes_to_keep;
+  TF_RETURN_IF_ERROR(AddArgNodes(graph, node_map, config.feed(), feed_remapping,
+                                 &nodes_to_keep));
   TF_RETURN_IF_ERROR(
-      AddArgNodes(graph, node_map, config.feed(), feed_remapping));
-  std::unordered_set<const Node*> retval_nodes;
-  TF_RETURN_IF_ERROR(
-      AddRetvalNodes(graph, node_map, config.fetch(), &retval_nodes));
+      AddRetvalNodes(graph, node_map, config.fetch(), &nodes_to_keep));
   VLOG(2) << "Post rewrite: " << DumpGraphToFile("tf2xla_post_rewrite", *graph);
-  PruneForReverseReachability(graph, std::move(retval_nodes));
+  PruneForReverseReachability(graph, std::move(nodes_to_keep));
   FixupSourceAndSinkEdges(graph);
   VLOG(2) << "Post prune: " << DumpGraphToFile("tfcompile_post_prune", *graph);
   // Sanity-check, to make sure the feeds and fetches still exist post-pruning.
@@ -277,8 +280,16 @@ Status InitGraph(const GraphDef& graph_def, const tf2xla::Config& config,
   // Prune the GraphDef first so that unknown ops that we aren't compiling get
   // filtered out.
   GraphDef second_copy_def;
+  // Add the placeholder nodes as "fetches" in prune_config, such that they will
+  // be preserved in PruneGraphDefInto.
+  auto prune_config = config;
+  for (const auto& entry : feed_remapping) {
+    auto ph = prune_config.add_fetch();
+    *ph->mutable_id()->mutable_node_name() = entry.second;
+    ph->mutable_id()->set_output_index(0);
+  }
   TF_RETURN_IF_ERROR(
-      PruneGraphDefInto(config, first_copy_def, &second_copy_def));
+      PruneGraphDefInto(prune_config, first_copy_def, &second_copy_def));
 
   TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(
       &second_copy_def, *g->op_registry(), /*node_offset=*/0));
diff --git a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
index fb89742b139..c1f60abc0d6 100644
--- a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
@@ -106,8 +106,9 @@ class DataFormatVecPermuteOp : public XlaOpKernel {
                 errors::InvalidArgument(
                     "Input must be a vector or matrix, but got shape ",
                     input_tensor_shape.DebugString()));
+    const int dim0 = input_tensor_shape.dim_size(0);
     OP_REQUIRES(
-        ctx, input_tensor_shape.dim_size(0) == 4,
+        ctx, dim0 == 2 || dim0 == 4,
         errors::InvalidArgument(
             "First dimension of input must be of size 4, but got shape ",
             input_tensor_shape.DebugString()));
@@ -118,10 +119,25 @@ class DataFormatVecPermuteOp : public XlaOpKernel {
               "Second dimension of 2D input must be of size 2, but got shape ",
               input_tensor_shape.DebugString()));
     }
-    int32 dst_indices[4];
-    for (int i = 0; i < 4; ++i) {
-      for (int j = 0; j < 4; ++j) {
-        if (src_format_[i] == dst_format_[j]) {
+
+    string src_format_str = src_format_;
+    string dst_format_str = dst_format_;
+    if (dim0 == 2) {
+      // If the input is a vector of size 2, treat the two elements as spatial
+      // dimensions.
+      auto keep_only_spatial_dimensions = [](string* format_str) -> void {
+        auto new_end = std::remove_if(
+            format_str->begin(), format_str->end(),
+            [](const char dim) { return dim != 'H' && dim != 'W'; });
+        format_str->erase(new_end, format_str->end());
+      };
+      keep_only_spatial_dimensions(&src_format_str);
+      keep_only_spatial_dimensions(&dst_format_str);
+    }
+    std::vector<int32> dst_indices(dim0);
+    for (int i = 0; i < dim0; ++i) {
+      for (int j = 0; j < dim0; ++j) {
+        if (src_format_str[i] == dst_format_str[j]) {
           dst_indices[j] = i;
           break;
         }
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
index bb2c0d9ddb8..5dbc083368c 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
@@ -28,6 +28,15 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+absl::InlinedVector<xla::XlaOp, 4> SliceVector(xla::XlaOp input, int64 rank) {
+  absl::InlinedVector<xla::XlaOp, 4> scalar_indices;
+  scalar_indices.reserve(rank);
+  for (int i = 0; i < rank; i++)
+    scalar_indices.push_back(
+        xla::Reshape(xla::Slice(input, {i}, {i + 1}, {1}), {}));
+  return scalar_indices;
+}
+
 class DynamicUpdateSliceOp : public XlaOpKernel {
  public:
   explicit DynamicUpdateSliceOp(OpKernelConstruction* context)
@@ -41,21 +50,23 @@ class DynamicUpdateSliceOp : public XlaOpKernel {
     const TensorShape update_shape = ctx->InputShape("update");
     const TensorShape index_shape = ctx->InputShape("indices");
 
+    int64 rank = input_shape.dims();
     OP_REQUIRES(
         ctx,
         TensorShapeUtils::IsVector(index_shape) &&
-            index_shape.num_elements() == input_shape.dims(),
+            index_shape.num_elements() == rank,
         errors::InvalidArgument("index must be a vector with length equal to "
                                 "the number of input dimensions"));
     OP_REQUIRES(
-        ctx, input_shape.dims() == update_shape.dims(),
+        ctx, rank == update_shape.dims(),
         errors::InvalidArgument("input and update must have the same rank,"
                                 " input shape is ",
                                 input_shape.DebugString(), "; update shape is ",
                                 update_shape.DebugString()));
 
+    xla::XlaOp indices = ctx->Input("indices");
     xla::XlaOp result = xla::DynamicUpdateSlice(
-        ctx->Input("input"), ctx->Input("update"), ctx->Input("indices"));
+        ctx->Input("input"), ctx->Input("update"), SliceVector(indices, rank));
     ctx->SetOutput(0, result);
   }
 };
@@ -76,17 +87,18 @@ class DynamicSliceOp : public XlaOpKernel {
     const TensorShape start_indices_shape = ctx->InputShape("start_indices");
     const TensorShape size_indices_shape = ctx->InputShape("size_indices");
 
+    int64 rank = input_shape.dims();
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsVector(start_indices_shape) &&
-                    start_indices_shape.num_elements() == input_shape.dims(),
+                    start_indices_shape.num_elements() == rank,
                 errors::InvalidArgument(
                     "start_indices must be a vector with length equal to "
                     "input rank, but input rank is ",
-                    input_shape.dims(), " and start_indices has shape ",
+                    rank, " and start_indices has shape ",
                     start_indices_shape.DebugString()));
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsVector(size_indices_shape) &&
-                    size_indices_shape.num_elements() == input_shape.dims(),
+                    size_indices_shape.num_elements() == rank,
                 errors::InvalidArgument(
                     "size_indices must be a vector with length equal to "
                     "input rank, but input rank is ",
@@ -96,8 +108,10 @@ class DynamicSliceOp : public XlaOpKernel {
     std::vector<int64> size_indices;
     OP_REQUIRES_OK(
         ctx, ctx->ConstantInputAsIntVector("size_indices", &size_indices));
+
+    xla::XlaOp start_indices = ctx->Input("start_indices");
     xla::XlaOp result = xla::DynamicSlice(
-        ctx->Input("input"), ctx->Input("start_indices"), size_indices);
+        ctx->Input("input"), SliceVector(start_indices, rank), size_indices);
     ctx->SetOutput(0, result);
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
index a3fcb4d4b8f..bd6f58453df 100644
--- a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -81,9 +82,7 @@ class MatMulOp : public XlaOpKernel {
         b = xla::ConvertElementType(b, xla::F32);
       }
     }
-    auto lhs = (transpose_a_) ? xla::Transpose(a, {1, 0}) : a;
-    auto rhs = (transpose_b_) ? xla::Transpose(b, {1, 0}) : b;
-    ctx->SetOutput(0, xla::Dot(lhs, rhs));
+    ctx->SetOutput(0, xla::BatchDot(a, transpose_a_, b, transpose_b_));
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
index 8431724f438..beb8e7aa174 100644
--- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -36,10 +36,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-// TODO(phawkins): implement double-sized windowed reductions in XLA and remove
-// the type constraint.
-constexpr std::array<DataType, 4> kScanOpTypes = {
-    {DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_INT32}};
+constexpr std::array<DataType, 5> kScanOpTypes = {
+    {DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32}};
 
 class ScanOp : public XlaOpKernel {
  public:
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index 17d0b87edda..7f274c6b00f 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -42,19 +42,17 @@ class SliceOp : public XlaOpKernel {
     const TensorShape begin_tensor_shape = ctx->InputShape(1);
     const TensorShape size_tensor_shape = ctx->InputShape(2);
 
+    const int input_dims = input_shape.dims();
     OP_REQUIRES(
         ctx,
         TensorShapeUtils::IsVector(begin_tensor_shape) &&
             TensorShapeUtils::IsVector(size_tensor_shape) &&
-            begin_tensor_shape.num_elements() == input_shape.dims() &&
-            size_tensor_shape.num_elements() == input_shape.dims(),
+            begin_tensor_shape.num_elements() == input_dims &&
+            size_tensor_shape.num_elements() == input_dims,
         errors::InvalidArgument(
             "Expected begin and size arguments to be 1-D tensors of size ",
-            input_shape.dims(), ", but got shapes ",
-            begin_tensor_shape.DebugString(), " and ",
-            size_tensor_shape.DebugString(), " instead."));
-
-    const int input_dims = input_shape.dims();
+            input_dims, ", but got shapes ", begin_tensor_shape.DebugString(),
+            " and ", size_tensor_shape.DebugString(), " instead."));
 
     std::vector<int64> begin;
     std::vector<int64> size;
@@ -129,7 +127,15 @@ class SliceOp : public XlaOpKernel {
                                             input_shape.dim_size(i), "], but ",
                                             "got ", size[i]));
       }
-      ctx->SetOutput(0, xla::DynamicSlice(ctx->Input(0), ctx->Input(1), size));
+
+      absl::InlinedVector<xla::XlaOp, 4> scalar_indices;
+      scalar_indices.reserve(input_dims);
+      xla::XlaOp begin = ctx->Input("begin");
+      for (int i = 0; i < input_dims; i++)
+        scalar_indices.push_back(
+            xla::Reshape(xla::Slice(begin, {i}, {i + 1}, {1}), {}));
+
+      ctx->SetOutput(0, xla::DynamicSlice(ctx->Input(0), scalar_indices, size));
     }
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index fa5a96ca6bd..976ff91f6ce 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -136,8 +136,11 @@ class TensorListReserveOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &num_elements));
     OP_REQUIRES(
         ctx, num_elements >= 0,
-        errors::InvalidArgument("XLA compilation requires a fixed tensor list "
-                                "size. Set the number of elements."));
+        errors::InvalidArgument(
+            "XLA compilation requires a fixed tensor list size. Set the number "
+            "of elements. This could also happen if you're using a TensorArray "
+            "in a while loop that does not have its maximum_iteration set, you "
+            "can fix this by setting maximum_iteration to a suitable value."));
 
     // If element shape is compile time constant and it's not "unknown rank"
     // shape (-1), create an initialized TensorList. Otherwise create an
@@ -197,10 +200,13 @@ class EmptyTensorListOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     int64 max_num_elements;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &max_num_elements));
-    OP_REQUIRES(
-        ctx, max_num_elements >= 0,
-        errors::InvalidArgument("XLA compilation requires a fixed tensor list "
-                                "size. Set the max number of elements."));
+    OP_REQUIRES(ctx, max_num_elements >= 0,
+                errors::InvalidArgument(
+                    "XLA compilation requires a fixed tensor list size. Set "
+                    "the max number of elements. This could also happen if "
+                    "you're using a TensorArray in a while loop that does not "
+                    "have its maximum_iteration set, you can fix this by "
+                    "setting maximum_iteration to a suitable value."));
 
     if (dtype_ != DT_VARIANT) {
       // We are creating a non-nested TensorList.
@@ -431,6 +437,120 @@ class TensorListStackOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("TensorListStack"), TensorListStackOp);
 
+class TensorListConcatOp : public XlaOpKernel {
+ public:
+  explicit TensorListConcatOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaOp input = ctx->Input(0);
+
+    // Check that the TensorList is initialized.
+    bool is_initialized;
+    OP_REQUIRES_OK(ctx, (IsTensorListInitialized(input, &is_initialized)));
+    OP_REQUIRES(ctx, is_initialized,
+                errors::InvalidArgument("TensorList is not initialized"));
+
+    // Only non-nested TensorList is supported for now.
+    bool is_nested;
+    OP_REQUIRES_OK(ctx, IsNestedTensorList(input, &is_nested));
+    OP_REQUIRES(ctx, !is_nested,
+                errors::Unimplemented("Only non-nested TensorList is supported "
+                                      "for TensorListConcat."));
+
+    xla::XlaOp buffer;
+    OP_REQUIRES_OK(ctx, GetTensorListBuffer(input, &buffer));
+
+    xla::XlaBuilder* b = input.builder();
+    auto shape_or = b->GetShape(buffer);
+    OP_REQUIRES_OK(ctx, shape_or.status());
+    xla::Shape element_shape = shape_or.ConsumeValueOrDie();
+    std::vector<int64> element_dims =
+        xla::SpanToVector(element_shape.dimensions());
+    OP_REQUIRES(
+        ctx, element_dims.size() > 1,
+        errors::Unimplemented("TensorList of scalars is not supported"));
+    int64 num_elements = element_dims[0];
+    int64 tensor_lengths = element_dims[1];
+
+    std::vector<int64> new_dims = {num_elements * tensor_lengths};
+
+    for (int i = 2; i < element_dims.size(); i++) {
+      new_dims.push_back(element_dims[i]);
+    }
+
+    xla::XlaOp out = xla::Reshape(buffer, new_dims);
+    ctx->SetOutput(0, out);
+
+    // Second output is a tensor of lengths of returned tensors.
+    xla::XlaOp lengths = xla::ConstantR1(b, num_elements, tensor_lengths);
+    ctx->SetOutput(1, lengths);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListConcatOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListConcatV2"), TensorListConcatOp);
+
+class TensorListSplitOp : public XlaOpKernel {
+ public:
+  explicit TensorListSplitOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+    // Only non-nested TensorList is supported for now.
+    OP_REQUIRES(
+        ctx, dtype_ != DT_VARIANT,
+        errors::Unimplemented(
+            "Only non-nested TensorList is supported for TensorListReserve."));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaOp input_tensor = ctx->Input(0);
+
+    xla::XlaBuilder* b = input_tensor.builder();
+    auto shape_or = b->GetShape(input_tensor);
+    OP_REQUIRES_OK(ctx, shape_or.status());
+    xla::Shape element_shape = shape_or.ConsumeValueOrDie();
+    std::vector<int64> element_dims =
+        xla::SpanToVector(element_shape.dimensions());
+    OP_REQUIRES(
+        ctx, !element_dims.empty(),
+        errors::Unimplemented("Element dimensions have to be non-empty"));
+
+    std::vector<int64> lengths;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(2, &lengths));
+    OP_REQUIRES(ctx, !lengths.empty(),
+                errors::Unimplemented("Length has to be non-empty"));
+    int64 length = lengths[0];
+    for (int64 len : lengths) {
+      OP_REQUIRES(ctx, len == length,
+                  errors::Unimplemented("All lengths have to be the same"));
+    }
+    OP_REQUIRES(
+        ctx, element_dims[0] % length == 0,
+        errors::Unimplemented("Buffer size has to be a multiple of length"));
+    std::vector<int64> new_dims = {element_dims[0] / length, length};
+    for (int i = 1; i < element_dims.size(); i++) {
+      new_dims.push_back(element_dims[i]);
+    }
+
+    xla::XlaOp reshaped = xla::Reshape(input_tensor, new_dims);
+
+    xla::XlaOp result;
+    OP_REQUIRES_OK(ctx, ExecuteTensorListFromTensor(length, reshaped, &result));
+    ctx->SetTensorListOutput(0, result);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListSplitOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListSplit")
+                    .CompileTimeConstantInput("element_shape")
+                    .CompileTimeConstantInput("lengths"),
+                TensorListSplitOp);
+
 class TensorListFromTensorOp : public XlaOpKernel {
  public:
   explicit TensorListFromTensorOp(OpKernelConstruction* ctx)
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index 499e27f0981..c398e5f129e 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -18,10 +18,18 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/bridge.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
+auto* mlir_bridge_gauge_v1 = monitoring::Gauge<bool, 0>::New(
+    "/tensorflow/config/experimental/enable_mlir_bridge_gauge_v1",
+    "Tracks usage of the MLIR-based TF2XLA bridge among TF1 models");
+auto* mlir_bridge_gauge_v2 = monitoring::Gauge<bool, 0>::New(
+    "/tensorflow/config/experimental/enable_mlir_bridge_gauge_v2",
+    "Tracks usage of the MLIR-based TF2XLA bridge among TF2 models");
+
 // This runs the first phase of the "bridge", transforming the graph in a form
 // that can be executed with delegation of some computations to an accelerator.
 // This builds on the model of XLA where a subset of the graph is encapsulated
@@ -32,10 +40,12 @@ Status MlirBridgePass::Run(const ConfigProto& config_proto,
                            mlir::ModuleOp module) {
   if (!config_proto.experimental().enable_mlir_bridge()) {
     VLOG(0) << "Skipping MLIR TPU Bridge, session flag not enabled";
+    mlir_bridge_gauge_v2->GetCell()->Set(false);
     return Status::OK();
   }
 
   VLOG(0) << "Running MLIR TPU Bridge";
+  mlir_bridge_gauge_v2->GetCell()->Set(true);
   TF_RETURN_IF_ERROR(
       mlir::TFTPU::TPUBridge(module, /*enable_logging=*/VLOG_IS_ON(1)));
 
@@ -48,10 +58,12 @@ Status MlirBridgeV1CompatPass::Run(const GraphOptimizationPassOptions& options,
 
   if (!options.session_options->config.experimental().enable_mlir_bridge()) {
     VLOG(0) << "Skipping MLIR TPU Bridge V1 Compat, session flag not enabled";
+    mlir_bridge_gauge_v1->GetCell()->Set(false);
     return Status::OK();
   }
 
   VLOG(0) << "Running MLIR TPU Bridge V1 Compat";
+  mlir_bridge_gauge_v1->GetCell()->Set(true);
   TF_RETURN_IF_ERROR(
       mlir::TFTPU::TPUBridgeV1Compat(module, /*enable_logging=*/VLOG_IS_ON(1)));
 
diff --git a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
index daf261fa5d8..43793be56a7 100644
--- a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -95,6 +96,7 @@ static void RegisterDialects() {
     mlir::registerDialect<mlir::TF::TensorFlowDialect>();
     mlir::registerDialect<mlir::StandardOpsDialect>();
     mlir::registerDialect<mlir::xla_hlo::XlaHloDialect>();
+    mlir::registerDialect<mlir::shape::ShapeDialect>();
     return true;
   }();
   (void)init_once;
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index 24afe595b18..7ea69f734c9 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -99,5 +99,42 @@ TEST(ConvertGraphDefToXla, Sum) {
       ConvertGraphDefToXla(graph_def, config, client, &computation)));
 }
 
+TEST(ConvertGraphDefToXla, SumWithUnusedArgument) {
+  GraphDef graph_def = SumGraph();
+  tf2xla::Config config = SumConfig();
+  NodeDef* unused = graph_def.add_node();
+  unused->set_name("unused");
+  unused->set_op("Placeholder");
+  (*unused->mutable_attr())["dtype"] = TypeAttrValue(DT_INT32);
+  config.add_feed()->mutable_id()->set_node_name("unused");
+
+  xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
+  xla::XlaComputation computation;
+  TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
+
+  // Set up arguments.
+  auto x_literal = xla::LiteralUtil::CreateR0<int32>(10);
+  auto y_literal = xla::LiteralUtil::CreateR0<int32>(32);
+  auto x_global_or = client->TransferToServer(x_literal);
+  auto y_global_or = client->TransferToServer(y_literal);
+  auto unused_global_or = client->TransferToServer(y_literal);
+  TF_EXPECT_OK(x_global_or.status());
+  TF_EXPECT_OK(y_global_or.status());
+  TF_EXPECT_OK(unused_global_or.status());
+  std::unique_ptr<xla::GlobalData> x_global =
+      std::move(x_global_or.ValueOrDie());
+  std::unique_ptr<xla::GlobalData> y_global =
+      std::move(y_global_or.ValueOrDie());
+  std::unique_ptr<xla::GlobalData> unused_global =
+      std::move(unused_global_or.ValueOrDie());
+
+  // Execute and check result.
+  auto result_or = client->ExecuteAndTransfer(
+      computation, {x_global.get(), y_global.get(), unused_global.get()});
+  TF_EXPECT_OK(result_or.status());
+  xla::Literal result = std::move(result_or.ValueOrDie());
+  EXPECT_EQ("(\ns32[] 42\n)", result.ToString());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 3d6083621f4..1cf3e10b774 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/types/variant.h"
+#include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/shape_inference.h"
 #include "tensorflow/compiler/tf2xla/graph_compiler.h"
@@ -571,6 +572,10 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   std::unique_ptr<Graph> graph(new Graph(options_.flib_def));
   CopyGraph(*fbody->graph, graph.get());
 
+  bool is_inside_mustcompile = false;
+  TryGetNodeAttr(AttrSlice(&fbody->fdef.attr()), kXlaMustCompileAttr,
+                 &is_inside_mustcompile);
+
   // Performs a first function inlining pass before shape inference, since
   // otherwise shape inference can't see inside functions and a comprehensive
   // shape_map, including function ops, is needed to constant-propagate Shape
@@ -622,6 +627,8 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   graph_optimizer_options.inline_multi_device_functions = true;
   graph_optimizer_options.inline_impl_selection_group_functions = true;
   graph_optimizer_options.inline_with_single_device_body_placer = true;
+  graph_optimizer_options.ignore_noinline = is_inside_mustcompile;
+
   optimizer.Optimize(flib_runtime_, flib_runtime_->env(),
                      /*device=*/nullptr, &graph, graph_optimizer_options);
 
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 1350f9e3e0b..45f49cee328 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -17,7 +17,6 @@ package_group(
         "//tensorflow/compiler/...",
         "//tensorflow/python/tpu/...",
         "//third_party/py/jax/...",
-        "//third_party/tf_runtime/tools/tf_kernel_gen/...",
     ],
 )
 
@@ -332,6 +331,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index cd52e2f5e45..404f9eb7519 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -70,6 +70,12 @@ ExecutableBuildOptions& ExecutableBuildOptions::set_num_partitions(
   return *this;
 }
 
+ExecutableBuildOptions& ExecutableBuildOptions::set_use_spmd_partitioning(
+    bool use_spmd_partitioning) {
+  use_spmd_partitioning_ = use_spmd_partitioning;
+  return *this;
+}
+
 ExecutableBuildOptions& ExecutableBuildOptions::set_device_assignment(
     const DeviceAssignment& device_assignment) {
   device_assignment_ = device_assignment;
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index 360ad0260df..9a7fdd974b1 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -77,6 +77,11 @@ class ExecutableBuildOptions {
   int num_partitions() const { return num_partitions_; }
   ExecutableBuildOptions& set_num_partitions(int num_partitions);
 
+  // Indicates whether to use SPMD (true) or MPMD (false) partitioning when
+  // num_partitions > 1 and XLA is requested to partition the input program.
+  bool use_spmd_partitioning() const { return use_spmd_partitioning_; }
+  ExecutableBuildOptions& set_use_spmd_partitioning(bool use_spmd_partitioning);
+
   // If set, this specifies a static device assignment for the computation.
   // Otherwise, the computation will be compiled generically and can be run with
   // any device assignment compatible with the computation's replica and
@@ -104,6 +109,7 @@ class ExecutableBuildOptions {
   se::DeviceMemoryAllocator* device_allocator_ = nullptr;
   int num_replicas_ = 1;
   int num_partitions_ = 1;
+  bool use_spmd_partitioning_ = false;
   absl::optional<DeviceAssignment> device_assignment_;
   bool alias_passthrough_params_ = false;
 };
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index 9b8156efe5b..cb79b2ef7db 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -236,6 +236,19 @@ XLA_TEST_F(MathTest, SqrtF32) {
   ComputeAndCompareR0<float>(&builder, 0.0f, {zero_data.get()}, error_spec_);
 }
 
+XLA_TEST_F(MathTest, SqrtF64) {
+  XlaBuilder builder(TestName());
+  Literal zero_literal = LiteralUtil::Zero(PrimitiveType::F64);
+
+  std::unique_ptr<GlobalData> zero_data =
+      client_->TransferToServer(zero_literal).ConsumeValueOrDie();
+
+  XlaOp zero = Parameter(&builder, 0, zero_literal.shape(), "zero");
+  Sqrt(zero);
+
+  ComputeAndCompareR0<double>(&builder, 0.0f, {zero_data.get()}, error_spec_);
+}
+
 #ifndef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64
 XLA_TEST_F(MathTest, ErfInvF64) {
   XlaBuilder builder(TestName());
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index a779086f1d5..58365c0f498 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -860,34 +860,10 @@ XlaOp XlaBuilder::SliceInDim(XlaOp operand, int64 start_index,
   });
 }
 
-XlaOp XlaBuilder::DynamicSlice(XlaOp operand, XlaOp start_indices,
-                               absl::Span<const int64> slice_sizes) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
-    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
-    TF_ASSIGN_OR_RETURN(const Shape* start_indices_shape,
-                        GetShapePtr(start_indices));
-    TF_ASSIGN_OR_RETURN(
-        Shape shape, ShapeInference::InferDynamicSliceShape(
-                         *operand_shape, {*start_indices_shape}, slice_sizes));
-    *instr.mutable_shape() = shape.ToProto();
-
-    for (int64 size : slice_sizes) {
-      instr.add_dynamic_slice_sizes(size);
-    }
-
-    return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice,
-                          {operand, start_indices});
-  });
-}
-
 XlaOp XlaBuilder::DynamicSlice(XlaOp operand,
                                absl::Span<const XlaOp> start_indices,
                                absl::Span<const int64> slice_sizes) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     std::vector<const Shape*> start_indices_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& start_indices_shapes,
@@ -898,43 +874,28 @@ XlaOp XlaBuilder::DynamicSlice(XlaOp operand,
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDynamicSliceShape(
                             *operand_shape, start_indices_shapes, slice_sizes));
-    *instr.mutable_shape() = shape.ToProto();
-
-    for (int64 size : slice_sizes) {
-      instr.add_dynamic_slice_sizes(size);
-    }
-
-    std::vector<XlaOp> operands = {operand};
-    operands.insert(operands.end(), start_indices.begin(), start_indices.end());
-    return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice, operands);
+    return DynamicSliceInternal(shape, operand, start_indices, slice_sizes);
   });
 }
 
-XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
-                                     XlaOp start_indices) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
+StatusOr<XlaOp> XlaBuilder::DynamicSliceInternal(
+    const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+    absl::Span<const int64> slice_sizes) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
 
-    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
-    TF_ASSIGN_OR_RETURN(const Shape* update_shape, GetShapePtr(update));
-    TF_ASSIGN_OR_RETURN(const Shape* start_indices_shape,
-                        GetShapePtr(start_indices));
-    TF_ASSIGN_OR_RETURN(
-        Shape shape,
-        ShapeInference::InferDynamicUpdateSliceShape(
-            *operand_shape, *update_shape, {*start_indices_shape}));
-    *instr.mutable_shape() = shape.ToProto();
+  for (int64 size : slice_sizes) {
+    instr.add_dynamic_slice_sizes(size);
+  }
 
-    return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
-                          {operand, update, start_indices});
-  });
+  std::vector<XlaOp> operands = {operand};
+  operands.insert(operands.end(), start_indices.begin(), start_indices.end());
+  return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice, operands);
 }
 
 XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
                                      absl::Span<const XlaOp> start_indices) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(const Shape* update_shape, GetShapePtr(update));
     std::vector<const Shape*> start_indices_shape_ptrs;
@@ -946,15 +907,22 @@ XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
     TF_ASSIGN_OR_RETURN(
         Shape shape, ShapeInference::InferDynamicUpdateSliceShape(
                          *operand_shape, *update_shape, start_indices_shapes));
-    *instr.mutable_shape() = shape.ToProto();
-
-    std::vector<XlaOp> operands = {operand, update};
-    operands.insert(operands.end(), start_indices.begin(), start_indices.end());
-    return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
-                          operands);
+    return DynamicUpdateSliceInternal(shape, operand, update, start_indices);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::DynamicUpdateSliceInternal(
+    const Shape& shape, XlaOp operand, XlaOp update,
+    absl::Span<const XlaOp> start_indices) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+
+  std::vector<XlaOp> operands = {operand, update};
+  operands.insert(operands.end(), start_indices.begin(), start_indices.end());
+  return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
+                        operands);
+}
+
 XlaOp XlaBuilder::ConcatInDim(absl::Span<const XlaOp> operands,
                               int64 dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -1301,7 +1269,6 @@ XlaOp XlaBuilder::ConvGeneralDilated(
     int64 feature_group_count, int64 batch_group_count,
     const PrecisionConfig* precision_config) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
     TF_RETURN_IF_ERROR(
@@ -1314,30 +1281,45 @@ XlaOp XlaBuilder::ConvGeneralDilated(
       window_dimensions[i] =
           rhs_shape->dimensions(dimension_numbers.kernel_spatial_dimensions(i));
     }
-    TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
+
+    TF_ASSIGN_OR_RETURN(Window window,
                         ShapeInference::InferWindowFromDimensions(
                             window_dimensions, window_strides, padding,
                             lhs_dilation, rhs_dilation));
-
-    TF_ASSIGN_OR_RETURN(
-        Shape shape, ShapeInference::InferConvolveShape(
-                         *lhs_shape, *rhs_shape, feature_group_count,
-                         batch_group_count, instr.window(), dimension_numbers));
-    *instr.mutable_shape() = shape.ToProto();
-
-    *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
-    instr.set_feature_group_count(feature_group_count);
-    instr.set_batch_group_count(batch_group_count);
-
-    if (precision_config != nullptr) {
-      *instr.mutable_precision_config() = *precision_config;
-    }
-
-    return AddInstruction(std::move(instr), HloOpcode::kConvolution,
-                          {lhs, rhs});
+    TF_ASSIGN_OR_RETURN(Shape shape,
+                        ShapeInference::InferConvolveShape(
+                            *lhs_shape, *rhs_shape, feature_group_count,
+                            batch_group_count, window, dimension_numbers));
+    return ConvGeneralDilatedInternal(shape, lhs, rhs, window, window_strides,
+                                      padding, lhs_dilation, rhs_dilation,
+                                      dimension_numbers, feature_group_count,
+                                      batch_group_count, precision_config);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::ConvGeneralDilatedInternal(
+    const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
+    absl::Span<const int64> window_strides,
+    absl::Span<const std::pair<int64, int64>> padding,
+    absl::Span<const int64> lhs_dilation, absl::Span<const int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+
+  *instr.mutable_window() = window;
+  *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
+  instr.set_feature_group_count(feature_group_count);
+  instr.set_batch_group_count(batch_group_count);
+
+  if (precision_config != nullptr) {
+    *instr.mutable_precision_config() = *precision_config;
+  }
+
+  return AddInstruction(std::move(instr), HloOpcode::kConvolution, {lhs, rhs});
+}
+
 XlaOp XlaBuilder::Fft(XlaOp operand, const FftType fft_type,
                       const absl::Span<const int64> fft_length) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -2203,6 +2185,39 @@ XlaOp XlaBuilder::BatchNormGrad(XlaOp operand, XlaOp scale, XlaOp batch_mean,
   });
 }
 
+XlaOp XlaBuilder::AllGather(XlaOp operand, int64 all_gather_dimension,
+                            int64 shard_count,
+                            absl::Span<const ReplicaGroup> replica_groups,
+                            const absl::optional<ChannelHandle>& channel_id,
+                            const absl::optional<Layout>& layout) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
+
+    TF_ASSIGN_OR_RETURN(Shape inferred_shape,
+                        ShapeInference::InferAllGatherShape(
+                            *operand_shape, all_gather_dimension, shard_count));
+    if (layout) {
+      *inferred_shape.mutable_layout() = *layout;
+      instr.set_constrain_layout(true);
+    }
+    *instr.mutable_shape() = inferred_shape.ToProto();
+
+    instr.add_dimensions(all_gather_dimension);
+    for (const ReplicaGroup& group : replica_groups) {
+      *instr.add_replica_groups() = group;
+    }
+    if (channel_id.has_value()) {
+      instr.set_channel_id(channel_id->handle());
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        auto all_gather,
+        AddInstruction(std::move(instr), HloOpcode::kAllGather, {operand}));
+    return all_gather;
+  });
+}
+
 XlaOp XlaBuilder::CrossReplicaSum(
     XlaOp operand, absl::Span<const ReplicaGroup> replica_groups) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -3105,20 +3120,11 @@ XlaOp SliceInDim(const XlaOp operand, int64 start_index, int64 limit_index,
                                        stride, dimno);
 }
 
-XlaOp DynamicSlice(const XlaOp operand, const XlaOp start_indices,
-                   absl::Span<const int64> slice_sizes) {
-  return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes);
-}
 XlaOp DynamicSlice(const XlaOp operand, absl::Span<const XlaOp> start_indices,
                    absl::Span<const int64> slice_sizes) {
   return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes);
 }
 
-XlaOp DynamicUpdateSlice(const XlaOp operand, const XlaOp update,
-                         const XlaOp start_indices) {
-  return operand.builder()->DynamicUpdateSlice(operand, update, start_indices);
-}
-
 XlaOp DynamicUpdateSlice(const XlaOp operand, const XlaOp update,
                          absl::Span<const XlaOp> start_indices) {
   return operand.builder()->DynamicUpdateSlice(operand, update, start_indices);
@@ -3182,6 +3188,10 @@ XlaOp Compare(const XlaOp lhs, const XlaOp rhs,
                                  broadcast_dimensions, direction);
 }
 
+XlaOp Compare(const XlaOp lhs, const XlaOp rhs, ComparisonDirection direction) {
+  return Compare(lhs, rhs, {}, direction);
+}
+
 XlaOp Dot(const XlaOp lhs, const XlaOp rhs,
           const PrecisionConfig* precision_config) {
   return lhs.builder()->Dot(lhs, rhs, precision_config);
@@ -3470,6 +3480,16 @@ XlaOp ReduceWindowWithGeneralPadding(
       base_dilations, window_dilations, padding);
 }
 
+XlaOp AllGather(const XlaOp operand, int64 all_gather_dimension,
+                int64 shard_count,
+                absl::Span<const ReplicaGroup> replica_groups,
+                const absl::optional<ChannelHandle>& channel_id,
+                const absl::optional<Layout>& layout) {
+  return operand.builder()->AllGather(operand, all_gather_dimension,
+                                      shard_count, replica_groups, channel_id,
+                                      layout);
+}
+
 XlaOp CrossReplicaSum(const XlaOp operand,
                       absl::Span<const ReplicaGroup> replica_groups) {
   return operand.builder()->CrossReplicaSum(operand, replica_groups);
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 2ab4c575862..426b6d83207 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -421,16 +421,17 @@ class XlaBuilder {
   virtual XlaOp SliceInDim(XlaOp operand, int64 start_index, int64 limit_index,
                            int64 stride, int64 dimno);
 
-  ABSL_DEPRECATED("Use span-of-indices form instead")
-  XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
-                     absl::Span<const int64> slice_sizes);
   XlaOp DynamicSlice(XlaOp operand, absl::Span<const XlaOp> start_indices,
                      absl::Span<const int64> slice_sizes);
+  virtual StatusOr<XlaOp> DynamicSliceInternal(
+      const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+      absl::Span<const int64> slice_sizes);
 
-  ABSL_DEPRECATED("Use span-of-indices form instead")
-  XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update, XlaOp start_indices);
   XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                            absl::Span<const XlaOp> start_indices);
+  virtual StatusOr<XlaOp> DynamicUpdateSliceInternal(
+      const Shape& shape, XlaOp operand, XlaOp update,
+      absl::Span<const XlaOp> start_indices);
 
   XlaOp ConcatInDim(absl::Span<const XlaOp> operands, int64 dimension);
   virtual StatusOr<XlaOp> ConcatInDimInternal(const Shape& shape,
@@ -491,6 +492,16 @@ class XlaBuilder {
                            int64 batch_group_count = 1,
                            const PrecisionConfig* precision_config = nullptr);
 
+  virtual StatusOr<XlaOp> ConvGeneralDilatedInternal(
+      const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
+      absl::Span<const int64> window_strides,
+      absl::Span<const std::pair<int64, int64>> padding,
+      absl::Span<const int64> lhs_dilation,
+      absl::Span<const int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count, int64 batch_group_count,
+      const PrecisionConfig* precision_config);
+
   XlaOp Fft(XlaOp operand, FftType fft_type,
             absl::Span<const int64> fft_length);
 
@@ -549,6 +560,12 @@ class XlaBuilder {
   XlaOp CrossReplicaSum(XlaOp operand,
                         absl::Span<const ReplicaGroup> replica_groups = {});
 
+  XlaOp AllGather(
+      XlaOp operand, int64 all_gather_dimension, int64 shard_count,
+      absl::Span<const ReplicaGroup> replica_groups = {},
+      const absl::optional<ChannelHandle>& channel_id = absl::nullopt,
+      const absl::optional<Layout>& layout = absl::nullopt);
+
   XlaOp AllReduce(
       XlaOp operand, const XlaComputation& computation,
       absl::Span<const ReplicaGroup> replica_groups = {},
@@ -842,14 +859,10 @@ class XlaBuilder {
   friend XlaOp SliceInDim(XlaOp operand, int64 start_index, int64 limit_index,
                           int64 stride, int64 dimno);
 
-  friend XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
-                            absl::Span<const int64> slice_sizes);
   friend XlaOp DynamicSlice(XlaOp operand,
                             absl::Span<const XlaOp> start_indices,
                             absl::Span<const int64> slice_sizes);
 
-  friend XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
-                                  XlaOp start_indices);
   friend XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                                   absl::Span<const XlaOp> start_indices);
 
@@ -876,6 +889,7 @@ class XlaBuilder {
   friend XlaOp Compare(XlaOp lhs, XlaOp rhs,
                        absl::Span<const int64> broadcast_dimensions,
                        ComparisonDirection direction);
+  friend XlaOp Compare(XlaOp lhs, XlaOp rhs, ComparisonDirection direction);
   friend XlaOp Dot(XlaOp lhs, XlaOp rhs,
                    const PrecisionConfig* precision_config);
   friend XlaOp DotGeneral(XlaOp lhs, XlaOp rhs,
@@ -992,6 +1006,11 @@ class XlaBuilder {
       absl::Span<const std::pair<int64, int64>> padding);
   friend XlaOp CrossReplicaSum(XlaOp operand,
                                absl::Span<const ReplicaGroup> replica_groups);
+  friend XlaOp AllGather(XlaOp operand, int64 all_gather_dimension,
+                         int64 shard_count,
+                         absl::Span<const ReplicaGroup> replica_groups,
+                         const absl::optional<ChannelHandle>& channel_id,
+                         const absl::optional<Layout>& layout);
   friend XlaOp AllReduce(XlaOp operand, const XlaComputation& computation,
                          absl::Span<const ReplicaGroup> replica_groups,
                          const absl::optional<ChannelHandle>& channel_id,
@@ -1417,10 +1436,6 @@ XlaOp SliceInDim(XlaOp operand, int64 start_index, int64 limit_index,
 XlaOp DynamicSlice(XlaOp operand, absl::Span<const XlaOp> start_indices,
                    absl::Span<const int64> slice_sizes);
 
-ABSL_DEPRECATED("Use span-of-indices form instead")
-XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
-                   absl::Span<const int64> slice_sizes);
-
 // Enqueues a dynamic update slice operation onto the computation, which
 // updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
 // The shape of 'update' determines the shape of the slice of 'operand'
@@ -1441,9 +1456,6 @@ XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
 XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                          absl::Span<const XlaOp> start_indices);
 
-ABSL_DEPRECATED("Use span-of-indices form instead")
-XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update, XlaOp start_indices);
-
 // Enqueues a concatenate instruction onto the computation. 'operands' must
 // have >= 1 entry.
 XlaOp ConcatInDim(XlaBuilder* builder, absl::Span<const XlaOp> operands,
@@ -1487,10 +1499,12 @@ XlaOp Lt(XlaOp lhs, XlaOp rhs,
 XlaOp Le(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
 
-// Enqueues a comparison instruction onto the computation.
+// Enqueues a comparison instruction onto the computation (optionally without
+// broadcast_dimensions for consistency with others).
 XlaOp Compare(XlaOp lhs, XlaOp rhs,
               absl::Span<const int64> broadcast_dimensions,
               ComparisonDirection direction);
+XlaOp Compare(XlaOp lhs, XlaOp rhs, ComparisonDirection direction);
 
 // Enqueues a dot instruction onto the computation.
 XlaOp Dot(XlaOp lhs, XlaOp rhs,
@@ -1771,6 +1785,11 @@ XlaOp ReduceWindowWithGeneralPadding(
 XlaOp CrossReplicaSum(XlaOp operand,
                       absl::Span<const ReplicaGroup> replica_groups = {});
 
+XlaOp AllGather(XlaOp operand, int64 all_gather_dimension, int64 shard_count,
+                absl::Span<const ReplicaGroup> replica_groups = {},
+                const absl::optional<ChannelHandle>& channel_id = absl::nullopt,
+                const absl::optional<Layout>& layout = absl::nullopt);
+
 // Enqueues an operation that do an AllReduce of the operand cross cores. Here
 // AllReduce means doing a reduction on the input operand cross cores and then
 // broadcasting the reduction result to those cores. The reduction function is
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 1fa839b2014..4fa47077fca 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -381,6 +381,29 @@ TEST_F(XlaBuilderTest, Transpose) {
   EXPECT_THAT(root, op::Transpose(op::Parameter()));
 }
 
+TEST_F(XlaBuilderTest, AllGatherR1) {
+  XlaBuilder b(TestName());
+  auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4}), "x");
+  AllGather(x, /*all_gather_dimension=*/0, /*shard_count=*/4);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+
+  EXPECT_EQ(root->opcode(), HloOpcode::kAllGather);
+  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), ShapeUtil::MakeShape(F32, {16})));
+}
+
+TEST_F(XlaBuilderTest, AllGatherR2) {
+  XlaBuilder b(TestName());
+  auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x");
+  AllGather(x, /*all_gather_dimension=*/1, /*shard_count=*/4);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+
+  EXPECT_EQ(root->opcode(), HloOpcode::kAllGather);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(root->shape(), ShapeUtil::MakeShape(F32, {4, 64})));
+}
+
 TEST_F(XlaBuilderTest, AllToAll) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x");
diff --git a/tensorflow/compiler/xla/client/xla_computation.h b/tensorflow/compiler/xla/client/xla_computation.h
index 3ccbfb28bd0..6a3b17a154a 100644
--- a/tensorflow/compiler/xla/client/xla_computation.h
+++ b/tensorflow/compiler/xla/client/xla_computation.h
@@ -29,8 +29,8 @@ namespace xla {
 class XlaComputation {
  public:
   XlaComputation() : unique_id_(-1) {}
-  XlaComputation(const HloModuleProto& proto)
-      : unique_id_(proto.id()), proto_(proto) {}
+  XlaComputation(HloModuleProto proto)
+      : unique_id_(proto.id()), proto_(std::move(proto)) {}
 
   ~XlaComputation() {}
 
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 216fb0a7422..958629c5fa6 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -55,18 +55,23 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   // b/77879207.
   opts.set_xla_gpu_disable_multi_streaming(true);
 
-  // TODO(jlebar): Disable fastmath once doing so is not a performance
-  // regression.
+  // Disable forms of fast math that have caused users problems in the past.
   opts.set_xla_cpu_enable_fast_math(true);
+  opts.set_xla_cpu_fast_math_honor_nans(true);
+  opts.set_xla_cpu_fast_math_honor_infs(true);
+  opts.set_xla_cpu_fast_math_honor_functions(true);
+  opts.set_xla_cpu_fast_math_honor_division(true);
+
+  // By default, copy TF's Eigen style min_max behavior with nans.
+  opts.set_xla_cpu_enable_fast_min_max(false);
+
   opts.set_xla_gpu_enable_fast_min_max(true);
 
   opts.set_xla_allow_excess_precision(true);
   opts.set_xla_force_host_platform_device_count(1);
   opts.set_xla_gpu_deterministic_reductions(false);
   opts.set_xla_cpu_enable_xprof_traceme(true);
-  // TODO(b/155295372): disable ptxas fallback by default.
-  opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(true);
-  opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_error(true);
+  opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(false);
 
   return opts;
 }
@@ -261,6 +266,12 @@ static void AllocateFlags() {
       "When xla_cpu_enable_fast_math is true then this controls whether we "
       "forbid to approximate calculations for functions. Ignored when "
       "xla_cpu_enable_fast_math is false."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_enable_fast_min_max",
+      bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_min_max),
+      flag_values->xla_cpu_enable_fast_min_max(),
+      "Enable fast floating point min/max lowering that always propagates "
+      "NaNs."));
   flag_objects->push_back(tensorflow::Flag(
       "xla_gpu_enable_fast_min_max",
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_fast_min_max),
@@ -554,15 +565,6 @@ static void AllocateFlags() {
       "that falling back to the driver can have drawbacks like using more "
       "memory and/or other bugs during compilation, so we recommend setting "
       "this flag to false."));
-  flag_objects->push_back(tensorflow::Flag(
-      "xla_gpu_unsafe_fallback_to_driver_on_ptxas_error",
-      bool_setter_for(
-          &DebugOptions::set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_error),
-      flag_values->xla_gpu_unsafe_fallback_to_driver_on_ptxas_error(),
-      "If true, XLA GPU falls back to the driver if there is an error when "
-      "running ptxas. Note that falling back to the driver can have drawbacks "
-      "like using more memory and/or other bugs during compilation, so we "
-      "recommend setting this flag to false."));
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
 
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index 43ee0fdd820..8ae8c418d5d 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -50,6 +50,7 @@ class RunId {
  public:
   // Creates a new, unique RunId.
   RunId();
+  explicit RunId(int64 value) : data_(value) {}
 
   RunId(const RunId&) = default;
   RunId& operator=(const RunId&) = default;
diff --git a/tensorflow/compiler/xla/g3doc/index.md b/tensorflow/compiler/xla/g3doc/index.md
index b7868fedb8b..60bde306266 100644
--- a/tensorflow/compiler/xla/g3doc/index.md
+++ b/tensorflow/compiler/xla/g3doc/index.md
@@ -174,9 +174,33 @@ When filing bugs, attach the contents of the `/tmp/generated` directory
 
 If possible, try to isolate
 a bug to a single XLA program by using the
-[`replay_computation`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/tools/replay_computation.cc)
+[`replay_computation`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/tools/run_hlo_module_main.cc)
 and iteratively running it on generated programs.
 
+## Known Issues
+
+Compilation with XLA can greatly improve the performance of your programs, but
+the TensorFlow interop has a number of known sharp corners.
+
+### TensorArray TF/XLA Interconversion
+
+The problem manifests itself as an error message
+`Support for TensorList crossing the XLA/TF boundary is not implemented`.
+
+XLA supports `tf.TensorArray`. However, the _interconversion_ between TF and
+XLA representations is not implemented yet.
+This error often arises when the `TensorArray` is used inside the compiled
+block, but the derivative is taken outside.
+
+Workaround: compile the outermost scope which is taking the derivative.
+
+### Random Number Generation
+
+XLA currently ignores TF seeds to random operations. This affects stateful TF
+random operations, such as `tf.random.normal`, or `tf.nn.dropout`.  XLA will
+behave as if the compilation was seeded with a new unique seed at each run. This
+limitation does not apply to stateless random ops.
+
 ## XLA Frontends
 
 Apart from TensorFlow, XLA programs can be generated by:
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 495701eaac2..002d07184a7 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -2299,20 +2299,26 @@ The output is guaranteed to be a deterministic function of the initial state but
 it is *not* guaranteed to be deterministic between backends and different
 compiler versions.
 
-<b>`RngBitGenerator(algorithm, key, shape)`</b> | Arguments | Type | Semantics |
-|---------------- | ----------------- | ------------------------------------- |
-| `algorithm` | `RandomAlgorithm` | PRNG algorithm to be used. | |
-`initial_state` | `XlaOp` | Initial state for the PRNG algorithm. | | `shape` |
-`Shape` | Output shape for generated data. |
+<b>`RngBitGenerator(algorithm, key, shape)`</b>
 
-Available values for `algorithm`: * `rng_default`: Backend specific algorithm
-with backend specific shape requirements. * `rng_three_fry`: ThreeFry
-counter-based PRNG algorithm. The `initial_state` shape is `u64[2]` with
-arbitrary values.
-[Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
-* `rng_philox`: Philox algorithm to generate random numbers in parallel. The
-`initial_state` shape is `u64[3]` with arbitrary values.
-[Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
+Arguments       | Type              | Semantics
+--------------- | ----------------- | -------------------------------------
+`algorithm`     | `RandomAlgorithm` | PRNG algorithm to be used.
+`initial_state` | `XlaOp`           | Initial state for the PRNG algorithm.
+`shape`         | `Shape`           | Output shape for generated data.
+
+Available values for `algorithm`:
+
+-   `rng_default`: Backend specific algorithm with backend specific shape
+    requirements.
+
+-   `rng_three_fry`: ThreeFry counter-based PRNG algorithm. The `initial_state`
+    shape is `u64[2]` with arbitrary values.
+    [Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
+
+-   `rng_philox`: Philox algorithm to generate random numbers in parallel. The
+    `initial_state` shape is `u64[3]` with arbitrary values.
+    [Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
 
 ## Scatter
 
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index cbbad741ce3..73c37d6b2f3 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -2104,6 +2104,32 @@ MutableBorrowingLiteral::MutableBorrowingLiteral(const char* src_buf_ptr,
   root_piece_->set_subshape(shape_.get());
 }
 
+MutableBorrowingLiteral::MutableBorrowingLiteral(absl::Span<char*> src_buf_ptrs,
+                                                 const Shape& shape)
+    : MutableLiteralBase() {
+  shape_ = absl::make_unique<Shape>(shape);
+  if (!shape_->IsTuple()) {
+    CHECK_EQ(src_buf_ptrs.size(), 1);
+    root_piece_ = new Piece();
+    root_piece_->set_buffer(const_cast<char*>(src_buf_ptrs[0]));
+    root_piece_->set_subshape(shape_.get());
+  } else {
+    CHECK(!ShapeUtil::IsNestedTuple(*shape_));
+    CHECK_EQ(src_buf_ptrs.size(), ShapeUtil::TupleElementCount(*shape_));
+    root_piece_ = new Piece();
+    root_piece_->set_subshape(shape_.get());
+
+    for (int i = 0; i < src_buf_ptrs.size(); ++i) {
+      Piece child_piece;
+      const auto& src_shape = shape_->tuple_shapes(i);
+      CHECK(src_shape.IsArray());
+      child_piece.set_subshape(&src_shape);
+      child_piece.set_buffer(src_buf_ptrs[i]);
+      root_piece_->emplace_back(std::move(child_piece));
+    }
+  }
+}
+
 MutableBorrowingLiteral::~MutableBorrowingLiteral() {
   if (root_piece_ != nullptr) {
     delete root_piece_;
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index 1553d042e80..a2be92fbf5b 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -776,6 +776,10 @@ class MutableBorrowingLiteral : public MutableLiteralBase {
                           const ShapeIndex& view_root);
   MutableBorrowingLiteral(const char* src_buf_ptr, const Shape& shape);
 
+  // Create a literal from a list of buffers and a shape.
+  // Returns a tuple literal if `shape` is a tuple type.
+  MutableBorrowingLiteral(absl::Span<char*> src_buf_ptrs, const Shape& shape);
+
  private:
   // Recursively copies the subtree from the `src_piece` at the given child
   // index to the `dest_piece`. For buffers only the pointers are copied, but
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 8c6bc84cf8e..10737489331 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -1,5 +1,6 @@
 load("//tensorflow/core/platform:build_config.bzl", "pyx_library")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_py_test_deps")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "pybind_extension")
@@ -186,6 +187,89 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ops",
+    srcs = ["ops.cc"],
+    hdrs = ["ops.h"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    deps = [
+        ":types",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:comparators",
+        "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/client/lib:qr",
+        "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
+        "//tensorflow/compiler/xla/client/lib:sorting",
+        "//tensorflow/compiler/xla/client/lib:svd",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+        "@pybind11",
+    ],
+)
+
+cc_library(
+    name = "outfeed_receiver",
+    srcs = ["outfeed_receiver.cc"],
+    hdrs = ["outfeed_receiver.h"],
+    deps = [
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
+tf_cc_test(
+    name = "cpu_outfeed_receiver_test",
+    size = "small",
+    srcs = ["outfeed_receiver_test.cc"],
+    deps = [
+        ":outfeed_receiver",
+        "//tensorflow/compiler/jit:xla_cpu_jit",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:executable_build_options",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/pjrt:cpu_device",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "outfeed_receiver_py",
+    srcs = ["outfeed_receiver_py.cc"],
+    hdrs = ["outfeed_receiver_py.h"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    deps = [
+        ":outfeed_receiver",
+        ":types",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/synchronization",
+        "@pybind11",
+    ],
+)
+
 config_setting(
     name = "enable_gpu",
     values = {"define": "xla_python_enable_gpu=true"},
@@ -205,7 +289,9 @@ pybind_extension(
     deps = [
         ":bfloat16",
         ":dlpack",
+        ":ops",
         ":python_ref_manager",
+        ":outfeed_receiver_py",
         ":types",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/hash",
@@ -228,12 +314,6 @@ pybind_extension(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/client/lib:comparators",
-        "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/lib:qr",
-        "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
-        "//tensorflow/compiler/xla/client/lib:sorting",
-        "//tensorflow/compiler/xla/client/lib:svd",
         "//tensorflow/compiler/xla/pjrt:cpu_device",
         "//tensorflow/compiler/xla/pjrt:nvidia_gpu_device",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
@@ -260,8 +340,8 @@ pybind_extension(
         "//tensorflow/core:lib_internal_impl",  # buildcleaner: keep
         "//tensorflow/core/profiler/lib:profiler_backends",
         "//tensorflow/core/profiler/lib:profiler_session",
-        "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/rpc:profiler_server",
+        "//tensorflow/python/profiler/internal:traceme_wrapper",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:platform",
     ] + select({
diff --git a/tensorflow/compiler/xla/python/ops.cc b/tensorflow/compiler/xla/python/ops.cc
new file mode 100644
index 00000000000..89891d39f78
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ops.cc
@@ -0,0 +1,356 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ops.h"
+
+#include <string>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "pybind11/attr.h"
+#include "pybind11/pybind11.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/qr.h"
+#include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
+#include "tensorflow/compiler/xla/client/lib/sorting.h"
+#include "tensorflow/compiler/xla/client/lib/svd.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/python/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+namespace py = pybind11;
+
+void BuildOpsSubmodule(py::module* m) {
+  // ops submodule, containing free functions that add operators to an
+  // XlaBuilder.
+  py::module ops = m->def_submodule("ops", "XLA operations");
+
+  py::enum_<TriangularSolveOptions::Transpose>(
+      ops, "TriangularSolveOptions_Transpose")
+      .value("TRANSPOSE_INVALID", TriangularSolveOptions::TRANSPOSE_INVALID)
+      .value("NO_TRANSPOSE", TriangularSolveOptions::NO_TRANSPOSE)
+      .value("TRANSPOSE", TriangularSolveOptions::TRANSPOSE)
+      .value("ADJOINT", TriangularSolveOptions::ADJOINT);
+
+  ops.def("AfterAll", &AfterAll, py::arg("builder"), py::arg("tokens"));
+  ops.def(
+      "AllReduce",
+      static_cast<XlaOp (*)(
+          XlaOp, const XlaComputation&, absl::Span<const ReplicaGroup>,
+          const absl::optional<ChannelHandle>&, const absl::optional<Shape>&)>(
+          &AllReduce),
+      py::arg("operand"), py::arg("computation"),
+      py::arg("replica_groups") = py::list(),
+      py::arg("channel_id") = absl::nullopt,
+      py::arg("shape_with_layout") = absl::nullopt);
+  ops.def("AllToAll", &AllToAll, py::arg("operand"), py::arg("split_dimension"),
+          py::arg("concat_dimension"), py::arg("split_count"),
+          py::arg("replica_groups") = py::list(),
+          py::arg("layout") = absl::nullopt);
+  ops.def("CollectivePermute", &CollectivePermute, py::arg("operand"),
+          py::arg("source_target_pairs"));
+  ops.def("CreateToken", &CreateToken, py::arg("builder"));
+  ops.def("CrossReplicaSum",
+          static_cast<XlaOp (*)(XlaOp, absl::Span<const ReplicaGroup>)>(
+              &CrossReplicaSum),
+          py::arg("operand"), py::arg("replica_groups") = py::list());
+  ops.def("BitcastConvertType", &BitcastConvertType, py::arg("operand"),
+          py::arg("new_element_type"));
+  ops.def("Broadcast", &Broadcast, py::arg("operand"), py::arg("sizes"));
+  ops.def("BroadcastInDim", &BroadcastInDim, py::arg("operand"),
+          py::arg("shape"), py::arg("broadcast_dimensions"));
+  ops.def("Call", &Call, py::arg("builder"), py::arg("computation"),
+          py::arg("operands"));
+  ops.def("Cholesky", &Cholesky, py::arg("a"), py::arg("lower") = true);
+  ops.def("Clamp", &Clamp, py::arg("min"), py::arg("operand"), py::arg("max"));
+  ops.def("Collapse", &Collapse, py::arg("operand"), py::arg("dimensions"));
+  ops.def("ConcatInDim", &ConcatInDim, py::arg("builder"), py::arg("operands"),
+          py::arg("dimension"));
+  ops.def("Conditional",
+          static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaComputation* const>,
+                                absl::Span<const XlaOp>)>(&Conditional),
+          py::arg("branch_index"), py::arg("branch_computations"),
+          py::arg("branch_operands"));
+  ops.def("Conditional",
+          static_cast<XlaOp (*)(XlaOp, XlaOp, const XlaComputation&, XlaOp,
+                                const XlaComputation&)>(&Conditional),
+          py::arg("predicate"), py::arg("true_operand"),
+          py::arg("true_computation"), py::arg("false_operand"),
+          py::arg("false_computation"));
+  ops.def("Constant", &ConstantLiteral, py::arg("builder"), py::arg("literal"));
+  ops.def("ConstantLiteral", &ConstantLiteral, py::arg("builder"),
+          py::arg("literal"));
+  ops.def("ConvGeneralDilated", &ConvGeneralDilated, py::arg("lhs"),
+          py::arg("rhs"), py::arg("window_strides"), py::arg("padding"),
+          py::arg("lhs_dilation"), py::arg("rhs_dilation"),
+          py::arg("dimension_numbers"), py::arg("feature_group_count") = 1,
+          py::arg("batch_group_count") = 1,
+          py::arg("precision_config") = nullptr);
+  ops.def("ConvertElementType", &ConvertElementType, py::arg("operand"),
+          py::arg("new_element_type"));
+  ops.def(
+      "CustomCall",
+      [](XlaBuilder* builder, const py::bytes& call_target_name,
+         absl::Span<const XlaOp> operands, const Shape& shape,
+         const py::bytes& opaque) -> XlaOp {
+        return CustomCall(builder, call_target_name, operands, shape, opaque);
+      },
+      py::arg("builder"), py::arg("call_target_name"), py::arg("operands"),
+      py::arg("shape"), py::arg("opaque") = py::bytes(""));
+  ops.def(
+      "CustomCallWithLayout",
+      [](XlaBuilder* builder, const py::bytes& call_target_name,
+         absl::Span<const XlaOp> operands, const Shape& shape_with_layout,
+         absl::Span<const Shape> operand_shapes_with_layout,
+         const py::bytes& opaque) -> XlaOp {
+        return CustomCallWithLayout(builder, call_target_name, operands,
+                                    shape_with_layout,
+                                    operand_shapes_with_layout, opaque);
+      },
+      py::arg("builder"), py::arg("call_target_name"), py::arg("operands"),
+      py::arg("shape_with_layout"), py::arg("operand_shapes_with_layout"),
+      py::arg("opaque") = py::bytes(""));
+  ops.def("Dot", &Dot, py::arg("lhs"), py::arg("rhs"),
+          py::arg("precision_config") = nullptr);
+  ops.def("DotGeneral", &DotGeneral, py::arg("lhs"), py::arg("rhs"),
+          py::arg("dimension_numbers"), py::arg("precision_config") = nullptr);
+  ops.def("DynamicSlice",
+          static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaOp>,
+                                absl::Span<const int64>)>(&DynamicSlice),
+          py::arg("operand"), py::arg("start_indices"), py::arg("slice_sizes"));
+  ops.def("DynamicUpdateSlice",
+          static_cast<XlaOp (*)(XlaOp, XlaOp, absl::Span<const XlaOp>)>(
+              &DynamicUpdateSlice),
+          py::arg("operand"), py::arg("update"), py::arg("start_indices"));
+
+  ops.def("Fft", &Fft, py::arg("operand"), py::arg("fft_type"),
+          py::arg("fft_length"));
+
+  ops.def("Gather", &Gather, py::arg("a"), py::arg("start_indices"),
+          py::arg("dimension_numbers"), py::arg("slice_sizes"),
+          py::arg("indices_are_sorted") = false);
+  ops.def("GetTupleElement", &GetTupleElement, py::arg("tuple_data"),
+          py::arg("index"));
+  ops.def("InfeedWithToken", &InfeedWithToken, py::arg("token"),
+          py::arg("shape"), py::arg("config") = "");
+  ops.def("Iota",
+          static_cast<XlaOp (*)(XlaBuilder*, const Shape&, int64)>(&Iota),
+          py::arg("builder"), py::arg("shape"), py::arg("iota_dimension"));
+  ops.def("Iota",
+          static_cast<XlaOp (*)(XlaBuilder*, PrimitiveType, int64)>(&Iota),
+          py::arg("builder"), py::arg("type"), py::arg("size"));
+  ops.def("Map", &Map, py::arg("builder"), py::arg("operands"),
+          py::arg("computation"), py::arg("dimensions"),
+          py::arg("static_operands") = py::list());
+  ops.def("NextAfter", &NextAfter, py::arg("from"), py::arg("to"));
+  ops.def("OutfeedWithToken", &OutfeedWithToken, py::arg("operand"),
+          py::arg("token"), py::arg("shape_with_layout"),
+          py::arg("outfeed_config") = "");
+  ops.def("Pad", &Pad, py::arg("operand"), py::arg("padding_value"),
+          py::arg("padding_config"));
+  ops.def("Parameter",
+          static_cast<XlaOp (*)(XlaBuilder*, int64, const Shape&,
+                                const std::string&, const std::vector<bool>&)>(
+              &Parameter),
+          py::arg("builder"), py::arg("parameter_number"), py::arg("shape"),
+          py::arg("name") = "",
+          py::arg("replicated_at_leaf_buffers") = std::vector<bool>());
+  ops.def(
+      "QR",
+      [](XlaOp a, bool full_matrices) -> StatusOr<std::pair<XlaOp, XlaOp>> {
+        TF_ASSIGN_OR_RETURN(auto qr, QRDecomposition(a, full_matrices));
+        return std::make_pair(qr.q, qr.r);
+      },
+      py::arg("operand"), py::arg("full_matrices"));
+  ops.def(
+      "Eigh",
+      [](XlaOp a, bool lower, int64 max_iter,
+         float epsilon) -> std::pair<XlaOp, XlaOp> {
+        auto eigh = SelfAdjointEig(a, lower, max_iter, epsilon);
+        return std::make_pair(eigh.v, eigh.w);
+      },
+      py::arg("a"), py::arg("lower") = true, py::arg("max_iter") = 100,
+      py::arg("epsilon") = 1e-6);
+  ops.def(
+      "SVD",
+      [](XlaOp a, int64 max_iter,
+         float epsilon) -> std::tuple<XlaOp, XlaOp, XlaOp> {
+        auto svd = SVD(a, max_iter, epsilon);
+        return std::make_tuple(svd.u, svd.d, svd.v);
+      },
+      py::arg("a"), py::arg("max_iter") = 100, py::arg("epsilon") = 1e-6);
+  ops.def("Reduce",
+          static_cast<XlaOp (*)(XlaBuilder*, absl::Span<const XlaOp>,
+                                absl::Span<const XlaOp>, const XlaComputation&,
+                                absl::Span<const int64>)>(&Reduce),
+          py::arg("builder"), py::arg("operands"), py::arg("init_values"),
+          py::arg("computation"), py::arg("dimensions_to_reduce"));
+  ops.def("ReducePrecision", &ReducePrecision, py::arg("operand"),
+          py::arg("exponent_bits"), py::arg("mantissa_bits"));
+  ops.def("ReduceWindowWithGeneralPadding", &ReduceWindowWithGeneralPadding,
+          py::arg("operand"), py::arg("init_value"), py::arg("computation"),
+          py::arg("window_dimensions"), py::arg("window_strides"),
+          py::arg("base_dilations"), py::arg("window_dilations"),
+          py::arg("padding"));
+  ops.def("ReplicaId", &ReplicaId, py::arg("builder"));
+  ops.def("Reshape",
+          static_cast<XlaOp (*)(XlaOp, absl::Span<const int64>,
+                                absl::Span<const int64>)>(&Reshape),
+          py::arg("operand"), py::arg("dimensions"), py::arg("new_sizes"));
+  ops.def("Reshape",
+          static_cast<XlaOp (*)(XlaOp, absl::Span<const int64>)>(&Reshape),
+          py::arg("operand"), py::arg("new_sizes"));
+  ops.def("Rev", &Rev, py::arg("operand"), py::arg("dimensions"));
+  ops.def("RngNormal", &RngNormal, py::arg("mu"), py::arg("sigma"),
+          py::arg("shape"));
+  ops.def("RngUniform", &RngUniform, py::arg("a"), py::arg("b"),
+          py::arg("shape"));
+  ops.def("Scatter", &Scatter, py::arg("input"), py::arg("scatter_indices"),
+          py::arg("updates"), py::arg("update_computation"),
+          py::arg("dimension_numbers"), py::arg("indices_are_sorted") = false,
+          py::arg("unique_indices") = false);
+  ops.def("Select", &Select, py::arg("pred"), py::arg("on_true"),
+          py::arg("on_false"));
+  ops.def("SelectAndScatterWithGeneralPadding",
+          &SelectAndScatterWithGeneralPadding, py::arg("operand"),
+          py::arg("select"), py::arg("window_dimensions"),
+          py::arg("window_strides"), py::arg("padding"), py::arg("source"),
+          py::arg("init_value"), py::arg("scatter"));
+  ops.def("Slice", &Slice, py::arg("operand"), py::arg("start_indices"),
+          py::arg("limit_indices"), py::arg("strides"));
+  ops.def("SliceInDim", &SliceInDim, py::arg("operand"), py::arg("start_index"),
+          py::arg("limit_index"), py::arg("stride"), py::arg("dimno"));
+  ops.def(
+      "Sort",
+      [](XlaBuilder* builder, absl::Span<const XlaOp> operands,
+         absl::optional<const XlaComputation*> comparator, int64 dimension,
+         bool is_stable) -> XlaOp {
+        return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+          std::vector<PrimitiveType> operand_types;
+          for (const auto& operand : operands) {
+            TF_ASSIGN_OR_RETURN(auto operand_shape, builder->GetShape(operand));
+            operand_types.push_back(operand_shape.element_type());
+          }
+
+          if (comparator) {
+            return Sort(operands, **comparator, dimension, is_stable);
+          } else {
+            return Sort(operands,
+                        CreateScalarLtComputation(operand_types, builder),
+                        dimension, is_stable);
+          }
+        });
+      },
+      py::arg("builder"), py::arg("operands"),
+      py::arg("comparator") = absl::nullopt, py::arg("dimension") = -1,
+      py::arg("is_stable") = false);
+  ops.def("TopK", &TopK, py::arg("input"), py::arg("k"));
+  ops.def("Transpose", &Transpose, py::arg("operand"), py::arg("permutation"));
+  ops.def("TriangularSolve", &TriangularSolve, py::arg("a"), py::arg("b"),
+          py::arg("left_side"), py::arg("lower"), py::arg("unit_diagonal"),
+          py::arg("transpose_a"));
+  ops.def("Tuple", &Tuple, py::arg("builder"), py::arg("elements"));
+  ops.def("While", &While, py::arg("condition"), py::arg("body"),
+          py::arg("init"));
+
+  ops.def("Igamma", &Igamma, py::arg("a"), py::arg("x"));
+  ops.def("Igammac", &Igammac, py::arg("a"), py::arg("x"));
+  ops.def("IgammaGradA", &IgammaGradA, py::arg("a"), py::arg("x"));
+  ops.def("RandomGammaGrad", &RandomGammaGrad, py::arg("a"), py::arg("x"));
+  ops.def("RegularizedIncompleteBeta", &RegularizedIncompleteBeta, py::arg("a"),
+          py::arg("b"), py::arg("x"));
+
+#define BINARY_OP(op)                                                 \
+  ops.def(                                                            \
+      #op,                                                            \
+      [](XlaOp a, XlaOp b, absl::optional<std::vector<int64>> dims) { \
+        return dims ? op(a, b, *dims) : op(a, b);                     \
+      },                                                              \
+      py::arg("lhs"), py::arg("rhs"),                                 \
+      py::arg("broadcast_dimensions") = absl::nullopt)
+  BINARY_OP(Eq);
+  BINARY_OP(Ne);
+  BINARY_OP(Ge);
+  BINARY_OP(Gt);
+  BINARY_OP(Lt);
+  BINARY_OP(Le);
+  BINARY_OP(Add);
+  BINARY_OP(Sub);
+  BINARY_OP(Mul);
+  BINARY_OP(Div);
+  BINARY_OP(Rem);
+  BINARY_OP(Max);
+  BINARY_OP(Min);
+  BINARY_OP(And);
+  BINARY_OP(Or);
+  BINARY_OP(Xor);
+  BINARY_OP(ShiftLeft);
+  BINARY_OP(ShiftRightArithmetic);
+  BINARY_OP(ShiftRightLogical);
+  BINARY_OP(Atan2);
+  BINARY_OP(Pow);
+  BINARY_OP(Complex);
+#undef BINARY_OP
+
+#define UNARY_OP(op) ops.def(#op, &op)
+  UNARY_OP(Not);
+  UNARY_OP(PopulationCount);
+  UNARY_OP(Clz);
+  UNARY_OP(Abs);
+  UNARY_OP(Exp);
+  UNARY_OP(Expm1);
+  UNARY_OP(Floor);
+  UNARY_OP(Ceil);
+  UNARY_OP(Round);
+  UNARY_OP(Log);
+  UNARY_OP(Log1p);
+  UNARY_OP(Sign);
+  UNARY_OP(Cos);
+  UNARY_OP(Sin);
+  UNARY_OP(Tanh);
+  UNARY_OP(IsFinite);
+  UNARY_OP(Neg);
+  UNARY_OP(Sqrt);
+  UNARY_OP(Rsqrt);
+  UNARY_OP(Square);
+  UNARY_OP(Reciprocal);
+  UNARY_OP(Erfc);
+  UNARY_OP(Erf);
+  UNARY_OP(ErfInv);
+  UNARY_OP(Lgamma);
+  UNARY_OP(Digamma);
+  UNARY_OP(BesselI0e);
+  UNARY_OP(BesselI1e);
+  UNARY_OP(Acos);
+  UNARY_OP(Asin);
+  UNARY_OP(Atan);
+  UNARY_OP(Tan);
+  UNARY_OP(Acosh);
+  UNARY_OP(Asinh);
+  UNARY_OP(Atanh);
+  UNARY_OP(Cosh);
+  UNARY_OP(Sinh);
+  UNARY_OP(Real);
+  UNARY_OP(Imag);
+  UNARY_OP(Conj);
+#undef UNARY_OP
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ops.h b/tensorflow/compiler/xla/python/ops.h
new file mode 100644
index 00000000000..7fe34e941ba
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ops.h
@@ -0,0 +1,27 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_OPS_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_OPS_H_
+
+#include "pybind11/pybind11.h"
+
+namespace xla {
+
+void BuildOpsSubmodule(pybind11::module* m);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_OPS_H_
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver.cc b/tensorflow/compiler/xla/python/outfeed_receiver.cc
new file mode 100644
index 00000000000..0be4167c397
--- /dev/null
+++ b/tensorflow/compiler/xla/python/outfeed_receiver.cc
@@ -0,0 +1,492 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/outfeed_receiver.h"
+
+#include <sys/types.h>
+
+#include <memory>
+#include <sstream>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+// Implementation notes:
+//
+// Startup:
+// -------
+//
+// The startup is initiated by a call from Python to StartOutfeedReceiver,
+// which starts N threads for listening to the N devices and for enqueueing
+// the received data into a callback queue. There is one additional callback
+// thread for dequeing the data and invoking the Python callback.
+//
+// Framing protocol
+// ----------------
+//
+// The outfeed mechanism has a single channel and the receiver must know
+// exactly the shape and number of outfeed operations issued by the compiled
+// code. This makes it hard to use outfeed in conditionals and loops and
+// especially when outfeeding different-shaped data.
+//
+// To address this, when we compile the code we capture the shape of the
+// data being outfed, and we generate a consumer ID (uint32_t) that is unique
+// across the lifetime of the program to: the Python callable to callback to,
+// the shape of the arguments, the keyword arguments to pass to the callable.
+// Each outfeed payload is preceeded by a header (of shape u32[2]) with a
+// special first value and the consumer ID. We maintain a registry of shapes
+// by consumer ID. When receiving we lookup the shape by consumer ID, and then
+// we read the payload.
+//
+// Back pressure:
+// --------------
+//
+// We maintain a sum of the bytes from all the data waiting in the callback
+// queue. The listening threads will wait for the sum to drop below a
+// configurable threshold, default 256Mb. While the listening thread is waiting,
+// on CPU and GPU the next outfeed operation from the device will block. On
+// TPU there is a buffer, but eventually the TPU will also block.
+//
+// Shutdown:
+// ---------
+//
+// The shutdown is initiated automatically when the last reference to the
+// outfeed receiver object is dropped, and the Python garbage collector invokes
+// the destructor.
+//
+// The shutdown sequence is implemented as follows:
+// * we enqueue on all devices a computation that outfeeds a special header
+//   with customer ID kOutfeedCidShutdown.
+// * when each listening threads gets the shutdown header, it decrements
+//   a counter of listening threads, and if the counter reaches 0, it
+//   enqueues a special shutdown callback.
+// * when the callback thread gets the shutdown callback marker, it terminates.
+// * the shutdown code waits until all threads terminate.
+//
+// Since we currently keep the shape registry in the OutfeedReceiver, it is
+// not safe to replace the OutfeedReceiver instance during the lifetime of
+// the JAX program, or else previously cached jitted computations may refer
+// to previously cached shapes. This can be solved, but for now we disallow
+// replacing the OutfeedReceiver, and do not provide a Shutdown API to the
+// Python program.
+
+namespace xla {
+
+// The header contains:
+// 0. kOutfeedHeaderStart
+// 1. consumer id
+int constexpr kOutfeedHeaderWords = 2;
+uint32_t constexpr kOutfeedHeaderStart = 271828;
+// Special consumer IDs, without outfeed payload.
+uint32_t constexpr kOutfeedCidShutdown = 0;
+
+// A Device and its PjRtClient.
+struct DeviceWithClient {
+  Device* device;
+  std::shared_ptr<PjRtClient> client;
+};
+
+// Encapsulates data received from a device outfeed.
+class OutfeedData {
+ public:
+  OutfeedData(DeviceWithClient device_client, uint32_t consumer_id, Shape shape)
+      : device_client_(device_client),
+        consumer_id_(consumer_id),
+        shape_(shape),
+        literal_(nullptr),
+        literal_size_bytes_(0) {}
+
+  DeviceWithClient device_client() { return device_client_; }
+  uint32_t consumer_id() const { return consumer_id_; }
+  Shape shape() const { return shape_; }
+  std::unique_ptr<Literal> literal() {
+    CHECK(literal_);
+    return std::move(literal_);
+  }
+
+  void SetLiteral(std::unique_ptr<Literal> literal);
+
+  ssize_t literal_size_bytes() const { return literal_size_bytes_; }
+
+  std::string DebugString() const;
+
+ private:
+  DeviceWithClient device_client_;
+  uint32_t consumer_id_;
+  Shape shape_;
+  std::unique_ptr<Literal> literal_;
+  ssize_t literal_size_bytes_;
+};
+
+void OutfeedData::SetLiteral(std::unique_ptr<Literal> literal) {
+  literal_ = std::move(literal);
+  shape_ = literal_->shape();
+  int total_size_bytes = 0;
+  ShapeUtil::ForEachSubshape(
+      shape_, [&](const Shape& literal_subshape, const ShapeIndex& index) {
+        if (!literal_subshape.IsTuple()) {
+          total_size_bytes += ShapeUtil::ByteSizeOf(literal_subshape, 8);
+        }
+      });
+  literal_size_bytes_ = total_size_bytes;
+}
+
+std::string OutfeedData::DebugString() const {
+  return absl::StrFormat("dev=%s; cons=%d; shape=%s",
+                         device_client_.device->DebugString(), consumer_id_,
+                         shape_.ToString());
+}
+
+class OutfeedReceiverImpl {
+ public:
+  OutfeedReceiverImpl(OutfeedReceiver::Callback callback,
+                      std::vector<std::shared_ptr<PjRtClient>> clients,
+                      ssize_t max_callback_queue_size_bytes);
+
+  OutfeedReceiverImpl(const OutfeedReceiverImpl&) = delete;
+  OutfeedReceiverImpl& operator=(const OutfeedReceiverImpl&) = delete;
+
+  // Blocks until all data has been received from devices and all data
+  // in the queue has been passed to Python.
+  ~OutfeedReceiverImpl();
+
+  void Start();
+
+  StatusOr<XlaOp> AddOutfeedToBuilder(XlaBuilder* builder, XlaOp token,
+                                      uint32_t consumer_id,
+                                      std::vector<XlaOp> arrays);
+
+ private:
+  bool CallbackQueueNotEmpty() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return !callback_queue_.empty();
+  }
+
+  bool CallbackQueueHasSpace() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return callback_queue_size_bytes_ < max_callback_queue_size_bytes_;
+  }
+
+  bool ShutdownDone() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return (num_working_callback_threads_ == 0 && num_listening_threads_ == 0);
+  }
+
+  void CallbackThreadLoop();
+  void DeviceListenerThreadLoop(int device_idx);
+
+  // Enqueues to a device an outfeed operation with a shutdown consumer ID.
+  Status SendShutdownOutfeedHeader(int device_idx);
+
+  // Receives a raw Literal from a device outfeed.
+  StatusOr<std::unique_ptr<Literal>> ReceiveRawFromOutfeed(const Device* device,
+                                                           const Shape& shape);
+
+  // Enqueues received data in the callbaback queue.
+  void EnqueueReceivedData(std::unique_ptr<OutfeedData> received)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Shuts down the threads. See implementation notes at top of file.
+  // It is not safe to restart an OutfeedReceiver after shutting down one.
+  void Shutdown();
+
+  OutfeedReceiver::Callback callback_;
+  // The devices on which we are listening, with their clients.
+  std::vector<DeviceWithClient> devices_;
+  // Maximum bytes capacity of the callback queue.
+  uint64_t max_callback_queue_size_bytes_;
+
+  absl::Mutex mu_;
+  // Registered shapes by consumer id.
+  // The shape registry must be alive as long as the program exists.
+  // Right now we tell the user to never restart after Shutdown.
+  absl::flat_hash_map<uint32_t, Shape> shape_registry_ TF_GUARDED_BY(mu_);
+  // How many bytes of Literal are in the callback queue.
+  uint64_t callback_queue_size_bytes_ TF_GUARDED_BY(mu_);
+  // Threads listening.
+  int num_listening_threads_ TF_GUARDED_BY(mu_);
+  bool shutdown_started_ TF_GUARDED_BY(mu_);
+
+  // How many callback threads are still working. Used for shutdown.
+  int num_working_callback_threads_ TF_GUARDED_BY(mu_);
+
+  std::queue<std::unique_ptr<OutfeedData>> callback_queue_ TF_GUARDED_BY(mu_);
+  // The threadpool must come last to ensure the queue exists
+  // when the pool destructor is called.
+  std::unique_ptr<tensorflow::thread::ThreadPool> threads_;
+};
+
+OutfeedReceiverImpl::OutfeedReceiverImpl(
+    OutfeedReceiver::Callback callback,
+    std::vector<std::shared_ptr<PjRtClient>> clients,
+    ssize_t max_callback_queue_size_bytes) {
+  callback_ = callback;
+  max_callback_queue_size_bytes_ = max_callback_queue_size_bytes;
+  for (const auto& client : clients) {
+    for (const auto& device : client->devices()) {
+      devices_.push_back(DeviceWithClient{device.get(), client});
+    }
+  }
+  CHECK_GT(devices_.size(), 0);
+
+  callback_queue_size_bytes_ = 0;
+  num_listening_threads_ = 0;
+  num_working_callback_threads_ = 0;
+  shutdown_started_ = false;
+}
+
+void OutfeedReceiverImpl::Start() {
+  {
+    absl::MutexLock lock(&mu_);
+    CHECK(!shutdown_started_);
+  }
+  int num_threads = 1 + devices_.size();
+  threads_ = absl::make_unique<tensorflow::thread::ThreadPool>(
+      tensorflow::Env::Default(), "outfeed_receiver", num_threads);
+  threads_->Schedule([this]() { CallbackThreadLoop(); });
+  for (int device_idx = 0; device_idx < devices_.size(); ++device_idx) {
+    threads_->Schedule(
+        [this, device_idx]() { DeviceListenerThreadLoop(device_idx); });
+  }
+}
+
+void OutfeedReceiverImpl::Shutdown() {
+  VLOG(2) << "Shutdown start";
+  {
+    absl::MutexLock lock(&mu_);
+    CHECK(!shutdown_started_);
+    shutdown_started_ = true;
+  }
+  for (int device_idx = 0; device_idx < devices_.size(); ++device_idx) {
+    CHECK(SendShutdownOutfeedHeader(device_idx).ok());
+  }
+  VLOG(2) << "Shutdown waiting for listening and callback threads to stop";
+  absl::MutexLock lock(&mu_);
+  mu_.Await(absl::Condition(this, &OutfeedReceiverImpl::ShutdownDone));
+  VLOG(2) << "Shutdown done";
+}
+
+OutfeedReceiverImpl::~OutfeedReceiverImpl() {
+  VLOG(2) << "~OutfeedReceiverImpl";
+  Shutdown();
+}
+
+void OutfeedReceiverImpl::DeviceListenerThreadLoop(int device_idx) {
+  {
+    absl::MutexLock lock(&mu_);
+    ++num_listening_threads_;
+  }
+  DeviceWithClient device_client = devices_[device_idx];
+  while (true) {
+    Shape header_shape = ShapeUtil::MakeShape(U32, {kOutfeedHeaderWords});
+    std::unique_ptr<Literal> header =
+        ReceiveRawFromOutfeed(device_client.device, header_shape).ValueOrDie();
+    absl::Span<uint32_t> header_data = header->data<uint32>();
+    CHECK_EQ(header_data.size(), kOutfeedHeaderWords);
+    CHECK_EQ(header_data[0], kOutfeedHeaderStart);
+    uint32_t consumer_id = header_data[1];
+    Shape shape;
+    {
+      absl::MutexLock lock(&mu_);
+      auto registered_shape = shape_registry_.find(consumer_id);
+      if (registered_shape == shape_registry_.end()) {
+        LOG(FATAL)
+            << "[" << device_client.device->DebugString()
+            << "] Cannot find registered shape for consumer ID " << consumer_id
+            << ". Perhaps the code was compiled with a different instance "
+            << "of OutfeedReceiver.";
+      }
+      shape = registered_shape->second;
+    }
+    auto received =
+        absl::make_unique<OutfeedData>(device_client, consumer_id, shape);
+    VLOG(2) << "Listener received header " << received->DebugString();
+    if (consumer_id == kOutfeedCidShutdown) {
+      VLOG(2) << "[" << device_client.device->DebugString()
+              << "] Listener received shutdown header";
+      absl::MutexLock lock(&mu_);
+      --num_listening_threads_;
+      if (num_listening_threads_ == 0) {
+        VLOG(2) << "Last listener shutdown; enqueue shutdown callback";
+        EnqueueReceivedData(std::move(received));
+      }
+      return;
+    }
+    std::unique_ptr<Literal> data =
+        ReceiveRawFromOutfeed(device_client.device, shape).ValueOrDie();
+    received->SetLiteral(std::move(data));
+    absl::MutexLock lock(&mu_);
+    EnqueueReceivedData(std::move(received));
+  }
+}
+
+void OutfeedReceiverImpl::EnqueueReceivedData(
+    std::unique_ptr<OutfeedData> received) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  mu_.Await(absl::Condition(this, &OutfeedReceiverImpl::CallbackQueueHasSpace));
+  ssize_t literal_size_bytes = received->literal_size_bytes();
+  callback_queue_size_bytes_ += literal_size_bytes;
+  VLOG(2) << "Listener enqueues data " << received->DebugString() << " of size "
+          << literal_size_bytes << " bytes; " << (1 + callback_queue_.size())
+          << " callbacks in queue of total size " << callback_queue_size_bytes_
+          << " bytes.\n";
+  callback_queue_.push(std::move(received));
+}
+
+StatusOr<std::unique_ptr<Literal>> OutfeedReceiverImpl::ReceiveRawFromOutfeed(
+    const Device* device, const Shape& shape) {
+  std::shared_ptr<Literal> literal_shared;
+
+  TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
+                      device->GetLocalDeviceState());
+  TF_ASSIGN_OR_RETURN(Literal literal,
+                      local_device->client()->TransferFromOutfeedLocal(
+                          shape, local_device->device_ordinal()));
+
+  return absl::make_unique<Literal>(std::move(literal));
+}
+
+void OutfeedReceiverImpl::CallbackThreadLoop() {
+  {
+    absl::MutexLock lock(&mu_);
+    num_working_callback_threads_++;
+    CHECK_EQ(num_working_callback_threads_, 1);
+  }
+  while (true) {
+    std::unique_ptr<OutfeedData> received;
+    {
+      absl::MutexLock lock(&mu_);
+      mu_.Await(
+          absl::Condition(this, &OutfeedReceiverImpl::CallbackQueueNotEmpty));
+      received = std::move(callback_queue_.front());
+      callback_queue_.pop();
+      callback_queue_size_bytes_ -= received->literal_size_bytes();
+      VLOG(2) << "Dequeued callback for " << received->DebugString() << "; "
+              << callback_queue_.size() << " callbacks in queue of total size "
+              << callback_queue_size_bytes_ << " bytes.\n";
+    }
+    if (received->consumer_id() == kOutfeedCidShutdown) {
+      VLOG(2) << "Callback loop received shutdown signal";
+      {
+        absl::MutexLock lock(&mu_);
+        CHECK(callback_queue_.empty());
+        CHECK_EQ(callback_queue_size_bytes_, 0);
+        --num_working_callback_threads_;
+      }
+      VLOG(2) << "Callback loop done";
+      return;
+    }
+    {
+      tensorflow::profiler::TraceMe traceme("OutfeedReceiver::Callback");
+      DeviceWithClient device_client = received->device_client();
+      callback_(device_client.device, std::move(device_client.client),
+                received->consumer_id(), received->literal());
+    }
+  }
+}
+
+Status OutfeedReceiverImpl::SendShutdownOutfeedHeader(int device_idx) {
+  const Device* device = devices_[device_idx].device;
+  constexpr int consumer_id = kOutfeedCidShutdown;
+  VLOG(2) << "[" << device->DebugString()
+          << "] SendSpecialHeader cons=" << consumer_id;
+  XlaBuilder builder(
+      absl::StrFormat("special_outfeed_header_%d_%d", consumer_id, device_idx));
+  XlaOp send =
+      AddOutfeedToBuilder(&builder, CreateToken(&builder), consumer_id, {})
+          .ValueOrDie();
+  XlaComputation computation = builder.Build(send).ValueOrDie();
+
+  CompileOptions compile_options;
+  compile_options.executable_build_options.set_num_replicas(1);
+  compile_options.executable_build_options.set_num_partitions(1);
+  DeviceAssignment device_assignment(1, 1);
+  device_assignment(0, 0) = device->id();
+  compile_options.executable_build_options.set_device_assignment(
+      device_assignment);
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<PjRtExecutable> executable,
+      PjRtExecutable::Compile(computation, devices_[device_idx].client.get(),
+                              std::move(compile_options)));
+  ExecuteOptions execute_options;
+  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<PjRtBuffer>> output_buffers,
+                      executable->Execute({}, execute_options));
+  return Status::OK();
+}
+
+StatusOr<XlaOp> OutfeedReceiverImpl::AddOutfeedToBuilder(
+    XlaBuilder* builder, XlaOp token, uint32_t consumer_id,
+    std::vector<XlaOp> arrays) {
+  XlaOp data = Tuple(builder, std::move(arrays));
+  Shape shape_with_layout = builder->GetShape(data).ValueOrDie();
+  ShapeUtil::ForEachMutableSubshape(
+      &shape_with_layout, [](Shape* subshape, const ShapeIndex&) {
+        if (!subshape->has_layout()) {
+          LayoutUtil::SetToDefaultLayout(subshape);
+        }
+      });
+  VLOG(2) << "RegisterShape cons=" << consumer_id
+          << "; shape=" << shape_with_layout.ToString();
+  {
+    absl::MutexLock lock(&mu_);
+    auto found = shape_registry_.find(consumer_id);
+    if (found != shape_registry_.end()) {
+      if (!ShapeUtil::Equal(shape_with_layout, found->second)) {
+        return InvalidArgument(
+            "Shape %s does not match previous shape %s used "
+            "for consumer id %d",
+            shape_with_layout.DebugString(), found->second.DebugString(),
+            consumer_id);
+      }
+    } else {
+      shape_registry_.insert({consumer_id, shape_with_layout});
+    }
+  }
+
+  std::vector<uint32_t> header{kOutfeedHeaderStart, consumer_id};
+  XlaOp header_op = ConstantR1<uint32_t>(builder, header);
+  token = OutfeedWithToken(
+      header_op, token, ShapeUtil::MakeShape(U32, {kOutfeedHeaderWords}), "");
+  if (consumer_id != kOutfeedCidShutdown) {
+    token = OutfeedWithToken(data, token, shape_with_layout, "");
+  }
+  return token;
+}
+
+OutfeedReceiver::OutfeedReceiver(
+    Callback callback, std::vector<std::shared_ptr<PjRtClient>> clients,
+    ssize_t max_callback_queue_size_bytes) {
+  p_impl_ = absl::make_unique<OutfeedReceiverImpl>(
+      callback, std::move(clients), max_callback_queue_size_bytes);
+}
+
+OutfeedReceiver::~OutfeedReceiver() {}
+
+void OutfeedReceiver::Start() { p_impl_->Start(); }
+
+StatusOr<XlaOp> OutfeedReceiver::AddOutfeedToBuilder(
+    XlaBuilder* builder, XlaOp token, uint32_t consumer_id,
+    std::vector<XlaOp> arrays) {
+  if (consumer_id == kOutfeedCidShutdown) {
+    return InvalidArgument("Consumer ID cannot be a reserved value: %d",
+                           consumer_id);
+  }
+  return p_impl_->AddOutfeedToBuilder(builder, token, consumer_id, arrays);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver.h b/tensorflow/compiler/xla/python/outfeed_receiver.h
new file mode 100644
index 00000000000..a0fdfcd36f0
--- /dev/null
+++ b/tensorflow/compiler/xla/python/outfeed_receiver.h
@@ -0,0 +1,77 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_OUTFEED_RECEIVER_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_OUTFEED_RECEIVER_H_
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+class OutfeedReceiverImpl;
+
+// Implements a multithreaded receiver of outfeeds from devices.
+class OutfeedReceiver {
+ public:
+  // A callback takes: device, client (for the device), consumer id, received.
+  // The client pointer should be alive while the device is used.
+  using Callback = std::function<void(Device*, std::shared_ptr<PjRtClient>,
+                                      uint32_t, std::shared_ptr<Literal>)>;
+
+  // Constructs the receiver for the given clients and callback function.
+  //
+  // Args:
+  //   callback: a function to be called when an outfeed is ready for
+  //     processing.
+  //   clients: the clients for whose devices to listen.
+  //   max_callback_queue_size_bytes: the maximum number of bytes for all
+  //     received outfeeds queued to be processed. When this limit is reached
+  //     we pause receiving outfeeds from devices.
+  OutfeedReceiver(Callback callback,
+                  std::vector<std::shared_ptr<PjRtClient>> clients,
+                  ssize_t max_callback_queue_size_bytes);
+
+  OutfeedReceiver(const OutfeedReceiver&) = delete;
+  OutfeedReceiver& operator=(const OutfeedReceiver&) = delete;
+
+  // Blocks until all data has been received from devices and all data
+  // in the queue has been passed to Python.
+  ~OutfeedReceiver();
+
+  // Starts the listener threads and the callback thread.
+  void Start();
+
+  // Adds to the computation builder the outfeed of the arrays.
+  // Has the side-effect of registering the sent shape for the consumer_id.
+  // Returns error status if the outfeed shape is different than the
+  // previously used shape for the same consumer_id or the consumer id is
+  // invalid.
+  StatusOr<XlaOp> AddOutfeedToBuilder(XlaBuilder* builder, XlaOp token,
+                                      uint32_t consumer_id,
+                                      std::vector<XlaOp> arrays);
+
+ private:
+  std::unique_ptr<OutfeedReceiverImpl> p_impl_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_OUTFEED_RECEIVER_H_
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_py.cc b/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
new file mode 100644
index 00000000000..a6256cfe86c
--- /dev/null
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
@@ -0,0 +1,156 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/outfeed_receiver_py.h"
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "absl/synchronization/mutex.h"
+#include "pybind11/functional.h"
+#include "pybind11/pybind11.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/outfeed_receiver.h"
+#include "tensorflow/compiler/xla/python/types.h"
+
+namespace xla {
+
+namespace py = pybind11;
+
+namespace {
+
+// A wrapper for OutfeedReceiver for use from Python, useful for ensuring
+// that the GIL is released before destroying the OutfeedReceiver.
+class OutfeedReceiverForPython {
+ public:
+  // A callback to Python takes: consumer id, received literal.
+  using CallbackToPython =
+      std::function<void(ClientAndPtr<Device>, uint32_t, pybind11::object)>;
+
+  OutfeedReceiverForPython(CallbackToPython callback_python,
+                           std::vector<std::shared_ptr<PjRtClient>> clients,
+                           ssize_t max_callback_queue_size_bytes) {
+    callback_python_ = callback_python;
+    outfeed_receiver_shutting_down_ = false;
+    OutfeedReceiver::Callback callback =
+        [this](Device* device, std::shared_ptr<PjRtClient> client,
+               uint32_t consumer_id, std::shared_ptr<Literal> literal) {
+          this->Callback(device, client, consumer_id, literal);
+        };
+    outfeed_receiver_ = absl::make_unique<OutfeedReceiver>(
+        callback, std::move(clients), max_callback_queue_size_bytes);
+  }
+  OutfeedReceiverForPython(const OutfeedReceiverForPython&) = delete;
+  OutfeedReceiverForPython& operator=(const OutfeedReceiverForPython&) = delete;
+
+  ~OutfeedReceiverForPython() {
+    // This destructor is called from the Python GC. Release it for the duration
+    // of the destruction, including the destruction of the OutfeedReceiver,
+    // when we may actually have to wait for threads to end. During this time
+    // we do not callback to Python (sometimes we get an exception
+    // "std::runtime_error: scoped_acquire::dec_ref(): thread state must
+    // be current!"").
+    {
+      absl::MutexLock lock(&mu_);
+      outfeed_receiver_shutting_down_ = true;
+    }
+    py::gil_scoped_release gil_release;
+    outfeed_receiver_ = nullptr;  // Shutdown the outfeed receiver.
+  }
+
+  void Start() { outfeed_receiver_->Start(); }
+
+  StatusOr<XlaOp> AddOutfeed(XlaBuilder* builder, XlaOp token,
+                             uint32_t consumer_id, std::vector<XlaOp> arrays) {
+    return outfeed_receiver_->AddOutfeedToBuilder(builder, token, consumer_id,
+                                                  arrays);
+  }
+
+  void Callback(Device* device, std::shared_ptr<PjRtClient> client,
+                uint32_t consumer_id, std::shared_ptr<Literal> literal) {
+    {
+      absl::MutexLock lock(&mu_);
+      if (outfeed_receiver_shutting_down_) {
+        VLOG(2) << "Ignoring unsafe callback to Python during shutdown";
+        return;
+      }
+    }
+    py::gil_scoped_acquire gil_acquire;  // Need GIL also for LiteralToPython
+    py::object literal_python =
+        LiteralToPython(std::move(literal)).ValueOrDie();
+    // The callback_ should handle all exceptions in user-code. If we get
+    // an exception here, it is a bug in the callback and we should stop.
+    callback_python_(WrapWithClient<Device>(std::move(client), device),
+                     consumer_id, std::move(literal_python));
+  }
+
+ private:
+  CallbackToPython callback_python_;
+  absl::Mutex mu_;
+  bool outfeed_receiver_shutting_down_ TF_GUARDED_BY(mu_);
+  std::unique_ptr<OutfeedReceiver> outfeed_receiver_;
+};
+
+}  // namespace
+
+void BuildOutfeedReceiverSubmodule(py::module* m) {
+  py::module outfeed_receiver =
+      m->def_submodule("outfeed_receiver", "Outfeed receiver");
+  outfeed_receiver.def(
+      "start",
+      [](OutfeedReceiverForPython::CallbackToPython callback_to_python,
+         std::vector<std::shared_ptr<PjRtClient>> clients,
+         ssize_t max_callback_queue_size_bytes)
+          -> std::unique_ptr<OutfeedReceiverForPython> {
+        auto server = absl::make_unique<OutfeedReceiverForPython>(
+            callback_to_python, clients, max_callback_queue_size_bytes);
+        server->Start();
+        return server;
+      },
+      py::arg("callback_to_python"), py::arg("backends"),
+      py::arg("max_queue_size_bytes") = 256 * 1024 * 1024,
+      R"(Starts a multithreaded outfeed receiver.
+
+      There is one thread for each of the specified devices. When Python
+      drops the last reference to the returned object, the receiver is shut
+      down. The destructor will block until all data is received from
+      devices.
+
+      Args:
+        * callback_to_python: a Python callback to call, with <consumer_id>
+          and the data received.
+        * backends: the list of backends to listen on.
+        * max_queue_size_bytes: an optional integer to bound the maximum size
+            of arrays in the callback queue. When this limit is reached the
+            device listener pauses.
+      )",
+      py::call_guard<py::gil_scoped_release>());
+
+  py::class_<OutfeedReceiverForPython> outfeed_receiver_class(
+      outfeed_receiver, "OutfeedReceiverForPython");
+
+  outfeed_receiver_class.def(
+      "add_outfeed", &OutfeedReceiverForPython::AddOutfeed, py::arg("builder"),
+      py::arg("token"), py::arg("consumer_id"), py::arg("arrays"),
+      R"(Adds an outfeed into the given computation builder.
+
+      Has the side-effect of registering the sent shape along with the consumer
+      ID. Returns error if the outfeed shape is not compatible with previously
+      used shape for the same consumer ID.)",
+      py::call_guard<py::gil_scoped_release>());
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_py.h b/tensorflow/compiler/xla/python/outfeed_receiver_py.h
new file mode 100644
index 00000000000..6b1a712327a
--- /dev/null
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_py.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_OUTFEED_RECEIVER_PY_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_OUTFEED_RECEIVER_PY_H_
+
+#include "pybind11/pybind11.h"
+
+namespace xla {
+
+void BuildOutfeedReceiverSubmodule(pybind11::module* m);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_OUTFEED_RECEIVER_PY_H_
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_test.cc b/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
new file mode 100644
index 00000000000..ea84b4e18d6
--- /dev/null
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
@@ -0,0 +1,258 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/outfeed_receiver.h"
+
+#include <memory>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/pjrt/cpu_device.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/test.h"
+
+namespace xla {
+
+namespace {
+
+Status CompileAndExecute(XlaBuilder* builder, XlaOp root, int device_id,
+                         PjRtClient* client) {
+  XlaComputation computation = builder->Build(root).ValueOrDie();
+
+  CompileOptions compile_options;
+  compile_options.executable_build_options.set_num_replicas(1);
+  compile_options.executable_build_options.set_num_partitions(1);
+  DeviceAssignment device_assignment(1, 1);
+  device_assignment(0, 0) = device_id;
+  compile_options.executable_build_options.set_device_assignment(
+      device_assignment);
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<PjRtExecutable> executable,
+      PjRtExecutable::Compile(computation, client, std::move(compile_options)));
+  ExecuteOptions execute_options;
+  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<PjRtBuffer>> output_buffers,
+                      executable->Execute({}, execute_options));
+  return Status::OK();
+}
+
+// Accumulates the received data.
+class Accumulator {
+ public:
+  struct Data {
+    uint32_t consumer_id;
+    std::shared_ptr<Literal> data;
+  };
+
+  void Receive(uint32_t consumer_id, std::shared_ptr<Literal> data) {
+    absl::MutexLock lock(&mutex_);
+    received_.push_back(Data{consumer_id, data});
+  }
+
+  std::vector<Data> received() {
+    absl::MutexLock lock(&mutex_);
+    return received_;
+  }
+
+ private:
+  absl::Mutex mutex_;
+  std::vector<Data> received_ TF_GUARDED_BY(mutex_);
+};
+
+TEST(OutfeedReceiverTest, ReceiveOutfeedSimple) {
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
+                          GetCpuClient(true));
+  std::vector<std::shared_ptr<PjRtClient>> clients{cpu_client};
+
+  auto receiver = absl::make_unique<Accumulator>();
+  OutfeedReceiver::Callback callback =
+      [&receiver](Device* device, std::shared_ptr<PjRtClient> client,
+                  uint32_t consumer_id, std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
+  auto outfeed_receiver =
+      std::make_shared<OutfeedReceiver>(callback, clients, 128);
+  outfeed_receiver->Start();
+
+  XlaBuilder builder("execute_test_outfeed");
+  constexpr int consumer_id0 = 5;
+  const Shape shape0 = ShapeUtil::MakeShape(U32, {16});
+  XlaOp data = Iota(&builder, shape0, 0);
+  XlaOp send = outfeed_receiver
+                   ->AddOutfeedToBuilder(&builder, CreateToken(&builder),
+                                         consumer_id0, {data})
+                   .ValueOrDie();
+  EXPECT_TRUE(CompileAndExecute(&builder, send, 0, cpu_client.get()).ok());
+
+  // Shutdown the receiver, to force it to wait to deliver the callbacks.
+  outfeed_receiver = nullptr;
+  std::vector<Accumulator::Data> received = receiver->received();
+  EXPECT_EQ(1, received.size());
+  EXPECT_EQ(consumer_id0, received[0].consumer_id);
+  EXPECT_EQ(ShapeUtil::MakeTupleShape({shape0}), received[0].data->shape());
+}
+
+TEST(OutfeedReceiverTest, ReceiveOutfeedTwoComputations) {
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
+                          GetCpuClient(true));
+  std::vector<std::shared_ptr<PjRtClient>> clients{cpu_client};
+
+  auto receiver = absl::make_unique<Accumulator>();
+  OutfeedReceiver::Callback callback =
+      [&receiver](Device* device, std::shared_ptr<PjRtClient> client,
+                  uint32_t consumer_id, std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
+  auto outfeed_receiver =
+      std::make_shared<OutfeedReceiver>(callback, clients, 128);
+  outfeed_receiver->Start();
+
+  XlaBuilder builder0("execute_test_outfeed_0");
+  constexpr int consumer_id0 = 5;
+  const Shape shape0 = ShapeUtil::MakeShape(U32, {16});
+  XlaOp data0 = Iota(&builder0, shape0, 0);
+  XlaOp send0 = outfeed_receiver
+                    ->AddOutfeedToBuilder(&builder0, CreateToken(&builder0),
+                                          consumer_id0, {data0})
+                    .ValueOrDie();
+  EXPECT_TRUE(CompileAndExecute(&builder0, send0, 0, cpu_client.get()).ok());
+
+  XlaBuilder builder1("execute_test_outfeed_1");
+  constexpr int consumer_id1 = 6;
+  const Shape shape1 = ShapeUtil::MakeShape(U32, {128});
+  XlaOp data1 = Iota(&builder1, shape1, 0);
+  XlaOp send1 = outfeed_receiver
+                    ->AddOutfeedToBuilder(&builder1, CreateToken(&builder1),
+                                          consumer_id1, {data1})
+                    .ValueOrDie();
+  EXPECT_TRUE(CompileAndExecute(&builder1, send1, 0, cpu_client.get()).ok());
+
+  // Shutdown the receiver, to force it to wait to deliver the callbacks.
+  outfeed_receiver = nullptr;
+  std::vector<Accumulator::Data> received = receiver->received();
+  EXPECT_EQ(2, received.size());
+  EXPECT_EQ(consumer_id0, received[0].consumer_id);
+  EXPECT_EQ(ShapeUtil::MakeTupleShape({shape0}), received[0].data->shape());
+  EXPECT_EQ(consumer_id1, received[1].consumer_id);
+  EXPECT_EQ(ShapeUtil::MakeTupleShape({shape1}), received[1].data->shape());
+}
+
+TEST(OutfeedReceiverTest, ReceiveOutfeedTwoOutfeed) {
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
+                          GetCpuClient(true));
+  std::vector<std::shared_ptr<PjRtClient>> clients{cpu_client};
+
+  auto receiver = absl::make_unique<Accumulator>();
+  OutfeedReceiver::Callback callback =
+      [&receiver](Device* device, std::shared_ptr<PjRtClient> client,
+                  uint32_t consumer_id, std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
+  auto outfeed_receiver =
+      std::make_shared<OutfeedReceiver>(callback, clients, 128);
+  outfeed_receiver->Start();
+
+  XlaBuilder builder("execute_test_outfeed");
+  constexpr int consumer_id0 = 5;
+  const Shape shape0 = ShapeUtil::MakeShape(U32, {16});
+  XlaOp data0 = Iota(&builder, shape0, 0);
+  XlaOp send0 = outfeed_receiver
+                    ->AddOutfeedToBuilder(&builder, CreateToken(&builder),
+                                          consumer_id0, {data0})
+                    .ValueOrDie();
+
+  constexpr int consumer_id1 = 6;
+  const Shape shape1 = ShapeUtil::MakeShape(U32, {128});
+  XlaOp data1 = Iota(&builder, shape1, 0);
+  XlaOp send1 =
+      outfeed_receiver
+          ->AddOutfeedToBuilder(&builder, send0, consumer_id1, {data1})
+          .ValueOrDie();
+  EXPECT_TRUE(CompileAndExecute(&builder, send1, 0, cpu_client.get()).ok());
+
+  // Shutdown the receiver, to force it to wait to deliver the callbacks.
+  outfeed_receiver = nullptr;
+  std::vector<Accumulator::Data> received = receiver->received();
+  EXPECT_EQ(2, received.size());
+  EXPECT_EQ(consumer_id0, received[0].consumer_id);
+  EXPECT_EQ(ShapeUtil::MakeTupleShape({shape0}), received[0].data->shape());
+  EXPECT_EQ(consumer_id1, received[1].consumer_id);
+  EXPECT_EQ(ShapeUtil::MakeTupleShape({shape1}), received[1].data->shape());
+}
+
+TEST(OutfeedReceiverTest, DifferentShapeForConsumerIdError) {
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
+                          GetCpuClient(true));
+  std::vector<std::shared_ptr<PjRtClient>> clients{cpu_client};
+
+  auto receiver = absl::make_unique<Accumulator>();
+  OutfeedReceiver::Callback callback =
+      [&receiver](Device* device, std::shared_ptr<PjRtClient> client,
+                  uint32_t consumer_id, std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
+  auto outfeed_receiver =
+      std::make_shared<OutfeedReceiver>(callback, clients, 128);
+  outfeed_receiver->Start();
+
+  XlaBuilder builder("execute_test_outfeed");
+  constexpr int consumer_id0 = 5;
+  const Shape shape0 = ShapeUtil::MakeShape(U32, {16});
+  XlaOp data0 = Iota(&builder, shape0, 0);
+  XlaOp send0 = outfeed_receiver
+                    ->AddOutfeedToBuilder(&builder, CreateToken(&builder),
+                                          consumer_id0, {data0})
+                    .ValueOrDie();
+
+  const Shape shape1 = ShapeUtil::MakeShape(U32, {128});
+  XlaOp data1 = Iota(&builder, shape1, 0);
+  // A different shape for the same consumer ID.
+  StatusOr<XlaOp> send1 = outfeed_receiver->AddOutfeedToBuilder(
+      &builder, send0, consumer_id0, {data1});
+  EXPECT_FALSE(send1.ok());
+  EXPECT_THAT(send1.status().ToString(),
+              testing::HasSubstr("does not match previous shape element_type"));
+}
+
+TEST(OutfeedReceiverTest, InvalidConsumerIdError) {
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
+                          GetCpuClient(true));
+  std::vector<std::shared_ptr<PjRtClient>> clients{cpu_client};
+
+  auto receiver = absl::make_unique<Accumulator>();
+  OutfeedReceiver::Callback callback =
+      [&receiver](Device* device, std::shared_ptr<PjRtClient> client,
+                  uint32_t consumer_id, std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
+  auto outfeed_receiver =
+      std::make_shared<OutfeedReceiver>(callback, clients, 128);
+  outfeed_receiver->Start();
+
+  XlaBuilder builder("execute_test_outfeed");
+  const Shape shape0 = ShapeUtil::MakeShape(U32, {16});
+  XlaOp data0 = Iota(&builder, shape0, 0);
+  StatusOr<XlaOp> send0 = outfeed_receiver->AddOutfeedToBuilder(
+      &builder, CreateToken(&builder), 0, {data0});
+
+  EXPECT_FALSE(send0.ok());
+  EXPECT_THAT(send0.status().ToString(),
+              testing::HasSubstr("Consumer ID cannot be a reserved value"));
+}
+
+}  // namespace
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py
index ef0caff0ae6..6d4482af43f 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py
@@ -20,6 +20,9 @@ from __future__ import print_function
 
 from absl import logging
 
+# Import xla_client to load shared C++ extensions (just CompileOptions at the
+# time of writing).
+from tensorflow.compiler.xla.python import xla_client  # pylint: disable=unused-import
 from tensorflow.compiler.xla.python.tpu_driver.client import tpu_client_extension as _tpu_client
 
 
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index f03595bf677..0b6824e83e9 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -24,17 +24,12 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
+#include "pybind11/attr.h"
 #include "pybind11/cast.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/lib/comparators.h"
-#include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/lib/qr.h"
-#include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
-#include "tensorflow/compiler/xla/client/lib/sorting.h"
-#include "tensorflow/compiler/xla/client/lib/svd.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
@@ -47,6 +42,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/python/bfloat16.h"
 #include "tensorflow/compiler/xla/python/dlpack.h"
+#include "tensorflow/compiler/xla/python/ops.h"
+#include "tensorflow/compiler/xla/python/outfeed_receiver_py.h"
 #include "tensorflow/compiler/xla/python/python_ref_manager.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
@@ -62,15 +59,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/profiler/rpc/profiler_server.h"
+#include "tensorflow/python/profiler/internal/traceme_wrapper.h"
 #include "tensorflow/stream_executor/platform.h"
 
 namespace xla {
+namespace {
 
 namespace py = pybind11;
 
-namespace {
+using ::tensorflow::profiler::TraceMeWrapper;
 
 struct Uniquer {
   absl::Mutex mu;
@@ -304,358 +302,6 @@ StatusOr<py::dict> PjRtBufferCudaArrayInterface(const PjRtBuffer& buffer) {
   return result;
 }
 
-void BuildOpsSubmodule(py::module* m) {
-  // ops submodule, containing free functions that add operators to an
-  // XlaBuilder.
-  py::module ops = m->def_submodule("ops", "XLA operations");
-
-  py::enum_<TriangularSolveOptions::Transpose>(
-      ops, "TriangularSolveOptions_Transpose")
-      .value("TRANSPOSE_INVALID", TriangularSolveOptions::TRANSPOSE_INVALID)
-      .value("NO_TRANSPOSE", TriangularSolveOptions::NO_TRANSPOSE)
-      .value("TRANSPOSE", TriangularSolveOptions::TRANSPOSE)
-      .value("ADJOINT", TriangularSolveOptions::ADJOINT);
-
-  ops.def("AfterAll", &AfterAll, py::arg("builder"), py::arg("tokens"));
-  ops.def(
-      "AllReduce",
-      static_cast<XlaOp (*)(
-          XlaOp, const XlaComputation&, absl::Span<const ReplicaGroup>,
-          const absl::optional<ChannelHandle>&, const absl::optional<Shape>&)>(
-          &AllReduce),
-      py::arg("operand"), py::arg("computation"),
-      py::arg("replica_groups") = py::list(),
-      py::arg("channel_id") = absl::nullopt,
-      py::arg("shape_with_layout") = absl::nullopt);
-  ops.def("AllToAll", &AllToAll, py::arg("operand"), py::arg("split_dimension"),
-          py::arg("concat_dimension"), py::arg("split_count"),
-          py::arg("replica_groups") = py::list(),
-          py::arg("layout") = absl::nullopt);
-  ops.def("CollectivePermute", &CollectivePermute, py::arg("operand"),
-          py::arg("source_target_pairs"));
-  ops.def("CreateToken", &CreateToken, py::arg("builder"));
-  ops.def("CrossReplicaSum",
-          static_cast<XlaOp (*)(XlaOp, absl::Span<const ReplicaGroup>)>(
-              &CrossReplicaSum),
-          py::arg("operand"), py::arg("replica_groups") = py::list());
-  ops.def("BitcastConvertType", &BitcastConvertType, py::arg("operand"),
-          py::arg("new_element_type"));
-  ops.def("Broadcast", &Broadcast, py::arg("operand"), py::arg("sizes"));
-  ops.def("BroadcastInDim", &BroadcastInDim, py::arg("operand"),
-          py::arg("shape"), py::arg("broadcast_dimensions"));
-  ops.def("Call", &Call, py::arg("builder"), py::arg("computation"),
-          py::arg("operands"));
-  ops.def("Cholesky", &Cholesky, py::arg("a"), py::arg("lower") = true);
-  ops.def("Clamp", &Clamp, py::arg("min"), py::arg("operand"), py::arg("max"));
-  ops.def("Collapse", &Collapse, py::arg("operand"), py::arg("dimensions"));
-  ops.def("ConcatInDim", &ConcatInDim, py::arg("builder"), py::arg("operands"),
-          py::arg("dimension"));
-  ops.def("Conditional",
-          static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaComputation* const>,
-                                absl::Span<const XlaOp>)>(&Conditional),
-          py::arg("branch_index"), py::arg("branch_computations"),
-          py::arg("branch_operands"));
-  ops.def("Conditional",
-          static_cast<XlaOp (*)(XlaOp, XlaOp, const XlaComputation&, XlaOp,
-                                const XlaComputation&)>(&Conditional),
-          py::arg("predicate"), py::arg("true_operand"),
-          py::arg("true_computation"), py::arg("false_operand"),
-          py::arg("false_computation"));
-  ops.def("Constant", &ConstantLiteral, py::arg("builder"), py::arg("literal"));
-  ops.def("ConstantLiteral", &ConstantLiteral, py::arg("builder"),
-          py::arg("literal"));
-  ops.def("ConvGeneralDilated", &ConvGeneralDilated, py::arg("lhs"),
-          py::arg("rhs"), py::arg("window_strides"), py::arg("padding"),
-          py::arg("lhs_dilation"), py::arg("rhs_dilation"),
-          py::arg("dimension_numbers"), py::arg("feature_group_count") = 1,
-          py::arg("batch_group_count") = 1,
-          py::arg("precision_config") = nullptr);
-  ops.def("ConvertElementType", &ConvertElementType, py::arg("operand"),
-          py::arg("new_element_type"));
-  ops.def(
-      "CustomCall",
-      [](XlaBuilder* builder, const py::bytes& call_target_name,
-         absl::Span<const XlaOp> operands, const Shape& shape,
-         const py::bytes& opaque) -> XlaOp {
-        return CustomCall(builder, call_target_name, operands, shape, opaque);
-      },
-      py::arg("builder"), py::arg("call_target_name"), py::arg("operands"),
-      py::arg("shape"), py::arg("opaque") = py::bytes(""));
-  ops.def(
-      "CustomCallWithLayout",
-      [](XlaBuilder* builder, const py::bytes& call_target_name,
-         absl::Span<const XlaOp> operands, const Shape& shape_with_layout,
-         absl::Span<const Shape> operand_shapes_with_layout,
-         const py::bytes& opaque) -> XlaOp {
-        return CustomCallWithLayout(builder, call_target_name, operands,
-                                    shape_with_layout,
-                                    operand_shapes_with_layout, opaque);
-      },
-      py::arg("builder"), py::arg("call_target_name"), py::arg("operands"),
-      py::arg("shape_with_layout"), py::arg("operand_shapes_with_layout"),
-      py::arg("opaque") = py::bytes(""));
-  ops.def("Dot", &Dot, py::arg("lhs"), py::arg("rhs"),
-          py::arg("precision_config") = nullptr);
-  ops.def("DotGeneral", &DotGeneral, py::arg("lhs"), py::arg("rhs"),
-          py::arg("dimension_numbers"), py::arg("precision_config") = nullptr);
-  ops.def("DynamicSlice",
-          static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaOp>,
-                                absl::Span<const int64>)>(&DynamicSlice),
-          py::arg("operand"), py::arg("start_indices"), py::arg("slice_sizes"));
-  ops.def("DynamicUpdateSlice",
-          static_cast<XlaOp (*)(XlaOp, XlaOp, absl::Span<const XlaOp>)>(
-              &DynamicUpdateSlice),
-          py::arg("operand"), py::arg("update"), py::arg("start_indices"));
-
-  ops.def("Fft", &Fft, py::arg("operand"), py::arg("fft_type"),
-          py::arg("fft_length"));
-
-  ops.def("Gather", &Gather, py::arg("a"), py::arg("start_indices"),
-          py::arg("dimension_numbers"), py::arg("slice_sizes"),
-          py::arg("indices_are_sorted") = false);
-  ops.def("GetTupleElement", &GetTupleElement, py::arg("tuple_data"),
-          py::arg("index"));
-  ops.def("InfeedWithToken", &InfeedWithToken, py::arg("token"),
-          py::arg("shape"), py::arg("config") = "");
-  ops.def("Iota",
-          static_cast<XlaOp (*)(XlaBuilder*, const Shape&, int64)>(&Iota),
-          py::arg("builder"), py::arg("shape"), py::arg("iota_dimension"));
-  ops.def("Iota",
-          static_cast<XlaOp (*)(XlaBuilder*, PrimitiveType, int64)>(&Iota),
-          py::arg("builder"), py::arg("type"), py::arg("size"));
-  ops.def("Map", &Map, py::arg("builder"), py::arg("operands"),
-          py::arg("computation"), py::arg("dimensions"),
-          py::arg("static_operands") = py::list());
-  ops.def("NextAfter", &NextAfter, py::arg("from"), py::arg("to"));
-  ops.def("OutfeedWithToken", &OutfeedWithToken, py::arg("operand"),
-          py::arg("token"), py::arg("shape_with_layout"),
-          py::arg("outfeed_config") = "");
-  ops.def("Pad", &Pad, py::arg("operand"), py::arg("padding_value"),
-          py::arg("padding_config"));
-  ops.def("Parameter",
-          static_cast<XlaOp (*)(XlaBuilder*, int64, const Shape&,
-                                const std::string&, const std::vector<bool>&)>(
-              &Parameter),
-          py::arg("builder"), py::arg("parameter_number"), py::arg("shape"),
-          py::arg("name") = "",
-          py::arg("replicated_at_leaf_buffers") = std::vector<bool>());
-  ops.def(
-      "QR",
-      [](XlaOp a, bool full_matrices) -> StatusOr<std::pair<XlaOp, XlaOp>> {
-        TF_ASSIGN_OR_RETURN(auto qr, QRDecomposition(a, full_matrices));
-        return std::make_pair(qr.q, qr.r);
-      },
-      py::arg("operand"), py::arg("full_matrices"));
-  ops.def(
-      "Eigh",
-      [](XlaOp a, bool lower, int64 max_iter,
-         float epsilon) -> std::pair<XlaOp, XlaOp> {
-        auto eigh = SelfAdjointEig(a, lower, max_iter, epsilon);
-        return std::make_pair(eigh.v, eigh.w);
-      },
-      py::arg("a"), py::arg("lower") = true, py::arg("max_iter") = 100,
-      py::arg("epsilon") = 1e-6);
-  ops.def(
-      "SVD",
-      [](XlaOp a, int64 max_iter,
-         float epsilon) -> std::tuple<XlaOp, XlaOp, XlaOp> {
-        auto svd = SVD(a, max_iter, epsilon);
-        return std::make_tuple(svd.u, svd.d, svd.v);
-      },
-      py::arg("a"), py::arg("max_iter") = 100, py::arg("epsilon") = 1e-6);
-  ops.def("Reduce",
-          static_cast<XlaOp (*)(XlaBuilder*, absl::Span<const XlaOp>,
-                                absl::Span<const XlaOp>, const XlaComputation&,
-                                absl::Span<const int64>)>(&Reduce),
-          py::arg("builder"), py::arg("operands"), py::arg("init_values"),
-          py::arg("computation"), py::arg("dimensions_to_reduce"));
-  ops.def("ReducePrecision", &ReducePrecision, py::arg("operand"),
-          py::arg("exponent_bits"), py::arg("mantissa_bits"));
-  ops.def("ReduceWindowWithGeneralPadding", &ReduceWindowWithGeneralPadding,
-          py::arg("operand"), py::arg("init_value"), py::arg("computation"),
-          py::arg("window_dimensions"), py::arg("window_strides"),
-          py::arg("base_dilations"), py::arg("window_dilations"),
-          py::arg("padding"));
-  ops.def("ReplicaId", &ReplicaId, py::arg("builder"));
-  ops.def("Reshape",
-          static_cast<XlaOp (*)(XlaOp, absl::Span<const int64>,
-                                absl::Span<const int64>)>(&Reshape),
-          py::arg("operand"), py::arg("dimensions"), py::arg("new_sizes"));
-  ops.def("Reshape",
-          static_cast<XlaOp (*)(XlaOp, absl::Span<const int64>)>(&Reshape),
-          py::arg("operand"), py::arg("new_sizes"));
-  ops.def("Rev", &Rev, py::arg("operand"), py::arg("dimensions"));
-  ops.def("RngNormal", &RngNormal, py::arg("mu"), py::arg("sigma"),
-          py::arg("shape"));
-  ops.def("RngUniform", &RngUniform, py::arg("a"), py::arg("b"),
-          py::arg("shape"));
-  ops.def("Scatter", &Scatter, py::arg("input"), py::arg("scatter_indices"),
-          py::arg("updates"), py::arg("update_computation"),
-          py::arg("dimension_numbers"), py::arg("indices_are_sorted") = false,
-          py::arg("unique_indices") = false);
-  ops.def("Select", &Select, py::arg("pred"), py::arg("on_true"),
-          py::arg("on_false"));
-  ops.def("SelectAndScatterWithGeneralPadding",
-          &SelectAndScatterWithGeneralPadding, py::arg("operand"),
-          py::arg("select"), py::arg("window_dimensions"),
-          py::arg("window_strides"), py::arg("padding"), py::arg("source"),
-          py::arg("init_value"), py::arg("scatter"));
-  ops.def("Slice", &Slice, py::arg("operand"), py::arg("start_indices"),
-          py::arg("limit_indices"), py::arg("strides"));
-  ops.def("SliceInDim", &SliceInDim, py::arg("operand"), py::arg("start_index"),
-          py::arg("limit_index"), py::arg("stride"), py::arg("dimno"));
-  ops.def(
-      "Sort",
-      [](XlaBuilder* builder, absl::Span<const XlaOp> operands,
-         absl::optional<const XlaComputation*> comparator, int64 dimension,
-         bool is_stable) -> XlaOp {
-        return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-          std::vector<PrimitiveType> operand_types;
-          for (const auto& operand : operands) {
-            TF_ASSIGN_OR_RETURN(auto operand_shape, builder->GetShape(operand));
-            operand_types.push_back(operand_shape.element_type());
-          }
-
-          if (comparator) {
-            return Sort(operands, **comparator, dimension, is_stable);
-          } else {
-            return Sort(operands,
-                        CreateScalarLtComputation(operand_types, builder),
-                        dimension, is_stable);
-          }
-        });
-      },
-      py::arg("builder"), py::arg("operands"),
-      py::arg("comparator") = absl::nullopt, py::arg("dimension") = -1,
-      py::arg("is_stable") = false);
-  ops.def("TopK", &TopK, py::arg("input"), py::arg("k"));
-  ops.def("Transpose", &Transpose, py::arg("operand"), py::arg("permutation"));
-  ops.def("TriangularSolve", &TriangularSolve, py::arg("a"), py::arg("b"),
-          py::arg("left_side"), py::arg("lower"), py::arg("unit_diagonal"),
-          py::arg("transpose_a"));
-  ops.def("Tuple", &Tuple, py::arg("builder"), py::arg("elements"));
-  ops.def("While", &While, py::arg("condition"), py::arg("body"),
-          py::arg("init"));
-
-  ops.def("Igamma", &Igamma, py::arg("a"), py::arg("x"));
-  ops.def("Igammac", &Igammac, py::arg("a"), py::arg("x"));
-  ops.def("IgammaGradA", &IgammaGradA, py::arg("a"), py::arg("x"));
-  ops.def("RandomGammaGrad", &RandomGammaGrad, py::arg("a"), py::arg("x"));
-  ops.def("RegularizedIncompleteBeta", &RegularizedIncompleteBeta, py::arg("a"),
-          py::arg("b"), py::arg("x"));
-
-#define BINARY_OP(op)                                                 \
-  ops.def(                                                            \
-      #op,                                                            \
-      [](XlaOp a, XlaOp b, absl::optional<std::vector<int64>> dims) { \
-        return dims ? op(a, b, *dims) : op(a, b);                     \
-      },                                                              \
-      py::arg("lhs"), py::arg("rhs"),                                 \
-      py::arg("broadcast_dimensions") = absl::nullopt)
-  BINARY_OP(Eq);
-  BINARY_OP(Ne);
-  BINARY_OP(Ge);
-  BINARY_OP(Gt);
-  BINARY_OP(Lt);
-  BINARY_OP(Le);
-  BINARY_OP(Add);
-  BINARY_OP(Sub);
-  BINARY_OP(Mul);
-  BINARY_OP(Div);
-  BINARY_OP(Rem);
-  BINARY_OP(Max);
-  BINARY_OP(Min);
-  BINARY_OP(And);
-  BINARY_OP(Or);
-  BINARY_OP(Xor);
-  BINARY_OP(ShiftLeft);
-  BINARY_OP(ShiftRightArithmetic);
-  BINARY_OP(ShiftRightLogical);
-  BINARY_OP(Atan2);
-  BINARY_OP(Pow);
-  BINARY_OP(Complex);
-#undef BINARY_OP
-
-#define UNARY_OP(op) ops.def(#op, &op)
-  UNARY_OP(Not);
-  UNARY_OP(PopulationCount);
-  UNARY_OP(Clz);
-  UNARY_OP(Abs);
-  UNARY_OP(Exp);
-  UNARY_OP(Expm1);
-  UNARY_OP(Floor);
-  UNARY_OP(Ceil);
-  UNARY_OP(Round);
-  UNARY_OP(Log);
-  UNARY_OP(Log1p);
-  UNARY_OP(Sign);
-  UNARY_OP(Cos);
-  UNARY_OP(Sin);
-  UNARY_OP(Tanh);
-  UNARY_OP(IsFinite);
-  UNARY_OP(Neg);
-  UNARY_OP(Sqrt);
-  UNARY_OP(Rsqrt);
-  UNARY_OP(Square);
-  UNARY_OP(Reciprocal);
-  UNARY_OP(Erfc);
-  UNARY_OP(Erf);
-  UNARY_OP(ErfInv);
-  UNARY_OP(Lgamma);
-  UNARY_OP(Digamma);
-  UNARY_OP(BesselI0e);
-  UNARY_OP(BesselI1e);
-  UNARY_OP(Acos);
-  UNARY_OP(Asin);
-  UNARY_OP(Atan);
-  UNARY_OP(Tan);
-  UNARY_OP(Acosh);
-  UNARY_OP(Asinh);
-  UNARY_OP(Atanh);
-  UNARY_OP(Cosh);
-  UNARY_OP(Sinh);
-  UNARY_OP(Real);
-  UNARY_OP(Imag);
-  UNARY_OP(Conj);
-#undef UNARY_OP
-}
-
-// Helper to implement TraceMe as a context manager in Python.
-class TraceMeContextManager {
- public:
-  explicit TraceMeContextManager(py::str name, py::kwargs kwargs)
-      : name_(std::move(name)), kwargs_(std::move(kwargs)) {}
-
-  void Enter() {
-    if (IsEnabled()) {
-      std::string name(name_);
-      if (!kwargs_.empty()) {
-        absl::StrAppend(&name, "#");
-        bool first = true;
-        for (const auto entry : kwargs_) {
-          absl::StrAppend(&name, first ? "" : ",",
-                          std::string(py::str(entry.first)), "=",
-                          std::string(py::str(entry.second)));
-          first = false;
-        }
-        absl::StrAppend(&name, "#");
-      }
-      traceme_.emplace(std::move(name));
-    }
-  }
-  py::object Exit(const py::object& ex_type, const py::object& ex_value,
-                  const py::object& traceback) {
-    traceme_.reset();
-    return py::none();
-  }
-
-  static bool IsEnabled() { return tensorflow::profiler::TraceMe::Active(); }
-
- private:
-  py::str name_;
-  py::kwargs kwargs_;
-  absl::optional<tensorflow::profiler::TraceMe> traceme_;
-};
 
 void BuildProfilerSubmodule(py::module* m) {
   py::module profiler =
@@ -672,11 +318,19 @@ void BuildProfilerSubmodule(py::module* m) {
       },
       py::arg("port"));
 
-  py::class_<TraceMeContextManager> traceme_class(profiler, "TraceMe");
+  py::class_<TraceMeWrapper> traceme_class(profiler, "TraceMe",
+                                           py::module_local());
   traceme_class.def(py::init<py::str, py::kwargs>())
-      .def("__enter__", &TraceMeContextManager::Enter)
-      .def("__exit__", &TraceMeContextManager::Exit)
-      .def_static("is_enabled", &TraceMeContextManager::IsEnabled);
+      .def("__enter__", [](py::object self) -> py::object { return self; })
+      .def("__exit__",
+           [](py::object self, const py::object& ex_type,
+              const py::object& ex_value,
+              const py::object& traceback) -> py::object {
+             py::cast<TraceMeWrapper*>(self)->Stop();
+             return py::none();
+           })
+      .def("set_metadata", &TraceMeWrapper::SetMetadata)
+      .def_static("is_enabled", &TraceMeWrapper::IsEnabled);
 }
 
 }  // namespace
@@ -872,11 +526,7 @@ PYBIND11_MODULE(xla_extension, m) {
         DebugOptions* debug_options =
             options.executable_build_options.mutable_debug_options();
         // Sets fast-math-disabling default options expected by JAX.
-        // TODO(phawkins): make these XLA-wide defaults.
-        debug_options->set_xla_cpu_fast_math_honor_infs(true);
-        debug_options->set_xla_cpu_fast_math_honor_nans(true);
-        debug_options->set_xla_cpu_fast_math_honor_division(true);
-        debug_options->set_xla_cpu_fast_math_honor_functions(true);
+        debug_options->set_xla_cpu_enable_fast_min_max(false);
         debug_options->set_xla_gpu_enable_fast_min_max(false);
         return options;
       }))
@@ -934,34 +584,6 @@ PYBIND11_MODULE(xla_extension, m) {
           "client",
           [](const ClientAndPtr<Device>& device) { return device.client; })
       .def("__str__", &Device::DebugString)
-      // TODO(phawkins): remove capitalized names after updating callers.
-      .def("TransferToInfeed",
-           [](const Device& device, const LiteralSlice& literal) {
-             GlobalPyRefManager()->CollectGarbage();
-             py::gil_scoped_release gil_release;
-             TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
-                                 device.GetLocalDeviceState());
-             return local_device->client()->TransferToInfeedLocal(
-                 literal, local_device->device_ordinal());
-           })
-      .def(
-          "TransferFromOutfeed",
-          [](const Device& device, const Shape& shape) -> StatusOr<py::object> {
-            GlobalPyRefManager()->CollectGarbage();
-            std::shared_ptr<Literal> literal_shared;
-            {
-              py::gil_scoped_release gil_release;
-              TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
-                                  device.GetLocalDeviceState());
-              TF_ASSIGN_OR_RETURN(
-                  Literal literal,
-                  local_device->client()->TransferFromOutfeedLocal(
-                      shape, local_device->device_ordinal()));
-
-              literal_shared = std::make_shared<Literal>(std::move(literal));
-            }
-            return LiteralToPython(std::move(literal_shared));
-          })
       .def("transfer_to_infeed",
            [](const Device& device, const LiteralSlice& literal) {
              GlobalPyRefManager()->CollectGarbage();
@@ -1248,28 +870,6 @@ PYBIND11_MODULE(xla_extension, m) {
       .def("size_of_generated_code_in_bytes",
            &PjRtExecutable::SizeOfGeneratedCodeInBytes)
       .def("delete", &PjRtExecutable::Delete)
-      // TODO(phawkins): delete capitalized methods after updating callers.
-      .def("Delete", &PjRtExecutable::Delete)
-      .def(
-          "Execute",
-          [](const PjRtExecutable& executable,
-             absl::Span<PjRtBuffer* const> args)
-              -> StatusOr<std::vector<ClientAndUniquePtr<PjRtBuffer>>> {
-            py::gil_scoped_release gil_release;
-            ExecuteOptions options;
-            options.untuple_result = true;
-            TF_ASSIGN_OR_RETURN(
-                std::vector<std::unique_ptr<PjRtBuffer>> output_buffers,
-                executable.Execute(args, options));
-            std::vector<ClientAndUniquePtr<PjRtBuffer>> outputs;
-            outputs.reserve(output_buffers.size());
-            for (auto& buffer : output_buffers) {
-              outputs.push_back(WrapWithClient(
-                  executable.client()->shared_from_this(), std::move(buffer)));
-            }
-            return outputs;
-          },
-          py::arg("arguments"))
       .def(
           "execute",
           [](const PjRtExecutable& executable,
@@ -1290,33 +890,6 @@ PYBIND11_MODULE(xla_extension, m) {
             return outputs;
           },
           py::arg("arguments"))
-      // TODO(phawkins): delete capitalized methods after updating callers.
-      .def(
-          "ExecuteOnLocalDevices",
-          [](const PjRtExecutable& executable,
-             absl::Span<const std::vector<PjRtBuffer*>> args)
-              -> StatusOr<
-                  std::vector<std::vector<ClientAndUniquePtr<PjRtBuffer>>>> {
-            py::gil_scoped_release gil_release;
-            ExecuteOptions options;
-            options.untuple_result = true;
-            TF_ASSIGN_OR_RETURN(
-                std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>
-                    output_buffers,
-                executable.ExecuteOnLocalDevices(args, options));
-            std::vector<std::vector<ClientAndUniquePtr<PjRtBuffer>>> outputs;
-            outputs.resize(output_buffers.size());
-            for (int computation = 0; computation < output_buffers.size();
-                 ++computation) {
-              for (auto& buffer : output_buffers[computation]) {
-                outputs[computation].push_back(
-                    WrapWithClient(executable.client()->shared_from_this(),
-                                   std::move(buffer)));
-              }
-            }
-            return outputs;
-          },
-          py::arg("arguments"))
       .def(
           "execute_on_local_devices",
           [](const PjRtExecutable& executable,
@@ -1377,7 +950,19 @@ PYBIND11_MODULE(xla_extension, m) {
                     &DebugOptions::set_xla_cpu_fast_math_honor_functions)
       .def_property("xla_gpu_enable_fast_min_max",
                     &DebugOptions::xla_gpu_enable_fast_min_max,
-                    &DebugOptions::set_xla_gpu_enable_fast_min_max);
+                    &DebugOptions::set_xla_gpu_enable_fast_min_max)
+      .def_property("xla_backend_optimization_level",
+                    &DebugOptions::xla_backend_optimization_level,
+                    &DebugOptions::set_xla_backend_optimization_level)
+      .def_property("xla_cpu_enable_xprof_traceme",
+                    &DebugOptions::xla_cpu_enable_xprof_traceme,
+                    &DebugOptions::set_xla_cpu_enable_xprof_traceme)
+      .def_property("xla_llvm_disable_expensive_passes",
+                    &DebugOptions::xla_llvm_disable_expensive_passes,
+                    &DebugOptions::set_xla_llvm_disable_expensive_passes)
+      .def_property("xla_test_all_input_layouts",
+                    &DebugOptions::xla_test_all_input_layouts,
+                    &DebugOptions::set_xla_test_all_input_layouts);
 
   py::class_<ExecutableBuildOptions>(m, "ExecutableBuildOptions")
       .def(py::init<>())
@@ -1406,7 +991,10 @@ PYBIND11_MODULE(xla_extension, m) {
                              options.device_assignment())
                        : absl::nullopt;
           },
-          &ExecutableBuildOptions::set_device_assignment);
+          &ExecutableBuildOptions::set_device_assignment)
+      .def_property("use_spmd_partitioning",
+                    &ExecutableBuildOptions::use_spmd_partitioning,
+                    &ExecutableBuildOptions::set_use_spmd_partitioning);
 
   py::class_<XlaComputation>(m, "XlaComputation")
       .def(py::init([](const py::bytes& serialized_hlo_module_proto)
@@ -1415,12 +1003,6 @@ PYBIND11_MODULE(xla_extension, m) {
         proto.ParseFromString(serialized_hlo_module_proto);
         return absl::make_unique<XlaComputation>(proto);
       }))
-      // TODO(phawkins): delete capitalized names after updating callers.
-      .def("GetProgramShape", &XlaComputation::GetProgramShape)
-      .def("GetSerializedProto", &GetComputationSerializedProto)
-      .def("GetHloText", &GetComputationHloText)
-      .def("GetHloDotGraph", &GetComputationHloDotGraph)
-      .def("Hash", &HashComputation)
       .def("get_hlo_module", &GetHloModule)
       .def("program_shape", &XlaComputation::GetProgramShape)
       .def("as_serialized_hlo_module_proto", &GetComputationSerializedProto)
@@ -1513,28 +1095,7 @@ PYBIND11_MODULE(xla_extension, m) {
           },
           "Builds a computation from the contents of the builder.",
           py::arg("root") = absl::nullopt)
-      .def("ClearOpMetadata", &XlaBuilder::ClearOpMetadata)
       .def("GetShape", &XlaBuilder::GetShape)
-      .def(
-          "GetProgramShape",
-          [](const XlaBuilder& builder,
-             absl::optional<XlaOp> root) -> StatusOr<ProgramShape> {
-            return root ? builder.GetProgramShape(*root)
-                        : builder.GetProgramShape();
-          },
-          py::arg("root") = absl::nullopt)
-      .def("IsConstant", &XlaBuilder::IsConstant)
-      .def("SetOpMetadata", &XlaBuilder::SetOpMetadata)
-      .def("SetSharding", &XlaBuilder::SetSharding)
-      .def("ClearSharding", &XlaBuilder::ClearSharding)
-      .def("SetUpAlias",
-           [](XlaBuilder& builder, const std::vector<int64>& output_index,
-              int64 param_number, const std::vector<int64>& param_index) {
-             builder.SetUpAlias(
-                 ShapeIndex(output_index.begin(), output_index.end()),
-                 param_number,
-                 ShapeIndex(param_index.begin(), param_index.end()));
-           })
       .def(
           "build",
           [](XlaBuilder& builder, absl::optional<XlaOp> root) {
@@ -1565,17 +1126,7 @@ PYBIND11_MODULE(xla_extension, m) {
                  ShapeIndex(param_index.begin(), param_index.end()));
            });
 
-  // TODO(phawkins): delete capitalized names after updating callers
-  m.def("BufferToDLPackManagedTensor", BufferToDLPackManagedTensor);
   m.def("buffer_to_dlpack_managed_tensor", BufferToDLPackManagedTensor);
-  m.def("DLPackManagedTensorToBuffer",
-        [](const py::capsule& tensor, std::shared_ptr<PjRtClient> client)
-            -> StatusOr<ClientAndUniquePtr<PjRtBuffer>> {
-          TF_ASSIGN_OR_RETURN(
-              std::unique_ptr<PjRtBuffer> buffer,
-              DLPackManagedTensorToBuffer(tensor, client.get()));
-          return WrapWithClient(std::move(client), std::move(buffer));
-        });
   m.def("dlpack_managed_tensor_to_buffer",
         [](const py::capsule& tensor, std::shared_ptr<PjRtClient> client)
             -> StatusOr<ClientAndUniquePtr<PjRtBuffer>> {
@@ -1615,6 +1166,7 @@ PYBIND11_MODULE(xla_extension, m) {
 
   BuildOpsSubmodule(&m);
   BuildProfilerSubmodule(&m);
+  BuildOutfeedReceiverSubmodule(&m);
 
   py::class_<DistributedRuntimeService,
              std::unique_ptr<DistributedRuntimeService>>
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index d9cd906939d..76c3bc33a91 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -300,13 +300,13 @@ CompileOptions = _xla.CompileOptions
 # An Executable is a C++ class that duck types with the following API:
 # class Executable(object):
 #   def local_devices(self) -> [Device]:
-#   def Execute(self, arguments : [Buffer]) -> Buffer:
+#   def execute(self, arguments : [Buffer]) -> Buffer:
 #     """Execute on one replica with Buffer arguments and return value."""
 #
-#   def SizeOfGeneratedCodeInBytes(self) -> int:
+#   def size_of_generated_code_in_bytes(self) -> int:
 #     """Return generated binary size, or -1 if not known."""
 #
-#   def ExecuteOnLocalDevices(self, arguments: [[Buffer]]) -> [Buffer]:
+#   def execute_on_local_devices(self, arguments: [[Buffer]]) -> [Buffer]:
 #     """Execute on many replicas with Buffer arguments and return value.
 #
 #     Args:
@@ -329,7 +329,7 @@ def execute_with_python_values(executable, arguments, backend):
     return backend.buffer_from_pyval(arg, device=executable.local_devices()[0])
 
   arguments = [put(arg) for arg in arguments]
-  outputs = executable.Execute(arguments)
+  outputs = executable.execute(arguments)
   return [x.to_py() for x in outputs]
 
 
@@ -359,7 +359,7 @@ def execute_with_python_values_replicated(executable, arguments, backend):
     flat_arg_buffers = flat_arg_buffers[len(replica_args):]
   return [[x.to_py()
            for x in xs]
-          for xs in executable.ExecuteOnLocalDevices(arg_buffers)]
+          for xs in executable.execute_on_local_devices(arg_buffers)]
 
 
 class PaddingType(enum.Enum):
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 62b3fae018a..000db2cb16b 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -115,6 +115,10 @@ def TestFactory(xla_backend, cloud_tpu=False):
     """Convenience wrapper to create Numpy arrays with a np.float32 dtype."""
     return np.array(*args, dtype=np.float32, **kwargs)
 
+  def NumpyArrayF64(*args, **kwargs):
+    """Convenience wrapper to create Numpy arrays with a np.float64 dtype."""
+    return np.array(*args, dtype=np.float64, **kwargs)
+
   def NumpyArrayS32(*args, **kwargs):
     """Convenience wrapper to create Numpy arrays with a np.int32 dtype."""
     return np.array(*args, dtype=np.int32, **kwargs)
@@ -882,12 +886,20 @@ def TestFactory(xla_backend, cloud_tpu=False):
       ops.Abs(ops.Constant(c, arr))
       self._ExecuteAndCompareClose(c, expected=[np.abs(arr)])
 
-    def testTanh(self):
+    def testTanhF32(self):
       c = self._NewComputation()
-      arr = NumpyArrayF32([3.3, 12.1])
+      arr = NumpyArrayF32([-0.2, 3.3, 12.1, 0.1, 0.0001])
       ops.Tanh(ops.Constant(c, arr))
       self._ExecuteAndCompareClose(c, expected=[np.tanh(arr)])
 
+    def testTanhF64(self):
+      if self.backend.platform == "tpu":
+        self.skipTest("TPU doesn't support 64bit tanh")
+      c = self._NewComputation()
+      arr = NumpyArrayF64([-0.2, 3.3, 12.1, 0.1, 0.0001])
+      ops.Tanh(ops.Constant(c, arr))
+      self._ExecuteAndCompareClose(c, expected=[np.tanh(arr)], rtol=1e-12)
+
     def testTranspose(self):
 
       def _TransposeAndTest(array, permutation):
@@ -2029,8 +2041,11 @@ def TestFactory(xla_backend, cloud_tpu=False):
   return tests
 
 
-def InstantiateTests(globals_dict, backend, test_prefix="", **kw):
-  for klass in TestFactory(backend, **kw):
+def InstantiateTests(globals_dict, backend_fn, test_prefix="", **kw):
+  # Avoid creating a new backend per test (this causes GPU OOM, and is probably
+  # inefficient).
+  backend_fn = functools.lru_cache(maxsize=None)(backend_fn)
+  for klass in TestFactory(backend_fn, **kw):
     test = type(test_prefix + klass.__name__, (klass,), {})
     # Clean up the qualified names of the tests to not include the test factory.
     test.__qualname__ = test.__name__
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 799d5654840..dfc9aae94e0 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -460,6 +460,97 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_sharding_util",
+    srcs = [
+        "hlo_sharding_util.cc",
+    ],
+    hdrs = [
+        "hlo_sharding_util.h",
+    ],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:array",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_sharding_util_test",
+    srcs = [
+        "hlo_sharding_util_test.cc",
+    ],
+    deps = [
+        ":hlo_sharding_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
+cc_library(
+    name = "sharding_propagation",
+    srcs = [
+        "sharding_propagation.cc",
+    ],
+    hdrs = [
+        "sharding_propagation.h",
+    ],
+    deps = [
+        ":dot_as_convolution_util",
+        ":hlo",
+        ":hlo_graph_dumper",
+        ":hlo_pass",
+        ":hlo_sharding_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "sharding_propagation_test",
+    srcs = [
+        "sharding_propagation_test.cc",
+    ],
+    deps = [
+        "hlo_matchers",
+        ":hlo_parser",
+        ":sharding_propagation",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
+cc_library(
+    name = "dot_as_convolution_util",
+    srcs = [
+        "dot_as_convolution_util.cc",
+    ],
+    hdrs = [
+        "dot_as_convolution_util.h",
+    ],
+    deps = [
+        ":hlo",
+        ":shape_inference",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 tf_cc_test(
     name = "dynamic_parameter_binding_test",
     srcs = ["dynamic_parameter_binding_test.cc"],
@@ -2098,6 +2189,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -2397,6 +2490,42 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "all_gather_decomposer",
+    srcs = ["all_gather_decomposer.cc"],
+    hdrs = ["all_gather_decomposer.h"],
+    deps = [
+        ":hlo",
+        ":hlo_casting_utils",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "all_gather_decomposer_test",
+    srcs = ["all_gather_decomposer_test.cc"],
+    deps = [
+        ":all_gather_decomposer",
+        ":hlo",
+        ":hlo_matchers",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "tuple_simplifier",
     srcs = ["tuple_simplifier.cc"],
@@ -3217,6 +3346,7 @@ cc_library(
         ":heap_simulator",
         ":hlo_cost_analysis",
         "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/core/lib/math:math_util",
     ],
 )
 
@@ -3234,6 +3364,29 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "memory_space_propagation",
+    srcs = ["memory_space_propagation.cc"],
+    hdrs = ["memory_space_propagation.h"],
+    deps = [
+        ":hlo",
+        ":hlo_dataflow_analysis",
+        ":hlo_pass",
+    ],
+)
+
+tf_cc_test(
+    name = "memory_space_propagation_test",
+    srcs = ["memory_space_propagation_test.cc"],
+    deps = [
+        ":hlo_parser",
+        ":memory_space_propagation",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "hlo_dce",
     srcs = ["hlo_dce.cc"],
@@ -3787,6 +3940,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:core",
         "@llvm-project//llvm:transform_utils",
     ],
@@ -4406,6 +4560,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:regexp_internal",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 55af8726dc8..4025cb46f18 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -472,8 +472,9 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
       HloInstruction* dot);
 
   HloComputation* GetOrCreateScalarAddComputation(PrimitiveType type) {
-    if (scalar_add_computation_) {
-      return scalar_add_computation_;
+    HloComputation*& scalar_add_computation = scalar_add_computations_[type];
+    if (scalar_add_computation) {
+      return scalar_add_computation;
     }
 
     HloComputation::Builder b("scalar_add_computation");
@@ -485,9 +486,9 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
         HloInstruction::CreateParameter(1, shape, "scalar_rhs"));
     auto scalar_op = b.AddInstruction(HloInstruction::CreateBinary(
         shape, HloOpcode::kAdd, scalar_lhs, scalar_rhs));
-    scalar_add_computation_ =
+    scalar_add_computation =
         computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
-    return scalar_add_computation_;
+    return scalar_add_computation;
   }
 
   // Tries to fold a kPad in the input or filter into the convolution
@@ -508,6 +509,13 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
   // Tries to convert slice(reshape(X)) into reshape(slice(X))
   StatusOr<bool> TryToReorderSliceAndReshape(HloInstruction* slice);
 
+  // Tries to simplify `(and (< a N) (< a K))` in cases where `N <= K` into
+  // `(< a N)`. This is crucial for being able to figure out the loop trip
+  // count.
+  //
+  // Assumes that the input is conjunction.
+  StatusOr<bool> TrySimplifyTautologicalCompare(HloInstruction* conjunction);
+
   // Useful when we want to use the same visitor over multiple computations.
   void ResetState(HloComputation* computation);
 
@@ -521,8 +529,8 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
   // Whether algebraic simplification has occurred.
   bool changed_ = false;
 
-  // Cached computation for adding two scalar F32.
-  HloComputation* scalar_add_computation_ = nullptr;
+  // Cached computation for adding two scalars of a given type.
+  absl::flat_hash_map<PrimitiveType, HloComputation*> scalar_add_computations_;
 
   AlgebraicSimplifier* simplifier_ = nullptr;
 };
@@ -856,6 +864,50 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
   return Status::OK();
 }
 
+StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyTautologicalCompare(
+    HloInstruction* conjunction) {
+  HloInstruction *lhs, *rhs;
+  if (!Match(conjunction, m::And(m::Op(&lhs), m::Op(&rhs)))) {
+    return false;
+  }
+  struct LessThanCompareInfo {  // (LT var constant)
+    HloInstruction* var;
+    int64 constant;
+  };
+
+  auto get_compare_info =
+      [&](HloInstruction* cmp) -> absl::optional<LessThanCompareInfo> {
+    HloInstruction *lhs, *rhs;
+    auto scalar_shape_matcher =
+        m::Shape().IsEffectiveScalar().WithElementType(PrimitiveType::S32);
+    if (Match(cmp, m::Compare(m::Op(&lhs),
+                              m::Constant(&rhs).WithShape(scalar_shape_matcher))
+                       .WithComparisonDirection(ComparisonDirection::kLt))) {
+      return {LessThanCompareInfo{lhs, *rhs->literal().GetFirstInteger()}};
+    } else if (Match(
+                   cmp,
+                   m::Compare(m::Constant(&lhs).WithShape(scalar_shape_matcher),
+                              m::Op(&rhs))
+                       .WithComparisonDirection(ComparisonDirection::kGt))) {
+      return {LessThanCompareInfo{rhs, *lhs->literal().GetFirstInteger()}};
+    }
+    return absl::nullopt;
+  };
+
+  absl::optional<LessThanCompareInfo> lhs_info = get_compare_info(lhs);
+  absl::optional<LessThanCompareInfo> rhs_info = get_compare_info(rhs);
+  if (lhs_info && rhs_info && lhs_info->var == rhs_info->var) {
+    int64 new_bound = std::min(lhs_info->constant, rhs_info->constant);
+    TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
+        conjunction,
+        HloInstruction::CreateCompare(lhs->shape(), lhs_info->var,
+                                      MakeScalarLike(lhs_info->var, new_bound),
+                                      ComparisonDirection::kLt)));
+    return true;
+  }
+  return false;
+}
+
 Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(logical_and, m::And(m::Op(&lhs), m::Op(&rhs))));
@@ -890,6 +942,13 @@ Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) {
     return Status::OK();
   }
 
+  // Simplify tautological conjunctions.
+  TF_ASSIGN_OR_RETURN(bool found_tautological_compare,
+                      TrySimplifyTautologicalCompare(logical_and));
+  if (found_tautological_compare) {
+    return Status::OK();
+  }
+
   return Status::OK();
 }
 
@@ -1423,6 +1482,22 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
     return ReplaceInstruction(divide, new_divide);
   }
 
+  // If X is a convert from pred, then
+  // X / broadcast(Y) => broadcast(1/Y) * X
+  if (Match(divide,
+            m::Divide(
+                m::Convert(&a,
+                           m::Op().WithShape(m::Shape().WithElementType(PRED))),
+                m::Broadcast(m::Op(&b).WithShape(m::Shape().IsScalar()))))) {
+    TF_ASSIGN_OR_RETURN(
+        auto recip, MakeBinaryHlo(HloOpcode::kDivide, MakeScalarLike(b, 1), b));
+    auto recip_bcast = computation_->AddInstruction(
+        HloInstruction::CreateBroadcast(divide->shape(), recip, {}));
+    TF_ASSIGN_OR_RETURN(auto mul,
+                        MakeBinaryHlo(HloOpcode::kMultiply, recip_bcast, a));
+    return ReplaceInstruction(divide, mul);
+  }
+
   return Status::OK();
 }
 
@@ -2983,6 +3058,20 @@ AlgebraicSimplifierVisitor::TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(
     return false;
   }
   HloInstruction* operand = broadcast->mutable_operand(0);
+  auto is_scalar_broadcast = [](const HloInstruction* instruction) {
+    return instruction->opcode() == HloOpcode::kBroadcast &&
+           ShapeUtil::IsScalar(instruction->operand(0)->shape());
+  };
+  auto is_equal_broadcast = [operand,
+                             broadcast](const HloInstruction* instruction) {
+    return instruction->opcode() == HloOpcode::kBroadcast &&
+           ShapeUtil::Equal(operand->shape(),
+                            instruction->operand(0)->shape()) &&
+           broadcast->dimensions() == instruction->dimensions();
+  };
+  auto is_compatible_broadcast = [&](const HloInstruction* instruction) {
+    return is_scalar_broadcast(instruction) || is_equal_broadcast(instruction);
+  };
   for (HloInstruction* user : broadcast->users()) {
     if (user->user_count() == 0 && user != computation_->root_instruction()) {
       continue;
@@ -3001,18 +3090,20 @@ AlgebraicSimplifierVisitor::TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(
       continue;
     }
 
-    // Find the unique non-scalar operand or continue if there isn't one.
-    int64 scalar_broadcast_count = 0;
+    // Check if all the operands of the user are compatible broadcasts for
+    // sinking. (They are either scalar broadcasts or broadcasts casting
+    // from/to the same shape/dimensions)
+    int64 compatible_broadcast_count = 0;
     int64 broadcast_use_count = 0;
     for (HloInstruction* user_operand : user->operands()) {
-      if (user_operand->opcode() == HloOpcode::kBroadcast &&
-          ShapeUtil::IsScalar(user_operand->operand(0)->shape())) {
-        ++scalar_broadcast_count;
+      if (is_compatible_broadcast(user_operand)) {
+        ++compatible_broadcast_count;
       } else if (broadcast == user_operand) {
         ++broadcast_use_count;
       }
     }
-    if (scalar_broadcast_count + broadcast_use_count != user->operand_count()) {
+    if (compatible_broadcast_count + broadcast_use_count !=
+        user->operand_count()) {
       continue;
     }
     std::vector<HloInstruction*> new_operands;
@@ -3020,14 +3111,24 @@ AlgebraicSimplifierVisitor::TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(
 
     Shape changed_shape;
     for (HloInstruction* user_operand : user->operands()) {
-      if (user_operand->opcode() == HloOpcode::kBroadcast &&
-          ShapeUtil::IsScalar(user_operand->operand(0)->shape())) {
-        changed_shape = ShapeUtil::ChangeElementType(
-            operand->shape(), user_operand->shape().element_type());
-        simplifier_->UpdateLayout(&changed_shape);
-        new_operands.push_back(
-            computation_->AddInstruction(HloInstruction::CreateBroadcast(
-                changed_shape, user_operand->mutable_operand(0), {})));
+      // If this is a broadcast operand that is not our original broadcast input
+      // to this function then we might need to change the input.
+      if (is_compatible_broadcast(user_operand)) {
+        // If this is a broadcast from a scalar value rewrite a broadcast from
+        // the scalar to the new shape enforced from the other broadcast
+        // operands.
+        if (is_scalar_broadcast(user_operand)) {
+          changed_shape = ShapeUtil::ChangeElementType(
+              operand->shape(), user_operand->shape().element_type());
+          simplifier_->UpdateLayout(&changed_shape);
+          new_operands.push_back(
+              computation_->AddInstruction(HloInstruction::CreateBroadcast(
+                  changed_shape, user_operand->mutable_operand(0), {})));
+        } else {
+          // For the non-scalar broadcasts we guarantee that the shape of the
+          // operand of the broadcast needs to be already a compatible shape.
+          new_operands.push_back(user_operand->mutable_operand(0));
+        }
       } else {
         CHECK_EQ(broadcast, user_operand);
         new_operands.push_back(operand);
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 6c8e80aa963..bcfc2fdc740 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -338,6 +338,79 @@ TEST_F(AlgebraicSimplifierTest, MultiplyReassociateMergeBroadcastedConstants) {
                                                     m::ConstantScalar(3.0))))));
 }
 
+TEST_F(AlgebraicSimplifierTest, ElementwiseSinkMultipleBroadcastsScalar) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      b0 = f32[4] broadcast(p0), dimensions={}
+      b1 = f32[4] broadcast(p1), dimensions={}
+      ROOT multiply = f32[4] multiply(b1, b0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Broadcast(m::Multiply(m::Broadcast(m::Parameter(1)),
+                                          m::Broadcast(m::Parameter(0))))));
+}
+
+TEST_F(AlgebraicSimplifierTest, ElementwiseSinkMultipleBroadcastsConstantMix) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[4] parameter(0)
+      c0 = f32[] constant(2.0)
+      b0 = f32[4,2] broadcast(c0), dimensions={}
+      b1 = f32[4,2] broadcast(p0), dimensions={0}
+      ROOT multiply = f32[4,2] multiply(b1, b0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Broadcast(m::Multiply(
+                  m::Parameter(0), m::Broadcast(m::ConstantScalar(2.0))))));
+}
+
+TEST_F(AlgebraicSimplifierTest, ElementwiseSinkMultipleBroadcastsNonScalar) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[4] parameter(0)
+      p1 = f32[4] parameter(1)
+      b0 = f32[4,2] broadcast(p0), dimensions={0}
+      b1 = f32[4,2] broadcast(p1), dimensions={0}
+      ROOT multiply = f32[4,2] multiply(b1, b0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Broadcast(m::Multiply(m::Parameter(1), m::Parameter(0)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, ElementwiseNoSinkBroadcastsDifferentDims) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[4] parameter(0)
+      p1 = f32[8] parameter(1)
+      b0 = f32[4,8] broadcast(p0), dimensions={0}
+      b1 = f32[4,8] broadcast(p1), dimensions={1}
+      ROOT multiply = f32[4,8] multiply(b1, b0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Multiply(m::Broadcast(m::Parameter(1)),
+                                     m::Broadcast(m::Parameter(0)))));
+}
+
 TEST_F(AlgebraicSimplifierTest,
        MultiplyReassociateMultiplyOfConstantAndBroadcast) {
   const char* kModuleStr = R"(
@@ -5761,6 +5834,44 @@ TEST_F(AlgebraicSimplifierTest, CompareSame) {
               GmockMatch(m::Broadcast(m::ConstantScalar(true))));
 }
 
+TEST_F(AlgebraicSimplifierTest, CompareSimplified) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      param = s32[] parameter(0)
+      c1 = s32[] constant(10)
+      c2 = s32[] constant(100)
+      cmp1 = pred[] compare(param, c1), direction=LT
+      cmp2 = pred[] compare(param, c2), direction=LT
+      ROOT out = pred[] and(cmp1, cmp2)
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Compare(m::Op(), m::Op().IsConstantScalar(10))
+                     .WithComparisonDirection(ComparisonDirection::kLt)));
+}
+
+TEST_F(AlgebraicSimplifierTest, CompareSimplifiedReversed) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      param = s32[] parameter(0)
+      c1 = s32[] constant(10)
+      c2 = s32[] constant(100)
+      cmp1 = pred[] compare(param, c1), direction=LT
+      cmp2 = pred[] compare(c2, param), direction=GT
+      ROOT out = pred[] and(cmp1, cmp2)
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Compare(m::Op(), m::Op().IsConstantScalar(10))
+                     .WithComparisonDirection(ComparisonDirection::kLt)));
+}
+
 TEST_F(AlgebraicSimplifierTest, CanDisableDotToMultiplyRewrite) {
   // Some backends may have better performance by treating an outer product as a
   // Dot, rather than a broadcast Multiply
@@ -6462,5 +6573,43 @@ TEST_F(AlgebraicSimplifierTest, SwapConvOperands) {
   EXPECT_EQ(conv->window().dimensions(1).padding_high(), 1);
 }
 
+TEST_F(AlgebraicSimplifierTest, ScalarDividePredicate) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = pred[2] parameter(0)
+      cvt = f32[2] convert(p0)
+      p1 = f32[] parameter(1)
+      bcast = f32[2] broadcast(p1), dimensions={}
+      ROOT div = f32[2] divide(cvt, bcast)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::MultiplyAnyOrder(
+          m::Convert(m::Parameter(0)),
+          m::Broadcast(m::Divide(m::ConstantScalar(1), m::Parameter(1))))));
+}
+
+TEST_F(AlgebraicSimplifierTest, MultipleDotStrengthReductions) {
+  constexpr char kModuleStr[] = R"(
+    HloModule test
+    ENTRY test {
+      a = c64[2,2] parameter(0)
+      b = c64[2] parameter(1)
+      cd = c64[2] dot(a, b), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      c = f64[2,2] parameter(2)
+      d = f64[2] parameter(3)
+      dd = f64[2] dot(c, d), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      ROOT tuple = (c64[2], f64[2]) tuple(cd, dd)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_EQ(3, m->computation_count());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/all_gather_decomposer.cc b/tensorflow/compiler/xla/service/all_gather_decomposer.cc
new file mode 100644
index 00000000000..00b9adaea43
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_gather_decomposer.cc
@@ -0,0 +1,157 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/all_gather_decomposer.h"
+
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+// Creates a computation of x + y.
+HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module) {
+  HloComputation::Builder sum_b("add");
+  auto x = sum_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(type, {}), "x"));
+  auto y = sum_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(type, {}), "y"));
+  if (type == PRED) {
+    sum_b.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(type, {}), HloOpcode::kOr, x, y));
+  } else {
+    sum_b.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(type, {}), HloOpcode::kAdd, x, y));
+  }
+  HloComputation* reduction = module->AddEmbeddedComputation(sum_b.Build());
+  return reduction;
+}
+
+Status DecomposeAllGather(HloAllGatherInstruction* ag, HloComputation* comp) {
+  const int64 shard_size =
+      ag->operand(0)->shape().dimensions(ag->all_gather_dimension());
+  const int64 ag_size = ag->shape().dimensions(ag->all_gather_dimension());
+  TF_RET_CHECK(ag_size % shard_size == 0);
+  int64 partition_count = ag_size / shard_size;
+  auto zero = comp->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(ag->shape().element_type())));
+  zero = comp->AddInstruction(
+      HloInstruction::CreateBroadcast(ag->shape(), zero, {}));
+  auto zero_index = comp->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(U32)));
+  std::vector<HloInstruction*> start_indices(ag->shape().rank(), zero_index);
+  auto shard_id_from_subgroup = [&](HloInstruction* replica_or_global_id) {
+    if (ag->replica_groups().empty()) {
+      return replica_or_global_id;
+    }
+    if (ag->replica_groups().size() == 1) {
+      // Whether the group is {1, 2, ..., N - 1}.
+      bool trivial_group = true;
+      for (int64 i = 0; i < ag->replica_groups()[0].replica_ids_size(); ++i) {
+        if (ag->replica_groups()[0].replica_ids(i) != i) {
+          trivial_group = false;
+          break;
+        }
+      }
+      if (trivial_group) {
+        CHECK_EQ(partition_count, ag->replica_groups()[0].replica_ids_size());
+        return replica_or_global_id;
+      }
+    }
+    // Create a table of shard IDs for each replica_or_global_id, then slice it
+    // using replica_or_global_id.
+    std::vector<uint32> shard_ids(ag->replica_groups().size() *
+                                  ag->replica_groups()[0].replica_ids_size());
+    for (const auto& group : ag->replica_groups()) {
+      for (int64 i = 0; i < group.replica_ids_size(); ++i) {
+        shard_ids[group.replica_ids(i)] = i;
+      }
+    }
+    auto id_table = comp->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR1<uint32>(shard_ids)));
+    auto shard_id = comp->AddInstruction(HloInstruction::CreateDynamicSlice(
+        ShapeUtil::MakeShape(U32, {1}), id_table, {replica_or_global_id}, {1}));
+    shard_id = comp->AddInstruction(
+        HloInstruction::CreateReshape(ShapeUtil::MakeShape(U32, {}), shard_id));
+    return shard_id;
+  };
+  HloInstruction* shard_id;
+  if (ag->channel_id().has_value()) {
+    if (ag->use_global_device_ids()) {
+      auto pid = comp->AddInstruction(HloInstruction::CreatePartitionId());
+      auto rid = comp->AddInstruction(HloInstruction::CreateReplicaId());
+      auto pcount = comp->AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<uint32>(partition_count)));
+      auto global_id = comp->AddInstruction(HloInstruction::CreateBinary(
+          pid->shape(), HloOpcode::kAdd, pid,
+          comp->AddInstruction(HloInstruction::CreateBinary(
+              pid->shape(), HloOpcode::kMultiply, rid, pcount))));
+      shard_id = shard_id_from_subgroup(global_id);
+    } else {
+      TF_RET_CHECK(!ag->replica_groups().empty());
+      TF_RET_CHECK(ag->replica_groups()[0].replica_ids_size() == 1);
+      shard_id = comp->AddInstruction(HloInstruction::CreatePartitionId());
+    }
+  } else {
+    shard_id = shard_id_from_subgroup(
+        comp->AddInstruction(HloInstruction::CreateReplicaId()));
+  }
+  start_indices[ag->all_gather_dimension()] =
+      comp->AddInstruction(HloInstruction::CreateBinary(
+          shard_id->shape(), HloOpcode::kMultiply, shard_id,
+          comp->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<uint32>(shard_size)))));
+  auto dus = comp->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      zero->shape(), zero, ag->mutable_operand(0), start_indices));
+  auto ar = comp->AddInstruction(HloInstruction::CreateAllReduce(
+      dus->shape(), {dus},
+      MakeBinaryAdd(dus->shape().element_type(), comp->parent()),
+      ag->replica_groups(),
+      /*constrain_layout=*/ag->constrain_layout(), ag->channel_id(),
+      ag->use_global_device_ids()));
+  TF_RETURN_IF_ERROR(ag->ReplaceAllUsesWith(ar));
+  TF_RETURN_IF_ERROR(comp->RemoveInstructionAndUnusedOperands(ag));
+  return Status::OK();
+}
+
+StatusOr<bool> AllGatherDecomposer::Run(HloModule* module) {
+  bool changed = false;
+  for (auto comp : module->MakeNonfusionComputations()) {
+    for (auto hlo : comp->MakeInstructionPostOrder()) {
+      if (hlo->opcode() != HloOpcode::kAllGather) {
+        continue;
+      }
+      auto ag = Cast<HloAllGatherInstruction>(hlo);
+      if (should_decompose_(*ag)) {
+        TF_RETURN_IF_ERROR(DecomposeAllGather(ag, comp));
+        changed = true;
+      }
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/all_gather_decomposer.h b/tensorflow/compiler/xla/service/all_gather_decomposer.h
new file mode 100644
index 00000000000..6b20765c709
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_gather_decomposer.h
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ALL_GATHER_DECOMPOSER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_ALL_GATHER_DECOMPOSER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// AllGatherDecomposer is a pass which converts unsupported all-gathers into
+// dynamic-update-slices and all-reduces.
+class AllGatherDecomposer : public HloModulePass {
+ public:
+  explicit AllGatherDecomposer(
+      std::function<bool(const HloAllGatherInstruction&)> should_decompose)
+      : should_decompose_(std::move(should_decompose)) {}
+  AllGatherDecomposer()
+      : should_decompose_(
+            [](const HloAllGatherInstruction& ag) { return true; }) {}
+  absl::string_view name() const override { return "all_gather_decomposer"; }
+
+  // Run AllGatherDecomposer pass on computations in 'module'.
+  // Returns whether the 'module' was changed.
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  std::function<bool(const HloAllGatherInstruction&)> should_decompose_;
+  int64 partition_count_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_ALL_GATHER_DECOMPOSER_H_
diff --git a/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc b/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc
new file mode 100644
index 00000000000..3df5e51a7c2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc
@@ -0,0 +1,160 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/all_gather_decomposer.h"
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+using ::testing::AllOf;
+namespace op = xla::testing::opcode_matchers;
+using AllGatherDecomposerTest = HloTestBase;
+
+TEST_F(AllGatherDecomposerTest, CrossReplicaAllGather) {
+  const string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  param0 = f32[10,20] parameter(0)
+  ROOT ag = f32[10,80] all-gather(param0), replica_groups={}, dimensions={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  AllGatherDecomposer decomposer;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::AllReduce(op::DynamicUpdateSlice(
+          op::Broadcast(op::Constant()), op::Parameter(0), op::Constant(),
+          op::Multiply(op::ReplicaId(), op::Constant()))));
+}
+
+TEST_F(AllGatherDecomposerTest, CrossPartitionAllGather) {
+  const string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  param0 = f32[10,20] parameter(0)
+  ROOT ag = f32[10,80] all-gather(param0), replica_groups={{0}}, channel_id=1,
+    dimensions={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  AllGatherDecomposer decomposer;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::AllReduce(op::DynamicUpdateSlice(
+          op::Broadcast(op::Constant()), op::Parameter(0), op::Constant(),
+          op::Multiply(op::PartitionId(), op::Constant()))));
+}
+
+TEST_F(AllGatherDecomposerTest, CrossReplicaAllGatherWithTrivialGroup) {
+  const string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  param0 = f32[10,20] parameter(0)
+  ROOT ag = f32[10,80] all-gather(param0), replica_groups={{0,1,2,3}},
+    dimensions={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  AllGatherDecomposer decomposer;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::AllReduce(op::DynamicUpdateSlice(
+          op::Broadcast(op::Constant()), op::Parameter(0), op::Constant(),
+          op::Multiply(op::ReplicaId(), op::Constant()))));
+}
+
+TEST_F(AllGatherDecomposerTest, CrossReplicaAllGatherWithSubgroups) {
+  const string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  param0 = f32[10,20] parameter(0)
+  ROOT ag = f32[10,80] all-gather(param0),
+    replica_groups={{2,1,0,3}, {4,6,7,5}}, dimensions={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  AllGatherDecomposer decomposer;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  auto id =
+      AllOf(op::Shape("u32[]"),
+            op::Reshape(op::DynamicSlice(op::Constant(), op::ReplicaId())));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::AllReduce(op::DynamicUpdateSlice(
+                  op::Broadcast(op::Constant()), op::Parameter(0),
+                  op::Constant(), op::Multiply(id, op::Constant()))));
+}
+
+TEST_F(AllGatherDecomposerTest, CrossReplicaAllGatherWithSubgroupsGlobalIds) {
+  const string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  param0 = f32[10,20] parameter(0)
+  ROOT ag = f32[10,80] all-gather(param0),
+    replica_groups={{2,1,0,3}, {4,6,7,5}}, dimensions={1}, channel_id=1,
+    use_global_device_ids=true
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  AllGatherDecomposer decomposer;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  auto global_id =
+      op::Add(op::PartitionId(), op::Multiply(op::ReplicaId(), op::Constant()));
+  auto id = AllOf(op::Shape("u32[]"),
+                  op::Reshape(op::DynamicSlice(op::Constant(), global_id)));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::AllReduce(op::DynamicUpdateSlice(
+                  op::Broadcast(op::Constant()), op::Parameter(0),
+                  op::Constant(), op::Multiply(id, op::Constant()))));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_support.cc b/tensorflow/compiler/xla/service/bfloat16_support.cc
index abb695fa486..30d764225c2 100644
--- a/tensorflow/compiler/xla/service/bfloat16_support.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_support.cc
@@ -79,6 +79,7 @@ bool BFloat16Support::EffectiveOperandPrecisionIsOutputPrecision(
     const HloInstruction& hlo, int64 operand_index) {
   switch (hlo.opcode()) {
     case HloOpcode::kAbs:
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllToAll:
     case HloOpcode::kBroadcast:
     case HloOpcode::kClamp:
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 67cdb081a91..6cd58b86f0c 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -261,7 +261,7 @@ void BufferAllocation::AddAssignment(const HloValue& buffer, int64 offset,
     Shape* shape = ShapeUtil::GetMutableSubshape(
         position.instruction->mutable_shape(), position.index);
     if (shape->has_layout()) {
-      shape->mutable_layout()->set_memory_space(buffer.color().value());
+      shape->mutable_layout()->set_memory_space(buffer.color());
     }
   }
 }
@@ -272,7 +272,7 @@ BufferAllocationProto BufferAllocation::ToProto() const {
   proto.set_size(size_);
   proto.set_is_thread_local(is_thread_local_);
   proto.set_is_tuple(is_tuple_);
-  proto.set_color(color_.value());
+  proto.set_color(color_);
   if (is_entry_computation_parameter_) {
     proto.set_is_entry_computation_parameter(true);
     for (int64 idx : param_shape_index()) {
@@ -336,8 +336,8 @@ static const HloInstruction* GetOutputInstruction(
 string BufferAllocation::ToString() const {
   string output;
   StrAppendFormat(&output, "allocation %d: %p, size %d", index_, this, size());
-  if (color().value() != 0) {
-    StrAppend(&output, ", color ", color().value());
+  if (color() != 0) {
+    StrAppend(&output, ", color ", color());
   }
   if (is_entry_computation_parameter()) {
     const HloInstruction* param = GetEntryParameterInstruction(*this);
@@ -607,9 +607,7 @@ void BufferAssignment::AddAssignment(BufferAllocation* allocation,
 // BufferAllocation.
 void BufferAssignment::CombineTempAllocations() {
   VLOG(1) << "CombineTempAllocations()";
-  flat_hash_map<BufferValue::Color, BufferAllocation,
-                BufferValue::Color::Hasher>
-      combined_allocation_map;
+  flat_hash_map<BufferValue::Color, BufferAllocation> combined_allocation_map;
 
   // Move all temp allocations into a single run at the end of the allocations
   // vector.
@@ -1059,8 +1057,8 @@ Status BufferAssigner::MergeInplaceOpBuffers(BufferAssignment* assignment) {
 
       // The instruction or operand color is excluded because it was assigned by
       // memory_space_assignment.
-      if (excluded_colors.contains(instruction_buffer.color().value()) ||
-          excluded_colors.contains(operand_buffer.color().value())) {
+      if (excluded_colors.contains(instruction_buffer.color()) ||
+          excluded_colors.contains(operand_buffer.color())) {
         continue;
       }
 
@@ -1353,13 +1351,10 @@ Status BufferAssigner::AssignBuffersForComputations(
   return Status::OK();
 }
 
-flat_hash_map<LogicalBuffer::Color, flat_hash_set<const HloValue*>,
-              LogicalBuffer::Color::Hasher>
+flat_hash_map<LogicalBuffer::Color, flat_hash_set<const HloValue*>>
 BufferAssigner::SplitBuffersByColor(
     const flat_hash_set<const HloValue*>& buffers) {
-  flat_hash_map<LogicalBuffer::Color, flat_hash_set<const HloValue*>,
-                LogicalBuffer::Color::Hasher>
-      color_map;
+  flat_hash_map<LogicalBuffer::Color, flat_hash_set<const HloValue*>> color_map;
   for (auto buffer : buffers) {
     color_map[buffer->color()].insert(buffer);
   }
@@ -1374,8 +1369,7 @@ Status BufferAssigner::AssignPresetBuffers(
   }
 
   // Create an allocation for each preset color.
-  absl::flat_hash_map<LogicalBuffer::Color, BufferAllocation*,
-                      LogicalBuffer::Color::Hasher>
+  absl::flat_hash_map<LogicalBuffer::Color, BufferAllocation*>
       preset_allocations;
   for (auto& color_and_info : preset_assignments_->assignment_informations()) {
     LogicalBuffer::Color color(color_and_info.first);
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 2a02d3776ce..50a4750601b 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -673,8 +673,7 @@ class BufferAssigner {
   // Split a set of buffers into several sets, each of which contains buffers
   // colored with the same color.
   absl::flat_hash_map<LogicalBuffer::Color,
-                      absl::flat_hash_set<const HloValue*>,
-                      LogicalBuffer::Color::Hasher>
+                      absl::flat_hash_set<const HloValue*>>
   SplitBuffersByColor(const absl::flat_hash_set<const HloValue*>& buffers);
 
   // If true, allocate buffers for constant instructions.
diff --git a/tensorflow/compiler/xla/service/buffer_value.cc b/tensorflow/compiler/xla/service/buffer_value.cc
index b1abba20689..58e8086f5e9 100644
--- a/tensorflow/compiler/xla/service/buffer_value.cc
+++ b/tensorflow/compiler/xla/service/buffer_value.cc
@@ -59,7 +59,7 @@ LogicalBufferProto BufferValue::ToProto(const SizeFunction& size_fn) const {
       ToLocationProto(*instruction(), index());
   proto.mutable_defined_at()->Swap(&proto_location);
   if (has_color()) {
-    proto.set_color(color().value());
+    proto.set_color(color());
   }
   return proto;
 }
diff --git a/tensorflow/compiler/xla/service/buffer_value.h b/tensorflow/compiler/xla/service/buffer_value.h
index 44cd7b5ebbd..bd2a09e4aaf 100644
--- a/tensorflow/compiler/xla/service/buffer_value.h
+++ b/tensorflow/compiler/xla/service/buffer_value.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/gtl/int_type.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -86,7 +85,7 @@ namespace xla {
 
 class BufferValue {
  public:
-  TF_LIB_GTL_DEFINE_INT_TYPE(Color, int64);
+  using Color = int64;
 
   // Id is a unique identifier for the BufferValue to facilitate efficient
   // collections of BufferValues with stable iteration order.
@@ -154,7 +153,7 @@ class BufferValue {
   static LogicalBufferProto::Location ToLocationProto(
       const HloInstruction& instruction, const ShapeIndex& index);
 
-  const Color kInvalidColor = Color(-1);
+  const Color kInvalidColor = -1;
 
  protected:
   BufferValue(HloInstruction* instruction, const ShapeIndex& index, Id id);
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index 8c76e912011..ce9c8a4ea62 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -91,6 +91,7 @@ CompileOnlyService::CompileAheadOfTime(
     TF_RETURN_IF_ERROR(options.static_device_assignment().Serialize(
         execution_options.mutable_device_assignment()));
   }
+  execution_options.set_use_spmd_partitioning(options.use_spmd_partitioning());
   for (const AotXlaComputationInstance& instance : computations) {
     TF_RET_CHECK(instance.computation.has_host_program_shape());
     *execution_options.mutable_shape_with_output_layout() =
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index cf646159a38..57b24e372e6 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -76,6 +76,7 @@ class AotCompilationOptions {
 
   virtual int64 replica_count() const { return 0; }
   virtual int64 num_cores() const { return 0; }
+  virtual bool use_spmd_partitioning() const { return false; }
 
   // Optional allocator that may be used for allocating temp space on the device
   // during compilation.
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc
index f60742a8c23..a606e44c5ef 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc
@@ -16,11 +16,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
 
 #include <iterator>
+#include <set>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
@@ -163,94 +166,92 @@ StatusOr<bool> TryRemoveConditional(HloInstruction* conditional) {
   return true;
 }
 StatusOr<bool> TryRemoveUnusedConditionalOperands(
-    HloInstruction* conditional,
-    std::map<HloComputation*, std::set<int64>>* changed_computations) {
-  // Avoid dealing with sharding.
-  if (conditional->has_sharding()) {
+    HloComputation* computation,
+    const absl::flat_hash_set<HloInstruction*>& calling_conditionals) {
+  HloInstruction* param = computation->parameter_instruction(0);
+  // Do not remove from the root instruction.
+  if (param == computation->root_instruction()) {
     return false;
   }
-  std::vector<std::set<int64>> tuple_indices_to_keep(
-      conditional->branch_count());
-  bool will_change = false;
-  for (int64 i = 0; i < conditional->branch_count(); ++i) {
-    HloComputation* computation = conditional->branch_computation(i);
-    if (changed_computations->count(computation) > 0) {
-      will_change = true;
-      break;
-    }
-    HloInstruction* param = computation->parameter_instruction(0);
-    // Do not remove the root instruction.
-    if (param == computation->root_instruction()) {
-      return false;
-    }
-    // There is nothing to be removed for non-tuple operands.
-    if (!param->shape().IsTuple()) {
-      return false;
-    }
-    for (HloInstruction* user : param->users()) {
-      // If the user is not a get tuple element, assume it is unsafe to remove
-      // elements from the tuple.
-      if (user->opcode() != HloOpcode::kGetTupleElement) {
-        return false;
-      }
-      tuple_indices_to_keep[i].insert(user->tuple_index());
-    }
-    // If not all tuple elements are used in this conditional branch, some can
-    // removed from the computation.
-    if (tuple_indices_to_keep[i].size() !=
-        ShapeUtil::TupleElementCount(param->shape())) {
-      will_change = true;
-    }
+  // There is nothing to be removed for non-tuple operands.
+  if (!param->shape().IsTuple()) {
+    return false;
   }
-
-  if (!will_change) {
+  std::set<int64> tuple_indices_to_keep;
+  for (HloInstruction* user : param->users()) {
+    // If the user is not a get tuple element, assume it is unsafe to remove
+    // elements from the tuple.
+    if (user->opcode() != HloOpcode::kGetTupleElement) {
+      return false;
+    }
+    tuple_indices_to_keep.insert(user->tuple_index());
+  }
+  // If all tuple elements are used in this conditional branch, there is nothing
+  // to be removed.
+  int64 old_tuple_element_count = ShapeUtil::TupleElementCount(param->shape());
+  if (tuple_indices_to_keep.size() == old_tuple_element_count) {
     return false;
   }
 
-  for (int64 branch = 0; branch < conditional->branch_count(); ++branch) {
-    const Shape& old_shape = conditional->operand(branch + 1)->shape();
-    int64 old_tuple_element_count = ShapeUtil::TupleElementCount(old_shape);
-    // Clone the computation in case it is called by another instruction.
-    HloComputation* computation = conditional->branch_computation(branch);
-    if (changed_computations
-            ->insert({computation, tuple_indices_to_keep[branch]})
-            .second) {
-      HloInstruction* param = computation->parameter_instruction(0);
+  // Create a new tuple shape based on the indices actually used by this
+  // computation branch.
+  std::vector<Shape> new_tuple_shapes;
+  new_tuple_shapes.reserve(tuple_indices_to_keep.size());
+  std::vector<int64> map(old_tuple_element_count, -1);
+  for (int64 i : tuple_indices_to_keep) {
+    map[i] = new_tuple_shapes.size();
+    new_tuple_shapes.push_back(param->shape().tuple_shapes(i));
+  }
+  Shape tuple_shape = ShapeUtil::MakeTupleShape(new_tuple_shapes);
+  // Clone the computation in case it is called by another non-conditional
+  // instruction.
+  HloComputation* new_computation =
+      computation->parent()->AddEmbeddedComputation(computation->Clone());
+  param = new_computation->parameter_instruction(0);
+  // Reset the parameter shape of the computation.
+  *param->mutable_shape() = tuple_shape;
 
-      // Create a new tuple shape based on the indices actually used by this
-      // branch.
-      std::vector<Shape> new_tuple_shapes;
-      new_tuple_shapes.reserve(tuple_indices_to_keep[branch].size());
-      std::vector<int64> map(old_tuple_element_count, -1);
-      for (int64 i : tuple_indices_to_keep[branch]) {
-        map[i] = new_tuple_shapes.size();
-        new_tuple_shapes.push_back(old_shape.tuple_shapes(i));
-      }
-      Shape tuple_shape = ShapeUtil::MakeTupleShape(new_tuple_shapes);
-      // Reset the parameter shape of the computation.
-      *param->mutable_shape() = tuple_shape;
+  // Reroute the GTE instructions to new tuple indices.
+  for (HloInstruction* user : param->users()) {
+    user->set_tuple_index(map[user->tuple_index()]);
+  }
 
-      // Reroute the GTE instructions to new tuple indices.
-      for (HloInstruction* user : param->users()) {
-        user->set_tuple_index(map[user->tuple_index()]);
-      }
+  // Adjust the operand shape of all calling conditionals.
+  for (HloInstruction* conditional : calling_conditionals) {
+    // Avoid dealing with sharding.
+    if (conditional->has_sharding()) {
+      continue;
     }
+    for (int64 branch = 0; branch < conditional->branch_count(); ++branch) {
+      if (conditional->branch_computation(branch) != computation) {
+        continue;
+      }
+      conditional->set_branch_computation(branch, new_computation);
+      const Shape& old_shape = conditional->operand(branch + 1)->shape();
 
-    // Reroute the operand tuple through a tuple of gte instructions of the
-    // original operand tuple.
-    const auto& to_keep = (*changed_computations)[computation];
-    std::vector<HloInstruction*> new_tuple_operands;
-    new_tuple_operands.reserve(to_keep.size());
-    for (int64 i : to_keep) {
-      new_tuple_operands.push_back(conditional->parent()->AddInstruction(
-          HloInstruction::CreateGetTupleElement(
-              old_shape.tuple_shapes(i),
-              conditional->mutable_operand(branch + 1), i)));
+      // Reroute the operand tuple through a tuple of gte instructions of the
+      // original operand tuple.
+      std::vector<HloInstruction*> new_tuple_operands;
+      new_tuple_operands.reserve(tuple_indices_to_keep.size());
+      for (int64 i : tuple_indices_to_keep) {
+        new_tuple_operands.push_back(conditional->parent()->AddInstruction(
+            HloInstruction::CreateGetTupleElement(
+                old_shape.tuple_shapes(i),
+                conditional->mutable_operand(branch + 1), i)));
+      }
+      HloInstruction* new_tuple = conditional->parent()->AddInstruction(
+          HloInstruction::CreateTuple(new_tuple_operands));
+      TF_RETURN_IF_ERROR(
+          conditional->ReplaceOperandWithDifferentShape(branch + 1, new_tuple));
+      CHECK(ShapeUtil::Compatible(conditional->operand(branch + 1)->shape(),
+                                  conditional->branch_computation(branch)
+                                      ->parameter_instruction(0)
+                                      ->shape()));
+      CHECK(ShapeUtil::Compatible(
+          conditional->shape(),
+          conditional->branch_computation(branch)->root_instruction()->shape()))
+          << conditional->branch_computation(branch)->ToString();
     }
-    HloInstruction* new_tuple = conditional->parent()->AddInstruction(
-        HloInstruction::CreateTuple(new_tuple_operands));
-    TF_RETURN_IF_ERROR(
-        conditional->ReplaceOperandWithDifferentShape(branch + 1, new_tuple));
   }
   return true;
 }
@@ -333,7 +334,7 @@ bool RemoveUnusedTupleElements(HloInstruction* conditional_op) {
   }
 
   // Compute old-to-new (old-to-new) indices mapping.
-  std::map<int, int> new_to_old_mapping, old_to_new_mapping;
+  absl::flat_hash_map<int, int> new_to_old_mapping, old_to_new_mapping;
   auto old_iter = used_indices.begin();
   for (int new_index = 0; new_index < new_tuple_shapes_size; ++new_index) {
     old_iter = std::find(old_iter, used_indices.end(), true);
@@ -519,7 +520,8 @@ bool MergeDuplicateTupleElements(HloInstruction* conditional) {
   };
 
   bool changed = false;
-  std::map<std::vector<const HloInstruction*>, int64> index_collision_table;
+  absl::flat_hash_map<std::vector<const HloInstruction*>, int64>
+      index_collision_table;
   for (int i = 0; i < conditional->shape().tuple_shapes_size(); ++i) {
     const std::vector<const HloInstruction*> ith_operands_vector =
         vectorize_branches_root_tuple_ith_operand(i);
@@ -551,16 +553,34 @@ StatusOr<bool> ConditionalSimplifier::Run(HloModule* module) {
     }
   }
 
-  std::map<HloComputation*, std::set<int64>> changed_computations;
+  absl::flat_hash_set<HloInstruction*> removed_conditionals;
   for (HloInstruction* conditional_op : conditional_ops) {
     changed |= MergeDuplicateTupleElements(conditional_op);
     changed |= RemoveUnusedTupleElements(conditional_op);
     changed |= ReplaceRootWithEmptyTupleIfNoUsers(conditional_op);
     TF_ASSIGN_OR_RETURN(bool result, TryRemoveConditional(conditional_op));
-    if (!result) {
-      TF_ASSIGN_OR_RETURN(result, TryRemoveUnusedConditionalOperands(
-                                      conditional_op, &changed_computations));
+    if (result) {
+      removed_conditionals.insert(conditional_op);
+      changed = true;
     }
+  }
+  // Try to remove unused conditional operands from branch computations. We need
+  // to be careful to adjust *all* calling conditional ops if we do that, so
+  // lets collect them first.
+  absl::flat_hash_map<HloComputation*, absl::flat_hash_set<HloInstruction*>>
+      calling_conditionals;
+  for (HloInstruction* conditional : conditional_ops) {
+    if (removed_conditionals.contains(conditional)) {
+      continue;
+    }
+    for (int64 branch = 0; branch < conditional->branch_count(); ++branch) {
+      calling_conditionals[conditional->branch_computation(branch)].insert(
+          conditional);
+    }
+  }
+  for (const auto& entry : calling_conditionals) {
+    TF_ASSIGN_OR_RETURN(bool result, TryRemoveUnusedConditionalOperands(
+                                         entry.first, entry.second));
     changed |= result;
   }
 
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
index 8a7fba6a48f..ea3101fa0ed 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
@@ -198,32 +198,26 @@ ENTRY main {
   c1_1 = f32[40,40] parameter(3)
   p = pred[] parameter(4)
   t = (f32[20,40], f32[40,40], f32[20,40], f32[40,40]) tuple(c0_0, c0_1, c1_0, c1_1)
-  ROOT result = (f32[20, 40]) conditional(p,t,t), false_computation=on_false, true_computation=on_true
+  call = (f32[20,40]) call(t), to_apply=on_true
+  ROOT result = (f32[20,40]) conditional(p,t,t), false_computation=on_false, true_computation=on_true
 }
 )";
   auto status = ParseAndReturnVerifiedModule(hlo_string);
   TF_ASSERT_OK(status.status());
+  std::unique_ptr<HloModule> module = status.ConsumeValueOrDie();
   HloVerifier v(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false);
-  TF_ASSERT_OK(v.Run(status.ValueOrDie().get()).status());
-  EXPECT_TRUE(
-      ConditionalSimplifier().Run(status.ValueOrDie().get()).ValueOrDie());
-  TF_ASSERT_OK(v.Run(status.ValueOrDie().get()).status());
-  EXPECT_EQ(status.ValueOrDie()
-                ->entry_computation()
-                ->root_instruction()
-                ->operand(1)
-                ->shape()
-                .tuple_shapes()
-                .size(),
-            2);
-  EXPECT_EQ(status.ValueOrDie()
-                ->entry_computation()
-                ->root_instruction()
-                ->operand(2)
-                ->shape()
-                .tuple_shapes()
-                .size(),
-            2);
+  TF_ASSERT_OK(v.Run(module.get()).status());
+  EXPECT_TRUE(ConditionalSimplifier().Run(module.get()).ValueOrDie());
+  TF_ASSERT_OK(v.Run(module.get()).status());
+  HloInstruction* conditional = module->entry_computation()->root_instruction();
+  EXPECT_TRUE(conditional != nullptr);
+  EXPECT_EQ(conditional->operand(1)->shape().tuple_shapes().size(), 2);
+  EXPECT_EQ(conditional->operand(2)->shape().tuple_shapes().size(), 2);
+  // For the call operation, nothing should have changed.
+  HloInstruction* call = FindInstruction(module.get(), "call");
+  EXPECT_EQ(
+      call->to_apply()->parameter_instruction(0)->shape().tuple_shapes().size(),
+      4);
 }
 
 TEST_F(ConditionalSimplifierTest,
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 121bdedf2dd..3460e65b0a2 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -118,6 +118,9 @@ cc_library(
         ":target_machine_features",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "@llvm-project//mlir:ExecutionEngineUtils",
+        "@llvm-project//mlir:LLVMDialect",
         "//tensorflow/compiler/xla/service:copy_insertion",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:dump",
@@ -146,6 +149,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:conditional_simplifier",
         "//tensorflow/compiler/xla/service:convolution_group_converter",
         "//tensorflow/compiler/xla/service:dot_decomposer",
+        "//tensorflow/compiler/xla/service:dynamic_padder",
         "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
@@ -365,6 +369,7 @@ cc_library(
         "@llvm-project//llvm:core",
         "@llvm-project//llvm:support",
         "@llvm-project//llvm:target",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -455,6 +460,7 @@ cc_library(
         ":cpu_options",
         ":cpu_runtime",
         ":ir_emission_utils",
+        ":mlir_emitter",
         ":target_machine_features",
         ":tiled_dot_emitter",
         ":vector_support_library",
@@ -473,6 +479,10 @@ cc_library(
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:core",
+        "@llvm-project//mlir:EDSC",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LinalgOps",
+        "@llvm-project//mlir:StandardOps",
     ],
 )
 
@@ -1069,3 +1079,24 @@ tf_cc_test(
         "@llvm-project//llvm:target",
     ],
 )
+
+cc_library(
+    name = "mlir_emitter",
+    srcs = ["mlir_emitter.cc"],
+    hdrs = ["mlir_emitter.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/xla:hlo_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "@llvm-project//llvm:core",
+        "@llvm-project//llvm:ipo",
+        "@llvm-project//llvm:linker",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMTransforms",
+        "@llvm-project//mlir:LinalgToLLVM",
+        "@llvm-project//mlir:LinalgTransforms",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TargetLLVMIR",
+        "@llvm-project//mlir:VectorToLLVM",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index b04237138e8..b2416ac2799 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -42,6 +42,8 @@ limitations under the License.
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/cpu_function_runtime.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -72,6 +74,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
+#include "tensorflow/compiler/xla/service/dynamic_padder.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -157,6 +160,8 @@ CpuCompiler::CpuCompiler() {
   // Initialize LLVM's MC layer for the native target.
   llvm::InitializeNativeTarget();
   llvm::InitializeNativeTargetAsmPrinter();
+
+  mlir::registerAllDialects();
 }
 
 namespace {
@@ -239,7 +244,6 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   HloPassPipeline pipeline("HLO passes through layout assignment");
   pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                             /*allow_mixed_precision=*/false);
-
   // Expand random number generation.
   pipeline.AddPass<RngExpander>();
   pipeline.AddPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_PHILOX);
@@ -273,6 +277,13 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<ConvolutionGroupConverter>(
       cost_model,
       /*convert_batch_groups_only=*/false);
+  pipeline.AddPass<ScatterExpander>();
+  pipeline.AddPass<BatchNormExpander>(
+      /*rewrite_training_op=*/true,
+      /*rewrite_inference_op=*/true,
+      /*rewrite_grad_op=*/true);
+  pipeline.AddPass<DynamicPadder>();
+  pipeline.AddPass<HloGetDimensionSizeRewriter>();
   pipeline.AddPass<ConvCanonicalization>(target_machine_features);
   {
     auto& pass =
@@ -281,12 +292,6 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
                                                /*allow_mixed_precision=*/false);
 
     pass.AddPass<TreeReductionRewriter>();
-    pass.AddPass<ScatterExpander>();
-    pass.AddPass<BatchNormExpander>(
-        /*rewrite_training_op=*/true,
-        /*rewrite_inference_op=*/true,
-        /*rewrite_grad_op=*/true);
-    pipeline.AddPass<HloGetDimensionSizeRewriter>();
     AlgebraicSimplifierOptions options;
     options.set_enable_dot_strength_reduction(false);
     pass.AddPass<AlgebraicSimplifier>(options);
@@ -605,9 +610,11 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
                        user_post_optimization_hook_);
 
   // Compile must be thread-safe so create a new LLVM context for the module.
-  auto llvm_context = absl::make_unique<llvm::LLVMContext>();
-  auto llvm_module =
-      absl::make_unique<llvm::Module>("__compute_module", *llvm_context);
+  mlir::MLIRContext mlir_context;
+  auto llvm_module = absl::make_unique<llvm::Module>(
+      "__compute_module",
+      mlir_context.getRegisteredDialect<mlir::LLVM::LLVMDialect>()
+          ->getLLVMContext());
 
   auto jit = absl::make_unique<SimpleOrcJIT>(
       CompilerTargetOptions(module->config()),
@@ -661,7 +668,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // before a caller computation.
 
   LLVMTargetMachineFeatures target_machine_features(jit->target_machine());
-  IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
+  IrEmitter ir_emitter(&mlir_context, *module, *assignment, llvm_module.get(),
                        std::move(instruction_to_profile_idx),
                        std::move(computation_to_profile_idx),
                        &target_machine_features,
@@ -815,8 +822,11 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
           opt_level));
 
   // Compile must be thread-safe so create a new LLVM context for the module.
-  llvm::LLVMContext llvm_context;
-  llvm::Module llvm_module("__compute_module", llvm_context);
+  mlir::MLIRContext mlir_context;
+  llvm::Module llvm_module(
+      "__compute_module",
+      mlir_context.getRegisteredDialect<mlir::LLVM::LLVMDialect>()
+          ->getLLVMContext());
   llvm_module.setDataLayout(target_machine->createDataLayout());
   llvm_module.setTargetTriple(triple.getTriple());
   if (pic_level != llvm::PICLevel::NotPIC) {
@@ -865,7 +875,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
     }
 
     LLVMTargetMachineFeatures target_machine_features(target_machine.get());
-    IrEmitter ir_emitter(*module, *assignment, &llvm_module,
+    IrEmitter ir_emitter(&mlir_context, *module, *assignment, &llvm_module,
                          std::move(instruction_to_profile_idx),
                          std::move(computation_to_profile_idx),
                          &target_machine_features,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 8c1ae0179c0..f031daecb1f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -363,7 +363,12 @@ StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
   if (shape.IsOpaque()) {
     return sizeof(void*);
   }
-  return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  if (shape.is_static() || shape.IsTuple()) {
+    return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  }
+  // Each dynamic dimension size is represented as a S32.
+  int64 metadata_size = sizeof(int32) * shape.dimensions_size();
+  return ShapeUtil::ByteSizeOf(shape, sizeof(void*)) + metadata_size;
 }
 
 const InstructionValueSet& CpuExecutable::GetRootValueSet() const {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index ff654c83d61..c0222010fd9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -25,6 +25,7 @@ const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
 const char* const kXlaForceEnableExperimentalLlvmIrGemm =
     "xla_force_enable_experimental_llvm_ir_gemm";
+const char* const kXlaUseLinalgForDot = "xla_use_linalg_for_dot";
 const char* const kLlvmIrGemmTileSize = "xla_llvm_ir_gemm_tile_size";
 
 }  // namespace
@@ -63,6 +64,12 @@ bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
   return extra_options_map.count(kXlaForceEnableExperimentalLlvmIrGemm) > 0;
 }
 
+bool UseLinalgForDot(const HloModuleConfig& config) {
+  const auto& extra_options_map =
+      config.debug_options().xla_backend_extra_options();
+  return extra_options_map.count(kXlaUseLinalgForDot) > 0;
+}
+
 static absl::string_view RemoveSuffix(absl::string_view str,
                                       absl::string_view suffix) {
   CHECK_GE(str.size(), suffix.size());
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index 99e6702d14a..5d25aef6912 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -27,6 +27,7 @@ namespace options {
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
 bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
+bool UseLinalgForDot(const HloModuleConfig& config);
 absl::optional<int64> LlvmIrGemvTilingFactor(const HloModuleConfig& config);
 absl::optional<std::tuple<int64, int64, int64>> LlvmIrGemmTileSize(
     const HloModuleConfig& config);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index bd949aa24c7..7abf5da0b64 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -67,6 +67,10 @@ extern const char* const kEigenMatMulF32SymbolName =
     "__xla_cpu_runtime_EigenMatMulF32";
 extern const char* const kEigenMatMulF64SymbolName =
     "__xla_cpu_runtime_EigenMatMulF64";
+extern const char* const kEigenMatMulC64SymbolName =
+    "__xla_cpu_runtime_EigenMatMulC64";
+extern const char* const kEigenMatMulC128SymbolName =
+    "__xla_cpu_runtime_EigenMatMulC128";
 extern const char* const kEigenMatMulS32SymbolName =
     "__xla_cpu_runtime_EigenMatMulS32";
 extern const char* const kMKLConvF32SymbolName = "__xla_cpu_runtime_MKLConvF32";
@@ -91,6 +95,10 @@ extern const char* const kEigenSingleThreadedMatMulF32SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulF32";
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulF64";
+extern const char* const kEigenSingleThreadedMatMulC64SymbolName =
+    "__xla_cpu_runtime_EigenSingleThreadedMatMulC64";
+extern const char* const kEigenSingleThreadedMatMulC128SymbolName =
+    "__xla_cpu_runtime_EigenSingleThreadedMatMulC128";
 extern const char* const kEigenSingleThreadedMatMulS32SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulS32";
 extern const char* const kEigenSingleThreadedConvF16SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index 14ea5448eef..492ce3f68b2 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -46,6 +46,8 @@ namespace runtime {
 extern const char* const kEigenMatMulF16SymbolName;
 extern const char* const kEigenMatMulF32SymbolName;
 extern const char* const kEigenMatMulF64SymbolName;
+extern const char* const kEigenMatMulC64SymbolName;
+extern const char* const kEigenMatMulC128SymbolName;
 extern const char* const kEigenMatMulS32SymbolName;
 extern const char* const kMKLConvF32SymbolName;
 extern const char* const kMKLMatMulF32SymbolName;
@@ -59,6 +61,8 @@ extern const char* const kEigenSingleThreadedFftSymbolName;
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
+extern const char* const kEigenSingleThreadedMatMulC64SymbolName;
+extern const char* const kEigenSingleThreadedMatMulC128SymbolName;
 extern const char* const kEigenSingleThreadedMatMulS32SymbolName;
 extern const char* const kEigenSingleThreadedConvF16SymbolName;
 extern const char* const kEigenSingleThreadedConvF32SymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index fae9670051a..e21ed7ad60e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -154,7 +154,8 @@ CpuTransferManager::TransferBufferToInfeedInternal(se::StreamExecutor* executor,
                                                    int64 size,
                                                    const void* source) {
   if (size > std::numeric_limits<int32>::max()) {
-    return InvalidArgument("Infeed shape is too large: needs %d bytes", size);
+    return InvalidArgument("CPU infeed of %d bytes exceeds maximum of %d bytes",
+                           size, std::numeric_limits<int32>::max());
   }
 
   if (size <= 0) {
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 7dba826b65c..9e75c1b9ac5 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -23,8 +23,17 @@ limitations under the License.
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
+#include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"  // from @llvm-project
+#include "mlir/EDSC/Builders.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/cpu/mlir_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
@@ -89,6 +98,9 @@ enum class DotImplementationStrategy {
   // and the output have to be row major.
   kTiledLlvmIrGemm,
 
+  // The dot operation is lowered into linalg.matmul op and lowered to LLVM IR.
+  kLinalgMatmul,
+
   // The dot operation is lowered into a call into an Eigen routine.  No fusions
   // are supported today.  The two inputs and the output have to be row major.
   // However, we do allow transposing either the LHS or the RHS as part of the
@@ -112,7 +124,7 @@ class DotOpEmitter {
                         const llvm_ir::IrArray& rhs_array,
                         const llvm_ir::IrArray* addend_array,
                         llvm::Value* executable_run_options_value,
-                        llvm::IRBuilder<>* b,
+                        llvm::IRBuilder<>* b, mlir::MLIRContext* mlir_context,
                         const HloModuleConfig& hlo_module_config,
                         const TargetMachineFeatures& target_machine_features);
 
@@ -163,6 +175,9 @@ class DotOpEmitter {
   // Lowers the dot operation as a tiled Matrix*Matrix loop.
   void EmitTiledLlvmIrGemm();
 
+  // Lowers the dot operation through MLIR's linalg.matmul.
+  Status EmitLinalgMatmul();
+
   // Lowers the dot operation as a naive nested loop that computes the result
   // one element at a time.
   void EmitNaiveLlvmIrGemm();
@@ -194,20 +209,19 @@ class DotOpEmitter {
   const llvm_ir::IrArray* addend_array_;
   llvm::Value* executable_run_options_value_;
   llvm::IRBuilder<>* b_;
+  mlir::MLIRContext* mlir_context_;
   const HloModuleConfig& hlo_module_config_;
   const TargetMachineFeatures& target_machine_features_;
 };
 }  // namespace
 
-DotOpEmitter::DotOpEmitter(DotInfo dot_info, string dot_hlo_name,
-                           const llvm_ir::IrArray& target_array,
-                           const llvm_ir::IrArray& lhs_array,
-                           const llvm_ir::IrArray& rhs_array,
-                           const llvm_ir::IrArray* addend_array,
-                           llvm::Value* executable_run_options_value,
-                           llvm::IRBuilder<>* b,
-                           const HloModuleConfig& hlo_module_config,
-                           const TargetMachineFeatures& target_machine_features)
+DotOpEmitter::DotOpEmitter(
+    DotInfo dot_info, string dot_hlo_name, const llvm_ir::IrArray& target_array,
+    const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
+    const llvm_ir::IrArray* addend_array,
+    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
+    mlir::MLIRContext* mlir_context, const HloModuleConfig& hlo_module_config,
+    const TargetMachineFeatures& target_machine_features)
     : dot_info_(std::move(dot_info)),
       dot_hlo_name_(std::move(dot_hlo_name)),
       target_array_(target_array),
@@ -216,9 +230,36 @@ DotOpEmitter::DotOpEmitter(DotInfo dot_info, string dot_hlo_name,
       addend_array_(addend_array),
       executable_run_options_value_(executable_run_options_value),
       b_(b),
+      mlir_context_(mlir_context),
       hlo_module_config_(hlo_module_config),
       target_machine_features_(target_machine_features) {}
 
+Status DotOpEmitter::EmitLinalgMatmul() {
+  Shape operand_shapes[] = {dot_info_.lhs_shape, dot_info_.rhs_shape};
+  llvm::Value* operand_ptrs[] = {lhs_array_.GetBasePointer(),
+                                 rhs_array_.GetBasePointer()};
+  llvm::Value* target_ptr = target_array_.GetBasePointer();
+
+  // Zero out the output buffer.
+  int64 size_bytes = ShapeUtil::ByteSizeOf(dot_info_.result_shape);
+  b_->CreateMemSet(target_ptr, b_->getInt8(0), /*Size=*/size_bytes,
+                   /*Align=*/llvm::MaybeAlign(1));
+
+  std::string name =
+      absl::StrCat("linalgMatMul_", dot_info_.result_shape.ToString(true), "_",
+                   dot_info_.lhs_shape.ToString(true), "_",
+                   dot_info_.rhs_shape.ToString(true));
+  return EmitMlirFuncAndCall(
+      mlir_context_, b_, dot_info_.result_shape, operand_shapes, target_ptr,
+      operand_ptrs, name, [&](mlir::OpBuilder* builder, mlir::FuncOp function) {
+        mlir::edsc::ScopedContext scope(*builder, function.getLoc());
+        mlir::Value a = function.getArgument(0), b = function.getArgument(1),
+                    c = function.getArgument(2);
+        mlir::edsc::intrinsics::linalg_matmul(b, c, a);
+        mlir::edsc::intrinsics::std_ret();
+      });
+}
+
 void DotOpEmitter::EmitTiledLlvmIrGemm() {
   PrimitiveType primitive_type = dot_info_.result_shape.element_type();
   MatMultDims mat_mult_dims = GetMatMultDims();
@@ -418,6 +459,9 @@ Status DotOpEmitter::Emit() {
       EmitTiledLlvmIrGemm();
       return Status::OK();
 
+    case DotImplementationStrategy::kLinalgMatmul:
+      return EmitLinalgMatmul();
+
     case DotImplementationStrategy::kEigen:
       return EmitCallToRuntime();
   }
@@ -613,6 +657,8 @@ Status DotOpEmitter::EmitCallToRuntime() {
   bool multi_threaded = ShouldUseMultiThreadedEigen(hlo_module_config_);
   bool use_mkl_dnn = hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
   PrimitiveType type = target_array_.GetShape().element_type();
+  llvm::Function* function = b_->GetInsertBlock()->getParent();
+  llvm::Module* module = function->getParent();
   llvm::Type* float_type;
   const char* fn_name;
   switch (type) {
@@ -640,6 +686,18 @@ Status DotOpEmitter::EmitCallToRuntime() {
                            : runtime::kEigenSingleThreadedMatMulF64SymbolName);
       float_type = b_->getDoubleTy();
       break;
+    case C64:
+      fn_name = multi_threaded
+                    ? runtime::kEigenMatMulC64SymbolName
+                    : runtime::kEigenSingleThreadedMatMulC64SymbolName;
+      float_type = llvm_ir::PrimitiveTypeToIrType(C64, module);
+      break;
+    case C128:
+      fn_name = multi_threaded
+                    ? runtime::kEigenMatMulC128SymbolName
+                    : runtime::kEigenSingleThreadedMatMulC128SymbolName;
+      float_type = llvm_ir::PrimitiveTypeToIrType(C128, module);
+      break;
     case S32:
       fn_name = multi_threaded
                     ? runtime::kEigenMatMulS32SymbolName
@@ -661,9 +719,6 @@ Status DotOpEmitter::EmitCallToRuntime() {
        int64_type, int64_type, int64_type, int32_type, int32_type},
       /*isVarArg=*/false);
 
-  llvm::Function* function = b_->GetInsertBlock()->getParent();
-  llvm::Module* module = function->getParent();
-
   llvm::FunctionCallee matmul_func =
       module->getOrInsertFunction(fn_name, matmul_type);
   if (auto* fn = llvm::dyn_cast<llvm::Function>(matmul_func.getCallee())) {
@@ -809,9 +864,11 @@ bool AreGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
       << output_shape.DebugString();
 
   switch (output_shape.element_type()) {
-    case F64:
-    case F32:
     case F16:
+    case F32:
+    case F64:
+    case C64:
+    case C128:
     case S32:
       return IsRank2(lhs_shape) && IsRank2(rhs_shape) && IsRank2(output_shape);
     default:
@@ -860,7 +917,9 @@ bool CanEmitTiledLlvmIrGemm(
     return false;
   }
 
-  if (dot_info.result_shape.element_type() == F16) {
+  if (dot_info.result_shape.element_type() == F16 ||
+      dot_info.result_shape.element_type() == C64 ||
+      dot_info.result_shape.element_type() == C128) {
     // TODO(sanjoy): This is probably easy to fix, but I want to keep the CL
     // adding this comment NFC.
     return false;
@@ -886,9 +945,12 @@ DotImplementationStrategy GetDotImplementationStrategy(
   }
 
   if (IsAlignedGemm(dot_info, target_machine_features)) {
-    return CanEmitTiledLlvmIrGemm(config, dot_info, target_machine_features)
-               ? DotImplementationStrategy::kTiledLlvmIrGemm
-               : DotImplementationStrategy::kEigen;
+    if (CanEmitTiledLlvmIrGemm(config, dot_info, target_machine_features)) {
+      return options::UseLinalgForDot(config)
+                 ? DotImplementationStrategy::kLinalgMatmul
+                 : DotImplementationStrategy::kTiledLlvmIrGemm;
+    }
+    return DotImplementationStrategy::kEigen;
   }
 
   return DotImplementationStrategy::kNaiveLlvmIr;
@@ -899,15 +961,15 @@ Status EmitNonBatchDotOperation(
     const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
     const llvm_ir::IrArray* addend_array,
     llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-    const HloModuleConfig& hlo_module_config,
+    mlir::MLIRContext* mlir_context, const HloModuleConfig& hlo_module_config,
     const TargetMachineFeatures& target_machine_features) {
   PrimitiveType type = target_array.GetShape().element_type();
   TF_RET_CHECK(S32 == type || F16 == type || F32 == type || F64 == type ||
                C64 == type || C128 == type);
   DotOpEmitter dot_emitter(std::move(dot_info), std::move(hlo_name),
                            target_array, lhs_array, rhs_array, addend_array,
-                           executable_run_options_value, b, hlo_module_config,
-                           target_machine_features);
+                           executable_run_options_value, b, mlir_context,
+                           hlo_module_config, target_machine_features);
   return dot_emitter.Emit();
 }
 
@@ -981,7 +1043,7 @@ Status EmitBatchDotOperation(
     const HloInstruction& dot, const llvm_ir::IrArray& target_array,
     const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
     llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-    const HloModuleConfig& hlo_module_config,
+    mlir::MLIRContext* mlir_context, const HloModuleConfig& hlo_module_config,
     const TargetMachineFeatures& target_machine_features) {
   TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(dot.dot_dimension_numbers()));
 
@@ -1039,7 +1101,7 @@ Status EmitBatchDotOperation(
         // Emit the inner non-batch dot operation.
         return EmitNonBatchDotOperation(
             dot_info, dot.name(), target_slice, lhs_slice, rhs_slice, nullptr,
-            executable_run_options_value, b, hlo_module_config,
+            executable_run_options_value, b, mlir_context, hlo_module_config,
             target_machine_features);
       });
 }
@@ -1089,7 +1151,7 @@ Status EmitDotOperation(const HloInstruction& dot,
                         const llvm_ir::IrArray& rhs_array,
                         const llvm_ir::IrArray* addend_array,
                         llvm::Value* executable_run_options_value,
-                        llvm::IRBuilder<>* b,
+                        llvm::IRBuilder<>* b, mlir::MLIRContext* mlir_context,
                         const HloModuleConfig& hlo_module_config,
                         const TargetMachineFeatures& target_machine_features) {
   // This routine assumes that the dot operation is not in a parallelized
@@ -1099,13 +1161,13 @@ Status EmitDotOperation(const HloInstruction& dot,
   if (IsBatchDot(dot)) {
     TF_RET_CHECK(addend_array == nullptr);
     return EmitBatchDotOperation(dot, target_array, lhs_array, rhs_array,
-                                 executable_run_options_value, b,
+                                 executable_run_options_value, b, mlir_context,
                                  hlo_module_config, target_machine_features);
   }
 
   return EmitNonBatchDotOperation(DotInfo(dot), dot.name(), target_array,
                                   lhs_array, rhs_array, addend_array,
-                                  executable_run_options_value, b,
+                                  executable_run_options_value, b, mlir_context,
                                   hlo_module_config, target_machine_features);
 }
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index 105bd3005c8..d9cf8a2036b 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "llvm/IR/IRBuilder.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -63,7 +64,7 @@ Status EmitDotOperation(const HloInstruction& dot,
                         const llvm_ir::IrArray& rhs_array,
                         const llvm_ir::IrArray* addend_array,
                         llvm::Value* executable_run_options_value,
-                        llvm::IRBuilder<>* b,
+                        llvm::IRBuilder<>* b, mlir::MLIRContext* mlir_context,
                         const HloModuleConfig& hlo_module_config,
                         const TargetMachineFeatures& target_machine_features);
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index e21ca01c803..05364a4492b 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -109,24 +109,6 @@ llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
     const HloInstruction* hlo,
     const HloToElementGeneratorMap& operand_to_generator) {
   switch (hlo->opcode()) {
-    case HloOpcode::kMap:
-      return [this, hlo, &operand_to_generator](
-                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        std::vector<llvm::Value*> operands;
-        for (int i = 0; i < hlo->operand_count(); i++) {
-          TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
-                              operand_to_generator.at(hlo->operand(i))(index));
-          operands.push_back(operand_value);
-        }
-        return ir_emitter_->EmitElementalMap(*Cast<HloMapInstruction>(hlo),
-                                             operands, llvm_ir::IrName(hlo));
-      };
-    case HloOpcode::kReduceWindow:
-      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
-        return ir_emitter_->EmitElementalReduceWindow(
-            Cast<HloReduceWindowInstruction>(hlo),
-            operand_to_generator.at(hlo->operand(0)), index);
-      };
     case HloOpcode::kConvolution:
       return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
         return ir_emitter_->EmitElementalConvolution(
@@ -134,22 +116,6 @@ llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
             operand_to_generator.at(hlo->operand(0)),
             operand_to_generator.at(hlo->operand(1)), index);
       };
-    case HloOpcode::kReduce:
-      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
-        auto reduce_instr = Cast<HloReduceInstruction>(hlo);
-        std::vector<llvm_ir::ElementGenerator> input_generators;
-        for (const HloInstruction* instr : reduce_instr->inputs()) {
-          input_generators.push_back(operand_to_generator.at(instr));
-        }
-
-        std::vector<llvm_ir::ElementGenerator> initial_value_generators;
-        for (const HloInstruction* instr : reduce_instr->init_values()) {
-          initial_value_generators.push_back(operand_to_generator.at(instr));
-        }
-        return ir_emitter_->EmitElementalReduce(
-            reduce_instr, std::move(input_generators),
-            std::move(initial_value_generators), index);
-      };
     default:
       return ElementalIrEmitter::MakeElementGenerator(hlo,
                                                       operand_to_generator);
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
index e3fba9306b7..5c9f6677ab3 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
@@ -44,6 +44,12 @@ class CpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
                                   llvm::Value* value) override;
 
+  StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view name) override {
+    return ir_emitter_->EmitThreadLocalCall(callee, parameters, name);
+  }
+
   IrEmitter* ir_emitter_;
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index c19fa779b60..998b9db132c 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <stddef.h>
 #include <stdint.h>
+
 #include <algorithm>
 #include <iterator>
 #include <limits>
@@ -40,6 +41,7 @@ limitations under the License.
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -88,8 +90,8 @@ using llvm_ir::SetToFirstInsertPoint;
 namespace cpu {
 
 IrEmitter::IrEmitter(
-    const HloModule& hlo_module, const BufferAssignment& assignment,
-    llvm::Module* llvm_module,
+    mlir::MLIRContext* mlir_context, const HloModule& hlo_module,
+    const BufferAssignment& assignment, llvm::Module* llvm_module,
     std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx,
     std::unordered_map<const HloComputation*, int64> computation_to_profile_idx,
     const TargetMachineFeatures* target_machine_features,
@@ -98,6 +100,7 @@ IrEmitter::IrEmitter(
       module_(llvm_module),
       arch_type_(llvm::Triple(llvm_module->getTargetTriple()).getArch()),
       b_(llvm_module->getContext()),
+      mlir_context_(mlir_context),
       instruction_to_profile_idx_(std::move(instruction_to_profile_idx)),
       computation_to_profile_idx_(std::move(computation_to_profile_idx)),
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
@@ -570,25 +573,9 @@ Status IrEmitter::HandleSort(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(sort));
   Shape keys_shape = sort->keys()->shape();
   PrimitiveType keys_type = keys_shape.element_type();
-  switch (keys_type) {
-    case PRED:
-    case S8:
-    case U8:
-    case S16:
-    case U16:
-    case BF16:
-    case F16:
-    case S32:
-    case U32:
-    case F32:
-    case S64:
-    case U64:
-    case F64:
-      break;
-    default:
-      return Unimplemented(
-          "Element type %s not supported in the Sort op on CPU.",
-          PrimitiveType_Name(keys_type));
+  if (!primitive_util::IsArrayType(keys_type)) {
+    return Unimplemented("Element type %s not supported in the Sort op on CPU.",
+                         PrimitiveType_Name(keys_type));
   }
   std::vector<llvm::Value*> destination_addresses(sort->operand_count());
   for (int64 i = 0; i < sort->operand_count(); ++i) {
@@ -695,101 +682,6 @@ Status IrEmitter::HandleTuple(HloInstruction* tuple) {
   return Status::OK();
 }
 
-llvm::Value* IrEmitter::EmitElementalMap(
-    const HloMapInstruction& map_instr,
-    absl::Span<llvm::Value* const> elemental_operands, absl::string_view name) {
-  return EmitScalarReturningThreadLocalCall(*map_instr.to_apply(),
-                                            elemental_operands, name);
-}
-
-StatusOr<llvm::Value*> IrEmitter::EmitElementalReduceWindow(
-    const HloReduceWindowInstruction* reduce_window,
-    const llvm_ir::ElementGenerator& input_generator,
-    const llvm_ir::IrArray::Index& index) {
-  const HloInstruction* operand = reduce_window->operand(0);
-  const Window& window = reduce_window->window();
-
-  // We fold inputs into the accumulator and initialize it to
-  // the initial value on the reduce_window.
-  PrimitiveType operand_element_type = operand->shape().element_type();
-  llvm::Value* accumulator_address = llvm_ir::EmitAllocaAtFunctionEntry(
-      llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
-      "reduce_window_accumulator_address", &b_,
-      MinimumAlignmentForPrimitiveType(operand_element_type));
-  Store(Load(GetEmittedValueFor(reduce_window->operand(1))),
-        accumulator_address);
-
-  llvm_ir::ForLoopNest loops(IrName(reduce_window, "inner"), &b_);
-  std::vector<int64> window_size;
-  for (const auto& dim : window.dimensions()) {
-    window_size.push_back(dim.size());
-  }
-  const llvm_ir::IrArray::Index window_index = loops.AddLoopsForShape(
-      ShapeUtil::MakeShape(operand_element_type, window_size), "window");
-  CHECK_EQ(window_index.size(), index.size());
-
-  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
-
-  std::vector<llvm::Value*> input_multi_index(index.size());
-  llvm::Value* in_bounds_condition = nullptr;
-  for (size_t i = 0; i < index.size(); ++i) {
-    llvm::Value* strided_index =
-        NSWMul(index[i], b_.getInt64(window.dimensions(i).stride()));
-    input_multi_index[i] = NSWSub(
-        NSWAdd(strided_index,
-               NSWMul(window_index[i],
-                      b_.getInt64(window.dimensions(i).window_dilation()))),
-        b_.getInt64(window.dimensions(i).padding_low()));
-
-    // We need to verify that we are not in the dilated base area.
-    llvm::Value* dilation_condition =
-        ICmpEQ(SRem(input_multi_index[i],
-                    b_.getInt64(window.dimensions(i).base_dilation())),
-               b_.getInt64(0));
-    if (in_bounds_condition == nullptr) {
-      in_bounds_condition = dilation_condition;
-    } else {
-      in_bounds_condition = And(in_bounds_condition, dilation_condition);
-    }
-
-    // Apply base dilation to the index.
-    input_multi_index[i] =
-        SDiv(input_multi_index[i],
-             b_.getInt64(window.dimensions(i).base_dilation()));
-
-    // We need to check if 0 <= input_multi_index[i] < bound, as otherwise we
-    // are in the padding so that we can skip the computation. That is
-    // equivalent to input_multi_index[i] < bound as an *unsigned* comparison,
-    // since a negative value will wrap to a large positive value.
-    llvm::Value* index_condition =
-        ICmpULT(input_multi_index[i],
-                b_.getInt64(ShapeUtil::GetDimension(operand->shape(), i)));
-    if (in_bounds_condition == nullptr) {
-      in_bounds_condition = index_condition;
-    } else {
-      in_bounds_condition = And(in_bounds_condition, index_condition);
-    }
-  }
-  CHECK(in_bounds_condition != nullptr);
-
-  llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_);
-  SetToFirstInsertPoint(if_data.true_block, &b_);
-
-  // We are not in the padding, so carry out the computation.
-  llvm_ir::IrArray::Index input_index(input_multi_index, operand->shape(),
-                                      b_.getInt64Ty());
-  TF_ASSIGN_OR_RETURN(llvm::Value* const input_value,
-                      input_generator(input_index));
-  llvm::Value* result = EmitScalarReturningThreadLocalCall(
-      *reduce_window->to_apply(), {Load(accumulator_address), input_value},
-      "reducer_function");
-  Store(result, accumulator_address);
-
-  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
-  return Load(accumulator_address);
-}
-
 Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
   // Pseudo code for reduce window:
   //
@@ -1008,7 +900,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   // Dot operation is complicated so we delegate to a helper class.
   return EmitDotOperation(*dot, target_array, lhs_array, rhs_array,
                           /*addend_array=*/nullptr,
-                          GetExecutableRunOptionsArgument(), &b_,
+                          GetExecutableRunOptionsArgument(), &b_, mlir_context_,
                           hlo_module_config_, target_machine_features_);
 }
 
@@ -1325,7 +1217,7 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
   auto operand = fft->operand(0);
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*fft, /*operands=*/{operand},
-      /*supported_types=*/{F32, C64}));
+      /*supported_types=*/{F32, F64, C64, C128}));
   TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(operand->shape().layout()));
   TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(fft->shape().layout()));
   VLOG(3) << "operand=" << ShapeUtil::HumanStringWithLayout(operand->shape());
@@ -1347,7 +1239,7 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
   llvm::FunctionType* fft_type = llvm::FunctionType::get(
       b_.getVoidTy(),
       {int8_ptr_type, int8_ptr_type, int8_ptr_type, int32_type, int32_type,
-       int64_type, int64_type, int64_type, int64_type},
+       int32_type, int64_type, int64_type, int64_type, int64_type},
       /*isVarArg=*/false);
 
   bool multi_threaded_eigen =
@@ -1366,6 +1258,8 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
        {GetExecutableRunOptionsArgument(),
         BitCast(GetEmittedValueFor(fft), int8_ptr_type),
         BitCast(operand_address, int8_ptr_type), b_.getInt32(fft->fft_type()),
+        b_.getInt32(operand->shape().element_type() == F64 ||
+                    operand->shape().element_type() == C128),
         b_.getInt32(fft_rank), b_.getInt64(input_batch),
         b_.getInt64(fft_rank > 0 ? fft_length[0] : 0),
         b_.getInt64(fft_rank > 1 ? fft_length[1] : 0),
@@ -2099,108 +1993,6 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
   return true;
 }
 
-StatusOr<llvm::Value*> IrEmitter::EmitElementalReduce(
-    const HloReduceInstruction* reduce,
-    std::vector<llvm_ir::ElementGenerator> input_generators,
-    std::vector<llvm_ir::ElementGenerator> initial_value_generators,
-    const llvm_ir::IrArray::Index& index) {
-  const Shape& out_shape = reduce->shape();
-  bool is_variadic = !out_shape.IsArray();
-  int accumulators_count = 1;
-  if (is_variadic) {
-    CHECK(out_shape.IsTuple());
-    accumulators_count = out_shape.tuple_shapes_size();
-  }
-
-  absl::Span<const int64> reduced_dimensions(reduce->dimensions());
-
-  std::vector<llvm::Value*> accumulator_addrs;
-  std::vector<llvm::Type*> accumulator_types;
-  for (int i = 0; i < accumulators_count; i++) {
-    const Shape& element_shape =
-        is_variadic ? out_shape.tuple_shapes(i) : out_shape;
-    PrimitiveType accumulator_type = element_shape.element_type();
-    llvm::Type* accumulator_llvm_type =
-        llvm_ir::PrimitiveTypeToIrType(accumulator_type, module_);
-    accumulator_types.push_back(accumulator_llvm_type);
-
-    // Initialize an accumulator with init_value.
-    llvm::AllocaInst* accumulator_addr = llvm_ir::EmitAllocaAtFunctionEntry(
-        accumulator_llvm_type, "accumulator_" + std::to_string(i), &b_,
-        MinimumAlignmentForPrimitiveType(accumulator_type));
-    TF_ASSIGN_OR_RETURN(
-        llvm::Value* const init_value,
-        initial_value_generators[i](llvm_ir::IrArray::Index(index.GetType())));
-    Store(init_value, accumulator_addr);
-    accumulator_addrs.push_back(accumulator_addr);
-  }
-
-  // The enclosing loops go over all the target elements. Now we have to compute
-  // the actual target element. For this, we build a new loop nest to iterate
-  // over all the reduction dimensions in the argument.
-  // AddLoopsForShapeOnDimensions will return an Index where induction Value*s
-  // are placed for each dimension in dimensions, and all the rest are nullptrs.
-  llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &b_);
-  const HloInstruction* arg = reduce->operand(0);
-  std::vector<llvm::Value*> input_multi_index =
-      loops.AddLoopsForShapeOnDimensions(arg->shape(), reduced_dimensions,
-                                         "reduction_dim");
-
-  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
-
-  // Build a full index for the input argument, using input_multi_index as the
-  // base. In input_multi_index only the reduction dimensions are filled in. We
-  // fill in the rest of the dimensions with induction Value*s taken from
-  // 'index' which iterates over the target array.  See the high-level
-  // description in the XLA documentation for details.
-  llvm_ir::IrArray::Index::const_iterator it = index.begin();
-
-  for (auto& i : input_multi_index) {
-    if (i == nullptr) {
-      i = *it++;
-    }
-  }
-  CHECK(index.end() == it);
-  llvm_ir::IrArray::Index input_index(input_multi_index, arg->shape(),
-                                      b_.getInt64Ty());
-
-  std::vector<llvm::Value*> reduction_operands;
-  for (llvm::Value* accum : accumulator_addrs) {
-    llvm::Value* accum_value = Load(accum);
-    reduction_operands.push_back(accum_value);
-  }
-
-  for (int i = 0; i < accumulators_count; i++) {
-    TF_ASSIGN_OR_RETURN(llvm::Value* const input_element,
-                        input_generators[i](input_index));
-    reduction_operands.push_back(input_element);
-  }
-
-  std::vector<llvm::Value*> results = EmitThreadLocalCall(
-      *reduce->to_apply(), reduction_operands, "reduce_function");
-
-  CHECK(results.size() == accumulators_count);
-  for (int i = 0; i < accumulators_count; i++) {
-    Store(results[i], accumulator_addrs[i]);
-  }
-  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
-
-  if (is_variadic) {
-    // Emit a structure, as that what the LoopEmitter expects.
-    llvm::Value* returned_structure = llvm::UndefValue::get(
-        llvm::StructType::get(b_.getContext(), accumulator_types));
-    for (int i = 0; i < accumulators_count; i++) {
-      llvm::Value* accumulator_value = Load(accumulator_addrs[i]);
-      returned_structure =
-          b_.CreateInsertValue(returned_structure, accumulator_value, i);
-    }
-    return returned_structure;
-  } else {
-    CHECK_EQ(accumulator_addrs.size(), 1);
-    return Load(accumulator_addrs[0]);
-  }
-}
-
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
   auto arg = reduce->mutable_operand(0);
   auto init_value = reduce->mutable_operand(1);
@@ -2517,10 +2309,10 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
     llvm_ir::IrArray addend_array(
         GetIrArrayFor(fusion->operand(addend_param_number)));
 
-    TF_RETURN_IF_ERROR(
-        EmitDotOperation(*dot, target_array, lhs_array, rhs_array,
-                         &addend_array, GetExecutableRunOptionsArgument(), &b_,
-                         hlo_module_config_, target_machine_features_));
+    TF_RETURN_IF_ERROR(EmitDotOperation(
+        *dot, target_array, lhs_array, rhs_array, &addend_array,
+        GetExecutableRunOptionsArgument(), &b_, mlir_context_,
+        hlo_module_config_, target_machine_features_));
     return Status::OK();
   } else {
     return Unimplemented("Fusion kind not implemented on CPU");
@@ -2554,7 +2346,125 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
   return Status::OK();
 }
 
+Status IrEmitter::HandleSliceToDynamic(HloInstruction* hlo) {
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
+  std::vector<llvm::Value*> dynamic_dims;
+  int32 raw_data_size =
+      ShapeUtil::ByteSizeOf(ShapeUtil::MakeStaticShape(hlo->shape()));
+  llvm::Value* dest_buffer = GetEmittedValueFor(hlo);
+  llvm::Value* raw_buffer =
+      b_.CreateBitCast(dest_buffer, b_.getInt8Ty()->getPointerTo());
+  for (int64 i = 1; i < hlo->operand_count(); ++i) {
+    const int64 dim_index = i - 1;
+    llvm::Value* source_buffer = GetEmittedValueFor(hlo->operand(i));
+    llvm::LoadInst* dyn_dim_size = b_.CreateLoad(source_buffer, "dyn_dim_size");
+
+    llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
+        b_.getInt8Ty(), raw_buffer, raw_data_size + dim_index * sizeof(int32));
+    b_.CreateStore(dyn_dim_size,
+                   b_.CreateBitCast(metadata, b_.getInt32Ty()->getPointerTo()));
+    dynamic_dims.push_back(b_.CreateIntCast(dyn_dim_size, b_.getInt64Ty(),
+                                            /*isSigned=*/true,
+                                            "i64_dyn_dim_size"));
+  }
+
+  llvm_ir::IrArray data_array = GetIrArrayFor(hlo);
+  // Pseudo code for sliceToDynamic:
+  //
+  //   for (index i in dynamic_dim)
+  //     dest_index = delinearize(linearize(i, dynamic_dim), static_dim)
+  //     dest[dest_index] = source[i]
+  auto loop_body_emitter =
+      [&](const llvm_ir::IrArray::Index& array_index) -> Status {
+    llvm::Value* source_element =
+        GetIrArrayFor(hlo->operand(0)).EmitReadArrayElement(array_index, &b_);
+    llvm::Value* linear_index = array_index.Linearize(dynamic_dims, &b_);
+    // Delinearize the index based on the static shape.
+    llvm_ir::IrArray::Index dest_index(linear_index, data_array.GetShape(),
+                                       &b_);
+    data_array.EmitWriteArrayElement(dest_index, source_element, &b_);
+    return Status::OK();
+  };
+  return llvm_ir::LoopEmitter(loop_body_emitter, data_array.GetShape(),
+                              dynamic_dims, &b_)
+      .EmitLoop(IrName(hlo));
+}
+
+Status IrEmitter::HandlePadToStatic(HloInstruction* hlo) {
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice data_slice,
+                      assignment_.GetUniqueSlice(hlo, {0}));
+  std::vector<llvm::Value*> dynamic_dims;
+  std::vector<llvm::Value*> tuple_operand_ptrs;
+  const Shape& data_shape = ShapeUtil::GetSubshape(hlo->shape(), {0});
+  const Shape& input_shape = hlo->operand(0)->shape();
+  llvm::Value* data_address = EmitBufferPointer(data_slice, data_shape);
+  llvm_ir::IrArray data_array(data_address, data_shape);
+  llvm::Value* source_buffer = GetEmittedValueFor(hlo->operand(0));
+  llvm::Value* raw_buffer =
+      b_.CreateBitCast(source_buffer, b_.getInt8Ty()->getPointerTo());
+  int64 raw_data_size =
+      ShapeUtil::ByteSizeOf(ShapeUtil::MakeStaticShape(input_shape));
+
+  // Put a placeholder for the data array's pointer
+  tuple_operand_ptrs.push_back(data_array.GetBasePointer());
+  // PadToStatic has a dynamic tensor as input and variadic size of outputs:
+  // (static_tensor, dynamic_dim_0, dynamic_dim_1, ... )
+  // Dynamic dimension sizes starts from output index 1.
+  for (int64 i = 1; i < hlo->shape().tuple_shapes_size(); ++i) {
+    // Read from the metadata section of the dynamic input (operand 0).
+    const Shape& dim_shape = ShapeUtil::GetSubshape(hlo->shape(), {i});
+    TF_RET_CHECK(Shape::Equal()(dim_shape, ShapeUtil::MakeScalarShape(S32)));
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dim_size_slice,
+                        assignment_.GetUniqueSlice(hlo, {i}));
+    llvm::Value* dest_dim_size_address =
+        EmitBufferPointer(dim_size_slice, data_shape);
+    const int64 dim_index = i - 1;
+    llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
+        b_.getInt8Ty(), raw_buffer, raw_data_size + dim_index * sizeof(int32));
+    llvm::Value* dyn_dim_size = b_.CreateLoad(
+        b_.CreateBitCast(metadata, b_.getInt32Ty()->getPointerTo()),
+        "dyn_dim_size");
+    b_.CreateStore(dyn_dim_size,
+                   b_.CreateBitCast(dest_dim_size_address,
+                                    b_.getInt32Ty()->getPointerTo()));
+    dynamic_dims.push_back(b_.CreateIntCast(dyn_dim_size, b_.getInt64Ty(),
+                                            /*isSigned=*/true,
+                                            "i64_dyn_dim_size"));
+    tuple_operand_ptrs.push_back(dest_dim_size_address);
+  }
+
+  // Pseudo code for padToStatic:
+  //
+  //   for (index i in dynamic_dim)
+  //     source_index = delinearize(inearize(i, dynamic_dim), static_dim)
+  //     dest[i] = source[source_index]
+  auto loop_body_emitter =
+      [&](const llvm_ir::IrArray::Index& array_index) -> Status {
+    llvm::Value* linear_index = array_index.Linearize(dynamic_dims, &b_);
+    llvm_ir::IrArray::Index source_index(linear_index, input_shape, &b_);
+    llvm::Value* source_element =
+        GetIrArrayFor(hlo->operand(0)).EmitReadArrayElement(source_index, &b_);
+    data_array.EmitWriteArrayElement(array_index, source_element, &b_);
+    return Status::OK();
+  };
+  TF_RETURN_IF_ERROR(
+      llvm_ir::LoopEmitter(loop_body_emitter, input_shape, dynamic_dims, &b_)
+          .EmitLoop(IrName(hlo)));
+
+  // Emit static tensor and dynamic sizes as one tuple.
+  llvm_ir::EmitTuple(GetIrArrayFor(hlo), tuple_operand_ptrs, &b_);
+  return Status::OK();
+}
+
 Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
+  if (custom_call->custom_call_target() == "PadToStatic") {
+    return HandlePadToStatic(custom_call);
+  }
+  if (custom_call->custom_call_target() == "SliceToDynamic") {
+    return HandleSliceToDynamic(custom_call);
+  }
   absl::Span<HloInstruction* const> operands(custom_call->operands());
   llvm::Type* i8_ptr_type = b_.getInt8PtrTy();
   llvm::AllocaInst* operands_alloca =
@@ -2999,9 +2909,8 @@ Status IrEmitter::HandleRngGetAndUpdateState(HloInstruction* rng_state) {
                                  old_state->getType()->getScalarType(),
                                  address->getType()->getPointerAddressSpace()));
   llvm::StoreInst* store = Store(old_state, address);
-  store->setAlignment(
-      llvm::MaybeAlign(IrEmitter::MinimumAlignmentForPrimitiveType(
-          rng_state->shape().element_type())));
+  store->setAlignment(llvm::Align(IrEmitter::MinimumAlignmentForPrimitiveType(
+      rng_state->shape().element_type())));
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index cc5aa3f37fc..661785153d0 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMITTER_H_
 
 #include <stddef.h>
+
 #include <map>
 #include <memory>
 #include <string>
@@ -32,6 +33,7 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Target/TargetMachine.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_function.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
@@ -58,6 +60,8 @@ namespace cpu {
 // functions.
 class IrEmitter : public DfsHloVisitorWithDefault,
                   public IrBuilderMixin<IrEmitter> {
+  friend class CpuElementalIrEmitter;
+
  public:
   using GeneratorForOperandIrArrays =
       std::function<std::vector<llvm_ir::IrArray>()>;
@@ -67,14 +71,16 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // hlo_module: the HLO module we are emitting IR for.
   // assignment: a BufferAssignment from which we know which buffers are used by
   //             the HLO nodes.
-  // llvm_module: the LLVM module to emit IR into.
+  // mlir_context: the MLIR context used for IR emission.
+  // llvm_module: the LLVM module to emit IR into. It's built using the LLVM
+  //              context inside of mlir_context.
   // instruction_to_profile_idx: the mapping from HLO instructions to their
   //              index in the profiling array.
   // computation_to_profile_idx: the mapping from HLO computations to their
   //              index in the profiling array.
   // emit_code_for_msan: whether emitted code should be compatible with msan.
-  IrEmitter(const HloModule& hlo_module, const BufferAssignment& assignment,
-            llvm::Module* llvm_module,
+  IrEmitter(mlir::MLIRContext* mlir_context, const HloModule& hlo_module,
+            const BufferAssignment& assignment, llvm::Module* llvm_module,
             std::unordered_map<const HloInstruction*, int64>
                 instruction_to_profile_idx,
             std::unordered_map<const HloComputation*, int64>
@@ -113,28 +119,12 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // Emit an LLVM global variable for every constant buffer allocation.
   Status EmitConstantGlobals();
 
-  // Emit code to map one element according to `map_instr`.
-  llvm::Value* EmitElementalMap(
-      const HloMapInstruction& map_instr,
-      absl::Span<llvm::Value* const> elemental_operands,
-      absl::string_view name);
-  // Emit code to emit the element at `index` for a reduce window instruction.
-  StatusOr<llvm::Value*> EmitElementalReduceWindow(
-      const HloReduceWindowInstruction* reduce_window,
-      const llvm_ir::ElementGenerator& input_generator,
-      const llvm_ir::IrArray::Index& index);
   // Emit code to emit the element at `index` for a convolution instruction.
   StatusOr<llvm::Value*> EmitElementalConvolution(
       const HloConvolutionInstruction* convolution,
       const llvm_ir::ElementGenerator& input_generator,
       const llvm_ir::ElementGenerator& kernel_generator,
       const llvm_ir::IrArray::Index& index);
-  // Emit code to emit the element at `index` for a reduce instruction.
-  StatusOr<llvm::Value*> EmitElementalReduce(
-      const HloReduceInstruction* reduce,
-      std::vector<llvm_ir::ElementGenerator> input_generators,
-      std::vector<llvm_ir::ElementGenerator> initial_value_generator,
-      const llvm_ir::IrArray::Index& index);
 
  protected:
   //
@@ -197,6 +187,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   }
 
  private:
+  Status HandleSliceToDynamic(HloInstruction* hlo);
+  Status HandlePadToStatic(HloInstruction* hlo);
   Status HandleAllReduceSingleReplica(HloInstruction* crs);
   Status HandleAllReduceMultipleReplica(HloInstruction* crs);
 
@@ -454,6 +446,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // module's function list).
   std::unique_ptr<IrFunction> compute_function_;
   llvm::IRBuilder<> b_;
+  mlir::MLIRContext* mlir_context_;
 
   // The buffer allocation slice for the root of the computation being compiled.
   // Only relevant for thread local computations.
diff --git a/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc b/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
new file mode 100644
index 00000000000..e7d52c288d5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
@@ -0,0 +1,132 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/mlir_emitter.h"
+
+#include "llvm/Linker/Linker.h"
+#include "llvm/Transforms/IPO/Internalize.h"
+#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"  // from @llvm-project
+#include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Target/LLVMIR.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/hlo_utils.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+// Lower an MLIR module to an LLVM module.
+std::unique_ptr<llvm::Module> MakeLLVMModule(mlir::OwningModuleRef module) {
+  mlir::PassManager manager(module->getContext());
+  manager.addPass(mlir::createConvertLinalgToLoopsPass());
+  manager.addPass(mlir::createConvertLinalgToLLVMPass());
+  manager.addPass(mlir::createConvertVectorToLLVMPass());
+  manager.addPass(mlir::createLowerToLLVMPass());
+  CHECK(succeeded(manager.run(*module)));
+  return mlir::translateModuleToLLVMIR(*module);
+}
+
+// Get arguments to pass a memref to an mlir function.
+void BuildViewForBuffer(llvm::SmallVectorImpl<llvm::Value *> *args,
+                        llvm::IRBuilder<> *b, const Shape &opShape,
+                        llvm::Value *op_val) {
+  llvm::Type *ty = op_val->getType();
+  while (auto aty = llvm::dyn_cast<llvm::ArrayType>(
+             llvm::cast<llvm::PointerType>(ty)->getElementType())) {
+    ty = aty->getElementType()->getPointerTo();
+  }
+  op_val = b->CreateBitCast(op_val, ty);
+
+  args->push_back(op_val);          // Allocated pointer.
+  args->push_back(op_val);          // Aligned pointer.
+  args->push_back(b->getInt64(0));  // Offset.
+
+  // Sizes.
+  for (int64 dim : opShape.dimensions()) {
+    args->push_back(b->getInt64(dim));
+  }
+
+  int64_t accumulated_stride = 1;
+  llvm::SmallVector<int64_t, 4> strides(opShape.rank(), 1);
+  for (int64 dim : LayoutUtil::MinorToMajor(opShape)) {
+    strides[dim] = accumulated_stride;
+    accumulated_stride *= opShape.dimensions(dim);
+  }
+
+  // Strides.
+  for (int64 stride : strides) {
+    args->push_back(b->getInt64(stride));
+  }
+}
+}  // namespace
+
+Status EmitMlirFuncAndCall(
+    mlir::MLIRContext *context, llvm::IRBuilder<> *b, const Shape &result_shape,
+    llvm::ArrayRef<Shape> operand_shapes, llvm::Value *result_ptr,
+    llvm::ArrayRef<llvm::Value *> operand_ptrs, llvm::StringRef func_name,
+    llvm::function_ref<void(mlir::OpBuilder *, mlir::FuncOp)> emitter) {
+  llvm::Module *llvm_module = b->GetInsertBlock()->getParent()->getParent();
+  mlir::Builder mlir_builder(context);
+
+  // Get memref types for the inputs and output.
+  TF_ASSIGN_OR_RETURN(mlir::Type ret_memref, ConvertTensorShapeToMemRefType(
+                                                 result_shape, mlir_builder));
+  std::vector<mlir::Type> operand_types = {ret_memref};
+  for (int i = 0; i != operand_shapes.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(
+        mlir::Type op_memref,
+        ConvertTensorShapeToMemRefType(operand_shapes[i], mlir_builder));
+    operand_types.push_back(op_memref);
+  }
+
+  // Create the function an call the emission callback.
+  mlir::Location loc = mlir::UnknownLoc::get(context);
+  auto function = mlir::FuncOp::create(
+      loc, func_name, mlir::FunctionType::get(operand_types, {}, context));
+  function.addEntryBlock();
+  mlir::OwningModuleRef mlir_module = mlir::ModuleOp::create(loc);
+  mlir_module->push_back(function);
+  mlir::OpBuilder op_builder(&function.getBody());
+  emitter(&op_builder, function);
+
+  // Now link it all into the main LLVM module.
+  auto mlir_llvm_module = MakeLLVMModule(std::move(mlir_module));
+  mlir_llvm_module->setDataLayout(llvm_module->getDataLayout());
+  llvm::Linker::linkModules(
+      *llvm_module, std::move(mlir_llvm_module), llvm::Linker::None,
+      [](llvm::Module &M, const llvm::StringSet<> &GVS) {
+        llvm::internalizeModule(M, [&GVS](const llvm::GlobalValue &GV) {
+          return !GV.hasName() || (GVS.count(GV.getName()) == 0);
+        });
+      });
+
+  // And leave behind a call to the function generated by MLIR.
+  llvm::Function *func = llvm_module->getFunction(func_name);
+  llvm::SmallVector<llvm::Value *, 4> op_vals;
+  BuildViewForBuffer(&op_vals, b, result_shape, result_ptr);
+  for (int i = 0; i != operand_shapes.size(); ++i) {
+    BuildViewForBuffer(&op_vals, b, operand_shapes[i], operand_ptrs[i]);
+  }
+  b->CreateCall(func, op_vals);
+
+  return Status::OK();
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/mlir_emitter.h b/tensorflow/compiler/xla/service/cpu/mlir_emitter.h
new file mode 100644
index 00000000000..bc0741e851a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/mlir_emitter.h
@@ -0,0 +1,43 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_MLIR_EMITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_MLIR_EMITTER_H_
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/status.h"
+
+namespace xla {
+namespace cpu {
+
+// Create a new MLIR function with the name `func_name`, populate it with
+// `emitter` and create a call, passing it the buffers defined by
+// resultShape/resultPtr and operandShapes/operandPtrs. The function is added to
+// the LLVM module at `b`s insertion point.
+Status EmitMlirFuncAndCall(
+    mlir::MLIRContext *context, llvm::IRBuilder<> *b, const Shape &result_shape,
+    llvm::ArrayRef<Shape> operand_shapes, llvm::Value *result_ptr,
+    llvm::ArrayRef<llvm::Value *> operand_ptrs, llvm::StringRef func_name,
+    llvm::function_ref<void(mlir::OpBuilder *, mlir::FuncOp)> emitter);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_MLIR_EMITTER_H_
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index 14afe770ede..225102e6ae6 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -142,24 +142,29 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
   //    in-place will only touch the updated elements).
   // TODO(b/27458679) Parallelize instructions which are skipped here.
   auto opcode = instruction->opcode();
-  if (opcode == HloOpcode::kParameter || opcode == HloOpcode::kConstant ||
-      opcode == HloOpcode::kCall || opcode == HloOpcode::kCustomCall ||
-      opcode == HloOpcode::kDot || opcode == HloOpcode::kSelectAndScatter ||
-      opcode == HloOpcode::kGetTupleElement || opcode == HloOpcode::kBitcast ||
-      opcode == HloOpcode::kFft || opcode == HloOpcode::kInfeed ||
-      opcode == HloOpcode::kOutfeed || opcode == HloOpcode::kRng ||
-      opcode == HloOpcode::kSort ||
-      (opcode == HloOpcode::kConvolution &&
-       PotentiallyImplementedAsEigenConvolution(*instruction,
-                                                target_machine_features_)) ||
-      (opcode == HloOpcode::kFusion && !instruction->IsLoopFusion()) ||
-      llvm_ir::MayBeImplementedAsInPlaceDynamicUpdateSlice(instruction) ||
-      instruction->shape().IsTuple()) {
+  if (llvm_ir::MayBeImplementedAsInPlaceDynamicUpdateSlice(instruction) ||
+      instruction->shape().IsTuple() || opcode == HloOpcode::kRng) {
     return 1;
   }
 
-  // Consult 'cost_model_' to compute target parallel task count.
-  return cost_model_->GetParallelTaskCount(instruction);
+  // Only allow known good instructions.
+  if (instruction->IsElementwise() || instruction->IsLoopFusion() ||
+      opcode == HloOpcode::kBroadcast || opcode == HloOpcode::kConcatenate ||
+      opcode == HloOpcode::kDynamicSlice ||
+      opcode == HloOpcode::kDynamicUpdateSlice ||
+      opcode == HloOpcode::kGather || opcode == HloOpcode::kIota ||
+      opcode == HloOpcode::kPad || opcode == HloOpcode::kReduce ||
+      opcode == HloOpcode::kReduceWindow || opcode == HloOpcode::kReshape ||
+      opcode == HloOpcode::kReverse || opcode == HloOpcode::kSlice ||
+      opcode == HloOpcode::kTranspose ||
+      (opcode == HloOpcode::kConvolution &&
+       !PotentiallyImplementedAsEigenConvolution(*instruction,
+                                                 target_machine_features_))) {
+    // Consult 'cost_model_' to compute target parallel task count.
+    return cost_model_->GetParallelTaskCount(instruction);
+  }
+
+  return 1;
 }
 
 StatusOr<bool> ParallelTaskAssigner::Run(HloModule* module) {
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
index e2c93568b74..e22210a61f2 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
@@ -170,5 +170,26 @@ TEST_F(ParallelTaskAssignmentTest, InPlaceDynamicUpdateSliceNotParallelized) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(ParallelTaskAssignmentTest, AllReduceNotParallelized) {
+  constexpr char hlo_string[] = R"(
+  HloModule TestTaskParallel_allreduce
+    add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    }
+
+    ENTRY CRS {
+      input = f32[1234567] parameter(0)
+      ROOT crs = f32[1234567] all-reduce(input), replica_groups={}, to_apply=add
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
+  EXPECT_FALSE(changed);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft.cc b/tensorflow/compiler/xla/service/cpu/runtime_fft.cc
index 051120be324..0c1e9dae751 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft.cc
@@ -28,13 +28,14 @@ using tensorflow::int64;
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenFft(
     const void* run_options_ptr, void* out, void* operand, int32 fft_type,
-    int32 fft_rank, int64 input_batch, int64 fft_length0, int64 fft_length1,
-    int64 fft_length2) {
+    int32 double_precision, int32 fft_rank, int64 input_batch,
+    int64 fft_length0, int64 fft_length1, int64 fft_length2) {
   const xla::ExecutableRunOptions* run_options =
       static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
   XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
   tensorflow::xla::EigenFftImpl(
       *run_options->intra_op_thread_pool(), out, operand,
-      static_cast<tensorflow::xla::FftType>(fft_type), fft_rank, input_batch,
-      fft_length0, fft_length1, fft_length2);
+      static_cast<tensorflow::xla::FftType>(fft_type),
+      static_cast<bool>(double_precision), fft_rank, input_batch, fft_length0,
+      fft_length1, fft_length2);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft.h b/tensorflow/compiler/xla/service/cpu/runtime_fft.h
index f20c5aa0aa2..d95da172116 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft.h
@@ -22,7 +22,8 @@ extern "C" {
 
 extern void __xla_cpu_runtime_EigenFft(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, void* out,
-    void* operand, tensorflow::int32 fft_type, tensorflow::int32 fft_rank,
+    void* operand, tensorflow::int32 fft_type,
+    tensorflow::int32 double_precision, tensorflow::int32 fft_rank,
     tensorflow::int64 input_batch, tensorflow::int64 fft_length0,
     tensorflow::int64 fft_length1, tensorflow::int64 fft_length2);
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
index 04dea120a8d..124e7d589a0 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
@@ -39,8 +39,8 @@ static constexpr int kFftTypeArraySize = 4;
 namespace internal {
 
 // Computes either a forward or reverse complex-to-complex FFT.
-template <bool Forward, int FFTRank, typename EigenDevice>
-void EigenFftC2C(const EigenDevice& device, complex64* out, complex64* operand,
+template <bool Forward, int FFTRank, typename EigenDevice, typename Complex>
+void EigenFftC2C(const EigenDevice& device, Complex* out, Complex* operand,
                  int64 input_batch, int64 fft_length0, int64 fft_length1,
                  int64 fft_length2) {
   // Create the axes (which are always trailing).
@@ -55,10 +55,10 @@ void EigenFftC2C(const EigenDevice& device, complex64* out, complex64* operand,
   for (int i = 0; i < FFTRank; i++) {
     dims[i + 1] = fft_shape[i];
   }
-  const Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
+  const Eigen::TensorMap<Eigen::Tensor<Complex, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
       input(operand, dims);
-  Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
+  Eigen::TensorMap<Eigen::Tensor<Complex, FFTRank + 1, Eigen::RowMajor>,
                    Eigen::Aligned>
       output(out, dims);
   output.device(device) = input.template fft<Eigen::BothParts, direction>(axes);
@@ -66,8 +66,8 @@ void EigenFftC2C(const EigenDevice& device, complex64* out, complex64* operand,
 
 // Computes a forward real->complex FFT, slicing out redundant negative
 // frequencies from the innermost dimension.
-template <int FFTRank, typename EigenDevice>
-void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
+template <int FFTRank, typename EigenDevice, typename Real, typename Complex>
+void EigenFftR2C(const EigenDevice& device, Complex* out, Real* operand,
                  int64 input_batch, int64 fft_length0, int64 fft_length1,
                  int64 fft_length2) {
   const std::array<int64, 3> fft_shape = {
@@ -81,10 +81,10 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
     in_dims[i + 1] = fft_shape[i];
     out_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
   }
-  const Eigen::TensorMap<Eigen::Tensor<float, FFTRank + 1, Eigen::RowMajor>,
+  const Eigen::TensorMap<Eigen::Tensor<Real, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
       input(operand, in_dims);
-  Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
+  Eigen::TensorMap<Eigen::Tensor<Complex, FFTRank + 1, Eigen::RowMajor>,
                    Eigen::Aligned>
       output(out, out_dims);
 
@@ -92,7 +92,7 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   const auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
 
   // Compute the full FFT using a temporary tensor.
-  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(in_dims);
+  Eigen::Tensor<Complex, FFTRank + 1, Eigen::RowMajor> full_fft(in_dims);
 
   const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> zero_start_indices;
   full_fft.device(device) =
@@ -105,8 +105,8 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
 // Computes a reverse complex->real FFT, reconstructing redundant negative
 // frequencies using reverse conjugate on innermost dimension after doing IFFT
 // on outer dimensions.
-template <int FFTRank, typename EigenDevice>
-void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
+template <int FFTRank, typename EigenDevice, typename Complex, typename Real>
+void EigenFftC2R(const EigenDevice& device, Real* out, Complex* operand,
                  int64 input_batch, int64 fft_length0, int64 fft_length1,
                  int64 fft_length2) {
   const std::array<int64, 3> fft_shape = {
@@ -120,10 +120,10 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
     in_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
     out_dims[i + 1] = fft_shape[i];
   }
-  const Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
+  const Eigen::TensorMap<Eigen::Tensor<Complex, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
       input(operand, in_dims);
-  Eigen::TensorMap<Eigen::Tensor<float, FFTRank + 1, Eigen::RowMajor>,
+  Eigen::TensorMap<Eigen::Tensor<Real, FFTRank + 1, Eigen::RowMajor>,
                    Eigen::Aligned>
       output(out, out_dims);
 
@@ -131,7 +131,7 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   // region we will slice from input given fft_shape. We slice input to
   // fft_shape on its inner-most dimensions, except the last (which we
   // slice to fft_shape[-1] / 2 + 1).
-  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(out_dims);
+  Eigen::Tensor<Complex, FFTRank + 1, Eigen::RowMajor> full_fft(out_dims);
 
   // Calculate the starting point and range of the source of
   // negative frequency part.
@@ -178,30 +178,59 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
 
 template <int FFTRank, typename EigenDevice>
 void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
-                      FftType fft_type, int64 input_batch, int64 fft_length0,
-                      int64 fft_length1, int64 fft_length2) {
+                      FftType fft_type, bool double_precision,
+                      int64 input_batch, int64 fft_length0, int64 fft_length1,
+                      int64 fft_length2) {
   switch (fft_type) {
     case FftType::FFT:
-      EigenFftC2C<true, FFTRank, EigenDevice>(
-          device, static_cast<complex64*>(out),
-          static_cast<complex64*>(operand), input_batch, fft_length0,
-          fft_length1, fft_length2);
+      if (double_precision) {
+        EigenFftC2C<true, FFTRank, EigenDevice, complex128>(
+            device, static_cast<complex128*>(out),
+            static_cast<complex128*>(operand), input_batch, fft_length0,
+            fft_length1, fft_length2);
+      } else {
+        EigenFftC2C<true, FFTRank, EigenDevice, complex64>(
+            device, static_cast<complex64*>(out),
+            static_cast<complex64*>(operand), input_batch, fft_length0,
+            fft_length1, fft_length2);
+      }
       break;
     case FftType::IFFT:
-      EigenFftC2C<false, FFTRank, EigenDevice>(
-          device, static_cast<complex64*>(out),
-          static_cast<complex64*>(operand), input_batch, fft_length0,
-          fft_length1, fft_length2);
+      if (double_precision) {
+        EigenFftC2C<false, FFTRank, EigenDevice, complex128>(
+            device, static_cast<complex128*>(out),
+            static_cast<complex128*>(operand), input_batch, fft_length0,
+            fft_length1, fft_length2);
+      } else {
+        EigenFftC2C<false, FFTRank, EigenDevice, complex64>(
+            device, static_cast<complex64*>(out),
+            static_cast<complex64*>(operand), input_batch, fft_length0,
+            fft_length1, fft_length2);
+      }
       break;
     case FftType::RFFT:
-      EigenFftR2C<FFTRank, EigenDevice>(
-          device, static_cast<complex64*>(out), static_cast<float*>(operand),
-          input_batch, fft_length0, fft_length1, fft_length2);
+      if (double_precision) {
+        EigenFftR2C<FFTRank, EigenDevice, double, complex128>(
+            device, static_cast<complex128*>(out),
+            static_cast<double*>(operand), input_batch, fft_length0,
+            fft_length1, fft_length2);
+      } else {
+        EigenFftR2C<FFTRank, EigenDevice, float, complex64>(
+            device, static_cast<complex64*>(out), static_cast<float*>(operand),
+            input_batch, fft_length0, fft_length1, fft_length2);
+      }
       break;
     case FftType::IRFFT:
-      EigenFftC2R<FFTRank, EigenDevice>(
-          device, static_cast<float*>(out), static_cast<complex64*>(operand),
-          input_batch, fft_length0, fft_length1, fft_length2);
+      if (double_precision) {
+        EigenFftC2R<FFTRank, EigenDevice, complex128, double>(
+            device, static_cast<double*>(out),
+            static_cast<complex128*>(operand), input_batch, fft_length0,
+            fft_length1, fft_length2);
+      } else {
+        EigenFftC2R<FFTRank, EigenDevice, complex64, float>(
+            device, static_cast<float*>(out), static_cast<complex64*>(operand),
+            input_batch, fft_length0, fft_length1, fft_length2);
+      }
       break;
     default:
       // Unsupported FFT type
@@ -213,22 +242,24 @@ void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
 
 template <typename EigenDevice>
 void EigenFftImpl(const EigenDevice& device, void* out, void* operand,
-                  FftType fft_type, int32 fft_rank, int64 input_batch,
-                  int64 fft_length0, int64 fft_length1, int64 fft_length2) {
+                  FftType fft_type, bool double_precision, int32 fft_rank,
+                  int64 input_batch, int64 fft_length0, int64 fft_length1,
+                  int64 fft_length2) {
   switch (fft_rank) {
     case 1:
-      internal::EigenFftWithRank<1, EigenDevice>(
-          device, out, operand, fft_type, input_batch, fft_length0, 0, 0);
+      internal::EigenFftWithRank<1, EigenDevice>(device, out, operand, fft_type,
+                                                 double_precision, input_batch,
+                                                 fft_length0, 0, 0);
       break;
     case 2:
       internal::EigenFftWithRank<2, EigenDevice>(device, out, operand, fft_type,
-                                                 input_batch, fft_length0,
-                                                 fft_length1, 0);
+                                                 double_precision, input_batch,
+                                                 fft_length0, fft_length1, 0);
       break;
     case 3:
-      internal::EigenFftWithRank<3, EigenDevice>(device, out, operand, fft_type,
-                                                 input_batch, fft_length0,
-                                                 fft_length1, fft_length2);
+      internal::EigenFftWithRank<3, EigenDevice>(
+          device, out, operand, fft_type, double_precision, input_batch,
+          fft_length0, fft_length1, fft_length2);
       break;
     default:
       // Unsupported FFT rank
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
index 7831c1b1b5b..0d4e7055ddb 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
@@ -60,6 +60,11 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSort(
   std::unique_ptr<std::string[]> reordered_values(
       new std::string[sort_dimension_elements]);
   for (int64 index = 0; index < num_iteration_elements; ++index) {
+    // If the sort should be stable, we have to reinitialize indices to iota to
+    // guarantee that we still keep the relative order in case of ties.
+    if (is_stable && index > 0) {
+      std::iota(indices.get(), indices.get() + sort_dimension_elements, 0);
+    }
     // 'index' can be split into two values which index into the 'c' dimension
     // and the 'a' dimension, respectively. 'index' % 'c' is the index into the
     // 'c' dimension, 'index' / 'c' is the index into the 'a' dimension. When
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
index 7d6c4942b69..35db15fed2c 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
@@ -114,6 +114,22 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF64(
                          transpose_rhs);
 }
 
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulC64(
+    const void* run_options_ptr, std::complex<float>* out,
+    std::complex<float>* lhs, std::complex<float>* rhs, int64 m, int64 n,
+    int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+  MatMulDispatch<std::complex<float>>(run_options_ptr, out, lhs, rhs, m, n, k,
+                                      transpose_lhs, transpose_rhs);
+}
+
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulC128(
+    const void* run_options_ptr, std::complex<double>* out,
+    std::complex<double>* lhs, std::complex<double>* rhs, int64 m, int64 n,
+    int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+  MatMulDispatch<std::complex<double>>(run_options_ptr, out, lhs, rhs, m, n, k,
+                                       transpose_lhs, transpose_rhs);
+}
+
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulS32(
     const void* run_options_ptr, int32* out, int32* lhs, int32* rhs, int64 m,
     int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) {
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul.h b/tensorflow/compiler/xla/service/cpu/runtime_matmul.h
index 1280d04d01f..11dfc5c1d80 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_H_
 
+#include <complex>
+
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/platform/types.h"
 
@@ -44,6 +46,18 @@ extern void __xla_cpu_runtime_EigenMatMulF64(
     tensorflow::int64 k, tensorflow::int32 transpose_lhs,
     tensorflow::int32 transpose_rhs);
 
+extern void __xla_cpu_runtime_EigenMatMulC64(
+    const void* run_options_ptr, std::complex<float>* out,
+    std::complex<float>* lhs, std::complex<float>* rhs, tensorflow::int64 m,
+    tensorflow::int64 n, tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenMatMulC128(
+    const void* run_options_ptr, std::complex<double>* out,
+    std::complex<double>* lhs, std::complex<double>* rhs, tensorflow::int64 m,
+    tensorflow::int64 n, tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+
 extern void __xla_cpu_runtime_EigenMatMulS32(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
     tensorflow::int32* out, tensorflow::int32* lhs, tensorflow::int32* rhs,
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
index d2780dd694e..9476dce5ced 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
@@ -24,10 +24,11 @@ using tensorflow::int64;
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenSingleThreadedFft(
     const void* run_options_ptr, void* out, void* operand, int32 fft_type,
-    int32 fft_rank, int64 input_batch, int64 fft_length0, int64 fft_length1,
-    int64 fft_length2) {
+    int32 double_precision, int32 fft_rank, int64 input_batch,
+    int64 fft_length0, int64 fft_length1, int64 fft_length2) {
   tensorflow::xla::EigenFftImpl(Eigen::DefaultDevice(), out, operand,
                                 static_cast<tensorflow::xla::FftType>(fft_type),
-                                fft_rank, input_batch, fft_length0, fft_length1,
+                                static_cast<bool>(double_precision), fft_rank,
+                                input_batch, fft_length0, fft_length1,
                                 fft_length2);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
index dcd133d012c..2f0ccda2d10 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
@@ -22,7 +22,8 @@ extern "C" {
 
 extern void __xla_cpu_runtime_EigenSingleThreadedFft(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, void* out,
-    void* operand, tensorflow::int32 fft_type, tensorflow::int32 fft_rank,
+    void* operand, tensorflow::int32 fft_type,
+    tensorflow::int32 double_precision, tensorflow::int32 fft_rank,
     tensorflow::int64 input_batch, tensorflow::int64 fft_length0,
     tensorflow::int64 fft_length1, tensorflow::int64 fft_length2);
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
index e395bc7426c..c7601f939c7 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
@@ -112,6 +112,24 @@ __xla_cpu_runtime_EigenSingleThreadedMatMulF64(const void* run_options_ptr,
                                        transpose_lhs, transpose_rhs);
 }
 
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
+__xla_cpu_runtime_EigenSingleThreadedMatMulC64(
+    const void* run_options_ptr, std::complex<float>* out,
+    std::complex<float>* lhs, std::complex<float>* rhs, int64 m, int64 n,
+    int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+  SingleThreadedMatMulDispatch<std::complex<float>>(
+      run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+}
+
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
+__xla_cpu_runtime_EigenSingleThreadedMatMulC128(
+    const void* run_options_ptr, std::complex<double>* out,
+    std::complex<double>* lhs, std::complex<double>* rhs, int64 m, int64 n,
+    int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+  SingleThreadedMatMulDispatch<std::complex<double>>(
+      run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+}
+
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
 __xla_cpu_runtime_EigenSingleThreadedMatMulS32(const void* run_options_ptr,
                                                int32* out, int32* lhs,
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h
index eb695910729..61fe224d420 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_MATMUL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_MATMUL_H_
 
+#include <complex>
+
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/platform/types.h"
 
@@ -44,6 +46,20 @@ extern void __xla_cpu_runtime_EigenSingleThreadedMatMulF64(
     tensorflow::int64 k, tensorflow::int32 transpose_lhs,
     tensorflow::int32 transpose_rhs);
 
+extern void __xla_cpu_runtime_EigenSingleThreadedMatMulC64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    std::complex<float>* out, std::complex<float>* lhs,
+    std::complex<float>* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenSingleThreadedMatMulC128(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    std::complex<double>* out, std::complex<double>* lhs,
+    std::complex<double>* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+
 extern void __xla_cpu_runtime_EigenSingleThreadedMatMulS32(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
     tensorflow::int32* out, tensorflow::int32* lhs, tensorflow::int32* rhs,
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 395eb31c13f..4cc9e373b3e 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -246,6 +246,8 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulC64);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulC128);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulS32);
   REGISTER_CPU_RUNTIME_SYMBOL(MKLMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(MKLMatMulF64);
@@ -257,6 +259,8 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulC64);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulC128);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulS32);
   REGISTER_CPU_RUNTIME_SYMBOL(ParallelForkJoin);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseInfeedBufferAfterDequeue);
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index f52de3394fe..1ac8509cdb1 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -35,6 +35,19 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "cpu_dyn_shape_test",
+    srcs = ["cpu_dyn_shape_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "cpu_fusion_test",
     srcs = ["cpu_fusion_test.cc"],
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_dyn_shape_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_dyn_shape_test.cc
new file mode 100644
index 00000000000..46249caa0c7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_dyn_shape_test.cc
@@ -0,0 +1,60 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+using CpuDynamicShapeTest = CpuCodegenTest;
+
+TEST_F(CpuDynamicShapeTest, DynamicShapeR2) {
+  HloComputation::Builder builder(TestName());
+
+  xla::Shape dyn_input_shape = xla::ShapeUtil::MakeShape(xla::F32, {2, 4});
+  dyn_input_shape.set_dynamic_dimension(0, true);
+  HloInstruction* param_x = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, dyn_input_shape, "x"));
+
+  builder.AddInstruction(HloInstruction::CreateUnary(
+      dyn_input_shape, HloOpcode::kNegate, param_x));
+  auto hlo_module = CreateNewVerifiedModule();
+  hlo_module->AddEntryComputation(builder.Build());
+
+  string filecheck_pattern = R"(
+; CHECK: %[[dyn_dim_size:.*]] = load i32, i32*
+; CHECK: %[[i64_dyn_dim_size:.*]] = sext i32 %[[dyn_dim_size:.*]] to i64
+; CHECK: icmp uge i64 %[[custom:.*]], %[[i64_dyn_dim_size:.*]]
+; CHECK: %[[multiplier:.*]] = mul i64 1, %[[i64_dyn_dim_size:.*]]
+; CHECK: mul nuw nsw i64 %[[custom:.*]], %[[multiplier:.*]]
+)";
+
+  CpuAotCompilationOptions options{
+      /*triple=*/"x86_64", /*cpu_name=*/"", /*features=*/"",
+      /*entry_point_name=*/"entry",
+      /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
+
+  CompileAheadOfTimeAndVerifyIr(std::move(hlo_module), options,
+                                filecheck_pattern,
+                                /*match_optimized_ir=*/false);
+}
+
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
index b6d6de28bc5..efeab3bd31a 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -70,6 +70,13 @@ class CpuUnaryIntrinsicTest
     return absl::StrCat(opcode, "_On_", triple,
                         (features.empty() ? "" : "_With"), features);
   }
+
+ private:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    HloTestBase::SetAotFastMathDebugOptions(&debug_options);
+    return debug_options;
+  }
 };
 
 // Creates a module with a call to the unary op, and tests if the
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc
index 8a72eb15487..757d878e224 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc
@@ -69,6 +69,13 @@ class CpuVectorizationTest
     return absl::StrCat(opcode, "_On_", triple,
                         (features.empty() ? "" : "_With"), features);
   }
+
+ private:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    HloTestBase::SetAotFastMathDebugOptions(&debug_options);
+    return debug_options;
+  }
 };
 
 TEST_P(CpuVectorizationTest, DoIt) {
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index cadea620ec6..bdaac32a0e5 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -116,9 +116,12 @@ class DfsHloVisitorBase {
   virtual Status HandleFft(HloInstructionPtr fft) = 0;
   virtual Status HandleTriangularSolve(HloInstructionPtr hlo) = 0;
   virtual Status HandleCholesky(HloInstructionPtr hlo) = 0;
+  virtual Status HandleAllGather(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllReduce(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0;
   virtual Status HandleCollectivePermute(HloInstructionPtr hlo) = 0;
+  virtual Status HandleCollectivePermuteStart(HloInstructionPtr hlo) = 0;
+  virtual Status HandleCollectivePermuteDone(HloInstructionPtr hlo) = 0;
   virtual Status HandleReplicaId(HloInstructionPtr hlo) = 0;
   virtual Status HandlePartitionId(HloInstructionPtr hlo) = 0;
   virtual Status HandleGetDimensionSize(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index baa9240fb56..b1d674fe467 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -98,6 +98,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleCholesky(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
+  Status HandleAllGather(HloInstructionPtr crs) override {
+    return DefaultAction(crs);
+  }
   Status HandleAllReduce(HloInstructionPtr crs) override {
     return DefaultAction(crs);
   }
@@ -107,6 +110,12 @@ class DfsHloVisitorWithDefaultBase
   Status HandleCollectivePermute(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
+  Status HandleCollectivePermuteStart(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  Status HandleCollectivePermuteDone(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
   Status HandleReplicaId(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
diff --git a/tensorflow/compiler/xla/service/dot_as_convolution_util.cc b/tensorflow/compiler/xla/service/dot_as_convolution_util.cc
new file mode 100644
index 00000000000..fcdf85d5ecb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dot_as_convolution_util.cc
@@ -0,0 +1,139 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dot_as_convolution_util.h"
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace xla {
+namespace dot_as_convolution_util {
+
+/* static */ absl::optional<DotGeneralAsConvolutionDimsInfo>
+ParseDotGeneralFromConvolution(const HloInstruction* conv) {
+  CHECK_EQ(conv->opcode(), HloOpcode::kConvolution);
+  if (conv->feature_group_count() != 1 || conv->batch_group_count() != 1) {
+    return absl::nullopt;
+  }
+  const auto& conv_dims = conv->convolution_dimension_numbers();
+  DotGeneralAsConvolutionDimsInfo dims;
+  dims.lhs_non_contracting_dims.push_back(
+      {conv_dims.input_batch_dimension(), -1,
+       conv_dims.output_batch_dimension(), -1});
+  dims.rhs_non_contracting_dims.push_back(
+      {-1, conv_dims.kernel_output_feature_dimension(),
+       conv_dims.output_feature_dimension(), -1});
+  dims.contracting_dims.push_back({conv_dims.input_feature_dimension(),
+                                   conv_dims.kernel_input_feature_dimension(),
+                                   -1, -1});
+
+  for (int64 i = 0; i < conv_dims.input_spatial_dimensions_size(); ++i) {
+    int64 lhs = conv_dims.input_spatial_dimensions(i);
+    int64 lhs_size = conv->operand(0)->shape().dimensions(lhs);
+    int64 rhs = conv_dims.kernel_spatial_dimensions(i);
+    int64 rhs_size = conv->operand(1)->shape().dimensions(rhs);
+    int64 output = conv_dims.output_spatial_dimensions(i);
+    const auto& wd = conv->window().dimensions(i);
+    if (lhs_size == wd.size() &&
+        std::max<int64>(1, lhs_size - 1) == wd.stride() &&
+        lhs_size == wd.base_dilation() && wd.window_dilation() == 1 &&
+        wd.padding_high() == 0 && wd.padding_low() == 0 &&
+        !wd.window_reversal()) {
+      // A batch dimension in DotGeneral is represented as a spatial dimension
+      // with window size B (batch dimension size), stride B - 1, and base
+      // dilation B.
+      dims.batch_dims.push_back({lhs, rhs, output, i});
+    } else if (lhs_size == wd.size() && wd.base_dilation() == 1 &&
+               wd.window_dilation() == 1 && wd.padding_high() == 0 &&
+               wd.padding_low() == 0 && !wd.window_reversal()) {
+      // A contracting dimension be represented as a spatial dimension with
+      // window size C (contracting dimension size). Stride can be any size
+      // since there is only one window.
+      dims.contracting_dims.push_back({lhs, rhs, output, i});
+    } else if (wd.stride() == 1 && wd.window_dilation() == 1 &&
+               wd.base_dilation() == 1) {
+      if (rhs_size == 1 && wd.size() == 1 && wd.padding_high() == 0 &&
+          wd.padding_low() == 0 && !wd.window_reversal()) {
+        // A LHS non-contracting dimension can be represented as a spatial
+        // dimension with window size 1.
+        dims.lhs_non_contracting_dims.push_back({lhs, rhs, output, i});
+      } else if (lhs_size == 1 && wd.size() == rhs_size &&
+                 wd.padding_high() == rhs_size - 1 &&
+                 wd.padding_low() == rhs_size - 1 && wd.window_reversal()) {
+        // A RHS non-contracting dimension can be represented as a spatial
+        // dimension with window size N (non-contracting dimension size), low
+        // padding N - 1,  high padding N - 1 and window reversal.
+        dims.rhs_non_contracting_dims.push_back({lhs, rhs, output, i});
+      } else {
+        return absl::nullopt;
+      }
+    } else {
+      return absl::nullopt;
+    }
+  }
+
+  return dims;
+}
+
+StatusOr<std::unique_ptr<HloInstruction>>
+CreateShardedConvForDotGeneralConvolution(
+    const HloInstruction& conv,
+    const DotGeneralAsConvolutionDimsInfo& dot_dnums,
+    HloInstruction* sharded_lhs_hlo, HloInstruction* sharded_rhs_hlo) {
+  CHECK_EQ(conv.opcode(), HloOpcode::kConvolution);
+  const auto& conv_dnums = conv.convolution_dimension_numbers();
+  auto window = conv.window();
+  for (const auto& dim : dot_dnums.batch_dims) {
+    auto wd = window.mutable_dimensions(dim.spatial_dim);
+    wd->set_size(sharded_lhs_hlo->shape().dimensions(
+        conv_dnums.input_spatial_dimensions(dim.spatial_dim)));
+    wd->set_stride(std::max<int64>(1, wd->size() - 1));
+    wd->set_base_dilation(wd->size());
+  }
+  for (const auto& dim : dot_dnums.contracting_dims) {
+    if (dim.spatial_dim < 0) {
+      continue;
+    }
+    auto wd = window.mutable_dimensions(dim.spatial_dim);
+    wd->set_size(sharded_lhs_hlo->shape().dimensions(
+        conv_dnums.input_spatial_dimensions(dim.spatial_dim)));
+  }
+  for (const auto& dim : dot_dnums.rhs_non_contracting_dims) {
+    if (dim.spatial_dim < 0) {
+      continue;
+    }
+    auto wd = window.mutable_dimensions(dim.spatial_dim);
+    wd->set_size(sharded_rhs_hlo->shape().dimensions(
+        conv_dnums.kernel_spatial_dimensions(dim.spatial_dim)));
+    wd->set_padding_high(wd->size() - 1);
+    wd->set_padding_low(wd->size() - 1);
+  }
+  TF_ASSIGN_OR_RETURN(Shape sharded_conv_shape,
+                      ShapeInference::InferConvolveShape(
+                          sharded_lhs_hlo->shape(), sharded_rhs_hlo->shape(),
+                          /*feature_group_count=*/1,
+                          /*batch_group_count=*/1, window, conv_dnums));
+  *sharded_conv_shape.mutable_layout() = conv.shape().layout();
+  return HloInstruction::CreateConvolve(
+      sharded_conv_shape, sharded_lhs_hlo, sharded_rhs_hlo,
+      /*feature_group_count=*/1,
+      /*batch_group_count=*/1, window, conv_dnums, conv.precision_config());
+}
+
+}  // namespace dot_as_convolution_util
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dot_as_convolution_util.h b/tensorflow/compiler/xla/service/dot_as_convolution_util.h
new file mode 100644
index 00000000000..a3e829a3d31
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dot_as_convolution_util.h
@@ -0,0 +1,68 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DOT_AS_CONVOLUTION_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DOT_AS_CONVOLUTION_UTIL_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+
+namespace xla {
+namespace dot_as_convolution_util {
+
+// Describes the dimensions of a convolution that can be interpreted as a dot.
+struct DotGeneralAsConvolutionDimsInfo {
+  // The dimension numbers for the operands and output corresponding to a
+  // logical dimension (e.g., batch, contracting, non-contracting). If an
+  // operand or the output doesn't have the logical dimension, it is set to
+  // -1.
+  struct DimNums {
+    int64 lhs;
+    int64 rhs;
+    int64 output;
+    // The corresponding spatial dimension in the convolution's config. Set to
+    // -1 if it's not mapped to a spatial dimension.
+    int64 spatial_dim;
+  };
+  std::vector<DimNums> batch_dims;
+  std::vector<DimNums> contracting_dims;
+  std::vector<DimNums> lhs_non_contracting_dims;
+  std::vector<DimNums> rhs_non_contracting_dims;
+};
+
+// Parses a convolution and returns a DotGeneralAsConvolutionDimsInfo if it can
+// be interpreted as a dot, or absl::nullopt otherwise.
+absl::optional<DotGeneralAsConvolutionDimsInfo> ParseDotGeneralFromConvolution(
+    const HloInstruction* conv);
+
+// Creates sharded convolution instruction that can be interpreted as a dot.
+// This is a utility for per-op partitioners.
+//  - 'conv' is the original convolution instruction.
+//  - 'dot_dnums' is the result of ParseDotGeneralFromConvolution() for 'conv'.
+//  - 'sharded_lhs_hlo' and 'sharded_rhs_hlo' are sharded inputs for the result
+//    convolution instruction.
+StatusOr<std::unique_ptr<HloInstruction>>
+CreateShardedConvForDotGeneralConvolution(
+    const HloInstruction& conv,
+    const DotGeneralAsConvolutionDimsInfo& dot_dnums,
+    HloInstruction* sharded_lhs_hlo, HloInstruction* sharded_rhs_hlo);
+
+}  // namespace dot_as_convolution_util
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DOT_AS_CONVOLUTION_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.cc b/tensorflow/compiler/xla/service/dot_decomposer.cc
index 353a7f5cebc..40354dec3c6 100644
--- a/tensorflow/compiler/xla/service/dot_decomposer.cc
+++ b/tensorflow/compiler/xla/service/dot_decomposer.cc
@@ -31,7 +31,7 @@ namespace {
 
 // Convert a dot into a canonical form where non-contracting and contracting
 // dimensions are reshaped together and batch dimensions are the most major
-// dimensions. The requires transposing and reshapes the lhs and rhs and
+// dimensions. This requires transposing and reshapes of the lhs and rhs and
 // reshaping the output batch to the original shape.
 Status CanonicalizeDot(HloInstruction* original_dot) {
   auto computation = original_dot->parent();
@@ -80,7 +80,9 @@ Status CanonicalizeDot(HloInstruction* original_dot) {
                                        lhs_shape),
           original_dot->mutable_operand(0), lhs_transpose));
   std::vector<int64> lhs_reshape_dims = batch_dim_sizes;
-  lhs_reshape_dims.push_back(lhs_non_contracting_size);
+  if (lhs_non_contracting_size > 1) {
+    lhs_reshape_dims.push_back(lhs_non_contracting_size);
+  }
   lhs_reshape_dims.push_back(lhs_contracting_size);
   // Reshape the contracting and non-contracting dimensions together.
   HloInstruction* reshaped_lhs =
@@ -126,7 +128,9 @@ Status CanonicalizeDot(HloInstruction* original_dot) {
 
   std::vector<int64> rhs_reshape_dims = batch_dim_sizes;
   rhs_reshape_dims.push_back(rhs_contracting_size);
-  rhs_reshape_dims.push_back(rhs_non_contracting_size);
+  if (rhs_non_contracting_size > 1) {
+    rhs_reshape_dims.push_back(rhs_non_contracting_size);
+  }
   // Reshape the contracting and non-contracting dimensions together.
   HloInstruction* reshaped_rhs =
       computation->AddInstruction(HloInstruction::CreateReshape(
@@ -134,15 +138,20 @@ Status CanonicalizeDot(HloInstruction* original_dot) {
           transposed_rhs));
 
   std::vector<int64> dot_dims = batch_dim_sizes;
-  dot_dims.push_back(lhs_non_contracting_size);
-  dot_dims.push_back(rhs_non_contracting_size);
+  if (lhs_non_contracting_size > 1) {
+    dot_dims.push_back(lhs_non_contracting_size);
+  }
+  if (rhs_non_contracting_size > 1) {
+    dot_dims.push_back(rhs_non_contracting_size);
+  }
 
   DotDimensionNumbers dot_dnums;
   for (int64 i = 0; i < num_batch_dims; ++i) {
     dot_dnums.add_lhs_batch_dimensions(i);
     dot_dnums.add_rhs_batch_dimensions(i);
   }
-  dot_dnums.add_lhs_contracting_dimensions(num_batch_dims + 1);
+  dot_dnums.add_lhs_contracting_dimensions(
+      num_batch_dims + (lhs_non_contracting_size > 1 ? 1 : 0));
   dot_dnums.add_rhs_contracting_dimensions(num_batch_dims);
 
   HloInstruction* dot = computation->AddInstruction(HloInstruction::CreateDot(
@@ -174,9 +183,9 @@ StatusOr<bool> DotDecomposer::Run(HloModule* module) {
       }
       // A dot is not canonical if it has more than one non-contracting
       // dimension.
-      if (dnums.lhs_batch_dimensions_size() + 2 !=
+      if (dnums.lhs_batch_dimensions_size() + 2 <
               instruction->operand(0)->shape().rank() ||
-          dnums.rhs_batch_dimensions_size() + 2 !=
+          dnums.rhs_batch_dimensions_size() + 2 <
               instruction->operand(1)->shape().rank()) {
         non_canonical_dots.push_back(instruction);
         continue;
diff --git a/tensorflow/compiler/xla/service/dot_decomposer_test.cc b/tensorflow/compiler/xla/service/dot_decomposer_test.cc
index 67fff50eaf6..c4152393933 100644
--- a/tensorflow/compiler/xla/service/dot_decomposer_test.cc
+++ b/tensorflow/compiler/xla/service/dot_decomposer_test.cc
@@ -50,5 +50,75 @@ TEST_F(DotDecomposerTest, CanonicalizeMultipleNonContractingDims) {
                                 op::Shape("f32[4032,512]"))));
 }
 
+TEST_F(DotDecomposerTest, DontCanonicalizeIfNoNoncontractingDims) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    p0 = f32[64,4]{1,0} parameter(0)
+    p1 = f32[64,4]{1,0} parameter(1)
+    ROOT dot = f32[64]{0} dot(p0, p1), lhs_batch_dims={0},
+                                       lhs_contracting_dims={1},
+                                       rhs_batch_dims={0},
+                                       rhs_contracting_dims={1}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool canonicalized,
+                          DotDecomposer().Run(module.get()));
+  EXPECT_FALSE(canonicalized);
+}
+
+TEST_F(DotDecomposerTest, DontAddLhsNonContractingDimIfOne) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    p0 = f32[64,4]{1,0} parameter(0)
+    p1 = f32[64,4,2,1]{3,2,1,0} parameter(1)
+    ROOT dot = f32[64,2,1]{2,1,0} dot(p0, p1), lhs_batch_dims={0},
+                                               lhs_contracting_dims={1},
+                                               rhs_batch_dims={0},
+                                               rhs_contracting_dims={1}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool canonicalized,
+                          DotDecomposer().Run(module.get()));
+  EXPECT_TRUE(canonicalized);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Reshape(AllOf(op::Dot(op::Reshape(), op::Reshape(),
+                                        /*lhs_contracting_dim=*/1,
+                                        /*rhs_contracting_dim=*/1),
+                                op::Shape("f32[64,2]"))));
+}
+
+TEST_F(DotDecomposerTest, DontAddRhsNonContractingDimIfOne) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    p0 = f32[64,4,2,1]{3,2,1,0} parameter(0)
+    p1 = f32[64,4]{1,0} parameter(1)
+    ROOT dot = f32[64,2,1]{2,1,0} dot(p0, p1), lhs_batch_dims={0},
+                                               lhs_contracting_dims={1},
+                                               rhs_batch_dims={0},
+                                               rhs_contracting_dims={1}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool canonicalized,
+                          DotDecomposer().Run(module.get()));
+  EXPECT_TRUE(canonicalized);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Reshape(AllOf(op::Dot(op::Reshape(), op::Reshape(),
+                                        /*lhs_contracting_dim=*/2,
+                                        /*rhs_contracting_dim=*/1),
+                                op::Shape("f32[64,2]"))));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 30300b8c195..8cb660de46c 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -2422,6 +2422,43 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
                  -> StatusOr<llvm::Value*> {
         return EmitElementalDot(hlo, operand_to_generator, dot_result_index);
       };
+    case HloOpcode::kMap:
+      return [this, hlo, &operand_to_generator](
+                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
+        std::vector<llvm::Value*> operands;
+        for (int i = 0; i < hlo->operand_count(); i++) {
+          TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
+                              operand_to_generator.at(hlo->operand(i))(index));
+          operands.push_back(operand_value);
+        }
+        std::vector<llvm_ir::ElementGenerator> input_generators;
+        for (const HloInstruction* instr : hlo->operands()) {
+          input_generators.push_back(operand_to_generator.at(instr));
+        }
+        return EmitElementalMap(Cast<HloMapInstruction>(hlo), operands);
+      };
+    case HloOpcode::kReduceWindow:
+      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
+        return EmitElementalReduceWindow(
+            Cast<HloReduceWindowInstruction>(hlo),
+            operand_to_generator.at(hlo->operand(0)),
+            operand_to_generator.at(hlo->operand(1)), index);
+      };
+    case HloOpcode::kReduce:
+      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
+        auto reduce_instr = Cast<HloReduceInstruction>(hlo);
+        std::vector<llvm_ir::ElementGenerator> input_generators;
+        for (const HloInstruction* instr : reduce_instr->inputs()) {
+          input_generators.push_back(operand_to_generator.at(instr));
+        }
+
+        std::vector<llvm_ir::ElementGenerator> initial_value_generators;
+        for (const HloInstruction* instr : reduce_instr->init_values()) {
+          initial_value_generators.push_back(operand_to_generator.at(instr));
+        }
+        return EmitElementalReduce(reduce_instr, std::move(input_generators),
+                                   std::move(initial_value_generators), index);
+      };
     default:
       return [hlo](const IrArray::Index& index) {
         return Unimplemented("Unhandled opcode for elemental IR emission: %s",
@@ -2451,4 +2488,215 @@ llvm::Value* ElementalIrEmitter::EmitComposeComplex(const HloInstruction* op,
   return complex;
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalMap(
+    const HloMapInstruction* map_instr,
+    absl::Span<llvm::Value* const> elemental_operands) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<llvm::Value*> values,
+      EmitThreadLocalCall(*map_instr->to_apply(), elemental_operands,
+                          llvm_ir::IrName(map_instr)));
+  CHECK_EQ(values.size(), 1);
+  return values[0];
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalReduceWindow(
+    const HloReduceWindowInstruction* reduce_window,
+    const llvm_ir::ElementGenerator& input_generator,
+    const llvm_ir::ElementGenerator& initial_value_generator,
+    const llvm_ir::IrArray::Index& index) {
+  // Pseudocode:
+  // for each index I in output
+  //   value = init_value
+  //   for each index W in window
+  //     for each dimension i from 0 to rank - 1
+  //       (input index I)[i] = O[i] * stride[i] + W[i] - pad_low[i]
+  //     if I in bounds of input
+  //       value = function(value, input[I])
+  //     output[O] = value
+  const HloInstruction* operand = reduce_window->operand(0);
+  const Window& window = reduce_window->window();
+
+  PrimitiveType operand_element_type = operand->shape().element_type();
+  llvm::Value* accum_ptr = llvm_ir::EmitAllocaAtFunctionEntry(
+      llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
+      "reduce_window_accum_ptr", b_);
+  {
+    TF_ASSIGN_OR_RETURN(
+        llvm::Value* const init_value,
+        initial_value_generator(llvm_ir::IrArray::Index(index.GetType())));
+    Store(init_value, accum_ptr);
+  }
+
+  llvm::Type* index_type = index.GetType();
+  auto index_typed_const = [&](uint64 c) -> llvm::Constant* {
+    return index.GetConstantWithIndexType(c);
+  };
+
+  llvm_ir::ForLoopNest loops(IrName(reduce_window), b_, index_type);
+  std::vector<int64> window_size;
+  for (const auto& dim : window.dimensions()) {
+    window_size.push_back(dim.size());
+  }
+  const IrArray::Index window_index = loops.AddLoopsForShape(
+      ShapeUtil::MakeShape(operand_element_type, window_size), "window");
+  CHECK_EQ(window_index.size(), index.size());
+
+  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b_);
+
+  std::vector<llvm::Value*> input_multi_index(index.size());
+  llvm::Value* in_bounds = b_->getInt1(true);
+  for (size_t i = 0; i < index.size(); ++i) {
+    llvm::Value* stridden_index =
+        NSWMul(index[i], index_typed_const(window.dimensions(i).stride()));
+    input_multi_index[i] = NSWSub(
+        NSWAdd(
+            stridden_index,
+            NSWMul(window_index[i],
+                   index_typed_const(window.dimensions(i).window_dilation()))),
+        index_typed_const(window.dimensions(i).padding_low()));
+
+    // We need to verify that we are not in the dilated base area.
+    llvm::Value* dilation_condition =
+        ICmpEQ(SRem(input_multi_index[i],
+                    index_typed_const(window.dimensions(i).base_dilation())),
+               index_typed_const(0));
+    in_bounds = And(in_bounds, dilation_condition);
+
+    // Apply base dilation to the index.
+    input_multi_index[i] =
+        SDiv(input_multi_index[i],
+             index_typed_const(window.dimensions(i).base_dilation()));
+
+    // We must check whether 0 <= input_multi_index[i] < bound, as
+    // otherwise we are in the pad and so can skip the computation. This
+    // comparison is equivalent to the unsigned comparison
+    // input_multi_index[i] < bound, as a negative value wraps to a large
+    // positive value.
+    in_bounds = And(in_bounds,
+                    ICmpULT(input_multi_index[i],
+                            index_typed_const(operand->shape().dimensions(i))));
+  }
+
+  llvm_ir::LlvmIfData if_data =
+      llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", b_);
+  SetToFirstInsertPoint(if_data.true_block, b_);
+
+  // We are not in pad, so do the computation.
+  IrArray::Index input_index(input_multi_index, operand->shape(), index_type);
+  TF_ASSIGN_OR_RETURN(llvm::Value * input_value, input_generator(input_index));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<llvm::Value*> accum_values,
+      EmitThreadLocalCall(*reduce_window->to_apply(),
+                          {Load(accum_ptr), input_value}, "reducer_function"));
+  CHECK_EQ(accum_values.size(), 1);
+  Store(accum_values[0], accum_ptr);
+
+  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b_);
+  return Load(accum_ptr);
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalReduce(
+    const HloReduceInstruction* reduce,
+    std::vector<llvm_ir::ElementGenerator> input_generators,
+    std::vector<llvm_ir::ElementGenerator> initial_value_generators,
+    const llvm_ir::IrArray::Index& index) {
+  const Shape& out_shape = reduce->shape();
+  bool is_variadic = !out_shape.IsArray();
+  int accumulators_count = 1;
+  if (is_variadic) {
+    CHECK(out_shape.IsTuple());
+    accumulators_count = out_shape.tuple_shapes_size();
+  }
+
+  absl::Span<const int64> reduced_dimensions(reduce->dimensions());
+
+  std::vector<llvm::Value*> accumulator_addrs;
+  std::vector<llvm::Type*> accumulator_types;
+  llvm::Type* index_type = index.GetType();
+  for (int i = 0; i < accumulators_count; i++) {
+    const Shape& element_shape =
+        is_variadic ? out_shape.tuple_shapes(i) : out_shape;
+    PrimitiveType accumulator_type = element_shape.element_type();
+    llvm::Type* accumulator_llvm_type =
+        llvm_ir::PrimitiveTypeToIrType(accumulator_type, module_);
+    accumulator_types.push_back(accumulator_llvm_type);
+
+    // Initialize an accumulator with init_value.
+    llvm::AllocaInst* accumulator_addr = llvm_ir::EmitAllocaAtFunctionEntry(
+        accumulator_llvm_type, "accumulator_" + std::to_string(i), b());
+    TF_ASSIGN_OR_RETURN(
+        llvm::Value* const init_value,
+        initial_value_generators[i](llvm_ir::IrArray::Index(index_type)));
+    Store(init_value, accumulator_addr);
+    accumulator_addrs.push_back(accumulator_addr);
+  }
+
+  // The enclosing loops go over all the target elements. Now we have to compute
+  // the actual target element. For this, we build a new loop nest to iterate
+  // over all the reduction dimensions in the argument.
+  // AddLoopsForShapeOnDimensions will return an Index where induction Value*s
+  // are placed for each dimension in dimensions, and all the rest are nullptrs.
+  llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), b(), index_type);
+  const HloInstruction* arg = reduce->operand(0);
+  std::vector<llvm::Value*> input_multi_index =
+      loops.AddLoopsForShapeOnDimensions(arg->shape(), reduced_dimensions,
+                                         "reduction_dim");
+
+  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b());
+
+  // Build a full index for the input argument, using input_multi_index as the
+  // base. In input_multi_index only the reduction dimensions are filled in. We
+  // fill in the rest of the dimensions with induction Value*s taken from
+  // 'index' which iterates over the target array.  See the high-level
+  // description in the XLA documentation for details.
+  auto it = index.begin();
+
+  for (auto& i : input_multi_index) {
+    if (i == nullptr) {
+      i = *it++;
+    }
+  }
+  CHECK(index.end() == it);
+  llvm_ir::IrArray::Index input_index(input_multi_index, arg->shape(),
+                                      index_type);
+
+  std::vector<llvm::Value*> reduction_operands;
+  for (llvm::Value* accum : accumulator_addrs) {
+    llvm::Value* accum_value = Load(accum);
+    reduction_operands.push_back(accum_value);
+  }
+
+  for (int i = 0; i < accumulators_count; i++) {
+    TF_ASSIGN_OR_RETURN(llvm::Value* const input_element,
+                        input_generators[i](input_index));
+    reduction_operands.push_back(input_element);
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<llvm::Value*> results,
+      EmitThreadLocalCall(*reduce->to_apply(), reduction_operands,
+                          "reduce_function"));
+
+  CHECK(results.size() == accumulators_count);
+  for (int i = 0; i < accumulators_count; i++) {
+    Store(results[i], accumulator_addrs[i]);
+  }
+  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b());
+
+  if (is_variadic) {
+    // Emit a structure, as that what the LoopEmitter expects.
+    llvm::Value* returned_structure = llvm::UndefValue::get(
+        llvm::StructType::get(b()->getContext(), accumulator_types));
+    for (int i = 0; i < accumulators_count; i++) {
+      llvm::Value* accumulator_value = Load(accumulator_addrs[i]);
+      returned_structure =
+          b()->CreateInsertValue(returned_structure, accumulator_value, i);
+    }
+    return returned_structure;
+  } else {
+    CHECK_EQ(accumulator_addrs.size(), 1);
+    return Load(accumulator_addrs[0]);
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index 94e8f1d6400..06a9d7b194c 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -17,12 +17,17 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_ELEMENTAL_IR_EMITTER_H_
 
 #include <unordered_map>
+#include <vector>
 
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -220,6 +225,26 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
       const HloToElementGeneratorMap& operand_to_generator,
       const llvm_ir::IrArray::Index& dot_result_index);
 
+  virtual StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view name) = 0;
+
+  StatusOr<llvm::Value*> EmitElementalMap(
+      const HloMapInstruction* map_instr,
+      absl::Span<llvm::Value* const> elemental_operands);
+
+  StatusOr<llvm::Value*> EmitElementalReduceWindow(
+      const HloReduceWindowInstruction* reduce_window,
+      const llvm_ir::ElementGenerator& input_generator,
+      const llvm_ir::ElementGenerator& initial_value_generator,
+      const llvm_ir::IrArray::Index& index);
+
+  StatusOr<llvm::Value*> EmitElementalReduce(
+      const HloReduceInstruction* reduce,
+      std::vector<llvm_ir::ElementGenerator> input_generators,
+      std::vector<llvm_ir::ElementGenerator> initial_value_generators,
+      const llvm_ir::IrArray::Index& index);
+
   llvm::IRBuilder<>* const b_;
 
   llvm::Module* module_;
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index f1ac1fef451..5d7bd26b01e 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -63,10 +63,6 @@ class ExecutionInput {
   explicit ExecutionInput(xla::Shape shape) : buffers_(std::move(shape)) {}
   explicit ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers)
       : buffers_(std::move(buffers)) {}
-  ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers,
-                 std::vector<ShapeIndex> owner_held_indices)
-      : buffers_(std::move(buffers)),
-        unowned_indices_(std::move(owner_held_indices)) {}
   ExecutionInput(ExecutionInput&&) = default;
 
   ~ExecutionInput() {
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 61bc41283e1..09382cfec3f 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -17,15 +17,15 @@ load(
     "tf_cuda_library",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+    "if_rocm_is_configured",
+)
 load(
     "//tensorflow/core/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
-)
 load("//tensorflow:tensorflow.bzl", "if_nccl")
 
 package(
@@ -684,7 +684,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/core:autotuning_proto_cc",
+        "//tensorflow/core/protobuf:autotuning_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/util/proto:proto_utils",
@@ -720,7 +720,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/core:autotuning_proto_cc",
+        "//tensorflow/core/protobuf:autotuning_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -819,6 +819,7 @@ cc_library(
     deps = [
         # LINT.IfChange
         "@local_config_cuda//cuda:cublas_headers",
+        "@local_config_cuda//cuda:cusolver_headers",
         # LINT.ThenChange(//tensorflow/copy.bara.sky:cublas_headers)
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -901,12 +902,15 @@ cc_library(
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_reachability",
-        "//tensorflow/compiler/xla/service:multi_output_fusion",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1674,7 +1678,7 @@ tf_proto_library_cc(
     protodeps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_proto",
-        "//tensorflow/core:autotuning_proto",
+        "//tensorflow/core/protobuf:autotuning_proto",
     ],
 )
 
@@ -1685,8 +1689,8 @@ cc_library(
     deps = [
         ":gpu_autotuning_proto_cc",
         "//tensorflow/compiler/xla:debug_options_flags",
-        "//tensorflow/core:autotuning_proto_cc",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/protobuf:autotuning_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index c6df786fb51..eee0fc83481 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -260,6 +260,13 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitAtan2(PrimitiveType prim_type,
 
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type,
                                                        llvm::Value* value) {
+  // When F64 is being requested, assume performance is less important and use
+  // the more numerically precise tanh function.
+  if (prim_type == F64) {
+    return EmitDeviceMathCall(TargetDeviceFunctionID::kTanh, {value},
+                              {prim_type}, prim_type);
+  }
+
   // Emit a fast approximation of tanh instead of calling __nv_tanh.
   // __nv_tanh is particularly bad because it contains branches, thus
   // preventing LLVM's load-store vectorizer from working its magic across a
@@ -305,168 +312,5 @@ llvm::Value* GpuElementalIrEmitter::EmitThreadId() {
   return NSWAdd(NSWMul(block_id, threads_per_block), thread_id_in_block);
 }
 
-llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
-    const HloInstruction* hlo,
-    const HloToElementGeneratorMap& operand_to_generator) {
-  switch (hlo->opcode()) {
-    case HloOpcode::kMap:
-      return [=, &operand_to_generator](
-                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        TF_RET_CHECK(!hlo->operands().empty())
-            << "Zero operand map not implemented in GPU backend.";
-        TF_RET_CHECK(hlo->to_apply()->num_parameters() > 0);
-        std::vector<llvm::Value*> operand_elements;
-        for (HloInstruction* operand : hlo->operands()) {
-          TF_ASSIGN_OR_RETURN(llvm::Value * value,
-                              operand_to_generator.at(operand)(index));
-          operand_elements.push_back(value);
-        }
-        return compute_nested_(*hlo->to_apply(), operand_elements);
-      };
-    case HloOpcode::kReduceWindow:
-      // Pseudocode:
-      // for each index I in output
-      //   value = init_value
-      //   for each index W in window
-      //     for each dimension i from 0 to rank - 1
-      //       (input index I)[i] = O[i] * stride[i] + W[i] - pad_low[i]
-      //     if I in bounds of input
-      //       value = function(value, input[I])
-      //     output[O] = value
-      return [=, &operand_to_generator](
-                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        const HloInstruction* operand = hlo->operand(0);
-        const Window& window = hlo->window();
-
-        PrimitiveType operand_element_type = operand->shape().element_type();
-        llvm::Value* accum_ptr = llvm_ir::EmitAllocaAtFunctionEntry(
-            llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
-            "reduce_window_accum_ptr", b_);
-        {
-          TF_ASSIGN_OR_RETURN(llvm::Value * init_value,
-                              operand_to_generator.at(hlo->operand(1))(
-                                  IrArray::Index(index.GetType())));
-          Store(init_value, accum_ptr);
-        }
-
-        llvm::Type* index_type = index.GetType();
-        auto index_typed_const = [&](uint64 c) -> llvm::Constant* {
-          return index.GetConstantWithIndexType(c);
-        };
-
-        llvm_ir::ForLoopNest loops(IrName(hlo), b_, index_type);
-        std::vector<int64> window_size;
-        for (const auto& dim : window.dimensions()) {
-          window_size.push_back(dim.size());
-        }
-        const IrArray::Index window_index = loops.AddLoopsForShape(
-            ShapeUtil::MakeShape(operand_element_type, window_size), "window");
-        CHECK_EQ(window_index.size(), index.size());
-
-        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b_);
-
-        std::vector<llvm::Value*> input_multi_index(index.size());
-        llvm::Value* in_bounds = b_->getInt1(true);
-        for (size_t i = 0; i < index.size(); ++i) {
-          llvm::Value* stridden_index = NSWMul(
-              index[i], index_typed_const(window.dimensions(i).stride()));
-          input_multi_index[i] = NSWSub(
-              NSWAdd(stridden_index,
-                     NSWMul(window_index[i],
-                            index_typed_const(
-                                window.dimensions(i).window_dilation()))),
-              index_typed_const(window.dimensions(i).padding_low()));
-
-          // We need to verify that we are not in the dilated base area.
-          llvm::Value* dilation_condition = ICmpEQ(
-              SRem(input_multi_index[i],
-                   index_typed_const(window.dimensions(i).base_dilation())),
-              index_typed_const(0));
-          in_bounds = And(in_bounds, dilation_condition);
-
-          // Apply base dilation to the index.
-          input_multi_index[i] =
-              SDiv(input_multi_index[i],
-                   index_typed_const(window.dimensions(i).base_dilation()));
-
-          // We must check whether 0 <= input_multi_index[i] < bound, as
-          // otherwise we are in the pad and so can skip the computation. This
-          // comparison is equivalent to the unsigned comparison
-          // input_multi_index[i] < bound, as a negative value wraps to a large
-          // positive value.
-          in_bounds =
-              And(in_bounds,
-                  ICmpULT(input_multi_index[i],
-                          index_typed_const(operand->shape().dimensions(i))));
-        }
-
-        llvm_ir::LlvmIfData if_data =
-            llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", b_);
-        SetToFirstInsertPoint(if_data.true_block, b_);
-
-        // We are not in pad, so do the computation.
-        IrArray::Index input_index(input_multi_index, operand->shape(),
-                                   index_type);
-        TF_ASSIGN_OR_RETURN(llvm::Value * input_value,
-                            operand_to_generator.at(operand)(input_index));
-        TF_ASSIGN_OR_RETURN(
-            llvm::Value * accum_value,
-            compute_nested_(*hlo->to_apply(), {Load(accum_ptr), input_value}));
-        Store(accum_value, accum_ptr);
-
-        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b_);
-        return Load(accum_ptr);
-      };
-    case HloOpcode::kReduce:
-      // TODO(b/118332391): This should be supported.
-      CHECK_EQ(hlo->operand_count(), 2) << "Did not expect variadic reduce";
-      return [=, &operand_to_generator](
-                 const IrArray::Index& output_index) -> StatusOr<llvm::Value*> {
-        const HloInstruction* operand = hlo->operand(0);
-        llvm::Value* accum_ptr =
-            b()->CreateAlloca(llvm_ir::PrimitiveTypeToIrType(
-                hlo->shape().element_type(), module_));
-        llvm::Type* index_type = output_index.GetType();
-        TF_ASSIGN_OR_RETURN(llvm::Value * init_value,
-                            operand_to_generator.at(hlo->operand(1))(
-                                IrArray::Index(index_type)));
-        b()->CreateStore(init_value, accum_ptr);
-
-        llvm_ir::ForLoopNest loops(IrName(hlo), b_, index_type);
-        std::vector<llvm::Value*> input_multi_index =
-            loops.AddLoopsForShapeOnDimensions(
-                operand->shape(), hlo->dimensions(), "reduction_dim");
-        if (!ShapeUtil::IsScalar(hlo->shape())) {
-          // Here only input_multi_index[hlo->dimensions()] are non-null, so we
-          // must set the rest.
-          size_t j = 0;
-          for (auto& i : input_multi_index) {
-            if (i == nullptr) {
-              i = output_index[j++];
-            }
-          }
-          CHECK_EQ(output_index.size(), j);
-        }
-        llvm_ir::IrArray::Index input_index(
-            input_multi_index, hlo->operand(0)->shape(), index_type);
-
-        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b());
-        TF_ASSIGN_OR_RETURN(
-            llvm::Value * input_value,
-            operand_to_generator.at(hlo->operand(0))(input_index));
-        TF_ASSIGN_OR_RETURN(
-            llvm::Value * accum_value,
-            compute_nested_(*hlo->to_apply(),
-                            {b()->CreateLoad(accum_ptr), input_value}));
-        b()->CreateStore(accum_value, accum_ptr);
-        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b());
-        return b()->CreateLoad(accum_ptr);
-      };
-    default:
-      return ElementalIrEmitter::MakeElementGenerator(hlo,
-                                                      operand_to_generator);
-  }
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index c8a58a21980..a3056b1ddad 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -40,17 +40,13 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
  public:
   // A NestedComputer computes an element of the output of the given computation
   // given a Span of its input elements.
-  using NestedComputer = std::function<StatusOr<llvm::Value*>(
+  using NestedComputer = std::function<StatusOr<std::vector<llvm::Value*>>(
       const HloComputation&, absl::Span<llvm::Value* const>)>;
 
   GpuElementalIrEmitter(const HloModuleConfig& hlo_module_config,
                         llvm::Module* module, llvm::IRBuilder<>* b,
                         NestedComputer compute_nested);
 
-  llvm_ir::ElementGenerator MakeElementGenerator(
-      const HloInstruction* hlo,
-      const HloToElementGeneratorMap& operand_to_generator) override;
-
  protected:
   StatusOr<llvm::Value*> EmitFloatBinaryOp(const HloInstruction* op,
                                            llvm::Value* lhs_value,
@@ -92,6 +88,12 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitComplexAbs(PrimitiveType prim_type,
                                         llvm::Value* value) override;
 
+  StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view) override {
+    return compute_nested_(callee, parameters);
+  }
+
   llvm::Value* EmitThreadId() override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
index 991a463f2a0..9d6be3c78ea 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
@@ -60,16 +60,18 @@ StatusOr<se::DeviceMemory<uint8>> FftScratchAllocator::AllocateBytes(
 
 namespace {
 
-se::fft::Type FftTypeToSeType(FftType type) {
+se::fft::Type FftTypeToSeType(FftType type, bool double_precision) {
   switch (type) {
     case FftType::FFT:
-      return se::fft::Type::kC2CForward;
+      return double_precision ? se::fft::Type::kZ2ZForward
+                              : se::fft::Type::kC2CForward;
     case FftType::IFFT:
-      return se::fft::Type::kC2CInverse;
+      return double_precision ? se::fft::Type::kZ2ZInverse
+                              : se::fft::Type::kC2CInverse;
     case FftType::IRFFT:
-      return se::fft::Type::kC2R;
+      return double_precision ? se::fft::Type::kZ2D : se::fft::Type::kC2R;
     case FftType::RFFT:
-      return se::fft::Type::kR2C;
+      return double_precision ? se::fft::Type::kD2Z : se::fft::Type::kR2C;
     default:
       LOG(FATAL) << "unsupported fft type";
   }
@@ -78,12 +80,16 @@ se::fft::Type FftTypeToSeType(FftType type) {
 string FftTypeToString(se::fft::Type type) {
   switch (type) {
     case se::fft::Type::kC2CForward:
+    case se::fft::Type::kZ2ZForward:
       return "FFT";
     case se::fft::Type::kC2CInverse:
+    case se::fft::Type::kZ2ZInverse:
       return "IFFT";
     case se::fft::Type::kC2R:
+    case se::fft::Type::kZ2D:
       return "IRFFT";
     case se::fft::Type::kR2C:
+    case se::fft::Type::kD2Z:
       return "RFFT";
     default:
       LOG(FATAL) << "unknown fft type";
@@ -98,7 +104,9 @@ FftThunk::FftThunk(FftType fft_type, absl::Span<const int64> fft_length,
                    const Shape& input_shape, const Shape& output_shape,
                    const HloInstruction* hlo)
     : Thunk(Kind::kFft, hlo),
-      fft_type_(FftTypeToSeType(fft_type)),
+      fft_type_(
+          FftTypeToSeType(fft_type, input_shape.element_type() == F64 ||
+                                        input_shape.element_type() == C128)),
       fft_length_(fft_length.begin(), fft_length.end()),
       scale_factor_(1.0f),
       input_buffer_(input_buffer),
@@ -166,6 +174,15 @@ Status FftThunk::ExecuteOnStream(const ExecuteParams& params) {
           stream.ThenFft(fft_plan_.get(), input_data, &output_data).ok();
       break;
     }
+    case se::fft::Type::kZ2ZForward: {
+      se::DeviceMemory<complex128> input_data(
+          buffer_allocations.GetDeviceAddress(input_buffer_));
+      se::DeviceMemory<complex128> output_data(
+          buffer_allocations.GetDeviceAddress(output_buffer_));
+      launch_ok =
+          stream.ThenFft(fft_plan_.get(), input_data, &output_data).ok();
+      break;
+    }
     case se::fft::Type::kC2CInverse: {
       se::DeviceMemory<complex64> input_data(
           buffer_allocations.GetDeviceAddress(input_buffer_));
@@ -181,6 +198,22 @@ Status FftThunk::ExecuteOnStream(const ExecuteParams& params) {
       }
       break;
     }
+    case se::fft::Type::kZ2ZInverse: {
+      se::DeviceMemory<complex128> input_data(
+          buffer_allocations.GetDeviceAddress(input_buffer_));
+      se::DeviceMemory<complex128> output_data(
+          buffer_allocations.GetDeviceAddress(output_buffer_));
+      launch_ok =
+          stream.ThenFft(fft_plan_.get(), input_data, &output_data).ok();
+      if (launch_ok) {
+        launch_ok =
+            stream
+                .ThenBlasScal(ShapeUtil::ElementsIn(output_shape_),
+                              complex128(scale_factor_), &output_data, 1)
+                .ok();
+      }
+      break;
+    }
     case se::fft::Type::kR2C: {
       se::DeviceMemory<float> input_data(
           buffer_allocations.GetDeviceAddress(input_buffer_));
@@ -190,6 +223,15 @@ Status FftThunk::ExecuteOnStream(const ExecuteParams& params) {
           stream.ThenFft(fft_plan_.get(), input_data, &output_data).ok();
       break;
     }
+    case se::fft::Type::kD2Z: {
+      se::DeviceMemory<double> input_data(
+          buffer_allocations.GetDeviceAddress(input_buffer_));
+      se::DeviceMemory<complex128> output_data(
+          buffer_allocations.GetDeviceAddress(output_buffer_));
+      launch_ok =
+          stream.ThenFft(fft_plan_.get(), input_data, &output_data).ok();
+      break;
+    }
     case se::fft::Type::kC2R: {
       se::DeviceMemory<complex64> input_data(
           buffer_allocations.GetDeviceAddress(input_buffer_));
@@ -205,6 +247,21 @@ Status FftThunk::ExecuteOnStream(const ExecuteParams& params) {
       }
       break;
     }
+    case se::fft::Type::kZ2D: {
+      se::DeviceMemory<complex128> input_data(
+          buffer_allocations.GetDeviceAddress(input_buffer_));
+      se::DeviceMemory<double> output_data(
+          buffer_allocations.GetDeviceAddress(output_buffer_));
+      launch_ok =
+          stream.ThenFft(fft_plan_.get(), input_data, &output_data).ok();
+      if (launch_ok) {
+        launch_ok = stream
+                        .ThenBlasScal(ShapeUtil::ElementsIn(output_shape_),
+                                      scale_factor_, &output_data, 1)
+                        .ok();
+      }
+      break;
+    }
     default:
       LOG(FATAL) << "unsupported fft type";
   }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 2df6b50d361..b9d1f3ef158 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -176,7 +176,6 @@ Status GpuExecutable::ExecuteThunks(
     // module, we won't get any data, but that's probably an OK trade-off.
     ScopedAnnotation annotation([&] { return thunk->profile_annotation(); });
 
-    TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
     int32 stream_no =
         thunk_schedule_->StreamNumberForHlo(*thunk->hlo_instruction());
     se::Stream* stream =
@@ -387,6 +386,10 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
             assignment_.get(), executor->device_ordinal(), memory_allocator));
   }
 
+  for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
+    TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
+  }
+
   TF_RETURN_IF_ERROR(ExecuteThunks(run_options, *buffer_allocations,
                                    block_host_until_done,
                                    hlo_execution_profile));
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 1316e8ad1aa..bb4184ff76f 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -351,6 +351,9 @@ bool FusionWouldBeTooLarge(const HloInstruction& instr1,
                            const HloInstruction& instr2) {
   if (SharedMemoryUsage(instr1) + SharedMemoryUsage(instr2) >
       kSharedMemoryBudgetInBytes) {
+    VLOG(5) << "Shared memory usage of fusion of " << instr1.ToString()
+            << " and " << instr2.ToString() << " would be over the budget of "
+            << kSharedMemoryBudgetInBytes << "B";
     return true;
   }
 
@@ -383,6 +386,14 @@ bool FusionWouldBeTooLarge(const HloInstruction& instr1,
           num_output_buffers <=
       kMaxOperandsAndOutputsPerFusion) {
     return false;
+  } else {
+    VLOG(5) << "Operand count of "
+            << "(" << instr1.ToString() << " ) = " << instr1.operand_count()
+            << " and ( " << instr2.ToString()
+            << " ) = " << instr2.operand_count()
+            << " and num_output_buffers = " << num_output_buffers
+            << " is bigger than the bound of "
+            << kMaxOperandsAndOutputsPerFusion;
   }
 
   // Compute the precise number of operands to the new fusion.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index 05fa798dc39..cb22b4d9042 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -96,7 +96,8 @@ Status GpuTransferManager::EnqueueBuffersToInfeed(
 StatusOr<InfeedBuffer> GpuTransferManager::TransferBufferToInfeedInternal(
     se::StreamExecutor* executor, int64 size, const void* source) {
   if (size > std::numeric_limits<int32>::max()) {
-    return InvalidArgument("Infeed shape is too large: needs %d bytes", size);
+    return InvalidArgument("GPU infeed of %d bytes exceeds maximum of %d bytes",
+                           size, std::numeric_limits<int32>::max());
   }
 
   if (size == 0) {
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index fc1c1bb4ab1..a0580e2ab04 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -65,12 +65,16 @@ bool GpuInstructionFusion::ShouldFuseInexpensiveChecks(HloInstruction* consumer,
 bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   if (!ShouldFuseInexpensiveChecks(consumer, operand_index)) {
+    VLOG(5) << "Not fusing inexpensive checks of operand " << operand_index
+            << " of " << consumer->ToString();
     return false;
   }
   auto producer = consumer->operand(operand_index);
 
   // The following checks are potentially expensive.
   if (FusionWouldBeTooLarge(*consumer, *producer)) {
+    VLOG(5) << "Fusion of (" << producer->ToString() << ") into ("
+            << consumer->ToString() << ") would be too large";
     return false;
   }
   if (consumer->opcode() != HloOpcode::kFusion) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 011eb07d3bd..aa8a6215cc7 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -222,7 +222,7 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation(
     // Derive a minimum alignment from the type. The optimizer can increase it
     // later.
     store->setAlignment(
-        llvm::MaybeAlign(ShapeUtil::ByteSizeOfPrimitiveType(element_type)));
+        llvm::Align(ShapeUtil::ByteSizeOfPrimitiveType(element_type)));
     return true;
   }
 
@@ -698,115 +698,6 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   return Status::OK();
 }
 
-Status IrEmitter::HandleReduce(HloInstruction* instr) {
-  const HloReduceInstruction* reduce = Cast<HloReduceInstruction>(instr);
-  const Shape& out_shape = reduce->shape();
-  bool returns_tuple = !out_shape.IsArray();
-  int accumulators_count = 1;
-  if (returns_tuple) {
-    CHECK(out_shape.IsTuple());
-    accumulators_count = out_shape.tuple_shapes_size();
-  }
-
-  auto arg = reduce->operand(0);
-  absl::Span<const int64> dimensions(reduce->dimensions());
-  HloComputation* function = reduce->to_apply();
-  return EmitTargetElementLoop(
-      *reduce,
-      [=](const llvm_ir::IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        std::vector<llvm::Value*> accumulator_addrs;
-        std::vector<llvm::Type*> accumulator_types;
-
-        // Initialize accumulators with initial values.
-        for (int i = 0; i < accumulators_count; i++) {
-          auto init_value = reduce->init_values()[i];
-          const Shape& element_shape =
-              returns_tuple ? out_shape.tuple_shapes(i) : out_shape;
-          PrimitiveType accumulator_type = element_shape.element_type();
-          llvm::Type* accumulator_llvm_type =
-              llvm_ir::PrimitiveTypeToIrType(accumulator_type, module_);
-          llvm::AllocaInst* accumulator_addr = Alloca(accumulator_llvm_type);
-          Store(Load(GetBasePointer(*init_value)), accumulator_addr);
-          accumulator_addrs.push_back(accumulator_addr);
-          accumulator_types.push_back(accumulator_llvm_type);
-        }
-
-        // The enclosing loops go over all the target elements. Now we have to
-        // compute the actual target element. For this, we build a new loop nest
-        // to iterate over all the reduction dimensions in the argument.
-        // AddLoopsForShapeOnDimensions will return an Index where induction
-        // Value*s are placed for each dimension in dimensions, and all the rest
-        // are nullptrs.
-        llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &b_);
-        std::vector<llvm::Value*> input_multi_index =
-            loops.AddLoopsForShapeOnDimensions(arg->shape(), dimensions,
-                                               "reduction_dim");
-
-        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
-
-        // Build a full index for the input argument, using reduced_dims_index
-        // as the base. In reduced_dims_index only the reduction dimensions are
-        // filled in. We fill in the rest of the dimensions with induction
-        // Value*s taken from 'index' which iterates over the target array.
-        // See the high-level description in the XLA documentation for details.
-        llvm_ir::IrArray::Index::const_iterator it = index.begin();
-
-        for (auto& i : input_multi_index) {
-          if (i == nullptr) {
-            i = *it++;
-          }
-        }
-        CHECK(index.end() == it);
-
-        // Apply the reduction function to the loaded value.
-        llvm_ir::IrArray::Index input_index(input_multi_index, arg->shape(),
-                                            b_.getInt64Ty());
-        std::vector<llvm::Value*> reduction_operands(accumulator_addrs.begin(),
-                                                     accumulator_addrs.end());
-        for (int i = 0; i < accumulators_count; i++) {
-          llvm::Value* input_address =
-              GetIrArray(*reduce->operand(i), *reduce)
-                  .EmitArrayElementAddress(input_index, &b_);
-          reduction_operands.push_back(input_address);
-        }
-
-        llvm::Value* ret_argument;
-        if (!returns_tuple) {
-          CHECK_EQ(accumulator_addrs.size(), 1);
-          ret_argument = accumulator_addrs[0];
-        } else {
-          const Shape& return_shape = function->root_instruction()->shape();
-
-          llvm::Type* return_value_buffer_type =
-              llvm_ir::ShapeToIrType(return_shape, module_);
-          ret_argument = Alloca(return_value_buffer_type);
-          llvm_ir::IrArray tuple_array(ret_argument, return_shape);
-          EmitTuple(tuple_array, accumulator_addrs, &b_);
-        }
-
-        TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-            *function, reduction_operands, ret_argument));
-
-        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
-
-        if (!returns_tuple) {
-          CHECK_EQ(accumulator_addrs.size(), 1);
-          return Load(accumulator_addrs[0]);
-        } else {
-          // Emit a struct for the LoopEmitter dealing with multi-output
-          // fusion.
-          llvm::Value* returned_structure = llvm::UndefValue::get(
-              llvm::StructType::get(b_.getContext(), accumulator_types));
-          for (int i = 0; i < accumulators_count; i++) {
-            llvm::Value* accumulator_value = Load(accumulator_addrs[i]);
-            returned_structure =
-                b_.CreateInsertValue(returned_structure, accumulator_value, i);
-          }
-          return returned_structure;
-        }
-      });
-}
-
 Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   // kFusion for library calls should be handled by
   // IrEmitterUnnested::HandleFusion.
@@ -866,22 +757,39 @@ Status IrEmitter::HandleBatchNormGrad(HloInstruction*) {
       "to a cudnn CustomCall using CudnnBatchNormRewriter.");
 }
 
-StatusOr<llvm::Value*> IrEmitter::ComputeNestedElement(
+StatusOr<std::vector<llvm::Value*>> IrEmitter::ComputeNestedElement(
     const HloComputation& computation,
     absl::Span<llvm::Value* const> parameter_elements) {
+  const Shape& return_shape = computation.root_instruction()->shape();
   llvm::Value* return_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
-      llvm_ir::PrimitiveTypeToIrType(
-          computation.root_instruction()->shape().element_type(), module_),
-      "return_buffer", &b_);
+      llvm_ir::ShapeToIrType(return_shape, module_), "return_buffer", &b_);
   std::vector<llvm::Value*> parameter_buffers;
   for (llvm::Value* parameter_element : parameter_elements) {
     parameter_buffers.push_back(llvm_ir::EmitAllocaAtFunctionEntry(
         parameter_element->getType(), "parameter_buffer", &b_));
     Store(parameter_element, parameter_buffers.back());
   }
+
+  std::vector<llvm::Value*> allocas_for_returned_scalars;
+  if (!return_shape.IsTuple()) {
+    allocas_for_returned_scalars.push_back(return_buffer);
+  } else {
+    allocas_for_returned_scalars =
+        llvm_ir::EmitTupleAllocasAtFunctionEntry(return_shape, &b_);
+    llvm_ir::IrArray tuple_array(return_buffer, return_shape);
+
+    EmitTuple(tuple_array, allocas_for_returned_scalars, &b_);
+  }
+
   TF_RETURN_IF_ERROR(EmitCallToNestedComputation(computation, parameter_buffers,
                                                  return_buffer));
-  return Load(return_buffer);
+
+  std::vector<llvm::Value*> returned_scalars;
+  returned_scalars.reserve(allocas_for_returned_scalars.size());
+  for (llvm::Value* addr : allocas_for_returned_scalars) {
+    returned_scalars.push_back(Load(addr));
+  }
+  return returned_scalars;
 }
 
 std::vector<llvm_ir::IrArray> IrEmitter::ConstructIrArrayForOutputs(
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index e0fe454dcfe..93712961ea2 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -89,7 +89,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   Status HandleRecv(HloInstruction* recv) override;
   Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleParameter(HloInstruction* parameter) override;
-  Status HandleReduce(HloInstruction* reduce) override;
   Status HandleTuple(HloInstruction* tuple) override;
   Status HandleScatter(HloInstruction* scatter) override;
   Status HandleSelect(HloInstruction* select) override;
@@ -213,7 +212,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
                        const llvm_ir::IrArray::Index& compare_keys_index,
                        const llvm_ir::IrArray& keys_array);
 
-  StatusOr<llvm::Value*> ComputeNestedElement(
+  StatusOr<std::vector<llvm::Value*>> ComputeNestedElement(
       const HloComputation& computation,
       absl::Span<llvm::Value* const> parameter_elements);
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index ec5f10bd2e8..a78ffc8dd1a 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2016,7 +2016,9 @@ void IrEmitterUnnested::EmitTile(
 
   // True iff all threads always execute all instructions in the tiling
   // dimension X.
-  bool x_tile_fits = mapping_scheme.GetDimsInElems()[kDimX] % tile_size_x == 0;
+  bool x_tile_fits =
+      mapping_scheme.GetDimsInElems()[kDimX] % tile_size_x == 0 &&
+      mapping_scheme.GetRowContiguous();
 
   // The outer loop below is simply doing:
   //
@@ -2731,7 +2733,8 @@ void IrEmitterUnnested::EmitHlo021Tile(
                                      /*num_threads_y=*/kNumRows,
                                      /*num_threads_x=*/kWarpSize,
                                      /*indexing_order=*/kLinearIndexingX,
-                                     /*vector_size=*/1);
+                                     /*vector_size=*/1,
+                                     /*is_row_contiguous=*/false);
   LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(),
                                      mapping_scheme.GetThreadsPerBlock());
   llvm::Type* index_type =
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index 5e15d0767a1..d5c4ecbc795 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -90,13 +90,14 @@ class KernelMappingScheme {
   KernelMappingScheme(absl::Span<const int64> dims_in_elems,
                       absl::Span<const int64> tile_sizes, int64 num_threads_y,
                       int64 num_threads_x, IndexingOrder indexing_order,
-                      int vector_size)
+                      int vector_size, bool is_row_contiguous = false)
       : dims_in_elems_{dims_in_elems[0], dims_in_elems[1], dims_in_elems[2]},
         tile_sizes_{tile_sizes[0], tile_sizes[1], tile_sizes[2]},
         num_threads_x_(num_threads_x),
         num_threads_y_(num_threads_y),
         indexing_order_(indexing_order),
-        vector_size_(vector_size) {
+        vector_size_(vector_size),
+        is_row_contiguous_(is_row_contiguous) {
     CHECK_EQ(tile_sizes[1] % num_threads_y_, 0);
     CHECK_EQ(tile_sizes[2] % num_threads_x_, 0);
     VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ",");
@@ -134,6 +135,7 @@ class KernelMappingScheme {
 
   IndexingOrder GetIndexingOrder() const { return indexing_order_; }
   int GetVectorSize() const { return vector_size_; }
+  bool GetRowContiguous() const { return is_row_contiguous_; }
 
  private:
   // The number of elements in each dimension.
@@ -159,6 +161,7 @@ class KernelMappingScheme {
   // to trigger vectorized loads on GPUs while keeping memory
   // coalescing.
   const int vector_size_;
+  const bool is_row_contiguous_;
 };
 
 // Information to support the code generation for a tiled reduction kernel.
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 060a0375271..497dcda4361 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -689,7 +689,7 @@ std::unique_ptr<llvm::TargetMachine> AMDGPUGetTargetMachine(
     llvm::Triple target_triple, int amdgpu_version,
     const HloModuleConfig& hlo_module_config) {
   return GetTargetMachine(target_triple, absl::StrCat("gfx", amdgpu_version),
-                          hlo_module_config, "-code-object-v3");
+                          hlo_module_config, "+code-object-v3");
 }
 
 void AMDGPUBackendInit(const HloModuleConfig& hlo_module_config) {
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h
index 8d2ef53bfa9..e60f3bc3c14 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h
@@ -16,7 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MULTI_OUTPUT_FUSION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MULTI_OUTPUT_FUSION_H_
 
-#include "tensorflow/compiler/xla/service/multi_output_fusion.h"
+#include <queue>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
+#include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
 namespace gpu {
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 0196267d904..eefa4661d37 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -385,6 +385,19 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
         } else {
           if (maybe_cubin.status().code() ==
               tensorflow::error::Code::NOT_FOUND) {
+            if (!hlo_module_config.debug_options()
+                     .xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found()) {
+              PrintCantFindCudaMessage(
+                  "Can't find ptxas binary in ${CUDA_DIR}/bin.  Custom ptxas "
+                  "location can be specified using $PATH.",
+                  hlo_module_config);
+              LOG(FATAL)
+                  << "Can't find ptxas binary.  You can pass the flag "
+                     "--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found "
+                     "to use the GPU driver for compiling ptx instead. However "
+                     "this option is discouraged and can lead to increased "
+                     "memory concumptions and other subtle runtime issues.";
+            }
             // Missing ptxas is expected in some environments where CUDA SDK
             // binaries are not available. We don't want to spam logs with
             // identical warnings in this case.
@@ -402,25 +415,13 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
                   "using $PATH.",
                   hlo_module_config);
             }
-            CHECK(hlo_module_config.debug_options()
-                      .xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found())
-                << "There was an error when trying to compile ptx into sass "
-                   "code. If you want to try falling back to the GPU driver to "
-                   "jit compile ptx, you can use the flag "
-                   "--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found."
-                   " Use at your own risk though, it has known drawbacks like "
-                   "increased memory consumption.";
           } else {
-            LOG(ERROR) << "Error during compilation of ptx to sass: "
-                       << maybe_cubin.status();
-            CHECK(hlo_module_config.debug_options()
-                      .xla_gpu_unsafe_fallback_to_driver_on_ptxas_error())
-                << "There was an error when trying to compile ptx into sass "
-                   "code. If you want to try falling back to the GPU driver to "
-                   "jit compile ptx, you can use the flag "
-                   "--xla_gpu_unsafe_fallback_to_driver_on_ptxas_error."
-                   " Use at your own risk though, it has known drawbacks like "
-                   "increased memory consumption.";
+            LOG(FATAL) << "ptxas returned an error during compilation of ptx "
+                          "to sass: '"
+                       << maybe_cubin.status() << "'  "
+                       << "If the error message indicates that a file could "
+                          "not be written, please verify that sufficient "
+                          "filesystem space is provided.";
           }
 
           // We're going to use the driver to JIT our PTX->SASS, so warn if
diff --git a/tensorflow/compiler/xla/service/gpu/target_util.cc b/tensorflow/compiler/xla/service/gpu/target_util.cc
index 49eadd8c6be..31b590a19ff 100644
--- a/tensorflow/compiler/xla/service/gpu/target_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/target_util.cc
@@ -111,47 +111,50 @@ struct TargetDeviceFunction {
 struct TargetDeviceFunction GetDeviceFunctionRoot(
     TargetDeviceFunctionID func_id) {
   switch (func_id) {
-    case TargetDeviceFunctionID::kPow: {
-      return {"__nv_pow", "__ocml_pow"};
-    }
-    case TargetDeviceFunctionID::kErfcinv: {
-      return {"__nv_erfcinv", "__ocml_erfcinv"};
-    }
-    case TargetDeviceFunctionID::kLog: {
-      return {"__nv_log", "__ocml_log"};
-    }
-    case TargetDeviceFunctionID::kLog1p: {
-      return {"__nv_log1p", "__ocml_log1p"};
-    }
-    case TargetDeviceFunctionID::kSin: {
-      return {"__nv_sin", "__ocml_sin"};
+    case TargetDeviceFunctionID::kAtan2: {
+      return {"__nv_atan2", "__ocml_atan2"};
     }
     case TargetDeviceFunctionID::kCos: {
       return {"__nv_cos", "__ocml_cos"};
     }
+    case TargetDeviceFunctionID::kErfcinv: {
+      return {"__nv_erfcinv", "__ocml_erfcinv"};
+    }
     case TargetDeviceFunctionID::kExp: {
       return {"__nv_exp", "__ocml_exp"};
     }
     case TargetDeviceFunctionID::kExpm1: {
       return {"__nv_expm1", "__ocml_expm1"};
     }
-    case TargetDeviceFunctionID::kSqrt: {
-      return {"__nv_sqrt", "__ocml_sqrt"};
-    }
-    case TargetDeviceFunctionID::kRsqrt: {
-      return {"__nv_rsqrt", "__ocml_rsqrt"};
-    }
-    case TargetDeviceFunctionID::kAtan2: {
-      return {"__nv_atan2", "__ocml_atan2"};
-    }
     case TargetDeviceFunctionID::kFmod: {
       return {"__nv_fmod", "__ocml_fmod"};
     }
+    case TargetDeviceFunctionID::kHypot: {
+      return {"__nv_hypot", "__ocml_hypot"};
+    }
+    case TargetDeviceFunctionID::kLog: {
+      return {"__nv_log", "__ocml_log"};
+    }
+    case TargetDeviceFunctionID::kLog1p: {
+      return {"__nv_log1p", "__ocml_log1p"};
+    }
+    case TargetDeviceFunctionID::kPow: {
+      return {"__nv_pow", "__ocml_pow"};
+    }
     case TargetDeviceFunctionID::kRound: {
       return {"__nv_round", "__ocml_round"};
     }
-    case TargetDeviceFunctionID::kHypot: {
-      return {"__nv_hypot", "__ocml_hypot"};
+    case TargetDeviceFunctionID::kRsqrt: {
+      return {"__nv_rsqrt", "__ocml_rsqrt"};
+    }
+    case TargetDeviceFunctionID::kSin: {
+      return {"__nv_sin", "__ocml_sin"};
+    }
+    case TargetDeviceFunctionID::kSqrt: {
+      return {"__nv_sqrt", "__ocml_sqrt"};
+    }
+    case TargetDeviceFunctionID::kTanh: {
+      return {"__nv_tanh", "__ocml_tanh"};
     }
   }
 }
diff --git a/tensorflow/compiler/xla/service/gpu/target_util.h b/tensorflow/compiler/xla/service/gpu/target_util.h
index 4355ed21136..2bdaea7734a 100644
--- a/tensorflow/compiler/xla/service/gpu/target_util.h
+++ b/tensorflow/compiler/xla/service/gpu/target_util.h
@@ -46,20 +46,21 @@ enum class TargetIntrinsicID {
 
 // Enumeration to get target specific device math function.
 enum class TargetDeviceFunctionID {
-  kPow = 0,
-  kErfcinv,
-  kLog,
-  kLog1p,
-  kSin,
+  kAtan2 = 0,
   kCos,
+  kErfcinv,
   kExp,
   kExpm1,
-  kSqrt,
-  kRsqrt,
-  kAtan2,
   kFmod,
+  kHypot,
+  kLog,
+  kLog1p,
+  kPow,
   kRound,
-  kHypot
+  kRsqrt,
+  kSin,
+  kSqrt,
+  kTanh,
 };
 
 // Emits IR to call a device function named "callee_name" on the given
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index e04dba418d9..7a9845d0f49 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -235,6 +235,20 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "gpu_copy_alone_test",
+    srcs = [
+        "gpu_copy_alone_test.cc",
+    ],
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "gpu_ftz_test",
     srcs = ["gpu_ftz_test.cc"],
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc
new file mode 100644
index 00000000000..1c475ab4e10
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// WARNING: This tests must be alone in its file!  Otherwise, the
+// error isn't caught. We expect and CUDA_ERROR_ILLEGAL_ADDRESS to be
+// thrown with the old buggy code.
+class CopyAloneNoOptTest : public GpuCodegenTest {
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    // The test MultiOutputStore contain a MOF fusion and XLA optimizer pass
+    // doesn't like this.
+    debug_options.set_xla_disable_all_hlo_passes(true);
+    return debug_options;
+  }
+};
+
+TEST_F(CopyAloneNoOptTest, CopyTranspose) {
+  const char* hlo_text = R"(
+HloModule mod
+ENTRY main {
+  %param = f32[8,32,32,32,16]{4,3,2,1,0} parameter(0)
+  ROOT %copy = f32[8,32,32,32,16]{3,2,1,4,0} copy(f32[8,32,32,32,16]{4,3,2,1,0} %param)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
+                                R"(
+CHECK-NOT: ld.global.nc.v2
+)");
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 94a4df43cf4..8a31bc5fef4 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -707,6 +707,10 @@ Status HloCostAnalysis::HandleCholesky(const HloInstruction* hlo) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleAllGather(const HloInstruction* hlo) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleAllReduce(const HloInstruction* crs) {
   // We assume 2 replicas, so that each output element is the sum of two input
   // elements.
@@ -732,6 +736,16 @@ Status HloCostAnalysis::HandleCollectivePermute(const HloInstruction* /*hlo*/) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleCollectivePermuteStart(
+    const HloInstruction* /*hlo*/) {
+  return Status::OK();
+}
+
+Status HloCostAnalysis::HandleCollectivePermuteDone(
+    const HloInstruction* /*hlo*/) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandlePartitionId(const HloInstruction* /*hlo*/) {
   return Status::OK();
 }
@@ -1027,6 +1041,42 @@ float HloCostAnalysis::optimal_seconds(const HloInstruction& hlo) const {
   return GetPropertyForHlo(hlo, kOptimalSecondsKey, hlo_properties_);
 }
 
+int64 HloCostAnalysis::GetBytesRead(const HloInstruction& hlo,
+                                    absl::optional<int64> memory_space) const {
+  int64 bytes_read = 0;
+  for (int operand_number = 0; operand_number < hlo.operand_count();
+       ++operand_number) {
+    for (const ShapeUtil::IndexedShape& indexed_shape :
+         ShapeUtil::GetLeafShapes(hlo.operand(operand_number)->shape())) {
+      absl::optional<int64> index_memory_space;
+      if (indexed_shape.shape.has_layout()) {
+        index_memory_space = indexed_shape.shape.layout().memory_space();
+      }
+      if (!memory_space || memory_space == index_memory_space) {
+        bytes_read +=
+            operand_bytes_accessed(hlo, operand_number, indexed_shape.index);
+      }
+    }
+  }
+  return bytes_read;
+}
+
+int64 HloCostAnalysis::GetBytesWritten(
+    const HloInstruction& hlo, absl::optional<int64> memory_space) const {
+  int64 bytes_written = 0;
+  for (const ShapeUtil::IndexedShape& indexed_shape :
+       ShapeUtil::GetLeafShapes(hlo.shape())) {
+    absl::optional<int64> index_memory_space;
+    if (indexed_shape.shape.has_layout()) {
+      index_memory_space = indexed_shape.shape.layout().memory_space();
+    }
+    if (!memory_space || memory_space == index_memory_space) {
+      bytes_written += output_bytes_accessed(hlo, indexed_shape.index);
+    }
+  }
+  return bytes_written;
+}
+
 StatusOr<HloCostAnalysis::Properties> HloCostAnalysis::ProcessSubcomputation(
     HloComputation* computation) {
   auto visitor = CreateNestedCostAnalysis(shape_size_, per_second_rates_);
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 915c4dcbe84..d9085dd7785 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -76,9 +76,12 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleFft(const HloInstruction* fft) override;
   Status HandleTriangularSolve(const HloInstruction* hlo) override;
   Status HandleCholesky(const HloInstruction* hlo) override;
+  Status HandleAllGather(const HloInstruction* hlo) override;
   Status HandleAllReduce(const HloInstruction* crs) override;
   Status HandleAllToAll(const HloInstruction* hlo) override;
   Status HandleCollectivePermute(const HloInstruction* hlo) override;
+  Status HandleCollectivePermuteStart(const HloInstruction* hlo) override;
+  Status HandleCollectivePermuteDone(const HloInstruction* hlo) override;
   Status HandleReplicaId(const HloInstruction* hlo) override;
   Status HandlePartitionId(const HloInstruction* hlo) override;
   Status HandleInfeed(const HloInstruction* infeed) override;
@@ -161,6 +164,14 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
                               ShapeIndex index = {}) const;
   float optimal_seconds(const HloInstruction& hlo) const;
 
+  // Get bytes read/written by this HLO. If memory_space is provided, it returns
+  // the bytes read/written from/to the given memory space only.
+  int64 GetBytesRead(const HloInstruction& hlo,
+                     absl::optional<int64> memory_space = absl::nullopt) const;
+  int64 GetBytesWritten(
+      const HloInstruction& hlo,
+      absl::optional<int64> memory_space = absl::nullopt) const;
+
   const Properties& properties() const { return properties_sum_; }
   const float property(const string& key) const {
     return GetProperty(key, properties());
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 4894e566393..d0d533e0b06 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -676,6 +676,39 @@ bool HloDataflowAnalysis::UpdateWhileValueSet(HloInstruction* xla_while) {
   }
 }
 
+bool HloDataflowAnalysis::UpdateCollectivePermuteStartValueSet(
+    HloInstruction* collective_permute_start) {
+  CHECK_EQ(collective_permute_start->opcode(),
+           HloOpcode::kCollectivePermuteStart);
+  bool changed = false;
+  // CollectivePermuteStart forwards the operand value to element {0} of its
+  // output.
+  const HloValueSet& operand_value_set =
+      GetValueSet(collective_permute_start->operand(0));
+  HloValueSet& value_set = GetValueSet(collective_permute_start, {0});
+  if (value_set != operand_value_set) {
+    value_set = operand_value_set;
+    changed = true;
+  }
+  return changed;
+}
+
+bool HloDataflowAnalysis::UpdateCollectivePermuteDoneValueSet(
+    HloInstruction* collective_permute_done) {
+  CHECK_EQ(collective_permute_done->opcode(),
+           HloOpcode::kCollectivePermuteDone);
+  bool changed = false;
+  // CollectivePermuteDone forwards the operand value at {0} to its output.
+  const HloValueSet& operand_value_set =
+      GetValueSet(collective_permute_done->operand(0), {1});
+  HloValueSet& value_set = GetValueSet(collective_permute_done);
+  if (value_set != operand_value_set) {
+    value_set = operand_value_set;
+    changed = true;
+  }
+  return changed;
+}
+
 bool HloDataflowAnalysis::UpdateInstructionValueSet(
     HloInstruction* instruction) {
   // Recompute from operands.
@@ -712,6 +745,10 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
       return UpdateCopyDoneValueSet(instruction);
     case HloOpcode::kConditional:
       return UpdateConditionalValueSet(instruction);
+    case HloOpcode::kCollectivePermuteStart:
+      return UpdateCollectivePermuteStartValueSet(instruction);
+    case HloOpcode::kCollectivePermuteDone:
+      return UpdateCollectivePermuteDoneValueSet(instruction);
     default:
       // Instruction does not forward HloValues (it defines all values in its
       // output). No update is necessary.
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index 75bcf7ea318..bec592aeb20 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -216,6 +216,10 @@ class HloDataflowAnalysis {
   bool UpdateTupleValueSet(HloInstruction* tuple);
   bool UpdateWhileValueSet(HloInstruction* xla_while);
   bool UpdateAddDependencyValueSet(HloInstruction* add_dependency);
+  bool UpdateCollectivePermuteStartValueSet(
+      HloInstruction* collective_permute_start);
+  bool UpdateCollectivePermuteDoneValueSet(
+      HloInstruction* collective_permute_done);
 
   // Propagates the dataflow through the module. In particular, it propagates
   // the HloValueSet from its defining instruction to the users of the
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index b8e3f83b515..900b557b4dc 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -47,16 +47,14 @@ StatusOr<bool> HloDCE::RunOnComputation(
   // computation's instruction while simultaneously removing instructions.
   std::vector<HloInstruction*> dead_roots;
   for (auto* instruction : computation->instructions()) {
+    auto maybe_collective_op = DynCast<HloAllReduceInstruction>(instruction);
     if (instruction != computation->root_instruction() &&
         instruction->user_count() == 0 &&
         computation->IsSafelyRemovable(instruction) &&
         (!instruction->HasSideEffect() ||
          (remove_cross_partition_collective_ops &&
-          ((instruction->opcode() == HloOpcode::kAllReduce &&
-            !Cast<HloAllReduceInstruction>(instruction)->constrain_layout()) ||
-           (instruction->opcode() == HloOpcode::kAllToAll &&
-            !Cast<HloAllToAllInstruction>(instruction)->constrain_layout()) ||
-           instruction->opcode() == HloOpcode::kCollectivePermute)))) {
+          (maybe_collective_op != nullptr &&
+           !maybe_collective_op->constrain_layout())))) {
       dead_roots.push_back(instruction);
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 106ebb7be0e..02443ff3c3c 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -2556,6 +2556,20 @@ std::unique_ptr<Array2D<double>> HloEvaluator::MatmulArray2D(
       lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF64);
 }
 
+std::unique_ptr<Array2D<std::complex<float>>> HloEvaluator::MatmulArray2D(
+    const Array2D<std::complex<float>>& lhs,
+    const Array2D<std::complex<float>>& rhs) {
+  return MatmulArray2DImpl<std::complex<float>>(
+      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulC64);
+}
+
+std::unique_ptr<Array2D<std::complex<double>>> HloEvaluator::MatmulArray2D(
+    const Array2D<std::complex<double>>& lhs,
+    const Array2D<std::complex<double>>& rhs) {
+  return MatmulArray2DImpl<std::complex<double>>(
+      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulC128);
+}
+
 std::unique_ptr<Array2D<int32>> HloEvaluator::MatmulArray2D(
     const Array2D<int32>& lhs, const Array2D<int32>& rhs) {
   return MatmulArray2DImpl<int32>(
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 803004225d2..dcd4129adcd 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -164,6 +164,12 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
       const Array2D<float>& lhs, const Array2D<float>& rhs);
   static std::unique_ptr<Array2D<double>> MatmulArray2D(
       const Array2D<double>& lhs, const Array2D<double>& rhs);
+  static std::unique_ptr<Array2D<std::complex<float>>> MatmulArray2D(
+      const Array2D<std::complex<float>>& lhs,
+      const Array2D<std::complex<float>>& rhs);
+  static std::unique_ptr<Array2D<std::complex<double>>> MatmulArray2D(
+      const Array2D<std::complex<double>>& lhs,
+      const Array2D<std::complex<double>>& rhs);
   static std::unique_ptr<Array2D<int32>> MatmulArray2D(
       const Array2D<int32>& lhs, const Array2D<int32>& rhs);
 
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 47a455ac3f4..ad21efa13c9 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -312,12 +312,13 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
 class HloDotDumper {
  public:
   HloDotDumper(const HloComputation* computation, absl::string_view label,
-               const DebugOptions& debug_options, bool show_backend_config,
+               const DebugOptions& debug_options,
+               HloRenderOptions hlo_render_options,
                const HloExecutionProfile* profile, NodeFilter filter)
       : computation_(computation),
         label_(label),
         debug_options_(debug_options),
-        show_backend_config_(show_backend_config),
+        hlo_render_options_(hlo_render_options),
         profile_(profile),
         filter_(std::move(filter)) {}
 
@@ -384,7 +385,7 @@ class HloDotDumper {
   const HloComputation* computation_;  // never null
   const string label_;                 // overall name for the graph
   const DebugOptions& debug_options_;
-  const bool show_backend_config_;
+  const HloRenderOptions hlo_render_options_;
   const HloExecutionProfile* profile_;  // may be null
   const NodeFilter filter_;
 
@@ -565,7 +566,8 @@ bool HloDotDumper::ShouldShowFusionSubcomputation(const HloInstruction* instr) {
 bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
   if (subcomp->IsFusionComputation()) {
     const HloInstruction* fusion = subcomp->FusionInstruction();
-    if (!filter_.Show(fusion) || filter_.SomeOrAllOperandsOmitted(fusion)) {
+    if (!filter_.Show(fusion) || filter_.SomeOrAllOperandsOmitted(fusion) ||
+        !hlo_render_options_.show_fusion_subcomputations) {
       return false;
     }
   }
@@ -1057,9 +1059,12 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kGetDimensionSize:
     case HloOpcode::kSetDimensionSize:
       return kGray;
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
+    case HloOpcode::kCollectivePermuteStart:
+    case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kPartitionId:
@@ -1130,7 +1135,8 @@ string HloDotDumper::GetInstructionNodeMetadata(const HloInstruction* instr) {
 
 string HloDotDumper::GetInstructionNodeBackendConfig(
     const HloInstruction* instr) {
-  if (!show_backend_config_ || instr->raw_backend_config_string().empty()) {
+  if (!hlo_render_options_.show_backend_config ||
+      instr->raw_backend_config_string().empty()) {
     return "";
   }
 
@@ -1601,14 +1607,14 @@ StatusOr<string> RenderGraph(const HloComputation& computation,
                              const DebugOptions& debug_options,
                              RenderedGraphFormat format,
                              const HloExecutionProfile* hlo_execution_profile,
-                             bool show_backend_config) {
+                             HloRenderOptions hlo_render_options) {
   tensorflow::mutex_lock lock(url_renderer_mu);
   if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
     return Unavailable("Can't render as URL; no URL renderer was registered.");
   }
 
   string rendered_dot =
-      HloDotDumper(&computation, label, debug_options, show_backend_config,
+      HloDotDumper(&computation, label, debug_options, hlo_render_options,
                    hlo_execution_profile, NodeFilter())
           .Dump();
   return WrapDotInFormat(rendered_dot, format);
@@ -1616,7 +1622,7 @@ StatusOr<string> RenderGraph(const HloComputation& computation,
 
 StatusOr<string> RenderNeighborhoodAround(
     const HloInstruction& node, int radius, RenderedGraphFormat format,
-    bool show_backend_config,
+    HloRenderOptions hlo_render_options,
     const absl::flat_hash_set<const HloInstruction*>& boundary) {
   tensorflow::mutex_lock lock(url_renderer_mu);
   if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
@@ -1629,7 +1635,7 @@ StatusOr<string> RenderNeighborhoodAround(
   string rendered_dot =
       HloDotDumper(node.parent(), label,
                    node.GetModule()->config().debug_options(),
-                   show_backend_config, /*profile=*/nullptr,
+                   hlo_render_options, /*profile=*/nullptr,
                    MakeNodeRadiusAroundFilter(&node, radius, boundary))
           .Dump();
   return WrapDotInFormat(rendered_dot, format);
@@ -1638,7 +1644,7 @@ StatusOr<string> RenderNeighborhoodAround(
 StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
                                       const HloInstruction& to, int64 max_nodes,
                                       RenderedGraphFormat format,
-                                      bool show_backend_config) {
+                                      HloRenderOptions hlo_render_options) {
   tensorflow::mutex_lock lock(url_renderer_mu);
   if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
     return FailedPrecondition(
@@ -1660,7 +1666,7 @@ StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
                    "NODES***<br/><br/>");
   }
   string rendered_dot =
-      HloDotDumper(from.parent(), label, debug_options, show_backend_config,
+      HloDotDumper(from.parent(), label, debug_options, hlo_render_options,
                    /*profile=*/nullptr, filter)
           .Dump();
   return WrapDotInFormat(rendered_dot, format);
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 324ac67a6dd..528de77e4e6 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -50,6 +50,14 @@ enum class RenderedGraphFormat {
   kUrl,
 };
 
+struct HloRenderOptions {
+  // Include the backend config string in the rendered graph.
+  bool show_backend_config = false;
+
+  // Include the fusion subcomputations in the rendered graph.
+  bool show_fusion_subcomputations = true;
+};
+
 // Renders an HLO module as a human-readable visual graph.
 //
 // Note that this only works well for relatively small graphs (no more than a
@@ -61,7 +69,7 @@ StatusOr<string> RenderGraph(
     const HloComputation& computation, absl::string_view label,
     const DebugOptions& debug_options, RenderedGraphFormat format,
     const HloExecutionProfile* hlo_execution_profile = nullptr,
-    bool show_backend_config = false);
+    HloRenderOptions hlo_render_options = {});
 
 // Like RenderGraph, but renders only nodes "near" the given node in the graph.
 //
@@ -73,7 +81,7 @@ StatusOr<string> RenderGraph(
 // will be omitted even if they are within the radius.
 StatusOr<string> RenderNeighborhoodAround(
     const HloInstruction& node, int radius, RenderedGraphFormat format,
-    bool show_backend_config = false,
+    HloRenderOptions hlo_render_options = {},
     const absl::flat_hash_set<const HloInstruction*>& boundary = {});
 
 // Renders nodes on any of the paths from `from` to `to`.  If there are more
@@ -82,7 +90,7 @@ StatusOr<string> RenderNeighborhoodAround(
 StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
                                       const HloInstruction& to, int64 max_nodes,
                                       RenderedGraphFormat format,
-                                      bool show_backend_config = false);
+                                      HloRenderOptions hlo_render_options = {});
 
 // Registers a function which implements RenderedGraphFormat::kUrl.
 //
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 27fac19587e..c02100debc3 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -388,6 +388,24 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                                   proto.outfeed_config());
       break;
     }
+    case HloOpcode::kAllGather: {
+      absl::optional<int64> channel_id;
+      if (proto.channel_id() > 0) {
+        channel_id = proto.channel_id();
+      }
+
+      TF_RET_CHECK(proto.dimensions_size() == 1)
+          << "AllGather cannot have more than 1 all-gather dimensions";
+      TF_RET_CHECK(all_operands().size() == 1)
+          << "AllGather must have a single operand";
+      int64 all_gather_dimension = proto.dimensions(0);
+      instruction = CreateAllGather(
+          shape, operands(0), all_gather_dimension,
+          std::vector<ReplicaGroup>(proto.replica_groups().begin(),
+                                    proto.replica_groups().end()),
+          proto.constrain_layout(), channel_id, proto.use_global_device_ids());
+      break;
+    }
     case HloOpcode::kAllReduce: {
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "AllReduce should have 1 called computation but sees "
@@ -434,7 +452,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           /*channel_id=*/channel_id, split_dimension);
       break;
     }
-    case HloOpcode::kCollectivePermute: {
+    case HloOpcode::kCollectivePermute:
+    case HloOpcode::kCollectivePermuteStart: {
       std::vector<std::pair<int64, int64>> source_target_pairs(
           proto.source_target_pairs_size());
       absl::optional<int64> channel_id;
@@ -445,8 +464,17 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         source_target_pairs[i].first = proto.source_target_pairs(i).source();
         source_target_pairs[i].second = proto.source_target_pairs(i).target();
       }
-      instruction = CreateCollectivePermute(shape, operands(0),
-                                            source_target_pairs, channel_id);
+
+      if (opcode == HloOpcode::kCollectivePermute) {
+        instruction = CreateCollectivePermute(shape, operands(0),
+                                              source_target_pairs, channel_id);
+      } else if (opcode == HloOpcode::kCollectivePermuteStart) {
+        instruction = CreateCollectivePermuteStart(
+            shape, operands(0), source_target_pairs, channel_id);
+      } else {
+        LOG(FATAL) << "Expect CollectivePermute or CollectivePermuteStart, "
+                   << "but got " << HloOpcodeString(opcode);
+      }
       break;
     }
     case HloOpcode::kReplicaId: {
@@ -787,6 +815,7 @@ HloInstruction::CreateRngBitGenerator(const Shape& shape, HloInstruction* state,
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kBitcast:
     case HloOpcode::kCeil:
+    case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kCopy:
     case HloOpcode::kCopyStart:
     case HloOpcode::kCopyDone:
@@ -929,6 +958,15 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
       shape, operand, exponent_bits, mantissa_bits);
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllGather(
+    const Shape& shape, HloInstruction* operand, int64 all_gather_dimension,
+    const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
+    const absl::optional<int64>& channel_id, bool use_global_device_ids) {
+  return absl::make_unique<HloAllGatherInstruction>(
+      shape, operand, all_gather_dimension, replica_groups, constrain_layout,
+      channel_id, use_global_device_ids);
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllReduce(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
     HloComputation* reduce_computation,
@@ -955,7 +993,18 @@ HloInstruction::CreateCollectivePermute(
     const std::vector<std::pair<int64, int64>>& source_target_pairs,
     const absl::optional<int64>& channel_id) {
   return absl::make_unique<HloCollectivePermuteInstruction>(
-      shape, operand, source_target_pairs, channel_id);
+      HloOpcode::kCollectivePermute, shape, operand, source_target_pairs,
+      channel_id);
+}
+
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateCollectivePermuteStart(
+    const Shape& shape, HloInstruction* operand,
+    const std::vector<std::pair<int64, int64>>& source_target_pairs,
+    const absl::optional<int64>& channel_id) {
+  return absl::make_unique<HloCollectivePermuteInstruction>(
+      HloOpcode::kCollectivePermuteStart, shape, operand, source_target_pairs,
+      channel_id);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReplicaId() {
@@ -1518,9 +1567,11 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kParameter:
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kReducePrecision:
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
+    case HloOpcode::kCollectivePermuteStart:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kConvolution:
@@ -1547,6 +1598,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kBitcast:
     case HloOpcode::kCeil:
     case HloOpcode::kClz:
+    case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kCopy:
     case HloOpcode::kCopyStart:
     case HloOpcode::kCopyDone:
@@ -1900,6 +1952,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
     case HloOpcode::kClz:
+    case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kComplex:
     case HloOpcode::kConvert:
     case HloOpcode::kCopy:
@@ -1997,9 +2050,11 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kReducePrecision:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
+    case HloOpcode::kCollectivePermuteStart:
     case HloOpcode::kConvolution:
     case HloOpcode::kCustomCall:
     case HloOpcode::kReduceWindow:
@@ -2851,12 +2906,18 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleConvolution(this);
     case HloOpcode::kFft:
       return visitor->HandleFft(this);
+    case HloOpcode::kAllGather:
+      return visitor->HandleAllGather(this);
     case HloOpcode::kAllReduce:
       return visitor->HandleAllReduce(this);
     case HloOpcode::kAllToAll:
       return visitor->HandleAllToAll(this);
     case HloOpcode::kCollectivePermute:
       return visitor->HandleCollectivePermute(this);
+    case HloOpcode::kCollectivePermuteStart:
+      return visitor->HandleCollectivePermuteStart(this);
+    case HloOpcode::kCollectivePermuteDone:
+      return visitor->HandleCollectivePermuteDone(this);
     case HloOpcode::kReplicaId:
       return visitor->HandleReplicaId(this);
     case HloOpcode::kPartitionId:
@@ -3934,6 +3995,10 @@ const PaddingConfig& HloInstruction::padding_config() const {
   return Cast<HloPadInstruction>(this)->padding_config();
 }
 
+PaddingConfig* HloInstruction::mutable_padding_config() {
+  return Cast<HloPadInstruction>(this)->mutable_padding_config();
+}
+
 int64 HloInstruction::slice_sizes(int64 dimension) const {
   return Cast<HloDynamicSliceInstruction>(this)->slice_sizes(dimension);
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 923138862a7..7a5d506b681 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -618,6 +618,16 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand, const int exponent_bits,
       const int mantissa_bits);
 
+  // Creates an all-gather op, which concats the operands of all participants
+  // along all_gather_dimension. The replica_groups, channel_id, and
+  // use_global_device_ids arguments are identical to those in all-reduce,
+  // except that the order of the group members determines the concatenation
+  // order of inputs from different participants.
+  static std::unique_ptr<HloInstruction> CreateAllGather(
+      const Shape& shape, HloInstruction* operand, int64 all_gather_dimension,
+      const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
+      const absl::optional<int64>& channel_id, bool use_global_device_ids);
+
   // Creates a cross replica reduction op.
   //
   // `reduction_computation`: the reduction function.
@@ -671,7 +681,7 @@ class HloInstruction {
       const absl::optional<int64>& channel_id,
       const absl::optional<int64>& split_dimension = absl::nullopt);
 
-  // Creates a communication instructions that permutes data cross replicas.
+  // Creates a communication instruction that permutes data cross replicas.
   // Data is sent/received according to the (source_replica_id,
   // target_replica_id) pairs in `source_target_pairs`. If a replica id is not a
   // target_replica_id in any pair, the output on that replica is a tensor
@@ -681,6 +691,13 @@ class HloInstruction {
       const std::vector<std::pair<int64, int64>>& source_target_pairs,
       const absl::optional<int64>& channel_id);
 
+  // Creates a communication instruction that initiates the start of
+  // CollectivePermute.
+  static std::unique_ptr<HloInstruction> CreateCollectivePermuteStart(
+      const Shape& shape, HloInstruction* operand,
+      const std::vector<std::pair<int64, int64>>& source_target_pairs,
+      const absl::optional<int64>& channel_id);
+
   // Creates an instruction that returns a U32 replica ID.
   static std::unique_ptr<HloInstruction> CreateReplicaId();
 
@@ -1800,6 +1817,7 @@ class HloInstruction {
 
   // Delegates to HloPadInstruction::padding_config.
   const PaddingConfig& padding_config() const;
+  PaddingConfig* mutable_padding_config();
 
   // Delegates to HloDynamicSliceInstruction::slice_sizes.
   int64 slice_sizes(int64 dimension) const;
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index eb821d40e78..9c5a66f0040 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -556,6 +556,51 @@ bool HloCollectiveInstruction::IdenticalSlowPath(
                        });
 }
 
+HloAllGatherInstruction::HloAllGatherInstruction(
+    const Shape& shape, HloInstruction* operand, int64 all_gather_dimension,
+    const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
+    const absl::optional<int64>& channel_id, bool use_global_device_ids)
+    : HloCollectiveInstruction(HloOpcode::kAllGather, shape, {operand},
+                               replica_groups, constrain_layout, channel_id),
+      all_gather_dimension_(all_gather_dimension),
+      use_global_device_ids_(use_global_device_ids) {}
+
+std::vector<string> HloAllGatherInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  std::vector<string> result =
+      HloCollectiveInstruction::ExtraAttributesToStringImpl(options);
+  result.push_back(StrCat("dimensions={", all_gather_dimension_, "}"));
+  if (use_global_device_ids_) {
+    result.push_back("use_global_device_ids=true");
+  }
+  return result;
+}
+
+std::unique_ptr<HloInstruction>
+HloAllGatherInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+    HloCloneContext* /*context*/) const {
+  return absl::make_unique<HloAllGatherInstruction>(
+      shape, new_operands[0], all_gather_dimension(), replica_groups(),
+      constrain_layout(), channel_id(), use_global_device_ids());
+}
+
+HloInstructionProto HloAllGatherInstruction::ToProto() const {
+  HloInstructionProto proto = HloCollectiveInstruction::ToProto();
+  proto.add_dimensions(all_gather_dimension_);
+  return proto;
+}
+
+bool HloAllGatherInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other = static_cast<const HloAllGatherInstruction&>(other);
+  return HloCollectiveInstruction::IdenticalSlowPath(other, eq_computations) &&
+         all_gather_dimension_ == casted_other.all_gather_dimension() &&
+         use_global_device_ids() == casted_other.use_global_device_ids();
+}
+
 HloAllReduceInstruction::HloAllReduceInstruction(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
     HloComputation* reduce_computation,
@@ -658,10 +703,10 @@ bool HloAllToAllInstruction::IdenticalSlowPath(
 }
 
 HloCollectivePermuteInstruction::HloCollectivePermuteInstruction(
-    const Shape& shape, HloInstruction* operand,
+    HloOpcode opcode, const Shape& shape, HloInstruction* operand,
     const std::vector<std::pair<int64, int64>>& source_target_pairs,
     const absl::optional<int64>& channel_id)
-    : HloChannelInstruction(HloOpcode::kCollectivePermute, shape, channel_id),
+    : HloChannelInstruction(opcode, shape, channel_id),
       source_target_pairs_(source_target_pairs) {
   AppendOperand(operand);
 }
@@ -693,6 +738,9 @@ bool HloCollectivePermuteInstruction::IdenticalSlowPath(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
+  if (opcode() != other.opcode()) {
+    return false;
+  }
   const auto& casted_other =
       static_cast<const HloCollectivePermuteInstruction&>(other);
   return HloChannelInstruction::IdenticalSlowPath(other, eq_computations) &&
@@ -707,7 +755,7 @@ HloCollectivePermuteInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* /*context*/) const {
   return absl::make_unique<HloCollectivePermuteInstruction>(
-      shape, new_operands[0], source_target_pairs(), channel_id());
+      opcode(), shape, new_operands[0], source_target_pairs(), channel_id());
 }
 
 HloReverseInstruction::HloReverseInstruction(const Shape& shape,
@@ -1819,8 +1867,14 @@ std::unique_ptr<HloInstruction>
 HloParameterInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
-  return absl::make_unique<HloParameterInstruction>(parameter_number_, shape,
-                                                    name());
+  auto clone = absl::make_unique<HloParameterInstruction>(parameter_number_,
+                                                          shape, name());
+  if (parameter_replicated_at_leaf_buffers_ &&
+      ShapeUtil::Equal(shape, this->shape())) {
+    clone->set_parameter_replicated_at_leaf_buffers(
+        *parameter_replicated_at_leaf_buffers_);
+  }
+  return clone;
 }
 
 HloGetTupleElementInstruction::HloGetTupleElementInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index eecd02d891e..6da01dc088e 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -348,6 +348,38 @@ class HloCollectiveInstruction : public HloChannelInstruction {
   bool constrain_layout_;
 };
 
+class HloAllGatherInstruction : public HloCollectiveInstruction {
+ public:
+  explicit HloAllGatherInstruction(
+      const Shape& shape, HloInstruction* operand, int64 all_gather_dimension,
+      const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
+      const absl::optional<int64>& channel_id, bool use_global_device_ids);
+  // Same as HloAllReduceInstruction::use_global_device_ids.
+  bool use_global_device_ids() const { return use_global_device_ids_; }
+
+  // The dimension on which data from different participants are concatenated.
+  int64 all_gather_dimension() const { return all_gather_dimension_; }
+
+ protected:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  HloInstructionProto ToProto() const override;
+
+ private:
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  int64 all_gather_dimension_;
+  bool use_global_device_ids_;
+};
+
 class HloAllReduceInstruction : public HloCollectiveInstruction {
  public:
   explicit HloAllReduceInstruction(
@@ -431,7 +463,7 @@ class HloAllToAllInstruction : public HloCollectiveInstruction {
 class HloCollectivePermuteInstruction : public HloChannelInstruction {
  public:
   explicit HloCollectivePermuteInstruction(
-      const Shape& shape, HloInstruction* operand,
+      HloOpcode opcode, const Shape& shape, HloInstruction* operand,
       const std::vector<std::pair<int64, int64>>& source_target_pairs,
       const absl::optional<int64>& channel_id);
 
@@ -674,7 +706,7 @@ class HloMapInstruction : public HloInstruction {
   // Returns the dimension sizes or numbers associated with this instruction.
   const std::vector<int64>& dimensions() const override { return dimensions_; }
   int64 dimensions(int64 index) const override { return dimensions()[index]; }
-  std::vector<int64>* mutable_dimensions() { return &dimensions_; }
+  std::vector<int64>* mutable_dimensions() override { return &dimensions_; }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
@@ -1377,6 +1409,7 @@ class HloPadInstruction : public HloInstruction {
                              const PaddingConfig& padding_config);
   // Returns the padding configuration for a pad node.
   const PaddingConfig& padding_config() const { return padding_config_; }
+  PaddingConfig* mutable_padding_config() { return &padding_config_; }
   // Returns the padding value.
   const HloInstruction* padding_value() const { return operand(1); }
   HloInstruction* mutable_padding_value() { return mutable_operand(1); }
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
index bc1745a0791..5502665e886 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "absl/base/casts.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/escaping.h"
 #include "absl/strings/numbers.h"
@@ -370,6 +371,11 @@ TokKind HloLexer::LexNumberOrPattern() {
     if (absl::SimpleAtoi(slice, &token_state_.int64_val)) {
       return TokKind::kInt;
     }
+    uint64 uint64_val;
+    if (absl::SimpleAtoi(slice, &uint64_val)) {
+      token_state_.int64_val = absl::bit_cast<int64>(uint64_val);
+      return TokKind::kInt;
+    }
     LOG(ERROR) << "Failed to parse int literal: " << slice;
     return TokKind::kError;
   }
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index ec048bef9e8..cb1b1d0dae4 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -203,6 +203,7 @@ HLO_MATCHER(Abs);
 HLO_MATCHER(Add);
 HLO_MATCHER(AddDependency);
 HLO_MATCHER(AfterAll);
+HLO_MATCHER(AllGather);
 HLO_MATCHER(AllReduce);
 HLO_MATCHER(AllToAll);
 HLO_MATCHER(And);
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index de65ed99303..9722d5c2b76 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -420,6 +420,8 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromShape(
     if (execution_options->num_partitions() > 0) {
       module_config.set_num_partitions(execution_options->num_partitions());
     }
+    module_config.set_use_spmd_partitioning(
+        execution_options->use_spmd_partitioning());
     if (execution_options->has_device_assignment()) {
       TF_ASSIGN_OR_RETURN(std::unique_ptr<DeviceAssignment> device_assignment,
                           DeviceAssignment::Deserialize(
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index b31a9ae6ca5..964f83322a4 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -128,6 +128,11 @@ class HloModuleConfig {
   }
   int64 num_partitions() const { return num_partitions_; }
 
+  void set_use_spmd_partitioning(bool use_spmd_partitioning) {
+    use_spmd_partitioning_ = use_spmd_partitioning;
+  }
+  bool use_spmd_partitioning() const { return use_spmd_partitioning_; }
+
   // Return a string which unambiguously represents all the fields of this data
   // structure. Used for generating a cache key for storing the compiled
   // executable.
@@ -199,6 +204,14 @@ class HloModuleConfig {
 
   std::vector<std::vector<int64>>* mutable_dot_config() { return &dot_config_; }
 
+  const std::vector<std::vector<std::vector<int64>>>& layout_config() const {
+    return layout_config_;
+  }
+
+  std::vector<std::vector<std::vector<int64>>>* mutable_layout_config() {
+    return &layout_config_;
+  }
+
  private:
   // If you add new members, be sure to update compilation_cache_key.
 
@@ -216,6 +229,10 @@ class HloModuleConfig {
   // The number of partitions (model parallelism) to compile this binary for.
   int64 num_partitions_ = 1;
 
+  // Whether to use SPMD (true) or MPMD (false) when num_partitions_ > 0 and XLA
+  // needs to partition the module.
+  bool use_spmd_partitioning_ = false;
+
   // The target maximum parallelism at which to partition HLOs for parallel
   // execution on the CPU backend.
   int64 intra_op_parallelism_threads_ = -1;
@@ -232,6 +249,9 @@ class HloModuleConfig {
   FusionConfigCollection fusion_config_collection_ =
       FusionConfigCollection::kOff;
 
+  // TODO(b/155665133): Consolidate fusion, dot, and layout config into a proto
+  // similar to backend config.
+
   // Custom fusion configuration, where fusion_config_[c][v] control if node v
   // in computation c must be fused to all its consumers (true) or not (false).
   std::vector<std::vector<bool>> fusion_config_;
@@ -240,6 +260,10 @@ class HloModuleConfig {
   // how to convert dot operation v (sorted topologically and by computation) to
   // convolution.
   std::vector<std::vector<int64>> dot_config_;
+
+  // Layout configuration, where layout_config_[v][i] controls the layout
+  // decision i of operation v.
+  std::vector<std::vector<std::vector<int64>>> layout_config_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 2d66237de59..92359bcbdac 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -48,6 +48,7 @@ namespace xla {
   V(kAdd, "add", 2)                                                    \
   V(kAddDependency, "add-dependency", 2)                               \
   V(kAfterAll, "after-all", kHloOpcodeIsVariadic)                      \
+  V(kAllGather, "all-gather", 1)                                       \
   V(kAllReduce, "all-reduce", kHloOpcodeIsVariadic)                    \
   V(kAllToAll, "all-to-all", kHloOpcodeIsVariadic)                     \
   V(kAtan2, "atan2", 2)                                                \
@@ -62,6 +63,8 @@ namespace xla {
   V(kCholesky, "cholesky", 1)                                          \
   V(kClamp, "clamp", 3)                                                \
   V(kCollectivePermute, "collective-permute", 1)                       \
+  V(kCollectivePermuteStart, "collective-permute-start", 1)            \
+  V(kCollectivePermuteDone, "collective-permute-done", 1)              \
   V(kClz, "count-leading-zeros", 1)                                    \
   V(kCompare, "compare", 2)                                            \
   V(kComplex, "complex", 2)                                            \
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index a9c3cacc4c4..d52a60d2555 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -765,6 +765,7 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kBitcast:
     case HloOpcode::kCeil:
     case HloOpcode::kClz:
+    case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kCopy:
     case HloOpcode::kCopyStart:
     case HloOpcode::kCopyDone:
@@ -850,6 +851,35 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
           HloInstruction::CreateBitcastConvert(shape, operands[0]));
       break;
     }
+    case HloOpcode::kAllGather: {
+      optional<std::vector<std::vector<int64>>> tmp_groups;
+      optional<std::vector<int64>> replica_group_ids;
+      optional<int64> channel_id;
+      optional<std::vector<int64>> dimensions;
+      optional<bool> constrain_layout;
+      optional<bool> use_global_device_ids;
+      attrs["replica_groups"] = {/*required=*/false,
+                                 AttrTy::kBracedInt64ListList, &tmp_groups};
+      attrs["channel_id"] = {/*required=*/false, AttrTy::kInt64, &channel_id};
+      attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                             &dimensions};
+      attrs["constrain_layout"] = {/*required=*/false, AttrTy::kBool,
+                                   &constrain_layout};
+      attrs["use_global_device_ids"] = {/*required=*/false, AttrTy::kBool,
+                                        &use_global_device_ids};
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+        return false;
+      }
+      std::vector<ReplicaGroup> replica_groups;
+      if (tmp_groups) {
+        replica_groups = CreateReplicaGroups(*tmp_groups);
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateAllGather(
+          shape, operands[0], dimensions->at(0), replica_groups,
+          constrain_layout ? *constrain_layout : false, channel_id,
+          use_global_device_ids ? *use_global_device_ids : false));
+      break;
+    }
     case HloOpcode::kAllReduce: {
       optional<std::vector<std::vector<int64>>> tmp_groups;
       optional<HloComputation*> to_apply;
@@ -909,7 +939,8 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
           split_dimension));
       break;
     }
-    case HloOpcode::kCollectivePermute: {
+    case HloOpcode::kCollectivePermute:
+    case HloOpcode::kCollectivePermuteStart: {
       optional<std::vector<std::vector<int64>>> source_targets;
       attrs["source_target_pairs"] = {
           /*required=*/true, AttrTy::kBracedInt64ListList, &source_targets};
@@ -928,9 +959,19 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
         pairs[i].first = (*source_targets)[i][0];
         pairs[i].second = (*source_targets)[i][1];
       }
-      instruction =
-          builder->AddInstruction(HloInstruction::CreateCollectivePermute(
-              shape, operands[0], pairs, channel_id));
+      if (opcode == HloOpcode::kCollectivePermute) {
+        instruction =
+            builder->AddInstruction(HloInstruction::CreateCollectivePermute(
+                shape, operands[0], pairs, channel_id));
+      } else if (opcode == HloOpcode::kCollectivePermuteStart) {
+        instruction = builder->AddInstruction(
+            HloInstruction::CreateCollectivePermuteStart(shape, operands[0],
+                                                         pairs, channel_id));
+      } else {
+        LOG(FATAL) << "Expect opcode to be CollectivePermute or "
+                      "CollectivePermuteStart, but got "
+                   << HloOpcodeString(opcode);
+      }
       break;
     }
     case HloOpcode::kReplicaId: {
@@ -2569,14 +2610,10 @@ bool HloParserImpl::CheckParsedValueIsInRange(LocTy loc, ParsedElemT value) {
            std::is_same<ParsedElemT, bool>::value))
         << "Unimplemented checking for ParsedElemT";
 
-    ParsedElemT upper_bound;
-    if (sizeof(LiteralNativeT) >= sizeof(ParsedElemT)) {
-      upper_bound = std::numeric_limits<ParsedElemT>::max();
-    } else {
-      upper_bound =
-          static_cast<ParsedElemT>(std::numeric_limits<LiteralNativeT>::max());
-    }
-    if (value > upper_bound || value < 0) {
+    const uint64 unsigned_value = value;
+    const uint64 upper_bound =
+        static_cast<uint64>(std::numeric_limits<LiteralNativeT>::max());
+    if (unsigned_value > upper_bound) {
       // Value is out of range for LiteralNativeT.
       return Error(loc, StrCat("value ", value,
                                " is out of range for literal's primitive type ",
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 7e66b4e648d..a687d0e1921 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -1480,6 +1480,43 @@ ENTRY CRS {
 
 )"
 },
+// all-gather
+{
+"AllGather",
+R"(HloModule AllGather
+
+ENTRY AllGather {
+  input = f32[128,32]{0,1} parameter(0)
+  ROOT ag = f32[128,128]{0,1} all-gather(input), replica_groups={}, dimensions={1}
+}
+
+)"
+},
+// all-gather with constrained layout
+{
+"AllGatherWithLayout",
+R"(HloModule AllGather
+
+ENTRY AllGather {
+  input = f32[128,32]{0,1} parameter(0)
+  ROOT ag = f32[128,128]{0,1} all-gather(input), replica_groups={}, constrain_layout=true, dimensions={1}
+}
+
+)"
+},
+// all-gather with subgroups
+{
+"AllGatherWithSubgroups",
+R"(HloModule AllGatherWithSubgroups
+
+ENTRY AllGatherWithSubgroups {
+  input = f32[128,32]{0,1} parameter(0)
+  ROOT ag = f32[128,64]{0,1} all-gather(input), replica_groups={{0,1},{2,3}}, dimensions={1}
+}
+
+)",
+/*replica_count=*/4,
+},
 // all-to-all
 {
 "AllToAll",
@@ -1516,6 +1553,20 @@ ENTRY CollectivePermute {
   ROOT root = f32[128,32]{0,1} collective-permute(input), source_target_pairs={{0,1},{1,2},{2,3}}
 }
 
+)",
+/*replica_count=*/4
+},
+// collective-permute-start and -done
+{
+"CollectivePermuteStartAndDone",
+R"(HloModule CollectivePermuteStartAndDone
+
+ENTRY CollectivePermuteStartAndDone {
+  input = f32[128,32]{0,1} parameter(0)
+  collective-permute-start.1 = (f32[128,32]{0,1}, f32[128,32]{0,1}, u32[], u32[]) collective-permute-start(input), source_target_pairs={{0,1},{1,2},{2,3}}
+  ROOT collective-permute-done.1 = f32[128,32]{0,1} collective-permute-done(collective-permute-start.1)
+}
+
 )",
 /*replica_count=*/4
 },
@@ -1963,9 +2014,7 @@ TEST_F(HloParserTest, ConstantUnsignedUnderflow) {
         ROOT %constant = u64[] constant(-1)
       })";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(Status::OK(), result.status());
-  ExpectHasSubstr(result.status().error_message(),
-                  "is out of range for literal's primitive type U64");
+  EXPECT_EQ(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, ConstantUnsignedOverflow) {
@@ -1987,7 +2036,7 @@ TEST_F(HloParserTest, ConstantUnsignedInt64Overflow) {
         ROOT %constant = u64[] constant(9223372036854775808)
       })";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(Status::OK(), result.status());
+  EXPECT_EQ(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, ConstantC64Overflow) {
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.cc b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
new file mode 100644
index 00000000000..7fc05608800
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
@@ -0,0 +1,592 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+
+#include <map>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/array.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace hlo_sharding_util {
+
+absl::optional<int64> SelectDominantDevice(
+    const std::map<int64, int64>& device_map, int64* top_count) {
+  int64 device = 0;
+  int64 count = 0;
+  for (auto& it : device_map) {
+    if (it.second > count) {
+      count = it.second;
+      device = it.first;
+    }
+  }
+  if (top_count != nullptr) {
+    *top_count = count;
+  }
+  return count > 0 ? absl::optional<int64>(device) : absl::optional<int64>();
+}
+
+Status AssignComputationDevice(HloComputation* computation, int64 device) {
+  VLOG(4) << "Assigning device " << device << " to " << computation->name()
+          << " computation";
+  for (HloInstruction* instruction : computation->instructions()) {
+    if (!instruction->has_sharding()) {
+      VLOG(4) << "Assigning device " << device << " to " << instruction->name();
+      instruction->set_device_sharding(device);
+    }
+  }
+  return Status::OK();
+}
+
+absl::optional<int64> GetMostOccurringDevice(
+    absl::Span<HloInstruction* const> instructions) {
+  std::map<int64, int64> device_map;
+  for (HloInstruction* instruction : instructions) {
+    if (instruction->has_sharding()) {
+      for (auto& it : instruction->sharding().UsedDevices(nullptr)) {
+        // The UsedDevices() API returns a map<device, occurrence_count>.
+        device_map[it.first] += it.second;
+      }
+    }
+  }
+  return SelectDominantDevice(device_map, nullptr);
+}
+
+StatusOr<absl::optional<int64>> GetDominantDevice(
+    absl::Span<HloComputation* const> computations, double dominant_factor) {
+  int64 instruction_count = 0;
+  std::map<int64, int64> device_map;
+  for (HloComputation* computation : computations) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      int64 count = 1;
+      if (instruction->has_sharding()) {
+        for (auto& it : instruction->sharding().UsedDevices(&count)) {
+          // The UsedDevices() API returns a map<device, occurrence_count>.
+          device_map[it.first] += it.second;
+        }
+      }
+      instruction_count += count;
+    }
+  }
+  int64 count;
+  absl::optional<int64> device = SelectDominantDevice(device_map, &count);
+  absl::optional<int64> dominant_device;
+  if (device) {
+    double factor =
+        static_cast<double>(count) / static_cast<double>(instruction_count);
+    if (factor >= dominant_factor) {
+      dominant_device = device;
+    }
+  }
+  return dominant_device;
+}
+
+HloSharding TransposeSharding(const HloSharding& sharding,
+                              const std::vector<int64>& dimensions) {
+  if (sharding.IsTileMaximal()) {
+    return sharding;
+  }
+  const int64 rank = dimensions.size();
+  std::vector<int64> tile_assignment_dim(rank);
+  for (int64 i = 0; i < rank; ++i) {
+    tile_assignment_dim[i] = sharding.tile_assignment().dim(dimensions[i]);
+  }
+  Array<int64> tile_assignment = sharding.tile_assignment();
+  tile_assignment.Reshape(tile_assignment_dim);
+  tile_assignment.Each([&](absl::Span<const int64> indices, int64* value) {
+    std::vector<int64> src_indices(indices.size(), -1);
+    for (int64 i = 0; i < indices.size(); ++i) {
+      src_indices[dimensions[i]] = indices[i];
+    }
+    *value = sharding.tile_assignment()(src_indices);
+  });
+  return HloSharding::Tile(tile_assignment);
+}
+
+absl::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
+                                            const Shape& target_shape,
+                                            const HloSharding& sharding) {
+  if (sharding.IsTileMaximal()) {
+    return sharding;
+  }
+
+  // In case of a tiled sharding the reshaped sharding will be a valid if the
+  // reshape is composed from the following operations:
+  // * Adding or removing dimensions with size 1.
+  // * Merging consecutive dimensions where only the most major is sharded.
+  // * Splitting a dimension to consecutive dimensions.
+  // * Any reshaping of unsharded dimensions.
+  // Note that merge and split can happen consecutively on the same dimension,
+  // e.g., f32[1024,256,1024] to f32[128,2048,1024] can be considered that 1024
+  // gets split into 128 and 8, but 8 then gets merged with 256. We use stacks
+  // to make supporting such cases easy.
+  const Shape tile_shape = sharding.TileShape(source_shape);
+  std::vector<int64> target_tile_assignment_dimensions;
+  std::vector<int64> source_dims_stack(source_shape.rank());
+  std::vector<int64> target_dims_stack(target_shape.rank());
+  std::vector<int64> sharding_tile_dims_stack(source_shape.rank());
+  for (int64 i = 0; i < source_shape.rank(); ++i) {
+    source_dims_stack[i] = source_shape.dimensions(source_shape.rank() - 1 - i);
+    sharding_tile_dims_stack[i] =
+        sharding.tile_assignment().dim(source_shape.rank() - 1 - i);
+  }
+  for (int64 i = 0; i < target_shape.rank(); ++i) {
+    target_dims_stack[i] = target_shape.dimensions(target_shape.rank() - 1 - i);
+  }
+  while (!source_dims_stack.empty() || !target_dims_stack.empty()) {
+    if (target_dims_stack.empty()) {
+      if (Product(sharding_tile_dims_stack) != 1) {
+        return absl::nullopt;
+      }
+      break;
+    }
+    int64 s_size = 1;
+    int64 t_size = 1;
+    int64 s_partitions = 1;
+    if (!source_dims_stack.empty()) {
+      s_size = source_dims_stack.back();
+      source_dims_stack.pop_back();
+      s_partitions = sharding_tile_dims_stack.back();
+      sharding_tile_dims_stack.pop_back();
+    }
+    t_size = target_dims_stack.back();
+    target_dims_stack.pop_back();
+    if (s_partitions * Product(sharding_tile_dims_stack) == 1) {
+      // No more partitions left.
+      target_tile_assignment_dimensions.push_back(1);
+      continue;
+    }
+    if (s_size == t_size) {
+      // Same dimension.
+      target_tile_assignment_dimensions.push_back(s_partitions);
+    } else if (t_size == 1) {
+      // Trivial dimension added.
+      target_tile_assignment_dimensions.push_back(1);
+      source_dims_stack.push_back(s_size);
+      sharding_tile_dims_stack.push_back(s_partitions);
+    } else if (s_size == 1) {
+      // Trivial dimension removed.
+      if (s_partitions != 1) {
+        return absl::nullopt;
+      }
+      target_dims_stack.push_back(t_size);
+    } else if (s_size > t_size) {
+      // Dimension split.
+      if (s_size % t_size != 0 || t_size % s_partitions != 0) {
+        return absl::nullopt;
+      }
+      target_tile_assignment_dimensions.push_back(s_partitions);
+      // We have part of the s_size unprocessed, so put it back to stack.
+      source_dims_stack.push_back(s_size / t_size);
+      sharding_tile_dims_stack.push_back(1);
+    } else {
+      // Dimension merge. Also merge the source dimension with the next, and
+      // process it next time.
+      if (s_size % s_partitions != 0) {
+        return absl::nullopt;
+      }
+      CHECK(!source_dims_stack.empty());
+      if (sharding_tile_dims_stack.back() != 1 && s_size != s_partitions) {
+        // If the next dimension to combine is sharded, we require that the
+        // current dimension's shard size to be 1. Otherwise, the new shard
+        // would be non-contiguous.
+        return absl::nullopt;
+      }
+      source_dims_stack.back() *= s_size;
+      sharding_tile_dims_stack.back() *= s_partitions;
+      target_dims_stack.push_back(t_size);
+    }
+  }
+  Array<int64> new_tile_assignment = sharding.tile_assignment();
+  new_tile_assignment.Reshape(target_tile_assignment_dimensions);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding ReverseSharding(const HloSharding& sharding,
+                            absl::Span<const int64> dimensions) {
+  if (sharding.IsTileMaximal() || dimensions.empty()) {
+    return sharding;
+  }
+
+  Array<int64> new_tile_assignment(sharding.tile_assignment().dimensions());
+  new_tile_assignment.Each([&](absl::Span<const int64> indices, int64* device) {
+    std::vector<int64> original_indices(indices.begin(), indices.end());
+    for (int64 d : dimensions) {
+      original_indices[d] =
+          new_tile_assignment.dim(d) - 1 - original_indices[d];
+    }
+    *device = sharding.tile_assignment()(original_indices);
+  });
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding ReshapeToTileDimension(const HloSharding& sharding, int64 dim,
+                                   absl::Span<const int64> dims) {
+  CHECK(!sharding.IsTuple() && !sharding.IsTileMaximal());
+  CHECK_NE(absl::c_find(dims, dim), dims.end()) << "dim is not in dims";
+  // We optimize the tile assignment on the single dimension dim in a way to
+  // minimize communication among devices caused by the reshard:
+  // +---+---+               +---+---+              +-+-+-+-+
+  // |   |   |               |   0   |              | | | | |
+  // | 0 | 1 |               +-------+              | | | | |
+  // |   |   |  reshape on   |   1   |  reshape on  | | | | |
+  // +---+---+   dim 0  =>   +-------+   dim 1  =>  |0|2|1|3|
+  // |   |   |               |   2   |              | | | | |
+  // | 2 | 3 |               +-------+              | | | | |
+  // |   |   |               |   3   |              | | | | |
+  // +---+---+               +---+---+              +-+-+-+-+
+
+  std::vector<int64> tile_dims(sharding.tile_assignment().num_dimensions(), 1);
+  // Handle ignore dimensions.
+  std::vector<int64> ignore_sizes;
+  int64 ignore_size = 1;
+  for (int64 i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
+    if (absl::c_find(dims, i) == dims.end()) {
+      int64 size = sharding.tile_assignment().dim(i);
+      ignore_sizes.push_back(size);
+      tile_dims[i] = size;
+      ignore_size *= size;
+    }
+  }
+
+  using Buckets = std::vector<std::vector<int64>>;
+  Array<Buckets> buckets(ignore_sizes,
+                         Buckets(sharding.tile_assignment().dim(dim)));
+  sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> index, int64 device) {
+        std::vector<int64> ignore_index;
+        for (int64 i = 0; i < index.size(); ++i) {
+          if (absl::c_find(dims, i) == dims.end()) {
+            ignore_index.push_back(index[i]);
+          }
+        }
+        buckets(ignore_index)[index[dim]].push_back(device);
+      });
+  std::vector<int64> devices;
+  buckets.Each([&](absl::Span<const int64> index, const Buckets& buckets) {
+    for (auto& bucket : buckets) {
+      devices.insert(devices.end(), bucket.begin(), bucket.end());
+    }
+  });
+  tile_dims[dim] = devices.size() / ignore_size;
+  Array<int64> tile_assignment(tile_dims);
+  tile_assignment.SetValues(devices);
+  return HloSharding::Tile(tile_assignment);
+}
+
+bool ContainsTileSharding(const HloModule& module) {
+  for (const HloComputation* computation : module.computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (instruction->has_sharding() &&
+          !instruction->sharding().IsTileMaximal()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+HloSharding GatherOutputSharding(const HloSharding& index_sharding,
+                                 const HloInstruction* hlo) {
+  if (index_sharding.IsTileMaximal()) {
+    return index_sharding;
+  }
+
+  const GatherDimensionNumbers& dnums = hlo->gather_dimension_numbers();
+  std::vector<int64> output_tile_assignment_dims;
+  for (int64 i = 0, index_dim = 0; i < hlo->shape().rank(); ++i) {
+    if (absl::c_binary_search(dnums.offset_dims(), i)) {
+      output_tile_assignment_dims.push_back(1);
+    } else {
+      output_tile_assignment_dims.push_back(
+          index_sharding.tile_assignment().dim(index_dim));
+      index_dim++;
+    }
+  }
+  Array<int64> new_tile_assignment = index_sharding.tile_assignment();
+  new_tile_assignment.Reshape(output_tile_assignment_dims);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding GatherIndexSharding(const HloSharding& output_sharding,
+                                const HloInstruction* hlo) {
+  if (output_sharding.IsTileMaximal()) {
+    return output_sharding;
+  }
+
+  const GatherDimensionNumbers& dnums = hlo->gather_dimension_numbers();
+  std::vector<int64> index_tile_assignment_dims;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (!absl::c_binary_search(dnums.offset_dims(), i)) {
+      index_tile_assignment_dims.push_back(
+          output_sharding.tile_assignment().dim(i));
+    }
+  }
+  Array<int64> new_tile_assignment = output_sharding.tile_assignment();
+  new_tile_assignment.Reshape(index_tile_assignment_dims);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding GatherEffectiveOutputSharding(const HloInstruction& hlo) {
+  if (hlo.sharding().IsTileMaximal()) {
+    return hlo.sharding();
+  }
+
+  const GatherDimensionNumbers& dnums = hlo.gather_dimension_numbers();
+  std::vector<int64> tile_assignment_dims(hlo.shape().rank());
+  int64 num_elements = 1;
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (!absl::c_binary_search(dnums.offset_dims(), i)) {
+      tile_assignment_dims[i] = hlo.sharding().tile_assignment().dim(i);
+      num_elements *= hlo.sharding().tile_assignment().dim(i);
+    } else {
+      tile_assignment_dims[i] = 1;
+    }
+  }
+  if (num_elements == hlo.sharding().tile_assignment().num_elements()) {
+    // Output sharding is only on non offset dimensions. We use output sharding
+    // to shard this gather op directly.
+    return hlo.sharding();
+  }
+
+  if (num_elements == 1) {
+    // Output sharding is only on offset dimensions. We do not shard this gather
+    // op. Return a tile maximal sharding with the first device in output
+    // sharding tile assignment.
+    return HloSharding::AssignDevice(*hlo.sharding().tile_assignment().begin());
+  }
+
+  // Output sharding is on both offset and non offset dimensions. We shard the
+  // gather op only on non offset dimensions.
+  // For example:
+  // - the gather op has sharding [2,2]{0,1,2,3},
+  // - first dimension is non offset dimension,
+  // - second dimension is offset dimension,
+  // Then the result sharding will be [2,1]{0,2}.
+  std::vector<int64> slice_starts(hlo.shape().rank(), 0LL),
+      slice_limits(hlo.shape().rank());
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (!absl::c_binary_search(dnums.offset_dims(), i)) {
+      slice_limits[i] = hlo.sharding().tile_assignment().dim(i);
+    } else {
+      slice_limits[i] = 1;
+    }
+  }
+  Array<int64> tile_assignment =
+      hlo.sharding().tile_assignment().Slice(slice_starts, slice_limits);
+  return HloSharding::Tile(tile_assignment);
+}
+
+HloSharding ScatterIndexSharding(const HloSharding& data_sharding,
+                                 const HloInstruction* hlo) {
+  if (data_sharding.IsTileMaximal()) {
+    return data_sharding;
+  }
+
+  const ScatterDimensionNumbers& dnums = hlo->scatter_dimension_numbers();
+  std::vector<int64> index_tile_assignment_dims;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (!absl::c_binary_search(dnums.update_window_dims(), i)) {
+      index_tile_assignment_dims.push_back(
+          data_sharding.tile_assignment().dim(i));
+    }
+  }
+  if (index_tile_assignment_dims.size() < hlo->operand(1)->shape().rank()) {
+    index_tile_assignment_dims.push_back(1);
+  }
+  Array<int64> new_tile_assignment = data_sharding.tile_assignment();
+  new_tile_assignment.Reshape(index_tile_assignment_dims);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding ScatterDataSharding(const HloSharding& index_sharding,
+                                const HloInstruction* hlo) {
+  if (index_sharding.IsTileMaximal()) {
+    return index_sharding;
+  }
+
+  const ScatterDimensionNumbers& dnums = hlo->scatter_dimension_numbers();
+  std::vector<int64> data_tile_assignment_dims;
+  for (int64 i = 0, index_dim = 0; i < hlo->shape().rank(); ++i) {
+    if (absl::c_binary_search(dnums.update_window_dims(), i)) {
+      data_tile_assignment_dims.push_back(1);
+    } else {
+      data_tile_assignment_dims.push_back(
+          index_sharding.tile_assignment().dim(index_dim));
+      index_dim++;
+    }
+  }
+  Array<int64> new_tile_assignment = index_sharding.tile_assignment();
+  new_tile_assignment.Reshape(data_tile_assignment_dims);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding ScatterEffectiveIndexSharding(const HloSharding& index_sharding,
+                                          const HloInstruction& hlo) {
+  if (index_sharding.IsTileMaximal()) {
+    return index_sharding;
+  }
+
+  // Only shard on first "number of scatter_window_dims" dimensions.
+  const ScatterDimensionNumbers& dnums = hlo.scatter_dimension_numbers();
+  int64 num_elements = 1;
+  int64 index_dim = 0;
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (absl::c_binary_search(dnums.inserted_window_dims(), i)) {
+      num_elements *= index_sharding.tile_assignment().dim(index_dim);
+      index_dim++;
+    }
+  }
+  if (num_elements == index_sharding.tile_assignment().num_elements()) {
+    // Index sharding is only on scatter_window_dims. We use this index sharding
+    // directly.
+    return index_sharding;
+  }
+
+  // Index sharding is only on update_window_dims. We do not shard this scatter
+  // op. Return a tile maximal sharding with the first device in index sharding
+  // tile assignment.
+  if (num_elements == 1) {
+    return HloSharding::AssignDevice(*index_sharding.tile_assignment().begin());
+  }
+
+  const int64 index_rank = hlo.operand(1)->shape().rank();
+  std::vector<int64> slice_starts(index_rank, 0LL), slice_limits(index_rank);
+  for (int64 i = 0; i < index_rank; ++i) {
+    if (i < index_dim) {
+      slice_limits[i] = index_sharding.tile_assignment().dim(i);
+    } else {
+      slice_limits[i] = 1;
+    }
+  }
+  Array<int64> tile_assignment =
+      index_sharding.tile_assignment().Slice(slice_starts, slice_limits);
+  return HloSharding::Tile(tile_assignment);
+}
+
+HloSharding ScatterEffectiveDataSharding(const HloSharding& data_sharding,
+                                         const HloInstruction& hlo) {
+  if (data_sharding.IsTileMaximal()) {
+    return data_sharding;
+  }
+
+  const ScatterDimensionNumbers& dnums = hlo.scatter_dimension_numbers();
+  const int64 data_rank = hlo.operand(2)->shape().rank();
+  std::vector<int64> tile_assignment_dims(data_rank, 1LL);
+  int64 num_elements = 1;
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (absl::c_binary_search(dnums.inserted_window_dims(), i)) {
+      CHECK_LT(i, data_rank);
+      tile_assignment_dims[i] = data_sharding.tile_assignment().dim(i);
+      num_elements *= data_sharding.tile_assignment().dim(i);
+    }
+  }
+  if (num_elements == data_sharding.tile_assignment().num_elements()) {
+    // Data sharding is only on scatter_window_dims. We use this data sharding
+    // directly.
+    return data_sharding;
+  }
+
+  if (num_elements == 1) {
+    // Data sharding is only on update_window_dims. We do not shard this
+    // scatter op. Return a tile maximal sharding with the first device in
+    // data sharding tile assignment.
+    return HloSharding::AssignDevice(*data_sharding.tile_assignment().begin());
+  }
+
+  // Data sharding is on both update_window_dims and scatter_window_dims. We
+  // shard the scatter op only on scatter_window_dims. For example:
+  // - the scatter data has sharding [2,2]{0,1,2,3},
+  // - first dimension is scatter_window_dims,
+  // - second dimension is update_window_dims,
+  // Then the result sharding will be [2,1]{0,2}.
+  std::vector<int64> slice_starts(data_rank, 0LL);
+  Array<int64> tile_assignment =
+      data_sharding.tile_assignment().Slice(slice_starts, tile_assignment_dims);
+  return HloSharding::Tile(tile_assignment);
+}
+
+StatusOr<std::pair<std::unique_ptr<HloInstruction>, HloOpcode>>
+IdentityValueAndHloOpcodeForScatterReduceComputation(
+    const HloScatterInstruction& scatter) {
+  auto computation = scatter.to_apply();
+  // We only handle computations with 2 parameters and only 1 calculation.
+  if (computation->instruction_count() != 3) {
+    return Status(
+        tensorflow::error::Code::INVALID_ARGUMENT,
+        "Expected scatter reduce computation with 2 parameters and only 1 "
+        "calculation");
+  }
+
+  auto root_instruction = computation->root_instruction();
+  if (root_instruction->opcode() == HloOpcode::kAdd ||
+      root_instruction->opcode() == HloOpcode::kOr) {
+    return std::make_pair(HloInstruction::CreateConstant(LiteralUtil::Zero(
+                              scatter.shape().element_type())),
+                          root_instruction->opcode());
+  } else if (root_instruction->opcode() == HloOpcode::kMultiply ||
+             root_instruction->opcode() == HloOpcode::kAnd) {
+    return std::make_pair(HloInstruction::CreateConstant(
+                              LiteralUtil::One(scatter.shape().element_type())),
+                          root_instruction->opcode());
+  } else if (root_instruction->opcode() == HloOpcode::kMaximum) {
+    return std::make_pair(HloInstruction::CreateConstant(LiteralUtil::MinValue(
+                              scatter.shape().element_type())),
+                          root_instruction->opcode());
+  } else if (root_instruction->opcode() == HloOpcode::kMinimum) {
+    return std::make_pair(HloInstruction::CreateConstant(LiteralUtil::MaxValue(
+                              scatter.shape().element_type())),
+                          root_instruction->opcode());
+  }
+
+  return Status(tensorflow::error::Code::INVALID_ARGUMENT,
+                "Expected scatter reduce computation which is "
+                "add/or/multiply/add/min/max");
+}
+
+std::vector<int64> DevicesForSharding(
+    const HloSharding& sharding, const std::vector<int64>& available_devices) {
+  std::vector<int64> devices;
+  if (sharding.IsReplicated()) {
+    for (int64 d : available_devices) {
+      if (!HloSharding::IsReservedDevice(d)) {
+        devices.push_back(d);
+      }
+    }
+    return devices;
+  }
+
+  for (int64 i : available_devices) {
+    if (sharding.UsesDevice(i)) {
+      devices.push_back(i);
+    }
+  }
+  DCHECK(std::all_of(sharding.tile_assignment().begin(),
+                     sharding.tile_assignment().end(), [&](int64 device) {
+                       return std::find(available_devices.begin(),
+                                        available_devices.end(),
+                                        device) != available_devices.end();
+                     }));
+  return devices;
+}
+
+}  // namespace hlo_sharding_util
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.h b/tensorflow/compiler/xla/service/hlo_sharding_util.h
new file mode 100644
index 00000000000..562f6d1420d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.h
@@ -0,0 +1,149 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_UTIL_H_
+
+#include <map>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+
+namespace xla {
+namespace hlo_sharding_util {
+
+// Given a map<device, occurrence_count>, selects the device with higher
+// occurrence count (if any). If top_count in not nullptr, it will receive the
+// count of the dominant device returned.
+absl::optional<int64> SelectDominantDevice(
+    const std::map<int64, int64>& device_map, int64* top_count);
+
+// Assigns all the instructions of a computation, to a given device.
+// This API does not recurse into called computations, and does not assign
+// instructions which already have sharding.
+Status AssignComputationDevice(HloComputation* computation, int64 device);
+
+// Given an instruction container, returns the device which is most commonly
+// occurring among the instructions.
+absl::optional<int64> GetMostOccurringDevice(
+    absl::Span<HloInstruction* const> instructions);
+
+// Given a set of computations, tries to extract the dominant device. A device
+// is dominant if the combined occurrence among all the instructions of the
+// input computations, is greater/equal than/to dominant_factor (real number
+// from 0 to 1).
+// This API does not recurse into called computations.
+// If no device exists that satisfies the condition, the returned optional will
+// hold no value.
+StatusOr<absl::optional<int64>> GetDominantDevice(
+    absl::Span<HloComputation* const> computations, double dominant_factor);
+
+// Returns the HloSharding with the tile dimensions and tile assignment
+// transposed based on the specified dimension numbers. In case of a tile
+// maximal sharding returns the original sharding.
+HloSharding TransposeSharding(const HloSharding& sharding,
+                              const std::vector<int64>& dimensions);
+
+// Returns the HloSharding with the tile shape reshaped based on the source and
+// target shapes and the tile assignment adjusted to correspond to the new tile
+// shape or absl::nullopt if the resulting reshape would create an invalid
+// sharding (non continuous or non uniformly sized tiles). In case of a tile
+// maximal sharding returns the original sharding.
+absl::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
+                                            const Shape& target_shape,
+                                            const HloSharding& sharding);
+
+// Returns the HloSharding with the tile dimensions and tile assignment
+// reversed based on the specified dimension numbers. In case of a tile
+// maximal sharding returns the original sharding.
+HloSharding ReverseSharding(const HloSharding& sharding,
+                            absl::Span<const int64> dimensions);
+
+// Returns a sharding tiled on unique dimension dim by reshaping the tile
+// assignment of the sharding argument. Only dimensions in the dims span
+// argument are considered for reshaping, the others are ignored.
+// Assumptions: sharding is tile sharded, and dim must be included in dims.
+HloSharding ReshapeToTileDimension(const HloSharding& sharding, int64 dim,
+                                   absl::Span<const int64> dims);
+
+// Returns true if the provided module includes one or more instructions with
+// a tile sharding.
+bool ContainsTileSharding(const HloModule& module);
+
+// Returns the preferred output sharding for a gather op based on the sharding
+// of the indces.
+HloSharding GatherOutputSharding(const HloSharding& index_sharding,
+                                 const HloInstruction* hlo);
+
+// Returns the preferred index sharding for a gather op based on the sharding
+// of the output.
+HloSharding GatherIndexSharding(const HloSharding& output_sharding,
+                                const HloInstruction* hlo);
+
+// Returns a new HloSharding for a gather op so that only non offset dimensions
+// are sharded. Assume "result" is returned by this function. It is ensured that
+// "GetIndexSharding(result, hlo)" will have the same number of elements as
+// "result".
+HloSharding GatherEffectiveOutputSharding(const HloInstruction& hlo);
+
+// Returns the preferred index sharding for a scatter op based on the sharding
+// of the data.
+HloSharding ScatterIndexSharding(const HloSharding& data_sharding,
+                                 const HloInstruction* hlo);
+
+// Returns the preferred data sharding for a scatter op based on the sharding
+// of the index.
+HloSharding ScatterDataSharding(const HloSharding& index_sharding,
+                                const HloInstruction* hlo);
+
+// Returns a new index sharding for a scatter op so that we only shard on first
+// "number of scatter_window_dims" dimensions. Assume "result" is returned by
+// this function. It is ensured that "ScatterDataSharding(result, hlo)" will
+// have the same number of elements as "result".
+HloSharding ScatterEffectiveIndexSharding(const HloSharding& index_sharding,
+                                          const HloInstruction& hlo);
+
+// Returns a new data sharding for a scatter op so that we only shard on
+// scatter_window_dims. Assume "result" is returned by this function. It is
+// ensured that "ScatterIndexSharding(result, hlo)" will have the same number of
+// elements as "result".
+HloSharding ScatterEffectiveDataSharding(const HloSharding& data_sharding,
+                                         const HloInstruction& hlo);
+
+// Returns an identity value and an HloOpcode for reduce computation of scatter
+// instruction.
+// - If computation is add/or, return 0/false with corresponding op code;
+// - If computation is multiply/and, return 1/true with corresponding op code.
+// - If computation is min/max, return max value/min value with corresponding op
+//   code.
+// - Otherwise, return error status.
+StatusOr<std::pair<std::unique_ptr<HloInstruction>, HloOpcode>>
+IdentityValueAndHloOpcodeForScatterReduceComputation(
+    const HloScatterInstruction& scatter);
+
+// Given a sharding and a list of devices in the topology, return a
+// list of the devices that `sharding` applies to.
+std::vector<int64> DevicesForSharding(
+    const HloSharding& sharding, const std::vector<int64>& available_devices);
+
+}  // namespace hlo_sharding_util
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc
new file mode 100644
index 00000000000..02496c75965
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc
@@ -0,0 +1,206 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+
+#include "tensorflow/compiler/xla/test.h"
+
+namespace xla {
+namespace hlo_sharding_util {
+namespace {
+
+TEST(HloShardingUtilTest, TransposeShardingReplicated) {
+  EXPECT_EQ(TransposeSharding(HloSharding::Replicate(), {0, 1, 2}),
+            HloSharding::Replicate());
+}
+
+TEST(HloShardingUtilTest, TransposeShardingTiled) {
+  HloSharding input = HloSharding::Tile(Array4D<int64>({{{{0, 1}}, {{2, 3}}}}));
+  HloSharding output =
+      HloSharding::Tile(Array4D<int64>({{{{0}, {2}}}, {{{1}, {3}}}}));
+  EXPECT_EQ(TransposeSharding(input, {3, 0, 1, 2}), output);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingMaximal) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {2, 3, 5});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {3, 5, 2});
+  HloSharding sharding = HloSharding::AssignDevice(7);
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledInvalid) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {2, 3, 5});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {3, 5, 2});
+  HloSharding sharding = HloSharding::Tile(Array3D<int64>({{{0}, {1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledMerge) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {4, 5, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {20, 7});
+  HloSharding input_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  HloSharding output_sharding = HloSharding::Tile(Array2D<int64>({{0}, {1}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledSplit) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {16, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {4, 4, 7});
+  HloSharding input_sharding = HloSharding::Tile(Array2D<int64>({{0}, {1}}));
+  HloSharding output_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledSplitThenMerge) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {16, 4, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {4, 16, 7});
+  HloSharding input_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  HloSharding output_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledArbitraryMinorDimensions) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {16, 7, 5, 3});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {4, 15, 2, 14});
+  Array<int64> sharding_array({2, 1, 1, 1});
+  sharding_array(0, 0, 0, 0) = 0;
+  sharding_array(1, 0, 0, 0) = 1;
+  HloSharding sharding = HloSharding::Tile(sharding_array);
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledTrivialDimensions) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {3, 1, 5, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {3, 5, 1, 7});
+  HloSharding input_sharding =
+      HloSharding::Tile(Array4D<int64>({{{{0}, {1}}}}));
+  HloSharding output_sharding =
+      HloSharding::Tile(Array4D<int64>({{{{0}}, {{1}}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTrivialDImensionInsertedToEnd) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {8, 16});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {8, 16, 1});
+  HloSharding input_sharding = HloSharding::Tile(Array2D<int64>({{0}, {1}}));
+  HloSharding output_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, NoopReshapeShardingEmptyTile) {
+  Shape shape = ShapeUtil::MakeShape(F32, {7, 1, 1});
+  HloSharding sharding = HloSharding::Tile(Array3D<int64>({{{0}, {1}}}));
+  absl::optional<HloSharding> result = ReshapeSharding(shape, shape, sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingScalar) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 1});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {});
+  HloSharding sharding = HloSharding::Tile(Array3D<int64>({{{0}, {1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension2D_Dim0) {
+  HloSharding sharding = HloSharding::Tile(Array2D<int64>({{0, 1}, {2, 3}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/0, /*dims=*/{0, 1});
+  EXPECT_EQ(result.tile_assignment(), Array2D<int64>({{0}, {1}, {2}, {3}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension2D_Dim1) {
+  HloSharding sharding = HloSharding::Tile(Array2D<int64>({{0, 1}, {2, 3}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/1, /*dims=*/{0, 1});
+  EXPECT_EQ(result.tile_assignment(), Array2D<int64>({{0, 2, 1, 3}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension3D_Dim0) {
+  HloSharding sharding =
+      HloSharding::Tile(Array3D<int64>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/0, /*dims=*/{0, 1, 2});
+  EXPECT_EQ(
+      result.tile_assignment(),
+      Array3D<int64>({{{0}}, {{1}}, {{2}}, {{3}}, {{4}}, {{5}}, {{6}}, {{7}}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension3D_Dim1) {
+  HloSharding sharding =
+      HloSharding::Tile(Array3D<int64>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/1, /*dims=*/{0, 1, 2});
+  EXPECT_EQ(result.tile_assignment(),
+            Array3D<int64>({{{0}, {1}, {4}, {5}, {2}, {3}, {6}, {7}}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension3D_Dim2) {
+  HloSharding sharding =
+      HloSharding::Tile(Array3D<int64>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/2, /*dims=*/{0, 1, 2});
+  EXPECT_EQ(result.tile_assignment(),
+            Array3D<int64>({{{0, 2, 4, 6, 1, 3, 5, 7}}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension2D_Dim2_Batch1) {
+  // Tile sharding in batch dimension, i.e.
+  // sharding={devices[2,2,2]0,1,2,3,4,5,6,7,8}.
+  HloSharding sharding =
+      HloSharding::Tile(Array3D<int64>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}));
+  // Reshape on dimensions {1, 2} only, therefore ignoring batch dimension 0.
+  HloSharding result = ReshapeToTileDimension(sharding, /*dim=*/2,
+                                              /*dims=*/{1, 2});
+  // Expected result is {devices=[2,1,4]0,2,1,3,4,6,5,7}, i.e. the two
+  // non-batch dimensions {{0, 1}, {2, 3}} and {{4, 5}, {6, 7}} are individually
+  // reshaped to tile dimension 2, i.e. {{0, 2, 1, 3}}, {{4, 6, 5, 7}}.
+  EXPECT_EQ(result.tile_assignment(),
+            Array3D<int64>({{{0, 2, 1, 3}}, {{4, 6, 5, 7}}}));
+}
+
+}  // namespace
+}  // namespace hlo_sharding_util
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc
index acc077ab12d..e57c8a83b23 100644
--- a/tensorflow/compiler/xla/service/hlo_value.cc
+++ b/tensorflow/compiler/xla/service/hlo_value.cc
@@ -91,8 +91,7 @@ string HloValue::ToShortString() const {
   return absl::StrFormat(
       "<%d %s%s%s%s>", id(), instruction()->name(),
       instruction()->shape().IsTuple() ? index().ToString() : "",
-      is_phi() ? " (phi)" : "",
-      has_color() ? StrCat(" @", color().value()) : "");
+      is_phi() ? " (phi)" : "", has_color() ? StrCat(" @", color()) : "");
 }
 
 string HloValue::ToString(int indent) const {
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 0911af10f38..4661b8fd9e3 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -74,7 +74,6 @@ Status CheckParameterCount(const HloInstruction* calling_instruction,
   }
   return Status::OK();
 }
-
 }  // namespace
 
 Status ShapeVerifier::Preprocess(HloInstruction* hlo) {
@@ -236,6 +235,40 @@ static Status CheckReplicaGroups(HloInstruction* hlo) {
   return Status::OK();
 }
 
+Status ShapeVerifier::HandleAllGather(HloInstruction* hlo) {
+  auto ag = Cast<HloAllGatherInstruction>(hlo);
+  TF_RETURN_IF_ERROR(CheckReplicaGroups(ag));
+  TF_RET_CHECK(ag->all_gather_dimension() >= 0);
+  TF_RET_CHECK(ag->all_gather_dimension() < ag->shape().rank());
+  TF_RET_CHECK(ag->all_gather_dimension() < ag->operand(0)->shape().rank());
+  if (ag->use_global_device_ids() && ag->replica_groups().empty()) {
+    return InternalError(
+        "Replica group must be specified when use_global_device_ids is true");
+  }
+
+  int64 shard_count = CeilOfRatio(
+      ag->shape().dimensions(ag->all_gather_dimension()),
+      ag->operand(0)->shape().dimensions(ag->all_gather_dimension()));
+  if (ag->channel_id().has_value()) {
+    if (ag->use_global_device_ids()) {
+      TF_RET_CHECK(shard_count == ag->replica_groups()[0].replica_ids_size());
+    } else {
+      if (ag->replica_groups().empty() ||
+          ag->replica_groups()[0].replica_ids_size() != 1) {
+        return InternalError(
+            "Replica group size must be 1 when use_global_device_ids is "
+            "false if the all-gather is also cross-partition");
+      }
+    }
+  } else if (!ag->replica_groups().empty()) {
+    // Cross-replica all-gather: shard count is subgroup size.
+    TF_RET_CHECK(shard_count == ag->replica_groups()[0].replica_ids_size());
+  }
+  return CheckShape(ag, ShapeInference::InferAllGatherShape(
+                            ag->operand(0)->shape(), ag->all_gather_dimension(),
+                            shard_count));
+}
+
 Status ShapeVerifier::HandleAllReduce(HloInstruction* crs) {
   TF_RETURN_IF_ERROR(CheckReplicaGroups(crs));
 
@@ -298,7 +331,9 @@ Status ShapeVerifier::HandleReplicaId(HloInstruction* hlo) {
   return CheckShape(hlo, ShapeUtil::MakeShape(U32, {}));
 }
 
-Status ShapeVerifier::HandleCollectivePermute(HloInstruction* hlo) {
+namespace {
+
+Status CheckDuplicatedSourceOrTarget(HloInstruction* hlo) {
   // A source or target cannot appear twice in the collective-permute's
   // source-target pairs.
   absl::flat_hash_set<int64> seen_sources;
@@ -317,10 +352,30 @@ Status ShapeVerifier::HandleCollectivePermute(HloInstruction* hlo) {
           p.second, hlo->ToString());
     }
   }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status ShapeVerifier::HandleCollectivePermute(HloInstruction* hlo) {
+  TF_RETURN_IF_ERROR(CheckDuplicatedSourceOrTarget(hlo));
   return CheckShape(hlo, ShapeInference::InferCollectivePermuteShape(
                              hlo->operand(0)->shape()));
 }
 
+Status ShapeVerifier::HandleCollectivePermuteStart(HloInstruction* hlo) {
+  TF_RETURN_IF_ERROR(CheckDuplicatedSourceOrTarget(hlo));
+  return CheckShape(
+      hlo, ShapeUtil::MakeTupleShape(
+               {hlo->operand(0)->shape(), hlo->operand(0)->shape(),
+                ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})}));
+}
+
+Status ShapeVerifier::HandleCollectivePermuteDone(HloInstruction* hlo) {
+  return CheckShape(
+      hlo, ShapeUtil::GetTupleElementShape(hlo->operand(0)->shape(), 0));
+}
+
 Status ShapeVerifier::HandleReducePrecision(HloInstruction* reduce_precision) {
   return CheckShape(reduce_precision, ShapeInference::InferReducePrecisionShape(
                                           reduce_precision->operand(0)->shape(),
@@ -628,9 +683,11 @@ Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
           shape_size_function_(bitcast->operand(0)->shape())) {
     return InternalError(
         "Bitcast cannot have different shape sizes of output (%d) and operand "
-        "(%d)",
+        "(%d) (%s) (%s)",
         shape_size_function_(bitcast->shape()),
-        shape_size_function_(bitcast->operand(0)->shape()));
+        shape_size_function_(bitcast->operand(0)->shape()),
+        bitcast->shape().ToString(true),
+        bitcast->operand(0)->shape().ToString(true));
   }
   return Status::OK();
 }
@@ -697,11 +754,7 @@ Status ShapeVerifier::HandleFusion(HloInstruction* fusion) {
   }
   for (HloInstruction* fused_param : fused_parameters) {
     int64 param_no = fused_param->parameter_number();
-    // Since fusion buffers aren't materialized, fusion parameters will not have
-    // the same memory space as the fusion operand.
-    if (!ShapesSame(fused_param->shape(), fusion->operand(param_no)->shape(),
-                    /*minor_to_major_only=*/false,
-                    /*ignore_memory_space=*/true)) {
+    if (!ShapesSame(fused_param->shape(), fusion->operand(param_no)->shape())) {
       return InternalError(
           "Shape mismatch between parameter number %d and its operand in "
           "%s.",
@@ -1343,32 +1396,60 @@ Status CheckSameIsHostTransfer(const HloInstruction* instr1,
   return Status::OK();
 }
 
-// Checks CopyStart and CopyDone nodes.
-Status VerifyAsynchronousCopies(const HloModule& module) {
+Status VerifySingleUser(const HloInstruction* instruction,
+                        HloOpcode expected_user) {
+  TF_RET_CHECK(instruction->users().size() == 1)
+      << "The " << HloOpcodeString(instruction->opcode())
+      << " instruction requires one consumer, found "
+      << instruction->users().size();
+
+  const HloInstruction* user = instruction->users().front();
+  TF_RET_CHECK(user->opcode() == expected_user)
+      << "The consumer of a " << HloOpcodeString(instruction->opcode())
+      << " instruction needs to be " << HloOpcodeString(expected_user)
+      << ", found " << HloOpcodeString(user->opcode());
+  return Status::OK();
+}
+
+Status VerifySingleOperand(const HloInstruction* instruction,
+                           HloOpcode expected_operand) {
+  TF_RET_CHECK(instruction->operands().size() == 1)
+      << "The " << HloOpcodeString(instruction->opcode())
+      << " instruction requires one consumer, found "
+      << instruction->users().size();
+
+  const HloInstruction* operand = instruction->operand(0);
+  TF_RET_CHECK(operand->opcode() == expected_operand)
+      << "The operand of a " << HloOpcodeString(instruction->opcode())
+      << " instruction needs to be " << HloOpcodeString(expected_operand)
+      << ", found " << HloOpcodeString(operand->opcode());
+  return Status::OK();
+}
+
+// Checks asynchronous instruction pairs.
+Status VerifyAsynchronousInstructionPairs(const HloModule& module) {
   // CopyStart must have a single CopyDone user.
   for (const HloComputation* computation : module.computations()) {
     for (const HloInstruction* instruction : computation->instructions()) {
       switch (instruction->opcode()) {
         case HloOpcode::kCopyStart: {
-          TF_RET_CHECK(instruction->users().size() == 1)
-              << "CopyStart instruction requires one consumer, found "
-              << instruction->users().size();
-          const HloInstruction* copy_done = instruction->users().front();
-          TF_RET_CHECK(copy_done->opcode() == HloOpcode::kCopyDone)
-              << "The consumer of a CopyStart instruction needs to be "
-                 "CopyDone, found "
-              << HloOpcodeString(copy_done->opcode());
+          TF_RETURN_IF_ERROR(
+              VerifySingleUser(instruction, HloOpcode::kCopyDone));
           break;
         }
         case HloOpcode::kCopyDone: {
-          TF_RET_CHECK(instruction->operands().size() == 1)
-              << "CopyDone instruction requires one operand, found "
-              << instruction->operands().size();
-          const HloInstruction* copy_start = instruction->operand(0);
-          TF_RET_CHECK(copy_start->opcode() == HloOpcode::kCopyStart)
-              << "The operand of a CopyDone instruction needs to be CopyStart, "
-                 "found "
-              << HloOpcodeString(copy_start->opcode());
+          TF_RETURN_IF_ERROR(
+              VerifySingleOperand(instruction, HloOpcode::kCopyStart));
+          break;
+        }
+        case HloOpcode::kCollectivePermuteStart: {
+          TF_RETURN_IF_ERROR(
+              VerifySingleUser(instruction, HloOpcode::kCollectivePermuteDone));
+          break;
+        }
+        case HloOpcode::kCollectivePermuteDone: {
+          TF_RETURN_IF_ERROR(VerifySingleOperand(
+              instruction, HloOpcode::kCollectivePermuteStart));
           break;
         }
         default:
@@ -1783,7 +1864,7 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
   }
 
   TF_RETURN_IF_ERROR(VerifyHloStructure(module));
-  TF_RETURN_IF_ERROR(VerifyAsynchronousCopies(*module));
+  TF_RETURN_IF_ERROR(VerifyAsynchronousInstructionPairs(*module));
   TF_RETURN_IF_ERROR(VerifyChannels(*module));
 
   std::unique_ptr<ShapeVerifier> shape_verifier =
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 2e83361a591..85b02e0518c 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -56,9 +56,12 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleFft(HloInstruction* fft) override;
   Status HandleCholesky(HloInstruction* hlo) override;
   Status HandleTriangularSolve(HloInstruction* hlo) override;
+  Status HandleAllGather(HloInstruction* hlo) override;
   Status HandleAllReduce(HloInstruction* crs) override;
   Status HandleAllToAll(HloInstruction* hlo) override;
   Status HandleCollectivePermute(HloInstruction* hlo) override;
+  Status HandleCollectivePermuteStart(HloInstruction* hlo) override;
+  Status HandleCollectivePermuteDone(HloInstruction* hlo) override;
   Status HandlePartitionId(HloInstruction* hlo) override;
   Status HandleReplicaId(HloInstruction* hlo) override;
   Status HandleReducePrecision(HloInstruction* reduce_precision) override;
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index e2c363e40c5..d9709c50df9 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -710,7 +710,7 @@ TEST_F(HloVerifierTest, CopyStartMultipleCopyDone) {
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
       status.error_message(),
-      HasSubstr("CopyStart instruction requires one consumer, found 2"));
+      HasSubstr("copy-start instruction requires one consumer, found 2"));
 }
 
 TEST_F(HloVerifierTest, CopyDoneNoCopyStart) {
@@ -730,8 +730,8 @@ TEST_F(HloVerifierTest, CopyDoneNoCopyStart) {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.error_message(),
-              HasSubstr("The operand of a CopyDone instruction needs to be "
-                        "CopyStart, found tuple"));
+              HasSubstr("The operand of a copy-done instruction needs to be "
+                        "copy-start, found tuple"));
 }
 
 TEST_F(HloVerifierTest, IotaNonArrayResult) {
@@ -1134,5 +1134,87 @@ TEST_F(HloVerifierTest, CollectiveChannelVerifier) {
               HasSubstr("used for different types of channel instructions"));
 }
 
+TEST_F(HloVerifierTestLayoutSensitive, CollectivePermuteStartAndDone) {
+  const char* const kModuleStr = R"(
+  HloModule Module
+
+  ENTRY CollectivePermuteStartAndDone {
+    p0 = f32[2,3]{1,0:S(1)} parameter(0)
+    collective-permute-start.1 = (f32[2,3]{1,0:S(1)}, f32[2,3]{1,0:S(1)}, u32[], u32[]) collective-permute-start(p0), source_target_pairs={{0,1},{1,0}}, channel_id=1
+    ROOT collective-permute-done.1 = f32[2,3]{1,0:S(1)} collective-permute-done(collective-permute-start.1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kModuleStr));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTest, CollectivePermuteStartAndDoneWrongType) {
+  const char* const kModuleStr = R"(
+  HloModule Module
+
+  ENTRY CollectivePermuteStartAndDoneWrongType {
+    p0 = f32[2,3]{1,0:S(1)} parameter(0)
+    collective-permute-start.1 = f32[2,3]{1,0:S(1)} collective-permute-start(p0), source_target_pairs={{0,1},{1,0}}, channel_id=1
+    ROOT collective-permute-done.1 = f32[2,3]{1,0:S(1)} collective-permute-done(collective-permute-start.1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kModuleStr));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Expected instruction to have shape equal to "
+                        "(f32[2,3], f32[2,3], u32[], u32[])"));
+}
+
+TEST_F(HloVerifierTest, CollectivePermuteStartAndMultipleDone) {
+  const char* const kModuleStr = R"(
+  HloModule Module
+
+  ENTRY CollectivePermuteStartAndMultipleDone {
+    p0 = f32[2,3]{1,0:S(1)} parameter(0)
+    collective-permute-start.1 = (f32[2,3]{1,0:S(1)}, f32[2,3]{1,0:S(1)}, u32[], u32[]) collective-permute-start(p0), source_target_pairs={{0,1},{1,0}}, channel_id=1
+    collective-permute-done.1 = f32[2,3]{1,0:S(1)} collective-permute-done(collective-permute-start.1)
+    ROOT collective-permute-done.2 = f32[2,3]{1,0:S(1)} collective-permute-done(collective-permute-start.1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kModuleStr));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(
+      status.error_message(),
+      HasSubstr("collective-permute-start instruction requires one consumer, "
+                "found 2"));
+}
+
+TEST_F(HloVerifierTest, CollectivePermuteDoneNoCollectivePermuteStart) {
+  const char* const kModuleStr = R"(
+  HloModule Module
+
+  ENTRY CollectivePermuteDoneNoCollectivePermuteStart {
+    p0 = f32[2,3]{1,0:S(1)} parameter(0)
+    p1 = f32[2,3]{1,0:S(1)} parameter(1)
+    p2 = u32[] parameter(2)
+    p3 = u32[] parameter(3)
+    tuple.1 = (f32[2,3], f32[2,3], u32[], u32[]) tuple(p0, p1, p2, p3)
+    ROOT collective-permute-done.1 = f32[2,3]{1,0:S(1)} collective-permute-done(tuple.1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kModuleStr));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("The operand of a collective-permute-done instruction "
+                        "needs to be collective-permute-start, found tuple"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 99242c9ca21..02966cc2bf2 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -145,9 +145,12 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kCholesky:
     case HloOpcode::kConditional:
     case HloOpcode::kConvolution:
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
+    case HloOpcode::kCollectivePermuteDone:
+    case HloOpcode::kCollectivePermuteStart:
     case HloOpcode::kCustomCall:
     case HloOpcode::kDomain:
     case HloOpcode::kDot:
@@ -501,7 +504,7 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
     while (true) {
       auto next_entry =
           fusion_queue->DequeueNextInstructionAndOperandsToFuseInOrder();
-      auto instruction = next_entry.first;
+      HloInstruction* instruction = next_entry.first;
       if (instruction == nullptr) {
         break;
       }
@@ -511,12 +514,14 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
         continue;
       }
 
+      VLOG(5) << "Considering fusion of: " << instruction->ToString();
       std::vector<int64>& sorted_operand_numbers = next_entry.second;
 
       for (int64 i : sorted_operand_numbers) {
         HloInstruction* operand = instruction->mutable_operand(i);
 
         if (!operand->IsFusible()) {
+          VLOG(3) << "Operand (" << operand->ToString() << ") is not fusible";
           continue;
         }
 
@@ -690,6 +695,8 @@ bool InstructionFusion::ShouldFuse(HloInstruction* consumer,
   if (FusionWouldDuplicate(*producer, *consumer) &&
       (!may_duplicate_ || is_expensive_(*producer)) &&
       !IsAlwaysDuplicable(*producer)) {
+    VLOG(4) << "Stopping: fusion may duplicate operand ("
+            << producer->ToString() << ") , and this is expensive";
     return false;
   }
 
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index a67c677bd03..82c30f1a710 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -951,7 +951,8 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
                 if (!Shape::Equal()
                          .IgnoreDynamicDimension()
                          .MinorToMajorOnlyInLayout()(instruction_subshape,
-                                                     buffer->shape())) {
+                                                     buffer->shape()) &&
+                    instruction->opcode() != HloOpcode::kBitcast) {
                   return InternalError(
                       "Layout of instruction %s at index {%s} does not match "
                       "source LogicalBuffer %s: %s vs %s",
@@ -1798,13 +1799,6 @@ Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
   // potential bugs in the layout assignment pass that may accidentally use the
   // existing layout.
   for (HloInstruction* instruction : computation->instructions()) {
-    if (instruction->opcode() == HloOpcode::kBitcast) {
-      // bitcasts are inherently layout sensitive and so a bitcast instruction
-      // present in the IR before layout assignment is a bug.
-      return InternalError(
-          "Unexpected bitcast operation seen during layout assignment: %s.",
-          instruction->ToString());
-    }
     // Some instructions carry mandatory layouts in their shape.
     if (instruction->opcode() != HloOpcode::kInfeed &&
         !IsLayoutConstrainedCustomCall(instruction) &&
@@ -2179,6 +2173,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kConditional:
     case HloOpcode::kConvert:
     case HloOpcode::kCos:
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
     case HloOpcode::kDivide:
@@ -2239,6 +2234,8 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kBitcast:
     case HloOpcode::kBroadcast:
     case HloOpcode::kCall:
+    case HloOpcode::kCollectivePermuteStart:
+    case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kConstant:
     case HloOpcode::kConvolution:
     case HloOpcode::kCopy:
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 304a80c7a52..6e575247e6b 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -814,27 +814,6 @@ TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
   EXPECT_THAT(false_result->opcode(), HloOpcode::kCopy);
 }
 
-TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) {
-  auto builder = HloComputation::Builder(TestName());
-  auto constant0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout<float>(
-          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
-  builder.AddInstruction(
-      HloInstruction::CreateBitcast(constant0->shape(), constant0));
-  auto m = CreateNewVerifiedModule();
-  m->AddEntryComputation(builder.Build());
-
-  ComputationLayout computation_layout(
-      m->entry_computation()->ComputeProgramShape());
-  LayoutAssignment layout_assignment(&computation_layout);
-  Status error_status = layout_assignment.Run(m.get()).status();
-  EXPECT_FALSE(error_status.ok());
-  EXPECT_THAT(
-      error_status.error_message(),
-      ::testing::HasSubstr(
-          "Unexpected bitcast operation seen during layout assignment"));
-}
-
 TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) {
   // Pin non matching layouts to parameter and root.
   const char* module_str = R"(
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 39399df7ad8..cabcc8e06ee 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -64,6 +64,7 @@ cc_library(
     srcs = ["llvm_util.cc"],
     hdrs = ["llvm_util.h"],
     deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
index 453a5cd84b2..f7808773592 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
@@ -58,7 +58,7 @@ ENTRY while3 {
 
   CompileAndVerifyIr(hlo_string, R"(
 ; CHECK-LABEL: @body(i8* %retval
-; CHECK: %[[add_result:.*]] = fadd fast float %[[fadd_lhs:.*]], %[[fadd_rhs:.*]]
+; CHECK: %[[add_result:.*]] = fadd reassoc nsz contract  float %[[fadd_lhs:.*]], %[[fadd_rhs:.*]]
 ; CHECK: store float %[[add_result]], float* %[[store_dest:.*]], align 4, !alias.scope ![[alias_scope_md_for_store:[0-9]+]]
 ;
 ; CHECK-LABEL: @condition(i8* %retval, i8* noalias %run_options, i8** noalias %params
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index da0dbf94ddd..278aa3e1696 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -373,6 +374,28 @@ llvm::Value* IrArray::Index::Linearize(absl::Span<const int64> dimensions,
   return logical_linear_index;
 }
 
+llvm::Value* IrArray::Index::Linearize(
+    const std::vector<llvm::Value*>& dynamic_dims,
+    llvm::IRBuilder<>* builder) const {
+  // Each dimension is multiplied by the product of the sizes of all
+  // earlier dimensions and added to the accumulator logical_linear_index.
+  CHECK_EQ(size(), dynamic_dims.size());
+  llvm::Value* logical_linear_index = GetConstantWithIndexType(0);
+  llvm::Value* multiplier = GetConstantWithIndexType(1);
+  for (ssize_t i = size() - 1; i >= 0; --i) {
+    llvm::Value* addend = builder->CreateMul((*this)[i], multiplier, "",
+                                             /*HasNUW=*/true, /*HasNSW=*/true);
+    addend = builder->CreateZExtOrTrunc(addend, index_type_);
+    logical_linear_index = builder->CreateAdd(logical_linear_index, addend, "",
+                                              /*HasNUW=*/true, /*HasNSW=*/true);
+    if (i) {
+      multiplier = builder->CreateMul(multiplier, dynamic_dims[i],
+                                      /*Name=*/"multiplier");
+    }
+  }
+  return logical_linear_index;
+}
+
 llvm::Value* IrArray::EmitArrayElementAddress(const IrArray::Index& index,
                                               llvm::IRBuilder<>* b,
                                               absl::string_view name,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index e838c4a0534..c71654f5294 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -155,6 +155,10 @@ class IrArray {
     llvm::Value* Linearize(absl::Span<const int64> dimensions,
                            llvm::IRBuilder<>* builder) const;
 
+    // Linearizes the index into the given dynamic dimensions.
+    llvm::Value* Linearize(const std::vector<llvm::Value*>& dynamic_dims,
+                           llvm::IRBuilder<>* builder) const;
+
     llvm::Type* GetType() const { return index_type_; }
 
     llvm::Constant* GetConstantWithIndexType(int64 c) const {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 4c9a8d3e004..6375bf7341f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
@@ -90,7 +91,9 @@ llvm::CallInst* EmitCallToIntrinsic(
 
 llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
                           llvm::IRBuilder<>* b) {
-  if (b->getFastMathFlags().noNaNs()) {
+  // TODO(tpopp): Pass this information down from the HLO's ModuleConfig.
+  if (b->getFastMathFlags().noNaNs() ||
+      GetDebugOptionsFromFlags().xla_cpu_enable_fast_min_max()) {
     auto cmp = b->CreateFCmpUGE(lhs_value, rhs_value);
     return b->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
@@ -103,7 +106,9 @@ llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
 
 llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
                           llvm::IRBuilder<>* b) {
-  if (b->getFastMathFlags().noNaNs()) {
+  // TODO(tpopp): Pass this information down from the HLO's ModuleConfig.
+  if (b->getFastMathFlags().noNaNs() ||
+      GetDebugOptionsFromFlags().xla_cpu_enable_fast_min_max()) {
     auto cmp = b->CreateFCmpULE(lhs_value, rhs_value);
     return b->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
@@ -287,7 +292,7 @@ llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(llvm::Type* type,
   llvm::AllocaInst* alloca =
       b->CreateAlloca(type, element_count, AsStringRef(name));
   if (alignment != 0) {
-    alloca->setAlignment(llvm::MaybeAlign(alignment));
+    alloca->setAlignment(llvm::Align(alignment));
   }
   return alloca;
 }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index 83be4334269..b6b3b2dd8b3 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -35,6 +35,14 @@ LoopEmitter::LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
                          llvm::IRBuilder<>* b)
     : body_emitter_(body_emitter), shape_(shape), b_(b) {}
 
+LoopEmitter::LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
+                         std::vector<llvm::Value*> dynamic_dims,
+                         llvm::IRBuilder<>* b)
+    : LoopEmitter::LoopEmitter(body_emitter, shape, b) {
+  CHECK_EQ(dynamic_dims.size(), shape_.dimensions_size());
+  dynamic_dims_ = std::move(dynamic_dims);
+}
+
 LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
                          const IrArray& target_array, llvm::IRBuilder<>* b)
     : body_emitter_([=](const llvm_ir::IrArray::Index array_index) -> Status {
@@ -84,6 +92,43 @@ LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
   }
 }
 
+IrArray::Index LoopEmitter::EmitStaticIndex(ForLoopNest* loop_nest,
+                                            llvm::Type* index_type) {
+  // Create loop nest with one for-loop for each dimension of the target shape.
+  // Loops are added from outermost to innermost order with the ForLoopNest
+  // class so emit loops in order from most-major dimension down to most-minor
+  // dimension (of the target shape).
+  std::vector<llvm::Value*> array_multi_index(shape_.dimensions_size());
+  for (int i = 0; i < LayoutUtil::MinorToMajor(shape_).size(); ++i) {
+    int64 dimension = LayoutUtil::Major(shape_.layout(), i);
+    std::unique_ptr<ForLoop> loop = loop_nest->AddLoop(
+        /*start_index=*/0,
+        /*end_index=*/shape_.dimensions(dimension),
+        /*suffix=*/absl::StrFormat("dim.%d", dimension));
+    array_multi_index[dimension] = loop->GetIndVarValue();
+  }
+  return IrArray::Index(array_multi_index, shape_, index_type);
+}
+
+IrArray::Index LoopEmitter::EmitDynamicIndex(ForLoopNest* loop_nest,
+                                             llvm::Type* index_type) {
+  CHECK_EQ(shape_.is_dynamic(), true);
+  // Create loop nest with one for-loop for each dynamic dimensions.
+  // Loops are added from outermost to innermost order with the ForLoopNest
+  // class so emit loops in order from most-major dimension down to most-minor
+  // dimension (of the target shape).
+  std::vector<llvm::Value*> array_multi_index(shape_.dimensions_size());
+  for (int i = 0; i < LayoutUtil::MinorToMajor(shape_).size(); ++i) {
+    int64 dimension = LayoutUtil::Major(shape_.layout(), i);
+    std::unique_ptr<ForLoop> loop = loop_nest->AddLoop(
+        /*suffix=*/absl::StrFormat("dim.%d", dimension),
+        /*start_index=*/llvm::ConstantInt::get(index_type, 0),
+        /*end_index=*/dynamic_dims_[dimension]);
+    array_multi_index[dimension] = loop->GetIndVarValue();
+  }
+  return IrArray::Index(array_multi_index, shape_, index_type);
+}
+
 std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
     absl::string_view loop_name, llvm::Type* index_type) {
   CHECK_NE(index_type, nullptr);
@@ -93,21 +138,11 @@ std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
     return {IrArray::Index(index_type)};
   }
 
-  // Create loop nest with one for-loop for each dimension of the target shape.
-  // Loops are added from outermost to innermost order with the ForLoopNest
-  // class so emit loops in order from most-major dimension down to most-minor
-  // dimension (of the target shape).
   ForLoopNest loop_nest(loop_name, b_);
-  std::vector<llvm::Value*> array_multi_index(shape_.dimensions_size());
-  for (int i = 0; i < LayoutUtil::MinorToMajor(shape_).size(); ++i) {
-    int64 dimension = LayoutUtil::Major(shape_.layout(), i);
-    std::unique_ptr<ForLoop> loop = loop_nest.AddLoop(
-        /*start_index=*/0,
-        /*end_index=*/shape_.dimensions(dimension),
-        /*suffix=*/absl::StrFormat("dim.%d", dimension));
-    array_multi_index[dimension] = loop->GetIndVarValue();
-  }
-  IrArray::Index array_index(array_multi_index, shape_, index_type);
+
+  IrArray::Index array_index = dynamic_dims_.empty()
+                                   ? EmitStaticIndex(&loop_nest, index_type)
+                                   : EmitDynamicIndex(&loop_nest, index_type);
 
   // Set IR builder insertion point to the loop body basic block of the
   // innermost loop.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
index a537c00066b..008205a642a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
@@ -42,6 +43,12 @@ class LoopEmitter {
 
   LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
               llvm::IRBuilder<>* b);
+
+  // Constructs a LoopEmitter from an body_emitter that generates
+  // element of the given target array in the dynamic dimension.
+  LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
+              std::vector<llvm::Value*> dynamic_dims, llvm::IRBuilder<>* b);
+
   // Constructs a LoopEmitter from an element generator that generates each
   // element of the given target array.
   LoopEmitter(const ElementGenerator& target_element_generator,
@@ -81,11 +88,21 @@ class LoopEmitter {
   // The shape that the emitted loop iterates through.
   Shape shape_;
 
+  // Dynamic dimensions that  emitted loop iterates through. Generate the
+  // loop based on the dynamic dimensions if this vector is not empty.
+  std::vector<llvm::Value*> dynamic_dims_;
+
   // Points to the exit block of the emitted loop. If the given shape is
   // scalar, no loops are emitted and exit_bb_ is nullptr in that case.
   llvm::BasicBlock* exit_bb_;
 
   llvm::IRBuilder<>* b_;
+
+ private:
+  IrArray::Index EmitStaticIndex(ForLoopNest* loop_nest,
+                                 llvm::Type* index_type);
+  IrArray::Index EmitDynamicIndex(ForLoopNest* loop_nest,
+                                  llvm::Type* index_type);
 };
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index ef8ddfc1a76..c80646e0c70 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -112,6 +112,8 @@ ExecutionOptions CreateExecutionOptions(
   }
   execution_options.set_num_replicas(build_options.num_replicas());
   execution_options.set_num_partitions(build_options.num_partitions());
+  execution_options.set_use_spmd_partitioning(
+      build_options.use_spmd_partitioning());
   if (build_options.has_device_assignment()) {
     TF_CHECK_OK(build_options.device_assignment().Serialize(
         execution_options.mutable_device_assignment()));
diff --git a/tensorflow/compiler/xla/service/logical_buffer.cc b/tensorflow/compiler/xla/service/logical_buffer.cc
index e1f56727bd2..d937d53d550 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer.cc
@@ -34,7 +34,7 @@ LogicalBuffer::~LogicalBuffer() {}
 string LogicalBuffer::ToString() const {
   string color_string;
   if (has_color()) {
-    color_string = absl::StrCat(" @", color().value());
+    color_string = absl::StrCat(" @", color());
   }
   return absl::StrCat(instruction_->name(), "[", absl::StrJoin(index_, ","),
                       "](#", id(), color_string, ")");
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 8752e870bb7..0ed72f51754 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -16,53 +16,52 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/memory_space_assignment.h"
 
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/core/lib/math/math_util.h"
 namespace xla {
 
 namespace {
 // Define a dummy chunk for chunks that will be allocated in the default memory
 // space and for keeping track of number of asynchronous copies.
 const HeapSimulator::Chunk kDummyChunk{-1, -1};
+// This variable is used by the cost analysis in estimating how many times each
+// while loop will execute. Nested loops will be assumed to have executed
+// pow(kWhileExecutionCount, nesting_level) times.
+const int kWhileExecutionCount = 5;
 
-// Returns a heuristic value that captures how much putting this tensor to
-// the alternate memory would help if the op is memory bound, or otherwise
-// how far off is the op to memory boundedness. The larger this number, the
-// higher priority it will be placed in the alternate memory.
-float GetAlternateMemoryBenefit(
-    const MemorySpaceAssignmentCostAnalysis& cost_analysis,
+}  // namespace
+
+float MemorySpaceAssignmentCostAnalysis::GetAlternateMemoryBenefit(
     const HloInstruction& instruction,
-    float elapsed_time_due_to_alternate_mem) {
+    float elapsed_time_due_to_alternate_mem) const {
   float elapsed_time_due_to_compute =
-      cost_analysis.GetInstructionElapsedDueToCompute(instruction);
+      GetInstructionElapsedDueToCompute(instruction);
   float elapsed_time_due_to_memory =
-      cost_analysis.GetInstructionElapsedDueToMemory(instruction);
+      GetInstructionElapsedDueToMemory(instruction);
   if (elapsed_time_due_to_memory > elapsed_time_due_to_compute) {
     // Memory bound, return how much alternate memory is better.
-    return elapsed_time_due_to_memory - elapsed_time_due_to_alternate_mem;
+    int while_nest_level = CalculateWhileLoopNestLevel(&instruction);
+    return (elapsed_time_due_to_memory - elapsed_time_due_to_alternate_mem) *
+           tensorflow::MathUtil::IPow<float>(kWhileExecutionCount,
+                                             while_nest_level);
   } else {
     // Compute bound, return how far off are we to memory boundedness.
     return elapsed_time_due_to_memory - elapsed_time_due_to_compute;
   }
 }
 
-// Returns a heuristic value of memory boundedness for the given BufferInterval.
-// The larger this number, the higher priority it will be placed in the
-// alternate memory.
-float GetMemoryBoundedness(
-    const MemorySpaceAssignmentCostAnalysis& cost_analysis,
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) {
+float MemorySpaceAssignmentCostAnalysis::GetMemoryBoundedness(
+    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const {
   const HloInstruction& defining_instruction =
       *interval.buffer->defining_instruction();
-  float alternate_mem_benefit =
-      GetAlternateMemoryBenefit(cost_analysis, defining_instruction,
-                                cost_analysis.GetInstructionElapsedDueToMemory(
-                                    defining_instruction,
-                                    /*operand_in_alternate_mem=*/{},
-                                    /*output_in_alternate_mem=*/true));
+  float alternate_mem_benefit = GetAlternateMemoryBenefit(
+      defining_instruction,
+      GetInstructionElapsedDueToMemory(defining_instruction,
+                                       /*operand_in_alternate_mem=*/{},
+                                       /*output_in_alternate_mem=*/true));
   for (const HloUse& use : interval.buffer->uses()) {
     float use_alternate_mem_benefit = GetAlternateMemoryBenefit(
-        cost_analysis, *use.instruction,
-        cost_analysis.GetInstructionElapsedDueToMemory(*use.instruction,
-                                                       use.operand_number));
+        *use.instruction,
+        GetInstructionElapsedDueToMemory(*use.instruction, use.operand_number));
     // If the benefit is positive (memory bound), add it to this buffer's
     // benefit. If the benefit is negative (compute bound), calculate the
     // maximum.
@@ -77,7 +76,7 @@ float GetMemoryBoundedness(
   // Get performance slowdown in seconds of prefetching current BufferInterval
   // causing to other BufferIntervals.
   float alternate_mem_slowdown =
-      cost_analysis.GetInstructionElapsedDueToMemorySlowdown(interval.size);
+      GetInstructionElapsedDueToMemorySlowdown(interval.size);
 
   // Scale the slowdown based on the time of this buffer. We would want earlier
   // buffers have lower slowdown values, because they are less likely to overlap
@@ -86,13 +85,28 @@ float GetMemoryBoundedness(
   // for early HLOs, and full slowdown for mid-to-late HLOs.
   // TODO(yuemmawang): Further in a smarter way, we want buffers overlapped with
   // more HLOs have higher slowdown, and vice versa.
-  float scale = interval.start * 1.0 / cost_analysis.GetScheduleEndTime();
+  float scale = interval.start * 1.0 / GetScheduleEndTime();
   alternate_mem_slowdown *= scale;
 
   return alternate_mem_benefit - alternate_mem_slowdown;
 }
 
-}  // namespace
+int MemorySpaceAssignmentCostAnalysis::CalculateWhileLoopNestLevel(
+    const HloInstruction* instruction) const {
+  int nest_level = 0;
+  const HloComputation* computation = instruction->parent();
+  while (!computation->IsEntryComputation()) {
+    auto node = call_graph_.GetNode(computation);
+    auto callsites = node.caller_callsites();
+    CHECK_EQ(callsites.size(), 1) << "The module is not flattened!";
+    auto callsite = callsites[0];
+    if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
+      ++nest_level;
+    }
+    computation = callsite.instruction()->parent();
+  }
+  return nest_level;
+}
 
 float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedDueToCompute(
     const HloInstruction& instruction) const {
@@ -207,29 +221,30 @@ CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
     const MemorySpaceAssignmentCostAnalysis& cost_analysis,
     float min_async_copy_to_overlap_ratio,
     float max_async_copy_to_overlap_ratio)
-    : cost_analysis_(cost_analysis),
+    : elapsed_time_(
+          cost_analysis.hlo_live_range().instruction_schedule().size(), 0.0),
+      while_nest_level_(
+          cost_analysis.hlo_live_range().instruction_schedule().size(), 0),
+      cost_analysis_(cost_analysis),
       min_async_copy_to_overlap_ratio_(min_async_copy_to_overlap_ratio),
       max_async_copy_to_overlap_ratio_(max_async_copy_to_overlap_ratio) {
   instruction_schedule_ =
       &cost_analysis_.hlo_live_range().instruction_schedule();
 
-  // First create a vector of elapsed times of HLO instructions.
-  std::vector<float> instructions_elapsed_time(instruction_schedule_->size(),
-                                               0.0);
+  // Create a vector of elapsed times and while nesting levels of HLO
+  // instructions.
   for (const auto& instruction_and_logical_time : *instruction_schedule_) {
     float elapsed_time = cost_analysis_.cost_analysis().optimal_seconds(
         *instruction_and_logical_time.first);
     int64 logical_time = instruction_and_logical_time.second;
-    if (logical_time >= instructions_elapsed_time.size()) {
-      instructions_elapsed_time.resize(logical_time + 1, 0.0);
+    if (logical_time >= elapsed_time_.size()) {
+      elapsed_time_.resize(logical_time + 1, 0.0);
+      while_nest_level_.resize(logical_time + 1, 0);
     }
-    instructions_elapsed_time[logical_time] = elapsed_time;
-  }
-  // As an optimization, create a cumulative sum vector of elapsed time.
-  float cumsum = 0.0;
-  for (float elapsed_time : instructions_elapsed_time) {
-    cumsum += elapsed_time;
-    elapsed_time_cumsum_.push_back(cumsum);
+    elapsed_time_[logical_time] = elapsed_time;
+    while_nest_level_[logical_time] =
+        cost_analysis_.CalculateWhileLoopNestLevel(
+            instruction_and_logical_time.first);
   }
 }
 
@@ -275,7 +290,7 @@ void CostAnalysisPrefetchIntervalPicker::Begin(const HloUse& use,
   end_logical_time_ = end_time;
   // Find the earliest time we're allowed to start prefetching.
   for (current_logical_prefetch_time_ = start_time;
-       current_logical_prefetch_time_ <= end_logical_time_ &&
+       current_logical_prefetch_time_ < end_logical_time_ &&
        max_async_copy_to_overlap_ratio_ * async_copy_elapsed_ <
            GetLogicalIntervalElapsed(current_logical_prefetch_time_,
                                      end_logical_time_);
@@ -290,9 +305,9 @@ int64 CostAnalysisPrefetchIntervalPicker::Next() {
 }
 
 bool CostAnalysisPrefetchIntervalPicker::Done() const {
-  // The end time is inclusive, so we're done if the prefetch time is greater
-  // than that.
-  if (current_logical_prefetch_time_ > end_logical_time_) {
+  // The end time is exclusive, so we're done if the prefetch time is greater
+  // than or equal to the end time.
+  if (current_logical_prefetch_time_ >= end_logical_time_) {
     return true;
   }
   float logical_interval_elapsed = GetLogicalIntervalElapsed(
@@ -303,7 +318,17 @@ bool CostAnalysisPrefetchIntervalPicker::Done() const {
 
 float CostAnalysisPrefetchIntervalPicker::GetLogicalIntervalElapsed(
     int64 start_time, int64 end_time) const {
-  return elapsed_time_cumsum_[end_time - 1] - elapsed_time_cumsum_[start_time];
+  int interval_nest_level =
+      std::min(while_nest_level_[start_time], while_nest_level_[end_time]);
+  float total_elapsed = 0;
+  for (int i = start_time + 1; i < end_time; ++i) {
+    total_elapsed +=
+        elapsed_time_[i] *
+        tensorflow::MathUtil::IPow<float>(
+            kWhileExecutionCount,
+            std::max(0, while_nest_level_[i] - interval_nest_level));
+  }
+  return total_elapsed;
 }
 
 std::string CostAnalysisPrefetchIntervalPicker::ToDebugString() const {
@@ -328,7 +353,7 @@ std::string CostAnalysisPrefetchIntervalPicker::ToNoCopyDebugString(
 absl::optional<float>
 CostAnalysisPrefetchIntervalPicker::BufferIntervalAlternateMemoryBenefit(
     const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const {
-  return GetMemoryBoundedness(cost_analysis_, interval);
+  return cost_analysis_.GetMemoryBoundedness(interval);
 }
 
 std::string MemorySpaceAssignment::AllocationValue::ToString() const {
@@ -496,13 +521,28 @@ bool AlternateMemoryBestFitHeap::IsIntervalAllowedInAlternateMemory(
         return false;
       }
     }
+
+    if ((position.instruction->opcode() == HloOpcode::kCollectivePermuteStart ||
+         position.instruction->opcode() == HloOpcode::kCollectivePermuteDone)) {
+      // Disable memory space allocation for these for now.
+      if (position.index == ShapeIndex({0})) {
+        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+                << " in default mem because it is a collective-permute buffer.";
+        return false;
+      } else if (position.index == ShapeIndex({1})) {
+        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+                << " in default mem because it is a collective-permute buffer.";
+        return false;
+      }
+    }
   }
 
   return true;
 }
 
 bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
-    const HloUse& use) const {
+    const AllocationValue& value, const HloUse& use) const {
+  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
   if (use.instruction->opcode() == HloOpcode::kWhile) {
     HloComputation* while_body = use.instruction->while_body();
 
@@ -512,7 +552,6 @@ bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
     HloValue* parameter_value =
         &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
             while_body->parameter_instruction(0), use.operand_index);
-    const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
     int64 parameter_time =
         instruction_schedule.at(while_body->parameter_instruction(0));
     int64 root_time = instruction_schedule.at(while_body->root_instruction());
@@ -567,7 +606,54 @@ bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
                  "there is a required default memory assignment.";
       return false;
     }
+  } else if (use.instruction->opcode() == HloOpcode::kConditional) {
+    // For any use of this conditional (the same value might be passed into
+    // multiple called computations), determine if the parameter->first use
+    // dependency is short.
+    int64 conditional_time = instruction_schedule.at(use.instruction);
+    for (const HloUse& other_use : value.uses()) {
+      if (other_use.instruction != use.instruction) {
+        continue;
+      }
+      HloComputation* called_computation =
+          use.instruction->called_computations().at(other_use.operand_number -
+                                                    1);
+      const HloInstruction* parameter_instruction =
+          called_computation->parameter_instruction(0);
+      HloValue* parameter_value =
+          &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
+              parameter_instruction, other_use.operand_index);
+      int64 parameter_time = instruction_schedule.at(parameter_instruction);
+      int64 min_use_time = conditional_time;
+      for (const HloUse& parameter_use : parameter_value->uses()) {
+        if (parameter_use.instruction->parent() == called_computation &&
+            parameter_use.instruction->opcode() !=
+                HloOpcode::kGetTupleElement &&
+            parameter_use.instruction->opcode() != HloOpcode::kTuple &&
+            parameter_use.instruction->opcode() != HloOpcode::kBitcast) {
+          min_use_time = std::min(
+              min_use_time, instruction_schedule.at(parameter_use.instruction));
+        }
+      }
+      if (options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
+              parameter_value->shape(), parameter_time, min_use_time)) {
+        VLOG(4) << "Conditional allocation allowed in alternate memory for "
+                   "computation = "
+                << called_computation->name()
+                << ", parameter time = " << parameter_time
+                << ", min use time = " << min_use_time;
+        return true;
+      } else {
+        VLOG(4) << "Conditional allocation not allowed in alternate memory for "
+                   "computation = "
+                << called_computation->name()
+                << ", parameter time = " << parameter_time
+                << ", min use time = " << min_use_time;
+      }
+    }
+    return false;
   }
+
   return true;
 }
 
@@ -585,23 +671,35 @@ void AlternateMemoryBestFitHeap::AppendBufferInfoDebugString(
   // definition_time: int. Logical time this value was defined in the schedule.
   // use_times: string. This is a semicolon-separated list of integers for all
   // the use times.
+  // use_names: string. This is a semicolon-separated list of string
+  // representation of uses.
   if (debug_str->empty()) {
     // Append the column names.
     absl::StrAppend(debug_str,
-                    "buffer_id,buffer_name,alt_mem_benefit,size,definition_"
-                    "time,use_times\n");
+                    "buffer_id,buffer_name,alt_mem_benefit,size,"
+                    "definition_time,use_times,use_names\n");
   }
   const HloBuffer& buffer =
       alias_analysis_.GetBufferContainingValue(*interval.buffer);
   const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
   int64 definition_time =
       instruction_schedule.at(interval.buffer->defining_position().instruction);
-  std::set<int64> use_times;
+  std::vector<std::pair<int64, std::string>> uses;
   for (const HloValue* value : buffer.values()) {
     for (const HloUse& use : value->uses()) {
-      use_times.insert(instruction_schedule.at(use.instruction));
+      uses.push_back(
+          {instruction_schedule.at(use.instruction), use.ToString()});
     }
   }
+  absl::c_sort(uses);
+  std::vector<int64> use_times;
+  std::vector<std::string> use_names;
+  use_times.reserve(uses.size());
+  use_names.reserve(uses.size());
+  for (auto use : uses) {
+    use_times.push_back(use.first);
+    use_names.push_back(use.second);
+  }
 
   absl::StrAppend(debug_str, buffer.id(), ",");
   absl::StrAppend(debug_str, "\"", interval.buffer->ToShortString(), "\",");
@@ -612,7 +710,8 @@ void AlternateMemoryBestFitHeap::AppendBufferInfoDebugString(
       debug_str, alternate_memory_benefit ? *alternate_memory_benefit : 0, ",");
   absl::StrAppend(debug_str, interval.size, ",");
   absl::StrAppend(debug_str, definition_time, ",");
-  absl::StrAppend(debug_str, "\"", absl::StrJoin(use_times, ";"), "\"");
+  absl::StrAppend(debug_str, "\"", absl::StrJoin(use_times, ";"), "\",");
+  absl::StrAppend(debug_str, "\"", absl::StrJoin(use_names, ";"), "\"");
   absl::StrAppend(debug_str, "\n");
 }
 
@@ -745,8 +844,6 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
     }
 
     const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
-    global_max_time_ = instruction_schedule.at(
-        module->entry_computation()->root_instruction());
 
     // TODO(berkin): For now, place the phi values due to conditionals in
     // default memory.
@@ -756,20 +853,12 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
         if (position.instruction->opcode() == HloOpcode::kConditional) {
           VLOG(3) << "Adding required assignment for condition output: "
                   << value->ToShortString();
-          required_assignments_[value].push_back(
-              {MemorySpace::kDefault,
-               instruction_schedule.at(position.instruction),
-               /*chunk=*/absl::nullopt});
+          AddRequiredAssignment(position.instruction, position.index,
+                                MemorySpace::kDefault);
           for (const HloComputation* called_computation :
                position.instruction->called_computations()) {
-            HloValue* root_value =
-                &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
-                    called_computation->root_instruction(), position.index);
-            required_assignments_[root_value].push_back(
-                {MemorySpace::kDefault,
-                 instruction_schedule.at(
-                     called_computation->root_instruction()),
-                 /*chunk=*/absl::nullopt});
+            AddRequiredAssignment(called_computation->root_instruction(),
+                                  position.index, MemorySpace::kDefault);
           }
         }
       }
@@ -795,9 +884,13 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
       }
 
       // Iterate over the uses.
-      for (HloUse use : allocation_value.uses()) {
+      for (int use_idx = 0; use_idx < allocation_value.uses().size();
+           ++use_idx) {
+        const HloUse& use = allocation_value.uses().at(use_idx);
         int64 use_time = instruction_schedule.at(use.instruction);
         int64 latest_prefetch_time = use_time;
+        bool allow_no_copy_alternate_mem_allocation = true;
+        absl::optional<int64> earliest_prefetch_time = absl::nullopt;
 
         // Sequential calls include kWhile, kCall, and kConditional opcodes.
         bool is_sequential_call =
@@ -844,14 +937,41 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
             // when we look at uses within the while loop body.
             use_time =
                 instruction_schedule.at(while_body->parameter_instruction(0));
+          } else if (use.instruction->opcode() == HloOpcode::kConditional) {
+            // Replace the use time with the earliest parameter of called
+            // computations.
+            for (const HloComputation* called_computation :
+                 use.instruction->called_computations()) {
+              use_time = std::min(
+                  use_time, instruction_schedule.at(
+                                called_computation->parameter_instruction(0)));
+            }
           }
         }
 
         // Add a required assignment in default memory if the use not allowed in
         // alternate memory.
-        if (!IsUseAllowedInAlternateMemory(use)) {
-          required_assignments_[allocation_value.value()].push_back(
-              {MemorySpace::kDefault, use_time, /*chunk=*/absl::nullopt});
+        if (!IsUseAllowedInAlternateMemory(allocation_value, use)) {
+          AddRequiredAssignment(allocation_value.value(), use.instruction,
+                                MemorySpace::kDefault, use_time);
+        } else if (use_idx > 0) {
+          // We allow buffers in alternate memory that are passed into
+          // conditionals to give up their alternate memory allocation inside
+          // the called computation. This means that if a conditional operator
+          // has an alternate memory allocation, subsequent uses cannot use the
+          // same alternate memory allocation in order not to clobber data. So
+          // we force default memory allocation for these subsequent uses.
+          const HloUse& previous_use = allocation_value.uses().at(use_idx - 1);
+          if (previous_use.instruction->opcode() == HloOpcode::kConditional &&
+              previous_use.instruction != use.instruction) {
+            allow_no_copy_alternate_mem_allocation = false;
+            earliest_prefetch_time =
+                instruction_schedule.at(previous_use.instruction);
+            VLOG(3) << "Previous use (" << previous_use.ToString()
+                    << ") of use (" << use.ToString()
+                    << ") is a conditional, so this use will need to evict. "
+                    << "Earliest prefetch time = " << *earliest_prefetch_time;
+          }
         }
 
         // Bitcasts don't define buffers and don't directly consume buffers.
@@ -859,10 +979,16 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
         // bitcasts will be handled specially.
         if (use.instruction->opcode() != HloOpcode::kBitcast) {
           AllocationRequest request;
-          request.start_time = definition_time;
+          // Rarely, (e.g., when conditional true and false parameters are the
+          // same), definition time can be the time of the conditional and use
+          // time is the parameter use, which is less.
+          request.start_time = std::min(definition_time, use_time);
           request.end_time = use_time;
           request.latest_prefetch_time = latest_prefetch_time;
           request.size = interval.size;
+          request.allow_no_copy_alternate_mem_allocation =
+              allow_no_copy_alternate_mem_allocation;
+          request.earliest_prefetch_time = earliest_prefetch_time;
           request.preferred_offset = preferred_offset;
           request.use = use;
           request.allocation_value = &allocation_value;
@@ -1048,35 +1174,42 @@ void AlternateMemoryBestFitHeap::AddAliasedRequiredAssignment(
   if (aliased_allocation->memory_space() == MemorySpace::kAlternate) {
     chunk = aliased_allocation->chunk();
   }
-  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
-  HloValue* value =
-      &alias_analysis_.dataflow_analysis().GetUniqueValueAt(instruction, index);
-  int64 instruction_time = instruction_schedule.at(instruction);
+  AddRequiredAssignment(instruction, index, aliased_allocation->memory_space(),
+                        chunk);
+}
+
+void AlternateMemoryBestFitHeap::AddRequiredAssignment(
+    const HloValue* value, const HloInstruction* instruction,
+    MemorySpaceAssignment::MemorySpace memory_space, int64 time,
+    absl::optional<HeapSimulator::Chunk> chunk) {
   // Check for existing required assignment at this time and make sure it is the
   // same as this if there is one.
-  auto existing_required_assignment =
-      RequiredMemoryAssignmentAt(value, instruction_time);
+  auto existing_required_assignment = RequiredMemoryAssignmentAt(value, time);
   if (existing_required_assignment) {
-    CHECK(aliased_allocation->memory_space() ==
-          existing_required_assignment->memory_space);
+    CHECK(memory_space == existing_required_assignment->memory_space)
+        << "inst = " << instruction->ToString() << " at " << time;
     CHECK((!chunk && !existing_required_assignment->chunk) ||
           chunk->offset == existing_required_assignment->chunk->offset);
-    VLOG(3) << "Not adding aliased required assignment because there is one "
-               "already: "
-            << value->ToShortString() << " at " << instruction_time << " at "
-            << (aliased_allocation->memory_space() == MemorySpace::kDefault
-                    ? "def"
-                    : "alt");
-    return;
+    VLOG(3) << "Not adding required assignment because there is one already: "
+            << value->ToShortString() << " at " << time << " at "
+            << (memory_space == MemorySpace::kDefault ? "def" : "alt");
+  } else {
+    VLOG(3) << "Adding required assignment: " << value->ToShortString()
+            << " at " << time << " at "
+            << (memory_space == MemorySpace::kDefault ? "def" : "alt");
+    required_assignments_[value].push_back({memory_space, time, chunk});
   }
+}
 
-  required_assignments_[value].push_back(
-      {aliased_allocation->memory_space(), instruction_time, chunk});
-  VLOG(3) << "Adding aliased required assignment: " << value->ToShortString()
-          << " at " << instruction_time << " at "
-          << (aliased_allocation->memory_space() == MemorySpace::kDefault
-                  ? "def"
-                  : "alt");
+void AlternateMemoryBestFitHeap::AddRequiredAssignment(
+    const HloInstruction* instruction, ShapeIndex index,
+    MemorySpace memory_space, absl::optional<Chunk> chunk) {
+  const HloValue* value =
+      &alias_analysis_.dataflow_analysis().GetUniqueValueAt(instruction, index);
+  int64 instruction_time =
+      hlo_live_range_.instruction_schedule().at(instruction);
+  AddRequiredAssignment(value, instruction, memory_space, instruction_time,
+                        chunk);
 }
 
 void AlternateMemoryBestFitHeap::AddInputAndOutputRequiredAssignments() {
@@ -1174,10 +1307,13 @@ void AlternateMemoryBestFitHeap::UncommitPendingChunks() {
     interval_tree_.Remove(interval.start, interval.end, chunk);
   }
   for (const auto& interval : pending_async_copies_) {
-    async_copy_interval_tree_.Remove(interval.start_time, interval.end_time,
-                                     kDummyChunk);
     if (interval.destination == MemorySpace::kAlternate) {
+      prefetch_interval_tree_.Remove(interval.start_time, interval.end_time,
+                                     kDummyChunk);
       async_copy_ordering_.RemoveCopy(interval);
+    } else {
+      eviction_interval_tree_.Remove(interval.start_time, interval.end_time,
+                                     kDummyChunk);
     }
   }
   pending_chunks_.clear();
@@ -1276,6 +1412,7 @@ bool AlternateMemoryBestFitHeap::FindAllocation(
   // First try keeping the allocation entirely in the alternate memory.
   if (required_memory_space_at_start != MemorySpace::kDefault &&
       required_memory_space_at_end != MemorySpace::kDefault &&
+      request.allow_no_copy_alternate_mem_allocation &&
       AllocateInAlternateMemoryNoCopy(request)) {
     return true;
   }
@@ -1350,6 +1487,7 @@ void AlternateMemoryBestFitHeap::AddAsyncCopy(
                   : "alternate")
           << " memory between " << start_time << " and "
           << copy_done_schedule_before_time << " keeping until " << end_time;
+  CHECK_LT(start_time, copy_done_schedule_before_time);
 
   allocations->push_back(
       absl::make_unique<MemorySpaceAssignment::CopyAllocation>(
@@ -1360,27 +1498,37 @@ void AlternateMemoryBestFitHeap::AddAsyncCopy(
   // the limit at any given time.
   pending_async_copies_.push_back(
       {start_time, copy_done_schedule_before_time, memory_space});
-  async_copy_interval_tree_.Add(start_time, copy_done_schedule_before_time,
-                                kDummyChunk);
   if (memory_space == MemorySpaceAssignment::MemorySpace::kAlternate) {
+    prefetch_interval_tree_.Add(start_time, copy_done_schedule_before_time,
+                                kDummyChunk);
     async_copy_ordering_.AddCopy(pending_async_copies_.back());
+  } else {
+    eviction_interval_tree_.Add(start_time, copy_done_schedule_before_time,
+                                kDummyChunk);
   }
 }
 
 bool AlternateMemoryBestFitHeap::ViolatesMaximumOutstandingAsyncCopies(
-    int64 start_time, int64 end_time) const {
-  if (options_.max_outstanding_async_copies < 0) {
+    int64 start_time, int64 end_time, bool is_prefetch) const {
+  if (options_.max_outstanding_prefetches < 0 && is_prefetch) {
+    return false;
+  }
+  if (options_.max_outstanding_evictions < 0 && !is_prefetch) {
     return false;
   }
 
-  // Count the asynchronous copies in the interval tree for the given interval.
-  int64 num_async_copies =
-      async_copy_interval_tree_.ChunksOverlappingInTime(start_time, end_time)
-          .size();
-
-  // Add one because we are checking if adding an additional asynchronous copy
-  // would violate the limit.
-  return num_async_copies + 1 > options_.max_outstanding_async_copies;
+  // Count the prefetches/evictions in the interval tree for the given interval.
+  if (is_prefetch) {
+    int64 num_prefetches =
+        prefetch_interval_tree_.ChunksOverlappingInTime(start_time, end_time)
+            .size();
+    return num_prefetches >= options_.max_outstanding_prefetches;
+  } else {
+    int64 num_evictions =
+        eviction_interval_tree_.ChunksOverlappingInTime(start_time, end_time)
+            .size();
+    return num_evictions >= options_.max_outstanding_evictions;
+  }
 }
 
 bool AlternateMemoryBestFitHeap::ViolatesAsyncCopyOrdering(
@@ -1512,6 +1660,9 @@ bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
                    request.allocation_value->defining_position().shape(),
                    eviction_start_time, request.end_time),
                eviction_end_time);
+  // Evictions must complete by the time of this use.
+  preferred_eviction_end_time =
+      std::min(preferred_eviction_end_time, request.latest_prefetch_time);
 
   BufferInterval eviction_mem_interval;
   eviction_mem_interval.buffer = request.allocation_value->value();
@@ -1519,8 +1670,7 @@ bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
   // Try to reserve a buffer from the end of the previous allocation to the
   // preferred eviction end time.
   eviction_mem_interval.start = eviction_end_time + 1;
-  eviction_mem_interval.end =
-      std::min(preferred_eviction_end_time, global_max_time_);
+  eviction_mem_interval.end = preferred_eviction_end_time;
   int64 preferred_offset = prev_allocation->chunk().offset;
   VLOG(3) << "Eviction (" << eviction_start_time << ", " << eviction_end_time
           << ") preferred end time = " << eviction_mem_interval.end;
@@ -1542,7 +1692,8 @@ bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
   bool eviction_interval_too_short = (eviction_start_time == eviction_end_time);
   bool eviction_violates_outstanding_copies =
       ViolatesMaximumOutstandingAsyncCopies(eviction_start_time,
-                                            eviction_end_time);
+                                            eviction_end_time,
+                                            /*is_prefetch=*/false);
 
   // See if this interval would violate the asynchronous copy limit.
   if (!eviction_interval_too_short && !eviction_violates_outstanding_copies) {
@@ -1563,7 +1714,8 @@ bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
     bool eviction_scheduled = false;
     for (int64 time = eviction_start_time; time < eviction_end_time; ++time) {
       VLOG(4) << "Try evicting (" << time << ", " << time + 1 << ")";
-      if (!ViolatesMaximumOutstandingAsyncCopies(time, time + 1)) {
+      if (!ViolatesMaximumOutstandingAsyncCopies(time, time + 1,
+                                                 /*is_prefetch=*/false)) {
         VLOG(3) << "Eviction successful.";
         AddAsyncCopy(*prev_allocation, MemorySpace::kDefault,
                      /*chunk=*/absl::nullopt, time, time + 1, time + 1,
@@ -1605,9 +1757,14 @@ bool AlternateMemoryBestFitHeap::Prefetch(
   //                                     ^      ^
   //                                   Copy    Copy
   //                                   Start   Done
-  options_.prefetch_interval_picker->Begin(
-      request.use, prev_allocation_in_default_mem.earliest_available_time(),
-      request.latest_prefetch_time);
+  int64 earliest_prefetch_time =
+      prev_allocation_in_default_mem.earliest_available_time();
+  if (request.earliest_prefetch_time) {
+    earliest_prefetch_time =
+        std::max(earliest_prefetch_time, *request.earliest_prefetch_time);
+  }
+  options_.prefetch_interval_picker->Begin(request.use, earliest_prefetch_time,
+                                           request.latest_prefetch_time);
   VLOG(3) << "Trying prefetch picker = "
           << options_.prefetch_interval_picker->ToDebugString();
 
@@ -1618,12 +1775,14 @@ bool AlternateMemoryBestFitHeap::Prefetch(
   alternate_mem_interval.size = request.size;
   while (!options_.prefetch_interval_picker->Done()) {
     alternate_mem_interval.start = options_.prefetch_interval_picker->Next();
+    CHECK_LT(alternate_mem_interval.start, request.latest_prefetch_time);
     VLOG(4) << "Trying alternate memory allocation ("
             << alternate_mem_interval.start << ", " << request.end_time << ")";
     // If this additional asynchronous copy would violate the limit, try a
     // different interval.
     if (ViolatesMaximumOutstandingAsyncCopies(alternate_mem_interval.start,
-                                              request.latest_prefetch_time)) {
+                                              request.latest_prefetch_time,
+                                              /*is_prefetch=*/true)) {
       VLOG(4) << "This would violate the outstanding async copy limit.";
       continue;
     }
@@ -1693,28 +1852,48 @@ AlternateMemoryBestFitHeap::FindBestChunkCandidate(
   return absl::nullopt;
 }
 
-/*static*/ int64 MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(
-    const HloModule& module) {
-  int64 max_copies = 0;
+StatusOr<MemorySpaceAssignment::AsyncCopyStats>
+MemorySpaceAssignment::CalculateAsyncCopyStats() const {
+  AsyncCopyStats stats;
+  stats.max_outstanding_async_copies = 0;
+  stats.num_prefetches = 0;
+  stats.prefetch_bytes = 0;
+  stats.num_evictions = 0;
+  stats.eviction_bytes = 0;
   int64 current_copies = 0;
-  for (HloInstruction* instruction :
-       module.schedule().sequence(module.entry_computation()).instructions()) {
-    if (instruction->opcode() == HloOpcode::kCopyStart) {
-      current_copies++;
-    } else if (instruction->opcode() == HloOpcode::kCopyDone) {
-      current_copies--;
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloDataflowAnalysis> dataflow_analysis,
+                      HloDataflowAnalysis::Run(*module_));
+  for (const HloComputation* computation :
+       module_->MakeNonfusionComputations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kCopyStart) {
+        current_copies++;
+      } else if (instruction->opcode() == HloOpcode::kCopyDone) {
+        current_copies--;
+        int64 size =
+            options_.size_fn(dataflow_analysis->GetUniqueValueAt(instruction));
+        if (instruction->shape().layout().memory_space() ==
+            options_.alternate_memory_space) {
+          ++stats.num_prefetches;
+          stats.prefetch_bytes += size;
+        } else {
+          ++stats.num_evictions;
+          stats.eviction_bytes += size;
+        }
+      }
+      stats.max_outstanding_async_copies =
+          std::max(stats.max_outstanding_async_copies, current_copies);
     }
-    max_copies = std::max(max_copies, current_copies);
   }
-  return max_copies;
+  return stats;
 }
 
 /*static*/ MemorySpaceAssignment::BufferIntervalCompare
 MemorySpaceAssignment::GetMemoryBoundednessBufferIntervalCompare(
     const MemorySpaceAssignmentCostAnalysis& cost_analysis) {
   return [&](const BufferInterval& x, const BufferInterval& y) {
-    float x_memory_boundedness = GetMemoryBoundedness(cost_analysis, x);
-    float y_memory_boundedness = GetMemoryBoundedness(cost_analysis, y);
+    float x_memory_boundedness = cost_analysis.GetMemoryBoundedness(x);
+    float y_memory_boundedness = cost_analysis.GetMemoryBoundedness(y);
     if (x_memory_boundedness != y_memory_boundedness) {
       return x_memory_boundedness > y_memory_boundedness;
     }
@@ -1748,6 +1927,8 @@ bool LooksLikeAnActivation(const HloInstruction* inst) {
           }
         }
         break;
+      case HloOpcode::kBitcast:
+        return LooksLikeAnActivation(user);
       default:
         return true;
     }
@@ -1764,10 +1945,20 @@ bool IsCrossProgramPrefetchCandidate(
          !value.uses().empty() &&
          options.size_fn(value) <= options.max_size_in_bytes &&
          absl::c_all_of(value.uses(), [&](const HloUse& use) {
-           const HloInstruction* gte =
+           const HloInstruction* inst =
                use.instruction->operand(use.operand_number);
-           return gte->opcode() == HloOpcode::kGetTupleElement &&
-                  !LooksLikeAnActivation(gte);
+
+           // Skip the LooksLikeAnActivation test since we're testing the
+           // parent GTE and its children below.
+           if (inst->opcode() == HloOpcode::kBitcast &&
+               inst->operand(0)->opcode() == HloOpcode::kGetTupleElement &&
+               inst->operand(0)->operand(0)->opcode() ==
+                   HloOpcode::kParameter) {
+             return true;
+           }
+
+           return inst->opcode() == HloOpcode::kGetTupleElement &&
+                  !LooksLikeAnActivation(inst);
          });
 }
 
@@ -1838,8 +2029,13 @@ MemorySpaceAssignment::RunMemorySpaceAssignment(
   VLOG(3) << "Module after memory space assignment: ";
   XLA_VLOG_LINES(3, module_->ToString());
   TF_CHECK_OK(module_->schedule().Verify());
+  TF_ASSIGN_OR_RETURN(AsyncCopyStats stats, CalculateAsyncCopyStats());
   VLOG(1) << "Maximum number of outstanding async copies: "
-          << CountMaximumOutstandingAsyncCopies(*module_);
+          << stats.max_outstanding_async_copies;
+  VLOG(1) << "Number of prefetches: " << stats.num_prefetches
+          << ", in bytes: " << stats.prefetch_bytes;
+  VLOG(1) << "Number of evictions: " << stats.num_evictions
+          << ", in bytes: " << stats.eviction_bytes;
 
   TF_RETURN_IF_ERROR(VerifyAndExportHeapSimulatorTrace());
 
@@ -2398,6 +2594,34 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
            std::tuple<const HloValue*, Chunk, HeapSimulatorTrace::Event::Kind>>
       events;
 
+  auto add_allocation_and_verify = [&](int64 start_time, int64 end_time,
+                                       const Chunk& chunk,
+                                       const HloValue* value) {
+    events[std::make_tuple(start_time, /*is_free=*/false, value->id())] =
+        std::make_tuple(value, chunk, HeapSimulatorTrace::Event::ALLOC);
+    events[std::make_tuple(end_time, /*is_free=*/true, value->id())] =
+        std::make_tuple(value, chunk, HeapSimulatorTrace::Event::FREE);
+
+    // Get the chunks overlapping in time and search if they overlap in space
+    // as well.
+    // TODO(berkin): For now checking against end_time - 1 (exclusive), but we
+    // really should check against end_time (inclusive) for cases where the
+    // operand can't share buffer with user (see
+    // HloDataflowAnalysis::CanShareOperandBufferWithUser).
+    for (const Chunk& overlapping_chunk :
+         interval_tree.ChunksOverlappingInTime(start_time, end_time - 1)) {
+      if (chunk.OverlapsWith(overlapping_chunk)) {
+        return InternalError(
+            ("Value %s (%d, %d) off: %d size: %d overlaps with another chunk"
+             " off: %d size: %d"),
+            value->ToShortString(), start_time, end_time, chunk.offset,
+            chunk.size, overlapping_chunk.offset, overlapping_chunk.size);
+      }
+    }
+    interval_tree.Add(start_time, end_time - 1, chunk);
+    return Status::OK();
+  };
+
   // Go through all instructions in the module to ensure CopyStart/CopyDone
   // instructions copy between alternate memory and default memory.
   for (const HloComputation* computation :
@@ -2433,34 +2657,73 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
     for (const HloValue* value : buffer.values()) {
       const HloLiveRange::TimeBound& time_bound =
           hlo_live_range->buffer_live_ranges().at(value);
-      events[std::make_tuple(time_bound.start, /*is_free=*/false,
-                             value->id())] =
-          std::make_tuple(value, chunk, HeapSimulatorTrace::Event::ALLOC);
-      events[std::make_tuple(time_bound.end, /*is_free=*/true, value->id())] =
-          std::make_tuple(value, chunk, HeapSimulatorTrace::Event::FREE);
-
-      VLOG(3) << " buffer: " << buffer.ToString()
-              << " value: " << value->ToShortString() << ": ("
-              << time_bound.start << ", " << time_bound.end
-              << ") off: " << chunk.offset << ", size: " << chunk.size;
-      // Get the chunks overlapping in time and search if they overlap in space
-      // as well.
-      // TODO(berkin): For now checking against end_time - 1 (exclusive), but we
-      // really should check against end_time (inclusive) for cases where the
-      // operand can't share buffer with user (see
-      // HloDataflowAnalysis::CanShareOperandBufferWithUser).
-      for (const Chunk& overlapping_chunk :
-           interval_tree.ChunksOverlappingInTime(time_bound.start,
-                                                 time_bound.end - 1)) {
-        if (chunk.OverlapsWith(overlapping_chunk)) {
-          return InternalError(
-              ("Buffer %s (%d, %d) off: %d size: %d overlaps with another chunk"
-               " off: %d size: %d"),
-              buffer.ToString(), time_bound.start, time_bound.end, chunk.offset,
-              chunk.size, overlapping_chunk.offset, overlapping_chunk.size);
+      const HloInstruction* last_use_instruction = nullptr;
+      int64 last_use_time = time_bound.start;
+      for (const HloUse& use : value->uses()) {
+        int64 use_time =
+            hlo_live_range->instruction_schedule().at(use.instruction);
+        if (use_time > last_use_time) {
+          last_use_time = use_time;
+          last_use_instruction = use.instruction;
         }
       }
-      interval_tree.Add(time_bound.start, time_bound.end - 1, chunk);
+
+      if (last_use_instruction &&
+          last_use_instruction->opcode() == HloOpcode::kConditional) {
+        // Special case when verifying conditional: we internally split the use
+        // of alternate memory in conditionals, so fish them out from the
+        // conditionals.
+        VLOG(3) << " Splitting conditional buffer: " << buffer.ToString()
+                << " value: " << value->ToShortString() << ": ("
+                << time_bound.start << ", " << time_bound.end
+                << ") off: " << chunk.offset << ", size: " << chunk.size;
+        int64 earliest_computation_start_time = time_bound.end;
+        for (const HloComputation* called_computation :
+             last_use_instruction->called_computations()) {
+          earliest_computation_start_time =
+              std::min(earliest_computation_start_time,
+                       hlo_live_range->computation_span_times()
+                           .at(called_computation)
+                           .start);
+          int64 parameter_time = -1;
+          int64 last_use_time = -1;
+          for (const HloPosition& position : value->positions()) {
+            if (position.instruction->opcode() == HloOpcode::kParameter &&
+                position.instruction->parent() == called_computation) {
+              parameter_time = hlo_live_range->instruction_schedule().at(
+                  position.instruction);
+              break;
+            }
+          }
+          for (const HloUse& use : value->uses()) {
+            if (use.instruction->parent() == called_computation) {
+              last_use_time = std::max(
+                  last_use_time,
+                  hlo_live_range->instruction_schedule().at(use.instruction));
+            }
+          }
+          if (last_use_time != -1) {
+            CHECK_NE(parameter_time, -1);
+            VLOG(3) << "  computation: " << called_computation->name() << ": ("
+                    << parameter_time << ", " << last_use_time << ")";
+            TF_RETURN_IF_ERROR(add_allocation_and_verify(
+                parameter_time, last_use_time, chunk, value));
+          }
+        }
+        VLOG(3) << "  from beginning until first computation: ("
+                << time_bound.start << ", "
+                << (earliest_computation_start_time - 1) << ")";
+        TF_RETURN_IF_ERROR(add_allocation_and_verify(
+            time_bound.start, earliest_computation_start_time - 1, chunk,
+            value));
+      } else {
+        VLOG(3) << " buffer: " << buffer.ToString()
+                << " value: " << value->ToShortString() << ": ("
+                << time_bound.start << ", " << time_bound.end
+                << ") off: " << chunk.offset << ", size: " << chunk.size;
+        TF_RETURN_IF_ERROR(add_allocation_and_verify(
+            time_bound.start, time_bound.end, chunk, value));
+      }
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index eb16db90600..3f59abfd28e 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -82,16 +82,31 @@ class MemorySpaceAssignmentCostAnalysis {
       const HloCostAnalysis& cost_analysis,
       float async_copy_bandwidth_bytes_per_second,
       float alternate_mem_bandwidth_bytes_per_second,
-      const HloLiveRange& hlo_live_range)
+      const HloLiveRange& hlo_live_range, const CallGraph& call_graph)
       : cost_analysis_(cost_analysis),
         async_copy_bandwidth_bytes_per_second_(
             async_copy_bandwidth_bytes_per_second),
         alternate_mem_bandwidth_bytes_per_second_(
             alternate_mem_bandwidth_bytes_per_second),
-        hlo_live_range_(hlo_live_range) {}
+        hlo_live_range_(hlo_live_range),
+        call_graph_(call_graph) {}
 
   const HloCostAnalysis& cost_analysis() const { return cost_analysis_; }
 
+  // Returns a heuristic value that captures how much putting this tensor to the
+  // alternate memory would help if the op is memory bound, or otherwise how far
+  // off is the op to memory boundedness. The larger this number, the higher
+  // priority it will be placed in the alternate memory.
+  float GetAlternateMemoryBenefit(
+      const HloInstruction& instruction,
+      float elapsed_time_due_to_alternate_mem) const;
+
+  // Returns a heuristic value of memory boundedness for the given
+  // BufferInterval.  The larger this number, the higher priority it will be
+  // placed in the alternate memory.
+  float GetMemoryBoundedness(
+      const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const;
+
   // Returns the elapsed time in seconds due to compute only.
   float GetInstructionElapsedDueToCompute(
       const HloInstruction& instruction) const;
@@ -127,6 +142,10 @@ class MemorySpaceAssignmentCostAnalysis {
 
   int64 GetScheduleEndTime() const;
 
+  // Returns the number of nested while loop levels this instruction resides in.
+  // 0 means it is not in a while loop.
+  int CalculateWhileLoopNestLevel(const HloInstruction* instruction) const;
+
   const HloLiveRange& hlo_live_range() const { return hlo_live_range_; }
 
  private:
@@ -134,6 +153,7 @@ class MemorySpaceAssignmentCostAnalysis {
   float async_copy_bandwidth_bytes_per_second_;
   float alternate_mem_bandwidth_bytes_per_second_;
   const HloLiveRange& hlo_live_range_;
+  const CallGraph& call_graph_;
 };
 
 // Abstract base class that memory space assignment uses to pick prefetch
@@ -262,10 +282,10 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
   // corresponds to the instruction schedule.
   float GetLogicalIntervalElapsed(int64 start_time, int64 end_time) const;
 
-  // For performance reasons, we calculate the prefix sum of the elapsed time so
-  // that it's efficient to find the elapsed time in seconds in any logical
-  // interval.
-  std::vector<float> elapsed_time_cumsum_;
+  // For each instruction in the flattened schedule, maintain their elapsed time
+  // and while nesting level.
+  std::vector<float> elapsed_time_;
+  std::vector<int> while_nest_level_;
 
   const MemorySpaceAssignmentCostAnalysis& cost_analysis_;
   float min_async_copy_to_overlap_ratio_;
@@ -323,9 +343,10 @@ class MemorySpaceAssignment {
     // the opcode) to be placed on the alternate memory.
     IsAllowedInAlternateMemoryFunction is_allowed_in_alternate_mem_fn;
 
-    // Specifies the upper bound for number of outstanding asynchronous copies,
-    // -1 for unlimited.
-    int64 max_outstanding_async_copies = -1;
+    // Specifies the upper bound for number of outstanding prefetches and
+    // evictions, -1 for unlimited.
+    int64 max_outstanding_prefetches = -1;
+    int64 max_outstanding_evictions = -1;
 
     // If true, tries allocating buffers across (e.g., before and inside a while
     // loop body) sequential calls (kWhile, kCall, and kConditional).
@@ -604,6 +625,15 @@ class MemorySpaceAssignment {
     AllocationSequence allocation_sequence_;
   };
 
+  // Statistics of asynchronous copies.
+  struct AsyncCopyStats {
+    int64 max_outstanding_async_copies;
+    int64 num_prefetches;
+    int64 prefetch_bytes;
+    int64 num_evictions;
+    int64 eviction_bytes;
+  };
+
   virtual ~MemorySpaceAssignment() = default;
 
   // Runs the MemorySpaceAssignment pass.
@@ -611,9 +641,8 @@ class MemorySpaceAssignment {
       HloModule* module, const HloLiveRange& hlo_live_range,
       const HloAliasAnalysis& alias_analysis, const Options& options);
 
-  // Returns the maximum number of outstanding asynchronous copies in the
-  // module.
-  static int64 CountMaximumOutstandingAsyncCopies(const HloModule& module);
+  // Calculates asynchronous copy statistics.
+  StatusOr<AsyncCopyStats> CalculateAsyncCopyStats() const;
 
   static BufferIntervalCompare GetMemoryBoundednessBufferIntervalCompare(
       const MemorySpaceAssignmentCostAnalysis& cost_analysis);
@@ -808,11 +837,16 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   // use_times is a sorted sequence of the times of all uses.
   // latest_prefetch_time is the latest time we can schedule the CopyDone for a
   // prefetch.
+  // If allow_no_copy_alternate_mem_allocation is false, an eviction is forced.
+  // If earliest_prefetch_time is set, prefetches cannot start before this
+  // value.
   struct AllocationRequest {
     int64 start_time;
     int64 end_time;
     int64 latest_prefetch_time;
     int64 size;
+    bool allow_no_copy_alternate_mem_allocation;
+    absl::optional<int64> earliest_prefetch_time;
     absl::optional<int64> preferred_offset;
     HloUse use;
     MemorySpaceAssignment::AllocationValue* allocation_value;
@@ -833,7 +867,8 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   bool IsIntervalAllowedInAlternateMemory(const BufferInterval& interval) const;
 
   // Returns true if the use is allowed in the alternate memory.
-  bool IsUseAllowedInAlternateMemory(const HloUse& use) const;
+  bool IsUseAllowedInAlternateMemory(const AllocationValue& value,
+                                     const HloUse& use) const;
 
   // Given an HloValue, creates AllocationValue objects and corresponding
   // AllocationSequences and appends them into allocation_sequence_list_.
@@ -887,6 +922,16 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
       const HloInstruction* instruction, ShapeIndex index,
       const MemorySpaceAssignment::Allocation* aliased_allocation);
 
+  // This sets a required assignment. CHECK fails if there is a conflicting
+  // required assignment at the same time.
+  void AddRequiredAssignment(const HloValue* value,
+                             const HloInstruction* instruction,
+                             MemorySpace memory_space, int64 time,
+                             absl::optional<Chunk> chunk = absl::nullopt);
+  void AddRequiredAssignment(const HloInstruction* instruction,
+                             ShapeIndex index, MemorySpace memory_space,
+                             absl::optional<Chunk> chunk = absl::nullopt);
+
   // Adds input and outputs as required assignments.
   void AddInputAndOutputRequiredAssignments();
 
@@ -909,8 +954,8 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
 
   // Returns true if the addition of an asynchronous copy in the given time
   // interval would violate the maximum number of asynchronous copies.
-  bool ViolatesMaximumOutstandingAsyncCopies(int64 start_time,
-                                             int64 end_time) const;
+  bool ViolatesMaximumOutstandingAsyncCopies(int64 start_time, int64 end_time,
+                                             bool is_prefetch) const;
 
   // Return true if the asynchronous copy would violate the pipelining order.
   bool ViolatesAsyncCopyOrdering(int64 start_time, int64 end_time) const;
@@ -953,8 +998,9 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   const HloAliasAnalysis& alias_analysis_;
   const HloLiveRange& hlo_live_range_;
   // We use a interval tree to keep track of the number of outstanding
-  // asynchronous copies.
-  BufferIntervalTree async_copy_interval_tree_;
+  // prefetches and evictions.
+  BufferIntervalTree prefetch_interval_tree_;
+  BufferIntervalTree eviction_interval_tree_;
   AsynchronousCopyOrdering async_copy_ordering_;
   std::vector<std::pair<BufferInterval, ChunkCandidate>> pending_chunks_;
   std::vector<AsynchronousCopy> pending_async_copies_;
@@ -964,7 +1010,6 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
       required_assignments_;
   // Number of bytes reserved in alternate memory space.
   int64 reserved_in_bytes_ = 0;
-  int64 global_max_time_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index b2125d318d0..23b311730f8 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -57,9 +57,10 @@ class MemorySpaceAssignmentTest : public HloTestBase,
         HloLiveRange::Run(module->schedule(), *alias_analysis,
                           module->entry_computation())
             .ValueOrDie();
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
     MemorySpaceAssignmentCostAnalysis cost_analysis(
         hlo_cost_analysis, kAsyncCopyBandwidth, kAlternateMemBandwidth,
-        *hlo_live_range);
+        *hlo_live_range, *call_graph);
     CostAnalysisPrefetchIntervalPicker prefetch_interval_picker(
         CostAnalysisPrefetchIntervalPicker(
             cost_analysis, /*min_async_copy_to_overlap_ratio=*/0.8,
@@ -126,7 +127,8 @@ class MemorySpaceAssignmentTest : public HloTestBase,
     options.prefetch_interval_picker = prefetch_interval_picker;
     options.size_fn = size_fn;
     options.is_allowed_in_alternate_mem_fn = is_allowed_in_alternate_mem;
-    options.max_outstanding_async_copies = max_outstanding_async_copies;
+    options.max_outstanding_prefetches = max_outstanding_async_copies;
+    options.max_outstanding_evictions = max_outstanding_async_copies;
     options.allocate_across_sequential_calls = GetParam();
     options.verify = true;
 
@@ -184,6 +186,47 @@ class MemorySpaceAssignmentTest : public HloTestBase,
     }
   }
 
+  struct OutstandingAsyncCopies {
+    int64 max_copies;
+    int64 max_prefetches;
+    int64 max_evictions;
+  };
+
+  /*static*/ OutstandingAsyncCopies CountMaximumOutstandingAsyncCopies(
+      const HloModule& module) {
+    OutstandingAsyncCopies copies{0, 0, 0};
+    int64 current_copies = 0;
+    int64 current_prefetches = 0;
+    int64 current_evictions = 0;
+    for (HloInstruction* instruction : module.schedule()
+                                           .sequence(module.entry_computation())
+                                           .instructions()) {
+      if (instruction->opcode() == HloOpcode::kCopyStart) {
+        current_copies++;
+        if (ShapeUtil::GetSubshape(instruction->shape(), {0})
+                .layout()
+                .memory_space() == kAlternateMemorySpace) {
+          current_prefetches++;
+        } else {
+          current_evictions++;
+        }
+      } else if (instruction->opcode() == HloOpcode::kCopyDone) {
+        current_copies--;
+        if (instruction->shape().layout().memory_space() ==
+            kAlternateMemorySpace) {
+          current_prefetches--;
+        } else {
+          current_evictions--;
+        }
+      }
+      copies.max_copies = std::max(copies.max_copies, current_copies);
+      copies.max_prefetches =
+          std::max(copies.max_prefetches, current_prefetches);
+      copies.max_prefetches = std::max(copies.max_evictions, current_evictions);
+    }
+    return copies;
+  }
+
   std::unique_ptr<HloModule> CreateEvictAndPrefetchModule() {
     HloComputation::Builder builder(TestName());
     Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
@@ -391,8 +434,8 @@ TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies0) {
 
   AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/0);
 
-  EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module),
-            0);
+  EXPECT_LE(CountMaximumOutstandingAsyncCopies(*module).max_prefetches, 0);
+  EXPECT_LE(CountMaximumOutstandingAsyncCopies(*module).max_evictions, 0);
 }
 
 TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies1) {
@@ -400,8 +443,8 @@ TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies1) {
 
   AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/1);
 
-  EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module),
-            1);
+  EXPECT_LE(CountMaximumOutstandingAsyncCopies(*module).max_prefetches, 1);
+  EXPECT_LE(CountMaximumOutstandingAsyncCopies(*module).max_evictions, 1);
 }
 
 TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies2) {
@@ -409,8 +452,8 @@ TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies2) {
 
   AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/2);
 
-  EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module),
-            2);
+  EXPECT_LE(CountMaximumOutstandingAsyncCopies(*module).max_prefetches, 2);
+  EXPECT_LE(CountMaximumOutstandingAsyncCopies(*module).max_evictions, 2);
 }
 
 // TODO(berkin): This test is broken with some prefetch timing improvements.
@@ -1650,6 +1693,324 @@ TEST_P(MemorySpaceAssignmentTest, ControlPredecessorsBug) {
   AssignMemorySpace(module.get());
 }
 
+TEST_P(MemorySpaceAssignmentTest, ConditionalShouldBeAllocatedInAlternateMem) {
+  // Checks if simple conditionals get alternate memory allocations.
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg1 = f32[3]{0} negate(gte)
+  }
+
+  false_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg2 = f32[3]{0} negate(gte)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy = f32[3]{0} copy(p0)
+    tuple = (f32[3]{0}) tuple(copy)
+    ROOT conditional = f32[3]{0} conditional(p1, tuple, tuple), true_computation=true_computation, false_computation=false_computation
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Check that copy and gtes got alternate memory allocations.
+    auto copy =
+        module->GetComputationWithName("entry")->GetInstructionWithName("copy");
+    EXPECT_EQ(copy->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto neg1 = module->GetComputationWithName("true_computation")
+                    ->GetInstructionWithName("neg1");
+    auto neg1_operand = neg1->operand(0);
+    EXPECT_EQ(neg1_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+    auto neg2 = module->GetComputationWithName("false_computation")
+                    ->GetInstructionWithName("neg2");
+    auto neg2_operand = neg2->operand(0);
+    EXPECT_EQ(neg2_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+  }
+}
+
+TEST_P(MemorySpaceAssignmentTest, ConditionalAvoidsUnnecessaryPrefetch) {
+  // Checks if we avoid unnecessary allocation in alternate memory if the input
+  // won't be used in the computation for a long time.
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]{0}, f32[3]{0}) parameter(0)
+    gte0 = f32[3]{0} get-tuple-element(p0), index=0
+    neg0 = f32[3]{0} negate(gte0)
+    neg1 = f32[3]{0} negate(neg0)
+    neg2 = f32[3]{0} negate(neg1)
+    neg3 = f32[3]{0} negate(neg2)
+    neg4 = f32[3]{0} negate(neg3)
+    neg5 = f32[3]{0} negate(neg4)
+    neg6 = f32[3]{0} negate(neg5)
+    neg7 = f32[3]{0} negate(neg6)
+    neg8 = f32[3]{0} negate(neg7)
+    neg9 = f32[3]{0} negate(neg8)
+    gte1 = f32[3]{0} get-tuple-element(p0), index=1
+    ROOT add = f32[3]{0} add(neg9, gte1)
+  }
+
+  false_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg = f32[3]{0} negate(gte)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy0 = f32[3]{0} copy(p0)
+    copy1 = f32[3]{0} copy(p0)
+    tuple0 = (f32[3]{0}, f32[3]{0}) tuple(copy0, copy1)
+    tuple1 = (f32[3]{0}) tuple(copy0)
+    ROOT conditional = f32[3]{0} conditional(p1, tuple0, tuple1), true_computation=true_computation, false_computation=false_computation
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Check that copy1 doesn't get unnecessarily allocated in alternate mem
+    // (due to long negate chain in true_computation) but is prefetched before
+    // add.
+    auto copy0 =
+        module->GetComputationWithName("entry")->GetInstructionWithName(
+            "copy0");
+    EXPECT_EQ(copy0->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto copy1 =
+        module->GetComputationWithName("entry")->GetInstructionWithName(
+            "copy1");
+    EXPECT_EQ(copy1->shape().layout().memory_space(), kDefaultMemorySpace);
+    auto add = module->GetComputationWithName("true_computation")
+                   ->GetInstructionWithName("add");
+    auto add_operand = add->operand(1);
+    EXPECT_EQ(add_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+  }
+}
+
+TEST_P(MemorySpaceAssignmentTest, ConditionalMultiUse) {
+  // Make sure there is an evict when there is a conditional use followed by
+  // another use.
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]{0}, f32[3]{0}) parameter(0)
+    gte0 = f32[3]{0} get-tuple-element(p0), index=0
+    gte1 = f32[3]{0} get-tuple-element(p0), index=1
+    add0 = f32[3]{0} add(gte0, gte1)
+    neg0 = f32[3]{0} negate(add0)
+    neg1 = f32[3]{0} negate(neg0)
+    neg2 = f32[3]{0} negate(neg1)
+    neg3 = f32[3]{0} negate(neg2)
+    neg4 = f32[3]{0} negate(neg3)
+    neg5 = f32[3]{0} negate(neg4)
+    neg6 = f32[3]{0} negate(neg5)
+    neg7 = f32[3]{0} negate(neg6)
+    neg8 = f32[3]{0} negate(neg7)
+    ROOT neg9 = f32[3]{0} negate(neg8)
+  }
+
+  false_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg = f32[3]{0} negate(gte)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy0 = f32[3]{0} copy(p0)
+    copy1 = f32[3]{0} copy(p0)
+    tuple0 = (f32[3]{0}, f32[3]{0}) tuple(copy0, copy1)
+    tuple1 = (f32[3]{0}) tuple(copy0)
+    conditional = f32[3]{0} conditional(p1, tuple0, tuple1), true_computation=true_computation, false_computation=false_computation
+    ROOT add1 = f32[3]{0} add(copy1, conditional)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Make sure the copy1->add edge is in alternate memory. Before conditional,
+    // this should be evicted to default memory and neg uses the input from
+    // default memory.
+    auto copy1 =
+        module->GetComputationWithName("entry")->GetInstructionWithName(
+            "copy1");
+    EXPECT_EQ(copy1->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto add0 = module->GetComputationWithName("true_computation")
+                    ->GetInstructionWithName("add0");
+    auto add0_operand = add0->operand(1);
+    EXPECT_EQ(add0_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+    auto add1 =
+        module->GetComputationWithName("entry")->GetInstructionWithName("add1");
+    auto add1_operand = add1->operand(0);
+    EXPECT_EQ(add1_operand->shape().layout().memory_space(),
+              kDefaultMemorySpace);
+    EXPECT_EQ(add1_operand->opcode(), HloOpcode::kCopyDone);
+  }
+}
+
+TEST_P(MemorySpaceAssignmentTest, ConditionalMultiUseInWhile) {
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg1 = f32[3]{0} negate(gte)
+  }
+
+  false_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg2 = f32[3]{0} negate(gte)
+  }
+
+  while_cond {
+    p0 = (f32[3]{0}, f32[3]{0}, pred[]) parameter(0)
+    ROOT gte = pred[] get-tuple-element(p0), index=2
+  }
+
+  while_body {
+    p0 = (f32[3]{0}, f32[3]{0}, pred[]) parameter(0)
+    gte0 = f32[3]{0} get-tuple-element(p0), index=0
+    gte1 = f32[3]{0} get-tuple-element(p0), index=1
+    gte2 = pred[] get-tuple-element(p0), index=2
+    cond_tuple = (f32[3]{0}) tuple(gte0)
+    conditional = f32[3]{0} conditional(gte2, cond_tuple, cond_tuple), true_computation=true_computation, false_computation=false_computation
+    add = f32[3]{0} add(conditional, gte1)
+    neg0 = f32[3]{0} negate(add)
+    neg1 = f32[3]{0} negate(neg0)
+    ROOT tuple = (f32[3]{0}, f32[3]{0}, pred[]) tuple(gte0, neg1, gte2)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy0 = f32[3]{0} copy(p0)
+    copy1 = f32[3]{0} copy(p0)
+    tuple = (f32[3]{0}, f32[3]{0}, pred[]) tuple(copy0, copy1, p1)
+    while = (f32[3]{0}, f32[3]{0}, pred[]) while(tuple), condition=while_cond, body=while_body
+    ROOT gte = f32[3]{0} get-tuple-element(while), index=1
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Make sure copy1/while{0}/cond_tuple{0} gets alternate memory allocation.
+    // This will force an eviction and a prefetch for while body root.
+    auto copy0 =
+        module->GetComputationWithName("entry")->GetInstructionWithName(
+            "copy0");
+    EXPECT_EQ(copy0->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto conditional = module->GetComputationWithName("while_body")
+                           ->GetInstructionWithName("conditional");
+    auto conditional_operand = conditional->operand(1);
+    EXPECT_EQ(ShapeUtil::GetSubshape(conditional_operand->shape(), {0})
+                  .layout()
+                  .memory_space(),
+              kAlternateMemorySpace);
+    auto while_root =
+        module->GetComputationWithName("while_body")->root_instruction();
+    auto while_root_operand = while_root->operand(0);
+    EXPECT_THAT(
+        while_root_operand,
+        op::AsyncCopy(kAlternateMemorySpace, kDefaultMemorySpace,
+                      op::AsyncCopy(kDefaultMemorySpace, kAlternateMemorySpace,
+                                    op::GetTupleElement(op::Parameter(0)))));
+  }
+}
+
+TEST_P(MemorySpaceAssignmentTest, NestedConditional) {
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation2 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg1 = f32[3]{0} negate(gte)
+  }
+
+  false_computation2 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg2 = f32[3]{0} negate(gte)
+  }
+
+  true_computation1 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    slice = f32[1]{0} slice(gte), slice={[0:1]}
+    bitcast = f32[] bitcast(slice)
+    constant = f32[] constant(0.0)
+    compare = pred[] compare(bitcast, constant), direction=GT
+    ROOT conditional = f32[3]{0} conditional(compare, p0, p0), true_computation=true_computation2, false_computation=false_computation2
+  }
+
+  false_computation1 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg3 = f32[3]{0} negate(gte)
+  }
+
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy = f32[3]{0} copy(p0)
+    tuple = (f32[3]{0}) tuple(copy)
+    ROOT conditional = f32[3]{0} conditional(p1, tuple, tuple), true_computation=true_computation1, false_computation=false_computation1
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Make sure alternate memory allocation gets propagated into both levels of
+    // conditional.
+    auto copy =
+        module->GetComputationWithName("entry")->GetInstructionWithName("copy");
+    EXPECT_EQ(copy->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto neg1_operand = module->GetComputationWithName("true_computation2")
+                            ->GetInstructionWithName("neg1")
+                            ->operand(0);
+    auto neg2_operand = module->GetComputationWithName("false_computation2")
+                            ->GetInstructionWithName("neg2")
+                            ->operand(0);
+    auto neg3_operand = module->GetComputationWithName("false_computation1")
+                            ->GetInstructionWithName("neg3")
+                            ->operand(0);
+    EXPECT_EQ(neg1_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+    EXPECT_EQ(neg2_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+    EXPECT_EQ(neg3_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+  }
+}
+
 TEST_P(MemorySpaceAssignmentTest,
        RequestIdentifierShouldNotBeAllocatedInAlternateMem) {
   // Ensure that request identifier returned by Send/Recv HLOs are not allocated
@@ -2136,7 +2497,8 @@ TEST_P(MemorySpaceAssignmentTest, NonEntryComputationSchedule3) {
   AssignMemorySpace(module.get(), -1, 5);
 }
 
-TEST_P(MemorySpaceAssignmentTest, NonEntryComputationSchedule4) {
+// TODO(berkin): This might be an incorrect input graph, investigate.
+TEST_P(MemorySpaceAssignmentTest, DISABLED_NonEntryComputationSchedule4) {
   auto module = CreateNewVerifiedModule();
   Shape shape = ShapeUtil::MakeShape(xla::F32, {2, 3});
   Shape shape2 = ShapeUtil::MakeShape(xla::F32, {3, 3});
@@ -3428,6 +3790,52 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchTest) {
   }
 }
 
+TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchBitcastTest) {
+  HloComputation::Builder builder(TestName());
+
+  constexpr int kBatch = 8;
+  constexpr int kFeature = 8;
+  constexpr int kOutput = 2;
+
+  auto lhs_shape = ShapeUtil::MakeShape(F32, {kBatch, kFeature});
+  auto rhs_shape = ShapeUtil::MakeShape(F32, {kOutput, kFeature});
+  auto bitcast_shape = ShapeUtil::MakeShape(F32, {kFeature, kOutput});
+  auto result_shape = ShapeUtil::MakeShape(F32, {kBatch, kOutput});
+  auto tuple_shape = ShapeUtil::MakeTupleShape({lhs_shape, rhs_shape});
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+
+  auto lhs = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(lhs_shape, param, 0));
+  auto rhs = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(rhs_shape, param, 1));
+
+  auto bitcast =
+      builder.AddInstruction(HloInstruction::CreateBitcast(bitcast_shape, rhs));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot = builder.AddInstruction(HloInstruction::CreateDot(
+      result_shape, lhs, bitcast, dot_dnums, DefaultPrecisionConfig(2)));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(computation, {param, lhs, rhs, bitcast, dot});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  AssignMemorySpace(module.get());
+
+  auto cross_program_prefetches = module->CrossProgramPrefetches();
+  EXPECT_EQ(cross_program_prefetches.size(), 1);
+  if (!cross_program_prefetches.empty()) {
+    EXPECT_EQ(cross_program_prefetches[0].first, 0);
+    EXPECT_EQ(cross_program_prefetches[0].second, ShapeIndex({1}));
+  }
+}
+
 TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchNestedTupleTest) {
   HloComputation::Builder builder(TestName());
 
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation.cc b/tensorflow/compiler/xla/service/memory_space_propagation.cc
new file mode 100644
index 00000000000..80eb4017477
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_propagation.cc
@@ -0,0 +1,67 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/memory_space_propagation.h"
+
+namespace xla {
+
+StatusOr<bool> MemorySpacePropagation::Run(HloModule* module) {
+  bool modified = false;
+  TF_ASSIGN_OR_RETURN(auto dataflow_analysis,
+                      HloDataflowAnalysis::Run(*module));
+  dataflow_analysis_ = std::move(dataflow_analysis);
+
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kFusion) {
+        // Propagate the operand subshapes.
+        for (int operand_idx = 0; operand_idx < instruction->operand_count();
+             ++operand_idx) {
+          modified |=
+              PropagateSubshapes(instruction->operand(operand_idx)->shape(),
+                                 instruction->fused_parameter(operand_idx));
+        }
+
+        // Propagate output subshapes.
+        modified |= PropagateSubshapes(instruction->shape(),
+                                       instruction->fused_expression_root());
+      }
+    }
+  }
+  return modified;
+}
+
+bool MemorySpacePropagation::PropagateSubshapes(
+    const Shape& caller_shape, const HloInstruction* callee_instruction) const {
+  bool modified = false;
+  for (const ShapeUtil::IndexedShape& indexed_shape :
+       ShapeUtil::GetLeafShapes(caller_shape)) {
+    int64 memory_space = indexed_shape.shape.layout().memory_space();
+    const HloValue& value = dataflow_analysis_->GetUniqueValueAt(
+        callee_instruction, indexed_shape.index);
+
+    for (const HloPosition& position : value.positions()) {
+      Shape* shape = ShapeUtil::GetMutableSubshape(
+          position.instruction->mutable_shape(), position.index);
+      if (shape->layout().memory_space() != memory_space) {
+        shape->mutable_layout()->set_memory_space(memory_space);
+        modified = true;
+      }
+    }
+  }
+  return modified;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation.h b/tensorflow/compiler/xla/service/memory_space_propagation.h
new file mode 100644
index 00000000000..65a1dfd14a6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_propagation.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
+
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// This is a legalization pass that propagates the memory space in the layout to
+// the fusion computations.
+class MemorySpacePropagation : public HloModulePass {
+ public:
+  ~MemorySpacePropagation() override = default;
+  absl::string_view name() const override { return "memory-space-propagation"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  // Given the caller shape (operand or output) and its corresponding
+  // insturction in the fused computation (parameter or root), propagates the
+  // memory space to all the subshapes in the callee side. Returns true if the
+  // module is modified.
+  bool PropagateSubshapes(const Shape& caller_shape,
+                          const HloInstruction* callee_instruction) const;
+
+  std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation_test.cc b/tensorflow/compiler/xla/service/memory_space_propagation_test.cc
new file mode 100644
index 00000000000..8d74958f6aa
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_propagation_test.cc
@@ -0,0 +1,203 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/memory_space_propagation.h"
+
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class MemorySpacePropagationTest : public HloTestBase {
+ public:
+  MemorySpacePropagationTest()
+      : HloTestBase(),
+        verifier_(/*layout_sensitive=*/false, /*allow_mixed_precision*/ false) {
+  }
+
+  Status Verify(HloModule* module) { return verifier_.Run(module).status(); }
+
+ private:
+  HloVerifier verifier_;
+};
+
+TEST_F(MemorySpacePropagationTest, NoMemorySpace) {
+  absl::string_view hlo_string = R"(
+  HloModule NoMemorySpace
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)} parameter(0)
+    ROOT %add.0 = s32[6]{0:T(128)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)} copy(%param2)
+    %fusion = s32[6]{0:T(128)} fusion(s32[6]{0:T(128)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[6]{0:T(128)} copy(%fusion)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  EXPECT_FALSE(memory_space_propagation.Run(module.get()).ValueOrDie());
+  TF_ASSERT_OK_AND_ASSIGN(auto ref, ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_EQ(module->Hash(), ref->Hash());
+}
+
+TEST_F(MemorySpacePropagationTest, NonTupleOutput) {
+  absl::string_view hlo_string = R"(
+  HloModule NonTupleOutput
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)} parameter(0)
+    ROOT %add.0 = s32[6]{0:T(128)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[6]{0:T(128)} copy(%fusion)
+  }
+  )";
+  absl::string_view expected_hlo_string = R"(
+  HloModule NonTupleOutput
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)S(1)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)S(1)} parameter(0)
+    ROOT %add.0 = s32[6]{0:T(128)S(1)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[6]{0:T(128)} copy(%fusion)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  EXPECT_TRUE(memory_space_propagation.Run(module.get()).ValueOrDie());
+  TF_EXPECT_OK(Verify(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto ref,
+                          ParseAndReturnVerifiedModule(expected_hlo_string));
+  EXPECT_EQ(module->Hash(), ref->Hash());
+}
+
+TEST_F(MemorySpacePropagationTest, TupleOutput) {
+  absl::string_view hlo_string = R"(
+  HloModule TupleOutput
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)} parameter(0)
+    %add.0 = s32[6]{0:T(128)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+    %multiply.0 = s32[6]{0:T(128)} multiply(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+    ROOT %tuple = (s32[6]{0:T(128)}, s32[6]{0:T(128)}) tuple(%add.0, %multiply.0)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = (s32[6]{0:T(128)S(1)}, s32[6]{0:T(128)}) fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    %gte0 = s32[6]{0:T(128)S(1)} get-tuple-element(%fusion), index=0
+    %gte1 = s32[6]{0:T(128)} get-tuple-element(%fusion), index=1
+    ROOT %root = s32[6]{0:T(128)} add(%gte0, %gte1)
+  }
+  )";
+  absl::string_view expected_hlo_string = R"(
+  HloModule TupleOutput
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)S(1)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)S(1)} parameter(0)
+    %add.0 = s32[6]{0:T(128)S(1)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+    %multiply.0 = s32[6]{0:T(128)} multiply(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+    ROOT %tuple = (s32[6]{0:T(128)S(1)}, s32[6]{0:T(128)}) tuple(%add.0, %multiply.0)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = (s32[6]{0:T(128)S(1)}, s32[6]{0:T(128)}) fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    %gte0 = s32[6]{0:T(128)S(1)} get-tuple-element(%fusion), index=0
+    %gte1 = s32[6]{0:T(128)} get-tuple-element(%fusion), index=1
+    ROOT %root = s32[6]{0:T(128)} add(%gte0, %gte1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  EXPECT_TRUE(memory_space_propagation.Run(module.get()).ValueOrDie());
+  TF_EXPECT_OK(Verify(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto ref,
+                          ParseAndReturnVerifiedModule(expected_hlo_string));
+  EXPECT_EQ(module->Hash(), ref->Hash());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index cd679f7412e..07655a61074 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -185,11 +185,11 @@ cc_library(
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgToLLVM",
         "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:LoopOps",
-        "@llvm-project//mlir:LoopOpsTransforms",
-        "@llvm-project//mlir:LoopsToGPUPass",
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SCFToGPUPass",
+        "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
index 56684b1f726..d5cad385324 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/Support/raw_ostream.h"
-#include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index 33d3690d4ab..4645b084eb6 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"  // from @llvm-project
 #include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h"  // from @llvm-project
-#include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
 #include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
@@ -31,9 +31,9 @@ limitations under the License.
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/LoopOps/LoopOps.h"  // from @llvm-project
-#include "mlir/Dialect/LoopOps/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/LoopOps/Transforms.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/Transforms.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
@@ -45,6 +45,7 @@ limitations under the License.
 #include "mlir/IR/Region.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "mlir/Transforms/LoopUtils.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
@@ -60,34 +61,6 @@ namespace {
 
 using ::mlir::xla_lhlo::FusionOp;
 
-// Following are some small transformations that are required to clean up code
-// after lowering from linalg to loops.
-
-// A simple pass that applies lowering of HLO to LHLO only within LHLO ops that
-// contain regions with HLO ops, e.g. FusionOp, ReduceOp, SelectAndScatterOp.
-// This is needed, as these ops are not closed from above and hence nested pass
-// managers can not be applied.
-struct NestedHloRegionsConverter
-    : public mlir::PassWrapper<NestedHloRegionsConverter,
-                               ::mlir::FunctionPass> {
-  void runOnFunction() override {
-    auto& ctx = getContext();
-    mlir::OwningRewritePatternList patterns;
-    mlir::ConversionTarget target(ctx);
-    target.addLegalDialect<::mlir::xla_lhlo::XlaLhloDialect>();
-    ::mlir::xla_hlo::populateHLOToLHLOConversionPattern(&ctx, &patterns);
-
-    getFunction().walk([&](mlir::Operation* op) {
-      if (op->getNumRegions() == 0) {
-        return;
-      }
-      if (failed(applyPartialConversion(op, target, patterns, nullptr))) {
-        signalPassFailure();
-      }
-    });
-  }
-};
-
 // Replaces a FusionOp by the operations contained in its region.
 struct FusionOpRemover
     : public mlir::PassWrapper<FusionOpRemover, ::mlir::FunctionPass> {
@@ -132,7 +105,7 @@ struct StoreForwardingPass
     // No store operation found. Continue search outside of the parallel
     // loop if block is in a parallel loop.
     if (auto parallelOp =
-            llvm::dyn_cast<mlir::loop::ParallelOp>(block->getParentOp())) {
+            llvm::dyn_cast<mlir::scf::ParallelOp>(block->getParentOp())) {
       return findStore(parallelOp.getOperation(), matches);
     }
     return {};
@@ -378,7 +351,7 @@ struct FixKernelFunctionSignatures
 struct MapParallelLoops
     : public mlir::PassWrapper<MapParallelLoops, mlir::FunctionPass> {
   void runOnFunction() override {
-    mlir::greedilyMapParallelLoopsToGPU(getFunction().getBody());
+    mlir::greedilyMapParallelSCFToGPU(getFunction().getBody());
   }
 };
 
@@ -388,8 +361,8 @@ struct MapParallelLoops
 struct FuseInnerParallelLoops
     : public mlir::PassWrapper<FuseInnerParallelLoops, mlir::FunctionPass> {
   void runOnFunction() override {
-    getFunction().walk([](mlir::loop::ParallelOp op) {
-      mlir::loop::naivelyFuseParallelOps(op.region());
+    getFunction().walk([](mlir::scf::ParallelOp op) {
+      mlir::scf::naivelyFuseParallelOps(op.region());
     });
   }
 };
@@ -401,7 +374,7 @@ struct ParallelLoopCollapsingToFirstDim
   void runOnOperation() override {
     mlir::Operation* module = getOperation();
 
-    module->walk([&](mlir::loop::ParallelOp op) {
+    module->walk([&](mlir::scf::ParallelOp op) {
       unsigned num_loops = op.getNumLoops();
       std::vector<unsigned> combinedLoops;
       combinedLoops.reserve(num_loops);
@@ -436,8 +409,10 @@ Status LowerLHLOToGPU(mlir::ModuleOp module,
     tiling_for_unrolling.append(tile_sizes.begin(), tile_sizes.end());
   }
 
-  // First, lower bodies of LHLO operations that contain HLO ops.
-  pm.addPass(absl::make_unique<NestedHloRegionsConverter>());
+  // Legalize from HLO to LHLO.
+  pm.addPass(::mlir::xla_hlo::createLegalizeToLhloPass());
+  // Moving `AllocOp`s and inserting missing `DeallocOp`s
+  pm.addPass(::mlir::createBufferPlacementPass());
   // Next, we can strip the outer fusion operation.
   pm.addPass(absl::make_unique<FusionOpRemover>());
   // Remove unnecessary LHLO copies.
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index ab71c30dcae..2ed5e709d81 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -313,6 +313,8 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     if (execution_options->num_partitions() > 0) {
       config->set_num_partitions(execution_options->num_partitions());
     }
+    config->set_use_spmd_partitioning(
+        execution_options->use_spmd_partitioning());
     config->set_seed(execution_options->seed());
     config->set_launch_id(execution_options->launch_id());
     config->set_debug_options(execution_options->debug_options());
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index f3c8eec1751..75a80747c1d 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -1856,7 +1857,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   switch (fft_type) {
     case FFT:
     case IFFT:
-      if (in.element_type() != C64) {
+      if (!primitive_util::IsComplexType(in.element_type())) {
         return InvalidArgument("%s requires complex input type, found %s.",
                                FftType_Name(fft_type),
                                PrimitiveType_Name(in.element_type()));
@@ -1864,8 +1865,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
       RET_CHECK_RANK(in);
       return in;
     case RFFT: {
-      if (in.element_type() != F32) {
-        return InvalidArgument("RFFT requires F32 input type, found %s.",
+      if (in.element_type() != F32 && in.element_type() != F64) {
+        return InvalidArgument("RFFT requires F32 or F64 input type, found %s.",
                                PrimitiveType_Name(in.element_type()));
       }
       RET_CHECK_RANK(in);
@@ -1880,7 +1881,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
               fft_length[i]);
         }
       }
-      Shape result = ShapeUtil::ChangeElementType(in, C64);
+      Shape result = ShapeUtil::ChangeElementType(
+          in, in.element_type() == F32 ? C64 : C128);
       // Preserve the size of zero-sized dimensions.
       if (fft_length[fft_rank - 1] != 0) {
         result.set_dimensions(result.dimensions_size() - 1,
@@ -1889,8 +1891,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
       return result;
     }
     case IRFFT: {
-      if (in.element_type() != C64) {
-        return InvalidArgument("IRFFT requires C64 input type, found %s.",
+      if (!primitive_util::IsComplexType(in.element_type())) {
+        return InvalidArgument("IRFFT requires complex input type, found %s.",
                                PrimitiveType_Name(in.element_type()));
       }
       RET_CHECK_RANK(in);
@@ -1999,6 +2001,17 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   return a;
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferAllGatherShape(
+    const Shape& operand_shape, int64 all_gather_dimension, int64 shard_count) {
+  TF_RET_CHECK(all_gather_dimension >= 0);
+  TF_RET_CHECK(all_gather_dimension < operand_shape.rank());
+  TF_RET_CHECK(shard_count > 0);
+  auto shape = operand_shape;
+  shape.set_dimensions(all_gather_dimension,
+                       shard_count * shape.dimensions(all_gather_dimension));
+  return shape;
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferAllReduceShape(
     absl::Span<const Shape* const> operand_shapes) {
   for (const Shape* operand_shape : operand_shapes) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 2e96a77aa22..2cb5930d098 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -123,6 +123,12 @@ class ShapeInference {
   // Infers the shape produced by the given triangular solve operation.
   static StatusOr<Shape> InferCholeskyShape(const Shape& a);
 
+  // Infers the shape produced by an all-gather with the given operand shape,
+  // concat dimension, and shard count.
+  static StatusOr<Shape> InferAllGatherShape(const Shape& operand_shape,
+                                             int64 all_gather_dimension,
+                                             int64 shard_count);
+
   // Infers the shape produced by a cross replica sum with the given operand
   // shapes.
   static StatusOr<Shape> InferAllReduceShape(
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 448f5119546..b5ecf6e583e 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -615,8 +615,7 @@ namespace fft {
 static const char* unsupported_rank = "only supports ranks 1-3";
 static const char* invalid_rank = "requires input of at least same rank";
 static const char* requires_complex_input = "requires complex input type";
-static const char* requires_f32_input = "requires F32 input type";
-static const char* requires_c64_input = "requires C64 input type";
+static const char* requires_f32_input = "requires F32 or F64 input type";
 static const char* dimensions_match = "innermost dimensions match fft_length";
 static const char* innermost_dimension_matches =
     "innermost dimension matches fft_length/2+1";
@@ -654,7 +653,7 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestFftTypes) {
   Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
   Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
   fft::Fail(shape_f32, type, {16, 8}, fft::requires_complex_input);
-  fft::Fail(shape_c128, type, {16, 8}, fft::requires_complex_input);
+  fft::Pass(shape_c128, type, {16, 8}, shape_c128);
 }
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestIfftRanks) {
@@ -672,7 +671,7 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestIfftTypes) {
   Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
   Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
   fft::Fail(shape_f32, type, {16, 8}, fft::requires_complex_input);
-  fft::Fail(shape_c128, type, {16, 8}, fft::requires_complex_input);
+  fft::Pass(shape_c128, type, {16, 8}, shape_c128);
 }
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestRfftRanks) {
@@ -747,9 +746,10 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestIrfftDimensions) {
 TEST_F(ShapeInferenceTest, InferFftShapeTestIrfftTypes) {
   FftType type = FftType::IRFFT;
   Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
-  Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
-  fft::Fail(shape_f32, type, {16, 8}, fft::requires_c64_input);
-  fft::Fail(shape_c128, type, {16, 8}, fft::requires_c64_input);
+  Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 5});
+  Shape shape_f64_out = ShapeUtil::MakeShape(F64, {16, 8});
+  fft::Fail(shape_f32, type, {16, 8}, fft::requires_complex_input);
+  fft::Pass(shape_c128, type, {16, 8}, shape_f64_out);
 }
 
 TEST_F(ShapeInferenceTest, MapThatChangesElementType) {
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index a1872330648..995b0ece7cd 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
@@ -93,6 +94,18 @@ class ShapedBuffer {
     buffers_.replace_shape_ptr(&on_device_shape_);
   }
 
+  // Reset the shape of this shaped buffer and underlying buffer structure.
+  //
+  // Precondition: EqualStructure(this->on_device_shape_, on_device_shape).
+  void set_shapes(const Shape& on_host_shape, const Shape& on_device_shape) {
+    CHECK(ShapeUtil::EqualStructure(on_device_shape, on_device_shape_))
+        << "Structures are not the same. new: " << on_device_shape
+        << ", old: " << on_device_shape_;
+    on_host_shape_ = on_host_shape;
+    on_device_shape_ = on_device_shape;
+    buffers_.replace_shape_ptr(&on_device_shape_);
+  }
+
   // Returns the underlying ShapeTree containing all the device addresses in the
   // ShapedBuffer.
   const ShapeTree<se::DeviceMemoryBase>& buffers() const { return buffers_; }
@@ -124,9 +137,8 @@ class ShapedBuffer {
 
 std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
 
-// ShapedBuffer derived class which allocates all internal buffers on
-// construction and deallocates the memory when the object is
-// destructed.
+// ScopedShapedBuffer takes allocated buffers as inputs, and deallocates on
+// destruction. This class represents an owning wrapper around `ShapedBuffer`.
 //
 // TODO(timshen): Remove inheritance between ScopedShapedBuffer and
 // ShapedBuffer.  There should never be a need to consider a ScopedShapedBuffer
diff --git a/tensorflow/compiler/xla/service/sharding_propagation.cc b/tensorflow/compiler/xla/service/sharding_propagation.cc
new file mode 100644
index 00000000000..c6990e76c95
--- /dev/null
+++ b/tensorflow/compiler/xla/service/sharding_propagation.cc
@@ -0,0 +1,1491 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/sharding_propagation.h"
+
+#include <algorithm>
+#include <list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_split.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/dot_as_convolution_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+using ComputationMap =
+    absl::flat_hash_map<const HloComputation*, HloInstruction*>;
+
+// Returns true iff the specified hlo or sharding has a spatially partitioned
+// sharding (tiled or replicated) what can be propagated by sharding
+// propagation.
+bool IsSpatiallyPartitioned(const HloSharding& sharding) {
+  if (sharding.IsTuple()) {
+    return absl::c_any_of(sharding.tuple_elements(), IsSpatiallyPartitioned);
+  } else {
+    return !sharding.IsTileMaximal() || sharding.IsReplicated();
+  }
+}
+bool IsSpatiallyPartitioned(const HloInstruction* hlo) {
+  return hlo->has_sharding() && IsSpatiallyPartitioned(hlo->sharding());
+}
+
+// Returns true if the lhs sharding is preferable over the rhs sharding.
+// The most specific sharding is tile maximal followed by single device tile
+// maximal and finally replicated. This order aims to primarily reduce memory
+// usage and secondly reduce total compute.
+// Note: This does NOT provide a total ordering as we can have 2 different
+// sharding with same preference level.
+bool IsShardingMoreSpecific(const HloSharding& lhs, const HloSharding& rhs) {
+  CHECK_EQ(lhs.IsTuple(), rhs.IsTuple());
+  if (lhs.IsTuple()) {
+    // For tuples we consider lhs to have a better sharding if none of the
+    // elements are worse and at least one element is better then in rhs
+    // sharding.
+    const auto& lhs_shardings = lhs.tuple_elements();
+    const auto& rhs_shardings = rhs.tuple_elements();
+    CHECK_EQ(lhs_shardings.size(), rhs_shardings.size());
+    bool is_better = false;
+    for (int64 i = 0; i < lhs_shardings.size(); ++i) {
+      if (IsShardingMoreSpecific(rhs_shardings[i], lhs_shardings[i])) {
+        return false;
+      }
+      if (IsShardingMoreSpecific(lhs_shardings[i], rhs_shardings[i])) {
+        is_better = true;
+      }
+    }
+    return is_better;
+  }
+  if (!rhs.IsTileMaximal()) {
+    // If we already have a non-tile-maximal sharding then we can't improve
+    // that.
+    return false;
+  } else if (!rhs.IsReplicated()) {
+    // If we are not replicated then only tiled (not tile maximal) shardings
+    // can improve us.
+    return !lhs.IsTileMaximal();
+  } else {
+    // If we are replicated then any non-replicated sharding can improve us.
+    return !lhs.IsReplicated();
+  }
+}
+
+// Returns a sharding where each tuple element is chosen as the more specific
+// one of the corresponding elements in a and b. Requires a an b to have the
+// same tuple nesting.
+HloSharding MergeForMoreSpecificSharding(const HloSharding& a,
+                                         const HloSharding& b) {
+  if (a.IsTuple()) {
+    HloSharding result = a;
+    CHECK(b.IsTuple());
+    CHECK_EQ(a.tuple_elements().size(), b.tuple_elements().size());
+    for (int64 i = 0; i < result.tuple_elements().size(); ++i) {
+      result.tuple_elements()[i] = MergeForMoreSpecificSharding(
+          a.tuple_elements()[i], b.tuple_elements()[i]);
+    }
+    return result;
+  }
+  return IsShardingMoreSpecific(a, b) ? a : b;
+}
+
+// Updates the sharding of the specified instruction with the specified sharding
+// if it is better than the current one and returns true if a new sharding have
+// been applied.
+bool MaybeImproveInstructionSharding(const HloSharding& sharding,
+                                     HloInstruction* instruction) {
+  // We don't want to propagate tile maximal shardings.
+  if (!IsSpatiallyPartitioned(sharding)) {
+    return false;
+  }
+  // Any sharding is better then no sharding.
+  if (!instruction->has_sharding()) {
+    instruction->set_sharding(sharding);
+    return true;
+  }
+  if (IsShardingMoreSpecific(sharding, instruction->sharding())) {
+    instruction->set_sharding(sharding);
+    return true;
+  }
+  return false;
+}
+
+// Sets the sharding for every element within a tuple to replicated (default
+// sharding). This is necessary because there is no way to represent a tuple
+// sharding when only some of the elements are sharded.
+void SetDefaultTupleSharding(HloInstruction* instruction) {
+  instruction->set_sharding(
+      HloSharding::SingleTuple(instruction->shape(), HloSharding::Replicate()));
+}
+
+// We consider a convolution kernel to be small iff it is smaller along all
+// spatial dimensions then the output of the convolution. The rational is that
+// we can either shard the kernel or the output and we want to shard the larger
+// one for better efficiency.
+bool IsConvolutionKernelSmall(const HloInstruction* instruction) {
+  CHECK_EQ(instruction->opcode(), HloOpcode::kConvolution);
+  const HloInstruction* rhs = instruction->operand(1);
+  const auto& dnums = instruction->convolution_dimension_numbers();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions().size(); ++i) {
+    int64 kernel_dim =
+        rhs->shape().dimensions(dnums.kernel_spatial_dimensions(i));
+    int64 output_dim =
+        instruction->shape().dimensions(dnums.output_spatial_dimensions(i));
+    if (kernel_dim >= output_dim) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Return the operand which is the most suitable for determining the sharding
+// for the specified instruction or nullptr if there isn't any suitable operand.
+const HloInstruction* PickRepresentativeOperand(
+    const HloInstruction* instruction) {
+  switch (instruction->opcode()) {
+    case HloOpcode::kMap:
+    case HloOpcode::kPad:
+    case HloOpcode::kPower:
+    case HloOpcode::kReverse:
+    case HloOpcode::kSlice:
+    case HloOpcode::kShiftLeft:
+    case HloOpcode::kShiftRightArithmetic:
+    case HloOpcode::kShiftRightLogical:
+      // For these opcodes the output sharding has to be determined by the
+      // sharding of the first operand but we can only determine sharding based
+      // on it if it already has a sharding.
+      if (instruction->operand(0)->has_sharding()) {
+        return instruction->operand(0);
+      }
+      return nullptr;
+    case HloOpcode::kAbs:
+    case HloOpcode::kAdd:
+    case HloOpcode::kAnd:
+    case HloOpcode::kAtan2:
+    case HloOpcode::kBitcastConvert:
+    case HloOpcode::kCeil:
+    case HloOpcode::kClamp:
+    case HloOpcode::kClz:
+    case HloOpcode::kCompare:
+    case HloOpcode::kComplex:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kConvert:
+    case HloOpcode::kCopy:
+    case HloOpcode::kCos:
+    case HloOpcode::kAllGather:
+    case HloOpcode::kAllReduce:
+    case HloOpcode::kAllToAll:
+    case HloOpcode::kCollectivePermute:
+    case HloOpcode::kDivide:
+    case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
+    case HloOpcode::kFloor:
+    case HloOpcode::kImag:
+    case HloOpcode::kIsFinite:
+    case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
+    case HloOpcode::kMaximum:
+    case HloOpcode::kMinimum:
+    case HloOpcode::kMultiply:
+    case HloOpcode::kNegate:
+    case HloOpcode::kNot:
+    case HloOpcode::kOr:
+    case HloOpcode::kPopulationCount:
+    case HloOpcode::kReal:
+    case HloOpcode::kReducePrecision:
+    case HloOpcode::kRemainder:
+    case HloOpcode::kRoundNearestAfz:
+    case HloOpcode::kRsqrt:
+    case HloOpcode::kSelect:
+    case HloOpcode::kSign:
+    case HloOpcode::kSin:
+    case HloOpcode::kSort:
+    case HloOpcode::kSqrt:
+    case HloOpcode::kCbrt:
+    case HloOpcode::kSubtract:
+    case HloOpcode::kTanh:
+    case HloOpcode::kTupleSelect:
+    case HloOpcode::kWhile:
+    case HloOpcode::kXor: {
+      // For these opcodes the output sharding can be determined by any operand
+      // so we find the operand with the most specific sharding.
+      const HloInstruction* best_operand = nullptr;
+      for (const HloInstruction* operand : instruction->operands()) {
+        if (operand->has_sharding() &&
+            (best_operand == nullptr ||
+             IsShardingMoreSpecific(operand->sharding(),
+                                    best_operand->sharding()))) {
+          best_operand = operand;
+        }
+      }
+      return best_operand;
+    }
+
+    // There is no suitable operand for the rest of the opcodes.
+    case HloOpcode::kAddDependency:
+    case HloOpcode::kAfterAll:
+    case HloOpcode::kBatchNormGrad:
+    case HloOpcode::kBatchNormInference:
+    case HloOpcode::kBatchNormTraining:
+    case HloOpcode::kBitcast:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kCall:
+    case HloOpcode::kCholesky:
+    case HloOpcode::kCollectivePermuteDone:
+    case HloOpcode::kCollectivePermuteStart:
+    case HloOpcode::kConditional:
+    case HloOpcode::kConstant:
+    case HloOpcode::kConvolution:
+    case HloOpcode::kCopyDone:
+    case HloOpcode::kCopyStart:
+    case HloOpcode::kCustomCall:
+    case HloOpcode::kDomain:
+    case HloOpcode::kDot:
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kFft:
+    case HloOpcode::kFusion:
+    case HloOpcode::kGather:
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kInfeed:
+    case HloOpcode::kIota:
+    case HloOpcode::kOutfeed:
+    case HloOpcode::kParameter:
+    case HloOpcode::kPartitionId:
+    case HloOpcode::kRecv:
+    case HloOpcode::kRecvDone:
+    case HloOpcode::kReduce:
+    case HloOpcode::kReduceWindow:
+    case HloOpcode::kReplicaId:
+    case HloOpcode::kReshape:
+    case HloOpcode::kRng:
+    case HloOpcode::kRngGetAndUpdateState:
+    case HloOpcode::kRngBitGenerator:
+    case HloOpcode::kScatter:
+    case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
+    case HloOpcode::kTrace:
+    case HloOpcode::kTranspose:
+    case HloOpcode::kTriangularSolve:
+    case HloOpcode::kTuple:
+    case HloOpcode::kGetDimensionSize:
+    case HloOpcode::kSetDimensionSize:
+      return nullptr;
+  }
+}
+
+bool SupportSpatialPartitioning(const HloInstruction* instruction,
+                                const ComputationMap& computation_map,
+                                bool is_spmd) {
+  if (instruction->parent()->root_instruction() == instruction &&
+      computation_map.find(instruction->parent()) == computation_map.end()) {
+    // We don't support sharding the root instruction of a computation yet,
+    // unless the computation is a while body.
+    return false;
+  }
+
+  if (instruction->IsElementwise() &&
+      (instruction->opcode() != HloOpcode::kRng || is_spmd)) {
+    return true;
+  }
+  switch (instruction->opcode()) {
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kConditional:
+    case HloOpcode::kConstant:
+    case HloOpcode::kConvolution:
+    case HloOpcode::kDot:
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kGather:
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kInfeed:
+    case HloOpcode::kIota:
+    case HloOpcode::kPad:
+    case HloOpcode::kReduceWindow:
+    case HloOpcode::kReshape:
+    case HloOpcode::kScatter:
+    case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kSlice:
+    case HloOpcode::kSort:
+    case HloOpcode::kTranspose:
+    case HloOpcode::kTuple:
+    case HloOpcode::kWhile:
+    case HloOpcode::kReduce:
+      return true;
+    case HloOpcode::kAllReduce:
+      // Only if channel_id is not specified.
+      return instruction->channel_id() == absl::nullopt;
+    case HloOpcode::kParameter:
+      return computation_map.find(instruction->parent()) !=
+             computation_map.end();
+    case HloOpcode::kReverse:
+      return is_spmd;
+    default:
+      return false;
+  }
+}
+
+// Tries to update the sharding of the specified instruction based on its
+// operands and returns true if the sharding of the instruction have been
+// changed and false otherwise.
+bool InferShardingFromOperands(HloInstruction* instruction,
+                               const ComputationMap& computation_map,
+                               bool is_spmd, bool aggressive_prop) {
+  if (!SupportSpatialPartitioning(instruction, computation_map, is_spmd)) {
+    // If an array shaped HLO doesn't support spatial partitioning but at least
+    // one of its operand is replicated then we make the HLO replicated as well.
+    if (instruction->shape().IsTuple() || instruction->operand_count() == 0 ||
+        instruction == instruction->parent()->root_instruction() ||
+        instruction->HasSideEffect()) {
+      return false;
+    }
+    if (absl::c_any_of(instruction->operands(), [](const HloInstruction* op) {
+          return op->has_sharding() && op->sharding().IsReplicated();
+        })) {
+      return MaybeImproveInstructionSharding(HloSharding::Replicate(),
+                                             instruction);
+    }
+    return false;
+  }
+
+  switch (instruction->opcode()) {
+    case HloOpcode::kGetTupleElement: {
+      const HloInstruction* operand = instruction->operand(0);
+      if (!IsSpatiallyPartitioned(operand)) {
+        return false;
+      }
+      HloSharding new_sharding = operand->sharding().GetSubSharding(
+          operand->shape(), {instruction->tuple_index()});
+      return MaybeImproveInstructionSharding(new_sharding, instruction);
+    }
+    case HloOpcode::kTuple: {
+      if (absl::c_none_of(instruction->operands(),
+                          [](const HloInstruction* hlo) {
+                            return IsSpatiallyPartitioned(hlo);
+                          })) {
+        // None of the operands have a spatially partitioned sharding.
+        return false;
+      }
+      bool changed = false;
+      if (!instruction->has_sharding()) {
+        // Set the sharding for all elements in the tuple because it isn't
+        // possible to set a partial sharding.
+        SetDefaultTupleSharding(instruction);
+        changed = true;
+      }
+      // Go through each operand and if the operand has a sharding that is
+      // better than the current sharding for that tuple element then update
+      // it.
+      const Shape& shape = instruction->shape();
+      std::vector<HloSharding> sub_shardings =
+          instruction->sharding().tuple_elements();
+      int64 sub_sharding_index = 0;
+      for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+        const HloInstruction* operand = instruction->operand(i);
+        if (operand->has_sharding()) {
+          if (operand->shape().IsTuple()) {
+            for (int64 i = 0, e = ShapeUtil::GetLeafCount(operand->shape());
+                 i < e; ++i) {
+              if (IsShardingMoreSpecific(
+                      operand->sharding().tuple_elements()[i],
+                      sub_shardings[sub_sharding_index + i])) {
+                sub_shardings[sub_sharding_index + i] =
+                    operand->sharding().tuple_elements()[i];
+              }
+            }
+          } else {
+            if (IsShardingMoreSpecific(operand->sharding(),
+                                       sub_shardings[sub_sharding_index])) {
+              sub_shardings[sub_sharding_index] = operand->sharding();
+            }
+          }
+        }
+        sub_sharding_index += ShapeUtil::GetLeafCount(operand->shape());
+      }
+
+      HloSharding new_sharding = HloSharding::Tuple(shape, sub_shardings);
+      if (new_sharding != instruction->sharding()) {
+        instruction->set_sharding(new_sharding);
+        return true;
+      }
+      return changed;
+    }
+    case HloOpcode::kReduce: {
+      // Reduce could have a tuple shape, where the first half of operands are
+      // the arrays to reduce, and the second half of operands are the init
+      // values.
+      bool changed = false;
+      for (int64 operand_id = 0; operand_id < instruction->operand_count() / 2;
+           ++operand_id) {
+        const HloInstruction* operand = instruction->operand(operand_id);
+        if (!IsSpatiallyPartitioned(operand)) {
+          continue;
+        }
+        auto get_maybe_tuple_sharding = [&](const HloSharding& sharding) {
+          if (instruction->operand_count() == 2) {
+            return sharding;
+          }
+          std::vector<HloSharding> tuple(instruction->operand_count() / 2,
+                                         sharding);
+          return HloSharding::Tuple(instruction->shape(), tuple);
+        };
+        if (operand->sharding().IsReplicated()) {
+          changed |= MaybeImproveInstructionSharding(
+              get_maybe_tuple_sharding(HloSharding::Replicate()), instruction);
+          continue;
+        }
+        if (absl::c_any_of(instruction->dimensions(), [operand](int64 dim) {
+              return operand->sharding().tile_assignment().dim(dim) > 1;
+            })) {
+          // We are reducing along one of the sharded dimensions. We don't
+          // support tiled sharding in this case.
+          changed |= MaybeImproveInstructionSharding(
+              get_maybe_tuple_sharding(HloSharding::Replicate()), instruction);
+        } else {
+          // We are reducing along some of the non-sharded dimensions. The
+          // result sharding should be the same as the operand sharding with the
+          // reduction dimensions removed as they are removed from the result
+          // shape.
+          std::vector<int64> target_tile_assignment_dimensions;
+          const auto& dimensions = instruction->dimensions();
+          for (int64 i = 0; i < operand->shape().rank(); ++i) {
+            if (absl::c_find(dimensions, i) == dimensions.end()) {
+              target_tile_assignment_dimensions.push_back(
+                  operand->sharding().tile_assignment().dim(i));
+            }
+          }
+          Array<int64> new_tile_assignment =
+              operand->sharding().tile_assignment();
+          new_tile_assignment.Reshape(target_tile_assignment_dimensions);
+          // Use the same sharding for all tuple elements, because they are part
+          // of the same reduce instruction.
+          HloSharding new_sharding =
+              get_maybe_tuple_sharding(HloSharding::Tile(new_tile_assignment));
+          changed |= MaybeImproveInstructionSharding(new_sharding, instruction);
+        }
+      }
+      return changed;
+    }
+    case HloOpcode::kBroadcast: {
+      const HloInstruction* op = instruction->operand(0);
+      if (!IsSpatiallyPartitioned(op) || op->sharding().IsReplicated()) {
+        return false;
+      }
+      // Heuristic: If an operand is more than 8 times fewer elements than its
+      // output, do not propagate sharding.
+      if (ShapeUtil::ElementsIn(instruction->shape()) >
+          8 * ShapeUtil::ElementsIn(op->shape())) {
+        return false;
+      }
+      // The output will be tiled along the broadcasted dimension the same way
+      // as the input for the broadcast while the other dimensions are kept
+      // non-tiled.
+      std::vector<int64> target_tile_assignment_dimensions;
+      const auto& dimensions = instruction->dimensions();
+      for (int64 i = 0; i < instruction->shape().rank(); ++i) {
+        auto it = absl::c_find(dimensions, i);
+        if (it == dimensions.end()) {
+          target_tile_assignment_dimensions.push_back(1);
+        } else {
+          const int64 source_dim = std::distance(dimensions.begin(), it);
+          target_tile_assignment_dimensions.push_back(
+              op->sharding().tile_assignment().dim(source_dim));
+        }
+      }
+      Array<int64> new_tile_assignment = op->sharding().tile_assignment();
+      new_tile_assignment.Reshape(target_tile_assignment_dimensions);
+      HloSharding new_sharding = HloSharding::Tile(new_tile_assignment);
+      return MaybeImproveInstructionSharding(new_sharding, instruction);
+    }
+    case HloOpcode::kConvolution: {
+      const auto& dnums = instruction->convolution_dimension_numbers();
+      const HloInstruction* lhs = instruction->operand(0);
+      const HloInstruction* rhs = instruction->operand(1);
+      auto get_tiled_sharding_based_on_lhs = [&] {
+        CHECK(!lhs->sharding().IsTileMaximal());
+        std::vector<int64> output_to_lhs_indices(instruction->shape().rank());
+        output_to_lhs_indices[dnums.output_batch_dimension()] =
+            dnums.input_batch_dimension();
+        output_to_lhs_indices[dnums.output_feature_dimension()] =
+            dnums.input_feature_dimension();
+        for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+          output_to_lhs_indices[dnums.output_spatial_dimensions(i)] =
+              dnums.input_spatial_dimensions(i);
+        }
+        return hlo_sharding_util::TransposeSharding(lhs->sharding(),
+                                                    output_to_lhs_indices);
+      };
+      auto get_tiled_sharding_based_on_rhs = [&] {
+        CHECK(!rhs->sharding().IsTileMaximal());
+        std::vector<int64> output_to_rhs_indices(instruction->shape().rank());
+        output_to_rhs_indices[dnums.output_batch_dimension()] =
+            dnums.kernel_input_feature_dimension();
+        output_to_rhs_indices[dnums.output_feature_dimension()] =
+            dnums.kernel_output_feature_dimension();
+        for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+          output_to_rhs_indices[dnums.output_spatial_dimensions(i)] =
+              dnums.kernel_spatial_dimensions(i);
+        }
+        return hlo_sharding_util::TransposeSharding(rhs->sharding(),
+                                                    output_to_rhs_indices);
+      };
+      if (auto dot_dims =
+              dot_as_convolution_util::ParseDotGeneralFromConvolution(
+                  instruction)) {
+        // lhs_or_rhs: lhs is 0 and rhs is 1.
+        auto partitioned_only_along =
+            [&](const HloSharding& sharding,
+                std::vector<dot_as_convolution_util::
+                                DotGeneralAsConvolutionDimsInfo::DimNums>& dims,
+                int64 lhs_or_rhs) {
+              if (sharding.IsTileMaximal()) {
+                return false;
+              }
+              int64 partition_count = 1;
+              for (const auto& dim : dims) {
+                if (lhs_or_rhs == 0) {
+                  partition_count *= sharding.tile_assignment().dim(dim.lhs);
+                } else {
+                  CHECK_EQ(lhs_or_rhs, 1);
+                  partition_count *= sharding.tile_assignment().dim(dim.rhs);
+                }
+              }
+              return partition_count ==
+                     sharding.tile_assignment().num_elements();
+            };
+        // If LHS/RHS is partitioned only along the batch dimensions, propagate
+        // the sharding to the output, since batch dimensions are the easiest to
+        // partition.
+        if (IsSpatiallyPartitioned(lhs) &&
+            partitioned_only_along(lhs->sharding(), dot_dims->batch_dims, 0)) {
+          return MaybeImproveInstructionSharding(
+              get_tiled_sharding_based_on_lhs(), instruction);
+        }
+        if (IsSpatiallyPartitioned(rhs) &&
+            partitioned_only_along(rhs->sharding(), dot_dims->batch_dims, 1)) {
+          return MaybeImproveInstructionSharding(
+              get_tiled_sharding_based_on_rhs(), instruction);
+        }
+        if (aggressive_prop) {
+          // If LHS/RHS is partitioned only along the non-contracting
+          // dimensions, propagate the sharding to the output.
+          const bool can_propagate_from_lhs =
+              IsSpatiallyPartitioned(lhs) &&
+              partitioned_only_along(lhs->sharding(),
+                                     dot_dims->lhs_non_contracting_dims, 0);
+          const bool can_propagate_from_rhs =
+              IsSpatiallyPartitioned(rhs) &&
+              partitioned_only_along(rhs->sharding(),
+                                     dot_dims->rhs_non_contracting_dims, 1);
+          // If we can propagate from both operands, choose the larger one which
+          // should help us reduce communications.
+          if (can_propagate_from_lhs && can_propagate_from_rhs) {
+            if (Product(lhs->shape().dimensions()) >=
+                Product(rhs->shape().dimensions())) {
+              return MaybeImproveInstructionSharding(
+                  get_tiled_sharding_based_on_lhs(), instruction);
+            } else {
+              return MaybeImproveInstructionSharding(
+                  get_tiled_sharding_based_on_rhs(), instruction);
+            }
+          }
+          if (can_propagate_from_lhs) {
+            return MaybeImproveInstructionSharding(
+                get_tiled_sharding_based_on_lhs(), instruction);
+          }
+          if (can_propagate_from_rhs) {
+            return MaybeImproveInstructionSharding(
+                get_tiled_sharding_based_on_rhs(), instruction);
+          }
+        }
+      }
+
+      if (!IsSpatiallyPartitioned(lhs)) {
+        return false;
+      }
+      if (lhs->sharding().IsReplicated()) {
+        return MaybeImproveInstructionSharding(HloSharding::Replicate(),
+                                               instruction);
+      }
+
+      if (IsConvolutionKernelSmall(instruction)) {
+        // If the kernel is small compared to the input then we can generate an
+        // output what is sharded the same way as the input.
+        const auto& tile_assignment = lhs->sharding().tile_assignment();
+        if (tile_assignment.dim(dnums.input_feature_dimension()) > 1) {
+          return false;
+        }
+        return MaybeImproveInstructionSharding(
+            get_tiled_sharding_based_on_lhs(), instruction);
+      }
+      // If the kernel is large (e.g backward convolution) then we only support
+      // replicated output.
+      return MaybeImproveInstructionSharding(HloSharding::Replicate(),
+                                             instruction);
+    }
+    case HloOpcode::kTranspose: {
+      const HloInstruction* input = instruction->operand(0);
+      if (!IsSpatiallyPartitioned(input)) {
+        return false;
+      }
+      HloSharding sharding = hlo_sharding_util::TransposeSharding(
+          input->sharding(), instruction->dimensions());
+      return MaybeImproveInstructionSharding(sharding, instruction);
+    }
+    case HloOpcode::kReduceWindow: {
+      const HloInstruction* lhs = instruction->operand(0);
+      if (!IsSpatiallyPartitioned(lhs)) {
+        return false;
+      }
+
+      auto has_dilation = [](const WindowDimension& dimensions) {
+        return dimensions.base_dilation() > 1 ||
+               dimensions.window_dilation() > 1;
+      };
+      if (absl::c_any_of(instruction->window().dimensions(), has_dilation)) {
+        VLOG(2) << "Not applying sharding to reduce window because dilatation "
+                   "isn't supported yet: "
+                << instruction->ToString();
+        return false;
+      }
+      return MaybeImproveInstructionSharding(lhs->sharding(), instruction);
+    }
+    case HloOpcode::kSelectAndScatter: {
+      // Shard according to first operand, as output keeps the same shape.
+      const HloInstruction* lhs = instruction->operand(0);
+      if (!IsSpatiallyPartitioned(lhs)) {
+        return false;
+      }
+
+      auto has_base_dilation = [](const WindowDimension& dimensions) {
+        return dimensions.base_dilation() > 1;
+      };
+      if (absl::c_any_of(instruction->window().dimensions(),
+                         has_base_dilation)) {
+        VLOG(2) << "Not applying sharding to select-and-scatter because "
+                   "base dilation isn't supported yet: "
+                << instruction->ToString();
+        return false;
+      }
+      return MaybeImproveInstructionSharding(lhs->sharding(), instruction);
+    }
+    case HloOpcode::kReshape: {
+      if (!IsSpatiallyPartitioned(instruction->operand(0))) {
+        return false;
+      }
+      absl::optional<HloSharding> new_sharding =
+          hlo_sharding_util::ReshapeSharding(
+              instruction->operand(0)->shape(), instruction->shape(),
+              instruction->operand(0)->sharding());
+      if (new_sharding.has_value()) {
+        return MaybeImproveInstructionSharding(new_sharding.value(),
+                                               instruction);
+      }
+      return false;
+    }
+    case HloOpcode::kReverse: {
+      if (!IsSpatiallyPartitioned(instruction->operand(0))) {
+        return false;
+      }
+      return MaybeImproveInstructionSharding(
+          hlo_sharding_util::ReverseSharding(
+              instruction->operand(0)->sharding(), instruction->dimensions()),
+          instruction);
+    }
+    case HloOpcode::kDot: {
+      auto& dot_dim_numbs = instruction->dot_dimension_numbers();
+      // Batch dimensions are the same for lhs and rhs on dot operations.
+      int64 num_batch_dims = dot_dim_numbs.lhs_batch_dimensions_size();
+      std::vector<int64> contracting_dims(2);
+      contracting_dims[0] = dot_dim_numbs.lhs_contracting_dimensions(0);
+      contracting_dims[1] = dot_dim_numbs.rhs_contracting_dimensions(0);
+      std::vector<const HloSharding*> ops_sharding(2, nullptr);
+      for (int64 op_num = 0; op_num < 2; ++op_num) {
+        const HloInstruction* op = instruction->operand(op_num);
+        if (IsSpatiallyPartitioned(op)) {
+          ops_sharding[op_num] = &op->sharding();
+        }
+      }
+      if (ops_sharding[0] == nullptr && ops_sharding[1] == nullptr) {
+        return false;
+      }
+
+      // Select representative operand.
+      int64 representative_op = -1;
+      if (ops_sharding[0] == nullptr) {
+        representative_op = 1;
+      } else if (ops_sharding[1] == nullptr) {
+        representative_op = 0;
+      } else if (ops_sharding[0]->IsReplicated() &&
+                 ops_sharding[1]->IsReplicated()) {
+        // Both replicated -> replicate
+        return MaybeImproveInstructionSharding(HloSharding::Replicate(),
+                                               instruction);
+      } else if (!ops_sharding[0]->IsReplicated() &&
+                 !ops_sharding[1]->IsReplicated()) {
+        // Both tile sharded. The dot spatial partitioning implementation
+        // replicates the operand corresponding to the non-tiled dimension:
+        // dot(lhs, rhs), sharding={devices=[1, ..., n, 1]} replicates rhs
+        // dot(lhs, rhs), sharding={devices=[1, ..., 1, n]} replicates lhs
+        // so set sharding in order to replicate the smaller of lhs and rhs
+        representative_op =
+            ShapeUtil::ByteSizeOf(instruction->operand(0)->shape()) <
+                    ShapeUtil::ByteSizeOf(instruction->operand(1)->shape())
+                ? 1
+                : 0;
+      } else {
+        // One is replicated and the other is tiled - pick the tiled one.
+        representative_op = ops_sharding[0]->IsReplicated() ? 1 : 0;
+      }
+
+      if (ops_sharding[representative_op]->IsReplicated()) {
+        return MaybeImproveInstructionSharding(HloSharding::Replicate(),
+                                               instruction);
+      } else {
+        // Tile-shard instruction according to representative op.
+        auto sharding = *ops_sharding[representative_op];
+        if (instruction->shape().dimensions_size() !=
+            sharding.tile_assignment().num_dimensions()) {
+          // It is necessarily the case of a matrix x vector, with
+          // representative_op being the matrix, because the vector op has the
+          // same shape as instruction.
+          CHECK_EQ(sharding.tile_assignment().num_dimensions(),
+                   instruction->shape().dimensions_size() + 1);
+          // Reshape sharding so that last dimension is 1, and then remove
+          // last dimension.
+          std::vector<int64> non_batch_dims(
+              sharding.tile_assignment().num_dimensions() - num_batch_dims);
+          absl::c_iota(non_batch_dims, num_batch_dims);
+          sharding = hlo_sharding_util::ReshapeToTileDimension(
+              sharding, num_batch_dims, non_batch_dims);
+          auto tile_assignment = sharding.tile_assignment();
+          auto dimensions = tile_assignment.dimensions();
+          CHECK_EQ(dimensions.back(), 1);
+          dimensions.pop_back();
+          tile_assignment.Reshape(dimensions);
+          sharding = HloSharding::Tile(tile_assignment);
+        }
+        return MaybeImproveInstructionSharding(sharding, instruction);
+      }
+    }
+    case HloOpcode::kParameter: {
+      auto parent_it = computation_map.find(instruction->parent());
+      if (parent_it == computation_map.end()) {
+        return false;
+      }
+      const HloInstruction* parent = parent_it->second;
+      switch (parent->opcode()) {
+        case HloOpcode::kConditional: {
+          for (int64 i = 1; i < parent->operand_count(); ++i) {
+            if (parent->called_computations()[i - 1] == instruction->parent()) {
+              if (parent->operand(i)->has_sharding()) {
+                return MaybeImproveInstructionSharding(
+                    parent->operand(i)->sharding(), instruction);
+              }
+              return false;
+            }
+          }
+          return false;
+        }
+        default:
+          return false;
+      }
+    }
+    case HloOpcode::kSort: {
+      const HloInstruction* operand = PickRepresentativeOperand(instruction);
+      if (!operand || !IsSpatiallyPartitioned(operand)) {
+        return false;
+      }
+
+      if (!operand->sharding().IsTileMaximal() &&
+          operand->sharding().tile_assignment().dim(
+              instruction->dimensions(0)) != 1) {
+        // Doesn't support sharding the sorting dimension.
+        return false;
+      }
+
+      if (instruction->shape().IsTuple()) {
+        return MaybeImproveInstructionSharding(
+            HloSharding::SingleTuple(instruction->shape(), operand->sharding()),
+            instruction);
+      } else {
+        return MaybeImproveInstructionSharding(operand->sharding(),
+                                               instruction);
+      }
+    }
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice: {
+      auto propagate_slicing = [instruction]() {
+        const HloInstruction* operand =
+            instruction->opcode() == HloOpcode::kDynamicSlice
+                ? instruction->operand(0)
+                : instruction->operand(1);
+        if (!IsSpatiallyPartitioned(operand)) {
+          return false;
+        }
+
+        if (operand->sharding().IsReplicated()) {
+          return MaybeImproveInstructionSharding(HloSharding::Replicate(),
+                                                 instruction);
+        }
+
+        const auto& tile_assignment = operand->sharding().tile_assignment();
+        for (int64 i = 0; i < instruction->shape().rank(); ++i) {
+          if (tile_assignment.dim(i) > 1 &&
+              instruction->shape().dimensions(i) !=
+                  operand->shape().dimensions(i)) {
+            return false;
+          }
+        }
+        return MaybeImproveInstructionSharding(operand->sharding(),
+                                               instruction);
+      };
+      auto propagate_base = [instruction]() {
+        if (instruction->opcode() != HloOpcode::kDynamicUpdateSlice) {
+          return false;
+        }
+        if (!IsSpatiallyPartitioned(instruction->operand(0))) {
+          return false;
+        }
+        return MaybeImproveInstructionSharding(
+            instruction->operand(0)->sharding(), instruction);
+      };
+      return propagate_slicing() || propagate_base();
+    }
+    case HloOpcode::kGather: {
+      if (!IsSpatiallyPartitioned(instruction->operand(1))) {
+        return false;
+      }
+      HloSharding new_sharding = hlo_sharding_util::GatherOutputSharding(
+          instruction->operand(1)->sharding(), instruction);
+      return MaybeImproveInstructionSharding(new_sharding, instruction);
+    }
+    case HloOpcode::kScatter: {
+      if (!IsSpatiallyPartitioned(instruction->operand(1)) &&
+          !IsSpatiallyPartitioned(instruction->operand(2))) {
+        return false;
+      }
+      return MaybeImproveInstructionSharding(HloSharding::Replicate(),
+                                             instruction);
+    }
+    case HloOpcode::kWhile: {
+      if (!instruction->operand(0)->has_sharding()) {
+        return false;
+      }
+      auto sharding = instruction->operand(0)->sharding();
+      if (instruction->has_sharding()) {
+        sharding =
+            MergeForMoreSpecificSharding(sharding, instruction->sharding());
+      }
+      return MaybeImproveInstructionSharding(sharding, instruction);
+    }
+    default: {
+      const HloInstruction* operand = PickRepresentativeOperand(instruction);
+      if (!operand || !IsSpatiallyPartitioned(operand)) {
+        return false;
+      }
+      return MaybeImproveInstructionSharding(operand->sharding(), instruction);
+    }
+  }
+  return false;
+}
+
+// Return the sharding that should be propagated from user to instruction.
+absl::optional<HloSharding> GetShardingFromUser(
+    const HloInstruction& instruction, const HloInstruction& user,
+    bool aggressive_prop, bool is_spmd) {
+  if (!IsSpatiallyPartitioned(&user)) {
+    return absl::nullopt;
+  }
+  switch (user.opcode()) {
+    case HloOpcode::kBroadcast: {
+      if (user.sharding().IsReplicated()) {
+        return user.sharding();
+      }
+      // Only support when none of the partitioned dimensions in the broadcast
+      // output belong to new dimensions.
+      for (int64 i = 0; i < user.shape().rank(); ++i) {
+        if (user.sharding().tile_assignment().dim(i) > 1 &&
+            absl::c_count(user.dimensions(), i) == 0) {
+          return absl::nullopt;
+        }
+      }
+
+      // The instruction (operand of broadcast) will be tiled the same way
+      // as the output.
+      std::vector<int64> target_tile_assignment_dimensions;
+      for (int64 output_dim : user.dimensions()) {
+        target_tile_assignment_dimensions.push_back(
+            user.sharding().tile_assignment().dim(output_dim));
+      }
+      Array<int64> new_tile_assignment = user.sharding().tile_assignment();
+      new_tile_assignment.Reshape(target_tile_assignment_dimensions);
+      return HloSharding::Tile(new_tile_assignment);
+    }
+    case HloOpcode::kConcatenate: {
+      if (user.sharding().IsReplicated()) {
+        return user.sharding();
+      }
+
+      const int64 cdim = user.concatenate_dimension();
+      const Array<int64>& tile_assignment = user.sharding().tile_assignment();
+      if (tile_assignment.dim(cdim) == 1) {
+        // If we are concatenating along a non-sharded dimension then the
+        // operands should have the same sharding as the result.
+        return user.sharding();
+      }
+
+      if (is_spmd) {
+        // SPMD doesn't support tiling with part of the devices. Return the same
+        // sharding.
+        return user.sharding();
+      }
+
+      // If we are concatenating along a sharded dimension then we want the
+      // operands to be distributed among the devices their data is used.
+      int64 start_offset = 0;
+      for (HloInstruction* op : user.operands()) {
+        if (op == &instruction) {
+          break;
+        }
+        start_offset += op->shape().dimensions(cdim);
+      }
+      const int64 tile_shape = CeilOfRatio(user.shape().dimensions(cdim),
+                                           tile_assignment.dimensions()[cdim]);
+      std::vector<int64> start_indices(tile_assignment.num_dimensions());
+      std::vector<int64> end_indices = tile_assignment.dimensions();
+      start_indices[cdim] = start_offset / tile_shape;
+      end_indices[cdim] = CeilOfRatio(
+          start_offset + instruction.shape().dimensions(cdim), tile_shape);
+      auto new_tile_assignment =
+          tile_assignment.Slice(start_indices, end_indices);
+      if (new_tile_assignment.num_elements() == 1) {
+        return HloSharding::AssignDevice(*new_tile_assignment.begin());
+      }
+      return HloSharding::Tile(new_tile_assignment);
+    }
+    case HloOpcode::kConvolution: {
+      if (auto dot_dims =
+              dot_as_convolution_util::ParseDotGeneralFromConvolution(&user)) {
+        const auto& dnums = user.convolution_dimension_numbers();
+        auto partitioned_only_along =
+            [&](const HloSharding& sharding,
+                std::vector<dot_as_convolution_util::
+                                DotGeneralAsConvolutionDimsInfo::DimNums>&
+                    dims) {
+              if (sharding.IsTileMaximal()) {
+                return false;
+              }
+              int64 partition_count = 1;
+              for (const auto& dim : dims) {
+                partition_count *= sharding.tile_assignment().dim(dim.output);
+              }
+              return partition_count ==
+                     sharding.tile_assignment().num_elements();
+            };
+        // If output is partitioned only along the batch dimensions, or only
+        // along the non-contracting dimensions, propagate the sharding to the
+        // operand.
+        if (&instruction == user.operand(0) &&
+            (partitioned_only_along(user.sharding(), dot_dims->batch_dims) ||
+             partitioned_only_along(user.sharding(),
+                                    dot_dims->lhs_non_contracting_dims))) {
+          std::vector<int64> lhs_to_output_indices(user.shape().rank());
+          lhs_to_output_indices[dnums.input_batch_dimension()] =
+              dnums.output_batch_dimension();
+          lhs_to_output_indices[dnums.input_feature_dimension()] =
+              dnums.output_feature_dimension();
+          for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+            lhs_to_output_indices[dnums.input_spatial_dimensions(i)] =
+                dnums.output_spatial_dimensions(i);
+          }
+          return hlo_sharding_util::TransposeSharding(user.sharding(),
+                                                      lhs_to_output_indices);
+        }
+        if (&instruction == user.operand(1) &&
+            (partitioned_only_along(user.sharding(), dot_dims->batch_dims) ||
+             partitioned_only_along(user.sharding(),
+                                    dot_dims->rhs_non_contracting_dims))) {
+          std::vector<int64> rhs_to_output_indices(user.shape().rank());
+          rhs_to_output_indices[dnums.kernel_input_feature_dimension()] =
+              dnums.output_batch_dimension();
+          rhs_to_output_indices[dnums.kernel_output_feature_dimension()] =
+              dnums.output_feature_dimension();
+          for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+            rhs_to_output_indices[dnums.kernel_spatial_dimensions(i)] =
+                dnums.output_spatial_dimensions(i);
+          }
+          return hlo_sharding_util::TransposeSharding(user.sharding(),
+                                                      rhs_to_output_indices);
+        }
+      }
+      return absl::nullopt;
+    }
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice: {
+      if (user.sharding().IsReplicated()) {
+        return user.sharding();
+      }
+      if (user.opcode() == HloOpcode::kDynamicUpdateSlice &&
+          &instruction == user.operand(0)) {
+        return user.sharding();
+      }
+      const HloInstruction* operand = user.opcode() == HloOpcode::kDynamicSlice
+                                          ? user.operand(0)
+                                          : user.operand(1);
+      if (&instruction != operand) {
+        return absl::nullopt;
+      }
+
+      const auto& tile_assignment = user.sharding().tile_assignment();
+      for (int64 i = 0; i < user.shape().rank(); ++i) {
+        if (tile_assignment.dim(i) > 1 &&
+            user.shape().dimensions(i) != operand->shape().dimensions(i)) {
+          return absl::nullopt;
+        }
+      }
+      return user.sharding();
+    }
+    case HloOpcode::kReduceWindow: {
+      if (&instruction != user.operand(0)) {
+        return absl::nullopt;
+      }
+      return user.sharding();
+    }
+    case HloOpcode::kReshape: {
+      return hlo_sharding_util::ReshapeSharding(
+          user.shape(), instruction.shape(), user.sharding());
+    }
+    case HloOpcode::kTranspose: {
+      // Calculate the dimension numbers for reversing the current transpose
+      // and then use TransposeSharding to convert the output sharding to an
+      // input sharding.
+      std::vector<int64> reverse_dimensions(user.dimensions().size());
+      for (int64 i = 0; i < user.dimensions().size(); ++i) {
+        reverse_dimensions[user.dimensions(i)] = i;
+      }
+      return hlo_sharding_util::TransposeSharding(user.sharding(),
+                                                  reverse_dimensions);
+    }
+    case HloOpcode::kTuple: {
+      return user.sharding().GetSubSharding(user.shape(),
+                                            {user.operand_index(&instruction)});
+    }
+    case HloOpcode::kGetTupleElement: {
+      HloSharding new_sharding =
+          instruction.has_sharding()
+              ? instruction.sharding()
+              : HloSharding::SingleTuple(instruction.shape(),
+                                         HloSharding::Replicate());
+      int64 sharding_index = 0;
+      for (int64 i = 0; i < instruction.shape().tuple_shapes_size(); ++i) {
+        if (i == user.tuple_index()) {
+          break;
+        }
+        if (instruction.shape().tuple_shapes(i).IsArray()) {
+          sharding_index += 1;
+        } else {
+          sharding_index +=
+              instruction.shape().tuple_shapes(i).tuple_shapes_size();
+        }
+      }
+      if (user.shape().IsArray()) {
+        new_sharding.tuple_elements()[sharding_index] = user.sharding();
+      }
+      for (int64 i = 0; i < user.sharding().tuple_elements().size(); ++i) {
+        new_sharding.tuple_elements()[sharding_index + i] =
+            user.sharding().tuple_elements()[i];
+      }
+      return new_sharding;
+    }
+    case HloOpcode::kDot: {
+      if (user.sharding().IsReplicated()) {
+        return user.sharding();
+      }
+      auto& dim_numbers = user.dot_dimension_numbers();
+      int64 op_idx = user.operand_index(&instruction);
+      // Batch dimensions are the same on lhs and rhs for dot operations.
+      int64 num_batch_dims = dim_numbers.lhs_batch_dimensions_size();
+      int64 num_spatial_dims =
+          instruction.shape().dimensions_size() - num_batch_dims;
+      if (num_spatial_dims == 1) {
+        // This is the vector of a matrix x vector operation -> replicate,
+        // since tiling on the vector would necessarily be on the contracting
+        // dimension, which we don't support.
+        CHECK_EQ(op_idx, 1);
+        return HloSharding::Replicate();
+      }
+      // Instruction is necessarily a matrix because it is one of the operands
+      // of a matrix x matrix operation.
+      CHECK_EQ(num_spatial_dims, 2);
+      // Propagate tile sharding to the bigger operand, and replicate the other.
+      auto other_op = user.operand(op_idx ^ 1);
+      if (ShapeUtil::ByteSizeOf(instruction.shape()) >
+          ShapeUtil::ByteSizeOf(other_op->shape())) {
+        return user.sharding();
+      } else {
+        return HloSharding::Replicate();
+      }
+    }
+    case HloOpcode::kReduce: {
+      if (instruction.shape().rank() == 0) {
+        return absl::nullopt;
+      }
+      auto user_sharding =
+          user.shape().IsTuple()
+              ? user.sharding().GetSubSharding(
+                    user.shape(), {user.operand_index(&instruction)})
+              : user.sharding();
+      if (user_sharding.IsTileMaximal()) {
+        return user_sharding;
+      }
+      std::vector<int64> target_tile_assignment_dimensions(
+          instruction.shape().rank());
+      const auto& dimensions = user.dimensions();
+      int64 next_output_dim = 0;
+      for (int64 i = 0; i < instruction.shape().rank(); ++i) {
+        if (absl::c_find(dimensions, i) == dimensions.end()) {
+          target_tile_assignment_dimensions[i] =
+              user_sharding.tile_assignment().dim(next_output_dim++);
+        } else {
+          target_tile_assignment_dimensions[i] = 1;
+        }
+      }
+      auto tile_assignment = user_sharding.tile_assignment();
+      tile_assignment.Reshape(target_tile_assignment_dimensions);
+      return HloSharding::Tile(tile_assignment);
+    }
+    case HloOpcode::kSort: {
+      if (user.sharding().IsTuple()) {
+        return user.sharding().GetSubSharding(
+            user.shape(), {user.operand_index(&instruction)});
+      } else {
+        return user.sharding();
+      }
+    }
+    case HloOpcode::kReverse: {
+      return hlo_sharding_util::ReverseSharding(user.sharding(),
+                                                user.dimensions());
+    }
+    default: {
+      // If the user output shape is compatible with the current instruction
+      // shape excluding element type and the current instruction is supported
+      // by spatial partitioning, then the user sharding can be used for
+      // propagation to the current instruction.
+      if (ShapeUtil::CompatibleIgnoringElementType(instruction.shape(),
+                                                   user.shape())) {
+        return user.sharding();
+      }
+      return absl::nullopt;
+    }
+  }
+}
+
+// Tries to update the sharding of the specified instruction based on its users
+// and returns true if the sharding of the instruction have been changed and
+// false otherwise.
+bool InferShardingFromUsers(HloInstruction* instruction,
+                            const ComputationMap& computation_map,
+                            bool aggressive_prop, bool is_spmd) {
+  if (!SupportSpatialPartitioning(instruction, computation_map, is_spmd)) {
+    return false;
+  }
+  bool improved_sharding = false;
+  for (const HloInstruction* user : instruction->users()) {
+    absl::optional<HloSharding> user_sharding =
+        GetShardingFromUser(*instruction, *user, aggressive_prop, is_spmd);
+    if (user_sharding) {
+      improved_sharding |=
+          MaybeImproveInstructionSharding(*user_sharding, instruction);
+    }
+  }
+  return improved_sharding;
+}
+
+// Remove Sharding custom-call instruction by folding the sharding attribute
+// to its operand. If the operand alreayd has a different sharding, insert a
+// copy node for reshard.
+StatusOr<bool> ProcessShardingInstruction(HloModule* module) {
+  bool changed = false;
+
+  for (HloComputation* computation : module->computations()) {
+    auto instructions = computation->MakeInstructionPostOrder();
+    std::reverse(instructions.begin(), instructions.end());
+    for (HloInstruction* instruction : instructions) {
+      if (instruction->opcode() != HloOpcode::kCustomCall) {
+        continue;
+      }
+      if (instruction->custom_call_target() != "Sharding") {
+        continue;
+      }
+      TF_RET_CHECK(instruction->has_sharding())
+          << "Sharding instruction must have a sharding attribute";
+      const HloSharding& sharding = instruction->sharding();
+
+      // If the operand has a different sharding from the current sharding
+      // instruction, create a copy node. Otherwise, just remove the sharding
+      // instruction and set the operand sharding.
+      if (instruction->operand(0)->has_sharding() &&
+          instruction->operand(0)->sharding() != sharding) {
+        auto copy = computation->AddInstruction(
+            HloInstruction::CreateUnary(instruction->shape(), HloOpcode::kCopy,
+                                        instruction->mutable_operand(0)));
+        TF_RETURN_IF_ERROR(computation->ReplaceInstruction(instruction, copy));
+        copy->set_sharding(sharding);
+      } else {
+        instruction->mutable_operand(0)->set_sharding(sharding);
+        TF_RETURN_IF_ERROR(
+            instruction->ReplaceAllUsesWith(instruction->mutable_operand(0)));
+        TF_RETURN_IF_ERROR(computation->RemoveInstruction(instruction));
+      }
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+}  // namespace
+
+/*static*/ Status ShardingPropagation::NormalizeDomain(
+    const DomainMetadata::Domain& domain, const DomainMetadata* metadata) {
+  if (metadata != nullptr) {
+    TF_ASSIGN_OR_RETURN(const auto& sharding_metadata,
+                        ShardingMetadata::ToShardingMetadata(metadata));
+    const auto& sharding = sharding_metadata->sharding();
+    if (sharding != nullptr) {
+      bool is_spatially_partitioned = !sharding->HasUniqueDevice();
+      if (sharding->IsTuple()) {
+        is_spatially_partitioned = absl::c_any_of(
+            sharding->tuple_elements(),
+            [](const HloSharding& s) { return !s.HasUniqueDevice(); });
+      }
+      if (is_spatially_partitioned) {
+        for (HloInstruction* domain : domain.exit_domains) {
+          domain->mutable_operand(0)->set_sharding(*sharding);
+        }
+        return Status::OK();
+      }
+    }
+  }
+  return ShardingMetadata::NormalizeShardingDomain(domain, metadata);
+}
+
+StatusOr<bool> ShardingPropagation::Run(HloModule* module) {
+  TF_ASSIGN_OR_RETURN(bool any_changed, ProcessShardingInstruction(module));
+
+  // Association of partitionable embedded computations with their parent
+  // instruction.
+  ComputationMap computation_map;
+
+  // Instructions that are related through a computation and need to share the
+  // same sharding.
+  auto get_related_instructions = [](HloInstruction* inst) {
+    if (inst->opcode() == HloOpcode::kWhile) {
+      return std::vector<HloInstruction*>{
+          inst, inst->while_body()->root_instruction(),
+          inst->while_body()->parameter_instruction(0),
+          inst->while_condition()->parameter_instruction(0)};
+    } else if (inst->opcode() == HloOpcode::kConditional) {
+      std::vector<HloInstruction*> comps{inst};
+      for (HloComputation* c : inst->called_computations()) {
+        comps.push_back(c->root_instruction());
+      }
+      return comps;
+    } else {
+      CHECK(false);
+    }
+  };
+
+  // If instruction is a while, or the root or a parameter of a while body,
+  // then propagate its sharding to the while instruction, to its body root,
+  // and to its condition parameter.
+  std::function<void(HloInstruction*)> maybe_computation_propagation =
+      [&](HloInstruction* instruction) {
+        auto propagate_to_instruction = [&](HloInstruction* search_inst) {
+          auto related_instructions = get_related_instructions(search_inst);
+          if (absl::c_count(related_instructions, instruction)) {
+            for (HloInstruction* inst : related_instructions) {
+              if (!inst->has_sharding() ||
+                  inst->sharding() != instruction->sharding()) {
+                VLOG(2) << "Add computation sharding: " << inst->name();
+                inst->set_sharding(instruction->sharding());
+                maybe_computation_propagation(inst);
+              }
+            }
+          }
+        };
+
+        if (instruction->opcode() == HloOpcode::kConditional ||
+            instruction->opcode() == HloOpcode::kWhile) {
+          propagate_to_instruction(instruction);
+        }
+
+        if (instruction->opcode() == HloOpcode::kParameter ||
+            instruction->parent()->root_instruction() == instruction) {
+          auto it = computation_map.find(instruction->parent());
+          if (it != computation_map.end()) {
+            propagate_to_instruction(it->second);
+          }
+        }
+      };
+
+  // Populate computation_map in order to associate while bodies to their
+  // while instructions.
+  for (auto computation : module->computations()) {
+    for (auto instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kWhile ||
+          instruction->opcode() == HloOpcode::kConditional) {
+        // Check if any of the related instructions has sharding, in which case
+        // propagate it to the other instructions, so they all share the same
+        // sharding, in case the user didn't shard all of them. We don't check
+        // that user shardings are consistent, because such check is already
+        // done by HloShardingVerifier.
+        const HloInstruction* sharded_inst = nullptr;
+        auto related_instructions = get_related_instructions(instruction);
+        for (auto inst : related_instructions) {
+          if (inst->has_sharding()) {
+            sharded_inst = inst;
+            break;
+          }
+        }
+        if (sharded_inst != nullptr) {
+          // Set the same sharding to all the other related instructions.
+          for (auto inst : related_instructions) {
+            inst->set_sharding(sharded_inst->sharding());
+          }
+        }
+      }
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        computation_map[instruction->while_body()] = instruction;
+      } else if (instruction->opcode() == HloOpcode::kConditional) {
+        for (HloComputation* c : instruction->called_computations()) {
+          computation_map[c] = instruction;
+        }
+      }
+    }
+  }
+
+  // Collect all pre-sharded instructions as we aren't allowed to modify their
+  // sharding.
+  absl::flat_hash_set<const HloInstruction*> provided_shardings;
+  for (const HloComputation* computation : module->computations()) {
+    for (const HloInstruction* inst : computation->instructions()) {
+      if (inst->has_sharding()) {
+        provided_shardings.insert(inst);
+      }
+    }
+  }
+
+  // Consider the root instruction of the entry module as one with provided
+  // sharding as its sharding have to match with the one expected by the host.
+  provided_shardings.insert(module->entry_computation()->root_instruction());
+
+  // Iterate to a fixpoint that is guaranteed to be reached because we only
+  // strictly improve the sharding of the graph and it can't be improved
+  // indefinitely.
+  int64 iterations = 0;
+  auto run_to_fix_point = [&](bool aggressive_prop) {
+    bool changed = true;
+    while (changed) {
+      changed = false;
+      int64 inferred_from_operand_counter = 0;
+      int64 inferred_from_user_counter = 0;
+      int64 instruction_counter = 0;
+      int64 already_sharded_counter = 0;
+      for (const HloComputation* computation : module->computations()) {
+        std::vector<HloInstruction*> instructions =
+            computation->MakeInstructionPostOrder();
+
+        instruction_counter += instructions.size();
+        for (const HloInstruction* instruction : instructions) {
+          already_sharded_counter += (instruction->has_sharding() ? 1 : 0);
+        }
+
+        // Remove the instructions where the sharding was provided from the
+        // outside so we don't modify them.
+        instructions.erase(
+            std::remove_if(instructions.begin(), instructions.end(),
+                           [&](HloInstruction* instruction) {
+                             return provided_shardings.contains(instruction);
+                           }),
+            instructions.end());
+
+        // First iterate the HLO graph in post order taking shardings from
+        // operands.
+        for (HloInstruction* instruction : instructions) {
+          if (InferShardingFromOperands(instruction, computation_map, is_spmd_,
+                                        aggressive_prop)) {
+            ++inferred_from_operand_counter;
+            changed = true;
+            VLOG(2) << "Add sharding (forward-pass): "
+                    << instruction->ToString();
+            maybe_computation_propagation(instruction);
+          }
+        }
+
+        // Then iterate the HLO graph in reverse post order taking shardings
+        // from users.
+        for (auto it = instructions.rbegin(); it != instructions.rend(); ++it) {
+          if (InferShardingFromUsers(*it, computation_map, aggressive_prop,
+                                     is_spmd_)) {
+            ++inferred_from_user_counter;
+            changed = true;
+            VLOG(2) << "Add sharding (backward-pass): " << (*it)->ToString();
+            maybe_computation_propagation(*it);
+          }
+        }
+      }
+      any_changed |= changed;
+      VLOG(1) << "Sharding propagation iteration " << iterations << ";";
+      VLOG(1) << "  total instructions: " << instruction_counter;
+      VLOG(1) << "  instructions already sharded: " << already_sharded_counter;
+      VLOG(1) << "  shardings inferred from operands: "
+              << inferred_from_operand_counter;
+      VLOG(1) << "  shardings inferred from users: "
+              << inferred_from_user_counter;
+      ++iterations;
+    }
+  };
+  run_to_fix_point(false);
+  run_to_fix_point(true);
+
+  VLOG(1) << "Sharding propagation completed after " << iterations
+          << " iterations";
+  return any_changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/sharding_propagation.h b/tensorflow/compiler/xla/service/sharding_propagation.h
new file mode 100644
index 00000000000..2c07a4a6a31
--- /dev/null
+++ b/tensorflow/compiler/xla/service/sharding_propagation.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SHARDING_PROPAGATION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SHARDING_PROPAGATION_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Propagates sharding information around the graph. HLOs that have shardings
+// are kept as-is, those that do not have shardings are given shardings based on
+// a simple local greedy heuristic.
+class ShardingPropagation : public HloModulePass {
+ public:
+  explicit ShardingPropagation(bool is_spmd = false) : is_spmd_(is_spmd) {}
+  absl::string_view name() const override { return "sharding-propagation"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+  // Function which can be used to apply a spatially partitioned sharding onto a
+  // given domain. It will apply the sharding into the exit edges of the domain
+  // and then rely on the rest of sharding propagation to ensure that the
+  // intermediate nodes get the correct sharding.
+  static Status NormalizeDomain(const DomainMetadata::Domain& domain,
+                                const DomainMetadata* metadata);
+
+ private:
+  bool is_spmd_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SHARDING_PROPAGATION_H_
diff --git a/tensorflow/compiler/xla/service/sharding_propagation_test.cc b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
new file mode 100644
index 00000000000..a9d685a7a93
--- /dev/null
+++ b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
@@ -0,0 +1,1329 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/sharding_propagation.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+using ShardingPropagationTest = HloTestBase;
+
+TEST_F(ShardingPropagationTest, ElementwiseOperationForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %elementwise {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0),
+    sharding={devices=[1,2,2,1]0,1,2,3}
+  %param1 = f32[5,7,11,13]{3,2,1,0} parameter(1)
+  %add = f32[5,7,11,13]{3,2,1,0} add(%param0, %param1)
+  ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%add)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "add"),
+              op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, ElementwiseOperationBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %elementwise {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0)
+  %param1 = f32[5,7,11,13]{3,2,1,0} parameter(1)
+  %add = f32[5,7,11,13]{3,2,1,0} add(%param0, %param1)
+  ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%add),
+    sharding={devices=[1,2,2,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "add"),
+              op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, BroadcastForwardPassNoSharding) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %broadcast {
+  %param0 = f32[7,11]{1,0} parameter(0),
+    sharding={devices=[2,2]0,1,2,3}
+  %broadcast = f32[5,7,11,13]{3,2,1,0} broadcast(%param0), dimensions={1,2}
+  ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%broadcast)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+// Regression Test for b/129569657.
+TEST_F(ShardingPropagationTest, BroadcastForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %broadcast {
+  %param0 = f32[3,2048,2048]{2,1,0} parameter(0),
+    sharding={devices=[1,2,2]0,1,2,3}
+  %broadcast = f32[3,2048,2048,3]{3,2,1,0} broadcast(%param0), dimensions={0,1,2}
+  ROOT %copy = f32[3,2048,2048,3]{3,2,1,0} copy(%broadcast)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "broadcast"),
+              op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, BroadcastBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %broadcast {
+  %param0 = f32[13]{0} parameter(0)
+  %broadcast = f32[5,7,11,13]{3,2,1,0} broadcast(%param0), dimensions={3}
+  ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%broadcast),
+    sharding={devices=[1,2,2,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "broadcast"),
+              op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, BroadcastUser) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %broadcast {
+  %param0 = f32[24,8]{0,1} parameter(0)
+  %copy = f32[24,8]{0,1} copy(%param0)
+  ROOT %broadcast = f32[4,24,6,8]{3,2,1,0} broadcast(%copy), dimensions={1,3},
+    sharding={devices=[1,2,1,4]0,1,2,3,4,5,6,7}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "copy"),
+              op::Sharding("{devices=[2,4]0,1,2,3,4,5,6,7}"));
+}
+
+TEST_F(ShardingPropagationTest, MaximalReduceForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+%add {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(%lhs, %rhs)
+}
+ENTRY %reduce {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0),
+    sharding={devices=[1,2,2,1]0,1,2,3}
+  %init = f32[] parameter(1)
+  %reduce = f32[5,7]{1,0} reduce(%param0, %init), dimensions={2,3}, to_apply=%add
+  ROOT %copy = f32[5,7]{0,1} copy(%reduce)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "reduce"),
+              op::Sharding("{replicated}"));
+}
+
+TEST_F(ShardingPropagationTest, ShardedReduceForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+%add {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(%lhs, %rhs)
+}
+ENTRY %reduce {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0),
+    sharding={devices=[1,2,2,1]0,1,2,3}
+  %init = f32[] parameter(1)
+  %reduce = f32[7,11]{1,0} reduce(%param0, %init), dimensions={0,3}, to_apply=%add
+  ROOT %copy = f32[7,11]{0,1} copy(f32[7,11]{1,0} %reduce)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "reduce"),
+              op::Sharding("{devices=[2,2]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, ShardedTupleReduceForwardAndBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%minmax_func {
+  %lhs_value = f32[] parameter(0)
+  %rhs_value = f32[] parameter(2)
+  %compare.2 = pred[] compare(%lhs_value, %rhs_value), direction=GT
+  %select.4 = f32[] select(%compare.2, %lhs_value, %rhs_value)
+  %lhs_index = s32[] parameter(1)
+  %rhs_index = s32[] parameter(3)
+  %select.5 = s32[] select(%compare.2, %lhs_index, %rhs_index)
+  ROOT %tuple.2 = (f32[], s32[]) tuple(%select.4, %select.5)
+}
+
+ENTRY %main {
+  %param0 = f32[28,10] parameter(0)
+  %param1 = s32[28,10] parameter(1), sharding={devices=[2,1]0,1}
+  %copy_param0 = f32[28,10] copy(%param0)
+  %init0 = f32[] parameter(2)
+  %init1 = s32[] parameter(3)
+  %reduce = (f32[28], s32[28]) reduce(%copy_param0, %param1, %init0, %init1),
+    dimensions={1}, to_apply=%minmax_func
+  %gte0 = f32[28] get-tuple-element(%reduce), index=0
+  %gte1 = s32[28] get-tuple-element(%reduce), index=1
+  %copy0 = f32[28] copy(%gte0)
+  %copy1 = s32[28] copy(%gte1)
+  ROOT %tuple = (f32[28], s32[28]) tuple(%copy0, %copy1)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "reduce"),
+              op::Sharding("{{devices=[2]0,1},{devices=[2]0,1}}"));
+  EXPECT_THAT(FindInstruction(module.get(), "copy_param0"),
+              op::Sharding("{devices=[2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, GetTupleElementForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %gte {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0)
+  %tuple = (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0}) tuple(
+    %param0, %param0)
+  %tuple.1 = (f32[5,7,11,13]{3,2,1,0},
+              (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0})) tuple(
+    %param0, %tuple),
+    sharding={{devices=[1,2,2,1]0,1,2,3},
+              {replicated},
+              {devices=[1,2,2,1]0,1,2,3}}
+  %gte = f32[5,7,11,13]{3,2,1,0} get-tuple-element(%tuple.1), index=0
+  %gte.1 = (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0}) get-tuple-element(
+    %tuple.1), index=1
+  %gte.2 = f32[5,7,11,13]{3,2,1,0} get-tuple-element(%gte.1), index=0
+  ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%gte.2)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "gte"),
+              op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
+  EXPECT_THAT(FindInstruction(module.get(), "gte.1"),
+              op::Sharding("{{replicated},"
+                           " {devices=[1,2,2,1]0,1,2,3}}"));
+  EXPECT_THAT(FindInstruction(module.get(), "gte.2"),
+              op::Sharding("{replicated}"));
+}
+
+TEST_F(ShardingPropagationTest, TupleForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %tuple {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0),
+    sharding={replicated}
+  %param1 = f32[5,7,11,13]{3,2,1,0} parameter(1),
+    sharding={devices=[1,2,2,1]0,1,2,3}
+  %param2 = f32[5,7,11,13]{3,2,1,0} parameter(2)
+  %tuple = (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0}) tuple(
+    %param1, %param2)
+  %tuple.1 = (f32[5,7,11,13]{3,2,1,0},
+              (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0})) tuple(
+    %param0, %tuple)
+  ROOT %copy = (f32[5,7,11,13]{3,2,1,0},
+                (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0})) copy(
+    %tuple.1)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "tuple"),
+              op::Sharding("{{devices=[1,2,2,1]0,1,2,3},"
+                           " {replicated}}"));
+  EXPECT_THAT(FindInstruction(module.get(), "tuple.1"),
+              op::Sharding("{{replicated},"
+                           " {devices=[1,2,2,1]0,1,2,3},"
+                           " {replicated}}"));
+}
+
+TEST_F(ShardingPropagationTest, ForwardConvolutionForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %lhs = f32[5,7,11,13]{3,2,1,0} parameter(0),
+    sharding={devices=[2,2,2,1]0,1,2,3,4,5,6,7}
+  %rhs = f32[3,3,13,17]{3,2,1,0} parameter(1)
+  %convolution = f32[5,7,11,17]{3,2,1,0} convolution(%lhs, %rhs),
+    window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f
+  ROOT %copy = f32[5,7,11,17]{3,2,1,0} copy(%convolution)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "convolution"),
+              op::Sharding("{devices=[2,2,2,1]0,1,2,3,4,5,6,7}"));
+}
+
+TEST_F(ShardingPropagationTest, ForwardConvolutionLargeDilationForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %lhs = f32[8,64,2]{2,1,0} parameter(0),
+    sharding={devices=[1,4,1]0,1,2,3}
+  %rhs = f32[3,2,2]{2,1,0} parameter(1)
+  %convolution = f32[8,32,2]{2,1,0} convolution(%lhs, %rhs),
+    window={size=3 rhs_dilate=16}, dim_labels=b0f_0io->b0f
+  ROOT %copy = f32[8,32,2]{2,1,0} copy(%convolution)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "convolution"),
+              op::Sharding("{devices=[1,4,1]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, TransposeForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %transpose {
+  %param = f32[7,11,13]{2,1,0} parameter(0),
+    sharding={devices=[2,1,2]0,1,2,3}
+  %transpose = f32[11,13,7]{2,1,0} transpose(%param), dimensions={1,2,0}
+  ROOT %copy = f32[11,13,7]{2,1,0} copy(%transpose)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "transpose"),
+              op::Sharding("{devices=[1,2,2]0,2,1,3}"));
+}
+
+TEST_F(ShardingPropagationTest, TransposeBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %transpose {
+  %param = f32[7,11,13]{2,1,0} parameter(0)
+  %copy = f32[7,11,13]{2,1,0} copy(%param)
+  ROOT %transpose = f32[11,13,7]{2,1,0} transpose(%copy), dimensions={1,2,0},
+    sharding={devices=[1,2,2]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "copy"),
+              op::Sharding("{devices=[2,1,2]0,2,1,3}"));
+}
+
+TEST_F(ShardingPropagationTest, ReshapeForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %reshape {
+  %param0 = f32[1430,1]{1,0} parameter(0),
+    sharding={devices=[2,1]0,1}
+  %reshape = f32[10,11,13]{2,1,0} reshape(%param0)
+  ROOT %copy = f32[10,11,13]{2,1,0} copy(%reshape)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "reshape"),
+              op::Sharding("{devices=[2,1,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, ReshapeBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %reshape {
+  %param0 = f32[2002,1]{1,0} parameter(0)
+  %copy = f32[2002,1]{1,0} copy(f32[2002,1]{1,0} %param0)
+  ROOT %reshape = f32[14,11,13]{2,1,0} reshape(%copy),
+    sharding={devices=[2,1,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "copy"),
+              op::Sharding("{devices=[2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, PadForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %pad {
+  %input = f32[11,17]{1,0} parameter(0),
+    sharding={devices=[2,2]0,1,2,3}
+  %pad_value = f32[] parameter(1)
+  %pad = f32[27,51]{1,0} pad(%input, %pad_value), padding=2_4_1x1_1_2
+  ROOT %copy = f32[27,51]{1,0} copy(%pad)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "pad"),
+              op::Sharding("{devices=[2,2]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, ShardedPreferredOverReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %replicated {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0),
+    sharding={replicated}
+  %copy = f32[5,7,11,13]{3,2,1,0} copy(%param0)
+  %param1 = f32[5,7,11,13]{3,2,1,0} parameter(1),
+    sharding={devices=[1,2,2,1]0,1,2,3}
+  %copy.1 = f32[5,7,11,13]{3,2,1,0} copy(%param1)
+  %add = f32[5,7,11,13]{3,2,1,0} add(%copy, %copy.1)
+  ROOT %copy.2 = f32[5,7,11,13]{3,2,1,0} copy(%add)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "copy"),
+              op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
+  EXPECT_THAT(FindInstruction(module.get(), "copy.1"),
+              op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
+  EXPECT_THAT(FindInstruction(module.get(), "add"),
+              op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, DontShardTuplesIfAllInputIsMaximal) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %tuple {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0),
+    sharding={maximal device=0}
+  %param1 = f32[5,7,11,13]{3,2,1,0} parameter(1),
+    sharding={maximal device=1}
+  %tuple = (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0}) tuple(
+    %param0, %param1)
+  ROOT %copy = (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0}) copy(%tuple)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_FALSE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "tuple"), op::NoSharding());
+}
+
+TEST_F(ShardingPropagationTest, ValidConvolution) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY conv {
+  %lhs = f32[13,17,19]{2,1,0} parameter(0),
+    sharding={devices=[1,2,1]0,1}
+  %rhs = f32[19,5,19]{2,1,0} parameter(1)
+  %conv = f32[13,13,19]{2,1,0} convolution(%lhs, %rhs),
+    window={size=5}, dim_labels=b0f_i0o->b0f
+  ROOT %tuple = (f32[13,13,19]{2,1,0}) tuple(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{devices=[1,2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, StridedSlice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %slice {
+  %param = f32[17,13]{1,0} parameter(0),
+    sharding={devices=[2,1]0,1}
+  %slice = f32[7,5]{1,0} slice(%param), slice={[1:15:2], [5:10:1]}
+  ROOT %tuple = (f32[7,5]{1,0}) tuple(%slice)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "slice"),
+              op::Sharding("{devices=[2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, ReduceWindowBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+%add (lhs: f32[], rhs: f32[]) -> f32[] {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(%lhs, %rhs)
+}
+ENTRY %reduce_window {
+  %param = f32[13,17]{1,0} parameter(0)
+  %param.copy = f32[13,17]{1,0} copy(%param)
+  %init = f32[] parameter(1)
+  ROOT %reduce-window = f32[7,17]{1,0} reduce-window(%param.copy, %init),
+    window={size=3x2 stride=2x1 pad=1_1x0_1}, to_apply=%add,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "param.copy"),
+              op::Sharding("{devices=[2,1]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "reduce-window"),
+              op::Sharding("{devices=[2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, ReplicatedConvolutionLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY conv {
+  %lhs = f32[3,2,3]{2,1,0} parameter(0), sharding={replicated}
+  %rhs = f32[2,2,1]{2,1,0} parameter(1)
+  %conv = f32[3,2,3]{2,1,0} convolution(%lhs, %rhs),
+    window={size=1}, dim_labels=bf0_oi0->bf0
+  ROOT %tuple = f32[3,2,3]{2,1,0} tuple(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "lhs"),
+              op::Sharding("{replicated}"));
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{replicated}"));
+}
+
+TEST_F(ShardingPropagationTest, ConvolutionShardedFeature) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY conv {
+  %lhs = f32[3,2,3]{2,1,0} parameter(0),
+    sharding={devices=[1,2,1]0,1}
+  %rhs = f32[2,2,1]{2,1,0} parameter(1)
+  %conv = f32[3,2,3]{2,1,0} convolution(%lhs, %rhs),
+    window={size=1}, dim_labels=bf0_oi0->bf0
+  ROOT %tuple = f32[3,2,3]{2,1,0} tuple(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(ShardingPropagationTest, ConvolutionDifferentDimensionNumbers) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY conv {
+  %lhs = f32[8,16,512] parameter(0),
+    sharding={devices=[1,2,1]0,1}
+  %rhs = f32[8,2,512] parameter(1)
+  %conv = f32[3,512,512] convolution(%lhs, %rhs),
+    window={size=2 stride=5},
+    dim_labels=f0b_i0o->0bf
+  ROOT %tuple = f32[3,512,512] tuple(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{devices=[2,1,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, Concatenate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %concat {
+  %param.0 = f32[5,7] parameter(0),
+    sharding={devices=[2,1]0,1}
+  %param.1 = f32[5,9] parameter(1),
+    sharding={devices=[2,1]0,1}
+  %concat = f32[5,16] concatenate(%param.0, %param.1),
+    dimensions={1}
+  ROOT %tuple = (f32[5,16]) tuple(%concat)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "concat"),
+              op::Sharding("{devices=[2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, TupleBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %tuple {
+  %param.0 = f32[1] parameter(0)
+  %param.1 = f32[3] parameter(1)
+  %copy.0 = f32[1] copy(%param.0)
+  %copy.1 = f32[3] copy(param.1)
+  ROOT %tuple = (f32[1], f32[3]) tuple(%copy.0, %copy.1),
+    sharding={{replicated}, {devices=[2]0,1}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "copy.0"),
+              op::Sharding("{replicated}"));
+  EXPECT_THAT(FindInstruction(module.get(), "copy.1"),
+              op::Sharding("{devices=[2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, AllReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%add (lhs: f32[], rhs: f32[]) -> f32[] {
+  %add_lhs = f32[] parameter(0)
+  %add_rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %add_lhs, f32[] %add_rhs)
+}
+
+ENTRY %entry {
+  %param.0 = f32[3] parameter(0)
+  %param.1 = f32[3] parameter(1)
+
+  %copy_f_t = f32[3] copy(%param.1), sharding={devices=[2]0,1}
+  %crs_f.tiled = f32[3] all-reduce(%copy_f_t), to_apply=%add
+  %crs_f.none = f32[3] all-reduce(%copy_f_t), to_apply=%add,
+    channel_id=1
+
+  %crs_b.replicated = f32[3] all-reduce(%param.0), to_apply=%add
+  %copy_b_r = f32[3] copy(%crs_b.replicated), sharding={replicated}
+
+  ROOT %tuple = (f32[3], f32[3], f32[3], f32[3]) tuple(
+    %crs_f.tiled, crs_f.none, %copy_b_r)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "crs_f.tiled"),
+              op::Sharding("{devices=[2]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "crs_f.none"), op::NoSharding());
+
+  EXPECT_THAT(FindInstruction(module.get(), "crs_b.replicated"),
+              op::Sharding("{replicated}"));
+}
+
+TEST_F(ShardingPropagationTest, While) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%cond {
+  %vars.cond = (u32[], f32[10]{0}) parameter(0)
+  %count.cond = u32[] get-tuple-element((u32[], f32[10]{0}) %vars.cond), index=0
+  %limit = u32[] constant(10)
+  ROOT %lt = pred[] compare(u32[] %count.cond, u32[] %limit), direction=LT
+}
+
+%body {
+  %vars = (u32[], f32[10]{0}) parameter(0)
+  %count = u32[] get-tuple-element(%vars), index=0
+  %acc = f32[10]{0} get-tuple-element((u32[], f32[10]{0}) %vars), index=1
+
+  %one = u32[] constant(1)
+  %count.1 = u32[] add(u32[] %count, u32[] %one), sharding={replicated}
+  %acc.1 = f32[10]{0} add(f32[10]{0} %acc, f32[10]{0} %acc)
+  ROOT %tuple = (u32[], f32[10]{0}) tuple(u32[] %count.1, f32[10]{0} %acc.1)
+}
+
+ENTRY %entry {
+  %p0 = f32[10]{0} parameter(0)
+  %p0.copy = f32[10]{0} copy(f32[10]{0} %p0)
+  %p1 = f32[10]{0} parameter(1)
+  %zero = u32[] constant(0)
+  %init = (u32[], f32[10]{0}) tuple(u32[] %zero, f32[10]{0} %p0.copy)
+  %while = (u32[], f32[10]{0}) while((u32[], f32[10]{0}) %init),
+    body=%body, condition=%cond
+  %res = f32[10]{0} get-tuple-element((u32[], f32[10]{0}) %while), index=1
+  %prev = f32[10]{0} get-tuple-element((u32[], f32[10]{0}) %init), index=1
+  %res.1 = f32[10]{0} multiply(f32[10]{0} %res, %prev)
+  ROOT %res_tuple = (f32[10]{0}) tuple(f32[10]{0} %res.1)
+})";
+
+  auto while_is_sharded = [this](HloModule* module,
+                                 const HloSharding& sharding) {
+    TF_ASSERT_OK_AND_ASSIGN(bool changed, ShardingPropagation().Run(module));
+    EXPECT_TRUE(changed);
+    auto while_instr = FindInstruction(module, "while");
+    EXPECT_NE(nullptr, while_instr);
+    std::vector<const HloInstruction*> instructions{
+        while_instr, while_instr->while_body()->root_instruction(),
+        while_instr->while_body()->parameter_instruction(0),
+        while_instr->while_condition()->parameter_instruction(0)};
+
+    for (auto instr : instructions) {
+      EXPECT_TRUE(instr->has_sharding());
+      EXPECT_EQ(sharding, instr->sharding());
+    }
+  };
+  {
+    // Propagation of user-defined partial sharding of while-related instruction
+    // (body root in this test).
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(hlo_string));
+    auto body_root = FindInstruction(module.get(), "tuple");
+    EXPECT_NE(nullptr, body_root);
+    auto sharding =
+        ParseSharding("{{replicated}, {devices=[2]0,1}}").ConsumeValueOrDie();
+    body_root->set_sharding(sharding);
+    while_is_sharded(module.get(), sharding);
+  }
+  {
+    // Propagation from acc.1 to the rest of the loop.
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(hlo_string));
+    auto acc_1 = FindInstruction(module.get(), "acc.1");
+    EXPECT_NE(nullptr, acc_1);
+    acc_1->set_sharding(ParseSharding("{devices=[2]0,1}").ConsumeValueOrDie());
+
+    while_is_sharded(
+        module.get(),
+        ParseSharding("{{replicated}, {devices=[2]0,1}}").ConsumeValueOrDie());
+  }
+}
+
+TEST_F(ShardingPropagationTest, Dot) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %param.0 = f32[8,256,128] parameter(0)
+  %param.1 = f32[8,128,512] parameter(1)
+  %param.2 = f32[8,128] parameter(2)
+
+  %p0_copy_0 = f32[8,256,128] copy(%param.0),
+    sharding={devices=[1,4,1]0,1,2,3}
+  %p1_copy_0 = f32[8,128,512] copy(%param.1),
+    sharding={devices=[1,2,2]0,1,2,3}
+  %p2_copy = f32[8,128] copy(%param.2)
+  %dot_prop_rhs = f32[8,256,512] dot(%p0_copy_0, %p1_copy_0),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+  %dot_prop_lhs = f32[8,512,256] dot(%p1_copy_0, %p0_copy_0),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={1}, rhs_contracting_dims={2}
+  %dot_mat_vec = f32[8,256] dot(%p0_copy_0, %p2_copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+
+  %p0_copy_1 = f32[8,256,128] copy(%param.0)
+  %p1_copy_1 = f32[8,128,512] copy(%param.1)
+  %dot_back_prop_rhs = f32[8,256,512] dot(%p0_copy_1, %p1_copy_1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+  %copy_back_prop_rhs = f32[8,256,512] copy(%dot_back_prop_rhs),
+    sharding={devices=[1,2,2]0,1,2,3}
+
+  ROOT %tuple = (f32[8,256,256], f32[8,256,256], f32[8,256])
+    tuple(%dot_prop_lhs, %dot_prop_rhs, %dot_mat_vec, %copy_back_prop_rhs)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "dot_prop_rhs"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3}"));
+  EXPECT_THAT(FindInstruction(module.get(), "dot_prop_lhs"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3}"));
+  EXPECT_THAT(FindInstruction(module.get(), "dot_mat_vec"),
+              op::Sharding("{devices=[1,4]0,1,2,3}"));
+
+  EXPECT_THAT(FindInstruction(module.get(), "p0_copy_1"),
+              op::Sharding("{replicated}"));
+  EXPECT_THAT(FindInstruction(module.get(), "p1_copy_1"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3}"));
+  EXPECT_THAT(FindInstruction(module.get(), "dot_back_prop_rhs"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, DotTiledBatchDim) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %p0 = f32[8,256,512] parameter(0)
+  %p1 = f32[8,512,128] parameter(1)
+
+  %add = f32[8,256,512] add(%p0, %p0)
+  %dot = f32[8,256,128] dot(%add, %p1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+  %res = f32[8,32768] reshape(%dot), sharding={devices=[2,2]0,1,2,3}
+
+  ROOT %tuple = (f32[8,32768]) tuple(%res)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "add"),
+              op::Sharding("{devices=[2,2,1]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, ConcatFromUserUnshardedDim) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %p0 = f32[8,128] parameter(0)
+  %p1 = f32[8,128] parameter(1)
+  %c0 = f32[8,128] copy(%p0)
+  %c1 = f32[8,128] copy(%p1)
+
+  %concat = f32[16,128] concatenate(%c0, %c1),
+    dimensions={0},
+    sharding={devices=[1,2]0,1}
+  ROOT %tuple = (f32[16,128]) tuple(%concat)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "c0"),
+              op::Sharding("{devices=[1,2]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "c1"),
+              op::Sharding("{devices=[1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, ConcatFromUserShardedDim) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %p0 = f32[8,128] parameter(0)
+  %p1 = f32[8,128] parameter(1)
+  %c0 = f32[8,128] copy(%p0)
+  %c1 = f32[8,128] copy(%p1)
+
+  %concat = f32[16,128] concatenate(%c0, %c1),
+    dimensions={0},
+    sharding={devices=[3,1]0,1,2}
+  ROOT %tuple = (f32[16,128]) tuple(%concat)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "c0"),
+              op::Sharding("{devices=[2,1]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "c1"),
+              op::Sharding("{devices=[2,1]1,2}"));
+}
+
+TEST_F(ShardingPropagationTest, ConcatFromUserShardedDimMaximalOperand) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %p0 = f32[8,128] parameter(0)
+  %p1 = f32[24,128] parameter(1)
+  %c0 = f32[8,128] copy(%p0)
+  %c1 = f32[24,128] copy(%p1)
+
+  %concat = f32[32,128] concatenate(%c0, %c1),
+    dimensions={0},
+    sharding={devices=[4,1]0,1,2,3}
+  ROOT %tuple = (f32[32,128]) tuple(%concat)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "c0"), op::NoSharding());
+  EXPECT_THAT(FindInstruction(module.get(), "c1"),
+              op::Sharding("{devices=[3,1]1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, ReplicatedToSideEffecting) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY entry_computation {
+  %const.0 = s32[] constant(0), sharding={replicated}
+  %const.1 = s32[] constant(2147483647), sharding={replicated}
+  %rng = s32[4]{0} rng(%const.0, %const.1),
+    distribution=rng_uniform
+  ROOT %root = (s32[4]{0}) tuple(%rng)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_FALSE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "rng"), op::NoSharding());
+}
+
+TEST_F(ShardingPropagationTest, PartReplicatedTupleUser) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY entry_computation {
+  %param.0 = f32[5] parameter(0)
+  %param.1 = f32[7] parameter(1)
+  %param.2 = f32[9] parameter(2)
+  %tuple.0 = (f32[5], f32[7]) tuple(%param.0, %param.1)
+  ROOT %tuple.1 = ((f32[5], f32[7]), f32[9]) tuple(%tuple.0, %param.2),
+    sharding={{maximal device=0}, {replicated}, {maximal device=1}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "tuple.0"),
+              op::Sharding("{{maximal device=0}, {replicated}}"));
+}
+
+TEST_F(ShardingPropagationTest, Conditional) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%true_comp {
+  %tp = (f32[3,5]) parameter(0)
+  %tgte = f32[3,5] get-tuple-element(%tp), index=0
+  %ttr = f32[5,3] transpose(%tgte), dimensions={1,0}
+  ROOT %tr = (f32[5,3]) tuple(%ttr)
+}
+
+%false_comp {
+  %fp = (f32[5,3]) parameter(0)
+  %fgte = f32[5,3] get-tuple-element(%fp), index=0
+  ROOT %fr = (f32[5,3]) tuple(%fgte)
+}
+
+ENTRY entry {
+  %cond = pred[] parameter(0)
+  %true_param = (f32[3,5]) parameter(1), sharding={{devices=[1,2]0,1}}
+  %false_param = (f32[5,3]) parameter(2), sharding={{devices=[1,3]0,1,2}}
+  %conditional = (f32[5,3]) conditional(
+      %cond, %true_param, %false_param),
+    true_computation=%true_comp,
+    false_computation=%false_comp
+  ROOT %root = f32[5,3] get-tuple-element(%conditional), index=0
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "tp"),
+              op::Sharding("{{devices=[1,2]0,1}}"));
+  EXPECT_THAT(FindInstruction(module.get(), "tgte"),
+              op::Sharding("{devices=[1,2]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "ttr"),
+              op::Sharding("{devices=[2,1]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "tr"),
+              op::Sharding("{{devices=[2,1]0,1}}"));
+  EXPECT_THAT(FindInstruction(module.get(), "fp"),
+              op::Sharding("{{devices=[1,3]0,1,2}}"));
+  EXPECT_THAT(FindInstruction(module.get(), "fgte"),
+              op::Sharding("{devices=[1,3]0,1,2}"));
+  EXPECT_THAT(FindInstruction(module.get(), "fr"),
+              op::Sharding("{{devices=[2,1]0,1}}"));
+  EXPECT_THAT(FindInstruction(module.get(), "conditional"),
+              op::Sharding("{{devices=[2,1]0,1}}"));
+}
+
+TEST_F(ShardingPropagationTest, TupleFromUser) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %p0 = f32[13] parameter(0)
+  %p1 = f32[15] parameter(1)
+  %p2 = f32[17] parameter(2)
+  %t0 = (f32[13], f32[15]) tuple(%p0, %p1)
+  %t1 = ((f32[13], f32[15]), f32[17]) tuple(%t0, %p2)
+  %gte.0 = (f32[13], f32[15]) get-tuple-element(%t1), index=0
+  %gte.1 = f32[13] get-tuple-element(%gte.0), index=0
+  %gte.2 = f32[15] get-tuple-element(%gte.0), index=1
+  %gte.3 = f32[17] get-tuple-element(%t1), index=1
+  ROOT %t2 = (f32[13], f32[15], f32[17]) tuple(%gte.1, %gte.2, %gte.3),
+    sharding={{replicated}, {devices=[2]0,1}, {devices=[3]1,2,3}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "t0"),
+              op::Sharding("{{replicated}, {devices=[2]0,1}}"));
+  EXPECT_THAT(
+      FindInstruction(module.get(), "t1"),
+      op::Sharding("{{replicated}, {devices=[2]0,1}, {devices=[3]1,2,3}}"));
+}
+
+TEST_F(ShardingPropagationTest, DynamicSliceForwardPass) {
+  const char* hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %p0 = f32[11,13,15] parameter(0)
+  %c0 = f32[11,13,15] copy(%p0), sharding={devices=[1,1,2]0,1}
+  %p1 = s32[] parameter(1)
+  %i0 = s32[] constant(0)
+  %ds = f32[11,1,15] dynamic-slice(%c0, %i0, %p1, %i0),
+    dynamic_slice_sizes={11,1,15}
+  ROOT %root = (f32[11,1,15]) tuple(%ds)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "ds"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, DynamicSliceBackwardPass) {
+  const char* hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %p0 = f32[11,13,15] parameter(0)
+  %c0 = f32[11,13,15] copy(%p0)
+  %p1 = s32[] parameter(1)
+  %i0 = s32[] constant(0)
+  %ds = f32[11,1,15] dynamic-slice(%c0, %i0, %p1, %i0),
+    dynamic_slice_sizes={11,1,15},
+    sharding={devices=[1,1,2]0,1}
+  ROOT %root = (f32[11,1,15]) tuple(%ds)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "ds"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, DynamicUpdateSliceForwardPassBase) {
+  const char* hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %p0 = f32[11,13,15] parameter(0)
+  %c0 = f32[11,13,15] copy(%p0), sharding={devices=[1,1,2]0,1}
+  %p1 = f32[11,1,15] parameter(1)
+  %c1 = f32[11,1,15] copy(%p1)
+  %p2 = s32[] parameter(2)
+  %i0 = s32[] constant(0)
+  %dus = f32[11,13,15] dynamic-update-slice(%c0, %c1, %i0, %p2, %i0)
+  ROOT %root = (f32[11,13,15]) tuple(%dus)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "dus"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "c1"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, DynamicUpdateSliceForwardPassUpdate) {
+  const char* hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %p0 = f32[11,13,15] parameter(0)
+  %c0 = f32[11,13,15] copy(%p0)
+  %p1 = f32[11,1,15] parameter(1)
+  %c1 = f32[11,1,15] copy(%p1), sharding={devices=[1,1,2]0,1}
+  %p2 = s32[] parameter(2)
+  %i0 = s32[] constant(0)
+  %dus = f32[11,13,15] dynamic-update-slice(%c0, %c1, %i0, %p2, %i0)
+  ROOT %root = (f32[11,13,15]) tuple(%dus)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "dus"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "c0"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, DynamicUpdateSliceBackwardPass) {
+  const char* hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %p0 = f32[11,13,15] parameter(0)
+  %c0 = f32[11,13,15] copy(%p0)
+  %p1 = f32[11,1,15] parameter(1)
+  %c1 = f32[11,1,15] copy(%p1)
+  %p2 = s32[] parameter(2)
+  %i0 = s32[] constant(0)
+  %dus = f32[11,13,15] dynamic-update-slice(%c0, %c1, %i0, %p2, %i0),
+    sharding={devices=[1,1,2]0,1}
+  ROOT %root = (f32[11,13,15]) tuple(%dus)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "c0"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "c1"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, EinsumLHSBatchPartitioned) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={devices=[2,1,1]0,1}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs)
+  %conv = f32[32,24,39296] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=0bf_0oi->0bf, window={size=32 stride=31 lhs_dilate=32}
+  ROOT %copy = f32[32,24,39296] copy(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "rhs.copy"),
+              op::Sharding("{devices=[2,1,1]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{devices=[2,1,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, EinsumOutputBatchPartitioned) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs)
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs)
+  %conv = f32[32,24,39296] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=0bf_0oi->0bf, window={size=32 stride=31 lhs_dilate=32},
+    sharding={devices=[2,1,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "lhs.copy"),
+              op::Sharding("{devices=[2,1,1]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "rhs.copy"),
+              op::Sharding("{devices=[2,1,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, EinsumLHSNonContractingPartitioned) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,2]0,1,2,3}
+  %rhs = f32[32,39296,64,1] parameter(1)
+  %rhs.copy = f32[32,39296,64,1] copy(%rhs)
+  %conv = f32[32,24,39296,128] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=0bf1_0oi1->0bf1, window={size=32x1 stride=31x1 lhs_dilate=32x1}
+  ROOT %copy = f32[32,24,39296,128] copy(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{devices=[1,2,1,2]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, EinsumOutputLHSNonContractingPartitioned) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs)
+  %rhs = f32[32,39296,64,1] parameter(1)
+  %rhs.copy = f32[32,39296,64,1] copy(%rhs)
+  ROOT %conv = f32[32,24,39296,128] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=0bf1_0oi1->0bf1, window={size=32x1 stride=31x1 lhs_dilate=32x1},
+    sharding={devices=[1,2,1,2]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "lhs.copy"),
+              op::Sharding("{devices=[1,2,1,2]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, EinsumRHSNonContractingPartitioned) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,1] parameter(0)
+  %lhs.copy = f32[32,24,64,1] copy(%lhs)
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={devices=[1,2,1,2]0,1,2,3}
+  %conv = f32[32,24,39296,128] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=0bf1_0oi1->0bf1,
+    window={size=32x128 stride=31x1 pad=0_0x127_127 lhs_dilate=32x1 rhs_reversal=0x1}
+  ROOT %copy = f32[32,24,39296,128] copy(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{devices=[1,1,2,2]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, EinsumOutputRHSNonContractingPartitioned) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,1] parameter(0)
+  %lhs.copy = f32[32,24,64,1] copy(%lhs)
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs)
+  ROOT %conv = f32[32,24,39296,128] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=0bf1_0oi1->0bf1,
+    window={size=32x128 stride=31x1 pad=0_0x127_127 lhs_dilate=32x1 rhs_reversal=0x1},
+    sharding={devices=[1,1,2,2]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "rhs.copy"),
+              op::Sharding("{devices=[1,2,1,2]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, EinsumChooseLargerOperand) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,1] parameter(0)
+  %lhs.copy = f32[32,24,64,1] copy(%lhs), sharding={devices=[1,4,1,1]0,1,2,3}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={devices=[1,2,1,2]0,1,2,3}
+  %conv = f32[32,24,39296,128] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=0bf1_0oi1->0bf1,
+    window={size=32x128 stride=31x1 pad=0_0x127_127 lhs_dilate=32x1 rhs_reversal=0x1}
+  ROOT %copy = f32[32,24,39296,128] copy(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{devices=[1,1,2,2]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, EinsumChooseBatchFirst) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,1] parameter(0)
+  %lhs.copy = f32[32,24,64,1] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={devices=[2,1,1,1]0,1}
+  %conv = f32[32,24,39296,128] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=0bf1_0oi1->0bf1,
+    window={size=32x128 stride=31x1 pad=0_0x127_127 lhs_dilate=32x1 rhs_reversal=0x1}
+  ROOT %copy = f32[32,24,39296,128] copy(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{devices=[2,1,1,1]0,1}"));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/BUILD b/tensorflow/compiler/xla/service/spmd/BUILD
new file mode 100644
index 00000000000..4433078472d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/BUILD
@@ -0,0 +1,71 @@
+# Description: SPMD partitioning pass.
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//tensorflow/compiler/xla:friends",
+    ],
+)
+
+cc_library(
+    name = "spmd_partitioner",
+    srcs = [
+        "spmd_partitioner.cc",
+        "spmd_partitioner_util.cc",
+    ],
+    hdrs = [
+        "spmd_partitioner.h",
+        "spmd_partitioner_util.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:comparison_util",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:protobuf_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client/lib:comparators",
+        "//tensorflow/compiler/xla/service:dot_as_convolution_util",
+        "//tensorflow/compiler/xla/service:flatten_call_graph",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
+        "//tensorflow/compiler/xla/service:hlo_cse",
+        "//tensorflow/compiler/xla/service:hlo_dce",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service:hlo_query",
+        "//tensorflow/compiler/xla/service:hlo_sharding_util",
+        "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/compiler/xla/service:shape_inference",
+        "//tensorflow/compiler/xla/service:tuple_simplifier",
+        "//tensorflow/core/platform:numbers",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "spmd_partitioner_test",
+    srcs = ["spmd_partitioner_test.cc"],
+    deps = [
+        ":spmd_partitioner",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
new file mode 100644
index 00000000000..635446a18a1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -0,0 +1,4943 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+
+#include <float.h>
+
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/protobuf_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/dot_as_convolution_util.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_cse.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/numbers.h"
+
+namespace xla {
+namespace spmd {
+
+string SpmdLogger::MakeReport() {
+  string report;
+  absl::StrAppend(&report,
+                  "\n\n***** SPMD memory during transformation *****\n");
+
+  std::sort(entries_.begin(), entries_.end(),
+            [](auto const& entry0, auto const& entry1) {
+              return entry0.first > entry1.first;
+            });
+  for (int64 i = 0;
+       i < std::min<int64>(report_instruction_count_, entries_.size()); ++i) {
+    absl::StrAppend(
+        &report, "\n  ",
+        tensorflow::strings::HumanReadableNumBytes(entries_[i].first), " : ",
+        entries_[i].second, "\n");
+  }
+
+  return report;
+}
+
+void SpmdLogger::RegisterLogEntry(HloInstruction* hlo,
+                                  const std::vector<HloInstruction*>& group) {
+  string report = hlo->ToString();
+  int64 max_value = -1;
+  for (HloInstruction* inst : group) {
+    if (!inst->shape().IsArray()) {
+      continue;
+    }
+    max_value = std::max<int64>(max_value, ShapeSizeInBytes(inst->shape()));
+    absl::StrAppend(&report, "     * ", inst->ToString(), "\n");
+  }
+  entries_.push_back(std::make_pair(max_value, report));
+}
+
+/* static */ string SpmdLogger::ReportBeforePartition(
+    const HloModule& module, int64 report_instruction_count) {
+  string report;
+  absl::StrAppend(&report,
+                  "\n\n***** SPMD memory usage before partition *****\n");
+  absl::StrAppend(&report, "\n  ** Replicated instructions\n");
+  absl::StrAppend(&report, ReportMemoryUsage(
+                               module,
+                               [](const HloInstruction* hlo) {
+                                 return !hlo->has_sharding() ||
+                                        hlo->sharding().IsReplicated();
+                               },
+                               report_instruction_count));
+  absl::StrAppend(&report, "\n  ** All instructions\n");
+  absl::StrAppend(&report,
+                  ReportMemoryUsage(
+                      module, [](const HloInstruction* hlo) { return true; },
+                      report_instruction_count));
+  return report;
+}
+
+/* static */ string SpmdLogger::ReportAfterPartition(
+    const HloModule& module, int64 report_instruction_count) {
+  string report;
+  absl::StrAppend(&report,
+                  "\n\n***** SPMD memory usage after partition *****\n");
+  absl::StrAppend(&report,
+                  ReportMemoryUsage(
+                      module, [](const HloInstruction* hlo) { return true; },
+                      report_instruction_count));
+  return report;
+}
+
+template <typename F>
+/* static */ string SpmdLogger::ReportMemoryUsage(
+    const HloModule& module, const F& filter, int64 report_instruction_count) {
+  string report;
+  std::vector<HloInstruction*> instructions;
+  instructions.reserve(module.instruction_count());
+
+  for (auto computation : module.computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    for (auto hlo : computation->instructions()) {
+      if (hlo->shape().IsTuple() ||
+          ShapeUtil::IsEffectiveScalar(hlo->shape())) {
+        continue;
+      }
+      if (filter(hlo)) {
+        instructions.push_back(hlo);
+      }
+    }
+  }
+
+  const auto add_report = [&](std::vector<HloInstruction*>* insts) {
+    std::sort(insts->begin(), insts->end(),
+              [](const HloInstruction* inst0, const HloInstruction* inst1) {
+                return ShapeSizeInBytes(inst0->shape()) >
+                       ShapeSizeInBytes(inst1->shape());
+              });
+    for (int64 i = 0;
+         i < std::min<int64>(report_instruction_count, insts->size()); ++i) {
+      absl::StrAppend(&report, "  ",
+                      tensorflow::strings::HumanReadableNumBytes(
+                          ShapeSizeInBytes((*insts)[i]->shape())),
+                      " : ", (*insts)[i]->ToString(), "\n");
+    }
+  };
+
+  add_report(&instructions);
+  return report;
+}
+
+namespace {
+
+// Returns the replica group configuration where each replica belongs to its own
+// group.
+std::vector<ReplicaGroup> CreateReplicaGroups(int64 num_replicas) {
+  std::vector<ReplicaGroup> groups(num_replicas);
+  for (int64 i = 0; i < num_replicas; ++i) {
+    groups[i].add_replica_ids(i);
+  }
+  return groups;
+}
+
+bool CanReshardWithAllToAll(const HloSharding& source,
+                            const HloSharding& target) {
+  return UniqueTiledDim(source) && UniqueTiledDim(target) &&
+         UniqueTiledDim(source) != UniqueTiledDim(target);
+}
+
+bool CanReshardWithCollectivePermute(const HloSharding& source,
+                                     const HloSharding& target) {
+  return UniqueTiledDim(source) && UniqueTiledDim(target) &&
+         UniqueTiledDim(source) == UniqueTiledDim(target) && source != target;
+}
+
+// Clears all sharding attributes from instructions in the module. This must be
+// called only after all SPMD transformation is complete.
+Status ClearShardingAttributes(HloModule* module) {
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* hlo : computation->instructions()) {
+      // Keep sharding annotation on Infeed and entry parameters since they're
+      // used by HloReplicationAnalysis later (for ArCrsCombiner).
+      if (hlo->opcode() == HloOpcode::kInfeed) {
+        continue;
+      }
+      if (hlo->opcode() == HloOpcode::kParameter &&
+          computation == module->entry_computation()) {
+        continue;
+      }
+      hlo->clear_sharding();
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+HloInstruction* SpmdBuilder::AddInstruction(
+    std::unique_ptr<HloInstruction> instruction) {
+  HloInstruction* hlo =
+      HloComputation::Builder::AddInstruction(std::move(instruction));
+  if (visiting_hlo_) {
+    instructions_[visiting_hlo_].push_back(hlo);
+  }
+  return hlo;
+}
+
+PartitionedHlo PartitionedHlo::Reshard(const HloSharding& target) {
+  auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].reshard_cache;
+  for (auto& entry : cache) {
+    if (entry.first == target) {
+      return entry.second;
+    }
+  }
+  cache.emplace_back(target, ReshardNoCache(target));
+  state_.reshard_cache->per_hlo_cache[cache.back().second.hlo()]
+      .reshard_cache.emplace_back(sharding(), *this);
+  return cache.back().second;
+}
+
+PartitionedHlo PartitionedHlo::ReshardNoCache(const HloSharding& target) {
+  VLOG(2) << "Resharding " << hlo_->ToString() << " from "
+          << hlo_->sharding().ToString() << " to " << target.ToString();
+  const Shape& shape = hlo_->shape();
+  CHECK(shape.IsTuple() || !target.IsTuple());
+
+  // Tuple shape instructions may have non-tuple sharding, which means that the
+  // same sharding applies to all the leaves.
+  if (shape.IsTuple() && !target.IsTuple()) {
+    return Reshard(target.GetTupleSharding(shape).ValueOrDie());
+  }
+
+  // For a tuple shape, recursively apply Reshard to all the leaves and return
+  // a tuple instruction.
+  if (shape.IsTuple()) {
+    std::vector<HloInstruction*> elements;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      auto subshape = ShapeUtil::GetTupleElementShape(shape, i);
+      auto element = state_.b->AddInstruction(
+          HloInstruction::CreateGetTupleElement(subshape, hlo(), i));
+      element->set_sharding(sharding().GetSubSharding(shape, {i}));
+      elements.push_back(
+          PartitionedHlo(
+              element, ShapeUtil::GetTupleElementShape(base_shape_, i), state_)
+              .Reshard(target.GetSubSharding(shape, {i}))
+              .hlo());
+    }
+    auto tuple =
+        state_.b->AddInstruction(HloInstruction::CreateTuple(elements));
+    tuple->set_sharding(target);
+    return PartitionedHlo(tuple, base_shape_, state_);
+  }
+
+  if (sharding() == target) {
+    return *this;
+  }
+
+  if (shape.element_type() == TOKEN) {
+    return *this;
+  }
+
+  if (CanReshardWithCollectivePermute(sharding(), target)) {
+    return ReshardWithCollectivePermute(target);
+  }
+
+  if (CanReshardWithAllToAll(sharding(), target)) {
+    return ReshardWithAllToAll(target);
+  }
+
+  // If not replicated yet, first replicate and then reshard to use one of the
+  // two implementations below.
+  if (!sharding().IsReplicated()) {
+    return Replicate().Reshard(target);
+  }
+
+  // 'Replicated' to 'SingleDevice'.
+  if (target.IsTileMaximal()) {
+    auto copy = state_.b->AddInstruction(
+        HloInstruction::CreateUnary(hlo_->shape(), HloOpcode::kCopy, hlo_));
+    copy->set_sharding(target);
+    return PartitionedHlo(copy, base_shape_, state_);
+  }
+
+  // 'Replicated' to 'Tiled'.
+  auto padded_hlo =
+      PadBaseShapeBeforeUnevenTiledSharding(hlo_, target, state_.b);
+  auto shard_shape = MakePartitionedShape(shape, target);
+  auto slice = state_.b->AddInstruction(HloInstruction::CreateDynamicSlice(
+      shard_shape, padded_hlo,
+      MakePartitionOffsets(shape, target, state_.partition_id, state_.b),
+      shard_shape.dimensions()));
+  slice->set_sharding(target);
+  return PartitionedHlo(slice, base_shape_, state_);
+}
+
+PartitionedHlo PartitionedHlo::PadWithValue(
+    HloInstruction* pad_value, absl::Span<const int64> left_padded_dims) const {
+  const HloSharding& sharding = hlo_->sharding();
+  const Shape& shape = hlo_->shape();
+  CHECK(!shape.IsTuple() && shape.element_type() != TOKEN);
+  if (sharding.IsReplicated() || EvenlyPartitions(base_shape_, sharding)) {
+    return *this;
+  }
+  CHECK(!sharding.IsTileMaximal());
+  auto index_shape = ShapeUtil::ChangeElementType(shape, S32);
+  auto mask_shape = ShapeUtil::ChangeElementType(index_shape, PRED);
+  auto get_mask_for_dim = [&](int64 dim, HloInstruction* start_index) {
+    // Comparison: iota + start_index < valid_size
+    auto iota =
+        state_.b->AddInstruction(HloInstruction::CreateIota(index_shape, dim));
+    auto broadcast_start_index = state_.b->AddInstruction(
+        HloInstruction::CreateBroadcast(index_shape, start_index, {}));
+    auto index_in_full_shape =
+        state_.b->AddInstruction(HloInstruction::CreateBinary(
+            index_shape, HloOpcode::kAdd, iota, broadcast_start_index));
+    ComparisonDirection direction = ComparisonDirection::kLt;
+    int64 index_limit = base_shape_.dimensions(dim);
+    if (absl::c_linear_search(left_padded_dims, dim)) {
+      direction = ComparisonDirection::kGe;
+      index_limit =
+          index_shape.dimensions(dim) * sharding.tile_assignment().dim(dim) -
+          index_limit;
+    }
+    auto limit = state_.b->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<int32>(index_limit)));
+    auto broadcast_limit = state_.b->AddInstruction(
+        HloInstruction::CreateBroadcast(index_shape, limit, {}));
+    return state_.b->AddInstruction(HloInstruction::CreateCompare(
+        mask_shape, index_in_full_shape, broadcast_limit, direction));
+  };
+
+  HloInstruction* mask = nullptr;
+  auto offsets = MakePartitionOffsets(base_shape_, sharding,
+                                      state_.partition_id, state_.b);
+  for (int64 i = 0; i < shape.rank(); ++i) {
+    if (base_shape_.dimensions(i) % sharding.tile_assignment().dim(i) == 0) {
+      continue;
+    }
+    if (mask == nullptr) {
+      mask = get_mask_for_dim(i, offsets[i]);
+    } else {
+      mask = state_.b->AddInstruction(
+          HloInstruction::CreateBinary(mask->shape(), HloOpcode::kAnd, mask,
+                                       get_mask_for_dim(i, offsets[i])));
+    }
+  }
+
+  if (mask == nullptr) {
+    return *this;
+  }
+
+  auto broadcast_pad_value = state_.b->AddInstruction(
+      HloInstruction::CreateBroadcast(shape, pad_value, {}));
+  auto result = state_.b->AddInstruction(HloInstruction::CreateTernary(
+      shape, HloOpcode::kSelect, mask, hlo_, broadcast_pad_value));
+  result->set_sharding(sharding);
+  return PartitionedHlo(result, base_shape_, state_);
+}
+
+absl::optional<PartitionedHlo::WindowedInputShardReturnValue>
+PartitionedHlo::ReshardAsWindowedInput(const Window& window,
+                                       const HloSharding& target,
+                                       HloInstruction* pad_value,
+                                       bool mask_invalid_region) {
+  auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].window_reshard_cache;
+  for (auto& entry : cache) {
+    if (std::get<0>(entry) == target &&
+        protobuf_util::ProtobufEquals(std::get<1>(entry), window)) {
+      return std::get<2>(entry);
+    }
+  }
+  auto update_cache = [&](WindowedInputShardReturnValue result) {
+    cache.emplace_back(target, window, std::move(result));
+    return std::get<2>(cache.back());
+  };
+  VLOG(2) << "ReshardAsWindowedInput()\n"
+          << "\twindow:" << window_util::ToString(window)
+          << "\ttarget sharding:" << target.ToString();
+
+  CHECK(!target.IsTileMaximal());
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(target, state_.partition_id, state_.b);
+  auto shard_shape = base_shape_;
+
+  std::vector<MultiplyAddDivideOffsetCalculation> start_on_padded_calculations(
+      base_shape_.rank());
+  std::vector<MultiplyAddDivideOffsetCalculation> limit_on_padded_calculations(
+      base_shape_.rank());
+  std::vector<HloInstruction*> dynamic_slice_offset_on_output(
+      base_shape_.rank(), nullptr);
+
+  Window shard_window = window;
+  auto padded_shape = base_shape_;
+  std::vector<HloInstruction*> offsets_on_padded_shape(base_shape_.rank());
+  std::vector<int64> per_shard_window_counts(base_shape_.rank());
+  std::vector<int64> explicit_left_padding(base_shape_.rank());
+  for (int64 i = 0; i < base_shape_.rank(); ++i) {
+    // Do not pad non-partitioned dimensions.
+    int64 shard_count = target.tile_assignment().dim(i);
+    if (shard_count == 1) {
+      offsets_on_padded_shape[i] = state_.b->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+      continue;
+    }
+    const auto& wd = window.dimensions(i);
+    if (wd.window_dilation() != 1) {
+      // TODO(yuanzx): Support window dilation.
+      VLOG(2) << "Failed to reshard window operand due to window dilation";
+      return absl::nullopt;
+    }
+    int64 full_size =
+        base_shape_.dimensions(i) +
+        (wd.base_dilation() - 1) * (base_shape_.dimensions(i) - 1) +
+        wd.padding_high() + wd.padding_low();
+    if (full_size < wd.size()) {
+      VLOG(2) << "Failed to reshard window operand because the window size is "
+                 "larger than padded base size";
+      return absl::nullopt;
+    }
+    int64 window_count = (full_size - wd.size()) / wd.stride() + 1;
+    per_shard_window_counts[i] = CeilOfRatio(window_count, shard_count);
+    if (wd.stride() != 1 &&
+        (wd.stride() * per_shard_window_counts[i]) % wd.base_dilation() != 0) {
+      // TODO(yuanzx): Support this case.
+      VLOG(2) << "Failed to reshard window operand due to non-trivial dilation";
+      return absl::nullopt;
+    }
+
+    // We use explicit padding for full dilations, then use padding_low and
+    // padding_high on the sharded op for the remaining. padding_low and
+    // padding_high are now given initial values, which will be later updated if
+    // dilation is not 1.
+    auto swd = shard_window.mutable_dimensions(i);
+    explicit_left_padding[i] = wd.padding_low() / wd.base_dilation();
+    swd->set_padding_low(wd.padding_low() % wd.base_dilation());
+    swd->set_padding_high(0);
+
+    // Calculation for the first element needed on the 'padded-but-not-dilated'
+    // shape. The start on the dilated shape could be a hole, so we add
+    // wd.base_dilation() - 1 to the constant term to skip the leading holes.
+    start_on_padded_calculations[i] = MultiplyAddDivideOffsetCalculation(
+        wd.stride() * per_shard_window_counts[i],
+        wd.base_dilation() - 1 - swd->padding_low(), wd.base_dilation());
+    int64 dilated_shard_size =
+        wd.stride() * (per_shard_window_counts[i] - 1) + wd.size();
+    limit_on_padded_calculations[i] = MultiplyAddDivideOffsetCalculation(
+        wd.stride() * per_shard_window_counts[i],
+        dilated_shard_size + wd.base_dilation() - 1 - swd->padding_low(),
+        wd.base_dilation());
+
+    offsets_on_padded_shape[i] = start_on_padded_calculations[i].Calculate(
+        partition_ordinals[i], state_.b);
+
+    auto shard_size_function =
+        limit_on_padded_calculations[i] - start_on_padded_calculations[i];
+    int64 max_shard_size = shard_size_function.MaxInRange(0, shard_count);
+    shard_shape.set_dimensions(i, max_shard_size);
+    padded_shape.set_dimensions(
+        i, limit_on_padded_calculations[i].Calculate(shard_count - 1));
+
+    // For base dilation, calculate the needed padding_low and padding_high, as
+    // well as the offset for the output if a dynamic slice is needed after the
+    // sharded op.
+    if (wd.base_dilation() != 1) {
+      // Returns the offset of a shard's first valid element in the dilated
+      // shard.
+      auto get_first_valid_element_offset_on_dilated_shard =
+          [&](int64 shard_ordinal) {
+            return start_on_padded_calculations[i].Calculate(shard_ordinal) *
+                       wd.base_dilation() +
+                   swd->padding_low() -
+                   wd.stride() * per_shard_window_counts[i] * shard_ordinal;
+          };
+      CHECK_EQ(get_first_valid_element_offset_on_dilated_shard(0),
+               swd->padding_low());
+
+      // Determine swd->padding_high.
+      for (int64 shard_ordinal = 0; shard_ordinal < shard_count;
+           ++shard_ordinal) {
+        int64 wanted_limit_on_dilated_shard =
+            wd.stride() * (per_shard_window_counts[i] - 1) + wd.size();
+        int64 actual_limit_on_dilated_shard_without_pad_high =
+            get_first_valid_element_offset_on_dilated_shard(shard_ordinal) +
+            (max_shard_size - 1) * wd.base_dilation() + 1;
+        swd->set_padding_high(std::max<int64>(
+            swd->padding_high(),
+            wanted_limit_on_dilated_shard -
+                actual_limit_on_dilated_shard_without_pad_high));
+      }
+
+      // Determine swd->padding_low and output dynamic slice index.
+      if (wd.stride() == 1) {
+        int64 max_pad_low = get_first_valid_element_offset_on_dilated_shard(0);
+        bool all_same = true;
+        for (int64 shard_ordinal = 1; shard_ordinal < shard_count;
+             ++shard_ordinal) {
+          int64 start =
+              get_first_valid_element_offset_on_dilated_shard(shard_ordinal);
+          if (start != swd->padding_low()) {
+            all_same = false;
+          }
+          max_pad_low = std::max(max_pad_low, start);
+        }
+        if (!all_same) {
+          auto start_on_padded_input =
+              start_on_padded_calculations[i].Calculate(partition_ordinals[i],
+                                                        state_.b);
+          // We will calculate
+          //   max_pad_low - (first_window - required_first_window)
+          // which equals
+          //   required_first_window - (first_window - max_pad_low)
+          auto first_window_minus_max_pad_low =
+              MultiplyAddDivideOffsetCalculation(
+                  wd.base_dilation(), swd->padding_low() - max_pad_low, 1)
+                  .Calculate(start_on_padded_input, state_.b);
+          auto required_first_window =
+              MultiplyAddDivideOffsetCalculation(per_shard_window_counts[i], 0,
+                                                 1)
+                  .Calculate(partition_ordinals[i], state_.b);
+          dynamic_slice_offset_on_output[i] =
+              state_.b->AddInstruction(HloInstruction::CreateBinary(
+                  required_first_window->shape(), HloOpcode::kSubtract,
+                  required_first_window, first_window_minus_max_pad_low));
+        }
+        swd->set_padding_low(max_pad_low);
+      } else {
+        if ((wd.stride() * per_shard_window_counts[i]) % wd.base_dilation() !=
+            0) {
+          // General base dilation not yet implemented.
+          return absl::nullopt;
+        }
+        // padding_low on all shards should equal the initially assigned
+        // swd->padding_low(), i.e., the padding_low() on the original window.
+      }
+    }
+  }
+
+  // Returns the output dynamic slice offset when needed, and absl::nullopt
+  // otherwise.
+  auto get_dynamic_slice_offset_on_output_if_needed =
+      [&]() -> absl::optional<std::vector<HloInstruction*>> {
+    if (absl::c_all_of(
+            dynamic_slice_offset_on_output,
+            [](HloInstruction* offset) { return offset == nullptr; })) {
+      return absl::nullopt;
+    }
+    auto zero = state_.b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+    for (int64 i = 0; i < dynamic_slice_offset_on_output.size(); ++i) {
+      if (dynamic_slice_offset_on_output[i] == nullptr) {
+        dynamic_slice_offset_on_output[i] = zero;
+      }
+    }
+    return dynamic_slice_offset_on_output;
+  };
+
+  // If the currrent HLO is replicated, pad then slice.
+  if (sharding().IsReplicated()) {
+    PaddingConfig padding_config;
+    for (int64 i = 0; i < base_shape_.rank(); ++i) {
+      auto padding_config_dim = padding_config.add_dimensions();
+      padding_config_dim->set_interior_padding(0);
+      // Do not pad non-partitioned dimensions.
+      if (target.tile_assignment().dim(i) == 1) {
+        padding_config_dim->set_edge_padding_low(0);
+        padding_config_dim->set_edge_padding_high(0);
+        continue;
+      }
+      padding_config_dim->set_edge_padding_low(explicit_left_padding[i]);
+      padding_config_dim->set_edge_padding_high(padded_shape.dimensions(i) -
+                                                explicit_left_padding[i] -
+                                                base_shape_.dimensions(i));
+    }
+    auto padded_hlo = ShapeUtil::Compatible(padded_shape, base_shape_)
+                          ? hlo_
+                          : state_.b->AddInstruction(HloInstruction::CreatePad(
+                                padded_shape, hlo_, pad_value, padding_config));
+    auto sharded_input =
+        state_.b->AddInstruction(HloInstruction::CreateDynamicSlice(
+            shard_shape, padded_hlo, offsets_on_padded_shape,
+            shard_shape.dimensions()));
+    return update_cache(WindowedInputShardReturnValue{
+        sharded_input, shard_window,
+        get_dynamic_slice_offset_on_output_if_needed()});
+  }
+
+  if (target != sharding()) {
+    return Reshard(target).ReshardAsWindowedInput(window, target, pad_value);
+  }
+
+  // Halo exchange.
+  HloInstruction* visiting_hlo = hlo_;
+  auto original_shard_shape = MakePartitionedShape(base_shape_, target);
+
+  std::vector<OffsetCalculation> left_halo_size_functions(base_shape_.rank());
+  std::vector<OffsetCalculation> right_halo_size_functions(base_shape_.rank());
+  // TODO(yuanzx): We are concatenating on each sharded dimension one at time,
+  // and in the second dimension (and beyond) we create halos by slicing the
+  // concat in the previous dimension, which is not optimal. We should generate
+  // halos only concating slices, instead of slicing concats.
+  for (int dim = 0; dim < base_shape_.rank(); ++dim) {
+    int64 shard_count = target.tile_assignment().dim(dim);
+    if (shard_count == 1) {
+      continue;
+    }
+    int64 input_shard_size =
+        CeilOfRatio(base_shape_.dimensions(dim), shard_count);
+
+    // Left halo. The size of the halo is derived by subtracting the first read
+    // element offset of the i'th partition from the limit of the (i-1)'th
+    // partition.
+    MultiplyAddDivideOffsetCalculation shard_limit_of_previous_on_padded(
+        input_shard_size, explicit_left_padding[dim], 1);
+    left_halo_size_functions[dim] =
+        shard_limit_of_previous_on_padded - start_on_padded_calculations[dim];
+
+    // Right halo.
+    MultiplyAddDivideOffsetCalculation shard_start_of_next_on_padded(
+        input_shard_size, input_shard_size + explicit_left_padding[dim], 1);
+    right_halo_size_functions[dim] =
+        limit_on_padded_calculations[dim] - shard_start_of_next_on_padded;
+
+    auto resharded = ExchangeHaloAndGetValidData(
+        visiting_hlo, base_shape_, left_halo_size_functions[dim],
+        right_halo_size_functions[dim], explicit_left_padding[dim],
+        padded_shape.dimensions(dim), shard_shape.dimensions(dim), dim, target,
+        offsets_on_padded_shape[dim], pad_value, partition_ordinals[dim],
+        state_.collective_ops_creator, state_.next_channel_id, state_.b,
+        mask_invalid_region);
+    if (!resharded) {
+      VLOG(1) << "ReshardAsWindowedInput failed without replicate first: halo "
+                 "is beyond the neighbor.";
+      return Replicate().ReshardAsWindowedInput(window, target, pad_value);
+    }
+    visiting_hlo = *resharded;
+  }
+  return update_cache(WindowedInputShardReturnValue{
+      visiting_hlo, shard_window,
+      get_dynamic_slice_offset_on_output_if_needed()});
+}
+
+PartitionedHlo PartitionedHlo::Replicate() {
+  const HloSharding& sharding = hlo_->sharding();
+  const Shape& shape = hlo_->shape();
+  CHECK(!shape.IsTuple() && shape.element_type() != TOKEN);
+
+  if (sharding.IsReplicated()) {
+    return *this;
+  }
+  auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].reshard_cache;
+  for (auto& entry : cache) {
+    if (entry.first.IsReplicated()) {
+      return entry.second;
+    }
+  }
+  auto update_cache = [&](PartitionedHlo resharded) {
+    state_.reshard_cache->per_hlo_cache[resharded.hlo()]
+        .reshard_cache.emplace_back(sharding, *this);
+    cache.emplace_back(HloSharding::Replicate(), std::move(resharded));
+    return cache.back().second;
+  };
+  // 'Single Device' to 'Repliated'.
+  if (sharding.IsTileMaximal()) {
+    return update_cache(Broadcast());
+  }
+
+  // 'Tiled' to 'Replicated'.
+  HloInstruction* result = nullptr;
+  if (state_.collective_ops_creator.create_cross_partition_all_gather) {
+    result = state_.partitioner->AllGatherShards(state_.b, hlo_, sharding,
+                                                 NewChannel());
+  }
+  Shape padded_base_shape = shape;
+  for (int64 i = 0; i < padded_base_shape.rank(); ++i) {
+    padded_base_shape.set_dimensions(
+        i, shape.dimensions(i) * sharding.tile_assignment().dim(i));
+  }
+  if (result == nullptr) {
+    auto zero = state_.b->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(shape.element_type())));
+    auto zero_bcast = state_.b->AddInstruction(
+        HloInstruction::CreateBroadcast(padded_base_shape, zero, {}));
+    auto dus =
+        state_.b->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+            padded_base_shape, zero_bcast, hlo_,
+            MakePartitionOffsets(padded_base_shape, sharding,
+                                 state_.partition_id, state_.b)));
+    HloComputation* reduction =
+        MakeBinaryAdd(shape.element_type(), state_.module);
+
+    auto all_reduce =
+        state_.collective_ops_creator.create_cross_partition_all_reduce(
+            state_.b, dus, reduction, NewChannel());
+    result = all_reduce;
+  }
+  if (!ShapeUtil::Compatible(base_shape_, padded_base_shape)) {
+    std::vector<int64> start_indices(shape.rank(), 0);
+    std::vector<int64> strides(shape.rank(), 1);
+    result = state_.b->AddInstruction(HloInstruction::CreateSlice(
+        base_shape_, result, start_indices, base_shape_.dimensions(), strides));
+  }
+  result->set_sharding(HloSharding::Replicate());
+  return update_cache(PartitionedHlo(result, base_shape_, state_));
+}
+
+PartitionedHlo PartitionedHlo::Broadcast() const {
+  const Shape& shape = hlo_->shape();
+  const HloSharding& sharding = hlo_->sharding();
+  CHECK(sharding.HasUniqueDevice());
+  CHECK(!shape.IsTuple() && shape.element_type() != TOKEN);
+
+  auto src_core_id = state_.b->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR0<uint32>(sharding.GetUniqueDevice())));
+  Shape bcast_shape = ShapeUtil::ChangeElementType(shape, PRED);
+  auto is_src_core = state_.b->AddInstruction(HloInstruction::CreateBroadcast(
+      bcast_shape,
+      state_.b->AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::MakeShape(PRED, {}), state_.partition_id, src_core_id,
+          ComparisonDirection::kEq)),
+      {}));
+
+  auto zero = state_.b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(shape.element_type())));
+  auto zero_bcast = state_.b->AddInstruction(
+      HloInstruction::CreateBroadcast(shape, zero, {}));
+  auto operand = state_.b->AddInstruction(HloInstruction::CreateTernary(
+      shape, HloOpcode::kSelect, is_src_core, hlo(), zero_bcast));
+  HloComputation* reduction =
+      MakeBinaryAdd(shape.element_type(), state_.module);
+
+  auto result = state_.collective_ops_creator.create_cross_partition_all_reduce(
+      state_.b, operand, reduction, NewChannel());
+  result->set_sharding(HloSharding::Replicate());
+  return PartitionedHlo(result, base_shape_, state_);
+}
+
+PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
+    const HloSharding& target) const {
+  int64 partition_count = sharding().tile_assignment().num_elements();
+  absl::optional<int64> input_partition_dim = UniqueTiledDim(sharding());
+  absl::optional<int64> output_partition_dim = UniqueTiledDim(target);
+  CHECK(input_partition_dim.has_value());
+  CHECK(output_partition_dim.has_value());
+
+  // If the device order is different in the target, fix the order with
+  // ReshardWithCollectivePermute.
+  auto input_tile_fixed_device_order = target.tile_assignment();
+  input_tile_fixed_device_order.Reshape(
+      sharding().tile_assignment().dimensions());
+  auto input_sharding_fixed_device_order =
+      HloSharding::Tile(input_tile_fixed_device_order);
+  if (input_sharding_fixed_device_order != sharding()) {
+    auto fixed_order =
+        ReshardWithCollectivePermute(input_sharding_fixed_device_order);
+    return fixed_order.ReshardWithAllToAll(target);
+  }
+
+  auto padded_hlo =
+      PadBaseShapeBeforeUnevenTiledSharding(hlo_, target, state_.b);
+
+  // The order of ids in the group must follow the target sharding.
+  std::vector<ReplicaGroup> groups(1);
+  for (int64 device : target.tile_assignment()) {
+    groups[0].add_replica_ids(device);
+  }
+
+  HloInstruction* result = nullptr;
+
+  // Split along the split dimension (output_partition_dim) of the all-to-all
+  // output.
+  std::vector<int64> dimensions;
+  for (int64 i = 0; i < base_shape_.rank(); ++i) {
+    if (i == *output_partition_dim) {
+      dimensions.push_back(partition_count);
+      dimensions.push_back(padded_hlo->shape().dimensions(i) / partition_count);
+    } else {
+      dimensions.push_back(padded_hlo->shape().dimensions(i));
+    }
+  }
+  auto reshape = state_.b->AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(base_shape_.element_type(), dimensions),
+      padded_hlo));
+  // After the reshape, it is guaranteed to have at least 3 dimensions.
+  auto all_to_all =
+      state_.collective_ops_creator.create_cross_partition_all_to_all(
+          state_.b, {reshape}, groups, (*state_.next_channel_id)++,
+          output_partition_dim);
+
+  // Reorder the split dimension of the reshape to be located in front of the
+  // input partition dimension, so the two dimensions can be combined.
+  int64 new_input_partition_dim = (*output_partition_dim < *input_partition_dim)
+                                      ? *input_partition_dim + 1
+                                      : *input_partition_dim;
+  std::vector<int64> permutation;
+  for (int64 i = 0; i < all_to_all->shape().rank(); ++i) {
+    if (i == *output_partition_dim) {
+      continue;
+    }
+    if (i == new_input_partition_dim) {
+      permutation.push_back(*output_partition_dim);
+    }
+    permutation.push_back(i);
+  }
+  auto transpose = state_.b->AddInstruction(HloInstruction::CreateTranspose(
+      ShapeInference::InferTransposeShape(all_to_all->shape(), permutation)
+          .ValueOrDie(),
+      all_to_all, permutation));
+
+  // Combine the split dimension and the input partition dimension.
+  auto new_shape = ShapeInference::InferAllToAllShape(
+                       padded_hlo->shape(), *output_partition_dim,
+                       *input_partition_dim, partition_count)
+                       .ValueOrDie();
+  result = state_.b->AddInstruction(
+      HloInstruction::CreateReshape(new_shape, transpose));
+
+  const Shape result_shape = MakePartitionedShape(base_shape_, target);
+  if (result_shape != result->shape()) {
+    result = state_.b->AddInstruction(HloInstruction::CreateSlice(
+        result_shape, result, std::vector<int64>(result_shape.rank(), 0),
+        result_shape.dimensions(), std::vector<int64>(result_shape.rank(), 1)));
+  }
+  result->set_sharding(target);
+  return PartitionedHlo(result, base_shape_, state_);
+}
+
+PartitionedHlo PartitionedHlo::ReshardWithCollectivePermute(
+    const HloSharding& target) const {
+  CHECK(CanReshardWithCollectivePermute(sharding(), target));
+  std::vector<std::pair<int64, int64>> src_dst_pairs;
+  sharding().tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 src_device) {
+        int64 dst_device = target.tile_assignment()(indices);
+        if (dst_device != src_device) {
+          src_dst_pairs.emplace_back(src_device, dst_device);
+        }
+      });
+  auto cp =
+      state_.collective_ops_creator.create_cross_partition_collective_permute(
+          state_.b, hlo(), src_dst_pairs, (*state_.next_channel_id)++);
+  cp->set_sharding(target);
+  return PartitionedHlo(cp, base_shape_, state_);
+}
+
+SpmdPartitioningVisitor::SpmdPartitioningVisitor(
+    HloComputation* computation, int64 num_partitions, int64 num_replicas,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdLogger* logger, SpmdPartitionerOptions options,
+    SpmdPartitioner* partitioner)
+    : changed_(false),
+      module_(computation->parent()),
+      num_partitions_(num_partitions),
+      num_replicas_(num_replicas),
+      collective_ops_creator_(collective_ops_creator),
+      next_channel_id_(next_channel_id),
+      b_(SpmdBuilder(computation->name() + "_spmd", /*hlo=*/nullptr)),
+      partition_id_(collective_ops_creator_.create_partition_id(&b_)),
+      logger_(logger),
+      options_(std::move(options)),
+      partitioner_(partitioner) {}
+
+Status SpmdPartitioningVisitor::DefaultAction(HloInstruction* hlo) {
+  if (hlo->HasSideEffect()) {
+    return Unimplemented("Side-effect ops cannot be replicated: %s",
+                         hlo->ToString());
+  }
+
+  if (hlo->IsElementwise() && hlo->operand_count() > 0) {
+    return HandleElementwise(hlo);
+  }
+
+  if (!hlo->sharding().IsTileMaximal()) {
+    VLOG(1) << "Not partitioned in SPMD mode (DefaultAction):"
+            << hlo->ToString();
+    for (int64 i = 0; i < hlo->operand_count(); ++i) {
+      VLOG(1) << "  operand " << i
+              << " sharding:" << hlo->operand(i)->sharding().ToString();
+    }
+  }
+
+  // If the instruction cannot be partitioned, replicate the instruction unless
+  // the instruction has side-effect.
+  std::vector<HloInstruction*> new_operands;
+  for (HloInstruction* operand : hlo->operands()) {
+    new_operands.push_back(
+        GetPartitionedHlo(operand).Reshard(HloSharding::Replicate()).hlo());
+  }
+  auto clone =
+      b_.AddInstruction(hlo->CloneWithNewOperands(hlo->shape(), new_operands));
+  clone->set_sharding(HloSharding::Replicate());
+  clone->set_metadata(hlo->metadata());
+  SetPartitionedHlo(hlo,
+                    PartitionedHlo(clone, hlo->shape(), MakePartitioningState())
+                        .Reshard(hlo->sharding()));
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::Preprocess(HloInstruction* hlo) {
+  visiting_hlo_ = hlo;
+  b_.set_visiting_hlo(hlo);
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::Postprocess(HloInstruction* hlo) {
+  logger_->RegisterLogEntry(GetPartitionedHlo(hlo).hlo(),
+                            b_.derived_instructions(hlo));
+  visiting_hlo_ = nullptr;
+  b_.set_visiting_hlo(nullptr);
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleElementwise(HloInstruction* hlo) {
+  std::vector<HloInstruction*> new_operands;
+  for (HloInstruction* operand : hlo->operands()) {
+    new_operands.push_back(
+        GetPartitionedHlo(operand).Reshard(hlo->sharding()).hlo());
+  }
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()), new_operands));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConcatenate(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  const Shape shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+  const int64 dimension = hlo->concatenate_dimension();
+  if (sharding.tile_assignment().dim(dimension) == 1) {
+    std::vector<HloInstruction*> new_operands;
+    for (HloInstruction* operand : hlo->operands()) {
+      new_operands.push_back(
+          GetPartitionedHlo(operand).Reshard(sharding).hlo());
+    }
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(
+          hlo->CloneWithNewOperands(shard_shape, new_operands));
+    });
+    return Status::OK();
+  }
+
+  // If the concatenate dimension is along one of the partitioned dimensions,
+  // allocate the full output shape, each partition updates its owned region,
+  // all-reduce across partitions, and then slice its output region.
+
+  // We currently don't support subgroup all-reduce along partitions, so more
+  // than 1 partitioned dimensions is not supported.
+  if (sharding.tile_assignment().dim(dimension) != num_partitions_) {
+    return DefaultAction(hlo);
+  }
+
+  // temp_output_shape is the output shape where the concatenate dimension
+  // is changed to the full (and padded to shard count) dimension size.
+  auto temp_output_shape = MakePartitionedShape(hlo->shape(), sharding);
+  temp_output_shape.set_dimensions(
+      dimension, temp_output_shape.dimensions(dimension) *
+                     sharding.tile_assignment().dim(dimension));
+  auto temp_output = CreateZero(temp_output_shape, &b_);
+
+  // Offset of each operand along the concatenate dimension.
+  int64 offset = 0;
+  for (HloInstruction* operand : hlo->operands()) {
+    auto spmd_operand = GetPartitionedHlo(operand).Reshard(sharding).hlo();
+    std::vector<HloInstruction*> start_indices(
+        hlo->shape().rank(), b_.AddInstruction(HloInstruction::CreateConstant(
+                                 LiteralUtil::Zero(S32))));
+    start_indices[dimension] =
+        MultiplyAddDivideOffsetCalculation(
+            spmd_operand->shape().dimensions(dimension), offset, 1)
+            .Calculate(MakeTiledPartitionOrdinals(sharding, partition_id_,
+                                                  &b_)[dimension],
+                       &b_);
+    temp_output = b_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+        temp_output_shape, temp_output, spmd_operand, start_indices));
+    offset += operand->shape().dimensions(dimension);
+  }
+  auto all_reduce = collective_ops_creator_.create_cross_partition_all_reduce(
+      &b_, temp_output, MakeBinaryAdd(hlo->shape().element_type(), module_),
+      NewChannel());
+  SetPartitionedHlo(hlo, [&] {
+    auto start_indices =
+        MakeTiledPartitionOrdinals(hlo->sharding(), partition_id_, &b_);
+    start_indices[dimension] = MultiplyAddDivideOffsetCalculation(
+                                   shard_shape.dimensions(dimension), 0, 1)
+                                   .Calculate(start_indices[dimension], &b_);
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        shard_shape, all_reduce, start_indices, shard_shape.dimensions()));
+  });
+
+  return Status::OK();
+}
+
+// If partitioning in the operand only happens in dimensions in passthrough
+// dimensions (offset dimensions in the gather output (or scatter update) that
+// have the same size as the operand), returns the corresponding output (or
+// update) sharding by passing through the input sharding.
+absl::optional<HloSharding> PassthroughOperandToGatherOutputOrScatterUpdate(
+    const PartitionedHlo& operand, const Shape& update_or_gather_shape,
+    absl::Span<const int64> collapsed_or_inserted_dims,
+    absl::Span<const int64> index_map,
+    absl::Span<const int64> offset_or_window_dims,
+    absl::Span<const int64> slice_size) {
+  if (operand.sharding().IsTileMaximal()) {
+    return operand.sharding();
+  }
+  std::vector<int64> passthrough_tile(update_or_gather_shape.rank(), 1);
+  int64 collapsed = 0;
+  for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
+    int64 dim_partitions = operand.sharding().tile_assignment().dim(i);
+    if (absl::c_linear_search(collapsed_or_inserted_dims, i) ||
+        absl::c_linear_search(index_map, i)) {
+      if (dim_partitions > 1) {
+        return absl::nullopt;
+      }
+      collapsed++;
+      continue;
+    }
+    if (slice_size[i] != operand.base_shape().dimensions(i) &&
+        dim_partitions > 1) {
+      return absl::nullopt;
+    }
+    int64 offset_dim = offset_or_window_dims[i - collapsed];
+    if (i - collapsed > 0 &&
+        offset_dim < offset_or_window_dims[i - collapsed - 1]) {
+      // Output offsets are transposed, we do not support this case.
+      return absl::nullopt;
+    }
+    passthrough_tile[offset_dim] = dim_partitions;
+  }
+  Array<int64> tile_assignment = operand.sharding().tile_assignment();
+  tile_assignment.Reshape(passthrough_tile);
+  return HloSharding::Tile(tile_assignment);
+}
+
+// Returns whether partitioning in the operand only happens in dimensions with
+// gather/scatter slice size 1.
+bool GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
+    const PartitionedHlo& operand, absl::Span<const int64> index_map,
+    absl::Span<const int64> slice_size, int64 num_partitions) {
+  if (operand.sharding().IsTileMaximal()) {
+    return false;
+  }
+  int64 trivial_slice_dims_partitions = 1;
+  for (int64 dim : index_map) {
+    if (slice_size[dim] == 1) {
+      trivial_slice_dims_partitions *=
+          operand.sharding().tile_assignment().dim(dim);
+    }
+  }
+  return trivial_slice_dims_partitions == num_partitions;
+}
+
+// Returns the min and max for the indices (replicated) in a scatter/gather
+// which has the operand partitioned on trivial slice dimensions (slice size 1).
+std::pair<HloInstruction*, HloInstruction*>
+IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
+    const PartitionedHlo& operand, const PartitionedHlo& replicated_indices,
+    HloInstruction* partition_id, absl::Span<const int64> index_map,
+    int64 index_vector_dim, SpmdBuilder* b) {
+  auto operand_offsets = MakePartitionOffsets(
+      operand.base_shape(), operand.sharding(), partition_id, b);
+  // Find the per-dimension index bounds.
+  std::vector<HloInstruction*> min_indices;
+  std::vector<HloInstruction*> max_indices;
+  for (int64 i = 0; i < index_map.size(); ++i) {
+    int64 dim = index_map[i];
+    int64 partitions = operand.sharding().tile_assignment().dim(dim);
+    if (partitions == 1) {
+      min_indices.push_back(CreateR0WithType<int32>(
+          replicated_indices.base_shape().element_type(), 0, b));
+      max_indices.push_back(CreateR0WithType<int32>(
+          replicated_indices.base_shape().element_type(),
+          operand.base_shape().dimensions(dim), b));
+      continue;
+    }
+    auto offset = operand_offsets[dim];
+    if (offset->shape().element_type() !=
+        replicated_indices.base_shape().element_type()) {
+      offset = b->AddInstruction(HloInstruction::CreateConvert(
+          ShapeUtil::MakeShape(replicated_indices.base_shape().element_type(),
+                               {}),
+          offset));
+    }
+    min_indices.push_back(offset);
+    auto partition_size_minus_1 =
+        CreateR0WithType<int32>(replicated_indices.base_shape().element_type(),
+                                operand.hlo()->shape().dimensions(dim) - 1, b);
+    max_indices.push_back(b->AddInstruction(HloInstruction::CreateBinary(
+        offset->shape(), HloOpcode::kAdd, offset, partition_size_minus_1)));
+  }
+  // Broadcast the index bounds to the same shape as the indices.
+  HloInstruction* broadcast_min;
+  HloInstruction* broadcast_max;
+  if (index_vector_dim < replicated_indices.base_shape().rank()) {
+    // The index vector is an R1, we need to reshape individual bounds to
+    // [1], and concat them if there are more than one.
+    for (int64 i = 0; i < min_indices.size(); ++i) {
+      min_indices[i] = b->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(min_indices[i]->shape().element_type(), {1}),
+          min_indices[i]));
+      max_indices[i] = b->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(max_indices[i]->shape().element_type(), {1}),
+          max_indices[i]));
+    }
+    int64 slice_dims = max_indices.size();
+    if (slice_dims > 1) {
+      min_indices[0] = b->AddInstruction(HloInstruction::CreateConcatenate(
+          ShapeUtil::MakeShape(min_indices[0]->shape().element_type(),
+                               {slice_dims}),
+          min_indices, 0));
+      max_indices[0] = b->AddInstruction(HloInstruction::CreateConcatenate(
+          min_indices[0]->shape(), max_indices, 0));
+    }
+    broadcast_min = b->AddInstruction(HloInstruction::CreateBroadcast(
+        replicated_indices.base_shape(), min_indices[0], {index_vector_dim}));
+    broadcast_max = b->AddInstruction(HloInstruction::CreateBroadcast(
+        replicated_indices.base_shape(), max_indices[0], {index_vector_dim}));
+  } else {
+    CHECK_EQ(max_indices.size(), 1);
+    broadcast_min = b->AddInstruction(HloInstruction::CreateBroadcast(
+        replicated_indices.base_shape(), min_indices[0], {}));
+    broadcast_max = b->AddInstruction(HloInstruction::CreateBroadcast(
+        replicated_indices.base_shape(), max_indices[0], {}));
+  }
+  return {broadcast_min, broadcast_max};
+}
+
+Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
+  auto scatter = Cast<HloScatterInstruction>(hlo);
+  auto dnums = scatter->scatter_dimension_numbers();
+  auto operand = GetPartitionedHlo(scatter->operand(0));
+  auto indices = GetPartitionedHlo(scatter->operand(1));
+  auto updates = GetPartitionedHlo(scatter->operand(2));
+  std::vector<int64> slice_size(operand.base_shape().rank(), 1);
+  int64 num_update_window_dims = 0;
+  for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
+    if (absl::c_linear_search(dnums.inserted_window_dims(), i)) {
+      continue;
+    }
+    slice_size[i] = updates.base_shape().dimensions(
+        dnums.update_window_dims(num_update_window_dims++));
+  }
+  std::vector<int64> inserted_window_dims(dnums.inserted_window_dims().begin(),
+                                          dnums.inserted_window_dims().end());
+  std::vector<int64> scatter_dims_to_operand_dims(
+      dnums.scatter_dims_to_operand_dims().begin(),
+      dnums.scatter_dims_to_operand_dims().end());
+  std::vector<int64> update_window_dims(dnums.update_window_dims().begin(),
+                                        dnums.update_window_dims().end());
+  if (!operand.sharding().IsTileMaximal()) {
+    auto maybe_passthrough = PassthroughOperandToGatherOutputOrScatterUpdate(
+        operand, updates.base_shape(), inserted_window_dims,
+        scatter_dims_to_operand_dims, update_window_dims, slice_size);
+    // Handle pass through cases if we can use compatible sharding for update.
+    if (maybe_passthrough.has_value()) {
+      indices = indices.Reshard(HloSharding::Replicate());
+      updates = updates.Reshard(*maybe_passthrough);
+      auto pscatter = b_.AddInstruction(HloInstruction::CreateScatter(
+          operand.hlo()->shape(), operand.hlo(), indices.hlo(), updates.hlo(),
+          scatter->to_apply(), dnums, scatter->indices_are_sorted(),
+          scatter->unique_indices()));
+      pscatter->set_sharding(*maybe_passthrough);
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(pscatter, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+    if (GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
+            operand, scatter_dims_to_operand_dims, slice_size,
+            num_partitions_) &&
+        ShapeSizeInBytes(updates.base_shape()) <
+            ShapeSizeInBytes(scatter->shape())) {
+      // Operand is sharded on trivial slice dims (update slice size 1). We can
+      // adjust the indices on each partition by subtracting the offsets. Then
+      // we execute a scatter on full updated indices, and out-of-bound accesses
+      // will have no effect on the result as guaranteed by the scatter
+      // semantics.
+      indices = indices.Reshard(HloSharding::Replicate());
+      updates = updates.Reshard(HloSharding::Replicate());
+      HloInstruction* indices_min;
+      HloInstruction* indices_max_unused;
+      std::tie(indices_min, indices_max_unused) =
+          IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
+              operand, indices, partition_id_, scatter_dims_to_operand_dims,
+              dnums.index_vector_dim(), &b_);
+      auto adjusted_indices = b_.AddInstruction(HloInstruction::CreateBinary(
+          indices.hlo()->shape(), HloOpcode::kSubtract, indices.hlo(),
+          indices_min));
+      auto pscatter = b_.AddInstruction(HloInstruction::CreateScatter(
+          operand.hlo()->shape(), operand.hlo(), adjusted_indices,
+          updates.hlo(), scatter->to_apply(), dnums,
+          scatter->indices_are_sorted(), scatter->unique_indices()));
+      pscatter->set_sharding(operand.sharding());
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(pscatter, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleSlice(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  auto operand = GetPartitionedHlo(hlo->operand(0)).Reshard(sharding);
+
+  // Create a window config to represent the slice.
+  Window window;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_size(1);
+    dim->set_stride(hlo->slice_strides(i));
+    dim->set_window_dilation(1);
+    dim->set_window_reversal(false);
+    dim->set_padding_low(-hlo->slice_starts(i));
+    dim->set_padding_high(hlo->slice_limits(i) -
+                          hlo->operand(0)->shape().dimensions(i));
+    dim->set_base_dilation(1);
+  }
+
+  auto reshard_operand = operand.ReshardAsWindowedInput(
+      window, sharding,
+      CreateZero(ShapeUtil::MakeShape(hlo->shape().element_type(), {}), &b_),
+      /*mask_invalid_region=*/false);
+  if (!reshard_operand.has_value()) {
+    return DefaultAction(hlo);
+  }
+  TF_RET_CHECK(!reshard_operand->dynamic_slice_index_on_output.has_value());
+  const Shape& operand_shape = reshard_operand->sharded_input->shape();
+
+  std::vector<int64> start_indices = hlo->slice_starts();
+  std::vector<int64> limit_indices = hlo->slice_limits();
+  std::vector<int64> strides = hlo->slice_strides();
+  bool need_slice = false;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    auto dim = reshard_operand->shard_window.dimensions(i);
+    start_indices[i] = -dim.padding_low();
+    limit_indices[i] = operand_shape.dimensions(i) + dim.padding_high();
+    if (start_indices[i] != 0 || strides[i] != 1 ||
+        limit_indices[i] != operand_shape.dimensions(i)) {
+      need_slice = true;
+    }
+  }
+
+  SetPartitionedHlo(hlo, [&] {
+    if (need_slice) {
+      auto shard_shape = MakePartitionedShape(hlo->shape(), sharding);
+      return b_.AddInstruction(HloInstruction::CreateSlice(
+          shard_shape, reshard_operand->sharded_input, start_indices,
+          limit_indices, strides));
+    }
+    return reshard_operand->sharded_input;
+  });
+
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
+  HloSharding sharding = hlo->sharding();
+  // Special handling for sort in TopK when first operand partitioined at
+  // sort dimension.
+  auto k = GetKValueInTopKWhenPartitionSortDim(hlo);
+  if (k.has_value()) {
+    // When the first operand partitioned at sort dimension:
+    //   1. Partition sort computation to different partitions;
+    //   2. Slice TopK value and index from different partitions;
+    //   3. Gather and replicate value and index from different partitions,
+    //      the shape of replicated value and index will be
+    //      [batch_size, ..., partition_count * k, ...];
+    //   4. Final sort uses replicated value and index from different partitions
+    //      as input.
+    // GetTupleElement and Slice after the non-partitoned sort won't change
+    // at this point, as HandleGetTupleElement and HandleSlice will update them.
+    HloSortInstruction* sort = DynCast<HloSortInstruction>(hlo);
+    const int64 sort_dim = sort->sort_dimension();
+    auto input = hlo->operand(0);
+    auto index = hlo->operand(1);
+    const HloSharding& input_sharding = input->sharding();
+    const int64 partition_count =
+        input_sharding.tile_assignment().dim(sort_dim);
+    const int64 input_size = input->shape().dimensions(sort_dim);
+    const int64 per_partition_size = CeilOfRatio(input_size, partition_count);
+    const auto element_type = input->shape().element_type();
+    const auto index_type = index->shape().element_type();
+
+    // Partition and pad input and index.
+    // Pad input with minimal value.
+    auto partitioned_input = GetPartitionedHlo(input).PadWithValue(
+        CreateFirstWithType(element_type, &b_));
+    // Pad index with max value.
+    auto partitioned_index =
+        GetPartitionedHlo(index)
+            .Reshard(input_sharding)
+            .PadWithValue(CreateLastWithType(index_type, &b_));
+
+    // Each partition needs to do TopK separately, thus the base shape
+    // becomes the padded shape.
+    std::vector<int64> replicated_dimensions(
+        input->shape().dimensions().begin(), input->shape().dimensions().end());
+    replicated_dimensions[sort_dim] = per_partition_size * partition_count;
+    const Shape replicated_shape = ShapeUtil::MakeTupleShape(
+        {ShapeUtil::MakeShape(element_type, replicated_dimensions),
+         ShapeUtil::MakeShape(index_type, replicated_dimensions)});
+
+    // Partition original topk to different shards.
+    auto topk_sharding =
+        input_sharding.GetTupleSharding(replicated_shape).ValueOrDie();
+    auto shard_shape = MakePartitionedShape(replicated_shape, topk_sharding);
+    auto topk = b_.AddInstruction(hlo->CloneWithNewOperands(
+        shard_shape, {partitioned_input.hlo(), partitioned_index.hlo()}));
+
+    // Get value from first sort.
+    HloInstruction* value_gte =
+        b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+            topk->shape().tuple_shapes(0), topk, 0));
+    HloInstruction* index_gte =
+        b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+            topk->shape().tuple_shapes(1), topk, 1));
+
+    // Slice top K value from the first partitioned sort.
+    replicated_dimensions[sort_dim] = k.value() * partition_count;
+    auto slice_input = SliceFirstK(value_gte, &b_, sort_dim, k.value());
+    slice_input->set_sharding(input_sharding);
+    PartitionedHlo partitioned_slice_input(
+        slice_input, ShapeUtil::MakeShape(element_type, replicated_dimensions),
+        MakePartitioningState());
+    // Reshard value to be replicated.
+    auto replicated_slice_input =
+        partitioned_slice_input.Reshard(HloSharding::Replicate()).hlo();
+
+    // Slice top K index from the first parttioned sort.
+    auto slice_index = SliceFirstK(index_gte, &b_, sort_dim, k.value());
+    slice_index->set_sharding(input_sharding);
+    PartitionedHlo partitioned_slice_index(
+        slice_index, ShapeUtil::MakeShape(index_type, replicated_dimensions),
+        MakePartitioningState());
+    // Reshard value to be replicated.
+    auto replicated_slice_index =
+        partitioned_slice_index.Reshard(HloSharding::Replicate()).hlo();
+
+    // Creates replicated sort to do TopK, the input is value and index pairs
+    // from all the partitions.
+    const Shape final_topk_shape = ShapeUtil::MakeTupleShape(
+        {ShapeUtil::MakeShape(element_type, replicated_dimensions),
+         ShapeUtil::MakeShape(index_type, replicated_dimensions)});
+    auto final_sort = b_.AddInstruction(HloInstruction::CreateSort(
+        final_topk_shape, sort_dim,
+        {replicated_slice_input, replicated_slice_index}, sort->to_apply(),
+        sort->is_stable()));
+    final_sort->set_sharding(HloSharding::Replicate()
+                                 .GetTupleSharding(final_sort->shape())
+                                 .ValueOrDie());
+    PartitionedHlo replicated_sort(final_sort, final_topk_shape,
+                                   MakePartitioningState());
+    SetPartitionedHlo(hlo, replicated_sort.Reshard(hlo->sharding()));
+
+    return Status::OK();
+  }
+
+  if (hlo->shape().IsTuple()) {
+    // Check that all elements are sharded in the same way.
+    if (hlo->shape().tuple_shapes_size() == 0) {
+      return DefaultAction(hlo);
+    }
+    sharding = hlo->sharding().GetSubSharding(hlo->shape(), {0});
+    for (int64 i = 1; i < hlo->operand_count(); ++i) {
+      if (sharding != hlo->sharding().GetSubSharding(hlo->shape(), {i})) {
+        return DefaultAction(hlo);
+      }
+    }
+  }
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  for (int64 dim : hlo->dimensions()) {
+    if (sharding.tile_assignment().dim(dim) > 1) {
+      return DefaultAction(hlo);
+    }
+  }
+  // Reshard operands to the same as the output.
+  std::vector<HloInstruction*> new_operands;
+  for (HloInstruction* operand : hlo->operands()) {
+    new_operands.push_back(GetPartitionedHlo(operand).Reshard(sharding).hlo());
+  }
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()), new_operands));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
+  if (hlo->custom_call_target() == "SPMDFullToShardShape") {
+    // This op switches from auto partitioning to manual partitioning.
+    auto input_partitioned = GetPartitionedHlo(hlo->operand(0));
+    if (!EvenlyPartitions(hlo->shape(), input_partitioned.sharding())) {
+      input_partitioned = input_partitioned.PadWithValue(
+          CreateR0WithType(hlo->shape().element_type(), 0, &b_));
+    }
+    auto input = input_partitioned.hlo();
+    CHECK(hlo->sharding().IsReplicated());
+    CHECK(ShapeUtil::Compatible(input->shape(), hlo->shape()));
+    auto copy = b_.AddInstruction(
+        HloInstruction::CreateUnary(input->shape(), HloOpcode::kCopy, input));
+    SetPartitionedHlo(hlo, [&] { return copy; });
+    return Status::OK();
+  }
+  if (hlo->custom_call_target() == "SPMDShardToFullShape") {
+    // This op switches from manual partitioning to auto partitioning.
+    auto input = GetPartitionedHlo(hlo->operand(0)).hlo();
+    CHECK(input->sharding().IsReplicated());
+    auto copy = b_.AddInstruction(
+        HloInstruction::CreateUnary(input->shape(), HloOpcode::kCopy, input));
+    CHECK(ShapeUtil::Compatible(
+        copy->shape(), MakePartitionedShape(hlo->shape(), hlo->sharding())));
+    SetPartitionedHlo(hlo, [&] { return copy; });
+    return Status::OK();
+  }
+  if (hlo->custom_call_target() != "TopK") {
+    return DefaultAction(hlo);
+  }
+
+  if (!hlo->operand(0)->has_sharding()) {
+    return DefaultAction(hlo);
+  }
+
+  const HloSharding& sharding = hlo->operand(0)->sharding();
+  if (sharding.IsTileMaximal() || sharding.IsReplicated()) {
+    return DefaultAction(hlo);
+  }
+
+  const int64 sort_dim = 1;
+  const int64 shard_count = sharding.tile_assignment().dim(sort_dim);
+
+  if (shard_count <= 1) {
+    return DefaultAction(hlo);
+  }
+
+  const int64 input_size = hlo->operand(0)->shape().dimensions(sort_dim);
+  const int64 batch_size = hlo->shape().tuple_shapes(0).dimensions(0);
+  const int64 k = hlo->shape().tuple_shapes(0).dimensions(sort_dim);
+  const int64 per_partition_size = CeilOfRatio(input_size, shard_count);
+
+  if (k >= per_partition_size) {
+    return DefaultAction(hlo);
+  }
+
+  auto input = hlo->operand(0);
+  const auto element_type = input->shape().element_type();
+
+  auto partitioned_input = GetPartitionedHlo(input).PadWithValue(
+      CreateFirstWithType(element_type, &b_));
+
+  // Each partition needs to do TopK separately, thus the base shape
+  // becomes [batch_size, k * shard_count].
+  const Shape replicated_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(hlo->operand(0)->shape().element_type(),
+                            {batch_size, k * shard_count}),
+       ShapeUtil::MakeShape(S32, {batch_size, k * shard_count})});
+  auto custom_call_sharding =
+      sharding.GetTupleSharding(replicated_shape).ValueOrDie();
+  auto shard_shape =
+      MakePartitionedShape(replicated_shape, custom_call_sharding);
+  auto topk = b_.AddInstruction(
+      hlo->CloneWithNewOperands(shard_shape, {partitioned_input.hlo()}));
+  topk->set_sharding(custom_call_sharding);
+  // Partition customcall.
+  PartitionedHlo partitioned_topk(topk, replicated_shape,
+                                  MakePartitioningState());
+  topk = partitioned_topk.hlo();
+
+  // Get value from TopK.
+  HloInstruction* value_gte =
+      b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          topk->shape().tuple_shapes(0), topk, 0));
+  value_gte->set_sharding(sharding);
+  // Partition GetTupleElement of value.
+  PartitionedHlo value_partitioned_gte(
+      value_gte, partitioned_topk.base_shape().tuple_shapes(0),
+      MakePartitioningState());
+  // Reshard value to be replicated.
+  auto replicated_value_gte =
+      value_partitioned_gte.Reshard(HloSharding::Replicate()).hlo();
+
+  // Get index from TopK.
+  HloInstruction* index_gte =
+      b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          topk->shape().tuple_shapes(1), topk, 1));
+  auto partition_id_s32 = b_.AddInstruction(HloInstruction::CreateConvert(
+      ShapeUtil::MakeShape(S32, partition_id_->shape().dimensions()),
+      partition_id_));
+  // Add per partition offset to index, index returned from CustomCall always
+  // starts from 0.
+  auto index_offset = b_.AddInstruction(HloInstruction::CreateBroadcast(
+      index_gte->shape(),
+      b_.AddInstruction(HloInstruction::CreateBinary(
+          partition_id_s32->shape(), HloOpcode::kMultiply, partition_id_s32,
+          b_.AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<int32>(per_partition_size))))),
+      {}));
+  index_gte = b_.AddInstruction(HloInstruction::CreateBinary(
+      index_offset->shape(), HloOpcode::kAdd, index_gte, index_offset));
+  index_gte->set_sharding(sharding);
+  // Parttion GetTupleElement of index.
+  PartitionedHlo index_partitioned_gte(
+      index_gte, partitioned_topk.base_shape().tuple_shapes(1),
+      MakePartitioningState());
+  // Reshard index to be replicated.
+  auto replicated_index_gte =
+      index_partitioned_gte.Reshard(HloSharding::Replicate()).hlo();
+
+  // Creates replicated sort to do TopK, the input is value and index pairs
+  // from all the partitions. The reason to use Sort instead of CustomCall TopK
+  // is CustomCall only takes value as input. There will be an extra Gather
+  // to get the correct index if CustomCall is used here.
+
+  // Create comparator for the sort.
+  XlaBuilder b("Sort.Compare");
+  XlaComputation comparator = CreateScalarComparisonComputation(
+      "compare-value-and-index", {input->shape().element_type(), S32}, {Gt, Lt},
+      &b);
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape, comparator.GetProgramShape());
+  HloModuleConfig config(program_shape);
+  TF_ASSIGN_OR_RETURN(auto new_module,
+                      HloModule::CreateFromProto(comparator.proto(), config));
+  HloCloneContext context(module_);
+  auto compare_computation =
+      module_->DeepCloneComputation(new_module->entry_computation(), &context);
+  auto sort = b_.AddInstruction(HloInstruction::CreateSort(
+      replicated_shape, sort_dim, {replicated_value_gte, replicated_index_gte},
+      compare_computation, true));
+  sort->set_sharding(
+      HloSharding::Replicate().GetTupleSharding(sort->shape()).ValueOrDie());
+  PartitionedHlo replicated_sort(sort, replicated_shape,
+                                 MakePartitioningState());
+
+  // Slice value and index from top-k for output.
+  HloInstruction* sort_value_gte =
+      b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          replicated_sort.hlo()->shape().tuple_shapes(0), replicated_sort.hlo(),
+          0));
+  HloInstruction* sort_index_gte =
+      b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          replicated_sort.hlo()->shape().tuple_shapes(1), replicated_sort.hlo(),
+          1));
+  // Slice value from final sort.
+  HloInstruction* slice_sort_value =
+      SliceFirstK(sort_value_gte, &b_, sort_dim, k);
+  // Slice index from final sort.
+  HloInstruction* slice_index_value =
+      SliceFirstK(sort_index_gte, &b_, sort_dim, k);
+  auto create_tuple = b_.AddInstruction(
+      HloInstruction::CreateTuple({slice_sort_value, slice_index_value}));
+  create_tuple->set_sharding(HloSharding::Replicate());
+
+  SetPartitionedHlo(hlo, PartitionedHlo(create_tuple, create_tuple->shape(),
+                                        MakePartitioningState())
+                             .Reshard(hlo->sharding()));
+
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleTranspose(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  std::vector<int64> inverse_dimensions(hlo->shape().rank());
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    inverse_dimensions[hlo->dimensions(i)] = i;
+  }
+  auto desired_operand_sharding =
+      hlo_sharding_util::TransposeSharding(sharding, inverse_dimensions);
+
+  auto operand = GetPartitionedHlo(hlo->operand(0))
+                     .Reshard(desired_operand_sharding)
+                     .hlo();
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()), {operand}));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  auto operand = GetPartitionedHlo(hlo->operand(0));
+  // The output shape is the source and the operand shape is the target to get
+  // the aligned sharding for the operand.
+  auto desired_operand_sharding = hlo_sharding_util::ReshapeSharding(
+      hlo->shape(), hlo->operand(0)->shape(), hlo->sharding());
+  if (desired_operand_sharding.has_value()) {
+    auto operand_hlo = operand.Reshard(*desired_operand_sharding).hlo();
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(hlo->CloneWithNewOperands(
+          MakePartitionedShape(hlo->shape(), hlo->sharding()), {operand_hlo}));
+    });
+    return Status::OK();
+  }
+
+  // Try use halo exchange for certain split-dim/merge-dims cases.
+  // ReshapeSharding failed in these cases probably due to uneven partitioning,
+  // where halo exchange could help. Specifically we check the following
+  // conditions to detect supported cases:
+  // 1) Both input and output are partitioned on one dimension.
+  // 2) The combined size of dimensions before the partitioned dimension are the
+  // same on input and output. This means we don't need to consider the major
+  // dimensions.
+  // 3) Let A = the input size on the partitioned dimension, and
+  //        B = the output size on the partitioned dimension; then
+  //    either A % B == 0 (split dim) or B % A == 0 (merge dims).
+  auto maybe_input_sharded_dim = UniqueTiledDim(operand.sharding());
+  auto maybe_output_sharded_dim = UniqueTiledDim(sharding);
+  if (!maybe_input_sharded_dim || !maybe_output_sharded_dim) {
+    return DefaultAction(hlo);
+  }
+  int64 input_sharded_dim = *maybe_input_sharded_dim;
+  int64 output_sharded_dim = *maybe_output_sharded_dim;
+  // Check that the major dims before the sharded dim have the same total size
+  // for input and output.
+  int64 input_major_dims_size = 1;
+  for (int64 i = 0; i < input_sharded_dim; ++i) {
+    input_major_dims_size *= operand.base_shape().dimensions(i);
+  }
+  int64 output_major_dims_size = 1;
+  for (int64 i = 0; i < output_sharded_dim; ++i) {
+    output_major_dims_size *= hlo->shape().dimensions(i);
+  }
+  if (input_major_dims_size != output_major_dims_size) {
+    return DefaultAction(hlo);
+  }
+  // Fix potential device ordering mismatch in tile assignment.
+  Array<int64> new_input_tile_assignment = sharding.tile_assignment();
+  new_input_tile_assignment.Reshape(
+      operand.sharding().tile_assignment().dimensions());
+  operand = operand.Reshard(HloSharding::Tile(new_input_tile_assignment));
+
+  int64 input_dim_size = operand.base_shape().dimensions(input_sharded_dim);
+  int64 output_dim_size = hlo->shape().dimensions(output_sharded_dim);
+  auto input_shard_shape =
+      MakePartitionedShape(operand.base_shape(), operand.sharding());
+  auto output_shard_shape = MakePartitionedShape(hlo->shape(), sharding);
+  if (input_dim_size % output_dim_size == 0) {
+    // Split dim.
+    int64 split_factor = input_dim_size / output_dim_size;
+    int64 output_shard_size = output_shard_shape.dimensions(output_sharded_dim);
+    // Use halo exchange to fix misaligned data.
+    Window window;
+    for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+      WindowDimension* dim = window.add_dimensions();
+      dim->set_size(1);
+      dim->set_stride(1);
+      dim->set_window_dilation(1);
+      dim->set_window_reversal(false);
+      dim->set_base_dilation(1);
+      dim->set_padding_low(0);
+      if (i == input_sharded_dim) {
+        dim->set_padding_high(output_shard_size * split_factor *
+                                  num_partitions_ -
+                              input_dim_size);
+      } else {
+        dim->set_padding_high(0);
+      }
+    }
+
+    auto reshard_operand = operand.ReshardAsWindowedInput(
+        window, operand.sharding(),
+        CreateZero(ShapeUtil::MakeShape(hlo->shape().element_type(), {}), &b_),
+        /*mask_invalid_region=*/false);
+    if (!reshard_operand.has_value()) {
+      return DefaultAction(hlo);
+    }
+    TF_RET_CHECK(!reshard_operand->dynamic_slice_index_on_output.has_value());
+    CHECK_EQ(
+        reshard_operand->sharded_input->shape().dimensions(input_sharded_dim),
+        output_shard_size * split_factor);
+    SetPartitionedHlo(hlo, [&] {
+      // Do a local reshape.
+      return b_.AddInstruction(HloInstruction::CreateReshape(
+          output_shard_shape, reshard_operand->sharded_input));
+    });
+    return Status::OK();
+  } else if (output_dim_size % input_dim_size == 0) {
+    // Merge dims.
+    int64 merge_factor = output_dim_size / input_dim_size;
+    // First reshape locally. (The sharded dimension could include padded data.)
+    auto tmp_shard_shape = output_shard_shape;
+    tmp_shard_shape.set_dimensions(
+        output_sharded_dim,
+        input_shard_shape.dimensions(input_sharded_dim) * merge_factor);
+    auto tmp_reshape = b_.AddInstruction(
+        HloInstruction::CreateReshape(tmp_shard_shape, operand.hlo()));
+    tmp_reshape->set_metadata(hlo->metadata());
+    tmp_reshape->set_sharding(hlo->sharding());
+    auto tmp_full_shape = tmp_shard_shape;
+    tmp_full_shape.set_dimensions(
+        output_sharded_dim,
+        tmp_shard_shape.dimensions(output_sharded_dim) * num_partitions_);
+    auto tmp_output =
+        PartitionedHlo(tmp_reshape, tmp_full_shape, MakePartitioningState());
+
+    // Use halo exchange to fix misaligned data.
+    Window window;
+    for (int64 i = 0; i < tmp_shard_shape.rank(); ++i) {
+      WindowDimension* dim = window.add_dimensions();
+      dim->set_size(1);
+      dim->set_stride(1);
+      dim->set_window_dilation(1);
+      dim->set_window_reversal(false);
+      dim->set_base_dilation(1);
+      dim->set_padding_low(0);
+      if (i == output_sharded_dim) {
+        dim->set_padding_high(output_dim_size -
+                              tmp_shard_shape.dimensions(output_sharded_dim) *
+                                  num_partitions_);
+      } else {
+        dim->set_padding_high(0);
+      }
+    }
+
+    auto reshard_output = tmp_output.ReshardAsWindowedInput(
+        window, sharding,
+        CreateZero(ShapeUtil::MakeShape(hlo->shape().element_type(), {}), &b_),
+        /*mask_invalid_region=*/false);
+    if (!reshard_output.has_value()) {
+      return DefaultAction(hlo);
+    }
+    TF_RET_CHECK(!reshard_output->dynamic_slice_index_on_output.has_value());
+    CHECK_EQ(
+        reshard_output->sharded_input->shape().dimensions(output_sharded_dim),
+        output_shard_shape.dimensions(output_sharded_dim));
+    SetPartitionedHlo(hlo, [&] { return reshard_output->sharded_input; });
+    return Status::OK();
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleIota(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  SetPartitionedHlo(hlo, [&] {
+    int64 dimension = Cast<HloIotaInstruction>(hlo)->iota_dimension();
+    auto iota = b_.AddInstruction(HloInstruction::CreateIota(
+        MakePartitionedShape(hlo->shape(), sharding), dimension));
+
+    if (sharding.tile_assignment().dim(dimension) > 1) {
+      auto partition_ordinals =
+          MakeTiledPartitionOrdinals(sharding, partition_id_, &b_);
+      auto multiplier = b_.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<int32>(iota->shape().dimensions(dimension))));
+      auto offset = b_.AddInstruction(HloInstruction::CreateBinary(
+          ShapeUtil::MakeShape(S32, {}), HloOpcode::kMultiply,
+          partition_ordinals[dimension], multiplier));
+      if (iota->shape().element_type() != S32) {
+        offset = b_.AddInstruction(HloInstruction::CreateConvert(
+            ShapeUtil::MakeShape(iota->shape().element_type(), {}), offset));
+      }
+      auto broadcast = b_.AddInstruction(
+          HloInstruction::CreateBroadcast(iota->shape(), offset, {}));
+      return b_.AddInstruction(HloInstruction::CreateBinary(
+          iota->shape(), HloOpcode::kAdd, iota, broadcast));
+    }
+
+    return iota;
+  });
+
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleSingleDevice(const HloInstruction* hlo) {
+  TF_RET_CHECK(hlo->sharding().HasUniqueDevice());
+  int64 device = hlo->sharding().GetUniqueDevice();
+  const HloSharding sharding = HloSharding::AssignDevice(device);
+
+  std::vector<HloInstruction*> operands;
+  std::vector<Shape> operand_shapes;
+  for (const HloInstruction* operand : hlo->operands()) {
+    operands.push_back(GetPartitionedHlo(operand).Reshard(sharding).hlo());
+    operand_shapes.push_back(operand->shape());
+  }
+  auto operand = b_.AddInstruction(HloInstruction::CreateTuple(operands));
+  auto operand_shape = ShapeUtil::MakeTupleShape(operand_shapes);
+
+  auto on_device = b_.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(device)));
+  auto pred = b_.AddInstruction(HloInstruction::CreateCompare(
+      ShapeUtil::MakeShape(PRED, {}), partition_id_, on_device,
+      ComparisonDirection::kEq));
+
+  SpmdBuilder true_b("true_computation", visiting_hlo_);
+  HloComputation* true_computation;
+  {
+    auto param = true_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0, operand_shape, "true_branch_param"));
+    std::vector<HloInstruction*> new_operands;
+    for (int64 i = 0; i < operands.size(); ++i) {
+      new_operands.push_back(true_b.AddInstruction(
+          HloInstruction::CreateGetTupleElement(operand_shapes[i], param, i)));
+    }
+    auto root = true_b.AddInstruction(
+        hlo->CloneWithNewOperands(hlo->shape(), new_operands));
+    true_computation = module_->AddEmbeddedComputation(true_b.Build(root));
+  }
+
+  SpmdBuilder false_b("false_computation", visiting_hlo_);
+  HloComputation* false_computation;
+  {
+    false_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0, operand_shape, "false_branch_param"));
+    auto root = CreateZero(hlo->shape(), &false_b);
+    false_computation = module_->AddEmbeddedComputation(false_b.Build(root));
+  }
+
+  SetPartitionedHlo(hlo, [&]() {
+    return b_.AddInstruction(HloInstruction::CreateConditional(
+        hlo->shape(), pred, operand, true_computation, operand,
+        false_computation));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleAllReduce(HloInstruction* hlo) {
+  if (hlo->IsCrossReplicaAllReduce() && hlo->operand_count() == 1) {
+    return HandleElementwise(hlo);
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleBroadcast(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  auto& operand = GetPartitionedHlo(hlo->operand(0));
+
+  // Tiled output.
+  std::vector<int64> wanted_input_tile_size(operand.base_shape().rank());
+  std::vector<int64> sharded_new_dims;
+  for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
+    wanted_input_tile_size[i] =
+        hlo->sharding().tile_assignment().dim(hlo->dimensions(i));
+  }
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (!absl::c_linear_search(hlo->dimensions(), i) &&
+        hlo->sharding().tile_assignment().dim(i) > 1) {
+      sharded_new_dims.push_back(i);
+    }
+  }
+  if (sharded_new_dims.empty()) {
+    // The new dimensions are replicated, so that we can do the adjustment on
+    // the input.
+    Array<int64> wanted_input_tile_assignment(wanted_input_tile_size);
+    wanted_input_tile_assignment.Each(
+        [&](absl::Span<const int64> indices, int64* val) {
+          std::vector<int64> indices_in_broadcast(hlo->shape().rank(), 0);
+          for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
+            indices_in_broadcast[hlo->dimensions(i)] = indices[i];
+          }
+          *val = hlo->sharding().tile_assignment()(indices_in_broadcast);
+        });
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(hlo->CloneWithNewOperands(
+          MakePartitionedShape(hlo->shape(), hlo->sharding()),
+          {operand.Reshard(HloSharding::Tile(wanted_input_tile_assignment))
+               .hlo()}));
+    });
+  } else {
+    auto input = operand.Reshard(HloSharding::Replicate()).hlo();
+    // We pad and shard the input first, then broadcast to the final shard
+    // shape.
+    auto output_offsets =
+        MakePartitionOffsets(hlo->shape(), hlo->sharding(), partition_id_, &b_);
+    std::vector<HloInstruction*> input_offsets(operand.base_shape().rank());
+    auto output_shard_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    auto input_shard_shape = input->shape();
+    auto padded_input_shape = input->shape();
+    for (int64 i = 0; i < input_offsets.size(); ++i) {
+      input_offsets[i] = output_offsets[hlo->dimensions(i)];
+      input_shard_shape.set_dimensions(
+          i, output_shard_shape.dimensions(hlo->dimensions(i)));
+      padded_input_shape.set_dimensions(
+          i, hlo->sharding().tile_assignment().dim(hlo->dimensions(i)) *
+                 input_shard_shape.dimensions(i));
+    }
+    auto padded_input = PadToShape(input, padded_input_shape, &b_);
+    auto input_shard =
+        ShapeUtil::Compatible(input_shard_shape, padded_input->shape())
+            ? padded_input
+            : b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+                  input_shard_shape, padded_input, input_offsets,
+                  input_shard_shape.dimensions()));
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(
+          hlo->CloneWithNewOperands(output_shard_shape, {input_shard}));
+    });
+  }
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConstant(HloInstruction* hlo) {
+  const Literal& literal = hlo->literal();
+  if (literal.shape().IsTuple() ||
+      (!hlo->sharding().IsTileMaximal() &&
+       (!EvenlyPartitions(hlo->shape(), hlo->sharding()) ||
+        !literal.IsAllFirst()))) {
+    return DefaultAction(hlo);
+  }
+
+  SetPartitionedHlo(hlo, [&]() {
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    std::vector<int64> start_indices(hlo->shape().rank(), 0);
+    auto constant = b_.AddInstruction(HloInstruction::CreateConstant(
+        literal.Slice(start_indices, shard_shape.dimensions())));
+    *constant->mutable_shape() = shard_shape;
+    return constant;
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleDynamicSlice(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (hlo->sharding().tile_assignment().dim(i) != 1 &&
+        (hlo->dynamic_slice_sizes()[i] != hlo->shape().dimensions(i) ||
+         !hlo->operand(i + 1)->IsConstant() ||
+         !hlo->operand(i + 1)->literal().IsZero({}))) {
+      // We currently do not partition the sliced dimensions.
+      return DefaultAction(hlo);
+    }
+  }
+  std::vector<HloInstruction*> new_indices(hlo->shape().rank());
+  auto new_input =
+      GetPartitionedHlo(hlo->operand(0)).Reshard(hlo->sharding()).hlo();
+  for (int64 i = 0; i < new_indices.size(); ++i) {
+    // Replicate the indices.
+    new_indices[i] = GetPartitionedHlo(hlo->operand(i + 1))
+                         .Reshard(HloSharding::Replicate())
+                         .hlo();
+  }
+  SetPartitionedHlo(hlo, [&]() {
+    auto partitioned_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        partitioned_shape, new_input, new_indices,
+        partitioned_shape.dimensions()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (hlo->sharding().tile_assignment().dim(i) != 1 &&
+        (hlo->operand(1)->shape().dimensions(i) != hlo->shape().dimensions(i) ||
+         !hlo->operand(i + 2)->IsConstant() ||
+         !hlo->operand(i + 2)->literal().IsZero({}))) {
+      // We currently do not partition the sliced dimensions.
+      return DefaultAction(hlo);
+    }
+  }
+  std::vector<HloInstruction*> new_indices(hlo->shape().rank());
+  auto new_input =
+      GetPartitionedHlo(hlo->operand(0)).Reshard(hlo->sharding()).hlo();
+  auto new_update =
+      GetPartitionedHlo(hlo->operand(1)).Reshard(hlo->sharding()).hlo();
+  for (int64 i = 0; i < new_indices.size(); ++i) {
+    // Replicate the indices.
+    new_indices[i] = GetPartitionedHlo(hlo->operand(i + 2))
+                         .Reshard(HloSharding::Replicate())
+                         .hlo();
+  }
+  SetPartitionedHlo(hlo, [&]() {
+    auto partitioned_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    return b_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+        partitioned_shape, new_input, new_update, new_indices));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
+  auto gather = Cast<HloGatherInstruction>(hlo);
+  const auto& dnums = gather->gather_dimension_numbers();
+  auto operand = GetPartitionedHlo(gather->operand(0));
+  auto indices = GetPartitionedHlo(gather->operand(1));
+  std::vector<int64> collapsed_slice_dims(dnums.collapsed_slice_dims().begin(),
+                                          dnums.collapsed_slice_dims().end());
+  std::vector<int64> start_index_map(dnums.start_index_map().begin(),
+                                     dnums.start_index_map().end());
+  std::vector<int64> offset_dims(dnums.offset_dims().begin(),
+                                 dnums.offset_dims().end());
+  if (!operand.sharding().IsTileMaximal()) {
+    auto maybe_passthrough = PassthroughOperandToGatherOutputOrScatterUpdate(
+        operand, gather->shape(), collapsed_slice_dims, start_index_map,
+        offset_dims, gather->gather_slice_sizes());
+    if (maybe_passthrough.has_value()) {
+      indices = indices.Reshard(HloSharding::Replicate());
+      auto pshape = MakePartitionedShape(gather->shape(), *maybe_passthrough);
+      std::vector<int64> pslice_sizes(gather->gather_slice_sizes().begin(),
+                                      gather->gather_slice_sizes().end());
+      for (int64 i = 0; i < pslice_sizes.size(); ++i) {
+        if (operand.sharding().tile_assignment().dim(i) > 1) {
+          pslice_sizes[i] = operand.hlo()->shape().dimensions(i);
+        }
+      }
+      auto pgather = b_.AddInstruction(HloInstruction::CreateGather(
+          pshape, operand.hlo(), indices.hlo(), dnums, pslice_sizes,
+          gather->indices_are_sorted()));
+      pgather->set_sharding(*maybe_passthrough);
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(pgather, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+    if (GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
+            operand, start_index_map, gather->gather_slice_sizes(),
+            num_partitions_) &&
+        ShapeSizeInBytes(gather->shape()) <
+            ShapeSizeInBytes(gather->operand(0)->shape())) {
+      indices = indices.Reshard(HloSharding::Replicate());
+      // Now the operand is partitioned in trivial slice dimensions, and the
+      // indices are replicated. We execute a gather on partitioned operand,
+      // with full number of indices, where out-of-bounds indices are clamped,
+      // and masked out with 0 in the result; then we use all-reduce to combine
+      // results. Although gather will not get faster, we avoided the need to
+      // replicate the operand.
+      HloInstruction* indices_min;
+      HloInstruction* indices_max;
+      std::tie(indices_min, indices_max) =
+          IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
+              operand, indices, partition_id_, start_index_map,
+              dnums.index_vector_dim(), &b_);
+      // Clamp the indices.
+      auto adjusted_indices = b_.AddInstruction(HloInstruction::CreateTernary(
+          indices.base_shape(), HloOpcode::kClamp, indices_min, indices.hlo(),
+          indices_max));
+      // Adjust the indices by subtracting the offset.
+      adjusted_indices = b_.AddInstruction(HloInstruction::CreateBinary(
+          indices.base_shape(), HloOpcode::kSubtract, adjusted_indices,
+          indices_min));
+      // Gather on adjusted indices.
+      auto pgather = b_.AddInstruction(HloInstruction::CreateGather(
+          gather->shape(), operand.hlo(), adjusted_indices, dnums,
+          gather->gather_slice_sizes(), gather->indices_are_sorted()));
+      // Mask out invalid results.
+      auto filter = b_.AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::ChangeElementType(indices.base_shape(), PRED),
+          indices.hlo(), indices_min, ComparisonDirection::kLt));
+      filter = b_.AddInstruction(HloInstruction::CreateBinary(
+          filter->shape(), HloOpcode::kOr, filter,
+          b_.AddInstruction(HloInstruction::CreateCompare(
+              ShapeUtil::ChangeElementType(indices.base_shape(), PRED),
+              indices.hlo(), indices_max, ComparisonDirection::kGt))));
+      if (dnums.index_vector_dim() < indices.base_shape().rank()) {
+        std::vector<int64> reduced_filter_dims;
+        for (int64 i = 0; i < filter->shape().rank(); ++i) {
+          if (i != dnums.index_vector_dim()) {
+            reduced_filter_dims.push_back(filter->shape().dimensions(i));
+          }
+        }
+        filter = b_.AddInstruction(HloInstruction::CreateReduce(
+            ShapeUtil::MakeShape(PRED, reduced_filter_dims), filter,
+            CreateR0WithType(PRED, false, &b_), {dnums.index_vector_dim()},
+            MakeBinaryAdd(PRED, module_)));
+      }
+      std::vector<int64> batch_dims;
+      for (int64 i = 0; i < pgather->shape().rank(); ++i) {
+        if (!absl::c_linear_search(dnums.offset_dims(), i)) {
+          batch_dims.push_back(i);
+        }
+      }
+      auto broadcast_filter = b_.AddInstruction(HloInstruction::CreateBroadcast(
+          ShapeUtil::ChangeElementType(pgather->shape(), PRED), filter,
+          batch_dims));
+      auto filtered = b_.AddInstruction(HloInstruction::CreateTernary(
+          pgather->shape(), HloOpcode::kSelect, broadcast_filter,
+          CreateZero(pgather->shape(), &b_), pgather));
+      // Combine from different partitions.
+      auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+          &b_, filtered,
+          MakeBinaryAdd(filtered->shape().element_type(), module_),
+          NewChannel());
+      ar->set_sharding(HloSharding::Replicate());
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleGetTupleElement(HloInstruction* hlo) {
+  const auto& tuple = GetPartitionedHlo(hlo->operand(0));
+  auto gte = b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::GetTupleElementShape(tuple.hlo()->shape(), hlo->tuple_index()),
+      tuple.hlo(), hlo->tuple_index()));
+  SetPartitionedHlo(hlo, [&]() {
+    const auto source_sharding = tuple.sharding().GetSubSharding(
+        tuple.base_shape(), {hlo->tuple_index()});
+    gte->set_sharding(source_sharding);
+    PartitionedHlo source_partitioned_gte(gte, hlo->shape(),
+                                          MakePartitioningState());
+    return source_partitioned_gte.Reshard(hlo->sharding()).hlo();
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleInfeed(HloInstruction* hlo) {
+  const Shape& shape = ShapeUtil::GetTupleElementShape(hlo->shape(), 0);
+  auto token = GetPartitionedHlo(hlo->operand(0)).hlo();
+  if (ShapeUtil::GetLeafCount(shape) == 0) {
+    // TODO(b/155819021): HloSharding has issues with tuple-shaped sharding: it
+    // requires one element for an empty tuple, but leaf-count number of
+    // elements for non-empty tuple. So if it has a nested empty tuple, we
+    // cannot invoke GetSubSharding() since it expects a sharding for the empty
+    // tuple. This is a workaround for that case.
+    SetPartitionedHlo(hlo, [&]() {
+      return b_.AddInstruction(
+          HloInstruction::CreateInfeed(shape, token, hlo->infeed_config()));
+    });
+    return Status::OK();
+  }
+  auto sharding = hlo->sharding().GetSubSharding(hlo->shape(), {0});
+  auto shard_shape = MakePartitionedShape(shape, sharding);
+  if (EvenlyPartitions(shape, sharding)) {
+    SetPartitionedHlo(hlo, [&]() {
+      return b_.AddInstruction(HloInstruction::CreateInfeed(
+          shard_shape, token, hlo->infeed_config()));
+    });
+    return Status::OK();
+  }
+
+  if (hlo->sharding().HasUniqueDevice()) {
+    return HandleSingleDevice(hlo);
+  }
+
+  // Create a branch for each unique partitioned shape.
+  std::vector<Shape> per_branch_partitioned_shapes;
+  std::vector<int32> conditional_branch_indices(num_partitions_);
+  for (int64 i = 0; i < num_partitions_; ++i) {
+    auto partitioned_shape =
+        MakeNonPaddedShapeForGivenPartition(shape, sharding, i);
+    int64 matching_existing_index = 0;
+    for (; matching_existing_index < per_branch_partitioned_shapes.size();
+         ++matching_existing_index) {
+      if (ShapeUtil::Compatible(
+              partitioned_shape,
+              per_branch_partitioned_shapes[matching_existing_index])) {
+        break;
+      }
+    }
+    if (matching_existing_index < per_branch_partitioned_shapes.size()) {
+      conditional_branch_indices[i] = matching_existing_index;
+    } else {
+      conditional_branch_indices[i] = per_branch_partitioned_shapes.size();
+      per_branch_partitioned_shapes.push_back(std::move(partitioned_shape));
+    }
+  }
+
+  HloInstruction* branch_index;
+  if (per_branch_partitioned_shapes.size() == num_partitions_) {
+    // Use partition ID as the branch index if each partition has its own
+    // branch.
+    branch_index = partition_id_;
+    // PartitionId's output is U32 but conditional requires S32.
+    if (branch_index->shape().element_type() != S32) {
+      branch_index = b_.AddInstruction(HloInstruction::CreateConvert(
+          ShapeUtil::ChangeElementType(branch_index->shape(), S32),
+          branch_index));
+    }
+  } else {
+    // Otherwise, use a constant table to look up the branch index.
+    auto branch_index_table = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR1<int32>(conditional_branch_indices)));
+    branch_index = b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        ShapeUtil::MakeShape(S32, {1}), branch_index_table, {partition_id_},
+        {1}));
+    branch_index = b_.AddInstruction(HloInstruction::CreateReshape(
+        ShapeUtil::MakeShape(S32, {}), branch_index));
+  }
+
+  std::vector<HloComputation*> branches(per_branch_partitioned_shapes.size());
+  for (int64 i = 0; i < branches.size(); ++i) {
+    SpmdBuilder branch_b(absl::StrCat("infeed_branch_", i), visiting_hlo_);
+    auto param = branch_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0, token->shape(), "infeed_token_param"));
+    auto infeed = branch_b.AddInstruction(HloInstruction::CreateInfeed(
+        per_branch_partitioned_shapes[i], param, hlo->infeed_config()));
+    branches[i] = module_->AddEmbeddedComputation(branch_b.Build(infeed));
+    if (!ShapeUtil::Compatible(per_branch_partitioned_shapes[i], shard_shape)) {
+      TF_ASSIGN_OR_RETURN(
+          auto padded,
+          branches[i]->DeepCopyInstructionWithCustomCopier(
+              infeed, [&](HloInstruction* leaf, const ShapeIndex& leaf_index,
+                          HloComputation* comp) {
+                // Index {1} corresponds to the token.
+                if (leaf_index.empty() || leaf_index[0] != 0) {
+                  return leaf;
+                }
+                ShapeIndexView subindex(leaf_index, 1);
+                if (ShapeUtil::Compatible(
+                        ShapeUtil::GetSubshape(per_branch_partitioned_shapes[i],
+                                               subindex),
+                        ShapeUtil::GetSubshape(shard_shape, subindex))) {
+                  return leaf;
+                }
+                return PadToShape(leaf,
+                                  ShapeUtil::GetSubshape(shard_shape, subindex),
+                                  nullptr, comp);
+              }));
+      branches[i]->set_root_instruction(padded,
+                                        /*accept_different_shape=*/true);
+    }
+  }
+  SetPartitionedHlo(hlo, [&]() {
+    return b_.AddInstruction(HloInstruction::CreateConditional(
+        ShapeUtil::MakeTupleShape({shard_shape, token->shape()}), branch_index,
+        branches, std::vector<HloInstruction*>(branches.size(), token)));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandlePad(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  auto lhs = GetPartitionedHlo(hlo->operand(0));
+  // Create a window config to represent the pad.
+  Window window;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    const auto& pd = hlo->padding_config().dimensions(i);
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_size(1);
+    dim->set_stride(1);
+    dim->set_window_dilation(1);
+    dim->set_window_reversal(false);
+    dim->set_padding_low(pd.edge_padding_low());
+    dim->set_padding_high(pd.edge_padding_high());
+    dim->set_base_dilation(pd.interior_padding() + 1);
+  }
+
+  auto replicated_rhs = GetPartitionedHlo(hlo->operand(1))
+                            .Reshard(HloSharding::Replicate())
+                            .hlo();
+  auto reshard_operand =
+      lhs.ReshardAsWindowedInput(window, hlo->sharding(), replicated_rhs,
+                                 /*mask_invalid_region=*/false);
+  if (!reshard_operand.has_value()) {
+    return DefaultAction(hlo);
+  }
+  PaddingConfig sharded_padding_config;
+  bool need_pad = false;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    auto dim = sharded_padding_config.add_dimensions();
+    const auto& wd = reshard_operand->shard_window.dimensions(i);
+    dim->set_edge_padding_low(wd.padding_low());
+    dim->set_edge_padding_high(wd.padding_high());
+    dim->set_interior_padding(wd.base_dilation() - 1);
+    if (wd.padding_low() != 0 || wd.padding_high() != 0 ||
+        wd.base_dilation() != 1) {
+      need_pad = true;
+    }
+  }
+  auto sharded_pad = reshard_operand->sharded_input;
+  if (need_pad) {
+    TF_ASSIGN_OR_RETURN(auto sharded_pad_shape,
+                        ShapeInference::InferPadShape(sharded_pad->shape(),
+                                                      replicated_rhs->shape(),
+                                                      sharded_padding_config));
+    sharded_pad = b_.AddInstruction(hlo->CreatePad(sharded_pad_shape,
+                                                   sharded_pad, replicated_rhs,
+                                                   sharded_padding_config));
+  }
+
+  SetPartitionedHlo(hlo, [&]() {
+    if (!reshard_operand->dynamic_slice_index_on_output) {
+      return sharded_pad;
+    }
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        shard_shape, sharded_pad,
+        *reshard_operand->dynamic_slice_index_on_output,
+        shard_shape.dimensions()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleParameter(HloInstruction* hlo) {
+  SetPartitionedHlo(hlo, [&]() {
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    auto new_param = b_.AddInstruction(HloInstruction::CreateParameter(
+        hlo->parameter_number(), shard_shape, "param"));
+    if (hlo->parameter_replicated_at_leaf_buffers()) {
+      new_param->set_parameter_replicated_at_leaf_buffers(
+          *hlo->parameter_replicated_at_leaf_buffers());
+    }
+    return new_param;
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
+  int64 input_count = 1;
+  auto per_input_sharding = hlo->sharding();
+  if (hlo->shape().IsTuple()) {
+    input_count = hlo->shape().tuple_shapes_size();
+    CHECK_GT(input_count, 0);
+    per_input_sharding = hlo->sharding().GetSubSharding(hlo->shape(), {0});
+  }
+
+  std::vector<PartitionedHlo> inputs;
+  std::vector<HloInstruction*> inits;
+  for (int64 operand_id = 0; operand_id < input_count; ++operand_id) {
+    inits.push_back(GetPartitionedHlo(hlo->operand(operand_id + input_count))
+                        .Reshard(HloSharding::Replicate())
+                        .hlo());
+    inputs.push_back(GetPartitionedHlo(hlo->operand(operand_id)));
+    if (operand_id > 0) {
+      // Make sure all operands are sharded in the same way.
+      inputs.back() = inputs.back().Reshard(inputs[0].sharding());
+    }
+    if (!inputs[0].sharding().IsTileMaximal()) {
+      inputs.back() = inputs.back().PadWithValue(inits[operand_id]);
+    }
+  }
+  bool reduce_sharded_dimension = false;
+  if (!inputs[0].sharding().IsTileMaximal()) {
+    reduce_sharded_dimension = absl::c_any_of(hlo->dimensions(), [&](int64 i) {
+      return inputs[0].sharding().tile_assignment().dim(i) > 1;
+    });
+
+    // reduce_sharded_dimension is not supported for tuple-shaped reduces.
+    if (reduce_sharded_dimension && input_count > 1) {
+      return DefaultAction(hlo);
+    }
+
+    // Currently we only support reducing all or none of the sharded
+    // dimensions.
+    if (reduce_sharded_dimension) {
+      for (int64 i = 0; i < inputs[0].base_shape().rank(); ++i) {
+        if (inputs[0].sharding().tile_assignment().dim(i) > 1 &&
+            absl::c_count(hlo->dimensions(), i) == 0) {
+          return DefaultAction(hlo);
+        }
+      }
+    }
+  }
+
+  std::vector<Shape*> new_operand_shapes(input_count * 2);
+  for (int64 i = 0; i < input_count; ++i) {
+    new_operand_shapes[i] = inputs[i].hlo()->mutable_shape();
+    new_operand_shapes[i + input_count] = inits[i]->mutable_shape();
+  }
+  // Create the shard shape of the reduce result.
+  TF_ASSIGN_OR_RETURN(
+      auto reduce_shape,
+      ShapeInference::InferReduceShape(new_operand_shapes, hlo->dimensions(),
+                                       hlo->to_apply()->ComputeProgramShape()));
+  *reduce_shape.mutable_layout() = hlo->shape().layout();
+
+  std::vector<HloInstruction*> input_hlos(input_count);
+  for (int64 i = 0; i < input_count; ++i) {
+    input_hlos[i] = inputs[i].hlo();
+  }
+  auto local_reduce = b_.AddInstruction(HloInstruction::CreateReduce(
+      reduce_shape, input_hlos, inits, hlo->dimensions(), hlo->to_apply()));
+  local_reduce->set_metadata(hlo->metadata());
+
+  SetPartitionedHlo(hlo, [&]() {
+    HloInstruction* reduce;
+    if (reduce_sharded_dimension) {
+      CHECK(local_reduce->shape().IsArray());
+      reduce = collective_ops_creator_.create_cross_partition_all_reduce(
+          &b_, local_reduce, hlo->to_apply(), NewChannel());
+      reduce->set_sharding(HloSharding::Replicate());
+    } else {
+      reduce = local_reduce;
+      if (inputs[0].sharding().IsTileMaximal()) {
+        reduce->set_sharding(inputs[0].sharding());
+      } else {
+        // Remove tile assignment dimensions that are reduced.
+        std::vector<int64> tile_dimensions;
+        for (int64 i = 0; i < input_hlos[0]->shape().rank(); ++i) {
+          if (absl::c_count(hlo->dimensions(), i) == 0) {
+            tile_dimensions.push_back(
+                inputs[0].sharding().tile_assignment().dim(i));
+          }
+        }
+        Array<int64> new_tile = inputs[0].sharding().tile_assignment();
+        new_tile.Reshape(tile_dimensions);
+        auto sharding = HloSharding::Tile(new_tile);
+        if (input_count > 1) {
+          std::vector<HloSharding> tuple(input_count, sharding);
+          sharding = HloSharding::Tuple(hlo->shape(), tuple);
+        }
+        reduce->set_sharding(sharding);
+      }
+    }
+
+    return PartitionedHlo(reduce, hlo->shape(), MakePartitioningState())
+        .Reshard(hlo->sharding())
+        .hlo();
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleReverse(HloInstruction* hlo) {
+  auto reverse = Cast<HloReverseInstruction>(hlo);
+  if (reverse->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  auto operand = GetPartitionedHlo(reverse->operand(0))
+                     .Reshard(hlo_sharding_util::ReverseSharding(
+                         reverse->sharding(), reverse->dimensions()));
+  auto left_padded_operand =
+      HaloExchangeToPadOnLeft(operand, reverse->dimensions());
+  if (!left_padded_operand) {
+    return DefaultAction(hlo);
+  }
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        left_padded_operand->shape(), {left_padded_operand}));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleWhile(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+
+  // Shardings for the body parameter, body root, and cond parameter must be
+  // the same, and the condition root must be replicated so that all partitions
+  // follow the same control flow.
+  hlo->while_condition()->parameter_instruction(0)->set_sharding(sharding);
+  hlo->while_body()->parameter_instruction(0)->set_sharding(sharding);
+  TF_RETURN_IF_ERROR(partitioner_
+                         ->PartitionComputation(hlo->while_condition(),
+                                                HloSharding::Replicate(),
+                                                next_channel_id_, logger_)
+                         .status());
+  TF_RETURN_IF_ERROR(partitioner_
+                         ->PartitionComputation(hlo->while_body(), sharding,
+                                                next_channel_id_, logger_)
+                         .status());
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(HloInstruction::CreateWhile(
+        MakePartitionedShape(hlo->shape(), sharding), hlo->while_condition(),
+        hlo->while_body(),
+        GetPartitionedHlo(hlo->operand(0)).Reshard(sharding).hlo()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConditional(HloInstruction* hlo) {
+  std::vector<HloInstruction*> branch_args;
+  for (int64 i = 0; i < hlo->branch_count(); ++i) {
+    HloComputation* computation = hlo->branch_computation(i);
+
+    // Shardings of the branch computation parameter and its argument must be
+    // the same.
+    computation->parameter_instruction(0)->set_sharding(
+        hlo->operand(i + 1)->sharding());
+    branch_args.push_back(GetPartitionedHlo(hlo->operand(i + 1)).hlo());
+  }
+
+  // The root of the branch computations must follow the sharding of the
+  // conditional instruction.
+  for (int64 i = 0; i < hlo->branch_count(); ++i) {
+    HloComputation* computation = hlo->branch_computation(i);
+    TF_RETURN_IF_ERROR(partitioner_
+                           ->PartitionComputation(computation, hlo->sharding(),
+                                                  next_channel_id_, logger_)
+                           .status());
+  }
+
+  // We replicate the predicate of the conditional (the first operand) so that
+  // all partitions follow the same control flow.
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(HloInstruction::CreateConditional(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()),
+        GetPartitionedHlo(hlo->operand(0))
+            .Reshard(HloSharding::Replicate())
+            .hlo(),
+        hlo->called_computations(), branch_args));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleOutfeed(HloInstruction* hlo) {
+  TF_RET_CHECK(hlo->sharding().HasUniqueDevice());
+  return HandleSingleDevice(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleRng(HloInstruction* hlo) {
+  if (hlo->sharding().HasUniqueDevice()) {
+    return HandleSingleDevice(hlo);
+  }
+
+  if (hlo->sharding().IsReplicated()) {
+    SetPartitionedHlo(hlo, [&] {
+      // Run on a single device (0) and distribute the data to all other cores.
+      std::vector<HloInstruction*> new_operands;
+      for (int64 i = 0; i < hlo->operand_count(); ++i) {
+        new_operands.push_back(GetPartitionedHlo(hlo->operand(i))
+                                   .Reshard(HloSharding::AssignDevice(0))
+                                   .hlo());
+      }
+      auto clone = b_.AddInstruction(
+          hlo->CloneWithNewOperands(hlo->shape(), new_operands));
+      clone->set_sharding(HloSharding::AssignDevice(0));
+      return PartitionedHlo(clone, hlo->shape(), MakePartitioningState())
+          .Reshard(HloSharding::Replicate())
+          .hlo();
+    });
+    return Status::OK();
+  }
+
+  TF_RET_CHECK(!hlo->sharding().IsTileMaximal());
+  SetPartitionedHlo(hlo, [&] {
+    // Replicate the operands and run partitioned Rng on all devices.
+    std::vector<HloInstruction*> new_operands;
+    for (int64 i = 0; i < hlo->operand_count(); ++i) {
+      new_operands.push_back(GetPartitionedHlo(hlo->operand(i))
+                                 .Reshard(HloSharding::Replicate())
+                                 .hlo());
+    }
+    return b_.AddInstruction(HloInstruction::CreateRng(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()),
+        hlo->random_distribution(), new_operands));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleReduceWindow(HloInstruction* hlo) {
+  auto& operand = GetPartitionedHlo(hlo->operand(0));
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  // Replicate init
+  auto replicated_init = GetPartitionedHlo(hlo->mutable_operand(1))
+                             .Reshard(HloSharding::Replicate());
+  auto resharded_operand_and_window = operand.ReshardAsWindowedInput(
+      hlo->window(), hlo->sharding(), replicated_init.hlo());
+  if (!resharded_operand_and_window.has_value()) {
+    return DefaultAction(hlo);
+  }
+
+  TF_ASSIGN_OR_RETURN(Shape sharded_rw_shape,
+                      ShapeInference::InferReduceWindowShape(
+                          resharded_operand_and_window->sharded_input->shape(),
+                          replicated_init.hlo()->shape(),
+                          resharded_operand_and_window->shard_window,
+                          hlo->to_apply()->ComputeProgramShape()));
+  auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+  *sharded_rw_shape.mutable_layout() = shard_shape.layout();
+  SetPartitionedHlo(hlo, [&]() {
+    auto sharded_rw = b_.AddInstruction(HloInstruction::CreateReduceWindow(
+        sharded_rw_shape, resharded_operand_and_window->sharded_input,
+        replicated_init.hlo(), resharded_operand_and_window->shard_window,
+        hlo->to_apply()));
+    if (!resharded_operand_and_window->dynamic_slice_index_on_output
+             .has_value()) {
+      CHECK(ShapeUtil::Compatible(shard_shape, sharded_rw->shape()));
+      return sharded_rw;
+    }
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        shard_shape, sharded_rw,
+        *resharded_operand_and_window->dynamic_slice_index_on_output,
+        shard_shape.dimensions()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleSelectAndScatter(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  auto operand = GetPartitionedHlo(hlo->operand(0));
+  auto source = GetPartitionedHlo(hlo->mutable_operand(1));
+  if (hlo->sharding() != operand.sharding()) {
+    operand = operand.Reshard(hlo->sharding());
+  }
+  if (hlo->sharding() != source.sharding()) {
+    source = source.Reshard(hlo->sharding());
+  }
+
+  // For F32 and BF16 types, we can use NaN padding to workaround the issue with
+  // low/high padding, since comparison will return false with NaN input.
+  if (hlo->shape().element_type() != F32 &&
+      hlo->shape().element_type() != BF16) {
+    return DefaultAction(hlo);
+  }
+
+  auto select = hlo->called_computations()[0];
+  auto select_root = select->root_instruction();
+  if (select_root->opcode() != HloOpcode::kCompare ||
+      select_root->operand(0)->opcode() != HloOpcode::kParameter ||
+      select_root->operand(1)->opcode() != HloOpcode::kParameter ||
+      select_root->operand(0)->parameter_number() +
+              select_root->operand(1)->parameter_number() !=
+          1) {
+    return DefaultAction(hlo);
+  }
+
+  float float_pad_value;
+  if (select_root->comparison_direction() == ComparisonDirection::kGe ||
+      select_root->comparison_direction() == ComparisonDirection::kGt) {
+    if (select_root->operand(0)->parameter_number() == 0) {
+      float_pad_value = -std::numeric_limits<float>::infinity();
+    } else {
+      float_pad_value = std::numeric_limits<float>::infinity();
+    }
+  } else if (select_root->comparison_direction() == ComparisonDirection::kLe ||
+             select_root->comparison_direction() == ComparisonDirection::kLt) {
+    if (select_root->operand(0)->parameter_number() == 0) {
+      float_pad_value = std::numeric_limits<float>::infinity();
+    } else {
+      float_pad_value = -std::numeric_limits<float>::infinity();
+    }
+  } else {
+    return DefaultAction(hlo);
+  }
+
+  auto pad_value = b_.AddInstruction(HloInstruction::CreateConstant(
+      hlo->shape().element_type() == BF16
+          ? LiteralUtil::CreateR0<bfloat16>(
+                static_cast<bfloat16>(float_pad_value))
+          : LiteralUtil::CreateR0<float>(float_pad_value)));
+
+  // Replicate init
+  auto replicated_init = GetPartitionedHlo(hlo->mutable_operand(2))
+                             .Reshard(HloSharding::Replicate());
+
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(hlo->sharding(), partition_id_, &b_);
+
+  // The first window for each dimension that overlaps with the shard area.
+  std::vector<MultiplyAddDivideOffsetCalculation> first_window(
+      hlo->shape().rank());
+  // The first window for each dimension that goes beyond with the shard area.
+  std::vector<MultiplyAddDivideOffsetCalculation> limit_window(
+      hlo->shape().rank());
+  std::vector<OffsetCalculation> data_left_halo_sizes(hlo->shape().rank());
+  std::vector<OffsetCalculation> data_right_halo_sizes(hlo->shape().rank());
+  std::vector<OffsetCalculation> source_left_halo_sizes(hlo->shape().rank());
+  std::vector<OffsetCalculation> source_right_halo_sizes(hlo->shape().rank());
+  auto unpadded_data_shard_shape =
+      MakePartitionedShape(hlo->shape(), hlo->sharding());
+  auto unpadded_source_shard_shape =
+      MakePartitionedShape(hlo->operand(1)->shape(), hlo->sharding());
+  auto source_shard_hlo = source.hlo();
+  auto data_shard_hlo = operand.hlo();
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    int64 shard_count = hlo->sharding().tile_assignment().dim(i);
+    if (shard_count == 1) {
+      continue;
+    }
+    // If stride > window_size, there will be gaps between windows. These gaps
+    // will also exist in the output, so we keep them during halo exchange.
+    //
+    // TODO(yuanzx): This could introduce overhead if partitions start at
+    // different offsets in a gap.
+    auto wd = hlo->window().dimensions(i);
+    if (wd.stride() > wd.size()) {
+      wd.set_size(wd.stride());
+    }
+    // shard_size * i < stride * k - pad_low + window_size  =>
+    //   k > (shard_size * i + pad_low - window_size) / stride  =>
+    //   first_k == (shard_size * i + pad_low - window_size + stride) / stride
+    first_window[i] = MultiplyAddDivideOffsetCalculation(
+        unpadded_data_shard_shape.dimensions(i),
+        wd.padding_low() - wd.size() + wd.stride(), wd.stride());
+    // shard_size * (i + 1) <= stride * k - pad_low  =>
+    //   k >= (shard_size * i + shard_size + pad_low) / stride  =>
+    //   limit_k == (shard_size * i + shard_size + pad_low + stride - 1) /
+    //     stride
+    limit_window[i] = MultiplyAddDivideOffsetCalculation(
+        unpadded_data_shard_shape.dimensions(i),
+        unpadded_data_shard_shape.dimensions(i) + wd.padding_low() +
+            wd.stride() - 1,
+        wd.stride());
+    source_left_halo_sizes[i] =
+        MultiplyAddDivideOffsetCalculation(
+            unpadded_source_shard_shape.dimensions(i), 0, 1) -
+        first_window[i];
+    source_right_halo_sizes[i] =
+        limit_window[i] - MultiplyAddDivideOffsetCalculation(
+                              unpadded_source_shard_shape.dimensions(i),
+                              unpadded_source_shard_shape.dimensions(i), 1);
+    data_left_halo_sizes[i] =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            unpadded_data_shard_shape.dimensions(i), wd.padding_low(), 1)) -
+        OffsetCalculation(
+            HloOpcode::kMultiply, first_window[i],
+            MultiplyAddDivideOffsetCalculation(0, wd.stride(), 1));
+    data_right_halo_sizes[i] =
+        OffsetCalculation(
+            HloOpcode::kMultiply, limit_window[i],
+            MultiplyAddDivideOffsetCalculation(0, wd.stride(), 1)) -
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            unpadded_data_shard_shape.dimensions(i),
+            unpadded_data_shard_shape.dimensions(i) + wd.stride() +
+                wd.padding_low() - wd.size(),
+            1));
+
+    int64 max_windows =
+        (limit_window[i] - first_window[i]).MaxInRange(0, shard_count);
+    auto first_window_hlo =
+        first_window[i].Calculate(partition_ordinals[i], &b_);
+    // Padding on the source is filled with the init value so they do not change
+    // the data on overlapping windows.
+    auto resharded_source = ExchangeHaloAndGetValidData(
+        source_shard_hlo, source.base_shape(), source_left_halo_sizes[i],
+        source_right_halo_sizes[i], 0,
+        limit_window[i].Calculate(shard_count - 1), max_windows, i,
+        hlo->sharding(), first_window_hlo, replicated_init.hlo(),
+        partition_ordinals[i], collective_ops_creator_, next_channel_id_, &b_);
+    if (!resharded_source) {
+      return DefaultAction(hlo);
+    }
+    source_shard_hlo = *resharded_source;
+
+    auto offset_start_in_data =
+        MultiplyAddDivideOffsetCalculation(wd.stride(), 0, 1)
+            .Calculate(first_window_hlo, &b_);
+    int64 padded_data_size =
+        (limit_window[i].Calculate(shard_count - 1) - 1) * wd.stride() +
+        wd.size();
+    int64 data_shard_size = (max_windows - 1) * wd.stride() + wd.size();
+    auto resharded_data = ExchangeHaloAndGetValidData(
+        data_shard_hlo, operand.base_shape(), data_left_halo_sizes[i],
+        data_right_halo_sizes[i], wd.padding_low(), padded_data_size,
+        data_shard_size, i, hlo->sharding(), offset_start_in_data, pad_value,
+        partition_ordinals[i], collective_ops_creator_, next_channel_id_, &b_);
+    if (!resharded_data) {
+      return DefaultAction(hlo);
+    }
+    data_shard_hlo = *resharded_data;
+  }
+
+  Window window_on_shard = hlo->window();
+  for (int64 i = 0; i < window_on_shard.dimensions_size(); ++i) {
+    int64 shard_count = hlo->sharding().tile_assignment().dim(i);
+    if (shard_count == 1) {
+      continue;
+    }
+    auto reshard_wd = window_on_shard.mutable_dimensions(i);
+    // The shards are already explicitly padded.
+    reshard_wd->set_padding_low(0);
+    reshard_wd->set_padding_high(0);
+  }
+
+  auto sharded_select_and_scatter =
+      b_.AddInstruction(HloInstruction::CreateSelectAndScatter(
+          data_shard_hlo->shape(), data_shard_hlo, select, window_on_shard,
+          source_shard_hlo, replicated_init.hlo(),
+          hlo->called_computations()[1]));
+  SetPartitionedHlo(hlo, [&]() {
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    if (ShapeUtil::Compatible(sharded_select_and_scatter->shape(),
+                              shard_shape)) {
+      return sharded_select_and_scatter;
+    }
+    auto zero = b_.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+    std::vector<HloInstruction*> slice_offsets(shard_shape.rank(), zero);
+    for (int64 i = 0; i < window_on_shard.dimensions_size(); ++i) {
+      if (hlo->sharding().tile_assignment().dim(i) == 1) {
+        continue;
+      }
+      int64 pad_low = hlo->window().dimensions(i).padding_low();
+      auto left_halo_size =
+          data_left_halo_sizes[i].Calculate(partition_ordinals[i], &b_);
+      if (data_left_halo_sizes[i].Calculate(0) == pad_low) {
+        slice_offsets[i] = left_halo_size;
+      } else {
+        auto is_shard0 = b_.AddInstruction(HloInstruction::CreateCompare(
+            ShapeUtil::MakeShape(PRED, {}), zero, partition_ordinals[i],
+            ComparisonDirection::kEq));
+        auto pad_low_hlo = b_.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<int32>(pad_low)));
+        slice_offsets[i] = b_.AddInstruction(HloInstruction::CreateTernary(
+            zero->shape(), HloOpcode::kSelect, is_shard0, pad_low_hlo,
+            left_halo_size));
+      }
+    }
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        shard_shape, sharded_select_and_scatter, slice_offsets,
+        shard_shape.dimensions()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleTuple(HloInstruction* hlo) {
+  std::vector<HloInstruction*> new_operands;
+  for (int64 i = 0; i < hlo->operand_count(); ++i) {
+    new_operands.push_back(
+        GetPartitionedHlo(hlo->operand(i))
+            .Reshard(hlo->sharding().GetSubSharding(hlo->shape(), {i}))
+            .hlo());
+  }
+  SetPartitionedHlo(hlo, [&]() {
+    return b_.AddInstruction(HloInstruction::CreateTuple(new_operands));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConvolutionTiledLhsAndRhs(
+    HloInstruction* hlo) {
+  TF_RET_CHECK(hlo->opcode() == HloOpcode::kConvolution);
+
+  auto lhs = GetPartitionedHlo(hlo->operand(0));
+  auto rhs = GetPartitionedHlo(hlo->operand(1));
+  TF_RET_CHECK(!lhs.sharding().IsTileMaximal() &&
+               !rhs.sharding().IsTileMaximal());
+
+  const auto& dnums = hlo->convolution_dimension_numbers();
+
+  // Check if the operand shardings are aligned. Also we currently don't
+  // support partitioning non-spatial dimensions.
+  std::vector<int64> rhs_to_lhs_indices(hlo->shape().rank());
+  rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
+      dnums.input_batch_dimension();
+  rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
+      dnums.input_feature_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
+        dnums.input_spatial_dimensions(i);
+  }
+  std::vector<int64> lhs_to_rhs_indices(hlo->shape().rank());
+  for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
+    lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
+  }
+
+  Window window = hlo->window();
+  std::vector<int64> reversed_rhs_dims;
+  for (int64 i = 0; i < window.dimensions_size(); ++i) {
+    if (window.dimensions(i).window_reversal()) {
+      reversed_rhs_dims.push_back(dnums.kernel_spatial_dimensions(i));
+    }
+  }
+  if (!reversed_rhs_dims.empty()) {
+    // Make the reversed dims left-padded to prepare for window reversal.
+    auto left_padded_rhs = HaloExchangeToPadOnLeft(rhs, reversed_rhs_dims);
+    if (left_padded_rhs == nullptr) {
+      return DefaultAction(hlo);
+    }
+    left_padded_rhs->set_sharding(rhs.sharding());
+    rhs = PartitionedHlo(left_padded_rhs, rhs.base_shape(), rhs.state());
+  }
+  // Consider window reversal when resharding RHS or LHS. Note: this will not
+  // reverse the data in the shard. We use window reversal to do that.
+  auto aligned_rhs_sharding = hlo_sharding_util::ReverseSharding(
+      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices),
+      reversed_rhs_dims);
+  auto aligned_lhs_sharding = hlo_sharding_util::TransposeSharding(
+      hlo_sharding_util::ReverseSharding(rhs.sharding(), reversed_rhs_dims),
+      lhs_to_rhs_indices);
+
+  auto unsupported_sharding = [&](const HloSharding& lhs_sharding,
+                                  const HloSharding& rhs_sharding) {
+    return lhs_sharding.tile_assignment().dim(dnums.input_batch_dimension()) !=
+               1 ||
+           rhs_sharding.tile_assignment().dim(
+               dnums.kernel_output_feature_dimension()) != 1;
+  };
+
+  auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(hlo->shape().element_type())));
+  if (ShapeSizeInBytes(lhs.base_shape()) < ShapeSizeInBytes(rhs.base_shape())) {
+    if (unsupported_sharding(aligned_lhs_sharding, rhs.sharding())) {
+      return DefaultAction(hlo);
+    }
+    lhs = lhs.Reshard(aligned_lhs_sharding).PadWithValue(zero);
+    rhs = rhs.PadWithValue(zero, reversed_rhs_dims);
+  } else {
+    if (unsupported_sharding(lhs.sharding(), aligned_rhs_sharding)) {
+      return DefaultAction(hlo);
+    }
+    lhs = lhs.PadWithValue(zero);
+    rhs =
+        rhs.Reshard(aligned_rhs_sharding).PadWithValue(zero, reversed_rhs_dims);
+  }
+
+  // Reshard LHS by exchanging halo such that each shard computes the partial
+  // sum of the full shape result, and add AllReduce.
+  //
+  // The size of halo on each dimension can be calculated from the projection
+  // onto the LHS that each RHS shard i needs to read. RHS and LHS below refers
+  // to the shard size of RHS and LHS, WC is the number of windows, and D is the
+  // window dilation.
+  //
+  // * offset(i): RHS * D * i - low_padding
+  // * limit(i): {(RHS - 1) * D + 1} * (i + 1) + (WC - 1) * stride - low_padding
+  //
+  // Since shard i has LHS of range [i * LHS, (i + 1) * LHS)
+  // * left-halo: i * LHS - offset(i)
+  //              = (LHS - RHS) * i + low_padding
+  // * right-halo: limit(i) - (i + 1) * LHS
+  //   = [{(RHS - 1) * D + 1} - LHS] * (i + 1) + (WC - 1) * stride - low_padding
+  std::vector<int64> shard_counts(dnums.input_spatial_dimensions_size());
+  std::vector<int64> lhs_shard_sizes(dnums.input_spatial_dimensions_size());
+  std::vector<int64> rhs_shard_sizes(dnums.input_spatial_dimensions_size());
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+    int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
+    int64 shard_count = lhs.sharding().tile_assignment().dim(lhs_dimension);
+    auto wd = window.dimensions(i);
+    if (wd.base_dilation() != 1) {
+      return DefaultAction(hlo);
+    }
+
+    int64 lhs_shard_size =
+        CeilOfRatio(lhs.base_shape().dimensions(lhs_dimension), shard_count);
+    int64 rhs_shard_size =
+        CeilOfRatio(rhs.base_shape().dimensions(rhs_dimension), shard_count);
+    shard_counts[i] = shard_count;
+    lhs_shard_sizes[i] = lhs_shard_size;
+    rhs_shard_sizes[i] = rhs_shard_size;
+  }
+
+  std::vector<OffsetCalculation> left_halo_size_functions(hlo->shape().rank());
+  std::vector<OffsetCalculation> right_halo_size_functions(hlo->shape().rank());
+  Window new_window = window;
+
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(lhs.sharding(), partition_id_, &b_);
+  HloInstruction* lhs_with_halo = lhs.hlo();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+    int64 lhs_shard_size = lhs_shard_sizes[i];
+    int64 rhs_shard_size = rhs_shard_sizes[i];
+
+    if (shard_counts[i] == 1) {
+      continue;
+    }
+
+    // Calculate the left and right halo sizes as described in the comments
+    // above.
+    auto wd = window.dimensions(i);
+    int64 padding_low = wd.padding_low();
+    int64 padding_high = wd.padding_high();
+    int64 base = lhs.base_shape().dimensions(lhs_dimension);
+    int64 window_count = 1 + (padding_low + padding_high + base -
+                              (1 + (wd.size() - 1) * wd.window_dilation())) /
+                                 wd.stride();
+    int64 rhs_shard_size_dilated =
+        (rhs_shard_size - 1) * wd.window_dilation() + 1;
+
+    left_halo_size_functions[lhs_dimension] =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            lhs_shard_size - rhs_shard_size * wd.window_dilation(), padding_low,
+            1));
+    right_halo_size_functions[lhs_dimension] =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            rhs_shard_size_dilated - lhs_shard_size,
+            rhs_shard_size_dilated - lhs_shard_size +
+                wd.stride() * (window_count - 1) - padding_low,
+            1));
+
+    // Exchange halo and concatenate.
+    int64 dim = dnums.input_spatial_dimensions(i);
+    int64 explicit_left_padding_on_full_shape = padding_low;
+    int64 shard_size_with_halo =
+        wd.stride() * (window_count - 1) + rhs_shard_size_dilated;
+
+    new_window.mutable_dimensions(i)->set_padding_low(0);
+    new_window.mutable_dimensions(i)->set_padding_high(0);
+    new_window.mutable_dimensions(i)->set_size(rhs_shard_size);
+
+    // offset_on_padded_shape and padded_full_shape_size are needed only if
+    // we want to mask out-of-range values in ExchangeHaloAndGetValidData().
+    // Since the default value for both the collective-permute is zero and
+    // also we call PadWithValue() on both operands at the beginning, we
+    // don't need to mask here.
+    //
+    // TODO(hyoulkee): Consider removing one of the two PadWithValue() calls
+    // if it's always safe.
+    auto offset_on_padded_shape =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation());
+    int64 padded_full_shape_size = 0;
+    auto concat = ExchangeHaloAndGetValidData(
+        lhs_with_halo, lhs.base_shape(), left_halo_size_functions[dim],
+        right_halo_size_functions[dim], explicit_left_padding_on_full_shape,
+        padded_full_shape_size, shard_size_with_halo, dim, lhs.sharding(),
+        offset_on_padded_shape.Calculate(partition_ordinals[dim], &b_), zero,
+        partition_ordinals[dim], collective_ops_creator_, next_channel_id_, &b_,
+        /*mask_invalid_region=*/false);
+    if (!concat) {
+      return DefaultAction(hlo);
+    }
+    lhs_with_halo = *concat;
+  }
+
+  SetPartitionedHlo(hlo, [&]() {
+    auto conv = b_.AddInstruction(HloInstruction::CreateConvolve(
+        hlo->shape(), lhs_with_halo, rhs.hlo(), hlo->feature_group_count(),
+        hlo->batch_group_count(), new_window,
+        hlo->convolution_dimension_numbers(), hlo->precision_config()));
+    auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+        &b_, conv, MakeBinaryAdd(hlo->shape().element_type(), module_),
+        NewChannel());
+    ar->set_sharding(HloSharding::Replicate());
+    return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
+        .Reshard(hlo->sharding())
+        .hlo();
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConvolution(HloInstruction* hlo) {
+  auto dot_dnums = dot_as_convolution_util::ParseDotGeneralFromConvolution(hlo);
+  if (dot_dnums) {
+    // Use HandleDotHelper() for convs that are actually einsums.
+    spmd::DotGeneralDimsMapping mapping;
+    for (const auto& dims : dot_dnums->batch_dims) {
+      mapping.batch_dims.emplace_back();
+      mapping.batch_dims.back().lhs = dims.lhs;
+      mapping.batch_dims.back().rhs = dims.rhs;
+      mapping.batch_dims.back().output = dims.output;
+    }
+    for (const auto& dims : dot_dnums->contracting_dims) {
+      mapping.contracting_dims.emplace_back();
+      mapping.contracting_dims.back().lhs = dims.lhs;
+      mapping.contracting_dims.back().rhs = dims.rhs;
+      mapping.contracting_dims.back().output = dims.output;
+    }
+    for (const auto& dims : dot_dnums->lhs_non_contracting_dims) {
+      mapping.lhs_non_contracting_dims.emplace_back();
+      mapping.lhs_non_contracting_dims.back().lhs = dims.lhs;
+      mapping.lhs_non_contracting_dims.back().rhs = dims.rhs;
+      mapping.lhs_non_contracting_dims.back().output = dims.output;
+    }
+    for (const auto& dims : dot_dnums->rhs_non_contracting_dims) {
+      mapping.rhs_non_contracting_dims.emplace_back();
+      mapping.rhs_non_contracting_dims.back().lhs = dims.lhs;
+      mapping.rhs_non_contracting_dims.back().rhs = dims.rhs;
+      mapping.rhs_non_contracting_dims.back().output = dims.output;
+    }
+    auto create_sharded_conv =
+        [&](HloInstruction* lhs_hlo, HloInstruction* rhs_hlo,
+            spmd::SpmdBuilder* b) -> StatusOr<HloInstruction*> {
+      TF_ASSIGN_OR_RETURN(
+          auto sharded_conv,
+          dot_as_convolution_util::CreateShardedConvForDotGeneralConvolution(
+              *hlo, *dot_dnums, lhs_hlo, rhs_hlo));
+      return b->AddInstruction(std::move(sharded_conv));
+    };
+    return HandleDotHelper(hlo, mapping, create_sharded_conv);
+  }
+
+  auto lhs = GetPartitionedHlo(hlo->operand(0));
+  auto rhs = GetPartitionedHlo(hlo->operand(1));
+  const HloSharding& sharding = hlo->sharding();
+  const auto& dnums = hlo->convolution_dimension_numbers();
+  std::vector<int64> rhs_to_lhs_indices(hlo->shape().rank());
+  rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
+      dnums.input_batch_dimension();
+  rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
+      dnums.input_feature_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
+        dnums.input_spatial_dimensions(i);
+  }
+  std::vector<int64> lhs_to_rhs_indices(hlo->shape().rank());
+  for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
+    lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
+  }
+  auto aligned_rhs_sharding =
+      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices);
+  auto aligned_lhs_sharding =
+      hlo_sharding_util::TransposeSharding(rhs.sharding(), lhs_to_rhs_indices);
+
+  // Handling cases where both operands' shardings are aligned. We check that
+  // the LHS batch dimension is not partitioned because it is mapped to the
+  // output feature dimension in aligned_rhs_sharding, which are not the same
+  // dimension.
+  if (!lhs.sharding().IsTileMaximal() && !rhs.sharding().IsTileMaximal()) {
+    if (options_.conv_halo_exchange_always_on_lhs) {
+      return HandleConvolutionTiledLhsAndRhs(hlo);
+    } else {
+      // Reshard RHS so that each shard computes the partial sum of the full
+      // shape result, and add AllReduce. See HandleConvolutionTiledLhsAndRhs()
+      // that reshards LHS.
+      //
+      // The size of halo on each dimension can be calculated from the
+      // projection onto the RHS that shard i needs to read. RHS and LHS below
+      // refers to the shard size of RHS and LHS, WC is the number of windows,
+      // and D is the window dilation.
+      //
+      // * offset(i): LHS * i + low_padding - (WC - 1) * stride
+      // * limit(i): LHS * (i + 1) + low_padding
+      //
+      // Since shard i has RHS of range [i * RHS * D, (i + 1) * RHS * D)
+      // * left-halo: i * RHS - offset(i)
+      //              = i * (RHS * D - LHS) + (WC - 1) * stride - low_padding
+      // * right-halo: limit(i) - (i + 1) * RHS
+      //              = (i + 1) * (LHS - RHS * D) + low_pading
+
+      auto unsupported_sharding = [&](const HloSharding& lhs_sharding,
+                                      const HloSharding& rhs_sharding) {
+        // We currently don't support partitioning input batch or output feature
+        // dimensions.
+        return lhs_sharding.tile_assignment().dim(
+                   dnums.input_batch_dimension()) != 1 ||
+               rhs_sharding.tile_assignment().dim(
+                   dnums.kernel_output_feature_dimension()) != 1;
+      };
+      auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::Zero(hlo->shape().element_type())));
+      if (ShapeSizeInBytes(lhs.base_shape()) <
+          ShapeSizeInBytes(rhs.base_shape())) {
+        if (unsupported_sharding(aligned_lhs_sharding, rhs.sharding())) {
+          return DefaultAction(hlo);
+        }
+        lhs = lhs.Reshard(aligned_lhs_sharding).PadWithValue(zero);
+        rhs = rhs.PadWithValue(zero);
+      } else {
+        if (unsupported_sharding(lhs.sharding(), aligned_rhs_sharding)) {
+          return DefaultAction(hlo);
+        }
+        lhs = lhs.PadWithValue(zero);
+        rhs = rhs.Reshard(aligned_rhs_sharding).PadWithValue(zero);
+      }
+
+      Window window = hlo->window();
+      std::vector<int64> shard_counts(dnums.input_spatial_dimensions_size());
+      std::vector<int64> lhs_shard_sizes(dnums.input_spatial_dimensions_size());
+      std::vector<int64> rhs_shard_sizes(dnums.input_spatial_dimensions_size());
+      for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+        int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+        int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
+        int64 shard_count = rhs.sharding().tile_assignment().dim(rhs_dimension);
+        auto wd = window.dimensions(i);
+        if (wd.base_dilation() != 1 || wd.window_reversal()) {
+          return DefaultAction(hlo);
+        }
+
+        int64 lhs_shard_size = CeilOfRatio(
+            lhs.base_shape().dimensions(lhs_dimension), shard_count);
+        int64 rhs_shard_size = CeilOfRatio(
+            rhs.base_shape().dimensions(rhs_dimension), shard_count);
+        shard_counts[i] = shard_count;
+        lhs_shard_sizes[i] = lhs_shard_size;
+        rhs_shard_sizes[i] = rhs_shard_size;
+      }
+
+      std::vector<OffsetCalculation> left_halo_size_functions(
+          hlo->shape().rank());
+      std::vector<OffsetCalculation> right_halo_size_functions(
+          hlo->shape().rank());
+      Window new_window = window;
+
+      // Data structures needed for Pad and DynamicSlice on LHS if needed.
+      bool need_dynamic_slice_lhs = false;
+      auto partition_ordinals =
+          MakeTiledPartitionOrdinals(lhs.sharding(), partition_id_, &b_);
+      std::vector<int64> zero_padding(hlo->shape().rank());
+      PaddingConfig pad_config =
+          window_util::MakeSymmetricPadding(zero_padding);
+      auto zero_s32 = b_.AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+      std::vector<HloInstruction*> dynamic_slice_start_indices(
+          hlo->shape().rank(), zero_s32);
+      Shape dynamic_slice_shape = lhs.hlo()->shape();
+      Shape pad_shape = lhs.hlo()->shape();
+
+      for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+        int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+        int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
+        int64 lhs_shard_size = lhs_shard_sizes[i];
+        int64 rhs_shard_size = rhs_shard_sizes[i];
+
+        if (shard_counts[i] == 1) {
+          continue;
+        }
+
+        // Calculate the left and right halo sizes as described in the comments
+        // above. It calculcates the halo sizes with dilation, so we apply
+        // CeilOfRatio({left,right}_halo_size, window_dilation).
+        auto wd = window.dimensions(i);
+        int64 padding_low = wd.padding_low();
+        int64 padding_high = wd.padding_high();
+        int64 base = lhs.base_shape().dimensions(lhs_dimension);
+        int64 window_count =
+            1 + (padding_low + padding_high + base -
+                 (1 + (wd.size() - 1) * wd.window_dilation())) /
+                    wd.stride();
+        left_halo_size_functions[rhs_dimension] =
+            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                rhs_shard_size * wd.window_dilation() - lhs_shard_size,
+                (window_count - 1) * wd.stride() - padding_low +
+                    wd.window_dilation() - 1,
+                wd.window_dilation()));
+        right_halo_size_functions[rhs_dimension] =
+            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                lhs_shard_size - rhs_shard_size * wd.window_dilation(),
+                lhs_shard_size - rhs_shard_size * wd.window_dilation() +
+                    padding_low + wd.window_dilation() - 1,
+                wd.window_dilation()));
+
+        // New RHS window size includes the maximum of both left and right
+        // halos.
+        int64 halo_size = left_halo_size_functions[rhs_dimension].MaxInRange(
+                              1, shard_counts[i]) +
+                          right_halo_size_functions[rhs_dimension].MaxInRange(
+                              0, shard_counts[i] - 1);
+        int64 new_window_size =
+            rhs.hlo()->shape().dimensions(rhs_dimension) + halo_size;
+
+        // The amount of new low padding could be dynamic (e.g., window_dilation
+        // != 1), which requires pad (to the maximum) and dynamic slice on LHS.
+        //
+        // If we consider the first window, the offset of the dilated RHS that
+        // aligns with the first valid LHS element for shard i is 'padding_low +
+        // LHS * i'. When the left halo is added to RHS, the offset of the first
+        // RHS element is (RHS * i - left_halo) * window_dilation. The
+        // difference between the two values is the amount of padding_low we
+        // need on LHS.
+        auto new_padding_low_function =
+            OffsetCalculation(
+                HloOpcode::kMultiply, left_halo_size_functions[rhs_dimension],
+                OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                    0, wd.window_dilation(), 1))) -
+            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                rhs_shard_size * wd.window_dilation() - lhs_shard_size,
+                -padding_low, 1));
+
+        int64 new_padding_low_max =
+            new_padding_low_function.MaxInRange(0, shard_counts[i]);
+        int64 new_padding_low = new_padding_low_max;
+        int64 new_padding_high = window_count * wd.stride() +
+                                 (new_window_size - 1) * wd.window_dilation() -
+                                 new_padding_low - lhs_shard_size;
+
+        // We do pad/dynamic-slice only when the padding is dynamic.
+        if (!new_padding_low_function.IsConstant()) {
+          need_dynamic_slice_lhs = true;
+          new_padding_low = 0;
+          pad_config.mutable_dimensions(lhs_dimension)
+              ->set_edge_padding_low(new_padding_low_max);
+          pad_config.mutable_dimensions(lhs_dimension)
+              ->set_edge_padding_high(new_padding_low_max);
+          pad_shape.set_dimensions(lhs_dimension,
+                                   lhs_shard_size + 2 * new_padding_low_max);
+          dynamic_slice_start_indices[lhs_dimension] =
+              (OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                   0, new_padding_low_max, 1)) -
+               new_padding_low_function)
+                  .Calculate(partition_ordinals[lhs_dimension], &b_);
+          dynamic_slice_shape.set_dimensions(
+              lhs_dimension, lhs_shard_size + new_padding_low_max);
+        }
+
+        // Since the convolution RHS operand size increased with halos, adjust
+        // the window config accordingly.
+        new_window.mutable_dimensions(i)->set_padding_low(new_padding_low);
+        new_window.mutable_dimensions(i)->set_padding_high(new_padding_high);
+        new_window.mutable_dimensions(i)->set_size(
+            rhs.hlo()->shape().dimensions(rhs_dimension) + halo_size);
+      }
+
+      HloInstruction* conv_lhs = lhs.hlo();
+      if (need_dynamic_slice_lhs) {
+        auto pad = b_.AddInstruction(
+            HloInstruction::CreatePad(pad_shape, lhs.hlo(), zero, pad_config));
+        conv_lhs = b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+            dynamic_slice_shape, pad, dynamic_slice_start_indices,
+            dynamic_slice_shape.dimensions()));
+      }
+
+      // Exchange halo and concatenate.
+      HloInstruction* rhs_with_halo = rhs.hlo();
+      for (int i = 0; i < dnums.kernel_spatial_dimensions_size(); ++i) {
+        int64 dim = dnums.kernel_spatial_dimensions(i);
+        int64 explicit_left_padding_on_full_shape =
+            left_halo_size_functions[dim].Calculate(0);
+        int64 shard_size_with_halo = new_window.dimensions(i).size();
+
+        // offset_on_padded_shape and padded_full_shape_size are needed only if
+        // we want to mask out-of-range values in ExchangeHaloAndGetValidData().
+        // Since the default value for both the collective-permute is zero and
+        // also we call PadWithValue() on both operands at the beginning, we
+        // don't need to mask here.
+        //
+        // TODO(hyoulkee): Consider removing one of the two PadWithValue() calls
+        // if it's always safe.
+        auto offset_on_padded_shape =
+            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                rhs_shard_sizes[i], explicit_left_padding_on_full_shape, 1)) -
+            left_halo_size_functions[dim];
+        int64 padded_full_shape_size =
+            offset_on_padded_shape.Calculate(shard_counts[i] - 1) +
+            new_window.dimensions(i).size();
+        auto concat = ExchangeHaloAndGetValidData(
+            rhs_with_halo, rhs.base_shape(), left_halo_size_functions[dim],
+            right_halo_size_functions[dim], explicit_left_padding_on_full_shape,
+            padded_full_shape_size, shard_size_with_halo, dim, rhs.sharding(),
+            offset_on_padded_shape.Calculate(partition_ordinals[dim], &b_),
+            zero, partition_ordinals[dim], collective_ops_creator_,
+            next_channel_id_, &b_, /*mask_invalid_region=*/false);
+        if (!concat) {
+          return DefaultAction(hlo);
+        }
+        rhs_with_halo = *concat;
+      }
+
+      SetPartitionedHlo(hlo, [&]() {
+        auto conv = b_.AddInstruction(HloInstruction::CreateConvolve(
+            hlo->shape(), conv_lhs, rhs_with_halo, hlo->feature_group_count(),
+            hlo->batch_group_count(), new_window, dnums,
+            hlo->precision_config()));
+        auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+            &b_, conv, MakeBinaryAdd(hlo->shape().element_type(), module_),
+            NewChannel());
+        ar->set_sharding(HloSharding::Replicate());
+        return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+  }
+
+  if (!sharding.IsTileMaximal()) {
+    // We don't currently support sharding on output feature dimension.
+    if (sharding.tile_assignment().dim(dnums.output_feature_dimension()) > 1) {
+      return DefaultAction(hlo);
+    }
+
+    // Check if the operand and the output sharding are aligned.
+    std::vector<int64> input_to_output_indices(hlo->shape().rank());
+    input_to_output_indices[dnums.input_batch_dimension()] =
+        dnums.output_batch_dimension();
+    input_to_output_indices[dnums.input_feature_dimension()] =
+        dnums.output_feature_dimension();
+    for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+      input_to_output_indices[dnums.input_spatial_dimensions(i)] =
+          dnums.output_spatial_dimensions(i);
+    }
+    auto target_operand_sharding =
+        hlo_sharding_util::TransposeSharding(sharding, input_to_output_indices);
+    lhs = lhs.Reshard(target_operand_sharding);
+
+    // Replicate the RHS.
+    rhs = rhs.Reshard(HloSharding::Replicate());
+
+    // Convolution window config does not include batch and feature dimensions,
+    // whereas ReshardAsWindowedInput() expects the same number of window
+    // dimensions as the rank of the operand. So add two more trivial
+    // dimensions.
+    std::vector<int64> ones(hlo->shape().rank(), 1);
+    auto operand_window = window_util::MakeWindow(ones);
+    for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+      *operand_window.mutable_dimensions(dnums.input_spatial_dimensions(i)) =
+          hlo->window().dimensions(i);
+    }
+
+    auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(hlo->shape().element_type())));
+    auto resharded_operand_and_window = lhs.ReshardAsWindowedInput(
+        operand_window, target_operand_sharding, zero);
+    if (!resharded_operand_and_window.has_value()) {
+      return DefaultAction(hlo);
+    }
+    Window new_window;
+    for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+      *new_window.add_dimensions() =
+          resharded_operand_and_window->shard_window.dimensions(
+              dnums.input_spatial_dimensions(i));
+    }
+    TF_ASSIGN_OR_RETURN(
+        Shape sharded_conv_shape,
+        ShapeInference::InferConvolveShape(
+            resharded_operand_and_window->sharded_input->shape(),
+            rhs.hlo()->shape(), hlo->feature_group_count(),
+            hlo->batch_group_count(), new_window, dnums));
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    *sharded_conv_shape.mutable_layout() = shard_shape.layout();
+    SetPartitionedHlo(hlo, [&]() {
+      auto sharded_conv = b_.AddInstruction(HloInstruction::CreateConvolve(
+          sharded_conv_shape, resharded_operand_and_window->sharded_input,
+          rhs.hlo(), hlo->feature_group_count(), hlo->batch_group_count(),
+          new_window, dnums, hlo->precision_config()));
+      if (!resharded_operand_and_window->dynamic_slice_index_on_output
+               .has_value()) {
+        CHECK(ShapeUtil::Compatible(shard_shape, sharded_conv->shape()));
+        return sharded_conv;
+      }
+      return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+          shard_shape, sharded_conv,
+          *resharded_operand_and_window->dynamic_slice_index_on_output,
+          shard_shape.dimensions()));
+    });
+    return Status::OK();
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleDot(HloInstruction* hlo) {
+  DotGeneralDimsMapping mapping;
+  const auto& dnums = hlo->dot_dimension_numbers();
+  int64 next_output_dim = 0;
+  for (int64 i = 0; i < dnums.lhs_batch_dimensions_size(); ++i) {
+    mapping.batch_dims.emplace_back();
+    mapping.batch_dims.back().lhs = dnums.lhs_batch_dimensions(i);
+    mapping.batch_dims.back().rhs = dnums.rhs_batch_dimensions(i);
+    mapping.batch_dims.back().output = next_output_dim++;
+  }
+  for (int64 i = 0; i < dnums.lhs_contracting_dimensions_size(); ++i) {
+    mapping.contracting_dims.emplace_back();
+    mapping.contracting_dims.back().lhs = dnums.lhs_contracting_dimensions(i);
+    mapping.contracting_dims.back().rhs = dnums.rhs_contracting_dimensions(i);
+    mapping.contracting_dims.back().output = -1;
+  }
+  for (int64 i = 0; i < hlo->operand(0)->shape().rank(); ++i) {
+    if (absl::c_linear_search(dnums.lhs_batch_dimensions(), i) ||
+        absl::c_linear_search(dnums.lhs_contracting_dimensions(), i)) {
+      continue;
+    }
+    mapping.lhs_non_contracting_dims.emplace_back();
+    mapping.lhs_non_contracting_dims.back().lhs = i;
+    mapping.lhs_non_contracting_dims.back().rhs = -1;
+    mapping.lhs_non_contracting_dims.back().output = next_output_dim++;
+  }
+  for (int64 i = 0; i < hlo->operand(1)->shape().rank(); ++i) {
+    if (absl::c_linear_search(dnums.rhs_batch_dimensions(), i) ||
+        absl::c_linear_search(dnums.rhs_contracting_dimensions(), i)) {
+      continue;
+    }
+    mapping.rhs_non_contracting_dims.emplace_back();
+    mapping.rhs_non_contracting_dims.back().lhs = -1;
+    mapping.rhs_non_contracting_dims.back().rhs = i;
+    mapping.rhs_non_contracting_dims.back().output = next_output_dim++;
+  }
+  auto create_sharded_dot = [&](HloInstruction* l, HloInstruction* r,
+                                SpmdBuilder* b) -> StatusOr<HloInstruction*> {
+    TF_ASSIGN_OR_RETURN(
+        auto sharded_dot_shape,
+        ShapeInference::InferDotOpShape(l->shape(), r->shape(),
+                                        hlo->dot_dimension_numbers()));
+    return b->AddInstruction(HloInstruction::CreateDot(
+        sharded_dot_shape, l, r, hlo->dot_dimension_numbers(),
+        hlo->precision_config()));
+  };
+  return HandleDotHelper(hlo, mapping, create_sharded_dot);
+}
+
+Status SpmdPartitioningVisitor::HandleDotHelper(
+    HloInstruction* hlo, const DotGeneralDimsMapping& dims_mapping,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot) {
+  const HloSharding& lhs_sharding = hlo->operand(0)->sharding();
+  const HloSharding& rhs_sharding = hlo->operand(1)->sharding();
+
+  // Similar to hlo_sharding_util::TransposeSharding(), but allows
+  // removing/adding non-partitioned dimensions.
+  auto transpose_sharding =
+      [&](const HloSharding& source, absl::Span<int64 const> src_to_tgt,
+          absl::Span<int64 const> tgt_to_src) -> absl::optional<HloSharding> {
+    if (source.IsTileMaximal()) {
+      return source;
+    }
+    std::vector<int64> tgt_dims_skipping_new(tgt_to_src.size(), -1);
+    int64 skipped_tgt_dims = 0;
+    for (int64 i = 0; i < tgt_to_src.size(); ++i) {
+      if (tgt_to_src[i] < 0) {
+        skipped_tgt_dims++;
+      } else {
+        tgt_dims_skipping_new[i] = i - skipped_tgt_dims;
+      }
+    }
+    int64 skipped_src_dims = absl::c_count(src_to_tgt, -1);
+    std::vector<int64> perm(src_to_tgt.size());
+    for (int64 i = 0; i < src_to_tgt.size(); ++i) {
+      if (src_to_tgt[i] < 0) {
+        if (source.tile_assignment().dim(i) > 1) {
+          return absl::nullopt;
+        }
+        perm[src_to_tgt.size() - skipped_src_dims] = i;
+        skipped_src_dims--;
+      } else {
+        perm[tgt_dims_skipping_new[src_to_tgt[i]]] = i;
+      }
+    }
+    auto tgt_sharding = hlo_sharding_util::TransposeSharding(source, perm);
+    if (skipped_tgt_dims == 0) {
+      return tgt_sharding;
+    }
+    auto reshape_tiles = tgt_sharding.tile_assignment();
+    std::vector<int64> tgt_tiles(tgt_to_src.size(), 1);
+    for (int64 i = 0; i < tgt_tiles.size(); ++i) {
+      if (tgt_to_src[i] >= 0) {
+        tgt_tiles[i] = reshape_tiles.dim(tgt_dims_skipping_new[i]);
+      }
+    }
+    reshape_tiles.Reshape(tgt_tiles);
+    return HloSharding::Tile(reshape_tiles);
+  };
+
+  std::vector<int64> lhs_to_rhs_indices(hlo->operand(0)->shape().rank(), -1);
+  std::vector<int64> lhs_to_output_indices(hlo->operand(0)->shape().rank(), -1);
+  std::vector<int64> rhs_to_lhs_indices(hlo->operand(1)->shape().rank(), -1);
+  std::vector<int64> rhs_to_output_indices(hlo->operand(1)->shape().rank(), -1);
+  std::vector<int64> output_to_lhs_indices(hlo->shape().rank(), -1);
+  std::vector<int64> output_to_rhs_indices(hlo->shape().rank(), -1);
+  auto populate_indices_mapping =
+      [&](const DotGeneralDimsMapping::DimsMapping& mapping) {
+        if (mapping.lhs >= 0) {
+          lhs_to_rhs_indices[mapping.lhs] = mapping.rhs;
+          lhs_to_output_indices[mapping.lhs] = mapping.output;
+        }
+        if (mapping.rhs >= 0) {
+          rhs_to_lhs_indices[mapping.rhs] = mapping.lhs;
+          rhs_to_output_indices[mapping.rhs] = mapping.output;
+        }
+        if (mapping.output >= 0) {
+          output_to_lhs_indices[mapping.output] = mapping.lhs;
+          output_to_rhs_indices[mapping.output] = mapping.rhs;
+        }
+      };
+  for (const auto& mapping : dims_mapping.batch_dims) {
+    populate_indices_mapping(mapping);
+  }
+  for (const auto& mapping : dims_mapping.contracting_dims) {
+    populate_indices_mapping(mapping);
+  }
+  for (const auto& mapping : dims_mapping.lhs_non_contracting_dims) {
+    populate_indices_mapping(mapping);
+  }
+  for (const auto& mapping : dims_mapping.rhs_non_contracting_dims) {
+    populate_indices_mapping(mapping);
+  }
+  auto lhs_sharding_transposed_to_match_rhs =
+      transpose_sharding(lhs_sharding, lhs_to_rhs_indices, rhs_to_lhs_indices);
+  auto rhs_sharding_transposed_to_match_lhs =
+      transpose_sharding(rhs_sharding, rhs_to_lhs_indices, lhs_to_rhs_indices);
+  auto lhs_sharding_transposed_to_match_output = transpose_sharding(
+      lhs_sharding, lhs_to_output_indices, output_to_lhs_indices);
+  auto rhs_sharding_transposed_to_match_output = transpose_sharding(
+      rhs_sharding, rhs_to_output_indices, output_to_rhs_indices);
+  auto output_sharding_transposed_to_match_lhs = transpose_sharding(
+      hlo->sharding(), output_to_lhs_indices, lhs_to_output_indices);
+  auto output_sharding_transposed_to_match_rhs = transpose_sharding(
+      hlo->sharding(), output_to_rhs_indices, rhs_to_output_indices);
+
+  // lhs_rhs_or_output: 0 lhs, 1 rhs, 2 output.
+  auto get_partitions_for_dims =
+      [&](const HloSharding& sharding,
+          absl::Span<const DotGeneralDimsMapping::DimsMapping> dims,
+          int lhs_rhs_or_output) {
+        int64 partitions = 1;
+        if (sharding.IsTileMaximal()) {
+          return partitions;
+        }
+        for (const auto& dim : dims) {
+          if (lhs_rhs_or_output == 0) {
+            partitions *= sharding.tile_assignment().dim(dim.lhs);
+          } else if (lhs_rhs_or_output == 1) {
+            partitions *= sharding.tile_assignment().dim(dim.rhs);
+          } else {
+            CHECK_EQ(lhs_rhs_or_output, 2);
+            partitions *= sharding.tile_assignment().dim(dim.output);
+          }
+        }
+        return partitions;
+      };
+  const int64 lhs_batch_partitions =
+      get_partitions_for_dims(lhs_sharding, dims_mapping.batch_dims, 0);
+  const int64 rhs_batch_partitions =
+      get_partitions_for_dims(rhs_sharding, dims_mapping.batch_dims, 1);
+  const int64 output_batch_partitions =
+      get_partitions_for_dims(hlo->sharding(), dims_mapping.batch_dims, 2);
+  const int64 lhs_contracting_partitions =
+      get_partitions_for_dims(lhs_sharding, dims_mapping.contracting_dims, 0);
+  const int64 rhs_contracting_partitions =
+      get_partitions_for_dims(rhs_sharding, dims_mapping.contracting_dims, 1);
+  const int64 lhs_non_contracting_partitions = get_partitions_for_dims(
+      lhs_sharding, dims_mapping.lhs_non_contracting_dims, 0);
+  const int64 rhs_non_contracting_partitions = get_partitions_for_dims(
+      rhs_sharding, dims_mapping.rhs_non_contracting_dims, 1);
+  const int64 output_lhs_non_contracting_partitions = get_partitions_for_dims(
+      hlo->sharding(), dims_mapping.lhs_non_contracting_dims, 2);
+  const int64 output_rhs_non_contracting_partitions = get_partitions_for_dims(
+      hlo->sharding(), dims_mapping.rhs_non_contracting_dims, 2);
+
+  auto& lhs = GetPartitionedHlo(hlo->operand(0));
+  auto& rhs = GetPartitionedHlo(hlo->operand(1));
+  // LHS and RHS are partitioned the same way and only partitioned in batch
+  // dimensions.
+  if (lhs_batch_partitions == rhs_batch_partitions &&
+      rhs_batch_partitions == num_partitions_ &&
+      lhs_sharding_transposed_to_match_rhs == rhs_sharding) {
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] {
+      dot->set_sharding(*lhs_sharding_transposed_to_match_output);
+      return PartitionedHlo(dot, hlo->shape(), MakePartitioningState())
+          .Reshard(hlo->sharding())
+          .hlo();
+    });
+    return Status::OK();
+  }
+
+  // Try emit batch-partitioned einsum with one operand resharded. Returns
+  // whether the attempt succeeds. If may_reshard_with_allreduce is false,
+  // reshard must be done using all-to-all; otherwise this attempt fails.
+  auto try_emit_output_batch_partitioned_einsum_with_reshard =
+      [&](bool may_reshard_with_allreduce) -> StatusOr<bool> {
+    // LHS and output are batch partitioned in the same way.
+    if (lhs_batch_partitions == num_partitions_ &&
+        output_batch_partitions == num_partitions_ &&
+        lhs_sharding_transposed_to_match_output == hlo->sharding()) {
+      if (!may_reshard_with_allreduce &&
+          !CanReshardWithAllToAll(rhs.sharding(),
+                                  *lhs_sharding_transposed_to_match_rhs)) {
+        return false;
+      }
+      auto resharded_rhs = rhs.Reshard(*lhs_sharding_transposed_to_match_rhs);
+      TF_ASSIGN_OR_RETURN(
+          auto dot, create_sharded_dot(lhs.hlo(), resharded_rhs.hlo(), &b_));
+      SetPartitionedHlo(hlo, [&] { return dot; });
+      return true;
+    }
+    // RHS and output are batch partitioned in the same way.
+    if (rhs_batch_partitions == num_partitions_ &&
+        output_batch_partitions == num_partitions_ &&
+        rhs_sharding_transposed_to_match_output == hlo->sharding()) {
+      if (!may_reshard_with_allreduce &&
+          !CanReshardWithAllToAll(lhs.sharding(),
+                                  *rhs_sharding_transposed_to_match_lhs)) {
+        return false;
+      }
+      auto resharded_lhs = lhs.Reshard(*rhs_sharding_transposed_to_match_lhs);
+      TF_ASSIGN_OR_RETURN(
+          auto dot, create_sharded_dot(resharded_lhs.hlo(), rhs.hlo(), &b_));
+      SetPartitionedHlo(hlo, [&] { return dot; });
+      return true;
+    }
+    return false;
+  };
+
+  {
+    // Try batch-parallel by resharding one operand, and not using all-reduce.
+    TF_ASSIGN_OR_RETURN(
+        bool emitted,
+        try_emit_output_batch_partitioned_einsum_with_reshard(false));
+    if (emitted) {
+      return Status::OK();
+    }
+  }
+
+  // Try to emit windowed DotGeneral when one operand is partitioned in the same
+  // way as the output along non-contracting dimensions, but the other operand
+  // is tiled in other dimensions.
+  auto emit_windowed_dot_general = [&](int64 matching_operand,
+                                       int64 windowing_operand,
+                                       bool windowed_at_contracting_dims,
+                                       bool windowed_at_batch_dims) {
+    CHECK_EQ(matching_operand + windowing_operand, 1);
+    CHECK(!windowed_at_batch_dims || !windowed_at_contracting_dims);
+    auto unpadded_result_buffer_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    auto padded_result_buffer_shape = unpadded_result_buffer_shape;
+    // For windowing at batch/non-contracting dims, we produce the result one
+    // partition at a time, so we need to pad the shape in case of uneven
+    // partitioning in order to make dynamic-update-slice in-bound.
+    if (!windowed_at_contracting_dims) {
+      padded_result_buffer_shape = GetPaddedShapeForUnevenPartitioning(
+          padded_result_buffer_shape,
+          windowing_operand == 0 ? *lhs_sharding_transposed_to_match_output
+                                 : *rhs_sharding_transposed_to_match_output);
+    }
+    // Mask the padding area of the windowed operand with zero if there is
+    // uneven partitioning.
+    if (windowed_at_contracting_dims) {
+      auto& to_mask = windowing_operand == 0 ? lhs : rhs;
+      to_mask =
+          to_mask.PadWithValue(b_.AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::Zero(hlo->shape().element_type()))));
+    }
+    auto result_buffer = CreateZero(padded_result_buffer_shape, &b_);
+    auto iteration = b_.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(0)));
+
+    // Create a while loop that computes one window per iteration. During each
+    // iteration, each partition sends its input window to its neighbor using
+    // collective-permute for the next iteration.
+    SpmdBuilder body_b("windowed_dot_general_body", visiting_hlo_);
+    auto param = body_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0,
+        ShapeUtil::MakeTupleShape({lhs.hlo()->shape(), rhs.hlo()->shape(),
+                                   result_buffer->shape(), iteration->shape()}),
+        "param"));
+    auto l = body_b.AddInstruction(
+        HloInstruction::CreateGetTupleElement(lhs.hlo()->shape(), param, 0));
+    auto r = body_b.AddInstruction(
+        HloInstruction::CreateGetTupleElement(rhs.hlo()->shape(), param, 1));
+    auto o = body_b.AddInstruction(HloInstruction::CreateGetTupleElement(
+        result_buffer->shape(), param, 2));
+    auto i = body_b.AddInstruction(
+        HloInstruction::CreateGetTupleElement(iteration->shape(), param, 3));
+
+    auto partition_id = collective_ops_creator_.create_partition_id(&body_b);
+    auto data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kAdd, i, partition_id));
+    auto partition_count = body_b.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<uint32>(num_partitions_)));
+    data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kRemainder, data_partition_id, partition_count));
+    auto dot_lhs = l;
+    auto dot_rhs = r;
+    if (windowed_at_contracting_dims || windowed_at_batch_dims) {
+      // Slice the matching operand according to the partitioned contracting
+      // dimensions on the windowed operand. We do this by treating the matching
+      // operand as replicated, and resharding it to match the windowed operand.
+      auto slice_operand = matching_operand == 0 ? l : r;
+      slice_operand->set_sharding(HloSharding::Replicate());
+      auto state = MakePartitioningState();
+      state.b = &body_b;
+      state.partition_id = data_partition_id;
+      auto slice = PartitionedHlo(slice_operand, slice_operand->shape(), state)
+                       .Reshard(windowing_operand == 0
+                                    ? *lhs_sharding_transposed_to_match_rhs
+                                    : *rhs_sharding_transposed_to_match_lhs)
+                       .hlo();
+      slice_operand->clear_sharding();
+      if (matching_operand == 0) {
+        dot_lhs = slice;
+      } else {
+        dot_rhs = slice;
+      }
+    }
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(dot_lhs, dot_rhs, &body_b));
+    if (windowed_at_contracting_dims) {
+      // Accumulate the partial output to the result buffer.
+      o = body_b.AddInstruction(
+          HloInstruction::CreateBinary(o->shape(), HloOpcode::kAdd, o, dot));
+    } else {
+      // The windowing operand is partitioned along batch/non-contracting
+      // dimensions, so we need a dynamic-update-slice to save the partial
+      // output in the result buffer.
+      auto offsets = MakePartitionOffsets(
+          o->shape(),
+          windowing_operand == 0 ? *lhs_sharding_transposed_to_match_output
+                                 : *rhs_sharding_transposed_to_match_output,
+          data_partition_id, &body_b);
+      o = body_b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          o->shape(), o, dot, offsets));
+    }
+
+    // ++i
+    i = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kAdd, i,
+        body_b.AddInstruction(
+            HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(1)))));
+    auto has_more = body_b.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::MakeShape(PRED, {}), i,
+        body_b.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<uint32>(num_partitions_))),
+        ComparisonDirection::kLt));
+    // Collective-permute for the next window. We don't need it for the last
+    // iteration, so we use a conditional around the collective-permute.
+    HloInstruction* conditional;
+    {
+      SpmdBuilder cp_b("window_collective_permute", visiting_hlo_);
+      {
+        auto p = cp_b.AddInstruction(HloInstruction::CreateParameter(
+            0, windowing_operand == 0 ? l->shape() : r->shape(), "window"));
+        std::vector<std::pair<int64, int64>> sd_pairs(num_partitions_);
+        for (int64 source = 0; source < num_partitions_; ++source) {
+          // 0 -> n-1, 1 -> 0, 2 -> 1, ...
+          sd_pairs[source] = {source,
+                              (source - 1 + num_partitions_) % num_partitions_};
+        }
+        collective_ops_creator_.create_cross_partition_collective_permute(
+            &cp_b, p, sd_pairs, (*next_channel_id_)++);
+      }
+      SpmdBuilder ncp_b("last_iteration_noop", visiting_hlo_);
+      {
+        ncp_b.AddInstruction(HloInstruction::CreateParameter(
+            0, windowing_operand == 0 ? l->shape() : r->shape(), "window"));
+      }
+      conditional = body_b.AddInstruction(HloInstruction::CreateConditional(
+          windowing_operand == 0 ? l->shape() : r->shape(), has_more,
+          windowing_operand == 0 ? l : r,
+          module_->AddEmbeddedComputation(cp_b.Build()),
+          windowing_operand == 0 ? l : r,
+          module_->AddEmbeddedComputation(ncp_b.Build())));
+    }
+    if (windowing_operand == 0) {
+      l = conditional;
+    } else {
+      r = conditional;
+    }
+    body_b.AddInstruction(HloInstruction::CreateTuple({l, r, o, i}));
+
+    SpmdBuilder cond_b("windowed_dot_general_cond", visiting_hlo_);
+    auto cond_param = cond_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0,
+        ShapeUtil::MakeTupleShape({lhs.hlo()->shape(), rhs.hlo()->shape(),
+                                   result_buffer->shape(), iteration->shape()}),
+        "param"));
+    auto cond_i = cond_b.AddInstruction(HloInstruction::CreateGetTupleElement(
+        iteration->shape(), cond_param, 3));
+    cond_b.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::MakeShape(PRED, {}), cond_i,
+        cond_b.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<uint32>(num_partitions_))),
+        ComparisonDirection::kLt));
+    auto while_loop = b_.AddInstruction(HloInstruction::CreateWhile(
+        cond_param->shape(), module_->AddEmbeddedComputation(cond_b.Build()),
+        module_->AddEmbeddedComputation(body_b.Build()),
+        b_.AddInstruction(HloInstruction::CreateTuple(
+            {lhs.hlo(), rhs.hlo(), result_buffer, iteration}))));
+    windowed_dot_general_loops_.push_back({while_loop, windowing_operand,
+                                           windowed_at_contracting_dims,
+                                           windowed_at_batch_dims});
+    SetPartitionedHlo(hlo, [&] {
+      auto result = b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          result_buffer->shape(), while_loop, 2));
+      if (!ShapeUtil::Compatible(padded_result_buffer_shape,
+                                 unpadded_result_buffer_shape)) {
+        result = b_.AddInstruction(HloInstruction::CreateSlice(
+            unpadded_result_buffer_shape, result,
+            std::vector<int64>(padded_result_buffer_shape.rank(), 0),
+            unpadded_result_buffer_shape.dimensions(),
+            std::vector<int64>(padded_result_buffer_shape.rank(), 1)));
+      }
+      return result;
+    });
+    return Status::OK();
+  };
+  if (output_lhs_non_contracting_partitions == num_partitions_ &&
+      output_sharding_transposed_to_match_lhs == lhs_sharding &&
+      ShapeSizeInBytes(hlo->operand(1)->shape()) >=
+          options_.threshold_for_windowed_einsum_mib * 1024 * 1024) {
+    if (rhs_contracting_partitions == num_partitions_) {
+      return emit_windowed_dot_general(0, 1, true, false);
+    }
+    if (rhs_non_contracting_partitions == num_partitions_) {
+      return emit_windowed_dot_general(0, 1, false, false);
+    }
+    if (rhs_batch_partitions == num_partitions_) {
+      return emit_windowed_dot_general(0, 1, false, true);
+    }
+  }
+  if (output_rhs_non_contracting_partitions == num_partitions_ &&
+      output_sharding_transposed_to_match_rhs == rhs_sharding &&
+      ShapeSizeInBytes(hlo->operand(0)->shape()) >=
+          options_.threshold_for_windowed_einsum_mib * 1024 * 1024) {
+    if (lhs_contracting_partitions == num_partitions_) {
+      return emit_windowed_dot_general(1, 0, true, false);
+    }
+    if (lhs_non_contracting_partitions == num_partitions_) {
+      return emit_windowed_dot_general(1, 0, false, false);
+    }
+    if (lhs_batch_partitions == num_partitions_) {
+      return emit_windowed_dot_general(1, 0, false, true);
+    }
+  }
+
+  {
+    // Try batch-parallel by resharding one operand, and allowing all-reduce.
+    TF_ASSIGN_OR_RETURN(
+        bool emitted,
+        try_emit_output_batch_partitioned_einsum_with_reshard(true));
+    if (emitted) {
+      return Status::OK();
+    }
+  }
+
+  // LHS and RHS have the same partitioned contracting dimensions.
+  if (lhs_contracting_partitions == rhs_contracting_partitions &&
+      lhs_contracting_partitions == num_partitions_) {
+    auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(hlo->shape().element_type())));
+    // Pad both sides with zero, since NaN at one side cannot be masked by zero
+    // on the other side.
+    if (ShapeSizeInBytes(lhs.base_shape()) <
+        ShapeSizeInBytes(rhs.base_shape())) {
+      lhs =
+          lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithValue(zero);
+      rhs = rhs.PadWithValue(zero);
+    } else {
+      lhs = lhs.PadWithValue(zero);
+      rhs =
+          rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithValue(zero);
+    }
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] {
+      auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+          &b_, dot, MakeBinaryAdd(hlo->shape().element_type(), module_),
+          NewChannel());
+      ar->set_sharding(HloSharding::Replicate());
+      return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
+          .Reshard(hlo->sharding())
+          .hlo();
+    });
+    return Status::OK();
+  }
+
+  // LHS and output have the same partitioned non-contracting dimensions.
+  if (lhs_non_contracting_partitions == num_partitions_ &&
+      output_lhs_non_contracting_partitions == num_partitions_ &&
+      lhs_sharding == hlo->sharding()) {
+    auto rhs_replicated = rhs.Reshard(HloSharding::Replicate()).hlo();
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs_replicated, &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+
+  // RHS and output have the same partitioned non-contracting dimensions.
+  if (rhs_non_contracting_partitions == num_partitions_ &&
+      output_rhs_non_contracting_partitions == num_partitions_ &&
+      rhs_sharding_transposed_to_match_output == hlo->sharding()) {
+    auto lhs_replicated = lhs.Reshard(HloSharding::Replicate()).hlo();
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs_replicated, rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+
+  // Output is batch partitioned.
+  if (output_batch_partitions == num_partitions_) {
+    auto resharded_lhs = lhs.Reshard(*output_sharding_transposed_to_match_lhs);
+    auto resharded_rhs = rhs.Reshard(*output_sharding_transposed_to_match_rhs);
+    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(resharded_lhs.hlo(),
+                                                     resharded_rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+  // Output is partitioned along LHS non-contracting dimensions.
+  if (output_lhs_non_contracting_partitions == num_partitions_) {
+    auto resharded_lhs = lhs.Reshard(*output_sharding_transposed_to_match_lhs);
+    auto replicated_rhs = rhs.Reshard(HloSharding::Replicate());
+    TF_ASSIGN_OR_RETURN(
+        auto dot,
+        create_sharded_dot(resharded_lhs.hlo(), replicated_rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+  // Output is partitioned along RHS non-contracting dimensions.
+  if (output_rhs_non_contracting_partitions == num_partitions_) {
+    auto replicated_lhs = lhs.Reshard(HloSharding::Replicate());
+    auto resharded_rhs = rhs.Reshard(*output_sharding_transposed_to_match_rhs);
+    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(replicated_lhs.hlo(),
+                                                     resharded_rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+
+  // Returns true if it is beneficial to reshard the operand at `operand_idx`
+  // across the contracting dimension.
+  const auto should_partition_contracting_dim = [&](int64 operand_idx) {
+    if (!hlo->sharding().IsReplicated()) {
+      return false;
+    }
+
+    if (operand_idx == 0) {
+      // If LHS and output are replicated, we compare the cost of all-gather
+      // on RHS vs all-reduce on the output.
+      return (rhs_contracting_partitions == num_partitions_) &&
+             lhs.sharding().IsReplicated() &&
+             ShapeUtil::ElementsIn(hlo->operand(1)->shape()) >
+                 ShapeUtil::ElementsIn(hlo->shape());
+    } else {
+      return (lhs_contracting_partitions == num_partitions_) &&
+             rhs.sharding().IsReplicated() &&
+             ShapeUtil::ElementsIn(hlo->operand(0)->shape()) >
+                 ShapeUtil::ElementsIn(hlo->shape());
+    }
+  };
+
+  // When the output is replicated and one of the operands is partitioned along
+  // contracting dimension, align the other operand to be partitioned along
+  // the contracting dimensions.
+  if (hlo->sharding().IsReplicated() && (should_partition_contracting_dim(0) ||
+                                         should_partition_contracting_dim(1))) {
+    auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(hlo->shape().element_type())));
+    if (should_partition_contracting_dim(0)) {
+      lhs =
+          lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithValue(zero);
+      rhs = rhs.PadWithValue(zero);
+    } else {
+      lhs = lhs.PadWithValue(zero);
+      rhs =
+          rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithValue(zero);
+    }
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] {
+      auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+          &b_, dot, MakeBinaryAdd(hlo->shape().element_type(), module_),
+          NewChannel());
+      ar->set_sharding(HloSharding::Replicate());
+      return PartitionedHlo(ar, hlo->shape(), MakePartitioningState()).hlo();
+    });
+    return Status::OK();
+  }
+
+  return DefaultAction(hlo);
+}
+
+namespace {
+
+// Finds a cluster of nodes that produce the inputs for `hlo` which only depend
+// on small operands, which means the cluster should start with broadcasts,
+// constants and iotas. All other internal nodes must be non-side-effecting
+// elemntwise ops. Returns the set of nodes, and the small operands. E.g., for
+// the following graph,
+//
+//     a -> broadcast -> multiply
+//     iota  ---> add--/
+//     constant/
+//
+// FindInputNodesIfOnlyDependOnSmallOperands(multiply) will return
+//    <{broadcast, iota, constant, add, multiply}, [a]>.
+std::pair<std::unordered_set<HloInstruction*>, std::vector<HloInstruction*>>
+FindInputNodesIfOnlyDependOnSmallOperands(HloInstruction* hlo) {
+  std::unordered_set<HloInstruction*> nodes_found;
+  std::vector<HloInstruction*> new_operands;
+  std::unordered_set<const HloInstruction*> new_operands_set;
+  std::vector<HloInstruction*> worklist;
+  worklist.push_back(hlo);
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    if (nodes_found.count(inst) > 0) {
+      continue;
+    }
+    if (inst->opcode() == HloOpcode::kBroadcast ||
+        inst->opcode() == HloOpcode::kConstant ||
+        inst->opcode() == HloOpcode::kIota) {
+      nodes_found.insert(inst);
+      for (auto o : inst->operands()) {
+        auto res = new_operands_set.emplace(o);
+        if (res.second) {
+          new_operands.push_back(o);
+        }
+      }
+    } else if (inst->IsElementwise() && !inst->HasSideEffectNoRecurse() &&
+               inst->opcode() != HloOpcode::kAllReduce &&
+               absl::c_all_of(inst->operands(),
+                              [inst](const HloInstruction* o) {
+                                return ShapeUtil::CompatibleIgnoringElementType(
+                                    o->shape(), inst->shape());
+                              })) {
+      nodes_found.insert(inst);
+      for (auto o : inst->operands()) {
+        worklist.push_back(o);
+      }
+    } else {
+      nodes_found.clear();
+      new_operands.clear();
+      break;
+    }
+  }
+  return {std::move(nodes_found), std::move(new_operands)};
+}
+
+// Moves a cluster of memory-reducing nodes into the windowed dot-general loop
+// on contracting dimensions. Such a loop has a dynamic slice on the
+// non-windowed operand. If we move the input nodes into the loop, the
+// dynamic-slice could be merged with them by later optimization passes, which
+// reduces memory.
+//
+// small_operands             small_operands
+//        |                          |
+// input_nodes                loop { |
+//        |          =>         input_nodes
+// loop { |                          |
+//    dynamic-slice             dynamic-slice
+//    ...                       ...
+// }                          }
+//
+// Later optimization passes (TpuPadSliceMover) will merge the dynamic slice
+// with the input nodes.
+Status SinkInputNodesIntoWindowedDotGeneralLoopOnContractingDimensions(
+    HloInstruction* loop, int64 non_windowed_operand_index) {
+  auto input_tuple = loop->mutable_operand(0);
+  auto old_operand = input_tuple->mutable_operand(non_windowed_operand_index);
+  auto input_nodes = FindInputNodesIfOnlyDependOnSmallOperands(old_operand);
+  auto to_sink = std::move(input_nodes.first);
+  auto new_operands = std::move(input_nodes.second);
+  if (to_sink.empty()) {
+    return Status::OK();
+  }
+  auto computation = loop->parent();
+  // Replace the old operand with a tuple of the found small operands.
+  auto new_input_subtuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(new_operands));
+  TF_RETURN_IF_ERROR(input_tuple->ReplaceOperandWithDifferentShape(
+      non_windowed_operand_index, new_input_subtuple));
+
+  auto body = loop->while_body();
+  auto body_param = body->parameter_instruction(0);
+  auto old_body_param_users = body_param->users();
+  // Update all tuple shapes.
+  for (auto tuple : std::vector<HloInstruction*>{
+           input_tuple, loop, loop->while_condition()->parameter_instruction(0),
+           body_param, body->root_instruction()}) {
+    *ShapeUtil::GetMutableSubshape(tuple->mutable_shape(),
+                                   {non_windowed_operand_index}) =
+        new_input_subtuple->shape();
+  }
+  // Now update the loop body.
+  auto new_operand_tuple_inside =
+      body->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_input_subtuple->shape(), body_param, non_windowed_operand_index));
+  TF_RETURN_IF_ERROR(body->root_instruction()->ReplaceOperandWithDifferentShape(
+      non_windowed_operand_index, new_operand_tuple_inside));
+
+  // Create nodes inside the loop body.
+  std::vector<HloInstruction*> worklist;
+  std::unordered_map<const HloInstruction*, HloInstruction*> outside_to_inside;
+  auto add_users_if_available = [&](HloInstruction* inst) {
+    for (auto u : inst->users()) {
+      if (outside_to_inside.count(u) == 0 && to_sink.count(u) > 0 &&
+          absl::c_all_of(u->operands(), [&](const HloInstruction* o) {
+            return outside_to_inside.count(o) > 0;
+          })) {
+        worklist.push_back(u);
+      }
+    }
+  };
+  for (int64 i = 0; i < new_operands.size(); ++i) {
+    outside_to_inside[new_operands[i]] =
+        body->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_operands[i]->shape(), new_operand_tuple_inside, i));
+    add_users_if_available(new_operands[i]);
+  }
+  // HLOs to sink without operands.
+  std::vector<HloInstruction*> nullaries_to_sink;
+  for (auto inst : to_sink) {
+    if (inst->operand_count() == 0) {
+      nullaries_to_sink.push_back(inst);
+    }
+  }
+  // Sort nullaries_to_sink to make it deterministic.
+  absl::c_sort(nullaries_to_sink,
+               [](const HloInstruction* a, const HloInstruction* b) {
+                 return a->unique_id() < b->unique_id();
+               });
+  for (auto inst : nullaries_to_sink) {
+    worklist.push_back(inst);
+  }
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    std::vector<HloInstruction*> inst_new_operands(inst->operand_count());
+    for (int64 i = 0; i < inst->operand_count(); ++i) {
+      inst_new_operands[i] = outside_to_inside[inst->operand(i)];
+    }
+    outside_to_inside[inst] = body->AddInstruction(
+        inst->CloneWithNewOperands(inst->shape(), inst_new_operands));
+    add_users_if_available(inst);
+  }
+  TF_RET_CHECK(outside_to_inside.count(old_operand) > 0);
+  for (auto ou : old_body_param_users) {
+    if (ou->opcode() == HloOpcode::kGetTupleElement &&
+        ou->tuple_index() == non_windowed_operand_index) {
+      TF_RETURN_IF_ERROR(
+          ou->ReplaceAllUsesWith(outside_to_inside[old_operand]));
+      TF_RETURN_IF_ERROR(body->RemoveInstruction(ou));
+    }
+  }
+  return Status::OK();
+}
+
+// Moves a cluster of memory-reducing nodes (with reduce nodes at the end) into
+// the windowed dot-general loop on non-contracting dimensions. Such a loop has
+// a dynamic-update-slice at the output. If we move the user nodes into the loop
+// and before the dynamic-update-slice, the user nodes can operate on smaller
+// shapes, which reduces memory.
+//
+// small_operands                   small_operands
+//  | |                 =>                  | |
+//  | |  loop {                     loop {  | |
+//  | |    conv                             | broadcast      conv
+//  | |      |                              |     |           /
+//  | | dynamic-update-slice                |  dynamic-slice /
+//  | |         |                           |     |         /
+//  | |  }      |                           |  multiply-----
+//  |broadcast  /                           |    /
+//  | |        /                            reduce
+//  |multiply--                             |
+//  \ |                                dynamic-update-slice
+//   reduce                         }
+//
+// Later optimization passes (TpuPadSliceMover) will merge the dynamic slice
+// with the input nodes (broadcast).
+Status MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
+    HloInstruction* loop) {
+  CHECK_EQ(loop->user_count(), 1);
+  // There should be a single direct user of the while loop, which is the
+  // gte for element 2, i.e., the dot output.
+  auto user_gte = loop->users().front();
+  CHECK_EQ(user_gte->opcode(), HloOpcode::kGetTupleElement);
+  CHECK_EQ(user_gte->tuple_index(), 2);
+  auto computation = loop->parent();
+
+  // Find the reduce outputs and the input nodes they depend on, if input nodes
+  // only have small operands.
+  std::unordered_set<HloInstruction*> to_move;
+  std::vector<HloInstruction*> new_operands;
+  std::unordered_set<const HloInstruction*> new_operands_set;
+  std::vector<HloInstruction*> reduce_outputs;
+  std::vector<HloInstruction*> worklist;
+  Shape padded_shape = user_gte->shape();
+  Shape unpadded_shape = user_gte->shape();
+  auto original_output = user_gte;
+
+  if (user_gte->user_count() == 1 &&
+      user_gte->users().back()->opcode() == HloOpcode::kSlice) {
+    original_output = user_gte->users().back();
+    unpadded_shape = original_output->shape();
+  }
+  for (auto u : original_output->users()) {
+    worklist.push_back(u);
+  }
+  to_move.insert(original_output);
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    if (to_move.count(inst) > 0) {
+      continue;
+    }
+    // We only support reduces with simple reduction function, since we may need
+    // to accumulate across iterations manually.
+    if (inst->opcode() == HloOpcode::kReduce &&
+        inst->to_apply()->instruction_count() == 3 &&
+        inst->to_apply()->num_parameters() == 2 &&
+        inst->to_apply()->root_instruction()->IsElementwise()) {
+      to_move.insert(inst);
+      auto other_operand = inst->mutable_operand(1);
+      auto res = new_operands_set.emplace(other_operand);
+      if (res.second) {
+        new_operands.push_back(other_operand);
+      }
+      reduce_outputs.push_back(inst);
+    } else if (inst != computation->root_instruction() &&
+               inst->user_count() > 0 && inst->IsElementwise() &&
+               !inst->HasSideEffectNoRecurse() &&
+               inst->opcode() != HloOpcode::kAllReduce &&
+               absl::c_all_of(inst->operands(),
+                              [inst](const HloInstruction* o) {
+                                return ShapeUtil::CompatibleIgnoringElementType(
+                                    o->shape(), inst->shape());
+                              })) {
+      // For an elementwise op, we need to make sure that they depend on only
+      // nodes already in to_move and nodes with small operands.
+      bool can_include = true;
+      for (auto operand : inst->operands()) {
+        if (to_move.count(operand) > 0) {
+          continue;
+        }
+        auto find_result = FindInputNodesIfOnlyDependOnSmallOperands(operand);
+        if (find_result.first.empty()) {
+          can_include = false;
+          break;
+        }
+        for (auto n : find_result.first) {
+          to_move.insert(n);
+        }
+        for (auto new_operand : find_result.second) {
+          auto res = new_operands_set.insert(new_operand);
+          if (res.second) {
+            new_operands.push_back(new_operand);
+          }
+        }
+      }
+      if (!can_include) {
+        to_move.clear();
+        break;
+      }
+      to_move.insert(inst);
+      for (auto u : inst->users()) {
+        worklist.push_back(u);
+      }
+    } else {
+      to_move.clear();
+      break;
+    }
+  }
+  // If nothing is found, to_move could contain only original_output, or cleared
+  // by the above code.
+  if (to_move.size() <= 1) {
+    return Status::OK();
+  }
+
+  // We will replace the original loop output with reduce-shape outputs. Create
+  // the initial buffers before the loop.
+  for (auto out : reduce_outputs) {
+    auto padded_out_shape = out->shape();
+    int64 operand_dim = 0;
+    int64 output_dim = 0;
+    while (output_dim < padded_out_shape.rank()) {
+      if (absl::c_linear_search(out->dimensions(), operand_dim)) {
+        // Dimension colapsed.
+        ++operand_dim;
+        continue;
+      }
+      // Kept dimensions have the same size of the padded shape.
+      padded_out_shape.set_dimensions(output_dim,
+                                      padded_shape.dimensions(operand_dim));
+      ++operand_dim;
+      ++output_dim;
+    }
+    auto broadcast =
+        computation->AddInstruction(HloInstruction::CreateBroadcast(
+            padded_out_shape,
+            computation->AddInstruction(HloInstruction::CreateConstant(
+                LiteralUtil::Zero(out->shape().element_type()))),
+            {}));
+    new_operands.push_back(broadcast);
+  }
+
+  auto input_tuple = loop->mutable_operand(0);
+  // Create the new input subtuple that contains the small operands and the
+  // reduce-shape result buffers.
+  auto new_input_subtuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(new_operands));
+  TF_RETURN_IF_ERROR(
+      input_tuple->ReplaceOperandWithDifferentShape(2, new_input_subtuple));
+  auto body = loop->while_body();
+  auto body_param = body->parameter_instruction(0);
+  auto body_root = body->root_instruction();
+  CHECK_EQ(body_root->opcode(), HloOpcode::kTuple);
+  // Update tuple shapes.
+  for (auto tuple : std::vector<HloInstruction*>{
+           input_tuple, loop, loop->while_condition()->parameter_instruction(0),
+           body_param, body_root}) {
+    *ShapeUtil::GetMutableSubshape(tuple->mutable_shape(), {2}) =
+        new_input_subtuple->shape();
+  }
+  auto new_loop_input =
+      body->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_input_subtuple->shape(), body_param, 2));
+
+  // Now create the moved nodes inside the loop body.
+  std::unordered_map<const HloInstruction*, HloInstruction*> outside_to_inside;
+  worklist.clear();
+  auto add_users_if_available = [&](HloInstruction* inst) {
+    for (auto u : inst->users()) {
+      if (outside_to_inside.count(u) == 0 && to_move.count(u) > 0 &&
+          absl::c_all_of(u->operands(), [&](const HloInstruction* o) {
+            return outside_to_inside.count(o) > 0;
+          })) {
+        worklist.push_back(u);
+      }
+    }
+  };
+  for (int64 i = 0; i < new_operands.size(); ++i) {
+    outside_to_inside[new_operands[i]] =
+        body->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_operands[i]->shape(), new_loop_input, i));
+    add_users_if_available(new_operands[i]);
+  }
+  // The elementwise nodes will be created with sliced shape. The original loop
+  // output corresponds to the dynamic-update-slice's update slice.
+  auto dus = body_root->mutable_operand(2);
+  CHECK_EQ(dus->opcode(), HloOpcode::kDynamicUpdateSlice);
+  outside_to_inside[original_output] = dus->mutable_operand(1);
+  add_users_if_available(original_output);
+  std::vector<HloInstruction*> slice_offsets(padded_shape.rank());
+  for (int64 i = 0; i < slice_offsets.size(); ++i) {
+    slice_offsets[i] = dus->mutable_operand(i + 2);
+  }
+  auto get_slice = [&](HloInstruction* padded) {
+    return body->AddInstruction(HloInstruction::CreateDynamicSlice(
+        ShapeUtil::ChangeElementType(dus->operand(1)->shape(),
+                                     padded->shape().element_type()),
+        padded, slice_offsets, dus->operand(1)->shape().dimensions()));
+  };
+  // Helper functions to create nodes with small operands.
+  auto add_broadcast = [&](const HloInstruction* broadcast) {
+    auto padded_operand_shape = broadcast->operand(0)->shape();
+    for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
+      padded_operand_shape.set_dimensions(
+          i, padded_shape.dimensions(broadcast->dimensions(i)));
+    }
+    auto padded_operand = PadToShape(outside_to_inside[broadcast->operand(0)],
+                                     padded_operand_shape, nullptr, body);
+    outside_to_inside[broadcast] =
+        get_slice(body->AddInstruction(broadcast->CloneWithNewOperands(
+            ShapeUtil::ChangeElementType(padded_shape,
+                                         padded_operand_shape.element_type()),
+            {padded_operand})));
+  };
+  auto add_iota = [&](const HloInstruction* iota) {
+    outside_to_inside[iota] =
+        get_slice(body->AddInstruction(iota->CloneWithNewOperands(
+            ShapeUtil::ChangeElementType(padded_shape,
+                                         iota->shape().element_type()),
+            {})));
+  };
+  auto add_constant = [&](const HloInstruction* constant) {
+    outside_to_inside[constant] = body->AddInstruction(constant->Clone());
+    outside_to_inside[constant] = get_slice(
+        PadToShape(outside_to_inside[constant],
+                   ShapeUtil::ChangeElementType(
+                       padded_shape, constant->shape().element_type()),
+                   nullptr, body));
+  };
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    if (outside_to_inside.count(inst) > 0) {
+      continue;
+    }
+    if (inst->opcode() == HloOpcode::kBroadcast) {
+      add_broadcast(inst);
+    } else if (inst->opcode() == HloOpcode::kIota) {
+      add_iota(inst);
+    } else if (inst->opcode() == HloOpcode::kConstant) {
+      add_constant(inst);
+    } else if (inst->opcode() == HloOpcode::kReduce) {
+      // This is an output, for which we has special handling later.
+    } else {
+      std::vector<HloInstruction*> operands_inside(inst->operand_count());
+      for (int64 i = 0; i < operands_inside.size(); ++i) {
+        operands_inside[i] = outside_to_inside[inst->operand(i)];
+      }
+      outside_to_inside[inst] = body->AddInstruction(inst->CloneWithNewOperands(
+          ShapeUtil::ChangeElementType(dus->operand(1)->shape(),
+                                       inst->shape().element_type()),
+          operands_inside));
+    }
+    add_users_if_available(inst);
+  }
+  std::vector<HloInstruction*> new_outputs_inside(new_operands.size());
+  for (int64 i = 0; i < new_outputs_inside.size(); ++i) {
+    new_outputs_inside[i] = outside_to_inside[new_operands[i]];
+  }
+  // Now create the reduce outpus inside of the loop.
+  for (int64 i = 0; i < reduce_outputs.size(); ++i) {
+    auto reduce_outside = reduce_outputs[i];
+    CHECK_EQ(reduce_outside->opcode(), HloOpcode::kReduce);
+    int64 index_in_operand = new_operands.size() - reduce_outputs.size() + i;
+    auto last_iter_result = outside_to_inside[new_operands[index_in_operand]];
+    auto operand0 = outside_to_inside[reduce_outside->operand(0)];
+    auto operand1 = outside_to_inside[reduce_outside->operand(1)];
+    TF_ASSIGN_OR_RETURN(auto reduce_shape,
+                        ShapeInference::InferReduceShape(
+                            {&operand0->shape(), &operand1->shape()},
+                            reduce_outside->dimensions(),
+                            reduce_outside->to_apply()->ComputeProgramShape()));
+    *reduce_shape.mutable_layout() = reduce_outside->shape().layout();
+    std::vector<HloInstruction*> reduce_dus_offsets;
+    // If any collapsed dimension is windowed, we need to accumulate with last
+    // iteration's result. If such a dimension has padding, we also need to mask
+    // off invalid data.
+    bool needs_accumulate = false;
+    std::vector<int64> dims_to_mask;
+    for (int64 i = 0; i < slice_offsets.size(); ++i) {
+      if (absl::c_linear_search(reduce_outside->dimensions(), i)) {
+        if (reduce_outside->operand(0)->shape().dimensions(i) !=
+            operand0->shape().dimensions(i)) {
+          needs_accumulate = true;
+          if (unpadded_shape.dimensions(i) != padded_shape.dimensions(i)) {
+            dims_to_mask.push_back(i);
+          }
+        }
+        continue;
+      }
+      reduce_dus_offsets.push_back(slice_offsets[i]);
+    }
+    // Mask off invalid data in collapsed dimensions.
+    for (int64 dim : dims_to_mask) {
+      auto iota = body->AddInstruction(HloInstruction::CreateIota(
+          ShapeUtil::ChangeElementType(operand0->shape(), S32), dim));
+      auto add = body->AddInstruction(HloInstruction::CreateBinary(
+          iota->shape(), HloOpcode::kAdd, iota,
+          body->AddInstruction(HloInstruction::CreateBroadcast(
+              iota->shape(), slice_offsets[dim], {}))));
+      auto limit = body->AddInstruction(HloInstruction::CreateBroadcast(
+          iota->shape(),
+          body->AddInstruction(
+              HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(
+                  reduce_outside->operand(0)->shape().dimensions(dim)))),
+          {}));
+      auto compare = body->AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::ChangeElementType(iota->shape(), PRED), add, limit,
+          ComparisonDirection::kLt));
+      operand0 = body->AddInstruction(HloInstruction::CreateTernary(
+          operand0->shape(), HloOpcode::kSelect, compare, operand0,
+          body->AddInstruction(HloInstruction::CreateBroadcast(
+              operand0->shape(), operand1, {}))));
+    }
+    auto output_inside =
+        body->AddInstruction(reduce_outside->CloneWithNewOperands(
+            reduce_shape, {operand0, operand1}));
+    // Accumulate with previous results if needed.
+    if (needs_accumulate) {
+      auto input_slice =
+          body->AddInstruction(HloInstruction::CreateDynamicSlice(
+              output_inside->shape(), last_iter_result, reduce_dus_offsets,
+              output_inside->shape().dimensions()));
+      output_inside = body->AddInstruction(HloInstruction::CreateBinary(
+          output_inside->shape(),
+          reduce_outside->to_apply()->root_instruction()->opcode(),
+          output_inside, input_slice));
+    }
+    // Dynamic-update-slice if needed.
+    if (!ShapeUtil::Compatible(output_inside->shape(),
+                               last_iter_result->shape())) {
+      output_inside =
+          body->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+              last_iter_result->shape(), last_iter_result, output_inside,
+              reduce_dus_offsets));
+    }
+    new_outputs_inside[index_in_operand] = output_inside;
+  }
+  // Body output.
+  auto new_output_inside =
+      body->AddInstruction(HloInstruction::CreateTuple(new_outputs_inside));
+  TF_RETURN_IF_ERROR(
+      body_root->ReplaceOperandWithDifferentShape(2, new_output_inside));
+  TF_RETURN_IF_ERROR(body->RemoveInstructionAndUnusedOperands(dus));
+  // Replace uses of the reduces outside the loop.
+  auto new_output_gte =
+      computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_output_inside->shape(), loop, 2));
+  for (int64 i = 0; i < reduce_outputs.size(); ++i) {
+    int64 index_in_operand = new_operands.size() - reduce_outputs.size() + i;
+    auto new_output =
+        computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_outputs_inside[index_in_operand]->shape(), new_output_gte,
+            index_in_operand));
+    if (!ShapeUtil::Compatible(new_output->shape(),
+                               reduce_outputs[i]->shape())) {
+      new_output = computation->AddInstruction(HloInstruction::CreateSlice(
+          reduce_outputs[i]->shape(), new_output,
+          std::vector<int64>(new_output->shape().rank(), 0),
+          reduce_outputs[i]->shape().dimensions(),
+          std::vector<int64>(new_output->shape().rank(), 1)));
+    }
+    TF_RETURN_IF_ERROR(reduce_outputs[i]->ReplaceAllUsesWith(new_output));
+    TF_RETURN_IF_ERROR(
+        computation->RemoveInstructionAndUnusedOperands(reduce_outputs[i]));
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status SpmdPartitioningVisitor::DoCodeMotionForWindowedDotGeneralLoops(
+    HloComputation* computation) {
+  for (auto& loop : windowed_dot_general_loops_) {
+    if (loop.windowed_in_contracting_dims || loop.windowed_in_batch_dims) {
+      // We have a dynamic-slice for the non-windowed operand in
+      // batch/contracting-dim windowed dot-general. So moving the
+      // broadcast/iota/elementwise ops into the loop could help reduce memory
+      // via fusion.
+      TF_RETURN_IF_ERROR(
+          SinkInputNodesIntoWindowedDotGeneralLoopOnContractingDimensions(
+              loop.while_loop, 1 - loop.windowed_operand));
+    }
+    if (!loop.windowed_in_contracting_dims) {
+      // We have a dynamic-update-slice for the output in
+      // batch/non-contracting-dim windowed dot-general. So moving reduce ops
+      // into the loop could help reduce memory.
+      TF_RETURN_IF_ERROR(
+          MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
+              loop.while_loop));
+    }
+  }
+  return Status::OK();
+}
+
+StatusOr<bool> SpmdPartitioningVisitor::DoPartition(
+    HloComputation* computation, const HloSharding& root_sharding) {
+  VLOG(2) << "Partitioning computation " << computation->name() << " for "
+          << num_replicas_ << " replicas and " << num_partitions_
+          << " partitions";
+  TF_RETURN_IF_ERROR(computation->Accept(this));
+
+  HloModule* module = computation->parent();
+  auto new_root =
+      GetPartitionedHlo(computation->root_instruction()).Reshard(root_sharding);
+  auto new_computation =
+      module->AddEmbeddedComputation(b_.Build(new_root.hlo()));
+  TF_RETURN_IF_ERROR(DoCodeMotionForWindowedDotGeneralLoops(new_computation));
+
+  // Replace the original computation with the new SPMD computation.
+  std::unordered_map<HloComputation*, HloComputation*> replacement;
+  replacement[computation] = new_computation;
+  module->ReplaceComputations(replacement);
+  return changed_;
+}
+
+Status SpmdPartitioningVisitor::HandlePartitionId(HloInstruction* hlo) {
+  return Unimplemented(
+      "PartitionId instruction is not supported for SPMD partitioning since "
+      "the meaning is ambiguous -- whether the instruction is replicated or "
+      "the data is replicated, and if the latter which data is replicated.");
+}
+
+SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64 num_partitions,
+                                                        int64 num_replicas) {
+  return {
+      [](SpmdBuilder* b) {
+        return b->AddInstruction(HloInstruction::CreatePartitionId());
+      },
+      [num_replicas](SpmdBuilder* b, HloInstruction* operand,
+                     HloComputation* reduction, int64 channel_id) {
+        return b->AddInstruction(HloInstruction::CreateAllReduce(
+            operand->shape(), {operand}, reduction,
+            CreateReplicaGroups(num_replicas),
+            /*constrain_layout=*/false, channel_id,
+            /*use_global_device_ids=*/false));
+      },
+      [](SpmdBuilder* b, HloInstruction* operand,
+         std::vector<std::pair<int64, int64>>& src_dst_pairs,
+         int64 channel_id) {
+        return b->AddInstruction(HloInstruction::CreateCollectivePermute(
+            operand->shape(), operand, src_dst_pairs, channel_id));
+      },
+      [](SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
+         const std::vector<ReplicaGroup>& replica_groups, int64 channel_id,
+         absl::optional<int64> split_dimension) {
+        std::vector<Shape> shapes(operands.size(), operands[0]->shape());
+        const Shape output_shape = (shapes.size() == 1)
+                                       ? shapes[0]
+                                       : ShapeUtil::MakeTupleShape(shapes);
+        return b->AddInstruction(HloInstruction::CreateAllToAll(
+            output_shape, operands, replica_groups,
+            /*constrain_layout=*/false, channel_id, split_dimension));
+      },
+      [num_replicas, num_partitions](
+          SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
+          const std::vector<std::vector<int64>>& partition_subgroups,
+          int64 channel_id, int64 all_gather_dimension) {
+        std::vector<ReplicaGroup> device_groups;
+        device_groups.reserve(partition_subgroups.size() * num_replicas);
+        for (int64 i = 0; i < num_replicas; ++i) {
+          for (const auto& pgroup : partition_subgroups) {
+            device_groups.emplace_back();
+            for (int64 pid : pgroup) {
+              device_groups.back().add_replica_ids(i * num_partitions + pid);
+            }
+          }
+        }
+        return b->AddInstruction(HloInstruction::CreateAllGather(
+            ag_shape, operand, all_gather_dimension, device_groups,
+            /*constrain_layout=*/false, channel_id,
+            /*use_global_device_ids=*/true));
+      },
+  };
+}
+
+SpmdPartitioner::SpmdPartitioner(int64 num_partitions, int64 num_replicas,
+                                 SpmdPartitionerOptions options)
+    : SpmdPartitioner(
+          num_partitions, num_replicas, std::move(options),
+          GetDefaultCollectiveOpsCreator(num_partitions, num_replicas)) {}
+
+HloInstruction* SpmdPartitioner::AllGatherShards(SpmdBuilder* b,
+                                                 HloInstruction* operand,
+                                                 const HloSharding& sharding,
+                                                 int64 channel_id) {
+  CHECK(!sharding.IsTileMaximal());
+  // Add one leading dimension to gather all partitions.
+  std::vector<int64> shape;
+  shape.push_back(1);
+  for (int64 dim : operand->shape().dimensions()) {
+    shape.push_back(dim);
+  }
+  auto reshape = b->AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(operand->shape().element_type(), shape), operand));
+  std::vector<std::vector<int64>> partition_subgroups(1);
+  for (int64 pid : sharding.tile_assignment()) {
+    partition_subgroups[0].push_back(pid);
+  }
+  shape[0] = sharding.tile_assignment().num_elements();
+  auto result = collective_ops_creator_.create_cross_partition_all_gather(
+      b, reshape, ShapeUtil::MakeShape(operand->shape().element_type(), shape),
+      partition_subgroups, channel_id, /*all_gather_dimension=*/0);
+  // If n > 1 dimensions are partitioned, split the leading dimension to n.
+  std::vector<int64> tiled_dims;
+  for (int64 i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
+    if (sharding.tile_assignment().dim(i) > 1) {
+      tiled_dims.push_back(i);
+    }
+  }
+  if (tiled_dims.size() > 1) {
+    std::vector<int64> split_dim_shape;
+    split_dim_shape.reserve(tiled_dims.size() + operand->shape().rank());
+    for (int64 i : tiled_dims) {
+      split_dim_shape.push_back(sharding.tile_assignment().dim(i));
+    }
+    for (int64 dim : operand->shape().dimensions()) {
+      split_dim_shape.push_back(dim);
+    }
+    result = b->AddInstruction(HloInstruction::CreateReshape(
+        ShapeUtil::MakeShape(operand->shape().element_type(), split_dim_shape),
+        result));
+  }
+  // Transpose the gathered dimensions to next to their corresponding
+  // partitioned dimensions.
+  std::vector<int64> xpose_permutation(result->shape().rank());
+  int64 split_dims_added = 0;
+  for (int64 i = 0; i < xpose_permutation.size(); ++i) {
+    if (sharding.tile_assignment().dim(i - split_dims_added) == 1) {
+      xpose_permutation[i] = i + tiled_dims.size() - split_dims_added;
+    } else {
+      xpose_permutation[i] = split_dims_added;
+      xpose_permutation[i + 1] = i + tiled_dims.size() - split_dims_added;
+      split_dims_added++;
+      i++;
+    }
+  }
+  result = b->AddInstruction(HloInstruction::CreateTranspose(
+      ShapeInference::InferTransposeShape(result->shape(), xpose_permutation)
+          .ValueOrDie(),
+      result, xpose_permutation));
+  // Reshape to the desired shape.
+  auto ag_shape = operand->shape();
+  for (int64 i : tiled_dims) {
+    ag_shape.set_dimensions(
+        i, ag_shape.dimensions(i) * sharding.tile_assignment().dim(i));
+  }
+  result = b->AddInstruction(HloInstruction::CreateReshape(ag_shape, result));
+  return result;
+}
+
+StatusOr<bool> SpmdPartitioner::PartitionComputation(
+    HloComputation* computation, const HloSharding& root_sharding,
+    int64* next_channel_id, SpmdLogger* logger) {
+  auto visitor =
+      CreateVisitor(computation, num_partitions_, num_replicas_,
+                    collective_ops_creator_, next_channel_id, logger, options_);
+  return visitor->DoPartition(computation, root_sharding);
+}
+
+std::unique_ptr<SpmdPartitioningVisitor> SpmdPartitioner::CreateVisitor(
+    HloComputation* computation, int64 num_partitions, int64 num_replicas,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdLogger* logger,
+    SpmdPartitionerOptions options) {
+  return absl::make_unique<SpmdPartitioningVisitor>(
+      computation, num_partitions, num_replicas, collective_ops_creator,
+      next_channel_id, logger, std::move(options), this);
+}
+
+StatusOr<bool> SpmdPartitioner::Run(HloModule* module) {
+  TF_RETURN_IF_ERROR(PreprocessSharding(module));
+
+  XLA_VLOG_LINES(1, SpmdLogger::ReportBeforePartition(
+                        *module, options_.report_instruction_count));
+
+  // Add the parameters' and output's shardings to the module.
+  std::vector<HloSharding> entry_params_shardings;
+  for (int64 i = 0; i < module->entry_computation()->num_parameters(); ++i) {
+    auto param = module->entry_computation()->parameter_instruction(i);
+    CHECK(param->has_sharding()) << "Missing sharding in entry parameter " << i;
+    entry_params_shardings.push_back(param->sharding());
+  }
+  module->set_spmd_parameters_shardings(entry_params_shardings);
+  auto entry_root = module->entry_computation()->root_instruction();
+  CHECK(entry_root->has_sharding()) << "Missing sharding in entry root.";
+  module->set_spmd_output_sharding(entry_root->sharding());
+
+  FlattenCallGraph flatten;
+  TF_ASSIGN_OR_RETURN(auto changed, flatten.Run(module));
+
+  SpmdLogger logger(options_.report_instruction_count);
+  auto program_shape = module->entry_computation()->ComputeProgramShape();
+  int64 next_channel_id = hlo_query::NextChannelId(*module);
+  TF_ASSIGN_OR_RETURN(
+      bool partition_changed,
+      PartitionComputation(
+          module->entry_computation(),
+          module->entry_computation()->root_instruction()->sharding(),
+          &next_channel_id, &logger));
+  changed |= partition_changed;
+
+  // For the entry computation, make sure that the root instruction and the
+  // parameters preserve their signatures.
+  auto new_program_shape = module->entry_computation()->ComputeProgramShape();
+  if (!options_.allow_module_signature_change) {
+    TF_RET_CHECK(Shape::Equal().MinorToMajorOnlyInLayout()(
+        program_shape.result(), new_program_shape.result()))
+        << "Result shape changed for the entry computation";
+    TF_RET_CHECK(program_shape.parameters_size() ==
+                 new_program_shape.parameters_size())
+        << "Parameter count changed for the entry computation";
+    for (int64 i = 0; i < program_shape.parameters_size(); ++i) {
+      TF_RET_CHECK(Shape::Equal().MinorToMajorOnlyInLayout()(
+          program_shape.parameters(i), new_program_shape.parameters(i)))
+          << "Parameter shape changed for the entry computation";
+    }
+  } else {
+    const auto& old_entry_layout = module->entry_computation_layout();
+    // Shapes can change but the layout should still remain the same.
+    for (int64 i = 0; i < new_program_shape.parameters_size(); ++i) {
+      TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+          old_entry_layout.parameter_shape(i),
+          new_program_shape.mutable_parameters(i)));
+    }
+    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+        old_entry_layout.result_shape(), new_program_shape.mutable_result()));
+
+    HloModuleConfig config = module->config();
+    *config.mutable_entry_computation_layout() =
+        ComputationLayout(new_program_shape, /*ignore_layouts=*/false);
+    module->set_config(config);
+  }
+
+  XLA_VLOG_LINES(1, SpmdLogger::ReportAfterPartition(
+                        *module, options_.report_instruction_count));
+  XLA_VLOG_LINES(1, logger.MakeReport());
+
+  if (changed) {
+    HloPassPipeline pass("spmd-cleanup");
+    pass.AddPass<TupleSimplifier>();
+    pass.AddPass<HloDCE>();
+    pass.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
+    pass.AddPass<FlattenCallGraph>();
+    TF_RETURN_IF_ERROR(pass.Run(module).status());
+  }
+
+  TF_RETURN_IF_ERROR(ClearShardingAttributes(module));
+  return changed;
+}
+
+Status SpmdPartitioner::PreprocessSharding(HloModule* module) {
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* hlo : computation->instructions()) {
+      if (hlo->HasSideEffectNoRecurse() && hlo->opcode() != HloOpcode::kRng) {
+        TF_RET_CHECK(hlo->has_sharding())
+            << "Side-effect HLO must have sharding: " << hlo->ToString();
+        TF_RET_CHECK(!HasReplicatedSharding(hlo->sharding()) ||
+                     hlo->opcode() == HloOpcode::kInfeed)
+            << "Non-infeed side-effect HLO cannot have a replicated sharding:"
+            << hlo->ToString();
+      }
+
+      // For unassigned HLOs, annotate with replicated sharding.
+      //
+      // Among side-effecting ops, only Rng is allowed to omit the annotation.
+      // In that case, we currently force it to run on core 0, since we don't
+      // support partitioning or replicating the Rng op (the values depend on
+      // the seed provided to each device).
+      //
+      // TODO(hyouklee): Should we also convert single-device shardings (without
+      // side-effects) into replicated?
+      if (!hlo->has_sharding()) {
+        if (hlo->opcode() == HloOpcode::kRng) {
+          hlo->set_sharding(HloSharding::AssignDevice(0));
+        } else {
+          hlo->set_sharding(
+              HloSharding::Single(hlo->shape(), HloSharding::Replicate()));
+        }
+      } else if (!hlo->sharding().IsTileMaximal()) {
+        std::vector<int64> available(num_partitions_);
+        std::iota(available.begin(), available.end(), 0);
+        TF_RET_CHECK(num_partitions_ == hlo_sharding_util::DevicesForSharding(
+                                            hlo->sharding(), available)
+                                            .size())
+            << "num_partitions:" << num_partitions_ << "\n"
+            << "SPMD partitioner only supports tile sharding that includes all "
+               "partitions. If you didn't add this sharding annotation in the "
+               "model, please file a bug to XLA team.\n"
+            << hlo->ToString();
+      }
+    }
+  }
+
+  // Entry computation's parameter and root sharding must be either all
+  // replicated or all on a single device.
+  if (!options_.allow_module_signature_change) {
+    const HloComputation* entry = module->entry_computation();
+    TF_RET_CHECK(entry->root_instruction()->has_sharding());
+    const HloSharding& root_sharding = entry->root_instruction()->sharding();
+    TF_RET_CHECK(root_sharding.IsReplicated() ||
+                 root_sharding.UniqueDevice().has_value())
+        << "Unsupported entry root sharding: " << root_sharding.ToString();
+
+    for (const HloInstruction* param : entry->parameter_instructions()) {
+      TF_RET_CHECK(param->has_sharding());
+      TF_RET_CHECK(param->sharding().IsReplicated() ||
+                   param->sharding().UniqueDevice().has_value())
+          << "Unsupported entry parameter sharding:"
+          << param->sharding().ToString();
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
new file mode 100644
index 00000000000..52e4c9021d8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
@@ -0,0 +1,465 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+
+namespace xla {
+namespace spmd {
+
+struct SpmdPartitionerOptions {
+  // Always exchange halo on LHS for all convolutions. If false, backprop filter
+  // convolution exchanges halo on RHS.
+  bool conv_halo_exchange_always_on_lhs = true;
+
+  // The number of instructions to be reported for the highest memory profile
+  // instructions.
+  int64 report_instruction_count = 5;
+
+  // The minimum size in MiB of an einsum operand to be considered using
+  // windowed implementation in an HLO loop.
+  int64 threshold_for_windowed_einsum_mib = 256;
+
+  // Whether the entry computations' signature could change after partitioning.
+  bool allow_module_signature_change = false;
+};
+
+// Class to wrap the computation builder to capture information during SPMD
+// transformation.
+class SpmdBuilder : public HloComputation::Builder {
+ public:
+  SpmdBuilder(const std::string& name, HloInstruction* hlo)
+      : HloComputation::Builder(name) {
+    visiting_hlo_ = hlo;
+  }
+  HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction);
+
+  const std::vector<HloInstruction*>& derived_instructions(
+      HloInstruction* hlo) {
+    return instructions_.at(hlo);
+  }
+
+  void set_visiting_hlo(HloInstruction* hlo) { visiting_hlo_ = hlo; }
+
+  HloInstruction* visiting_hlo() const { return visiting_hlo_; }
+
+ private:
+  // Currently visiting instruction.
+  HloInstruction* visiting_hlo_;
+
+  // Map from the currently visiting (old) instruction to new instructions
+  // created during SPMD partitioning.
+  HloInstructionMap<std::vector<HloInstruction*>> instructions_;
+};
+
+// A set of functions that create the cross-partition collective ops.
+struct SPMDCollectiveOpsCreator {
+  // Function used to create a partition ID HLO.
+  std::function<HloInstruction*(SpmdBuilder*)> create_partition_id;
+
+  // Function used to create a cross-partition all-reduce HLO.
+  std::function<HloInstruction*(SpmdBuilder*, HloInstruction* operand,
+                                HloComputation* reduction, int64 channel_id)>
+      create_cross_partition_all_reduce;
+
+  // Function used to create a cross-partition collective-permute HLO.
+  std::function<HloInstruction*(
+      SpmdBuilder*, HloInstruction* operand,
+      std::vector<std::pair<int64, int64>>& src_dst_pairs,
+      int64 next_channel_id)>
+      create_cross_partition_collective_permute;
+
+  // Function used to create a cross-partition all-to-all HLO.
+  std::function<HloInstruction*(
+      SpmdBuilder*, absl::Span<HloInstruction* const> operands,
+      const std::vector<ReplicaGroup>& replica_groups, int64 channel_id,
+      absl::optional<int64> split_dimension)>
+      create_cross_partition_all_to_all;
+
+  // Function used to create a cross-partition all-gather HLO. This is optional:
+  // if it is nullptr, the partitioner will use all-reduce instead.
+  std::function<HloInstruction*(
+      SpmdBuilder*, HloInstruction* operand, const Shape& ag_shape,
+      const std::vector<std::vector<int64>>& partition_subgroups,
+      int64 channel_id, int64 all_gather_dimension)>
+      create_cross_partition_all_gather;
+};
+
+// Create a default SPMDCollectiveOpsCreator.
+SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64 num_partitions,
+                                                        int64 num_replicas);
+
+// Logger to report memory usage during SPMD partitioning.
+class SpmdLogger {
+ public:
+  explicit SpmdLogger(int64 report_instruction_count)
+      : report_instruction_count_(report_instruction_count) {}
+  static std::string ReportBeforePartition(const HloModule& module,
+                                           int64 report_instruction_count);
+  static std::string ReportAfterPartition(const HloModule& module,
+                                          int64 report_instruction_count);
+
+  // Registers the logging for the groups of instructions created to transform
+  // the given hlo.
+  void RegisterLogEntry(HloInstruction* hlo,
+                        const std::vector<HloInstruction*>& group);
+
+  std::string MakeReport();
+
+ private:
+  template <typename F>
+  static std::string ReportMemoryUsage(const HloModule& module, const F& filter,
+                                       int64 report_instruction_count);
+
+  // A vector of logging messages (one for each original HLO instruction), where
+  // the first integer of the pair represents the size of the HBM used.
+  std::vector<std::pair<int64, std::string>> entries_;
+
+  int64 report_instruction_count_;
+};
+
+class SpmdPartitioningVisitor;
+
+class SpmdPartitioner : public HloModulePass {
+ public:
+  SpmdPartitioner(int64 num_partitions, int64 num_replicas,
+                  SpmdPartitionerOptions options);
+  SpmdPartitioner(int64 num_partitions, int64 num_replicas,
+                  SpmdPartitionerOptions options,
+                  SPMDCollectiveOpsCreator collective_ops_creator)
+      : num_partitions_(num_partitions),
+        num_replicas_(num_replicas),
+        options_(std::move(options)),
+        collective_ops_creator_(std::move(collective_ops_creator)) {}
+  absl::string_view name() const override { return "spmd-partitioning"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+  // Transforms the given computation with SPMD instructions, replacing it with
+  // a new computation.
+  StatusOr<bool> PartitionComputation(HloComputation* computation,
+                                      const HloSharding& root_sharding,
+                                      int64* next_channel_id,
+                                      SpmdLogger* logger);
+
+  // Creates all-gather based on HloSharding. Can be overridden to customize.
+  // The default uses a single all-gather even if there are multiple sharded
+  // dimensions, and adds potential reshapes and transposes to achieve that.
+  // If it returns false, the partitioner will fall back to all-reduce.
+  virtual HloInstruction* AllGatherShards(SpmdBuilder* b,
+                                          HloInstruction* operand,
+                                          const HloSharding& sharding,
+                                          int64 channel_id);
+
+ protected:
+  virtual std::unique_ptr<SpmdPartitioningVisitor> CreateVisitor(
+      HloComputation* computation, int64 num_partitions, int64 num_replicas,
+      const SPMDCollectiveOpsCreator& collective_ops_creator,
+      int64* next_channel_id, SpmdLogger* logger,
+      SpmdPartitionerOptions options);
+
+  // Verify that the sharding of instructions in the module are valid, and also
+  // fill in missing sharding information.
+  Status PreprocessSharding(HloModule* module);
+
+  const int64 num_partitions_;
+  const int64 num_replicas_;
+
+  SpmdPartitionerOptions options_;
+  SPMDCollectiveOpsCreator collective_ops_creator_;
+};
+
+// Class describes partition state of the data represented by an HLO created
+// during SPMD partitioning pass.
+//
+// Data on some devices may include padding region, if the base (full) shape
+// could not be evenly partitioned.
+class PartitionedHlo {
+ public:
+  // Return value for ReshardAsWindowedInput which describes the resharded HLO,
+  // the window for the user on the shard, and if necessary, the dynamic slice
+  // offsets to be applied to the output of the op being sharded.
+  struct WindowedInputShardReturnValue {
+    HloInstruction* sharded_input;
+    Window shard_window;
+    absl::optional<std::vector<HloInstruction*>> dynamic_slice_index_on_output;
+  };
+  // A cache for resharding each partitioned HLO.
+  struct ReshardCache {
+    struct PerHloCache {
+      std::vector<std::pair<HloSharding, PartitionedHlo>> reshard_cache;
+      std::vector<
+          std::tuple<HloSharding, Window, WindowedInputShardReturnValue>>
+          window_reshard_cache;
+    };
+    std::unordered_map<HloInstruction*, PerHloCache> per_hlo_cache;
+  };
+  struct PartitioningState {
+    SpmdBuilder* b;
+    HloModule* module;
+    int64 num_replicas;
+    HloInstruction* partition_id;
+    SPMDCollectiveOpsCreator collective_ops_creator;
+    int64* next_channel_id;
+    ReshardCache* reshard_cache;
+    SpmdPartitioner* partitioner;
+  };
+  PartitionedHlo(HloInstruction* hlo, Shape base_shape, PartitioningState state)
+      : hlo_(hlo), base_shape_(base_shape), state_(std::move(state)) {
+    CHECK(hlo->has_sharding())
+        << "PartitionedHlo is missing sharding:" << hlo->ToString();
+    // If the tuple shape instruction does not have a tuple sharding, reassign
+    // to use the tuple sharding. Reshard() implementation assumes this.
+    if (hlo_->shape().IsTuple() && !hlo_->sharding().IsTuple()) {
+      hlo_->set_sharding(
+          hlo_->sharding().GetTupleSharding(hlo_->shape()).ValueOrDie());
+    }
+  }
+
+  // Reshards the current SPMD instruction to a new sharding. Could only modify
+  // the reshard cache.
+  PartitionedHlo Reshard(const HloSharding& target);
+
+  // Pads the garbage area of the output with the provided value. Normally,
+  // unevenly partitioned dimensions are padded on the right, but this function
+  // allows specifying left-padded dimensions, which can be used during the
+  // handling of kReverse, etc.
+  PartitionedHlo PadWithValue(
+      HloInstruction* pad_value,
+      absl::Span<const int64> left_padded_dims = {}) const;
+
+  // Returns the SPMD instruction.
+  HloInstruction* hlo() const { return hlo_; }
+
+  // Returns the sharding of the SPMD instruction.
+  const HloSharding& sharding() const { return hlo_->sharding(); }
+
+  // Original full shape of the data.
+  const Shape& base_shape() const { return base_shape_; }
+
+  int64 NewChannel() const { return (*state_.next_channel_id)++; }
+
+  // Reshards the HLO to a usable partitioned input for a windowed user. Could
+  // only modify the reshard cache.
+  absl::optional<WindowedInputShardReturnValue> ReshardAsWindowedInput(
+      const Window& window, const HloSharding& target,
+      HloInstruction* pad_value, bool mask_invalid_region = true);
+
+  const PartitioningState& state() const { return state_; }
+
+ private:
+  // Same as Reshard except that it does not explicitly modify the reshard
+  // cache, although it would indirectly modify by calling Replicate().
+  PartitionedHlo ReshardNoCache(const HloSharding& target);
+
+  // Helper function to replicate the data on all devices. Could only modify
+  // the reshard cache.
+  PartitionedHlo Replicate();
+
+  // Helper function to broadcast data from a single device to all devices.
+  PartitionedHlo Broadcast() const;
+
+  // Helper function to reshard the tensor using AllToAll (instead of the
+  // default of Replicate followed by Slice).
+  PartitionedHlo ReshardWithAllToAll(const HloSharding& target) const;
+
+  // Helper function to reshard the tensor using CollectivePermute.
+  PartitionedHlo ReshardWithCollectivePermute(const HloSharding& target) const;
+
+  // SPMD instruction.
+  HloInstruction* hlo_;
+
+  // The original shape of the data before SPMD transformation is applied.
+  Shape base_shape_;
+
+  PartitioningState state_;
+};
+
+struct DotGeneralDimsMapping {
+  // The dimension numbers for the operands and output corresponding to a
+  // logical dimension (e.g., batch, contracting, non-contracting). If an
+  // operand or the output doesn't have the logical dimension, it is set to
+  // -1.
+  struct DimsMapping {
+    int64 lhs;
+    int64 rhs;
+    int64 output;
+  };
+  std::vector<DimsMapping> batch_dims;
+  std::vector<DimsMapping> contracting_dims;
+  std::vector<DimsMapping> lhs_non_contracting_dims;
+  std::vector<DimsMapping> rhs_non_contracting_dims;
+};
+
+class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
+ public:
+  SpmdPartitioningVisitor(
+      HloComputation* computation, int64 num_partitions, int64 num_replicas,
+      const SPMDCollectiveOpsCreator& collective_ops_creator,
+      int64* next_channel_id, SpmdLogger* logger,
+      SpmdPartitionerOptions options, SpmdPartitioner* partitioner);
+
+  Status DefaultAction(HloInstruction* hlo) override;
+  Status HandleAllReduce(HloInstruction* hlo) override;
+  Status HandleBroadcast(HloInstruction* hlo) override;
+  Status HandleConstant(HloInstruction* hlo) override;
+  Status HandleCustomCall(HloInstruction* hlo) override;
+  Status HandleDot(HloInstruction* hlo) override;
+  Status HandleDynamicSlice(HloInstruction* hlo) override;
+  Status HandleDynamicUpdateSlice(HloInstruction* hlo) override;
+  Status HandleGather(HloInstruction* hlo) override;
+  Status HandleGetTupleElement(HloInstruction* hlo) override;
+  Status HandleInfeed(HloInstruction* hlo) override;
+  Status HandleOutfeed(HloInstruction* hlo) override;
+  Status HandlePad(HloInstruction* hlo) override;
+  Status HandleParameter(HloInstruction* hlo) override;
+  Status HandleReduce(HloInstruction* hlo) override;
+  Status HandleReverse(HloInstruction* hlo) override;
+  Status HandleWhile(HloInstruction* hlo) override;
+  Status HandleConditional(HloInstruction* hlo) override;
+  Status HandleReduceWindow(HloInstruction* hlo) override;
+  Status HandleSelectAndScatter(HloInstruction* hlo) override;
+  Status HandleTuple(HloInstruction* hlo) override;
+  Status HandleRng(HloInstruction* hlo) override;
+  Status HandleConvolution(HloInstruction* hlo) override;
+  Status HandleConcatenate(HloInstruction* hlo) override;
+  Status HandleScatter(HloInstruction* hlo) override;
+  Status HandleSlice(HloInstruction* hlo) override;
+  Status HandleSort(HloInstruction* hlo) override;
+  Status HandleTranspose(HloInstruction* hlo) override;
+  Status HandleReshape(HloInstruction* hlo) override;
+  Status HandleIota(HloInstruction* hlo) override;
+  Status HandlePartitionId(HloInstruction* hlo) override;
+
+  // Handles convolution where both LHS and RHS operands are tiled.
+  Status HandleConvolutionTiledLhsAndRhs(HloInstruction* hlo);
+
+  // Implementation of dot partitioning given DotGeneralDimsMapping.
+  Status HandleDotHelper(
+      HloInstruction* hlo, const DotGeneralDimsMapping& dims_mapping,
+      const std::function<StatusOr<HloInstruction*>(
+          HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot);
+
+  // Common handle for elementwise HLOs.
+  Status HandleElementwise(HloInstruction* hlo);
+
+  // Common handle for HLOs that runs on a single device.
+  Status HandleSingleDevice(const HloInstruction* hlo);
+
+  // Returns the PartitionedHlo that corresponds to the original hlo.
+  PartitionedHlo& GetPartitionedHlo(const HloInstruction* hlo) {
+    CHECK_EQ(partitioned_instructions_.count(hlo), 1);
+    return partitioned_instructions_.find(hlo)->second;
+  }
+
+  // Sets the PartitionedHlo for the original hlo.
+  void SetPartitionedHlo(const HloInstruction* hlo,
+                         const PartitionedHlo& partitioned_hlo) {
+    CHECK_EQ(partitioned_instructions_.count(hlo), 0);
+    partitioned_instructions_.emplace(hlo, partitioned_hlo);
+    changed_ = true;
+  }
+
+  // Convenient wrapper that creates PartitionedHlo from the result of the func
+  // and maps it to the given original hlo.
+  void SetPartitionedHlo(const HloInstruction* hlo,
+                         const std::function<HloInstruction*()>& func) {
+    HloInstruction* new_hlo = func();
+    new_hlo->set_sharding(hlo->sharding());
+    new_hlo->set_metadata(hlo->metadata());
+    SetPartitionedHlo(
+        hlo, PartitionedHlo(new_hlo, hlo->shape(), MakePartitioningState()));
+    changed_ = true;
+  }
+
+  int64 NewChannel() { return (*next_channel_id_)++; }
+
+  PartitionedHlo::PartitioningState MakePartitioningState() {
+    PartitionedHlo::PartitioningState state;
+    state.b = &b_;
+    state.module = module_;
+    state.num_replicas = num_replicas_;
+    state.partition_id = partition_id_;
+    state.collective_ops_creator = collective_ops_creator_;
+    state.next_channel_id = next_channel_id_;
+    state.reshard_cache = &reshard_cache_;
+    state.partitioner = partitioner_;
+    return state;
+  }
+
+  SpmdBuilder* builder() { return &b_; }
+
+  StatusOr<bool> DoPartition(HloComputation* computation,
+                             const HloSharding& root_sharding);
+
+ private:
+  Status Preprocess(HloInstruction* hlo) override;
+  Status Postprocess(HloInstruction* hlo) override;
+
+  // Performs code motion for windowed dot-general loops in
+  // windowed_dot_general_loops_. Invoked after the visitor finishes traversing
+  // the graph.
+  Status DoCodeMotionForWindowedDotGeneralLoops(HloComputation* computation);
+
+  bool changed_;
+  HloModule* module_;
+  int64 num_partitions_;
+  int64 num_replicas_;
+
+  SPMDCollectiveOpsCreator collective_ops_creator_;
+
+  // Tracks the next channel id to use for cross-partition all-reduce.
+  int64* next_channel_id_;
+  SpmdBuilder b_;
+
+  HloInstruction* partition_id_;
+
+  PartitionedHlo::ReshardCache reshard_cache_;
+
+  // Mapping from the instruction in the original computation to the new SPMD
+  // partitioned instruction.
+  ConstHloInstructionMap<PartitionedHlo> partitioned_instructions_;
+
+  // Information about a loop created for windowed dot-general. Used when
+  // DoCodeMotionForWindowedDotGeneralLoops() executes after the visitor
+  // finishes traversing the graph.
+  struct WindowedDotGeneralLoop {
+    HloInstruction* while_loop;
+    int64 windowed_operand;
+    bool windowed_in_contracting_dims;
+    bool windowed_in_batch_dims;
+  };
+  std::vector<WindowedDotGeneralLoop> windowed_dot_general_loops_;
+
+  HloInstruction* visiting_hlo_;
+  SpmdLogger* logger_;
+  const SpmdPartitionerOptions options_;
+  SpmdPartitioner* partitioner_;
+};
+
+}  // namespace spmd
+}  // namespace xla
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
new file mode 100644
index 00000000000..1f0b1d06c1f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -0,0 +1,3771 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace spmd {
+namespace {
+
+using ::testing::_;
+using ::testing::AllOf;
+namespace op = xla::testing::opcode_matchers;
+
+class SpmdPartitioningTest : public HloTestBase {
+ public:
+  StatusOr<std::unique_ptr<HloModule>> PartitionComputation(
+      const char* hlo_module, int64 num_devices,
+      bool conv_halo_exchange_always_on_lhs = true) {
+    // Some tests (BackpropFilter convs) set this flag false to test two
+    // different paths of the implementation.
+    SpmdPartitionerOptions options;
+    options.conv_halo_exchange_always_on_lhs = conv_halo_exchange_always_on_lhs;
+    options.allow_module_signature_change = true;
+    auto collective_ops_creator =
+        GetDefaultCollectiveOpsCreator(num_devices, /*num_replicas=*/1);
+    // Do not use all-gather for pattern-matching purpose, as the partitioner
+    // might create reshape/transposes around it.
+    collective_ops_creator.create_cross_partition_all_gather = nullptr;
+
+    TF_ASSIGN_OR_RETURN(auto module, ParseAndReturnVerifiedModule(
+                                         hlo_module, GetModuleConfigForTest()));
+    HloPassPipeline pass("spmd-partitioning");
+    pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
+                              /*allow_mixed_precision=*/false);
+    pass.AddPass<SpmdPartitioner>(num_devices, /*num_replicas=*/1, options,
+                                  collective_ops_creator);
+    pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
+                              /*allow_mixed_precision=*/false);
+    TF_RETURN_IF_ERROR(pass.Run(module.get()).status());
+    return StatusOr<std::unique_ptr<HloModule>>(std::move(module));
+  }
+};
+
+TEST_F(SpmdPartitioningTest, InvalidSharding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = (f32[8,2]{1,0}, token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {maximal device=0}}
+  ROOT infeed.data = f32[8,2]{1,0} get-tuple-element(infeed), index=0,
+    sharding={maximal device=0}
+})";
+  auto module_status = PartitionComputation(hlo_string, /*num_devices=*/4);
+  EXPECT_FALSE(module_status.status().ok());
+  EXPECT_THAT(module_status.status().ToString(),
+              ::testing::HasSubstr(
+                  "only supports tile sharding that includes all partitions"));
+}
+
+TEST_F(SpmdPartitioningTest, SingleDeviceToReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={maximal device=0}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant), sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Copy(op::AllReduce(
+                              op::Select(op::Broadcast(op::Compare()),
+                                         op::Constant(), op::Broadcast()))),
+                          op::Shape("s32[2,3]")));
+}
+
+TEST_F(SpmdPartitioningTest, SingleDeviceToSingleDevice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={maximal device=0}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant), sharding={maximal device=1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  VLOG(1) << module->ToString();
+  EXPECT_THAT(root, op::Copy(AllOf(op::Copy(op::AllReduce(op::Select(
+                                       op::Broadcast(op::Compare()),
+                                       op::Constant(), op::Broadcast()))),
+                                   op::Shape("s32[2,3]"))));
+}
+
+TEST_F(SpmdPartitioningTest, SingleDeviceToTiled) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={maximal device=0}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant),
+    sharding={devices=[2,1]1,0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(
+          op::Copy(op::DynamicSlice(
+              op::AllReduce(op::Select(
+                  op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
+                  op::Constant(), op::Broadcast())),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
+                                           op::Constant())),
+              op::Constant())),
+          op::Shape("s32[1,3]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant), sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      op::Copy(op::AllReduce(AllOf(
+          op::DynamicUpdateSlice(
+              op::Broadcast(), AllOf(op::Constant(), op::Shape("s32[1,3]")),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
+                                           op::Constant())),
+              op::Constant()),
+          op::Shape("s32[2,3]")))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToSingleDevice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant), sharding={maximal device=0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      op::Copy(op::Copy(op::AllReduce(AllOf(
+          op::DynamicUpdateSlice(
+              op::Broadcast(), AllOf(op::Constant(), op::Shape("s32[1,3]")),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
+                                           op::Constant())),
+              op::Constant()),
+          op::Shape("s32[2,3]"))))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledEven) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param= s32[8,2]{1,0} parameter(0), sharding={devices=[2,1]0,1}
+  ROOT %copy = s32[8,2]{1,0} copy(%param), sharding={devices=[1,2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Copy(op::Reshape(op::Transpose(op::AllToAll(AllOf(
+                op::Reshape(op::Parameter()), op::Shape("s32[4,2,1]")))))),
+            op::Shape("s32[8,1]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledUneven) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param= f32[7,31,128]{2,1,0} parameter(0), sharding={devices=[1,2,1]0,1}
+  ROOT %copy = f32[7,31,128]{2,1,0} copy(%param), sharding={devices=[2,1,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Copy(op::Slice(op::Reshape(AllOf(op::Transpose(op::AllToAll(
+          op::Reshape(AllOf(op::Pad(), op::Shape("f32[8,16,128]")))))))))));
+}
+
+TEST_F(SpmdPartitioningTest, GetTupleElementSwapDevice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param.0 = (f32[2,3]{1,0}, u32[]) parameter(0),
+    sharding={{maximal device=1}, {maximal device=1}}
+  %gte.0 = f32[2,3]{1,0} get-tuple-element(%param.0), index=0,
+    sharding={maximal device=0}
+  %gte.1 = u32[] get-tuple-element(%param.0), index=1,
+    sharding={maximal device=0}
+  ROOT %tuple = (f32[2,3]{1,0}, u32[]) tuple(%gte.0, %gte.1),
+    sharding={{maximal device=0},{maximal device=0}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_THAT(root, op::Tuple());
+
+  EXPECT_THAT(root->operand(0),
+              op::Copy(op::AllReduce(op::Select(
+                  op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
+                  op::GetTupleElement(op::Parameter()), op::Broadcast()))));
+  EXPECT_THAT(root->operand(1),
+              op::Copy(op::AllReduce(op::Select(
+                  op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
+                  op::GetTupleElement(op::Parameter()), op::Broadcast()))));
+}
+
+TEST_F(SpmdPartitioningTest, GetTupleElementTiled) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  param.0 = (f32[2,3]{1,0}, u32[2,3]{1,0}) parameter(0),
+    sharding={{replicated}, {replicated}}
+  gte.0 = f32[2,3]{1,0} get-tuple-element(param.0), index=0,
+    sharding={devices=[2,1]0,1}
+  gte.1 = u32[2,3]{1,0} get-tuple-element(param.0), index=1,
+    sharding={devices=[2,1]0,1}
+  ROOT %tuple = (f32[2,3]{1,0}, u32[2,3]{1,0}) tuple(gte.0, gte.1),
+    sharding={{devices=[2,1]0,1},{devices=[2,1]0,1}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_THAT(root, op::Tuple());
+
+  auto offset = op::Reshape(
+      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+
+  EXPECT_THAT(root->operand(0),
+              op::DynamicSlice(op::GetTupleElement(op::Parameter()), offset,
+                               op::Constant()));
+  EXPECT_THAT(root->operand(1),
+              op::DynamicSlice(op::GetTupleElement(op::Parameter()), offset,
+                               op::Constant()));
+}
+
+TEST_F(SpmdPartitioningTest, TiledInfeed) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = (f32[8,2]{1,0}, token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {maximal device=0}}
+  ROOT infeed.data = f32[8,2]{1,0} get-tuple-element(infeed), index=0,
+    sharding={maximal device=0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, op::Copy(op::AllReduce(op::DynamicUpdateSlice(
+                op::Broadcast(),
+                op::GetTupleElement(
+                    AllOf(op::Infeed(), op::Shape("(f32[4,2]{1,0}, token[])"))),
+                op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
+                                             op::Constant())),
+                op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, UnevenTiledInfeed) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = (f32[9,2]{1,0}, token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {maximal device=0}}
+  ROOT infeed.data = f32[9,2]{1,0} get-tuple-element(infeed), index=0,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, AllOf(op::Shape("f32[5,2]"), op::GetTupleElement(op::Conditional(
+                                             op::Convert(op::PartitionId()),
+                                             op::AfterAll(), op::AfterAll()))));
+  EXPECT_THAT(
+      root->operand(0)->called_computations()[0]->root_instruction(),
+      AllOf(op::Shape("(f32[5,2], token[])"), op::Infeed(op::Parameter())));
+  auto second_infeed =
+      AllOf(op::Shape("(f32[4,2], token[])"), op::Infeed(op::Parameter()));
+  EXPECT_THAT(root->operand(0)->called_computations()[1]->root_instruction(),
+              AllOf(op::Shape("(f32[5,2], token[])"),
+                    op::Tuple(op::Pad(op::GetTupleElement(second_infeed),
+                                      op::Constant()),
+                              op::GetTupleElement(second_infeed))));
+}
+
+TEST_F(SpmdPartitioningTest, UnevenTiledTupleInfeed) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = ((f32[9,2]{1,0}, f32[2]{0}), token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {replicated}, {maximal device=0}}
+  ROOT infeed.data = (f32[9,2]{1,0}, f32[2]{0}) get-tuple-element(infeed),
+    index=0, sharding={{devices=[2,1]0,1}, {replicated}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("(f32[5,2], f32[2])"),
+                          op::GetTupleElement(op::Conditional(
+                              op::Convert(op::PartitionId()), op::AfterAll(),
+                              op::AfterAll()))));
+  EXPECT_THAT(root->operand(0)->called_computations()[0]->root_instruction(),
+              AllOf(op::Shape("((f32[5,2], f32[2]), token[])"),
+                    op::Infeed(op::Parameter())));
+  auto second_infeed = AllOf(op::Shape("((f32[4,2], f32[2]), token[])"),
+                             op::Infeed(op::Parameter()));
+  EXPECT_THAT(
+      root->operand(0)->called_computations()[1]->root_instruction(),
+      AllOf(op::Shape("((f32[5,2], f32[2]), token[])"),
+            op::Tuple(op::Tuple(op::Pad(op::GetTupleElement(
+                                            op::GetTupleElement(second_infeed)),
+                                        op::Constant()),
+                                op::GetTupleElement(
+                                    op::GetTupleElement(second_infeed))),
+                      op::GetTupleElement(second_infeed))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToReplicatedReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[3,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce = f32[] reduce(constant, constant.1), dimensions={0,1},
+    to_apply=sum, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      op::AllReduce(op::Reduce(
+          op::Select(
+              op::Compare(op::Add(op::Iota(), op::Broadcast(op::Reshape())),
+                          op::Broadcast(op::Constant())),
+              AllOf(op::Shape("f32[2,3]{1,0}"),
+                    op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                     op::Reshape(), op::Constant())),
+              op::Broadcast(op::Constant())),
+          op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, TiledElementwise) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[3,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  constant.1 = f32[3,3]{1,0} constant({{2,2,2},{2,2,2},{2,2,2}}),
+    sharding={replicated}
+  multiply = f32[3,3]{1,0} multiply(constant, constant.1),
+    sharding={devices=[2,1]0,1}
+  ROOT add = f32[3,3]{1,0} add(multiply, constant.1),
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(
+          op::Shape("f32[2,3]{1,0}"),
+          op::Add(op::Multiply(
+                      op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                       op::Reshape(), op::Constant()),
+                      op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                       op::Reshape(), op::Constant())),
+                  op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                   op::Reshape(), op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledAllReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  parameter = f32[3,3]{1,0} parameter(0), sharding={devices=[2,1]0,1}
+  ROOT all-reduce = f32[3,3]{1,0} all-reduce(parameter), to_apply=sum,
+    replica_groups={}, sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, AllOf(op::Shape("f32[2,3]{1,0}"), op::AllReduce(op::Parameter(0))));
+}
+
+TEST_F(SpmdPartitioningTest, BroadcastOnlyNewDimsSharded) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[4,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={replicated}
+  ROOT broadcast = f32[3,4,3]{2,1,0} broadcast(constant), dimensions={1,2},
+    sharding={devices=[2,1,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,4,3]{2,1,0}"),
+                          op::Broadcast(op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, BroadcastOnlyOldDimsSharded) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[4,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={replicated}
+  ROOT broadcast = f32[4,4,3]{2,1,0} broadcast(constant), dimensions={1,2},
+    sharding={devices=[1,2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[4,2,3]{2,1,0}"),
+                          op::Broadcast(op::DynamicSlice(
+                              op::Constant(), op::Reshape(), op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, BroadcastBothOldAndNewDimsSharded) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[4,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={replicated}
+  ROOT broadcast = f32[4,4,3]{2,1,0} broadcast(constant), dimensions={1,2},
+    sharding={devices=[2,2,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Shape("f32[2,2,3]{2,1,0}"),
+            op::Broadcast(AllOf(op::Shape("f32[2,3]{1,0}"),
+                                op::DynamicSlice(op::Constant(), op::Reshape(),
+                                                 op::Constant())))));
+}
+
+TEST_F(SpmdPartitioningTest, BroadcastPropagateTiledSharding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[4,3]{1,0} constant({{1,1,1},{1,4,1},{1,3,1},{1,2,1}}),
+    sharding={devices=[2,1]0,1}
+  ROOT broadcast = f32[4,4,3]{2,1,0} broadcast(constant), dimensions={1,2},
+    sharding={devices=[1,2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[4,2,3]{2,1,0}"),
+                          op::Broadcast(op::DynamicSlice(
+                              op::Constant(), op::Reshape(), op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, OutfeedSingleDevice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token.0 = token[] after-all()
+  data = f32[1024]{0} parameter(0), sharding={maximal device=0}
+  outfeed = token[] outfeed(data, token.0), sharding={maximal device=0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("token[]"),
+                          op::Conditional(
+                              op::Compare(op::PartitionId(), op::Constant()),
+                              op::Tuple(op::Parameter(0), op::AfterAll()),
+                              op::Tuple(op::Parameter(0), op::AfterAll()))));
+
+  HloInstruction* root_b0 = root->branch_computation(0)->root_instruction();
+  EXPECT_THAT(root_b0,
+              AllOf(op::Shape("token[]"),
+                    op::Outfeed(op::GetTupleElement(op::Parameter(), 0),
+                                op::GetTupleElement(op::Parameter(), 1))));
+
+  HloInstruction* root_b1 = root->branch_computation(1)->root_instruction();
+  EXPECT_THAT(root_b1, AllOf(op::Shape("token[]"), op::AfterAll()));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowReplicatedInput) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[6,2]{1,0} constant({{1,1},{1,4},{2,1},{3,1},{1,2},{2,2}}),
+    sharding={replicated}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[3,2]{1,0} reduce-window(constant, constant.1),
+    window={size=3x1 stride=2x1 pad=1_0x0_0}, to_apply=sum,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Shape("f32[2,2]{1,0}"),
+            op::ReduceWindow(
+                op::DynamicSlice(AllOf(op::Shape("f32[9,2]{1,0}"),
+                                       op::Pad(op::Constant(), op::Constant())),
+                                 op::Multiply(op::Reshape(), op::Constant()),
+                                 op::Constant()),
+                op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiledNegativeLeftHalo) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[6,2]{1,0} constant({{1,1},{1,4},{2,1},{3,1},{1,2},{2,2}}),
+    sharding={devices=[2,1]0,1}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT %reduce-window = f32[3,2]{1,0} reduce-window(%constant, %constant.1),
+    window={size=3x1 stride=2x1 pad=0_1x0_0}, to_apply=sum,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  auto sharded_input =
+      op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant());
+  auto right_halo = AllOf(op::Shape("f32[2,2]{1,0}"),
+                          op::CollectivePermute(op::Slice(sharded_input)));
+  auto pre_masking = op::DynamicSlice(
+      AllOf(
+          op::Shape("f32[6,2]{1,0}"),
+          op::Pad(op::Concatenate(sharded_input, right_halo), op::Constant())),
+      op::Reshape(), op::Constant());
+  auto index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto masked =
+      op::Select(op::Compare(index_in_padded, op::Broadcast(op::Constant())),
+                 pre_masking, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,2]{1,0}"),
+                          op::ReduceWindow(masked, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiledOneSideHaloBeyondNeighbor) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  param = f32[9,2] parameter(0), sharding={devices=[5,1]0,1,2,3,4}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[5,2]{1,0} reduce-window(param, constant.1),
+    window={size=4x1 stride=2x1 pad=3_0x0_0}, to_apply=sum,
+    sharding={devices=[5,1]0,1,2,3,4}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/5));
+  VLOG(1) << module->ToString();
+  auto halo0 = AllOf(op::Shape("f32[1,2]"),
+                     op::CollectivePermute(op::Slice(op::Parameter(0))));
+  auto halo1 =
+      AllOf(op::Shape("f32[2,2]"), op::CollectivePermute(op::Parameter(0)));
+  auto pre_mask =
+      AllOf(op::Shape("f32[4,2]"),
+            op::Slice(AllOf(op::Shape("f32[5,2]"),
+                            op::Concatenate(halo0, halo1, op::Parameter(0)))));
+  auto masked =
+      op::Select(op::Compare(op::Add(op::Iota(), op::Broadcast(op::Multiply())),
+                             op::Broadcast(op::Constant())),
+                 pre_mask, op::Broadcast(op::Constant()));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[1,2]{1,0}"),
+                          op::ReduceWindow(masked, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiledOneSideUnequalHalo) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[9,2]{1,0} constant(
+    {{1,1},{1,4},{2,1},{3,1},{1,2},{2,2},{4,1},{1,2},{2,1}}),
+    sharding={devices=[3,1]0,1,2}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[5,2]{1,0} reduce-window(constant, constant.1),
+    window={size=3x1 stride=2x1 pad=1_1x0_0}, to_apply=sum,
+    sharding={devices=[3,1]0,1,2}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/3));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  auto sharded_input =
+      op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant());
+  auto right_halo = AllOf(op::Shape("f32[2,2]{1,0}"),
+                          op::CollectivePermute(op::Slice(sharded_input)));
+  auto pre_masking = op::DynamicSlice(
+      AllOf(
+          op::Shape("f32[7,2]{1,0}"),
+          op::Pad(op::Concatenate(sharded_input, right_halo), op::Constant())),
+      op::Reshape(), op::Constant());
+  auto index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto masked = op::Select(
+      op::And(op::Compare(index_in_padded, op::Broadcast(op::Constant())),
+              op::Compare(index_in_padded, op::Broadcast(op::Constant()))),
+      pre_masking, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,2]{1,0}"),
+                          op::ReduceWindow(masked, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiledTwoSideHalo) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[4,2]{1,0} constant({{1,1},{1,4},{2,1},{3,1}}),
+    sharding={devices=[2,1]0,1}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[2,2]{1,0} reduce-window(constant, constant.1),
+    window={size=5x1 stride=3x1 pad=2_2x0_0}, to_apply=sum,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  auto sharded_input =
+      op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant());
+  auto left_halo = AllOf(op::Shape("f32[1,2]{1,0}"),
+                         op::CollectivePermute(op::Slice(sharded_input)));
+  auto right_halo = AllOf(op::Shape("f32[1,2]{1,0}"),
+                          op::CollectivePermute(op::Slice(sharded_input)));
+  auto pre_masking = AllOf(
+      op::Shape("f32[5,2]{1,0}"),
+      op::DynamicSlice(
+          AllOf(op::Shape("f32[6,2]{1,0}"),
+                op::Pad(op::Concatenate(left_halo, sharded_input, right_halo),
+                        op::Constant())),
+          op::Reshape(), op::Constant()));
+  auto index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto masked = op::Select(
+      op::And(op::Compare(index_in_padded, op::Broadcast(op::Constant())),
+              op::Compare(index_in_padded, op::Broadcast(op::Constant()))),
+      pre_masking, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[1,2]{1,0}"),
+                          op::ReduceWindow(masked, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiled2D) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = (f32[4,4,2,2]{3,2,1,0}, token[]) infeed(token0),
+    sharding={{devices=[2,2,1,1]0,1,2,3}, {maximal device=0}}
+  infeed.data = f32[4,4,2,2]{3,2,1,0} get-tuple-element(infeed), index=0,
+    sharding={devices=[2,2,1,1]0,1,2,3}
+  constant = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[2,2,2,2]{3,2,1,0} reduce-window(infeed.data, constant),
+    window={size=5x5x1x1 stride=3x3x1x1 pad=2_2x2_2x0_0x0_0}, to_apply=sum,
+    sharding={devices=[2,2,1,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  auto sharded_input = AllOf(op::Shape("f32[2,2,2,2]{3,2,1,0}"),
+                             op::GetTupleElement(op::Infeed()));
+  auto dim0_left_halo = AllOf(op::Shape("f32[1,2,2,2]{3,2,1,0}"),
+                              op::CollectivePermute(op::Slice(sharded_input)));
+  auto dim0_right_halo = AllOf(op::Shape("f32[1,2,2,2]{3,2,1,0}"),
+                               op::CollectivePermute(op::Slice(sharded_input)));
+  auto dim0_pre_masking = op::DynamicSlice(
+      AllOf(op::Shape("f32[6,2,2,2]{3,2,1,0}"),
+            op::Pad(
+                op::Concatenate(dim0_left_halo, sharded_input, dim0_right_halo),
+                op::Constant())),
+      op::Reshape(), op::Constant(), op::Constant(), op::Constant());
+  auto dim0_index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto dim0_masked = op::Select(
+      op::And(op::Compare(dim0_index_in_padded, op::Broadcast(op::Constant())),
+              op::Compare(dim0_index_in_padded, op::Broadcast(op::Constant()))),
+      dim0_pre_masking, op::Broadcast(op::Constant()));
+  auto dim0_resharded = AllOf(op::Shape("f32[5,2,2,2]{3,2,1,0}"), dim0_masked);
+  auto dim1_left_halo = AllOf(op::Shape("f32[5,1,2,2]{3,2,1,0}"),
+                              op::CollectivePermute(op::Slice(dim0_resharded)));
+  auto dim1_right_halo =
+      AllOf(op::Shape("f32[5,1,2,2]{3,2,1,0}"),
+            op::CollectivePermute(op::Slice(dim0_resharded)));
+  auto dim1_pre_masking = op::DynamicSlice(
+      AllOf(op::Shape("f32[5,6,2,2]{3,2,1,0}"),
+            op::Pad(op::Concatenate(dim1_left_halo, dim0_resharded,
+                                    dim1_right_halo),
+                    op::Constant())),
+      op::Constant(), op::Reshape(), op::Constant(), op::Constant());
+  auto dim1_index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto dim1_masked = op::Select(
+      op::And(op::Compare(dim1_index_in_padded, op::Broadcast(op::Constant())),
+              op::Compare(dim1_index_in_padded, op::Broadcast(op::Constant()))),
+      dim1_pre_masking, op::Broadcast(op::Constant()));
+  auto dim1_resharded = AllOf(op::Shape("f32[5,5,2,2]{3,2,1,0}"), dim1_masked);
+  EXPECT_THAT(root, AllOf(op::Shape("f32[1,1,2,2]{3,2,1,0}"),
+                          op::ReduceWindow(dim1_resharded, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,224,224,3] parameter(0)
+  %lhs.copy = f32[128,224,224,3] copy(f32[128,224,224,3] %lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[7,7,3,64] parameter(1)
+  %rhs.copy = f32[7,7,3,64] copy(f32[7,7,3,64] %rhs),
+    sharding={replicated}
+  ROOT %conv = f32[128,112,112,64] convolution(
+    f32[128,224,224,3] %lhs.copy,
+    f32[7,7,3,64] %rhs.copy),
+    window={size=7x7 stride=2x2 pad=3_3x3_3},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,112,224,3]"));
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[7,7,3,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[128,3,224,3]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[128,2,224,3]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(
+                        op::Select(op::And(),
+                                   op::Concatenate(left_halo, lhs, right_halo),
+                                   op::Broadcast()),
+                        rhs),
+                    op::Shape("f32[128,56,112,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsReplicatedNeedReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,224,224,3] parameter(0)
+  %lhs.copy = f32[128,224,224,3] copy(f32[128,224,224,3] %lhs),
+    sharding={devices=[2,1,1,1]0,1}
+  %rhs = f32[7,7,3,64] parameter(1)
+  %rhs.copy = f32[7,7,3,64] copy(f32[7,7,3,64] %rhs),
+    sharding={replicated}
+  ROOT %conv = f32[128,112,112,64] convolution(
+    f32[128,224,224,3] %lhs.copy,
+    f32[7,7,3,64] %rhs.copy),
+    window={size=7x7 stride=2x2 pad=3_3x3_3},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[64,224,224,3]"));
+  auto all_to_all =
+      AllOf(op::AllToAll(op::Reshape(lhs)), op::Shape("f32[64,2,112,224,3]"));
+  auto reshard_lhs = AllOf(op::Reshape(op::Transpose(all_to_all)),
+                           op::Shape("f32[128,112,224,3]"));
+
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[7,7,3,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(reshard_lhs)),
+                         op::Shape("f32[128,3,224,3]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(reshard_lhs)),
+                          op::Shape("f32[128,2,224,3]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Convolution(
+                op::Select(op::And(),
+                           op::Concatenate(left_halo, reshard_lhs, right_halo),
+                           op::Broadcast()),
+                rhs),
+            op::Shape("f32[128,56,112,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsReplicatedReordered) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[224,224,3,128] parameter(0)
+  %lhs.copy = f32[224,224,3,128] copy(%lhs), sharding={devices=[2,1,1,1]0,1}
+  %rhs = f32[7,7,3,64] parameter(1)
+  %rhs.copy = f32[7,7,3,64] copy(%rhs), sharding={replicated}
+  ROOT %conv = f32[128,112,112,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=7x7 stride=2x2 pad=3_3x3_3},
+    dim_labels=01fb_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[112,224,3,128]"));
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[7,7,3,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[3,224,3,128]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[2,224,3,128]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(
+                        op::Select(op::And(),
+                                   op::Concatenate(left_halo, lhs, right_halo),
+                                   op::Broadcast()),
+                        rhs),
+                    op::Shape("f32[128,56,112,64]")));
+}
+
+// (stride * per_shard_window_count) % dilation == 0
+TEST_F(SpmdPartitioningTest,
+       ConvolutionBaseDilationSameStartPatternLhsTiledRhsReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,7,7,512] parameter(0)
+  %lhs.copy = f32[128,7,7,512] copy(%lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[3,3,512,512] parameter(1)
+  %rhs.copy = f32[3,3,512,512] copy(%rhs),
+    sharding={replicated}
+  ROOT %conv = f32[128,4,4,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=3x3 stride=4x4 pad=1_1x1_1 lhs_dilate=2x2 rhs_reversal=1x1},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  // There is no halo exchange, and because the last element in the shard is not
+  // needed (stride == 4), the LHS will be just a slice.
+  auto sliced_lhs =
+      AllOf(op::Slice(op::Copy(op::DynamicSlice(
+                op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                op::Reshape(), op::Constant(), op::Constant()))),
+            op::Shape("f32[128,3,7,512]"));
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[3,3,512,512]"));
+  EXPECT_THAT(root, AllOf(op::Convolution(sliced_lhs, rhs),
+                          op::Shape("f32[128,2,4,512]")));
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 1);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 1);
+}
+
+// (stride * per_shard_window_count) % dilation != 0 but stride == 1
+TEST_F(SpmdPartitioningTest,
+       ConvolutionBaseDilationStride1LhsTiledRhsReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,7,7,512] parameter(0)
+  %lhs.copy = f32[128,7,7,512] copy(%lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[3,3,512,512] parameter(1)
+  %rhs.copy = f32[3,3,512,512] copy(%rhs),
+    sharding={replicated}
+  ROOT %conv = f32[128,14,14,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2 rhs_reversal=1x1},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Constant())),
+                   op::Shape("f32[128,4,7,512]"));
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[3,3,512,512]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[128,1,7,512]"));
+  auto start_window = op::Multiply(op::Reshape(), op::Constant());
+  auto start_input_element = op::Divide(start_window, op::Constant());
+  auto dynamic_offset_for_padded_concat = op::Subtract(
+      op::Constant(), op::Subtract(op::Multiply(op::Reshape(), op::Constant()),
+                                   start_input_element));
+  auto pre_masking =
+      AllOf(op::Shape("f32[128,5,7,512]"),
+            op::DynamicSlice(
+                AllOf(op::Shape("f32[128,6,7,512]"),
+                      op::Pad(op::Concatenate(left_halo, lhs), op::Constant())),
+                op::Constant(), dynamic_offset_for_padded_concat,
+                op::Constant(), op::Constant()));
+  auto masked = op::Select(
+      op::Compare(op::Add(op::Iota(), op::Broadcast(start_input_element)),
+                  op::Broadcast(op::Constant())),
+      pre_masking, op::Broadcast(op::Constant()));
+  auto dynamic_offset_on_output = op::Subtract(
+      start_window, op::Multiply(start_input_element, op::Constant()));
+  EXPECT_THAT(root,
+              AllOf(op::DynamicSlice(AllOf(op::Convolution(masked, rhs),
+                                           op::Shape("f32[128,8,14,512]")),
+                                     op::Constant(), dynamic_offset_on_output,
+                                     op::Constant(), op::Constant()),
+                    op::Shape("f32[128,7,14,512]")));
+  EXPECT_EQ(root->operand(0)->window().dimensions(0).padding_low(), 1);
+  EXPECT_EQ(root->operand(0)->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, SelectAndScatterNoOverlap) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT compare = pred[] compare(a, b), direction=GE
+}
+
+sum {
+  c = f32[] parameter(0)
+  d = f32[] parameter(1)
+  ROOT add = f32[] add(c, d)
+}
+
+ENTRY entry {
+  %param = f32[11,4]{1,0} parameter(0)
+  %param.copy = f32[11,4] copy(%param),
+    sharding={devices=[4,1]0,1,2,3}
+  constant = f32[4,2]{1,0} constant({{1,2},{3,4},{1,0},{2,8}}),
+    sharding={devices=[4,1]0,1,2,3}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT select-and-scatter = f32[11,4]{1,0} select-and-scatter(param.copy,
+    constant, constant.1), window={size=3x2 stride=3x2 pad=0_1x0_0},
+    select=ge, scatter=sum, sharding={devices=[4,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto source =
+      AllOf(op::Shape("f32[1,2]{1,0}"),
+            op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant()));
+  auto masked_data = AllOf(
+      op::Shape("f32[3,4]{1,0}"),
+      op::Select(
+          op::Compare(op::Add(op::Iota(), op::Broadcast(op::Multiply(
+                                              op::Reshape(), op::Constant()))),
+                      op::Broadcast(op::Constant())),
+          op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                    op::Reshape(), op::Constant())),
+          op::Broadcast(op::Constant())));
+
+  EXPECT_THAT(root,
+              AllOf(op::SelectAndScatter(masked_data, source, op::Constant()),
+                    op::Shape("f32[3,4]{1,0}")));
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 0);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, SelectAndScatterNoOverlapReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT compare = pred[] compare(a, b), direction=GE
+}
+
+sum {
+  c = f32[] parameter(0)
+  d = f32[] parameter(1)
+  ROOT add = f32[] add(c, d)
+}
+
+ENTRY entry {
+  %param = f32[11,4]{1,0} parameter(0)
+  %param.copy = f32[11,4] copy(%param),
+    sharding={devices=[1,4]0,1,2,3}
+  constant = f32[4,2]{1,0} constant({{1,2},{3,4},{1,0},{2,8}}),
+    sharding={devices=[4,1]0,1,2,3}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT select-and-scatter = f32[11,4]{1,0} select-and-scatter(param.copy,
+    constant, constant.1), window={size=3x2 stride=3x2 pad=0_1x0_0},
+    select=ge, scatter=sum, sharding={devices=[4,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto source =
+      AllOf(op::Shape("f32[1,2]{1,0}"),
+            op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant()));
+  auto operand = AllOf(op::Copy(op::DynamicSlice(
+                           op::Parameter(0), op::Constant(), op::Reshape())),
+                       op::Shape("f32[11,1]"));
+  auto reshard_operand = op::Reshape(op::Transpose(
+      op::AllToAll(op::Reshape(op::Pad(operand, op::Constant())))));
+  auto masked_data = AllOf(
+      op::Shape("f32[3,4]{1,0}"),
+      op::Select(
+          op::Compare(op::Add(op::Iota(), op::Broadcast(op::Multiply(
+                                              op::Reshape(), op::Constant()))),
+                      op::Broadcast(op::Constant())),
+          reshard_operand, op::Broadcast(op::Constant())));
+
+  EXPECT_THAT(root,
+              AllOf(op::SelectAndScatter(masked_data, source, op::Constant()),
+                    op::Shape("f32[3,4]{1,0}")));
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 0);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, SelectAndScatterWithOverlap) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT compare = pred[] compare(a, b), direction=GE
+}
+
+sum {
+  c = f32[] parameter(0)
+  d = f32[] parameter(1)
+  ROOT add = f32[] add(c, d)
+}
+
+ENTRY entry {
+  %param = f32[11,4]{1,0} parameter(0)
+  %param.copy = f32[11,4] copy(%param),
+    sharding={devices=[4,1]0,1,2,3}
+  constant = f32[6,2]{1,0} constant({{1,2},{3,4},{1,0},{2,8},{6,6},{1,9}}),
+    sharding={devices=[4,1]0,1,2,3}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT select-and-scatter = f32[11,4]{1,0} select-and-scatter(param.copy,
+    constant, constant.1), window={size=3x2 stride=2x2 pad=1_1x0_0},
+    select=ge, scatter=sum, sharding={devices=[4,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+
+  auto source_shard =
+      AllOf(op::Shape("f32[2,2]{1,0}"),
+            op::DynamicSlice(op::Pad(), op::Reshape(), op::Constant()));
+  // Max halo size is the same as the shard size, so slice is not needed.
+  auto source_left_halo = op::CollectivePermute(source_shard);
+  auto required_source_shard_start =
+      op::Divide(op::Multiply(op::Reshape(), op::Constant()), op::Constant());
+  auto source_with_halo = op::DynamicSlice(
+      AllOf(op::Shape("f32[5,2]{1,0}"),
+            op::Pad(op::Concatenate(source_left_halo, source_shard),
+                    op::Constant())),
+      op::Subtract(op::Constant(),
+                   op::Subtract(op::Multiply(op::Reshape(), op::Constant()),
+                                required_source_shard_start)),
+      op::Constant());
+  auto masked_source_with_halo = AllOf(
+      AllOf(op::Shape("f32[3,2]{1,0}")),
+      op::Select(
+          op::Compare(
+              op::Add(op::Iota(), op::Broadcast(required_source_shard_start)),
+              op::Broadcast(op::Constant())),
+          source_with_halo, op::Broadcast(op::Constant())));
+
+  auto data_shard =
+      AllOf(op::Shape("f32[3,4]{1,0}"),
+            op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                      op::Reshape(), op::Constant())));
+  auto data_left_halo = AllOf(op::Shape("f32[2,4]{1,0}"),
+                              op::CollectivePermute(op::Slice(data_shard)));
+  auto data_right_halo = AllOf(op::Shape("f32[2,4]{1,0}"),
+                               op::CollectivePermute(op::Slice(data_shard)));
+  auto required_data_start_on_padded =
+      op::Multiply(required_source_shard_start, op::Constant());
+  auto left_halo_size = op::Subtract(
+      op::Add(op::Multiply(op::Reshape(), op::Constant()), op::Constant()),
+      required_data_start_on_padded);
+  auto data_with_halo =
+      AllOf(op::Shape("f32[7,4]{1,0}"),
+            op::DynamicSlice(
+                AllOf(op::Shape("f32[8,4]{1,0}"),
+                      op::Pad(op::Concatenate(data_left_halo, data_shard,
+                                              data_right_halo),
+                              op::Constant())),
+                op::Subtract(op::Constant(), left_halo_size), op::Constant()));
+  auto index_on_padded =
+      op::Add(op::Iota(), op::Broadcast(required_data_start_on_padded));
+  auto masked_data_with_halo = op::Select(
+      op::And(op::Compare(index_on_padded, op::Broadcast(op::Constant())),
+              op::Compare(index_on_padded, op::Broadcast(op::Constant()))),
+      data_with_halo, op::Broadcast(op::Constant()));
+
+  EXPECT_THAT(
+      root, AllOf(op::DynamicSlice(op::SelectAndScatter(masked_data_with_halo,
+                                                        masked_source_with_halo,
+                                                        op::Constant()),
+                                   left_halo_size, op::Constant()),
+                  op::Shape("f32[3,4]{1,0}")));
+  EXPECT_EQ(root->operand(0)->window().dimensions(0).padding_low(), 0);
+  EXPECT_EQ(root->operand(0)->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiled) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,64] parameter(0)
+  %lhs.copy = f32[128,56,56,64] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,56,56,256] parameter(1)
+  %rhs.copy = f32[128,56,56,256] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[1,1,64,256] convolution(%lhs.copy, %rhs.copy),
+    window={size=56x56}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,64]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,256]"));
+
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(lhs, rhs)),
+                          op::Shape("f32[1,1,64,256]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWindowReversal) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[5,128,64] parameter(0), sharding={devices=[2,1,1]0,1}
+  %rhs = f32[5,128,256] parameter(1), sharding={devices=[2,1,1]1,0}
+  ROOT %conv = f32[1,64,256] convolution(%lhs, %rhs),
+    window={size=5 rhs_reversal=1}, dim_labels=0fb_0io->0bf,
+    sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto lhs_masked =
+      AllOf(op::Shape("f32[3,128,64]"), op::Select(_, op::Parameter(0), _));
+  auto rhs_left_padded = op::Slice(op::Concatenate(
+      op::CollectivePermute(op::Slice(op::Parameter(1))), op::Parameter(1)));
+  auto rhs_masked =
+      AllOf(op::Shape("f32[3,128,256]"), op::Select(_, rhs_left_padded, _));
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(lhs_masked, rhs_masked)),
+                    op::Shape("f32[1,64,256]")));
+}
+
+TEST_F(SpmdPartitioningTest, DotLhsTiledRhsTiledWithReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,64] parameter(0)
+  %lhs.copy = f32[128,56,56,64] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,56,56,256] parameter(1)
+  %rhs.copy = f32[128,56,56,256] copy(%rhs), sharding={devices=[2,1,1,1]0,1}
+  ROOT %conv = f32[1,1,64,256] convolution(%lhs.copy, %rhs.copy),
+    window={size=56x56}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,64]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[64,56,56,256]"));
+  auto all_to_all =
+      AllOf(op::AllToAll(op::Reshape(lhs)), op::Shape("f32[2,64,28,56,64]"));
+  auto reshard = AllOf(op::Reshape(op::Transpose(all_to_all)));
+
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(reshard, rhs)),
+                          op::Shape("f32[1,1,64,256]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWithReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,512] parameter(0)
+  %lhs.copy = f32[128,56,56,512] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,28,28,64] parameter(1)
+  %rhs.copy = f32[128,28,28,64] copy(%rhs), sharding={devices=[2,1,1,1]0,1}
+  ROOT %conv = f32[1,1,512,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=0_-1x0_-1 rhs_dilate=2x2},
+    dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,512]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[64,28,28,64]"));
+  auto all_to_all =
+      AllOf(op::AllToAll(op::Reshape(rhs)), op::Shape("f32[64,2,14,28,64]"));
+  auto reshard = op::Reshape(op::Transpose(all_to_all));
+
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(op::Slice(lhs), reshard)),
+                    op::Shape("f32[1,1,512,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWithPadding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,28,28,128] parameter(0)
+  %lhs.copy = f32[32,28,28,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,28,28,64] parameter(1)
+  %rhs.copy = f32[32,28,28,64] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[3,3,128,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=1_1x1_1}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,14,28,128]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,14,28,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                         op::Shape("f32[32,1,28,64]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                          op::Shape("f32[32,1,28,64]"));
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(
+                        lhs, AllOf(op::Concatenate(left_halo, rhs, right_halo),
+                                   op::Shape("f32[32,16,28,64]")))),
+                    op::Shape("f32[3,3,128,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWindowDilate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,224,224,3] parameter(0)
+  %lhs.copy = f32[128,224,224,3] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,112,112,64] parameter(1)
+  %rhs.copy = f32[128,112,112,64] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[7,7,3,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=112x112 pad=3_2x3_2 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,112,224,3]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,56,112,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                         op::Shape("f32[128,2,112,64]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                          op::Shape("f32[128,2,112,64]"));
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(
+                        lhs, AllOf(op::Concatenate(left_halo, rhs, right_halo),
+                                   op::Shape("f32[128,60,112,64]")))),
+                    op::Shape("f32[7,7,3,64]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWindowDilateNegativeRhsPadding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,256] parameter(0)
+  %lhs.copy = f32[128,56,56,256] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,28,28,512] parameter(1)
+  %rhs.copy = f32[128,28,28,512] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[1,1,256,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=0_-1x0_-1 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,256]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,14,28,512]"));
+
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(lhs, rhs)),
+                          op::Shape("f32[1,1,256,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWindowDilateUneven) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,14,14,512] parameter(0)
+  %lhs.copy = f32[128,14,14,512] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,7,7,512] parameter(1)
+  %rhs.copy = f32[128,7,7,512] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[3,3,512,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=7x7 pad=1_0x1_0 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,7,14,512]"));
+  auto rhs = AllOf(
+      op::Select(op::Compare(),
+                 op::Copy(op::DynamicSlice(
+                     op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                     op::Reshape(), op::Constant(), op::Constant())),
+                 op::Broadcast()),
+      op::Shape("f32[128,4,7,512]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                         op::Shape("f32[128,1,7,512]"));
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(
+                        AllOf(op::DynamicSlice(op::Pad(lhs, op::Constant()),
+                                               op::Constant(), op::Subtract(),
+                                               op::Constant(), op::Constant()),
+                              op::Shape("f32[128,10,14,512]")),
+                        AllOf(op::Concatenate(left_halo, rhs),
+                              op::Shape("f32[128,5,7,512]")))),
+                    op::Shape("f32[3,3,512,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWithPadding_HaloOnLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,28,28,128] parameter(0)
+  %lhs.copy = f32[32,28,28,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,28,28,64] parameter(1)
+  %rhs.copy = f32[32,28,28,64] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[3,3,128,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=1_1x1_1}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,14,28,128]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,14,28,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[32,1,28,128]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[32,1,28,128]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(
+                              AllOf(op::Concatenate(left_halo, lhs, right_halo),
+                                    op::Shape("f32[32,16,28,128]")),
+                              rhs)),
+                          op::Shape("f32[3,3,128,64]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWindowDilate_HaloOnLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,224,224,3] parameter(0)
+  %lhs.copy = f32[128,224,224,3] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,112,112,64] parameter(1)
+  %rhs.copy = f32[128,112,112,64] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[7,7,3,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=112x112 pad=3_2x3_2 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,112,224,3]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,56,112,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[128,3,224,3]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[128,2,224,3]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(
+                              AllOf(op::Concatenate(left_halo, lhs, right_halo),
+                                    op::Shape("f32[128,117,224,3]")),
+                              rhs)),
+                          op::Shape("f32[7,7,3,64]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWindowDilateNegativeRhsPadding_HaloOnLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,256] parameter(0)
+  %lhs.copy = f32[128,56,56,256] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,28,28,512] parameter(1)
+  %rhs.copy = f32[128,28,28,512] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[1,1,256,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=0_-1x0_-1 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,256]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,14,28,512]"));
+
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(op::Slice(lhs), rhs)),
+                          op::Shape("f32[1,1,256,512]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWindowDilateUneven_HaloOnLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,14,14,512] parameter(0)
+  %lhs.copy = f32[128,14,14,512] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,7,7,512] parameter(1)
+  %rhs.copy = f32[128,7,7,512] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[3,3,512,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=7x7 pad=1_0x1_0 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,7,14,512]"));
+  auto rhs = AllOf(
+      op::Select(op::Compare(),
+                 op::Copy(op::DynamicSlice(
+                     op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                     op::Reshape(), op::Constant(), op::Constant())),
+                 op::Broadcast()),
+      op::Shape("f32[128,4,7,512]"));
+
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[128,1,14,512]"));
+  EXPECT_THAT(
+      root, AllOf(op::AllReduce(op::Convolution(
+                      AllOf(op::DynamicSlice(
+                                AllOf(op::Pad(op::Concatenate(lhs, right_halo),
+                                              op::Constant()),
+                                      op::Shape("f32[128,10,14,512]")),
+                                op::Constant(), op::Reshape(), op::Constant(),
+                                op::Constant()),
+                            op::Shape("f32[128,9,14,512]")),
+                      rhs)),
+                  op::Shape("f32[3,3,512,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConcatenateAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[14,257] parameter(0)
+  %param0.copy = f32[14,257] copy(%param0), sharding={devices=[2,1]0,1}
+  %param1 = f32[14,116] parameter(1)
+  %param1.copy = f32[14,116] copy(%param1), sharding={devices=[2,1]0,1}
+  ROOT %concatenate = f32[14,373] concatenate(%param0.copy, %param1.copy),
+    dimensions={1}, sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                                op::Constant())),
+                      op::Shape("f32[7,257]"));
+  auto param1 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                                op::Constant())),
+                      op::Shape("f32[7,116]"));
+  EXPECT_THAT(root,
+              AllOf(op::Concatenate(param0, param1), op::Shape("f32[7,373]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConcatenateAlongPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[14,257] parameter(0)
+  %param0.copy = f32[14,257] copy(%param0), sharding={devices=[1,2]0,1}
+  %param1 = f32[14,116] parameter(1)
+  %param1.copy = f32[14,116] copy(%param1), sharding={devices=[1,2]0,1}
+  ROOT %concatenate = f32[14,373] concatenate(%param0.copy, %param1.copy),
+    dimensions={1}, sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 =
+      AllOf(op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                      op::Constant(), op::Reshape())),
+            op::Shape("f32[14,129]"));
+  auto param1 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(),
+                                                op::Reshape())),
+                      op::Shape("f32[14,58]"));
+  EXPECT_THAT(root, AllOf(op::DynamicSlice(
+                              AllOf(op::AllReduce(op::DynamicUpdateSlice(
+                                        op::DynamicUpdateSlice(
+                                            op::Broadcast(), param0,
+                                            op::Constant(), op::Multiply()),
+                                        param1, op::Constant(), op::Add())),
+                                    op::Shape("f32[14,374]")),
+                              op::Constant(), op::Multiply()),
+                          op::Shape("f32[14,187]")));
+}
+
+TEST_F(SpmdPartitioningTest, PadAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0), sharding={devices=[1,1,2]0,1}
+  %const = f32[] constant(0)
+  ROOT %pad = f32[128,17,257] pad(%param0, %const), padding=0_0x1_2x0_0,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(op::Parameter(), op::Shape("f32[128,14,129]"));
+  EXPECT_THAT(root, AllOf(op::Pad(param0, op::Constant()),
+                          op::Shape("f32[128,17,129]")));
+}
+
+TEST_F(SpmdPartitioningTest, PadAlongPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[14,257] parameter(0), sharding={devices=[1,2]0,1}
+  %const = f32[] constant(0)
+  ROOT %pad = f32[14,259] pad(%param0, %const), padding=0_0x0_2,
+    sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(op::Parameter(), op::Shape("f32[14,129]"));
+  auto after_halo_exchange =
+      AllOf(op::Shape("f32[14,130]"),
+            op::Concatenate(param0, op::CollectivePermute(op::Slice(param0))));
+  auto pad = AllOf(op::Shape("f32[14,131]"),
+                   op::Pad(after_halo_exchange, op::Constant()));
+  EXPECT_THAT(root, op::DynamicSlice(pad, op::Constant(), _));
+}
+
+TEST_F(SpmdPartitioningTest, PadAlongPartitionedDimensionWithInteriorPadding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[7] parameter(0), sharding={devices=[2]0,1}
+  %param1 = f32[] parameter(1), sharding={replicated}
+  ROOT %pad = f32[22] pad(%param0, %param1), padding=2_1_2,
+    sharding={devices=[2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+
+  auto param0 = AllOf(op::Parameter(), op::Shape("f32[4]"));
+  auto after_halo_exchange =
+      AllOf(op::Shape("f32[4]"),
+            op::DynamicSlice(
+                AllOf(op::Shape("f32[5]"),
+                      op::Concatenate(op::CollectivePermute(op::Slice(param0)),
+                                      param0)),
+                _));
+  auto pad = op::Pad(after_halo_exchange, op::Parameter(1));
+  EXPECT_THAT(root, op::DynamicSlice(pad, _));
+}
+
+TEST_F(SpmdPartitioningTest, SliceAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0)
+  %param0.copy = f32[128,14,257] copy(%param0), sharding={devices=[1,1,2]0,1}
+  ROOT %slice = f32[128,11,257] slice(%param0.copy),
+    slice={[0:128:1], [2:13:1], [0:257:1]}, sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                op::Constant(), op::Constant(), op::Reshape())),
+      op::Shape("f32[128,14,129]"));
+  EXPECT_THAT(root, AllOf(op::Slice(param0), op::Shape("f32[128,11,129]")));
+}
+
+TEST_F(SpmdPartitioningTest, SliceAlongPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0)
+  %param0.copy = f32[128,14,257] copy(%param0), sharding={devices=[1,1,2]0,1}
+  ROOT %slice = f32[63,14,251] slice(%param0.copy),
+    slice={[2:128:2], [0:14:1], [5:256:1]}, sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                op::Constant(), op::Constant(), op::Reshape())),
+      op::Shape("f32[128,14,129]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Slice(AllOf(
+                op::DynamicSlice(
+                    AllOf(op::Concatenate(
+                              param0,
+                              AllOf(op::CollectivePermute(op::Slice(param0)),
+                                    op::Shape("f32[128,14,2]"))),
+                          op::Shape("f32[128,14,131]")),
+                    op::Constant(), op::Constant(), op::Add()),
+                op::Shape("f32[128,14,126]"))),
+            op::Shape("f32[63,14,126]")));
+}
+
+TEST_F(SpmdPartitioningTest, SortAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  p.0.lhs.1247 = f32[]{:T(256)} parameter(0), sharding={replicated}
+  bitcast-convert = s32[]{:T(256)} bitcast-convert(p.0.lhs.1247), sharding={replicated}
+  constant = s32[]{:T(256)} constant(0), sharding={replicated}
+  compare = pred[]{:T(256)E(32)} compare(bitcast-convert, constant), direction=LT, sharding={replicated}
+  constant.1 = u32[]{:T(256)} constant(2147483647), sharding={replicated}
+  bitcast-convert.1 = u32[]{:T(256)} bitcast-convert(p.0.lhs.1247), sharding={replicated}
+  subtract = u32[]{:T(256)} subtract(constant.1, bitcast-convert.1), sharding={replicated}
+  bitcast-convert.2 = s32[]{:T(256)} bitcast-convert(subtract), sharding={replicated}
+  select = s32[]{:T(256)} select(compare, bitcast-convert.2, bitcast-convert), sharding={replicated}
+  p.0.rhs.1248 = f32[]{:T(256)} parameter(1), sharding={replicated}
+  bitcast-convert.3 = s32[]{:T(256)} bitcast-convert(p.0.rhs.1248), sharding={replicated}
+  compare.1 = pred[]{:T(256)E(32)} compare(bitcast-convert.3, constant), direction=LT, sharding={replicated}
+  bitcast-convert.4 = u32[]{:T(256)} bitcast-convert(p.0.rhs.1248), sharding={replicated}
+  subtract.1 = u32[]{:T(256)} subtract(constant.1, bitcast-convert.4), sharding={replicated}
+  bitcast-convert.5 = s32[]{:T(256)} bitcast-convert(subtract.1), sharding={replicated}
+  select.1 = s32[]{:T(256)} select(compare.1, bitcast-convert.5, bitcast-convert.3), sharding={replicated}
+  compare.2 = pred[]{:T(256)E(32)} compare(select, select.1), direction=GT, sharding={replicated}
+  compare.258 = pred[]{:T(256)E(32)} compare(select.1, select), direction=GT, sharding={replicated}
+  compare.259 = pred[]{:T(256)E(32)} compare(compare.2, compare.258), direction=EQ, sharding={replicated}
+  p.1.lhs.1249 = s32[]{:T(256)} parameter(2), sharding={replicated}
+  p.1.rhs.1250 = s32[]{:T(256)} parameter(3), sharding={replicated}
+  compare.260 = pred[]{:T(256)E(32)} compare(p.1.lhs.1249, p.1.rhs.1250), direction=LT, sharding={replicated}
+  ROOT select.86 = pred[]{:T(256)E(32)} select(compare.259, compare.260, compare.2), sharding={replicated}
+}
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0)
+  %param0.copy = f32[128,14,257] copy(%param0), sharding={devices=[1,2,1]0,1}
+  %param1 = s32[128,14,257] parameter(1)
+  %param1.copy = s32[128,14,257] copy(%param1), sharding={devices=[1,2,1]0,1}
+  ROOT %sort.6 = (f32[128,14,257]{2,1,0:T(8,128)}, s32[128,14,257]{2,1,0:T(8,128)})
+    sort(%param0.copy, %param1.copy), dimensions={2}, is_stable=true,
+    to_apply=%ge, sharding={{devices=[1,2,1]0,1},{devices=[1,2,1]0,1}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 =
+      AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                      op::Reshape(), op::Constant())),
+            op::Shape("f32[128,7,257]"));
+  auto param1 =
+      AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(),
+                                      op::Reshape(), op::Constant())),
+            op::Shape("s32[128,7,257]"));
+  EXPECT_THAT(root, AllOf(op::Sort(param0, param1),
+                          op::Shape("(f32[128,7,257], s32[128,7,257])")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionCustomCall) {
+  const char* const hlo_string = R"(
+HloModule cluster_2013453984438090939__.47
+
+ENTRY %cluster_2013453984438090939__.47
+  (arg_tuple.1: ()) -> (bf16[2,2000], s32[2,2000]) {
+  %arg_tuple.1 = bf16[2,209664] parameter(0)
+  %copy.arg_tuple.1 = bf16[2,209664] copy(%arg_tuple.1), sharding={devices=[1,2]0,1}
+  %custom-call = (bf16[2,2000]{1,0}, s32[2,2000]{1,0})
+    custom-call(bf16[2,209664]{1,0} %copy.arg_tuple.1), custom_call_target="TopK"
+  %get-tuple-element = bf16[2,2000]{1,0}
+    get-tuple-element((bf16[2,2000]{1,0}, s32[2,2000]{1,0}) %custom-call),
+    index=0, sharding={replicated}
+  %get-tuple-element.1 = s32[2,2000]{1,0} get-tuple-element((bf16[2,2000]{1,0},
+    s32[2,2000]{1,0}) %custom-call), index=1, sharding={replicated}
+  ROOT %tuple.46 = (bf16[2,2000]{1,0}, s32[2,2000]{1,0})
+    tuple(bf16[2,2000]{1,0} %get-tuple-element, s32[2,2000]{1,0}
+    %get-tuple-element.1), sharding={{replicated}, {replicated}},
+    metadata={op_name="XLA_Retvals"}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto custom_call = FindInstruction(module.get(), "custom-call.1");
+  EXPECT_EQ(custom_call->operand(0)->shape().dimensions(1), 104832);
+  auto sort = FindInstruction(module.get(), "sort");
+  EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 4000);
+  EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 4000);
+}
+
+TEST_F(SpmdPartitioningTest, PartitionSortInTopK) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%compare-greater-than.8 (p.0.lhs.9: bf16[], p.0.rhs.10: bf16[], p.1.lhs.11: 
+   s32[], p.1.rhs.12: s32[]) -> pred[] {
+  %p.1.lhs.11 = s32[] parameter(2)
+  %p.1.rhs.12 = s32[] parameter(3)
+  %p.0.lhs.9 = bf16[] parameter(0)
+  %convert.13 = f32[] convert(bf16[] %p.0.lhs.9)
+  %bitcast-convert.16 = s32[] bitcast-convert(f32[] %convert.13)
+  %constant.20 = s32[] constant(0)
+  %compare.21 = pred[] compare(s32[] %bitcast-convert.16, s32[] %constant.20),
+    direction=LT
+  %constant.15 = u32[] constant(2147483647)
+  %bitcast-convert.17 = u32[] bitcast-convert(f32[] %convert.13)
+  %subtract.18 = u32[] subtract(u32[] %constant.15, u32[] %bitcast-convert.17)
+  %bitcast-convert.19 = s32[] bitcast-convert(u32[] %subtract.18)
+  %select.22 = s32[] select(pred[] %compare.21, s32[] %bitcast-convert.19, s32[]
+    %bitcast-convert.16)
+  %p.0.rhs.10 = bf16[] parameter(1)
+  %convert.14 = f32[] convert(bf16[] %p.0.rhs.10)
+  %bitcast-convert.24 = s32[] bitcast-convert(f32[] %convert.14)
+  %constant.28 = s32[] constant(0)
+  %compare.29 = pred[] compare(s32[] %bitcast-convert.24, s32[] %constant.28),
+    direction=LT
+  %constant.23 = u32[] constant(2147483647)
+  %bitcast-convert.25 = u32[] bitcast-convert(f32[] %convert.14)
+  %subtract.26 = u32[] subtract(u32[] %constant.23, u32[] %bitcast-convert.25)
+  %bitcast-convert.27 = s32[] bitcast-convert(u32[] %subtract.26)
+  %select.30 = s32[] select(pred[] %compare.29, s32[] %bitcast-convert.27, s32[]
+    %bitcast-convert.24)
+  ROOT %compare.31 = pred[] compare(s32[] %select.22, s32[] %select.30),
+    direction=GT
+}
+
+ENTRY entry
+  (arg_tuple.1: ()) -> (bf16[2,2000], s32[2,2000]) {
+  %arg_tuple.1 = bf16[2,209664] parameter(0)
+  %copy.arg_tuple.1 = bf16[2,209664] copy(%arg_tuple.1), sharding={devices=[1,2]0,1}
+  %iota.7 = s32[2,209664] iota(), iota_dimension=1,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %sort.32 = (bf16[2,209664], s32[2,209664])
+    sort(bf16[2,209664] %copy.arg_tuple.1, s32[2,209664] %iota.7),
+    dimensions={1}, is_stable=true, to_apply=%compare-greater-than.8,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.33 = bf16[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=0, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.34 = bf16[2,2000] slice(bf16[2,209664]
+    %get-tuple-element.33), slice={[0:2], [0:2000]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.35 = s32[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=1, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.36 = s32[2,2000] slice(s32[2,209664]
+    %get-tuple-element.35), slice={[0:2], [0:2000]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  ROOT %tuple.46 = (bf16[2,2000], s32[2,2000])
+    tuple(bf16[2,2000] %slice.34, s32[2,2000]
+    %slice.36), sharding={{replicated}, {replicated}},
+    metadata={op_name="XLA_Retvals"}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort");
+  EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 104832);
+  EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 104832);
+  auto final_sort = FindInstruction(module.get(), "sort.1");
+  EXPECT_EQ(final_sort->operand(0)->shape().dimensions(1), 4000);
+  EXPECT_EQ(final_sort->operand(1)->shape().dimensions(1), 4000);
+}
+
+TEST_F(SpmdPartitioningTest, PartitionSortInTopKWhenComparisonWithSelect) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%compare-greater-than.8 (p.0.lhs.2566: bf16[],
+  p.0.rhs.2567: bf16[], p.1.lhs.2586: s32[], p.1.rhs.2587: s32[]) -> pred[] {
+  %p.0.lhs.2566 = bf16[] parameter(0)
+  %convert.164 = f32[] convert(bf16[] %p.0.lhs.2566)
+  %bitcast-convert.48 = s32[] bitcast-convert(f32[] %convert.164)
+  %constant.285 = s32[] constant(0)
+  %compare.84 = pred[] compare(s32[] %bitcast-convert.48, s32[] %constant.285),
+    direction=LT
+  %constant.286 = u32[] constant(2147483647)
+  %bitcast-convert.49 = u32[] bitcast-convert(f32[] %convert.164)
+  %subtract.84 = u32[] subtract(u32[] %constant.286, u32[] %bitcast-convert.49)
+  %bitcast-convert.50 = s32[] bitcast-convert(u32[] %subtract.84)
+  %select.40 = s32[] select(pred[] %compare.84, s32[] %bitcast-convert.50,
+    s32[] %bitcast-convert.48)
+  %p.0.rhs.2567 = bf16[] parameter(1)
+  %convert.165 = f32[] convert(bf16[] %p.0.rhs.2567)
+  %bitcast-convert.51 = s32[] bitcast-convert(f32[] %convert.165)
+  %compare.85 = pred[] compare(s32[] %bitcast-convert.51, s32[] %constant.285),
+    direction=LT
+  %bitcast-convert.52 = u32[] bitcast-convert(f32[] %convert.165)
+  %subtract.85 = u32[] subtract(u32[] %constant.286, u32[] %bitcast-convert.52)
+  %bitcast-convert.53 = s32[] bitcast-convert(u32[] %subtract.85)
+  %select.41 = s32[] select(pred[] %compare.85, s32[] %bitcast-convert.53,
+    s32[] %bitcast-convert.51)
+  %compare.86 = pred[] compare(s32[] %select.40, s32[] %select.41), direction=GT
+  %compare.1645 = pred[] compare(s32[] %select.41, s32[] %select.40), direction=GT
+  %compare.1646 = pred[] compare(pred[] %compare.86, pred[] %compare.1645),
+    direction=EQ
+  %p.1.lhs.2586 = s32[] parameter(2)
+  %p.1.rhs.2587 = s32[] parameter(3)
+  %compare.1647 = pred[] compare(s32[] %p.1.lhs.2586, s32[] %p.1.rhs.2587),
+    direction=LT
+  ROOT %select.1054 = pred[] select(pred[] %compare.1646, pred[] %compare.1647,
+    pred[] %compare.86)
+}
+
+ENTRY entry
+  (arg_tuple.1: ()) -> (bf16[2,2000], s32[2,2000]) {
+  %arg_tuple.1 = bf16[2,209664] parameter(0)
+  %copy.arg_tuple.1 = bf16[2,209664] copy(%arg_tuple.1), sharding={devices=[1,2]0,1}
+  %iota.7 = s32[2,209664] iota(), iota_dimension=1,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %sort.32 = (bf16[2,209664], s32[2,209664])
+    sort(bf16[2,209664] %copy.arg_tuple.1, s32[2,209664] %iota.7),
+    dimensions={1}, is_stable=true, to_apply=%compare-greater-than.8,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.33 = bf16[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=0, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.34 = bf16[2,2000] slice(bf16[2,209664]
+    %get-tuple-element.33), slice={[0:2], [0:2000]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.35 = s32[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=1, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.36 = s32[2,2000] slice(s32[2,209664]
+    %get-tuple-element.35), slice={[0:2], [0:2000]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  ROOT %tuple.46 = (bf16[2,2000], s32[2,2000])
+    tuple(bf16[2,2000] %slice.34, s32[2,2000]
+    %slice.36), sharding={{replicated}, {replicated}},
+    metadata={op_name="XLA_Retvals"}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort");
+  EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 104832);
+  EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 104832);
+  auto final_sort = FindInstruction(module.get(), "sort.1");
+  EXPECT_EQ(final_sort->operand(0)->shape().dimensions(1), 4000);
+  EXPECT_EQ(final_sort->operand(1)->shape().dimensions(1), 4000);
+}
+
+TEST_F(SpmdPartitioningTest, NoPartitionSortInTopKWhenSecondOperandIsNotIota) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%compare-greater-than.8 (p.0.lhs.2566: bf16[],
+  p.0.rhs.2567: bf16[], p.1.lhs.2586: s32[], p.1.rhs.2587: s32[]) -> pred[] {
+  %p.0.lhs.2566 = bf16[] parameter(0)
+  %convert.164 = f32[] convert(bf16[] %p.0.lhs.2566)
+  %bitcast-convert.48 = s32[] bitcast-convert(f32[] %convert.164)
+  %constant.285 = s32[] constant(0)
+  %compare.84 = pred[] compare(s32[] %bitcast-convert.48, s32[] %constant.285),
+    direction=LT
+  %constant.286 = u32[] constant(2147483647)
+  %bitcast-convert.49 = u32[] bitcast-convert(f32[] %convert.164)
+  %subtract.84 = u32[] subtract(u32[] %constant.286, u32[] %bitcast-convert.49)
+  %bitcast-convert.50 = s32[] bitcast-convert(u32[] %subtract.84)
+  %select.40 = s32[] select(pred[] %compare.84, s32[] %bitcast-convert.50,
+    s32[] %bitcast-convert.48)
+  %p.0.rhs.2567 = bf16[] parameter(1)
+  %convert.165 = f32[] convert(bf16[] %p.0.rhs.2567)
+  %bitcast-convert.51 = s32[] bitcast-convert(f32[] %convert.165)
+  %compare.85 = pred[] compare(s32[] %bitcast-convert.51, s32[] %constant.285),
+    direction=LT
+  %bitcast-convert.52 = u32[] bitcast-convert(f32[] %convert.165)
+  %subtract.85 = u32[] subtract(u32[] %constant.286, u32[] %bitcast-convert.52)
+  %bitcast-convert.53 = s32[] bitcast-convert(u32[] %subtract.85)
+  %select.41 = s32[] select(pred[] %compare.85, s32[] %bitcast-convert.53,
+    s32[] %bitcast-convert.51)
+  %compare.86 = pred[] compare(s32[] %select.40, s32[] %select.41), direction=GT
+  %compare.1645 = pred[] compare(s32[] %select.41, s32[] %select.40), direction=GT
+  %compare.1646 = pred[] compare(pred[] %compare.86, pred[] %compare.1645),
+    direction=EQ
+  %p.1.lhs.2586 = s32[] parameter(2)
+  %p.1.rhs.2587 = s32[] parameter(3)
+  %compare.1647 = pred[] compare(s32[] %p.1.lhs.2586, s32[] %p.1.rhs.2587),
+    direction=LT
+  ROOT %select.1054 = pred[] select(pred[] %compare.1646, pred[] %compare.1647,
+    pred[] %compare.86)
+}
+
+ENTRY entry {
+  %arg_tuple.1 = bf16[2,209664] parameter(0)
+  %arg_tuple.2 = s32[2,209664] parameter(1)
+  %copy.arg_tuple.1 = bf16[2,209664] copy(%arg_tuple.1), sharding={devices=[1,2]0,1}
+  %sort.32 = (bf16[2,209664], s32[2,209664])
+    sort(bf16[2,209664] %copy.arg_tuple.1, s32[2,209664] %arg_tuple.2),
+    dimensions={1}, is_stable=true, to_apply=%compare-greater-than.8,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.33 = bf16[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=0, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.34 = bf16[2,2000] slice(bf16[2,209664]
+    %get-tuple-element.33), slice={[0:2], [0:2000]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.35 = s32[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=1, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.36 = s32[2,2000] slice(s32[2,209664]
+    %get-tuple-element.35), slice={[0:2], [0:2000]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  ROOT %tuple.46 = (bf16[2,2000], s32[2,2000])
+    tuple(bf16[2,2000] %slice.34, s32[2,2000]
+    %slice.36), sharding={{replicated}, {replicated}},
+    metadata={op_name="XLA_Retvals"}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  std::cout << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort");
+  EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 209664);
+  EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 209664);
+}
+
+TEST_F(SpmdPartitioningTest, NoPartitionSortInTopKWhenNoPartitionInSortDim) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%compare-greater-than.8 (p.0.lhs.2566: bf16[],
+  p.0.rhs.2567: bf16[], p.1.lhs.2586: s32[], p.1.rhs.2587: s32[]) -> pred[] {
+  %p.0.lhs.2566 = bf16[] parameter(0)
+  %convert.164 = f32[] convert(bf16[] %p.0.lhs.2566)
+  %bitcast-convert.48 = s32[] bitcast-convert(f32[] %convert.164)
+  %constant.285 = s32[] constant(0)
+  %compare.84 = pred[] compare(s32[] %bitcast-convert.48, s32[] %constant.285),
+    direction=LT
+  %constant.286 = u32[] constant(2147483647)
+  %bitcast-convert.49 = u32[] bitcast-convert(f32[] %convert.164)
+  %subtract.84 = u32[] subtract(u32[] %constant.286, u32[] %bitcast-convert.49)
+  %bitcast-convert.50 = s32[] bitcast-convert(u32[] %subtract.84)
+  %select.40 = s32[] select(pred[] %compare.84, s32[] %bitcast-convert.50,
+    s32[] %bitcast-convert.48)
+  %p.0.rhs.2567 = bf16[] parameter(1)
+  %convert.165 = f32[] convert(bf16[] %p.0.rhs.2567)
+  %bitcast-convert.51 = s32[] bitcast-convert(f32[] %convert.165)
+  %compare.85 = pred[] compare(s32[] %bitcast-convert.51, s32[] %constant.285),
+    direction=LT
+  %bitcast-convert.52 = u32[] bitcast-convert(f32[] %convert.165)
+  %subtract.85 = u32[] subtract(u32[] %constant.286, u32[] %bitcast-convert.52)
+  %bitcast-convert.53 = s32[] bitcast-convert(u32[] %subtract.85)
+  %select.41 = s32[] select(pred[] %compare.85, s32[] %bitcast-convert.53,
+    s32[] %bitcast-convert.51)
+  %compare.86 = pred[] compare(s32[] %select.40, s32[] %select.41), direction=GT
+  %compare.1645 = pred[] compare(s32[] %select.41, s32[] %select.40), direction=GT
+  %compare.1646 = pred[] compare(pred[] %compare.86, pred[] %compare.1645),
+    direction=EQ
+  %p.1.lhs.2586 = s32[] parameter(2)
+  %p.1.rhs.2587 = s32[] parameter(3)
+  %compare.1647 = pred[] compare(s32[] %p.1.lhs.2586, s32[] %p.1.rhs.2587),
+    direction=LT
+  ROOT %select.1054 = pred[] select(pred[] %compare.1646, pred[] %compare.1647,
+    pred[] %compare.86)
+}
+
+ENTRY entry
+  (arg_tuple.1: ()) -> (bf16[2,2000], s32[2,2000]) {
+  %arg_tuple.1 = bf16[2,209664] parameter(0)
+  %copy.arg_tuple.1 = bf16[2,209664] copy(%arg_tuple.1), sharding={devices=[2,1]0,1}
+  %iota.7 = s32[2,209664] iota(), iota_dimension=1,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %sort.32 = (bf16[2,209664], s32[2,209664])
+    sort(bf16[2,209664] %copy.arg_tuple.1, s32[2,209664] %iota.7),
+    dimensions={1}, is_stable=true, to_apply=%compare-greater-than.8,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.33 = bf16[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=0, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.34 = bf16[2,2000] slice(bf16[2,209664]
+    %get-tuple-element.33), slice={[0:2], [0:2000]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.35 = s32[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=1, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.36 = s32[2,2000] slice(s32[2,209664]
+    %get-tuple-element.35), slice={[0:2], [0:2000]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  ROOT %tuple.46 = (bf16[2,2000], s32[2,2000])
+    tuple(bf16[2,2000] %slice.34, s32[2,2000]
+    %slice.36), sharding={{replicated}, {replicated}},
+    metadata={op_name="XLA_Retvals"}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  std::cout << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort");
+  EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 209664);
+  EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 209664);
+}
+
+TEST_F(SpmdPartitioningTest, NoPartitionSortInTopKWhenSliceInOtherDim) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%compare-greater-than.8 (p.0.lhs.2566: bf16[],
+  p.0.rhs.2567: bf16[], p.1.lhs.2586: s32[], p.1.rhs.2587: s32[]) -> pred[] {
+  %p.0.lhs.2566 = bf16[] parameter(0)
+  %convert.164 = f32[] convert(bf16[] %p.0.lhs.2566)
+  %bitcast-convert.48 = s32[] bitcast-convert(f32[] %convert.164)
+  %constant.285 = s32[] constant(0)
+  %compare.84 = pred[] compare(s32[] %bitcast-convert.48, s32[] %constant.285),
+    direction=LT
+  %constant.286 = u32[] constant(2147483647)
+  %bitcast-convert.49 = u32[] bitcast-convert(f32[] %convert.164)
+  %subtract.84 = u32[] subtract(u32[] %constant.286, u32[] %bitcast-convert.49)
+  %bitcast-convert.50 = s32[] bitcast-convert(u32[] %subtract.84)
+  %select.40 = s32[] select(pred[] %compare.84, s32[] %bitcast-convert.50,
+    s32[] %bitcast-convert.48)
+  %p.0.rhs.2567 = bf16[] parameter(1)
+  %convert.165 = f32[] convert(bf16[] %p.0.rhs.2567)
+  %bitcast-convert.51 = s32[] bitcast-convert(f32[] %convert.165)
+  %compare.85 = pred[] compare(s32[] %bitcast-convert.51, s32[] %constant.285),
+    direction=LT
+  %bitcast-convert.52 = u32[] bitcast-convert(f32[] %convert.165)
+  %subtract.85 = u32[] subtract(u32[] %constant.286, u32[] %bitcast-convert.52)
+  %bitcast-convert.53 = s32[] bitcast-convert(u32[] %subtract.85)
+  %select.41 = s32[] select(pred[] %compare.85, s32[] %bitcast-convert.53,
+    s32[] %bitcast-convert.51)
+  %compare.86 = pred[] compare(s32[] %select.40, s32[] %select.41), direction=GT
+  %compare.1645 = pred[] compare(s32[] %select.41, s32[] %select.40), direction=GT
+  %compare.1646 = pred[] compare(pred[] %compare.86, pred[] %compare.1645),
+    direction=EQ
+  %p.1.lhs.2586 = s32[] parameter(2)
+  %p.1.rhs.2587 = s32[] parameter(3)
+  %compare.1647 = pred[] compare(s32[] %p.1.lhs.2586, s32[] %p.1.rhs.2587),
+    direction=LT
+  ROOT %select.1054 = pred[] select(pred[] %compare.1646, pred[] %compare.1647,
+    pred[] %compare.86)
+}
+
+ENTRY entry {
+  %arg_tuple.1 = bf16[2,209664] parameter(0)
+  %copy.arg_tuple.1 = bf16[2,209664] copy(%arg_tuple.1), sharding={devices=[1,2]0,1}
+  %iota.7 = s32[2,209664] iota(), iota_dimension=1,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %sort.32 = (bf16[2,209664], s32[2,209664])
+    sort(bf16[2,209664] %copy.arg_tuple.1, s32[2,209664] %iota.7),
+    dimensions={1}, is_stable=true, to_apply=%compare-greater-than.8,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.33 = bf16[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=0, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.34 = bf16[1,209664] slice(bf16[2,209664]
+    %get-tuple-element.33), slice={[0:1], [0:209664]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.35 = s32[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=1, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.36 = s32[1,209664] slice(s32[2,209664]
+    %get-tuple-element.35), slice={[0:1], [0:209664]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  ROOT %tuple.46 = (bf16[1,209664], s32[1,209664])
+    tuple(bf16[1,209664] %slice.34, s32[1,209664]
+    %slice.36), sharding={{replicated}, {replicated}},
+    metadata={op_name="XLA_Retvals"}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort");
+  EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 209664);
+  EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 209664);
+}
+
+TEST_F(SpmdPartitioningTest, ShardableTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0), sharding={devices=[1,2,1,1]0,1}
+  ROOT %transpose = f32[16,4,38,38] transpose(%param0.copy),
+    dimensions={0,3,1,2}, sharding={devices=[1,1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[16,19,38,4]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(param0), op::Shape("f32[16,4,19,38]")));
+}
+
+TEST_F(SpmdPartitioningTest, MultiDimensionShardedTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0),
+    sharding={devices=[4,2,1,1]0,1,2,3,4,5,6,7}
+  ROOT %transpose = f32[38,4,16,38] transpose(%param0.copy),
+    dimensions={1,3,0,2}, sharding={devices=[2,1,4,1]0,2,4,6,1,3,5,7}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[4,19,38,4]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(param0), op::Shape("f32[19,4,4,38]")));
+}
+
+TEST_F(SpmdPartitioningTest, NonShardableTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0), sharding={devices=[1,2,1,1]0,1}
+  ROOT %transpose = f32[16,4,38,38] transpose(%param0.copy),
+    dimensions={0,3,1,2}, sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto resahrd = AllOf(op::Reshape(op::Transpose(op::Reshape(op::AllToAll()))),
+                       op::Shape("f32[16,38,38,2]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(), op::Shape("f32[16,2,38,38]")));
+}
+
+TEST_F(SpmdPartitioningTest, ShardableReshape) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[38,38,324] parameter(0)
+  %param0.copy = f32[38,38,324] copy(%param0), sharding={devices=[2,1,1]0,1}
+  ROOT %reshape = f32[38,38,4,81] reshape(%param0.copy),
+    sharding={devices=[2,1,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 =
+      AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                      op::Constant(), op::Constant())),
+            op::Shape("f32[19,38,324]"));
+  EXPECT_THAT(root, AllOf(op::Reshape(param0), op::Shape("f32[19,38,4,81]")));
+}
+
+TEST_F(SpmdPartitioningTest, NonShardableReshape) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[38,38,324] parameter(0)
+  %param0.copy = f32[38,38,324] copy(%param0), sharding={devices=[1,1,2]0,1}
+  ROOT %transpose = f32[38,38,4,81] reshape(%param0.copy),
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::DynamicSlice(
+                AllOf(op::Pad(
+                          AllOf(op::Reshape(AllOf(op::AllReduce(),
+                                                  op::Shape("f32[38,38,324]"))),
+                                op::Shape("f32[38,38,4,81]")),
+                          op::Constant()),
+                      op::Shape("f32[38,38,4,82]")),
+                op::Constant(), op::Constant(), op::Constant(), op::Reshape()),
+            op::Shape("f32[38,38,4,41]")));
+}
+
+TEST_F(SpmdPartitioningTest, ReshapeMergeDimsWithHaloExchange) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = s32[2,3,7,10] parameter(0), sharding={devices=[1,1,2,1]0,1}
+  ROOT %reshape = s32[3,2,1,14,5] reshape(%input),
+    sharding={devices=[1,1,1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto reshape =
+      AllOf(op::Reshape(op::Parameter(0)), op::Shape("s32[3,2,1,8,5]"));
+  auto halo = op::CollectivePermute(op::Slice(reshape));
+  auto exchanged =
+      op::DynamicSlice(op::Concatenate(halo, reshape), _, _, _, _, _);
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(exchanged, op::Shape("s32[3,2,1,7,5]")));
+}
+
+// Produces an invalid module after transformation.
+TEST_F(SpmdPartitioningTest, InceptionV3_4_way_ReduceWindowDilated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %param0 = f32[128,5,5,768] parameter(0)
+  %param0.copy = f32[128,5,5,768] copy(%param0),
+    sharding={devices=[1,4,1,1]0,1,2,3}
+  %constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT %rw = f32[128,17,17,768] reduce-window(%param0.copy, %constant.1),
+    window={size=1x5x5x1 pad=0_0x4_4x4_4x0_0 lhs_dilate=1x3x3x1},
+    to_apply=sum, sharding={devices=[1,4,1,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto input_shard = op::Copy(op::DynamicSlice(
+      op::Pad(op::Parameter(0), op::Constant()), op::Constant(), op::Reshape(),
+      op::Constant(), op::Constant()));
+  auto id_mul4_add1 =
+      op::Add(op::Multiply(op::Reshape(), op::Constant()), op::Constant());
+  auto id_mul5 = op::Multiply(op::Reshape(), op::Constant());
+  auto id_mul5_add1_div3 =
+      op::Divide(op::Add(id_mul5, op::Constant()), op::Constant());
+  auto before_masking = AllOf(
+      op::Shape("f32[128,3,5,768]"),
+      op::DynamicSlice(
+          AllOf(
+              op::Shape("f32[128,4,5,768]"),
+              op::Concatenate(op::CollectivePermute(input_shard), input_shard)),
+          op::Constant(),
+          op::Subtract(op::Constant(),
+                       op::Subtract(id_mul4_add1, id_mul5_add1_div3)),
+          op::Constant(), op::Constant()));
+  auto masked = op::Select(
+      op::And(op::Compare(op::Add(op::Iota(), op::Broadcast(id_mul5_add1_div3)),
+                          op::Broadcast(op::Constant())),
+              op::Compare(op::Add(op::Iota(), op::Broadcast(id_mul5_add1_div3)),
+                          op::Broadcast(op::Constant()))),
+      before_masking, op::Broadcast(op::Constant()));
+  auto rw = AllOf(op::Shape("f32[128,7,17,768]"),
+                  op::ReduceWindow(masked, op::Constant()));
+  auto final_slice_index = op::Subtract(
+      id_mul5,
+      op::Add(op::Multiply(id_mul5_add1_div3, op::Constant()), op::Constant()));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Shape("f32[128,5,17,768]"),
+                    op::DynamicSlice(rw, op::Constant(), final_slice_index,
+                                     op::Constant(), op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %param0 = f32[4,32,32,128] parameter(0)
+  %param0.copy = f32[4,32,32,128] copy(%param0),
+    sharding={devices=[1,1,1,2]0,1}
+  %constant.1 = f32[] constant(0), sharding={replicated}
+  %reduce = f32[128] reduce(%param0.copy, %constant.1), dimensions={0,1,2},
+    to_apply=%sum, sharding={devices=[2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[4,32,32,64]"));
+
+  EXPECT_THAT(root,
+              AllOf(op::Reduce(param0, op::Constant()), op::Shape("f32[64]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledTupleReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%minmax_func {
+  %lhs_value = f32[] parameter(0)
+  %rhs_value = f32[] parameter(2)
+  %compare.2 = pred[] compare(%lhs_value, %rhs_value), direction=GT
+  %select.4 = f32[] select(%compare.2, %lhs_value, %rhs_value)
+  %lhs_index = s32[] parameter(1)
+  %rhs_index = s32[] parameter(3)
+  %select.5 = s32[] select(%compare.2, %lhs_index, %rhs_index)
+  ROOT %tuple.2 = (f32[], s32[]) tuple(%select.4, %select.5)
+}
+
+ENTRY %main {
+  %param0 = f32[28,10] parameter(0), sharding={devices=[2,1]0,1}
+  %param1 = s32[28,10] parameter(1), sharding={devices=[2,1]0,1}
+  %init0 = f32[] parameter(2)
+  %init1 = s32[] parameter(3)
+  ROOT %reduce = (f32[28], s32[28]) reduce(%param0, %param1, %init0, %init1),
+    dimensions={1}, to_apply=%minmax_func,
+    sharding={{devices=[2]0,1}, {devices=[2]0,1}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Reduce(op::Parameter(0), op::Parameter(1),
+                                     op::Parameter(2), op::Parameter(3)),
+                          op::Shape("(f32[14], s32[14])")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledReduceOutputReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %param0 = f32[4,32,32,128] parameter(0)
+  %param0.copy = f32[4,32,32,128] copy(%param0),
+    sharding={devices=[1,2,1,1]0,1}
+  %constant.1 = f32[] constant(0), sharding={replicated}
+  %reduce = f32[128] reduce(%param0.copy, %constant.1), dimensions={0,1,2},
+    to_apply=%sum, sharding={devices=[2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[4,16,32,128]"));
+
+  EXPECT_THAT(root,
+              AllOf(op::DynamicSlice(
+                        AllOf(op::AllReduce(op::Reduce(param0, op::Constant())),
+                              op::Shape("f32[128]")),
+                        op::Reshape()),
+                    op::Shape("f32[64]")));
+}
+
+TEST_F(SpmdPartitioningTest, IotaAlongNonTileDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  ROOT %iota = s32[16,80,91] iota(), iota_dimension=1,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Iota(), op::Shape("s32[16,80,46]")));
+}
+
+TEST_F(SpmdPartitioningTest, IotaAlongTileDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  ROOT %iota = s32[16,80,91] iota(), iota_dimension=2,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Add(op::Iota(), op::Broadcast()),
+                          op::Shape("s32[16,80,46]")));
+}
+
+TEST_F(SpmdPartitioningTest, U32IotaAlongTileDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  ROOT %iota = u32[16,80,91] iota(), iota_dimension=2,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Add(op::Iota(), op::Broadcast()),
+                          op::Shape("u32[16,80,46]")));
+}
+
+TEST_F(SpmdPartitioningTest, Conditional) {
+  const char* const hlo_string = R"(
+HloModule module
+
+Negate {
+  x = f32[4,5] parameter(0), sharding={replicated}
+  ROOT negate = f32[4,5] negate(x), sharding={replicated}
+}
+
+Identity {
+  y = f32[4,5] parameter(0), sharding={devices=[2,1]0,1}
+  ROOT copy = f32[4,5] copy(y), sharding={devices=[2,1]0,1}
+}
+
+ENTRY entry {
+  %param.0 = pred[] parameter(0)
+  %param.0.copy = pred[] copy(%param.0), sharding={maximal device=0}
+  %param.1 = f32[4,5] parameter(1)
+  %param.1.copy = f32[4,5] copy(%param.1), sharding={replicated}
+  %param.2 = f32[4,5] parameter(2)
+  %param.2.copy = f32[4,5] copy(%param.2), sharding={devices=[2,1]0,1}
+  ROOT cond = f32[4,5] conditional(%param.0.copy, %param.1.copy, %param.2.copy),
+    true_computation=Negate, false_computation=Identity,
+    sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto param0 = AllOf(op::Copy(op::Copy(op::Parameter()), op::Shape("pred[]")));
+  auto param1 = AllOf(op::Copy(op::Parameter()), op::Shape("f32[4,5]"));
+  auto param2 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                                op::Constant())),
+                      op::Shape("f32[2,5]"));
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Conditional(op::AllReduce(), param1, param2),
+                          op::Shape("f32[2,5]")));
+
+  auto then_branch_root = root->branch_computation(0)->root_instruction();
+  EXPECT_THAT(then_branch_root,
+              AllOf(op::DynamicSlice(op::Negate(op::Parameter()), op::Reshape(),
+                                     op::Constant()),
+                    op::Shape("f32[2,5]")));
+
+  auto else_branch_root = root->branch_computation(1)->root_instruction();
+  EXPECT_THAT(else_branch_root,
+              AllOf(op::Copy(op::Parameter()), op::Shape("f32[2,5]")));
+}
+
+TEST_F(SpmdPartitioningTest, SelectAndScatter_RetinaNet) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT compare = pred[] compare(a, b), direction=GE
+}
+
+sum {
+  c = f32[] parameter(0)
+  d = f32[] parameter(1)
+  ROOT add = f32[] add(c, d)
+}
+
+ENTRY entry {
+  %param.0 = f32[32,128,384,64] parameter(0)
+  %param.0.copy = f32[32,128,384,64] copy(%param.0),
+    sharding={devices=[1,8,1,1]0,1,2,3,4,5,6,7}
+  %param.1 = f32[32,64,192,64] parameter(1)
+  %param.1.copy = f32[32,64,192,64] copy(%param.1),
+    sharding={devices=[1,8,1,1]0,1,2,3,4,5,6,7}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT select-and-scatter = f32[32,128,384,64] select-and-scatter(param.0.copy,
+    %param.1.copy, constant.1), window={size=1x1x1x1 stride=1x2x2x1},
+    select=ge, scatter=sum, sharding={devices=[1,8,1,1]0,1,2,3,4,5,6,7}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto source = AllOf(
+      op::Shape("f32[32,8,192,64]"),
+      op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())));
+  auto data = AllOf(
+      op::Shape("f32[32,16,384,64]"),
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())));
+
+  EXPECT_THAT(root, op::SelectAndScatter(data, source, op::Constant()));
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 0);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, TiledDot) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,64] parameter(0)
+  %lhs.copy = f32[128,64] copy(%lhs), sharding={devices=[1,2]0,1}
+  %rhs = f32[64,256] parameter(1)
+  %rhs.copy = f32[64,256] copy(%rhs), sharding={devices=[2,1]0,1}
+  ROOT %conv = f32[128,256] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=bf_io->bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(),
+                                             op::Reshape())),
+                   op::Shape("f32[128,32]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                             op::Constant())),
+                   op::Shape("f32[32,256]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(lhs, rhs)),
+                          op::Shape("f32[128,256]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledDotOutputTiled) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,64] parameter(0)
+  %lhs.copy = f32[128,64] copy(%lhs), sharding={devices=[1,2]0,1}
+  %rhs = f32[64,256] parameter(1)
+  %rhs.copy = f32[64,256] copy(%rhs), sharding={devices=[2,1]0,1}
+  ROOT %conv = f32[128,256] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=bf_io->bf, sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(),
+                                             op::Reshape())),
+                   op::Shape("f32[128,32]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                             op::Constant())),
+                   op::Shape("f32[32,256]"));
+  EXPECT_THAT(root, AllOf(op::DynamicSlice(
+                              AllOf(op::AllReduce(op::Convolution(lhs, rhs)),
+                                    op::Shape("f32[128,256]")),
+                              op::Constant(), op::Reshape()),
+                          op::Shape("f32[128,128]")));
+}
+
+TEST_F(SpmdPartitioningTest, BatchPartitionedConvolution) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,256,256] parameter(0)
+  %lhs.copy = f32[128,256,256] copy(%lhs), sharding={devices=[1,2,1]0,1}
+  %rhs = f32[256,8,1] parameter(1)
+  %rhs.copy = f32[256,8,1] copy(%rhs), sharding={replicated}
+  ROOT %conv = f32[128,256,8] convolution(%lhs.copy, %rhs.copy),
+    window={size=1}, dim_labels=0bf_io0->0bf, sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                             op::Reshape(), op::Constant())),
+                   op::Shape("f32[128,128,256]"));
+  auto rhs = AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[256,8,1]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(lhs, rhs), op::Shape("f32[128,128,8]")));
+}
+
+TEST_F(SpmdPartitioningTest, DotOutputFeaturePartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,64] parameter(0)
+  %lhs.copy = f32[24,64] copy(%lhs), sharding={replicated}
+  %rhs = f32[39296,64] parameter(1)
+  %rhs.copy = f32[39296,64] copy(%rhs), sharding={devices=[2,1]0,1}
+  ROOT %dot = f32[24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={1}, rhs_contracting_dims={1},
+    sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("f32[24,64]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Reshape(),
+                                             op::Constant())),
+                   op::Shape("f32[19648,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[24,19648]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumBatchPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={devices=[2,1,1]0,1}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={devices=[2,1,1]0,1}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,24,64]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,39296,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[16,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumLHSandOutputBatchPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={devices=[2,1,1]0,1}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,24,64]"));
+  auto rhs = AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[32,39296,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, op::DynamicSlice(rhs, op::Reshape(),
+                                                        op::Constant(),
+                                                        op::Constant())),
+                          op::Shape("f32[16,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSandOutputBatchPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={devices=[1,2,1]0,1}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={devices=[2,1,1]0,1}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                             op::Reshape(), op::Constant())),
+                   op::Shape("f32[32,12,64]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,39296,64]"));
+  auto lhs_reshard = op::Reshape(op::Transpose(op::AllToAll(op::Reshape(lhs))));
+  EXPECT_THAT(root,
+              AllOf(op::Dot(lhs_reshard, rhs), op::Shape("f32[16,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumOutputBatchPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={replicated}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs_slice =
+      AllOf(op::DynamicSlice(op::Copy(op::Parameter(0)), op::Reshape(),
+                             op::Constant(), op::Constant()),
+            op::Shape("f32[16,24,64]"));
+  auto rhs_slice =
+      AllOf(op::DynamicSlice(op::Copy(op::Parameter(1)), op::Reshape(),
+                             op::Constant(), op::Constant()),
+            op::Shape("f32[16,39296,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs_slice, rhs_slice),
+                          op::Shape("f32[16,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumContractingDimsPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,1,2,2]0,1,2,3}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={devices=[1,1,2,2]0,1,2,3}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                op::Constant(), op::Reshape(), op::Reshape())),
+      op::Shape("f32[32,24,32,64]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(),
+                                op::Constant(), op::Reshape(), op::Reshape())),
+      op::Shape("f32[32,39296,32,64]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Dot(lhs, rhs)),
+                          op::Shape("f32[32,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumLHSNonContractingDimsPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,2]0,1,2,3}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,128,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[1,2,2,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[32,12,64,64]"));
+  auto rhs = AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[32,39296,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[32,12,64,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSNonContractingDimsPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={replicated}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={devices=[1,2,1,2]0,1,2,3}
+  ROOT %dot = f32[32,24,39296,128] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[1,1,2,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("f32[32,24,64]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[32,19648,64,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[32,24,19648,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumOutputLHSNonContractingDimPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={replicated}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("f32[32,24,64,128]"));
+  auto rhs =
+      AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[32,39296,64,128]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Dot(AllOf(op::DynamicSlice(lhs, op::Constant(), op::Reshape(),
+                                           op::Constant(), op::Constant()),
+                          op::Shape("f32[32,12,64,128]")),
+                    rhs),
+            op::Shape("f32[32,12,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumOutputRHSNonContractingDimPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={replicated}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("f32[32,24,64,128]"));
+  auto rhs =
+      AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[32,39296,64,128]"));
+  EXPECT_THAT(root,
+              AllOf(op::Dot(lhs, AllOf(op::DynamicSlice(
+                                           rhs, op::Constant(), op::Reshape(),
+                                           op::Constant(), op::Constant()),
+                                       op::Shape("f32[32,19648,64,128]"))),
+                    op::Shape("f32[32,24,19648]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedNonContracting) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39295,64,128] parameter(1)
+  %rhs.copy = f32[32,39295,64,128] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %dot = f32[32,24,39295] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,12,64,128]"));
+  auto rhs =
+      AllOf(op::Copy(op::DynamicSlice(op::Pad(op::Parameter(1), op::Constant()),
+                                      op::Constant(), op::Reshape(),
+                                      op::Constant(), op::Constant())),
+            op::Shape("f32[32,19648,64,128]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Slice(AllOf(op::GetTupleElement(op::While(op::Tuple(
+                                lhs, rhs, op::Broadcast(), op::Constant()))),
+                            op::Shape("f32[32,12,39296]"))),
+            op::Shape("f32[32,12,39295]")));
+  auto while_loop = root->operand(0)->operand(0);
+  // Check loop condition.
+  EXPECT_THAT(
+      while_loop->while_condition()->root_instruction(),
+      op::Compare(op::GetTupleElement(op::Parameter(0)), op::Constant()));
+
+  // Check loop body.
+  auto next_i = op::Add(op::GetTupleElement(op::Parameter(0)), op::Constant());
+  auto window = op::Conditional(op::Compare(next_i, op::Constant()),
+                                op::GetTupleElement(op::Parameter(0)),
+                                op::GetTupleElement(op::Parameter(0)));
+  auto partial_output = op::Dot(op::GetTupleElement(op::Parameter(0)),
+                                op::GetTupleElement(op::Parameter(0)));
+  EXPECT_THAT(
+      while_loop->while_body()->root_instruction(),
+      op::Tuple(op::GetTupleElement(op::Parameter(0)), window,
+                op::DynamicUpdateSlice(op::GetTupleElement(op::Parameter(0)),
+                                       partial_output, op::Constant(),
+                                       op::Constant(), op::Reshape()),
+                next_i));
+
+  // Check the conditional that contains the collective permute.
+  auto cp_conditional =
+      while_loop->while_body()->root_instruction()->operand(1);
+  EXPECT_THAT(cp_conditional->true_computation()->root_instruction(),
+              op::CollectivePermute(op::Parameter(0)));
+  EXPECT_THAT(cp_conditional->false_computation()->root_instruction(),
+              op::Parameter(0));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedContracting) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,63,128] parameter(0)
+  %lhs.copy = f32[32,24,63,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39296,63,128] parameter(1)
+  %rhs.copy = f32[32,39296,63,128] copy(%rhs), sharding={devices=[1,1,2,1]0,1}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,12,63,128]"));
+  auto rhs =
+      AllOf(op::Copy(op::DynamicSlice(op::Pad(op::Parameter(1), op::Constant()),
+                                      op::Constant(), op::Constant(),
+                                      op::Reshape(), op::Constant())),
+            op::Shape("f32[32,39296,32,128]"));
+  auto masked_rhs =
+      op::Select(op::Compare(), rhs, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root,
+              AllOf(op::GetTupleElement(op::While(op::Tuple(
+                        lhs, masked_rhs, op::Broadcast(), op::Constant()))),
+                    op::Shape("f32[32,12,39296]")));
+  auto while_loop = root->operand(0);
+  // Check loop condition.
+  EXPECT_THAT(
+      while_loop->while_condition()->root_instruction(),
+      op::Compare(op::GetTupleElement(op::Parameter(0)), op::Constant()));
+
+  // Check loop body.
+  auto next_i = op::Add(op::GetTupleElement(op::Parameter(0)), op::Constant());
+  auto window = op::Conditional(op::Compare(next_i, op::Constant()),
+                                op::GetTupleElement(op::Parameter(0)),
+                                op::GetTupleElement(op::Parameter(0)));
+  auto partial_output = op::Dot(
+      op::DynamicSlice(
+          op::Pad(op::GetTupleElement(op::Parameter(0)), op::Constant()),
+          op::Constant(), op::Constant(), op::Reshape(), op::Constant()),
+      op::GetTupleElement(op::Parameter(0)));
+  EXPECT_THAT(
+      while_loop->while_body()->root_instruction(),
+      op::Tuple(op::GetTupleElement(op::Parameter(0)), window,
+                op::Add(op::GetTupleElement(op::Parameter(0)), partial_output),
+                next_i));
+
+  // Check the conditional that contains the collective permute.
+  auto cp_conditional =
+      while_loop->while_body()->root_instruction()->operand(1);
+  EXPECT_THAT(cp_conditional->true_computation()->root_instruction(),
+              op::CollectivePermute(op::Parameter(0)));
+  EXPECT_THAT(cp_conditional->false_computation()->root_instruction(),
+              op::Parameter(0));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedNonContractingReduce1) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39295,64,128] parameter(1)
+  %rhs.copy = f32[32,39295,64,128] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  %dot = f32[32,24,39295] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+  %constant = f32[] constant(0)
+  %constant.1 = f32[] constant(2)
+  %broadcast = f32[32,24,39295] broadcast(%constant.1), dimensions={},
+    sharding={devices=[1,2,1]0,1}
+  %multiply = f32[32,24,39295] multiply(%dot, %broadcast),
+  sharding={devices=[1,2,1]0,1}
+  ROOT %reduce = f32[32,24] reduce(%multiply, %constant), dimensions={2},
+    to_apply=sum, sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  // Involves loop code motion, skips pattern matching.
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedNonContractingReduce2) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39295,64,128] parameter(1)
+  %rhs.copy = f32[32,39295,64,128] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  %dot = f32[32,24,39295] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+  %constant = f32[] constant(0)
+  %constant.1 = f32[] constant(2)
+  %broadcast = f32[32,24,39295] broadcast(%constant.1), dimensions={},
+    sharding={devices=[1,2,1]0,1}
+  %multiply = f32[32,24,39295] multiply(%dot, %broadcast),
+    sharding={devices=[1,2,1]0,1}
+  ROOT %reduce = f32[32,39295] reduce(%multiply, %constant), dimensions={1},
+    to_apply=sum, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  // Involves loop code motion, skips pattern matching.
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedContractingFromBroadcast) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %rhs = f32[32,39296,63,128] parameter(0)
+  %rhs.copy = f32[32,39296,63,128] copy(%rhs), sharding={devices=[1,1,2,1]0,1}
+  %constant.1 = f32[] constant(2)
+  %broadcast = f32[32,24,63,128] broadcast(%constant.1), dimensions={},
+    sharding={devices=[1,2,1,1]0,1}
+  %add = f32[32,24,63,128] add(%broadcast, %broadcast),
+    sharding={devices=[1,2,1,1]0,1}
+  ROOT %dot = f32[32,24,39296] dot(%add, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  // Involves loop code motion, skips pattern matching.
+}
+
+TEST_F(SpmdPartitioningTest, ReplicatedRng) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = s32[] parameter(0)
+  %lhs.copy = s32[] copy(%lhs), sharding={replicated}
+  %rhs = s32[] parameter(1)
+  %rhs.copy = s32[] copy(%rhs), sharding={replicated}
+  ROOT %rng = s32[4]{0} rng(%lhs.copy, %rhs.copy),
+      distribution=rng_uniform, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("s32[]"));
+  auto rhs = AllOf(op::Copy(op::Parameter(1)), op::Shape("s32[]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::AllReduce(op::Select(
+                op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
+                op::Rng(), op::Broadcast(op::Constant()))),
+            op::Shape("s32[4]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionedRng) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = s32[] parameter(0)
+  %lhs.copy = s32[] copy(%lhs), sharding={replicated}
+  %rhs = s32[] parameter(1)
+  %rhs.copy = s32[] copy(%rhs), sharding={maximal device=1}
+  ROOT %rng = s32[4]{0} rng(%lhs.copy, %rhs.copy),
+      distribution=rng_uniform, sharding={devices=[2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("s32[]"));
+  auto rhs = AllOf(op::Copy(op::Copy(op::Parameter(1))), op::Shape("s32[]"));
+  EXPECT_THAT(root, AllOf(op::Rng(lhs, op::AllReduce(op::Select(
+                                           op::Broadcast(op::Compare()), rhs,
+                                           op::Broadcast(op::Constant())))),
+                          op::Shape("s32[2]")));
+}
+
+TEST_F(SpmdPartitioningTest, DynamicSliceAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = s32[128,64] parameter(0)
+  %input.copy = s32[128,64] copy(%input), sharding={devices=[2,1]0,1}
+  %index = s32[] parameter(1)
+  %constant = s32[] constant(0)
+  ROOT %dynamic-slice = s32[128,2] dynamic-slice(%input.copy, %constant, %index),
+    dynamic_slice_sizes={128,2}, sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto input = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                               op::Constant())),
+                     op::Shape("s32[64,64]"));
+  EXPECT_THAT(root,
+              AllOf(op::DynamicSlice(input, op::Constant(), op::Parameter(1)),
+                    op::Shape("s32[64,2]")));
+}
+
+TEST_F(SpmdPartitioningTest, DynamicUpdateSliceAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = s32[128,64] parameter(0)
+  %input.copy = s32[128,64] copy(%input), sharding={devices=[2,1]0,1}
+  %index = s32[] parameter(1)
+  %constant = s32[] constant(0)
+  %update = s32[128,2] parameter(2)
+  %update.copy = s32[128,2] copy(%update), sharding={devices=[2,1]0,1}
+  ROOT %dynamic-update-slice = s32[128,64]
+    dynamic-update-slice(%input.copy, %update.copy, %constant, %index),
+    sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto input = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                               op::Constant())),
+                     op::Shape("s32[64,64]"));
+  auto update = AllOf(op::Copy(op::DynamicSlice(op::Parameter(2), op::Reshape(),
+                                                op::Constant())),
+                      op::Shape("s32[64,2]"));
+  EXPECT_THAT(root, AllOf(op::DynamicUpdateSlice(input, update, op::Constant(),
+                                                 op::Parameter(1)),
+                          op::Shape("s32[64,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, PassthroughGather) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={devices=[1,2]0,1}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  ROOT %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9}, sharding={devices=[1,2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Gather(op::Parameter(0), op::Parameter(1)),
+                          op::Shape("f32[3,5]")));
+}
+
+TEST_F(SpmdPartitioningTest, GatherPartitionedOnTrivialSliceDims) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[17,9] parameter(0), sharding={devices=[2,1]0,1}
+  %indices = s32[2,3] parameter(1), sharding={replicated}
+  ROOT %gather = f32[2,3,9] gather(%input, %indices), offset_dims={2},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=2,
+    slice_sizes={1,9}, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto offset = op::Reshape(
+      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+  auto min = AllOf(op::Broadcast(offset), op::Shape("s32[2,3]"));
+  auto max = AllOf(op::Broadcast(op::Add(offset, op::Constant())),
+                   op::Shape("s32[2,3]"));
+  auto clamp = op::Clamp(min, op::Parameter(1), max);
+  auto gather = op::Gather(op::Parameter(0), op::Subtract(clamp, min));
+  auto mask =
+      op::Or(op::Lt(op::Parameter(1), min), op::Gt(op::Parameter(1), max));
+  auto masked =
+      op::Select(op::Broadcast(mask), op::Broadcast(op::Constant()), gather);
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::AllReduce(masked), op::Shape("f32[2,3,9]")));
+}
+
+TEST_F(SpmdPartitioningTest, PassthroughScatter) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={devices=[1,2]0,1}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %updates = f32[3,9] parameter(2), sharding={devices=[1,2]0,1}
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1, sharding={devices=[1,2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Scatter(op::Parameter(0), op::Parameter(1),
+                                      op::Parameter(2)),
+                          op::Shape("f32[2,5]")));
+}
+
+TEST_F(SpmdPartitioningTest, ScatterPartitionedOnTrivialSliceDims) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[17,9] parameter(0), sharding={devices=[2,1]0,1}
+  %indices = s32[2,3] parameter(1), sharding={replicated}
+  %updates = f32[2,3,9] parameter(2), sharding={replicated}
+  ROOT %scatter = f32[17,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={2},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=2, sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto offset = op::Reshape(
+      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+  auto indices = op::Subtract(
+      op::Parameter(1), AllOf(op::Broadcast(offset), op::Shape("s32[2,3]")));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Scatter(op::Parameter(0), indices, op::Parameter(2)),
+                    op::Shape("f32[9,9]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledReversePassthrough) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[3,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  ROOT reverse = f32[3,3]{1,0} reverse(constant), dimensions={1},
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,3]{1,0}"),
+                          op::Reverse(op::DynamicSlice(
+                              op::Pad(op::Constant(), op::Constant()),
+                              op::Reshape(), op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledReversePassthroughViaReversedSharding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  param = f32[4] parameter(0), sharding={devices=[2]0,1}
+  ROOT reverse = f32[4] reverse(param), dimensions={0},
+    sharding={devices=[2]1,0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2]"), op::Reverse(op::Parameter(0))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledReverseSwapShards) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  param = f32[4] parameter(0), sharding={devices=[2]0,1}
+  ROOT reverse = f32[4] reverse(param), dimensions={0},
+    sharding={devices=[2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Shape("f32[2]"),
+                    op::Reverse(op::CollectivePermute(op::Parameter(0)))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledReverseHaloExchange) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  param = f32[3] parameter(0), sharding={devices=[2]0,1}
+  ROOT reverse = f32[3] reverse(param), dimensions={0},
+    sharding={devices=[2]1,0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  auto halo_exchange_concat =
+      op::Concatenate(AllOf(op::Shape("f32[1]"),
+                            op::CollectivePermute(op::Slice(op::Parameter(0)))),
+                      op::Parameter(0));
+  auto after_halo_exchange = op::Slice(halo_exchange_concat);
+  EXPECT_THAT(root,
+              AllOf(op::Shape("f32[2]"), op::Reverse(after_halo_exchange)));
+}
+
+TEST_F(SpmdPartitioningTest, MixWithManualPartitioning) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  param = f32[8,2] parameter(0), sharding={devices=[2,1]0,1}
+  to_shard = f32[4,2] custom-call(param), custom_call_target="SPMDFullToShardShape", sharding={replicated}
+  add = f32[4,2] add(to_shard, to_shard), sharding={replicated}
+  to_full = f32[8,2] custom-call(add), custom_call_target="SPMDShardToFullShape", sharding={devices=[2,1]0,1}
+  ROOT mul = f32[8,2] multiply(to_full, param), sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  auto to_shard = op::Copy(op::Parameter(0));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[4,2]"),
+                          op::Multiply(op::Copy(op::Add(to_shard, to_shard)),
+                                       op::Parameter(0))));
+}
+
+}  // namespace
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
new file mode 100644
index 00000000000..df7597628af
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
@@ -0,0 +1,874 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
+
+#include <algorithm>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace spmd {
+
+bool HasReplicatedSharding(const HloSharding& sharding) {
+  if (sharding.IsTuple()) {
+    return absl::c_any_of(sharding.tuple_elements(), HasReplicatedSharding);
+  }
+  return sharding.IsReplicated();
+}
+
+HloInstruction* CreateZero(const Shape& shape, SpmdBuilder* b) {
+  if (shape.IsTuple()) {
+    std::vector<HloInstruction*> elements;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      elements.push_back(
+          CreateZero(ShapeUtil::GetTupleElementShape(shape, i), b));
+    }
+    return b->AddInstruction(HloInstruction::CreateTuple(elements));
+  }
+
+  if (shape.IsToken()) {
+    return b->AddInstruction(HloInstruction::CreateToken());
+  }
+  auto zero = b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(shape.element_type())));
+  return b->AddInstruction(HloInstruction::CreateBroadcast(shape, zero, {}));
+}
+
+HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module) {
+  HloComputation::Builder sum_b("add");
+  auto x = sum_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(type, {}), "x"));
+  auto y = sum_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(type, {}), "y"));
+  if (type == PRED) {
+    sum_b.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(type, {}), HloOpcode::kOr, x, y));
+  } else {
+    sum_b.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(type, {}), HloOpcode::kAdd, x, y));
+  }
+  HloComputation* reduction = module->AddEmbeddedComputation(sum_b.Build());
+  return reduction;
+}
+
+bool EvenlyPartitions(const Shape& shape, const HloSharding& sharding) {
+  if (sharding.IsTuple()) {
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      if (!EvenlyPartitions(ShapeUtil::GetTupleElementShape(shape, i),
+                            sharding.GetSubSharding(shape, {i}))) {
+        return false;
+      }
+    }
+  }
+
+  if (sharding.IsTileMaximal()) {
+    return sharding.IsReplicated();
+  }
+  for (int64 i = 0; i < shape.dimensions_size(); ++i) {
+    if (shape.dimensions(i) % sharding.tile_assignment().dim(i) != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+Shape MakePartitionedShape(const Shape& shape, const HloSharding& sharding) {
+  if (sharding.IsTuple()) {
+    std::vector<Shape> subshapes;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      subshapes.push_back(
+          MakePartitionedShape(ShapeUtil::GetTupleElementShape(shape, i),
+                               sharding.GetSubSharding(shape, {i})));
+    }
+    return ShapeUtil::MakeTupleShape(subshapes);
+  }
+  return sharding.TileShape(shape);
+}
+
+int64 ShapeSizeInBytes(const Shape& shape) {
+  return ShapeUtil::ByteSizeOfPrimitiveType(shape.element_type()) *
+         ShapeUtil::ElementsIn(shape);
+}
+
+Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,
+                                          const HloSharding& sharding,
+                                          int64 partition_id) {
+  if (sharding.IsTuple()) {
+    std::vector<Shape> subshapes;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      subshapes.push_back(MakeNonPaddedShapeForGivenPartition(
+          ShapeUtil::GetTupleElementShape(shape, i),
+          sharding.GetSubSharding(shape, {i}), partition_id));
+    }
+    return ShapeUtil::MakeTupleShape(subshapes);
+  }
+
+  auto partition_shape = shape;
+  std::vector<int64> tile_offset =
+      sharding.TileOffsetForDevice(shape, partition_id);
+  std::vector<int64> tile_limit =
+      sharding.TileLimitForDevice(shape, partition_id);
+  for (int64 i = 0; i < tile_offset.size(); ++i) {
+    if (sharding.UsesDevice(partition_id)) {
+      partition_shape.set_dimensions(i, tile_limit[i] - tile_offset[i]);
+    } else {
+      partition_shape.set_dimensions(i, 0);
+    }
+  }
+  return partition_shape;
+}
+
+std::vector<HloInstruction*> MakePartitionOffsets(const Shape& shape,
+                                                  const HloSharding& sharding,
+                                                  HloInstruction* partition_id,
+                                                  SpmdBuilder* b) {
+  CHECK(!shape.IsTuple());
+
+  Array2D<int32> offset_array(
+      {sharding.tile_assignment().num_elements(), shape.rank()});
+  offset_array.Each([&](int64 i, int64 j, int32* value) {
+    *value = sharding.TileOffsetForDevice(shape, i)[j];
+  });
+  auto offset_table = b->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2FromArray2D(offset_array)));
+  std::vector<HloInstruction*> offsets;
+  for (int64 i = 0; i < shape.rank(); ++i) {
+    if (sharding.tile_assignment().dim(i) == 1) {
+      offsets.push_back(b->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32))));
+    } else {
+      auto index = b->AddInstruction(HloInstruction::CreateDynamicSlice(
+          ShapeUtil::MakeShape(S32, {1, 1}), offset_table,
+          {partition_id, b->AddInstruction(HloInstruction::CreateConstant(
+                             LiteralUtil::CreateR0<uint32>(i)))},
+          {1, 1}));
+      offsets.push_back(b->AddInstruction(
+          HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), index)));
+    }
+  }
+  return offsets;
+}
+
+std::vector<HloInstruction*> MakeTiledPartitionOrdinals(
+    const HloSharding& sharding, HloInstruction* partition_id, SpmdBuilder* b) {
+  CHECK(!sharding.IsTileMaximal());
+  auto table_shape =
+      ShapeUtil::MakeShape(S32, sharding.tile_assignment().dimensions());
+  return MakePartitionOffsets(table_shape, sharding, partition_id, b);
+}
+
+HloInstruction* PadToShape(HloInstruction* hlo, const Shape& padded_shape,
+                           SpmdBuilder* b, HloComputation* computation) {
+  CHECK(b == nullptr || computation == nullptr);
+  if (ShapeUtil::Compatible(hlo->shape(), padded_shape)) {
+    return hlo;
+  }
+  PaddingConfig padding_config;
+  for (int64 i = 0; i < padded_shape.rank(); ++i) {
+    auto padding_config_dim = padding_config.add_dimensions();
+    padding_config_dim->set_edge_padding_low(0);
+    padding_config_dim->set_interior_padding(0);
+    padding_config_dim->set_edge_padding_high(padded_shape.dimensions(i) -
+                                              hlo->shape().dimensions(i));
+  }
+  auto add_hlo = [&](std::unique_ptr<HloInstruction> to_add) {
+    if (b == nullptr) {
+      return computation->AddInstruction(std::move(to_add));
+    }
+    return b->AddInstruction(std::move(to_add));
+  };
+  auto zero = add_hlo(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(hlo->shape().element_type())));
+  return add_hlo(
+      HloInstruction::CreatePad(padded_shape, hlo, zero, padding_config));
+}
+
+Shape GetPaddedShapeForUnevenPartitioning(const Shape& base_shape,
+                                          const HloSharding& sharding) {
+  if (sharding.IsTileMaximal()) {
+    return base_shape;
+  }
+  if (EvenlyPartitions(base_shape, sharding)) {
+    return base_shape;
+  }
+  auto shard_shape = MakePartitionedShape(base_shape, sharding);
+  Shape padded_base_shape = base_shape;
+  for (int64 i = 0; i < padded_base_shape.rank(); ++i) {
+    padded_base_shape.set_dimensions(
+        i, shard_shape.dimensions(i) * sharding.tile_assignment().dim(i));
+  }
+  return padded_base_shape;
+}
+
+HloInstruction* PadBaseShapeBeforeUnevenTiledSharding(
+    HloInstruction* hlo, const HloSharding& sharding, SpmdBuilder* b) {
+  auto padded_base_shape =
+      GetPaddedShapeForUnevenPartitioning(hlo->shape(), sharding);
+  if (ShapeUtil::Compatible(padded_base_shape, hlo->shape())) {
+    return hlo;
+  }
+  return PadToShape(hlo, padded_base_shape, b);
+}
+
+absl::optional<int64> UniqueTiledDim(const HloSharding& sharding) {
+  if (sharding.IsTileMaximal()) {
+    return absl::nullopt;
+  }
+  int64 dim = -1;
+  for (int64 i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
+    if (sharding.tile_assignment().dim(i) > 1) {
+      if (dim != -1) {
+        return absl::nullopt;
+      }
+      dim = i;
+    }
+  }
+  CHECK_NE(dim, -1);
+  return dim;
+}
+
+MultiplyAddDivideOffsetCalculation::MultiplyAddDivideOffsetCalculation(
+    int64 multiplier, int64 offset, int64 divisor)
+    : multiplier_(multiplier), offset_(offset), divisor_(divisor) {
+  CHECK_GT(divisor_, 0);
+  Simplify();
+}
+
+OffsetCalculation MultiplyAddDivideOffsetCalculation::operator-(
+    const MultiplyAddDivideOffsetCalculation& other) const {
+  if (divisor_ == 1 && other.divisor_ == 1) {
+    return OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+        multiplier_ - other.multiplier_, offset_ - other.offset_, 1));
+  }
+  return OffsetCalculation(HloOpcode::kSubtract, *this, other);
+}
+
+void MultiplyAddDivideOffsetCalculation::Simplify() {
+  // We could simplify the calculation when multiplier is a multiple of
+  // divisor_. However, when offset_ is not a multiple of divisor_, we must
+  // make sure that offset_ and multiplier_ are both non-negative or both
+  // non-positive. E.g., (3 * i  - 1) / 3 is not equivalent to i or i - 1.
+  if (divisor_ != 1 && multiplier_ % divisor_ == 0 &&
+      (offset_ % divisor_ == 0 || offset_ * multiplier_ > 0)) {
+    multiplier_ /= divisor_;
+    offset_ /= divisor_;
+    divisor_ = 1;
+  }
+}
+
+int64 MultiplyAddDivideOffsetCalculation::Calculate(int64 shard_ordinal) const {
+  return (shard_ordinal * multiplier_ + offset_) / divisor_;
+}
+
+HloInstruction* MultiplyAddDivideOffsetCalculation::Calculate(
+    HloInstruction* shard_ordinal, SpmdBuilder* b) const {
+  auto scalar_shape = ShapeUtil::MakeShape(S32, {});
+  if (multiplier_ == 0) {
+    return b->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<int32>(offset_ / divisor_)));
+  }
+  HloInstruction* result = shard_ordinal;
+  if (multiplier_ != 1) {
+    result = b->AddInstruction(HloInstruction::CreateBinary(
+        scalar_shape, HloOpcode::kMultiply, shard_ordinal,
+        b->AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<int32>(multiplier_)))));
+  }
+  if (offset_ != 0) {
+    auto offset = b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(offset_)));
+    result = b->AddInstruction(HloInstruction::CreateBinary(
+        scalar_shape, HloOpcode::kAdd, result, offset));
+  }
+  if (divisor_ != 1) {
+    auto divisor = b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(divisor_)));
+    result = b->AddInstruction(HloInstruction::CreateBinary(
+        scalar_shape, HloOpcode::kDivide, result, divisor));
+  }
+  return result;
+}
+
+int64 MultiplyAddDivideOffsetCalculation::MaxInRange(
+    int64 start_ordinal, int64 limit_ordinal) const {
+  int64 max = Calculate(start_ordinal);
+  for (int64 i = start_ordinal + 1; i < limit_ordinal; ++i) {
+    max = std::max(max, Calculate(i));
+  }
+  return max;
+}
+
+OffsetCalculation& OffsetCalculation::operator=(
+    const OffsetCalculation& other) {
+  opcode_ = other.opcode_;
+  copy_from_ = other.copy_from_;
+  if (opcode_ != HloOpcode::kCopy) {
+    lhs_ = absl::make_unique<OffsetCalculation>(*other.lhs_);
+    rhs_ = absl::make_unique<OffsetCalculation>(*other.rhs_);
+  }
+  return *this;
+}
+
+bool OffsetCalculation::IsConstant() const {
+  if (opcode_ == HloOpcode::kCopy) {
+    return copy_from_.IsConstant();
+  }
+  if (opcode_ == HloOpcode::kSubtract && *lhs_ == *rhs_) {
+    return true;
+  }
+  return lhs_->IsConstant() && rhs_->IsConstant();
+}
+
+OffsetCalculation OffsetCalculation::operator-(
+    const OffsetCalculation& other) const {
+  if (opcode_ == HloOpcode::kCopy && other.opcode_ == HloOpcode::kCopy) {
+    return copy_from_ - other.copy_from_;
+  }
+  return OffsetCalculation(HloOpcode::kSubtract, *this, other);
+}
+
+bool OffsetCalculation::operator==(const OffsetCalculation& other) const {
+  if (opcode_ != other.opcode_) {
+    return false;
+  }
+  if (opcode_ == HloOpcode::kCopy) {
+    return copy_from_ == other.copy_from_;
+  }
+  return *lhs_ == *other.lhs_ && *rhs_ == *other.rhs_;
+}
+
+int64 OffsetCalculation::Calculate(int64 shard_ordinal) const {
+  switch (opcode_) {
+    case HloOpcode::kCopy:
+      return copy_from_.Calculate(shard_ordinal);
+    case HloOpcode::kSubtract:
+      return lhs_->Calculate(shard_ordinal) - rhs_->Calculate(shard_ordinal);
+    case HloOpcode::kMultiply:
+      return lhs_->Calculate(shard_ordinal) * rhs_->Calculate(shard_ordinal);
+    default:
+      LOG(FATAL) << "Should not happen";
+  }
+}
+
+HloInstruction* OffsetCalculation::Calculate(HloInstruction* shard_ordinal,
+                                             SpmdBuilder* b) const {
+  if (opcode_ == HloOpcode::kCopy) {
+    return copy_from_.Calculate(shard_ordinal, b);
+  }
+  auto lhs = lhs_->Calculate(shard_ordinal, b);
+  auto rhs = rhs_->Calculate(shard_ordinal, b);
+  return b->AddInstruction(
+      HloInstruction::CreateBinary(lhs->shape(), opcode_, lhs, rhs));
+}
+
+int64 OffsetCalculation::MaxInRange(int64 start_ordinal,
+                                    int64 limit_ordinal) const {
+  if (IsConstant()) {
+    return Calculate(start_ordinal);
+  }
+  if (opcode_ == HloOpcode::kCopy) {
+    return std::max(Calculate(start_ordinal), Calculate(limit_ordinal - 1));
+  }
+  int64 max = Calculate(start_ordinal);
+  for (int64 i = start_ordinal + 1; i < limit_ordinal; ++i) {
+    max = std::max(max, Calculate(i));
+  }
+  return max;
+}
+
+absl::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo, const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function, int64 dim,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b) {
+  int64 input_shard_size = hlo->shape().dimensions(dim);
+  int64 shard_count = target.tile_assignment().dim(dim);
+
+  std::vector<HloInstruction*> concat_pieces;
+
+  int64 max_left_halo_size = left_halo_size_function.MaxInRange(1, shard_count);
+  for (int64 i = CeilOfRatio(max_left_halo_size, input_shard_size) - 1; i >= 0;
+       --i) {
+    std::vector<std::pair<int64, int64>> source_target_pairs;
+    target.tile_assignment().Each(
+        [&](absl::Span<const int64> indices, int64 device) {
+          if (indices[dim] > i) {
+            std::vector<int64> source_indices(indices.begin(), indices.end());
+            source_indices[dim] -= i + 1;
+            source_target_pairs.emplace_back(
+                target.tile_assignment()(source_indices), device);
+          }
+        });
+    int64 halo_size =
+        std::min(max_left_halo_size - input_shard_size * i, input_shard_size);
+    auto halo_shape = hlo->shape();
+    auto source_halo_slice = hlo;
+    if (halo_size != hlo->shape().dimensions(dim)) {
+      halo_shape.set_dimensions(dim, halo_size);
+      std::vector<int64> halo_start_indices(halo_shape.rank(), 0);
+      halo_start_indices[dim] = hlo->shape().dimensions(dim) - halo_size;
+      std::vector<int64> halo_slice_strides(halo_shape.rank(), 1);
+      source_halo_slice = b->AddInstruction(HloInstruction::CreateSlice(
+          halo_shape, hlo, halo_start_indices, hlo->shape().dimensions(),
+          halo_slice_strides));
+    }
+    auto left_halo =
+        collective_ops_creator.create_cross_partition_collective_permute(
+            b, source_halo_slice, source_target_pairs, (*next_channel_id)++);
+    concat_pieces.push_back(left_halo);
+  }
+
+  concat_pieces.push_back(hlo);
+
+  // Right halo.
+  int64 max_right_halo_size =
+      right_halo_size_function.MaxInRange(0, shard_count - 1);
+  for (int64 i = 0; i < CeilOfRatio(max_right_halo_size, input_shard_size);
+       ++i) {
+    std::vector<std::pair<int64, int64>> source_target_pairs;
+    target.tile_assignment().Each(
+        [&](absl::Span<const int64> indices, int64 device) {
+          if (indices[dim] > i) {
+            std::vector<int64> target_indices(indices.begin(), indices.end());
+            target_indices[dim] -= i + 1;
+            source_target_pairs.emplace_back(
+                device, target.tile_assignment()(target_indices));
+          }
+        });
+    int64 halo_size =
+        std::min(max_right_halo_size - input_shard_size * i, input_shard_size);
+    auto halo_shape = hlo->shape();
+    HloInstruction* source_halo_slice = hlo;
+    if (halo_size != halo_shape.dimensions(dim)) {
+      halo_shape.set_dimensions(dim, halo_size);
+      std::vector<int64> halo_start_indices(halo_shape.rank(), 0);
+      std::vector<int64> halo_slice_strides(halo_shape.rank(), 1);
+      source_halo_slice = b->AddInstruction(HloInstruction::CreateSlice(
+          halo_shape, hlo, halo_start_indices, halo_shape.dimensions(),
+          halo_slice_strides));
+    }
+    auto right_halo =
+        collective_ops_creator.create_cross_partition_collective_permute(
+            b, source_halo_slice, source_target_pairs, (*next_channel_id)++);
+    concat_pieces.push_back(right_halo);
+  }
+
+  auto concat = hlo;
+  // Concat with halos/padding.
+  if (concat_pieces.size() > 1) {
+    auto concat_shape = hlo->shape();
+    int64 concat_dim_size = 0;
+    for (auto piece : concat_pieces) {
+      concat_dim_size += piece->shape().dimensions(dim);
+    }
+    concat_shape.set_dimensions(dim, concat_dim_size);
+    concat = b->AddInstruction(
+        HloInstruction::CreateConcatenate(concat_shape, concat_pieces, dim));
+  }
+
+  return concat;
+}
+
+absl::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo,
+    std::vector<OffsetCalculation> left_halo_size_functions,
+    std::vector<OffsetCalculation> right_halo_size_functions,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b) {
+  CHECK(left_halo_size_functions.size() == hlo->shape().rank());
+  CHECK(right_halo_size_functions.size() == hlo->shape().rank());
+
+  HloInstruction* visiting_hlo = hlo;
+  for (int dim = 0; dim < hlo->shape().rank(); ++dim) {
+    auto concat = ExchangeHalo(visiting_hlo, left_halo_size_functions[dim],
+                               right_halo_size_functions[dim], dim, target,
+                               collective_ops_creator, next_channel_id, b);
+    if (!concat) {
+      return absl::nullopt;
+    }
+    visiting_hlo = *concat;
+  }
+  return visiting_hlo;
+}
+
+absl::optional<HloInstruction*> ExchangeHaloAndGetValidData(
+    HloInstruction* hlo, const Shape& base_shape,
+    const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function,
+    int64 explicit_left_padding_on_full_shape, int64 padded_full_shape_size,
+    int64 shard_size_with_halo, int64 dim, const HloSharding& target,
+    HloInstruction* offset_on_padded_shape, HloInstruction* pad_value,
+    HloInstruction* partition_ordinal,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b, bool mask_invalid_region) {
+  auto halo_exchange_result =
+      ExchangeHalo(hlo, left_halo_size_function, right_halo_size_function, dim,
+                   target, collective_ops_creator, next_channel_id, b);
+  if (!halo_exchange_result) {
+    return absl::nullopt;
+  }
+  auto concat = *halo_exchange_result;
+  int64 shard_count = target.tile_assignment().dim(dim);
+  int64 max_left_halo_size = left_halo_size_function.MaxInRange(1, shard_count);
+
+  // Now we determine if we need extra padding after the concat.
+  //
+  // The max of halo size or the first shard's explicit left padding.
+  int64 max_left_halo_or_padding_size =
+      std::max(std::max(int64{0}, max_left_halo_size),
+               explicit_left_padding_on_full_shape);
+  // The calculation that returns the dynamic slice index for a shard on the
+  // padded concat, which is the difference between
+  // max_left_halo_or_padding_size and its left halo size.
+  auto start_offset_on_padded_concat_calculation =
+      OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+          0, max_left_halo_or_padding_size, 1)) -
+      left_halo_size_function;
+
+  // See if we need to pad the concat before dynamic slice.
+  int64 extra_left_padding =
+      std::max(int64{0}, max_left_halo_or_padding_size -
+                             std::max(int64{0}, max_left_halo_size));
+  int64 extra_right_padding =
+      start_offset_on_padded_concat_calculation.MaxInRange(0, shard_count) +
+      shard_size_with_halo - concat->shape().dimensions(dim) -
+      extra_left_padding;
+  extra_right_padding = std::max(int64{0}, extra_right_padding);
+  if (extra_left_padding > 0 || extra_right_padding > 0) {
+    PaddingConfig padding_config;
+    auto padded_concat_shape = concat->shape();
+    for (int64 i = 0; i < base_shape.rank(); ++i) {
+      auto padding_config_dim = padding_config.add_dimensions();
+      padding_config_dim->set_interior_padding(0);
+      padding_config_dim->set_edge_padding_low(0);
+      padding_config_dim->set_edge_padding_high(0);
+      if (i != dim) {
+        continue;
+      }
+      padding_config_dim->set_edge_padding_low(extra_left_padding);
+      padding_config_dim->set_edge_padding_high(extra_right_padding);
+      padded_concat_shape.set_dimensions(dim, concat->shape().dimensions(dim) +
+                                                  extra_left_padding +
+                                                  extra_right_padding);
+    }
+    concat = b->AddInstruction(HloInstruction::CreatePad(
+        padded_concat_shape, concat, pad_value, padding_config));
+  }
+
+  auto valid_slice = concat;
+  if (shard_size_with_halo != concat->shape().dimensions(dim)) {
+    // Concat is bigger than the shard shape, so we need a dynamic slice.
+    CHECK_LT(shard_size_with_halo, concat->shape().dimensions(dim));
+    auto slice_shape = concat->shape();
+    slice_shape.set_dimensions(dim, shard_size_with_halo);
+
+    if (left_halo_size_function.IsConstant() &&
+        left_halo_size_function.Calculate(0) ==
+            explicit_left_padding_on_full_shape) {
+      std::vector<int64> start_indices(slice_shape.rank(), 0);
+      std::vector<int64> strides(slice_shape.rank(), 1);
+      valid_slice = b->AddInstruction(
+          HloInstruction::CreateSlice(slice_shape, concat, start_indices,
+                                      slice_shape.dimensions(), strides));
+    } else {
+      auto zero = b->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+      std::vector<HloInstruction*> slice_offsets(base_shape.rank(), zero);
+      slice_offsets[dim] = start_offset_on_padded_concat_calculation.Calculate(
+          partition_ordinal, b);
+      valid_slice = b->AddInstruction(HloInstruction::CreateDynamicSlice(
+          slice_shape, concat, slice_offsets, slice_shape.dimensions()));
+    }
+  }
+
+  if (!mask_invalid_region) {
+    return valid_slice;
+  }
+
+  int64 total_right_padding = padded_full_shape_size -
+                              base_shape.dimensions(dim) -
+                              explicit_left_padding_on_full_shape;
+  // Mask off garbage data due to uneven partition or low/high padding.
+  if (explicit_left_padding_on_full_shape > 0 || total_right_padding > 0) {
+    auto index_shape = ShapeUtil::ChangeElementType(valid_slice->shape(), S32);
+    auto iota = b->AddInstruction(HloInstruction::CreateIota(index_shape, dim));
+    auto broadcast_start_index_in_padded_shape =
+        b->AddInstruction(HloInstruction::CreateBroadcast(
+            index_shape, offset_on_padded_shape, {}));
+    auto index_in_padded_shape = b->AddInstruction(
+        HloInstruction::CreateBinary(index_shape, HloOpcode::kAdd, iota,
+                                     broadcast_start_index_in_padded_shape));
+    auto mask_shape = ShapeUtil::ChangeElementType(index_shape, PRED);
+    std::vector<HloInstruction*> predicates;
+    if (explicit_left_padding_on_full_shape > 0) {
+      auto valid_index_start =
+          b->AddInstruction(HloInstruction::CreateBroadcast(
+              index_shape,
+              b->AddInstruction(
+                  HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(
+                      explicit_left_padding_on_full_shape))),
+              {}));
+      predicates.push_back(b->AddInstruction(HloInstruction::CreateCompare(
+          mask_shape, index_in_padded_shape, valid_index_start,
+          ComparisonDirection::kGe)));
+    }
+    if (total_right_padding > 0) {
+      auto valid_index_limit =
+          b->AddInstruction(HloInstruction::CreateBroadcast(
+              index_shape,
+              b->AddInstruction(
+                  HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(
+                      base_shape.dimensions(dim) +
+                      explicit_left_padding_on_full_shape))),
+              {}));
+      predicates.push_back(b->AddInstruction(HloInstruction::CreateCompare(
+          mask_shape, index_in_padded_shape, valid_index_limit,
+          ComparisonDirection::kLt)));
+    }
+    CHECK(!predicates.empty());
+    auto is_valid =
+        predicates.size() == 2
+            ? b->AddInstruction(HloInstruction::CreateBinary(
+                  mask_shape, HloOpcode::kAnd, predicates[0], predicates[1]))
+            : predicates[0];
+    auto masking_value = b->AddInstruction(
+        HloInstruction::CreateBroadcast(valid_slice->shape(), pad_value, {}));
+    valid_slice = b->AddInstruction(
+        HloInstruction::CreateTernary(valid_slice->shape(), HloOpcode::kSelect,
+                                      is_valid, valid_slice, masking_value));
+  }
+  return valid_slice;
+}
+
+HloInstruction* HaloExchangeToPadOnLeft(PartitionedHlo& original,
+                                        absl::Span<const int64> dims) {
+  if (original.sharding().IsTileMaximal()) {
+    return original.hlo();
+  }
+  // Create a window config to halo exchange for unevenly partitioned reverse
+  // dimensions.
+  Window window;
+  for (int64 i = 0; i < original.base_shape().rank(); ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_size(1);
+    dim->set_stride(1);
+    dim->set_window_dilation(1);
+    dim->set_window_reversal(false);
+    int64 low_padding = 0;
+    if (absl::c_linear_search(dims, i)) {
+      low_padding =
+          RoundUpToNearest(original.base_shape().dimensions(i),
+                           original.sharding().tile_assignment().dim(i)) -
+          original.base_shape().dimensions(i);
+    }
+    dim->set_padding_low(low_padding);
+    dim->set_padding_high(0);
+    dim->set_base_dilation(1);
+  }
+
+  auto reshard_window = original.ReshardAsWindowedInput(
+      window, original.sharding(),
+      CreateZero(ShapeUtil::MakeShape(original.base_shape().element_type(), {}),
+                 original.state().b),
+      /*mask_invalid_region=*/false);
+  if (!reshard_window.has_value()) {
+    return nullptr;
+  }
+  CHECK(!reshard_window->dynamic_slice_index_on_output.has_value());
+  return reshard_window->sharded_input;
+}
+
+bool IsNanSafeGt(HloComputation* comp) {
+  namespace m = match;
+  auto match_bitcast_f32 = [](int64 parameter_number) {
+    auto param = m::Parameter(parameter_number)
+                     .WithShape(m::Shape().WithElementType(F32));
+    auto param_s32 =
+        m::BitcastConvert(param).WithShape(m::Shape().WithElementType(S32));
+    auto param_u32 =
+        m::BitcastConvert(param).WithShape(m::Shape().WithElementType(U32));
+    return m::Select(
+        m::Lt(param_s32, m::ConstantScalar(0)),
+        m::BitcastConvert(
+            m::Subtract(m::ConstantScalar(std::numeric_limits<int32>::max()),
+                        param_u32))
+            .WithShape(m::Shape().WithElementType(S32)),
+        param_s32);
+  };
+  auto match_bitcast_bf16 = [](int64 parameter_number) {
+    auto param = m::Convert(m::Parameter(parameter_number)
+                                .WithShape(m::Shape().WithElementType(BF16)))
+                     .WithShape(m::Shape().WithElementType(F32));
+    auto param_s32 =
+        m::BitcastConvert(param).WithShape(m::Shape().WithElementType(S32));
+    auto param_u32 =
+        m::BitcastConvert(param).WithShape(m::Shape().WithElementType(U32));
+    return m::Select(
+        m::Lt(param_s32, m::ConstantScalar(0)),
+        m::BitcastConvert(
+            m::Subtract(m::ConstantScalar(std::numeric_limits<int32>::max()),
+                        param_u32))
+            .WithShape(m::Shape().WithElementType(S32)),
+        param_s32);
+  };
+  // If root instruction is kSelect and compares indices if values are equal.
+  if (comp->root_instruction()->opcode() == HloOpcode::kSelect) {
+    return Match(comp->root_instruction()->operand(2),
+                 m::Gt(match_bitcast_f32(0), match_bitcast_f32(1))) ||
+           Match(comp->root_instruction()->operand(2),
+                 m::Gt(match_bitcast_bf16(0), match_bitcast_bf16(1)));
+  }
+  return Match(comp->root_instruction(),
+               m::Gt(match_bitcast_f32(0), match_bitcast_f32(1))) ||
+         Match(comp->root_instruction(),
+               m::Gt(match_bitcast_bf16(0), match_bitcast_bf16(1)));
+}
+
+absl::optional<int64> GetKValueInTopKWhenPartitionSortDim(HloInstruction* hlo) {
+  HloSortInstruction* sort = DynCast<HloSortInstruction>(hlo);
+  if (sort == nullptr || sort->operand_count() != 2) {
+    return absl::nullopt;
+  }
+  if (!IsNanSafeGt(sort->to_apply())) {
+    return absl::nullopt;
+  }
+  HloInstruction* data = sort->mutable_operand(0);
+  HloIotaInstruction* iota =
+      DynCast<HloIotaInstruction>(sort->mutable_operand(1));
+  const PrimitiveType element_type = data->shape().element_type();
+  if (iota == nullptr || iota->shape().element_type() != S32 ||
+      iota->opcode() != HloOpcode::kIota ||
+      iota->iota_dimension() != sort->sort_dimension()) {
+    return absl::nullopt;
+  }
+
+  const int64 sort_dim = sort->sort_dimension();
+
+  if (element_type != F32 && element_type != BF16 && element_type != S32 &&
+      element_type != U32) {
+    return absl::nullopt;
+  }
+
+  bool supported = true;
+  absl::optional<int64> k;
+  for (HloInstruction* gte : sort->users()) {
+    if (gte->opcode() != HloOpcode::kGetTupleElement) {
+      supported = false;
+      break;
+    }
+
+    const HloInstruction* slice = gte->users()[0];
+    if (slice->opcode() != HloOpcode::kSlice) {
+      // Non-slice user means we are not doing a TopK
+      supported = false;
+      break;
+    }
+    if (absl::c_any_of(slice->slice_starts(), [](int x) { return x != 0; }) ||
+        absl::c_any_of(slice->slice_strides(), [](int x) { return x != 1; })) {
+      // Strided slice or slicing at the beginning isn't supported.
+      supported = false;
+      break;
+    }
+    for (int64 dim = 0; dim < data->shape().dimensions_size(); dim++) {
+      if (dim == sort_dim) {
+        continue;
+      }
+      if (slice->slice_limits(dim) !=
+          slice->operand(0)->shape().dimensions(dim)) {
+        // Slicing along the other dimension isn't supported.
+        supported = false;
+        break;
+      }
+    }
+    if (!k.has_value()) {
+      k = slice->slice_limits(sort_dim);
+    } else if (k != slice->slice_limits(sort_dim)) {
+      // Different k for the different operands isn't supported.
+      supported = false;
+      break;
+    }
+  }
+  if (k == absl::nullopt || !supported) {
+    return absl::nullopt;
+  }
+
+  // Only support when sort dim is sharded.
+  if (!data->has_sharding()) {
+    return absl::nullopt;
+  }
+  const HloSharding& sharding = sort->operand(0)->sharding();
+
+  if (sharding.IsTileMaximal()) {
+    return absl::nullopt;
+  }
+
+  // Check if partitioned at sort dimension.
+  for (int64 dim : sort->dimensions()) {
+    if (sharding.tile_assignment().dim(dim) > 1) {
+      if (dim != sort_dim) {
+        return absl::nullopt;
+      }
+    }
+  }
+
+  // Checks if partition size is smaller than k.
+  const int64 shard_count = sharding.tile_assignment().dim(sort_dim);
+
+  if (shard_count <= 1) {
+    return absl::nullopt;
+  }
+
+  const int64 input_size = hlo->operand(0)->shape().dimensions(sort_dim);
+  const int64 per_partition_size = CeilOfRatio(input_size, shard_count);
+
+  if (k.value() >= per_partition_size) {
+    return absl::nullopt;
+  }
+
+  return k;
+}
+
+// Slice first k elements from sort_dim.
+HloInstruction* SliceFirstK(HloInstruction* hlo, SpmdBuilder* builder,
+                            int64 slice_dim, int64 k) {
+  const Shape& hlo_shape = hlo->shape();
+  auto hlo_dims = hlo_shape.dimensions();
+  std::vector<int64> start_indices(hlo_shape.dimensions_size(), 0);
+  std::vector<int64> limit_indices(hlo_dims.begin(), hlo_dims.end());
+  std::vector<int64> strides(hlo_shape.dimensions_size(), 1);
+  limit_indices[slice_dim] = k;
+  auto output_shape = hlo_shape;
+  output_shape.set_dimensions(slice_dim, k);
+  return builder->AddInstruction(HloInstruction::CreateSlice(
+      output_shape, hlo, start_indices, limit_indices, strides));
+}
+
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
new file mode 100644
index 00000000000..5f245667970
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
@@ -0,0 +1,268 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+
+namespace xla {
+namespace spmd {
+
+// Returns true if the given sharding contains any replicated sharding.
+bool HasReplicatedSharding(const HloSharding& sharding);
+
+// Creates zero value instructions of the given shape.
+HloInstruction* CreateZero(const Shape& shape, SpmdBuilder* b);
+
+template <typename NativeT>
+HloInstruction* CreateR0WithType(PrimitiveType type, NativeT value,
+                                 SpmdBuilder* b) {
+  auto literal = LiteralUtil::CreateR0(value)
+                     .ConvertToShape(ShapeUtil::MakeShape(type, {}))
+                     .ValueOrDie();
+  return b->AddInstruction(HloInstruction::CreateConstant(std::move(literal)));
+}
+
+inline HloInstruction* CreateFirstWithType(PrimitiveType type, SpmdBuilder* b) {
+  if (type == F32) {
+    auto float_pad_value = std::numeric_limits<float>::quiet_NaN();
+    return CreateR0WithType(type, -float_pad_value, b);
+  }
+  auto literal = LiteralUtil::MinValue(type);
+  return b->AddInstruction(HloInstruction::CreateConstant(std::move(literal)));
+}
+
+inline HloInstruction* CreateLastWithType(PrimitiveType type, SpmdBuilder* b) {
+  if (type == F32) {
+    auto float_pad_value = std::numeric_limits<float>::quiet_NaN();
+    return CreateR0WithType(type, float_pad_value, b);
+  }
+  auto literal = LiteralUtil::MaxValue(type);
+  return b->AddInstruction(HloInstruction::CreateConstant(std::move(literal)));
+}
+
+// Create a binary add computation of the given type and add to the module.
+HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module);
+
+// Returns true if the shape can be evenly partitioned for the given sharding.
+// All tile sharded dimensions should be evenly divisible and there should be no
+// single-device sharding. Replicate sharding is considered even partition.
+bool EvenlyPartitions(const Shape& shape, const HloSharding& sharding);
+
+// Returns the shard shape of the given shape when it is partitioned for the
+// target sharding.
+Shape MakePartitionedShape(const Shape& shape, const HloSharding& sharding);
+
+// Similar to ShapeUtil::ByteSizeOf(), but does not check it has dense layout
+// since this can be before layout assignment.
+int64 ShapeSizeInBytes(const Shape& shape);
+
+// Returns the shard shape for a partition without padding due to uneven
+// sharding.
+Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,
+                                          const HloSharding& sharding,
+                                          int64 partition_id);
+
+// Generates the HLO instructions that represent the dimension offsets on any
+// device. The size of the returned vector is the rank of the given shape.
+std::vector<HloInstruction*> MakePartitionOffsets(const Shape& shape,
+                                                  const HloSharding& sharding,
+                                                  HloInstruction* partition_id,
+                                                  SpmdBuilder* b);
+
+// Returns the offsets of the partition in the tile assignment.
+std::vector<HloInstruction*> MakeTiledPartitionOrdinals(
+    const HloSharding& sharding, HloInstruction* partition_id, SpmdBuilder* b);
+
+// Pads hlo to the desired shape using high padding. Either a builder or a
+// computation needs to be supplied, but not both.
+HloInstruction* PadToShape(HloInstruction* hlo, const Shape& padded_shape,
+                           SpmdBuilder* b,
+                           HloComputation* computation = nullptr);
+
+// Returns the padded shape when combining all partitions.
+Shape GetPaddedShapeForUnevenPartitioning(const Shape& base_shape,
+                                          const HloSharding& sharding);
+
+// Pads the HLO (with base shape) for uneven tiled partition to make it evenly
+// partitionable.
+HloInstruction* PadBaseShapeBeforeUnevenTiledSharding(
+    HloInstruction* hlo, const HloSharding& sharding, SpmdBuilder* b);
+
+// Returns the index of the unique tile dimension. Returns absl::nullopt if the
+// given sharding is not tiled or tiled along multiple dimensions.
+absl::optional<int64> UniqueTiledDim(const HloSharding& sharding);
+
+// Utilities for symbolic offset calculation and halo exchange.
+class OffsetCalculation;
+
+// Represents a calculation over integers:
+//   (shard_ordinal * multiplier + offset) / divisor
+class MultiplyAddDivideOffsetCalculation {
+ public:
+  MultiplyAddDivideOffsetCalculation()
+      : multiplier_(0), offset_(0), divisor_(1) {}
+  MultiplyAddDivideOffsetCalculation(int64 multiplier, int64 offset,
+                                     int64 divisor);
+
+  OffsetCalculation operator-(
+      const MultiplyAddDivideOffsetCalculation& other) const;
+
+  bool operator==(const MultiplyAddDivideOffsetCalculation& other) const {
+    return multiplier_ == other.multiplier_ && offset_ == other.offset_ &&
+           divisor_ == other.divisor_;
+  }
+
+  bool IsConstant() const { return multiplier_ == 0; }
+  void Simplify();
+  int64 Calculate(int64 shard_ordinal) const;
+  HloInstruction* Calculate(HloInstruction* shard_ordinal,
+                            SpmdBuilder* b) const;
+
+  // Returns the maximum result for shard ordinals in the range
+  // [start_ordinal, limit_ordinal).
+  int64 MaxInRange(int64 start_ordinal, int64 limit_ordinal) const;
+
+ private:
+  int64 multiplier_;
+  int64 offset_;
+  int64 divisor_;
+};
+
+// Represents a calculation over integers based on results of other calculations
+// defined by an opcode. If the opcode is kCopy, it simply wraps an
+// MultiplyAddDivideOffsetCalculation.
+class OffsetCalculation {
+ public:
+  OffsetCalculation() : opcode_(HloOpcode::kCopy), copy_from_() {}
+  explicit OffsetCalculation(
+      const MultiplyAddDivideOffsetCalculation& copy_from)
+      : opcode_(HloOpcode::kCopy), copy_from_(copy_from) {}
+  OffsetCalculation(const OffsetCalculation& copy_from) { *this = copy_from; }
+  OffsetCalculation(HloOpcode opcode,
+                    const MultiplyAddDivideOffsetCalculation& lhs,
+                    const MultiplyAddDivideOffsetCalculation& rhs)
+      : opcode_(opcode),
+        lhs_(absl::make_unique<OffsetCalculation>(lhs)),
+        rhs_(absl::make_unique<OffsetCalculation>(rhs)) {}
+  OffsetCalculation(HloOpcode opcode, const OffsetCalculation& lhs,
+                    const OffsetCalculation& rhs)
+      : opcode_(opcode),
+        lhs_(absl::make_unique<OffsetCalculation>(lhs)),
+        rhs_(absl::make_unique<OffsetCalculation>(rhs)) {}
+
+  OffsetCalculation& operator=(const OffsetCalculation& other);
+
+  // Returns whether the calculation returns the same value for all shards. This
+  // is conservative and could return false even if it is actually constant.
+  bool IsConstant() const;
+
+  OffsetCalculation operator-(const OffsetCalculation& other) const;
+  bool operator==(const OffsetCalculation& other) const;
+  int64 Calculate(int64 shard_ordinal) const;
+  HloInstruction* Calculate(HloInstruction* shard_ordinal,
+                            SpmdBuilder* b) const;
+
+  // Returns the maximum result for shard ordinals in the range
+  // [start_ordinal, limit_ordinal).
+  int64 MaxInRange(int64 start_ordinal, int64 limit_ordinal) const;
+
+ private:
+  HloOpcode opcode_;
+  std::unique_ptr<OffsetCalculation> lhs_;
+  std::unique_ptr<OffsetCalculation> rhs_;
+  MultiplyAddDivideOffsetCalculation copy_from_;
+};
+
+// Performs halo exchange on the given dimension based on the provided
+// left/right halo size functions. Returns nullopt if the halo is beyond the
+// direct neighbor of the shard.
+absl::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo, const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function, int64 dim,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b);
+
+// Exchange halo on all dimensions of the HLO. Returns nullopt if any one of the
+// dimensions fails to exchange halo (halo is beyond the neighbor shard).
+absl::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo,
+    std::vector<OffsetCalculation> left_halo_size_functions,
+    std::vector<OffsetCalculation> right_halo_size_functions,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b);
+
+// Exchanges halos and performs pad/dynamic-slice on the concatenated data such
+// that the result starts with the first needed element on each shard. It also
+// masks off invalid data due to padding.
+// Arguments:
+//  hlo: the HLO op before halo exchange
+//  explicit_left_padding_on_full_shape: the amount of left padding to be added
+//   explicitly by this function on the base shape before partitioning. Without
+//   base dilation, this is usually set to the window's padding_low so that the
+//   sharded op do not need to add padding_low on the window; however, with base
+//   dilation, this could only be set to a custom size.
+//  padded_full_shape_size: the size of the padded full shape on the given
+//   dimension, which includes explicit_left_padding_on_full_shape and required
+//   right padding to make the shape evenly shardable.
+//  shard_size_with_halo: the shard size on the dimension after halo exchange.
+//   If different shards have different sizes, use the maximum size.
+//  offset_on_padded_shape: the offset HLO (S32) that represents the start of
+//   each shard on the padded full shape.
+//  pad_value: the padding value used on the full shape.
+absl::optional<HloInstruction*> ExchangeHaloAndGetValidData(
+    HloInstruction* hlo, const Shape& base_shape,
+    const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function,
+    int64 explicit_left_padding_on_full_shape, int64 padded_full_shape_size,
+    int64 shard_size_with_halo, int64 dim, const HloSharding& target,
+    HloInstruction* offset_on_padded_shape, HloInstruction* pad_value,
+    HloInstruction* partition_ordinal,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b, bool mask_invalid_region = true);
+
+// Uses halo exchange to change from right-padding to left-padding for uneven
+// tiled sharding on the given dimensions. Tiled sharding always pads uneven
+// partitioned data on the right, but we need to swap it to the left for
+// kReverse or kConvolution with window reversal.
+HloInstruction* HaloExchangeToPadOnLeft(PartitionedHlo& original,
+                                        absl::Span<const int64> dims);
+
+// Check if the computation is GT comparison and safe for NaNs.
+bool IsNanSafeGt(HloComputation* computation);
+
+// Return k in TopK when input value is parttioned in the sort dimension.
+absl::optional<int64> GetKValueInTopKWhenPartitionSortDim(HloInstruction* hlo);
+
+// Slices the first k elements at slice dimension.
+HloInstruction* SliceFirstK(HloInstruction* hlo, SpmdBuilder* builder,
+                            int64 slice_dim, int64 k);
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index 2d33184b7d0..1111811d3a3 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -300,7 +300,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
 }
 
 StatusOr<bool> WhileLoopInvariantCodeMotion::Run(HloModule* module) {
-  VLOG(2) << "HLO module before WhileLoopConstantSinking:";
+  VLOG(2) << "HLO module before WhileLoopInvariantCodeMotion:";
   XLA_VLOG_LINES(2, module->ToString());
 
   bool changed = false;
@@ -332,10 +332,10 @@ StatusOr<bool> WhileLoopInvariantCodeMotion::Run(HloModule* module) {
   }
 
   if (changed) {
-    VLOG(2) << "HLO module after WhileLoopConstantSinking:";
+    VLOG(2) << "HLO module after WhileLoopInvariantCodeMotion:";
     XLA_VLOG_LINES(2, module->ToString());
   } else {
-    VLOG(2) << "HLO module unchanged after WhileLoopConstantSinking";
+    VLOG(2) << "HLO module unchanged after WhileLoopInvariantCodeMotion";
   }
 
   return changed;
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 22ee5a16a30..52cbb8f95ac 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
@@ -150,6 +151,19 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   return equal;
 }
 
+/* static */ bool ShapeUtil::EqualStructure(const Shape& lhs,
+                                            const Shape& rhs) {
+  bool equal = true;
+  ForEachSubshape(lhs, [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+    equal &= IndexIsValid(rhs, index);
+  });
+  ForEachSubshape(rhs, [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+    equal &= IndexIsValid(lhs, index);
+  });
+
+  return equal;
+}
+
 /* static */ int64 ShapeUtil::TrueRank(const Shape& shape) {
   int64 accum = 0;
   for (int64 dimension : shape.dimensions()) {
@@ -261,6 +275,12 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return ValidateShape(*shape);
 }
 
+/* static */ Shape ShapeUtil::MakeStaticShape(const Shape& original) {
+  Shape result = original;
+  result.clear_dynamic_dimensions();
+  return result;
+}
+
 /* static */ Shape ShapeUtil::MakeTupleShape(absl::Span<const Shape> shapes) {
   Shape result;
   result.set_element_type(TUPLE);
@@ -626,8 +646,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   if (shape.element_type() == TUPLE) {
     return ByteSizeOfTupleIndexTable(shape, pointer_size);
   } else if (shape.IsArray()) {
-    int64 byte_size = ByteSizeOfElements(shape);
-    return byte_size;
+    return ByteSizeOfElements(shape);
   } else if (shape.element_type() == TOKEN) {
     return 0;
   } else if (shape.element_type() == OPAQUE_TYPE) {
@@ -1441,6 +1460,19 @@ ShapeUtil::ReshapeLeavesDimensionsUnmodified(
   return shape;
 }
 
+/* static */ bool ShapeUtil::DynamicShapeIsCompatible(
+    const xla::Shape& dynamic_shape, const xla::Shape& bounded_shape) {
+  if (dynamic_shape.rank() != bounded_shape.rank()) {
+    return false;
+  }
+  for (int64 i = 0; i < dynamic_shape.rank(); ++i) {
+    if (dynamic_shape.dimensions(i) > bounded_shape.dimensions(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 /* static */ Shape ShapeUtil::FilterDimensions(
     const std::function<bool(int64)>& p, Shape shape) {
   CHECK(shape.IsArray());
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 7e05e17865d..dde56587482 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -298,6 +298,16 @@ class ShapeUtil {
   // As Equal, but allow one of lhs and rhs to be F16 while the other is F32.
   static bool EqualIgnoringFpPrecision(const Shape& lhs, const Shape& rhs);
 
+  // Two shapes have same structure if all subshape indices of lhs are presented
+  // on rhs and vice versa.
+  // A nested tuple shape of (F32, (S32[2], F32[2, 2])) is structurally equal to
+  // (S32, (F32[3], S32[2])) as their structures are both (,(,))
+  //
+  // In contrast, (F32, (F32, F32)) is structurally different from
+  // ((F32, F32), F32) as the former has structure (,(,)) while the latter has
+  // ((,),)
+  static bool EqualStructure(const Shape& lhs, const Shape& rhs);
+
   // Returns the number of dimensions for which the dimension is not (trivially)
   // 1. e.g., f32[2x1x1] has a true rank of 1D, the other dimensions are just
   // fluff. Note that zero dimensions are included in the true rank, e.g.,
@@ -339,6 +349,9 @@ class ShapeUtil {
   // element type changed to type.
   static Shape ChangeElementType(const Shape& original, PrimitiveType type);
 
+  // Retursn a shape with same dimensions but with all dimensions set to static.
+  static Shape MakeStaticShape(const Shape& original);
+
   // Creates a tuple shape from a slice of element shapes within the tuple.
   static Shape MakeTupleShape(absl::Span<const Shape> shapes);
 
@@ -643,12 +656,16 @@ class ShapeUtil {
   static Shape FilterDimensions(const std::function<bool(int64)>& p,
                                 Shape shape);
 
-  // Iterates through all the shape indexes, in minor to major order, starting
-  // from the base indexes, incrementing by the incr steps, up to count
-  // (index[i] < base[i] + count[i]), and calls the visitor_function with the
-  // current index.
-  // The visitor_function visitor function should return true if it wants to
-  // continue, or false otherwise.
+  // Returns true if `dynamic_shape` has dimensions that are less-equal to the
+  // "bounded_shape".
+  static bool DynamicShapeIsCompatible(const xla::Shape& dynamic_shape,
+                                       const xla::Shape& bounded_shape);
+
+  // Iterates through all the shape indexes, in minor to major order,
+  // starting from the base indexes, incrementing by the incr steps, up to
+  // count (index[i] < base[i] + count[i]), and calls the visitor_function
+  // with the current index. The visitor_function visitor function should
+  // return true if it wants to continue, or false otherwise.
   //
   // visitor_function must be a callable of type
   // StatusOr<bool>(absl::Span<int64>) or compatible.
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 5b83186ffa4..790497f888e 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -76,6 +76,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   void SetFastMathDisabled(bool disabled) {
     auto* opts = execution_options_.mutable_debug_options();
     opts->set_xla_cpu_enable_fast_math(!disabled);
+    opts->set_xla_cpu_enable_fast_min_max(!disabled);
     opts->set_xla_gpu_enable_fast_min_max(!disabled);
   }
 
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 6d64cb0a510..26cb25acbfe 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -416,6 +416,10 @@ XLA_TEST_P(ParametricDotTest, TestF16) { TestImpl<Eigen::half>(); }
 #endif
 XLA_TEST_P(ParametricDotTest, TestF32) { TestImpl<float>(); }
 XLA_TEST_P(ParametricDotTest, TestF64) { TestImpl<double>(); }
+XLA_TEST_P(ParametricDotTest, TestC64) { TestImpl<std::complex<float>>(); }
+#ifndef XLA_BACKEND_DOES_NOT_SUPPORT_COMPLEX128
+XLA_TEST_P(ParametricDotTest, TestC128) { TestImpl<std::complex<double>>(); }
+#endif
 XLA_TEST_P(ParametricDotTest, TestS32) { TestImpl<int32>(); }
 
 INSTANTIATE_TEST_CASE_P(DotTests, ParametricDotTest,
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 8eed609a134..7b64be5597b 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -165,6 +165,16 @@ PrecisionConfig HloTestBase::DefaultPrecisionConfig(int operands) {
   return precision_config;
 }
 
+void HloTestBase::SetAotFastMathDebugOptions(DebugOptions* options) {
+  options->set_xla_cpu_enable_fast_math(true);
+  options->set_xla_gpu_enable_fast_min_max(true);
+  options->set_xla_cpu_enable_fast_min_max(true);
+  options->set_xla_cpu_fast_math_honor_nans(false);
+  options->set_xla_cpu_fast_math_honor_infs(false);
+  options->set_xla_cpu_fast_math_honor_functions(false);
+  options->set_xla_cpu_fast_math_honor_division(false);
+}
+
 DebugOptions HloTestBase::GetDebugOptionsForTest() {
   auto debug_options = GetDebugOptionsFromFlags();
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index d05776a0cb9..85b1876dd3c 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -100,6 +100,10 @@ class HloTestBase : public ::testing::Test {
 
   static PrecisionConfig DefaultPrecisionConfig(int operands);
 
+  // Sets most fath math options to be enabled to model the fast math flags
+  // generally used for CPU:AOT compilation.
+  static void SetAotFastMathDebugOptions(DebugOptions* options);
+
  protected:
   // This uses the interpreter backend as the reference backend and
   // automatically finds another supported backend as the test backend. If the
diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
index 3407a68f709..40e226f9902 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@@ -310,8 +310,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) {
 
 XLA_TEST_F(VecOpsSimpleTest, ClampFloatEdgeCases) {
   XlaBuilder builder(TestName());
-  mutable_debug_options()->set_xla_cpu_enable_fast_math(false);
-  mutable_debug_options()->set_xla_gpu_enable_fast_min_max(false);
+  SetFastMathDisabled(true);
   auto low = ConstantR1<float>(&builder, {NAN, 1, 1});
   auto high = ConstantR1<float>(&builder, {3, NAN, 3});
   auto x = ConstantR1<float>(&builder, {2, 2, NAN});
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 5a482305513..d575bbb1f3e 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -863,7 +863,7 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
     // Starts = iteration * 2;
     auto starts = Mul(iteration, ConstantR0<int32>(&builder, 2));
     // UpdateSlice.
-    auto out1 = DynamicUpdateSlice(input, update, starts);
+    auto out1 = DynamicUpdateSlice(input, update, {starts});
 
     Tuple(&builder, {out0, out1});
     body = builder.Build().ConsumeValueOrDie();
diff --git a/tensorflow/compiler/xla/tools/interactive_graphviz.cc b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
index 4f8a6b43314..b6c62beff74 100644
--- a/tensorflow/compiler/xla/tools/interactive_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
@@ -112,8 +112,7 @@ constexpr int64 kDefaultMaxNumNodesInAllPaths = 100;
 
 using absl::EqualsIgnoreCase;
 
-// A global control for whether backend configuration display is enabled.
-bool show_backend_config = true;
+HloRenderOptions hlo_render_options;
 
 HloInstruction* FindInstruction(const HloModule& module, string node_name) {
   if (absl::StartsWith(node_name, "%")) {
@@ -160,6 +159,8 @@ void DoHelpCommand() {
     Renders all nodes in <computation>.
   backend_config [on|off]
     Controls whether backend operation configuration information is printed.
+  show_fusion_subcomputations [on|off]
+    Controls whether fusion subcomputations are shown.
   list [name|op_name|op_type] <pattern>
     Lists all instructions whose name, metadata op_name, or metadata op_type
     contains <pattern> as a substring.
@@ -182,15 +183,32 @@ void DoHelpCommand() {
 // Turn metadata-printing on or off.
 void DoBackendConfigCommand(const std::vector<string>& tokens) {
   if (tokens.size() == 2 && tokens[1] == "on") {
-    show_backend_config = true;
+    hlo_render_options.show_backend_config = true;
   } else if (tokens.size() == 2 && tokens[1] == "off") {
-    show_backend_config = false;
+    hlo_render_options.show_backend_config = false;
   } else if (tokens.size() != 1) {
     std::cerr << "(Illegal backend_config value.  Use either 'on' or 'off'.)"
               << std::endl;
   }
   std::cout << "Backend configuration display "
-            << (show_backend_config ? "ON" : "OFF") << std::endl;
+            << (hlo_render_options.show_backend_config ? "ON" : "OFF")
+            << std::endl;
+}
+
+// Turn fusion computation display on or off.
+void DoShowFusionSubcomputationsCommand(const std::vector<string>& tokens) {
+  if (tokens.size() == 2 && tokens[1] == "on") {
+    hlo_render_options.show_fusion_subcomputations = true;
+  } else if (tokens.size() == 2 && tokens[1] == "off") {
+    hlo_render_options.show_fusion_subcomputations = false;
+  } else if (tokens.size() != 1) {
+    std::cerr << "(Illegal show_fusion_subcomputations value.  Use either "
+                 "'on' or 'off'.)"
+              << std::endl;
+  }
+  std::cout << "Fusion subcomputations display "
+            << (hlo_render_options.show_fusion_subcomputations ? "ON" : "OFF")
+            << std::endl;
 }
 
 // List all computations in the module.
@@ -373,7 +391,7 @@ void DoExtractCommand(const HloModule& module,
   auto extracted_module = ExtractModule(instr, height);
   std::cout << extracted_module->ToString(
                    HloPrintOptions::ShortParsable().set_print_backend_config(
-                       show_backend_config))
+                       hlo_render_options.show_backend_config))
             << std::endl;
 }
 
@@ -517,7 +535,7 @@ void DoAllPathsCommand(const Options& opts, const HloModule& module,
   }
   RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
     return RenderAllPathsFromTo(*from, *to, max_nodes, format,
-                                /*show_backend_config=*/show_backend_config);
+                                hlo_render_options);
   });
 }
 
@@ -582,15 +600,13 @@ void DoPlotCommand(const Options& opts, const HloModule& module,
     RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
       return RenderGraph(*comp, /*label=*/"",
                          comp->parent()->config().debug_options(), format,
-                         /*hlo_execution_profile=*/nullptr,
-                         /*show_backend_config=*/show_backend_config);
+                         /*hlo_execution_profile=*/nullptr, hlo_render_options);
     });
   } else {
     RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
-      return RenderNeighborhoodAround(
-          *instr, graph_width, format,
-          /*show_backend_config=*/show_backend_config,
-          /*boundary=*/boundary);
+      return RenderNeighborhoodAround(*instr, graph_width, format,
+                                      hlo_render_options,
+                                      /*boundary=*/boundary);
     });
   }
 }
@@ -617,6 +633,8 @@ void InteractiveDumpGraphs(const Options& opts, const HloModule& module) {
       DoHelpCommand();
     } else if (tokens[0] == "backend_config") {
       DoBackendConfigCommand(tokens);
+    } else if (tokens[0] == "show_fusion_subcomputations") {
+      DoShowFusionSubcomputationsCommand(tokens);
     } else if (tokens[0] == "list") {
       if (tokens.size() > 1 && tokens[1] == "computations") {
         DoListComputationsCommand(module, tokens);
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index a015af674af..6595bcbe292 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -148,9 +148,20 @@ message DebugOptions {
   // xla_cpu_enable_fast_math is false.
   bool xla_cpu_fast_math_honor_functions = 129;
 
+  // When false we lower the Minimum and Maximum hlos in the CPU backend such
+  // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NaN.  In other words, if flag
+  // this is false we always propagate NaNs through Min and Max.
+  //
+  // Note, this does not correspond to the exact same behavior as the gpu flag
+  // below!
+  bool xla_cpu_enable_fast_min_max = 140;
+
   // When true we lower the Minimum and Maximum hlos in the GPU backend such
   // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NotNaN.  In other words, if flag
   // this is true we don't propagate NaNs through Min and Max.
+  //
+  // Note, this does not correspond to the exact same behavior as the cpu flag
+  // above!
   bool xla_gpu_enable_fast_min_max = 100;
 
   // Allows xla to increase the output precision of floating point operations.
@@ -276,18 +287,16 @@ message DebugOptions {
   // memory, or have bugs.
   bool xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found = 138;
 
-  // It is usually preferable to not fallback to the driver; it can consume more
-  // memory, or have bugs.
-  bool xla_gpu_unsafe_fallback_to_driver_on_ptxas_error = 139;
-
-  // Next id: 140
+  // Next id: 141
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
 
-  reserved 5, 117, 133;  // were xla_hlo_dump_as_graphdef, xla_dump_to, and
-                         // xla_gpu_use_horizontal_fusion
+  reserved 5, 117, 133,
+      139;  // were xla_hlo_dump_as_graphdef, xla_dump_to,
+            // xla_gpu_use_horizontal_fusion, and
+            // xla_gpu_unsafe_fallback_to_driver_on_ptxas_error
 }
 
 // These settings control how XLA compiles and/or runs code.  Not all settings
@@ -333,6 +342,10 @@ message ExecutionOptions {
 
   // Used to identify a set of programs that should be launch together.
   int32 launch_id = 10;
+
+  // Indicates whether to use SPMD (true) or MPMD (false) partitioning when
+  // num_partitions > 1 and XLA is requested to partition the input program.
+  bool use_spmd_partitioning = 11;
 }
 
 message GetDeviceHandlesRequest {
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index d71e6e2cc73..494ba29e981 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -49,6 +49,7 @@ cc_library(
     deps = [
         ":xrt_state_ops",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
index 1bcd8561e61..ba6e6a093d6 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
@@ -158,7 +158,7 @@ Status XRTCompileOp::Compile(OpKernelContext* ctx,
     argument_layout_ptrs[i] = &argument_layouts[i];
   }
   xla::ExecutableBuildOptions build_options;
-  build_options.set_device_ordinal(client->default_device_ordinal());
+  build_options.set_device_ordinal(device_ref.device_ordinal());
   build_options.set_num_replicas(num_replicas);
   build_options.set_result_layout(xla::Shape(config.program_shape().result()));
   build_options.set_device_allocator(device_ref.backend()->memory_allocator());
@@ -206,7 +206,8 @@ void XRTCompileOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, CompilationCacheKey(computation_proto, &key));
 
   // Process-wide cache of XLA executables.
-  auto cache_or = GetOrCreateCompilationCache(rm, /*max_number_of_entries=*/0);
+  auto cache_or = XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
+      ctx, /*max_number_of_entries=*/0);
   OP_REQUIRES_OK(ctx, cache_or.status());
   auto cache = cache_or.ConsumeValueOrDie();
 
@@ -259,15 +260,11 @@ void XRTReleaseCompilationRefOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XRTReleaseCompilationRefOp::Compute";
   auto timed = monitoring::MakeTimed(xrt_metrics::GetReleaseCompilationCell());
 
-  ResourceMgr* rm;
-  OP_REQUIRES_OK(ctx, XRTGenericDeviceAccessor::GetResourceManager(ctx, &rm));
-
   // Process-wide cache of XLA executables.
-  XRTCompilationCache* cache;
-  OP_REQUIRES_OK(ctx, rm->Lookup<XRTCompilationCache>(
-                          rm->default_container(),
-                          kXRTCompilationCacheResourceName, &cache));
-  core::ScopedUnref cache_unref(cache);
+  auto cache_or = XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
+      ctx, /*max_number_of_entries=*/0);
+  OP_REQUIRES_OK(ctx, cache_or.status());
+  auto cache = cache_or.ConsumeValueOrDie();
 
   const Tensor& keys_tensor = ctx->input(0);
   auto flat_keys = keys_tensor.flat<int64>();
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index b641f333e8b..2fc599e42df 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
@@ -38,7 +39,11 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/monitoring/timed.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
@@ -146,16 +151,245 @@ xla::StatusOr<InputBuffers> GetChainedOpInputs(
   return std::move(input_buffers);
 }
 
+// Given a shape, returns a byte array representing the shape metadata of the
+// shape. The shape metadata contains dimensions sizes stored as contiguous S32.
+std::vector<int32> PrepareMetadata(const xla::Shape& shape) {
+  DCHECK(shape.is_static());
+  DCHECK(shape.IsArray());
+  // Each dimension size is stored as a S32.
+  std::vector<int32> result(shape.dimensions_size());
+  for (int64 i = 0; i < shape.dimensions_size(); ++i) {
+    result[i] = shape.dimensions(i);
+  }
+  return result;
+}
+
+// Given a buffer with dynamic shape, update buffer metadata at the correct
+// offset starting from that buffer.
+//
+// +-----------+
+// |Payload    |
+// +-----------+
+// | Padding   |
+// +-----------+
+// |dim_size_0 |  (each dim_size is a S32):
+// +-----------+
+// |dim_size_1 |
+// +-----------+
+//  ..........
+// +-----------+
+//
+// Size of payload = ByteSizeOf(runtime_shape)
+// Size of payload + padding = ByteSizeOf(compile_time_shape_static)
+// Size of payload + padding + metadata = ByteSizeOf(compile_time_shape)
+Status UpdateMetadata(se::Stream* stream, se::DeviceMemory<uint8>* buffer,
+                      const xla::Shape& compile_time_shape,
+                      const xla::Shape& runtime_shape) {
+  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
+                                         stream->parent()->platform()));
+  TF_ASSIGN_OR_RETURN(
+      auto transfer_manager,
+      xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
+  xla::Shape compile_time_shape_static =
+      xla::ShapeUtil::MakeStaticShape(compile_time_shape);
+  uint64 offset = shape_size_fn(compile_time_shape_static);
+  uint64 metadata_size = shape_size_fn(compile_time_shape) - offset;
+  auto metadata_buffer =
+      stream->parent()->GetSubBuffer(buffer, offset, metadata_size);
+
+  auto metadata_literal = std::make_shared<xla::Literal>(
+      xla::LiteralUtil::CreateR1<int32>(PrepareMetadata(runtime_shape)));
+  TF_RETURN_IF_ERROR(transfer_manager->TransferArrayToDeviceAsync(
+      stream, *metadata_literal, metadata_buffer));
+  // Retain the literal until the end of the transfer.
+  stream->ThenDoHostCallback([metadata_literal]() { return Status::OK(); });
+  return Status::OK();
+}
+
+// Given a static input buffer, convert it to dynamic form by expanding it to
+// the bounded size and attaching a metadata filled with dimension sizes.
+//
+// From:
+// +--------+
+// |Payload |
+// +--------+
+//
+// To:
+//
+// +--------+
+// |Payload |
+// +--------+
+// | Padding|
+// +--------+
+// |Metadata|
+// +--------+
+//
+// As we can't expand the size of an existing memory allocation, a reallocation
+// is required. A list of new allocations are returned after this function. The
+// caller is reponsible for maintaining those allocations.
+xla::StatusOr<std::vector<se::OwningDeviceMemory>> UpdateDynamicInputs(
+    se::Stream* stream, se::DeviceMemoryAllocator* allocator,
+    std::vector<xla::ShapedBuffer*> runtime_inputs,
+    const std::vector<xla::ShapeLayout>& compile_time_shapes) {
+  std::vector<se::OwningDeviceMemory> new_allocations;
+  TF_RET_CHECK(runtime_inputs.size() == compile_time_shapes.size());
+  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
+                                         stream->parent()->platform()));
+  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
+  for (int64 i = 0; i < compile_time_shapes.size(); i++) {
+    const xla::Shape& compile_time_shape = compile_time_shapes[i].shape();
+    if (compile_time_shape.is_static()) {
+      continue;
+    }
+    auto* runtime_input = runtime_inputs[i];
+
+    bool element_modified = false;
+    TF_RETURN_IF_ERROR(xla::ShapeUtil::ForEachSubshapeWithStatus(
+        compile_time_shape,
+        [&](const xla::Shape& compile_time_shape,
+            const xla::ShapeIndex& index) -> Status {
+          if (compile_time_shape.IsTuple() || compile_time_shape.is_static()) {
+            return Status::OK();
+          }
+          const xla::Shape& runtime_shape = xla::ShapeUtil::GetSubshape(
+              runtime_input->on_device_shape(), index);
+          TF_RET_CHECK(!runtime_shape.IsTuple());
+          TF_RET_CHECK(xla::ShapeUtil::DynamicShapeIsCompatible(
+              runtime_shape, compile_time_shape));
+          se::DeviceMemoryBase* static_input =
+              runtime_input->buffers().mutable_element(index);
+          TF_ASSIGN_OR_RETURN(
+              auto dynamic_input,
+              allocator->Allocate(stream->parent()->device_ordinal(),
+                                  shape_size_fn(compile_time_shape)));
+          new_allocations.emplace_back(std::move(dynamic_input));
+          se::DeviceMemory<uint8>* dynamic_input_base =
+              new_allocations.back().ptr();
+          // Send the original data to the new location.
+          stream->ThenMemcpyD2D(dynamic_input_base, *static_input,
+                                static_input->size());
+          TF_RETURN_IF_ERROR(UpdateMetadata(stream, dynamic_input_base,
+                                            compile_time_shape, runtime_shape));
+          // Modify the memory location in the input shape tree to point to the
+          // new input.
+          runtime_input->set_buffer(*dynamic_input_base, index);
+          element_modified = true;
+          return Status::OK();
+        }));
+    if (element_modified) {
+      runtime_input->set_shapes(compile_time_shape, compile_time_shape);
+      // The input location has been modified, need to fix tuple table to
+      // point to the correct address.
+      TF_ASSIGN_OR_RETURN(
+          auto transfer_manager,
+          xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+      TF_RETURN_IF_ERROR(
+          transfer_manager->WriteTupleIndexTablesAsync(stream, *runtime_input));
+    }
+  }
+  return std::move(new_allocations);
+}
+
+xla::StatusOr<xla::Literal> ReadMetadataLiteral(
+    se::Stream* stream, se::DeviceMemoryBase* buffer,
+    const xla::Shape& buffer_shape, xla::TransferManager* transfer_manager) {
+  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
+                                         stream->parent()->platform()));
+  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
+  xla::Shape buffer_shape_static =
+      xla::ShapeUtil::MakeStaticShape(buffer_shape);
+  const int64 offset = shape_size_fn(buffer_shape_static);
+  int64 metadata_size = shape_size_fn(buffer_shape) - offset;
+  TF_RET_CHECK(metadata_size != 0);
+  auto buffer_8 = se::DeviceMemory<uint8>(*buffer);
+  auto metadata_buffer =
+      stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
+  return transfer_manager->TransferArrayFromDevice(
+      stream,
+      xla::ShapeUtil::MakeShape(xla::S32, {buffer_shape.dimensions_size()}),
+      metadata_buffer);
+}
+
+// For each subshape in the result buffer that's dynamic, read the dynamic
+// dimension sizes from the metadata, and update output shapes. The result shape
+// is a static and concrete shape.
+xla::Status UpdateDynamicOutputs(se::Stream* stream,
+                                 xla::ShapedBuffer* shaped_buffer,
+                                 xla::Shape* output_host_shape,
+                                 xla::Shape* output_device_shape) {
+  DCHECK(output_device_shape->is_dynamic());
+  TF_ASSIGN_OR_RETURN(
+      auto transfer_manager,
+      xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+  TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus(
+      [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+        const xla::Shape& buffer_shape =
+            xla::ShapeUtil::GetSubshape(*output_device_shape, index);
+        if (buffer_shape.IsTuple()) {
+          return Status::OK();
+        }
+        xla::Shape& host_shape =
+            *xla::ShapeUtil::GetMutableSubshape(output_host_shape, index);
+        xla::Shape& device_shape =
+            *xla::ShapeUtil::GetMutableSubshape(output_device_shape, index);
+        if (device_shape.is_static()) {
+          return Status::OK();
+        }
+        TF_ASSIGN_OR_RETURN(auto metadata,
+                            ReadMetadataLiteral(stream, buffer, buffer_shape,
+                                                transfer_manager));
+        // Update shape size from metadata.
+        for (int64 i = 0; i < metadata.element_count(); ++i) {
+          host_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
+          device_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
+        }
+        return Status::OK();
+      }));
+  output_host_shape->clear_dynamic_dimensions();
+  output_device_shape->clear_dynamic_dimensions();
+  return Status::OK();
+}
+
+// Create output tuple from run_result.
+xla::StatusOr<RefPtr<XRTTupleAllocation>> CreateOutputTuple(
+    se::Stream* stream, xla::ScopedShapedBuffer run_result,
+    xla::Backend* backend, int device_ordinal) {
+  XRTTupleAllocation* output_tuple;
+  xla::ShapedBuffer shaped_buffer = run_result.release();
+  if (shaped_buffer.on_device_shape().is_dynamic()) {
+    // Update dynamic shapes from output buffer, and create a XRT tensor with
+    // dimension sizes read from metadata.
+    xla::Shape output_host_shape = shaped_buffer.on_host_shape();
+    xla::Shape output_device_shape = shaped_buffer.on_device_shape();
+    TF_RETURN_IF_ERROR(UpdateDynamicOutputs(
+        stream, &shaped_buffer, &output_host_shape, &output_device_shape));
+    TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
+        shaped_buffer, output_host_shape, output_device_shape, backend,
+        device_ordinal, &output_tuple));
+  } else {
+    // Fast-path: Don't copy shapes of output buffer.
+    TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
+        shaped_buffer, backend, device_ordinal, &output_tuple));
+  }
+  return RefPtr<XRTTupleAllocation>(output_tuple);
+}
+
 xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
     OpKernelContext* context, XRTGenericDeviceAccessor::ScopedRef* device_ref,
     xla::LocalExecutable* executable, const InputBuffers& input_buffers,
-    se::Stream* stream, int rng_seed, int replica_id) {
+    se::Stream* stream, int rng_seed,
+    const xrt::CommonExecutionConfig& config) {
   VLOG(2) << "Executing computation.";
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
   run_options.set_allocator(device_ref->backend()->memory_allocator());
   run_options.set_intra_op_thread_pool(&context->eigen_cpu_device());
   run_options.set_rng_seed(rng_seed);
+  if (config.run_id() != 0) {
+    run_options.set_run_id(xla::RunId(config.run_id()));
+  }
   if (executable->executable()
           ->module_config()
           .has_static_device_assignment()) {
@@ -164,8 +398,11 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
   }
   xla::GpuExecutableRunOptions gpu_options;
   std::vector<xla::GlobalDeviceId> gpu_global_ids;
-  if (replica_id >= 0) {
-    gpu_global_ids.emplace_back(replica_id);
+  if (config.local_replica_mapping_size() > 0) {
+    gpu_global_ids.reserve(config.local_replica_mapping_size());
+    for (auto& gid : config.local_replica_mapping()) {
+      gpu_global_ids.emplace_back(xla::GlobalDeviceId(gid));
+    }
     gpu_options.set_gpu_global_device_ids(gpu_global_ids);
   }
   std::shared_ptr<NcclUniqueIdFactory> nccl_factory = GetNcclUniqueIdFactory();
@@ -184,18 +421,31 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
 
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
+  const std::vector<xla::ShapeLayout>& shape_layouts =
+      executable->executable()
+          ->module_config()
+          .entry_computation_layout()
+          .parameter_layouts();
+  TF_ASSIGN_OR_RETURN(auto new_allocations,
+                      UpdateDynamicInputs(stream, run_options.allocator(),
+                                          input_buffers.input_pointers,
+                                          shape_layouts));
+  auto new_allocations_ptr =
+      std::make_shared<std::vector<se::OwningDeviceMemory>>(
+          std::move(new_allocations));
   TF_ASSIGN_OR_RETURN(
       xla::ScopedShapedBuffer run_result,
       executable->Run(input_buffers.input_pointers, run_options));
+  // Retain the new allocation for input memory until the end of execution.
+  stream->ThenDoHostCallback([new_allocations_ptr]() { return Status::OK(); });
+
   auto elapsed = env->NowMicros() - start_time;
   VLOG(2) << "Elapsed time: " << elapsed << "us";
 
-  auto shaped_buffer = run_result.release();
-  XRTTupleAllocation* output_tuple;
-  TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
-      shaped_buffer, device_ref->backend(), device_ref->device_ordinal(),
-      &output_tuple));
-  RefPtr<XRTTupleAllocation> output_tuple_ptr(output_tuple);
+  TF_ASSIGN_OR_RETURN(
+      RefPtr<XRTTupleAllocation> output_tuple_ptr,
+      CreateOutputTuple(stream, std::move(run_result), device_ref->backend(),
+                        device_ref->device_ordinal()));
 
   // The ScopedShapedBuffer returned by the executable Run() API, in case of
   // input/output buffer aliasing, might have holes in it, which need to be
@@ -208,7 +458,7 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
           const xla::HloInputOutputAliasConfig::Alias& alias) -> Status {
     TF_RET_CHECK(alias.parameter_number < input_buffers.input_tuples.size());
     return alias.kind == xla::HloInputOutputAliasConfig::AliasKind::kUserAlias
-               ? output_tuple->AliasBufferFrom(
+               ? output_tuple_ptr->AliasBufferFrom(
                      *input_buffers.input_tuples[alias.parameter_number],
                      alias.parameter_index, output_index)
                : Status::OK();
@@ -222,10 +472,11 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
     OpKernelContext* context, XRTMemoryManager* memory_manager,
     XRTGenericDeviceAccessor::ScopedRef* device_ref,
     xla::LocalExecutable* executable, const InputBuffers& input_buffers,
-    se::Stream* stream, int rng_seed, int replica_id) {
+    se::Stream* stream, int rng_seed,
+    const xrt::CommonExecutionConfig& config) {
   auto runfn = [&]() {
     return RunExecutable(context, device_ref, executable, input_buffers, stream,
-                         rng_seed, replica_id);
+                         rng_seed, config);
   };
 
   // We pass zero as requested_free_size as there is no simple way to get the
@@ -241,14 +492,15 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
     XRTGenericDeviceAccessor::ScopedRef* device_ref,
     xla::LocalExecutable* executable,
     const std::vector<InputCoords>& input_coords, bool release_inputs,
-    se::Stream* stream, int rng_seed, int replica_id) {
+    se::Stream* stream, int rng_seed,
+    const xrt::CommonExecutionConfig& config) {
   XRTMemoryManager::WorkingSet working_set(memory_manager);
   TF_ASSIGN_OR_RETURN(InputBuffers input_buffers,
                       GetInputBuffers(&working_set, device_ref->backend(),
                                       input_coords, release_inputs));
   return ExecuteComputation(context, memory_manager.get(), device_ref,
                             executable, input_buffers, stream, rng_seed,
-                            replica_id);
+                            config);
 }
 
 // XRTExecuteOp
@@ -297,8 +549,9 @@ Status XRTExecuteOp::DoWork(OpKernelContext* context) {
   bool release_inputs = config_proto.release_input_handles();
   bool release_compilation = config_proto.release_compilation_handle();
 
-  TF_ASSIGN_OR_RETURN(
-      auto cache, GetOrCreateCompilationCache(rm, /*max_number_of_entries=*/0));
+  TF_ASSIGN_OR_RETURN(auto cache,
+                      XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
+                          context, /*max_number_of_entries=*/0));
   // We are guaranteed that the underlying device object won't be deleted out
   // from under us, while the ScopedRef is live.
   class XRTGenericDeviceAccessor::ScopedRef device_ref;
@@ -330,7 +583,7 @@ Status XRTExecuteOp::DoWork(OpKernelContext* context) {
       RefPtr<XRTTupleAllocation> output_tuple,
       ExecuteComputation(context, memory_manager, &device_ref, executable,
                          input_coords, release_inputs, stream, rng_seed,
-                         config_proto.replica_id()));
+                         config_proto.common_config()));
 
   return CreateExecuteOutput(context, memory_manager.get(),
                              std::move(output_tuple),
@@ -379,8 +632,9 @@ Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) {
   xrt::XRTChainedExecuteConfig config;
   TF_RET_CHECK(ParseFromTString(execution_config.scalar<tstring>()(), &config));
 
-  TF_ASSIGN_OR_RETURN(
-      auto cache, GetOrCreateCompilationCache(rm, /*max_number_of_entries=*/0));
+  TF_ASSIGN_OR_RETURN(auto cache,
+                      XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
+                          context, /*max_number_of_entries=*/0));
   // We are guaranteed that the underlying device object won't be deleted out
   // from under us, while the ScopedRef is live.
   class XRTGenericDeviceAccessor::ScopedRef device_ref;
@@ -408,7 +662,7 @@ Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) {
 
     return ExecuteComputation(context, memory_manager.get(), &device_ref,
                               executable, input_buffers, stream, rng_seed,
-                              config.replica_id());
+                              config.common_config());
   };
 
   return ExecuteChained(context, memory_manager, device_ref.backend(),
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index 243289c8821..67647cc4285 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -49,6 +49,91 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+xla::XlaComputation ReturnDynamicR1() {
+  xla::XlaBuilder builder("ReturnDynamicR1");
+  auto p0 = xla::Parameter(&builder, 0,
+                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P0");
+  auto p1 = xla::Parameter(&builder, 1,
+                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P1");
+  auto p2 = xla::Parameter(&builder, 2, xla::ShapeUtil::MakeShape(xla::S32, {}),
+                           "P2");
+  auto sum = xla::Add(p0, p1);
+  auto pad_sum = xla::SetDimensionSize(sum, p2, 0);
+  return builder.Build(pad_sum).ValueOrDie();
+}
+
+xla::XlaComputation ReturnDynamicR2() {
+  xla::XlaBuilder builder("ReturnDynamicR2");
+  auto p0 = xla::Parameter(&builder, 0,
+                           xla::ShapeUtil::MakeShape(xla::F32, {2, 4}), "P0");
+  auto p1 = xla::Parameter(&builder, 1,
+                           xla::ShapeUtil::MakeShape(xla::F32, {2, 4}), "P1");
+  auto p2 = xla::Parameter(&builder, 2, xla::ShapeUtil::MakeShape(xla::S32, {}),
+                           "P2");
+  auto sum = xla::Add(p0, p1);
+  auto pad_sum_dim0 = xla::SetDimensionSize(sum, p2, 0);
+  auto pad_sum_dim1 = xla::SetDimensionSize(pad_sum_dim0, p2, 1);
+  return builder.Build(pad_sum_dim1).ValueOrDie();
+}
+
+xla::XlaComputation AcceptDynamicR1() {
+  xla::XlaBuilder builder("AcceptDynamicR1");
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  auto p0 = xla::Parameter(&builder, 0, dyn_shape, "P0");
+  auto p1 = xla::Parameter(&builder, 1, dyn_shape, "P1");
+  auto sum = xla::Add(p0, p1);
+  return builder.Build(sum).ValueOrDie();
+}
+
+xla::XlaComputation AcceptDynamicR2() {
+  xla::XlaBuilder builder("AcceptDynamicR2");
+  xla::Shape dyn_shape;
+  dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {2, 4});
+  dyn_shape.set_dynamic_dimension(1, true);
+  auto p0 = xla::Parameter(&builder, 0, dyn_shape, "P0");
+  auto negate = xla::Neg(p0);
+  return builder.Build(negate).ValueOrDie();
+}
+
+xla::XlaComputation ReturnDynamicR1Tuple() {
+  xla::XlaBuilder builder("ReturnDynamicR1Tuple");
+  auto p0 = xla::Parameter(&builder, 0,
+                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P0");
+  auto p1 = xla::Parameter(&builder, 1,
+                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P1");
+  auto p2 = xla::Parameter(&builder, 2, xla::ShapeUtil::MakeShape(xla::S32, {}),
+                           "P2");
+  auto sum = xla::Add(p0, p1);
+  auto sub = xla::Sub(p0, p1);
+  auto one = xla::One(&builder, xla::S32);
+  auto pad_sum = xla::SetDimensionSize(sum, p2, 0);
+  auto pad_sub = xla::SetDimensionSize(sub, p2 + one, 0);
+  auto tuple = xla::Tuple(&builder, {pad_sum, sum, pad_sub});
+  return builder.Build(tuple, /*remove_dynamic_dimensions=*/true).ValueOrDie();
+}
+
+xla::XlaComputation AcceptDynamicR1Tuple() {
+  xla::XlaBuilder builder("AcceptDynamicR1");
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  xla::Shape tuple_shape =
+      xla::ShapeUtil::MakeTupleShape({dyn_shape, dyn_shape});
+  xla::Shape nest_tuple_shape =
+      xla::ShapeUtil::MakeTupleShape({dyn_shape, dyn_shape});
+  auto p = xla::Parameter(&builder, 0, tuple_shape, "P0");
+  auto p0 = xla::GetTupleElement(p, 0);
+  auto p1 = xla::GetTupleElement(p, 1);
+  auto sum = xla::Add(p0, p1);
+  return builder.Build(sum).ValueOrDie();
+}
+
+template <typename T>
+xla::LiteralProto CreateR0(T v) {
+  auto array = xla::LiteralUtil::CreateR0<T>(v);
+  return array.ToProto();
+}
+
 class XrtClientSession : public ClientSession {
  public:
   explicit XrtClientSession(const Scope& scope) : ClientSession(scope) {
@@ -61,6 +146,11 @@ class XrtClientSession : public ClientSession {
 string* xla_test_device_ptr;  // initial value set in main()
 string* xla_platform_ptr;     // initial value set in main()
 
+bool SupportDynamicShapes() {
+  // TODO(jackcao): Support dynamic shapes on XLA GPU.
+  return *xla_test_device_ptr != "XLA_GPU";
+}
+
 string DeviceFromFlag() {
   string xla_test_device = *xla_test_device_ptr;
   return absl::StrCat("/device:", xla_test_device, ":0");
@@ -1035,6 +1125,353 @@ TEST(RawApiTest, CompileAndExecute) {
   EXPECT_EQ(program_shape.parameters_size(), 2);
 }
 
+TEST(RawApiTest, DynamicR1Test) {
+  if (!SupportDynamicShapes()) {
+    GTEST_SKIP()
+        << "Skipping the test if backend doesn't support dynamic shapes";
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f, -1.0f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({1.0f, -1.0f, 2.5f, 1.17f});
+  xrt::XLAAllocation p2;
+  *p2.mutable_value() = CreateR0<xla::int32>(2);
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S32, {}).ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  *shapes->mutable_result() = dyn_shape.ToProto();
+  StoreComputationSnapshot(ReturnDynamicR1(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto p2_value = ops::Const(cpu_root, p2.SerializeAsString());
+  auto p2_handle = ops::XRTAllocate(root, p2_value);
+  auto result = ops::XRTExecute(
+      root, c_handle.handle, e_config,
+      {Output(p0_handle), Output(p1_handle), Output(p2_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+  auto expected = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
+TEST(RawApiTest, DynamicR2Test) {
+  if (!SupportDynamicShapes()) {
+    GTEST_SKIP()
+        << "Skipping the test if backend doesn't support dynamic shapes";
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = xla::LiteralUtil::CreateR2({{1.0f, 2.0f, 0.5f, -1.0f},
+                                                    {1.5f, 2.5f, 3.0f, -2.0f}})
+                            .ToProto();
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = xla::LiteralUtil::CreateR2({{1.0f, -1.0f, 2.5f, 1.17f},
+                                                    {1.2f, -1.6f, 2.8f, 1.24f}})
+                            .ToProto();
+  xrt::XLAAllocation p2;
+  *p2.mutable_value() = CreateR0<xla::int32>(2);
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2, 4}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2, 4}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S32, {}).ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {2, 4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  dyn_shape.set_dynamic_dimension(1, true);
+  *shapes->mutable_result() = dyn_shape.ToProto();
+  StoreComputationSnapshot(ReturnDynamicR2(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto p2_value = ops::Const(cpu_root, p2.SerializeAsString());
+  auto p2_handle = ops::XRTAllocate(root, p2_value);
+  auto result = ops::XRTExecute(
+      root, c_handle.handle, e_config,
+      {Output(p0_handle), Output(p1_handle), Output(p2_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+  auto expected = xla::LiteralUtil::CreateR2<float>({{2.0f, 1.0f}, {2.7, 0.9}});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
+TEST(RawApiTest, DynamicR1TupleTest) {
+  if (!SupportDynamicShapes()) {
+    GTEST_SKIP()
+        << "Skipping the test if backend doesn't support dynamic shapes";
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f, -1.0f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({1.0f, -1.0f, -0.5f, 1.0f});
+  xrt::XLAAllocation p2;
+  *p2.mutable_value() = CreateR0<xla::int32>(2);
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S32, {}).ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape(
+          {dyn_shape, xla::ShapeUtil::MakeShape(xla::F32, {4}), dyn_shape})
+          .ToProto();
+  StoreComputationSnapshot(ReturnDynamicR1Tuple(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto p2_value = ops::Const(cpu_root, p2.SerializeAsString());
+  auto p2_handle = ops::XRTAllocate(root, p2_value);
+  auto result = ops::XRTExecute(
+      root, c_handle.handle, e_config,
+      {Output(p0_handle), Output(p1_handle), Output(p2_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+
+  auto expected0 = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f});
+  auto expected1 = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f, 0.0f, 0.0f});
+  auto expected2 = xla::LiteralUtil::CreateR1<float>({0.0f, 3.0f, 1.0f});
+  auto expected =
+      xla::LiteralUtil::MakeTuple({&expected0, &expected1, &expected2});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
+TEST(RawApiTest, AcceptDynamicR1TupleTest) {
+  if (!SupportDynamicShapes()) {
+    GTEST_SKIP()
+        << "Skipping the test if backend doesn't support dynamic shapes";
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({1.0f, -1.0f, -0.5f});
+
+  xrt::XLATupleNode tuple_desc;
+  auto subdesc_10 = tuple_desc.add_tuples();
+  auto subdesc_11 = tuple_desc.add_tuples();
+  subdesc_10->set_input_index(0);
+  subdesc_10->set_release_input_handle(true);
+  subdesc_11->set_input_index(1);
+  subdesc_11->set_release_input_handle(true);
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  xla::Shape dyn_input_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_input_shape.set_dynamic_dimension(0, true);
+  xla::Shape dyn_tuple_shape =
+      xla::ShapeUtil::MakeTupleShape({dyn_input_shape, dyn_input_shape});
+  *shapes->add_parameters() = dyn_tuple_shape.ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  *shapes->mutable_result() = dyn_shape.ToProto();
+  StoreComputationSnapshot(AcceptDynamicR1Tuple(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+
+  auto tuple_0 = ops::Const(root.WithDevice("/device:CPU:0"),
+                            tuple_desc.SerializeAsString());
+  auto t0_handle = ops::XRTMakeTuple(
+      root, tuple_0,
+      {static_cast<Output>(p0_handle), static_cast<Output>(p1_handle)});
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
+                                {static_cast<Output>(t0_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+
+  auto expected = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f, 0.0f});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
+TEST(RawApiTest, AcceptDynamicR1Test) {
+  if (!SupportDynamicShapes()) {
+    GTEST_SKIP()
+        << "Skipping the test if backend doesn't support dynamic shapes";
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({1.0f, -1.0f, -0.5f});
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  xla::Shape dyn_input_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_input_shape.set_dynamic_dimension(0, true);
+  *shapes->add_parameters() = dyn_input_shape.ToProto();
+  *shapes->add_parameters() = dyn_input_shape.ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  *shapes->mutable_result() = dyn_shape.ToProto();
+  StoreComputationSnapshot(AcceptDynamicR1(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto allocate_op_0 = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto allocate_op_1 = ops::XRTAllocate(root, p1_value);
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
+                                {Output(allocate_op_0), Output(allocate_op_1)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+
+  auto expected = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f, 0.0f});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
+TEST(RawApiTest, AcceptDynamicR2Test) {
+  if (!SupportDynamicShapes()) {
+    GTEST_SKIP()
+        << "Skipping the test if backend doesn't support dynamic shapes";
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() =
+      xla::LiteralUtil::CreateR2({{-1.0f, 3.0f, 1.0f}, {-2.0f, -1.0f, 3.0f}})
+          .ToProto();
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  // Compile time expects ascending layout.
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {2, 4});
+  dyn_shape.set_dynamic_dimension(1, true);
+  *shapes->add_parameters() = dyn_shape.ToProto();
+
+  *shapes->mutable_result() = dyn_shape.ToProto();
+  StoreComputationSnapshot(AcceptDynamicR2(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto result =
+      ops::XRTExecute(root, c_handle.handle, e_config, {Output(p0_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+
+  auto expected = xla::LiteralUtil::CreateR2<float>(
+      {{1.0f, -3.0f, -1.0f}, {2.0f, 1.0f, -3.0f}});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
 TEST(RawApiTest, CompileAndExecuteWithArgumentVector) {
   xrt::XLAAllocation p0;
   *p0.mutable_value() = FloatVector({1.0f, 2.0f});
diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto
index 1cbd851f7ef..9a351732c4b 100644
--- a/tensorflow/compiler/xrt/xrt.proto
+++ b/tensorflow/compiler/xrt/xrt.proto
@@ -111,6 +111,17 @@ message XLATupleNode {
   repeated XLATupleNode tuples = 3;
 }
 
+message CommonExecutionConfig {
+  // The replica index this execute is driving.
+  int32 replica_id = 1;
+  // Mapping local device ordinals to global replica IDs.
+  // local_replica_mapping[LOCAL_DEVICE_ORDINAL] = GLOBAL_REPLICA_ID
+  repeated int32 local_replica_mapping = 2;
+  // The execution run ID used to correlate different XRT execute operations
+  // happeining in parallel from different threads.
+  int64 run_id = 3;
+}
+
 // Options for an XLA execution.
 message XRTExecutionConfig {
   // Local device to run on. This is present because the execute Op
@@ -133,8 +144,9 @@ message XRTExecutionConfig {
   // a single tuple allocation the execution will return a vector of
   // allocations, one for each of the first-level elements of the result tuple.
   bool return_exploded_tuple = 7;
-  // The replica index this execute is driving.
-  int32 replica_id = 8;
+  reserved 8;
+  // The common configuration for XRT execute operations.
+  CommonExecutionConfig common_config = 9;
 }
 
 message XRTChainedExecuteConfig {
@@ -145,8 +157,9 @@ message XRTChainedExecuteConfig {
   // Optional key to disambiguate between executions. This is only needed if
   // multiple host send/recvs may be outstanding concurrently with executions.
   string execution_instance_key = 3;
-  // The replica index this execute is driving.
-  int32 replica_id = 4;
+  reserved 4;
+  // The common configuration for XRT execute operations.
+  CommonExecutionConfig common_config = 5;
 }
 
 // A single chained execute operation. An operation can either be a device data
diff --git a/tensorflow/compiler/xrt/xrt_device.cc b/tensorflow/compiler/xrt/xrt_device.cc
index 1b5557d556d..46954572c5d 100644
--- a/tensorflow/compiler/xrt/xrt_device.cc
+++ b/tensorflow/compiler/xrt/xrt_device.cc
@@ -17,19 +17,56 @@ limitations under the License.
 
 #include "tensorflow/compiler/xrt/xrt_device.h"
 
+#include <map>
+
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
+namespace {
+
+class ResourceMgrArena {
+ public:
+  static ResourceMgrArena* Get() {
+    static ResourceMgrArena* arena = new ResourceMgrArena();
+    return arena;
+  }
+
+  ResourceMgr* GetResourceMgr(const std::string& platform_name) {
+    mutex_lock lock(mutex_);
+    auto it = resource_managers_.find(platform_name);
+    if (it == resource_managers_.end()) {
+      it = resource_managers_.emplace(platform_name, new ResourceMgr()).first;
+    }
+    return it->second;
+  }
+
+ private:
+  mutex mutex_;
+  std::map<std::string, ResourceMgr*> resource_managers_;
+};
+
+}  // namespace
 
 /*static*/ Status XRTGenericDeviceAccessor::GetResourceManager(
     OpKernelContext* ctx, ResourceMgr** rm) {
-  *rm = ctx->resource_manager();
+  const XlaDevice::Metadata* metadata;
+  TF_RETURN_IF_ERROR(XlaDevice::GetMetadata(ctx, &metadata));
+  *rm = ResourceMgrArena::Get()->GetResourceMgr(metadata->platform()->Name());
   return Status::OK();
 }
 
+/* static */ xla::StatusOr<RefPtr<XRTCompilationCache>>
+XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
+    OpKernelContext* ctx, int64 max_number_of_entries) {
+  ResourceMgr* rm;
+  TF_RETURN_IF_ERROR(GetResourceManager(ctx, &rm));
+  return tensorflow::GetOrCreateCompilationCache(rm, max_number_of_entries);
+}
+
 /*static*/ Status XRTGenericDeviceAccessor::InitScopedRef(
     OpKernelContext* ctx, int device_ordinal, ScopedRef* scoped_ref) {
   const XlaDevice::Metadata* metadata;
diff --git a/tensorflow/compiler/xrt/xrt_device.h b/tensorflow/compiler/xrt/xrt_device.h
index 5ebee7641f0..02fab315830 100644
--- a/tensorflow/compiler/xrt/xrt_device.h
+++ b/tensorflow/compiler/xrt/xrt_device.h
@@ -19,6 +19,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XRT_XRT_DEVICE_H_
 
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 
@@ -31,6 +32,9 @@ class XRTGenericDeviceAccessor {
  public:
   static Status GetResourceManager(OpKernelContext* ctx, ResourceMgr** rm);
 
+  static xla::StatusOr<RefPtr<XRTCompilationCache>> GetOrCreateCompilationCache(
+      OpKernelContext* ctx, int64 max_number_of_entries);
+
   // We use a ScopedRef pattern here even though it's not strictly necessary,
   // just so that templated uses of this and the TPU accessor class will be as
   // similar as possible.
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index b4bec2a6907..2b16801f6ed 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -83,7 +83,6 @@ load(
     "tf_gen_op_libs",
     "tf_genrule_cmd_append_to_srcs",
     "tf_opts_nortti_if_lite_protos",
-    "tf_opts_nortti_if_mobile",
     "tf_portable_full_lite_protos",
     "transitive_hdrs",
 )
@@ -100,28 +99,23 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
 
-# buildifier: disable=same-origin-load
-# Placeholder: load("//tensorflow:tensorflow.bzl", "tf_portable_proto_lib")
-
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_monitoring_deps")
 
 # For platform specific build config
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_additional_all_protos",
     "tf_additional_lib_deps",
     "tf_additional_test_deps",
     "tf_jspb_proto_library",
     "tf_kernel_tests_linkstatic",
     "tf_lib_proto_parsing_deps",
     "tf_portable_deps_no_runtime",
+    "tf_portable_proto_lib",
     "tf_proto_library",
-    "tf_proto_library_cc",
     "tf_protos_all_impl",
     "tf_protos_grappler_impl",
     "tf_protos_profiler_impl",
-    "tf_pyclif_proto_library",
 )
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
@@ -184,18 +178,18 @@ package_group(name = "friends")
 # filegroup; e.g.  ones with individual proto_library targets.
 # LINT.IfChange
 COMMON_PROTO_SRCS = [
-    "protobuf/bfc_memory_map.proto",
-    "protobuf/config.proto",
-    "protobuf/cluster.proto",
-    "protobuf/debug.proto",
-    "protobuf/device_filters.proto",
-    "protobuf/device_properties.proto",
-    "protobuf/graph_debug_info.proto",
-    "protobuf/queue_runner.proto",
-    "protobuf/rewriter_config.proto",
-    "protobuf/tensor_bundle.proto",
-    "protobuf/saver.proto",
-    "protobuf/verifier_config.proto",
+    "//tensorflow/core/protobuf:bfc_memory_map.proto",
+    "//tensorflow/core/protobuf:config.proto",
+    "//tensorflow/core/protobuf:cluster.proto",
+    "//tensorflow/core/protobuf:debug.proto",
+    "//tensorflow/core/protobuf:device_filters.proto",
+    "//tensorflow/core/protobuf:device_properties.proto",
+    "//tensorflow/core/protobuf:graph_debug_info.proto",
+    "//tensorflow/core/protobuf:queue_runner.proto",
+    "//tensorflow/core/protobuf:rewriter_config.proto",
+    "//tensorflow/core/protobuf:tensor_bundle.proto",
+    "//tensorflow/core/protobuf:saver.proto",
+    "//tensorflow/core/protobuf:verifier_config.proto",
 ]
 
 EXAMPLE_PROTO_SRCS = [
@@ -242,7 +236,7 @@ PROFILER_PROTO_SRCS = [
 ]
 
 ERROR_CODES_PROTO_SRCS = [
-    "protobuf/error_codes.proto",
+    "//tensorflow/core/protobuf:error_codes.proto",
     "//tensorflow/core/lib/core:error_codes.proto",
 ]
 # LINT.ThenChange(//tensorflow/core/portable_proto_config.asciipb)
@@ -255,11 +249,13 @@ tf_proto_library(
     cc_api_version = 2,
     make_default_target_header_only = True,
     protodeps = [
-        ":core_protos",
-        ":error_codes_proto_impl",
         "//tensorflow/core/example:protos_all",
         "//tensorflow/core/framework:protos_all",
         "//tensorflow/core/lib/core:error_codes_proto",
+        "//tensorflow/core/profiler/protobuf:xplane_proto",
+        "//tensorflow/core/profiler:profiler_options_proto",
+        "//tensorflow/core/protobuf:error_codes_proto_impl",
+        "//tensorflow/core/protobuf:for_core_protos",
         "//tensorflow/core/util:protos_all",
         "//tensorflow/core/util:test_log_proto_impl",
     ],
@@ -1274,7 +1270,7 @@ filegroup(
         "//tensorflow/core/platform:mobile_srcs_no_runtime",
         "//tensorflow/core/public:mobile_srcs_no_runtime",
         "//tensorflow/core/util:mobile_srcs_no_runtime",
-        "//tensorflow/core/util/ctc:android_srcs",
+        "//tensorflow/core/util/ctc:mobile_srcs",
     ] + glob(
         [
             "client/**/*.cc",
@@ -1304,12 +1300,12 @@ filegroup(
         "//tensorflow/core/common_runtime/eager:srcs",
         "//tensorflow/core/framework:mobile_srcs_only_runtime",
         "//tensorflow/core/graph:mobile_srcs_only_runtime",
-        "//tensorflow/core/kernels:android_srcs",
+        "//tensorflow/core/kernels:mobile_srcs",
         "//tensorflow/core/lib/io:mobile_srcs_only_runtime",
         "//tensorflow/core/profiler:mobile_srcs",
         "//tensorflow/core/public:mobile_srcs_only_runtime",
         "//tensorflow/core/util/sparse:mobile_srcs_only_runtime",
-        "//tensorflow/core/util/tensor_bundle:android_srcs",
+        "//tensorflow/core/util/tensor_bundle:mobile_srcs",
         "//tensorflow/core/util:mobile_srcs_only_runtime",
 
         # Sources for which we already have granular targets.
@@ -1382,10 +1378,9 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":protos_all_cc_impl",
         "//tensorflow/core/util:stats_calculator_portable",
         "//tensorflow/core:mobile_additional_lib_deps",
-    ] + tf_portable_deps_no_runtime(),
+    ] + tf_portable_proto_lib() + tf_portable_deps_no_runtime(),
     alwayslink = 1,
 )
 
@@ -1417,54 +1412,12 @@ cc_library(
     ],
 )
 
-# Native library support for iOS applications.
-#
-# bazel  build --config=ios_x86_64 \
-# :ios_tensorflow_lib
-cc_library(
-    name = "ios_tensorflow_lib",
-    srcs = if_ios([
-        ":portable_op_registrations_and_gradients",
-        "//tensorflow/core/kernels:android_core_ops",
-        "//tensorflow/core/kernels:android_extended_ops",
-    ]),
-    copts = tf_copts() + tf_opts_nortti_if_lite_protos() + ["-Os"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":portable_tensorflow_lib_lite",
-        ":protos_all_cc_impl",
-        "//third_party/eigen3",
-        "//third_party/fft2d:fft2d_headers",
-        "@com_google_protobuf//:protobuf",
-        "@fft2d",
-        "@gemmlowp",
-    ],
-    alwayslink = 1,
-)
-
 alias(
     name = "ios_tensorflow_lib_lite",
     actual = ":portable_tensorflow_lib_lite",
     visibility = ["//visibility:public"],
 )
 
-cc_library(
-    name = "ios_tensorflow_test_lib",
-    testonly = 1,
-    srcs = if_ios([":android_test_srcs"]),
-    copts = tf_copts() + ["-Os"],
-    tags = [
-        "manual",
-        "notap",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":ios_tensorflow_lib",
-        "//tensorflow/core/platform/default/build_config:gtest",
-        "//third_party/eigen3",
-    ],
-)
-
 # Full TensorFlow library with operator support. Use this unless reducing
 # binary size (by packaging a reduced operator set) is a concern.
 alias(
@@ -1473,10 +1426,16 @@ alias(
     visibility = ["//visibility:public"],
 )
 
+alias(
+    name = "ios_tensorflow_lib",
+    actual = ":portable_tensorflow_lib",
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "portable_tensorflow_lib",
     srcs = if_mobile([":portable_op_registrations_and_gradients"]),
-    copts = tf_copts() + tf_opts_nortti_if_lite_protos(),
+    copts = tf_copts() + tf_opts_nortti_if_lite_protos() + if_ios(["-Os"]),
     features = tf_features_nomodules_if_mobile(),
     tags = [
         "manual",
@@ -1559,6 +1518,12 @@ alias(
     visibility = ["//visibility:public"],
 )
 
+alias(
+    name = "ios_tensorflow_test_lib",
+    actual = ":portable_tensorflow_test_lib",
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "portable_tensorflow_test_lib",
     testonly = 1,
@@ -1569,7 +1534,7 @@ cc_library(
         "//tensorflow/core/framework:android_test_hdrs",
         "//tensorflow/core/util:android_test_hdrs",
     ],
-    copts = tf_copts(android_optimization_level_override = None),
+    copts = tf_copts(android_optimization_level_override = None) + if_ios(["-Os"]),
     features = tf_features_nomodules_if_mobile() + tf_opts_nortti_if_lite_protos(),
     tags = [
         "manual",
@@ -1637,20 +1602,13 @@ alias(
     [
         alias(
             name = "protobuf_%s_pyclif%s" % (proto_name, target_suffix),
-            actual = ":protobuf/%s_pyclif%s" % (proto_name, target_suffix),
+            actual = "//tensorflow/core/protobuf:%s_pyclif%s" % (proto_name, target_suffix),
             visibility = ["//visibility:public"],
         )
         for target_suffix in [
             "",
             "_pb2",
         ]
-    ] + [
-        tf_pyclif_proto_library(
-            name = "protobuf/%s_pyclif" % proto_name,
-            proto_lib = ":protos_all",
-            proto_srcfile = "protobuf/%s.proto" % proto_name,
-            visibility = ["//visibility:public"],
-        ),
     ]
     for proto_name in [
         "config",
@@ -1664,77 +1622,74 @@ alias(
 # -----------------------------------------------------------------------------
 # Internal targets
 
-tf_proto_library(
+alias(
     name = "autotuning_proto",
-    srcs = ["protobuf/autotuning.proto"],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
+    actual = "//tensorflow/core/protobuf:autotuning_proto",
     visibility = [
         "//tensorflow:internal",
     ],
 )
 
-tf_proto_library(
+alias(
+    name = "autotuning_proto_cc",
+    actual = "//tensorflow/core/protobuf:autotuning_proto_cc",
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+alias(
     name = "conv_autotuning_proto",
-    srcs = ["protobuf/conv_autotuning.proto"],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
-    protodeps = [
-        "//tensorflow/stream_executor:dnn_proto",
-    ],
+    actual = "//tensorflow/core/protobuf:conv_autotuning_proto",
     visibility = [
         "//tensorflow:internal",
     ],
 )
 
-tf_proto_library_cc(
-    name = "worker_proto",
-    srcs = ["protobuf/worker.proto"],
-    cc_api_version = 2,
-    protodeps = tf_additional_all_protos(),
-    visibility = ["//visibility:public"],
-)
-
-tf_proto_library_cc(
-    name = "worker_service_proto",
-    srcs = ["protobuf/worker_service.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_stubby_versions = ["2"],
-    protodeps = [":worker_proto"],
+alias(
+    name = "conv_autotuning_proto_cc",
+    actual = "//tensorflow/core/protobuf:conv_autotuning_proto_cc",
     visibility = [
         "//tensorflow:internal",
     ],
 )
 
-tf_proto_library_cc(
-    name = "master_proto",
-    srcs = ["protobuf/master.proto"],
-    cc_api_version = 2,
-    protodeps = tf_additional_all_protos(),
-    visibility = ["//tensorflow:internal"],
-)
-
-tf_proto_library_cc(
-    name = "master_service_proto",
-    srcs = ["protobuf/master_service.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_stubby_versions = ["2"],
-    protodeps = [":master_proto"],
+alias(
+    name = "worker_proto_cc",
+    actual = "//tensorflow/core/protobuf:worker_proto_cc",
     visibility = [
         "//tensorflow:internal",
     ],
 )
 
-tf_proto_library_cc(
-    name = "eager_service_proto",
-    srcs = ["protobuf/eager_service.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_grpc_version = 1,
-    cc_stubby_versions = ["2"],
-    protodeps = tf_additional_all_protos(),
+alias(
+    name = "worker_service_proto_cc",
+    actual = "//tensorflow/core/protobuf:worker_service_proto_cc",
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+alias(
+    name = "master_proto_cc",
+    actual = "//tensorflow/core/protobuf:master_proto_cc",
+    visibility = [
+        "//learning/brain/frameworks/uptc:__subpackages__",
+        "//tensorflow:internal",
+    ],
+)
+
+alias(
+    name = "master_service_proto_cc",
+    actual = "//tensorflow/core/protobuf:master_service_proto_cc",
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+alias(
+    name = "eager_service_proto_cc",
+    actual = "//tensorflow/core/protobuf:eager_service_proto_cc",
     visibility = [
         "//tensorflow:internal",
     ],
@@ -2146,49 +2101,14 @@ cc_library(
     ],
 )
 
-tf_proto_library(
+alias(
     name = "error_codes_proto_impl",
-    srcs = ["protobuf/error_codes.proto"],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
+    actual = "//tensorflow/core/protobuf:error_codes_proto_impl",
 )
 
-tf_proto_library(
-    name = "core_protos",
-    srcs = COMMON_PROTO_SRCS + [
-        # Protos which are not needed on mobile builds, but should be included
-        # in protos_all.
-        #
-        # Note that some protos are in neither core_proto_srcs nor this
-        # filegroup; e.g. ones with individual proto_library targets.
-        "protobuf/control_flow.proto",
-        # TODO(ebrevdo): Re-enable once CriticalSection is in core.
-        # "protobuf/critical_section.proto",
-        "protobuf/data/experimental/snapshot.proto",
-        "protobuf/debug_event.proto",
-        "protobuf/meta_graph.proto",
-        "protobuf/named_tensor.proto",
-        "protobuf/remote_tensor_handle.proto",
-        "protobuf/saved_model.proto",
-        "protobuf/saved_object_graph.proto",
-        "protobuf/struct.proto",
-        "protobuf/tensorflow_server.proto",
-        "protobuf/trackable_object_graph.proto",
-        "protobuf/transport_options.proto",
-    ],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
-    protodeps = [
-        ":error_codes_proto_impl",
-        "//tensorflow/core/example:protos_all",
-        "//tensorflow/core/framework:protos_all",
-        "//tensorflow/core/lib/core:error_codes_proto",
-        "//tensorflow/core/profiler/protobuf:xplane_proto",
-        "//tensorflow/core/profiler:profiler_options_proto",
-        "//tensorflow/core/util:protos_all",
-        "//tensorflow/core/util:test_log_proto_impl",
-    ],
-    visibility = ["//visibility:private"],
+alias(
+    name = "error_codes_proto_impl_cc",
+    actual = "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
 )
 
 alias(
@@ -2334,6 +2254,7 @@ tf_cuda_library(
         "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/core/profiler/lib:annotated_traceme",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/tpu:tpu_library_loader",
         "//tensorflow/core/util:einsum_op_util",
         "//tensorflow/core/util:padding",
         "//tensorflow/core/util:port",
@@ -2480,13 +2401,9 @@ alias(
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_cc(
-    name = "replay_log_proto",
-    srcs = ["protobuf/replay_log.proto"],
-    cc_api_version = 2,
-    protodeps = [
-        ":master_proto",
-    ] + tf_additional_all_protos(),
+alias(
+    name = "replay_log_proto_cc",
+    actual = "//tensorflow/core/protobuf:replay_log_proto_cc",
     visibility = [
         "//tensorflow:internal",
     ],
@@ -3117,6 +3034,11 @@ alias(
     actual = "//tensorflow/core/platform:cuda_libdevice_path",
 )
 
+# Normalize CORE_PROTO_SRCS to generate valid output file names.
+PORTABLE_PROTO_HEADERS_OUT = tf_android_core_proto_headers(CORE_PROTO_SRCS) + [
+    "//google/protobuf/any.proto.h",
+]
+
 transitive_hdrs(
     name = "headers",
     visibility = ["//tensorflow:__subpackages__"],
@@ -3129,8 +3051,3 @@ transitive_hdrs(
         "//tensorflow/core/platform:platform_strings",
     ],
 )
-
-# Normalize CORE_PROTO_SRCS to generate valid output file names.
-PORTABLE_PROTO_HEADERS_OUT = tf_android_core_proto_headers(CORE_PROTO_SRCS) + [
-    "//google/protobuf/any.proto.h",
-]
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyFtrl.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyFtrl.pbtxt
index 0f49a18a114..f3379461a5f 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyFtrl.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyFtrl.pbtxt
@@ -65,7 +65,7 @@ END
   summary: "Update \'*var\' according to the Ftrl-proximal scheme."
   description: <<END
 accum_new = accum + grad * grad
-linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
 quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
 var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
 accum = accum_new
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyFtrlV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyFtrlV2.pbtxt
index 3218ab7776c..1eb33005e91 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyFtrlV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyFtrlV2.pbtxt
@@ -65,8 +65,8 @@ END
   summary: "Update \'*var\' according to the Ftrl-proximal scheme."
   description: <<END
 grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
+accum_new = accum + grad * grad
+linear += grad_with_shrinkage -
     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
 quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
 var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
index 09eff6177b1..ae5942b3617 100644
--- a/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
@@ -84,6 +84,13 @@ END
     name: "Tout"
     description: <<END
 the types of the output tensors.
+END
+  }
+  attr {
+    name: "enable_large_batch_splitting"
+    description: <<END
+input with a large size (i.e., larger than the largest value of
+`allowed_batch_sizes`) will be splitted into multiple batches with batch size.
 END
   }
   summary: "Batches all the inputs tensors to the computation done by the function."
diff --git a/tensorflow/core/api_def/base_api/api_def_CompressElement.pbtxt b/tensorflow/core/api_def/base_api/api_def_CompressElement.pbtxt
new file mode 100644
index 00000000000..17b63e4ab2f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CompressElement.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CompressElement"
+  visibility: HIDDEN
+  summary: "Compresses a dataset element."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt
index 3f9ec2761a1..11043899ba4 100644
--- a/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt
@@ -28,7 +28,7 @@ The counts or summed weights for each value in the range [0, size).
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
 bool; Whether the kernel should count the appearance or number of occurrences.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
index 416da1ccaab..8296bfe6d7b 100644
--- a/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
@@ -4,61 +4,62 @@ op {
   in_arg {
     name: "values"
     description: <<END
-int32 or int64; Tensor containing data to count.
+Tensor containing data to count.
 END
   }
   in_arg {
     name: "weights"
     description: <<END
-float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+A Tensor of the same shape as indices containing per-index weight values. May
+also be the empty tensor if no weights are used.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-int64; indices tensor for the resulting sparse tensor object.
+Indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_values"
     description: <<END
-int64 or float32; values tensor for the resulting sparse tensor object.
+Values tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_dense_shape"
     description: <<END
-int64; shape tensor for the resulting sparse tensor object.
+Shape tensor for the resulting sparse tensor object.
 END
   }
   attr {
     name: "T"
     description: <<END
-dtype; dtype of the input values tensor.
+Dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-int32; minimum value to count. Can be set to -1 for no minimum.
+Minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-int32; maximum value to count. Can be set to -1 for no maximum.
+Maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
-bool; whether to output the number of occurrences of each value or 1.
+Whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-dtype; dtype of the output values tensor.
+Dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a tf.tensor input."
diff --git a/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt
new file mode 100644
index 00000000000..9a4e5abd110
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "DeviceIndex"
+  visibility: HIDDEN
+  summary: "Return the index of device the op runs."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt
new file mode 100644
index 00000000000..aeb87346ab2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt
@@ -0,0 +1,86 @@
+op {
+  graph_op_name: "ExtractGlimpseV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+A 1-D tensor of 2 elements containing the size of the glimpses
+to extract.  The glimpse height must be specified first, following
+by the glimpse width.
+END
+  }
+  in_arg {
+    name: "offsets"
+    description: <<END
+A 2-D integer tensor of shape `[batch_size, 2]` containing
+the y, x locations of the center of each window.
+END
+  }
+  out_arg {
+    name: "glimpse"
+    description: <<END
+A tensor representing the glimpses `[batch_size,
+glimpse_height, glimpse_width, channels]`.
+END
+  }
+  attr {
+    name: "centered"
+    description: <<END
+indicates if the offset coordinates are centered relative to
+the image, in which case the (0, 0) offset is relative to the center
+of the input images. If false, the (0,0) offset corresponds to the
+upper left corner of the input images.
+END
+  }
+  attr {
+    name: "normalized"
+    description: <<END
+indicates if the offset coordinates are normalized.
+END
+  }
+  attr {
+    name: "uniform_noise"
+    description: <<END
+indicates if the noise should be generated using a
+uniform distribution or a Gaussian distribution.
+END
+  }
+  attr {
+    name: "noise"
+    description: <<END
+indicates if the noise should `uniform`, `gaussian`, or
+`zero`. The default is `uniform` which means the the noise type
+will be decided by `uniform_noise`.
+END
+  }
+  summary: "Extracts a glimpse from the input tensor."
+  description: <<END
+Returns a set of windows called glimpses extracted at location
+`offsets` from the input tensor. If the windows only partially
+overlaps the inputs, the non overlapping areas will be filled with
+random noise.
+
+The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+glimpse_width, channels]`. The channels and batch dimensions are the
+same as that of the input tensor. The height and width of the output
+windows are specified in the `size` parameter.
+
+The argument `normalized` and `centered` controls how the windows are built:
+
+* If the coordinates are normalized but not centered, 0.0 and 1.0
+  correspond to the minimum and maximum of each height and width
+  dimension.
+* If the coordinates are both normalized and centered, they range from
+  -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+  left corner, the lower right corner is located at (1.0, 1.0) and the
+  center is at (0, 0).
+* If the coordinates are not normalized they are interpreted as
+  numbers of pixels.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
index c6104da4a64..7f2a8a1cf1a 100644
--- a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
@@ -30,8 +30,8 @@ END
   summary: "Gather slices from `params` axis `axis` according to `indices`."
   description: <<END
 `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-params.shape[axis + 1:]` where:
+Produces an output tensor with shape `params.shape[:axis] +
+indices.shape[batch_dims:] + params.shape[axis + 1:]` where:
 
 ```python
     # Scalar indices (output is rank(params) - 1).
diff --git a/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
index 2fcd3659dc7..c0c160d1be4 100644
--- a/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
@@ -20,9 +20,11 @@ op {
         "A `Tensor` of type T. An alias of `x`. The content "
         "of `y` is undefined if there are duplicates in `i`."
   }
-  summary: <<END
-    Updates specified rows with values in `v`.
+  summary: "Updates specified rows 'i' with values 'v'."
+  description: <<END
+Computes `x[i, :] = v; return x`.
 
-    Computes `x[i, :] = v; return x`.
+Originally this function is mutative however for compilation we make this
+operation create / operate on a copy of `x`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedBincount.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedBincount.pbtxt
index b4deaa7c430..b6299ada526 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedBincount.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedBincount.pbtxt
@@ -34,7 +34,7 @@ The counts or summed weights for each value in the range [0, size).
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
 bool; Whether the kernel should count the appearance or number of occurrences.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
index 1763aea1fa6..37224d841de 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
@@ -4,67 +4,68 @@ op {
   in_arg {
     name: "splits"
     description: <<END
-int64; Tensor containing the row splits of the ragged tensor to count.
+Tensor containing the row splits of the ragged tensor to count.
 END
   }
 in_arg {
     name: "values"
     description: <<END
-int32 or int64; Tensor containing values of the sparse tensor to count.
+Tensor containing values of the sparse tensor to count.
 END
   }
   in_arg {
     name: "weights"
     description: <<END
-float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+A Tensor of the same shape as indices containing per-index weight values.
+May also be the empty tensor if no weights are used.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-int64; indices tensor for the resulting sparse tensor object.
+Indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_values"
     description: <<END
-int64 or float32; values tensor for the resulting sparse tensor object.
-  END
+Values tensor for the resulting sparse tensor object.
+END
   }
   out_arg {
     name: "output_dense_shape"
     description: <<END
-int64; shape tensor for the resulting sparse tensor object.
+Shape tensor for the resulting sparse tensor object.
   END
   }
   attr {
     name: "T"
     description: <<END
-dtype; dtype of the input values tensor.
+Dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-int32; minimum value to count. Can be set to -1 for no minimum.
+Minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-int32; maximum value to count. Can be set to -1 for no maximum.
+Maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
-bool; whether to output the number of occurrences of each value or 1.
+Whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-dtype; dtype of the output values tensor.
+Dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a ragged tensor input."
diff --git a/tensorflow/core/api_def/base_api/api_def_SnapshotDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SnapshotDatasetV2.pbtxt
new file mode 100644
index 00000000000..0545b343ac8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SnapshotDatasetV2.pbtxt
@@ -0,0 +1,41 @@
+op {
+  graph_op_name: "SnapshotDatasetV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+  name: "path"
+  description: <<END
+The path we should write snapshots to / read snapshots from.
+END
+  }
+  attr {
+    name: "compression"
+    description: <<END
+The type of compression to be applied to the saved snapshot files.
+END
+  }
+  attr {
+    name: "reader_func"
+    description: <<END
+Optional. A function to control how to read data from snapshot shards.
+END
+  }
+  attr {
+    name: "shard_func"
+    description: <<END
+Optional. A function to control how to shard data when writing a snapshot.
+END
+  }
+  summary: "Creates a dataset that will write to / read from a snapshot."
+  description: <<END
+This dataset attempts to determine whether a valid snapshot exists at the
+`snapshot_path`, and reads from the snapshot in lieu of using `input_dataset`.
+If not, it will run the preprocessing pipeline as usual, and write out a
+snapshot of the data processed for future use.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrlV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrlV2.pbtxt
index df924f29636..5300b5570cb 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrlV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrlV2.pbtxt
@@ -72,8 +72,8 @@ END
   description: <<END
 That is for rows we have grad for, we update var, accum and linear as follows:
 grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
+accum_new = accum + grad * grad
+linear += grad_with_shrinkage -
     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
 quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
 var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseBincount.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseBincount.pbtxt
index cfcc432f880..12cb5f43218 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseBincount.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseBincount.pbtxt
@@ -40,7 +40,7 @@ The counts or summed weights for each value in the range [0, size).
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
 bool; Whether the kernel should count the appearance or number of occurrences.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
index 62538e36a45..a346710c8b3 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
@@ -4,73 +4,74 @@ op {
   in_arg {
     name: "indices"
     description: <<END
-int64; Tensor containing the indices of the sparse tensor to count.
+Tensor containing the indices of the sparse tensor to count.
 END
   }
 in_arg {
     name: "values"
     description: <<END
-int32 or int64; Tensor containing values of the sparse tensor to count.
+Tensor containing values of the sparse tensor to count.
 END
   }
 in_arg {
     name: "dense_shape"
     description: <<END
-int64; Tensor containing the dense shape of the sparse tensor to count.
+Tensor containing the dense shape of the sparse tensor to count.
 END
   }
-  in_arg {
+ in_arg {
     name: "weights"
     description: <<END
-float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+A Tensor of the same shape as indices containing per-index weight values.
+May also be the empty tensor if no weights are used.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-int64; indices tensor for the resulting sparse tensor object.
+Indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
       name: "output_values"
       description: <<END
-int64 or float32; values tensor for the resulting sparse tensor object.
+Values tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
       name: "output_dense_shape"
       description: <<END
-int64; shape tensor for the resulting sparse tensor object.
+Shape tensor for the resulting sparse tensor object.
 END
   }
   attr {
     name: "T"
     description: <<END
-dtype; dtype of the input values tensor.
+Dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-int32; minimum value to count. Can be set to -1 for no minimum.
+Minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-int32; maximum value to count. Can be set to -1 for no maximum.
+Maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
-bool; whether to output the number of occurrences of each value or 1.
+Whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-dtype; dtype of the output values tensor.
+Dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a sparse tensor input."
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt
new file mode 100644
index 00000000000..2c4340cb9b7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt
@@ -0,0 +1,104 @@
+op {
+  graph_op_name: "SparseCrossHashed"
+  in_arg {
+    name: "indices"
+    description: <<END
+2-D.  Indices of each input `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+1-D.   values of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "shapes"
+    description: <<END
+1-D.   Shapes of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "dense_inputs"
+    description: <<END
+2-D.    Columns represented by dense `Tensor`.
+END
+  }
+  in_arg {
+    name: "num_buckets"
+    description: <<END
+It is used if hashed_output is true.
+output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+END
+  }
+  in_arg {
+    name: "strong_hash"
+    description: <<END
+boolean, if true, siphash with salt will be used instead of farmhash.
+END
+  }
+  in_arg {
+    name: "salt"
+    description: <<END
+Specify the salt that will be used by the siphash function.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+2-D.  Indices of the concatenated `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+1-D.  Non-empty values of the concatenated or hashed
+`SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_shape"
+    description: <<END
+1-D.  Shape of the concatenated `SparseTensor`.
+END
+  }
+  summary: "Generates sparse cross from a list of sparse and dense tensors."
+  description: <<END
+The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+representing features of one feature column. It outputs a 2D `SparseTensor` with
+the batchwise crosses of these features.
+
+For example, if the inputs are
+
+    inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+
+    inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+
+    inputs[2]: Tensor [["f"], ["g"]]
+
+then the output will be
+
+    shape = [2, 2]
+    [0, 0]: "a_X_d_X_f"
+    [1, 0]: "b_X_e_X_g"
+    [1, 1]: "c_X_e_X_g"
+
+if hashed_output=true then the output will be
+
+    shape = [2, 2]
+    [0, 0]: FingerprintCat64(
+                Fingerprint64("f"), FingerprintCat64(
+                    Fingerprint64("d"), Fingerprint64("a")))
+    [1, 0]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("b")))
+    [1, 1]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("c")))
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCrossV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCrossV2.pbtxt
new file mode 100644
index 00000000000..0627d9b3909
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCrossV2.pbtxt
@@ -0,0 +1,91 @@
+op {
+  graph_op_name: "SparseCrossV2"
+  in_arg {
+    name: "indices"
+    description: <<END
+2-D.  Indices of each input `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+1-D.   values of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "shapes"
+    description: <<END
+1-D.   Shapes of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "dense_inputs"
+    description: <<END
+2-D.    Columns represented by dense `Tensor`.
+END
+  }
+  in_arg {
+    name: "sep"
+    description: <<END
+string used when joining a list of string inputs, can be used as separator later.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+2-D.  Indices of the concatenated `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+1-D.  Non-empty values of the concatenated or hashed
+`SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_shape"
+    description: <<END
+1-D.  Shape of the concatenated `SparseTensor`.
+END
+  }
+  summary: "Generates sparse cross from a list of sparse and dense tensors."
+  description: <<END
+The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+representing features of one feature column. It outputs a 2D `SparseTensor` with
+the batchwise crosses of these features.
+
+For example, if the inputs are
+
+    inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+
+    inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+
+    inputs[2]: Tensor [["f"], ["g"]]
+
+then the output will be
+
+    shape = [2, 2]
+    [0, 0]: "a_X_d_X_f"
+    [1, 0]: "b_X_e_X_g"
+    [1, 1]: "c_X_e_X_g"
+
+if hashed_output=true then the output will be
+
+    shape = [2, 2]
+    [0, 0]: FingerprintCat64(
+                Fingerprint64("f"), FingerprintCat64(
+                    Fingerprint64("d"), Fingerprint64("a")))
+    [1, 0]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("b")))
+    [1, 1]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("c")))
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UncompressElement.pbtxt b/tensorflow/core/api_def/base_api/api_def_UncompressElement.pbtxt
new file mode 100644
index 00000000000..e2039b674f0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UncompressElement.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "UncompressElement"
+  visibility: HIDDEN
+  summary: "Uncompresses a compressed dataset element."
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseCrossHashed.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseCrossHashed.pbtxt
new file mode 100644
index 00000000000..2c830668733
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseCrossHashed.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseCrossHashed"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseCrossV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseCrossV2.pbtxt
new file mode 100644
index 00000000000..dfa0a670c4c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseCrossV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseCrossV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index eb506d29571..016896b36f4 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -243,6 +243,7 @@ filegroup(
         "memory_types.h",
         "mkl_cpu_allocator.h",
         "mkl_layout_pass.h",
+        "mkl_tfconversion_pass.h",
         "optimization_registry.h",
         "partitioning_utils.h",
         "placer.h",
@@ -1028,9 +1029,13 @@ cc_library(
 cc_library(
     name = "mkl_layout_pass",
     srcs = ["mkl_layout_pass.cc"],
-    hdrs = ["mkl_layout_pass.h"],
+    hdrs = [
+        "mkl_layout_pass.h",
+        "//tensorflow/core/graph:mkl_graph_util_header",
+    ],
     copts = tf_copts(),
     deps = [
+        ":function",
         ":optimization_registry",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -1043,9 +1048,13 @@ cc_library(
 cc_library(
     name = "mkl_tfconversion_pass",
     srcs = ["mkl_tfconversion_pass.cc"],
-    hdrs = ["mkl_tfconversion_pass.h"],
+    hdrs = [
+        "mkl_tfconversion_pass.h",
+        "//tensorflow/core/graph:mkl_graph_util_header",
+    ],
     copts = tf_copts(),
     deps = [
+        ":function",
         ":optimization_registry",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -1212,6 +1221,7 @@ cc_library(
         ":propagator_debug_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:hash",
         "//tensorflow/core/profiler/lib:traceme",
     ],
 )
@@ -2285,7 +2295,7 @@ tf_cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:scope",
         "//tensorflow/core/kernels:cwise_op",
-    ] + if_mkl([":mkl_array_ops_op_lib"]),
+    ] + if_mkl(["//tensorflow/core:mkl_array_ops_op_lib"]),
 )
 
 tf_cc_test(
@@ -2510,10 +2520,12 @@ tf_cc_test(
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 7a614a8d224..12e30da2773 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -221,23 +221,42 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
                                           const CollectiveParams& col_params,
                                           const string& exec_key,
                                           StatusCallback done) {
+  const auto is_callback_called = std::make_shared<std::atomic<bool>>(false);
+
   // On any individual collective Op failure we need to abort the
   // BufRendezvous so that other Ops in the instance don't hang
   // waiting for transmissions that will never happen.  Do so after a
   // delay so that the original error status is more likely to
   // propagate up, and peers are unlikely to re-create the purged
   // BufRendezvous by late-arriving requests.
-  StatusCallback done_safe = [this, done](const Status& s) {
-    if (!s.ok()) {
-      Ref();  // Ensure this lasts until the closure executes.
-      SchedNonBlockingClosureAfter(1000000, [this, s] {
-        remote_access_->buf_rendezvous()->StartAbort(s);
-        Unref();
-      });
+  StatusCallback done_safe = [this, done, is_callback_called](const Status& s) {
+    auto should_call_callback = !is_callback_called->exchange(true);
+    if (should_call_callback) {
+      if (!s.ok()) {
+        Ref();  // Ensure this lasts until the closure executes.
+        SchedNonBlockingClosureAfter(1000000, [this, s] {
+          remote_access_->buf_rendezvous()->StartAbort(s);
+          Unref();
+        });
+      }
+      done(s);
     }
-    done(s);
   };
 
+  auto timeout_microseconds = static_cast<int64>(
+      col_params.instance.impl_details.timeout_seconds * 1'000'000);
+  if (timeout_microseconds > 0) {
+    // TODO(xldrx): Share the timeout watchdog thread among collectives.
+    SchedNonBlockingClosureAfter(
+        timeout_microseconds, [is_callback_called, done_safe] {
+          if (!is_callback_called->load()) {
+            auto status = Status(error::DEADLINE_EXCEEDED,
+                                 "Collective has timed out during execution.");
+            done_safe(status);
+          }
+        });
+  }
+
   Tensor* output = ctx->mutable_output(0);
   const Tensor* input = (col_params.instance.type == REDUCTION_COLLECTIVE ||
                          col_params.instance.type == GATHER_COLLECTIVE ||
@@ -284,7 +303,30 @@ void BaseCollectiveExecutor::CompleteParamsAsync(
     const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
     StatusCallback done) {
   cp->instance.gpu_ring_order = *gpu_ring_order_;
-  cem_->GetParamResolver()->CompleteParamsAsync(device, cp, cancel_mgr, done);
+  const auto is_callback_called = std::make_shared<std::atomic<bool>>(false);
+  auto done_with_timeout = done;
+  auto timeout_microseconds =
+      static_cast<int64>(cp->instance.impl_details.timeout_seconds * 1'000'000);
+  if (timeout_microseconds > 0) {
+    // TODO(xldrx): Share the timeout watchdog thread among collectives.
+    SchedNonBlockingClosureAfter(
+        timeout_microseconds, [is_callback_called, done] {
+          if (!is_callback_called->load()) {
+            auto status =
+                Status(error::DEADLINE_EXCEEDED,
+                       "Collective has timed out waiting for other workers.");
+            done(status);
+          }
+        });
+    done_with_timeout = [is_callback_called, done](const Status& s) {
+      auto should_call_callback = !is_callback_called->exchange(true);
+      if (should_call_callback) {
+        done(s);
+      }
+    };
+  }
+  cem_->GetParamResolver()->CompleteParamsAsync(device, cp, cancel_mgr,
+                                                done_with_timeout);
 }
 
 Status BaseCollectiveExecutor::CreateCollective(
diff --git a/tensorflow/core/common_runtime/composite_device.cc b/tensorflow/core/common_runtime/composite_device.cc
index 3103fa37941..7fd41e00a04 100644
--- a/tensorflow/core/common_runtime/composite_device.cc
+++ b/tensorflow/core/common_runtime/composite_device.cc
@@ -24,7 +24,7 @@ const char* const kCompositeDeviceType = "COMPOSITE";
 
 std::unique_ptr<CompositeDevice> CompositeDevice::MakeDevice(
     const std::vector<string>& underlying_devices, const int unique_device_id,
-    Status* status) {
+    const DeviceNameUtils::ParsedName& host_name, Status* status) {
   if (underlying_devices.empty()) {
     status->Update(
         errors::InvalidArgument("underlying_devices should not be empty."));
@@ -62,13 +62,15 @@ std::unique_ptr<CompositeDevice> CompositeDevice::MakeDevice(
       return nullptr;
     }
   }
+
+  DeviceNameUtils::ParsedName parsed_composite_name = host_name;
   DeviceAttributes device_attributes;
-  parsed_name.type = kCompositeDeviceType;
-  device_attributes.set_device_type(parsed_name.type);
-  parsed_name.id = unique_device_id;
+  parsed_composite_name.type = kCompositeDeviceType;
+  parsed_composite_name.id = unique_device_id;
   const string composite_name =
-      DeviceNameUtils::ParsedNameToString(parsed_name);
+      DeviceNameUtils::ParsedNameToString(parsed_composite_name);
   device_attributes.set_name(composite_name);
+  device_attributes.set_device_type(kCompositeDeviceType);
 
   return absl::WrapUnique(
       new CompositeDevice(device_attributes, underlying_devices));
diff --git a/tensorflow/core/common_runtime/composite_device.h b/tensorflow/core/common_runtime/composite_device.h
index 127e5b8303a..850eae55e8d 100644
--- a/tensorflow/core/common_runtime/composite_device.h
+++ b/tensorflow/core/common_runtime/composite_device.h
@@ -42,10 +42,11 @@ class CompositeDevice : public Device {
     return &underlying_devices_;
   }
 
-  // Helper for creating a CompositeDevice.
+  // Helper for creating a CompositeDevice on the same task as the given host
+  // CPU.
   static std::unique_ptr<CompositeDevice> MakeDevice(
       const std::vector<string>& underlying_devices, const int unique_device_id,
-      Status* status);
+      const DeviceNameUtils::ParsedName& host_name, Status* status);
 
  private:
   CompositeDevice(const DeviceAttributes& device_attributes,
diff --git a/tensorflow/core/common_runtime/composite_device_test.cc b/tensorflow/core/common_runtime/composite_device_test.cc
index ac2f9108ecb..73a6ae44912 100644
--- a/tensorflow/core/common_runtime/composite_device_test.cc
+++ b/tensorflow/core/common_runtime/composite_device_test.cc
@@ -20,12 +20,15 @@ limitations under the License.
 namespace tensorflow {
 
 TEST(CompositeDeviceTest, Basic) {
+  const string host_name = "/job:localhost/replica:0/task:0/device:CPU:0";
+  DeviceNameUtils::ParsedName parsed_host_name;
+  EXPECT_TRUE(DeviceNameUtils::ParseFullName(host_name, &parsed_host_name));
   std::vector<string> underlying_devices;
   {
     Status status;
     std::unique_ptr<CompositeDevice> composite_device =
         CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/0,
-                                    &status);
+                                    parsed_host_name, &status);
     EXPECT_EQ(composite_device, nullptr);
     EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
     EXPECT_TRUE(absl::StrContains(status.error_message(),
@@ -41,7 +44,7 @@ TEST(CompositeDeviceTest, Basic) {
         "/job:localhost/replica:0/task:0/device:CPU:1");
     std::unique_ptr<CompositeDevice> composite_device =
         CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/0,
-                                    &status);
+                                    parsed_host_name, &status);
     TF_ASSERT_OK(status);
     EXPECT_EQ(composite_device->device_type(), kCompositeDeviceType);
     EXPECT_EQ(underlying_devices, *composite_device->underlying_devices());
@@ -53,7 +56,7 @@ TEST(CompositeDeviceTest, Basic) {
         "/job:localhost/replica:0/task:0/device:CPU:0");
     std::unique_ptr<CompositeDevice> composite_device =
         CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/1,
-                                    &status);
+                                    parsed_host_name, &status);
     EXPECT_EQ(composite_device, nullptr);
     EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
     EXPECT_TRUE(
@@ -68,7 +71,7 @@ TEST(CompositeDeviceTest, Basic) {
         "/job:localhost/replica:0/task:0/device:GPU:0");
     std::unique_ptr<CompositeDevice> composite_device =
         CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/1,
-                                    &status);
+                                    parsed_host_name, &status);
     EXPECT_EQ(composite_device, nullptr);
     EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
     EXPECT_TRUE(absl::StrContains(status.error_message(),
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index c7583c374f2..0b693085da3 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -45,6 +45,7 @@ StaticDeviceMgr::StaticDeviceMgr(std::vector<std::unique_ptr<Device>> devices)
     }
     const auto& t = d->device_type();
     device_type_counts_[t]++;
+    device_incarnation_set_.insert(d->attributes().incarnation());
     if (cpu_device_ == nullptr && t == "CPU" && d->parsed_name().id == 0) {
       cpu_device_ = d.get();
     }
@@ -123,6 +124,10 @@ Status StaticDeviceMgr::LookupDevice(StringPiece name, Device** device) const {
   return Status::OK();
 }
 
+bool StaticDeviceMgr::ContainsDevice(int64 device_incarnation) const {
+  return device_incarnation_set_.contains(device_incarnation);
+}
+
 void StaticDeviceMgr::ClearContainers(
     gtl::ArraySlice<string> containers) const {
   Status s;
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index 56248b39078..83a0d0cc29c 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/lib/core/arena.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -56,6 +57,11 @@ class DeviceMgr {
   // Accepts either a full device name, or just the replica-local suffix.
   virtual Status LookupDevice(StringPiece name, Device** device) const = 0;
 
+  // Check if the current device manager contains device with the given
+  // incarnation ID. Looking up by incarnation IDs because they are randomly
+  // generated and not intentionally reused (unlike device pointers).
+  virtual bool ContainsDevice(int64 device_incarnation) const = 0;
+
   // Clears given containers of all devices if 'container' is
   // non-empty. Otherwise, clears default containers of all devices.
   virtual void ClearContainers(gtl::ArraySlice<string> containers) const = 0;
@@ -86,6 +92,7 @@ class StaticDeviceMgr : public DeviceMgr {
   string DebugString() const override;
   string DeviceMappingString() const override;
   Status LookupDevice(StringPiece name, Device** device) const override;
+  bool ContainsDevice(int64 device_incarnation) const override;
   void ClearContainers(gtl::ArraySlice<string> containers) const override;
   int NumDeviceType(const string& type) const override;
   Device* HostCPU() const override;
@@ -95,6 +102,7 @@ class StaticDeviceMgr : public DeviceMgr {
 
   StringPiece CopyToBackingStore(StringPiece s);
 
+  absl::flat_hash_set<int64> device_incarnation_set_;
   std::unordered_map<StringPiece, Device*, StringPieceHasher> device_map_;
   core::Arena name_backing_store_;  // Storage for keys in device_map_
   std::unordered_map<string, int> device_type_counts_;
@@ -117,6 +125,7 @@ class DynamicDeviceMgr : public DeviceMgr {
   string DebugString() const override;
   string DeviceMappingString() const override;
   Status LookupDevice(StringPiece name, Device** device) const override;
+  bool ContainsDevice(int64 device_incarnation) const override;
   void ClearContainers(gtl::ArraySlice<string> containers) const override;
   int NumDeviceType(const string& type) const override;
   Device* HostCPU() const override;
@@ -140,6 +149,7 @@ class DynamicDeviceMgr : public DeviceMgr {
   std::unordered_map<Device*, std::unique_ptr<Device>> dynamic_devices_
       TF_GUARDED_BY(devices_mu_);
 
+  absl::flat_hash_set<int64> device_incarnation_set_ TF_GUARDED_BY(devices_mu_);
   std::unordered_map<string, Device*> device_map_ TF_GUARDED_BY(devices_mu_);
 
   std::unordered_map<string, int> device_type_counts_
diff --git a/tensorflow/core/common_runtime/device_set.cc b/tensorflow/core/common_runtime/device_set.cc
index b062529a3ff..902ca2c2ee2 100644
--- a/tensorflow/core/common_runtime/device_set.cc
+++ b/tensorflow/core/common_runtime/device_set.cc
@@ -116,12 +116,15 @@ void DeviceSet::SortPrioritizedDeviceVector(PrioritizedDeviceVector* vector) {
     if (a_type_name != b_type_name) {
       auto a_priority = DeviceFactory::DevicePriority(a_type_name);
       auto b_priority = DeviceFactory::DevicePriority(b_type_name);
-      // First sort by prioritized device type (higher is preferred) and
-      // then by device name (lexicographically).
       if (a_priority != b_priority) {
         return a_priority > b_priority;
       }
     }
+
+    if (a.first->IsLocal() != b.first->IsLocal()) {
+      return a.first->IsLocal();
+    }
+
     return StringPiece(a.first->name()) < StringPiece(b.first->name());
   };
   std::sort(vector->begin(), vector->end(), device_sort);
diff --git a/tensorflow/core/common_runtime/device_set.h b/tensorflow/core/common_runtime/device_set.h
index 608705c32f7..f59f84c2066 100644
--- a/tensorflow/core/common_runtime/device_set.h
+++ b/tensorflow/core/common_runtime/device_set.h
@@ -90,8 +90,8 @@ class DeviceSet {
   //
   // After a call to this function, the argument vector will be sorted by
   // explicit priority (the second element in the `std::pair<DeviceType,
-  // int32>`), then by `DeviceTypeOrder` of the device type, and lastly
-  // by device name.
+  // int32>`), then by `DeviceTypeOrder` of the device type, then by device
+  // locality, and lastly by device name.
   static void SortPrioritizedDeviceVector(PrioritizedDeviceVector* vector);
 
   // Sorts a PrioritizedDeviceTypeVector according to types and explicit
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index d104e0a985f..96938bcbafd 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -349,12 +349,12 @@ DirectSession::DirectSession(const SessionOptions& options,
   int devices_added = 0;
   if (options.config.log_device_placement()) {
     const string mapping_str = device_mgr_->DeviceMappingString();
+    string msg;
     if (mapping_str.empty()) {
-      printf("Device mapping: no known devices.\n");
+      msg = "Device mapping: no known devices.";
     } else {
-      printf("Device mapping:\n%s", mapping_str.c_str());
+      msg = strings::StrCat("Device mapping:\n", mapping_str);
     }
-    string msg = strings::StrCat("Device mapping:\n", mapping_str);
     if (!logging::LogToListeners(msg)) {
       LOG(INFO) << msg;
     }
diff --git a/tensorflow/core/common_runtime/dynamic_device_mgr.cc b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
index f35fa7e416a..f47de47c5b9 100644
--- a/tensorflow/core/common_runtime/dynamic_device_mgr.cc
+++ b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
@@ -92,6 +92,11 @@ Status DynamicDeviceMgr::LookupDevice(StringPiece name, Device** device) const {
   return Status::OK();
 }
 
+bool DynamicDeviceMgr::ContainsDevice(int64 device_incarnation) const {
+  tf_shared_lock l(devices_mu_);
+  return device_incarnation_set_.contains(device_incarnation);
+}
+
 void DynamicDeviceMgr::ClearContainers(
     gtl::ArraySlice<string> containers) const {
   Status s;
@@ -138,6 +143,7 @@ Status DynamicDeviceMgr::AddDevices(
       device_map_[name] = d.get();
     }
     device_type_counts_[d->device_type()]++;
+    device_incarnation_set_.insert(d->attributes().incarnation());
     dynamic_devices_.emplace(d.get(), std::move(d));
   }
   return Status::OK();
@@ -171,6 +177,7 @@ Status DynamicDeviceMgr::RemoveDevices(std::vector<Device*> devices) {
       device_map_.erase(name);
     }
     device_type_counts_[d->device_type()]--;
+    device_incarnation_set_.erase(d->attributes().incarnation());
     dynamic_devices_.erase(it);
   }
   return Status::OK();
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 695342d5e7a..625468b39d5 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -47,7 +47,7 @@ tf_cuda_library(
     visibility = ["//tensorflow:internal"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu_lib",
@@ -83,7 +83,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:worker_env",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "@com_google_absl//absl/types:optional",
@@ -147,7 +147,7 @@ tf_cuda_library(
         "//tensorflow/core/platform:platform_port",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu_lib",
@@ -181,7 +181,7 @@ tf_cuda_library(
         ":eager_executor",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "@com_google_absl//absl/types:variant",
@@ -207,7 +207,7 @@ tf_cuda_library(
         ":tensor_handle_data",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "@com_google_absl//absl/strings",
@@ -305,13 +305,14 @@ tf_cuda_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":attr_builder",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:optional",
         "@farmhash_archive//:farmhash",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:windows": KERNEL_AND_DEVICE_DEPS,
         "//conditions:default": KERNEL_AND_DEVICE_DEPS + [
@@ -369,6 +370,7 @@ cc_library(
         ":eager_operation",
         ":kernel_and_device",
         ":tensor_handle",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/strings",
@@ -379,7 +381,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core/distributed_runtime/eager:remote_mgr",
@@ -396,6 +398,24 @@ cc_library(
     }) + if_mkl([":mkl_eager_op_rewrite"]),
 )
 
+tf_cc_test(
+    name = "execute_node_test",
+    srcs = ["execute_node_test.cc"],
+    deps = [
+        ":context",
+        ":core",
+        ":execute",
+        ":kernel_and_device",
+        ":tensor_handle",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 cc_library(
     name = "mkl_eager_op_rewrite",
     srcs = ["mkl_eager_op_rewrite.cc"],
@@ -466,6 +486,7 @@ cc_library(
         ":eager_operation",
         ":kernel_and_device",
         ":tensor_handle",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/strings",
@@ -477,7 +498,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core/distributed_runtime/eager:remote_mgr",
@@ -506,7 +527,7 @@ tf_cuda_library(
         "@farmhash_archive//:farmhash",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu",
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 4e5bc934c38..1024f3caabd 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
 
+#include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/c/eager/operation_interface.h"
 #include "tensorflow/c/eager/tensor_handle_interface.h"
@@ -80,7 +81,8 @@ EagerContext::EagerContext(
     bool device_mgr_owned, Rendezvous* rendezvous,
     const CustomKernelCreator* custom_kernel_creator,
     DistributedFunctionLibraryRuntime* cluster_flr)
-    : default_device_placement_policy_(default_device_placement_policy),
+    : opts_(opts),
+      default_device_placement_policy_(default_device_placement_policy),
       default_mirroring_policy_(default_mirroring_policy),
       local_device_manager_(device_mgr, device_mgr_owned),
       host_cpu_device_(device_mgr->HostCPU()),
@@ -168,6 +170,28 @@ AbstractTensorInterface* EagerContext::CreateTensor(
   return new TensorInterface(Tensor(dtype, TensorShape(dim_sizes)));
 }
 
+AbstractTensorInterface* EagerContext::CreateTensor(
+    DataType dtype, const int64_t* dims, int num_dims, void* data, size_t len,
+    bool convert_string, MemoryReleaser memory_releaser,
+    void* memory_releaser_arg) {
+  TF_Tensor* tensor_wrapper =
+      TF_NewTensor(static_cast<TF_DataType>(dtype), dims, num_dims, data, len,
+                   memory_releaser, memory_releaser_arg);
+
+  if (convert_string) {
+    tensorflow::Tensor tensor;
+    Status status = TF_TensorToTensor(tensor_wrapper, &tensor);
+    TF_DeleteTensor(tensor_wrapper);
+    if (!status.ok()) return nullptr;
+    return new TensorInterface(std::move(tensor));
+  } else {
+    AbstractTensorInterface* result = nullptr;
+    std::swap(result, tensor_wrapper->tensor);
+    TF_DeleteTensor(tensor_wrapper);
+    return result;
+  }
+}
+
 std::unique_ptr<SavedModelAPI> EagerContext::LoadSavedModelAPI(
     const std::string& directory,
     const absl::optional<std::unordered_set<std::string>>& tags,
@@ -853,6 +877,18 @@ Status EagerContext::FindDeviceFromName(const char* device_name,
   return status;
 }
 
+Status EagerContext::FindCompositeDeviceFromName(
+    const char* device_name, CompositeDevice** device) const {
+  tf_shared_lock l(composite_devices_mu_);
+  for (const auto& d : composite_devices_) {
+    if (d.second->name() == device_name) {
+      *device = d.second.get();
+      return Status::OK();
+    }
+  }
+  return errors::NotFound("Unknown composite device: ", device_name);
+}
+
 Status EagerContext::FindCustomDeviceFromName(const string& device_name,
                                               CustomDevice** dev) const {
   auto dev_it = custom_devices_.find(device_name);
@@ -900,12 +936,14 @@ Status EagerContext::FindOrCreateCompositeDevice(
   }
 
   Status s;
-  auto device = CompositeDevice::MakeDevice(underlying_devices,
-                                            composite_devices_.size(), &s);
+  // Create a CompositeDevice on the same task as the host CPU, in order to
+  // trigger packed TensorHandle copy from a client to a remote worker.
+  auto device =
+      CompositeDevice::MakeDevice(underlying_devices, composite_devices_.size(),
+                                  HostCPU()->parsed_name(), &s);
   TF_RETURN_IF_ERROR(s);
   *composite_device = device.get();
-  // TODO(b/145922293): Add the composite device to the device set of pflr in
-  // order to make placer recognize it.
+  pflr_->AddCompositeDevice(*composite_device);
   composite_devices_.emplace(hash_key, std::move(device));
   return Status::OK();
 }
@@ -1014,7 +1052,7 @@ void EagerContext::IncrementContextViewId() {
 // Set collective ops related state in the context. Passing nullptr to
 // `new_server` will reuse the existing GRPC server in context.
 Status EagerContext::StoreCollectiveOpsServer(
-    std::unique_ptr<ServerInterface> new_server, DeviceMgr* device_mgr,
+    std::unique_ptr<ServerInterface> new_server, const DeviceMgr* device_mgr,
     CollectiveExecutorMgrInterface* rpc_collective_executor_mgr) {
   collective_executor_mgr_.Reset(rpc_collective_executor_mgr);
 
@@ -1139,7 +1177,7 @@ Status EagerContext::InitializeRemoteMaster(
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
     std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
     const std::vector<string>& remote_contexts, uint64 context_id,
-    Rendezvous* r, DeviceMgr* local_device_mgr, int keep_alive_secs,
+    Rendezvous* r, const DeviceMgr* local_device_mgr, int keep_alive_secs,
     DistributedFunctionLibraryRuntime* cluster_flr,
     std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
         remote_mgr) {
@@ -1238,7 +1276,7 @@ Status EagerContext::SetMasterContextState(
     std::shared_ptr<WorkerSession> worker_session,
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
     std::unique_ptr<DynamicDeviceMgr> remote_device_manager, uint64 context_id,
-    uint64 context_view_id, Rendezvous* r, DeviceMgr* local_device_mgr,
+    uint64 context_view_id, Rendezvous* r, const DeviceMgr* local_device_mgr,
     int keep_alive_secs, DistributedFunctionLibraryRuntime* cluster_flr,
     std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
         remote_mgr) {
@@ -1250,7 +1288,13 @@ Status EagerContext::SetMasterContextState(
   use_send_tensor_rpc_ =
       ReadBoolFromEnvVar("TF_EAGER_REMOTE_USE_SEND_TENSOR_RPC", true);
 
-  local_device_manager_.Reset(local_device_mgr);
+  if (local_device_mgr != local_device_manager_.Get()) {
+    if (local_device_manager_.Owned()) {
+      old_local_device_managers_.push_back(
+          std::move(local_device_manager_.owned_object));
+    }
+    local_device_manager_.Reset(local_device_mgr);
+  }
   host_cpu_device_ = local_device_manager_.Get()->HostCPU();
 
   if (rendezvous_ != nullptr) rendezvous_->Unref();
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index aad318886a9..cceb883a965 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -173,6 +173,11 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
 
   AbstractTensorInterface* CreateTensor(
       DataType dtype, absl::Span<const int64> dim_sizes) override;
+  AbstractTensorInterface* CreateTensor(DataType dtype, const int64_t* dims,
+                                        int num_dims, void* data, size_t len,
+                                        bool convert_string,
+                                        MemoryReleaser memory_releaser,
+                                        void* memory_releaser_arg) override;
 
   AbstractTensorHandleInterface* CreateLocalHandle(
       AbstractTensorInterface* t) override;
@@ -290,12 +295,16 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   // errors, and the error message will be combined from all executors.
   Status SyncExecutors();
 
+  Status AsyncWait() override { return SyncExecutors(); }
+
   core::RefCountPtr<KernelAndDevice> GetCachedKernel(Fprint128 cache_key);
 
   void AddKernelToCache(Fprint128 cache_key, KernelAndDevice* kernel);
 
   bool LogDevicePlacement() const { return log_device_placement_; }
+  void SetLogDevicePlacement(bool enable) { log_device_placement_ = enable; }
   bool AllowSoftPlacement() const { return allow_soft_placement_; }
+  void SetAllowSoftPlacement(bool enable) { allow_soft_placement_ = enable; }
   bool LogMemory() const { return log_memory_; }
 
   Rendezvous* GetRendezvous() const { return rendezvous_; }
@@ -390,7 +399,7 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
       const std::vector<string>& remote_contexts, uint64 context_id,
-      Rendezvous* r, DeviceMgr* local_device_mgr, int keep_alive_secs,
+      Rendezvous* r, const DeviceMgr* local_device_mgr, int keep_alive_secs,
       DistributedFunctionLibraryRuntime* cluster_flr,
       std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
           remote_mgr);
@@ -427,7 +436,7 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
       const std::vector<string>& remote_contexts, uint64 context_id);
 
   Status StoreCollectiveOpsServer(
-      std::unique_ptr<ServerInterface> new_server, DeviceMgr* device_mgr,
+      std::unique_ptr<ServerInterface> new_server, const DeviceMgr* device_mgr,
       CollectiveExecutorMgrInterface* rpc_collective_executor_mgr);
 
   // For the specified remote worker, preprocess and set its device filters.
@@ -483,6 +492,9 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
 
   Status FindDeviceFromName(const char* device_name, Device** device) const;
 
+  Status FindCompositeDeviceFromName(const char* device_name,
+                                     CompositeDevice** device) const;
+
   Status FindCustomDeviceFromName(const string& device_name,
                                   CustomDevice** dev) const;
 
@@ -498,6 +510,8 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   // Gets the CPU device on the task of device.
   Status CPUDeviceOnTask(const Device* device, Device** cpu_device) const;
 
+  const SessionOptions& session_options() const { return opts_; }
+
  private:
   ~EagerContext() override;
 
@@ -551,6 +565,7 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
     T* unowned_object_ptr = nullptr;
   };
 
+  SessionOptions opts_;
   const ContextDevicePlacementPolicy default_device_placement_policy_;
   const ContextMirroringPolicy default_mirroring_policy_;
 
@@ -563,6 +578,8 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
       TF_GUARDED_BY(policy_map_mu_);
 
   OwnedOrUnownedHelper<const DeviceMgr> local_device_manager_;
+  // Maintain copy of all previously created local device managers.
+  std::vector<std::unique_ptr<const DeviceMgr>> old_local_device_managers_;
 
   // Unowned DynamicDeviceMgr is set on remote worker to allow running
   // multi-device function on remote worker.
@@ -617,9 +634,8 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   mutex metadata_mu_;
   RunMetadata run_metadata_ TF_GUARDED_BY(metadata_mu_);
   GraphCollector graph_collector_;
-  // TODO(fishx): Allow update following two bool after context creation.
-  const bool log_device_placement_;
-  const bool allow_soft_placement_;
+  std::atomic<bool> log_device_placement_;
+  std::atomic<bool> allow_soft_placement_;
 
   // Information related to step containers.
   std::atomic<int> num_active_steps_;
@@ -651,7 +667,7 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
       uint64 context_id, uint64 context_view_id, Rendezvous* r,
-      DeviceMgr* local_device_mgr, int keep_alive_secs,
+      const DeviceMgr* local_device_mgr, int keep_alive_secs,
       DistributedFunctionLibraryRuntime* cluster_flr,
       std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
           remote_mgr);
diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc
index 9154a288a84..c6ed61c80c4 100644
--- a/tensorflow/core/common_runtime/eager/context_test.cc
+++ b/tensorflow/core/common_runtime/eager/context_test.cc
@@ -31,7 +31,7 @@ static Device* CreateDevice(const string& type, int n) {
     Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
   };
   DeviceAttributes attr;
-  attr.set_name("/job:a/replica:0/task:0/device:" + type + ":" +
+  attr.set_name("/job:localhost/replica:0/task:0/device:" + type + ":" +
                 std::to_string(n));
   attr.set_device_type(type);
   return new FakeDevice(attr);
@@ -179,7 +179,11 @@ TEST_F(EagerContextTest, CompositeDevice) {
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
                                                       &composite_device_0));
   EXPECT_EQ(composite_device_0->name(),
-            "/job:worker/replica:0/task:0/device:COMPOSITE:0");
+            "/job:localhost/replica:0/task:0/device:COMPOSITE:0");
+  CompositeDevice* device = nullptr;
+  TF_EXPECT_OK(context()->FindCompositeDeviceFromName(
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:0", &device));
+  EXPECT_EQ(device, composite_device_0);
   CompositeDevice* composite_device_1 = nullptr;
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
                                                       &composite_device_1));
@@ -189,7 +193,13 @@ TEST_F(EagerContextTest, CompositeDevice) {
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
                                                       &composite_device_2));
   EXPECT_EQ(composite_device_2->name(),
-            "/job:worker/replica:0/task:0/device:COMPOSITE:1");
+            "/job:localhost/replica:0/task:0/device:COMPOSITE:1");
+  TF_EXPECT_OK(context()->FindCompositeDeviceFromName(
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:1", &device));
+  EXPECT_EQ(device, composite_device_2);
+
+  EXPECT_TRUE(errors::IsNotFound(context()->FindCompositeDeviceFromName(
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:2", &device)));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.cc b/tensorflow/core/common_runtime/eager/eager_executor.cc
index 13630a01ea9..ddfdabf9472 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.cc
+++ b/tensorflow/core/common_runtime/eager/eager_executor.cc
@@ -50,7 +50,6 @@ EagerExecutor::~EagerExecutor() {
 
 Status EagerExecutor::ShutDown() {
   {
-    std::vector<core::RefCountPtr<NodeItem>> items_to_destroy;
     bool has_thread;
     Status status;
     {
@@ -72,9 +71,6 @@ Status EagerExecutor::ShutDown() {
         nodes_pending_.notify_all();
       }
     }
-    for (auto& item : items_to_destroy) {
-      item->node->Abort(status);
-    }
     if (!has_thread) {
       return status;
     }
@@ -98,7 +94,7 @@ const char* EagerExecutor::StateStringLocked() {
 
 Status EagerExecutor::SyncExecute(EagerNode* node) {
   if (Async()) {
-    return errors::Internal("Executor does not support sync execution");
+    return errors::Internal("Executor does not support async execution");
   }
   if (node->AsAsync() != nullptr) {
     return errors::Internal("Executor does not support executing async nodes");
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 95d85bfbcc9..e7d75517c15 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -365,8 +365,12 @@ Status GetOrCreateKernelAndDevice(
   Device* device = absl::get<Device*>(op->Device());
 
   Fprint128 cache_key = op->MutableAttrs()->CacheKey(op->DeviceName());
+  /// Include soft placement policy in cache key since the placement strategy
+  // can change and thus affect which kernel is picked.
+  cache_key = FingerprintCat128(cache_key, ctx.AllowSoftPlacement());
 
   std::vector<Device*> input_dev_ptrs;
+  absl::flat_hash_map<string, const std::vector<string>*> composite_devices;
   std::unordered_map<int, DtypeAndPartialTensorShape>
       input_resource_variable_dtypes_and_shapes;
   // We can eliminate some overhead by running simple functions using regular
@@ -410,6 +414,13 @@ Status GetOrCreateKernelAndDevice(
       Device* input_device;
       TF_RETURN_IF_ERROR(GetDeviceForInput(ctx, input, &input_device));
       input_dev_ptrs.push_back(input_device);
+      CompositeDevice* composite_device = nullptr;
+      if (ctx.FindCompositeDeviceFromName(input_device->name().c_str(),
+                                          &composite_device)
+              .ok()) {
+        composite_devices[input_device->name()] =
+            composite_device->underlying_devices();
+      }
       cache_key =
           FingerprintCat128(cache_key, Fingerprint128(input_device->name()));
 
@@ -480,13 +491,6 @@ Status GetOrCreateKernelAndDevice(
                << KernelsRegisteredForOp(op->Name());
       op->SetDevice(device);
     }
-    if (ctx.LogDevicePlacement() || VLOG_IS_ON(1)) {
-      string msg = strings::StrCat("Executing op ", ndef.op(), " in device ",
-                                   DeviceNameOrUnspecified(device));
-      if (!logging::LogToListeners(msg)) {
-        LOG(INFO) << msg;
-      }
-    }
 
     FunctionLibraryRuntime* flr =
         device == nullptr ? nullptr : ctx.func_lib(device);
@@ -520,6 +524,7 @@ Status GetOrCreateKernelAndDevice(
 #endif  // IS_MOBILE_PLATFORM
       kernel.reset(new KernelAndDeviceFunc(
           flr, ctx.pflr(), std::move(input_dev_ptrs),
+          std::move(composite_devices),
           std::move(input_resource_variable_dtypes_and_shapes), runner,
           ctx.GetCollectiveExecutorHandle(), ctx.HostCPU(), op->Name(),
           [&ctx](const int64 step_id) { return ctx.CreateRendezvous(step_id); },
@@ -598,6 +603,14 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
   int num_outputs = kernel->num_outputs();
   TF_RETURN_IF_ERROR(ValidateInputTypeAndPlacement(&ctx, op, kernel));
 
+  if (ctx.LogDevicePlacement() || VLOG_IS_ON(1)) {
+    string msg = strings::StrCat("Executing op ", op->Name(), " in device ",
+                                 kernel->device()->name());
+    if (!logging::LogToListeners(msg)) {
+      LOG(INFO) << msg;
+    }
+  }
+
   GraphCollector* graph_collector = nullptr;
   if (ctx.ShouldStoreGraphs()) {
     graph_collector = ctx.GetGraphCollector();
@@ -769,9 +782,15 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
         }
       }
       auto* input_handle = remote_op->add_op_inputs()->mutable_remote_handle();
+      // For a multi-device function, a remote RunComponentFunction request is
+      // not sent through StreamingEnqueueAsync. It could arrive at a remote
+      // worker before a remote execution request which produces an input of the
+      // component function. So we wait until the remote input is ready before
+      // serializing it.
+      const bool wait_until_ready = op->is_function();
       TF_RETURN_IF_ERROR(ctx.RemoteMgr()->SerializeRemoteTensorHandle(
-          input, input_handle, input_device, *input_device_name,
-          serialize_resource_dtype_and_shape));
+          input, wait_until_ready, input_handle, input_device,
+          *input_device_name, serialize_resource_dtype_and_shape));
       if (!input_handle->resource_dtypes_and_shapes().empty()) {
         TF_RETURN_IF_ERROR(
             input->AddResourceShapeMirror(op_device, input_handle->op_id(),
@@ -832,6 +851,16 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
       ctx.GetContextViewId(), eager_client.get(),
       op->MutableAttrs()->BuildNodeDef(), op->EagerContext().FuncLibDef(),
       op->Inputs(), {retvals, num_outputs}));
+
+  if (op->EagerContext().LogDevicePlacement() || VLOG_IS_ON(1)) {
+    string msg = strings::StrCat(
+        "Executing op ", op->Name(), " on task ",
+        DeviceNameUtils::ParsedNameToString(op->GetDeviceParsedName()));
+    if (!logging::LogToListeners(msg)) {
+      LOG(INFO) << msg;
+    }
+  }
+
   Status s = executor.AddOrExecute(std::move(node));
   // Since the operation failed, we need to Unref any outputs that were
   // allocated.
@@ -865,6 +894,19 @@ bool IsPinnableOp(const string& op_type) {
          !absl::StartsWith(op_type, "XRT");
 }
 
+// Validate if the remote device with the given incarnation is valid in the
+// remote device manager of the current eager context.
+Status ValidateTensorHandleRemoteDevice(EagerContext* ctx,
+                                        int64 device_incarnation) {
+  if (ctx->remote_device_mgr()->ContainsDevice(device_incarnation)) {
+    return Status::OK();
+  }
+  return errors::InvalidArgument(
+      "Resource input tensor contains an invalid device. This might happen "
+      "when the client has connected to a different cluster, or some remote "
+      "workers have been restarted.");
+}
+
 // The Op device may be updated if:
 // - A resource touching input is specified: all resource-touching ops run in
 // the device the resource is, regardless of anything else that has been
@@ -926,6 +968,10 @@ Status MaybeUpdateOpDevice(EagerOperation* op) {
   for (int i = 0; i < op->Inputs().size(); ++i) {
     TensorHandle* tensor_handle = op->Inputs()[i];
     if (tensor_handle->dtype == DT_RESOURCE) {
+      if (tensor_handle->resource_remote_device_incarnation() != 0) {
+        TF_RETURN_IF_ERROR(ValidateTensorHandleRemoteDevice(
+            &ctx, tensor_handle->resource_remote_device_incarnation()));
+      }
       Device* resource_device = tensor_handle->resource_device();
       DVLOG(2) << "for op " << op->Name() << " input " << i << " "
                << DataTypeString(tensor_handle->dtype)
@@ -1093,15 +1139,6 @@ Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
     return EagerLocalExecute(op, retvals, num_retvals);
   }
 
-  if (op->EagerContext().LogDevicePlacement() || VLOG_IS_ON(1)) {
-    string msg = strings::StrCat(
-        "Executing op ", op->Name(), " on task ",
-        DeviceNameUtils::ParsedNameToString(op->GetDeviceParsedName()));
-    if (!logging::LogToListeners(msg)) {
-      LOG(INFO) << msg;
-    }
-  }
-
 #if defined(IS_MOBILE_PLATFORM)
   return errors::Unimplemented(
       "Eager's remote execution is not available on mobile devices.");
@@ -1402,6 +1439,14 @@ void EagerLocalExecuteAsync(EagerOperation* op, TensorHandle** retvals,
     return;
   }
 
+  if (ctx.LogDevicePlacement() || VLOG_IS_ON(1)) {
+    string msg = strings::StrCat("Executing op ", op->Name(), " in device ",
+                                 kernel->device()->name());
+    if (!logging::LogToListeners(msg)) {
+      LOG(INFO) << msg;
+    }
+  }
+
   GraphCollector* graph_collector = nullptr;
   if (ctx.ShouldStoreGraphs()) {
     graph_collector = ctx.GetGraphCollector();
@@ -1414,7 +1459,7 @@ void EagerLocalExecuteAsync(EagerOperation* op, TensorHandle** retvals,
   EagerKernelExecuteAsync(
       &ctx, op->Inputs(), op->remote_func_params(), std::move(kernel),
       graph_collector, op->GetCancellationManager(), retvals, num_outputs,
-      [op, num_outputs, &retvals, done = std::move(done)](const Status& s) {
+      [op, num_outputs, retvals, done = std::move(done)](const Status& s) {
         op->Clear();
         // Since the operation failed, we need to Unref any outputs if they were
         // allocated.
diff --git a/tensorflow/core/common_runtime/eager/execute_node.cc b/tensorflow/core/common_runtime/eager/execute_node.cc
index b7bebd4ba11..27503cfd99d 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node.cc
@@ -17,6 +17,51 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
+
+#if !defined(IS_MOBILE_PLATFORM)
+bool ExecuteNodeArgs::IsRemote(EagerContext* ctx, Device* input_device,
+                               TensorHandle* handle) {
+  uint64 context_view_id = ctx->GetContextViewId();
+  if (handle->Type() == TensorHandle::REMOTE ||
+      handle->HasRemoteMirror(input_device, context_view_id)) {
+    if (!has_remote_inputs_) {
+      has_remote_inputs_ = true;
+    }
+    return true;
+  }
+  return false;
+}
+#endif  // IS_MOBILE_PLATFORM
+
+Status ExecuteNodeArgs::InitPackedHandle(const int index, EagerContext* ctx,
+                                         Device* input_device,
+                                         TensorHandle* packed_handle) {
+  int num_handles = packed_handle->NumPackedHandles();
+  packed_args_.emplace(index, gtl::InlinedVector<TensorValue, 4>(num_handles));
+  TensorValue* packed_arg_flat = &(packed_args_[index][0]);
+  for (int i = 0; i < num_handles; ++i) {
+    TensorHandle* h = nullptr;
+    TF_RETURN_IF_ERROR(packed_handle->ExtractPackedHandle(i, &h));
+    // We have validated that h->device() is not a CustomDevice when
+    // constructing a pack TensorHandle.
+    const Status status =
+        h->TensorValue(absl::get<Device*>(h->device()), &packed_arg_flat[i]);
+    if (!status.ok()) {
+#if !defined(IS_MOBILE_PLATFORM)
+      if (IsRemote(ctx, input_device, h)) {
+        continue;
+      }
+#endif  // IS_MOBILE_PLATFORM
+      if (h->Type() == TensorHandle::PACKED) {
+        return errors::InvalidArgument(
+            "Nested packed handles are not supported");
+      }
+      return status;
+    }
+  }
+  return Status::OK();
+}
+
 Status ExecuteNodeArgs::Init(
     EagerContext* ctx, const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
     const core::RefCountPtr<KernelAndDevice>& kernel) {
@@ -35,43 +80,72 @@ Status ExecuteNodeArgs::Init(
       Status s = in->TensorValue(ctx->CanonicalDevice(d), &tensor_args_flat[i]);
       if (!s.ok()) {
 #if !defined(IS_MOBILE_PLATFORM)
-        uint64 context_view_id = ctx->GetContextViewId();
-        if (in->Type() == TensorHandle::REMOTE ||
-            in->HasRemoteMirror(d, context_view_id)) {
-          if (!has_remote_inputs_) {
-            has_remote_inputs_ = true;
-          }
+        if (IsRemote(ctx, d, in)) {
           continue;
         }
 #endif
-        return s;
+        if (in->Type() != TensorHandle::PACKED) {
+          return s;
+        }
+        if (!has_packed_inputs_) {
+          has_packed_inputs_ = true;
+        }
+        TF_RETURN_IF_ERROR(InitPackedHandle(i, ctx, d, in));
       }
     }
   }
 
 #if !defined(IS_MOBILE_PLATFORM)
   if (has_remote_inputs_) {
+    const bool is_function = kernel->IsFunction();
     serialize_remote_handle_ =
-        [ctx, &op_inputs](const FunctionArgIndex& index,
-                          eager::RemoteTensorHandle* handle) -> Status {
-      if (index.sub_index >= 0) {
-        return errors::InvalidArgument("Got unexpected sub_index ",
-                                       index.sub_index, " for argument ",
-                                       index.index);
+        [ctx, &op_inputs, is_function](
+            const FunctionArgIndex& index,
+            eager::RemoteTensorHandle* handle) -> Status {
+      TensorHandle* h = op_inputs[index.index];
+      if (op_inputs[index.index]->Type() == TensorHandle::PACKED) {
+        TF_RETURN_IF_ERROR(
+            op_inputs[index.index]->ExtractPackedHandle(index.sub_index, &h));
       }
-      VariantDevice variant_device = op_inputs[index.index]->device();
+      VariantDevice variant_device = h->device();
       if (VariantDeviceIsCustom(variant_device)) {
         return errors::Internal(
             "Custom devices and remote execution are currently not supported "
             "together.");
       }
       Device* device = absl::get<Device*>(variant_device);
+      // For a multi-device function, a remote RunComponentFunction request is
+      // not sent through StreamingEnqueueAsync. It could arrive at a remote
+      // worker before a remote execution request which produces an input of the
+      // component function. So we wait until the remote input is ready before
+      // serializing it.
+      const bool wait_util_ready = is_function;
       return ctx->RemoteMgr()->SerializeRemoteTensorHandle(
-          op_inputs[index.index], handle, device, device->name());
+          h, wait_util_ready, handle, device, device->name());
     };
   }
 #endif  // !IS_MOBILE_PLATFORM
   return Status::OK();
 }
 
+Status ExecuteNodeArgs::GetLocalArg(const FunctionArgIndex& index,
+                                    Tensor* val) const {
+  Status s = EagerKernelArgs::GetLocalArg(index, val);
+  if (s.ok()) {
+    return Status::OK();
+  }
+  if (packed_args_.contains(index.index)) {
+    Tensor* arg = packed_args_.at(index.index).at(index.sub_index).tensor;
+    if (arg) {
+      *val = *arg;
+      return Status::OK();
+    } else {
+      return errors::NotFound("Argument (", index.index, ",", index.sub_index,
+                              ") has no local tensor.");
+    }
+  } else {
+    return s;
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/execute_node.h b/tensorflow/core/common_runtime/eager/execute_node.h
index d416f58bbcd..7924471066e 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.h
+++ b/tensorflow/core/common_runtime/eager/execute_node.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstddef>
 #include <memory>
 #include <string>
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
@@ -54,6 +55,8 @@ class ExecuteNodeArgs : public EagerKernelArgs {
               const absl::InlinedVector<TensorHandle*, 4>& op_inputs,
               const core::RefCountPtr<KernelAndDevice>& kernel);
 
+  Status GetLocalArg(const FunctionArgIndex& index, Tensor* val) const override;
+
   bool HasRemoteOrPackedInputs() const override {
     return has_remote_inputs_ || has_packed_inputs_;
   };
@@ -66,8 +69,20 @@ class ExecuteNodeArgs : public EagerKernelArgs {
 #endif  // IS_MOBILE_PLATFORM
 
  private:
+#if !defined(IS_MOBILE_PLATFORM)
+  // Returns whether `handle` is a remote handle or has a remote mirror on
+  // `input_device`
+  bool IsRemote(EagerContext* ctx, Device* input_device, TensorHandle* handle);
+#endif  // IS_MOBILE_PLATFORM
+
+  // Initialize a packed TensorHandle which is the `index`-th argument.
+  Status InitPackedHandle(const int index, EagerContext* ctx,
+                          Device* input_device, TensorHandle* packed_handle);
+
   bool has_remote_inputs_ = false;
   bool has_packed_inputs_ = false;
+  // Maps from the index of a packed arg to a list of sub-args.
+  absl::flat_hash_map<int, gtl::InlinedVector<TensorValue, 4>> packed_args_;
 #if !defined(IS_MOBILE_PLATFORM)
   std::function<Status(const FunctionArgIndex&, eager::RemoteTensorHandle*)>
       serialize_remote_handle_;
diff --git a/tensorflow/core/common_runtime/eager/execute_node_test.cc b/tensorflow/core/common_runtime/eager/execute_node_test.cc
new file mode 100644
index 00000000000..83fbcf5017e
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/execute_node_test.cc
@@ -0,0 +1,143 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/execute_node.h"
+
+#include <memory>
+
+#include "tensorflow/core/common_runtime/composite_device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class TestKernelAndDeviceFunc final : public KernelAndDeviceFunc {
+ public:
+  TestKernelAndDeviceFunc(std::vector<Device*> input_devices,
+                          Device* host_cpu_device)
+      : KernelAndDeviceFunc(
+            /*flr=*/nullptr, /*pflr=*/nullptr, /*input_devices=*/{},
+            /*composite_devices=*/{}, /*input_resource_dtypes_and_shapes=*/{},
+            /*runner=*/nullptr, /*collective_executor=*/nullptr,
+            host_cpu_device, /*name=*/"",
+            /*rendezvous_creator=*/nullptr, /*get_op_id=*/nullptr),
+        test_input_devices_(std::move(input_devices)) {}
+
+  Device* InputDevice(int i) const override { return test_input_devices_[i]; }
+
+ private:
+  std::vector<Device*> test_input_devices_;
+};
+
+TEST(ExecuteNodeTest, ExecuteNodeArgs) {
+  StaticDeviceMgr device_mgr(
+      DeviceFactory::NewDevice("CPU", {}, "/job:localhost/replica:0/task:0"));
+  Device* device0 = device_mgr.ListDevices().at(0);
+  auto remote_device_mgr = absl::make_unique<DynamicDeviceMgr>();
+  std::vector<std::unique_ptr<Device>> remote_devices;
+  remote_devices.emplace_back(
+      DeviceFactory::NewDevice("CPU", {}, "/job:localhost/replica:0/task:1"));
+  TF_ASSERT_OK(remote_device_mgr->AddDevices(std::move(remote_devices)));
+  Device* device1 = remote_device_mgr->ListDevices().at(0);
+
+  Status s;
+  std::unique_ptr<CompositeDevice> composite_device =
+      CompositeDevice::MakeDevice({device0->name(), device1->name()},
+                                  /*unique_device_id=*/0,
+                                  device_mgr.HostCPU()->parsed_name(), &s);
+  TF_ASSERT_OK(s);
+
+  auto ctx = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
+      &device_mgr, false, nullptr, nullptr, nullptr);
+
+  // Set a RemoteMgr to the EagerContext.
+  auto remote_mgr = absl::make_unique<eager::RemoteMgr>(
+      /*is_master=*/true, ctx);
+  TF_ASSERT_OK(ctx->InitializeRemoteMaster(
+      /*server=*/nullptr, /*worker_env=*/nullptr,
+      /*worker_session=*/nullptr, /*remote_eager_workers=*/nullptr,
+      std::move(remote_device_mgr), /*remote_contexts=*/{},
+      EagerContext::NewContextId(),
+      /*r=*/nullptr, &device_mgr, /*keep_alive_secs*/ 600,
+      /*cluster_flr=*/nullptr, std::move(remote_mgr)));
+
+  DataType dtype = DT_FLOAT;
+  Tensor t0(dtype, TensorShape({}));
+  // Create two local TensorHandles
+  t0.scalar<float>()() = {1.0f};
+  TensorHandle* h0 =
+      TensorHandle::CreateLocalHandle(std::move(t0), device0, device0, ctx);
+  Tensor t1(dtype, TensorShape({}));
+  t1.scalar<float>()() = {2.0f};
+  TensorHandle* h1 =
+      TensorHandle::CreateLocalHandle(std::move(t1), device0, device0, ctx);
+  // Create two remote TensorHandles
+  TensorHandle* h2 = TensorHandle::CreateLazyRemoteHandle(
+      /*op_id=*/1, /*output_num=*/0, dtype, device1, ctx);
+  TensorHandle* h3 = TensorHandle::CreateLazyRemoteHandle(
+      /*op_id=*/2, /*output_num=*/1, dtype, device1, ctx);
+  // Create a packed TensorHandle
+  TensorHandle* packed_h = nullptr;
+  TF_ASSERT_OK(TensorHandle::CreatePackedHandle({h1, h2}, ctx, &packed_h));
+
+  // LOCAL, PACKED, REMOTE
+  absl::InlinedVector<TensorHandle*, 4> inputs = {h0, packed_h, h3};
+
+  std::vector<Device*> input_devices;
+  for (auto* h : inputs) {
+    input_devices.push_back(absl::get<Device*>(h->DeviceOrHostCPU(*ctx)));
+  }
+  const core::RefCountPtr<KernelAndDevice> kernel(
+      new TestKernelAndDeviceFunc(std::move(input_devices), device0));
+
+  ExecuteNodeArgs args(inputs.size());
+  TF_EXPECT_OK(args.Init(ctx, inputs, kernel));
+  EXPECT_TRUE(args.HasRemoteOrPackedInputs());
+  Tensor local0;
+  TF_EXPECT_OK(args.GetLocalArg(FunctionArgIndex(0), &local0));
+  EXPECT_EQ(local0.flat<float>().size(), 1);
+  EXPECT_EQ(local0.flat<float>()(0), 1.0);
+  Tensor local1;
+  TF_EXPECT_OK(args.GetLocalArg(FunctionArgIndex(1, 0), &local1));
+  EXPECT_EQ(local1.flat<float>().size(), 1);
+  EXPECT_EQ(local1.flat<float>()(0), 2.0);
+  eager::RemoteTensorHandle remote0;
+  TF_EXPECT_OK(args.GetRemoteArg(FunctionArgIndex(1, 1), &remote0));
+  EXPECT_EQ(remote0.op_id(), 1);
+  EXPECT_EQ(remote0.output_num(), 0);
+  eager::RemoteTensorHandle remote1;
+  TF_EXPECT_OK(args.GetRemoteArg(FunctionArgIndex(2), &remote1));
+  EXPECT_EQ(remote1.op_id(), 2);
+  EXPECT_EQ(remote1.output_num(), 1);
+
+  h0->Unref();
+  h1->Unref();
+  h2->Unref();
+  h3->Unref();
+  packed_h->Unref();
+  ctx->Unref();
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 98d71959e2d..1e69f9d1767 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -35,8 +35,10 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/setround.h"
 #include "tensorflow/core/profiler/lib/annotated_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/public/version.h"
@@ -158,6 +160,7 @@ Status KernelAndDeviceFunc::InstantiateFunc(const NodeDef& ndef,
   for (const Device* device : input_devices_) {
     options.input_devices.push_back(device->name());
   }
+  options.composite_devices = composite_devices_;
   options.input_resource_dtypes_and_shapes = input_resource_dtypes_and_shapes_;
 
   const auto& it = ndef.attr().find("executor_type");
@@ -233,7 +236,6 @@ Status KernelAndDeviceOp::Run(
     std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
     const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
   OpKernelContext::Params params;
-  params.is_eager = true;
   params.device = device_;
   params.frame_iter = FrameAndIter(0, 0);
   params.inputs = inputs.GetTensorValues();
@@ -280,6 +282,8 @@ Status KernelAndDeviceOp::Run(
   OpKernelContext context(&params);
 
   {
+    port::ScopedFlushDenormal flush;
+    port::ScopedSetRound round(FE_TONEAREST);
     // 'AnnotatedTraceMe' will trace both scheduling time on host and execution
     // time on device of the OpKernel.
     profiler::AnnotatedTraceMe activity(
@@ -425,7 +429,9 @@ Device* KernelAndDeviceOp::InputDevice(int i) const {
 }
 
 Device* KernelAndDeviceFunc::InputDevice(int i) const {
-  if (input_dtypes_[i] == DT_RESOURCE) {
+  if ((input_dtypes_[i] == DT_RESOURCE) &&
+      (composite_devices_.find(input_devices_[i]->name()) ==
+       composite_devices_.end())) {
     return host_cpu_device_;
   } else {
     return input_devices_[i];
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index a740b898262..d2c54322513 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
@@ -241,7 +242,7 @@ class KernelAndDeviceOp final : public KernelAndDevice {
 // Represents a multi-device function. Functions can also be run using
 // various function-calling kernels including CallOp and PartitionedCallOp.
 // In such cases, KernelAndDeviceOp is used.
-class KernelAndDeviceFunc final : public KernelAndDevice {
+class KernelAndDeviceFunc : public KernelAndDevice {
  public:
   // `flr` can be nullptr.
   // `pflr` must not be nullptr.
@@ -249,6 +250,7 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
   KernelAndDeviceFunc(
       FunctionLibraryRuntime* flr, ProcessFunctionLibraryRuntime* pflr,
       std::vector<Device*> input_devices,
+      absl::flat_hash_map<string, const std::vector<string>*> composite_devices,
       std::unordered_map<int, DtypeAndPartialTensorShape>
           input_resource_dtypes_and_shapes,
       std::function<void(std::function<void()>)>* runner,
@@ -261,6 +263,7 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
         pflr_(pflr),
         handle_(kInvalidHandle),
         input_devices_(std::move(input_devices)),
+        composite_devices_(std::move(composite_devices)),
         input_resource_dtypes_and_shapes_(
             std::move(input_resource_dtypes_and_shapes)),
         name_(name),
@@ -320,6 +323,8 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
   // CPU devices are not null. Resource handles' devices are actual backing
   // devices.
   std::vector<Device*> input_devices_;
+  // Maps from a CompositeDevice name to a list of physical device names.
+  absl::flat_hash_map<string, const std::vector<string>*> composite_devices_;
   std::unordered_map<int, DtypeAndPartialTensorShape>
       input_resource_dtypes_and_shapes_;
 
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
index 2d4ae338144..f2339806814 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h"
-#include "tensorflow/core/common_runyime/mkl_layout_pass.h"
+#include "tensorflow/core/common_runtime/mkl_layout_pass.h"
 #include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/util/mkl_util.h"
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 1d7b4ea5d6c..dbfc5639017 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -49,6 +49,13 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+int64 GetRemoteDeviceIncarnation(Device* device) {
+  if (device == nullptr || device->IsLocal()) return 0;
+  return device->attributes().incarnation();
+}
+}  // namespace
+
 TensorHandle::PackedTensorHandleData::PackedTensorHandleData(
     std::vector<TensorHandle*>&& handles, const TensorShape& shape)
     : handles_(std::move(handles)), shape_(shape) {
@@ -124,6 +131,10 @@ string TensorHandle::PackedTensorHandleData::DebugString() const {
   return debug_str;
 }
 
+int TensorHandle::PackedTensorHandleData::NumPackedHandles() const {
+  return handles_.size();
+}
+
 Status TensorHandle::PackedTensorHandleData::ExtractPackedHandle(
     const int index, TensorHandle** handle) const {
   if (index < 0 || index >= handles_.size()) {
@@ -185,6 +196,13 @@ Status TensorHandle::GetResourceAllowedDevices(std::vector<string>* result) {
   return GetResourceHandleInfoImpl(get_resource_info);
 }
 
+int TensorHandle::NumPackedHandles() const {
+  if (Type() != PACKED) {
+    return 0;
+  }
+  return absl::get<PackedTensorHandleData>(data_).NumPackedHandles();
+}
+
 Status TensorHandle::ExtractPackedHandle(const int index,
                                          TensorHandle** handle) const {
   if (Type() != PACKED) {
@@ -233,6 +251,8 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
       device_((!ctx || d == ctx->HostCPU()) ? nullptr : d),
       op_device_(op_device),
       resource_device_(resource_device),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<LocalTensorHandleData>, std::move(t)) {
   DVLOG(3) << "Creating Local TensorHandle: " << this
@@ -247,6 +267,8 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
       op_device_(op_device),
       resource_device_(
           GetResourceDevice(t.flat<class ResourceHandle>()(0), ctx)),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       resource_handle_info_(
           {t.flat<class ResourceHandle>()(0).dtypes_and_shapes(),
@@ -263,6 +285,7 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, CustomDevice* d,
       device_(d),
       op_device_(nullptr),
       resource_device_(nullptr),
+      resource_remote_device_incarnation_(0),
       ctx_(ctx),
       data_(absl::in_place_type<LocalTensorHandleData>, std::move(t)) {
   // TODO(allenl): Figure out a better op_device story for custom devices,
@@ -286,6 +309,8 @@ TensorHandle::TensorHandle(Device* d, Device* op_device,
       device_((d == ctx->HostCPU()) ? nullptr : d),
       op_device_(op_device),
       resource_device_(resource_device),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<LocalTensorHandleData>) {
   DVLOG(3) << "Creating empty Local TensorHandle: " << this
@@ -293,17 +318,14 @@ TensorHandle::TensorHandle(Device* d, Device* op_device,
 }
 
 Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
+                                        const tensorflow::DataType dtype,
+                                        const tensorflow::TensorShape& shape,
                                         EagerContext* ctx,
                                         TensorHandle** packed_handle) {
   if (handles.empty()) {
     return errors::InvalidArgument("Handles should not be empty.");
   }
 
-  // Get the dtype and shape from the fisrt handle since all handles have the
-  // same dtype and shape.
-  tensorflow::DataType dtype = handles.at(0)->dtype;
-  tensorflow::TensorShape shape;
-  TF_RETURN_IF_ERROR(handles.at(0)->Shape(&shape));
   ResourceHandleInfo resource_handle_info;
   if (dtype == DT_RESOURCE) {
     TF_RETURN_IF_ERROR(
@@ -315,8 +337,8 @@ Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
       return errors::InvalidArgument(
           "CustomDevice is not supported for packing.");
     } else {
-      devices.push_back(
-          absl::get<Device*>(handle->DeviceOrHostCPU(*ctx))->name());
+      devices.push_back(handle->op_device() ? handle->op_device()->name()
+                                            : ctx->HostCPU()->name());
     }
   }
 
@@ -335,6 +357,22 @@ Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
   return Status::OK();
 }
 
+Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
+                                        EagerContext* ctx,
+                                        TensorHandle** packed_handle) {
+  if (handles.empty()) {
+    return errors::InvalidArgument("Handles should not be empty.");
+  }
+
+  // Get the dtype and shape from the fisrt handle since all handles have the
+  // same dtype and shape.
+  tensorflow::DataType dtype = handles.at(0)->dtype;
+  tensorflow::TensorShape shape;
+  TF_RETURN_IF_ERROR(handles.at(0)->Shape(&shape));
+  return CreatePackedHandle(std::move(handles), dtype, shape, ctx,
+                            packed_handle);
+}
+
 TensorHandle::TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
                            const tensorflow::DataType dtype,
                            const tensorflow::TensorShape& shape,
@@ -343,6 +381,8 @@ TensorHandle::TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
       device_(device),
       op_device_(device),
       resource_device_(dtype == DT_RESOURCE ? device : nullptr),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<PackedTensorHandleData>, std::move(handles),
             shape) {
@@ -365,6 +405,8 @@ TensorHandle::TensorHandle(int64 op_id, int32 output_num,
       device_(d),
       op_device_(d),
       resource_device_(dtype == DT_RESOURCE ? d : nullptr),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<RemoteTensorHandleData>, op_id, output_num,
             remote_task, ctx) {
@@ -387,6 +429,8 @@ TensorHandle::TensorHandle(int64 op_id, int32 output_num,
       device_(d),
       op_device_(d),
       resource_device_(dtype == DT_RESOURCE ? d : nullptr),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<RemoteTensorHandleData>, op_id, output_num,
             ctx->GetContextViewId()) {
@@ -661,8 +705,8 @@ Status TensorHandle::AddEmptyLocalMirror(const Device* d) {
 }
 
 #if !defined(IS_MOBILE_PLATFORM)
-Status TensorHandle::RemoteAddressUntilReady(const Device* d, int64* op_id,
-                                             int32* output_num) const {
+Status TensorHandle::RemoteAddress(const Device* d, const bool wait_until_ready,
+                                   int64* op_id, int32* output_num) const {
   DVLOG(3) << "RemoteAddress on TensorHandle: " << this << " device: " << d
            << " " << d->name();
 
@@ -670,7 +714,8 @@ Status TensorHandle::RemoteAddressUntilReady(const Device* d, int64* op_id,
     tf_shared_lock l(mu_);
     auto mirror = remote_mirrors_.find(d->name());
     if (mirror != remote_mirrors_.end()) {
-      return mirror->second.OpIdAndOutputNumUntilReady(op_id, output_num);
+      return mirror->second.OpIdAndOutputNum(wait_until_ready, op_id,
+                                             output_num);
     }
 
     return errors::FailedPrecondition(
@@ -682,7 +727,7 @@ Status TensorHandle::RemoteAddressUntilReady(const Device* d, int64* op_id,
   }
 
   auto& data = absl::get<RemoteTensorHandleData>(data_);
-  return data.OpIdAndOutputNumUntilReady(op_id, output_num);
+  return data.OpIdAndOutputNum(wait_until_ready, op_id, output_num);
 }
 
 bool TensorHandle::HasRemoteMirror(const Device* d,
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 7908f39d4b4..5e7638ae03c 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -91,6 +91,11 @@ class TensorHandle : public AbstractTensorHandleInterface,
   // Create a handle which packs the given handles of the same dtype and shape.
   // If handles are on different devices, assign the packed handle to a
   // CompositeDevice.
+  static Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
+                                   const tensorflow::DataType dtype,
+                                   const tensorflow::TensorShape& shape,
+                                   EagerContext* ctx,
+                                   TensorHandle** packed_handle);
   static Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
                                    EagerContext* ctx,
                                    TensorHandle** packed_handle);
@@ -133,6 +138,9 @@ class TensorHandle : public AbstractTensorHandleInterface,
   VariantDevice device() const { return device_; }
   Device* op_device() const { return op_device_; }
   Device* resource_device() const { return resource_device_; }
+  int64 resource_remote_device_incarnation() const {
+    return resource_remote_device_incarnation_;
+  }
 
   VariantDevice DeviceOrHostCPU(const EagerContext& ctx) const;
 
@@ -160,10 +168,11 @@ class TensorHandle : public AbstractTensorHandleInterface,
   Status AddResourceShapeMirror(const Device* d, int64 op_id, int output_num,
                                 EagerContext* ctx);
 
-  // Return the op_id and output num if the handle refers to a remote tensor;
-  // and blocks until the remote tensor is ready on the given remote worker.
-  Status RemoteAddressUntilReady(const Device* d, int64* op_id,
-                                 int32* output_num) const;
+  // Return the op_id and output num if the handle refers to a remote tensor.
+  // If wait_until_ready is true, block until the remote tensor is ready on the
+  // given remote worker.
+  Status RemoteAddress(const Device* d, const bool wait_until_ready,
+                       int64* op_id, int32* output_num) const;
 
   // Called on an async remote tensor once it's shape has been determined. This
   // transitions the tensor handle from a non-ready to a ready state by
@@ -231,6 +240,8 @@ class TensorHandle : public AbstractTensorHandleInterface,
       std::vector<DtypeAndPartialTensorShape>* result);
   Status GetResourceAllowedDevices(std::vector<string>* result);
 
+  // Returns the number of packed handles. 0 if the handle type is not PACKED.
+  int NumPackedHandles() const;
   // It's called on a packed TensorHandle. Extract a handle with the given
   // index.
   Status ExtractPackedHandle(const int index, TensorHandle** handle) const;
@@ -263,6 +274,9 @@ class TensorHandle : public AbstractTensorHandleInterface,
   // If the tensor dtype is DT_RESOURCE, resource_device_ holds the device
   // backing the resource. Else resource_device_ is nullptr.
   tensorflow::Device* const resource_device_;
+  // Incarnation ID of the resource device if it locates on a remote device, or
+  // 0 if it locates on a local device.
+  const int64 resource_remote_device_incarnation_;
 
   mutable mutex mu_;
 
@@ -316,6 +330,8 @@ class TensorHandle : public AbstractTensorHandleInterface,
     void Poison(Status status);
     string DebugString() const;
 
+    // Number of packed handles.
+    int NumPackedHandles() const;
     // Extract a handle on the given index.
     Status ExtractPackedHandle(const int index, TensorHandle** handle) const;
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index c823b6aa9b0..13b634bbec4 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -66,17 +67,28 @@ TEST(TensorHandle_ShapeTest, AsyncShape) {
   ctx->Unref();
 }
 
-static Device* CreateDevice(const char* type, const char* name) {
+static Device* CreateDevice(const char* type, const char* name,
+                            bool is_local = true) {
   class FakeDevice : public Device {
    public:
-    explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
+    explicit FakeDevice(const DeviceAttributes& attr, bool is_local)
+        : Device(nullptr, attr), is_local_(is_local) {}
     Status Sync() override { return Status::OK(); }
     Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
+    bool IsLocal() const override { return is_local_; }
+
+   private:
+    const bool is_local_;
   };
   DeviceAttributes attr;
   attr.set_name(name);
   attr.set_device_type(type);
-  return new FakeDevice(attr);
+  int64 incarnation = random::New64();
+  while (incarnation == 0) {
+    incarnation = random::New64();
+  }
+  attr.set_incarnation(incarnation);
+  return new FakeDevice(attr, is_local);
 }
 
 }  // namespace
@@ -88,6 +100,7 @@ class PackedTensorHandleTest : public ::testing::Test {
     for (const char* name : device_names_) {
       devices.emplace_back(CreateDevice("GPU", name));
     }
+    devices.emplace_back(CreateDevice("CPU", host_name_));
     device_mgr_ = new StaticDeviceMgr(std::move(devices));
 
     context_ = new EagerContext(
@@ -120,6 +133,8 @@ class PackedTensorHandleTest : public ::testing::Test {
       "/job:worker/replica:0/task:1/device:GPU:0",
       "/job:worker/replica:0/task:1/device:GPU:1"};
 
+  const char* host_name_ = "/job:worker/replica:0/task:0/device:CPU:0";
+
   StaticDeviceMgr* device_mgr_;
   EagerContext* context_;
 };
@@ -164,6 +179,7 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   h2->Unref();
   h3->Unref();
 
+  EXPECT_EQ(packed_handle->NumPackedHandles(), 4);
   EXPECT_EQ(packed_handle->Type(), TensorHandle::PACKED);
   EXPECT_EQ(packed_handle->dtype, dtype);
   TensorShape packed_shape;
@@ -185,7 +201,7 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   const std::vector<TensorHandle::HandleType> expected_handle_types = {
       TensorHandle::LOCAL, TensorHandle::LOCAL, TensorHandle::REMOTE,
       TensorHandle::REMOTE};
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < packed_handle->NumPackedHandles(); ++i) {
     TensorHandle* h = nullptr;
     TF_ASSERT_OK(packed_handle->ExtractPackedHandle(i, &h));
     EXPECT_EQ(absl::get<Device*>(h->device()), ListDevices().at(i));
@@ -203,4 +219,87 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   packed_handle->Unref();
 }
 
+TEST(TensorHandle_ResourceDeviceTest, OnLocalDevice) {
+  std::unique_ptr<Device> d0(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  StaticDeviceMgr local_device_mgr(std::move(d0));
+  auto ctx = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
+      &local_device_mgr, false, nullptr, nullptr, nullptr);
+
+  tensorflow::DataType dtype = DT_RESOURCE;
+  TensorShape shape = {2};
+  Tensor t(dtype, shape);
+
+  Device* d = local_device_mgr.ListDevices()[0];
+  TensorHandle* th =
+      TensorHandle::CreateLocalHandle(std::move(t), d, d, d, ctx);
+  // Remote device incarnation for local resource should be 0 (invalid)
+  EXPECT_EQ(0, th->resource_remote_device_incarnation());
+  // Local device manager must contain the resource device.
+  EXPECT_TRUE(local_device_mgr.ContainsDevice(
+      th->resource_device()->attributes().incarnation()));
+
+  std::unique_ptr<Device> d1(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  StaticDeviceMgr new_device_mgr(std::move(d1));
+  EXPECT_FALSE(new_device_mgr.ContainsDevice(
+      th->resource_device()->attributes().incarnation()));
+
+  th->Unref();
+  ctx->Unref();
+}
+
+TEST(TensorHandle_ResourceDeviceTest, OnRemoteDevice) {
+  std::unique_ptr<Device> d_local(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  StaticDeviceMgr local_device_mgr(std::move(d_local));
+  auto ctx = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
+      &local_device_mgr, false, nullptr, nullptr, nullptr);
+
+  std::unique_ptr<Device> d0(
+      CreateDevice("CPU", "/job:worker/task:0/device:CPU:0", false));
+  Device* d0_ptr = d0.get();
+  std::unique_ptr<Device> d1(
+      CreateDevice("CPU", "/job:worker/task:1/device:CPU:0", false));
+  Device* d1_ptr = d1.get();
+
+  DynamicDeviceMgr remote_device_mgr;
+  std::vector<std::unique_ptr<Device>> vector_d0;
+  vector_d0.emplace_back(std::move(d0));
+  TF_ASSERT_OK(remote_device_mgr.AddDevices(std::move(vector_d0)));
+
+  TensorHandle* th0 = TensorHandle::CreateUnshapedRemoteHandle(
+      0, 0, "", DT_RESOURCE, d0_ptr, ctx);
+  EXPECT_TRUE(remote_device_mgr.ContainsDevice(
+      th0->resource_remote_device_incarnation()));
+
+  std::vector<std::unique_ptr<Device>> vector_d1;
+  vector_d1.emplace_back(std::move(d1));
+  TF_ASSERT_OK(remote_device_mgr.AddDevices(std::move(vector_d1)));
+  EXPECT_TRUE(remote_device_mgr.ContainsDevice(
+      th0->resource_remote_device_incarnation()));
+
+  TensorHandle* th1 = TensorHandle::CreateUnshapedRemoteHandle(
+      0, 0, "", DT_RESOURCE, d1_ptr, ctx);
+  EXPECT_TRUE(remote_device_mgr.ContainsDevice(
+      th1->resource_remote_device_incarnation()));
+
+  std::vector<Device*> remove_d1{d1_ptr};
+  TF_ASSERT_OK(remote_device_mgr.RemoveDevices(std::move(remove_d1)));
+  EXPECT_FALSE(remote_device_mgr.ContainsDevice(
+      th1->resource_remote_device_incarnation()));
+  EXPECT_TRUE(remote_device_mgr.ContainsDevice(
+      th0->resource_remote_device_incarnation()));
+
+  th0->Unref();
+  th1->Unref();
+  ctx->Unref();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 1f2a364258f..af011ac95d8 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/context.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -66,6 +67,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/annotated_traceme.h"
 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 
 namespace tensorflow {
@@ -403,7 +405,7 @@ ExecutorState<PropagatorStateType>::ExecutorState(
       runner_(args.runner),
       sync_on_finish_(args.sync_on_finish),
       run_all_kernels_inline_(args.run_all_kernels_inline),
-      propagator_(immutable_state, step_id_),
+      propagator_(immutable_state, step_id_, vlog_),
       num_outstanding_ops_(0) {
   if (args.user_intra_op_threadpool != nullptr) {
     Device* device = immutable_state_.params().device;
@@ -811,16 +813,14 @@ template <class PropagatorStateType>
 Status ExecutorState<PropagatorStateType>::PrepareInputs(
     const NodeItem& item, Entry* first_input, TensorValueVec* inputs,
     AllocatorAttributeVec* input_alloc_attrs, bool* is_input_dead) {
-  inputs->clear();
   inputs->resize(item.num_inputs);
-  input_alloc_attrs->clear();
   input_alloc_attrs->resize(item.num_inputs);
 
   *is_input_dead = false;
 
-  bool is_merge = item.is_merge;
   for (int i = 0; i < item.num_inputs; ++i) {
-    const bool expect_ref = IsRefType(item.input_type(i));
+    const bool expect_ref = TF_PREDICT_FALSE(item.is_any_input_ref_typed) &&
+                            IsRefType(item.input_type(i));
     Entry* entry = first_input + i;
     (*input_alloc_attrs)[i] = entry->alloc_attr;
 
@@ -830,7 +830,10 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
     switch (entry->state) {
       case Entry::State::NO_VALUE: {
         // Only merge and transfer nodes can have no-value inputs.
-        if (!is_merge) {
+        inp->mutex_if_ref = nullptr;
+        if (item.is_merge) {
+          inp->tensor = nullptr;
+        } else {
           DCHECK(item.is_transfer_node)
               << item.kernel->name() << " - input " << i;
           entry->state = Entry::State::HAS_CONST_TENSOR;
@@ -846,17 +849,18 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
       }
 
       case Entry::State::HAS_VALUE: {
-        if (expect_ref) {
+        if (TF_PREDICT_FALSE(expect_ref)) {
           return AttachDef(
               errors::InvalidArgument(i, "-th input expects a ref type"),
               item.kernel->def());
         }
+        inp->mutex_if_ref = nullptr;
         inp->tensor = entry->val.get();
         break;
       }
 
       case Entry::State::HAS_CONST_TENSOR: {
-        if (expect_ref) {
+        if (TF_PREDICT_FALSE(expect_ref)) {
           return AttachDef(
               errors::InvalidArgument(i, "-th input expects a ref type"),
               item.kernel->def());
@@ -865,6 +869,7 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
         // stores a non-const `Tensor*`, and relies on the `OpKernelContext`
         // accessors making dynamic checks that prevent using an immutable
         // tensor as a mutable tensor.
+        inp->mutex_if_ref = nullptr;
         inp->tensor = const_cast<Tensor*>(entry->const_tensor);
         break;
       }
@@ -872,8 +877,8 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
       case Entry::State::HAS_REF_TENSOR: {
         {
           tf_shared_lock ml(*entry->ref_tensor.mu);
-          if (!entry->ref_tensor.tensor->IsInitialized() &&
-              !item.is_initialization_op) {
+          if (TF_PREDICT_FALSE(!entry->ref_tensor.tensor->IsInitialized() &&
+                               !item.is_initialization_op)) {
             return AttachDef(errors::FailedPrecondition(
                                  "Attempting to use uninitialized value ",
                                  item.kernel->requested_input(i)),
@@ -896,12 +901,13 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
           }
           entry->state = Entry::State::HAS_VALUE;
 
+          inp->mutex_if_ref = nullptr;
           inp->tensor = entry->val.get();
           // The dtype of entry->ref_tensor.tensor could have been changed by
           // another operation that ran after the operation that "produced" it
           // executed, so re-validate that the type of the dereferenced tensor
           // matches the expected input type.
-          if (item.input_type(i) != inp->tensor->dtype()) {
+          if (TF_PREDICT_FALSE(item.input_type(i) != inp->tensor->dtype())) {
             return AttachDef(
                 errors::InvalidArgument(
                     i, "-th input expects type ",
@@ -1050,10 +1056,12 @@ bool ExecutorState<PropagatorStateType>::NodeDone(
         // aborting all other execution in the step.
         abort_run = true;
 
-        // If execution has been cancelled, mark any new errors as being
-        // derived. This ensures any errors triggered by cancellation are marked
-        // as derived.
-        if (cancellation_manager_ && cancellation_manager_->IsCancelled()) {
+        // If execution has been cancelled, mark cancelled or aborted errors as
+        // being derived. Note that the original node that fails might also
+        // trigger cancellation, and here we make sure the original error is
+        // exposed to users and not buried as a derived error.
+        if (cancellation_manager_ && cancellation_manager_->IsCancelled() &&
+            (errors::IsCancelled(s) || errors::IsAborted(s))) {
           status_ = StatusGroup::MakeDerived(s);
         } else {
           status_ = s;
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index 9a1b7cff813..dd65b5dce1d 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -549,7 +549,8 @@ BENCHMARK(BM_FeedInputFetchOutput);
 //
 // ...using the functional `WhileOp` (if `lower` is false) or the
 // `Switch`/`Merge`-style of control flow (if `lower` is true).
-static void BM_WhileLoopHelper(int iters, int loop_iters, bool lower) {
+static void BM_WhileLoopHelper(int iters, int loop_iters, int loop_vars,
+                               bool lower) {
   testing::StopTiming();
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
@@ -558,20 +559,44 @@ static void BM_WhileLoopHelper(int iters, int loop_iters, bool lower) {
 
   // Define the loop body as a function: `x = x + 1`.
   const Tensor one_t = test::AsScalar<int32>(1);
+
+  std::vector<string> args;
+  args.reserve(loop_vars);
+  args.push_back("x: int32");
+  for (int i = 1; i < loop_vars; ++i) {
+    args.push_back(strings::StrCat("x", i, ": int32"));
+  }
+
+  std::vector<string> body_rets;
+  body_rets.reserve(loop_vars);
+  body_rets.push_back("y: int32");
+  for (int i = 1; i < loop_vars; ++i) {
+    body_rets.push_back(strings::StrCat("y", i, ": int32"));
+  }
+
+  std::vector<FunctionDefHelper::Node> body_nodes;
+  body_nodes.reserve(1 + loop_vars);
+  body_nodes.push_back(
+      {{"one"}, "Const", {}, {{"value", one_t}, {"dtype", DT_INT32}}});
+  body_nodes.push_back({{"y"}, "Add", {"x", "one"}, {{"T", DT_INT32}}});
+  for (int i = 1; i < loop_vars; ++i) {
+    body_nodes.push_back({{strings::StrCat("y", i)},
+                          "Identity",
+                          {strings::StrCat("x", i)},
+                          {{"T", DT_INT32}}});
+  }
+
   *f_lib_proto.add_function() = FunctionDefHelper::Define(
       // Name
       "XPlusOne",
       // Args
-      {"x: int32"},
+      args,
       // Return values
-      {"y: int32"},
+      body_rets,
       // Attr def
       {},
       // Nodes
-      {
-          {{"one"}, "Const", {}, {{"value", one_t}, {"dtype", DT_INT32}}},
-          {{"y"}, "Add", {"x", "one"}, {{"T", DT_INT32}}},
-      });
+      body_nodes);
 
   // Define the loop condition as a function: `x < loop_iters`.
   const Tensor loop_iters_t = test::AsScalar<int32>(loop_iters);
@@ -579,7 +604,7 @@ static void BM_WhileLoopHelper(int iters, int loop_iters, bool lower) {
       // Name
       "LessThanOrEqualToN",
       // Args
-      {"x: int32"},
+      args,
       // Return values
       {"z: bool"},
       // Attr def
@@ -594,7 +619,12 @@ static void BM_WhileLoopHelper(int iters, int loop_iters, bool lower) {
   TF_ASSERT_OK(root.graph()->AddFunctionLibrary(f_lib_proto));
   auto a = ops::Const(root.WithOpName("A"), 0, {});
   Node* while_node;
-  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
+  std::vector<NodeBuilder::NodeOut> inputs;
+  std::vector<DataType> input_types(loop_vars, DT_INT32);
+  inputs.reserve(loop_vars);
+  for (int i = 0; i < loop_vars; ++i) {
+    inputs.push_back(NodeBuilder::NodeOut(a.node()));
+  }
   AttrValue int32_attr;
   int32_attr.set_type(DT_INT32);
   AttrValue cond_func;
@@ -604,7 +634,7 @@ static void BM_WhileLoopHelper(int iters, int loop_iters, bool lower) {
   TF_ASSERT_OK(
       NodeBuilder("while", "While", &root.graph()->flib_def())
           .Input(inputs)
-          .Attr("T", {DT_INT32})
+          .Attr("T", input_types)
           .Attr("cond", cond_func)
           .Attr("body", body_func)
           .Attr("parallel_iterations", 100)
@@ -635,21 +665,33 @@ static void BM_WhileLoopHelper(int iters, int loop_iters, bool lower) {
   test::Benchmark("cpu", graph.release()).Run(iters);
 }
 
-static void BM_LoweredWhileLoop(int iters, int loop_iters) {
-  BM_WhileLoopHelper(iters, loop_iters, /* lower= */ true);
+static void BM_LoweredWhileLoop(int iters, int loop_iters, int loop_vars) {
+  BM_WhileLoopHelper(iters, loop_iters, loop_vars, /* lower= */ true);
 }
-BENCHMARK(BM_LoweredWhileLoop)->Arg(0);
-BENCHMARK(BM_LoweredWhileLoop)->Arg(1);
-BENCHMARK(BM_LoweredWhileLoop)->Arg(10);
-BENCHMARK(BM_LoweredWhileLoop)->Arg(100);
-BENCHMARK(BM_LoweredWhileLoop)->Arg(1000);
+BENCHMARK(BM_LoweredWhileLoop)
+    ->ArgPair(0, 1)
+    ->ArgPair(1, 1)
+    ->ArgPair(10, 1)
+    ->ArgPair(100, 1)
+    ->ArgPair(1000, 1)
+    ->ArgPair(0, 100)
+    ->ArgPair(1, 100)
+    ->ArgPair(10, 100)
+    ->ArgPair(100, 100)
+    ->ArgPair(1000, 100);
 
-static void BM_FunctionalWhileLoop(int iters, int loop_iters) {
-  BM_WhileLoopHelper(iters, loop_iters, /* lower= */ false);
+static void BM_FunctionalWhileLoop(int iters, int loop_iters, int loop_vars) {
+  BM_WhileLoopHelper(iters, loop_iters, loop_vars, /* lower= */ false);
 }
-BENCHMARK(BM_FunctionalWhileLoop)->Arg(0);
-BENCHMARK(BM_FunctionalWhileLoop)->Arg(1);
-BENCHMARK(BM_FunctionalWhileLoop)->Arg(10);
-BENCHMARK(BM_FunctionalWhileLoop)->Arg(100);
-BENCHMARK(BM_FunctionalWhileLoop)->Arg(1000);
+BENCHMARK(BM_FunctionalWhileLoop)
+    ->ArgPair(0, 1)
+    ->ArgPair(1, 1)
+    ->ArgPair(10, 1)
+    ->ArgPair(100, 1)
+    ->ArgPair(1000, 1)
+    ->ArgPair(0, 100)
+    ->ArgPair(1, 100)
+    ->ArgPair(10, 100)
+    ->ArgPair(100, 100)
+    ->ArgPair(1000, 100);
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 07919117051..3744eb967d3 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -121,7 +121,6 @@ filegroup(
         "gpu_managed_allocator.h",
         "gpu_mem_allocator.h",
         "gpu_process_state.h",
-        "gpu_stream_util.h",
         "gpu_util.h",
         "//tensorflow/core/common_runtime:gpu_runtime_headers",
     ],
@@ -137,7 +136,6 @@ tf_cuda_library(
         "gpu_device_factory.cc",
         "gpu_managed_allocator.cc",
         "gpu_process_state.cc",
-        "gpu_stream_util.cc",
         "gpu_util.cc",
         "gpu_util_platform_specific.cc",
     ],
@@ -400,29 +398,3 @@ tf_cc_test_gpu(
         "//tensorflow/core/kernels:ops_util",
     ],
 )
-
-tf_cc_test_gpu(
-    name = "gpu_stream_util_test",
-    size = "small",
-    srcs = ["gpu_stream_util_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    tags = tf_cuda_tests_tags() + ["nomac"],
-    deps = [
-        ":gpu_runtime",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:sendrecv_ops",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/common_runtime:core_cpu",
-        "//tensorflow/core/common_runtime:core_cpu_internal",
-        "//tensorflow/core/common_runtime:direct_session_internal",
-        "//tensorflow/core/kernels:matmul_op",
-        "//tensorflow/core/kernels:ops_util",
-    ],
-)
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index cf2e7043cae..e11b079b7ec 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -42,7 +42,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/common_runtime/local_device.h"
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc
deleted file mode 100644
index de715d140a1..00000000000
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
-
-#include <set>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-
-namespace tensorflow {
-namespace gpu_stream_util {
-
-Status AssignStreams(const Graph* graph, const AssignStreamsOpts& opts,
-                     std::unordered_map<int, int>* node_to_stream_id) {
-  VLOG(1) << "AssignStreams";
-  Status status;
-
-  // Sanity check arguments.
-  if (graph == nullptr)
-    status.Update(errors::InvalidArgument("Bad graph argument supplied."));
-  if (node_to_stream_id == nullptr) {
-    status.Update(
-        errors::InvalidArgument("Bad node_to_stream_id argument supplied."));
-  }
-  if ((opts.max_streams < 1) || (opts.send_stream >= opts.max_streams) ||
-      (opts.recv_stream >= opts.max_streams) ||
-      (opts.const_stream >= opts.max_streams) ||
-      (opts.compute_stream >= opts.max_streams)) {
-    status.Update(errors::InvalidArgument("Bad graph argument supplied."));
-  }
-  TF_RETURN_IF_ERROR(status);
-
-  // Topologically sort the nodes.
-  std::vector<Node*> order;
-  GetReversePostOrder(*graph, &order);
-  if (VLOG_IS_ON(2)) {
-    for (Node* n : order) {
-      const int node_id = n->id();
-      VLOG(2) << "Node " << node_id << " " << n->type_string() << " "
-              << n->name() << " " << n->in_edges().size() << " inputs";
-      for (const Edge* e : n->in_edges()) {
-        VLOG(2) << "  Edge from " << e->src()->id() << "  " << e->src()->name()
-                << " fanout " << e->src()->out_edges().size();
-      }
-    }
-  }
-  // We perform stream assignment assuming a large number of
-  // stream IDs and then map these down to the required number of streams
-  // using simple round-robin.
-  // Stream Assignment strategy:
-  // 1. Nodes with zero inputs are always be executed on a
-  // fresh stream.
-  // 2. Try to execute a node on the same stream as one of its
-  // inputs to avoid inter-stream dependencies.
-  // 3. If any input comes from a node with a large fanout then
-  // perhaps an indication that it is shared between parallel
-  // streams of work. We choose a new stream here so that all consumers
-  // of the tensor are likely to run in parallel.
-  int highest_stream_id = -1;
-  for (Node* n : order) {
-    VLOG(3) << "Inspecting node " << n->DebugString();
-    const int node_id = n->id();
-    const string& op = n->type_string();
-
-    // Determine a suitable stream to use.
-    int stream_id = highest_stream_id + 1;
-    for (const Edge* e : n->in_edges()) {
-      const size_t fanout = e->src()->out_edges().size();
-      if (fanout == 1) {
-        stream_id = (*node_to_stream_id)[e->src()->id()];
-        break;
-      }
-    }
-    // Override stream for specific op types.
-    if (op == "_Send") {
-      if (opts.send_stream >= 0) stream_id = opts.send_stream;
-    } else if (op == "_Recv") {
-      if (opts.recv_stream >= 0) stream_id = opts.recv_stream;
-    } else if (op == "Const") {
-      if (opts.const_stream >= 0) stream_id = opts.const_stream;
-    } else {
-      if (opts.compute_stream >= 0) stream_id = opts.compute_stream;
-    }
-
-    (*node_to_stream_id)[node_id] = stream_id % opts.max_streams;
-    highest_stream_id = std::max(stream_id, highest_stream_id);
-  }
-  VLOG(1) << "Identified " << highest_stream_id << " candidate streams for "
-          << order.size() << " nodes.";
-
-  return Status::OK();
-}
-
-}  // namespace gpu_stream_util
-}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util.h b/tensorflow/core/common_runtime/gpu/gpu_stream_util.h
deleted file mode 100644
index c61ada96efe..00000000000
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_
-
-#include <unordered_map>
-
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace tensorflow {
-namespace gpu_stream_util {
-
-struct AssignStreamsOpts {
-  int32 max_streams = 1;
-  // The following options specify a stream to use for specific op
-  // types.  The value -1 allows ops to be assigned to any stream.
-  int32 send_stream = -1;
-  int32 recv_stream = -1;
-  int32 const_stream = -1;
-  int32 compute_stream = -1;
-};
-
-// Given the input graph, assigns every node in the graph with a
-// stream_id that should be used.
-Status AssignStreams(const Graph* graph, const AssignStreamsOpts& opts,
-                     std::unordered_map<int, int>* node_to_stream_id);
-
-}  // namespace gpu_stream_util
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
deleted file mode 100644
index 2500425359c..00000000000
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
-
-#include "tensorflow/cc/ops/sendrecv_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/kernels/ops_testutil.h"
-#include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace {
-
-class GpuStreamUtilTest : public OpsTestBase {
- protected:
-};
-
-TEST_F(GpuStreamUtilTest, BogusOpts) {
-  auto root = Scope::NewRootScope().ExitOnError();
-  Graph g(OpRegistry::Global());
-  TF_ASSERT_OK(root.ToGraph(&g));
-  std::unordered_map<int, int> node_to_stream_id;
-  gpu_stream_util::AssignStreamsOpts opts;
-  Status status;
-  status = gpu_stream_util::AssignStreams(nullptr, opts, &node_to_stream_id);
-  EXPECT_FALSE(status.ok());
-  status = gpu_stream_util::AssignStreams(&g, opts, nullptr);
-  EXPECT_FALSE(status.ok());
-  opts.max_streams = 0;
-  status = gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id);
-  EXPECT_FALSE(status.ok());
-  opts.max_streams = 1;
-  opts.compute_stream = 5;
-  status = gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id);
-  EXPECT_FALSE(status.ok());
-}
-
-TEST_F(GpuStreamUtilTest, EmptyGraph) {
-  auto root = Scope::NewRootScope().ExitOnError();
-  Graph g(OpRegistry::Global());
-  TF_ASSERT_OK(root.ToGraph(&g));
-  std::unordered_map<int, int> node_to_stream_id;
-  gpu_stream_util::AssignStreamsOpts opts;
-  TF_ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id));
-  EXPECT_EQ(2, node_to_stream_id.size());  // _SOURCE and _SINK
-}
-
-TEST_F(GpuStreamUtilTest, SimpleGraphOneStream) {
-  auto root = Scope::DisabledShapeInferenceScope().ExitOnError();
-  ops::MatMul(root, {}, {});
-  Graph g(OpRegistry::Global());
-  TF_ASSERT_OK(root.ToGraph(&g));
-
-  std::unordered_map<int, int> node_to_stream_id;
-  gpu_stream_util::AssignStreamsOpts opts;
-  TF_ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id));
-
-  // There should be 5 nodes assigned.
-  EXPECT_EQ(5, node_to_stream_id.size());
-
-  // All of them should have stream 0.
-  for (const auto& it : node_to_stream_id) {
-    EXPECT_EQ(0, it.second);
-  }
-}
-
-TEST_F(GpuStreamUtilTest, SimpleGraphManyStreams) {
-  auto root = Scope::DisabledShapeInferenceScope().ExitOnError();
-  ops::MatMul(root, {}, {});
-  Graph g(OpRegistry::Global());
-  TF_ASSERT_OK(root.ToGraph(&g));
-
-  std::unordered_map<int, int> node_to_stream_id;
-  gpu_stream_util::AssignStreamsOpts opts;
-  opts.max_streams = 3;
-  TF_ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id));
-
-  // There should be 5 nodes assigned.
-  EXPECT_EQ(5, node_to_stream_id.size());
-
-  // All of them should have a stream in the range [0..max_streams).
-  for (const auto& it : node_to_stream_id) {
-    EXPECT_GE(it.second, 0);
-    EXPECT_LT(it.second, opts.max_streams);
-  }
-}
-
-TEST_F(GpuStreamUtilTest, StreamOverrides) {
-  auto root = Scope::DisabledShapeInferenceScope().ExitOnError();
-  ops::_Recv(root.WithOpName("input"), DT_FLOAT, "input", "/cpu:0", 0,
-             "/device:GPU:0");
-  Output n = ops::MatMul(root, {}, {});
-  ops::_Send(root.WithOpName("output"), n, "output", "/device:GPU:0", 0,
-             "/cpu:0");
-  Graph g(OpRegistry::Global());
-  TF_ASSERT_OK(root.ToGraph(&g));
-
-  // Perform stream assignment using a large number of streams, but with
-  // op types constrained to specific streams.
-  std::unordered_map<int, int> node_to_stream_id;
-  gpu_stream_util::AssignStreamsOpts opts;
-  opts.max_streams = 100;
-  opts.const_stream = 90;
-  opts.send_stream = 91;
-  opts.recv_stream = 92;
-  opts.compute_stream = 93;
-  TF_ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id));
-
-  // There should be 7 nodes assigned.
-  EXPECT_EQ(7, node_to_stream_id.size());  // including _SOURCE and _SINK
-
-  // Nodes should be assigned to streams by op type.
-  for (const auto& it : node_to_stream_id) {
-    Node* n = g.FindNodeId(it.first);
-    const string& op = n->type_string();
-    const int stream = it.second;
-    if (op == "Const") {
-      EXPECT_EQ(stream, 90);
-    } else if (op == "_Send") {
-      EXPECT_EQ(stream, 91);
-    } else if (op == "_Recv") {
-      EXPECT_EQ(stream, 92);
-    } else {  // Compute.
-      EXPECT_EQ(stream, 93);
-    }
-  }
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 746930750ad..ae1a2daa788 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -42,7 +42,7 @@ void GraphOptimizer::Optimize(
     const NodePredicate& cse_consider_fn, const NodePredicate& cf_consider_fn,
     bool inline_multi_device_functions,
     bool inline_impl_selection_group_functions,
-    bool inline_with_single_device_body_placer) {
+    bool inline_with_single_device_body_placer, bool ignore_noinline) {
   Graph* g = graph->get();
   DumpGraph("Initial", g);
 
@@ -116,6 +116,11 @@ void GraphOptimizer::Optimize(
             .inline_impl_selection_group_functions = true;
       }
 
+      if (ignore_noinline) {
+        expand_inline_opts.multi_device_options.ignore_noinline = true;
+        expand_inline_opts.native_options.ignore_noinline = true;
+      }
+
       bool was_mutated = ExpandInlineFunctions(runtime, g, expand_inline_opts);
       if (was_mutated) {
         DumpGraph("ExpandInlineFunctions", g);
@@ -138,11 +143,11 @@ void GraphOptimizer::Optimize(FunctionLibraryRuntime* runtime, Env* env,
                               const Device* device,
                               std::unique_ptr<Graph>* graph,
                               const Options& options) {
-  Optimize(runtime, env, device, graph, options.shape_map,
-           options.cse_consider_fn, options.cf_consider_fn,
-           options.inline_multi_device_functions,
-           options.inline_impl_selection_group_functions,
-           options.inline_with_single_device_body_placer);
+  Optimize(
+      runtime, env, device, graph, options.shape_map, options.cse_consider_fn,
+      options.cf_consider_fn, options.inline_multi_device_functions,
+      options.inline_impl_selection_group_functions,
+      options.inline_with_single_device_body_placer, options.ignore_noinline);
 }
 
 void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g,
diff --git a/tensorflow/core/common_runtime/graph_optimizer.h b/tensorflow/core/common_runtime/graph_optimizer.h
index 099ea8efa12..53bf532bd9c 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.h
+++ b/tensorflow/core/common_runtime/graph_optimizer.h
@@ -58,6 +58,9 @@ class GraphOptimizer {
     // If true all functions will be inlined with a single device function
     // body placer strategy.
     bool inline_with_single_device_body_placer = false;
+
+    // If true, the _noinline attribute on functions and callers is ignored.
+    bool ignore_noinline = false;
   };
 
   explicit GraphOptimizer(const OptimizerOptions& opts);
@@ -81,7 +84,8 @@ class GraphOptimizer {
       const NodePredicate& cf_consider_fn = nullptr,
       bool inline_multi_device_functions = false,
       bool inline_impl_selection_group_functions = false,
-      bool inline_with_single_device_body_placer = false);
+      bool inline_with_single_device_body_placer = false,
+      bool ignore_noinline = false);
 
   const OptimizerOptions& options() { return opts_; }
 
diff --git a/tensorflow/core/common_runtime/graph_view.cc b/tensorflow/core/common_runtime/graph_view.cc
index 7db0781551d..7a63e06814a 100644
--- a/tensorflow/core/common_runtime/graph_view.cc
+++ b/tensorflow/core/common_runtime/graph_view.cc
@@ -191,9 +191,11 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) {
 
   DCHECK_LT(DataType_MAX, 255);  // Must fit in uint8
   uint8* input_types = item->input_type_base();
+  item->is_any_input_ref_typed = false;
   for (int i = 0; i < num_inputs; i++) {
     input_types[i] = static_cast<uint8>(n->input_type(i));
     DCHECK_EQ(item->input_type(i), n->input_type(i));
+    item->is_any_input_ref_typed |= IsRefType(n->input_type(i));
   }
 
   // Check ScopedAllocatorAttrs and forward_from.  Also assign output_types.
diff --git a/tensorflow/core/common_runtime/graph_view.h b/tensorflow/core/common_runtime/graph_view.h
index 6d31555ed9a..38eb3e33bcb 100644
--- a/tensorflow/core/common_runtime/graph_view.h
+++ b/tensorflow/core/common_runtime/graph_view.h
@@ -81,6 +81,8 @@ struct NodeItem {
                                                      // of any output edge is a
                                                      // merge or control trigger
                                                      // node.
+  bool is_any_input_ref_typed : 1;  // True iff any IsRefType(dt) for dt in this
+                                    // node's input types.
 
   // The kernel for this node.
   OpKernel* kernel = nullptr;
diff --git a/tensorflow/core/common_runtime/immutable_executor_state.cc b/tensorflow/core/common_runtime/immutable_executor_state.cc
index a98d9f0feaa..03d12a0e98a 100644
--- a/tensorflow/core/common_runtime/immutable_executor_state.cc
+++ b/tensorflow/core/common_runtime/immutable_executor_state.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/graph/edgeset.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_node_util.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
@@ -39,9 +40,6 @@ ImmutableExecutorState::~ImmutableExecutorState() {
       params_.delete_kernel(item->kernel);
     }
   }
-  for (auto fiter : frame_info_) {
-    delete fiter.second;
-  }
 }
 
 namespace {
@@ -71,11 +69,16 @@ void GetMaxPendingCounts(const Node* n, size_t* max_pending,
 
 ImmutableExecutorState::FrameInfo* ImmutableExecutorState::EnsureFrameInfo(
     const string& fname) {
-  auto slot = &frame_info_[fname];
-  if (*slot == nullptr) {
-    *slot = new FrameInfo;
+  auto iter = frame_info_.find(fname);
+  if (iter != frame_info_.end()) {
+    return iter->second.get();
+  } else {
+    auto frame_info = absl::make_unique<FrameInfo>(fname);
+    absl::string_view fname_view = frame_info->name;
+    auto emplace_result =
+        frame_info_.emplace(fname_view, std::move(frame_info));
+    return emplace_result.first->second.get();
   }
-  return *slot;
 }
 
 Status ImmutableExecutorState::Initialize(const Graph& graph) {
@@ -89,7 +92,7 @@ Status ImmutableExecutorState::Initialize(const Graph& graph) {
     EnsureFrameInfo(it)->nodes =
         absl::make_unique<std::vector<const NodeItem*>>();
   }
-  root_frame_info_ = frame_info_[""];
+  root_frame_info_ = frame_info_[""].get();
 
   pending_ids_.resize(gview_.num_nodes());
 
@@ -157,6 +160,28 @@ Status ImmutableExecutorState::Initialize(const Graph& graph) {
       TF_RETURN_IF_ERROR(
           GetNodeAttr(n->attrs(), "is_constant", &is_constant_enter));
       item->is_constant_enter = is_constant_enter;
+
+      string frame_name;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "frame_name", &frame_name));
+      FrameInfo* frame_info = frame_info_[frame_name].get();
+
+      int parallel_iterations;
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(n->attrs(), "parallel_iterations", &parallel_iterations));
+
+      if (frame_info->parallel_iterations == -1) {
+        frame_info->parallel_iterations = parallel_iterations;
+      } else if (frame_info->parallel_iterations != parallel_iterations) {
+        LOG(WARNING) << "Loop frame \"" << frame_name
+                     << "\" had two different values for parallel_iterations: "
+                     << frame_info->parallel_iterations << " vs. "
+                     << parallel_iterations << ".";
+      }
+
+      if (enter_frame_info_.size() <= id) {
+        enter_frame_info_.resize(id + 1);
+      }
+      enter_frame_info_[id] = frame_info;
     } else {
       item->is_constant_enter = false;
     }
diff --git a/tensorflow/core/common_runtime/immutable_executor_state.h b/tensorflow/core/common_runtime/immutable_executor_state.h
index 50c98939ea8..a35edfe227c 100644
--- a/tensorflow/core/common_runtime/immutable_executor_state.h
+++ b/tensorflow/core/common_runtime/immutable_executor_state.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/common_runtime/graph_view.h"
 #include "tensorflow/core/common_runtime/local_executor_params.h"
 #include "tensorflow/core/common_runtime/pending_counts.h"
@@ -41,11 +42,16 @@ class Graph;
 class ImmutableExecutorState {
  public:
   struct FrameInfo {
-    FrameInfo()
-        : input_count(0),
+    explicit FrameInfo(string name)
+        : name(std::move(name)),
+          input_count(0),
           total_inputs(0),
           pending_counts(nullptr),
-          nodes(nullptr) {}
+          nodes(nullptr),
+          parallel_iterations(-1) {}
+
+    // The name of the frame.
+    string name;
 
     // The total number of inputs to a frame.
     int input_count;
@@ -63,6 +69,9 @@ class ImmutableExecutorState {
 
     // The nodes in a frame. Used only for debugging.
     std::unique_ptr<std::vector<const NodeItem*>> nodes;
+
+    // The number of iterations of this frame that can execute concurrently.
+    int32 parallel_iterations;
   };
 
   explicit ImmutableExecutorState(const LocalExecutorParams& p)
@@ -83,17 +92,13 @@ class ImmutableExecutorState {
   }
   const std::vector<const NodeItem*>& root_nodes() const { return root_nodes_; }
 
-  const FrameInfo* get_frame_info(const string& frame_name) const {
-    auto it_frame_info = frame_info_.find(frame_name);
-    if (it_frame_info == frame_info_.end()) {
-      return nullptr;
-    } else {
-      return it_frame_info->second;
-    }
-  }
-
   const FrameInfo& get_root_frame_info() const { return *root_frame_info_; }
 
+  const FrameInfo& get_enter_frame_info(const NodeItem& node_item) const {
+    DCHECK(node_item.is_enter);
+    return *enter_frame_info_[node_item.node_id];
+  }
+
   bool requires_control_flow_support() const { return requires_control_flow_; }
 
   // Copies the pending counts for nodes in this graph to the given array.
@@ -135,9 +140,14 @@ class ImmutableExecutorState {
   // Mapping from frame name to static information about the frame.
   // TODO(yuanbyu): We could cache it along with the graph so to avoid
   // the overhead of constructing it for each executor instance.
-  gtl::FlatMap<string, FrameInfo*> frame_info_;
+  absl::flat_hash_map<absl::string_view, std::unique_ptr<FrameInfo>>
+      frame_info_;
   const FrameInfo* root_frame_info_;  // Not owned.
 
+  // If the graph contains any "Enter" or "RefEnter" nodes, this vector maps
+  // dense node IDs to the corresponding FrameInfo.
+  std::vector<FrameInfo*> enter_frame_info_;
+
   // If `requires_control_flow_` is false, this points to an array of initial
   // pending counts for the nodes in the graph, indexed by node ID.
   std::unique_ptr<std::atomic<int32>[]> atomic_pending_counts_;
diff --git a/tensorflow/core/common_runtime/inline_function_utils.cc b/tensorflow/core/common_runtime/inline_function_utils.cc
index a074942629d..5a07573a430 100644
--- a/tensorflow/core/common_runtime/inline_function_utils.cc
+++ b/tensorflow/core/common_runtime/inline_function_utils.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/function_utils.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
@@ -545,6 +546,19 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
     return node;
   };
 
+  // ------------------------------------------------------------------------ //
+  // Helper function to get an input/output argument name by index. For
+  // functions instantiated from SymbolicGradien corresponding FunctionDef is
+  // empty, and argument name is unknown.
+
+  auto arg_name = [&](auto& args, size_t i) -> absl::string_view {
+    if (i < args.size()) {
+      return args[i].name();
+    } else {
+      return "<unknown>";
+    }
+  };
+
   // ------------------------------------------------------------------------ //
   // Input edges. For data edges coming into "caller", we first compute the
   // <src>:<src_output> for the i-th input in "inputs".
@@ -663,11 +677,12 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
   //
   // The added identity nodes depend on "input_control_node".
   VLOG(4) << "Add input Identity nodes for each function argument:";
+
   for (std::size_t i = 0; i < fbody->arg_nodes.size(); ++i) {
     Node* arg = node_map[fbody->arg_nodes[i]->id()];
     Node* n = input_identity("input", inputs[i], i);
     VLOG(4) << "    [index " << i << "] "
-            << fbody->fdef.signature().input_arg(i).name() << " as "
+            << arg_name(fbody->fdef.signature().input_arg(), i) << " as "
             << n->name() << " (input: " << inputs[i].name()
             << ", requested_device: " << n->requested_device() << ")";
 
@@ -704,6 +719,7 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
   // If `keep_node_fetchable` is `true` we always add an output control node, to
   // guarantee that executing a fetchable node will execute all side-effects.
   VLOG(4) << "Add output Identity nodes for each function output argument:";
+
   std::vector<Node*> outputs(caller->num_outputs());
   for (std::size_t i = 0; i < fbody->ret_nodes.size(); ++i) {
     Node* ret = node_map[fbody->ret_nodes[i]->id()];
@@ -718,7 +734,7 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
     Node* n = output_identity("output", data, i);
     outputs[i] = n;
     VLOG(4) << "    [index " << i << "] "
-            << fbody->fdef.signature().output_arg(i).name() << " as "
+            << arg_name(fbody->fdef.signature().output_arg(), i) << " as "
             << n->name() << " (ret: " << data.node->name() << ":" << data.index
             << ", requested_device: " << n->requested_device() << ")";
     for (const Edge* e : ret->in_edges()) {
diff --git a/tensorflow/core/common_runtime/lower_while_op.cc b/tensorflow/core/common_runtime/lower_while_op.cc
index e9d322721f2..90fdc886c50 100644
--- a/tensorflow/core/common_runtime/lower_while_op.cc
+++ b/tensorflow/core/common_runtime/lower_while_op.cc
@@ -238,12 +238,14 @@ Status LowerWhileHelper::CreateEnterNodes() {
   TF_RETURN_IF_ERROR(while_op_->input_edges(&edges));
   for (const Edge* edge : edges) {
     Node* enter_node;
-    NodeBuilder builder = NodeBuilder(NewName("enter"), "Enter",
-                                      graph_->op_registry(), &debug_info_)
-                              .Input(NodeOut(edge->src(), edge->src_output()))
-                              .Attr("frame_name", name_)
-                              .Attr("parallel_iterations", parallel_iterations_)
-                              .Device(while_op_->requested_device());
+    NodeBuilder builder =
+        NodeBuilder(NewName("enter"), "Enter", graph_->op_registry(),
+                    &debug_info_)
+            .Input(NodeOut(edge->src(), edge->src_output()))
+            .Attr("frame_name", name_)
+            .Attr("parallel_iterations", parallel_iterations_)
+            .Device(edge->src()->requested_device())
+            .AssignedDevice(edge->src()->assigned_device_name());
     if (IsResource(edge->dst_input())) {
       builder.Attr("is_constant", true);
     }
@@ -282,7 +284,8 @@ Status LowerWhileHelper::CreateMergeNodes() {
         NodeBuilder(NewName("merge"), "Merge", graph_->op_registry(),
                     &debug_info_)
             .Input({NodeOut(enter_node, 0), NodeOut(enter_node, 0)})
-            .Device(while_op_->requested_device())
+            .Device(enter_node->requested_device())
+            .AssignedDevice(enter_node->assigned_device_name())
             .Finalize(graph_, &merge_node));
     merge_nodes_.emplace_back(merge_node);
   }
@@ -323,21 +326,19 @@ Status LowerWhileHelper::CreateSwitchNodes() {
       TF_RETURN_IF_ERROR(while_op_->input_node(i, &input_node));
       op_name = strings::StrCat(input_node->name(), "_switch");
     }
+    Node* merge_node = merge_nodes_[op_input_output_to_lowered_node_[i]];
     Node* switch_node;
     string op_type = "Switch";
-    if (IsRefType(
-            merge_nodes_[op_input_output_to_lowered_node_[i]]->output_type(
-                0))) {
+    if (IsRefType(merge_node->output_type(0))) {
       op_type = "RefSwitch";
     }
-    TF_RETURN_IF_ERROR(
-        NodeBuilder(NewName(op_name), op_type, graph_->op_registry(),
-                    &debug_info_)
-            .Input(
-                NodeOut(merge_nodes_[op_input_output_to_lowered_node_[i]], 0))
-            .Input(NodeOut(loop_cond_node_, 0))
-            .Device(while_op_->requested_device())
-            .Finalize(graph_, &switch_node));
+    TF_RETURN_IF_ERROR(NodeBuilder(NewName(op_name), op_type,
+                                   graph_->op_registry(), &debug_info_)
+                           .Input(NodeOut(merge_node, 0))
+                           .Input(NodeOut(loop_cond_node_, 0))
+                           .Device(merge_node->requested_device())
+                           .AssignedDevice(merge_node->assigned_device_name())
+                           .Finalize(graph_, &switch_node));
     switch_nodes_.emplace_back(switch_node);
   }
   return Status::OK();
@@ -392,7 +393,10 @@ Status LowerWhileHelper::CreateExitNodes() {
                       &debug_info_)
               .Input(NodeOut(switch_nodes_[op_input_output_to_lowered_node_[i]],
                              0))
-              .Device(while_op_->requested_device())
+              .Device(switch_nodes_[op_input_output_to_lowered_node_[i]]
+                          ->requested_device())
+              .AssignedDevice(switch_nodes_[op_input_output_to_lowered_node_[i]]
+                                  ->assigned_device_name())
               .Finalize(graph_, &exit_node));
       exit_nodes_.emplace_back(exit_node);
       outputs.emplace_back(NodeOut(exit_node, 0));
@@ -440,11 +444,13 @@ Status LowerWhileHelper::CreateNextIterationNodes() {
     if (IsResource(i)) {
       continue;
     }
+    Node* merge_node = merge_nodes_[op_input_output_to_lowered_node_[i]];
     TF_RETURN_IF_ERROR(NodeBuilder(NewName("next_iteration"), "NextIteration",
                                    graph_->op_registry(), &debug_info_)
                            .Input(NodeOut(body_call_node_, i))
                            .ControlInput(body_call_node_)
-                           .Device(while_op_->requested_device())
+                           .Device(merge_node->requested_device())
+                           .AssignedDevice(merge_node->assigned_device_name())
                            .Finalize(graph_, &next_iteration));
     next_iterations_nodes_.emplace_back(next_iteration);
   }
diff --git a/tensorflow/core/common_runtime/lower_while_op_test.cc b/tensorflow/core/common_runtime/lower_while_op_test.cc
index 0fc005cfb6f..9d7870f891d 100644
--- a/tensorflow/core/common_runtime/lower_while_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_while_op_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/strings/match.h"
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/array_ops.h"
@@ -25,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -169,6 +171,238 @@ TEST(LowerWhileOpTest, Simple) {
   }
 }
 
+TEST(LowerWhileOpTest, ForwardAssignedInputDevice) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  // Add test functions for cond and body.
+  FunctionDefLibrary f_lib_proto;
+  *f_lib_proto.add_function() = test::function::XTimesTwo();
+  *f_lib_proto.add_function() = test::function::LessThanOrEqualToN(8);
+
+  TF_ASSERT_OK(graph->AddFunctionLibrary(f_lib_proto));
+  auto type = DT_FLOAT;
+  Node* placeholder;
+  TF_CHECK_OK(NodeBuilder("placed_node", "Placeholder")
+                  .Attr("dtype", type)
+                  .Finalize(graph.get(), &placeholder));
+  const string assigned_device_name = "/job:localhost/replica:0/task:0/gpu:0";
+  placeholder->set_assigned_device_name(assigned_device_name);
+  Node* while_node;
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(placeholder)});
+  AttrValue cond_func;
+  cond_func.mutable_func()->set_name("LessThanOrEqualToN");
+  AttrValue body_func;
+  body_func.mutable_func()->set_name("XTimesTwo");
+  TF_ASSERT_OK(
+      NodeBuilder("while", "While", &graph->flib_def())
+          .Input(inputs)
+          .Attr("T", {type})
+          .Attr("cond", cond_func)
+          .Attr("body", body_func)
+          .Attr("parallel_iterations", 100)
+          .Attr(LowerFunctionalOpsPass::kLowerUsingSwitchMergeAttr, true)
+          .Finalize(graph.get(), &while_node));
+  TF_ASSERT_OK(Rewrite(&graph));
+
+  const Node* placeholder_node = nullptr;
+  for (const auto* op : graph->op_nodes()) {
+    if (op->name() == "placed_node") {
+      placeholder_node = op;
+    }
+  }
+  ASSERT_NE(placeholder_node, nullptr);
+  // Verify the assigned device of the Enter node.
+  int enter_consumers = 0;
+  const Node* enter_node = nullptr;
+  for (const Node* consumer : placeholder_node->out_nodes()) {
+    if (consumer->type_string() == "Enter") {
+      enter_consumers += 1;
+      enter_node = consumer;
+      ASSERT_EQ(consumer->assigned_device_name(), assigned_device_name);
+    }
+  }
+  ASSERT_EQ(enter_consumers, 1);
+  // Verify the assigned device of the Merge node.
+  int merge_consumers = 0;
+  const Node* merge_node = nullptr;
+  for (const Node* consumer : enter_node->out_nodes()) {
+    if (consumer->type_string() == "Merge") {
+      merge_consumers += 1;
+      merge_node = consumer;
+      ASSERT_EQ(consumer->assigned_device_name(), assigned_device_name);
+    }
+  }
+  ASSERT_EQ(merge_consumers, 1);
+  // Verify the assigned device of the NextIteration node.
+  int next_iteration_consumers = 0;
+  for (const Node* consumer : merge_node->in_nodes()) {
+    if (consumer->type_string() == "NextIteration") {
+      next_iteration_consumers += 1;
+      ASSERT_EQ(consumer->assigned_device_name(), assigned_device_name);
+    }
+  }
+  ASSERT_EQ(next_iteration_consumers, 1);
+  // Verify the assigned device of the Switch node.
+  int switch_consumers = 0;
+  const Node* switch_node = nullptr;
+  for (const Node* consumer : merge_node->out_nodes()) {
+    if (consumer->type_string() == "Switch") {
+      switch_consumers += 1;
+      switch_node = consumer;
+      ASSERT_EQ(consumer->assigned_device_name(), assigned_device_name);
+    }
+  }
+  ASSERT_EQ(switch_consumers, 1);
+  // Verify the assigned device of the Exit node.
+  int exit_consumers = 0;
+  for (const Node* consumer : switch_node->out_nodes()) {
+    if (consumer->type_string() == "Exit") {
+      exit_consumers += 1;
+      ASSERT_EQ(consumer->assigned_device_name(), assigned_device_name);
+    }
+  }
+  ASSERT_EQ(exit_consumers, 1);
+}
+
+TEST(LowerWhileOpTest, ForwardRequestedInputDevice) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  // Add test functions for cond and body.
+  FunctionDefLibrary f_lib_proto;
+  *f_lib_proto.add_function() = test::function::XTimesTwo();
+  *f_lib_proto.add_function() = test::function::LessThanOrEqualToN(8);
+
+  TF_ASSERT_OK(graph->AddFunctionLibrary(f_lib_proto));
+  auto type = DT_FLOAT;
+  // We will place the loop var on the gpu:0.
+  const string gpu_0_device = "/job:localhost/replica:0/task:0/gpu:0";
+  // We will place loop's control input on the gpu:1.
+  const string gpu_1_device = "/job:localhost/replica:0/task:0/gpu:1";
+  // We will place While op on gpu:2.
+  const string gpu_2_device = "/job:localhost/replica:0/task:0/gpu:2";
+  Node* gpu_0_ph;
+  TF_CHECK_OK(NodeBuilder("placed_node", "Placeholder")
+                  .Attr("dtype", type)
+                  .Device(gpu_0_device)
+                  .Finalize(graph.get(), &gpu_0_ph));
+  Node* control_in;
+  // Add a control input to the While op to trigger the creation of a
+  // LoopExecuted node.
+  TF_CHECK_OK(NodeBuilder("control_in", "Placeholder")
+                  .Attr("dtype", type)
+                  .Device(gpu_1_device)
+                  .Finalize(graph.get(), &control_in));
+  Node* while_node;
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(gpu_0_ph)});
+  AttrValue cond_func;
+  cond_func.mutable_func()->set_name("LessThanOrEqualToN");
+  AttrValue body_func;
+  body_func.mutable_func()->set_name("XTimesTwo");
+  TF_ASSERT_OK(
+      NodeBuilder("while", "While", &graph->flib_def())
+          .Input(inputs)
+          .ControlInput(control_in)
+          .Device(gpu_2_device)
+          .Attr("T", {type})
+          .Attr("cond", cond_func)
+          .Attr("body", body_func)
+          .Attr("parallel_iterations", 100)
+          .Attr(LowerFunctionalOpsPass::kLowerUsingSwitchMergeAttr, true)
+          .Finalize(graph.get(), &while_node));
+
+  // Create an empty Const node with control dep from the While op.
+  // This triggers the creation of a LoopExecuted node.
+  Node* control_out;
+  TensorProto proto;
+  proto.set_dtype(DT_FLOAT);
+  TensorShape empty_shape({0});
+  empty_shape.AsProto(proto.mutable_tensor_shape());
+  TF_ASSERT_OK(NodeBuilder("control_out", "Const")
+                   .ControlInput(while_node)
+                   .Attr("dtype", DT_FLOAT)
+                   .Attr("value", proto)
+                   .Finalize(graph.get(), &control_out));
+
+  TF_ASSERT_OK(Rewrite(&graph));
+
+  const Node* placeholder_node = nullptr;
+  for (const auto* op : graph->op_nodes()) {
+    if (op->name() == "placed_node") {
+      placeholder_node = op;
+    }
+  }
+  ASSERT_NE(placeholder_node, nullptr);
+  // Verify the requested device of the Enter node.
+  int enter_consumers = 0;
+  const Node* enter_node = nullptr;
+  for (const Node* consumer : placeholder_node->out_nodes()) {
+    if (consumer->type_string() == "Enter") {
+      enter_consumers += 1;
+      enter_node = consumer;
+      ASSERT_EQ(consumer->requested_device(), gpu_0_device);
+    }
+  }
+  ASSERT_EQ(enter_consumers, 1);
+  // Verify the requested device of the Merge node.
+  int merge_consumers = 0;
+  const Node* merge_node = nullptr;
+  for (const Node* consumer : enter_node->out_nodes()) {
+    if (consumer->type_string() == "Merge") {
+      merge_consumers += 1;
+      merge_node = consumer;
+      ASSERT_EQ(consumer->requested_device(), gpu_0_device);
+    }
+  }
+  ASSERT_EQ(merge_consumers, 1);
+  // Verify the requested device of the NextIteration node.
+  int next_iteration_consumers = 0;
+  for (const Node* consumer : merge_node->in_nodes()) {
+    if (consumer->type_string() == "NextIteration") {
+      next_iteration_consumers += 1;
+      ASSERT_EQ(consumer->requested_device(), gpu_0_device);
+    }
+  }
+  ASSERT_EQ(next_iteration_consumers, 1);
+  // Verify the requested device of the Switch node.
+  int switch_consumers = 0;
+  const Node* switch_node = nullptr;
+  for (const Node* consumer : merge_node->out_nodes()) {
+    if (consumer->type_string() == "Switch") {
+      switch_consumers += 1;
+      switch_node = consumer;
+      ASSERT_EQ(consumer->requested_device(), gpu_0_device);
+    }
+  }
+  ASSERT_EQ(switch_consumers, 1);
+  // Verify the requested device of the Exit node.
+  int exit_consumers = 0;
+  for (const Node* consumer : switch_node->out_nodes()) {
+    if (consumer->type_string() == "Exit") {
+      exit_consumers += 1;
+      ASSERT_EQ(consumer->requested_device(), gpu_0_device);
+    }
+  }
+  ASSERT_EQ(exit_consumers, 1);
+  // Verify the requested device of LoopControlInputs.
+  const Node* loop_control_inputs_node = nullptr;
+  for (const auto* op : graph->op_nodes()) {
+    if (absl::StrContains(op->name(), "LoopControlInputs")) {
+      loop_control_inputs_node = op;
+    }
+  }
+  ASSERT_NE(loop_control_inputs_node, nullptr);
+  ASSERT_EQ(loop_control_inputs_node->requested_device(), gpu_2_device);
+  // Verify the requested device of LoopExecuted.
+  const Node* loop_executed_node = nullptr;
+  for (const auto* op : graph->op_nodes()) {
+    if (absl::StrContains(op->name(), "LoopExecuted")) {
+      loop_executed_node = op;
+    }
+  }
+  ASSERT_NE(loop_executed_node, nullptr);
+  ASSERT_EQ(loop_executed_node->requested_device(), gpu_2_device);
+}
+
 TEST(LowerWhileOpTest, MultipleInputs) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index 2941845a604..fbec7059743 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -268,6 +268,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.dequantize = "Dequantize";
     csinfo_.fused_batch_norm = "FusedBatchNorm";
     csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
+    csinfo_.fused_batch_norm_ex = "_FusedBatchNormEx";
     csinfo_.fused_batch_norm_v2 = "FusedBatchNormV2";
     csinfo_.fused_batch_norm_grad_v2 = "FusedBatchNormGradV2";
     csinfo_.fused_batch_norm_v3 = "FusedBatchNormV3";
@@ -295,6 +296,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
         "_MklDepthwiseConv2dNativeBackpropInput";
     csinfo_.mkl_depthwise_conv2d_grad_filter =
         "_MklDepthwiseConv2dNativeBackpropFilter";
+    csinfo_.mkl_fused_batch_norm_ex = "_MklFusedBatchNormEx";
     csinfo_.mkl_fused_conv2d = "_MklFusedConv2D";
     csinfo_.mkl_fused_depthwise_conv2d = "_MklFusedDepthwiseConv2dNative";
     csinfo_.mkl_fused_matmul = "_MklFusedMatMul";
@@ -478,6 +480,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
         {csinfo_.fused_batch_norm_grad_v3,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad_v3),
          CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+#ifdef ENABLE_MKLDNN_V1
+    rinfo_.push_back({csinfo_.fused_batch_norm_ex,
+                      csinfo_.mkl_fused_batch_norm_ex, CopyAttrsAll,
+                      FusedBatchNormExRewrite, kRewriteForLayoutPropagation});
+#endif
     rinfo_.push_back({csinfo_.fused_conv2d, csinfo_.mkl_fused_conv2d,
                       CopyAttrsFusedConv2D, FusedConv2DRewrite,
                       kRewriteForLayoutPropagation});
@@ -499,7 +506,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
          CopyAttrsAll, LrnGradRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.matmul,
                       mkl_op_registry::GetMklOpName(csinfo_.matmul),
-                      CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange});
+                      CopyAttrsAll, MatMulRewrite, kRewriteForOpNameChange});
     rinfo_.push_back(
         {csinfo_.leakyrelu, mkl_op_registry::GetMklOpName(csinfo_.leakyrelu),
          CopyAttrsAll, LeakyReluRewrite, kRewriteForLayoutPropagation});
@@ -926,6 +933,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string dequantize;
     string fused_batch_norm;
     string fused_batch_norm_grad;
+    string fused_batch_norm_ex;
     string fused_batch_norm_v2;
     string fused_batch_norm_grad_v2;
     string fused_batch_norm_v3;
@@ -951,6 +959,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string mkl_conv2d_with_bias;
     string mkl_depthwise_conv2d_grad_input;
     string mkl_depthwise_conv2d_grad_filter;
+    string mkl_fused_batch_norm_ex;
     string mkl_fused_conv2d;
     string mkl_fused_depthwise_conv2d;
     string mkl_fused_matmul;
@@ -1473,6 +1482,16 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
+  static bool MatMulRewrite(const Node* n) {
+    DataType T;
+    GetNodeAttr(n->def(), "T", &T);
+    if ((T == DT_FLOAT) || (T == DT_BFLOAT16)) {
+      VLOG(2) << "Rewriting MatMul to _MklMatMul";
+      return true;
+    }
+    return false;
+  }
+
   static bool DequantizeRewrite(const Node* n) {
     DCHECK(n);
     Node* input = nullptr;
@@ -1670,6 +1689,31 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return do_rewrite;
   }
 
+  static bool FusedBatchNormExRewrite(const Node* n) {
+    DCHECK(n);
+
+    int num_side_inputs;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "num_side_inputs", &num_side_inputs));
+    string activation_mode;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "activation_mode", &activation_mode));
+
+    // if the num_side_inputs is not 0, don't rewrite the node.
+    if (num_side_inputs != 0) {
+      VLOG(1) << "FusedBatchNormExRewrite: The model sets num_side_inputs"
+              << "larger than 0 is not optimized by Intel MKL.";
+      return false;
+    }
+
+    // if the activation_mode is not 'Relu', don't rewrite the node.
+    if (activation_mode != "Relu") {
+      VLOG(1) << "FusedBatchNormExRewrite: Only Relu activation mode is"
+              << "supported by Intel MKL.";
+      return false;
+    }
+
+    return true;
+  }
+
   static bool FusedConv2DRewrite(const Node* n) {
     // MKL DNN currently doesn't support all fusions that grappler fuses
     // together with Conv2D (ex. batchnorm). We rewrite _FusedConv2D only if
@@ -2168,9 +2212,6 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
   // Number of input slots to original op
   // Input slots are represented by .Input() calls in REGISTER_OP.
   int old_node_input_slots = old_node->op_def().input_arg_size();
-  // Actual number of inputs can be greater than or equal to number
-  // of Input slots because inputs of type list could be unfolded.
-  CHECK_GE(old_node_inputs.size(), old_node_input_slots);
   int nn_slot_idx = 0;  // slot index for inputs of new node
 
   // Let's copy all inputs (TF tensors) of original node to new node.
@@ -2178,13 +2219,14 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
   for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
     // An input slot could be a single tensor or a list. We need
     // to handle this case accordingly.
-    CHECK_LT(iidx, old_node_inputs.size());
     const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
-      int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingTFTensorList(old_node_inputs, &iidx, N,
-                                    &new_node_inputs);
+      int tensor_list_length = GetTensorListLength(arg, old_node);
+      if (tensor_list_length != 0) {
+        GetNodesProducingTFTensorList(old_node_inputs, &iidx,
+                                      tensor_list_length, &new_node_inputs);
+      }
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
@@ -2217,13 +2259,14 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
   for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
     // An input slot could be a single tensor or a list. We need
     // to handle this case accordingly.
-    CHECK_LT(iidx, old_node_inputs.size());
     const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
-      int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, N,
-                                     &new_node_inputs);
+      int tensor_list_length = GetTensorListLength(arg, old_node);
+      if (tensor_list_length != 0) {
+        GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
+                                       tensor_list_length, &new_node_inputs);
+      }
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
@@ -3739,6 +3782,7 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
       n->type_string() != csinfo_.pad_with_conv2d &&
       n->type_string() != csinfo_.pad_with_fused_conv2d &&
       n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
+      n->type_string() != csinfo_.fused_batch_norm_ex &&
       n->type_string() != csinfo_.fused_conv2d &&
       n->type_string() != csinfo_.fused_depthwise_conv2d &&
       n->type_string() != csinfo_.fused_matmul &&
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
index c6d5331852e..71ab786f8a5 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
@@ -3216,6 +3216,100 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV3_Negative) {
             "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
 }
 
+// clang-format off
+#ifdef ENABLE_MKLDNN_V1
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph("node { name: 'A' op: '" #INPUT "'}"                           \
+              "node { name: 'B' op: 'Input'}"                                \
+              "node { name: 'C' op: 'Input'}"                                \
+              "node { name: 'D' op: 'Input'}"                                \
+              "node { name: 'E' op: 'Input'}"                                \
+              "node { name: 'F' op: '_FusedBatchNormEx'"                     \
+              " attr { key: 'T'               value { type: " #T " } }"      \
+              " attr { key: 'U'               value { type: DT_FLOAT } }"    \
+              " attr { key: 'data_format'     value { s: 'NCHW' } }"         \
+              " attr { key: 'epsilon'         value { f: 0.0001 } }"         \
+              " attr { key: 'num_side_inputs' value { i: 0 } }"              \
+              " attr { key: 'is_training'     value { b: true } }"           \
+              " attr { key: 'activation_mode' value { s: 'Relu' } }"         \
+              " input: ['A', 'B', 'C', 'D', 'E'] }"                          \
+              "node { name: 'G' op: 'Zeta'"                                  \
+              " attr { key: 'T' value { type: " #T " } }"                    \
+              " input: ['A', 'F'] }");                                       \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+              "A(" #INPUT ");B(Input);C(Input);D(Input);"                    \
+              "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);"     \
+              "DMT/_4(Const);E(Input);"                                      \
+              "F(_MklFusedBatchNormEx);G(Zeta)|A->F;A->G;"                   \
+              "A:control->DMT/_0:control;A:control->DMT/_1:control;"         \
+              "A:control->DMT/_2:control;A:control->DMT/_3:control;"         \
+              "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"              \
+              "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;" \
+              "E->F:4;F->G:1");                                              \
+  }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Positive);
+#undef REGISTER_TEST
+
+// Rewrite test for _FusedBatchNormEx Op with side input
+#define REGISTER_TEST(NAME, T, INPUT)                                     \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                 \
+    InitGraph("node { name: 'A' op: '" #INPUT "'}"                        \
+              "node { name: 'B' op: 'Input'}"                             \
+              "node { name: 'C' op: 'Input'}"                             \
+              "node { name: 'D' op: 'Input'}"                             \
+              "node { name: 'E' op: 'Input'}"                             \
+              "node { name: 'F' op: '" #INPUT "'}"                        \
+              "node { name: 'G' op: '_FusedBatchNormEx'"                  \
+              " attr { key: 'T'               value { type: " #T " } }"   \
+              " attr { key: 'U'               value { type: DT_FLOAT } }" \
+              " attr { key: 'data_format'     value { s: 'NCHW' } }"      \
+              " attr { key: 'epsilon'         value { f: 0.0001 } }"      \
+              " attr { key: 'num_side_inputs' value { i: 1 } }"           \
+              " attr { key: 'is_training'     value { b: true } }"        \
+              " attr { key: 'activation_mode' value { s: 'Relu' } }"      \
+              " input: ['A', 'B', 'C', 'D', 'E', 'F'] }"                  \
+              "node { name: 'H' op: 'Zeta'"                               \
+              " attr { key: 'T' value { type: " #T " } }"                 \
+              " input: ['A', 'G'] }");                                    \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                              \
+              "A(" #INPUT ");B(Input);C(Input);D(Input);E(Input);"        \
+              "F(" #INPUT ");G(_FusedBatchNormEx);H(Zeta)|A->G;A->H;"     \
+              "B->G:1;C->G:2;D->G:3;E->G:4;F->G:5;G->H:1");               \
+  }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative1);
+#undef REGISTER_TEST
+
+// Rewrite test for _FusedBatchNormEx Op with Identity activation
+#define REGISTER_TEST(NAME, T, INPUT)                                     \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                 \
+    InitGraph("node { name: 'A' op: '" #INPUT "'}"                        \
+              "node { name: 'B' op: 'Input'}"                             \
+              "node { name: 'C' op: 'Input'}"                             \
+              "node { name: 'D' op: 'Input'}"                             \
+              "node { name: 'E' op: 'Input'}"                             \
+              "node { name: 'G' op: '_FusedBatchNormEx'"                  \
+              " attr { key: 'T'               value { type: " #T " } }"   \
+              " attr { key: 'U'               value { type: DT_FLOAT } }" \
+              " attr { key: 'data_format'     value { s: 'NCHW' } }"      \
+              " attr { key: 'epsilon'         value { f: 0.0001 } }"      \
+              " attr { key: 'num_side_inputs' value { i: 1 } }"           \
+              " attr { key: 'is_training'     value { b: true } }"        \
+              " attr { key: 'activation_mode' value { s: 'Identity' } }"  \
+              " input: ['A', 'B', 'C', 'D', 'E'] }"                       \
+              "node { name: 'H' op: 'Zeta'"                               \
+              " attr { key: 'T' value { type: " #T " } }"                 \
+              " input: ['A', 'G'] }");                                    \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                              \
+              "A(" #INPUT ");B(Input);C(Input);D(Input);E(Input);"        \
+              "G(_FusedBatchNormEx);H(Zeta)|A->G;A->H;"                   \
+              "B->G:1;C->G:2;D->G:3;E->G:4;G->H:1");                      \
+  }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative2);
+#undef REGISTER_TEST
+#endif  // ENABLE_MKLDNN_V1
+// clang-format on
+
 TEST_F(MklLayoutPassTest, NodeRewrite_QuantizedDepthwiseConv2D_Positive) {
   InitGraph(
       "node { name: 'A' op: 'QuantizedUnsignedInt8Input'}"
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 271169f2a5e..c1a76e529d8 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/rendezvous_util.h"
 #include "tensorflow/core/common_runtime/replicate_per_replica_nodes.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -230,7 +231,7 @@ FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
   Device* device = nullptr;
   if (device_name != kDefaultFLRDevice) {
     if (!device_mgr_->LookupDevice(device_name, &device).ok()) {
-      VLOG(1) << "Could not find device: " << device_name;
+      VLOG(4) << "Could not find device: " << device_name;
       return nullptr;
     }
   }
@@ -1046,6 +1047,15 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
     return;
   }
 
+  // A locally created cancellation manager, used only when the caller does not
+  // provide one in argument.
+  std::shared_ptr<CancellationManager> local_cm;
+  CancellationManager* cm = opts.cancellation_manager;
+  if (cm == nullptr) {
+    local_cm = std::make_shared<CancellationManager>();
+    cm = local_cm.get();
+  }
+
   auto* refcounted_done = new ReffedStatusCallback(std::move(done));
   for (int i = 0; i < data->glue_.size(); ++i) {
     refcounted_done->Ref();
@@ -1059,7 +1069,7 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
 
     opts_copy.args_alloc_attrs = comp_data.arg_alloc_attrs;
     opts_copy.rets_alloc_attrs = comp_data.ret_alloc_attrs;
-    opts_copy.remote_execution = false;
+    opts_copy.cancellation_manager = cm;
 
     InternalArgs comp_args;
     Status s = get_component_args(comp_data, &comp_args);
@@ -1067,13 +1077,39 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
       VLOG(2) << "Failed to get component function arguments: " << s;
       refcounted_done->UpdateStatus(s);
       refcounted_done->Unref();
+      cm->StartCancel();
       continue;
     }
     std::vector<Tensor>* comp_rets = new std::vector<Tensor>;
     rets->resize(data->num_outputs_);
 
+    auto component_fn_callback = [comp_rets, rets, comp_data, refcounted_done,
+                                  cm, local_cm, data,
+                                  target](const Status& status) {
+      if (!status.ok()) {
+        VLOG(2) << "Component function execution on target " << target
+                << " failed: " << status;
+        const string function_and_msg = strings::StrCat(
+            errors::FormatFunctionForError(data->function_name_), " ",
+            status.error_message());
+        refcounted_done->UpdateStatus(Status(status.code(), function_and_msg));
+        // Cancel the execution of other component functions.
+        cm->StartCancel();
+      } else {
+        VLOG(2) << "Component function execution on target " << target
+                << " succeeded.";
+        for (int i = 0; i < comp_rets->size(); ++i) {
+          (*rets)[comp_data.ret_indices[i]] = (*comp_rets)[i];
+        }
+      }
+      delete comp_rets;
+      // refcounted_done is thread-safe
+      refcounted_done->Unref();
+    };
+
     FunctionLibraryRuntime* flr = GetFLR(target);
     if (flr != nullptr) {
+      opts_copy.remote_execution = false;
       // When target device has private thread pool, use the target device
       // runner
       thread::ThreadPool* pool = flr->device()->tensorflow_device_thread_pool();
@@ -1084,24 +1120,7 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
       VLOG(4) << "    with " << opts_copy.DebugString();
 
       flr->Run(opts_copy, handle, GetLocalArgs(comp_args.args), comp_rets,
-               [comp_rets, rets, comp_data, refcounted_done,
-                data](const Status& status) {
-                 if (!status.ok()) {
-                   VLOG(2) << "Component function execution failed: " << status;
-                   const string function_and_msg = strings::StrCat(
-                       errors::FormatFunctionForError(data->function_name_),
-                       " ", status.error_message());
-                   refcounted_done->UpdateStatus(
-                       Status(status.code(), function_and_msg));
-                 } else {
-                   for (int i = 0; i < comp_rets->size(); ++i) {
-                     (*rets)[comp_data.ret_indices[i]] = (*comp_rets)[i];
-                   }
-                 }
-                 delete comp_rets;
-                 // refcounted_done is thread-safe
-                 refcounted_done->Unref();
-               });
+               std::move(component_fn_callback));
     } else {
       opts_copy.remote_execution = true;
 
@@ -1109,21 +1128,8 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
               << " with handle " << handle;
       VLOG(4) << "    with " << opts_copy.DebugString();
 
-      RunInternal(
-          opts_copy, handle, comp_args.args, comp_rets, cleanup_items,
-          [comp_rets, rets, comp_data, refcounted_done](const Status& status) {
-            if (!status.ok()) {
-              VLOG(2) << "Component function execution failed: " << status;
-              refcounted_done->UpdateStatus(status);
-            } else {
-              for (int i = 0; i < comp_rets->size(); ++i) {
-                (*rets)[comp_data.ret_indices[i]] = (*comp_rets)[i];
-              }
-            }
-            delete comp_rets;
-            // refcounted_done is thread-safe
-            refcounted_done->Unref();
-          });
+      RunInternal(opts_copy, handle, comp_args.args, comp_rets, cleanup_items,
+                  std::move(component_fn_callback));
     }
   }
   refcounted_done->Unref();
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 247b94dc58c..5bdb4601d37 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -820,7 +820,8 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_CompositeDevice) {
   Status s;
   std::unique_ptr<CompositeDevice> composite_device =
       CompositeDevice::MakeDevice({device0_->name(), device1_->name()},
-                                  /*unique_device_id=*/0, &s);
+                                  /*unique_device_id=*/0,
+                                  device_mgr_->HostCPU()->parsed_name(), &s);
   TF_ASSERT_OK(s);
   AddCompositeDevice(composite_device.get());
 
diff --git a/tensorflow/core/common_runtime/propagator_state.cc b/tensorflow/core/common_runtime/propagator_state.cc
index 4fd5e0f97d9..a6639b1132e 100644
--- a/tensorflow/core/common_runtime/propagator_state.cc
+++ b/tensorflow/core/common_runtime/propagator_state.cc
@@ -16,31 +16,33 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/propagator_state.h"
 
 #include "tensorflow/core/common_runtime/graph_view.h"
+#include "tensorflow/core/common_runtime/immutable_executor_state.h"
 #include "tensorflow/core/common_runtime/propagator_debug_utils.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/hash.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
 
 PropagatorState::PropagatorState(const ImmutableExecutorState& immutable_state,
-                                 int64 step_id)
+                                 int64 step_id, bool vlog)
     : immutable_state_(immutable_state),
       step_id_(step_id),
-      vlog_(VLOG_IS_ON(1)) {
+      vlog_(vlog || VLOG_IS_ON(1)) {
   // We start the entire execution in iteration 0 of the root frame
   // so let us create the root frame and the state for iteration 0.
   // We assume root_frame_->frame_name.empty().
   root_frame_ = new FrameState(immutable_state_, 1);
   root_frame_->frame_id = 0;  // must be 0
-  root_frame_->InitializeFrameInfo(root_frame_->frame_name);
+  root_frame_->InitializeFrameInfo(immutable_state_.get_root_frame_info());
 
   // Initialize iteration 0.
   root_frame_->SetIteration(
       0, new PropagatorState::IterationState(0, root_frame_->pending_counts,
                                              root_frame_->total_input_tensors));
 
-  outstanding_frames_.insert({root_frame_->frame_name, root_frame_});
+  outstanding_frames_.emplace(root_frame_->frame_id, root_frame_);
 }
 
 PropagatorState::~PropagatorState() {
@@ -224,16 +226,16 @@ void PropagatorState::FindOrCreateChildFrame(FrameState* frame,
                                              const NodeItem& node_item,
                                              FrameState** child) {
   // Get the child frame name.
-  AttrSlice attrs(node_item.kernel->def());
-  const string& enter_name = GetNodeAttrString(attrs, "frame_name");
-  DCHECK(!enter_name.empty()) << "Could not find \"frame_name\" attr in node "
-                              << node_item.kernel->name();
-  const string child_name = strings::StrCat(
-      frame->frame_name, ";", iter_state->iter_num, ";", enter_name);
+  const ImmutableExecutorState::FrameInfo& frame_info =
+      immutable_state_.get_enter_frame_info(node_item);
+
+  const uint64 child_id = Hash64Combine(
+      frame->frame_id,
+      Hash64Combine(iter_state->iter_num, Hash64(frame_info.name)));
 
   {
-    mutex_lock executor_lock(mu_);
-    auto it = outstanding_frames_.find(child_name);
+    tf_shared_lock executor_lock(mu_);
+    auto it = outstanding_frames_.find(child_id);
     if (it != outstanding_frames_.end()) {
       *child = it->second;
       return;
@@ -242,20 +244,18 @@ void PropagatorState::FindOrCreateChildFrame(FrameState* frame,
 
   // Need to create a new frame instance.
   // Note that this new frame instance is created without any locks.
-  if (vlog_) VLOG(2) << "Create frame: " << child_name;
+  if (vlog_) {
+    const string child_name = strings::StrCat(
+        frame->frame_name, ";", iter_state->iter_num, ";", frame_info.name);
+    VLOG(2) << "Create frame: " << child_name << " id: " << child_id;
+  }
 
-  int parallel_iters;
-  bool found_parallel_iters =
-      TryGetNodeAttr(attrs, "parallel_iterations", &parallel_iters);
-  DCHECK(found_parallel_iters)
-      << "Could not find \"parallel_iterations\" attr in node "
-      << node_item.kernel->name();
-  FrameState* temp = new FrameState(immutable_state_, parallel_iters);
-  temp->frame_name = child_name;
-  temp->frame_id = Hash64(child_name);
+  FrameState* temp =
+      new FrameState(immutable_state_, frame_info.parallel_iterations);
+  temp->frame_id = child_id;
   temp->parent_frame = frame;
   temp->parent_iter = iter_state;
-  temp->InitializeFrameInfo(enter_name);
+  temp->InitializeFrameInfo(frame_info);
 
   // Initialize iteration 0.
   {
@@ -266,13 +266,13 @@ void PropagatorState::FindOrCreateChildFrame(FrameState* frame,
 
   {
     mutex_lock executor_lock(mu_);
-    auto it = outstanding_frames_.find(child_name);
+    auto it = outstanding_frames_.find(child_id);
     if (it != outstanding_frames_.end()) {
       *child = it->second;
     } else {
       mutex_lock frame_lock(frame->mu);
       iter_state->outstanding_frame_count++;
-      outstanding_frames_[child_name] = temp;
+      outstanding_frames_[child_id] = temp;
       *child = temp;
       temp = nullptr;
     }
@@ -349,11 +349,10 @@ void PropagatorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
   }
 
   // Delete the frame.
-  const string& frame_name = frame->frame_name;
-  if (vlog_) VLOG(2) << "Delete frame " << frame_name;
+  if (vlog_) VLOG(2) << "Delete frame " << frame->frame_id;
   {
     mutex_lock executor_lock(mu_);
-    outstanding_frames_.erase(frame_name);
+    outstanding_frames_.erase(frame->frame_id);
   }
   delete frame;
 }
@@ -655,14 +654,11 @@ bool PropagatorState::FrameState::CleanupIterations(IterationState* iter_state,
 }
 
 void PropagatorState::FrameState::InitializeFrameInfo(
-    const string& enter_name) {
-  const ImmutableExecutorState::FrameInfo* finfo =
-      immutable_state.get_frame_info(enter_name);
-  DCHECK_NE(finfo, nullptr);
-  pending_counts = finfo->pending_counts.get();
-  total_input_tensors = finfo->total_inputs;
-  num_pending_inputs = finfo->input_count;
-  nodes = finfo->nodes.get();
+    const ImmutableExecutorState::FrameInfo& finfo) {
+  pending_counts = finfo.pending_counts.get();
+  total_input_tensors = finfo.total_inputs;
+  num_pending_inputs = finfo.input_count;
+  nodes = finfo.nodes.get();
 }
 
 void PropagatorState::FrameState::SetIteration(int64 iter,
diff --git a/tensorflow/core/common_runtime/propagator_state.h b/tensorflow/core/common_runtime/propagator_state.h
index 459e28a83ee..167519ccc73 100644
--- a/tensorflow/core/common_runtime/propagator_state.h
+++ b/tensorflow/core/common_runtime/propagator_state.h
@@ -45,7 +45,8 @@ typedef gtl::InlinedVector<AllocatorAttributes, 4> AllocatorAttributeVec;
 // adding them to a `TaggedNodeSeq`.
 class PropagatorState {
  public:
-  PropagatorState(const ImmutableExecutorState& immutable_state, int64 step_id);
+  PropagatorState(const ImmutableExecutorState& immutable_state, int64 step_id,
+                  bool vlog);
   ~PropagatorState();
 
  private:
@@ -279,7 +280,7 @@ class PropagatorState {
     // during structured traversal: parent_frame->mu < mu.
     mutex mu;
 
-    void InitializeFrameInfo(const string& enter_name);
+    void InitializeFrameInfo(const ImmutableExecutorState::FrameInfo& finfo);
 
     inline IterationState* GetIteration(int64 iter)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
@@ -447,12 +448,13 @@ class PropagatorState {
   // The root frame in which the execution of this step is started.
   FrameState* root_frame_;
 
-  // Mapping from frame name to outstanding frames. A new frame is created
+  // Mapping from frame ID to outstanding frames. A new frame is created
   // at some iteration of an active frame. So the unique key for the new
-  // child frame is composed of the name of the parent frame, the iteration
+  // child frame is a hash composed of the ID of the parent frame, the iteration
   // number at which the parent frame is creating the new frame, and the
   // name of the new frame from nodedef.
-  gtl::FlatMap<string, FrameState*> outstanding_frames_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<uint64, FrameState*> outstanding_frames_
+      TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(PropagatorState);
 };
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
index 3609a5e7e1f..fbae80aef55 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
@@ -42,7 +42,9 @@ class ReplicateHelper {
       Node* replicated_node = graph->AddNode(node_def, &status);
       TF_RETURN_IF_ERROR(status);
       replicated_node->set_assigned_device_name(device);
-      replicated_node->AddAttr("sub_index", i);
+      if (replicated_node->IsArg()) {
+        replicated_node->AddAttr("sub_index", i);
+      }
       replicated_nodes[i] = replicated_node;
     }
     replicated_nodes_map_.emplace(node, std::move(replicated_nodes));
@@ -195,6 +197,9 @@ Status ReplicatePerReplicaNodesInFunctionGraph(
   for (Node* n : graph->op_nodes()) {
     if (composite_device_names.find(n->assigned_device_name()) !=
         composite_device_names.end()) {
+      // TODO(b/145922293): Validate that an _Arg node assigned to a
+      // CompositeDevice should have an attribute indicating that the _Arg node
+      // represents a packed input.
       composite_device_to_cluster_nodes[n->assigned_device_name()].push_back(n);
     }
   }
@@ -211,7 +216,9 @@ Status ReplicatePerReplicaNodesInFunctionGraph(
       // Reuse the original nodes if there is only one allowed device.
       for (Node* n : cluster_nodes) {
         n->set_assigned_device_name(allowed_devices.at(0));
-        n->AddAttr("sub_index", 0);
+        if (n->IsArg()) {
+          n->AddAttr("sub_index", 0);
+        }
       }
       continue;
     }
diff --git a/tensorflow/core/common_runtime/session.cc b/tensorflow/core/common_runtime/session.cc
index 3817736020e..05c8432d4fe 100644
--- a/tensorflow/core/common_runtime/session.cc
+++ b/tensorflow/core/common_runtime/session.cc
@@ -60,18 +60,12 @@ Status Session::PRun(const string& handle,
 }
 
 Session* NewSession(const SessionOptions& options) {
-  SessionFactory* factory;
-  Status s = SessionFactory::GetFactory(options, &factory);
-  if (!s.ok()) {
-    LOG(ERROR) << s;
-    return nullptr;
-  }
   // Starts exporting metrics through a platform-specific monitoring API (if
   // provided). For builds using "tensorflow/core/platform/default", this is
   // currently a no-op.
   session_created->GetCell()->Set(true);
   Session* out_session;
-  s = NewSession(options, &out_session);
+  Status s = NewSession(options, &out_session);
   if (!s.ok()) {
     LOG(ERROR) << "Failed to create session: " << s;
     return nullptr;
diff --git a/tensorflow/core/common_runtime/simple_propagator_state.cc b/tensorflow/core/common_runtime/simple_propagator_state.cc
index 48fac96dd3d..01322cc3514 100644
--- a/tensorflow/core/common_runtime/simple_propagator_state.cc
+++ b/tensorflow/core/common_runtime/simple_propagator_state.cc
@@ -23,16 +23,16 @@ limitations under the License.
 namespace tensorflow {
 
 SimplePropagatorState::SimplePropagatorState(
-    const ImmutableExecutorState& immutable_state, int64 step_id)
+    const ImmutableExecutorState& immutable_state, int64 step_id, bool vlog)
     : SimplePropagatorState(immutable_state, step_id,
-                            immutable_state.get_root_frame_info()) {}
+                            immutable_state.get_root_frame_info(), vlog) {}
 
 SimplePropagatorState::SimplePropagatorState(
     const ImmutableExecutorState& immutable_state, int64 step_id,
-    const ImmutableExecutorState::FrameInfo& finfo)
+    const ImmutableExecutorState::FrameInfo& finfo, bool vlog)
     : immutable_state_(immutable_state),
       step_id_(step_id),
-      vlog_(VLOG_IS_ON(1)),
+      vlog_(vlog || VLOG_IS_ON(1)),
       input_tensors_(finfo.total_inputs),
       pending_(
           new std::atomic<int32>[immutable_state.graph_view().num_nodes()]),
diff --git a/tensorflow/core/common_runtime/simple_propagator_state.h b/tensorflow/core/common_runtime/simple_propagator_state.h
index 1aee4c7ff2f..024341e5048 100644
--- a/tensorflow/core/common_runtime/simple_propagator_state.h
+++ b/tensorflow/core/common_runtime/simple_propagator_state.h
@@ -47,7 +47,7 @@ namespace tensorflow {
 class SimplePropagatorState {
  public:
   SimplePropagatorState(const ImmutableExecutorState& immutable_state,
-                        int64 step_id);
+                        int64 step_id, bool vlog);
   ~SimplePropagatorState();
 
   // A `TaggedNode` corresponds to a single invocation of a node's kernel,
@@ -157,7 +157,8 @@ class SimplePropagatorState {
  private:
   SimplePropagatorState(const ImmutableExecutorState& immutable_state_,
                         int64 step_id,
-                        const ImmutableExecutorState::FrameInfo& finfo);
+                        const ImmutableExecutorState::FrameInfo& finfo,
+                        bool vlog);
 
   const ImmutableExecutorState& immutable_state_;
   const int64 step_id_;
diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index 9c58be108fc..1b6e6790559 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -1,5 +1,10 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/core/platform:build_config.bzl", "tf_protos_all")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_additional_all_protos",
+    "tf_proto_library",
+    "tf_protos_all",
+)
 
 package(
     default_visibility = [
@@ -10,6 +15,45 @@ package(
 
 exports_files(["LICENSE"])
 
+cc_library(
+    name = "compression_utils",
+    srcs = ["compression_utils.cc"],
+    hdrs = [
+        "compression_utils.h",
+    ],
+    deps = [
+        ":dataset_proto_cc",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_cc_test(
+    name = "compression_utils_test",
+    srcs = ["compression_utils_test.cc"],
+    deps = [
+        ":compression_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/data:dataset_test_base",
+    ],
+)
+
+tf_proto_library(
+    name = "dataset_proto",
+    srcs = ["dataset.proto"],
+    cc_api_version = 2,
+    protodeps = tf_additional_all_protos(),
+)
+
 cc_library(
     name = "standalone",
     srcs = ["standalone.cc"],
diff --git a/tensorflow/core/data/service/compression_utils.cc b/tensorflow/core/data/compression_utils.cc
similarity index 88%
rename from tensorflow/core/data/service/compression_utils.cc
rename to tensorflow/core/data/compression_utils.cc
index c4a47e1b00e..d132bdca8da 100644
--- a/tensorflow/core/data/service/compression_utils.cc
+++ b/tensorflow/core/data/compression_utils.cc
@@ -12,21 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
 
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/platform/snappy.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
 namespace data {
-namespace service_util {
-
-Status Compress(const std::vector<Tensor>& element, CompressedElement* out) {
-  tensorflow::profiler::TraceMe activity(
-      "Compress", tensorflow::profiler::TraceMeLevel::kInfo);
 
+Status CompressElement(const std::vector<Tensor>& element,
+                       CompressedElement* out) {
   // Step 1: Determine the total uncompressed size. This requires serializing
   // non-memcopyable tensors, which we save to use again later.
   std::vector<TensorProto> non_memcpy_components;
@@ -51,7 +47,8 @@ Status Compress(const std::vector<Tensor>& element, CompressedElement* out) {
   char* position = uncompressed.mdata();
   int non_memcpy_component_index = 0;
   for (auto& component : element) {
-    ComponentMetadata* metadata = out->mutable_component_metadata()->Add();
+    CompressedComponentMetadata* metadata =
+        out->mutable_component_metadata()->Add();
     metadata->set_dtype(component.dtype());
     component.shape().AsProto(metadata->mutable_tensor_shape());
     if (DataTypeCanUseMemcpy(component.dtype())) {
@@ -71,13 +68,13 @@ Status Compress(const std::vector<Tensor>& element, CompressedElement* out) {
                              out->mutable_data())) {
     return errors::Internal("Failed to compress using snappy.");
   }
+  VLOG(3) << "Compressed element from " << total_size << " bytes to "
+          << out->data().size() << " bytes";
   return Status::OK();
 }
 
-Status Uncompress(const CompressedElement& compressed,
-                  std::vector<Tensor>* out) {
-  tensorflow::profiler::TraceMe activity(
-      "Uncompress", tensorflow::profiler::TraceMeLevel::kInfo);
+Status UncompressElement(const CompressedElement& compressed,
+                         std::vector<Tensor>* out) {
   int num_components = compressed.component_metadata_size();
   out->clear();
   out->reserve(num_components);
@@ -92,7 +89,8 @@ Status Uncompress(const CompressedElement& compressed,
   tensor_proto_strs.reserve(num_components);
   int64 total_size = 0;
   for (int i = 0; i < num_components; ++i) {
-    const ComponentMetadata& metadata = compressed.component_metadata(i);
+    const CompressedComponentMetadata& metadata =
+        compressed.component_metadata(i);
     if (DataTypeCanUseMemcpy(metadata.dtype())) {
       out->emplace_back(metadata.dtype(), metadata.tensor_shape());
       TensorBuffer* buffer = DMAHelper::buffer(&out->back());
@@ -146,6 +144,5 @@ Status Uncompress(const CompressedElement& compressed,
   return Status::OK();
 }
 
-}  // namespace service_util
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/compression_utils.h b/tensorflow/core/data/compression_utils.h
similarity index 82%
rename from tensorflow/core/data/service/compression_utils.h
rename to tensorflow/core/data/compression_utils.h
index 96698aaaf09..5e033771272 100644
--- a/tensorflow/core/data/service/compression_utils.h
+++ b/tensorflow/core/data/compression_utils.h
@@ -16,24 +16,23 @@ limitations under the License.
 #define TENSORFLOW_CORE_DATA_SERVICE_COMPRESSION_UTILS_H_
 
 #include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/dataset.pb.h"
 #include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
 namespace data {
-namespace service_util {
 
 // Compresses the components of `element` into the `CompressedElement` proto.
 //
 // In addition to writing the actual compressed bytes, `Compress` fills
 // out the per-component metadata for the `CompressedElement`.
-Status Compress(const std::vector<Tensor>& element, CompressedElement* out);
+Status CompressElement(const std::vector<Tensor>& element,
+                       CompressedElement* out);
 
 // Uncompresses a `CompressedElement` into a vector of tensor components.
-Status Uncompress(const CompressedElement& compressed,
-                  std::vector<Tensor>* out);
+Status UncompressElement(const CompressedElement& compressed,
+                         std::vector<Tensor>* out);
 
-}  // namespace service_util
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/data/service/compression_utils_test.cc b/tensorflow/core/data/compression_utils_test.cc
similarity index 89%
rename from tensorflow/core/data/service/compression_utils_test.cc
rename to tensorflow/core/data/compression_utils_test.cc
index b5da13efeed..eb220092f88 100644
--- a/tensorflow/core/data/service/compression_utils_test.cc
+++ b/tensorflow/core/data/compression_utils_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
 
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/kernels/data/dataset_test_base.h"
@@ -20,7 +20,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
-namespace service_util {
 
 class ParameterizedCompressionUtilsTest
     : public DatasetOpsTestBase,
@@ -29,9 +28,9 @@ class ParameterizedCompressionUtilsTest
 TEST_P(ParameterizedCompressionUtilsTest, RoundTrip) {
   std::vector<Tensor> element = GetParam();
   CompressedElement compressed;
-  TF_ASSERT_OK(Compress(element, &compressed));
+  TF_ASSERT_OK(CompressElement(element, &compressed));
   std::vector<Tensor> round_trip_element;
-  TF_ASSERT_OK(Uncompress(compressed, &round_trip_element));
+  TF_ASSERT_OK(UncompressElement(compressed, &round_trip_element));
   TF_EXPECT_OK(
       ExpectEqual(element, round_trip_element, /*compare_order=*/true));
 }
@@ -50,6 +49,5 @@ std::vector<std::vector<Tensor>> TestCases() {
 INSTANTIATE_TEST_SUITE_P(Instantiation, ParameterizedCompressionUtilsTest,
                          ::testing::ValuesIn(TestCases()));
 
-}  // namespace service_util
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/dataset.proto b/tensorflow/core/data/dataset.proto
new file mode 100644
index 00000000000..27a36364e76
--- /dev/null
+++ b/tensorflow/core/data/dataset.proto
@@ -0,0 +1,27 @@
+syntax = "proto3";
+
+package tensorflow.data;
+
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
+
+// This file contains protocol buffers for working with tf.data Datasets.
+
+// Metadata describing a compressed component of a dataset element.
+message CompressedComponentMetadata {
+  // The dtype of the component tensor.
+  .tensorflow.DataType dtype = 1;
+  // The shape of the component tensor.
+  .tensorflow.TensorShapeProto tensor_shape = 2;
+  // Size of the uncompressed tensor bytes. For tensors serialized as
+  // TensorProtos, this is TensorProto::BytesAllocatedLong(). For raw Tensors,
+  // this is the size of the buffer underlying the Tensor.
+  int64 tensor_size_bytes = 3;
+}
+
+message CompressedElement {
+  // Compressed tensor bytes for all components of the element.
+  bytes data = 1;
+  // Metadata for the components of the element.
+  repeated CompressedComponentMetadata component_metadata = 2;
+}
diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index 5413493cb78..6b2341d2b6d 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -44,6 +44,7 @@ tf_proto_library(
     cc_api_version = 2,
     protodeps = tf_additional_all_protos() + [
         ":common_proto",
+        "//tensorflow/core/data:dataset_proto",
     ],
 )
 
@@ -84,7 +85,6 @@ cc_library(
     ],
     deps = [
         ":common_proto_cc",
-        ":compression_utils",
         ":credentials_factory",
         ":grpc_util",
         ":master_cc_grpc_proto",
@@ -98,6 +98,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/data:dataset_proto_cc",
         "//tensorflow/core/data:standalone",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
@@ -129,39 +130,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "compression_utils",
-    srcs = ["compression_utils.cc"],
-    hdrs = [
-        "compression_utils.h",
-    ],
-    deps = [
-        ":common_proto_cc",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/lib:traceme",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-tf_cc_test(
-    name = "compression_utils_test",
-    srcs = ["compression_utils_test.cc"],
-    deps = [
-        ":compression_utils",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels/data:dataset_test_base",
-    ],
-)
-
 cc_library(
     name = "credentials_factory",
     srcs = ["credentials_factory.cc"],
@@ -276,6 +244,7 @@ cc_library(
     name = "server_lib",
     srcs = ["server_lib.cc"],
     hdrs = ["server_lib.h"],
+    linkstatic = True,
     visibility = [
         "//visibility:public",
     ],
@@ -289,6 +258,7 @@ cc_library(
         "//tensorflow/core:tensorflow",
         tf_grpc_cc_dependency(),
     ],
+    alwayslink = 1,
 )
 
 cc_library(
@@ -317,7 +287,6 @@ tf_cc_test(
     srcs = ["data_service_test.cc"],
     tags = ["no_windows"],
     deps = [
-        ":compression_utils",
         ":data_service",
         ":grpc_master_impl",
         ":grpc_util",
@@ -333,6 +302,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/data:compression_utils",
         "//tensorflow/core/kernels/data:dataset_test_base",
         "@com_google_absl//absl/strings",
         tf_grpc_cc_dependency(),
diff --git a/tensorflow/core/data/service/common.proto b/tensorflow/core/data/service/common.proto
index 6dfa698764b..4bde56fe1ca 100644
--- a/tensorflow/core/data/service/common.proto
+++ b/tensorflow/core/data/service/common.proto
@@ -3,7 +3,6 @@ syntax = "proto3";
 package tensorflow.data;
 
 import "tensorflow/core/framework/graph.proto";
-import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
 
 message DatasetDef {
@@ -12,24 +11,6 @@ message DatasetDef {
   GraphDef graph = 1;
 }
 
-message ComponentMetadata {
-  // The dtype of the component tensor.
-  .tensorflow.DataType dtype = 1;
-  // The shape of the component tensor.
-  .tensorflow.TensorShapeProto tensor_shape = 2;
-  // Size of the uncompressed tensor bytes. For tensors serialized as
-  // TensorProtos, this is TensorProto::BytesAllocatedLong(). For raw Tensors,
-  // this is the size of the buffer underlying the Tensor.
-  int64 tensor_size_bytes = 3;
-}
-
-message CompressedElement {
-  // Compressed tensor bytes for all components of the element.
-  bytes data = 1;
-  // Metadata for the components of the element.
-  repeated ComponentMetadata component_metadata = 2;
-}
-
 message TaskDef {
   // The dataset to iterate over.
   // TODO(aaudibert): load the dataset from disk instead of passing it here.
diff --git a/tensorflow/core/data/service/data_service.cc b/tensorflow/core/data/service/data_service.cc
index 915435d8fcb..d4e08c77f35 100644
--- a/tensorflow/core/data/service/data_service.cc
+++ b/tensorflow/core/data/service/data_service.cc
@@ -132,6 +132,22 @@ Status DataServiceMasterClient::GetTasks(int64 job_id,
   return Status::OK();
 }
 
+Status DataServiceMasterClient::GetWorkers(std::vector<WorkerInfo>* workers) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  GetWorkersRequest req;
+  GetWorkersResponse resp;
+  grpc_impl::ClientContext ctx;
+  grpc::Status s = stub_->GetWorkers(&ctx, req, &resp);
+  if (!s.ok()) {
+    return grpc_util::WrapError("Failed to get workers", s);
+  }
+  workers->clear();
+  for (auto& worker : resp.workers()) {
+    workers->push_back(worker);
+  }
+  return Status::OK();
+}
+
 Status DataServiceMasterClient::EnsureInitialized() {
   std::shared_ptr<grpc::ChannelCredentials> credentials;
   TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/data/service/data_service.h b/tensorflow/core/data/service/data_service.h
index d205b4d9ebf..bb5a8a470f0 100644
--- a/tensorflow/core/data/service/data_service.h
+++ b/tensorflow/core/data/service/data_service.h
@@ -96,6 +96,10 @@ class DataServiceMasterClient : public DataServiceClientBase {
   Status GetTasks(int64 job_id, std::vector<TaskInfo>* tasks,
                   bool* job_finished);
 
+  // Queries the master for its registered workers. The worker info will be
+  // stored in `*workers`.
+  Status GetWorkers(std::vector<WorkerInfo>* workers);
+
  protected:
   Status EnsureInitialized() override;
 
diff --git a/tensorflow/core/data/service/data_service_test.cc b/tensorflow/core/data/service/data_service_test.cc
index 73a46bad3d0..19392393eeb 100644
--- a/tensorflow/core/data/service/data_service_test.cc
+++ b/tensorflow/core/data/service/data_service_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "grpcpp/create_channel.h"
 #include "grpcpp/security/credentials.h"
 #include "absl/strings/str_split.h"
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/master.grpc.pb.h"
 #include "tensorflow/core/data/service/master.pb.h"
@@ -37,6 +37,7 @@ namespace data {
 
 namespace {
 constexpr const char kProtocol[] = "grpc+local";
+}
 
 TEST(DataService, ParseParallelEpochsProcessingMode) {
   ProcessingMode mode;
@@ -62,114 +63,13 @@ TEST(DataService, ProcessingModeToString) {
   EXPECT_EQ("one_epoch", ProcessingModeToString(ProcessingMode::ONE_EPOCH));
 }
 
-Status CheckWorkerOutput(const std::string& worker_address, int64 task_id,
-                         std::vector<std::vector<Tensor>> expected_output) {
-  DataServiceWorkerClient worker(worker_address, kProtocol);
-  for (std::vector<Tensor>& expected : expected_output) {
-    bool end_of_sequence;
-    CompressedElement compressed;
-    TF_RETURN_IF_ERROR(
-        worker.GetElement(task_id, &compressed, &end_of_sequence));
-    if (end_of_sequence) {
-      return errors::Internal("Reached end of sequence too early.");
-    }
-    std::vector<Tensor> element;
-    TF_RETURN_IF_ERROR(service_util::Uncompress(compressed, &element));
-    TF_RETURN_IF_ERROR(DatasetOpsTestBase::ExpectEqual(element, expected,
-                                                       /*compare_order=*/true));
-  }
-  // Call GetElement a couple more times to verify tha end_of_sequence keeps
-  // returning true.
-  bool end_of_sequence;
-  CompressedElement compressed;
-  TF_RETURN_IF_ERROR(worker.GetElement(task_id, &compressed, &end_of_sequence));
-  if (!end_of_sequence) {
-    return errors::Internal("Expected end_of_sequence to be true");
-  }
-  TF_RETURN_IF_ERROR(worker.GetElement(task_id, &compressed, &end_of_sequence));
-  if (!end_of_sequence) {
-    return errors::Internal("Expected end_of_sequence to be true");
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
-TEST(DataService, IterateDatasetOneWorker) {
+TEST(DataService, GetWorkers) {
   TestCluster cluster(1);
   TF_ASSERT_OK(cluster.Initialize());
-  test_util::GraphDefTestCase test_case;
-  TF_ASSERT_OK(test_util::map_test_case(&test_case));
   DataServiceMasterClient master(cluster.MasterAddress(), kProtocol);
-
-  int64 dataset_id;
-  TF_ASSERT_OK(master.RegisterDataset(test_case.graph_def, &dataset_id));
-  int64 job_id;
-  TF_ASSERT_OK(
-      master.CreateJob(dataset_id, ProcessingMode::PARALLEL_EPOCHS, &job_id));
-  std::vector<TaskInfo> tasks;
-  bool job_finished;
-  TF_ASSERT_OK(master.GetTasks(job_id, &tasks, &job_finished));
-  ASSERT_EQ(tasks.size(), 1);
-  EXPECT_EQ(tasks[0].worker_address(), cluster.WorkerAddress(0));
-  EXPECT_FALSE(job_finished);
-
-  TF_EXPECT_OK(CheckWorkerOutput(tasks[0].worker_address(), tasks[0].id(),
-                                 test_case.output));
-}
-
-TEST(DataService, IterateDatasetTwoWorkers) {
-  TestCluster cluster(2);
-  TF_ASSERT_OK(cluster.Initialize());
-  test_util::GraphDefTestCase test_case;
-  TF_ASSERT_OK(test_util::map_test_case(&test_case));
-  DataServiceMasterClient master(cluster.MasterAddress(), kProtocol);
-
-  int64 dataset_id;
-  TF_ASSERT_OK(master.RegisterDataset(test_case.graph_def, &dataset_id));
-  int64 job_id;
-  TF_ASSERT_OK(
-      master.CreateJob(dataset_id, ProcessingMode::PARALLEL_EPOCHS, &job_id));
-  std::vector<TaskInfo> tasks;
-  bool job_finished;
-  TF_EXPECT_OK(master.GetTasks(job_id, &tasks, &job_finished));
-  EXPECT_EQ(tasks.size(), 2);
-  EXPECT_FALSE(job_finished);
-
-  // Each worker produces the full dataset.
-  for (TaskInfo task : tasks) {
-    TF_EXPECT_OK(
-        CheckWorkerOutput(task.worker_address(), task.id(), test_case.output));
-  }
-}
-
-TEST(DataService, AddWorkerMidEpoch) {
-  TestCluster cluster(1);
-  TF_ASSERT_OK(cluster.Initialize());
-  test_util::GraphDefTestCase test_case;
-  TF_ASSERT_OK(test_util::map_test_case(&test_case));
-  DataServiceMasterClient master(cluster.MasterAddress(), kProtocol);
-
-  int64 dataset_id;
-  TF_ASSERT_OK(master.RegisterDataset(test_case.graph_def, &dataset_id));
-  int64 job_id;
-  TF_ASSERT_OK(
-      master.CreateJob(dataset_id, ProcessingMode::PARALLEL_EPOCHS, &job_id));
-  std::vector<TaskInfo> tasks;
-  bool job_finished;
-  TF_ASSERT_OK(master.GetTasks(job_id, &tasks, &job_finished));
-  EXPECT_EQ(tasks.size(), 1);
-  EXPECT_FALSE(job_finished);
-  TF_ASSERT_OK(cluster.AddWorker());
-  TF_EXPECT_OK(master.GetTasks(job_id, &tasks, &job_finished));
-  EXPECT_EQ(tasks.size(), 2);
-  EXPECT_FALSE(job_finished);
-
-  // Each worker produces the full dataset.
-  for (TaskInfo task : tasks) {
-    TF_EXPECT_OK(
-        CheckWorkerOutput(task.worker_address(), task.id(), test_case.output));
-  }
+  std::vector<WorkerInfo> workers;
+  TF_EXPECT_OK(master.GetWorkers(&workers));
+  EXPECT_EQ(1, workers.size());
 }
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/grpc_master_impl.cc b/tensorflow/core/data/service/grpc_master_impl.cc
index ba27959fee7..20ad58a0115 100644
--- a/tensorflow/core/data/service/grpc_master_impl.cc
+++ b/tensorflow/core/data/service/grpc_master_impl.cc
@@ -44,6 +44,7 @@ HANDLER(GetOrRegisterDataset);
 HANDLER(CreateJob);
 HANDLER(GetOrCreateJob);
 HANDLER(GetTasks);
+HANDLER(GetWorkers);
 #undef HANDLER
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/grpc_master_impl.h b/tensorflow/core/data/service/grpc_master_impl.h
index 32eb0f3fc6a..d29bb6759f0 100644
--- a/tensorflow/core/data/service/grpc_master_impl.h
+++ b/tensorflow/core/data/service/grpc_master_impl.h
@@ -48,6 +48,7 @@ class GrpcMasterImpl : public MasterService::Service {
   HANDLER(CreateJob);
   HANDLER(GetOrCreateJob);
   HANDLER(GetTasks);
+  HANDLER(GetWorkers);
 #undef HANDLER
 
  private:
diff --git a/tensorflow/core/data/service/grpc_worker_impl.cc b/tensorflow/core/data/service/grpc_worker_impl.cc
index a5d005d6c6e..7884fa063ba 100644
--- a/tensorflow/core/data/service/grpc_worker_impl.cc
+++ b/tensorflow/core/data/service/grpc_worker_impl.cc
@@ -30,7 +30,6 @@ GrpcWorkerImpl::GrpcWorkerImpl(ServerBuilder* server_builder,
                                const std::string& protocol)
     : impl_(master_address, protocol) {
   server_builder->RegisterService(this);
-  LOG(INFO) << "GrpcWorkerImpl: master address is " << master_address;
   VLOG(1) << "Registered data service worker";
 }
 
diff --git a/tensorflow/core/data/service/master.proto b/tensorflow/core/data/service/master.proto
index 005e5affb7d..661264cc41b 100644
--- a/tensorflow/core/data/service/master.proto
+++ b/tensorflow/core/data/service/master.proto
@@ -98,6 +98,18 @@ message GetTasksResponse {
   bool job_finished = 2;
 }
 
+message WorkerInfo {
+  string address = 1;
+  int64 id = 2;
+}
+
+message GetWorkersRequest {}
+
+message GetWorkersResponse {
+  // A list of all workers.
+  repeated WorkerInfo workers = 1;
+}
+
 service MasterService {
   // Registers a worker with the master.
   rpc RegisterWorker(RegisterWorkerRequest) returns (RegisterWorkerResponse);
@@ -121,4 +133,7 @@ service MasterService {
 
   // Reports a list of all tasks for a job.
   rpc GetTasks(GetTasksRequest) returns (GetTasksResponse);
+
+  // Reports a list of all workers registered with the master.
+  rpc GetWorkers(GetWorkersRequest) returns (GetWorkersResponse);
 }
diff --git a/tensorflow/core/data/service/master_impl.cc b/tensorflow/core/data/service/master_impl.cc
index 6e2c95c475e..37a884d540e 100644
--- a/tensorflow/core/data/service/master_impl.cc
+++ b/tensorflow/core/data/service/master_impl.cc
@@ -169,7 +169,11 @@ Status DataServiceMasterImpl::GetOrCreateJob(
   if (job != nullptr) {
     TF_RETURN_IF_ERROR(ValidateMatchingJob(**job, requested_processing_mode,
                                            request->dataset_id()));
-    response->set_job_id((*job)->job_id());
+    int64 job_id = (*job)->job_id();
+    response->set_job_id(job_id);
+    VLOG(3) << "Found existing job for name=" << request->job_name()
+            << ", index=" << request->job_name_index()
+            << ". job_id: " << job_id;
     return Status::OK();
   }
   int64 job_id;
@@ -177,6 +181,8 @@ Status DataServiceMasterImpl::GetOrCreateJob(
                                request->job_name(), &job_id));
   named_jobs_[key] = jobs_[job_id];
   response->set_job_id(job_id);
+  VLOG(3) << "Created job " << job_id << " for dataset "
+          << request->dataset_id() << " and name " << request->job_name();
   return Status::OK();
 }
 
@@ -309,5 +315,19 @@ Status DataServiceMasterImpl::GetTasks(const GetTasksRequest* request,
   return Status::OK();
 }
 
+Status DataServiceMasterImpl::GetWorkers(const GetWorkersRequest* request,
+                                         GetWorkersResponse* response) {
+  mutex_lock l(mu_);
+  VLOG(3) << "Enter GetWorkers";
+  for (auto& worker : workers_) {
+    WorkerInfo* info = response->add_workers();
+    info->set_address(worker.address());
+    info->set_id(worker.worker_id());
+  }
+  VLOG(3) << "Returning list of " << workers_.size()
+          << " workers from GetWorkers";
+  return Status::OK();
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/master_impl.h b/tensorflow/core/data/service/master_impl.h
index de25ea0d6a8..0dc049a389c 100644
--- a/tensorflow/core/data/service/master_impl.h
+++ b/tensorflow/core/data/service/master_impl.h
@@ -60,6 +60,8 @@ class DataServiceMasterImpl {
   Status GetOrCreateJob(const GetOrCreateJobRequest* request,
                         GetOrCreateJobResponse* response);
   Status GetTasks(const GetTasksRequest* request, GetTasksResponse* response);
+  Status GetWorkers(const GetWorkersRequest* request,
+                    GetWorkersResponse* response);
 
  private:
   class Worker {
@@ -75,7 +77,7 @@ class DataServiceMasterImpl {
     }
 
     std::string DebugString() {
-      return absl::StrCat("id: ", worker_id_, "address: ", address_);
+      return absl::StrCat("id: ", worker_id_, " address: ", address_);
     }
 
    private:
diff --git a/tensorflow/core/data/service/server_lib.cc b/tensorflow/core/data/service/server_lib.cc
index 66fc1e20603..33c2232f4dc 100644
--- a/tensorflow/core/data/service/server_lib.cc
+++ b/tensorflow/core/data/service/server_lib.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/data/service/grpc_master_impl.h"
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/grpc_worker_impl.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 namespace data {
@@ -31,6 +32,13 @@ GrpcDataServerBase::GrpcDataServerBase(int port, const std::string& protocol)
     : requested_port_(port), protocol_(protocol) {}
 
 Status GrpcDataServerBase::Start() {
+  if (stopped_) {
+    return errors::FailedPrecondition(
+        "Server cannot be started after it has been stopped.");
+  }
+  if (started_) {
+    return Status::OK();
+  }
   ::grpc::ServerBuilder builder;
   std::shared_ptr<::grpc::ServerCredentials> credentials;
   TF_RETURN_IF_ERROR(
@@ -47,11 +55,18 @@ Status GrpcDataServerBase::Start() {
 
   TF_RETURN_IF_ERROR(StartServiceInternal());
 
+  started_ = true;
   VLOG(1) << "Started tf.data service running at 0.0.0.0:" << BoundPort();
   return Status::OK();
 }
 
-void GrpcDataServerBase::Stop() { server_->Shutdown(); }
+void GrpcDataServerBase::Stop() {
+  if (stopped_) {
+    return;
+  }
+  server_->Shutdown();
+  stopped_ = true;
+}
 
 void GrpcDataServerBase::Join() { server_->Wait(); }
 
@@ -68,15 +83,15 @@ void MasterGrpcDataServer::AddServiceToBuilder(grpc::ServerBuilder* builder) {
   service_ = service.release();
 }
 
-Status MasterGrpcDataServer::NumTasks(int* num_tasks) {
-  GetTasksRequest req;
-  GetTasksResponse resp;
+Status MasterGrpcDataServer::NumWorkers(int* num_workers) {
+  GetWorkersRequest req;
+  GetWorkersResponse resp;
   grpc::ServerContext ctx;
-  grpc::Status s = service_->GetTasks(&ctx, &req, &resp);
+  grpc::Status s = service_->GetWorkers(&ctx, &req, &resp);
   if (!s.ok()) {
-    return grpc_util::WrapError("Failed to get num tasks", s);
+    return grpc_util::WrapError("Failed to get workers", s);
   }
-  *num_tasks = resp.task_info_size();
+  *num_workers = resp.workers_size();
   return Status::OK();
 }
 
diff --git a/tensorflow/core/data/service/server_lib.h b/tensorflow/core/data/service/server_lib.h
index 0ef305db89a..72bec665c8e 100644
--- a/tensorflow/core/data/service/server_lib.h
+++ b/tensorflow/core/data/service/server_lib.h
@@ -64,6 +64,8 @@ class GrpcDataServerBase {
 
  private:
   int bound_port_;
+  bool started_ = false;
+  bool stopped_ = false;
 
   std::unique_ptr<grpc::Server> server_;
 };
@@ -73,8 +75,8 @@ class MasterGrpcDataServer : public GrpcDataServerBase {
   MasterGrpcDataServer(int requested_port, const std::string& protocol);
   ~MasterGrpcDataServer() override;
 
-  // Returns the number of tasks created by the master.
-  Status NumTasks(int* num_tasks);
+  // Returns the number of workers registerd with the master.
+  Status NumWorkers(int* num_workers);
 
  protected:
   void AddServiceToBuilder(grpc::ServerBuilder* builder) override;
diff --git a/tensorflow/core/data/service/worker.proto b/tensorflow/core/data/service/worker.proto
index 04b8f03474c..51c6899f540 100644
--- a/tensorflow/core/data/service/worker.proto
+++ b/tensorflow/core/data/service/worker.proto
@@ -2,6 +2,7 @@ syntax = "proto3";
 
 package tensorflow.data;
 
+import "tensorflow/core/data/dataset.proto";
 import "tensorflow/core/data/service/common.proto";
 
 message ProcessTaskRequest {
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index 7395244a569..151410bb219 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/dataset.pb.h"
 #include "tensorflow/core/data/service/credentials_factory.h"
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/master.grpc.pb.h"
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/snappy.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -84,6 +85,7 @@ Status DataServiceWorkerImpl::ProcessTask(const ProcessTaskRequest* request,
 
 Status DataServiceWorkerImpl::ProcessTaskInternal(const TaskDef& task_def)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  VLOG(3) << "Received request to process task " << task_def.task_id();
   standalone::Dataset::Params params;
   std::unique_ptr<standalone::Dataset> dataset;
   TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(
@@ -100,6 +102,7 @@ Status DataServiceWorkerImpl::ProcessTaskInternal(const TaskDef& task_def)
   task.id = task_def.task_id();
   task.dataset = std::move(dataset);
   task.iterator = std::move(iterator);
+  VLOG(3) << "Began processing for task " << task_def.task_id();
   return Status::OK();
 }
 
@@ -133,8 +136,33 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
 
   if (!end_of_sequence) {
     VLOG(3) << "Producing an element for task " << request->task_id();
-    TF_RETURN_IF_ERROR(service_util::Compress(
-        outputs, response->mutable_compressed_element()));
+    if (outputs.size() != 1) {
+      return errors::FailedPrecondition(
+          "Expected dataset to produce a single scalar variant tensor, but the "
+          "dataset produced ",
+          outputs.size(), " outputs");
+    }
+    if (outputs[0].dtype() != DT_VARIANT) {
+      return errors::FailedPrecondition(
+          "Expected dataset to produce a single scalar variant tensor, but "
+          "the dataset produced a tensor with type ",
+          DataTypeString(outputs[0].dtype()));
+    }
+    if (!TensorShapeUtils::IsScalar(outputs[0].shape())) {
+      return errors::FailedPrecondition(
+          "Expected dataset to produce a single scalar variant tensor, but "
+          "the dataset produced a tensor with shape ",
+          outputs[0].shape());
+    }
+    Variant& variant = outputs[0].scalar<Variant>()();
+    CompressedElement* compressed = variant.get<CompressedElement>();
+    if (compressed == nullptr) {
+      return errors::FailedPrecondition(
+          "Expected dataset to produce a CompressedElement variant tensor, but "
+          "it produced ",
+          variant.TypeName());
+    }
+    compressed->Swap(response->mutable_compressed_element());
   }
   response->set_end_of_sequence(end_of_sequence);
 
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 8b458021910..7849e094cb9 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
@@ -373,11 +374,14 @@ void BaseRemoteRendezvous::RecvLocalAsyncInternal(const ParsedKey& parsed,
 
 void BaseRemoteRendezvous::StartAbort(const Status& s) {
   CHECK(!s.ok());
-  // Use a "derived" status as the status for the rendezvous. Derived
-  // status messages are ignored when aggregating errors across devices: this
-  // allows us to prefer our original status message over any cancellation
-  // related errors.
-  Status derived_status = StatusGroup::MakeDerived(s);
+  // If the status passed in is a cancelled or aborted error, mark it as
+  // "derived" for the rendezvous. Derived status messages are ignored when
+  // aggregating errors across devices: this allows us to prefer our original
+  // status message over any cancellation related errors.
+  Status derived_status = s;
+  if (errors::IsCancelled(s) || errors::IsAborted(s)) {
+    derived_status = StatusGroup::MakeDerived(s);
+  }
 
   local_->StartAbort(derived_status);
   {
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index c7fdfa176b1..c27758cbb44 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -42,6 +42,7 @@ cc_library(
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:worker_session",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
@@ -68,6 +69,7 @@ cc_library(
         "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:call_options",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
index ec129173833..808188aa36d 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -20,9 +20,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_execute_node.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -64,13 +66,7 @@ void EagerClusterFunctionLibraryRuntime::Instantiate(
   VLOG(1) << "CFLR::Instantiate: " << function_name << " on " << target
           << " (this: " << this << ")";
   core::RefCountPtr<eager::EagerClient> eager_client;
-  Device* device;
-  s = ctx_->FindDeviceFromName(target.c_str(), &device);
-  if (!s.ok()) {
-    done(s);
-    return;
-  }
-  s = ctx_->GetClient(device, &eager_client);
+  s = ctx_->GetClient(target, &eager_client);
   if (!s.ok()) {
     done(s);
     return;
@@ -189,13 +185,31 @@ void EagerClusterFunctionLibraryRuntime::Run(
   op->Attrs().FillAttrValueMap(remote_op->mutable_attrs());
   remote_op->set_device(function_data->target);
 
+  CancellationManager* cm = opts.cancellation_manager;
+  CancellationToken token = 0;
+  auto call_opts = std::make_shared<CallOptions>();
+  if (cm != nullptr) {
+    token = cm->get_cancellation_token();
+    const bool already_cancelled = !cm->RegisterCallback(
+        token,
+        [call_opts, request, response, done]() { call_opts->StartCancel(); });
+    if (already_cancelled) {
+      done(errors::Cancelled("EagerClusterFunctionLibraryRuntime::Run"));
+      return;
+    }
+  }
+
   // Execute component function on remote worker using RunComponentFunction RPC.
   // Different from executing remote functions with Enqueue, this method runs
   // a function on remote worker without tying up a thread (i.e., pure
   // asynchronously).
   eager_client->RunComponentFunctionAsync(
-      request.get(), response.get(),
-      [request, response, rets, done = std::move(done)](const Status& s) {
+      call_opts.get(), request.get(), response.get(),
+      [request, response, rets, call_opts, cm, token,
+       done = std::move(done)](const Status& s) {
+        if (cm != nullptr) {
+          cm->TryDeregisterCallback(token);
+        }
         if (!s.ok()) {
           done(s);
           return;
diff --git a/tensorflow/core/distributed_runtime/eager/eager_client.h b/tensorflow/core/distributed_runtime/eager/eager_client.h
index 9ca802d8a72..d6cf0943176 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_client.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_client.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
 
+#include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
@@ -38,12 +39,15 @@ class EagerClient : public core::RefCounted {
   CLIENT_METHOD(UpdateContext);
   CLIENT_METHOD(Enqueue);
   CLIENT_METHOD(WaitQueueDone);
-  CLIENT_METHOD(RunComponentFunction);
   CLIENT_METHOD(KeepAlive);
   CLIENT_METHOD(CloseContext);
 
 #undef CLIENT_METHOD
 
+  virtual void RunComponentFunctionAsync(
+      CallOptions* call_opts, const RunComponentFunctionRequest* request,
+      RunComponentFunctionResponse* response, StatusCallback done) = 0;
+
   // Feeds `request` into the request stream of EagerService::StreamingEnqueue.
   // `response` will be filled with the response for this `request`. The
   // 1-to-1 correspondence between requests and responses is a property
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 95131150d3d..f1e70d53757 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -238,7 +238,7 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
   TF_RETURN_IF_ERROR(env_->session_mgr->WorkerSessionForSession(
       session_name, &worker_session));
 
-  tensorflow::DeviceMgr* device_mgr = worker_session->device_mgr();
+  const tensorflow::DeviceMgr* device_mgr = worker_session->device_mgr();
 
   // Initialize remote tensor communication based on worker session.
   TF_RETURN_IF_ERROR(r->Initialize(worker_session.get()));
@@ -355,7 +355,7 @@ Status EagerServiceImpl::UpdateContext(const UpdateContextRequest* request,
   TF_RETURN_IF_ERROR(env_->session_mgr->WorkerSessionForSession(
       session_name, &worker_session));
 
-  tensorflow::DeviceMgr* device_mgr = worker_session->device_mgr();
+  const tensorflow::DeviceMgr* device_mgr = worker_session->device_mgr();
 
   std::vector<string> remote_workers;
   worker_session->worker_cache()->ListWorkers(&remote_workers);
@@ -411,7 +411,7 @@ Status EagerServiceImpl::CreateMasterContext(
 }
 
 void EagerServiceImpl::RunComponentFunction(
-    const RunComponentFunctionRequest* request,
+    CallOptions* call_opts, const RunComponentFunctionRequest* request,
     RunComponentFunctionResponse* response, StatusCallback done) {
   ServerContext* context = nullptr;
   Status s = GetServerContext(request->context_id(), &context);
@@ -451,11 +451,17 @@ void EagerServiceImpl::RunComponentFunction(
   VLOG(3) << "ServerContext: Calling EagerLocalExecuteAsync for op "
           << operation.id();
 
+  auto cm = std::make_shared<CancellationManager>();
+  op->SetCancellationManager(cm.get());
+  call_opts->SetCancelCallback([cm] { cm->StartCancel(); });
+
   context->Ref();
   EagerLocalExecuteAsync(
       op, retvals->data(), num_retvals,
-      [op, op_id = operation.id(), num_retvals, retvals, response,
-       eager_context, context, done = std::move(done)](const Status& status) {
+      [op, op_id = operation.id(), num_retvals, retvals, cm, call_opts,
+       response, eager_context, context,
+       done = std::move(done)](const Status& status) {
+        call_opts->ClearCancelCallback();
         auto wrapped_done = [&](const Status& status) {
           context->Unref();
           done(status);
@@ -524,6 +530,8 @@ Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
       s = context->Context()->Executor().AddOrExecute(std::move(node));
     } else if (item.has_send_tensor()) {
       s = SendTensor(item.send_tensor(), context->Context());
+    } else if (item.has_send_packed_handle()) {
+      s = SendPackedHandle(item.send_packed_handle(), context->Context());
     } else if (item.has_register_function()) {
       s = RegisterFunction(item.register_function(), context->Context());
     } else if (item.has_cleanup_function()) {
@@ -643,6 +651,52 @@ Status EagerServiceImpl::SendTensor(const SendTensorOp& send_tensor,
   return Status::OK();
 }
 
+Status EagerServiceImpl::SendPackedHandle(
+    const SendPackedHandleOp& send_packed_handle, EagerContext* eager_context) {
+  if (send_packed_handle.handles().empty()) {
+    return errors::InvalidArgument("Handles should not be empty.");
+  }
+
+  std::vector<tensorflow::TensorHandle*> handles;
+  handles.resize(send_packed_handle.handles_size());
+  for (int i = 0; i < send_packed_handle.handles_size(); ++i) {
+    const auto& item = send_packed_handle.handles(i);
+    if (item.has_local_handle()) {
+      Tensor tensor;
+      if (!ParseTensorProtoToTensor(item.local_handle().tensor(), &tensor)) {
+        return errors::InvalidArgument(
+            "Invalid TensorProto: ",
+            item.local_handle().tensor().DebugString());
+      }
+      Device* op_device = nullptr;
+      TF_RETURN_IF_ERROR(eager_context->FindDeviceFromName(
+          item.local_handle().device().c_str(), &op_device));
+      handles[i] = TensorHandle::CreateLocalHandle(
+          std::move(tensor), /*d=*/nullptr, op_device, eager_context);
+    } else {
+      TF_RETURN_IF_ERROR(
+          eager_context->RemoteMgr()->DeserializeRemoteTensorHandle(
+              item.remote_handle(), &handles[i]));
+    }
+  }
+
+  tensorflow::TensorHandle* packed_handle = nullptr;
+  std::vector<tensorflow::TensorHandle*> handles_to_pack = handles;
+  // Create a unshaped packed TensorHandle.
+  TF_RETURN_IF_ERROR(TensorHandle::CreatePackedHandle(
+      std::move(handles_to_pack), handles.at(0)->dtype, TensorShape(),
+      eager_context, &packed_handle));
+
+  for (auto* h : handles) {
+    // Unref handle since it has a ref in the packed handle now.
+    h->Unref();
+  }
+
+  eager_context->RemoteMgr()->AddOperationOutputs({packed_handle},
+                                                  send_packed_handle.op_id());
+  return Status::OK();
+}
+
 tensorflow::Status EagerServiceImpl::GetServerContext(
     uint64 context_id, ServerContext** server_context) {
   tf_shared_lock l(contexts_mu_);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index 06d4c36b61c..09db3883a15 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -96,7 +96,8 @@ class EagerServiceImpl {
   Status WaitQueueDone(const WaitQueueDoneRequest* request,
                        WaitQueueDoneResponse* response);
 
-  void RunComponentFunction(const RunComponentFunctionRequest* request,
+  void RunComponentFunction(CallOptions* call_opts,
+                            const RunComponentFunctionRequest* request,
                             RunComponentFunctionResponse* response,
                             StatusCallback done);
 
@@ -212,6 +213,8 @@ class EagerServiceImpl {
                    QueueResponse* queue_response);
   Status SendTensor(const SendTensorOp& send_tensor,
                     EagerContext* eager_context);
+  Status SendPackedHandle(const SendPackedHandleOp& send_packed_handle,
+                          EagerContext* eager_context);
   Status RegisterFunction(const RegisterFunctionOp& register_function,
                           EagerContext* eager_context);
   Status CleanupFunction(const CleanupFunctionOp& cleanup_function);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 92e92f47356..cf430d78617 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -15,10 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/eager/eager_service_impl.h"
 
-#include <string.h>
-
-#include <memory>
-
 #include "absl/types/optional.h"
 #include "absl/types/variant.h"
 #include "tensorflow/c/c_api_internal.h"
@@ -39,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/eager_service.pb.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 
@@ -90,10 +87,11 @@ class FakeEagerClient : public EagerClient {
   CLIENT_METHOD(CloseContext);
 #undef CLIENT_METHOD
 
-  void RunComponentFunctionAsync(const RunComponentFunctionRequest* request,
+  void RunComponentFunctionAsync(CallOptions* call_opts,
+                                 const RunComponentFunctionRequest* request,
                                  RunComponentFunctionResponse* response,
                                  StatusCallback done) override {
-    impl_->RunComponentFunction(request, response, std::move(done));
+    impl_->RunComponentFunction(call_opts, request, response, std::move(done));
   }
 
   void StreamingEnqueueAsync(const EnqueueRequest* request,
@@ -176,14 +174,11 @@ void SetTensorProto(TensorProto* tensor_proto) {
   TF_DeleteTensor(t);
 }
 
-void AddOperationToEnqueueRequest(
-    int64 id, const string& name,
+void BuildOperation(
+    Operation* operation, int64 id, const string& name,
     const std::vector<absl::variant<TensorProto, std::pair<int64, int32>>>&
         inputs,
-    const std::unordered_map<string, AttrValue>& attrs, const string& device,
-    EnqueueRequest* request) {
-  auto* operation = request->add_queue()->mutable_operation();
-
+    const std::unordered_map<string, AttrValue>& attrs, const string& device) {
   operation->set_id(id);
   operation->set_name(name);
   operation->set_device(device);
@@ -208,6 +203,28 @@ void AddOperationToEnqueueRequest(
   }
 }
 
+void AddOperationToEnqueueRequest(
+    int64 id, const string& name,
+    const std::vector<absl::variant<TensorProto, std::pair<int64, int32>>>&
+        inputs,
+    const std::unordered_map<string, AttrValue>& attrs, const string& device,
+    EnqueueRequest* request) {
+  auto* operation = request->add_queue()->mutable_operation();
+  BuildOperation(operation, id, name, inputs, attrs, device);
+}
+
+void AddOperationToRunComponentFunctionRequest(
+    int64 id, const string& name,
+    const std::vector<absl::variant<TensorProto, std::pair<int64, int32>>>&
+        inputs,
+    const std::unordered_map<string, AttrValue>& attrs, const string& device,
+    RunComponentFunctionRequest* request) {
+  auto* operation = request->mutable_operation();
+  operation->set_is_function(true);
+  operation->set_is_component_function(true);
+  BuildOperation(operation, id, name, inputs, attrs, device);
+}
+
 tensorflow::NodeDef MatMulFunctionNodeDef() {
   tensorflow::NodeDef def;
   CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
@@ -298,6 +315,69 @@ tensorflow::FunctionDef MatMulNestedFunction() {
   return def;
 }
 
+tensorflow::FunctionDef SingleRecvNodeFunction() {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'SingleRecvNodeFunction'"
+      "      input_arg {"
+      "        name: 'a'"
+      "        type: DT_FLOAT"
+      "      }"
+      "      output_arg {"
+      "        name: 'recv_tensor'"
+      "        type: DT_FLOAT"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'recv_node'"
+      "      op: '_Recv'"
+      "      device: '/job:localhost/replica:0/task:0/device:CPU:0'"
+      "      attr {"
+      "        key: 'client_terminated'"
+      "        value {"
+      "          b: true"
+      "        }"
+      "      }"
+      "      attr {"
+      "        key: 'recv_device'"
+      "        value {"
+      "          s: '/job:localhost/replica:0/task:0/device:CPU:0'"
+      "        }"
+      "      }"
+      "      attr {"
+      "        key: 'send_device'"
+      "        value {"
+      "          s: '/job:localhost/replica:0/task:0/device:CPU:0'"
+      "        }"
+      "      }"
+      "      attr {"
+      "        key: 'send_device_incarnation'"
+      "        value {"
+      "          i: 1"
+      "        }"
+      "      }"
+      "      attr {"
+      "        key: 'tensor_name'"
+      "        value {"
+      "          s: 't0'"
+      "        }"
+      "      }"
+      "      attr {"
+      "        key: 'tensor_type'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    ret {"
+      "      key: 'recv_tensor'"
+      "      value: 'recv_node:tensor:0'"
+      "    }",
+      &def));
+  return def;
+}
+
 // Test creates a context and attempts to execute some ops.
 TEST_F(EagerServiceImplTest, BasicTest) {
   TestEagerServiceImpl eager_service_impl(&worker_env_);
@@ -461,6 +541,97 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
     TF_ASSERT_OK(eager_service_impl.CloseContext(&close_context_request,
                                                  &close_context_response));
   }
+
+  // Creates a context and attempts to execute a component function.
+  void TestComponentFunction(const RegisterFunctionOp& register_op,
+                             const string& function_name,
+                             const bool test_cancel) {
+    TestEagerServiceImpl eager_service_impl(&worker_env_);
+    uint64 context_id = random::New64();
+
+    // Create context.
+    CreateContextRequest request;
+    request.mutable_server_def()->set_job_name("localhost");
+    request.mutable_server_def()->set_task_index(0);
+    request.set_context_id(context_id);
+    CreateContextResponse response;
+    TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
+
+    // Register function.
+    EnqueueRequest enqueue_request;
+    enqueue_request.set_context_id(context_id);
+    *enqueue_request.add_queue()->mutable_register_function() = register_op;
+    EnqueueResponse enqueue_response;
+    TF_ASSERT_OK(
+        eager_service_impl.Enqueue(&enqueue_request, &enqueue_response));
+
+    // First run an op to generate input for function.
+    EnqueueRequest remote_enqueue_request;
+    remote_enqueue_request.set_context_id(context_id);
+    EnqueueResponse remote_enqueue_response;
+
+    std::unordered_map<string, AttrValue> const_attrs;
+    AttrValue val;
+    val.set_type(tensorflow::DataType::DT_FLOAT);
+    const_attrs.insert({"dtype", val});
+    val.Clear();
+    SetTensorProto(val.mutable_tensor());
+    const_attrs.insert({"value", val});
+    AddOperationToEnqueueRequest(1, "Const", {}, const_attrs,
+                                 "/job:localhost/replica:0/task:0/device:CPU:0",
+                                 &remote_enqueue_request);
+    TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+                                            &remote_enqueue_response));
+
+    // Run function with input from the previous op.
+    RunComponentFunctionRequest run_comp_func_request;
+    run_comp_func_request.set_context_id(context_id);
+    RunComponentFunctionResponse run_comp_func_response;
+    AddOperationToRunComponentFunctionRequest(
+        2, function_name, {std::make_pair(1, 0)},
+        std::unordered_map<string, AttrValue>(),
+        "/job:localhost/replica:0/task:0/device:CPU:0", &run_comp_func_request);
+
+    CallOptions call_opts;
+    Notification n;
+    Status status;
+    eager_service_impl.RunComponentFunction(&call_opts, &run_comp_func_request,
+                                            &run_comp_func_response,
+                                            [&status, &n](const Status& s) {
+                                              status.Update(s);
+                                              n.Notify();
+                                            });
+    if (test_cancel) {
+      call_opts.StartCancel();
+    }
+    n.WaitForNotification();
+    if (test_cancel) {
+      EXPECT_TRUE(errors::IsCancelled(status)) << status.error_message();
+    } else {
+      TF_ASSERT_OK(status);
+      // Retrieve the output.
+      const tensorflow::Tensor* t = nullptr;
+      tensorflow::TensorHandle* tensor_handle;
+      TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+          context_id, RemoteTensorHandleInternal(2, 0), &tensor_handle));
+      TF_ASSERT_OK(tensor_handle->Tensor(&t));
+
+      auto actual = t->flat<float>();
+      EXPECT_EQ(4, actual.size());
+
+      EXPECT_EQ(7, actual(0));
+      EXPECT_EQ(10, actual(1));
+      EXPECT_EQ(15, actual(2));
+      EXPECT_EQ(22, actual(3));
+    }
+
+    CloseContextRequest close_context_request;
+    close_context_request.set_context_id(context_id);
+    close_context_request.set_context_view_id(0);
+    CloseContextResponse close_context_response;
+    TF_ASSERT_OK(eager_service_impl.CloseContext(&close_context_request,
+                                                 &close_context_response));
+  }
 };
 
 TEST_F(EagerServiceImplFunctionTest, BasicFunctionTest) {
@@ -482,6 +653,18 @@ TEST_F(EagerServiceImplFunctionTest, NestedFunctionTest) {
   TestFunction(register_op, "MatMulNestedFunction");
 }
 
+TEST_F(EagerServiceImplFunctionTest, ComponentFunctionTest) {
+  RegisterFunctionOp register_op;
+  *register_op.mutable_function_def() = MatMulFunction();
+  TestComponentFunction(register_op, "MatMulFunction", false);
+}
+
+TEST_F(EagerServiceImplFunctionTest, ComponentFunctionCancellationTest) {
+  RegisterFunctionOp register_op;
+  *register_op.mutable_function_def() = SingleRecvNodeFunction();
+  TestComponentFunction(register_op, "SingleRecvNodeFunction", true);
+}
+
 class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
  public:
   FunctionWithRemoteInputsTest()
@@ -728,7 +911,9 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
   core::RefCountPtr<KernelAndDeviceFunc> kernel = nullptr;
   const int64 op_id = 2;
   kernel.reset(new KernelAndDeviceFunc(
-      flr, eager_pflr_.get(), std::move(input_dev_ptrs), {}, /*runner=*/nullptr,
+      flr, eager_pflr_.get(), std::move(input_dev_ptrs),
+      /*composite_devices=*/{}, /*input_resource_dtypes_and_shapes=*/{},
+      /*runner=*/nullptr,
       /*collective_executor=*/nullptr, local_device, fdef_.signature().name(),
       [ctx](const int64 step_id) { return ctx->CreateRendezvous(step_id); },
       [=]() { return op_id; }));
@@ -773,7 +958,9 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncAsyncTest) {
   core::RefCountPtr<KernelAndDeviceFunc> kernel = nullptr;
   const int64 op_id = 2;
   kernel.reset(new KernelAndDeviceFunc(
-      flr, eager_pflr_.get(), std::move(input_dev_ptrs), {}, /*runner=*/nullptr,
+      flr, eager_pflr_.get(), std::move(input_dev_ptrs),
+      /*composite_devices=*/{}, /*input_resource_dtypes_and_shapes=*/{},
+      /*runner=*/nullptr,
       /*collective_executor=*/nullptr, local_device, fdef_.signature().name(),
       [ctx](const int64 step_id) { return ctx->CreateRendezvous(step_id); },
       [=]() { return op_id; }));
@@ -877,6 +1064,109 @@ TEST_F(EagerServiceImplTest, SendTensorTest) {
                                                &close_context_response));
 }
 
+// Test serializes and sends a pack TensorHandle.
+TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
+  TestEagerServiceImpl eager_service_impl(&worker_env_);
+
+  const string device0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const string device1 = "/job:localhost/replica:0/task:1/device:CPU:0";
+  const string device2 = "/job:localhost/replica:0/task:2/device:CPU:0";
+
+  uint64 context_id = random::New64();
+  CreateContextRequest request;
+  auto* server_def = request.mutable_server_def();
+  server_def->set_job_name("localhost");
+  server_def->set_task_index(0);
+  request.add_cluster_device_attributes()->set_name(device0);
+  request.add_cluster_device_attributes()->set_name(device1);
+  request.add_cluster_device_attributes()->set_name(device2);
+  request.set_context_id(context_id);
+  CreateContextResponse response;
+
+  TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
+
+  EnqueueRequest remote_enqueue_request;
+  remote_enqueue_request.set_context_id(context_id);
+  EnqueueResponse remote_enqueue_response;
+
+  // Copy a tensor to device0
+  auto* send_tensor = remote_enqueue_request.add_queue()->mutable_send_tensor();
+  send_tensor->set_op_id(1);
+  SetTensorProto(send_tensor->add_tensors());
+
+  // Copy a packed handle to device0
+  auto* send_packed_handle =
+      remote_enqueue_request.add_queue()->mutable_send_packed_handle();
+  send_packed_handle->set_op_id(3);
+  RemoteTensorHandle* remote_handle =
+      send_packed_handle->add_handles()->mutable_remote_handle();
+  remote_handle->set_op_id(send_tensor->op_id());
+  remote_handle->set_output_num(0);
+  remote_handle->set_op_device(device0);
+  remote_handle->set_device(device0);
+
+  SendPackedHandleOp::LocalTensorHandle* lcoal_handle =
+      send_packed_handle->add_handles()->mutable_local_handle();
+  SetTensorProto(lcoal_handle->mutable_tensor());
+  lcoal_handle->set_device(device1);
+
+  remote_handle = send_packed_handle->add_handles()->mutable_remote_handle();
+  remote_handle->set_op_id(2);
+  remote_handle->set_output_num(5);
+  remote_handle->set_op_device(device2);
+  remote_handle->set_device(device2);
+
+  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+                                          &remote_enqueue_response));
+
+  tensorflow::TensorHandle* packed_handle;
+  TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+      context_id, RemoteTensorHandleInternal(3, 0), &packed_handle));
+
+  EXPECT_EQ(packed_handle->Type(), TensorHandle::PACKED);
+  EXPECT_EQ(packed_handle->NumPackedHandles(), 3);
+
+  TensorHandle* handle0 = nullptr;
+  TF_ASSERT_OK(packed_handle->ExtractPackedHandle(0, &handle0));
+  EXPECT_EQ(handle0->Type(), TensorHandle::LOCAL);
+  EXPECT_EQ(handle0->op_device()->name(), device0);
+  const Tensor* t0 = nullptr;
+  TF_ASSERT_OK(handle0->Tensor(&t0));
+  auto actual = t0->flat<float>();
+  EXPECT_EQ(4, actual.size());
+  EXPECT_EQ(1.0, actual(0));
+  EXPECT_EQ(2.0, actual(1));
+  EXPECT_EQ(3.0, actual(2));
+  EXPECT_EQ(4.0, actual(3));
+
+  TensorHandle* handle1 = nullptr;
+  TF_ASSERT_OK(packed_handle->ExtractPackedHandle(1, &handle1));
+  EXPECT_EQ(handle1->Type(), TensorHandle::LOCAL);
+  EXPECT_EQ(handle1->op_device()->name(), device1);
+  const Tensor* t1 = nullptr;
+  TF_ASSERT_OK(handle0->Tensor(&t1));
+  EXPECT_EQ(t1, t0);
+
+  TensorHandle* handle2 = nullptr;
+  TF_ASSERT_OK(packed_handle->ExtractPackedHandle(2, &handle2));
+  EXPECT_EQ(handle2->Type(), TensorHandle::REMOTE);
+  EXPECT_EQ(handle2->op_device()->name(), device2);
+  int64 op_id;
+  int32 output_num;
+  TF_ASSERT_OK(handle2->RemoteAddress(absl::get<Device*>(handle2->device()),
+                                      /*wait_until_ready=*/true, &op_id,
+                                      &output_num));
+  EXPECT_EQ(op_id, 2);
+  EXPECT_EQ(output_num, 5);
+
+  CloseContextRequest close_context_request;
+  close_context_request.set_context_id(context_id);
+  close_context_request.set_context_view_id(0);
+  CloseContextResponse close_context_response;
+  TF_ASSERT_OK(eager_service_impl.CloseContext(&close_context_request,
+                                               &close_context_response));
+}
+
 // Test requests sent to the eager service on master.
 TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
   tensorflow::Rendezvous* rendezvous =
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index b281bcef2b3..090417863f3 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 namespace eager {
@@ -145,7 +147,8 @@ void RemoteCopyNode::StartSend() {
     request.set_context_id(ctx_->GetContextId());
     auto* remote_op = request.add_queue()->mutable_operation();
     status = ctx_->RemoteMgr()->SerializeRemoteTensorHandle(
-        src_, remote_op->add_op_inputs()->mutable_remote_handle(),
+        src_, /*wait_until_ready=*/false,
+        remote_op->add_op_inputs()->mutable_remote_handle(),
         absl::get<Device*>(src_->device()),
         absl::get<Device*>(src_->DeviceOrHostCPU(*ctx_))->name());
     if (!status.ok()) {
@@ -290,6 +293,103 @@ void RemoteCopyNode::StartRecv(StatusCallback done) {
   }
 }
 
+Status SerializePackedHandle(const uint64 op_id, TensorHandle* packed_handle,
+                             const Device* target_device, EagerContext* ctx,
+                             SendPackedHandleOp* op) {
+  op->set_op_id(op_id);
+  for (int i = 0; i < packed_handle->NumPackedHandles(); ++i) {
+    TensorHandle* h = nullptr;
+    TF_RETURN_IF_ERROR(packed_handle->ExtractPackedHandle(i, &h));
+    if (h->Type() == TensorHandle::LOCAL) {
+      // AsProtoTensorContent doesn't work when the tensor is on the GPU, hence
+      // copy it to the CPU before copying it out.
+      Tensor tensor;
+      TF_RETURN_IF_ERROR(h->CopyToDevice(*ctx, ctx->HostCPU(), &tensor));
+      auto* local_handle = op->add_handles()->mutable_local_handle();
+      local_handle->set_device(h->op_device() ? h->op_device()->name()
+                                              : ctx->HostCPU()->name());
+      tensor.AsProtoTensorContent(local_handle->mutable_tensor());
+    } else if (h->Type() == TensorHandle::REMOTE) {
+      // Only serialize the resource dtype and shape of the first handle, since
+      // all handles are of the same resource dtype and shape.
+      Device* src_device = absl::get<Device*>(h->device());
+      const bool serialize_resource_dtype_and_shape =
+          (i == 0) && (h->dtype == DT_RESOURCE) &&
+          (ctx->OnSameTask(src_device, target_device));
+      TF_RETURN_IF_ERROR(ctx->RemoteMgr()->SerializeRemoteTensorHandle(
+          h, /*wait_until_ready=*/false,
+          op->add_handles()->mutable_remote_handle(), src_device,
+          absl::get<Device*>(h->DeviceOrHostCPU(*ctx))->name(),
+          serialize_resource_dtype_and_shape));
+    } else {
+      return errors::InvalidArgument("Nested packed handles are not supported");
+    }
+  }
+  return Status::OK();
+}
+
+void RemoteCopyNode::StartSendPackedHandle(StatusCallback done) {
+  Status s;
+  const uint64 context_view_id = ctx_->GetContextViewId();
+  if (!send_device_->IsLocal()) {
+    s = errors::InvalidArgument(
+        "Copy a packed handle from a remote device is not supported");
+    captured_state_->dst()->PoisonRemote(s, recv_device_, context_view_id);
+    done(s);
+    return;
+  }
+
+  EnqueueRequest request;
+  uint64 context_id = ctx_->GetContextId();
+  request.set_context_id(context_id);
+  s = SerializePackedHandle(recv_op_id_, src_, recv_device_, ctx_,
+                            request.add_queue()->mutable_send_packed_handle());
+  if (!s.ok()) {
+    captured_state_->dst()->PoisonRemote(s, recv_device_, context_view_id);
+    done(s);
+    return;
+  }
+
+  TensorShape shape;
+  s = src_->Shape(&shape);
+  if (!s.ok()) {
+    captured_state_->dst()->PoisonRemote(s, recv_device_, context_view_id);
+    done(s);
+    return;
+  }
+  captured_state_->SetSrcShape(shape);
+
+  core::RefCountPtr<eager::EagerClient> eager_client;
+  s = ctx_->GetClient(recv_device_, &eager_client);
+  if (!s.ok()) {
+    captured_state_->dst()->PoisonRemote(s, recv_device_, context_view_id);
+    done(s);
+    return;
+  }
+
+  EnqueueResponse* response = new EnqueueResponse;
+  Device* recv_device = recv_device_;
+  const std::shared_ptr<CapturedSharedState>& captured_state = captured_state_;
+  eager_client->StreamingEnqueueAsync(
+      &request, response,
+      [captured_state, response, recv_device, context_view_id,
+       done](const Status& s) {
+        if (s.ok()) {
+          Status status = captured_state->dst()->SetRemoteShape(
+              captured_state->GetSrcShape(), recv_device, context_view_id);
+          if (!status.ok()) {
+            LOG(ERROR) << "Ignoring an error encountered when setting remote "
+                          "shape of tensor received by SendPackedHadnle rpc: "
+                       << status.ToString();
+          }
+        } else {
+          captured_state->dst()->PoisonRemote(s, recv_device, context_view_id);
+        }
+        done(s);
+        delete response;
+      });
+}
+
 void RemoteCopyNode::StartRemoteSendTensor(StatusCallback done) {
   Status s;
   EnqueueRequest request;
@@ -351,7 +451,11 @@ Status RemoteCopyNode::Prepare() {
 
 void RemoteCopyNode::RunAsync(StatusCallback done) {
   started_ = true;
-  if (ctx_->UseSendTensorRPC() && send_device_->IsLocal() &&
+  if (src_->Type() == TensorHandle::PACKED) {
+    return StartSendPackedHandle(std::move(done));
+  }
+
+  if ((ctx_->UseSendTensorRPC()) && send_device_->IsLocal() &&
       !recv_device_->IsLocal()) {
     return StartRemoteSendTensor(std::move(done));
   }
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.h b/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
index a527cd47127..7816a24ed33 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
@@ -121,6 +121,9 @@ class RemoteCopyNode : public AsyncEagerNode {
   // SendTensor RPC *on the receiver*.
   void StartRemoteSendTensor(StatusCallback done);
 
+  // Send a local packed TensorHandle to a remote device.
+  void StartSendPackedHandle(StatusCallback done);
+
   // State that is captured by Send and/or Recv callbacks (depending on which
   // one(s) is remote) and outlives this node in the case of remote->remote
   // copy.
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index 7c5115d33ef..94a4f199337 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -74,6 +74,7 @@ Status RemoteMgr::GetMirroredResourceShape(
 }
 
 Status RemoteMgr::GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
+                                        const bool wait_until_ready,
                                         int64* op_id, int32* output_num) {
   // TODO(allenl): Consider supporting remote handles on custom devices.
   VariantDevice device = handle->device();
@@ -82,8 +83,8 @@ Status RemoteMgr::GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
         "Custom devices and remote execution are currently not supported "
         "together.");
   }
-  TF_RETURN_IF_ERROR(handle->RemoteAddressUntilReady(absl::get<Device*>(device),
-                                                     op_id, output_num));
+  TF_RETURN_IF_ERROR(handle->RemoteAddress(
+      absl::get<Device*>(device), wait_until_ready, op_id, output_num));
   tensorflow::TensorHandle* h;
   TF_RETURN_IF_ERROR(
       GetTensorHandleImpl(RemoteTensorHandleInternal(*op_id, *output_num), &h));
@@ -120,13 +121,15 @@ Status RemoteMgr::DeleteTensorHandle(
 }
 
 Status RemoteMgr::SerializeRemoteTensorHandle(
-    TensorHandle* in, RemoteTensorHandle* out, Device* device,
-    const string& device_name, const bool serialize_resource_dtype_and_shape) {
+    TensorHandle* in, const bool wait_until_ready, RemoteTensorHandle* out,
+    Device* device, const string& device_name,
+    const bool serialize_resource_dtype_and_shape) {
   int64 op_id;
   int32 output_num;
-  if (!in->RemoteAddressUntilReady(device, &op_id, &output_num).ok()) {
+  if (!in->RemoteAddress(device, wait_until_ready, &op_id, &output_num).ok()) {
     tf_shared_lock l(remote_tensor_handle_mu_);
-    TF_RETURN_IF_ERROR(GetRemoteTensorHandle(in, &op_id, &output_num));
+    TF_RETURN_IF_ERROR(
+        GetRemoteTensorHandle(in, wait_until_ready, &op_id, &output_num));
   }
   out->Clear();
   out->set_op_id(op_id);
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.h b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
index 54c987d4daa..2446352c931 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
@@ -61,9 +61,11 @@ class RemoteMgr {
   }
 
   // Serialize a remote TensorHandle to a RemoteTensorHandle.
+  // If wait_until_ready is true, block until the remote handle is ready on a
+  // remote worker.
   Status SerializeRemoteTensorHandle(
-      TensorHandle* in, RemoteTensorHandle* out, Device* device,
-      const string& device_name,
+      TensorHandle* in, const bool wait_until_ready, RemoteTensorHandle* out,
+      Device* device, const string& device_name,
       const bool serialize_resource_dtype_and_shape = false);
 
   // Deserialize a RemoteTensorHandle to a TensorHandle(local/remote).
@@ -83,7 +85,8 @@ class RemoteMgr {
   // Returns the op_id and output_num if the given local TensorHandle exists in
   // remote_tensor_handle_map_.
   Status GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
-                               int64* op_id, int32* output_num)
+                               const bool wait_until_ready, int64* op_id,
+                               int32* output_num)
       TF_SHARED_LOCKS_REQUIRED(remote_tensor_handle_mu_);
 
   Status GetTensorHandleImpl(const RemoteTensorHandleInternal& remote_handle,
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
index e4cf6277c5a..1e33a9d0f62 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
@@ -81,7 +81,8 @@ TEST_F(RemoteMgrTest, SerializeLocalTensorHandleWithRemoteMirror) {
       handle->SetRemoteShape(shape, remote_device_, ctx_->GetContextViewId()));
   RemoteTensorHandle remote_handle;
   TF_ASSERT_OK(remote_mgr.SerializeRemoteTensorHandle(
-      handle, &remote_handle, remote_device_, remote_device_->name()));
+      handle, /*wait_until_ready=*/true, &remote_handle, remote_device_,
+      remote_device_->name()));
   EXPECT_EQ(op_id, remote_handle.op_id());
   EXPECT_EQ(output_num, remote_handle.output_num());
   EXPECT_EQ(remote_device_->name(), remote_handle.device());
@@ -97,7 +98,8 @@ TEST_F(RemoteMgrTest, SerializeRemoteTensorHandle) {
       op_id, output_num, DT_FLOAT, remote_device_, ctx_);
   RemoteTensorHandle remote_handle;
   TF_ASSERT_OK(remote_mgr.SerializeRemoteTensorHandle(
-      handle, &remote_handle, remote_device_, remote_device_->name()));
+      handle, /*wait_until_ready=*/true, &remote_handle, remote_device_,
+      remote_device_->name()));
   EXPECT_EQ(op_id, remote_handle.op_id());
   EXPECT_EQ(output_num, remote_handle.output_num());
   EXPECT_EQ(remote_device_->name(), remote_handle.device());
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
index 6cdf6b196a2..6f4d5ada759 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
@@ -194,9 +194,12 @@ string RemoteTensorHandleData::DebugString() const {
                          " output_num: ", output_num_);
 }
 
-Status RemoteTensorHandleData::OpIdAndOutputNumUntilReady(
-    int64* op_id, int32* output_num) const {
-  TF_RETURN_IF_ERROR(WaitReady("OpIdAndOutputNumUntilReady"));
+Status RemoteTensorHandleData::OpIdAndOutputNum(const bool wait_util_ready,
+                                                int64* op_id,
+                                                int32* output_num) const {
+  if (wait_util_ready) {
+    TF_RETURN_IF_ERROR(WaitReady("OpIdAndOutputNumUntilReady"));
+  }
   *op_id = op_id_;
   *output_num = output_num_;
   return Status::OK();
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
index 37ad5e721b6..5f096677225 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
@@ -50,9 +50,10 @@ class RemoteTensorHandleData {
 
   string DebugString() const;
 
-  // Block until the remote tensor is ready on a remote worker and return the op
-  // id and output num.
-  Status OpIdAndOutputNumUntilReady(int64* op_id, int32* output_num) const;
+  // Return the op id and output num. If wait_util_ready is true, block until
+  // the remote tensor is ready on a remote worker.
+  Status OpIdAndOutputNum(const bool wait_util_ready, int64* op_id,
+                          int32* output_num) const;
 
   uint64 context_view_id() const { return context_view_id_; }
 
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 8b363e66d87..fe353d7d76c 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -55,7 +55,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-GraphMgr::GraphMgr(const WorkerEnv* worker_env, DeviceMgr* device_mgr)
+GraphMgr::GraphMgr(const WorkerEnv* worker_env, const DeviceMgr* device_mgr)
     : worker_env_(worker_env), device_mgr_(device_mgr), table_(5) {
   // The default value of sync_on_finish will be flipped soon and this
   // environment variable will be removed as well.
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index 50190ab337e..e768c0907b6 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -69,7 +69,7 @@ class WorkerSession;
 //   EXPECT_EQ(out["c"], Tensor({4, 6}));
 class GraphMgr {
  public:
-  explicit GraphMgr(const WorkerEnv* worker_env, DeviceMgr* device_mgr);
+  explicit GraphMgr(const WorkerEnv* worker_env, const DeviceMgr* device_mgr);
   ~GraphMgr();
 
   // Registers a graph. Fills in "handle". The registered graph retains a
@@ -145,7 +145,7 @@ class GraphMgr {
   };
 
   const WorkerEnv* worker_env_;  // Not owned.
-  DeviceMgr* device_mgr_;
+  const DeviceMgr* device_mgr_;
 
   CostModelManager cost_model_manager_;
 
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 96e1a63e5a6..60d7172c2fc 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -462,6 +462,8 @@ tf_cuda_cc_tests(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:test_utils",
+        "//tensorflow/core/platform:blocking_counter",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
index d7251029d10..ff362c3411f 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
     default_visibility = [
@@ -29,6 +30,7 @@ cc_library(
         "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime/eager:eager_client",
         "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
         "//tensorflow/core/distributed_runtime/rpc:grpc_client_cq_tag",
@@ -56,3 +58,21 @@ cc_library(
         tf_grpc_cc_dependency(),
     ],
 )
+
+tf_cc_test(
+    name = "grpc_eager_client_test",
+    size = "small",
+    srcs = [
+        "grpc_eager_client_test.cc",
+    ],
+    deps = [
+        ":grpc_eager_client",
+        "//tensorflow/c:tf_status_headers",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/platform:blocking_counter",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:strcat",
+    ],
+)
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index de4f36ea24d..425b25e2386 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h"
 
 #include "grpcpp/generic/generic_stub.h"
+#include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
@@ -106,8 +107,8 @@ class GrpcEagerClientThread : public core::RefCounted {
 class GrpcEagerClient : public EagerClient {
  public:
   GrpcEagerClient(const tensorflow::SharedGrpcChannelPtr& channel,
-                  GrpcEagerClientThread* thread)
-      : stub_(channel), thread_(thread) {
+                  GrpcEagerClientThread* thread, const string& target)
+      : stub_(channel), thread_(thread), target_(target) {
     // Hold a reference to make sure the corresponding EagerClientThread
     // outlives the client.
     thread_->Ref();
@@ -127,14 +128,14 @@ class GrpcEagerClient : public EagerClient {
     new RPCState<protobuf::Message>(                                      \
         &stub_, cq_, "/tensorflow.eager.EagerService/" #method, *request, \
         response, std::move(done_wrapped), /*call_opts=*/nullptr,         \
-        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true);   \
+        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,    \
+        &target_);                                                        \
   }
 
   CLIENT_METHOD(CreateContext);
   CLIENT_METHOD(UpdateContext);
   CLIENT_METHOD(Enqueue);
   CLIENT_METHOD(WaitQueueDone);
-  CLIENT_METHOD(RunComponentFunction);
   CLIENT_METHOD(KeepAlive);
 
 #undef CLIENT_METHOD
@@ -146,7 +147,8 @@ class GrpcEagerClient : public EagerClient {
     new RPCState<protobuf::Message>(
         &stub_, cq_, "/tensorflow.eager.EagerService/CloseContext", *request,
         response, std::move(done_wrapped), /*call_opts=*/nullptr,
-        /*threadpool=*/nullptr);
+        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
+        &target_);
 
     VLOG(1) << "Sending RPC to close remote eager context "
             << request->DebugString();
@@ -162,12 +164,24 @@ class GrpcEagerClient : public EagerClient {
     }
   }
 
+  void RunComponentFunctionAsync(CallOptions* call_opts,
+                                 const RunComponentFunctionRequest* request,
+                                 RunComponentFunctionResponse* response,
+                                 StatusCallback done) override {
+    StatusCallback done_wrapped = callback_wrapper(std::move(done));
+    new RPCState<protobuf::Message>(
+        &stub_, cq_, "/tensorflow.eager.EagerService/RunComponentFunction",
+        *request, response, std::move(done_wrapped), call_opts,
+        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
+        &target_);
+  }
+
   void StreamingEnqueueAsync(const EnqueueRequest* request,
                              EnqueueResponse* response,
                              StatusCallback done) override {
     StatusCallback done_wrapped = callback_wrapper(std::move(done));
     if (EnableStreaming()) {
-      tf_shared_lock l(mu_);
+      mutex_lock l(mu_);
       auto it = enqueue_dispatchers_.find(request->context_id());
       if (it == enqueue_dispatchers_.end()) {
         auto it_and_bool = enqueue_dispatchers_.emplace(
@@ -194,6 +208,7 @@ class GrpcEagerClient : public EagerClient {
  private:
   ::grpc::GenericStub stub_;
   const GrpcEagerClientThread* thread_;
+  const string target_;
 
   ::grpc::CompletionQueue* cq_;
 
@@ -225,6 +240,7 @@ class GrpcEagerClientCache : public EagerClientCache {
 
   Status GetClient(const string& target,
                    core::RefCountPtr<EagerClient>* client) override {
+    mutex_lock l(clients_mu_);
     auto it = clients_.find(target);
     if (it == clients_.end()) {
       tensorflow::SharedGrpcChannelPtr shared =
@@ -236,7 +252,7 @@ class GrpcEagerClientCache : public EagerClientCache {
       int assigned_index = AssignClientToThread(target);
       GrpcEagerClientThread* thread = threads_[assigned_index].get();
       core::RefCountPtr<EagerClient> worker(
-          new GrpcEagerClient(shared, thread));
+          new GrpcEagerClient(shared, thread, target));
       it = clients_.emplace(target, std::move(worker)).first;
     }
 
@@ -266,7 +282,9 @@ class GrpcEagerClientCache : public EagerClientCache {
   }
 
   std::shared_ptr<tensorflow::GrpcChannelCache> cache_;
-  std::unordered_map<string, core::RefCountPtr<EagerClient>> clients_;
+  mutable mutex clients_mu_;
+  std::unordered_map<string, core::RefCountPtr<EagerClient>> clients_
+      TF_GUARDED_BY(clients_mu_);
   std::vector<core::RefCountPtr<GrpcEagerClientThread>> threads_;
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc
new file mode 100644
index 00000000000..a6da56eca13
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h"
+
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/blocking_counter.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace eager {
+
+TEST(GrpcEagerClientCache, TestGetClientThreadSafety) {
+  GrpcChannelSpec spec;
+  TF_ASSERT_OK(spec.AddHostPortsJob(
+      "worker", {"a:1", "b:2", "c:3", "d:4", "e:5", "f:6"}));
+  ChannelCreationFunction channel_func =
+      ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
+  auto channel_cache = std::shared_ptr<GrpcChannelCache>(
+      NewGrpcChannelCache(spec, channel_func));
+  std::unique_ptr<EagerClientCache> client_cache(
+      NewGrpcEagerClientCache(channel_cache));
+  const int num_calls = 10;
+  BlockingCounter counter(num_calls);
+
+  for (int i = 0; i < num_calls; i++) {
+    Env::Default()->SchedClosure([&client_cache, i, &counter]() {
+      string target = strings::StrCat("/job:worker/replica:0/task:", i);
+      core::RefCountPtr<EagerClient> eager_client;
+      Status s = client_cache->GetClient(target, &eager_client);
+      // With 6 tasks added to the job, querying client for 0--5 should be OK,
+      // and querying client for 6+ should give invalid argument error.
+      error::Code expected_code = i <= 5 ? error::OK : error::INVALID_ARGUMENT;
+      EXPECT_EQ(expected_code, s.code());
+      counter.DecrementCount();
+    });
+  }
+  counter.Wait();
+}
+
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index d95589704b1..924112e0d96 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -76,9 +76,13 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
       EagerCall<RunComponentFunctionRequest, RunComponentFunctionResponse>*
           call) {
     env_->compute_pool->Schedule([this, call]() {
-      local_impl_.RunComponentFunction(
-          &call->request, &call->response,
-          [call](const Status& s) { call->SendResponse(ToGrpcStatus(s)); });
+      auto call_opts = std::make_shared<CallOptions>();
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      local_impl_.RunComponentFunction(call_opts.get(), &call->request,
+                                       &call->response,
+                                       [call, call_opts](const Status& s) {
+                                         call->SendResponse(ToGrpcStatus(s));
+                                       });
     });
     Call<GrpcEagerServiceImpl, grpc::EagerService::AsyncService,
          RunComponentFunctionRequest, RunComponentFunctionResponse>::
@@ -86,7 +90,7 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
             &service_, cq_.get(),
             &grpc::EagerService::AsyncService::RequestRunComponentFunction,
             &GrpcEagerServiceImpl::RunComponentFunctionHandler,
-            /*supports_cancel=*/false);
+            /*supports_cancel=*/true);
   }
 
   // Called when a new request has been received as part of a StreamingEnqueue
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 85431acdf0c..6e706179863 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -45,7 +45,7 @@ class GrpcRemoteWorker : public WorkerInterface {
   explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
                             ::grpc::CompletionQueue* completion_queue,
                             thread::ThreadPool* callback_threadpool,
-                            WorkerCacheLogger* logger)
+                            WorkerCacheLogger* logger, const string& target)
       : channel_(std::move(channel)),
         stub_(channel_),
         cq_(completion_queue),
@@ -66,7 +66,8 @@ class GrpcRemoteWorker : public WorkerInterface {
         instancesource_(Method(GrpcWorkerMethod::kCompleteInstance)),
         getstepsequence_(Method(GrpcWorkerMethod::kGetStepSequence)),
         markrecvfinished_(Method(GrpcWorkerMethod::kMarkRecvFinished)),
-        logger_(logger) {}
+        logger_(logger),
+        target_(target) {}
 
   ~GrpcRemoteWorker() override {}
 
@@ -273,7 +274,7 @@ class GrpcRemoteWorker : public WorkerInterface {
                     bool fail_fast = true) {
     new RPCState<protobuf::Message>(
         &stub_, cq_, method, *request, response, std::move(done), call_opts,
-        callback_threadpool_, /*max_retries=*/0, fail_fast);
+        callback_threadpool_, /*max_retries=*/0, fail_fast, &target_);
   }
 
   void IssueRequest(const protobuf::Message* request, TensorResponse* response,
@@ -281,7 +282,8 @@ class GrpcRemoteWorker : public WorkerInterface {
                     CallOptions* call_opts = nullptr) {
     new RPCState<TensorResponse>(&stub_, cq_, method, *request, response,
                                  std::move(done), call_opts,
-                                 callback_threadpool_);
+                                 callback_threadpool_, /*max_retries=*/0,
+                                 /*fail_fast=*/true, &target_);
   }
 
   void IssueMarkRecvFinishedRequest(int64 request_id) {
@@ -321,6 +323,7 @@ class GrpcRemoteWorker : public WorkerInterface {
 
   // Support for logging.
   WorkerCacheLogger* logger_;
+  const string target_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcRemoteWorker);
 };
@@ -328,9 +331,10 @@ class GrpcRemoteWorker : public WorkerInterface {
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
                                      thread::ThreadPool* callback_threadpool,
-                                     WorkerCacheLogger* logger) {
+                                     WorkerCacheLogger* logger,
+                                     const string& target) {
   return new GrpcRemoteWorker(std::move(channel), completion_queue,
-                              callback_threadpool, logger);
+                              callback_threadpool, logger, target);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
index c0a49ecfc38..97e590e0ad1 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
@@ -29,7 +29,8 @@ class WorkerInterface;
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
                                      thread::ThreadPool* callback_threadpool,
-                                     WorkerCacheLogger* logger);
+                                     WorkerCacheLogger* logger,
+                                     const string& target);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
index 272d6bb1b20..bcb98baaeb9 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -210,7 +210,8 @@ void GrpcRPCFactory::StartCall(const Tensor& address_t, const Tensor& method_t,
       get_stub(index), &completion_queue_, *get_method_ptr(index),
       call->request(), call->response(),
       /*done=*/[call](const Status& s) { call->Done(s); }, call->call_opts(),
-      nullptr /*threadpool*/, fail_fast_, timeout_in_ms_, 0 /* max_retries */);
+      /*threadpool=*/nullptr, fail_fast_, timeout_in_ms_, /*max_retries=*/0,
+      /*target=*/nullptr);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 32083fc272f..6523d2fb4dd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -70,6 +70,18 @@ class NoReusePortOption : public ::grpc::ServerBuilderOption {
                          plugins) override {}
 };
 
+// Define an option subclass in order to enable SO_REUSEPORT for the
+// server socket.
+class ReusePortOption : public ::grpc::ServerBuilderOption {
+ public:
+  void UpdateArguments(::grpc::ChannelArguments* args) override {
+    args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 1);
+  }
+
+  void UpdatePlugins(std::vector<std::unique_ptr<::grpc::ServerBuilderPlugin>>*
+                         plugins) override {}
+};
+
 // static utility function
 RendezvousMgrInterface* NewRpcRendezvousMgr(const WorkerEnv* env) {
   return new RpcRendezvousMgr(env);
@@ -118,9 +130,6 @@ GrpcServer::~GrpcServer() {
   // OpSegments.)
   if (worker_env_.session_mgr != nullptr) {
     delete worker_env_.session_mgr;  // Deletes graph_mgr's.
-  } else {
-    // Note: session_mgr's legacy_session_ deletes device_mgr now.
-    delete worker_env_.device_mgr;
   }
 
   // Do not delete (as these are not owned by the server):
@@ -131,9 +140,11 @@ GrpcServer::~GrpcServer() {
 
 void GrpcServer::MaybeMutateBuilder(::grpc::ServerBuilder* builder) {}
 
-// Look up the port that has been requested for this task in `server_def`.
-Status GrpcServer::GetPort(const ServerDef& server_def, int* port) const {
+// Look up the requested host name and port for this task in `server_def`.
+Status GrpcServer::GetHostAndPort(const ServerDef& server_def,
+                                  string* host_name, int* port) const {
   *port = -1;
+  *host_name = "localhost";
   for (const auto& job : server_def.cluster().job()) {
     if (job.name() == server_def.job_name()) {
       auto iter = job.tasks().find(server_def.task_index());
@@ -153,6 +164,11 @@ Status GrpcServer::GetPort(const ServerDef& server_def, int* port) const {
               "Could not parse port for local server from \"", iter->second,
               "\".");
         }
+
+        if (colon_index != string::npos &&
+            !iter->second.substr(0, colon_index).empty()) {
+          *host_name = iter->second.substr(0, colon_index);
+        }
       }
       break;
     }
@@ -175,7 +191,7 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   // otherwise if 'task_index=-1' the program will abort.
 
   int requested_port;
-  TF_RETURN_IF_ERROR(GetPort(server_def_, &requested_port));
+  TF_RETURN_IF_ERROR(GetHostAndPort(server_def_, &host_name_, &requested_port));
 
   SessionOptions sess_opts;
   ConfigProto config = server_def_.default_session_config();
@@ -185,12 +201,18 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   string name_prefix =
       strings::StrCat("/job:", server_def_.job_name(), "/replica:0",
                       "/task:", server_def_.task_index());
-  std::vector<std::unique_ptr<Device>> devices;
-  TF_RETURN_IF_ERROR(
-      DeviceFactory::AddDevices(sess_opts, name_prefix, &devices));
-  worker_env_.device_mgr = new StaticDeviceMgr(std::move(devices));
-  master_env_.local_devices = worker_env_.device_mgr->ListDevices();
+  if (opts.local_device_mgr == nullptr) {
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_RETURN_IF_ERROR(
+        DeviceFactory::AddDevices(sess_opts, name_prefix, &devices));
+    worker_env_.device_mgr = new StaticDeviceMgr(std::move(devices));
+    owned_device_manager_.reset(worker_env_.device_mgr);
+  } else {
+    worker_env_.device_mgr = opts.local_device_mgr;
+    owned_device_manager_.reset(nullptr);
+  }
   worker_env_.local_devices = worker_env_.device_mgr->ListDevices();
+  master_env_.local_devices = worker_env_.device_mgr->ListDevices();
   worker_env_.rendezvous_mgr = opts.rendezvous_mgr_func == nullptr
                                    ? new RpcRendezvousMgr(&worker_env_)
                                    : opts.rendezvous_mgr_func(&worker_env_);
@@ -220,8 +242,18 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
                            GetServerCredentials(server_def_), &bound_port_);
   builder.SetMaxMessageSize(std::numeric_limits<int32>::max());
 
-  builder.SetOption(
-      std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
+  bool reuse_port = false;
+  const Status status =
+      ReadBoolFromEnvVar("TF_GRPC_REUSE_PORT", false, &reuse_port);
+  if (!status.ok()) {
+    LOG(ERROR) << status.error_message();
+  }
+  auto server_build_option =
+      reuse_port
+          ? std::unique_ptr<::grpc::ServerBuilderOption>(new ReusePortOption)
+          : std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption);
+  builder.SetOption(std::move(server_build_option));
+
   // Allow subclasses to specify more args to pass to the gRPC server.
   MaybeMutateBuilder(&builder);
   master_impl_ = CreateMaster(&master_env_);
@@ -325,7 +357,7 @@ Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
                                        task.second);
       }
       if (job.name() == *options.job_name && task.first == options.task_index) {
-        host_port = strings::StrCat("localhost:", bound_port_);
+        host_port = strings::StrCat(host_name_, ":", bound_port_);
       } else {
         host_port = task.second;
       }
@@ -478,7 +510,7 @@ Status GrpcServer::Join() {
 }
 
 const string GrpcServer::target() const {
-  return strings::StrCat("grpc://localhost:", bound_port_);
+  return strings::StrCat("grpc://", host_name_, ":", bound_port_);
 }
 
 std::shared_ptr<::grpc::ServerCredentials> GrpcServer::GetServerCredentials(
@@ -498,12 +530,13 @@ std::unique_ptr<Master> GrpcServer::CreateMaster(MasterEnv* master_env) {
 
 /* static */
 Status GrpcServer::Create(const ServerDef& server_def, Env* env,
+                          const DeviceMgr* local_device_mgr,
                           std::unique_ptr<ServerInterface>* out_server) {
   std::unique_ptr<GrpcServer> ret(
       new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
-  ServiceInitFunction service_func = nullptr;
   GrpcServerOptions options;
   options.rendezvous_mgr_func = NewRpcRendezvousMgr;
+  options.local_device_mgr = local_device_mgr;
   Status s = ret->Init(options);
   if (!s.ok()) {
     LOG(ERROR) << s;
@@ -513,19 +546,21 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
   return Status::OK();
 }
 
+/* static */
+Status GrpcServer::Create(const ServerDef& server_def, Env* env,
+                          std::unique_ptr<ServerInterface>* out_server) {
+  return Create(server_def, env, nullptr, out_server);
+}
+
 /* static */
 Status GrpcServer::Create(const ServerDef& server_def, Env* env,
                           std::unique_ptr<GrpcServer>* out_server) {
-  std::unique_ptr<GrpcServer> ret(
-      new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
-  GrpcServerOptions options;
-  options.rendezvous_mgr_func = NewRpcRendezvousMgr;
-  Status s = ret->Init(options);
+  std::unique_ptr<ServerInterface> server;
+  Status s = Create(server_def, env, nullptr, &server);
   if (!s.ok()) {
-    LOG(ERROR) << s;
     return s;
   }
-  *out_server = std::move(ret);
+  out_server->reset(dynamic_cast<GrpcServer*>(server.release()));
   return Status::OK();
 }
 
@@ -537,9 +572,10 @@ class GrpcServerFactory : public ServerFactory {
     return server_def.protocol() == "grpc";
   }
 
-  Status NewServer(const ServerDef& server_def,
+  Status NewServer(const ServerDef& server_def, const Options& options,
                    std::unique_ptr<ServerInterface>* out_server) override {
-    return GrpcServer::Create(server_def, Env::Default(), out_server);
+    return GrpcServer::Create(server_def, Env::Default(),
+                              options.local_device_mgr, out_server);
   }
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 8e25b8835eb..0474c5a517f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -68,11 +68,14 @@ struct GrpcServerOptions {
   WorkerCreationFunction worker_func = nullptr;
   StatsPublisherFactory stats_factory = CreateNoOpStatsPublisher;
   GrpcWorkerServiceOptions worker_service_options;
+  const DeviceMgr* local_device_mgr = nullptr;
 };
 
 class GrpcServer : public ServerInterface {
  protected:
   GrpcServer(const ServerDef& server_def, Env* env);
+  GrpcServer(const ServerDef& server_def, DeviceMgr* local_device_mgr,
+             Env* env);
   // Allow children classes to override this and provide custom args to the
   // server before it is constructed. Default behavior is to do nothing.
   virtual void MaybeMutateBuilder(::grpc::ServerBuilder* builder);
@@ -82,6 +85,10 @@ class GrpcServer : public ServerInterface {
                        std::unique_ptr<ServerInterface>* out_server);
   static Status Create(const ServerDef& server_def, Env* env,
                        std::unique_ptr<GrpcServer>* out_server);
+  // Reuse the local_device_mgr.
+  static Status Create(const ServerDef& server_def, Env* env,
+                       const DeviceMgr* local_device_mgr,
+                       std::unique_ptr<ServerInterface>* out_server);
 
   // Destruction is only supported in the factory method. Clean
   // shutdown is not currently implemented for this server type.
@@ -104,7 +111,8 @@ class GrpcServer : public ServerInterface {
   Status UpdateServerDef(const ServerDef& server_def);
 
  protected:
-  virtual Status GetPort(const ServerDef& server_def, int* port) const;
+  virtual Status GetHostAndPort(const ServerDef& server_def, string* host_name,
+                                int* port) const;
   Status Init(const GrpcServerOptions& opts = GrpcServerOptions());
 
   // A subclass can override this method to support secure credentials.
@@ -136,6 +144,9 @@ class GrpcServer : public ServerInterface {
   // The port to which this server is bound.
   int bound_port_ = 0;
 
+  // The host name of this server
+  string host_name_;
+
   // Guards server configuration, server, and state.
   mutex mu_;
 
@@ -159,6 +170,7 @@ class GrpcServer : public ServerInterface {
 
   // Implementation of a TensorFlow worker, and RPC polling thread.
   WorkerEnv worker_env_;
+  std::unique_ptr<const DeviceMgr> owned_device_manager_;
   std::unique_ptr<GrpcWorker> worker_impl_;
   AsyncServiceInterface* worker_service_ = nullptr;
   std::unique_ptr<Thread> worker_thread_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index c72ba6035a4..041b6e51ffb 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -45,7 +45,7 @@ class RPCState : public GrpcClientCQTag {
            const ::grpc::string& method, const protobuf::Message& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
            thread::ThreadPool* threadpool, int32 max_retries = 0,
-           bool fail_fast = true)
+           bool fail_fast = true, const string* target = nullptr)
       : RPCState(
             stub, cq, method, request, response, std::move(done), call_opts,
             threadpool,
@@ -63,7 +63,7 @@ class RPCState : public GrpcClientCQTag {
 #endif  // PLATFORM_GOOGLE
               return x;
             }(),
-            /*timeout_in_ms=*/0, max_retries) {
+            /*timeout_in_ms=*/0, max_retries, target) {
   }
 
   template <typename Request>
@@ -71,7 +71,7 @@ class RPCState : public GrpcClientCQTag {
            const ::grpc::string& method, const Request& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
            thread::ThreadPool* threadpool, bool fail_fast, int64 timeout_in_ms,
-           int32 max_retries)
+           int32 max_retries, const string* target)
       : call_opts_(call_opts),
         threadpool_(threadpool),
         done_(std::move(done)),
@@ -80,7 +80,8 @@ class RPCState : public GrpcClientCQTag {
         cq_(cq),
         stub_(stub),
         method_(method),
-        fail_fast_(fail_fast) {
+        fail_fast_(fail_fast),
+        target_(target) {
     response_ = response;
     ::grpc::Status s = GrpcMaybeUnparseProto(request, &request_buf_);
     if (!s.ok()) {
@@ -152,10 +153,13 @@ class RPCState : public GrpcClientCQTag {
       StartCall();
     } else {
       // Attach additional GRPC error information if any to the final status
-      s = Status(s.code(),
-                 strings::StrCat(s.error_message(),
-                                 "\nAdditional GRPC error information:\n",
-                                 context_->debug_error_string()));
+      string error_msg = s.error_message();
+      strings::StrAppend(&error_msg, "\nAdditional GRPC error information");
+      if (target_) {
+        strings::StrAppend(&error_msg, " from remote target ", *target_);
+      }
+      strings::StrAppend(&error_msg, ":\n:", context_->debug_error_string());
+      s = Status(s.code(), error_msg);
       // Always treat gRPC cancellation as a derived error. This ensures that
       // other error types are preferred during status aggregation. (gRPC
       // cancellation messages do not contain the original status message).
@@ -196,6 +200,7 @@ class RPCState : public GrpcClientCQTag {
   ::grpc::GenericStub* stub_;
   ::grpc::string method_;
   bool fail_fast_;
+  const string* target_;
 };
 
 // Represents state associated with one streaming RPC call.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index f6b6e15a2ba..1d75728ddd2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -69,9 +69,9 @@ class GrpcWorkerCache : public WorkerCachePartial {
         return nullptr;
       }
       size_t index = AssignWorkerToThread(target);
-      return NewGrpcRemoteWorker(channel,
-                                 worker_env_->GetCompletionQueue(index),
-                                 worker_env_->GetThreadPool(), &logger_);
+      return NewGrpcRemoteWorker(
+          channel, worker_env_->GetCompletionQueue(index),
+          worker_env_->GetThreadPool(), &logger_, target);
     }
   }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 5bb61eb8cc1..512c17fcfcf 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -136,7 +137,12 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
   // Start the main RecvTensor call, checking for an async abort.
   void StartRTCall(std::function<void()> recv_done) {
     resp_.InitAlloc(dst_device_, alloc_attrs_);
-    auto cb = [this, recv_done = std::move(recv_done)](const Status& s) {
+    auto abort_checked = std::make_shared<Notification>();
+    auto cb = [this, abort_checked,
+               recv_done = std::move(recv_done)](const Status& s) {
+      // Make sure the Rendezvous abort checking is finished before running the
+      // callback, which might destroy the current call object.
+      abort_checked->WaitForNotification();
       if (!s.ok()) {
         mutex_lock l(mu_);
         status_.Update(s);
@@ -144,6 +150,22 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
       recv_done();
     };
     wi_->RecvTensorAsync(&opts_, &req_, &resp_, std::move(cb));
+
+    // NOTE: Check if the rendezvous was aborted after sending out the RPC. The
+    // ordering is important because `StartAbort` could be called right before
+    // the `RecvTensorAsync` request registers its RPC cancellation to `opts_`.
+    // In that case, the previous `StartAbort` would not trigger the
+    // cancellation of this call.
+    Status s;
+    {
+      mutex_lock l(mu_);
+      s = status_;
+    }
+    if (!s.ok()) {
+      opts_.StartCancel();
+    }
+    // Notify that the abort check has finished.
+    abort_checked->Notify();
   }
 
   string src_worker_;
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index 85923542f73..7c5779246bd 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -16,13 +16,16 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
 
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/test_utils.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/control_flow.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/blocking_counter.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -48,13 +51,34 @@ Rendezvous::ParsedKey MakeKey(const string& s) {
 }
 
 namespace {
+// A dummy worker interface implementation that simply triggers the callback
+// with OK status for RecvTensor request.
+class DummyWorker : public TestWorkerInterface {
+ public:
+  void RecvTensorAsync(CallOptions* opts, const RecvTensorRequest* request,
+                       TensorResponse* response, StatusCallback done) override {
+    SchedClosure([done = std::move(done)]() {
+      // Simulate a random delay for RPC. This is needed to fill the entire
+      // object buffer in `RpcRecvTensorFreeList` and trigger the destruction of
+      // RPC call objects.
+      const int64 t_us = random::New64() % 100 * 1000;
+      Env::Default()->SleepForMicroseconds(t_us);
+      done(Status::OK());
+    });
+  }
+};
+
 // Fake cache implementation for WorkerEnv.
 class DummyWorkerCache : public WorkerCacheInterface {
   void ListWorkers(std::vector<string>* workers) const override {}
   void ListWorkersInJob(const string& job_name,
                         std::vector<string>* workers) const override {}
   WorkerInterface* GetOrCreateWorker(const string& target) override {
-    return nullptr;
+    if (dummy_remote_worker_ == nullptr) {
+      // Ownership transferred to WorkerFreeList
+      dummy_remote_worker_ = new DummyWorker;
+    }
+    return dummy_remote_worker_;
   }
   Status GetEagerClientCache(
       std::unique_ptr<eager::EagerClientCache>* eager_client_cache) override {
@@ -66,7 +90,31 @@ class DummyWorkerCache : public WorkerCacheInterface {
   }
   void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
                               StatusCallback done) override {}
+
+ private:
+  DummyWorker* dummy_remote_worker_ = nullptr;
 };
+
+static Device* CreateDevice(const char* type, const char* name) {
+  class FakeDevice : public Device {
+   public:
+    explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
+    Status Sync() override { return Status::OK(); }
+    Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
+  };
+  DeviceAttributes attr;
+  attr.set_name(name);
+  attr.set_device_type(type);
+  return new FakeDevice(attr);
+}
+
+static DeviceMgr* CreateDeviceMgr() {
+  std::unique_ptr<Device> d0(
+      CreateDevice("CPU", "/job:mnist/replica:1/task:2/cpu:1"));
+  std::vector<std::unique_ptr<Device>> devices;
+  devices.emplace_back(std::move(d0));
+  return new StaticDeviceMgr(std::move(devices));
+}
 }  // namespace
 
 class RpcRendezvousMgrTest : public ::testing::Test {
@@ -75,7 +123,7 @@ class RpcRendezvousMgrTest : public ::testing::Test {
       : cache_(new DummyWorkerCache),
         worker_session_("rpc_session", "/job:mnist/replica:1/task:2",
                         std::unique_ptr<WorkerCacheInterface>(cache_),
-                        std::unique_ptr<DeviceMgr>(),
+                        std::unique_ptr<DeviceMgr>(CreateDeviceMgr()),
                         std::unique_ptr<GraphMgr>(), nullptr),
         rmgr_(&env) {
     env.env = Env::Default();
@@ -193,6 +241,7 @@ TEST_F(RpcRendezvousMgrTest, CancelAfterReceived) {
   delete cm;
 }
 
+namespace {
 class DummyDeviceContext : public DeviceContext {
  public:
   explicit DummyDeviceContext(int stream_id) : stream_id_(stream_id) {}
@@ -202,6 +251,7 @@ class DummyDeviceContext : public DeviceContext {
  private:
   const int stream_id_;
 };
+}  // namespace
 
 TEST_F(RpcRendezvousMgrTest, TransferDummyDeviceContext) {
   DummyDeviceContext* dc = new DummyDeviceContext(123);
@@ -237,6 +287,59 @@ TEST_F(RpcRendezvousMgrTest, TransferDummyDeviceContext) {
   dc->Unref();
 }
 
-// NOTE: Remote Send/Recv is better tested in worker_test.cc
+TEST_F(RpcRendezvousMgrTest, RemoteRecvOne) {
+  const int64 step_id = 123;
+  const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
+      "/job:worker/replica:1/task:2/cpu:0", 7890,
+      "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
+  {
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
+    core::ScopedUnref unref(rendez);
+    Rendezvous::Args args;
+
+    Tensor val(DT_STRING);
+    bool val_dead = false;
+
+    TF_ASSERT_OK(rendez->Recv(key, args, &val, &val_dead));
+  }
+  rmgr_.Cleanup(step_id);
+}
+
+TEST_F(RpcRendezvousMgrTest, RemoteRecvAsyncMany) {
+  const int64 step_id = 123;
+  const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
+      "/job:worker/replica:1/task:2/cpu:0", 7890,
+      "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
+  {
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
+    core::ScopedUnref unref(rendez);
+    Rendezvous::Args args;
+
+    // Send a large number of async RPC requests to fill up the buffer in
+    // `RpcRecvTensorFreeList`, in order to test deleting RPC call objects.
+    int num_requests = 10000;
+    Tensor val(DT_STRING);
+    mutex mu_;
+    Status status = Status::OK();
+    BlockingCounter counter(num_requests);
+
+    for (int i = 0; i < num_requests; i++) {
+      rendez->RecvAsync(
+          key, args,
+          [&mu_, &status, &counter](const Status& s, const Rendezvous::Args&,
+                                    const Rendezvous::Args&, const Tensor&,
+                                    const bool) {
+            mutex_lock l(mu_);
+            status.Update(s);
+            counter.DecrementCount();
+          });
+    }
+    counter.Wait();
+    TF_ASSERT_OK(status);
+  }
+  rmgr_.Cleanup(step_id);
+}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/server_lib.cc b/tensorflow/core/distributed_runtime/server_lib.cc
index 62a2011db39..12baa75976a 100644
--- a/tensorflow/core/distributed_runtime/server_lib.cc
+++ b/tensorflow/core/distributed_runtime/server_lib.cc
@@ -73,7 +73,17 @@ Status NewServer(const ServerDef& server_def,
                  std::unique_ptr<ServerInterface>* out_server) {
   ServerFactory* factory;
   TF_RETURN_IF_ERROR(ServerFactory::GetFactory(server_def, &factory));
-  return factory->NewServer(server_def, out_server);
+  return factory->NewServer(server_def, ServerFactory::Options(), out_server);
+}
+
+// Creates a server based on the given `server_def`, and stores it in
+// `*out_server`. Returns OK on success, otherwise returns an error.
+Status NewServerWithOptions(const ServerDef& server_def,
+                            const ServerFactory::Options& options,
+                            std::unique_ptr<ServerInterface>* out_server) {
+  ServerFactory* factory;
+  TF_RETURN_IF_ERROR(ServerFactory::GetFactory(server_def, &factory));
+  return factory->NewServer(server_def, options, out_server);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/server_lib.h b/tensorflow/core/distributed_runtime/server_lib.h
index 275f526d311..7b4b4892848 100644
--- a/tensorflow/core/distributed_runtime/server_lib.h
+++ b/tensorflow/core/distributed_runtime/server_lib.h
@@ -24,6 +24,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+class DeviceMgr;
+
 // This library supports a registration/factory-based mechanism for
 // creating TensorFlow server objects. Each server implementation must
 // have an accompanying implementation of ServerFactory, and create a
@@ -63,10 +65,14 @@ class ServerInterface {
 
 class ServerFactory {
  public:
+  struct Options {
+    // Local DeviceMgr to use.
+    const tensorflow::DeviceMgr* local_device_mgr;
+  };
   // Creates a new server based on the given `server_def`, and stores
   // it in `*out_server`. Returns OK on success, otherwise returns an
   // error.
-  virtual Status NewServer(const ServerDef& server_def,
+  virtual Status NewServer(const ServerDef& server_def, const Options& options,
                            std::unique_ptr<ServerInterface>* out_server) = 0;
 
   // Returns true if and only if this factory can create a server
@@ -92,6 +98,9 @@ class ServerFactory {
 // `*out_server`. Returns OK on success, otherwise returns an error.
 Status NewServer(const ServerDef& server_def,
                  std::unique_ptr<ServerInterface>* out_server);
+Status NewServerWithOptions(const ServerDef& server_def,
+                            const ServerFactory::Options& options,
+                            std::unique_ptr<ServerInterface>* out_server);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/server_lib_test.cc b/tensorflow/core/distributed_runtime/server_lib_test.cc
index 77048c24b47..2152ff986d6 100644
--- a/tensorflow/core/distributed_runtime/server_lib_test.cc
+++ b/tensorflow/core/distributed_runtime/server_lib_test.cc
@@ -26,7 +26,7 @@ class TestServerFactory : public ServerFactory {
     return server_def.protocol() == "test_protocol";
   }
 
-  Status NewServer(const ServerDef& server_def,
+  Status NewServer(const ServerDef& server_def, const Options& options,
                    std::unique_ptr<ServerInterface>* out_server) override {
     return Status::OK();
   }
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index e2151e068f6..1d9a22a5817 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -171,7 +171,7 @@ Status SessionMgr::UpdateSession(
 
   std::vector<std::unique_ptr<Device>> cluster_devices;
 
-  DeviceMgr* local_device_mgr = worker_session->device_mgr();
+  const DeviceMgr* local_device_mgr = worker_session->device_mgr();
   DeviceMgr* remote_device_mgr = worker_session->remote_device_mgr();
   std::vector<Device*> curr_remote_devices = remote_device_mgr->ListDevices();
   std::vector<std::unique_ptr<Device>> added_remote_devices;
diff --git a/tensorflow/core/distributed_runtime/test_utils.h b/tensorflow/core/distributed_runtime/test_utils.h
index a93c78e62fd..cec09775469 100644
--- a/tensorflow/core/distributed_runtime/test_utils.h
+++ b/tensorflow/core/distributed_runtime/test_utils.h
@@ -70,28 +70,28 @@ class TestWorkerInterface : public WorkerInterface {
   void CleanupGraphAsync(const CleanupGraphRequest* request,
                          CleanupGraphResponse* response,
                          StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("CleanupGraphAsync"));
   }
 
   void CleanupAllAsync(const CleanupAllRequest* request,
                        CleanupAllResponse* response,
                        StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("CleanupAllAsync"));
   }
 
   void RecvTensorAsync(CallOptions* opts, const RecvTensorRequest* request,
                        TensorResponse* response, StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("RecvTensorAsync"));
   }
 
   void LoggingAsync(const LoggingRequest* request, LoggingResponse* response,
                     StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("LoggingAsync"));
   }
 
   void TracingAsync(const TracingRequest* request, TracingResponse* response,
                     StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("TracingAsync"));
   }
 
   void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
@@ -103,20 +103,20 @@ class TestWorkerInterface : public WorkerInterface {
                           const CompleteGroupRequest* request,
                           CompleteGroupResponse* response,
                           StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("CompleteGroupAsync"));
   }
 
   void CompleteInstanceAsync(CallOptions* ops,
                              const CompleteInstanceRequest* request,
                              CompleteInstanceResponse* response,
                              StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("CompleteInstanceAsync"));
   }
 
   void GetStepSequenceAsync(const GetStepSequenceRequest* request,
                             GetStepSequenceResponse* response,
                             StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("GetStepSequenceAsync"));
   }
 };
 
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 7850ecc46b2..f857a63e64d 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -38,7 +38,7 @@ Worker::Worker(WorkerEnv* env) : env_(env), recent_request_ids_(100000) {
 void Worker::GetStatusAsync(const GetStatusRequest* request,
                             GetStatusResponse* response, bool fail_fast,
                             StatusCallback done) {
-  DeviceMgr* dm = env_->device_mgr;
+  const DeviceMgr* dm = env_->device_mgr;
   std::vector<DeviceAttributes> devices;
   dm->ListDeviceAttributes(&devices);
   response->mutable_device_attributes()->Reserve(devices.size());
diff --git a/tensorflow/core/distributed_runtime/worker_env.h b/tensorflow/core/distributed_runtime/worker_env.h
index 93d933bfa60..ecc3313d0ce 100644
--- a/tensorflow/core/distributed_runtime/worker_env.h
+++ b/tensorflow/core/distributed_runtime/worker_env.h
@@ -53,7 +53,7 @@ struct WorkerEnv {
   // Note: Please use the device_mgr associated with your session if appropriate
   // instead of this one. Using this device_mgr does not support ClusterSpec
   // propagated sessions.
-  DeviceMgr* device_mgr = nullptr;
+  const DeviceMgr* device_mgr = nullptr;
 
   // A set of rendezvous keyed by step ids.
   RendezvousMgrInterface* rendezvous_mgr = nullptr;
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index ca4f25f08f5..3aed73fa358 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -144,7 +144,7 @@ Status WorkerSession::UpdateWorkerCacheAndDevices(
 std::shared_ptr<WorkerSession> WorkerSession::CreateWithBorrowedDeviceMgr(
     const string& session_name, const string& worker_name,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
-    DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
+    const DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
     std::unique_ptr<DynamicDeviceMgr> remote_device_mgr) {
   return std::shared_ptr<WorkerSession>(new WorkerSession(
       session_name, worker_name, std::move(worker_cache), borrowed_device_mgr,
@@ -154,7 +154,7 @@ std::shared_ptr<WorkerSession> WorkerSession::CreateWithBorrowedDeviceMgr(
 WorkerSession::WorkerSession(
     const string& session_name, const string& worker_name,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
-    DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
+    const DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
     std::unique_ptr<DynamicDeviceMgr> remote_device_mgr)
     : session_name_(session_name),
       worker_name_(worker_name),
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
index 3b2d1122558..f870a8c064b 100644
--- a/tensorflow/core/distributed_runtime/worker_session.h
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -37,7 +37,7 @@ class WorkerSession {
   // sessions created with `isolate_session_state == false`. In the
   // those cases, this method returns a pointer to a borrowed
   // DeviceMgr (typically the `worker_env.device_mgr`).
-  DeviceMgr* device_mgr() {
+  const DeviceMgr* device_mgr() {
     return device_mgr_ ? device_mgr_.get() : borrowed_device_mgr_;
   }
 
@@ -65,7 +65,7 @@ class WorkerSession {
   static std::shared_ptr<WorkerSession> CreateWithBorrowedDeviceMgr(
       const string& session_name, const string& worker_name,
       std::unique_ptr<WorkerCacheInterface> worker_cache,
-      DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
+      const DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
       std::unique_ptr<DynamicDeviceMgr> remote_device_mgr);
 
   // In the eager runtime we allow WorkerSession to be updated, where the
@@ -90,7 +90,7 @@ class WorkerSession {
  private:
   WorkerSession(const string& session_name, const string& worker_name,
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
-                DeviceMgr* borrowed_device_mgr,
+                const DeviceMgr* borrowed_device_mgr,
                 std::unique_ptr<GraphMgr> graph_mgr,
                 std::unique_ptr<DynamicDeviceMgr> remote_device_mgr);
 
@@ -113,8 +113,8 @@ class WorkerSession {
 
   std::unique_ptr<ClusterFunctionLibraryRuntime> cluster_flr_;
 
-  const std::unique_ptr<DeviceMgr> device_mgr_;
-  DeviceMgr* const borrowed_device_mgr_;  // Not owned.
+  const std::unique_ptr<const DeviceMgr> device_mgr_;
+  const DeviceMgr* const borrowed_device_mgr_;  // Not owned.
   std::unique_ptr<DynamicDeviceMgr> remote_device_mgr_;
 };
 
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 0e923bd1236..52f15dcb5c2 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -1008,6 +1008,7 @@ tf_cc_tests(
     srcs = [
         "allocator_test.cc",
         "attr_value_util_test.cc",
+        "batch_util_test.cc",
         "bfloat16_test.cc",
         "cancellation_test.cc",
         "common_shape_fns_test.cc",
diff --git a/tensorflow/core/framework/batch_util_test.cc b/tensorflow/core/framework/batch_util_test.cc
new file mode 100644
index 00000000000..4e98371bda7
--- /dev/null
+++ b/tensorflow/core/framework/batch_util_test.cc
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(CopyContiguousSlicesTest, CompatibleShape) {
+  Tensor src(DT_FLOAT, {7, 1, 2});
+  Tensor dst(DT_FLOAT, {9, 2, 1});
+  auto s = batch_util::CopyContiguousSlices(
+      src, /*src_offset=*/2, /*dst_offset=*/0, /*num_slices=*/5, &dst);
+  ASSERT_EQ(error::OK, s.code());
+}
+
+TEST(CopyContiguousSlicesTest, SourceOffsetOutOfRange) {
+  Tensor src(DT_FLOAT, {7, 1, 2});
+  Tensor dst(DT_FLOAT, {9, 2, 1});
+  auto s = batch_util::CopyContiguousSlices(
+      src, /*src_offset=*/7, /*dst_offset=*/0, /*num_slices=*/5, &dst);
+  ASSERT_EQ(error::FAILED_PRECONDITION, s.code());
+}
+
+TEST(CopyContiguousSlicesTest, DstOffsetOutOfRange) {
+  Tensor src(DT_FLOAT, {7, 1, 2});
+  Tensor dst(DT_FLOAT, {9, 2, 1});
+  auto s = batch_util::CopyContiguousSlices(
+      src, /*src_offset=*/0, /*dst_offset=*/0, /*num_slices=*/8, &dst);
+  ASSERT_EQ(error::FAILED_PRECONDITION, s.code());
+}
+
+TEST(CopyContiguousSlicesTest, CheckDstWithExpectedValues) {
+  auto src = test::AsTensor<float>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+                                   TensorShape({5, 2}));
+  Tensor dst(DT_FLOAT, {9, 2, 1});
+  auto s = batch_util::CopyContiguousSlices(
+      src, /*src_offset=*/1, /*dst_offset=*/5, /*num_slices=*/3, &dst);
+  ASSERT_EQ(error::OK, s.code());
+  test::ExpectTensorEqual<float>(
+      test::AsTensor<float>({2, 3, 4, 5, 6, 7}, TensorShape({3, 2, 1})),
+      dst.Slice(5, 8));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 50f425eeaca..3726fde9809 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -84,6 +84,8 @@ struct CollImplDetails {
       dependencies;           // collective instances on which this node depends
   string communication_hint;  // user-supplied hint for implementation choice,
                               // e.g. ring or nccl
+  float timeout_seconds;      // If non zero, set a completion timeout for the
+                              // collective op to detect staleness.
 };
 
 // Data common to all members of a collective instance.
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 113adbdd432..216002ad8e7 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -468,6 +468,25 @@ Status CheckFormatConstraintsOnShape(const TensorFormat tensor_format,
   return Status::OK();
 }
 
+Status DatasetIteratorShape(shape_inference::InferenceContext* c) {
+  shape_inference::ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+  std::vector<PartialTensorShape> output_shapes;
+  TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+  if (output_shapes.size() != c->num_outputs()) {
+    return errors::InvalidArgument(
+        "`output_shapes` must be the same length as `output_types` (",
+        output_shapes.size(), " vs. ", c->num_outputs());
+  }
+  for (size_t i = 0; i < output_shapes.size(); ++i) {
+    shape_inference::ShapeHandle output_shape_handle;
+    TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+        output_shapes[i], &output_shape_handle));
+    c->set_output(static_cast<int>(i), output_shape_handle);
+  }
+  return Status::OK();
+}
+
 Status MakeShapeFromFormat(TensorFormat format, DimensionOrConstant N,
                            const std::vector<DimensionOrConstant>& spatial,
                            DimensionOrConstant C, ShapeHandle* out,
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index e1984abab7e..218400c2435 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -92,6 +92,9 @@ inline Status MergeBothInputsShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+// Shape function for dataset iterators.
+Status DatasetIteratorShape(shape_inference::InferenceContext* c);
+
 // Returns a new shape with the specified dims arranged in the specified
 // format. The returned value is owned by this context.
 // Note: if format = "FORMAT_NCHW_VECT_C" then C represents the outer_depth.
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 81ddf8df98d..3415c7f23fc 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -119,9 +119,6 @@ class DeviceContext : public core::RefCounted {
   }
 };
 
-// map[i] is the DeviceContext* for the node with id i, if i < map.size().
-typedef std::vector<DeviceContext*> DeviceContextMap;
-
 class DeviceBase {
  public:
   explicit DeviceBase(Env* env) : env_(env) {}
diff --git a/tensorflow/core/framework/load_library.cc b/tensorflow/core/framework/load_library.cc
index b9e33b148f7..c223eac4722 100644
--- a/tensorflow/core/framework/load_library.cc
+++ b/tensorflow/core/framework/load_library.cc
@@ -21,6 +21,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mem.h"
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/tpu/tpu_library_loader.h"
+#endif  // IS_MOBILE_PLATFORM
 
 namespace tensorflow {
 
@@ -97,6 +100,17 @@ Status LoadLibrary(const char* library_filename, void** result,
   *buf = str_buf;
   *len = str.length();
 
+#if !defined(IS_MOBILE_PLATFORM)
+  // Determine if this library is a TPU library, and if so, calls the TPU
+  // initialization functions to populate function tables, etc...
+  void* unused_symbol;
+  if (env->GetSymbolFromLibrary(library.handle, "TfTpu_Initialize",
+                                &unused_symbol)
+          .ok()) {
+    TF_RETURN_IF_ERROR(tensorflow::tpu::InitializeTPULibrary(library.handle));
+  }
+#endif  // IS_MOBILE_PLATFORM
+
   *result = library.handle;
   return Status::OK();
 }
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 7aeec28e995..658be94b9bb 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -25,103 +25,9 @@ namespace data {
 namespace model {
 namespace {
 
-// Key of the derivative w.r.t. the last input time in the gradient of
-// `OutputTime`.
-constexpr char kInputTimeDerivativeKey[] = "last_input_time";
-
 // Wrapper for the square function to reduce verbosity.
 inline double Square(double x) { return x * x; }
 
-// Given the average time between output events (`output_time`), the average
-// time between input events (`input_time`) and the buffer size, the method
-// computes the expected time an input event will have to wait.
-//
-// The wait time is approximated as the product of the probability the buffer
-// will be empty and the time it takes to produce an element into the buffer.
-//
-// The formula used for computing the probability is derived by modeling the
-// problem as an M/M/1/K queue
-// (https://en.wikipedia.org/wiki/Birth%E2%80%93death_process#M/M/1/K_queue).
-//
-// Collects derivatives of `ComputeWaitTime` w.r.t `output_time`, `input_time'
-// and `buffer_size` if the corresponding pointers are not `nullptr`.
-double ComputeWaitTime(double output_time, double input_time,
-                       double buffer_size, double* output_time_derivative,
-                       double* input_time_derivative,
-                       double* buffer_size_derivative) {
-  // Case 0: either the producer or the consumer are infinitely fast. Wait time
-  // is the time to produce an output.
-  if (output_time == 0 || input_time == 0) {
-    if (output_time_derivative) {
-      *output_time_derivative = 1.0L;
-    }
-    if (input_time_derivative) {
-      *input_time_derivative = 0.0L;
-    }
-    if (buffer_size_derivative) {
-      *buffer_size_derivative = 0.0L;
-    }
-    return output_time;
-  }
-  // Case 1: the consumer is slower than the producer. Wait time is 0 since the
-  // buffer will be full in the long run.
-  if (input_time > output_time) {
-    if (output_time_derivative) {
-      *output_time_derivative = 0.0L;
-    }
-    if (input_time_derivative) {
-      *input_time_derivative = 0.0L;
-    }
-    if (buffer_size_derivative) {
-      *buffer_size_derivative = 0.0L;
-    }
-    return 0;
-  }
-  // Case 2: the consumer and the producer are equally fast. Expected wait time
-  // decreases linearly with the size of the buffer.
-  if (input_time == output_time) {
-    const double p_buffer_empty = 1.0L / (buffer_size + 1.0L);
-    if (output_time_derivative) {
-      *output_time_derivative = p_buffer_empty;
-    }
-    if (input_time_derivative) {
-      *input_time_derivative = 0.0L;
-    }
-    if (buffer_size_derivative) {
-      const double p_buffer_empty_der = -1.0L / Square(buffer_size + 1.0L);
-      *buffer_size_derivative = p_buffer_empty_der * output_time;
-    }
-    return p_buffer_empty * output_time;
-  }
-  // Case 3: the producer is slower than the consumer and neither is infinitely
-  // fast.
-  const double alpha = 1.0L / input_time;
-  const double beta = 1.0L / output_time;
-  const double ratio_pow = std::pow((beta / alpha), (buffer_size + 1.0L));
-  const double p_buffer_empty = (1.0L - beta / alpha) / (1.0L - ratio_pow);
-  if (output_time_derivative) {
-    *output_time_derivative =
-        (1.0L - ratio_pow -
-         (output_time - input_time) * (buffer_size + 1.0L) * ratio_pow /
-             output_time) /
-        Square(1.0L - ratio_pow);
-  }
-  if (input_time_derivative) {
-    *input_time_derivative =
-        (ratio_pow - 1.0L +
-         (buffer_size + 1.0L) * ratio_pow * (alpha / beta - 1.0L)) /
-        Square(1.0L - ratio_pow);
-  }
-  if (buffer_size_derivative) {
-    const double p_buffer_empty_der = (1.0L - beta / alpha) * ratio_pow *
-                                      std::log(beta / alpha) /
-                                      Square(1.0L - ratio_pow);
-    *buffer_size_derivative = p_buffer_empty_der * output_time;
-  }
-
-  return p_buffer_empty * output_time;
-}
-
 // The first input of InterleaveMany corresponds to the input dataset whose
 // elements are used to create the (derived) input datasets whose elements are
 // interleaved as output.
@@ -140,34 +46,60 @@ class InterleaveMany : public Node {
         Args{id_, name_, std::move(output)});
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double old_input_time;
+    if (output_) {
+      old_input_time = (*input_times)[output_->long_name()];
+    } else {
+      old_input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
+
+    if (num_inputs() <= 1) {
+      (*input_times)[long_name()] = old_input_time;
+      return;
+    }
+    double new_input_time =
+        old_input_time +
+        SelfProcessingTimeLocked() * static_cast<double>(num_inputs() - 1);
+    (*input_times)[long_name()] = new_input_time;
+  }
+
   // The output time is the sum of the self processing time and the average
   // output time of inputs comprising the interleave "cycle".
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
-      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
     if (num_inputs() <= 1) {
-      return SelfProcessingTimeLocked();
-    }
-    double delta = SelfProcessingTimeLocked() * (num_inputs() - 1);
-    input_times->back() += delta;
-    auto cleanup = gtl::MakeCleanup(
-        [input_times, delta]() { input_times->back() -= delta; });
-    double output_time;
-    if (gradient) {
-      absl::flat_hash_map<string, double> inputs_gradient;
-      output_time =
-          (OutputTimeForInputs(input_times, &inputs_gradient) -
-           inputs_.front()->OutputTime(input_times, /*gradient=*/nullptr)) /
-          static_cast<double>(num_inputs() - 1);
-      for (auto& pair : inputs_gradient) {
-        (*gradient)[pair.first] =
-            pair.second / static_cast<double>(num_inputs() - 1);
+      (*output_times)[long_name()] = self_processing_time;
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
+        }
       }
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der + inputs_gradient[kInputTimeDerivativeKey] /
-                                    static_cast<double>(num_inputs() - 1);
+      return;
+    }
+
+    double output_time = (OutputTimeForInputs(*output_times) -
+                          (*output_times)[inputs_.front()->long_name()]) /
+                         static_cast<double>(num_inputs() - 1);
+    if (gradients) {
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient /= static_cast<double>(num_inputs() - 1);
+        }
+      }
+
+      (*output_time_gradients)[long_name()] =
+          (OutputTimeGradientsForInputs(*output_time_gradients) -
+           (*output_time_gradients)[inputs_.front()->long_name()]) /
+          static_cast<double>(num_inputs() - 1);
+
       // Set derivatives w.r.t. tunable parameters of the subtree rooted in the
       // first input equal to 0 since its output time is excluded from
       // computations.
@@ -175,15 +107,10 @@ class InterleaveMany : public Node {
           first_input_parameters;
       inputs_.front()->CollectTunableParameters(&first_input_parameters);
       for (auto& pair : first_input_parameters) {
-        (*gradient)[pair.first] = 0.0L;
+        (*gradients)[pair.first] = 0.0L;
       }
-    } else {
-      output_time =
-          (OutputTimeForInputs(input_times, /*gradient=*/nullptr) -
-           inputs_.front()->OutputTime(input_times, /*gradient=*/nullptr)) /
-          static_cast<double>(num_inputs() - 1);
     }
-    return SelfProcessingTimeLocked() + output_time;
+    (*output_times)[long_name()] = self_processing_time + output_time;
   }
 
   // The processing time is the sum of the self processing time and the average
@@ -197,16 +124,15 @@ class InterleaveMany : public Node {
       (*processing_times)[long_name()] = self_processing_time;
     }
     if (num_inputs() <= 1) {
-      total_processing_times->insert(
-          std::make_pair(long_name(), self_processing_time));
+      (*total_processing_times)[long_name()] = self_processing_time;
       return;
     }
     double processing_time =
         (TotalProcessingTimeForInputs(*total_processing_times) -
          (*total_processing_times)[inputs_.front()->long_name()]) /
         static_cast<double>(num_inputs() - 1);
-    total_processing_times->insert(
-        std::make_pair(long_name(), self_processing_time + processing_time));
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 };
 
@@ -238,55 +164,85 @@ class AsyncInterleaveMany : public Node {
         Args{id_, name_, std::move(output)}, parameters);
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double input_time;
+
+    if (num_inputs() <= 1) {
+      if (output_) {
+        input_time = (*input_times)[output_->long_name()];
+      } else {
+        input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      }
+    } else {
+      input_time =
+          SelfProcessingTimeLocked() * static_cast<double>(num_inputs() - 1);
+    }
+    (*input_times)[long_name()] = input_time;
+  }
+
   // The output time is estimated using `ComputeWaitTime(output_time,
   // input_time, parallelism, ...)`, where `output_time` is the sum of the
   // self-processing time and the average output time of inputs comprising the
   // interleave "cycle", `input_time` is specified through `input_times` and
   // `buffer_size` is derived from parallelism.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
-      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
     if (num_inputs() <= 1) {
-      return SelfProcessingTimeLocked();
+      (*output_times)[long_name()] = self_processing_time;
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
+        }
+      }
+      return;
     }
-    double old_input_time = input_times->back();
-    double new_input_time =
-        SelfProcessingTimeLocked() * static_cast<double>(num_inputs() - 1);
-    input_times->push_back(new_input_time);
-    auto cleanup =
-        gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
+
+    double input_time;
+    if (output_) {
+      input_time = input_times.at(output_->long_name());
+    } else {
+      input_time = gtl::FindWithDefault(input_times, kInputTimeKey, 0.0L);
+    }
+
     double parallelism = num_inputs() - 1;  // default to cycle length
     auto* parameter = gtl::FindOrNull(parameters_, kParallelism);
     if (parameter) {
       parallelism = std::min(parallelism, (*parameter)->value);
     }
-    if (gradient) {
-      absl::flat_hash_map<string, double> inputs_gradient;
-      double output_time_for_inputs =
-          OutputTimeForInputs(input_times, &inputs_gradient) -
-          inputs_.front()->OutputTime(input_times, /*gradient=*/nullptr);
-      double output_time = output_time_for_inputs /
-                           static_cast<double>(num_inputs() - 1) / parallelism;
+
+    double output_time_for_inputs =
+        OutputTimeForInputs(*output_times) -
+        (*output_times)[inputs_.front()->long_name()];
+    double output_time = output_time_for_inputs /
+                         static_cast<double>(num_inputs() - 1) / parallelism;
+    double result;
+
+    if (gradients) {
       double output_time_der = 0.0L;
       double input_time_der = 0.0L;
       double buffer_size_der = 0.0L;
-      double result = ComputeWaitTime(
-          SelfProcessingTimeLocked() + output_time, old_input_time, parallelism,
-          &output_time_der, &input_time_der, &buffer_size_der);
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der + input_time_der;
+      result = ComputeWaitTime(self_processing_time + output_time, input_time,
+                               parallelism, &output_time_der, &input_time_der,
+                               &buffer_size_der);
+      (*output_time_gradients)[long_name()] = input_time_der;
       double parallelism_der = -output_time_for_inputs /
                                static_cast<double>(num_inputs() - 1) /
                                Square(parallelism);
-      for (auto& pair : inputs_gradient) {
-        if (pair.first != kInputTimeDerivativeKey) {
-          (*gradient)[pair.first] = output_time_der * pair.second /
-                                    static_cast<double>(num_inputs() - 1) /
-                                    parallelism;
+
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient *= (output_time_der /
+                        static_cast<double>(num_inputs() - 1) / parallelism);
         }
       }
+
       // Set derivatives w.r.t. tunable parameters of the subtree rooted in the
       // first input equal to 0 since its output time is excluded from
       // computations.
@@ -294,23 +250,21 @@ class AsyncInterleaveMany : public Node {
           first_input_parameters;
       inputs_.front()->CollectTunableParameters(&first_input_parameters);
       for (auto& pair : first_input_parameters) {
-        (*gradient)[pair.first] = 0.0L;
+        (*gradients)[pair.first] = 0.0L;
       }
       // Add derivative w.r.t. own parallelism parameter.
       if (parameter && (*parameter)->state->tunable) {
-        (*gradient)[long_name()] =
+        (*gradients)[long_name()] =
             output_time_der * parallelism_der + buffer_size_der;
       }
-      return result;
+    } else {
+      result = ComputeWaitTime(self_processing_time + output_time, input_time,
+                               parallelism,
+                               /*output_time_derivative=*/nullptr,
+                               /*input_time_derivative=*/nullptr,
+                               /*buffer_size_derivative=*/nullptr);
     }
-    double output_time =
-        (OutputTimeForInputs(input_times, /*gradient=*/nullptr) -
-         inputs_.front()->OutputTime(input_times, /*gradient=*/nullptr)) /
-        static_cast<double>(num_inputs() - 1) / parallelism;
-    return ComputeWaitTime(
-        SelfProcessingTimeLocked() + output_time, old_input_time, parallelism,
-        /*output_time_derivative=*/nullptr,
-        /*input_time_derivative=*/nullptr, /*buffer_size_derivative=*/nullptr);
+    (*output_times)[long_name()] = result;
   }
 
   // The processing time is the sum of the self processing time and the average
@@ -324,16 +278,15 @@ class AsyncInterleaveMany : public Node {
       (*processing_times)[long_name()] = self_processing_time;
     }
     if (num_inputs() <= 1) {
-      total_processing_times->insert(
-          std::make_pair(long_name(), self_processing_time));
+      (*total_processing_times)[long_name()] = self_processing_time;
       return;
     }
     double processing_time =
         (TotalProcessingTimeForInputs(*total_processing_times) -
          (*total_processing_times)[inputs_.front()->long_name()]) /
         static_cast<double>(num_inputs() - 1);
-    total_processing_times->insert(
-        std::make_pair(long_name(), self_processing_time + processing_time));
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 };
 
@@ -350,41 +303,55 @@ class KnownRatio : public Node {
                                         ratio_);
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double old_input_time;
+    if (output_) {
+      old_input_time = (*input_times)[output_->long_name()];
+    } else {
+      old_input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
+
+    if (ratio_ == 0) {
+      (*input_times)[long_name()] = old_input_time;
+      return;
+    }
+    double new_input_time =
+        (old_input_time + SelfProcessingTimeLocked()) / ratio_;
+    (*input_times)[long_name()] = new_input_time;
+  }
+
   // The output time is the sum of the self processing time and the product of
   // `ratio_` and the sum of output times of inputs.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
-      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
     if (ratio_ == 0) {
-      return SelfProcessingTimeLocked();
-    }
-    double old_input_time = input_times->back();
-    input_times->back() =
-        (old_input_time + SelfProcessingTimeLocked()) / ratio_;
-    auto cleanup = gtl::MakeCleanup([input_times, old_input_time]() {
-      input_times->back() = old_input_time;
-    });
-    double result;
-    if (gradient) {
-      absl::flat_hash_map<string, double> inputs_gradient;
-      result = SelfProcessingTimeLocked() +
-               ratio_ * OutputTimeForInputs(input_times, &inputs_gradient);
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der + ratio_ *
-                                    inputs_gradient[kInputTimeDerivativeKey] *
-                                    (1.0L + 1.0L / ratio_);
-      for (auto& pair : inputs_gradient) {
-        if (pair.first != kInputTimeDerivativeKey) {
-          (*gradient)[pair.first] = pair.second * ratio_;
+      (*output_times)[long_name()] = self_processing_time;
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
         }
       }
-    } else {
-      result = SelfProcessingTimeLocked() +
-               ratio_ * OutputTimeForInputs(input_times, /*gradient=*/nullptr);
+      return;
     }
-    return result;
+    double result =
+        self_processing_time + ratio_ * OutputTimeForInputs(*output_times);
+    if (gradients) {
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient *= ratio_;
+        }
+      }
+      (*output_time_gradients)[long_name()] =
+          OutputTimeGradientsForInputs(*output_time_gradients);
+    }
+    (*output_times)[long_name()] = result;
   }
 
   // The processing time is the sum of the self processing time and the product
@@ -399,8 +366,8 @@ class KnownRatio : public Node {
     }
     double processing_time =
         ratio_ * TotalProcessingTimeForInputs(*total_processing_times);
-    total_processing_times->insert(
-        std::make_pair(long_name(), self_processing_time + processing_time));
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 
  private:
@@ -430,6 +397,29 @@ class AsyncKnownRatio : public Node {
         Args{id_, name_, std::move(output)}, ratio_, parameters);
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double input_time;
+
+    if (ratio_ == 0.0) {
+      if (output_) {
+        input_time = (*input_times)[output_->long_name()];
+      } else {
+        input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      }
+      (*input_times)[long_name()] = input_time;
+      return;
+    }
+
+    double parallelism = 1.0;
+    auto* parallelism_parameter = gtl::FindOrNull(parameters_, kParallelism);
+    if (parallelism_parameter) {
+      parallelism = (*parallelism_parameter)->value;
+    }
+    input_time = SelfProcessingTimeLocked() / ratio_ / parallelism;
+    (*input_times)[long_name()] = input_time;
+  }
+
   // The output time is estimated using `ComputeWaitTime(output_time,
   // input_time, parallelism, ...)`, where `output_time` is the sum of the self
   // processing time and the product of `ratio_` and the sum of output times of
@@ -437,9 +427,12 @@ class AsyncKnownRatio : public Node {
   // has parallelism parameter, then `buffer_size` is derived from parallelism.
   //
   // Current implementation assumes that there is at most 1 parameter per node.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
-      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     double parallelism = 1.0;
     double buffer_size = 0.0;
     auto* parallelism_parameter = gtl::FindOrNull(parameters_, kParallelism);
@@ -451,80 +444,85 @@ class AsyncKnownRatio : public Node {
       buffer_size = (*buffer_size_parameter)->value;
     }
     double self_processing_time = SelfProcessingTimeLocked();
+    double result;
+    double input_time;
+    if (output_) {
+      input_time = input_times.at(output_->long_name());
+    } else {
+      input_time = gtl::FindWithDefault(input_times, kInputTimeKey, 0.0L);
+    }
+
     if (ratio_ == 0.0) {
       double output_time = self_processing_time / parallelism;
-      if (gradient) {
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
+        }
+
         double output_time_der = 0.0L;
         double input_time_der = 0.0L;
         double buffer_size_der = 0.0L;
-        double result = ComputeWaitTime(output_time, input_times->back(),
-                                        buffer_size, &output_time_der,
-                                        &input_time_der, &buffer_size_der);
-        auto last_input_time_der =
-            gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-        (*gradient)[kInputTimeDerivativeKey] =
-            last_input_time_der + input_time_der;
+        result = ComputeWaitTime(output_time, input_time, buffer_size,
+                                 &output_time_der, &input_time_der,
+                                 &buffer_size_der);
+        (*output_time_gradients)[long_name()] = input_time_der;
         // Add derivative w.r.t. own parameter if it's tunable.
         if (parallelism_parameter && (*parallelism_parameter)->state->tunable) {
-          (*gradient)[long_name()] =
+          (*gradients)[long_name()] =
               -output_time_der * self_processing_time / Square(parallelism) +
               buffer_size_der;
         } else if (buffer_size_parameter &&
                    (*buffer_size_parameter)->state->tunable) {
-          (*gradient)[long_name()] = buffer_size_der;
+          (*gradients)[long_name()] = buffer_size_der;
         }
-        return result;
+      } else {
+        result = ComputeWaitTime(output_time, input_time, buffer_size,
+                                 /*output_time_derivative=*/nullptr,
+                                 /*input_time_derivative=*/nullptr,
+                                 /*buffer_size_derivative=*/nullptr);
       }
-      return ComputeWaitTime(output_time, input_times->back(), buffer_size,
-                             /*output_time_derivative=*/nullptr,
-                             /*input_time_derivative=*/nullptr,
-                             /*buffer_size_derivative=*/nullptr);
+      (*output_times)[long_name()] = result;
+      return;
     }
-    double old_input_time = input_times->back();
-    double new_input_time = self_processing_time / ratio_ / parallelism;
-    input_times->push_back(new_input_time);
-    auto cleanup =
-        gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
-    if (gradient) {
-      absl::flat_hash_map<string, double> inputs_gradient;
+
+    double output_time = self_processing_time / parallelism +
+                         ratio_ * OutputTimeForInputs(*output_times);
+    if (gradients) {
       double output_time_der = 0.0L;
       double input_time_der = 0.0L;
       double buffer_size_der = 0.0L;
-      double output_time =
-          self_processing_time / parallelism +
-          ratio_ * OutputTimeForInputs(input_times, &inputs_gradient);
-      double result =
-          ComputeWaitTime(output_time, old_input_time, buffer_size,
+      result =
+          ComputeWaitTime(output_time, input_time, buffer_size,
                           &output_time_der, &input_time_der, &buffer_size_der);
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der + input_time_der;
-      for (auto& pair : inputs_gradient) {
-        if (pair.first != kInputTimeDerivativeKey) {
-          (*gradient)[pair.first] = pair.second * ratio_ * output_time_der;
+      (*output_time_gradients)[long_name()] = input_time_der;
+
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient *= (ratio_ * output_time_der);
         }
       }
+
       // Add derivative w.r.t. own parameter if it's tunable.
       if (parallelism_parameter && (*parallelism_parameter)->state->tunable) {
-        (*gradient)[long_name()] =
+        double inputs_time_der_sum =
+            OutputTimeGradientsForInputs(*output_time_gradients);
+        (*gradients)[long_name()] =
             -output_time_der * self_processing_time / Square(parallelism) +
             buffer_size_der -
-            output_time_der * inputs_gradient[kInputTimeDerivativeKey] *
-                self_processing_time / Square(parallelism);
+            output_time_der * inputs_time_der_sum * self_processing_time /
+                Square(parallelism);
       } else if (buffer_size_parameter &&
                  (*buffer_size_parameter)->state->tunable) {
-        (*gradient)[long_name()] = buffer_size_der;
+        (*gradients)[long_name()] = buffer_size_der;
       }
-      return result;
+    } else {
+      result = ComputeWaitTime(output_time, input_time, buffer_size,
+                               /*output_time_derivative=*/nullptr,
+                               /*input_time_derivative=*/nullptr,
+                               /*buffer_size_derivative=*/nullptr);
     }
-    double output_time =
-        self_processing_time / parallelism +
-        ratio_ * OutputTimeForInputs(input_times, /*gradient=*/nullptr);
-    return ComputeWaitTime(output_time, old_input_time, buffer_size,
-                           /*output_time_derivative=*/nullptr,
-                           /*input_time_derivative=*/nullptr,
-                           /*buffer_size_derivative=*/nullptr);
+    (*output_times)[long_name()] = result;
   }
 
   // The processing time is the sum of the self processing time and the product
@@ -539,8 +537,8 @@ class AsyncKnownRatio : public Node {
     }
     double processing_time =
         ratio_ * TotalProcessingTimeForInputs(*total_processing_times);
-    total_processing_times->insert(
-        std::make_pair(long_name(), self_processing_time + processing_time));
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 
  private:
@@ -559,44 +557,64 @@ class UnknownRatio : public Node {
     return std::make_shared<UnknownRatio>(Args{id_, name_, std::move(output)});
   }
 
-  // The output time is the sum of the self processing time and the product of
-  // the ratio estimate and the sum of output times of inputs.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
       const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double old_input_time;
+    if (output_) {
+      old_input_time = (*input_times)[output_->long_name()];
+    } else {
+      old_input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
+
     if (num_elements_ == 0 || inputs_.empty() ||
         inputs_.front()->num_elements() == 0) {
-      return SelfProcessingTimeLocked();
+      (*input_times)[long_name()] = old_input_time;
+      return;
     }
-    // TODO(jsimsa): The current implementation assumes that the number of input
-    // elements consumed per output is the same across all inputs.
     std::shared_ptr<Node> input = inputs_.front();
     double ratio = static_cast<double>(input->num_elements()) /
                    static_cast<double>(num_elements_);
-    double old_input_time = input_times->back();
-    input_times->back() = (old_input_time + SelfProcessingTimeLocked()) / ratio;
-    auto cleanup = gtl::MakeCleanup([input_times, old_input_time]() {
-      input_times->back() = old_input_time;
-    });
-    if (gradient) {
-      absl::flat_hash_map<string, double> inputs_gradient;
-      double result =
-          SelfProcessingTimeLocked() +
-          ratio * OutputTimeForInputs(input_times, &inputs_gradient);
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der +
-          inputs_gradient[kInputTimeDerivativeKey] / ratio;
-      for (auto& pair : inputs_gradient) {
-        if (pair.first != kInputTimeDerivativeKey) {
-          (*gradient)[pair.first] = pair.second * ratio;
+    double new_input_time =
+        (old_input_time + SelfProcessingTimeLocked()) / ratio;
+    (*input_times)[long_name()] = new_input_time;
+  }
+
+  // The output time is the sum of the self processing time and the product of
+  // the ratio estimate and the sum of output times of inputs.
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
+    if (num_elements_ == 0 || inputs_.empty() ||
+        inputs_.front()->num_elements() == 0) {
+      (*output_times)[long_name()] = self_processing_time;
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
         }
       }
-      return result;
+      return;
     }
-    return SelfProcessingTimeLocked() +
-           ratio * OutputTimeForInputs(input_times, /*gradient=*/nullptr);
+    // TODO(jsimsa): The current implementation assumes that the number of input
+    // elements consumed per output is the same across all inputs.
+    double ratio = static_cast<double>(inputs_.front()->num_elements()) /
+                   static_cast<double>(num_elements_);
+    double result =
+        self_processing_time + ratio * OutputTimeForInputs(*output_times);
+    if (gradients) {
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient *= ratio;
+        }
+      }
+      (*output_time_gradients)[long_name()] =
+          OutputTimeGradientsForInputs(*output_time_gradients);
+    }
+    (*output_times)[long_name()] = result;
   }
 
   // The processing time is the sum of the self processing time and the product
@@ -610,8 +628,7 @@ class UnknownRatio : public Node {
       (*processing_times)[long_name()] = self_processing_time;
     }
     if (inputs_.empty() || num_elements_ == 0) {
-      total_processing_times->insert(
-          std::make_pair(long_name(), self_processing_time));
+      (*total_processing_times)[long_name()] = self_processing_time;
       return;
     }
     // TODO(jsimsa): The current implementation assumes that the number of input
@@ -621,8 +638,8 @@ class UnknownRatio : public Node {
                    static_cast<double>(num_elements_);
     double processing_time =
         ratio * TotalProcessingTimeForInputs(*total_processing_times);
-    total_processing_times->insert(
-        std::make_pair(long_name(), self_processing_time + processing_time));
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 };
 
@@ -638,11 +655,30 @@ class Unknown : public Node {
     return std::make_shared<Unknown>(Args{id_, name_, std::move(output)});
   }
 
-  // The output time is the sum of output times of inputs.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
       const override TF_SHARED_LOCKS_REQUIRED(mu_) {
-    return OutputTimeForInputs(input_times, gradient);
+    double input_time;
+    if (output_) {
+      input_time = (*input_times)[output_->long_name()];
+    } else {
+      input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
+    (*input_times)[long_name()] = input_time;
+  }
+
+  // The output time is the sum of output times of inputs.
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double result = OutputTimeForInputs(*output_times);
+    (*output_times)[long_name()] = result;
+    if (gradients) {
+      (*output_time_gradients)[long_name()] =
+          OutputTimeGradientsForInputs(*output_time_gradients);
+    }
   }
 
   // The processing time is the sum of processing times of inputs.
@@ -652,8 +688,7 @@ class Unknown : public Node {
       TF_SHARED_LOCKS_REQUIRED(mu_) {
     double processing_time =
         TotalProcessingTimeForInputs(*total_processing_times);
-    total_processing_times->insert(
-        std::make_pair(long_name(), processing_time));
+    (*total_processing_times)[long_name()] = processing_time;
   }
 };
 
@@ -700,21 +735,162 @@ std::shared_ptr<Node> MakeUnknownNode(Node::Args args) {
   return std::make_shared<Unknown>(std::move(args));
 }
 
+double Node::ComputeWaitTime(const double& output_time,
+                             const double& input_time,
+                             const double& buffer_size,
+                             double* output_time_derivative,
+                             double* input_time_derivative,
+                             double* buffer_size_derivative) {
+  // If we set x=`input_time`, y=`output_time`, n=`buffer_size`,
+  // p=`p_buffer_empty`, T=`wait_time`, then we have:
+  // if y = 0, then p = 0;
+  // elif x = 0, then p = 1;
+  // elif x = y, then p = 1 / (n+1);
+  // else p = [1 - x/y] / [1 - power(x/y, n+1)].
+  //
+  // We also have T = p * y, and derivatives of T w.r.t. x, y, n are computed:
+  // dT/dx = dp/dx * y,
+  // dT/dy = p + dp/dy * y,
+  // dT/dn = dp/dn * y.
+  // Then the remaining work is to compute dp/dx, dp/dy, dp/dn by considering
+  // different cases and substitute the values into above formulas.
+
+  // Case 1: if producer is infinitely fast. The buffer will always be full.
+  // Wait time will always be 0.
+  if (output_time == 0) {
+    if (output_time_derivative) {
+      // Note a common error is `*output_time_derivative = 0` since p=0 on the
+      // line y=0 doesn't imply dp/dy = 0 there. Actually to compute dp/dy at
+      // (x,0), we need to consider lim_{dy->0+} [p(x,dy)-p(x,0)] / dy, where
+      // p(x,0)=0 and p(x,dy) = [1 - x/dy] / [1 - power(x/dy, n+1)].
+      if (buffer_size == 0 || input_time == 0) {
+        *output_time_derivative = 1.0L;
+      } else {
+        *output_time_derivative = 0.0L;
+      }
+    }
+    if (input_time_derivative) {
+      *input_time_derivative = 0.0L;
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = 0.0L;
+    }
+    return 0.0L;
+  }
+
+  // Case 2: if consumer is infinitely fast. Wait time is always the time to
+  // produce an output.
+  if (input_time == 0) {
+    if (output_time_derivative) {
+      *output_time_derivative = 1.0L;
+    }
+    if (input_time_derivative) {
+      // Note a common error is `*input_time_derivative = 0` since p=1 on the
+      // line x=0 doesn't imply dp/dx = 0 there. Actually to compute dp/dx at
+      // (0,y), we need to consider lim_{dx->0+} [p(dx,y)-p(0,y)] / dx, where
+      // p(0,y)=1, p(dx,y) = [1 - dx/y] / [1 - power(dx/y, n+1)] if y!=0.
+      if (buffer_size == 0) {
+        *input_time_derivative = 0.0L;
+      } else {
+        *input_time_derivative = -1.0L;
+      }
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = 0.0L;
+    }
+    return output_time;
+  }
+
+  // Case 3: the consumer and the producer are equally fast. Expected wait time
+  // decreases linearly with the size of the buffer.
+  if (input_time == output_time) {
+    const double p_buffer_empty = 1.0L / (buffer_size + 1.0L);
+    const double p_buffer_empty_der =
+        -buffer_size / (2.0L * buffer_size + 2.0L);
+    if (output_time_derivative) {
+      // Note a common error is `*output_time_derivative = p_buffer_empty` since
+      // p=1/(n+1) on the line x=y doesn't imply dp/dy = 0 there. Actually to
+      // compute dp/dy at (y,y), we need to consider
+      // lim_{dy->0} [p(y,y+dy)-p(y,y)] / dy, where p(y,y)=1/(n+1),
+      // p(y,y+dy) = [1 - y/(y+dy)] / [1 - power(y/(y+dy), n+1)].
+      *output_time_derivative = p_buffer_empty - p_buffer_empty_der;
+    }
+    if (input_time_derivative) {
+      // Note a common error is `*input_time_derivative = 0` since
+      // p=1/(n+1) on the line x=y doesn't imply dp/dx = 0 there. Actually to
+      // compute dp/dx at (x,x), we need to consider
+      // lim_{dx->0} [p(x+dx,x)-p(x,x)] / dx, where p(x,x)=1/(n+1),
+      // p(x+dx,x) = [1 - (x+dx)/x] / [1 - power((x+dx)/x, n+1)].
+      *input_time_derivative = p_buffer_empty_der;
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = -output_time / Square(buffer_size + 1.0L);
+    }
+    return p_buffer_empty * output_time;
+  }
+
+  // Case 4: the consumer is slower than the producer and neither is infinitely
+  // fast. Case 4 and Case 5 actually follow same formula. Separate them for
+  // numerical computation reasons.
+  if (input_time > output_time) {
+    const double ratio = output_time / input_time;
+    const double ratio_pow = std::pow(ratio, buffer_size);
+    const double p_buffer_empty =
+        ratio_pow * (1.0L - ratio) / (1.0L - ratio * ratio_pow);
+    const double p_buffer_empty_der =
+        (buffer_size - (buffer_size + 1.0L) * ratio + ratio_pow * ratio) *
+        ratio_pow / ratio / Square(1.0L - ratio_pow * ratio);
+    if (output_time_derivative) {
+      *output_time_derivative = p_buffer_empty + p_buffer_empty_der * ratio;
+    }
+    if (input_time_derivative) {
+      *input_time_derivative = -p_buffer_empty_der * Square(ratio);
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = p_buffer_empty / (1.0L - ratio_pow * ratio) *
+                                std::log(ratio) * output_time;
+    }
+    return p_buffer_empty * output_time;
+  }
+
+  // Case 5: the producer is slower than the consumer and neither is infinitely
+  // fast.
+  const double ratio = input_time / output_time;
+  const double ratio_pow = std::pow(ratio, buffer_size);
+  const double p_buffer_empty = (1.0L - ratio) / (1.0L - ratio_pow * ratio);
+  const double p_buffer_empty_der =
+      ((buffer_size + 1.0L - buffer_size * ratio) * ratio_pow - 1.0L) /
+      Square(1.0L - ratio_pow * ratio);
+  if (output_time_derivative) {
+    *output_time_derivative = p_buffer_empty - p_buffer_empty_der * ratio;
+  }
+  if (input_time_derivative) {
+    *input_time_derivative = p_buffer_empty_der;
+  }
+  if (buffer_size_derivative) {
+    *buffer_size_derivative = p_buffer_empty / (1.0L - ratio_pow * ratio) *
+                              ratio_pow * ratio * std::log(ratio) * output_time;
+  }
+  return p_buffer_empty * output_time;
+}
+
 void Node::CollectTunableParameters(
     absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const {
-  CollectTunableParametersHelper(parameters);
-
+  tf_shared_lock l(mu_);
   // Collect tunable parameters from the leaves of the nodes tree to the root.
-  for (const auto& node : CollectNodes()) {
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
     node->CollectTunableParametersHelper(parameters);
   }
+  CollectTunableParametersHelper(parameters);
 }
 
 string Node::DebugString() const {
   absl::flat_hash_map<string, string> debug_strings;
-
+  tf_shared_lock l(mu_);
   // Build up the debug string from the leaves of the nodes tree to the root.
-  for (const auto& node : CollectNodes()) {
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
     node->DebugStringHelper(&debug_strings);
   }
   DebugStringHelper(&debug_strings);
@@ -731,10 +907,35 @@ void Node::FlushMetrics() {
   metrics_.record_num_elements(num_elements_);
 }
 
-double Node::OutputTime(std::vector<double>* input_times,
-                        absl::flat_hash_map<string, double>* gradient) const {
+double Node::OutputTime(absl::flat_hash_map<string, double>* input_times,
+                        absl::flat_hash_map<string, double>* gradients) const {
+  // To store the output time gradient w.r.t. input time (if `gradients` is not
+  // `nullptr`) and the output time for each node.
+  absl::flat_hash_map<string, double> output_time_gradients, output_times;
   tf_shared_lock l(mu_);
-  return OutputTimeLocked(input_times, gradient);
+  auto nodes = CollectNodes(TraversalOrder::BFS);
+
+  // Computes and stores input time for each node from the root to leaves of the
+  // nodes tree.
+  InputTimeLocked(input_times);
+  for (const auto& node : nodes) {
+    tf_shared_lock l(node->mu_);
+    node->InputTimeLocked(input_times);
+  }
+
+  std::reverse(nodes.begin(), nodes.end());
+  // Computes and stores the output time and output time gradient w.r.t. input
+  // time (if `gradients` is not `nullptr`) for each node from leaves of the
+  // nodes tree to the root.
+  for (const auto& node : nodes) {
+    tf_shared_lock l(node->mu_);
+    node->OutputTimeLocked(*input_times, gradients, &output_times,
+                           &output_time_gradients);
+  }
+  OutputTimeLocked(*input_times, gradients, &output_times,
+                   &output_time_gradients);
+
+  return output_times[long_name()];
 }
 
 std::shared_ptr<Node> Node::Snapshot(std::shared_ptr<Node> output) const {
@@ -759,9 +960,10 @@ double Node::SelfProcessingTime() const {
 
 double Node::TotalBufferedBytes() const {
   absl::flat_hash_map<string, double> total_bytes;
-
+  tf_shared_lock l(mu_);
   // Compute total buffered bytes from the leaves of the nodes tree to the root.
-  for (const auto& node : CollectNodes()) {
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
     node->TotalBufferedBytesHelper(&total_bytes);
   }
   TotalBufferedBytesHelper(&total_bytes);
@@ -771,10 +973,11 @@ double Node::TotalBufferedBytes() const {
 
 double Node::TotalMaximumBufferedBytes() const {
   absl::flat_hash_map<string, double> total_bytes;
-
+  tf_shared_lock l(mu_);
   // Compute total maximum buffered bytes from the leaves of the nodes tree
   // to the root.
-  for (const auto& node : CollectNodes()) {
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
     node->TotalMaximumBufferedBytesHelper(&total_bytes);
   }
   TotalMaximumBufferedBytesHelper(&total_bytes);
@@ -787,17 +990,16 @@ double Node::TotalProcessingTime(
   // Create a hash map to store the per-element CPU time spent in the subtree
   // rooted in each node.
   absl::flat_hash_map<string, double> total_processing_times;
+  tf_shared_lock l(mu_);
 
   // Computes per-element CPU time spent in the subtree rooted in the node from
   // the leaves of the nodes tree to the root.
-  for (const auto& node : CollectNodes()) {
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
     tf_shared_lock l(node->mu_);
     node->TotalProcessingTimeLocked(processing_times, &total_processing_times);
   }
-  {
-    tf_shared_lock l(mu_);
-    TotalProcessingTimeLocked(processing_times, &total_processing_times);
-  }
+  TotalProcessingTimeLocked(processing_times, &total_processing_times);
+
   return total_processing_times[long_name()];
 }
 
@@ -810,13 +1012,25 @@ double Node::AverageBufferedElementSize() const {
 }
 
 double Node::OutputTimeForInputs(
-    std::vector<double>* input_times,
-    absl::flat_hash_map<string, double>* gradient) const {
+    const absl::flat_hash_map<string, double>& output_times) const {
   double sum = 0;
   for (auto& input : inputs_) {
     // Inputs for which autotuning is disabled are excluded.
     if (input->autotune()) {
-      sum += input->OutputTime(input_times, gradient);
+      sum += output_times.at(input->long_name());
+    }
+  }
+  return sum;
+}
+
+double Node::OutputTimeGradientsForInputs(
+    const absl::flat_hash_map<string, double>& output_time_gradients) const {
+  double sum = 0;
+  for (auto& input : inputs_) {
+    // Inputs for which autotuning is disabled are excluded.
+    if (input->autotune()) {
+      sum +=
+          gtl::FindWithDefault(output_time_gradients, input->long_name(), 0.0L);
     }
   }
   return sum;
@@ -870,12 +1084,12 @@ double Node::SelfProcessingTimeLocked() const {
          static_cast<double>(num_elements_);
 }
 
-Node::NodeVector Node::CollectNodes() const {
+Node::NodeVector Node::CollectNodes(TraversalOrder order) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
   NodeVector node_vector;
   std::list<std::shared_ptr<Node>> temp_list;
 
   {
-    tf_shared_lock l(mu_);
     for (auto& input : inputs_) {
       node_vector.push_back(input);
       temp_list.push_back(input);
@@ -893,16 +1107,19 @@ Node::NodeVector Node::CollectNodes() const {
       }
     }
   }
-  std::reverse(node_vector.begin(), node_vector.end());
+
+  if (order == TraversalOrder::REVERSE_BFS) {
+    std::reverse(node_vector.begin(), node_vector.end());
+  }
   return node_vector;
 }
 
 void Node::CollectTunableParametersHelper(
-    absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const {
+    absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
   if (!autotune_) {
     return;
   }
-  tf_shared_lock l(mu_);
   for (auto& pair : parameters_) {
     if (pair.second->state->tunable) {
       parameters->insert(std::make_pair(long_name(), pair.second));
@@ -910,9 +1127,8 @@ void Node::CollectTunableParametersHelper(
   }
 }
 
-void Node::DebugStringHelper(
-    absl::flat_hash_map<string, string>* debug_strings) const {
-  tf_shared_lock l(mu_);
+void Node::DebugStringHelper(absl::flat_hash_map<string, string>* debug_strings)
+    const TF_SHARED_LOCKS_REQUIRED(mu_) {
   string result;
   strings::StrAppend(&result, long_name(), ":\n");
   strings::StrAppend(&result, "  autotune=", autotune_.load(), "\n");
@@ -962,13 +1178,13 @@ std::shared_ptr<Node> Node::SnapshotHelper(
 }
 
 void Node::TotalBufferedBytesHelper(
-    absl::flat_hash_map<string, double>* total_bytes) const {
+    absl::flat_hash_map<string, double>* total_bytes) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
   if (!autotune_) {
     total_bytes->insert(std::make_pair(long_name(), 0));
     return;
   }
 
-  tf_shared_lock l(mu_);
   double result = 0;
   auto* parameter = gtl::FindOrNull(parameters_, kBufferSize);
   if (!parameter) {
@@ -984,13 +1200,13 @@ void Node::TotalBufferedBytesHelper(
 }
 
 void Node::TotalMaximumBufferedBytesHelper(
-    absl::flat_hash_map<string, double>* total_bytes) const {
+    absl::flat_hash_map<string, double>* total_bytes) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
   if (!autotune_) {
     total_bytes->insert(std::make_pair(long_name(), 0));
     return;
   }
 
-  tf_shared_lock l(mu_);
   double result = 0;
   auto* parameter = gtl::FindOrNull(parameters_, kBufferSize);
   if (!parameter) {
@@ -1132,8 +1348,8 @@ void Model::OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget) {
   double new_output_time;
   double new_value;
   for (int i = 0; i < kMaxIterations; ++i) {
-    absl::flat_hash_map<string, double> gradient;
-    new_output_time = OutputTime(snapshot, &gradient);
+    absl::flat_hash_map<string, double> gradients;
+    new_output_time = OutputTime(snapshot, &gradients);
     int64 model_parallelism = 0;
     for (auto& pair : essential_parameters) {
       model_parallelism += std::round(pair.second->value);
@@ -1150,12 +1366,12 @@ void Model::OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget) {
     for (auto& pair : parameters) {
       if (pair.second->value != pair.second->max) {
         max_abs_derivative =
-            std::max(max_abs_derivative, std::abs(gradient[pair.first]));
+            std::max(max_abs_derivative, std::abs(gradients[pair.first]));
       }
     }
     for (auto& pair : parameters) {
       new_value = pair.second->value -
-                  kDescentStep * gradient[pair.first] / max_abs_derivative;
+                  kDescentStep * gradients[pair.first] / max_abs_derivative;
       // Projection on a feasible interval.
       if (new_value > pair.second->max) {
         pair.second->value = pair.second->max;
@@ -1199,7 +1415,7 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
     pair.second->value = pair.second->min;
   }
   while (true) {
-    const double output_time = OutputTime(snapshot, /*gradient=*/nullptr);
+    const double output_time = OutputTime(snapshot, /*gradients=*/nullptr);
     bool all_max = true;
     for (auto& pair : parameters) {
       if (pair.second->value < pair.second->max) {
@@ -1218,7 +1434,7 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
         continue;
       }
       pair.second->value++;
-      double new_output_time = OutputTime(snapshot, /*gradient=*/nullptr);
+      double new_output_time = OutputTime(snapshot, /*gradients=*/nullptr);
       double delta = output_time - new_output_time;
       if (delta > best_delta &&
           (delta > kBufferSizeMinDelta || pair.second->name != kBufferSize)) {
@@ -1248,15 +1464,18 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
 }
 
 double Model::OutputTime(std::shared_ptr<Node> node,
-                         absl::flat_hash_map<string, double>* gradient) {
-  std::vector<double> input_times(1, 0);
+                         absl::flat_hash_map<string, double>* gradients) {
+  // To store the input time for each node.
+  absl::flat_hash_map<string, double> input_times;
+
   // TODO(jsimsa): Now that we are accounting for buffer size in wait time
   // computation, assuming that the input is infinitely fast will result in
   // inaccurate estimates of the output latency.
   //
   // We should compute the output latency as a fix-point of the following
   // equation: `output_time = node(OutputTime(input_times(1, output_time))`.
-  return node->OutputTime(&input_times, gradient);
+
+  return node->OutputTime(&input_times, gradients);
 }
 
 double Model::TotalBufferedBytes(std::shared_ptr<Node> node) {
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 1c3b64f4a0d..e325056f0c4 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -42,11 +42,19 @@ constexpr int64 kAutotune = -1;
 constexpr char kParallelism[] = "parallelism";
 constexpr char kBufferSize[] = "buffer_size";
 
+// A key used to identify input time gradient.
+constexpr char kInputTimeKey[] = "input_time";
+
 enum class AutotuneAlgorithm {
   HILL_CLIMB = 0,
   GRADIENT_DESCENT = 1,
 };
 
+enum class TraversalOrder {
+  BFS = 0,
+  REVERSE_BFS = 1,
+};
+
 // Represents thread-safe state that can be shared between an input pipeline and
 // the performance model.
 struct SharedState {
@@ -142,7 +150,31 @@ class Node {
         metrics_(name_),
         output_(args.output.get()) {}
 
-  virtual ~Node() { FlushMetrics(); }
+  virtual ~Node() {
+    // Clear the sub-nodes instead of relying on implicit shared pointer
+    // destructor to avoid potential stack overflow when the tree is deep.
+    std::deque<std::shared_ptr<Node>> queue;
+    {
+      mutex_lock l(mu_);
+      while (inputs_.size() > 0) {
+        queue.push_back(inputs_.front());
+        inputs_.pop_front();
+      }
+    }
+    while (!queue.empty()) {
+      auto node = queue.back();
+      queue.pop_back();
+      {
+        mutex_lock l(node->mu_);
+        while (node->inputs_.size() > 0) {
+          queue.push_back(node->inputs_.front());
+          node->inputs_.pop_front();
+        }
+      }
+    }
+
+    FlushMetrics();
+  }
 
   // Adds an input.
   void add_input(std::shared_ptr<Node> node) TF_LOCKS_EXCLUDED(mu_) {
@@ -261,6 +293,26 @@ class Node {
     autotune_.store(autotune);
   }
 
+  // Given the average time between output events (`output_time`), the average
+  // time between input events (`input_time`) and the buffer size, the method
+  // computes the expected time an input event will have to wait.
+  //
+  // The wait time is approximated as the product of the probability the buffer
+  // will be empty and the time it takes to produce an element into the buffer.
+  //
+  // The formula used for computing the probability is derived by modeling the
+  // problem as an M/M/1/K queue
+  // (https://en.wikipedia.org/wiki/Birth%E2%80%93death_process#M/M/1/K_queue).
+  //
+  // Collects derivatives of `ComputeWaitTime` w.r.t `output_time`, `input_time'
+  // and `buffer_size` if the corresponding pointers are not `nullptr`.
+  static double ComputeWaitTime(const double& output_time,
+                                const double& input_time,
+                                const double& buffer_size,
+                                double* output_time_derivative,
+                                double* input_time_derivative,
+                                double* buffer_size_derivative);
+
   // Collects tunable parameters in the subtree rooted in this node.
   void CollectTunableParameters(
       absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const
@@ -272,11 +324,11 @@ class Node {
   // Flushes the metrics recorded by this node.
   void FlushMetrics() TF_LOCKS_EXCLUDED(mu_);
 
-  // Returns the per-element output time for this node and if `gradient` is not
-  // `nullptr`, collects the gradient of the output time w.r.t. tunable
-  // parameters of the subtree rooted in this node and the last input time.
-  double OutputTime(std::vector<double>* input_times,
-                    absl::flat_hash_map<string, double>* gradient) const
+  // Returns the per-element output time for this node and if `gradients` is not
+  // `nullptr`, collects the output time gradient w.r.t. tunable parameters of
+  // the subtree rooted in this node.
+  double OutputTime(absl::flat_hash_map<string, double>* input_times,
+                    absl::flat_hash_map<string, double>* gradients) const
       TF_LOCKS_EXCLUDED(mu_);
 
   // Returns a copy of this node, making a deep copy of its inputs and a
@@ -370,20 +422,34 @@ class Node {
   // Returns the average size of an element buffered in this node.
   double AverageBufferedElementSize() const TF_SHARED_LOCKS_REQUIRED(mu_);
 
-  // Returns the sum of per-element output time for the inputs of this node and
-  // if `gradient` is not `nullptr`, collects gradients of output times w.r.t.
-  // tunable parameters and the last input time.
-  double OutputTimeForInputs(std::vector<double>* input_times,
-                             absl::flat_hash_map<string, double>* gradient)
-      const TF_SHARED_LOCKS_REQUIRED(mu_);
+  // Returns the sum of per-element output time for the tunable inputs of this
+  // node.
+  double OutputTimeForInputs(
+      const absl::flat_hash_map<string, double>& output_times) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
-  // Returns the per-element output time for this node and if `gradient` is not
-  // `nullptr`, collects the gradient of the output time w.r.t. tunable
-  // parameters of the subtree rooted in this node and the last input time.
-  virtual double OutputTimeLocked(std::vector<double>* input_times,
-                                  absl::flat_hash_map<string, double>* gradient)
+  // Returns the sum of output time gradient w.r.t. input time for the tunable
+  // inputs of this node.
+  double OutputTimeGradientsForInputs(
+      const absl::flat_hash_map<string, double>& output_time_gradients) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Computes the input time for this node and stores it in `input_times`.
+  virtual void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
       const TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
 
+  // Computes the per-element output time for this node and stores it in
+  // `output_times`. If `gradients` is not `nullptr`, computes the output time
+  // gradient w.r.t. tunable parameters of the subtree rooted in this node and
+  // stores it in `gradients`, also computes the output time gradient w.r.t.
+  // input time and stores it in `output_time_gradients`.
+  virtual void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const
+      TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
+
   // Returns the sum of per-element processing time for the inputs of this node
   // by adding values for input nodes in `total_processing_times`. Processing
   // time for a given input is a weighted combination of a statistic based on
@@ -408,18 +474,20 @@ class Node {
       absl::flat_hash_map<string, double>* total_processing_times)
       TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
 
-  // Returns a vector of nodes of the subtree rooted in this node.
-  // The nodes are in the reverse breadth-first search order.
-  NodeVector CollectNodes() const;
+  // Returns a vector of nodes of the subtree rooted in this node. The nodes are
+  // either in breadth-first search or reverse breadth-first search order
+  // depending on the `order` argument. The root node itself is not collected.
+  NodeVector CollectNodes(TraversalOrder order) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Collect tunable parameters for the node.
   void CollectTunableParametersHelper(
-      absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters)
-      const;
+      absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Build up debug string for the node and store in the debug strings map.
-  void DebugStringHelper(
-      absl::flat_hash_map<string, string>* debug_strings) const;
+  void DebugStringHelper(absl::flat_hash_map<string, string>* debug_strings)
+      const TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Copy the node and add the (input, copy) pairs to the NodePairList.
   std::shared_ptr<Node> SnapshotHelper(std::shared_ptr<Node> clone_base,
@@ -427,12 +495,14 @@ class Node {
 
   // Compute total buffered bytes for the node and store in the total bytes map.
   void TotalBufferedBytesHelper(
-      absl::flat_hash_map<string, double>* total_bytes) const;
+      absl::flat_hash_map<string, double>* total_bytes) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Compute total maximum buffered bytes for the node and store in the total
   // bytes map.
   void TotalMaximumBufferedBytesHelper(
-      absl::flat_hash_map<string, double>* total_bytes) const;
+      absl::flat_hash_map<string, double>* total_bytes) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Stores the time passed to the last call to `Node::record_start()` on the
   // current thread.
@@ -575,11 +645,11 @@ class Model {
   // an element divided by CPU budget.
   void OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget);
 
-  // Collects the output time and if `gradient` is not `nullptr`, the output
+  // Collects the output time and if `gradients` is not `nullptr`, the output
   // time gradient w.r.t. tunable parameters of the subtree rooted in the given
-  // node and the last input time.
+  // node.
   double OutputTime(std::shared_ptr<Node> node,
-                    absl::flat_hash_map<string, double>* gradient);
+                    absl::flat_hash_map<string, double>* gradients);
 
   // Collects the processing time for the given node.
   double TotalProcessingTime(std::shared_ptr<Node> node);
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 92c309bd476..688dd0083e9 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -44,18 +44,19 @@ TEST_P(AsyncInterleaveManyTest, Model) {
     async_interleave_many->remove_input(meta_source);
   });
   std::shared_ptr<Node> source1 =
-      model::MakeSourceNode({1, "source1", async_interleave_many});
+      model::MakeSourceNode({2, "source1", async_interleave_many});
   async_interleave_many->add_input(source1);
   auto cleanup1 = gtl::MakeCleanup([async_interleave_many, source1]() {
     async_interleave_many->remove_input(source1);
   });
   std::shared_ptr<Node> source2 =
-      model::MakeSourceNode({2, "source2", async_interleave_many});
+      model::MakeSourceNode({3, "source2", async_interleave_many});
   async_interleave_many->add_input(source2);
   auto cleanup2 = gtl::MakeCleanup([async_interleave_many, source2]() {
     async_interleave_many->remove_input(source2);
   });
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   EXPECT_EQ(async_interleave_many->TotalBufferedBytes(), 0);
   EXPECT_EQ(async_interleave_many->TotalMaximumBufferedBytes(), 0);
   async_interleave_many->record_buffer_event(110, 10);
@@ -123,7 +124,8 @@ TEST_P(AsyncKnownRatioTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", async_known_many});
   async_known_many->add_input(source2);
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   EXPECT_EQ(async_known_many->TotalBufferedBytes(), 0);
   EXPECT_EQ(async_known_many->TotalMaximumBufferedBytes(), 0);
   async_known_many->record_buffer_event(110, 10);
@@ -194,12 +196,12 @@ TEST(InterleaveManyTest, Model) {
       model::MakeSourceNode({1, "meta_source", interleave_many});
   interleave_many->add_input(meta_source);
   std::shared_ptr<Node> source1 =
-      model::MakeSourceNode({1, "source1", interleave_many});
+      model::MakeSourceNode({2, "source1", interleave_many});
   interleave_many->add_input(source1);
   std::shared_ptr<Node> source2 =
-      model::MakeSourceNode({2, "source2", interleave_many});
+      model::MakeSourceNode({3, "source2", interleave_many});
   interleave_many->add_input(source2);
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   interleave_many->add_processing_time(100);
   EXPECT_EQ(interleave_many->processing_time(), 100);
   EXPECT_EQ(interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
@@ -238,7 +240,7 @@ TEST_P(KnownRatioTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", known_many});
   known_many->add_input(source2);
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   source1->add_processing_time(100);
   EXPECT_EQ(known_many->TotalProcessingTime(/*processing_times=*/nullptr), 0);
   EXPECT_EQ(known_many->OutputTime(&input_times, nullptr), 0);
@@ -286,7 +288,7 @@ INSTANTIATE_TEST_SUITE_P(Test, KnownRatioTest, ::testing::Values(0, 1, 2, 4));
 
 TEST(SourceTest, Model) {
   std::shared_ptr<Node> source = model::MakeSourceNode({0, "source", nullptr});
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   source->add_processing_time(100);
   EXPECT_EQ(source->processing_time(), 100);
   EXPECT_EQ(source->TotalProcessingTime(/*processing_times=*/nullptr), 0);
@@ -310,7 +312,7 @@ TEST(UnknownRatioTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", unknown_many});
   unknown_many->add_input(source2);
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   unknown_many->add_processing_time(100);
   EXPECT_EQ(unknown_many->processing_time(), 100);
   EXPECT_EQ(unknown_many->TotalProcessingTime(/*processing_times=*/nullptr), 0);
@@ -345,7 +347,7 @@ TEST(UnknownTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", unknown});
   unknown->add_input(source2);
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   source1->add_processing_time(100);
   EXPECT_EQ(unknown->TotalProcessingTime(/*processing_times=*/nullptr), 0);
   EXPECT_EQ(unknown->OutputTime(&input_times, nullptr), 0);
@@ -390,17 +392,23 @@ class TestNode : public model::Node {
     return nullptr;
   }
 
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
-      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
-    return 0;
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {}
+
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    (*output_times)[long_name()] = 0;
   }
 
   void TotalProcessingTimeLocked(
       absl::flat_hash_map<string, double>* processing_times,
       absl::flat_hash_map<string, double>* total_processing_times) override
       TF_SHARED_LOCKS_REQUIRED(mu_) {
-    total_processing_times->insert(std::make_pair(long_name(), 0));
+    (*total_processing_times)[long_name()] = 0;
   }
 };
 
@@ -504,7 +512,7 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
     async_interleave_many->remove_input(meta_source);
   });
   std::shared_ptr<Node> source1 = model::MakeAsyncInterleaveManyNode(
-      {0, "async_interleave_many", nullptr},
+      {2, "async_interleave_many", async_interleave_many},
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
@@ -514,12 +522,13 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
     async_interleave_many->remove_input(source1);
   });
   std::shared_ptr<Node> source2 =
-      model::MakeSourceNode({2, "source2", async_interleave_many});
+      model::MakeSourceNode({3, "source2", async_interleave_many});
   async_interleave_many->add_input(source2);
   auto cleanup2 = gtl::MakeCleanup([async_interleave_many, source2]() {
     async_interleave_many->remove_input(source2);
   });
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
   async_interleave_many->CollectTunableParameters(&parameters);
   async_interleave_many->record_element();
@@ -532,13 +541,13 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
   parameters[source1->long_name()]->value = 1;
 
   // Test gradient of own parameters.
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   double output_time =
-      async_interleave_many->OutputTime(&input_times, &gradient);
+      async_interleave_many->OutputTime(&input_times, &gradients);
   parameters[async_interleave_many->long_name()]->value += kParameterStep;
   double new_output_time =
       async_interleave_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_interleave_many->long_name()],
+  EXPECT_NEAR(gradients[async_interleave_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 
@@ -546,7 +555,7 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
   parameters[async_interleave_many->long_name()]->value -= kParameterStep;
   parameters[source1->long_name()]->value += kParameterStep;
   new_output_time = async_interleave_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[source1->long_name()],
+  EXPECT_NEAR(gradients[source1->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -565,7 +574,7 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
           std::make_shared<SharedState>(parameter_value, nullptr, nullptr), 1,
           parameter_value)});
   std::shared_ptr<Node> source1 = model::MakeAsyncKnownRatioNode(
-      {0, "source1", nullptr}, num_inputs_per_output,
+      {1, "source1", async_known_many}, num_inputs_per_output,
       {model::MakeParameter(
           parameter_name,
           std::make_shared<SharedState>(parameter_value, nullptr, nullptr), 1,
@@ -573,7 +582,8 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
   async_known_many->add_input(source1);
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", async_known_many});
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   async_known_many->add_input(source2);
   source1->record_element();
   source1->add_processing_time(100);
@@ -584,14 +594,14 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
 
   // Test gradient of own parameters.
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   async_known_many->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
   parameters[source1->long_name()]->value = 1;
-  double output_time = async_known_many->OutputTime(&input_times, &gradient);
+  double output_time = async_known_many->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = async_known_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 
@@ -599,7 +609,7 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
   parameters[async_known_many->long_name()]->value -= kParameterStep;
   parameters[source1->long_name()]->value += kParameterStep;
   new_output_time = async_known_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[source1->long_name()],
+  EXPECT_NEAR(gradients[source1->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -614,28 +624,29 @@ TEST(InterleaveManyGradientTest, Model) {
   std::shared_ptr<Node> interleave_many =
       model::MakeInterleaveManyNode({0, "interleave_many", nullptr});
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
-      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {1, "async_known_many", interleave_many}, num_inputs_per_output,
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
           parallelism)});
   std::shared_ptr<Node> source1 =
-      model::MakeSourceNode({2, "source1", async_known_many});
+      model::MakeSourceNode({2, "source1", interleave_many});
   interleave_many->record_element();
   interleave_many->add_processing_time(100);
   interleave_many->add_input(source1);
   interleave_many->add_input(async_known_many);
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   interleave_many->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
-  double output_time = interleave_many->OutputTime(&input_times, &gradient);
+  double output_time = interleave_many->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = interleave_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -647,7 +658,7 @@ TEST(KnownRatioGradientTest, Model) {
   std::shared_ptr<Node> known_many = model::MakeKnownRatioNode(
       {0, "known_many", nullptr}, num_inputs_per_output);
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
-      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {1, "async_known_many", known_many}, num_inputs_per_output,
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
@@ -657,15 +668,16 @@ TEST(KnownRatioGradientTest, Model) {
   known_many->add_input(async_known_many);
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   known_many->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
-  double output_time = known_many->OutputTime(&input_times, &gradient);
+  double output_time = known_many->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = known_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -677,7 +689,7 @@ TEST(UnknownRatioGradientTest, Model) {
   std::shared_ptr<Node> unknown_many =
       model::MakeUnknownRatioNode({0, "unknown_many", nullptr});
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
-      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {1, "async_known_many", unknown_many}, num_inputs_per_output,
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
@@ -687,15 +699,16 @@ TEST(UnknownRatioGradientTest, Model) {
   unknown_many->add_input(async_known_many);
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   unknown_many->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
-  double output_time = unknown_many->OutputTime(&input_times, &gradient);
+  double output_time = unknown_many->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = unknown_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -707,7 +720,7 @@ TEST(UnknownGradientTest, Model) {
   std::shared_ptr<Node> unknown =
       model::MakeUnknownNode({0, "unknown", nullptr});
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
-      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {1, "async_known_many", unknown}, num_inputs_per_output,
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
@@ -717,15 +730,16 @@ TEST(UnknownGradientTest, Model) {
   unknown->add_input(async_known_many);
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   unknown->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
-  double output_time = unknown->OutputTime(&input_times, &gradient);
+  double output_time = unknown->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = unknown->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -757,6 +771,78 @@ TEST(SnapshotTest, Model) {
     }
   }
 }
+
+class ComputeWaitTimeTest
+    : public ::testing::TestWithParam<std::tuple<double, double, double>> {};
+
+TEST_P(ComputeWaitTimeTest, Model) {
+  const double output_time = std::get<0>(GetParam());
+  const double input_time = std::get<1>(GetParam());
+  const double buffer_size = std::get<2>(GetParam());
+
+  double output_time_derivative = 0.0L;
+  double input_time_derivative = 0.0L;
+  double buffer_size_derivative = 0.0L;
+
+  double wait_time = model::Node::ComputeWaitTime(
+      output_time, input_time, buffer_size, &output_time_derivative,
+      &input_time_derivative, &buffer_size_derivative);
+
+  double new_wait_time =
+      model::Node::ComputeWaitTime(output_time + kParameterStep, input_time,
+                                   buffer_size, nullptr, nullptr, nullptr);
+  EXPECT_NEAR(output_time_derivative,
+              (new_wait_time - wait_time) / kParameterStep,
+              kComparisonPrecision);
+
+  if (output_time >= kParameterStep) {
+    new_wait_time =
+        model::Node::ComputeWaitTime(output_time - kParameterStep, input_time,
+                                     buffer_size, nullptr, nullptr, nullptr);
+    EXPECT_NEAR(output_time_derivative,
+                (wait_time - new_wait_time) / kParameterStep,
+                kComparisonPrecision);
+  }
+
+  new_wait_time =
+      model::Node::ComputeWaitTime(output_time, input_time + kParameterStep,
+                                   buffer_size, nullptr, nullptr, nullptr);
+  EXPECT_NEAR(input_time_derivative,
+              (new_wait_time - wait_time) / kParameterStep,
+              kComparisonPrecision);
+
+  if (input_time >= kParameterStep) {
+    new_wait_time =
+        model::Node::ComputeWaitTime(output_time, input_time - kParameterStep,
+                                     buffer_size, nullptr, nullptr, nullptr);
+    EXPECT_NEAR(input_time_derivative,
+                (wait_time - new_wait_time) / kParameterStep,
+                kComparisonPrecision);
+  }
+
+  new_wait_time = model::Node::ComputeWaitTime(output_time, input_time,
+                                               buffer_size + kParameterStep,
+                                               nullptr, nullptr, nullptr);
+  EXPECT_NEAR(buffer_size_derivative,
+              (new_wait_time - wait_time) / kParameterStep,
+              kComparisonPrecision);
+
+  if (buffer_size >= kParameterStep) {
+    new_wait_time = model::Node::ComputeWaitTime(output_time, input_time,
+                                                 buffer_size - kParameterStep,
+                                                 nullptr, nullptr, nullptr);
+    EXPECT_NEAR(buffer_size_derivative,
+                (wait_time - new_wait_time) / kParameterStep,
+                kComparisonPrecision);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Test, ComputeWaitTimeTest,
+    ::testing::Combine(::testing::Values(0, 20, 40, 80, 100),
+                       ::testing::Values(0, 20, 40, 80, 100),
+                       ::testing::Values(0, 1, 2, 4, 10, 20, 40)));
+
 }  // namespace
 }  // namespace model
 }  // namespace data
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 86911c7310a..0a26ceca66f 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -100,7 +100,7 @@ string AttrSlice::DebugString() const {
   return absl::StrJoin(attr_key_vals, ", ");
 }
 
-string SummarizeNodeDef(const NodeDef& node_def) {
+string SummarizeNodeDef(const NodeDef& node_def, int max_inputs_in_summary) {
   string ret = strings::StrCat(errors::FormatNodeNameForError(node_def.name()),
                                " = ", node_def.op(), "[");
   strings::StrAppend(&ret, SummarizeAttrsHelper(node_def, node_def.device()));
@@ -111,6 +111,10 @@ string SummarizeNodeDef(const NodeDef& node_def) {
   for (const string& input : node_def.input()) {
     if (!first) strings::StrAppend(&ret, ", ");
     first = false;
+    if (max_inputs_in_summary-- == 0) {
+      strings::StrAppend(&ret, "...");
+      break;
+    }
     strings::StrAppend(&ret, input);
   }
   strings::StrAppend(&ret, ")");
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index db3f2570a92..d937a8e51e1 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -58,7 +58,12 @@ extern const char* const kColocationGroupPrefix;
 
 // Produce a human-readable version of a Node or NodeDef that is more concise
 // than a text-format proto.
-string SummarizeNodeDef(const NodeDef& node_def);
+//
+// The parameter `max_inputs_in_summary` specifies how many inputs at most to
+// serialize in the output (in order not to get a string which is overly large).
+// The value `-1` specifies that all inputs will be shown.
+string SummarizeNodeDef(const NodeDef& node_def,
+                        int max_inputs_in_summary = -1);
 string SummarizeAttrs(const NodeDef& node_def);
 string SummarizeAttrsHelper(AttrSlice attrs, StringPiece device);
 
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 55de3d9fa03..a4ada3303d3 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -597,9 +597,6 @@ class OpKernelContext {
     // The step being executed.
     int64 step_id = 0;
 
-    // True if the op is created by eager runtime.
-    bool is_eager = false;
-
     // The op kernel being computed.
     OpKernel* op_kernel = nullptr;
 
@@ -718,8 +715,6 @@ class OpKernelContext {
 
   int64 step_id() const { return params_->step_id; }
 
-  bool is_eager() const { return params_->is_eager; }
-
   const OpKernel& op_kernel() const { return *params_->op_kernel; }
 
   // Input/output signature.
diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index 40a6d53d223..361f7ed13c1 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
 
 #include <vector>
+
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -90,7 +91,7 @@ class ShapeInferenceTestutil {
         ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
             op, i, "e")                                                     \
             .error_message();                                               \
-    const std::string& substring = error_substring;                         \
+    const std::string substring = error_substring;                          \
     EXPECT_NE("", error_message);                                           \
     EXPECT_TRUE(absl::StrContains(error_message, substring))                \
         << "Expected to see '" << substring << "' in '" << error_message    \
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 54541be0b4f..28eab3ab1e0 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <type_traits>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -52,6 +53,8 @@ namespace batch_util {
 Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
 Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
 Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index);
+Status CopyContiguousSlices(const Tensor& src, int64 src_offset,
+                            int64 dst_offset, int64 num_slices, Tensor* dst);
 }  // namespace batch_util
 
 /// @ingroup core
@@ -239,6 +242,12 @@ class Tensor {
   /// are not valid.
   Tensor(Tensor&& other);
 
+  // Explicitly delete constructor that take a pointer (except char*)
+  // so that the pointer doesn't get implicitly cast to bool.
+  template <typename T, typename std::enable_if<!std::is_same<T, char>::value,
+                                                T>::type* = nullptr>
+  explicit Tensor(T* t) = delete;
+
   ~Tensor();
 
   /// Returns the data type.
@@ -672,6 +681,9 @@ class Tensor {
   friend Status batch_util::MaybeMoveSliceToElement(
       Tensor* parent, Tensor* element,
       int64 index);  // For access to base<T>().
+  friend Status batch_util::CopyContiguousSlices(
+      const Tensor& src, int64 src_offset, int64 dst_offset, int64 num_slices,
+      Tensor* dst);  // For access to base<T>().
 
   bool CanUseDMA() const;
 
diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
index 74e5080a30f..bf776bcd2bc 100644
--- a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
@@ -30,7 +30,7 @@ class TrivialTestGraphInputYielder : public InputYielder {
  public:
   TrivialTestGraphInputYielder(int num_stages, int width, int tensor_size,
                                bool insert_queue,
-                               const std::vector<string>& device_names);
+                               const std::vector<std::string>& device_names);
   bool NextItem(GrapplerItem* item) override;
 
  private:
@@ -38,7 +38,7 @@ class TrivialTestGraphInputYielder : public InputYielder {
   const int width_;
   const int tensor_size_;
   const bool insert_queue_;
-  std::vector<string> device_names_;
+  std::vector<std::string> device_names_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 0b8846faf05..7432e2d54ea 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -531,7 +531,10 @@ cc_library(
 tf_cuda_cc_test(
     name = "memory_optimizer_test",
     srcs = ["memory_optimizer_test.cc"],
-    tags = ["no_cuda_on_cpu_tap"],  # Do not re-enable again without actually testing.
+    tags = [
+        "no_cuda_on_cpu_tap",  # Do not re-enable again without actually testing.
+        "no_windows",  # b/56402646
+    ],
     deps = [
         ":gpu_swapping_kernels",
         ":gpu_swapping_ops",
@@ -1059,8 +1062,10 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:graph_view",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index a0ec3714070..d0942471f13 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -2031,6 +2031,7 @@ Status ConstantFolding::SimplifyNode(bool use_shape_info, NodeDef* node,
       PartialConcatConstFolding(optimized_graph, properties, node));
   SET_AND_RETURN_IF_MODIFIED(
       ConstantPushDownBiasAdd(properties, optimized_graph, node));
+  SET_AND_RETURN_IF_MODIFIED(SimplifyCase(optimized_graph, node));
 
   graph_modified_ = graph_modified_cached;
   return Status::OK();
@@ -2378,6 +2379,32 @@ bool ConstantFolding::SimplifyPack(GraphDef* optimized_graph, NodeDef* node) {
   return true;
 }
 
+bool ConstantFolding::SimplifyCase(GraphDef* optimized_graph, NodeDef* node) {
+  if (node->op() != "Case") return false;
+  const NodeDef* output_idx_node = node_map_->GetNode(node->input(0));
+  if (output_idx_node == nullptr ||
+      !CheckAttrExists(*output_idx_node, "value").ok())
+    return false;
+  Tensor output_idx_t;
+  if (!output_idx_t.FromProto(output_idx_node->attr().at("value").tensor()))
+    return false;
+  int output_idx = output_idx_t.scalar<int>()();
+  const auto& func_list = node->attr().at("branches").list();
+  if (output_idx < 0 || output_idx >= func_list.func_size()) return false;
+  NodeDef call_node = *node;
+  call_node.set_op("PartitionedCall");
+  call_node.clear_input();
+  for (int i = 1; i < node->input_size(); ++i) {
+    call_node.add_input(node->input(i));
+  }
+  auto* new_func = (*call_node.mutable_attr())["f"].mutable_func();
+  *new_func = func_list.func(output_idx);
+  call_node.mutable_attr()->erase("branches");
+  call_node.mutable_attr()->erase("output_shapes");
+  *node = std::move(call_node);
+  return true;
+}
+
 bool ConstantFolding::MoveConstantsPastEnter(GraphDef* optimized_graph,
                                              NodeDef* node) {
   if (!IsEnter(*node) || node->input_size() == 0 ||
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 074f0c5f057..88784339816 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -279,6 +279,9 @@ class ConstantFolding : public GraphOptimizer {
   Status SimplifySlice(const GraphProperties& properties, bool use_shape_info,
                        GraphDef* optimized_graph, NodeDef* node);
 
+  // Simplify a Case operation where the output_idx is known.
+  bool SimplifyCase(GraphDef* optimized_graph, NodeDef* node);
+
   // Removes Reverse op over dimensions with size 1.
   Status RemoveReverse(const GraphProperties& properties, bool use_shape_info,
                        GraphDef* optimized_graph, NodeDef* node);
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 7fae40501ff..7e4a698fff6 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -4091,6 +4092,59 @@ TEST_F(ConstantFoldingTest, BitcastDenormalFloats) {
   test::ExpectTensorEqual<int64>(tensors[0], tensors_expected[0]);
 }
 
+TEST_F(ConstantFoldingTest, SimplifyCase) {
+  using test::function::NDef;
+
+  // Build a graph to compute y = Case(1, x, XTimesTwo(x), NonZero(x))
+  GrapplerItem item;
+  constexpr char kDevice[] = "/job:localhost/replica:0/task:0/device:CPU:0";
+  AttrValue branches;
+  auto* f = branches.mutable_list()->add_func();
+  f->set_name("XTimesTwo");
+  (*f->mutable_attr())["T"].set_type(DT_FLOAT);
+  auto* g = branches.mutable_list()->add_func();
+  *g = *f;
+  g->set_name("NonZero");
+
+  const Tensor kOne = test::AsScalar<int32>(1);
+  item.graph = test::function::GDef(
+      {NDef("one", "Const", {}, {{"value", kOne}, {"dtype", DT_INT32}},
+            kDevice),
+       NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("case", "Case", {"one", "x"},
+            {{"Tin", DataTypeSlice{DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"branches", branches}},
+            kDevice),
+       NDef("y", "Identity", {"case"}, {{"T", DT_FLOAT}}, kDevice)},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+          test::function::NonZero(),
+      });
+  VLOG(1) << "Before: " << item.graph.DebugString();
+
+  item.fetch = {"y"};
+  const Tensor kTwo = test::AsScalar<float>(2.0f);
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"x", kTwo}});
+
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
+  GraphDef optimized_graph;
+  TF_ASSERT_OK(optimizer.Optimize(/*cluster=*/nullptr, item, &optimized_graph));
+  VLOG(1) << "After: " << optimized_graph.DebugString();
+
+  int pco_count = 0;
+  for (const auto& node : optimized_graph.node()) {
+    EXPECT_NE(node.op(), "Case");
+    if (node.op() == "PartitionedCall") ++pco_count;
+  }
+  EXPECT_EQ(pco_count, 1);
+
+  auto tensors = EvaluateNodes(optimized_graph, item.fetch, {{"x", kTwo}});
+  ASSERT_EQ(tensors.size(), tensors_expected.size());
+  test::ExpectTensorEqual<float>(tensors[0], tensors_expected[0]);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index bab28d44686..a927afc5b30 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -603,16 +603,8 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:arithmetic_optimizer",
-        "//tensorflow/core/grappler/optimizers:common_subgraph_elimination",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core/grappler/optimizers:dependency_optimizer",
-        "//tensorflow/core/grappler/optimizers:function_optimizer",
-        "//tensorflow/core/grappler/optimizers:loop_optimizer",
-        "//tensorflow/core/grappler/optimizers:model_pruner",
-        "//tensorflow/core/grappler/optimizers:remapper",
-        "//tensorflow/core/grappler/optimizers:shape_optimizer",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
index 3591cd525ac..5804c3ee01a 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -21,15 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/common_subgraph_elimination.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
-#include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/function_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/model_pruner.h"
-#include "tensorflow/core/grappler/optimizers/remapper.h"
-#include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
 #include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/ptr_util.h"
@@ -60,14 +52,6 @@ constexpr std::array<const char*, 15> kTFDataOptimizations = {
     "slack",
     "inject_prefetch"};
 
-// Standard grappler optimizations, in the order we want to perform them.
-// The order matches the order in the generic meta optimizer.
-constexpr std::array<const char*, 9> kGrapplerOptimizations = {
-    "pruning",  "function",   "common_subgraph_elimination",
-    "shape",    "arithmetic", "layout_optimizer",
-    "remapper", "loop",       "dependency",
-};
-
 // Parses a list of string optimizer configurations into a map from
 // optimizer name -> rewriter config for that optimizer.
 Status ToConfigMap(
@@ -118,11 +102,6 @@ Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
         ApplyOptimization(optimization, cluster, &optimized_item));
   }
 
-  for (const auto& optimization : kGrapplerOptimizations) {
-    TF_RETURN_IF_ERROR(
-        ApplyOptimization(optimization, cluster, &optimized_item));
-  }
-
   // Store the final result of all the optimizations in `output`.
   output->Swap(&optimized_item.graph);
 
@@ -132,16 +111,17 @@ Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
           .ReachableDefinitions(*output);
   const auto producer = output->versions().producer();
   bool optimized_functions = false;
-  for (const FunctionDef& func : output->library().function()) {
+  for (const auto& name : flib.ListFunctionNames()) {
+    auto* func = flib.Find(name);
     // Skip non tf.data functions.
-    if (!func.attr().contains(data::kTFDataFunction)) continue;
-    VLOG(3) << "Optimize function: function=" << func.signature().name();
+    if (!func->attr().contains(data::kTFDataFunction)) continue;
+    VLOG(3) << "Optimize function: function=" << func->signature().name();
     optimized_functions = true;
 
     // Make a GrapplerItem from a FunctionDef.
     GrapplerFunctionItem func_item;
     TF_RETURN_IF_ERROR(
-        MakeGrapplerFunctionItem(func, flib, producer, &func_item));
+        MakeGrapplerFunctionItem(*func, flib, producer, &func_item));
 
     GraphDef optimized_func_graph;
     TF_RETURN_IF_ERROR(Optimize(cluster, func_item, &optimized_func_graph));
@@ -162,7 +142,7 @@ Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     // Replace optimized function with a new FunctionDef.
     TF_RETURN_IF_ERROR(
-        flib.ReplaceFunction(func.signature().name(), optimized_func));
+        flib.ReplaceFunction(func->signature().name(), optimized_func));
   }
   if (optimized_functions) {
     *output->mutable_library() = flib.ToProto();
@@ -221,27 +201,6 @@ Status TFDataMetaOptimizer::Init(
     }
   }
 
-  // Enable a subset of grappler optimization that are enabled by default.
-  //
-  // Layout optimizations are excluded because they assume that ops without
-  // explicit device assignment will be placed on GPU (if available) but that's
-  // not the case for operations within tf.data functions.
-  //
-  // TODO(b/120437209): Re-enable constant folding.
-  //
-  // TODO(jsimsa): Make the set of generic Grappler optimization applied to
-  // tf.data functions configurable.
-  enabled_optimizers_["pruning"] = MakeUnique<ModelPruner>();
-  enabled_optimizers_["shape"] = MakeUnique<ShapeOptimizer>();
-  enabled_optimizers_["remapping"] = MakeUnique<Remapper>(RewriterConfig::ON);
-  enabled_optimizers_["common_subgraph_elimination"] =
-      MakeUnique<CommonSubgraphElimination>();
-  enabled_optimizers_["arithmetic"] = MakeUnique<ArithmeticOptimizer>();
-  enabled_optimizers_["dependency"] = MakeUnique<DependencyOptimizer>();
-  enabled_optimizers_["loop"] = MakeUnique<LoopOptimizer>();
-  enabled_optimizers_["function"] = MakeUnique<FunctionOptimizer>(
-      RewriterConfig::ON, /*lower_control_flow=*/true);
-
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index eaccff3b127..ed3af955c13 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -1122,7 +1122,15 @@ void AddStrictInputSemantics(Node* caller, Graph* g) {
 
   VLOG(3) << "Add control edges from all data inputs to enforce strict "
              "semantics with regard to function inputs";
+
+  // Do not add control edges from placeholders, because it will prevent
+  // pruning, and they can't produce any side effects anyway.
+  const auto is_placeholder = [](const Node* node) -> bool {
+    return node->type_string() == "Placeholder";
+  };
+
   for (const Node* node : data_inputs) {
+    if (is_placeholder(node)) continue;
     g->AddControlEdge(g->FindNodeId(node->id()), caller,
                       /*allow_duplicates=*/true);
   }
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
index c85d85e69ff..79bedf5f2e6 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
@@ -356,57 +356,35 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
 #if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
 #endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  Scope s = Scope::NewRootScope();
-  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", /*dilated=*/false,
-                                        /*input_sizes_length=*/4);
-  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
-  GrapplerItem item;
-  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+  for (const int input_sizes_length : {2, 4}) {
+    Scope s = Scope::NewRootScope();
+    auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", /*dilated=*/false,
+                                          input_sizes_length);
+    Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+    GrapplerItem item;
+    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
-  GenericLayoutOptimizer optimizer;
-  GraphDef output;
-  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+    GenericLayoutOptimizer optimizer;
+    GraphDef output;
+    TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
-  Status status;
-  utils::GraphView graph_view(&output, &status);
-  TF_ASSERT_OK(status);
-  auto* conv2d_backprop_node = graph_view.GetNode("Conv2DBackpropInput");
-  ASSERT_NE(conv2d_backprop_node, nullptr);
-  ASSERT_EQ(conv2d_backprop_node->NumRegularFanins(), 3);
-  VerifyRegularFaninMatch(
-      conv2d_backprop_node, 0,
-      "Conv2DBackpropInput-0-DataFormatVecPermuteNHWCToNCHW-LayoutOptimizer",
-      0);
-  auto* input_sizes_node = graph_view.GetNode(
-      "Conv2DBackpropInput-0-DataFormatVecPermuteNHWCToNCHW-LayoutOptimizer");
-  ASSERT_NE(input_sizes_node, nullptr);
-  EXPECT_EQ(input_sizes_node->GetOp(), "DataFormatVecPermute");
-  ASSERT_EQ(input_sizes_node->NumRegularFanins(), 1);
-  VerifyRegularFaninMatch(input_sizes_node, 0, "InputSizesIdentity", 0);
-}
-
-TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInput2DInputSizes) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  Scope s = Scope::NewRootScope();
-  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", /*dilated=*/false,
-                                        /*input_sizes_length=*/2);
-  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
-  GrapplerItem item;
-  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
-
-  GenericLayoutOptimizer optimizer;
-  GraphDef output;
-  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
-
-  Status status;
-  utils::GraphView graph_view(&output, &status);
-  TF_ASSERT_OK(status);
-  auto* conv2d_backprop_node = graph_view.GetNode("Conv2DBackpropInput");
-  ASSERT_NE(conv2d_backprop_node, nullptr);
-  ASSERT_EQ(conv2d_backprop_node->NumRegularFanins(), 3);
-  VerifyRegularFaninMatch(conv2d_backprop_node, 0, "InputSizesIdentity", 0);
+    Status status;
+    utils::GraphView graph_view(&output, &status);
+    TF_ASSERT_OK(status);
+    auto* conv2d_backprop_node = graph_view.GetNode("Conv2DBackpropInput");
+    ASSERT_NE(conv2d_backprop_node, nullptr);
+    ASSERT_EQ(conv2d_backprop_node->NumRegularFanins(), 3);
+    VerifyRegularFaninMatch(
+        conv2d_backprop_node, 0,
+        "Conv2DBackpropInput-0-DataFormatVecPermuteNHWCToNCHW-LayoutOptimizer",
+        0);
+    auto* input_sizes_node = graph_view.GetNode(
+        "Conv2DBackpropInput-0-DataFormatVecPermuteNHWCToNCHW-LayoutOptimizer");
+    ASSERT_NE(input_sizes_node, nullptr);
+    EXPECT_EQ(input_sizes_node->GetOp(), "DataFormatVecPermute");
+    ASSERT_EQ(input_sizes_node->NumRegularFanins(), 1);
+    VerifyRegularFaninMatch(input_sizes_node, 0, "InputSizesIdentity", 0);
+  }
 }
 
 TEST_F(GenericLayoutOptimizerTest, Conv2DDataFormatVecPermuteCollapse) {
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index a5a5f7ae64a..ab7d8fcd6cf 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -739,28 +739,13 @@ Status Conv2DBackpropInputTransposer::TransposeNode(
     VLOG(3) << fanin_node->GetName() << " is not a vector.";
     return Status::OK();
   }
-  int vector_size = fanin_shape.dim(0).size();
-  if (vector_size == -1) {
-    VLOG(3) << "The number of elements in " << fanin_node->GetName()
-            << " is unknown.";
-    return Status::OK();
-  }
-  if (vector_size != 2 && vector_size != 4) {
-    return errors::InvalidArgument(
-        fanin_node->GetName(), " must be a vector of size 2 or 4, but found ",
-        vector_size);
-  }
 
   VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
           << "' with op '" << node->GetOp() << "' from data format '"
           << context->src_format << "' to '" << context->dst_format << "'";
   TF_RETURN_IF_ERROR(UpdateNode(context, node));
-  // Do not permute a input_sizes of size 2 because it represents HW regardless
-  // of whether NCHW or NHWC.
-  if (vector_size != 2) {
-    TF_RETURN_IF_ERROR(
-        UpdateFaninEdgesWithOp(context, {0}, node, kOpDataFormatVecPermute));
-  }
+  TF_RETURN_IF_ERROR(
+      UpdateFaninEdgesWithOp(context, {0}, node, kOpDataFormatVecPermute));
   TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {2}, node, kOpTranspose));
   TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose));
   return context->graph_view->GetMutationBuilder()->Apply();
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector.cc b/tensorflow/core/grappler/optimizers/implementation_selector.cc
index 37dda6ab6a3..2b0a27aaa2d 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.cc
@@ -17,15 +17,19 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/function_api_info.h"
+#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/graph_view.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -35,6 +39,11 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+constexpr char kConstOp[] = "Const";
+constexpr char kCaseOp[] = "Case";
+constexpr char kDeviceIndexOp[] = "DeviceIndex";
+
+// TODO(b/157615690): clean up function implementation swap code.
 // The overall idea for the function swap is like below:
 //          -----------                            -----------
 //  inp_1 ->|  P_C    | -> out_1         g_inp_1 ->|  P_C    | -> g_out_1
@@ -159,6 +168,15 @@ Status UpdateNodeDef(utils::MutableNodeView* node_view, const string& funcName,
   }
 
   if (apiInfo.function_type() == FunctionApiInfo::BACKWARD) {
+    // Strip node control dependencies. We'll add them back after updating
+    // all the data inputs.
+    std::vector<std::string> control_deps;
+    for (int i = node_def->input_size() - 1; i >= 0; --i) {
+      if (!IsControlInput(node_def->input(i))) break;
+      control_deps.push_back(node_def->input(i));
+      node_def->mutable_input()->RemoveLast();
+    }
+
     // For step 4 above.
     const int prev_input_size = node_def->input_size();
     const int diff = prev_input_size - apiInfo.input_arg_dtypes().size();
@@ -194,6 +212,11 @@ Status UpdateNodeDef(utils::MutableNodeView* node_view, const string& funcName,
       for (int i = 1; i <= -diff; ++i)
         node_def->add_input(strings::StrCat(node_name, ":", i + last_index));
     }
+
+    // Add control dependencies back.
+    for (std::string& control : control_deps)
+      node_def->add_input(std::move(control));
+
   } else if (apiInfo.function_type() == FunctionApiInfo::FORWARD) {
     // For forward function, since the DTYPE of the intermediate state might
     // have been changed, we want to update the down stream Identity node if
@@ -277,6 +300,74 @@ Status ImplementationSelector::MaybeOptimizeFunctionCall(
   return Status::OK();
 }
 
+// Finds the index of the device from the device name list.
+Status FindDeviceIndex(const utils::MutableNodeView* device_index_node,
+                       const string& device, int* index) {
+  DeviceNameUtils::ParsedName parsed_name;
+  if (!DeviceNameUtils::ParseFullName(device, &parsed_name) ||
+      !parsed_name.has_type) {
+    return errors::Internal("Could not parse device name:", device);
+  }
+  const auto& device_list =
+      device_index_node->GetAttr("device_names")->list().s();
+  auto it = absl::c_find(device_list, parsed_name.type);
+  if (it != device_list.end()) {
+    *index = it - device_list.begin();
+  } else {
+    // Sets *index to device_list.size() because the default_fn is guaranteed to
+    // be the final item in the case op branching list.
+    *index = device_list.size();
+  }
+  return Status::OK();
+}
+
+// Rewrites the device_index op to a const op with value of the index.
+void RewriteDeviceIndexOp(utils::MutableNodeView* device_index_node,
+                          int index) {
+  // Modifies the DeviceIndex node to be an Const op with correct device index.
+  auto node = device_index_node->node();
+  node->set_op(kConstOp);
+  node->clear_attr();
+  (*node->mutable_attr())["dtype"].set_type(DT_INT32);
+  auto* tensor = (*node->mutable_attr())["value"].mutable_tensor();
+  tensor->set_dtype(DT_INT32);
+  tensor->add_int_val(index);
+  VLOG(2) << "Node after rewriting:" << node->DebugString();
+}
+
+Status ImplementationSelector::SelectDeviceIndex(GraphDef* graph) const {
+  Status status;
+  VLOG(2) << "graph before rewriting device index:" << graph->DebugString();
+  utils::MutableGraphView graph_view(graph, &status);
+  TF_RETURN_IF_ERROR(status);
+  const int num_nodes = graph_view.NumNodes();
+  for (int k = 0; k < num_nodes; ++k) {
+    auto* node_view = graph_view.GetNode(k);
+    if (node_view->GetOp() != kDeviceIndexOp) {
+      continue;
+    }
+    VLOG(2) << "Found a node to rewrite the device index";
+
+    // Find the case node with device index node as input, rewrite the
+    // DeviceIndex node to have the value of the index of device type of the
+    // case node.
+    for (const auto& fanouts : node_view->GetRegularFanouts()) {
+      for (const auto& fanout : fanouts) {
+        if (fanout.node_view()->GetOp() != kCaseOp) continue;
+        int index;
+        // If any error is thrown out during device parsing, we simply skip
+        // and do not modify the DeviceIndexNode.
+        Status status =
+            FindDeviceIndex(node_view, fanout.node_view()->GetDevice(), &index);
+        if (status.ok()) {
+          RewriteDeviceIndexOp(node_view, index);
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
 Status ImplementationSelector::SelectImplementation(GraphDef* graph) const {
   if (!graph->has_library()) {
     VLOG(2) << "Skipping graph since it does not have function def";
@@ -292,8 +383,9 @@ Status ImplementationSelector::SelectImplementation(GraphDef* graph) const {
   TF_RETURN_IF_ERROR(status);
 
   const int num_nodes = graph_view.NumNodes();
-  for (int k = 0; k < num_nodes; ++k)
+  for (int k = 0; k < num_nodes; ++k) {
     TF_RETURN_IF_ERROR(MaybeOptimizeFunctionCall(graph_view.GetNode(k)));
+  }
 
   return Status::OK();
 }
@@ -311,7 +403,13 @@ Status ImplementationSelector::Optimize(Cluster* cluster,
             << "libraries: " << status;
     return errors::Aborted("Skipped Optimization");
   }
+
   *optimized_graph = item.graph;
+  status = SelectDeviceIndex(optimized_graph);
+  if (!status.ok()) {
+    *optimized_graph = item.graph;
+    VLOG(2) << "Could not rewrite device index due to error:" << status;
+  }
   return SelectImplementation(optimized_graph);
 }
 
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector.h b/tensorflow/core/grappler/optimizers/implementation_selector.h
index 57d19fe7046..f6962e0a10d 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector.h
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.h
@@ -34,6 +34,28 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+// Motivation: To achieve the same high level functionality, the underlying
+// implementations sometimes are different for various devices where the
+// function runs. In order to achieve the correct result and best performance,
+// the proper implementation needs to be picked dynamically.
+//
+// Currently there are two approaches to do this.
+// (1) Utilize case op and dynamacically change the branch index.
+// (2) Swap function implementation, it will be deprecated.
+//
+// Idea for approach 1.
+// This transformation rewrites the DeviceIndex op with a Const op with value
+// of the index of the device the associcated Case op runs.
+// Example:
+// def plus_one_gpu(x): return x + 1.0
+// def plus_one_reference_implementation(x): return x + 1.0
+// input = tf.constant(2.0, dtype=tf.float32)
+// cpu_fn = lambda:plus_one_reference_implementation(input)
+// gpu_fn = lambda:plus_one_gpu(input)
+// control_flow_ops.execute_fn_for_device(
+//  {"CPU": cpu_fn, "GPU":gpu_fn)}, default_fn=cpu_fn)
+//
+// Idea for approach 2.
 // This transformation replaces function calls by the appropriate function
 // definition based on properties of the runtime system. For instance,
 // we may choose one implementation over another if we have a GPU with
@@ -58,7 +80,8 @@ namespace grappler {
 // z = plus_one_gpu(input)
 // print(sess.run(z))
 //
-// At runtime, we will trim either `plus_one_gpu` or
+
+// At runtime, we will select either `plus_one_gpu` or
 // `plus_one_reference_implementation` based on the availability of the GPU.
 //
 // Available annotations:
@@ -106,6 +129,68 @@ class ImplementationSelector : public CustomGraphOptimizer {
   // gradients.
   Status SelectImplementation(GraphDef* graph) const;
 
+  // Rewrites the DeviceIndex op with a Const op with value of the index of the
+  // device the associcated Case op runs.
+
+  // This function first looks up all the DeviceIndex ops.
+  // Then for each of these ops, it finds the device of the
+  // associated Case op that takes the DeviceIndex op as the input, and
+  // caculates the index of the device in the device list of DeviceIndex op.
+  // Lastly, it rewrites the DeviceIndex op with a Const op and sets the value
+  // to be the index.
+  //
+  // Example input nodes:
+  // node {
+  //   name: "x"
+  //   op: "DeviceIndex"
+  //   device: "/device:CPU:0"
+  //   attr {
+  //     key: "device_names"
+  //     value {
+  //       list {
+  //         s: "CPU"
+  //         s: "TPU_REPLICATED_CORE"
+  //         s: "GPU"
+  //       }
+  //     }
+  //   }
+  // }
+  // node {
+  //   name: "case"
+  //   op: "Case"
+  //   input: "x"
+  //   device: "/device:GPU:0"
+  //   ...
+  // }
+  // Example output nodes:
+  //
+  //  name: "x"
+  //  op: "Const"
+  //  device: "/device:CPU:0"
+  //  attr {
+  //    key: "dtype"
+  //    value {
+  //      type: DT_INT32
+  //    }
+  //  }
+  //  attr {
+  //    key: "value"
+  //    value {
+  //      tensor {
+  //        dtype: DT_INT32
+  //        int_val: 2
+  //      }
+  //    }
+  //  }
+  // node {
+  //   name: "case"
+  //   op: "Case"
+  //   input: "x"
+  //   device: "/device:GPU:0"
+  //   ...
+  // }
+  Status SelectDeviceIndex(GraphDef* graph) const;
+
   std::unique_ptr<FunctionLibraryApiInfo> lib_info_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(ImplementationSelector);
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector_test.cc b/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
index 914570fcadb..2ef8bb878cc 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -58,6 +59,167 @@ TEST_F(ImplementationSelectorTest, NoUpdate) {
   EXPECT_EQ(item.graph.node_size(), output.node_size());
 }
 
+TEST_F(ImplementationSelectorTest, SelectDeviceIndex) {
+  using test::function::NDef;
+  ImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  AttrValue device_names;
+  device_names.mutable_list()->add_s("CPU");
+  device_names.mutable_list()->add_s("GPU");
+  item.graph = test::function::GDef(
+      {NDef("x", "DeviceIndex", {}, {{"device_names", device_names}},
+            CpuDevice),
+       NDef("case", "Case", {"x"}, {{"T", DT_FLOAT}}, GpuDevice)});
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      // Rewrite DeviceIndex op to a Const op with value of GPU index 1.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.attr().at("value").tensor().int_val(0));
+    }
+  }
+}
+
+TEST_F(ImplementationSelectorTest, SelectDeviceIndexMultiOps) {
+  using test::function::NDef;
+  ImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  AttrValue device_names;
+  device_names.mutable_list()->add_s("CPU");
+  device_names.mutable_list()->add_s("TPU_REPLICATED_CORE");
+  device_names.mutable_list()->add_s("GPU");
+  item.graph = test::function::GDef(
+      {NDef("x", "DeviceIndex", {}, {{"device_names", device_names}},
+            CpuDevice),
+       NDef("case", "Case", {"x"}, {{"T", DT_FLOAT}}, GpuDevice),
+       NDef("y", "DeviceIndex", {}, {{"device_names", device_names}},
+            GpuDevice),
+       NDef("case_y", "Case", {"y"}, {{"T", DT_FLOAT}}, TpuDevice)});
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      // Rewrite DeviceIndex op to a Const op with value of GPU index 1.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(2, node.attr().at("value").tensor().int_val(0));
+    }
+    if (node.name() == "y") {
+      // Rewrite DeviceIndex op to a Const op with value of CPU index 0.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.attr().at("value").tensor().int_val(0));
+    }
+  }
+}
+
+TEST_F(ImplementationSelectorTest, SelectDeviceIndexNotFound) {
+  using test::function::NDef;
+  ImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  AttrValue device_names;
+  device_names.mutable_list()->add_s("CPU");
+  device_names.mutable_list()->add_s("GPU");
+  item.graph = test::function::GDef(
+      {NDef("x", "DeviceIndex", {}, {{"device_names", device_names}},
+            CpuDevice),
+       NDef("case", "Case", {"x"}, {{"T", DT_FLOAT}}, TpuDevice)});
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      // Rewrite DeviceIndex op to a Const op with value of device names length.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(2, node.attr().at("value").tensor().int_val(0));
+    }
+  }
+}
+
+TEST_F(ImplementationSelectorTest, SelectDeviceIndexError) {
+  using test::function::NDef;
+  ImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  AttrValue device_names;
+  device_names.mutable_list()->add_s("CPU");
+  device_names.mutable_list()->add_s("GPU");
+  item.graph = test::function::GDef(
+      {NDef("x", "DeviceIndex", {}, {{"device_names", device_names}},
+            CpuDevice),
+       NDef("case", "Case", {"x"}, {{"T", DT_FLOAT}}, "")});
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      // Device parse has error, do not rewrite the DeviceIndexNode.
+      EXPECT_EQ("DeviceIndex", node.op());
+    }
+  }
+}
+
+TEST_F(ImplementationSelectorTest, TwoTypesOfSwapImplementation) {
+  using test::function::NDef;
+  ImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  // DeviceIndex op based implementation selector.
+  AttrValue device_names;
+  device_names.mutable_list()->add_s("CPU");
+  device_names.mutable_list()->add_s("TPU_REPLICATED_CORE");
+  device_names.mutable_list()->add_s("GPU");
+
+  // Function swap based implementation selector.
+  auto cpu_def = test::function::XTimesTwo();
+  auto* func_attr = cpu_def.mutable_attr();
+  (*func_attr)["api_implements"].set_s("times_two");
+  (*func_attr)["api_preferred_device"].set_s("CPU");
+
+  auto gpu_def = test::function::XAddX();
+  auto* func2_attr = gpu_def.mutable_attr();
+  (*func2_attr)["api_implements"].set_s("times_two");
+  (*func2_attr)["api_preferred_device"].set_s("GPU");
+
+  item.graph = test::function::GDef(
+      {NDef("x", "DeviceIndex", {}, {{"device_names", device_names}},
+            CpuDevice),
+       NDef("case", "Case", {"x"}, {{"T", DT_FLOAT}}, GpuDevice),
+       NDef("y", "DeviceIndex", {}, {{"device_names", device_names}},
+            GpuDevice),
+       NDef("case_y", "Case", {"y"}, {{"T", DT_FLOAT}}, TpuDevice),
+       NDef("y1", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, GpuDevice),
+       NDef("z1", "Identity", {"y1"}, {{"T", DT_FLOAT}}, GpuDevice),
+       NDef("y2", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, CpuDevice),
+       NDef("z2", "Identity", {"y2"}, {{"T", DT_FLOAT}}, CpuDevice)},
+      // FunctionLib
+      {cpu_def, gpu_def});
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      // Rewrite DeviceIndex op to a Const op with value of GPU index 1.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(2, node.attr().at("value").tensor().int_val(0));
+    }
+    if (node.name() == "y") {
+      // Rewrite DeviceIndex op to a Const op with value of CPU index 0.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.attr().at("value").tensor().int_val(0));
+    }
+    if (node.name() == "y1") {
+      // Make sure the implementation has been swapped to use the GPU version.
+      EXPECT_EQ("XAddX", node.op());
+    } else if (node.name() == "y2") {
+      // Make sure the implementation is not changed.
+      EXPECT_EQ("XTimesTwo", node.op());
+    }
+  }
+}
+
 TEST_F(ImplementationSelectorTest, SwapImplementation) {
   using test::function::NDef;
   auto cpu_def = test::function::XTimesTwo();
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index f47265f6334..cd0d44e8e12 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -114,12 +114,12 @@ FunctionDefLibrary GetFunctionDefLibraryStub(
 }
 
 uint64 DeadlineMicroSeconds(const RewriterConfig& cfg) {
-  const uint64 kFiveMinutesInUsec = 5 * 60 * 1000 * 1000;
+  const uint64 kTwentyMinutesInUsec = 20 * 60 * 1000 * 1000;
   if (cfg.meta_optimizer_timeout_ms() < 0) {
     return 0;
   } else {
     return cfg.meta_optimizer_timeout_ms() == 0
-               ? Env::Default()->NowMicros() + kFiveMinutesInUsec
+               ? Env::Default()->NowMicros() + kTwentyMinutesInUsec
                : Env::Default()->NowMicros() +
                      cfg.meta_optimizer_timeout_ms() * 1000;
   }
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index 7a6b4907bf4..cf1953fcdb2 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/remapper.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -294,6 +295,145 @@ TEST_F(MklRemapperTest, FuseDepthwiseConv2DWithBiasAndActivation) {
   }
 }
 
+#ifdef ENABLE_MKLDNN_V1
+TEST_F(MklRemapperTest, FuseBatchNormWithRelu) {
+  using ::tensorflow::ops::Placeholder;
+
+  for (bool is_training : {true, false}) {
+    for (bool has_side_input : {true, false}) {
+      tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+      const int num_channels = 24;
+
+      TensorShape channel_shape({num_channels});
+      TensorShape empty_shape({0});
+
+      auto input =
+          Placeholder(s.WithOpName("input"), DT_FLOAT,
+                      ops::Placeholder::Shape({2, 8, 8, num_channels}));
+      auto input_cast = ops::Cast(s.WithOpName("input_cast"), input, DT_FLOAT);
+      auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT);
+      auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT);
+      auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT);
+      auto var = Placeholder(s.WithOpName("var"), DT_FLOAT);
+
+      float epsilon = 0.1f;
+      auto fbn =
+          ops::FusedBatchNormV3(s.WithOpName("fused_batch_norm"), input_cast,
+                                scale, offset, mean, var,
+                                ops::FusedBatchNormV3::IsTraining(is_training)
+                                    .Epsilon(epsilon)
+                                    .DataFormat("NHWC"));
+
+      if (has_side_input) {
+        auto side_input =
+            Placeholder(s.WithOpName("side_input"), DT_FLOAT,
+                        ops::Placeholder::Shape({2, 8, 8, num_channels}));
+        auto side_input_cast =
+            ops::Cast(s.WithOpName("side_input_cast"), side_input, DT_FLOAT);
+        auto add = ops::Add(s.WithOpName("add"), fbn.y, side_input_cast);
+        auto relu = ops::Relu(s.WithOpName("relu"), add);
+      } else {
+        auto relu = ops::Relu(s.WithOpName("relu"), fbn.y);
+      }
+
+      auto input_t = GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
+      auto scale_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
+      auto offset_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
+      auto mean_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
+                                                               : channel_shape);
+      auto var_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
+                                                              : channel_shape);
+      auto side_input_t =
+          GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
+
+      GrapplerItem item;
+      item.fetch = {"relu"};
+      if (has_side_input)
+        item.feed = {{"input", input_t},   {"scale", scale_t},
+                     {"offset", offset_t}, {"mean", mean_t},
+                     {"var", var_t},       {"side_input", side_input_t}};
+      else
+        item.feed = {{"input", input_t},
+                     {"scale", scale_t},
+                     {"offset", offset_t},
+                     {"mean", mean_t},
+                     {"var", var_t}};
+      TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+      // Place all nodes on CPU.
+      for (int i = 0; i < item.graph.node_size(); ++i) {
+        item.graph.mutable_node(i)->set_device("/device:CPU:0");
+      }
+
+      Remapper optimizer(RewriterConfig::AGGRESSIVE);
+      GraphDef output;
+      TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+      int found = 0;
+      if (has_side_input) {
+        for (const NodeDef& node : output.node()) {
+          if (node.name() == "add") {
+            EXPECT_EQ(node.op(), "Add");
+            ASSERT_EQ(node.input_size(), 2);
+            EXPECT_EQ(node.input(0), "fused_batch_norm");
+            EXPECT_EQ(node.input(1), "side_input_cast");
+            found++;
+          }
+          if (node.name() == "relu") {
+            EXPECT_EQ(node.op(), "Relu");
+            ASSERT_EQ(node.input_size(), 1);
+            EXPECT_EQ(node.input(0), "add");
+            found++;
+          }
+          if (node.name() == "fused_batch_norm") {
+            EXPECT_EQ(node.op(), "FusedBatchNormV3");
+            ASSERT_EQ(node.input_size(), 5);
+            EXPECT_EQ(node.input(0), "input_cast");
+            EXPECT_EQ(node.input(1), "scale");
+            EXPECT_EQ(node.input(2), "offset");
+            EXPECT_EQ(node.input(3), "mean");
+            EXPECT_EQ(node.input(4), "var");
+            found++;
+          }
+        }
+        EXPECT_EQ(found, 3);
+      } else {
+        for (const NodeDef& node : output.node()) {
+          if (node.name() == "relu") {
+            EXPECT_EQ(node.op(), "Identity");
+            ASSERT_EQ(node.input_size(), 1);
+            EXPECT_EQ(node.input(0), "fused_batch_norm");
+            found++;
+          }
+          if (node.name() == "fused_batch_norm") {
+            EXPECT_EQ(node.op(), "_FusedBatchNormEx");
+            ASSERT_EQ(node.input_size(), 5);
+            EXPECT_EQ(node.input(0), "input_cast");
+            EXPECT_EQ(node.input(1), "scale");
+            EXPECT_EQ(node.input(2), "offset");
+            EXPECT_EQ(node.input(3), "mean");
+            EXPECT_EQ(node.input(4), "var");
+
+            auto attr = node.attr();
+            EXPECT_EQ(attr["num_side_inputs"].i(), 0);
+            EXPECT_EQ(attr["activation_mode"].s(), "Relu");
+            found++;
+          }
+        }
+        EXPECT_EQ(found, 2);
+      }
+
+      auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+      ASSERT_EQ(tensors_expected.size(), 1);
+      auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+      ASSERT_EQ(tensors.size(), 1);
+      test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+    }
+  }
+}
+#endif  // ENABLE_MKLDNN_V1
+
 }  // namespace grappler
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index 243ab7bd965..20db4360f73 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -33,6 +33,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
+namespace {
 
 bool IsTrivialIdentity(const NodeDef& node, const GraphView& graph_view) {
   for (const auto input :
@@ -103,7 +104,9 @@ bool IsOutputPortRefValue(const NodeDef& node, int port_id,
 bool CanRemoveNode(const NodeDef& node, const GraphView& graph_view,
                    const absl::flat_hash_set<string>& function_names,
                    const OpRegistryInterface& op_registry) {
-  if (IsNoOp(node) && node.input().empty()) {
+  if (IsNoOp(node) &&
+      (node.input().empty() ||
+       graph_view.NumFanouts(node, /*include_controlled_nodes=*/true) == 0)) {
     return true;
   }
   if (IsConstant(node) && node.input().empty() &&
@@ -412,6 +415,8 @@ Status SplitIdentityNInputs(GraphDef* graph,
   return Status::OK();
 }
 
+}  // namespace
+
 Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
                              GraphDef* optimized_graph) {
   const std::unordered_set<string> nodes_to_preserve = item.NodesToPreserve();
@@ -453,13 +458,18 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   // Check if we can further prune the graph, by removing the trivial ops.
   absl::flat_hash_set<const NodeDef*> nodes_to_delete;
-  for (const auto& node : pruned_graph->node()) {
-    if (!IsTrivialOp(node, graph_view)) {
+  for (int i = 0; i < pruned_graph->node_size(); ++i) {
+    NodeDef* node = pruned_graph->mutable_node(i);
+    // Remove redundant control inputs, since they may prevent pruning below.
+    DedupControlInputs(node);
+
+    if (!IsTrivialOp(*node, graph_view)) {
+      VLOG(3) << node->name() << " is not trivial.";
       continue;
     }
 
     // Don't remove nodes that must be preserved.
-    if (nodes_to_preserve.find(node.name()) != nodes_to_preserve.end()) {
+    if (nodes_to_preserve.find(node->name()) != nodes_to_preserve.end()) {
       continue;
     }
 
@@ -477,8 +487,10 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     //   converting references to non-references. It is important to preserve
     //   these non-references since the partitioner will avoid sending
     //   non-references across partitions more than once.
-    if (CanRemoveNode(node, graph_view, function_names, *op_registry)) {
-      nodes_to_delete.insert(&node);
+    if (CanRemoveNode(*node, graph_view, function_names, *op_registry)) {
+      nodes_to_delete.insert(node);
+    } else {
+      VLOG(3) << node->name() << " cannot be removed";
     }
   }
 
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index d2624e3d842..9beadbb7c70 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -100,12 +100,13 @@ TEST_F(ModelPrunerTest, IdentityPruning) {
 
     Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
     Output b = ops::Sqrt(s.WithOpName("b"), {a});
-    Output c = ops::Identity(s.WithOpName("c"), b);
+    Output c = ops::Identity(s.WithOpName("c").WithControlDependencies(b), b);
     Output d = ops::Identity(s.WithOpName("d"), c);
     Output e = ops::Sqrt(s.WithOpName("e"), {d});
 
     TF_ASSERT_OK(s.ToGraphDef(&item.graph));
   }
+  item.fetch.push_back("e");
 
   ModelPruner pruner;
   GraphDef output;
@@ -117,8 +118,6 @@ TEST_F(ModelPrunerTest, IdentityPruning) {
 
     Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
     Output b = ops::Sqrt(s.WithOpName("b"), {a});
-    Output c = ops::Identity(s.WithOpName("c"), b);
-    Output d = ops::Identity(s.WithOpName("d"), b);
     Output e = ops::Sqrt(s.WithOpName("e"), {b});
 
     TF_ASSERT_OK(s.ToGraphDef(&expected));
@@ -126,10 +125,9 @@ TEST_F(ModelPrunerTest, IdentityPruning) {
 
   CompareGraphs(expected, output);
 
-  std::vector<string> fetch = {"e"};
-  auto actual_tensors = EvaluateNodes(output, fetch);
+  auto actual_tensors = EvaluateNodes(output, item.fetch);
   ASSERT_EQ(actual_tensors.size(), 1);
-  auto expected_tensors = EvaluateNodes(item.graph, fetch);
+  auto expected_tensors = EvaluateNodes(item.graph, item.fetch);
   ASSERT_EQ(expected_tensors.size(), 1);
   test::ExpectTensorEqual<float>(actual_tensors[0], expected_tensors[0]);
 }
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 9602ea44f0d..9a7d1953105 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -797,23 +797,27 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
     const auto* fused_batch_norm_node_def = fused_batch_norm.node();
     if (!IsFusedBatchNorm(*fused_batch_norm_node_def)) return false;
 
-    // We fuse FusedBatchNorm only on GPU, because on CPU we fuse it with
-    // contraction (MatMul or Conv2D node).
+#ifndef ENABLE_MKLDNN_V1
+    // We fuse FusedBatchNorm on GPU or MKL CPU.
     if (!NodeIsOnGpu(fused_batch_norm_node_def)) return false;
+#endif
 
     DataType t_dtype = GetDataTypeFromAttr(*fused_batch_norm_node_def, "T");
+#ifndef ENABLE_MKLDNN_V1
     if (t_dtype != DT_FLOAT && t_dtype != DT_HALF) return false;
+#else
+    if (t_dtype != DT_FLOAT && t_dtype != DT_BFLOAT16) return false;
+#endif
 
     // Get the FusedBatchNorm training mode.
     bool is_training;
     if (!GetNodeAttr(*fused_batch_norm_node_def, kIsTraining, &is_training)
              .ok())
       return false;
-
     // In training mode we rely on cuDNN for computing FusedBatchNorm with side
     // inputs and activation, and it has its own limitations. In inference mode
     // we have a custom CUDA kernel that doesn't not have these constraints.
-    if (is_training) {
+    if (is_training && NodeIsOnGpu(fused_batch_norm_node_def)) {
       // cuDNN only supports NHWC data layout.
       string data_format;
       if (!GetNodeAttr(*fused_batch_norm_node_def, kDataFormat, &data_format)
@@ -865,6 +869,12 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
 
   // Input to a Relu can be an Add node with FusedBatchNorm as one of the inputs
   if (IsAdd(*relu_fanin_0_node_def)) {
+    // Currently no CPU implementation for "FusedBatchNorm + SideInput +
+    // <Activation>""
+#ifdef ENABLE_MKLDNN_V1
+    return false;
+#endif
+
     // Check that only Relu node consumes the output of an Add node.
     if (HasControlFaninOrFanout(*relu_fanin_0_node_view) ||
         !HasAtMostOneFanoutAtPort0(*relu_fanin_0_node_view) ||
@@ -946,12 +956,17 @@ void CopyFusedBatchNormAttributes(const NodeDef& fused_batch_norm,
   (*attr)["is_training"] = src_attr.at("is_training");
   (*attr)["data_format"] = src_attr.at("data_format");
   (*attr)["epsilon"] = src_attr.at("epsilon");
+  (*attr)["exponential_avg_factor"] = src_attr.at("exponential_avg_factor");
 
   // FusedBatchNormV2 and V3 have an extra type parameter.
   if (fused_batch_norm.op() != "FusedBatchNorm") {
-    (*attr)["U"] = src_attr.at("U");
+    SetAttrValue(src_attr.at("U"), &(*attr)["U"]);
   } else {
-    (*attr)["U"] = src_attr.at("T");
+#ifndef ENABLE_MKLDNN_V1
+    SetAttrValue(src_attr.at("T"), &(*attr)["U"]);
+#else
+    SetAttrValue(DT_FLOAT, &(*attr)["U"]);
+#endif
   }
 }
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index e47c681bb61..47b2fa44f57 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1338,7 +1338,10 @@ tf_kernel_library(
         "tile_functor_cpu_int64.cc",
         "tile_functor_cpu_int8.cc",
         "tile_functor_cpu_tstring.cc",
+        "tile_functor_cpu_uint32.cc",
+        "tile_functor_cpu_uint64.cc",
         "tile_functor_cpu_uint8.cc",
+        "tile_functor_cpu_variant.cc",
         "tile_functor_sycl.cc",
     ],
     hdrs = ["tile_functor.h"],
@@ -3792,7 +3795,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "svd_op",
     prefix = "svd_op",
-    deps = LINALG_DEPS,
+    deps = LINALG_DEPS + if_cuda([
+        ":eye_functor",
+    ]),
 )
 
 tf_kernel_library(
@@ -4019,7 +4024,7 @@ cc_library(
     ],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             "//third_party/eigen3",
@@ -4044,7 +4049,7 @@ cc_library(
         ":eigen_spatial_convolutions-inl",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
@@ -4060,7 +4065,7 @@ cc_library(
     deps = select({
         "//tensorflow:android": [
             ":conv_3d_mobile",
-            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             ":conv_3d",
@@ -6611,6 +6616,8 @@ filegroup(
         "avgpooling_op.h",
         "batch_matmul_op_impl.h",
         "batch_norm_op.h",
+        "broadcast_to_op.h",
+        "bucketize_op.h",
         "control_flow_ops.h",
         "conv_2d.h",
         "conv_3d.h",
@@ -6675,6 +6682,8 @@ filegroup(
         "transpose_op.h",
         "where_op.h",
         "xent_op.h",
+    ] + [
+        "//tensorflow/core/kernels/boosted_trees/quantiles:weighted_quantiles_hdrs",
     ],
 )
 
@@ -6700,6 +6709,7 @@ filegroup(
         "conv_ops_fused_float.cc",
         "conv_ops_fused_half.cc",
         "conv_ops_fused_impl.h",
+        "conv_ops_fused_image_transform.cc",
         "conv_ops_using_gemm.cc",
         "crop_and_resize_op.cc",
         "crop_and_resize_op.h",
@@ -6709,6 +6719,7 @@ filegroup(
         "cwise_op_bitwise_and.cc",
         "cwise_op_bitwise_or.cc",
         "cwise_op_bitwise_xor.cc",
+        "cwise_op_ceil.cc",
         "cwise_op_conj.cc",
         "cwise_op_cos.cc",
         "cwise_op_cosh.cc",
@@ -6801,6 +6812,8 @@ filegroup(
     name = "android_extended_ops_group2",
     srcs = [
         "batchtospace_op.cc",
+        "broadcast_to_op.cc",
+        "bucketize_op.cc",
         "ctc_decoder_ops.cc",
         "decode_bmp_op.cc",
         "depthtospace_op.cc",
@@ -6877,6 +6890,7 @@ filegroup(
         "spacetobatch_functor.cc",
         "spacetobatch_op.cc",
         "spacetodepth_op.cc",
+        "sparse_cross_op.cc",
         "sparse_fill_empty_rows_op.cc",
         "sparse_reshape_op.cc",
         "sparse_to_dense_op.cc",
@@ -6904,7 +6918,10 @@ filegroup(
         "tile_functor_cpu_int64.cc",
         "tile_functor_cpu_int8.cc",
         "tile_functor_cpu_tstring.cc",
+        "tile_functor_cpu_uint32.cc",
+        "tile_functor_cpu_uint64.cc",
         "tile_functor_cpu_uint8.cc",
+        "tile_functor_cpu_variant.cc",
         "tile_ops.cc",
         "tile_ops_cpu_impl_1.cc",
         "tile_ops_cpu_impl_2.cc",
@@ -6922,6 +6939,8 @@ filegroup(
         "where_op.cc",
         "xent_op.cc",
         ":android_extended_ops_headers",
+    ] + [
+        "//tensorflow/core/kernels/boosted_trees:quantile_ops.cc",
     ],
 )
 
@@ -7084,6 +7103,7 @@ cc_library(
     deps = [
         "//tensorflow/core:portable_tensorflow_lib_lite",
         "//tensorflow/core:protos_all_cc_impl",
+        "//tensorflow/core/platform:strong_hash",
         "//third_party/eigen3",
         "//third_party/fft2d:fft2d_headers",
         "@com_google_absl//absl/base",
@@ -7096,7 +7116,7 @@ cc_library(
 
 build_test(
     name = "android_tensorflow_kernels_build_test",
-    targets = [":android_tensorflow_kernels"],
+    targets = [":portable_tensorflow_kernels"],
 )
 
 cc_library(
@@ -7109,7 +7129,7 @@ cc_library(
         "//tensorflow/core:android_gif_internal",
         "//tensorflow/core:android_jpeg_internal",
         "//tensorflow/core:android_png_internal",
-        "//tensorflow/core:android_tensorflow_lib_lite",
+        "//tensorflow/core:portable_tensorflow_lib_lite",
     ],
     alwayslink = 1,
 )
@@ -7126,7 +7146,7 @@ cc_library(
     linkopts = ["-ldl"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:android_tensorflow_lib_lite",
+        "//tensorflow/core:portable_tensorflow_lib_lite",
     ],
     alwayslink = 1,
 )
@@ -7268,8 +7288,8 @@ tf_cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             ":quantized_ops",
@@ -7329,8 +7349,8 @@ cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             ":ops_util",
@@ -7414,8 +7434,8 @@ cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             ":ops_testutil",
@@ -7601,8 +7621,8 @@ cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             ":ops_util",
@@ -7827,8 +7847,8 @@ cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
@@ -8225,7 +8245,10 @@ tf_mkl_kernel_library(
 tf_mkl_kernel_library(
     name = "mkl_fused_batch_norm_op",
     srcs = ["mkl_fused_batch_norm_op.cc"],
-    deps = NN_DEPS + mkl_deps(),
+    deps = NN_DEPS + [
+        ":fused_batch_norm_op",
+        ":no_op",
+    ] + mkl_deps(),
 )
 
 tf_cc_test_mkl(
diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/attention_ops.cc
index f555c0fd679..6e5e07a9fb1 100644
--- a/tensorflow/core/kernels/attention_ops.cc
+++ b/tensorflow/core/kernels/attention_ops.cc
@@ -32,6 +32,8 @@ namespace tensorflow {
 class ExtractGlimpseOp : public OpKernel {
  public:
   explicit ExtractGlimpseOp(OpKernelConstruction* context) : OpKernel(context) {
+    const string& op = context->def().op();
+    version_ = (op == "ExtractGlimpse") ? 1 : 2;
     OP_REQUIRES_OK(context, context->GetAttr("normalized", &normalized_));
     OP_REQUIRES_OK(context, context->GetAttr("centered", &centered_));
     bool uniform_noise = false;
@@ -117,21 +119,23 @@ class ExtractGlimpseOp : public OpKernel {
       // calling TensorFlow operates with (y,x) as indices.
       offset_vec.push_back(Eigen::IndexPair<float>(offset_x, offset_y));
     }
-
     output->tensor<float, 4>().swap_layout().device(
         context->eigen_cpu_device()) =
         Eigen::ExtractGlimpses(input.tensor<float, 4>().swap_layout(),
                                output_width, output_height, offset_vec,
-                               normalized_, centered_, noise_);
+                               normalized_, centered_, noise_, version_);
   }
 
  private:
   bool normalized_;
   bool centered_;
   Eigen::ExtractGlimpsesNoiseMode noise_;
+  int32 version_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ExtractGlimpse").Device(DEVICE_CPU),
                         ExtractGlimpseOp);
+REGISTER_KERNEL_BUILDER(Name("ExtractGlimpseV2").Device(DEVICE_CPU),
+                        ExtractGlimpseOp);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 151f2367c95..fd4bdacff93 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -272,6 +272,7 @@ class BatchResource : public ResourceBase {
                        int32 batch_timeout_micros, int32 max_enqueued_batches,
                        const std::vector<int32>& allowed_batch_sizes,
                        FunctionLibraryRuntime::Handle fhandle,
+                       bool enable_large_batch_splitting,
                        std::unique_ptr<BatchResource>* resource) {
     std::unique_ptr<BatchResource> new_resource(new BatchResource);
 
@@ -286,6 +287,10 @@ class BatchResource : public ResourceBase {
     new_resource->batcher_queue_options_.batch_timeout_micros =
         batch_timeout_micros;
 
+    // Support for splitting large batch is still in progress.
+    new_resource->batcher_queue_options_.enable_large_batch_splitting =
+        enable_large_batch_splitting;
+
     new_resource->allowed_batch_sizes_ = allowed_batch_sizes;
 
     new_resource->fhandle_ = fhandle;
@@ -786,6 +791,13 @@ class BatchFunctionKernel : public AsyncOpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("f", &func));
     OP_REQUIRES_OK(
         c, lib->Instantiate(func.name(), AttrSlice(&func.attr()), &fhandle_));
+
+    if (c->HasAttr("enable_large_batch_splitting")) {
+      OP_REQUIRES_OK(c, c->GetAttr("enable_large_batch_splitting",
+                                   &enable_large_batch_splitting_));
+    } else {
+      enable_large_batch_splitting_ = false;
+    }
   }
 
   bool IsExpensive() override { return false; }
@@ -794,10 +806,10 @@ class BatchFunctionKernel : public AsyncOpKernel {
     BatchResource* br;
     std::function<Status(BatchResource**)> creator = [this](BatchResource** r) {
       std::unique_ptr<BatchResource> new_resource;
-      TF_RETURN_IF_ERROR(
-          BatchResource::Create(num_batch_threads_, max_batch_size_,
-                                batch_timeout_micros_, max_enqueued_batches_,
-                                allowed_batch_sizes_, fhandle_, &new_resource));
+      TF_RETURN_IF_ERROR(BatchResource::Create(
+          num_batch_threads_, max_batch_size_, batch_timeout_micros_,
+          max_enqueued_batches_, allowed_batch_sizes_, fhandle_,
+          enable_large_batch_splitting_, &new_resource));
       *r = new_resource.release();
       return Status::OK();
     };
@@ -844,6 +856,7 @@ class BatchFunctionKernel : public AsyncOpKernel {
   int32 max_enqueued_batches_;
   std::vector<int32> allowed_batch_sizes_;
   FunctionLibraryRuntime::Handle fhandle_;
+  bool enable_large_batch_splitting_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("BatchFunction").Device(DEVICE_CPU),
@@ -876,7 +889,7 @@ class BatchKernel : public AsyncOpKernel {
       std::unique_ptr<BatchResource> new_resource;
       TF_RETURN_IF_ERROR(BatchResource::Create(
           num_batch_threads_, max_batch_size_, batch_timeout_micros_,
-          max_enqueued_batches_, allowed_batch_sizes_, kInvalidHandle,
+          max_enqueued_batches_, allowed_batch_sizes_, kInvalidHandle, false,
           &new_resource));
       *r = new_resource.release();
       return Status::OK();
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index c44de023ced..66bdff933d8 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -160,6 +160,10 @@ class SharedBatchScheduler
     // See the class documentation above for guidelines on how to tune this
     // parameter.
     size_t max_enqueued_batches = 10;
+
+    // If true, queue implementation would split one input batch task into
+    // subtasks and fit them into different batches.
+    bool enable_large_batch_splitting = false;
   };
   Status AddQueue(const QueueOptions& options,
                   std::function<void(std::unique_ptr<Batch<TaskType>>)>
diff --git a/tensorflow/core/kernels/bincount_op.cc b/tensorflow/core/kernels/bincount_op.cc
index 846b43255d3..a84b25f2541 100644
--- a/tensorflow/core/kernels/bincount_op.cc
+++ b/tensorflow/core/kernels/bincount_op.cc
@@ -130,8 +130,8 @@ struct BincountFunctor<CPUDevice, Tidx, T, false> {
   }
 };
 
-template <typename Tidx, typename T, bool binary_count>
-struct BincountReduceFunctor<CPUDevice, Tidx, T, binary_count> {
+template <typename Tidx, typename T, bool binary_output>
+struct BincountReduceFunctor<CPUDevice, Tidx, T, binary_output> {
   static Status Compute(OpKernelContext* context,
                         const typename TTypes<Tidx, 2>::ConstTensor& in,
                         const typename TTypes<T, 2>::ConstTensor& weights,
@@ -148,7 +148,7 @@ struct BincountReduceFunctor<CPUDevice, Tidx, T, binary_count> {
             for (int64 j = 0; j < num_cols; ++j) {
               Tidx value = in(i, j);
               if (value < num_bins) {
-                if (binary_count) {
+                if (binary_output) {
                   out(i, value) = T(1);
                 } else {
                   if (weights.size()) {
@@ -221,7 +221,7 @@ template <typename Device, typename Tidx, typename T>
 class DenseBincountOp : public OpKernel {
  public:
   explicit DenseBincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -240,7 +240,7 @@ class DenseBincountOp : public OpKernel {
       OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({size}), &out_t));
       auto out = out_t->flat<T>();
       fill(ctx->eigen_device<Device>(), out);
-      if (binary_count_) {
+      if (binary_output_) {
         OP_REQUIRES_OK(
             ctx, functor::BincountFunctor<Device, Tidx, T, true>::Compute(
                      ctx, data.flat<Tidx>(), weights.flat<T>(), out, size));
@@ -259,7 +259,7 @@ class DenseBincountOp : public OpKernel {
           ctx, ctx->allocate_output(0, TensorShape({num_rows, size}), &out_t));
       auto out = out_t->matrix<T>();
       fill(ctx->eigen_device<Device>(), out_t->flat<T>());
-      if (binary_count_) {
+      if (binary_output_) {
         OP_REQUIRES_OK(
             ctx, functor::BincountReduceFunctor<Device, Tidx, T, true>::Compute(
                      ctx, data.matrix<Tidx>(), weight_matrix, out, size));
@@ -273,7 +273,7 @@ class DenseBincountOp : public OpKernel {
   }
 
  private:
-  bool binary_count_;
+  bool binary_output_;
 };
 
 #define REGISTER_KERNELS(Tidx, T)                            \
@@ -314,7 +314,7 @@ template <typename Device, typename Tidx, typename T>
 class SparseBincountOp : public OpKernel {
  public:
   explicit SparseBincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -338,7 +338,7 @@ class SparseBincountOp : public OpKernel {
       OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({size}), &out_t));
       auto out = out_t->flat<T>();
       fill(ctx->eigen_device<Device>(), out);
-      if (binary_count_) {
+      if (binary_output_) {
         OP_REQUIRES_OK(ctx,
                        functor::BincountFunctor<Device, Tidx, T, true>::Compute(
                            ctx, values, weights, out, size));
@@ -359,7 +359,7 @@ class SparseBincountOp : public OpKernel {
         const int64 batch = indices_mat(i, 0);
         const Tidx bin = values(i);
         if (bin < size) {
-          if (binary_count_) {
+          if (binary_output_) {
             out(batch, bin) = T(1);
           } else {
             if (weights_size) {
@@ -374,7 +374,7 @@ class SparseBincountOp : public OpKernel {
   }
 
  private:
-  bool binary_count_;
+  bool binary_output_;
 };
 
 #define REGISTER_KERNELS(Tidx, T)                            \
@@ -395,7 +395,7 @@ template <typename Device, typename Tidx, typename T>
 class RaggedBincountOp : public OpKernel {
  public:
   explicit RaggedBincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -429,7 +429,7 @@ class RaggedBincountOp : public OpKernel {
       OP_REQUIRES(ctx, bin >= 0,
                   errors::InvalidArgument("Input must be non-negative"));
       if (bin < size) {
-        if (binary_count_) {
+        if (binary_output_) {
           out(batch_idx - 1, bin) = T(1);
         } else {
           T value = (weights_size > 0) ? weights(idx) : T(1);
@@ -440,7 +440,7 @@ class RaggedBincountOp : public OpKernel {
   }
 
  private:
-  bool binary_count_;
+  bool binary_output_;
 };
 
 #define REGISTER_KERNELS(Tidx, T)                            \
diff --git a/tensorflow/core/kernels/boosted_trees/BUILD b/tensorflow/core/kernels/boosted_trees/BUILD
index b1f6d9d595f..4ca40dc4177 100644
--- a/tensorflow/core/kernels/boosted_trees/BUILD
+++ b/tensorflow/core/kernels/boosted_trees/BUILD
@@ -19,6 +19,8 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+exports_files(["quantile_ops.cc"])
+
 tf_proto_library(
     name = "boosted_trees_proto",
     srcs = [
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/BUILD b/tensorflow/core/kernels/boosted_trees/quantiles/BUILD
index 1b48065d6f7..fb03e284d8d 100644
--- a/tensorflow/core/kernels/boosted_trees/quantiles/BUILD
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/BUILD
@@ -10,6 +10,16 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "weighted_quantiles_hdrs",
+    srcs = [
+        "quantile_stream_resource.h",
+        "weighted_quantiles_buffer.h",
+        "weighted_quantiles_stream.h",
+        "weighted_quantiles_summary.h",
+    ],
+)
+
 # Quantiles
 
 cc_library(
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
index 5690c3a6014..a22af7ab71e 100644
--- a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
@@ -16,6 +16,7 @@
 #define TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
 
 #include <cstring>
+#include <list>
 #include <vector>
 
 #include "tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_buffer.h"
@@ -250,10 +251,37 @@ class WeightedQuantilesSummary {
     float compression_eps = ApproximationError() + (1.0 / num_boundaries);
     compressed_summary.Compress(num_boundaries, compression_eps);
 
+    // Remove the least important boundaries by the gap removing them would
+    // create.
+    std::list<int64> boundaries_to_keep;
+    for (int64 i = 0; i != compressed_summary.entries_.size(); ++i) {
+      boundaries_to_keep.push_back(i);
+    }
+    while (boundaries_to_keep.size() > num_boundaries) {
+      std::list<int64>::iterator min_element = boundaries_to_keep.end();
+      auto prev = boundaries_to_keep.begin();
+      auto curr = prev;
+      ++curr;
+      auto next = curr;
+      ++next;
+      WeightType min_weight = TotalWeight();
+      for (; next != boundaries_to_keep.end(); ++prev, ++curr, ++next) {
+        WeightType new_weight =
+            compressed_summary.entries_[*next].PrevMaxRank() -
+            compressed_summary.entries_[*prev].NextMinRank();
+        if (new_weight < min_weight) {
+          min_element = curr;
+          min_weight = new_weight;
+        }
+      }
+      boundaries_to_keep.erase(min_element);
+    }
+
     // Return boundaries.
-    output.reserve(compressed_summary.entries_.size());
-    for (const auto& entry : compressed_summary.entries_) {
-      output.push_back(entry.value);
+    output.reserve(boundaries_to_keep.size());
+    for (auto itr = boundaries_to_keep.begin(); itr != boundaries_to_keep.end();
+         ++itr) {
+      output.push_back(compressed_summary.entries_[*itr].value);
     }
     return output;
   }
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index b457f7e29d4..4951d0895c6 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -85,6 +85,9 @@ class CollectiveGatherOpKernel : public CollectiveOpKernel {
     OP_REQUIRES_OK(
         c, c->GetAttr("communication_hint",
                       &col_params_.instance.impl_details.communication_hint));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("timeout_seconds",
+                      &col_params_.instance.impl_details.timeout_seconds));
     const NodeDef& real_node = c->def();
     col_params_.name = strings::StrCat(real_node.name(), ": Gather");
     col_params_.group.device_type = c->device_type();
@@ -176,10 +179,14 @@ class CollectiveReduceOpKernel : public CollectiveOpKernel {
     OP_REQUIRES_OK(
         c, c->GetAttr("communication_hint",
                       &col_params_.instance.impl_details.communication_hint));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("timeout_seconds",
+                      &col_params_.instance.impl_details.timeout_seconds));
     VLOG(2) << "CollectiveReduce instance " << col_params_.instance.instance_key
             << " merge_op " << merge_op_name << " final_op " << final_op_name
             << " communication_hint "
-            << col_params_.instance.impl_details.communication_hint;
+            << col_params_.instance.impl_details.communication_hint
+            << " timeout " << col_params_.instance.impl_details.timeout_seconds;
 
     const NodeDef& real_node = c->def();
     col_params_.name = strings::StrCat(real_node.name(), ": Reduce(",
@@ -284,6 +291,9 @@ class CollectiveBcastSendOpKernel : public CollectiveOpKernel {
     OP_REQUIRES_OK(
         c, c->GetAttr("communication_hint",
                       &col_params_.instance.impl_details.communication_hint));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("timeout_seconds",
+                      &col_params_.instance.impl_details.timeout_seconds));
     col_params_.is_source = true;
     col_params_.instance.impl_details.subdiv_offsets = {0};
 
@@ -363,6 +373,9 @@ class CollectiveBcastRecvOpKernel : public CollectiveOpKernel {
     OP_REQUIRES_OK(
         c, c->GetAttr("communication_hint",
                       &col_params_.instance.impl_details.communication_hint));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("timeout_seconds",
+                      &col_params_.instance.impl_details.timeout_seconds));
     col_params_.is_source = false;
     col_params_.instance.impl_details.subdiv_offsets = {0};
 
diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h
index 31abe9dfead..85ca2b5722a 100644
--- a/tensorflow/core/kernels/conv_2d_gpu.h
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -210,6 +210,57 @@ __global__ void ShuffleInTensor3Simple(int nthreads,
   }
 }
 
+static constexpr int kUnroll = 4;
+
+template <typename T, int sp0, int sp1, int sp2, bool conjugate = false>
+__global__ void ShuffleInTensor3SimpleVector(int nthreads,
+                                             const T* __restrict__ input,
+                                             Dimension<3> input_dims,
+                                             T* __restrict__ output) {
+  Dimension<3> output_dims;
+  output_dims[sp0] = input_dims[0];
+  output_dims[sp1] = input_dims[1];
+  output_dims[sp2] = input_dims[2];
+
+  const int stride = blockDim.x * gridDim.x * kUnroll;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  T buf[kUnroll];
+
+  int output_index;
+  for (output_index = tid * kUnroll; output_index + kUnroll - 1 < nthreads;
+       output_index += stride) {
+#pragma unroll
+    for (int i = 0; i < kUnroll; i++) {
+      int output_index_i = output_index + i;
+      Index<3> output_tensor_index =
+          FlatToTensorIndex(output_index_i, output_dims);
+      Index<3> input_tensor_index;
+      input_tensor_index[0] = output_tensor_index[sp0];
+      input_tensor_index[1] = output_tensor_index[sp1];
+      input_tensor_index[2] = output_tensor_index[sp2];
+
+      int input_index_i = TensorIndexToFlat(input_tensor_index, input_dims);
+      buf[i] = maybe_conj<T, conjugate>::run(ldg(input + input_index_i));
+    }
+    float2* out = reinterpret_cast<float2*>(output + output_index);
+    *out = *reinterpret_cast<float2*>(buf);
+  }
+
+  for (; output_index < nthreads; ++output_index) {
+    Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims);
+
+    Index<3> input_tensor_index;
+    input_tensor_index[0] = output_tensor_index[sp0];
+    input_tensor_index[1] = output_tensor_index[sp1];
+    input_tensor_index[2] = output_tensor_index[sp2];
+
+    int input_index = TensorIndexToFlat(input_tensor_index, input_dims);
+
+    output[output_index] =
+        maybe_conj<T, conjugate>::run(ldg(input + input_index));
+  }
+}
+
 // Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor,
 // where dimensions are zero-based: output[i][j][k] = input[i][k][j].
 //
@@ -1008,10 +1059,40 @@ struct SwapDimension0And2InTensor3<GPUDevice, T, conjugate> {
                                static_cast<int>(combined_dims[2])};
     size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
     GpuLaunchConfig config = GetGpuLaunchConfig(total_size, d);
-    TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0, conjugate>,
-                                config.block_count, config.thread_per_block, 0,
-                                d.stream(), config.virtual_thread_count, in,
-                                input_dims, out));
+
+    auto out_ptr = reinterpret_cast<uintptr_t>(out);
+    bool aligned = out_ptr % 16 == 0;
+
+    bool use_vector = false;
+    bool use_custom_config = false;
+    if ((input_dims[0] <= 128 && input_dims[2] <= 128) ||
+        input_dims[0] * input_dims[1] <= 128 ||
+        input_dims[1] * input_dims[2] <= 8) {
+      use_vector = true;
+      use_custom_config = true;
+    } else if (input_dims[1] * input_dims[2] <= 16384) {
+      use_vector = true;
+    }
+
+    if (sizeof(T) == 2 && aligned && use_vector) {
+      int block_count;
+      if (use_custom_config) {
+        block_count = (total_size + config.thread_per_block - 1) /
+                      config.thread_per_block;
+      } else {
+        block_count = config.block_count;
+      }
+
+      TF_CHECK_OK(
+          GpuLaunchKernel(ShuffleInTensor3SimpleVector<T, 2, 1, 0, conjugate>,
+                          block_count, config.thread_per_block / kUnroll, 0,
+                          d.stream(), total_size, in, input_dims, out));
+    } else {
+      TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0, conjugate>,
+                                  config.block_count, config.thread_per_block,
+                                  0, d.stream(), config.virtual_thread_count,
+                                  in, input_dims, out));
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/count_ops.cc b/tensorflow/core/kernels/count_ops.cc
index e7cc18ac454..7c85b050039 100644
--- a/tensorflow/core/kernels/count_ops.cc
+++ b/tensorflow/core/kernels/count_ops.cc
@@ -16,17 +16,20 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-using BatchedIntMap = std::vector<absl::flat_hash_map<int64, int64>>;
+template <class T>
+using BatchedMap = std::vector<absl::flat_hash_map<int64, T>>;
 
 namespace {
 // TODO(momernick): Extend this function to work with outputs of rank > 2.
-Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
+template <class T>
+Status OutputSparse(const BatchedMap<T>& per_batch_counts, int num_values,
                     bool is_1d, OpKernelContext* context) {
   int total_values = 0;
   int num_batches = per_batch_counts.size();
@@ -44,12 +47,12 @@ Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
       context->allocate_output(1, TensorShape({total_values}), &values));
 
   auto output_indices = indices->matrix<int64>();
-  auto output_values = values->flat<int64>();
+  auto output_values = values->flat<T>();
   int64 value_loc = 0;
   for (int b = 0; b < num_batches; ++b) {
     const auto& per_batch_count = per_batch_counts[b];
-    std::vector<std::pair<int, int>> pairs(per_batch_count.begin(),
-                                           per_batch_count.end());
+    std::vector<std::pair<int, T>> pairs(per_batch_count.begin(),
+                                         per_batch_count.end());
     std::sort(pairs.begin(), pairs.end());
     for (const auto& x : pairs) {
       if (is_1d) {
@@ -77,85 +80,19 @@ Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
   return Status::OK();
 }
 
-Status OutputWeightedSparse(const BatchedIntMap& per_batch_counts,
-                            int num_values, const Tensor& weights, bool is_1d,
-                            OpKernelContext* context) {
-  if (!TensorShapeUtils::IsVector(weights.shape())) {
-    return errors::InvalidArgument(
-        "Weights must be a 1-dimensional tensor. Got: ",
-        weights.shape().DebugString());
-  }
-
-  if (num_values > weights.dim_size(0)) {
-    return errors::InvalidArgument("The maximum array value was ", num_values,
-                                   ", but the weight array has size ",
-                                   weights.shape().DebugString());
-  }
-  auto weight_values = weights.flat<float>();
-
-  int total_values = 0;
-  int num_batches = per_batch_counts.size();
-  for (const auto& per_batch_count : per_batch_counts) {
-    total_values += per_batch_count.size();
-  }
-
-  Tensor* indices;
-  int inner_dim = is_1d ? 1 : 2;
-  TF_RETURN_IF_ERROR(context->allocate_output(
-      0, TensorShape({total_values, inner_dim}), &indices));
-
-  Tensor* values;
-  TF_RETURN_IF_ERROR(
-      context->allocate_output(1, TensorShape({total_values}), &values));
-
-  auto output_indices = indices->matrix<int64>();
-  auto output_values = values->flat<float>();
-  int64 value_loc = 0;
-  for (int b = 0; b < num_batches; ++b) {
-    const auto& per_batch_count = per_batch_counts[b];
-    std::vector<std::pair<int, int>> pairs(per_batch_count.begin(),
-                                           per_batch_count.end());
-    std::sort(pairs.begin(), pairs.end());
-    for (const auto& x : pairs) {
-      if (is_1d) {
-        output_indices(value_loc, 0) = x.first;
-      } else {
-        output_indices(value_loc, 0) = b;
-        output_indices(value_loc, 1) = x.first;
-      }
-      output_values(value_loc) = x.second * weight_values(x.first);
-      ++value_loc;
-    }
-  }
-
-  Tensor* dense_shape;
-  if (is_1d) {
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(2, TensorShape({1}), &dense_shape));
-    dense_shape->flat<int64>().data()[0] = num_values;
-  } else {
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(2, TensorShape({2}), &dense_shape));
-    dense_shape->flat<int64>().data()[0] = num_batches;
-    dense_shape->flat<int64>().data()[1] = num_values;
-  }
-  return Status::OK();
-}
-
-template <class T>
-T GetOutputSize(T max_seen, T max_length, T min_length) {
+int GetOutputSize(int max_seen, int max_length, int min_length) {
   return max_length > 0 ? max_length : std::max((max_seen + 1), min_length);
 }
 
 }  // namespace
 
-template <class T>
+template <class T, class W>
 class DenseCount : public OpKernel {
  public:
   explicit DenseCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -170,6 +107,15 @@ class DenseCount : public OpKernel {
                     "Input must be a 1 or 2-dimensional tensor. Got: ",
                     data.shape().DebugString()));
 
+    if (use_weights) {
+      OP_REQUIRES(
+          context, weights.shape() == data.shape(),
+          errors::InvalidArgument(
+              "Weights and data must have the same shape. Weight shape: ",
+              weights.shape().DebugString(),
+              "; data shape: ", data.shape().DebugString()));
+    }
+
     bool is_1d = TensorShapeUtils::IsVector(data.shape());
     int negative_valued_axis = -1;
     int num_batch_dimensions = (data.shape().dims() + negative_valued_axis);
@@ -179,19 +125,23 @@ class DenseCount : public OpKernel {
       num_batch_elements *= data.shape().dim_size(i);
     }
     int num_value_elements = data.shape().num_elements() / num_batch_elements;
-    auto per_batch_counts = BatchedIntMap(num_batch_elements);
+    auto per_batch_counts = BatchedMap<W>(num_batch_elements);
+
     T max_value = 0;
 
     const auto data_values = data.flat<T>();
+    const auto weight_values = weights.flat<W>();
     int i = 0;
     for (int b = 0; b < num_batch_elements; ++b) {
       for (int v = 0; v < num_value_elements; ++v) {
         const auto& value = data_values(i);
         if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-          if (binary_count_) {
-            (per_batch_counts[b])[value] = 1;
+          if (binary_output_) {
+            per_batch_counts[b][value] = 1;
+          } else if (use_weights) {
+            per_batch_counts[b][value] += weight_values(i);
           } else {
-            (per_batch_counts[b])[value]++;
+            per_batch_counts[b][value]++;
           }
           if (value > max_value) {
             max_value = value;
@@ -201,30 +151,24 @@ class DenseCount : public OpKernel {
       }
     }
 
-    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
-    if (use_weights) {
-      OP_REQUIRES_OK(context,
-                     OutputWeightedSparse(per_batch_counts, num_output_values,
-                                          weights, is_1d, context));
-    } else {
-      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
-                                           is_1d, context));
-    }
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
   }
 
  private:
-  T minlength_;
-  T maxlength_;
-  bool binary_count_;
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
 };
 
-template <class T>
+template <class T, class W>
 class SparseCount : public OpKernel {
  public:
   explicit SparseCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -235,23 +179,27 @@ class SparseCount : public OpKernel {
     bool use_weights = weights.NumElements() > 0;
 
     bool is_1d = shape.NumElements() == 1;
-    const auto indices_values = indices.matrix<int64>();
-    const auto values_values = values.flat<T>();
-
     int num_batches = is_1d ? 1 : shape.flat<int64>()(0);
     int num_values = values.NumElements();
 
-    auto per_batch_counts = BatchedIntMap(num_batches);
+    const auto indices_values = indices.matrix<int64>();
+    const auto values_values = values.flat<T>();
+    const auto weight_values = weights.flat<W>();
+
+    auto per_batch_counts = BatchedMap<W>(num_batches);
+
     T max_value = 0;
 
     for (int idx = 0; idx < num_values; ++idx) {
       int batch = is_1d ? 0 : indices_values(idx, 0);
       const auto& value = values_values(idx);
       if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-        if (binary_count_) {
-          (per_batch_counts[batch])[value] = 1;
+        if (binary_output_) {
+          per_batch_counts[batch][value] = 1;
+        } else if (use_weights) {
+          per_batch_counts[batch][value] += weight_values(idx);
         } else {
-          (per_batch_counts[batch])[value]++;
+          per_batch_counts[batch][value]++;
         }
         if (value > max_value) {
           max_value = value;
@@ -259,30 +207,25 @@ class SparseCount : public OpKernel {
       }
     }
 
-    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
-    if (use_weights) {
-      OP_REQUIRES_OK(context,
-                     OutputWeightedSparse(per_batch_counts, num_output_values,
-                                          weights, is_1d, context));
-    } else {
-      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
-                                           is_1d, context));
-    }
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
   }
 
  private:
-  T minlength_;
-  T maxlength_;
-  bool binary_count_;
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
+  bool validate_;
 };
 
-template <class T>
+template <class T, class W>
 class RaggedCount : public OpKernel {
  public:
   explicit RaggedCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -290,13 +233,15 @@ class RaggedCount : public OpKernel {
     const Tensor& values = context->input(1);
     const Tensor& weights = context->input(2);
     bool use_weights = weights.NumElements() > 0;
+    bool is_1d = false;
 
     const auto splits_values = splits.flat<int64>();
     const auto values_values = values.flat<T>();
+    const auto weight_values = weights.flat<W>();
     int num_batches = splits.NumElements() - 1;
     int num_values = values.NumElements();
 
-    auto per_batch_counts = BatchedIntMap(num_batches);
+    auto per_batch_counts = BatchedMap<W>(num_batches);
     T max_value = 0;
     int batch_idx = 0;
 
@@ -306,10 +251,12 @@ class RaggedCount : public OpKernel {
       }
       const auto& value = values_values(idx);
       if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-        if (binary_count_) {
-          (per_batch_counts[batch_idx - 1])[value] = 1;
+        if (binary_output_) {
+          per_batch_counts[batch_idx - 1][value] = 1;
+        } else if (use_weights) {
+          per_batch_counts[batch_idx - 1][value] += weight_values(idx);
         } else {
-          (per_batch_counts[batch_idx - 1])[value]++;
+          per_batch_counts[batch_idx - 1][value]++;
         }
         if (value > max_value) {
           max_value = value;
@@ -317,42 +264,47 @@ class RaggedCount : public OpKernel {
       }
     }
 
-    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
-    if (use_weights) {
-      OP_REQUIRES_OK(context,
-                     OutputWeightedSparse(per_batch_counts, num_output_values,
-                                          weights, false, context));
-    } else {
-      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
-                                           false, context));
-    }
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
   }
 
  private:
-  T minlength_;
-  T maxlength_;
-  bool binary_count_;
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
+  bool validate_;
 };
 
-#define REGISTER(TYPE)                                    \
-                                                          \
-  REGISTER_KERNEL_BUILDER(Name("DenseCountSparseOutput")  \
-                              .TypeConstraint<TYPE>("T")  \
-                              .Device(DEVICE_CPU),        \
-                          DenseCount<TYPE>)               \
-                                                          \
-  REGISTER_KERNEL_BUILDER(Name("SparseCountSparseOutput") \
-                              .TypeConstraint<TYPE>("T")  \
-                              .Device(DEVICE_CPU),        \
-                          SparseCount<TYPE>)              \
-                                                          \
-  REGISTER_KERNEL_BUILDER(Name("RaggedCountSparseOutput") \
-                              .TypeConstraint<TYPE>("T")  \
-                              .Device(DEVICE_CPU),        \
-                          RaggedCount<TYPE>)
+#define REGISTER_W(W_TYPE) \
+  REGISTER(int32, W_TYPE)  \
+  REGISTER(int64, W_TYPE)
 
-REGISTER(int32);
-REGISTER(int64);
+#define REGISTER(I_TYPE, W_TYPE)                                     \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("DenseCountSparseOutput")             \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          DenseCount<I_TYPE, W_TYPE>)                \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("SparseCountSparseOutput")            \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          SparseCount<I_TYPE, W_TYPE>)               \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("RaggedCountSparseOutput")            \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          RaggedCount<I_TYPE, W_TYPE>)
+
+TF_CALL_INTEGRAL_TYPES(REGISTER_W);
+TF_CALL_float(REGISTER_W);
+TF_CALL_double(REGISTER_W);
+
+#undef REGISTER_W
 #undef REGISTER
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cubin_headers/BUILD b/tensorflow/core/kernels/cubin_headers/BUILD
new file mode 100644
index 00000000000..509ac008355
--- /dev/null
+++ b/tensorflow/core/kernels/cubin_headers/BUILD
@@ -0,0 +1,67 @@
+# Generates headers containing cubin for CUDA kernels.
+load("//tensorflow/core/kernels/cubin_headers:build_defs.bzl", "gen_kernel_image_hdr")
+
+bias_add_kernel = """
+func @bias_add(%arg0: tensor<?x?xf99>,
+         %arg1: tensor<?xf99>) -> tensor<?x?xf99> {
+  %0 = "tf.BiasAdd"(%arg0, %arg1) { T = "tfdtype$DT_TYPE" }
+    : (tensor<?x?xf99>, tensor<?xf99>) -> tensor<?x?xf99>
+  return %0 : tensor<?x?xf99>
+}
+"""
+
+[
+    gen_kernel_image_hdr(
+        name = "bias_add_{type}_kernel".format(type = type),
+        op = bias_add_kernel.replace("f99", type).replace("DT_TYPE", dtype),
+        same_shape = "0,2",
+        tile_size = "16x16",
+    )
+    for (type, dtype) in [
+        ("f16", "DT_HALF"),
+        ("f32", "DT_FLOAT"),
+        ("f64", "DT_DOUBLE"),
+    ]
+]
+
+relu_kernel = """
+func @relu(%arg0: tensor<?xf99>) -> tensor<?xf99> {
+  %0 = "tf.Relu"(%arg0) { T = "tfdtype$DT_TYPE" }
+    : (tensor<?xf99>) -> tensor<?xf99>
+  return %0 : tensor<?xf99>
+}
+"""
+
+[
+    gen_kernel_image_hdr(
+        name = "relu_{type}_kernel".format(type = type),
+        op = relu_kernel.replace("f99", type).replace("DT_TYPE", dtype),
+        same_shape = "0,1",
+        tile_size = "256",
+    )
+    for (type, dtype) in [
+        ("f16", "DT_HALF"),
+        ("f32", "DT_FLOAT"),
+        ("f64", "DT_DOUBLE"),
+    ]
+]
+
+tanh_kernel = """
+func @tanh(%arg0: tensor<?xf99>) -> tensor<?xf99> {
+  %0 = "tf.Tanh"(%arg0) { T = "tfdtype$DT_TYPE" }
+    : (tensor<?xf99>) -> tensor<?xf99>
+  return %0 : tensor<?xf99>
+}
+"""
+
+[
+    gen_kernel_image_hdr(
+        name = "tanh_{type}_kernel".format(type = type),
+        op = tanh_kernel.replace("f99", type).replace("DT_TYPE", dtype),
+        tile_size = "256",
+    )
+    for (type, dtype) in [
+        ("f32", "DT_FLOAT"),
+        ("f64", "DT_DOUBLE"),
+    ]
+]
diff --git a/tensorflow/core/kernels/cubin_headers/build_defs.bzl b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
new file mode 100644
index 00000000000..f9dac50591a
--- /dev/null
+++ b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
@@ -0,0 +1,102 @@
+"""Generates cubin headers for TF dialect ops."""
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures")
+
+def _lookup_file(filegroup, path):
+    """Extracts file at (relative) path in filegroup."""
+    for file in filegroup.files.to_list():
+        if file.path.endswith(path):
+            return file
+    return None
+
+def _gen_kernel_image_hdr_impl(ctx):
+    if not ctx.attr.gpu_archs:
+        fail("No GPU architecture specified, use --config=cuda or similar")
+
+    name = ctx.attr.name
+    tile_sizes = ctx.attr.tile_size.replace("x", ",")
+    same_shape = []
+    if ctx.attr.same_shape:
+        same_shape.append("--same_shape=%s" % ctx.attr.same_shape)
+
+    cubins = []
+    images = []
+    for arch in ctx.attr.gpu_archs:
+        # TODO(b/152737872): 'compute_' should generate both SASS and PTX.
+        arch = arch.replace("compute_", "sm_")
+        filename = "%s.%s.cubin" % (name, arch)
+        cubin = ctx.actions.declare_file(filename)
+        ctx.actions.run(
+            outputs = [cubin],
+            executable = ctx.executable._tool,
+            arguments = same_shape + [
+                "--tile_sizes=%s" % tile_sizes,
+                "--arch=%s" % arch.split("_")[1],
+                "--output=%s" % cubin.path,
+                ctx.attr.op,
+            ],
+            mnemonic = "compile",
+        )
+        cubins.append(cubin)
+        images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
+
+    # Generate fatbin file from all cubins.
+    fatbin = ctx.actions.declare_file("%s.fatbin" % name)
+    ctx.actions.run(
+        outputs = [fatbin],
+        inputs = cubins,
+        executable = _lookup_file(ctx.attr._cuda_root, "bin/fatbinary"),
+        arguments = [
+            "--64",
+            "--cmdline=--compile-only",
+            "--link",
+            "--compress-all",
+            "--create=%s" % fatbin.path,
+        ] + images,
+        mnemonic = "fatbinary",
+    )
+
+    bin2c = _lookup_file(ctx.attr._cuda_root, "bin/bin2c")
+    ctx.actions.run_shell(
+        outputs = [ctx.outputs.out],
+        inputs = [fatbin],
+        tools = [bin2c],
+        command = "%s --static --const --type=int --name=%s %s 1> %s" %
+                  (bin2c.path, ctx.attr.symbol, fatbin.path, ctx.outputs.out.path),
+        mnemonic = "bin2c",
+    )
+
+_gen_kernel_image_hdr = rule(
+    implementation = _gen_kernel_image_hdr_impl,
+    output_to_genfiles = True,
+    attrs = {
+        "op": attr.string(mandatory = True),
+        "tile_size": attr.string(mandatory = True),
+        "same_shape": attr.string(),
+        "out": attr.output(mandatory = True),
+        "symbol": attr.string(mandatory = True),
+        "gpu_archs": attr.string_list(mandatory = True),
+        "_cuda_root": attr.label(
+            default = Label("@local_config_cuda//cuda:cuda_root"),
+        ),
+        "_tool": attr.label(
+            executable = True,
+            default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_cubin"),
+            cfg = "host",
+        ),
+    },
+)
+
+def gen_kernel_image_hdr(name, op, tile_size, tags = [], same_shape = None):
+    """Generates a C header with fatbin data from a Tensorflow op."""
+    if cuda_gpu_architectures():
+        _gen_kernel_image_hdr(
+            name = name,
+            op = op,
+            tile_size = tile_size,
+            same_shape = same_shape,
+            out = "%s.h" % name,
+            symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
+            gpu_archs = cuda_gpu_architectures(),
+            tags = tags,
+        )
diff --git a/tensorflow/core/kernels/cuda_sparse.h b/tensorflow/core/kernels/cuda_sparse.h
index eb69469b615..2d41cc72421 100644
--- a/tensorflow/core/kernels/cuda_sparse.h
+++ b/tensorflow/core/kernels/cuda_sparse.h
@@ -259,7 +259,7 @@ class GpuSparse {
   // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-coo2csr.
   Status Coo2csr(const int* cooRowInd, int nnz, int m, int* csrRowPtr) const;
 
-#if CUDA_VERSION < 10020
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10020)) || TENSORFLOW_USE_ROCM
   // Sparse-dense matrix multiplication C = alpha * op(A) * op(B)  + beta * C,
   // where A is a sparse matrix in CSR format, B and C are dense tall
   // matrices.  This routine allows transposition of matrix B, which
@@ -311,7 +311,7 @@ class GpuSparse {
   // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv_mergepath
   //
   // **NOTE** This is an in-place operation for data in y.
-#if CUDA_VERSION < 10020
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10020)) || TENSORFLOW_USE_ROCM
   template <typename Scalar>
   Status Csrmv(gpusparseOperation_t transA, int m, int n, int nnz,
                const Scalar* alpha_host, const gpusparseMatDescr_t descrA,
@@ -366,7 +366,7 @@ class GpuSparse {
                  Scalar* csrSortedValC, int* csrSortedRowPtrC,
                  int* csrSortedColIndC, void* workspace);
 
-#if CUDA_VERSION >= 10000
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
   // Computes sparse-sparse matrix multiplication of matrices
   // stored in CSR format.  This is part zero: calculate required workspace
   // size.
@@ -383,7 +383,7 @@ class GpuSparse {
   // output.  csrSortedRowPtrC must be preallocated on device with
   // m + 1 entries.  See:
   // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgemm.
-#if CUDA_VERSION < 10000
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
   Status CsrgemmNnz(gpusparseOperation_t transA, gpusparseOperation_t transB,
                     int m, int k, int n, const gpusparseMatDescr_t descrA,
                     int nnzA, const int* csrSortedRowPtrA,
@@ -408,7 +408,7 @@ class GpuSparse {
   // addition.  csrValC and csrColIndC must be allocated on the device
   // with nnzTotalDevHostPtr entries (as calculated by CsrgemmNnz).  See:
   // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgemm.
-#if CUDA_VERSION < 10000
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
   template <typename Scalar>
   Status Csrgemm(gpusparseOperation_t transA, gpusparseOperation_t transB,
                  int m, int k, int n, const gpusparseMatDescr_t descrA,
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index d088abc00e6..6d0351202df 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -3,6 +3,7 @@
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "if_not_mobile",
     "tf_cc_test",
     "tf_kernel_library",
 )
@@ -150,6 +151,7 @@ cc_library(
         ":dataset_utils",
         ":single_threaded_executor",
         ":stats_utils",
+        "@com_google_absl//absl/time",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -158,8 +160,10 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:variable_ops",
         "//tensorflow/core/profiler/lib:traceme",
-        "@com_google_absl//absl/time",
-    ],
+    ] + if_not_mobile([
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
+    ]),
 )
 
 cc_library(
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 28738e3e2fe..d79cb25ec8b 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -35,6 +35,11 @@ limitations under the License.
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#endif  // !IS_MOBILE_PLATFORM
+
 namespace tensorflow {
 namespace data {
 namespace {
@@ -466,17 +471,15 @@ Status FunctionMetadata::Create(
 
   auto attr = fdef->attr().find(FunctionLibraryDefinition::kIntsOnDeviceAttr);
   if (attr != fdef->attr().end() && attr->second.b()) {
-    LOG(WARNING)
-        << "Disabling multi-device execution for a function that uses the "
-        << FunctionLibraryDefinition::kIntsOnDeviceAttr << " attribute.";
+    VLOG(1) << "Disabling multi-device execution for a function that uses the "
+            << FunctionLibraryDefinition::kIntsOnDeviceAttr << " attribute.";
     (*out_metadata)->use_multi_device_function_ = false;
     return Status::OK();
   }
   auto validate_arg = [](const OpDef::ArgDef& arg) {
     if (!arg.number_attr().empty() || !arg.type_list_attr().empty()) {
-      LOG(WARNING) << "Disabling multi-device execution for a function with "
-                      "a vector argument "
-                   << arg.name() << ".";
+      VLOG(1) << "Disabling multi-device execution for a function with "
+              << "a vector argument " << arg.name() << ".";
       return false;
     }
     return true;
@@ -562,8 +565,7 @@ Status CapturedFunction::Instantiate(
   if (!metadata_->use_inter_op_parallelism()) {
     inst_opts.executor_type = "SINGLE_THREADED_EXECUTOR";
   }
-  bool is_multi_device = false;
-  TF_RETURN_IF_ERROR(IsMultiDevice(ctx, &is_multi_device));
+  bool is_multi_device = metadata_->use_multi_device_function();
   inst_opts.is_multi_device_function = is_multi_device;
 
   // We infer the target device from the function library runtime.
@@ -615,6 +617,28 @@ Status CapturedFunction::Instantiate(
     for (size_t i = 0; i < fdef->signature().output_arg_size(); ++i) {
       inst_opts.output_devices.push_back(inst_opts.target);
     }
+
+#if !defined(IS_MOBILE_PLATFORM)
+    grappler::GrapplerItem::OptimizationOptions optimization_options;
+    optimization_options.allow_pruning_stateful_and_dataset_ops = false;
+    ConfigProto config_proto = inst_opts.config_proto;
+    // Layout optimizations are excluded because they assume that ops without
+    // explicit device assignment will be placed on GPU (if available) but
+    // that's not the case for operations within tf.data functions.
+    config_proto.mutable_graph_options()
+        ->mutable_rewrite_options()
+        ->set_layout_optimizer(RewriterConfig::OFF);
+    // TODO(b/120437209): Re-enable constant folding.
+    config_proto.mutable_graph_options()
+        ->mutable_rewrite_options()
+        ->set_constant_folding(RewriterConfig::OFF);
+    inst_opts.optimize_graph_fn =
+        std::bind(tensorflow::grappler::OptimizeGraph, std::placeholders::_1,
+                  std::placeholders::_2, std::placeholders::_3,
+                  std::placeholders::_4, std::placeholders::_5,
+                  std::move(config_proto), fdef->signature().name(),
+                  std::move(optimization_options), std::placeholders::_6);
+#endif  // !IS_MOBILE_PLATFORM
   }
 
   FunctionLibraryRuntime::Handle f_handle;
@@ -866,77 +890,5 @@ CapturedFunction::CapturedFunction(
     : metadata_(std::move(metadata)),
       captured_inputs_(std::move(captured_inputs)) {}
 
-Status CapturedFunction::IsMultiDevice(IteratorContext* ctx,
-                                       bool* is_multi_device) {
-  if (!metadata_->use_multi_device_function()) {
-    *is_multi_device = false;
-    return Status::OK();
-  }
-
-  const FunctionDef* fdef;
-  TF_RETURN_IF_ERROR(
-      LookupFunction(*metadata_->lib_def(), metadata_->func().name(), &fdef));
-
-  Device* current_device = ctx->flr()->device();
-  DeviceType current_device_type(current_device->device_type());
-  DeviceNameUtils::ParsedName current_device_name;
-  if (!DeviceNameUtils::ParseFullName(current_device->name(),
-                                      &current_device_name)) {
-    return errors::InvalidArgument("Failed to parse device name: ",
-                                   current_device->name());
-  }
-
-  // Check if any of the captured inputs are placed on a device not compatible
-  // with the current device. For non-captured inputs, we assume they are placed
-  // on the current device.
-  for (const auto& input : captured_inputs_) {
-    DataType dtype = input.dtype();
-    if (dtype == DT_RESOURCE) {
-      const ResourceHandle& handle = input.flat<ResourceHandle>()(0);
-      DeviceNameUtils::ParsedName resource_device_name;
-      if (!DeviceNameUtils::ParseFullName(handle.device(),
-                                          &resource_device_name)) {
-        return errors::InvalidArgument("Failed to parse device name: ",
-                                       handle.device());
-      }
-      if (!DeviceNameUtils::AreCompatibleDevNames(current_device_name,
-                                                  resource_device_name)) {
-        *is_multi_device = true;
-        return Status::OK();
-      }
-    }
-  }
-
-  // Check if all ops could be placed on the current device.
-  for (const auto& name : metadata_->lib_def()->ListFunctionNames()) {
-    const FunctionDef* fdef;
-    TF_RETURN_IF_ERROR(LookupFunction(*metadata_->lib_def(), name, &fdef));
-    for (const auto& node : fdef->node_def()) {
-      // Check if the op has a kernel available for the current device.
-      if (!KernelDefAvailable(current_device_type, node)) {
-        *is_multi_device = true;
-        return Status::OK();
-      }
-      // If the op has a requested device, check if the requested device is
-      // compatible with the current device.
-      if (!node.device().empty()) {
-        DeviceNameUtils::ParsedName node_device_name;
-        if (!DeviceNameUtils::ParseFullName(node.device(), &node_device_name)) {
-          return errors::InvalidArgument("Failed to parse device name: ",
-                                         node.device());
-        }
-        if (!DeviceNameUtils::AreCompatibleDevNames(current_device_name,
-                                                    node_device_name)) {
-          *is_multi_device = true;
-          return Status::OK();
-        }
-      }
-    }
-  }
-
-  *is_multi_device = false;
-  return Status::OK();
-}
-
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index 284a02091dd..de424fc547c 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -256,10 +256,6 @@ class CapturedFunction {
   CapturedFunction(std::shared_ptr<const FunctionMetadata> metadata,
                    std::vector<Tensor> captured_inputs);
 
-  // Determines whether the captured function requires the use of the
-  // multi-device function backend.
-  Status IsMultiDevice(IteratorContext* ctx, bool* is_multi_device);
-
   const std::shared_ptr<const FunctionMetadata> metadata_;
   const std::vector<Tensor> captured_inputs_;
 
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index d8ae7190a7f..70ca70176e8 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -63,7 +63,10 @@ class AnonymousResourceOp : public OpKernel {
 
     if (create_deleter_) {
       Tensor* deleter_t;
-      OP_REQUIRES_OK(ctx, ctx->allocate_output(1, TensorShape({}), &deleter_t));
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output(1, TensorShape({}), &deleter_t, attr));
       deleter_t->scalar<Variant>()() =
           ResourceDeleter(handle, ctx->resource_manager());
     }
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index d61c574cb35..60459b31a8f 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -109,6 +109,20 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "compression_ops",
+    srcs = ["compression_ops.cc"],
+    hdrs = ["compression_ops.h"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/data:compression_utils",
+        "//tensorflow/core/data:dataset_proto_cc",
+    ],
+)
+
 tf_kernel_library(
     name = "csv_dataset_op",
     srcs = ["csv_dataset_op.cc"],
@@ -131,13 +145,14 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/data/service:common_proto_cc",
-        "//tensorflow/core/data/service:compression_utils",
+        "//tensorflow/core/data:compression_utils",
+        "//tensorflow/core/data:dataset_proto_cc",
         "//tensorflow/core/data/service:data_service",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:name_utils",
         "//tensorflow/core/kernels/data:serialization_utils",
+        "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -522,12 +537,14 @@ cc_library(
         "//tensorflow/core/platform:random",
         "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
 tf_kernel_library(
     name = "snapshot_dataset_op",
     srcs = ["snapshot_dataset_op.cc"],
+    hdrs = ["snapshot_dataset_op.h"],
     deps = [
         ":snapshot_util",
         "//tensorflow/core:core_cpu_internal",
@@ -536,10 +553,16 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/framework:op_requires",
         "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/kernels/data:captured_function",
         "//tensorflow/core/kernels/data:dataset_utils",
+        "//tensorflow/core/kernels/data:name_utils",
         "//tensorflow/core/platform:platform_port",
         "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
     ],
 )
@@ -680,6 +703,7 @@ tf_kernel_library(
         ":auto_shard_dataset_op",
         ":choose_fastest_branch_dataset_op",
         ":choose_fastest_dataset_op",
+        ":compression_ops",
         ":csv_dataset_op",
         ":dense_to_sparse_batch_dataset_op",
         ":directed_interleave_dataset_op",
diff --git a/tensorflow/core/kernels/data/experimental/compression_ops.cc b/tensorflow/core/kernels/data/experimental/compression_ops.cc
new file mode 100644
index 00000000000..efa7018acb6
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/compression_ops.cc
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/experimental/compression_ops.h"
+
+#include "tensorflow/core/data/compression_utils.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+CompressElementOp::CompressElementOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx) {}
+
+void CompressElementOp::Compute(OpKernelContext* ctx) {
+  std::vector<Tensor> components;
+  for (size_t i = 0; i < ctx->num_inputs(); ++i) {
+    components.push_back(ctx->input(i));
+  }
+  CompressedElement compressed;
+  OP_REQUIRES_OK(ctx, CompressElement(components, &compressed));
+
+  Tensor* output;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+  output->scalar<Variant>()() = std::move(compressed);
+}
+
+UncompressElementOp::UncompressElementOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
+}
+
+void UncompressElementOp::Compute(OpKernelContext* ctx) {
+  Tensor tensor = ctx->input(0);
+  const Variant& variant = tensor.scalar<Variant>()();
+  const CompressedElement* compressed = variant.get<CompressedElement>();
+
+  std::vector<Tensor> components;
+  OP_REQUIRES_OK(ctx, UncompressElement(*compressed, &components));
+  OP_REQUIRES(ctx, components.size() == output_types_.size(),
+              errors::FailedPrecondition("Expected ", output_types_.size(),
+                                         " outputs from uncompress, but got ",
+                                         components.size()));
+  for (int i = 0; i < components.size(); ++i) {
+    OP_REQUIRES(
+        ctx, components[i].dtype() == output_types_[i],
+        errors::FailedPrecondition("Expected a tensor of type ",
+                                   DataTypeString(output_types_[i]),
+                                   " but got a tensor of type ",
+                                   DataTypeString(components[i].dtype())));
+    ctx->set_output(i, components[i]);
+  }
+}
+
+REGISTER_KERNEL_BUILDER(Name("CompressElement").Device(DEVICE_CPU),
+                        CompressElementOp);
+REGISTER_KERNEL_BUILDER(Name("UncompressElement").Device(DEVICE_CPU),
+                        UncompressElementOp);
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/compression_ops.h b/tensorflow/core/kernels/data/experimental/compression_ops.h
new file mode 100644
index 00000000000..6dd89ea4e5d
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/compression_ops.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_COMPRESSION_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_COMPRESSION_OPS_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+class CompressElementOp : public OpKernel {
+ public:
+  explicit CompressElementOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class UncompressElementOp : public OpKernel {
+ public:
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit UncompressElementOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_COMPRESSION_OPS_H_
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index 8c336686deb..a106bcb0a7c 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -21,14 +21,14 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
-#include "tensorflow/core/data/service/common.pb.h"
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/dataset.pb.h"
 #include "tensorflow/core/data/service/data_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/kernels/data/serialization_utils.h"
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/snappy.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
@@ -178,14 +179,19 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
   class Iterator : public DatasetIterator<Dataset> {
    public:
     explicit Iterator(const Params& params, int64 iterator_index)
-        : DatasetIterator<Dataset>(params), iterator_index_(iterator_index) {}
+        : DatasetIterator<Dataset>(params),
+          iterator_index_(iterator_index),
+          max_outstanding_requests_(params.dataset->max_outstanding_requests_) {
+    }
 
     ~Iterator() override {
       mutex_lock l(mu_);
       VLOG(1) << "Destroying data service dataset iterator for job id "
               << job_id_;
       cancelled_ = true;
-      cv_.notify_all();
+      worker_thread_cv_.notify_all();
+      manager_thread_cv_.notify_all();
+      get_next_cv_.notify_all();
       // Thread destructors will block until the threads finish, no need to wait
       // here.
     }
@@ -218,12 +224,16 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             });
       }
 
-      while (results_.empty() && !job_finished_ && !cancelled_) {
-        cv_.wait(l);
+      while (results_.empty() && !job_finished_ && !cancelled_ &&
+             status_.ok()) {
+        get_next_cv_.wait(l);
       }
       if (cancelled_) {
         return errors::Cancelled("Data service iterator was cancelled");
       }
+      if (!status_.ok()) {
+        return status_;
+      }
       if (results_.empty()) {
         *end_of_sequence = true;
         return Status::OK();
@@ -232,7 +242,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       *end_of_sequence = false;
       out_tensors->swap(results_.front());
       results_.pop();
-      cv_.notify_all();
+      worker_thread_cv_.notify_one();
 
       return Status::OK();
     }
@@ -255,16 +265,21 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
     }
 
    private:
-    typedef struct TaskThread {
-      int64 task_id;
-      // Cached address of the worker for task `task_id`.
-      std::string address;
-      std::unique_ptr<DataServiceWorkerClient> worker;
-      std::unique_ptr<Thread> thread;
-      bool end_of_sequence = false;
-      // Indicates that the thread has finished running.
-      bool finished = false;
-    } TaskThread;
+    struct Task {
+      Task(int64 task_id, const std::string& address,
+           std::unique_ptr<DataServiceWorkerClient> worker)
+          : task_id(task_id), address(address), worker(std::move(worker)) {}
+
+      const int64 task_id;
+      // Address of the tf.data service worker for task `task_id`.
+      const std::string address;
+      // Client for fetching task elements from the tf.data service worker.
+      const std::unique_ptr<DataServiceWorkerClient> worker;
+      // Indicates whether a worker thread is currently processing the task.
+      bool in_use TF_GUARDED_BY(&Iterator::mu_) = false;
+      // Indicates whether the worker has returned end_of_sequence for the task.
+      bool end_of_sequence TF_GUARDED_BY(&Iterator::mu_) = false;
+    };
 
     // Periodically refresh the task list.
     // Maintain one thread fetching elements for each task.
@@ -282,22 +297,23 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             int64 remaining_time = next_check - Env::Default()->NowMicros();
             VLOG(3) << "Task thread manager waiting for " << remaining_time
                     << "us";
-            cv_.wait_for(l, std::chrono::microseconds(remaining_time));
+            manager_thread_cv_.wait_for(
+                l, std::chrono::microseconds(remaining_time));
           }
           if (cancelled_) {
             VLOG(3) << "Task thread manager finished";
             return;
           }
         }
-        UpdateTaskThreads(&master, ctx.get());
+        UpdateTasks(&master);
+        UpdateWorkerThreads(ctx.get());
         next_check = Env::Default()->NowMicros() +
                      dataset()->task_refresh_interval_ms_ * 1000;
       }
     }
 
-    void UpdateTaskThreads(DataServiceMasterClient* master,
-                           IteratorContext* ctx) LOCKS_EXCLUDED(mu_) {
-      VLOG(3) << "Updating task threads";
+    void UpdateTasks(DataServiceMasterClient* master) LOCKS_EXCLUDED(mu_) {
+      VLOG(3) << "Updating tasks";
       std::vector<TaskInfo> tasks;
       bool job_finished;
       Status s = master->GetTasks(job_id_, &tasks, &job_finished);
@@ -306,113 +322,149 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
                      << s;
         return;
       }
-      absl::flat_hash_set<int64> task_ids;
+      absl::flat_hash_map<int64, TaskInfo> task_id_to_task;
+      for (auto& task : tasks) {
+        task_id_to_task[task.id()] = task;
+      }
       mutex_lock l(mu_);
       job_finished_ = job_finished;
-      for (auto& task : tasks) {
-        task_ids.insert(task.id());
-        if (task_threads_.contains(task.id())) {
-          continue;
-        }
-        task_threads_[task.id()] = absl::make_unique<TaskThread>();
-        TaskThread* task_thread = task_threads_[task.id()].get();
-        task_thread->task_id = task.id();
-        task_thread->address = task.worker_address();
-        num_unfinished_tasks_++;
-        outstanding_requests_++;
-        auto done = [this, task_thread]() {
-          mutex_lock l(mu_);
-          num_unfinished_tasks_--;
-          outstanding_requests_--;
-          cv_.notify_all();
-          task_thread->finished = true;
-          VLOG(3) << "Task thread " << task_thread->task_id << " finished";
-        };
-        task_thread->thread =
-            ctx->StartThread("tf-data-service-task_thread",
-                             [this, task_thread, done = std::move(done)]() {
-                               RunTaskThread(task_thread, std::move(done));
-                             });
+      if (job_finished) {
+        get_next_cv_.notify_all();
+        return;
       }
-      // Mark deleted tasks and clean up finished task threads.
-      for (auto it = task_threads_.begin(); it != task_threads_.end();) {
-        TaskThread* task_thread = it->second.get();
-        if (task_thread->finished) {
-          task_threads_.erase(it++);
+      for (int i = 0; i < tasks_.size(); ++i) {
+        std::shared_ptr<Task> task = tasks_[i];
+        if (task_id_to_task.contains(task->task_id)) {
+          // Remove already-known tasks from `task_id_to_task`, so that at the
+          // end of the loop, only new tasks remain.
+          task_id_to_task.erase(task->task_id);
+        } else {
+          // Task has been removed.
+          if (task->end_of_sequence) {
+            finished_tasks_--;
+          }
+          tasks_[i] = tasks_[tasks_.size() - 1];
+          tasks_.pop_back();
+        }
+      }
+      for (auto& new_task_entry : task_id_to_task) {
+        TaskInfo& task_info = new_task_entry.second;
+        std::unique_ptr<DataServiceWorkerClient> worker;
+        Status s = CreateDataServiceWorkerClient(task_info.worker_address(),
+                                                 dataset()->protocol_, &worker);
+        if (!s.ok()) {
+          status_ = s;
+          get_next_cv_.notify_all();
           continue;
         }
-        if (!task_ids.contains(task_thread->task_id)) {
-          VLOG(3) << "Marking removed task thread " << task_thread->task_id
-                  << " as finished";
-          task_thread->end_of_sequence = true;
-        }
-        ++it;
+        tasks_.push_back(std::make_shared<Task>(
+            task_info.id(), task_info.worker_address(), std::move(worker)));
       }
       if (dataset()->max_outstanding_requests_ == model::kAutotune) {
         // Adjust max_outstanding_requests to account for newly added tasks.
-        max_outstanding_requests_ = task_threads_.size();
+        max_outstanding_requests_ = tasks_.size();
       }
     }
 
-    void RunTaskThread(TaskThread* task_thread, std::function<void()> done) {
+    void UpdateWorkerThreads(IteratorContext* ctx) LOCKS_EXCLUDED(mu_) {
+      mutex_lock l(mu_);
+      while (num_running_worker_threads_ < max_outstanding_requests_) {
+        num_running_worker_threads_++;
+        outstanding_requests_++;
+        auto done = [this]() {
+          mutex_lock l(mu_);
+          num_running_worker_threads_--;
+          outstanding_requests_--;
+          VLOG(3) << "Exiting worker thread";
+        };
+        worker_threads_.push_back(ctx->StartThread(
+            "tf-data-service-task_thread", [this, done = std::move(done)]() {
+              RunWorkerThread(std::move(done));
+            }));
+      }
+    }
+
+    void RunWorkerThread(std::function<void()> done) {
       auto cleanup = gtl::MakeCleanup([done = std::move(done)]() { done(); });
-      VLOG(3) << "Starting task thread for task " << task_thread->task_id
-              << " with worker address " << task_thread->address;
+      VLOG(3) << "Starting worker thread";
+      std::shared_ptr<Task> task_to_process;
       while (true) {
-        if (!task_thread->worker) {
-          Status s = CreateDataServiceWorkerClient(
-              task_thread->address, dataset()->protocol_, &task_thread->worker);
-          if (!s.ok()) {
-            LOG(WARNING) << "Failed to create a worker client for "
-                         << task_thread->address << ": " << s;
-          }
-        }
         {
           mutex_lock l(mu_);
-          if (task_thread->end_of_sequence) {
-            VLOG(3) << "Task thread" << task_thread->task_id
-                    << " reached end_of_sequence";
-            return;
+          if (task_to_process) {
+            task_to_process->in_use = false;
+            task_to_process = nullptr;
+            worker_thread_cv_.notify_one();
           }
           outstanding_requests_--;
-          while (!cancelled_ && results_.size() + outstanding_requests_ >=
-                                    max_outstanding_requests_) {
-            VLOG(3) << "Task thread for task " << task_thread->task_id
-                    << " waiting. results_.size()=" << results_.size()
-                    << " outstanding_requests_=" << outstanding_requests_;
-            cv_.wait(l);
+          while (!cancelled_ && !(SpaceInBuffer() && TaskAvailable())) {
+            if (VLOG_IS_ON(3)) {
+              VLOG(3) << "Sleeping with results_.size=" << results_.size()
+                      << ", outstanding_requests_=" << outstanding_requests_
+                      << ", max_oustanding_requests="
+                      << max_outstanding_requests_
+                      << " finished_tasks=" << finished_tasks_
+                      << " tasks_.size()=" << tasks_.size();
+            }
+            worker_thread_cv_.wait(l);
           }
-          outstanding_requests_++;
           if (cancelled_) {
             return;
           }
+          outstanding_requests_++;
+          // Search for a task to update.
+          int num_tasks = tasks_.size();
+          for (int i = 0; i < num_tasks; ++i) {
+            int index = (next_task_index_ + i) % num_tasks;
+            std::shared_ptr<Task>& task = tasks_[index];
+            if (!task->in_use && !task->end_of_sequence) {
+              task->in_use = true;
+              task_to_process = task;
+              next_task_index_ = (index + 1) % num_tasks;
+              break;
+            }
+          }
+          DCHECK(task_to_process != nullptr);
+          VLOG(3) << "Processing task " << task_to_process->task_id;
         }
-        // TODO(aaudibert): add backoff and max retries.
         int64 deadline_micros =
             Env::Default()->NowMicros() + kRetryTimeoutMicros;
-        Status s = FetchElement(task_thread, deadline_micros);
+        Status s = GetElement(task_to_process.get(), deadline_micros);
         if (!s.ok()) {
-          LOG(WARNING) << "Failed to fetch element from worker at "
-                       << task_thread->address << ": " << s;
+          mutex_lock l(mu_);
+          status_ = s;
+          get_next_cv_.notify_all();
+          return;
         }
       }
     }
 
-    // Fetches an element from a task and adds the element to `results_`.
+    // Gets an element from a task and adds the element to `results_`.
     //
     // If the task reaches end_of_sequence or is cancelled (e.g. due to a
-    // worker dying), FetchElement returns Status::OK() without adding to
+    // worker dying), GetElement returns Status::OK() without adding to
     // `results_`.
-    Status FetchElement(TaskThread* task_thread, int64 deadline_micros) {
-      VLOG(3) << "Fetching an element for task id " << task_thread->task_id;
+    Status GetElement(Task* task, int64 deadline_micros)
+        TF_LOCKS_EXCLUDED(mu_) {
+      VLOG(3) << "Getting an element for task id " << task->task_id;
+      tensorflow::profiler::TraceMe activity(
+          "GetElement", tensorflow::profiler::TraceMeLevel::kInfo);
       CompressedElement compressed;
       bool end_of_sequence;
       for (int num_retries = 0;; ++num_retries) {
-        Status s = task_thread->worker->GetElement(
-            task_thread->task_id, &compressed, &end_of_sequence);
+        Status s = task->worker->GetElement(task->task_id, &compressed,
+                                            &end_of_sequence);
         if (s.ok()) {
           break;
         }
+        if (errors::IsNotFound(s)) {
+          // This indicates that the worker was restarted. The restarted worker
+          // will get a new task, and the old task is lost.
+          mutex_lock l(mu_);
+          finished_tasks_++;
+          task->end_of_sequence = true;
+          return Status::OK();
+        }
         // Retry all errors that could indicate preemption.
         if (!errors::IsUnavailable(s) && !errors::IsCancelled(s) &&
             !errors::IsAborted(s)) {
@@ -422,7 +474,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
           mutex_lock l(mu_);
           // If `UpdateTaskThreads` finds that the task has been cancelled, it
           // will set end_of_sequence to `true`.
-          if (task_thread->end_of_sequence || cancelled_) {
+          if (task->end_of_sequence || cancelled_) {
             return Status::OK();
           }
         }
@@ -444,25 +496,37 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
 
       std::vector<Tensor> element;
       if (!end_of_sequence) {
-        TF_RETURN_IF_ERROR(service_util::Uncompress(compressed, &element));
+        Tensor tensor(DT_VARIANT, TensorShape{});
+        tensor.scalar<Variant>()() = std::move(compressed);
+        element.push_back(tensor);
       }
       mutex_lock l(mu_);
       if (end_of_sequence) {
-        task_thread->end_of_sequence = true;
+        task->end_of_sequence = true;
+        finished_tasks_++;
         return Status::OK();
       }
       results_.push(std::move(element));
-      cv_.notify_all();
-      VLOG(3) << "Fetched an element for task id " << task_thread->task_id;
+      get_next_cv_.notify_all();
+      VLOG(3) << "Got an element for task id " << task->task_id;
       return Status::OK();
     }
 
+    bool SpaceInBuffer() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      return results_.size() + outstanding_requests_ <
+             max_outstanding_requests_;
+    }
+
+    bool TaskAvailable() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      return finished_tasks_ + outstanding_requests_ < tasks_.size();
+    }
+
     const int64 iterator_index_;
 
     mutex mu_;
-    // TODO(aaudibert): split this into a couple cvs for different conditions
-    // so that we can use notify_one and avoid unnecessary wakeups.
-    condition_variable cv_ TF_GUARDED_BY(mu_);
+    condition_variable get_next_cv_ TF_GUARDED_BY(mu_);
+    condition_variable worker_thread_cv_ TF_GUARDED_BY(mu_);
+    condition_variable manager_thread_cv_ TF_GUARDED_BY(mu_);
     bool cancelled_ TF_GUARDED_BY(mu_) = false;
 
     int64 outstanding_requests_ TF_GUARDED_BY(mu_) = 0;
@@ -470,17 +534,31 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
     // at the same time. This count includes both in-progress requests for
     // elements as well as completed requests which haven't yet been produced.
     int64 max_outstanding_requests_ TF_GUARDED_BY(mu_);
+
+    // The number of threads in `worker_threads_` which are still running.
+    int64 num_running_worker_threads_ TF_GUARDED_BY(mu_) = 0;
+
+    // The index of the next task in `tasks_` to read from.
+    int64 next_task_index_ TF_GUARDED_BY(mu_) = 0;
+
+    // The number tasks in the `tasks_` list that have reached end_of_sequence.
+    int64 finished_tasks_ TF_GUARDED_BY(mu_) = 0;
+
+    // List of tasks to read from.
+    std::vector<std::shared_ptr<Task>> tasks_ TF_GUARDED_BY(mu_);
+
+    // A status to be returned from the next call to `GetNext`. This is set by
+    // asynchronous threads when they encounter errors.
+    Status status_ TF_GUARDED_BY(mu_) = Status::OK();
     std::queue<std::vector<Tensor>> results_ TF_GUARDED_BY(mu_);
 
     // Set once in Initialize().
     int64 job_id_;
-    int64 num_unfinished_tasks_ TF_GUARDED_BY(mu_) = 0;
 
     bool job_finished_ = false;
-    // Must come second to last so that task threads are joined before
+    // Must be ordered second to last so that worker threads are joined before
     // destroying other fields.
-    absl::flat_hash_map<int64, std::unique_ptr<TaskThread>> task_threads_
-        TF_GUARDED_BY(mu_);
+    std::vector<std::unique_ptr<Thread>> worker_threads_ TF_GUARDED_BY(mu_);
     // Must be ordered last so that the thread is joined before destroying other
     // fields.
     std::unique_ptr<Thread> task_thread_manager_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index 462f8ce6ef7..0a6df24d40a 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -353,7 +353,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
           TF_RETURN_IF_ERROR(writer->WriteScalar(
               full_name("current_iterator_not_initialized"), ""));
         }
-
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("group_counter"),
+                                               group_counter_ - 1));
         return Status::OK();
       }
 
@@ -364,7 +365,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
 
         if (reader->Contains(full_name("end_of_input"))) end_of_input_ = true;
 
-        // Restoring groups
+        // Restoring groups_
         if (reader->Contains(full_name("groups_size"))) {
           int64 size;
           TF_RETURN_IF_ERROR(
@@ -381,7 +382,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
           }
         }
 
-        // Restoring Windows
+        // Restoring window_sizes_
         if (reader->Contains(full_name("window_sizes_size"))) {
           int64 size;
           TF_RETURN_IF_ERROR(
@@ -397,6 +398,10 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
           }
         }
 
+        // Group counter needs to be restored before current group iterator.
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("group_counter"), &group_counter_));
+
         if (reader->Contains(full_name("current_iterator_not_initialized"))) {
           current_group_iterator_.reset();
         } else {
@@ -486,11 +491,12 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
 
         // Create an iterator for the dataset that was returned by `f`.
         return returned_dataset->MakeIterator(
-            ctx, this, strings::StrCat(prefix(), "::Reduce"),
+            ctx, this, strings::StrCat(prefix(), "[", group_counter_++, "]"),
             &current_group_iterator_);
       }
 
       mutex mu_;
+      int64 group_counter_ TF_GUARDED_BY(mu_) = 0;
       std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
       // TODO(mrry): Optimize for dense key space if appropriate.
       bool end_of_input_ TF_GUARDED_BY(mu_) = false;
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index 2cc602a15ae..f9a20f44b15 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/kernels/data/experimental/snapshot_dataset_op.h"
+
 #include <random>
 
 #include "absl/time/clock.h"
@@ -60,6 +62,875 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 namespace experimental {
+
+// ==== Snapshot Implementation ====
+
+namespace {
+
+/* The current snapshot on-disk layout is as follows:
+ *   /user/specified/path/
+ *     - graphhash1/
+ *       - snapshot.metadata  // metadata file
+ *       - run1/
+ *         - 00000000.shard/  // shard index
+ *           // new checkpoint files are created on all threads at once, either
+ *           // when a file gets too big, or when a TF checkpoint happens.
+ *           - 00000000.snapshot  // checkpoint file 0
+ *           - 00000001.snapshot  // checkpoint file 1
+ *           - ...
+ *         - 00000001.shard/
+ *           - 00000000.snapshot
+ *           - 00000001.snapshot
+ *           - ...
+ *         - 00000002.shard/
+ *           - 00000000.snapshot
+ *           - 00000001.snapshot
+ *           - ...
+ *           ...
+ *       - run2/
+ *           ...
+ *     - graphhash2/
+ *       ...
+ *     - graphhash3/
+ *       ...
+ */
+
+constexpr const char* const kShardDirectorySuffix = ".shard";
+
+inline tstring HashDirectory(const tstring& path, const uint64 hash) {
+  return io::JoinPath(path, absl::StrFormat("%d", hash));
+}
+
+inline tstring RunDirectory(const tstring& hash_directory,
+                            const uint64 run_id) {
+  return io::JoinPath(hash_directory, absl::StrFormat("%d", run_id));
+}
+
+inline tstring SnapshotShardDirectory(const tstring& run_directory,
+                                      const int64 snapshot_index) {
+  return io::JoinPath(run_directory, absl::StrFormat("%08d%s", snapshot_index,
+                                                     kShardDirectorySuffix));
+}
+
+}  // namespace
+
+class SnapshotDatasetV2Op::Dataset : public DatasetBase {
+ public:
+  Dataset(OpKernelContext* ctx, const DatasetBase* input, uint64 hash,
+          const std::string& path, const std::string& compression,
+          std::unique_ptr<CapturedFunction> reader_func,
+          std::unique_ptr<CapturedFunction> shard_func);
+
+  ~Dataset() override;
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override;
+
+  const DataTypeVector& output_dtypes() const override;
+
+  const std::vector<PartialTensorShape>& output_shapes() const override;
+
+  string DebugString() const override;
+
+  int64 Cardinality() const override;
+
+  Status CheckExternalState() const override;
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override;
+
+ private:
+  const DatasetBase* input_;
+  const uint64 hash_;
+  const tstring path_;
+  const std::string compression_;
+
+  std::unique_ptr<CapturedFunction> reader_func_;
+  std::unique_ptr<CapturedFunction> shard_func_;
+
+  class Iterator;
+};
+
+class SnapshotDatasetV2Op::Dataset::Iterator : public DatasetIterator<Dataset> {
+ public:
+  static constexpr const char* const kIteratorMode = "iterator_mode";
+  static constexpr const char* const kIndex = "index";
+  static constexpr const char* const kGraphHashDirectory =
+      "graph_hash_directory";
+
+  explicit Iterator(const Params& params);
+
+  Status Initialize(IteratorContext* ctx) override;
+
+  Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) override;
+
+ protected:
+  Status SaveInternal(SerializationContext* ctx,
+                      IteratorStateWriter* writer) override;
+
+  Status RestoreInternal(IteratorContext* ctx,
+                         IteratorStateReader* reader) override;
+
+ private:
+  Status InitializeIterator(IteratorContext* ctx, IteratorStateReader* reader);
+
+  int64 index_ TF_GUARDED_BY(mu_);
+  std::unique_ptr<IteratorBase> iterator_ TF_GUARDED_BY(mu_);
+  snapshot_util::Mode mode_ TF_GUARDED_BY(mu_);
+  const std::string hash_dir_;
+
+  mutex mu_;
+
+  class Reader;
+  class Writer;
+  class Passthrough;
+};
+
+class SnapshotDatasetV2Op::Dataset::Iterator::Reader
+    : public DatasetIterator<Dataset> {
+ public:
+  static constexpr const char* const kIteratorName = "Reader";
+
+  explicit Reader(const Params& params, int64 start_index);
+
+  ~Reader() override;
+
+  Status Initialize(IteratorContext* ctx) override;
+
+  Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) override;
+
+ protected:
+  Status SaveInternal(SerializationContext* ctx,
+                      IteratorStateWriter* writer) override;
+
+  Status RestoreInternal(IteratorContext* ctx,
+                         IteratorStateReader* reader) override;
+
+ private:
+  const int64 start_index_;
+
+  mutex mu_;
+
+  std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
+
+  DatasetBase* input_ TF_GUARDED_BY(mu_);
+
+  std::unique_ptr<InstantiatedCapturedFunction> instantiated_reader_func_
+      TF_GUARDED_BY(mu_);
+};
+
+class SnapshotDatasetV2Op::Dataset::Iterator::Writer
+    : public DatasetIterator<Dataset> {
+ public:
+  static constexpr const char* const kIteratorName = "Writer";
+  static constexpr const char* const kRunId = "run_id";
+  static constexpr const char* const kCurrentCheckpointId =
+      "current_checkpoint_id";
+
+  explicit Writer(const Params& params);
+
+  ~Writer() override;
+
+  Status Initialize(IteratorContext* ctx) override;
+
+  Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) override;
+
+ protected:
+  Status SaveInternal(SerializationContext* ctx,
+                      IteratorStateWriter* writer) override;
+
+  Status RestoreInternal(IteratorContext* ctx,
+                         IteratorStateReader* reader) override;
+
+ private:
+  struct BufferElement {
+    std::vector<Tensor> value;
+    bool end_of_sequence = false;
+  };
+
+  class WriterThread {
+   public:
+    explicit WriterThread(Writer* writer_iterator, Env* env, int64 file_index,
+                          const tstring& shard_directory,
+                          uint64 current_checkpoint_id) {
+      thread_ = absl::WrapUnique(env->StartThread(
+          ThreadOptions(), absl::StrCat("snapshot_writer_thread_", file_index),
+          [this, writer_iterator, env, shard_directory, current_checkpoint_id] {
+            RunWriterThread(writer_iterator, env, shard_directory,
+                            current_checkpoint_id);
+          }));
+    }
+
+    void EnqueueTensors(const std::vector<Tensor>& tensors)
+        TF_LOCKS_EXCLUDED(deque_mu_) {
+      // Copy the Tensor to the deque for writing.
+      mutex_lock l(deque_mu_);
+      BufferElement be;
+      be.value = tensors;
+      deque_.push_back(std::move(be));
+    }
+
+    void DequeueTensors(BufferElement* be) TF_LOCKS_EXCLUDED(deque_mu_) {
+      mutex_lock l(deque_mu_);
+      deque_mu_.Await(
+          tensorflow::Condition(this, &WriterThread::DequeIsNotEmpty));
+      *be = deque_.front();
+      deque_.pop_front();
+    }
+
+    void StopThread() TF_LOCKS_EXCLUDED(deque_mu_) {
+      mutex_lock l(deque_mu_);
+      BufferElement be;
+      be.end_of_sequence = true;
+      deque_.push_back(std::move(be));
+    }
+
+    void RunWriterThread(Writer* writer_iterator, Env* env,
+                         const tstring& shard_directory,
+                         uint64 current_checkpoint_id) {
+      Status s = WriterThreadFn(writer_iterator, env, shard_directory,
+                                current_checkpoint_id);
+      if (!s.ok()) {
+        mutex_lock l(writer_iterator->mu_);
+        writer_iterator->writer_status_ = s;
+      }
+    }
+
+   private:
+    bool DequeIsNotEmpty() TF_EXCLUSIVE_LOCKS_REQUIRED(deque_mu_) {
+      return !deque_.empty();
+    }
+
+    Status WriterThreadFn(Writer* writer_iterator, Env* env,
+                          const tstring& shard_directory,
+                          uint64 current_checkpoint_id) {
+      std::unique_ptr<snapshot_util::Writer> writer;
+      TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(shard_directory));
+
+      TF_RETURN_IF_ERROR(snapshot_util::Writer::Create(
+          env,
+          snapshot_util::GetCurrentCheckpointFile(shard_directory,
+                                                  current_checkpoint_id),
+          writer_iterator->dataset()->compression_, kSnapshotFileFormatVersion,
+          writer_iterator->dataset()->output_dtypes(), &writer));
+
+      while (true) {
+        BufferElement be;
+        DequeueTensors(&be);
+
+        if (be.end_of_sequence) {
+          TF_RETURN_IF_ERROR(writer->Close());
+          break;
+        }
+
+        TF_RETURN_IF_ERROR(writer->WriteTensors(be.value));
+      }
+      return Status::OK();
+    }
+
+    // If both the writer `mu_` and this `deque_mu_` needs to be acquired, the
+    // writer `mu_` must be acquired first.
+    mutex deque_mu_;
+
+    std::deque<BufferElement> deque_ TF_GUARDED_BY(deque_mu_);
+
+    // This has to be last. During destruction, we need to make sure that
+    // thread_ is destroyed first as the thread destructor blocks on thread
+    // completion. If there are other member variables after this, they may get
+    // destroyed first before the thread finishes, potentially causing the
+    // thread to access invalid memory.
+    std::unique_ptr<Thread> thread_;
+  };
+
+  Status GetShardIndex(std::vector<Tensor>* tensors, int64* shard_index)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  Status WriteMetadataFile(Env* env, bool finalized)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  void StopWriterThreads(bool mark_closed) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  mutex mu_;
+  std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
+
+  absl::flat_hash_map<int64, std::unique_ptr<WriterThread>> writer_threads_
+      TF_GUARDED_BY(mu_);
+  Status writer_status_ TF_GUARDED_BY(mu_);
+  bool writers_closed_ TF_GUARDED_BY(mu_);
+
+  uint64 run_id_ TF_GUARDED_BY(mu_);
+  tstring run_dir_ TF_GUARDED_BY(mu_);
+
+  // Stores the ID of the current checkpoint .snapshot file being read. See top
+  // of this file for the directory layout.
+  uint64 current_checkpoint_id_ TF_GUARDED_BY(mu_);
+
+  std::unique_ptr<InstantiatedCapturedFunction> instantiated_shard_func_
+      TF_GUARDED_BY(mu_);
+};
+
+class SnapshotDatasetV2Op::Dataset::Iterator::Passthrough
+    : public DatasetIterator<Dataset> {
+ public:
+  static constexpr const char* const kIteratorName = "Passthrough";
+
+  explicit Passthrough(const Params& params);
+
+  Status Initialize(IteratorContext* ctx) override;
+
+  Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) override;
+
+ protected:
+  Status SaveInternal(SerializationContext* ctx,
+                      IteratorStateWriter* writer) override;
+
+  Status RestoreInternal(IteratorContext* ctx,
+                         IteratorStateReader* reader) override;
+
+ private:
+  std::unique_ptr<IteratorBase> input_impl_;
+};
+
+SnapshotDatasetV2Op::Dataset::Dataset(
+    OpKernelContext* ctx, const DatasetBase* input, uint64 hash,
+    const std::string& path, const std::string& compression,
+    std::unique_ptr<CapturedFunction> reader_func,
+    std::unique_ptr<CapturedFunction> shard_func)
+    : DatasetBase(DatasetContext(ctx)),
+      input_(input),
+      hash_(hash),
+      path_(path),
+      compression_(compression),
+      reader_func_(std::move(reader_func)),
+      shard_func_(std::move(shard_func)) {
+  input_->Ref();
+}
+
+SnapshotDatasetV2Op::Dataset::~Dataset() { input_->Unref(); }
+
+std::unique_ptr<IteratorBase>
+SnapshotDatasetV2Op::Dataset::MakeIteratorInternal(const string& prefix) const {
+  return absl::make_unique<Iterator>(
+      Iterator::Params{this, absl::StrCat(prefix, "::Snapshot")});
+}
+
+const DataTypeVector& SnapshotDatasetV2Op::Dataset::output_dtypes() const {
+  return input_->output_dtypes();
+}
+
+const std::vector<PartialTensorShape>&
+SnapshotDatasetV2Op::Dataset::output_shapes() const {
+  return input_->output_shapes();
+}
+
+string SnapshotDatasetV2Op::Dataset::DebugString() const {
+  return name_utils::DatasetDebugString(kDatasetType);
+}
+
+int64 SnapshotDatasetV2Op::Dataset::Cardinality() const {
+  return input_->Cardinality();
+}
+
+Status SnapshotDatasetV2Op::Dataset::CheckExternalState() const {
+  return input_->CheckExternalState();
+}
+
+Status SnapshotDatasetV2Op::Dataset::AsGraphDefInternal(
+    SerializationContext* ctx, DatasetGraphDefBuilder* b, Node** output) const {
+  Node* input_graph_node = nullptr;
+  TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+
+  Node* path = nullptr;
+  TF_RETURN_IF_ERROR(b->AddScalar(path_, &path));
+
+  std::vector<Node*> reader_func_other_args;
+  DataTypeVector reader_func_other_args_types;
+  TF_RETURN_IF_ERROR(reader_func_->AddToGraph(ctx, b, &reader_func_other_args,
+                                              &reader_func_other_args_types));
+
+  std::vector<Node*> shard_func_other_args;
+  DataTypeVector shard_func_other_args_types;
+  TF_RETURN_IF_ERROR(shard_func_->AddToGraph(ctx, b, &shard_func_other_args,
+                                             &shard_func_other_args_types));
+
+  AttrValue compression_attr;
+  b->BuildAttrValue(compression_, &compression_attr);
+
+  AttrValue reader_func_attr;
+  b->BuildAttrValue(reader_func_->func(), &reader_func_attr);
+
+  AttrValue shard_func_attr;
+  b->BuildAttrValue(shard_func_->func(), &shard_func_attr);
+
+  AttrValue reader_func_arguments_types_attr;
+  b->BuildAttrValue(reader_func_other_args_types,
+                    &reader_func_arguments_types_attr);
+
+  AttrValue shard_func_arguments_types_attr;
+  b->BuildAttrValue(shard_func_other_args_types,
+                    &shard_func_arguments_types_attr);
+
+  return b->AddDataset(
+      this,
+      /*inputs=*/
+      {std::make_pair(0, input_graph_node), std::make_pair(1, path)},
+      /*list_inputs=*/
+      {std::make_pair(2, reader_func_other_args),
+       std::make_pair(3, shard_func_other_args)},
+      /*attrs=*/
+      {{kCompression, compression_attr},
+       {kReaderFunc, reader_func_attr},
+       {kShardFunc, shard_func_attr},
+       {kReaderFuncTarguments, reader_func_arguments_types_attr},
+       {kShardFuncTarguments, shard_func_arguments_types_attr}},
+      output);
+}
+
+SnapshotDatasetV2Op::Dataset::Iterator::Iterator(const Params& params)
+    : DatasetIterator<Dataset>(params),
+      index_(0),
+      hash_dir_(HashDirectory(dataset()->path_, dataset()->hash_)) {}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Initialize(
+    IteratorContext* ctx) {
+  return ctx->env()->RecursivelyCreateDir(hash_dir_);
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::SaveInternal(
+    SerializationContext* ctx, IteratorStateWriter* writer) {
+  mutex_lock l(mu_);
+  if (iterator_ != nullptr) {
+    TF_RETURN_IF_ERROR(SaveInput(ctx, writer, iterator_));
+    TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kIteratorMode),
+                                           static_cast<int64>(mode_)));
+    TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kIndex), index_));
+    TF_RETURN_IF_ERROR(
+        writer->WriteScalar(full_name(kGraphHashDirectory), hash_dir_));
+  }
+
+  return Status::OK();
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::RestoreInternal(
+    IteratorContext* ctx, IteratorStateReader* reader) {
+  mutex_lock l(mu_);
+
+  if (reader->Contains(full_name(kIteratorMode))) {
+    TF_RETURN_IF_ERROR(InitializeIterator(ctx, reader));
+    return RestoreInput(ctx, reader, iterator_);
+  }
+
+  return Status::OK();
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::GetNextInternal(
+    IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+    bool* end_of_sequence) {
+  mutex_lock l(mu_);
+  if (iterator_ == nullptr) {
+    TF_RETURN_IF_ERROR(InitializeIterator(ctx, nullptr));
+  }
+  // TODO(b/154341936): Explicitly stopping and starting this iterator
+  // should not be necessary, but the additional
+  // `{Reader,Writer,Passthrough}::kIteratorName` added to the prefix passed to
+  // `iterator_` when it was created prevents the model from identifying this
+  // iterator as the output of `iterator_`.
+  RecordStop(ctx);
+  Status s = iterator_->GetNext(ctx, out_tensors, end_of_sequence);
+  index_++;
+  RecordStart(ctx);
+  return s;
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::InitializeIterator(
+    IteratorContext* ctx, IteratorStateReader* reader)
+    TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  if (reader != nullptr) {
+    // Check whether the computed hash directory is the same.
+    tstring hash_dir;
+    TF_RETURN_IF_ERROR(
+        reader->ReadScalar(full_name(kGraphHashDirectory), &hash_dir));
+    if (hash_dir != hash_dir_) {
+      return errors::DataLoss(
+          "Dataset has changed while restoring from the checkpoint. Old hash "
+          "directory: ",
+          hash_dir, "; new hash directory: ", hash_dir_);
+    }
+
+    experimental::SnapshotMetadataRecord metadata;
+    bool file_exists;
+    TF_RETURN_IF_ERROR(
+        snapshot_util::ReadMetadataFile(hash_dir_, &metadata, &file_exists));
+    if (!file_exists) {
+      return errors::DataLoss("Snapshot metadata file in ", hash_dir_,
+                              " does not exist any more.");
+    }
+
+    int64 iterator_mode;
+    TF_RETURN_IF_ERROR(
+        reader->ReadScalar(full_name(kIteratorMode), &iterator_mode));
+    mode_ = snapshot_util::Mode(iterator_mode);
+
+    TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kIndex), &index_));
+  } else {
+    experimental::SnapshotMetadataRecord metadata;
+    bool file_exists;
+    TF_RETURN_IF_ERROR(
+        snapshot_util::ReadMetadataFile(hash_dir_, &metadata, &file_exists));
+
+    // `pending_snapshot_expiry_seconds` is a legacy option where we would not
+    // write snapshots that we think were still on-going. We decided that this
+    // would not be necessary as a feature for SnapshotV2, and we would always
+    // write a new snapshot regardless of whether someone else is currently
+    // writing one. Setting this to 0 ensures that all previous snapshots
+    // will be ignored and we will proceed to writing.
+    TF_RETURN_IF_ERROR(snapshot_util::DetermineOpState(
+        /*mode_string=*/"", file_exists, &metadata,
+        /*pending_snapshot_expiry_seconds=*/0, &mode_));
+  }
+
+  switch (mode_) {
+    case snapshot_util::READER:
+      iterator_ = absl::make_unique<Reader>(
+          Reader::Params{dataset(),
+                         absl::StrCat(prefix(), Reader::kIteratorName)},
+          index_);
+      break;
+    case snapshot_util::WRITER:
+      iterator_ = absl::make_unique<Writer>(Writer::Params{
+          dataset(), absl::StrCat(prefix(), Writer::kIteratorName)});
+      break;
+    case snapshot_util::PASSTHROUGH:
+      iterator_ = absl::make_unique<Passthrough>(Passthrough::Params{
+          dataset(), absl::StrCat(prefix(), Passthrough::kIteratorName)});
+      break;
+  }
+  return iterator_->Initialize(ctx);
+}
+
+SnapshotDatasetV2Op::Dataset::Iterator::Reader::Reader(const Params& params,
+                                                       int64 start_index)
+    : DatasetIterator<Dataset>(params), start_index_(start_index) {}
+
+SnapshotDatasetV2Op::Dataset::Iterator::Reader::~Reader() { input_->Unref(); }
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Reader::Initialize(
+    IteratorContext* ctx) {
+  mutex_lock l(mu_);
+
+  TF_RETURN_IF_ERROR(
+      dataset()->reader_func_->Instantiate(ctx, &instantiated_reader_func_));
+
+  tstring hash_dir = HashDirectory(dataset()->path_, dataset()->hash_);
+  bool metadata_file_exists;
+  experimental::SnapshotMetadataRecord metadata;
+  TF_RETURN_IF_ERROR(snapshot_util::ReadMetadataFile(hash_dir, &metadata,
+                                                     &metadata_file_exists));
+
+  auto run_dir = io::JoinPath(hash_dir, metadata.run_id());
+
+  std::vector<std::string> snapshot_shard_dirs;
+  TF_RETURN_IF_ERROR(ctx->env()->GetMatchingPaths(
+      io::JoinPath(run_dir,
+                   absl::StrFormat("%s%s", "*", kShardDirectorySuffix)),
+      &snapshot_shard_dirs));
+  std::sort(snapshot_shard_dirs.begin(), snapshot_shard_dirs.end());
+
+  DatasetBase* dataset_of_snapshot_files;
+  TF_RETURN_IF_ERROR(snapshot_util::Reader::MakeNestedDataset(
+      ctx->env(), snapshot_shard_dirs, dataset()->compression_,
+      kSnapshotFileFormatVersion, dataset()->output_dtypes(),
+      dataset()->output_shapes(), start_index_, &dataset_of_snapshot_files));
+
+  Tensor input_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_RETURN_IF_ERROR(StoreDatasetInVariantTensor(dataset_of_snapshot_files,
+                                                 &input_dataset_tensor));
+
+  std::vector<Tensor> reader_input;
+  std::vector<Tensor> reader_output;
+  reader_input.push_back(std::move(input_dataset_tensor));
+
+  TF_RETURN_IF_ERROR(instantiated_reader_func_->Run(
+      ctx, std::move(reader_input), &reader_output));
+  if (reader_output.size() != 1) {
+    return errors::InvalidArgument(
+        "reader_func returns more than one argument.");
+  }
+  TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(reader_output[0], &input_));
+
+  // We need to take a reference here as we will use the input_ and
+  // its iterator.
+  input_->Ref();
+
+  return input_->MakeIterator(ctx, this, prefix(), &input_impl_);
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Reader::GetNextInternal(
+    IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+    bool* end_of_sequence) {
+  mutex_lock l(mu_);
+  return input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+}
+
+// We do not need to checkpoint the reader as we are rebuilding the reader
+// datasets from information that is already saved by the main iterator.
+Status SnapshotDatasetV2Op::Dataset::Iterator::Reader::SaveInternal(
+    SerializationContext* ctx, IteratorStateWriter* writer) {
+  return Status::OK();
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Reader::RestoreInternal(
+    IteratorContext* ctx, IteratorStateReader* reader) {
+  return Status::OK();
+}
+
+SnapshotDatasetV2Op::Dataset::Iterator::Writer::Writer(const Params& params)
+    : DatasetIterator<Dataset>(params),
+      writers_closed_(false),
+      run_id_(0),
+      current_checkpoint_id_(0) {}
+
+SnapshotDatasetV2Op::Dataset::Iterator::Writer::~Writer() {
+  mutex_lock l(mu_);
+  StopWriterThreads(true);
+}
+
+void SnapshotDatasetV2Op::Dataset::Iterator::Writer::StopWriterThreads(
+    bool mark_closed) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  if (!writers_closed_) {
+    // Push the end of sequence signal to each of the threads to close files.
+    for (auto& writer_thread : writer_threads_) {
+      writer_thread.second->StopThread();
+    }
+
+    writer_threads_.clear();
+    writers_closed_ = mark_closed;
+  }
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::WriteMetadataFile(
+    Env* env, bool finalized) {
+  DCHECK(!run_dir_.empty());
+
+  experimental::SnapshotMetadataRecord metadata;
+  metadata.set_creation_timestamp(EnvTime::NowMicros());
+  metadata.set_graph_hash(absl::StrFormat("%d", dataset()->hash_));
+  metadata.set_run_id(absl::StrFormat("%d", run_id_));
+  metadata.set_version(kSnapshotFileFormatVersion);
+  for (const auto& output_dtype : dataset()->output_dtypes()) {
+    metadata.add_dtype(output_dtype);
+  }
+  metadata.set_finalized(finalized);
+  tstring hash_directory = HashDirectory(dataset()->path_, dataset()->hash_);
+
+  TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(hash_directory));
+  return snapshot_util::WriteMetadataFile(hash_directory, &metadata);
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::Initialize(
+    IteratorContext* ctx) {
+  mutex_lock l(mu_);
+  TF_RETURN_IF_ERROR(
+      dataset()->shard_func_->Instantiate(ctx, &instantiated_shard_func_));
+
+  return dataset()->input_->MakeIterator(
+      ctx, this, strings::StrCat(prefix(), "::WriterIterator"), &input_impl_);
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::GetShardIndex(
+    std::vector<Tensor>* tensors, int64* shard_index) {
+  std::vector<Tensor> output_tensors;
+
+  // Run the shard function
+  TF_RETURN_IF_ERROR(
+      instantiated_shard_func_->RunInstantiated(*tensors, &output_tensors));
+
+  if (output_tensors.size() != 1 || output_tensors[0].dtype() != DT_INT64 ||
+      output_tensors[0].NumElements() != 1) {
+    return errors::InvalidArgument("`shard_func` must return a scalar int64.");
+  }
+
+  // Create writable files if we see an index bigger than our current files.
+  *shard_index = output_tensors[0].flat<int64>()(0);
+  return Status::OK();
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::GetNextInternal(
+    IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+    bool* end_of_sequence) {
+  *end_of_sequence = false;
+  WriterThread* current_writer_thread;
+
+  {
+    std::vector<Tensor> output_tensors;
+    mutex_lock l(mu_);
+
+    // We initialize late here because restoring from checkpoint comes after the
+    // the Initialize call. We cannot initialize within Initialize() because
+    // we cannot determine whether we should overwrite an existing metadata
+    // file or not before `RestoreInternal` is potentially called.
+    if (run_dir_.empty()) {
+      run_id_ = random::New64();
+
+      // Creates the run directory.
+      run_dir_ = RunDirectory(HashDirectory(dataset()->path_, dataset()->hash_),
+                              run_id_);
+      TF_RETURN_IF_ERROR(ctx->env()->RecursivelyCreateDir(run_dir_));
+      TF_RETURN_IF_ERROR(WriteMetadataFile(ctx->env(), /*finalized=*/false));
+    }
+
+    // Writers have either encountered an error or are closed.
+    if (!writer_status_.ok() || writers_closed_) {
+      *end_of_sequence = true;
+      return writer_status_;
+    }
+
+    TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+
+    // Finalize metadata file when we are at the end of the iterator.
+    if (*end_of_sequence) {
+      StopWriterThreads(/*mark_closed=*/true);
+      TF_RETURN_IF_ERROR(writer_status_);
+      return WriteMetadataFile(ctx->env(), /*finalized=*/true);
+    }
+
+    int64 shard_index = 0;
+    TF_RETURN_IF_ERROR(GetShardIndex(out_tensors, &shard_index));
+
+    // If the index does not exist, we will start a new thread.
+    if (writer_threads_.count(shard_index) == 0) {
+      const tstring snapshot_shard_directory =
+          SnapshotShardDirectory(run_dir_, shard_index);
+      auto thread_data = std::make_unique<WriterThread>(
+          this, ctx->env(), shard_index, snapshot_shard_directory,
+          current_checkpoint_id_);
+      writer_threads_.insert({shard_index, std::move(thread_data)});
+    }
+    current_writer_thread = writer_threads_[shard_index].get();
+  }
+
+  current_writer_thread->EnqueueTensors(*out_tensors);
+  return Status::OK();
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::SaveInternal(
+    SerializationContext* ctx, IteratorStateWriter* writer) {
+  mutex_lock l(mu_);
+  TF_RETURN_IF_ERROR(
+      writer->WriteScalar(full_name(kRunId), static_cast<int64>(run_id_)));
+  TF_RETURN_IF_ERROR(
+      writer->WriteScalar(full_name(kCurrentCheckpointId),
+                          static_cast<int64>(current_checkpoint_id_)));
+
+  StopWriterThreads(/*mark_closed=*/false);
+  writer_threads_.clear();
+  current_checkpoint_id_++;
+
+  return SaveInput(ctx, writer, input_impl_);
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::RestoreInternal(
+    IteratorContext* ctx, IteratorStateReader* reader) {
+  mutex_lock l(mu_);
+  int64 run_id_signed;
+  int64 current_checkpoint_id;
+
+  TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kRunId), &run_id_signed));
+  TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kCurrentCheckpointId),
+                                        &current_checkpoint_id));
+
+  run_id_ = static_cast<uint64>(run_id_signed);
+  run_dir_ =
+      RunDirectory(HashDirectory(dataset()->path_, dataset()->hash_), run_id_);
+  current_checkpoint_id_ = static_cast<uint64>(current_checkpoint_id);
+
+  return RestoreInput(ctx, reader, input_impl_);
+}
+
+SnapshotDatasetV2Op::Dataset::Iterator::Passthrough::Passthrough(
+    const Params& params)
+    : DatasetIterator<Dataset>(params) {}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Passthrough::Initialize(
+    IteratorContext* ctx) {
+  return dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_);
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Passthrough::GetNextInternal(
+    IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+    bool* end_of_sequence) {
+  return input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Passthrough::SaveInternal(
+    SerializationContext* ctx, IteratorStateWriter* writer) {
+  return SaveInput(ctx, writer, input_impl_);
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Passthrough::RestoreInternal(
+    IteratorContext* ctx, IteratorStateReader* reader) {
+  return RestoreInput(ctx, reader, input_impl_);
+}
+
+SnapshotDatasetV2Op::SnapshotDatasetV2Op(OpKernelConstruction* ctx)
+    : UnaryDatasetOpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
+  FunctionMetadata::Params reader_params;
+  FunctionMetadata::Params shard_params;
+
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kCompression, &compression_));
+
+  OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, kReaderFunc, reader_params,
+                                               &reader_func_metadata_));
+  OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, kShardFunc, shard_params,
+                                               &shard_func_metadata_));
+}
+
+void SnapshotDatasetV2Op::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                                      DatasetBase** output) {
+  tstring path;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "path", &path));
+
+  // Computes the hash of the preceding items in the graph.
+  uint64 graph_hash;
+  GraphDef graph_def;
+  SerializationContext::Params params;
+  std::vector<std::pair<string, Tensor>> input_list;
+  params.input_list = &input_list;
+  params.external_state_policy =
+      SerializationContext::ExternalStatePolicy::kIgnore;
+  OP_REQUIRES_OK(
+      ctx, AsGraphDef(ctx, input, SerializationContext(params), &graph_def));
+  OP_REQUIRES_OK(ctx, HashGraph(graph_def, &graph_hash));
+
+  std::unique_ptr<CapturedFunction> reader_func;
+  OP_REQUIRES_OK(ctx,
+                 CapturedFunction::Create(ctx, reader_func_metadata_,
+                                          kReaderFuncOtherArgs, &reader_func));
+  std::unique_ptr<CapturedFunction> shard_func;
+  OP_REQUIRES_OK(ctx,
+                 CapturedFunction::Create(ctx, shard_func_metadata_,
+                                          kShardFuncOtherArgs, &shard_func));
+
+  *output = new SnapshotDatasetV2Op::Dataset(
+      ctx, input, graph_hash, path, compression_, std::move(reader_func),
+      std::move(shard_func));
+}
+
+namespace {
+REGISTER_KERNEL_BUILDER(Name("SnapshotDatasetV2").Device(DEVICE_CPU),
+                        SnapshotDatasetV2Op);
+}  // namespace
+
+// ==== Legacy Snapshot Implementation (Deprecated) ====
+
 namespace {
 
 // Defaults to 10 GiB per shard.
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.h b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.h
new file mode 100644
index 00000000000..daab6d20e9b
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.h
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SNAPSHOT_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SNAPSHOT_DATASET_OP_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/experimental/snapshot_util.h"
+#include "tensorflow/core/kernels/data/name_utils.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/random.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+const int64 kSnapshotFileFormatVersion = 1;
+
+class SnapshotDatasetV2Op : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Snapshot";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kCompression = "compression";
+  static constexpr const char* const kReaderFunc = "reader_func";
+  static constexpr const char* const kShardFunc = "shard_func";
+  static constexpr const char* const kReaderFuncOtherArgs =
+      "reader_func_other_args";
+  static constexpr const char* const kShardFuncOtherArgs =
+      "shard_func_other_args";
+  static constexpr const char* const kReaderFuncTarguments =
+      "Treader_func_args";
+  static constexpr const char* const kShardFuncTarguments = "Tshard_func_args";
+
+  explicit SnapshotDatasetV2Op(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+
+  std::string compression_;
+
+  std::shared_ptr<FunctionMetadata> reader_func_metadata_;
+  std::shared_ptr<FunctionMetadata> shard_func_metadata_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SNAPSHOT_DATASET_OP_H_
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.cc b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
index 3ad1345d776..31d1a87087e 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <queue>
 
 #include "absl/memory/memory.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -31,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/zlib_inputstream.h"
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
 #include "tensorflow/core/platform/coding.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/random.h"
@@ -44,6 +46,12 @@ namespace snapshot_util {
 /* static */ constexpr const int64 Reader::kSnappyReaderInputBufferSizeBytes;
 /* static */ constexpr const int64 Reader::kSnappyReaderOutputBufferSizeBytes;
 
+std::string GetCurrentCheckpointFile(const std::string& shard_directory,
+                                     const uint64 current_checkpoint_id) {
+  return io::JoinPath(shard_directory,
+                      absl::StrFormat("%08d.snapshot", current_checkpoint_id));
+}
+
 Writer::Writer(const std::string& filename, const std::string& compression_type,
                int version, const DataTypeVector& dtypes)
     : filename_(filename),
@@ -62,7 +70,7 @@ Status Writer::Create(Env* env, const std::string& filename,
 }
 
 Status Writer::Initialize(tensorflow::Env* env) {
-  TF_RETURN_IF_ERROR(env->NewWritableFile(filename_, &dest_));
+  TF_RETURN_IF_ERROR(env->NewAppendableFile(filename_, &dest_));
 #if defined(IS_SLIM_BUILD)
   if (compression_type_ != io::compression::kNone) {
     LOG(ERROR) << "Compression is unsupported on mobile platforms. Turning "
@@ -225,16 +233,17 @@ Status Reader::Create(Env* env, const std::string& filename,
 
 class Reader::Dataset : public DatasetBase {
  public:
-  explicit Dataset(const std::string& filename, const std::string& compression,
+  explicit Dataset(const std::string& shard_dir, const std::string& compression,
                    const int64 version, const DataTypeVector& dtypes,
                    const std::vector<PartialTensorShape>& shapes,
-                   DatasetContext::Params params)
+                   const int64 start_index, DatasetContext::Params params)
       : DatasetBase(DatasetContext(std::move(params))),
-        filename_(filename),
+        shard_dir_(shard_dir),
         compression_(compression),
         version_(version),
         dtypes_(dtypes),
-        shapes_(shapes) {}
+        shapes_(shapes),
+        start_index_(start_index) {}
 
   const DataTypeVector& output_dtypes() const override { return dtypes_; }
 
@@ -252,7 +261,8 @@ class Reader::Dataset : public DatasetBase {
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
                             Node** node) const override {
-    // TODO(frankchn): Implement for serialization and checkpointing.
+    // Not necessary perform any serialization as this dataset is only
+    // constructed at runtime in C++ and will be reconstructed every time.
     return Status::OK();
   }
 
@@ -263,21 +273,29 @@ class Reader::Dataset : public DatasetBase {
   }
 
  private:
-  std::string filename_;
-  std::string compression_;
-  int64 version_;
-  DataTypeVector dtypes_;
-  std::vector<PartialTensorShape> shapes_;
+  const std::string shard_dir_;
+  const std::string compression_;
+  const int64 version_;
+  const DataTypeVector dtypes_;
+  const std::vector<PartialTensorShape> shapes_;
+  const int64 start_index_;
 
   class Iterator : public DatasetIterator<Dataset> {
    public:
     explicit Iterator(const Params& params)
-        : DatasetIterator<Dataset>(params) {}
+        : DatasetIterator<Dataset>(params), current_checkpoint_id_(0) {}
 
     Status Initialize(IteratorContext* ctx) override {
-      return Reader::Create(ctx->env(), dataset()->filename_,
-                            dataset()->compression_, dataset()->version_,
-                            dataset()->dtypes_, &reader_);
+      TF_RETURN_IF_ERROR(Reader::Create(
+          ctx->env(), GetCurrentFilename(), dataset()->compression_,
+          dataset()->version_, dataset()->dtypes_, &reader_));
+      bool end_of_sequence;
+      for (int64 i = 0; i < dataset()->start_index_; ++i) {
+        // TODO(frankchn): Optimize this to not parse every single element.
+        std::vector<Tensor> unused;
+        TF_RETURN_IF_ERROR(GetNextInternal(ctx, &unused, &end_of_sequence));
+      }
+      return Status::OK();
     }
 
    protected:
@@ -286,27 +304,53 @@ class Reader::Dataset : public DatasetBase {
                            bool* end_of_sequence) override {
       *end_of_sequence = false;
       Status s = reader_->ReadTensors(out_tensors);
-      if (errors::IsOutOfRange(s)) {
+      if (!errors::IsOutOfRange(s)) {
+        return s;
+      }
+      Status status = AdvanceToNextFile(ctx->env());
+      if (errors::IsNotFound(status)) {
         *end_of_sequence = true;
         return Status::OK();
+      } else {
+        return status;
       }
-      return s;
     }
 
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
-      // TODO(frankchn): Implement for serialization and checkpointing.
+      // Not necessary to save any state as this iterator will be reconstructed
+      // from scratch when the parent snapshot dataset is restored from
+      // checkpoint.
       return Status::OK();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
-      // TODO(frankchn): Implement for serialization and checkpointing.
+      // Not necessary to restore any state as this iterator will be
+      // reconstructed from scratch when the parent snapshot dataset is restored
+      // from checkpoint.
       return Status::OK();
     }
 
    private:
     std::unique_ptr<Reader> reader_;
+
+    // Stores the id current checkpoint file that we are in the process of
+    // reading (e.g. if the file is currently 00000001.snapshot, then this will
+    // be 1).
+    uint64 current_checkpoint_id_;
+
+    std::string GetCurrentFilename() {
+      return GetCurrentCheckpointFile(dataset()->shard_dir_,
+                                      current_checkpoint_id_);
+    }
+
+    Status AdvanceToNextFile(Env* env) {
+      current_checkpoint_id_++;
+      TF_RETURN_IF_ERROR(env->FileExists(GetCurrentFilename()));
+      return Reader::Create(env, GetCurrentFilename(), dataset()->compression_,
+                            dataset()->version_, dataset()->dtypes_, &reader_);
+    }
   };
 };
 
@@ -337,7 +381,8 @@ class Reader::NestedDataset : public DatasetBase {
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
                             Node** node) const override {
-    // TODO(frankchn): Implement for serialization and checkpointing.
+    // Not necessary perform any serialization as this dataset is only
+    // constructed at runtime in C++ and will be reconstructed every time.
     return Status::OK();
   }
 
@@ -377,13 +422,17 @@ class Reader::NestedDataset : public DatasetBase {
 
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
-      // TODO(frankchn): Implement for serialization and checkpointing.
+      // Not necessary to save any state as this iterator will be reconstructed
+      // from scratch when the parent snapshot dataset is restored from
+      // checkpoint.
       return Status::OK();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
-      // TODO(frankchn): Implement for serialization and checkpointing.
+      // Not necessary to restore any state as this iterator will be
+      // reconstructed from scratch when the parent snapshot dataset is restored
+      // from checkpoint.
       return Status::OK();
     }
 
@@ -393,21 +442,36 @@ class Reader::NestedDataset : public DatasetBase {
 };
 
 Status Reader::MakeNestedDataset(Env* env,
-                                 const std::vector<std::string>& filenames,
+                                 const std::vector<std::string>& shard_dirs,
                                  const string& compression_type, int version,
                                  const DataTypeVector& dtypes,
                                  const std::vector<PartialTensorShape>& shapes,
+                                 const int64 start_index,
                                  DatasetBase** output) {
   std::vector<DatasetBase*> datasets;
 
-  datasets.reserve(filenames.size());
-  for (const auto& filename : filenames) {
+  datasets.reserve(shard_dirs.size());
+  for (const auto& shard_dir : shard_dirs) {
+    // TODO(frankchn): The reading pattern could be controlled in a non-round
+    // robin fashion, so we cannot assume a round-robin manner when restoring.
+    int64 dataset_start_index = start_index / shard_dirs.size();
+    if (start_index % shard_dirs.size() > datasets.size()) {
+      dataset_start_index++;
+    }
+
     datasets.push_back(
-        new Dataset(filename, compression_type, version, dtypes, shapes,
+        new Dataset(shard_dir, compression_type, version, dtypes, shapes,
+                    dataset_start_index,
                     DatasetContext::Params({"snapshot_util::Reader::Dataset",
                                             "snapshot_util_reader_Dataset"})));
   }
 
+  // Rotate the vector such that the first dataset contains the next element
+  // to be produced.
+  std::rotate(datasets.begin(),
+              datasets.begin() + (start_index % shard_dirs.size()),
+              datasets.end());
+
   *output = new NestedDataset(
       datasets, DatasetContext::Params({"snapshot_util::Reader::NestedDataset",
                                         "snapshot_util_reader_NestedDataset"}));
@@ -463,6 +527,15 @@ Status Reader::Initialize(Env* env) {
   return Status::OK();
 }
 
+Status Reader::SkipRecords(int64 num_records) {
+  // TODO(frankchn): Optimize to not parse the entire Tensor and actually skip.
+  for (int i = 0; i < num_records; ++i) {
+    std::vector<Tensor> unused_tensors;
+    TF_RETURN_IF_ERROR(ReadTensors(&unused_tensors));
+  }
+  return Status::OK();
+}
+
 Status Reader::ReadTensors(std::vector<Tensor>* read_tensors) {
   profiler::TraceMe activity(
       [&]() { return absl::StrCat(kClassName, kSeparator, "ReadTensors"); },
@@ -503,12 +576,10 @@ Status Reader::ReadTensors(std::vector<Tensor>* read_tensors) {
       size_t tensor_proto_size = tensor_proto_strs[complex_index].second;
       TensorProto tp;
 #if defined(PLATFORM_GOOGLE)
-      auto tensor_proto_ptr = tensor_proto_str.release();
-      absl::Cord c;
-      c.AppendExternalMemory(
-          absl::string_view(tensor_proto_ptr, tensor_proto_size),
-          tensor_proto_ptr,
-          [](void* arg) { delete[] static_cast<char*>(arg); });
+      absl::string_view tensor_proto_view(tensor_proto_str.get(),
+                                          tensor_proto_size);
+      absl::Cord c = absl::MakeCordFromExternal(
+          tensor_proto_view, [s = std::move(tensor_proto_str)] {});
       if (!tp.ParseFromCord(c)) {
         return errors::Internal("Could not parse TensorProto");
       }
@@ -615,11 +686,9 @@ Status Reader::ReadRecord(absl::Cord* record) {
   } else {
     auto tmp_str = absl::make_unique<tstring>();
     TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(length, tmp_str.get()));
-    tstring* tmp_str_raw = tmp_str.release();
-    record->AppendExternalMemory(*tmp_str_raw, tmp_str_raw,
-                                 [](absl::string_view unused_data, void* arg) {
-                                   delete static_cast<tstring*>(arg);
-                                 });
+    absl::string_view tmp_str_view(*tmp_str);
+    record->Append(
+        absl::MakeCordFromExternal(tmp_str_view, [s = std::move(tmp_str)] {}));
     return Status::OK();
   }
 }
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.h b/tensorflow/core/kernels/data/experimental/snapshot_util.h
index dd15c591a22..a6455a85393 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.h
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.h
@@ -49,6 +49,9 @@ constexpr char kModePassthrough[] = "passthrough";
 
 enum Mode { READER = 0, WRITER = 1, PASSTHROUGH = 2 };
 
+std::string GetCurrentCheckpointFile(const std::string& shard_directory,
+                                     const uint64 current_checkpoint_id);
+
 class Writer {
  public:
   static constexpr const size_t kHeaderSize = sizeof(uint64);
@@ -126,14 +129,17 @@ class Reader {
   // dataset. Each element within the nested dataset is itself a dataset, and
   // contains all the elements written out to each individual snapshot file.
   static Status MakeNestedDataset(Env* env,
-                                  const std::vector<std::string>& filenames,
+                                  const std::vector<std::string>& shard_dirs,
                                   const string& compression_type, int version,
                                   const DataTypeVector& dtypes,
                                   const std::vector<PartialTensorShape>& shapes,
+                                  const int64 start_index,
                                   DatasetBase** output);
 
   Status ReadTensors(std::vector<Tensor>* read_tensors);
 
+  Status SkipRecords(int64 num_records);
+
  private:
   explicit Reader(const std::string& filename, const string& compression_type,
                   int version, const DataTypeVector& dtypes);
diff --git a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index 111afa218df..e813de70931 100644
--- a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -38,8 +38,12 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
     explicit Dataset(OpKernelContext* ctx, DatasetBase* input)
         : DatasetBase(DatasetContext(ctx)), input_(input) {
       input_->Ref();
+      batch_size_ = -1;
       for (const PartialTensorShape& shape : input->output_shapes()) {
         if (!shape.unknown_rank()) {
+          if (batch_size_ < 0 && shape.dim_size(0) >= 0) {
+            batch_size_ = shape.dim_size(0);
+          }
           gtl::InlinedVector<int64, 4> partial_dim_sizes;
           for (int i = 1; i < shape.dims(); ++i) {
             partial_dim_sizes.push_back(shape.dim_size(i));
@@ -69,6 +73,17 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "UnbatchDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      if (batch_size_ > 0) {
+        return n * batch_size_;
+      }
+      return kUnknownCardinality;
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
@@ -222,6 +237,8 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
 
     const DatasetBase* const input_;
     std::vector<PartialTensorShape> shapes_;
+    // batch_size_ may or may not be known, with -1 as unknown
+    int64 batch_size_;
   };
 };
 
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index a2e8ca13192..9fb3c5fb46e 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -1102,9 +1102,8 @@ REGISTER_KERNEL_BUILDER(
     MakeIteratorOp);
 REGISTER_KERNEL_BUILDER(Name("DeleteIterator").Device(DEVICE_CPU).Priority(2),
                         DeleteIteratorOp);
-REGISTER_KERNEL_BUILDER(
-    Name("DeleteIterator").Device(DEVICE_GPU).HostMemory("deleter").Priority(1),
-    DeleteIteratorOp);
+REGISTER_KERNEL_BUILDER(Name("DeleteIterator").Device(DEVICE_GPU).Priority(1),
+                        DeleteIteratorOp);
 REGISTER_KERNEL_BUILDER(
     Name("AnonymousIterator").Device(DEVICE_CPU).Priority(2),
     AnonymousIteratorHandleOp);
@@ -1116,7 +1115,6 @@ REGISTER_KERNEL_BUILDER(
     AnonymousIteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("AnonymousIteratorV2")
                             .Device(DEVICE_GPU)
-                            .HostMemory("deleter")
                             .Priority(1),
                         AnonymousIteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("DatasetToSingleElement").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 5dae096d5b5..7b8f697d2d3 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -621,6 +621,11 @@ class ParallelMapIterator : public DatasetBaseIterator {
       return false;
     }
     if (!deterministic_) {
+      // Iterate through in-flight results and returns the first one that is
+      // found to be available and not end-of-input. If the first result (in
+      // order) is end-of-input, we know that all earlier iterations have
+      // already been completed, so it is safe to return that result for the
+      // caller to process end of iteration.
       for (auto it = invocation_results_.begin();
            it != invocation_results_.end(); ++it) {
         if ((*it)->notification.HasBeenNotified() &&
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 0c3fe43d6c3..0230bcd146d 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -100,9 +100,13 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
     AttrValue slack_period_attr;
     b->BuildAttrValue(slack_period_, &slack_period_attr);
-    TF_RETURN_IF_ERROR(b->AddDataset(
-        this, {input_graph_node, buffer_size},
-        {std::make_pair(kSlackPeriod, slack_period_attr)}, output));
+    AttrValue legacy_autotune_attr;
+    b->BuildAttrValue(legacy_autotune_, &legacy_autotune_attr);
+    TF_RETURN_IF_ERROR(
+        b->AddDataset(this, {input_graph_node, buffer_size},
+                      {std::make_pair(kSlackPeriod, slack_period_attr),
+                       std::make_pair(kLegacyAutotune, legacy_autotune_attr)},
+                      output));
     return Status::OK();
   }
 
diff --git a/tensorflow/core/kernels/data/tf_record_dataset_op.cc b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
index 94d523b5bfb..c6387a49f46 100644
--- a/tensorflow/core/kernels/data/tf_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
@@ -39,7 +39,9 @@ namespace data {
 constexpr char kCurrentFileIndex[] = "current_file_index";
 constexpr char kOffset[] = "offset";
 constexpr char kGcsFsPrefix[] = "gs://";
+constexpr char kS3FsPrefix[] = "s3://";
 constexpr int64 kCloudTpuBlockSize = 127LL << 20;  // 127MB.
+constexpr int64 kS3BlockSize = kCloudTpuBlockSize;
 
 bool is_cloud_tpu_gcs_fs() {
 #if defined(PLATFORM_CLOUD_TPU) && defined(TPU_GCS_FS)
@@ -237,12 +239,14 @@ void TFRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
       errors::InvalidArgument("`filenames` must be a scalar or a vector."));
 
   bool is_gcs_fs = true;
+  bool is_s3_fs = true;
   std::vector<string> filenames;
   filenames.reserve(filenames_tensor->NumElements());
   for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
     VLOG(2) << "Reading file: " << filenames_tensor->flat<tstring>()(i);
     filenames.push_back(filenames_tensor->flat<tstring>()(i));
     is_gcs_fs &= absl::StartsWith(filenames[i], kGcsFsPrefix);
+    is_s3_fs &= absl::StartsWith(filenames[i], kS3FsPrefix);
   }
 
   tstring compression_type;
@@ -264,6 +268,13 @@ void TFRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
     buffer_size = kCloudTpuBlockSize;
   }
 
+  if (is_s3_fs && buffer_size < kS3BlockSize) {
+    VLOG(2) << "User buffer size is too small for reading "
+            << "TFRecords stored in S3. Overriding " << buffer_size
+            << " to the minimum recommended buffer_size = " << kS3BlockSize;
+    buffer_size = kS3BlockSize;
+  }
+
   *output =
       new Dataset(ctx, std::move(filenames), compression_type, buffer_size);
 }
diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
index 0b4241dbb93..181aa1b8a2c 100644
--- a/tensorflow/core/kernels/data_format_ops.cc
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -90,16 +90,15 @@ class DataFormatVecPermuteOp : public OpKernel {
                     "input must be a vector or 2D tensor, but got shape ",
                     input.shape().DebugString()));
     if (input.dims() == 1) {
-      OP_REQUIRES(
-          context, input.NumElements() == 4,
-          errors::InvalidArgument("1D input must be of size 4, but got shape ",
-                                  input.shape().DebugString()));
+      OP_REQUIRES(context, input.NumElements() == 2 || input.NumElements() == 4,
+                  errors::InvalidArgument(
+                      "1D input must be of size 2 or 4, but got shape ",
+                      input.shape().DebugString()));
     } else if (input.dims() == 2) {
-      OP_REQUIRES(
-          context, input.dim_size(0) == 4,
-          errors::InvalidArgument(
-              "First dimension of 2D input must be of size 4, but got shape ",
-              input.shape().DebugString()));
+      OP_REQUIRES(context, input.dim_size(0) == 2 || input.dim_size(0) == 4,
+                  errors::InvalidArgument("First dimension of 2D input must be "
+                                          "of size 2 or 4, but got shape ",
+                                          input.shape().DebugString()));
       OP_REQUIRES(
           context, input.dim_size(1) == 2,
           errors::InvalidArgument(
@@ -112,7 +111,21 @@ class DataFormatVecPermuteOp : public OpKernel {
                    context->allocate_output(0, input.shape(), &output));
     // Support 1D and 2D cases.
     Eigen::DSizes<Eigen::DenseIndex, 8> dst_idx;
-    ComputeDstIndex(input.dims(), &dst_idx);
+    string src_format_str = src_format_;
+    string dst_format_str = dst_format_;
+    if (input.dim_size(0) == 2) {
+      // If the input is a vector of size 2, treat the two elements as spatial
+      // dimensions.
+      auto keep_only_spatial_dimensions = [](string* format_str) -> void {
+        auto new_end = std::remove_if(
+            format_str->begin(), format_str->end(),
+            [](const char dim) { return dim != 'H' && dim != 'W'; });
+        format_str->erase(new_end, format_str->end());
+      };
+      keep_only_spatial_dimensions(&src_format_str);
+      keep_only_spatial_dimensions(&dst_format_str);
+    }
+    ComputeDstIndex(src_format_str, dst_format_str, input.dims(), &dst_idx);
 
     functor::DataFormatVecPermute<Device, T>()(context->eigen_device<Device>(),
                                                input.flat<T>(),
@@ -124,10 +137,12 @@ class DataFormatVecPermuteOp : public OpKernel {
   // Example: HWNC --> NHWC
   // 1D: dst = [1, 2, 0, 3],
   // 2D: dst = [2, 3, 4, 5, 0, 1, 6, 7]
-  void ComputeDstIndex(int num_dim, Eigen::DSizes<Eigen::DenseIndex, 8>* dst) {
-    for (int i = 0; i < src_format_.size(); ++i) {
-      for (int j = 0; j < dst_format_.size(); ++j) {
-        if (dst_format_[j] != src_format_[i]) continue;
+  static void ComputeDstIndex(const string& src_format_str,
+                              const string& dst_format_str, int num_dim,
+                              Eigen::DSizes<Eigen::DenseIndex, 8>* dst) {
+    for (int i = 0; i < src_format_str.size(); ++i) {
+      for (int j = 0; j < dst_format_str.size(); ++j) {
+        if (dst_format_str[j] != src_format_str[i]) continue;
         // Found the dst index. Set output based on the number of dims.
         for (int k = 0; k < num_dim; ++k) {
           (*dst)[i * num_dim + k] = j * num_dim + k;
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 00356778026..42364e416ea 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -435,9 +435,9 @@ class DebugIdentityV2Op : public OpKernel {
     for (const string& dump_root : dump_roots_) {
       tfdbg::DebugEventsWriter* debug_events_writer =
           tfdbg::DebugEventsWriter::GetDebugEventsWriter(dump_root);
-      debug_events_writer->WriteGraphExecutionTrace(
-          tfdbg_context_id_, device_name_, op_name_, output_slot_,
-          tensor_debug_mode_, tensor);
+      OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace(
+                                  tfdbg_context_id_, device_name_, op_name_,
+                                  output_slot_, tensor_debug_mode_, tensor));
     }
     context->set_output(0, tensor);
   }
diff --git a/tensorflow/core/kernels/dequantize_op.cc b/tensorflow/core/kernels/dequantize_op.cc
index 0f5a7019b1f..3b38daf0067 100644
--- a/tensorflow/core/kernels/dequantize_op.cc
+++ b/tensorflow/core/kernels/dequantize_op.cc
@@ -61,7 +61,9 @@ class DequantizeOp : public OpKernel {
                                 " is '" +
                                 DataTypeString(ctx->output_type(0)) + "'"));
 
+    need_cast_ = true;
     if (ctx->output_type(0) == DT_FLOAT) {
+      need_cast_ = false;
       OP_REQUIRES(ctx,
                   (mode_string == "MIN_COMBINED" ||
                    mode_string == "MIN_FIRST" || mode_string == "SCALED"),
@@ -98,8 +100,9 @@ class DequantizeOp : public OpKernel {
     }
 
     Tensor* output = nullptr;
-    Tensor float_output = tensorflow::Tensor(DT_FLOAT, input.shape());
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
+    Tensor float_output =
+        need_cast_ ? tensorflow::Tensor(DT_FLOAT, input.shape()) : *output;
     if (num_slices == 1) {
       const float min_range = input_min_tensor.flat<float>()(0);
       const float max_range = input_max_tensor.flat<float>()(0);
@@ -128,10 +131,12 @@ class DequantizeOp : public OpKernel {
                         max_ranges(i), output_tensor.template chip<1>(i));
       }
     }
-    S* out_ptr = output->flat<S>().data();
-    float* in_ptr = float_output.flat<float>().data();
-    for (int64 i = 0; i < float_output.NumElements(); ++i) {
-      out_ptr[i] = static_cast<S>(in_ptr[i]);
+    if (need_cast_) {
+      S* out_ptr = output->flat<S>().data();
+      float* in_ptr = float_output.flat<float>().data();
+      for (int64 i = 0; i < float_output.NumElements(); ++i) {
+        out_ptr[i] = static_cast<S>(in_ptr[i]);
+      }
     }
   }
 
@@ -219,6 +224,7 @@ class DequantizeOp : public OpKernel {
   int mode_;
   int axis_;
   bool narrow_range_;
+  bool need_cast_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("Dequantize")
diff --git a/tensorflow/core/kernels/eigen_attention.h b/tensorflow/core/kernels/eigen_attention.h
index c5158e65d8a..4e03a787c2b 100644
--- a/tensorflow/core/kernels/eigen_attention.h
+++ b/tensorflow/core/kernels/eigen_attention.h
@@ -56,13 +56,14 @@ struct GlimpseExtractionOp {
   GlimpseExtractionOp(const Index width, const Index height,
                       const std::vector<IndexPair<float> >& offsets,
                       const bool normalized, const bool centered,
-                      const ExtractGlimpsesNoiseMode noise)
+                      const ExtractGlimpsesNoiseMode noise, const int version)
       : width_(width),
         height_(height),
         offsets_(offsets),
         normalized_(normalized),
         centered_(centered),
-        noise_(noise) {}
+        noise_(noise),
+        version_(version) {}
 
   template <typename Input>
   DSizes<Index, 4> dimensions(const Input& input) const {
@@ -101,21 +102,44 @@ struct GlimpseExtractionOp {
     for (Index i = 0; i < batch_size; ++i) {
       float x = offsets_[i].first, y = offsets_[i].second;
 
-      // Un-normalize coordinates back to pixel space if normalized.
-      if (normalized_) {
-        x *= input_width;
-        y *= input_height;
+      if (version_ == 1) {
+        // Un-normalize coordinates back to pixel space if normalized.
+        if (normalized_) {
+          x *= input_width;
+          y *= input_height;
+        }
+        // Un-center if coordinates are centered on the image center.
+        if (centered_) {
+          x /= 2.0f;
+          y /= 2.0f;
+          x += input_width / 2.0f;
+          y += input_height / 2.0f;
+        }
+        // Remove half of the glimpse window.
+        x -= width_ / 2.0f;
+        y -= height_ / 2.0f;
+      } else {
+        if (normalized_) {
+          // Un-normalize coordinates back to pixel space if normalized.
+          x *= input_width;
+          y *= input_height;
+          if (centered_) {
+            // Un-center if coordinates are centered on the image center.
+            x /= 2.0f;
+            y /= 2.0f;
+            x += input_width / 2.0f;
+            y += input_height / 2.0f;
+            // Remove half of the glimpse window.
+            x -= width_ / 2.0f;
+            y -= height_ / 2.0f;
+          }
+        } else {
+          if (centered_) {
+            x += input_width / 2.0f;
+            y += input_height / 2.0f;
+          }
+        }
       }
-      // Un-center if coordinates are centered on the image center.
-      if (centered_) {
-        x /= 2.0f;
-        y /= 2.0f;
-        x += input_width / 2.0f;
-        y += input_height / 2.0f;
-      }
-      // Remove half of the glimpse window.
-      x -= width_ / 2.0f;
-      y -= height_ / 2.0f;
 
       const Index offset_x = (Index)x;
       const Index offset_y = (Index)y;
@@ -243,6 +267,7 @@ struct GlimpseExtractionOp {
   const bool normalized_;
   const bool centered_;
   const ExtractGlimpsesNoiseMode noise_;
+  const int version_;
 };
 }  // namespace
 
@@ -255,7 +280,8 @@ ExtractGlimpses(
     const typename internal::traits<Input>::Index height,
     const std::vector<IndexPair<float> >& offsets, const bool normalized = true,
     const bool centered = true,
-    const ExtractGlimpsesNoiseMode noise = ExtractGlimpsesNoiseMode::UNIFORM) {
+    const ExtractGlimpsesNoiseMode noise = ExtractGlimpsesNoiseMode::UNIFORM,
+    const int version = 2) {
   EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
                       YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4,
@@ -263,7 +289,7 @@ ExtractGlimpses(
 
   typedef typename internal::traits<Input>::Index Index;
   const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
-                                      centered, noise);
+                                      centered, noise, version);
   return input.customOp(op);
 }
 
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 7f4d1144cb2..96c0a3d6bdc 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -924,5 +924,37 @@ class FakeParamOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_CPU), FakeParamOp);
 REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_GPU), FakeParamOp);
 
+// DeviceIndexOP returns the current device index.
+class DeviceIndexOp : public OpKernel {
+ public:
+  explicit DeviceIndexOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("device_names", &device_names_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* device_name_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, TensorShape({}), &device_name_t));
+    DeviceNameUtils::ParsedName parsed_name;
+    int index = device_names_.size();
+    if (DeviceNameUtils::ParseFullName(ctx->device()->name(), &parsed_name) &&
+        parsed_name.has_type) {
+      auto it = absl::c_find(device_names_, parsed_name.type);
+      if (it != device_names_.end()) {
+        index = it - device_names_.begin();
+      }
+    }
+    device_name_t->scalar<int32>()() = index;
+  }
+
+ private:
+  PersistentTensor value_handle_;
+  std::vector<string> device_names_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("DeviceIndex").Device(DEVICE_CPU), DeviceIndexOp);
+REGISTER_KERNEL_BUILDER(
+    Name("DeviceIndex").Device(DEVICE_GPU).HostMemory("index"), DeviceIndexOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index ab8b4b2dd48..948273aa33a 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -44,8 +44,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
   const SliceIndex indices_size = static_cast<SliceIndex>(indices.dimension(0));
   const SliceIndex batch_size = static_cast<SliceIndex>(params.dimension(0));
   const Index limit = static_cast<Index>(params.dimension(1));
-  T* out_base = &out(0, 0, 0);
-  const T* params_base = &params(0, 0, 0);
+  T* out_base = out.data();
+  const T* params_base = params.data();
   if (static_slice_elems >= 0) {
     // Give compiler static knowledge of the number of elements/bytes
     slice_elems = static_slice_elems;
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.h b/tensorflow/core/kernels/gather_functor_gpu.cu.h
index 1cadee41a88..b2dd43885d0 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.h
@@ -92,13 +92,18 @@ struct GatherFunctor<GPUDevice, T, Index> {
     const int64 indices_size = indices.size();
     const int64 slice_size = params.dimension(2);
 
-    GpuLaunchConfig config = GetGpuLaunchConfig(out_size, d);
     if (is_axis_zero) {
+      GpuLaunchConfig config = GetGpuLaunchConfig(
+          out_size, d, &GatherOpKernel<T, Index, true>,
+          /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
       TF_CHECK_OK(GpuLaunchKernel(
           GatherOpKernel<T, Index, true>, config.block_count,
           config.thread_per_block, 0, d.stream(), params.data(), indices.data(),
           out.data(), gather_dim_size, indices_size, slice_size, out_size));
     } else {
+      GpuLaunchConfig config = GetGpuLaunchConfig(
+          out_size, d, &GatherOpKernel<T, Index, false>,
+          /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
       TF_CHECK_OK(GpuLaunchKernel(
           GatherOpKernel<T, Index, false>, config.block_count,
           config.thread_per_block, 0, d.stream(), params.data(), indices.data(),
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 849a2b4389f..5e6bd1de9d6 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -88,18 +88,18 @@ class GatherOp : public OpKernel {
     }
 
     if (batch_dims_ != 0) {
-      if (batch_dims_ < 0) {
-        batch_dims_ = indices.dims() + batch_dims_;
-      }
-
-      if (!axis_is_set) axis = batch_dims_;
-
       OP_REQUIRES(
           c, batch_dims_ >= -indices.dims() && batch_dims_ <= indices.dims(),
           errors::InvalidArgument("Expected batch_dims in the range [",
                                   -indices.dims(), ", ", indices.dims(),
                                   "], but got ", batch_dims_));
 
+      if (batch_dims_ < 0) {
+        batch_dims_ = indices.dims() + batch_dims_;
+      }
+
+      if (!axis_is_set) axis = batch_dims_;
+
       OP_REQUIRES(c, batch_dims_ < params.dims(),
                   errors::InvalidArgument("batch_dims (", batch_dims_,
                                           ") must be less than rank(params) (",
@@ -154,6 +154,7 @@ class GatherOp : public OpKernel {
     Tensor* out = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
     if (N == 0) return;
+    if (inner_size == 0) return;
 
     int64 bad_i = -1;
     auto indices_flat = indices.flat<Index>();
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
index ecac2274ae8..e4c77881ea8 100644
--- a/tensorflow/core/kernels/gather_op_test.cc
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -40,11 +40,12 @@ namespace {
 
 class GatherOpTest : public OpsTestBase {
  protected:
-  void MakeOp(DataType data_type, DataType index_type) {
+  void MakeOp(DataType data_type, DataType index_type, int batch_dims = 0) {
     TF_ASSERT_OK(NodeDefBuilder("myop", "GatherV2")
                      .Input(FakeInput(data_type))
                      .Input(FakeInput(index_type))
                      .Input(FakeInput(index_type))
+                     .Attr("batch_dims", batch_dims)
                      .Finalize(node_def()));
     TF_ASSERT_OK(InitOp());
   }
@@ -176,6 +177,20 @@ TEST_F(GatherOpTest, Error_IndexOutOfRange) {
       << s;
 }
 
+TEST_F(GatherOpTest, Error_BatchDimsOutOfRange) {
+  MakeOp(DT_FLOAT, DT_INT32, 10);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({4}), {0, 4, 99, 2});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(absl::StrContains(
+      s.ToString(), "Expected batch_dims in the range [-1, 1], but got 10"))
+      << s;
+}
+
 constexpr int kLookups = 2000;
 
 template <typename Index>
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 855506e9d8a..37fc1b3ae08 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -435,8 +435,10 @@ class TensorListConcat : public OpKernel {
     for (int i = 0; i < tensor_list->tensors().size(); i++) {
       const Tensor& element_tensor = tensor_list->tensors()[i];
       if (element_tensor.dtype() != DT_INVALID) {
-        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
-            element_tensor.shaped<T, 2>({1, element_tensor.NumElements()})));
+        if (element_tensor.NumElements() > 0) {
+          inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+              element_tensor.shaped<T, 2>({1, element_tensor.NumElements()})));
+        }
       } else {
         AllocatorAttributes attr;
         if (element_dtype_ == DT_VARIANT) {
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 21be643eaa5..ec5f80cb3fa 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -244,7 +244,7 @@ class MklAddNOp : public OpKernel {
 
       // Create Sum op, and submit net for execution.
       std::vector<primitive> net;
-      auto sum_stream = CPU_STREAM(cpu_engine);
+      stream* fwd_cpu_stream = CreateStream(ctx, cpu_engine);
 #ifdef ENABLE_MKLDNN_V1
       mkldnn::sum sum_op(sum_pd);
       std::unordered_map<int, memory> net_args = {
@@ -253,10 +253,10 @@ class MklAddNOp : public OpKernel {
       for (int i = 0; i < num_inputs; ++i) {
         net_args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, inputs[i]});
       }
-      sum_op.execute(sum_stream, net_args);
+      sum_op.execute(*fwd_cpu_stream, net_args);
 #else
       net.push_back(sum(sum_pd, inputs, dst.GetOpMem()));
-      sum_stream.submit(net).wait();
+      fwd_cpu_stream->submit(net).wait();
 #endif
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index 47b999e0e11..8de3e327f96 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -136,9 +136,10 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
       const T* src_data = input_tensor.flat<T>().data();
 
       T* dst_data = output_tensor->flat<T>().data();
-
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(context, pooling_fwd->GetEngine()));
       // Execute pooling op.
-      pooling_fwd->Execute(src_data, dst_data);
+      pooling_fwd->Execute(src_data, dst_data, nullptr, fwd_cpu_stream);
 
       // Pass min, max from input to output.
       if (int8_forward_inference) {
@@ -240,8 +241,8 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
               : memory::desc(diff_dst_dims, MklDnnType<T>(),
                              this->data_format_mkldnn_);
 
-      // Pass prop_kind::forward_training to create a forward primitive
-      // that is used in the backward pass.
+// Pass prop_kind::forward_training to create a forward primitive
+// that is used in the backward pass.
 #ifdef ENABLE_MKLDNN_V1
       // TODO(DNNL): Find out what should we use src_md.data.format.
       MklPoolingParams bwdParams(
@@ -260,6 +261,8 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       MklPoolingBwdPrimitive<T>* pooling_bwd =
           MklPoolingBwdPrimitiveFactory<T>::Get(bwdParams);
 
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, pooling_bwd->GetEngine()));
       Tensor* output_tensor = nullptr;
       this->AllocateOutputTensor(context, *(pooling_bwd->GetPoolingBwdPd()),
                                  orig_input_dims_mkl_order,
@@ -286,7 +289,8 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       T* diff_src_data = output_tensor->flat<T>().data();
 
       // Execute pooling op.
-      pooling_bwd->Execute(diff_dst_data, diff_src_data);
+      pooling_bwd->Execute(diff_dst_data, diff_src_data, nullptr,
+                           bwd_cpu_stream);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index d918327ef5f..976f778424e 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -265,8 +265,7 @@ class MklConcatFwdPrimitive : public MklPrimitive {
  public:
   explicit MklConcatFwdPrimitive(const MklConcatFwdParams& concat_fwd_dims,
                                  const std::vector<memory::desc>& srcs_md)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     // Create concat primitive
     Setup(concat_fwd_dims, srcs_md);
   }
@@ -278,7 +277,8 @@ class MklConcatFwdPrimitive : public MklPrimitive {
   //   dst_data:    output data buffer of dst
   void Execute(const std::vector<mkldnn::memory>& in_data,
                const mkldnn::memory& dst_data,
-               const MklConcatFwdParams& concat_fwd_dims) {
+               const MklConcatFwdParams& concat_fwd_dims,
+               std::shared_ptr<stream> fwd_stream) {
     DCHECK_EQ(in_data.size(), context_.data_mem.size());
     for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
       context_.data_mem_shdptr[i]->set_data_handle(
@@ -292,10 +292,10 @@ class MklConcatFwdPrimitive : public MklPrimitive {
     }
 
 #ifdef ENABLE_MKLDNN_V1
-    execute_primitives(context_.fwd_primitives, context_.fwd_stream,
+    execute_primitives(context_.fwd_primitives, fwd_stream,
                        context_.fwd_primitives_args);
 #else
-    context_.fwd_stream->submit(context_.fwd_primitives);
+    fwd_stream->submit(context_.fwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
     // After exec, set data handle back
@@ -335,7 +335,6 @@ class MklConcatFwdPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::concat::primitive_desc> fwd_pd;
     std::shared_ptr<mkldnn::primitive> concat_fwd;
 
-    std::shared_ptr<mkldnn::stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
 
 #ifdef ENABLE_MKLDNN_V1
@@ -343,10 +342,7 @@ class MklConcatFwdPrimitive : public MklPrimitive {
 #endif  // ENABLE_MKLDNN_V1
 
     ConcatFwdContext()
-        : dst_mem(nullptr),
-          fwd_pd(nullptr),
-          concat_fwd(nullptr),
-          fwd_stream(nullptr) {}
+        : dst_mem(nullptr), fwd_pd(nullptr), concat_fwd(nullptr) {}
   };
 
   // Creates the src and dst memory descriptor for mkl concat
@@ -417,7 +413,6 @@ class MklConcatFwdPrimitive : public MklPrimitive {
   }
 
   struct ConcatFwdContext context_;
-  engine cpu_engine_;
 };
 
 // Class to create/cache the mkl concat primitives based on the
@@ -758,7 +753,7 @@ class MklConcatOp : public OpKernel {
         for (int k = 0; k < input_tensors.size(); k++) {
           if (input_tensors[k].NumElements() > 0) {
             srcs[k].CheckReorderToOpMem(
-                MEMORY_PD_WITHOUT_DATA(srcs_pd[k], cpu_engine));
+                MEMORY_PD_WITHOUT_DATA(srcs_pd[k], cpu_engine), context);
             inputs.push_back(srcs[k].GetOpMem());
           }
         }
@@ -796,7 +791,8 @@ class MklConcatOp : public OpKernel {
           if (dnn_shape_dst.IsMklTensor())
             dst_md = dnn_shape_dst.GetMklLayout();
           dst.SetUsrMem(dst_md, dst_tensor);
-          stream concat_stream = CPU_STREAM(cpu_engine);
+          std::shared_ptr<stream> fwd_cpu_stream;
+          fwd_cpu_stream.reset(CreateStream(context, cpu_engine));
 #ifdef ENABLE_MKLDNN_V1
           auto concat_op = concat(concat_pd);
           std::unordered_map<int, memory> net_args = {
@@ -805,12 +801,12 @@ class MklConcatOp : public OpKernel {
           for (int i = 0; i < inputs.size(); ++i) {
             net_args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, inputs[i]});
           }
-          concat_op.execute(concat_stream, net_args);
+          concat_op.execute(*fwd_cpu_stream, net_args);
 #else
           auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
           std::vector<primitive> net;
           net.push_back(concat_op);
-          concat_stream.submit(net).wait();
+          fwd_cpu_stream->submit(net).wait();
 #endif  // ENABLE_MKLDNN_V1
         } else {
           MklConcatFwdPrimitive<T>* concat_fwd = nullptr;
@@ -835,9 +831,11 @@ class MklConcatOp : public OpKernel {
           dst_md = dnn_shape_dst.IsMklTensor() ? dnn_shape_dst.GetMklLayout()
                                                : dst_md;
           dst.SetUsrMem(dst_md, dst_tensor);
-
+          std::shared_ptr<stream> fwd_cpu_stream;
+          fwd_cpu_stream.reset(CreateStream(context, concat_fwd->GetEngine()));
           // Execute concat
-          concat_fwd->Execute(srcs_mem, dst.GetOpMem(), concat_fwd_dims);
+          concat_fwd->Execute(srcs_mem, dst.GetOpMem(), concat_fwd_dims,
+                              fwd_cpu_stream);
         }
 
         // For quantized concat, min and max outputs are also computed.
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 269513f2e7d..4c3cea4b6ff 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -97,9 +97,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
  public:
   explicit MklConvBwdFilterPrimitive(
       const MklConvBwdFilterParams& convBwdFilterDims)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.bwd_filter_stream.reset(new CPU_STREAM(cpu_engine_));
-
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     // Create convolution backward filter primitive.
     if (context_.conv_bwd_filter == nullptr) {
       Setup(convBwdFilterDims);
@@ -114,7 +112,8 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   //   diff_bias_data:   output data buffer for diff_bias
   //   diff_dst_data:    input data buffer for diff_dst
   void Execute(const T* src_data, const T* diff_filter_data,
-               const T* diff_bias_data, const T* diff_dst_data) {
+               const T* diff_bias_data, const T* diff_dst_data,
+               std::shared_ptr<stream> bwd_filter_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.diff_filter_mem->set_data_handle(
@@ -127,11 +126,10 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
         static_cast<void*>(const_cast<T*>(diff_dst_data)));
 
 #ifdef ENABLE_MKLDNN_V1
-    execute_primitives(context_.bwd_filter_primitives,
-                       context_.bwd_filter_stream,
+    execute_primitives(context_.bwd_filter_primitives, bwd_filter_stream,
                        context_.bwd_filter_primitives_args);
 #else
-    context_.bwd_filter_stream->submit(context_.bwd_filter_primitives);
+    bwd_filter_stream->submit(context_.bwd_filter_primitives);
 #endif
 
     context_.src_mem->set_data_handle(DummyData);
@@ -147,8 +145,10 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   //   diff_filter_data: output data buffer of diff_filter
   //   diff_dst_data:    input data buffer of diff_dst
   void Execute(const T* src_data, const T* diff_filter_data,
-               const T* diff_dst_data) {
-    Execute(src_data, diff_filter_data, nullptr, diff_dst_data);
+               const T* diff_dst_data,
+               std::shared_ptr<stream> bwd_filter_stream) {
+    Execute(src_data, diff_filter_data, nullptr, diff_dst_data,
+            bwd_filter_stream);
   }
 
 #ifndef ENABLE_MKLDNN_V1
@@ -223,8 +223,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
           src_md(nullptr),
           diff_filter_md(nullptr),
           diff_bias_md(nullptr),
-          diff_dst_md(nullptr),
-          bwd_filter_stream(nullptr) {
+          diff_dst_md(nullptr) {
     }
   };
 
@@ -345,7 +344,6 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   }
 
   struct ConvBwdFilterContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -600,8 +598,10 @@ class MklConvCustomBackpropFilterOp
       auto bwd_filter_pd = conv_bwd_filter->GetPrimitiveDesc();
       if (IS_SRC_REORDER_NEEDED(fwd_src_md, bwd_filter_pd, conv_bwd_filter)) {
         src.SetUsrMem(fwd_src_md, &src_tensor);
-        src.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            bwd_filter_pd->PRIMITIVE_DESC_SRC, cpu_engine_));
+        src.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(bwd_filter_pd->PRIMITIVE_DESC_SRC,
+                                   cpu_engine_),
+            context);
         src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
       } else {
         src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
@@ -612,8 +612,10 @@ class MklConvCustomBackpropFilterOp
       if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, bwd_filter_pd,
                                      conv_bwd_filter)) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
-        diff_dst.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            bwd_filter_pd->PRIMITIVE_DESC_DIFF_DST, cpu_engine_));
+        diff_dst.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(bwd_filter_pd->PRIMITIVE_DESC_DIFF_DST,
+                                   cpu_engine_),
+            context);
         diff_dst_data = static_cast<T*>(diff_dst.GetOpMem().get_data_handle());
       } else {
         diff_dst_data =
@@ -646,18 +648,21 @@ class MklConvCustomBackpropFilterOp
       }
 
       // Execute convolution backward filter.
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, conv_bwd_filter->GetEngine()));
       if (bias_enabled) {
         T* diff_bias_data =
             static_cast<T*>(const_cast<T*>(diff_bias_tensor->flat<T>().data()));
         conv_bwd_filter->Execute(src_data, diff_filter_data, diff_bias_data,
-                                 diff_dst_data);
+                                 diff_dst_data, bwd_cpu_stream);
       } else {
-        conv_bwd_filter->Execute(src_data, diff_filter_data, diff_dst_data);
+        conv_bwd_filter->Execute(src_data, diff_filter_data, diff_dst_data,
+                                 bwd_cpu_stream);
       }
 
       // Reorder diff_filter back to Tensorflow layout if necessary.
       if (diff_filter_reorder_required) {
-        diff_filter.InsertReorderToUserMem();
+        diff_filter.InsertReorderToUserMem(context);
       }
 
       // Delete primitive since it is not cached.
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index bcd0446b748..f9c8d11c67c 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -99,9 +99,7 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
  public:
   explicit MklConvBwdInputPrimitive(
       const MklConvBwdInputParams& convBwdInputDims)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.bwd_input_stream.reset(new CPU_STREAM(cpu_engine_));
-
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     // Create conv bwd input primitive
     if (context_.conv_bwd_input == nullptr) {
       Setup(convBwdInputDims);
@@ -116,7 +114,8 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
   //   diff_dst_data: input data buffer for dst
   // Bias does not matter here
   void Execute(const T* diff_src_data, const T* filter_data,
-               const T* diff_dst_data) {
+               const T* diff_dst_data,
+               std::shared_ptr<stream> bwd_input_stream) {
     context_.diff_src_mem->set_data_handle(
         static_cast<T*>(const_cast<T*>(diff_src_data)));
     context_.filter_mem->set_data_handle(
@@ -125,10 +124,10 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
         static_cast<T*>(const_cast<T*>(diff_dst_data)));
 
 #ifdef ENABLE_MKLDNN_V1
-    execute_primitives(context_.bwd_input_primitives, context_.bwd_input_stream,
+    execute_primitives(context_.bwd_input_primitives, bwd_input_stream,
                        context_.bwd_input_primitives_args);
 #else
-    context_.bwd_input_stream->submit(context_.bwd_input_primitives);
+    bwd_input_stream->submit(context_.bwd_input_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
     // Set data handle back to DummyData.
@@ -180,7 +179,6 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
     std::shared_ptr<memory::desc> diff_dst_md;
 
     // MKL-DNN pipeline for executing primitives.
-    std::shared_ptr<mkldnn::stream> bwd_input_stream;
     std::vector<mkldnn::primitive> bwd_input_primitives;
 
 #ifdef ENABLE_MKLDNN_V1
@@ -203,8 +201,7 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
           fwd_pd(nullptr),
           diff_src_md(nullptr),
           filter_md(nullptr),
-          diff_dst_md(nullptr),
-          bwd_input_stream(nullptr) {
+          diff_dst_md(nullptr) {
     }
   };
 
@@ -290,7 +287,6 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
   }
 
   struct ConvBwdInputContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -522,8 +518,10 @@ class MklConvCustomBackpropInputOp
       if (IS_FILTER_REORDER_NEEDED(fwd_filter_md, bwd_input_pd,
                                    conv_bwd_input)) {
         filter.SetUsrMem(fwd_filter_md, &filter_tensor);
-        filter.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            bwd_input_pd.get()->PRIMITIVE_DESC_WEIGHTS, cpu_engine_));
+        filter.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(bwd_input_pd.get()->PRIMITIVE_DESC_WEIGHTS,
+                                   cpu_engine_),
+            context);
         filter_data = static_cast<T*>(filter.GetOpMem().get_data_handle());
       } else {
         filter_data =
@@ -535,23 +533,29 @@ class MklConvCustomBackpropInputOp
       if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, bwd_input_pd,
                                      conv_bwd_input)) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
-        diff_dst.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            bwd_input_pd.get()->PRIMITIVE_DESC_DIFF_DST, cpu_engine_));
+        diff_dst.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(bwd_input_pd.get()->PRIMITIVE_DESC_DIFF_DST,
+                                   cpu_engine_),
+            context);
         diff_dst_data = static_cast<T*>(diff_dst.GetOpMem().get_data_handle());
       } else {
         diff_dst_data =
             static_cast<T*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
       }
 
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, conv_bwd_input->GetEngine()));
       // Execute conv bwd input primitive.
       if (!eager_mode) {
-        conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data);
+        conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data,
+                                bwd_cpu_stream);
       } else {
         // In eager mode we first write the output to temporary
         // buffer in MKL format. Then we convert the data to TF format.
         T* tmp_data =
             static_cast<T*>(const_cast<T*>(tmp_tensor.flat<T>().data()));
-        conv_bwd_input->Execute(tmp_data, filter_data, diff_dst_data);
+        conv_bwd_input->Execute(tmp_data, filter_data, diff_dst_data,
+                                bwd_cpu_stream);
         auto output_tf_md = diff_src_mkl_shape.GetTfLayout();
 #ifndef ENABLE_MKLDNN_V1
         auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine_);
@@ -563,7 +567,7 @@ class MklConvCustomBackpropInputOp
         memory* dst_data_mem =
             new MEMORY_CONSTRUCTOR(OUTPUT_TF_MD, cpu_engine_, diff_src_data);
         CreateAndExecuteReorder(reorder_pd, *tmp_data_mem, *dst_data_mem,
-                                cpu_engine_);
+                                cpu_engine_, context);
       }
 
       // Delete primitive since it is not cached.
diff --git a/tensorflow/core/kernels/mkl_dequantize_op.cc b/tensorflow/core/kernels/mkl_dequantize_op.cc
index 8737581c726..06570c1db1c 100644
--- a/tensorflow/core/kernels/mkl_dequantize_op.cc
+++ b/tensorflow/core/kernels/mkl_dequantize_op.cc
@@ -155,7 +155,8 @@ class MklDequantizeOp : public OpKernel {
       // Also it does not define round_nearest (enum).
       attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
 #endif  // !ENABLE_MKLDNN_V1
-      stream reorder_stream = CPU_STREAM(cpu_engine);
+      std::shared_ptr<stream> reorder_stream;
+      reorder_stream.reset(CreateStream(ctx, cpu_engine));
       std::vector<primitive> net;
 
       // Create reorder primitive and then execute.
@@ -169,11 +170,10 @@ class MklDequantizeOp : public OpKernel {
       reorder_net_args.push_back({{MKLDNN_ARG_FROM, *src.GetUsrMem()},
                                   { MKLDNN_ARG_TO,
                                     *dst.GetUsrMem() }});
-      execute_primitives(net, std::make_shared<stream>(reorder_stream),
-                         reorder_net_args);
+      execute_primitives(net, reorder_stream, reorder_net_args);
 #else
       net.push_back(reorder(reorder_pd, *src.GetUsrMem(), *dst.GetUsrMem()));
-      reorder_stream.submit(net);
+      reorder_stream->submit(net);
 #endif  // ENABLE_MKLDNN_V1
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 115b3597964..954ae0492df 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/fused_batch_norm_op.h"
+#include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
@@ -37,11 +39,14 @@ using BatchNormBwdPd = mkldnn::batch_normalization_backward::primitive_desc;
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 
+using FusedBNActivationMode = functor::FusedBatchNormActivationMode;
+
 struct MklBatchNormFwdParams {
   memory::dims src_dims;
   int depth;
   float eps;
   bool training;
+  FusedBNActivationMode activation_mode;
 #ifndef ENABLE_MKLDNN_V1
   MEMORY_FORMAT src_format;
 #else
@@ -50,14 +55,17 @@ struct MklBatchNormFwdParams {
 
   MklBatchNormFwdParams(const memory::dims& src_dims, int depth, float eps,
 #ifndef ENABLE_MKLDNN_V1
-                        bool training, MEMORY_FORMAT src_format)
+                        bool training, MEMORY_FORMAT src_format,
+                        FusedBNActivationMode activation_mode)
 #else
-                        bool training, memory::desc src_md)
+                        bool training, memory::desc src_md,
+                        FusedBNActivationMode activation_mode)
 #endif  // !ENABLE_MKLDNN_V1
       : src_dims(src_dims),
         depth(depth),
         eps(eps),
         training(training),
+        activation_mode(activation_mode),
 #ifndef ENABLE_MKLDNN_V1
         src_format(src_format) {
   }
@@ -71,13 +79,7 @@ template <typename T, typename U>
 class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
  public:
   explicit MklFusedBatchNormFwdPrimitive(const MklBatchNormFwdParams& fwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-#ifdef ENABLE_MKLDNN_V1
-    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
-#else
-    context_.fwd_stream.reset(
-        new mkldnn::stream(mkldnn::stream::kind::eager_nostore));
-#endif
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     if (context_.bn_fwd == nullptr) Setup(fwdParams);
   }
 
@@ -90,7 +92,8 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
   //   mean_data:     output data buffer of means
   //   variance_data: output data buffer of variances
   void Execute(const T* src_data, const U* weights_data, T* dst_data,
-               U* mean_data, U* variance_data) {
+               U* mean_data, U* variance_data,
+               std::shared_ptr<stream> fwd_stream, U* workspace_data) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
@@ -104,12 +107,15 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
       context_.mean_mem->set_data_handle(static_cast<void*>(mean_data));
       context_.variance_mem->set_data_handle(static_cast<void*>(variance_data));
     }
+    if (workspace_data != nullptr) {
+      context_.ws_mem->set_data_handle(workspace_data);
+    }
 #ifdef ENABLE_MKLDNN_V1
     // Execute batch-normalization forward primitives.
-    execute_primitives(context_.fwd_primitives, context_.fwd_stream,
-                       context_.net_args);
+    execute_primitives(context_.fwd_primitives, fwd_stream, context_.net_args);
 #else
-    context_.fwd_stream->submit(context_.fwd_primitives);
+    fwd_stream.reset(new stream(stream::kind::eager_nostore));
+    fwd_stream->submit(context_.fwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
     context_.src_mem->set_data_handle(DummyData);
@@ -123,6 +129,10 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
       context_.mean_mem->set_data_handle(DummyData);
       context_.variance_mem->set_data_handle(DummyData);
     }
+
+    if (workspace_data != nullptr) {
+      context_.ws_mem->set_data_handle(DummyData);
+    }
   }
 
   MEMORY_PRIMITIVE_DESC GetDstPd() const { return context_.dst_mem->GET_DESC; }
@@ -158,13 +168,13 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::memory> dst_mem;
     std::shared_ptr<mkldnn::memory> mean_mem;
     std::shared_ptr<mkldnn::memory> variance_mem;
+    std::shared_ptr<mkldnn::memory> ws_mem;
 
     // Forward BatchNorm primitive descriptor.
     std::shared_ptr<BatchNormFwdPd> fwd_pd;
 
     // BatchNorm forward primitive.
     std::shared_ptr<mkldnn::primitive> bn_fwd;
-    std::shared_ptr<mkldnn::stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
 
 #ifdef ENABLE_MKLDNN_V1
@@ -180,7 +190,7 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
           mean_mem(nullptr),
           variance_mem(nullptr),
           bn_fwd(nullptr),
-          fwd_stream(nullptr) {}
+          ws_mem(nullptr) {}
   };
 
   void Setup(const MklBatchNormFwdParams& fwdParams) {
@@ -192,6 +202,9 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
                                         : prop_kind::forward_scoring;
 
 #ifdef ENABLE_MKLDNN_V1
+    if (fwdParams.activation_mode == FusedBNActivationMode::kRelu) {
+      context_.flags |= GET_FLAG(fuse_norm_relu);
+    }
     // Memory descriptor
     auto src_md = fwdParams.src_md;
     // Create forward BatchNorm descriptor and primitive descriptor.
@@ -229,6 +242,13 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
           m_dims, U, MEMORY_FORMAT::nc, cpu_engine_, DummyData));
     }
 
+#ifdef ENABLE_MKLDNN_V1
+    if (IS_SET(fuse_norm_relu)) {
+      context_.ws_mem.reset(new MEMORY_CONSTRUCTOR(
+          context_.fwd_pd->workspace_desc(), cpu_engine_, DummyData));
+    }
+#endif  // ENABLE_MKLDNN_V1
+
     // BatchNorm forward primitive.
     // TODO(intel-tf): Merge all the #ifdefs and simplify code
     if (!fwdParams.training && !(IS_SET(use_global_stats))) {
@@ -258,20 +278,41 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     } else if (IS_SET(use_global_stats)) {
 #ifdef ENABLE_MKLDNN_V1
       if ((IS_SET(use_scale_shift)) && GET_FLAG(use_scale_shift)) {
-        context_.net_args.push_back(
-            {{MKLDNN_ARG_SRC, *context_.src_mem},
-             {MKLDNN_ARG_MEAN, *context_.mean_mem},
-             {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
-             {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
-             { MKLDNN_ARG_DST,
-               *context_.dst_mem }});
+        if (IS_SET(fuse_norm_relu)) {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               { MKLDNN_ARG_WORKSPACE,
+                 *context_.ws_mem }});
+        } else {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
+               { MKLDNN_ARG_DST,
+                 *context_.dst_mem }});
+        }
       } else {
-        context_.net_args.push_back(
-            {{MKLDNN_ARG_SRC, *context_.src_mem},
-             {MKLDNN_ARG_MEAN, *context_.mean_mem},
-             {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
-             { MKLDNN_ARG_DST,
-               *context_.dst_mem }});
+        if (IS_SET(fuse_norm_relu)) {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               { MKLDNN_ARG_WORKSPACE,
+                 *context_.ws_mem }});
+        } else {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               { MKLDNN_ARG_DST,
+                 *context_.dst_mem }});
+        }
       }
       context_.bn_fwd.reset(new batch_normalization_forward(*context_.fwd_pd));
 #else
@@ -291,19 +332,40 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     } else {
 #ifdef ENABLE_MKLDNN_V1
       if ((IS_SET(use_scale_shift)) && GET_FLAG(use_scale_shift)) {
-        context_.net_args.push_back(
-            {{MKLDNN_ARG_SRC, *context_.src_mem},
-             {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
-             {MKLDNN_ARG_DST, *context_.dst_mem},
-             {MKLDNN_ARG_MEAN, *context_.mean_mem},
-             { MKLDNN_ARG_VARIANCE,
-               *context_.variance_mem }});
+        if (IS_SET(fuse_norm_relu)) {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               { MKLDNN_ARG_WORKSPACE,
+                 *context_.ws_mem }});
+        } else {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               { MKLDNN_ARG_VARIANCE,
+                 *context_.variance_mem }});
+        }
       } else {
-        context_.net_args.push_back({{MKLDNN_ARG_SRC, *context_.src_mem},
-                                     {MKLDNN_ARG_DST, *context_.dst_mem},
-                                     {MKLDNN_ARG_MEAN, *context_.mean_mem},
-                                     { MKLDNN_ARG_VARIANCE,
-                                       *context_.variance_mem }});
+        if (IS_SET(fuse_norm_relu)) {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               { MKLDNN_ARG_WORKSPACE,
+                 *context_.ws_mem }});
+        } else {
+          context_.net_args.push_back({{MKLDNN_ARG_SRC, *context_.src_mem},
+                                       {MKLDNN_ARG_DST, *context_.dst_mem},
+                                       {MKLDNN_ARG_MEAN, *context_.mean_mem},
+                                       { MKLDNN_ARG_VARIANCE,
+                                         *context_.variance_mem }});
+        }
       }
       context_.bn_fwd.reset(new batch_normalization_forward(*context_.fwd_pd));
 #else
@@ -323,7 +385,6 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
   }
 
   struct BatchNormFwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T, typename U>
@@ -360,6 +421,7 @@ class MklFusedBatchNormFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey<int>(fwdParams.depth);
     key_creator.AddAsKey<float>(fwdParams.eps);
     key_creator.AddAsKey<bool>(fwdParams.training);
+    key_creator.AddAsKey<FusedBNActivationMode>(fwdParams.activation_mode);
     key_creator.AddAsKey(typeid(T).name());
     key_creator.AddAsKey(typeid(U).name());
     return key_creator.GetKey();
@@ -419,13 +481,7 @@ template <typename T, typename U>
 class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
  public:
   explicit MklFusedBatchNormBwdPrimitive(const MklBatchNormBwdParams& bwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-#ifdef ENABLE_MKLDNN_V1
-    context_.bwd_stream.reset(new CPU_STREAM(cpu_engine_));
-#else
-    context_.bwd_stream.reset(
-        new mkldnn::stream(mkldnn::stream::kind::eager_nostore));
-#endif
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     if (context_.bn_bwd == nullptr) Setup(bwdParams);
   }
 
@@ -445,7 +501,8 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
   //                          on CPU as of now.
   void Execute(const T* src_data, const U* mean_data, const U* variance_data,
                const T* diff_dst_data, const U* weights_data, T* diff_src_data,
-               U* diff_weights_data, U* res_space_data) {
+               U* diff_weights_data, U* res_space_data,
+               std::shared_ptr<stream> bwd_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.mean_mem->set_data_handle(
@@ -467,10 +524,10 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
 #ifdef ENABLE_MKLDNN_V1
     // Execute backward batch-normalization primitives.
     DCHECK_EQ(context_.bwd_primitives.size(), context_.net_args.size());
-    execute_primitives(context_.bwd_primitives, context_.bwd_stream,
-                       context_.net_args);
+    execute_primitives(context_.bwd_primitives, bwd_stream, context_.net_args);
 #else
-    context_.bwd_stream->submit(context_.bwd_primitives);
+    bwd_stream.reset(new stream(stream::kind::eager_nostore));
+    bwd_stream->submit(context_.bwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
     // After execution, set data handle back to DummyData.
@@ -523,7 +580,6 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
     // Backward batch-normalization primitive.
     std::shared_ptr<mkldnn::primitive> bn_bwd;
     std::vector<mkldnn::primitive> bwd_primitives;
-    std::shared_ptr<mkldnn::stream> bwd_stream;
 
 #ifdef ENABLE_MKLDNN_V1
     std::vector<std::unordered_map<int, memory>> net_args;
@@ -536,8 +592,7 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
           diff_dst_mem(nullptr),
           weights_mem(nullptr),
           diff_weights_mem(nullptr),
-          diff_src_mem(nullptr),
-          bwd_stream(nullptr) {}
+          diff_src_mem(nullptr) {}
   };
 
   void Setup(const MklBatchNormBwdParams& bwdParams) {
@@ -546,7 +601,7 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
             ? GET_FLAG(use_scale_shift)
             : (GET_FLAG(use_scale_shift) | GET_FLAG(use_global_stats));
 
-    // Memory descriptors.
+// Memory descriptors.
 #ifndef ENABLE_MKLDNN_V1
     auto src_md = memory::desc({bwdParams.src_dims}, MklDnnType<T>(),
                                bwdParams.src_format);
@@ -619,7 +674,6 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
   }
 
   struct BatchNormBwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T, typename U>
@@ -676,7 +730,8 @@ class MklFusedBatchNormBwdPrimitiveFactory : public MklPrimitiveFactory<T> {
 //  Adding a third parameter to the template to support FusedBatchNormV3
 //  with MKL. This is different from default where the classes are
 //  derived. Moves enabling to compile-time rather than runtime.
-template <typename Device, typename T, typename U, bool reserved_space>
+template <typename Device, typename T, typename U, bool reserved_space,
+          bool is_batch_norm_ex = false>
 class MklFusedBatchNormOp : public OpKernel {
  public:
   explicit MklFusedBatchNormOp(OpKernelConstruction* context)
@@ -696,6 +751,28 @@ class MklFusedBatchNormOp : public OpKernel {
     depth_ = 0;
     mean_values_ = nullptr;
     variance_values_ = nullptr;
+
+#ifndef ENABLE_MKLDNN_V1
+    OP_REQUIRES(context, !is_batch_norm_ex,
+                errors::InvalidArgument(
+                    "_MklFusedBatchNormEx is not supported in DNNL 0.x ."));
+#endif
+    if (!is_batch_norm_ex) {
+      activation_mode_ = FusedBNActivationMode::kIdentity;
+    } else {
+      int num_side_inputs;
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("num_side_inputs", &num_side_inputs));
+      // Currently _MKLFusedBatchNormEx do not support "SideInput"
+      OP_REQUIRES(context, num_side_inputs == 0,
+                  errors::InvalidArgument(
+                      "_MKLFusedBatchNorm do not support side input now."));
+
+      OP_REQUIRES_OK(context, ParseActivationMode(context, &activation_mode_));
+      OP_REQUIRES(context, activation_mode_ == FusedBNActivationMode::kRelu,
+                  errors::InvalidArgument(
+                      "_MKLFusedBatchNorm only support Relu activation"));
+    }
   }
 
   void Compute(OpKernelContext* context) override {
@@ -744,9 +821,12 @@ class MklFusedBatchNormOp : public OpKernel {
 
       // Handle the special case: input with 0 element and 0 batch size.
       Tensor* dst_tensor = nullptr;
+      TensorShape workspace_tf_shape;
       if (tf_shape_src.num_elements() == 0) {
-        HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(),
-                         &dst_tensor);
+        size_t workspace_bytes = 0;
+        workspace_tf_shape.AddDim(workspace_bytes);
+        HandleEmptyInput(context, tf_shape_src, workspace_tf_shape,
+                         scale_tensor.shape(), &dst_tensor);
         return;
       }
 
@@ -758,23 +838,16 @@ class MklFusedBatchNormOp : public OpKernel {
       // Index of output tensor(diff_src).
       const size_t kDstIndex = 0;
 
-      // Allocate 4 output TF tensors.
+      // Allocate 5 output TF tensors.
       Tensor* batch_mean_tensor = nullptr;
       Tensor* batch_variance_tensor = nullptr;
       Tensor* saved_mean_tensor = nullptr;
       Tensor* saved_variance_tensor = nullptr;
       Tensor* reserved_space_tensor = nullptr;
-      AllocateTFOutputs(context, scale_tensor.shape(), &batch_mean_tensor,
-                        &batch_variance_tensor, &saved_mean_tensor,
-                        &saved_variance_tensor, &reserved_space_tensor);
-
-      if (is_training_)
-        SetMeanVariance(*batch_mean_tensor, *batch_variance_tensor);
-      else
-        SetMeanVariance(est_mean_tensor, est_variance_tensor);
 
       MklDnnData<T> src(&cpu_engine_);
       MklDnnData<U> weights(&cpu_engine_);
+      MklDnnData<U> wksp(&cpu_engine_);
 
       MEMORY_FORMAT dnn_fmt;
       MKL_TENSOR_FORMAT mkl_tensor_fmt;
@@ -801,6 +874,51 @@ class MklFusedBatchNormOp : public OpKernel {
                         ? dnn_shape_src.GetMklLayout()
                         : memory::desc(src_dims, MklDnnType<T>(), dnn_fmt);
 
+#ifdef ENABLE_MKLDNN_V1
+      MklBatchNormFwdParams fwdParams(src_dims, depth_, epsilon_, is_training_,
+                                      src_md, activation_mode_);
+#else
+      MklBatchNormFwdParams fwdParams(
+          src_dims, depth_, epsilon_, is_training_,
+          static_cast<MEMORY_FORMAT>(src_md.data.format), activation_mode_);
+#endif  // ENABLE_MKLDNN_V1
+      // Get forward batch-normalization op from the primitive caching pool.
+      MklFusedBatchNormFwdPrimitive<T, U>* bn_fwd =
+          MklFusedBatchNormFwdPrimitiveFactory<T, U>::Get(fwdParams);
+
+      // Allocate workspace tensor
+      U* ws_data = nullptr;
+      if (fwdParams.activation_mode == FusedBNActivationMode::kRelu) {
+#ifdef ENABLE_MKLDNN_V1
+        MEMORY_PRIMITIVE_DESC workspace_pd =
+            bn_fwd->GetBatchNormFwdPd()->workspace_desc();
+        size_t workspace_bytes = workspace_pd.get_size();
+        workspace_tf_shape.AddDim(workspace_bytes);
+
+        AllocateTFOutputs(context, scale_tensor.shape(), workspace_tf_shape,
+                          &batch_mean_tensor, &batch_variance_tensor,
+                          &saved_mean_tensor, &saved_variance_tensor,
+                          &reserved_space_tensor);
+        if (reserved_space) {
+          wksp.SetUsrMem(workspace_pd, reserved_space_tensor);
+          ws_data = static_cast<U*>(wksp.GetOpMem().get_data_handle());
+        }
+#endif  // ENABLE_MKLDNN_V1
+      } else {
+        // There is actually no workspace tensor out, so we make a dummy one.
+        size_t workspace_bytes = 0;
+        workspace_tf_shape.AddDim(workspace_bytes);
+        AllocateTFOutputs(context, scale_tensor.shape(), workspace_tf_shape,
+                          &batch_mean_tensor, &batch_variance_tensor,
+                          &saved_mean_tensor, &saved_variance_tensor,
+                          &reserved_space_tensor);
+      }
+
+      if (is_training_)
+        SetMeanVariance(*batch_mean_tensor, *batch_variance_tensor);
+      else
+        SetMeanVariance(est_mean_tensor, est_variance_tensor);
+
       // MKL-DNN packs scale & shift as "weights":
       // <scale>...<scale><shift>...<shift>
       weights.AllocateBuffer(2 * depth_ * sizeof(U));
@@ -821,25 +939,15 @@ class MklFusedBatchNormOp : public OpKernel {
                   reinterpret_cast<char*>(variance_values_),
                   depth_ * sizeof(U));
 
-#ifdef ENABLE_MKLDNN_V1
-      MklBatchNormFwdParams fwdParams(src_dims, depth_, epsilon_, is_training_,
-                                      src_md);
-#else
-      MklBatchNormFwdParams fwdParams(
-          src_dims, depth_, epsilon_, is_training_,
-          static_cast<MEMORY_FORMAT>(src_md.data.format));
-#endif  // ENABLE_MKLDNN_V1
-      // Get forward batch-normalization op from the primitive caching pool.
-      MklFusedBatchNormFwdPrimitive<T, U>* bn_fwd =
-          MklFusedBatchNormFwdPrimitiveFactory<T, U>::Get(fwdParams);
-
       // Check if reorder is needed for src.
       const T* src_data = nullptr;
       std::shared_ptr<BatchNormFwdPd> bn_fwd_pd = bn_fwd->GetBatchNormFwdPd();
       if (IS_SRC_REORDER_NEEDED(src_md, bn_fwd_pd, bn_fwd)) {
         src.SetUsrMem(src_md, &src_tensor);
-        src.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            GET_SRC_DESC_FROM_OP_PD(bn_fwd_pd), cpu_engine_));
+        src.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(GET_SRC_DESC_FROM_OP_PD(bn_fwd_pd),
+                                   cpu_engine_),
+            context);
         src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
       } else {
         src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
@@ -865,9 +973,10 @@ class MklFusedBatchNormOp : public OpKernel {
       T* dst_data = dst_tensor->flat<T>().data();
 
       // Execute
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(context, bn_fwd->GetEngine()));
       bn_fwd->Execute(src_data, weights_op_data, dst_data, mean_op_data,
-                      variance_op_data);
-
+                      variance_op_data, fwd_cpu_stream, ws_data);
       float adjust_factor = 1.0;
       if (is_training_) {
         size_t orig_size = src_dims[0] * src_dims[2] * src_dims[3];
@@ -881,9 +990,6 @@ class MklFusedBatchNormOp : public OpKernel {
       auto batch_variance_data = batch_variance_tensor->flat<U>().data();
       auto est_mean_data = est_mean_tensor.flat<U>().data();
       auto est_variance_data = est_variance_tensor.flat<U>().data();
-
-      // TODO(intel-tf): Merge the `is_training && exponential_avg_factor == 1`
-      // case with the `else` (`!is_training`) case if possible.
       if (is_training_) {
         if (exponential_avg_factor_ == U(1.0)) {
           for (int k = 0; k < depth_; k++) {
@@ -924,6 +1030,7 @@ class MklFusedBatchNormOp : public OpKernel {
   U* mean_values_;
   U* variance_values_;
   size_t depth_;  // Batch normalization is performed for per channel.
+  FusedBNActivationMode activation_mode_;
   engine cpu_engine_ = engine(ENGINE_CPU, 0);
 
   void ExtractParams(OpKernelContext* context) {
@@ -938,6 +1045,7 @@ class MklFusedBatchNormOp : public OpKernel {
   }
 
   void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src,
+                        TensorShape workspace_tf_shape,
                         TensorShape tf_shape_scale, Tensor** dst_tensor) {
     DCHECK(dst_tensor);
 
@@ -955,12 +1063,14 @@ class MklFusedBatchNormOp : public OpKernel {
     Tensor* saved_mean_tensor = nullptr;
     Tensor* saved_variance_tensor = nullptr;
     Tensor* reserved_space_tensor = nullptr;
-    AllocateTFOutputs(context, tf_shape_scale, &batch_mean_tensor,
-                      &batch_variance_tensor, &saved_mean_tensor,
-                      &saved_variance_tensor, &reserved_space_tensor);
+    AllocateTFOutputs(context, tf_shape_scale, workspace_tf_shape,
+                      &batch_mean_tensor, &batch_variance_tensor,
+                      &saved_mean_tensor, &saved_variance_tensor,
+                      &reserved_space_tensor);
   }
 
   void AllocateTFOutputs(OpKernelContext* context, TensorShape tf_shape_scale,
+                         TensorShape workspace_tf_shape,
                          Tensor** batch_mean_tensor,
                          Tensor** batch_variance_tensor,
                          Tensor** saved_mean_tensor,
@@ -1024,21 +1134,15 @@ class MklFusedBatchNormOp : public OpKernel {
     std::fill_n(saved_variance_data, num_elements, static_cast<U>(0));
 
     // Changes to support reserved_space_3 parameter in FusedBatchNormV3.
-    // TODO: This parameter functionality is not implemented on CPU.
-    //       It is used to hold intermediate results. So the allocated
-    //       memory is filled with 0s.
     if (reserved_space) {
       DCHECK(reserved_space_tensor != nullptr);
 
       MklDnnShape mkl_shape_reserved_space;
       mkl_shape_reserved_space.SetMklTensor(false);
       AllocateOutputSetMklShape(context, kReservedSpaceIndex,
-                                reserved_space_tensor, tf_shape_scale,
+                                reserved_space_tensor, workspace_tf_shape,
                                 mkl_shape_reserved_space);
       DCHECK((*reserved_space_tensor) != nullptr);
-      auto saved_reserved_space_data =
-          (*reserved_space_tensor)->flat<U>().data();
-      std::fill_n(saved_reserved_space_data, num_elements, static_cast<U>(0));
     }
   }
 };
@@ -1208,8 +1312,10 @@ class MklFusedBatchNormGradOp : public OpKernel {
       std::shared_ptr<BatchNormBwdPd> bn_bwd_pd = bn_bwd->GetBatchNormBwdPd();
       if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, bn_bwd_pd, bn_bwd)) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
-        diff_dst.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            GET_DIFF_DST_DESC_FROM_OP_PD(bn_bwd_pd), cpu_engine_));
+        diff_dst.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(GET_DIFF_DST_DESC_FROM_OP_PD(bn_bwd_pd),
+                                   cpu_engine_),
+            context);
         diff_dst_data = static_cast<T*>(diff_dst.GetOpMem().get_data_handle());
       } else {
         diff_dst_data =
@@ -1246,10 +1352,11 @@ class MklFusedBatchNormGradOp : public OpKernel {
                             : nullptr);
 
       // Execute
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, bn_bwd->GetEngine()));
       bn_bwd->Execute(src_data, mean_data, variance_data, diff_dst_data,
                       weights_data, diff_src_data, diff_weights_data,
-                      res_space_data);
-
+                      res_space_data, bwd_cpu_stream);
       // Allocate output TF tensors diff_scale and diff_shift.
       Tensor* diff_scale_tensor = nullptr;
       Tensor* diff_shift_tensor = nullptr;
@@ -1363,7 +1470,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
           .Device(DEVICE_CPU)                                  \
           .TypeConstraint<T>("T")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormOp<CPUDevice, T, T, false>);
+      MklFusedBatchNormOp<CPUDevice, T, T, false, false>);
 
 TF_CALL_float(REGISTER_MKL_FUSED_BATCHNORM_CPU);
 TF_CALL_bfloat16(REGISTER_MKL_FUSED_BATCHNORM_CPU);
@@ -1376,7 +1483,7 @@ TF_CALL_bfloat16(REGISTER_MKL_FUSED_BATCHNORM_CPU);
           .TypeConstraint<T>("T")                              \
           .TypeConstraint<U>("U")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormOp<CPUDevice, T, U, false>);
+      MklFusedBatchNormOp<CPUDevice, T, U, false, false>);
 
 REGISTER_MKL_FUSED_BATCHNORM_V2_CPU(float, float);
 REGISTER_MKL_FUSED_BATCHNORM_V2_CPU(bfloat16, float);
@@ -1417,12 +1524,30 @@ REGISTER_MKL_FUSED_BATCHNORM_GRAD_V2_CPU(bfloat16, float);
           .TypeConstraint<T>("T")                              \
           .TypeConstraint<U>("U")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormOp<CPUDevice, T, U, true>);
+      MklFusedBatchNormOp<CPUDevice, T, U, true, false>);      \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name("_MklFusedBatchNormEx")                             \
+          .Device(DEVICE_CPU)                                  \
+          .TypeConstraint<T>("T")                              \
+          .TypeConstraint<U>("U")                              \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
+      MklFusedBatchNormOp<CPUDevice, T, U, true, true>);
 
 REGISTER_MKL_FUSED_BATCHNORM_V3_CPU(float, float);
 REGISTER_MKL_FUSED_BATCHNORM_V3_CPU(bfloat16, float);
 #undef REGISTER_MKL_FUSED_BATCHNORM_V3_CPU
 
+REGISTER_KERNEL_BUILDER(Name("_FusedBatchNormEx")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .TypeConstraint<float>("U"),
+                        NoOp);
+REGISTER_KERNEL_BUILDER(Name("_FusedBatchNormEx")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<bfloat16>("T")
+                            .TypeConstraint<float>("U"),
+                        NoOp);
+
 #define REGISTER_MKL_FUSED_BATCHNORM_GRAD_V3_CPU(T, U)         \
   REGISTER_KERNEL_BUILDER(                                     \
       Name("_MklFusedBatchNormGradV3")                         \
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 532dbaa79b4..a11e7ebcbf5 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -88,7 +88,6 @@ class MklLRNOp : public OpKernel {
     workspace_enabled_ = false;
     OP_REQUIRES_OK(context,
                    context->GetAttr("workspace_enabled", &workspace_enabled_));
-    fwd_stream_.reset(new CPU_STREAM(cpu_engine_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -169,6 +168,7 @@ class MklLRNOp : public OpKernel {
           lrn_prim_desc.PRIMITIVE_DESC_SRC, cpu_engine_));
 
       std::vector<primitive> net;
+      fwd_stream_.reset(CreateStream(context, cpu_engine_));
 #ifdef ENABLE_MKLDNN_V1
       net.push_back(lrn_forward(lrn_prim_desc));
       std::vector<std::unordered_map<int, memory>> net_args;
diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc
index 3a7c864d10e..86193901c96 100644
--- a/tensorflow/core/kernels/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@@ -25,6 +25,11 @@ limitations under the License.
 
 #if defined(INTEL_MKL)
 
+#ifdef ENABLE_MKLDNN_V1
+#include "mkldnn.hpp"
+#else
+#include "mkl_cblas.h"
+#endif  // ENABLE_MKLDNN_V1
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -32,13 +37,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/mkl_matmul_ops_common.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-// This header file is part of MKL ML, need equivalent file in MKL DNN
-#ifndef INTEL_MKL_DNN_ONLY
-#include "mkl_cblas.h"
-#endif
-
-#include "mkldnn.h"
-
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -157,21 +155,18 @@ class MklMatMulOp : public OpKernel {
     // 1.0 and 0.0 respectively.
     const float alpha = 1.0f;
     const float beta = 0.0f;
-#if defined(INTEL_MKL_DNN_ONLY)
-    const char* const ftrans[] = {"N", "T", "C"};
-    int index_transa = transa ? 1 : 0;
-    int index_transb = transb ? 1 : 0;
-    VLOG(2) << "MKL DNN SGEMM called";
-    // MKL DNN only supports the Fortran api and requires column major while
-    // Tensorflow uses row major so we reverse the order A and B
-    mkldnn_sgemm(ftrans[index_transb], ftrans[index_transa], &n, &m, &k, &alpha,
-                 b, &ldb, a, &lda, &beta, c, &ldc);
+#ifdef ENABLE_MKLDNN_V1
+    char char_transa = transa ? 'T' : 'N';
+    char char_transb = transb ? 'T' : 'N';
+    VLOG(2) << "MKL DNN SGEMM CALLED";
+    dnnl_sgemm(char_transa, char_transb, m, n, k, alpha, a, lda, b, ldb, beta,
+               c, ldc);
 #else
-    // MKL ML binary uses CBLAS API
+    // TODO(intel-tf): Remove this after TF2.3 fork.
     cblas_sgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
                 transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b,
                 ldb, beta, c, ldc);
-#endif
+#endif  // ENABLE_MKLDNN_V1
   }
 
 #ifdef ENABLE_INTEL_MKL_BFLOAT16
@@ -184,14 +179,13 @@ class MklMatMulOp : public OpKernel {
     const int index_transa = transa ? 1 : 0;
     const int index_transb = transb ? 1 : 0;
 
-    Tensor c_float;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, {m, n}, &c_float));
 #ifdef ENABLE_MKLDNN_V1
     const char ftrans[] = {'N', 'T', 'C'};
     dnnl_gemm<bfloat16>(ftrans[index_transa], ftrans[index_transb], m, n, k,
-                        alpha, a, lda, b, ldb, beta,
-                        c_float.flat<float>().data(), ldc);
+                        alpha, a, lda, b, ldb, beta, c, ldc);
 #else
+    Tensor c_float;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, {m, n}, &c_float));
     const char* const ftrans[] = {"N", "T", "C"};
 
     // MKL-DNN only supports the Fortran API and requires column major while
@@ -201,57 +195,10 @@ class MklMatMulOp : public OpKernel {
                             reinterpret_cast<const mkldnn_bfloat16_t*>(b), &ldb,
                             reinterpret_cast<const mkldnn_bfloat16_t*>(a), &lda,
                             &beta, c_float.flat<float>().data(), &ldc);
-#endif  // ENABLE_MKLDNN_V1
     FloatToBFloat16(c_float.flat<float>().data(), c, c_float.NumElements());
+#endif  // ENABLE_MKLDNN_V1
   }
 #endif  // ENABLE_INTEL_MKL_BFLOAT16
-
-// MKL-DNN only supports SGEMM and bfloat16-GEMM.
-#ifndef INTEL_MKL_DNN_ONLY
-
-  // Matrix-Matrix Multiplication with FP64 tensors. For detailed info about
-  // parameters, look at FP32 function description.
-  void MklBlasGemm(OpKernelContext* ctx, bool transa, bool transb, const int m,
-                   const int n, const int k, const double* a, const int lda,
-                   const double* b, const int ldb, double* c, const int ldc) {
-    const double alpha = 1.0;
-    const double beta = 0.0;
-    cblas_dgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
-                transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b,
-                ldb, beta, c, ldc);
-  }
-
-  // Matrix-Matrix Multiplication with Complex64 (std::complex<float>) tensors.
-  // For detailed info about parameters, look at FP32 function description.
-  void MklBlasGemm(OpKernelContext* ctx, bool transa, bool transb, const int m,
-                   const int n, const int k, const complex64* a, const int lda,
-                   const complex64* b, const int ldb, complex64* c,
-                   int const ldc) {
-    const MKL_Complex8 alpha = {1.0f, 0.0f};
-    const MKL_Complex8 beta = {0.0f, 0.0f};
-    cblas_cgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
-                transb ? CblasTrans : CblasNoTrans, m, n, k, &alpha,
-                reinterpret_cast<const MKL_Complex8*>(a), lda,
-                reinterpret_cast<const MKL_Complex8*>(b), ldb, &beta,
-                reinterpret_cast<MKL_Complex8*>(c), ldc);
-  }
-
-  // Matrix-Matrix Multiplication with Complex128 (std::complex<double>)
-  // tensors. For detailed info about parameters, look at FP32 function
-  // description.
-  void MklBlasGemm(OpKernelContext* ctx, bool transa, bool transb, const int m,
-                   const int n, const int k, const complex128* a, const int lda,
-                   const complex128* b, const int ldb, complex128* c,
-                   const int ldc) {
-    const MKL_Complex16 alpha = {1.0, 0.0};
-    const MKL_Complex16 beta = {0.0, 0.0};
-    cblas_zgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
-                transb ? CblasTrans : CblasNoTrans, m, n, k, &alpha,
-                reinterpret_cast<const MKL_Complex16*>(a), lda,
-                reinterpret_cast<const MKL_Complex16*>(b), ldb, &beta,
-                reinterpret_cast<MKL_Complex16*>(c), ldc);
-  }
-#endif  // !INTEL_MKL_DNN_ONLY
 };
 
 #define REGISTER_CPU(T)                                   \
@@ -269,13 +216,6 @@ TF_CALL_float(REGISTER_CPU);
 #ifdef ENABLE_INTEL_MKL_BFLOAT16
 TF_CALL_bfloat16(REGISTER_CPU);
 #endif  // ENABLE_INTEL_MKL_BFLOAT16
-
-#ifndef INTEL_MKL_DNN_ONLY
-TF_CALL_double(REGISTER_CPU);
-TF_CALL_complex64(REGISTER_CPU);
-TF_CALL_complex128(REGISTER_CPU);
-#endif  // !INTEL_MKL_DNN_ONLY
 #endif  // ENABLE_MKL
-
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl_matmul_ops_common.h
index ab816ce73fa..d3a05a4a6d2 100644
--- a/tensorflow/core/kernels/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl_matmul_ops_common.h
@@ -516,33 +516,190 @@ class MklDnnMatMulOpBase : public OpKernel {
 
 // MatMul support for bfloat16 and int8 types is introduced in DNNLv1.2.
 #ifdef ENABLE_MKLDNN_V1
+
+using mkldnn::matmul;
+
 namespace {
 
-void dnnl_gemm_exec(const memory::desc& a_md, const memory::desc& b_md,
-                    const memory::desc& c_md, const void* a, const void* b,
-                    void* c, const primitive_attr& attr) {
-  // Create a MatMul primitive
-  mkldnn::engine cpu_engine = mkldnn::engine(ENGINE_CPU, 0);
-  mkldnn::matmul::desc matmul_desc(a_md, b_md, c_md);
-  mkldnn::matmul::primitive_desc matmul_pd(matmul_desc, attr, cpu_engine);
-  mkldnn::matmul matmul_prim(matmul_pd);
-  // Wrap raw pointers into DNNL memory objects
-  mkldnn::memory a_memory(a_md, cpu_engine, const_cast<void*>(a));
-  mkldnn::memory b_memory(b_md, cpu_engine, const_cast<void*>(b));
-  mkldnn::memory c_memory(c_md, cpu_engine, c);
-  // Execute the MatMul primitive.
-  // Since here all shapes and parameters are static, please note that we
-  // don't need to pass alpha (scales) again, as they are already hard-coded
-  // in the primitive descriptor. Also, we are not allowed to change the
-  // shapes of matrices A, B, and C -- they should exactly match
-  // the memory descriptors passed to MatMul operation descriptor.
-  mkldnn::stream s(cpu_engine);
-  matmul_prim.execute(s, {{DNNL_ARG_SRC, a_memory},
-                          {DNNL_ARG_WEIGHTS, b_memory},
-                          { DNNL_ARG_DST,
-                            c_memory }});
-  s.wait();
-}
+struct MklMatMulParams {
+  memory::dims a_dims;
+  memory::dims b_dims;
+  memory::dims c_dims;
+  memory::dims a_strides;
+  memory::dims b_strides;
+  memory::dims c_strides;
+
+  MklMatMulParams(memory::dims a_dims, memory::dims b_dims, memory::dims c_dims,
+                  memory::dims a_strides, memory::dims b_strides,
+                  memory::dims c_strides)
+      : a_dims(a_dims),
+        b_dims(b_dims),
+        c_dims(c_dims),
+        a_strides(a_strides),
+        b_strides(b_strides),
+        c_strides(c_strides) {}
+};
+
+template <typename T>
+class MklMatMulPrimitive : public MklPrimitive {
+ public:
+  explicit MklMatMulPrimitive(const MklMatMulParams& params)
+      : cpu_engine_(ENGINE_CPU, 0) {
+    context_.stream.reset(new CPU_STREAM(cpu_engine_));
+    // Create matmul primitive
+    Setup(params);
+  }
+
+  ~MklMatMulPrimitive() {}
+
+  void Execute(const T* a_data, const T* b_data, T* c_data) {
+    context_.a_mem->set_data_handle(static_cast<void*>(const_cast<T*>(a_data)));
+    context_.b_mem->set_data_handle(static_cast<void*>(const_cast<T*>(b_data)));
+    context_.c_mem->set_data_handle(static_cast<void*>(const_cast<T*>(c_data)));
+
+    execute_primitives(context_.matmul_primitives, context_.stream,
+                       context_.net_args);
+
+    // After execution, set data handle back
+    context_.a_mem->set_data_handle(DummyData);
+    context_.b_mem->set_data_handle(DummyData);
+    context_.c_mem->set_data_handle(DummyData);
+  }
+
+ private:
+  // Primitive reuse context for MatMul op
+  struct MklMatMulContext {
+    // MKL-DNN memory.
+    std::shared_ptr<mkldnn::memory> a_mem;
+    std::shared_ptr<mkldnn::memory> b_mem;
+    std::shared_ptr<mkldnn::memory> c_mem;
+
+    // Descriptor and primitive-descriptor for MatMul.
+    std::shared_ptr<matmul::desc> desc;
+    std::shared_ptr<matmul::primitive_desc> prim_desc;
+
+    // Memory descriptors.
+    std::shared_ptr<mkldnn::memory::desc> a_md;
+    std::shared_ptr<mkldnn::memory::desc> b_md;
+    std::shared_ptr<mkldnn::memory::desc> c_md;
+
+    // MatMul primitive.
+    std::shared_ptr<mkldnn::stream> stream;
+    std::vector<mkldnn::primitive> matmul_primitives;
+    std::vector<std::unordered_map<int, memory>> net_args;
+
+    MklMatMulContext()
+        : a_mem(nullptr),
+          b_mem(nullptr),
+          c_mem(nullptr),
+          desc(nullptr),
+          prim_desc(nullptr),
+          a_md(nullptr),
+          b_md(nullptr),
+          c_md(nullptr),
+          stream(nullptr) {}
+  };
+
+  void Setup(const MklMatMulParams& params) {
+    std::shared_ptr<mkldnn::primitive> matmul_primitive = nullptr;
+
+    // Create MatMul descriptor and primitive descriptor.
+    context_.a_md.reset(
+        new memory::desc({params.a_dims}, MklDnnType<T>(), params.a_strides));
+
+    context_.b_md.reset(
+        new memory::desc({params.b_dims}, MklDnnType<T>(), params.b_strides));
+
+    context_.c_md.reset(
+        new memory::desc({params.c_dims}, MklDnnType<T>(), params.c_strides));
+
+    // Create matmul.
+    context_.desc.reset(
+        new matmul::desc(*context_.a_md, *context_.b_md, *context_.c_md));
+    context_.prim_desc.reset(
+        new matmul::primitive_desc(*context_.desc, cpu_engine_));
+
+    // Create memory primitive based on dummy data.
+    context_.a_mem.reset(
+        new mkldnn::memory(*context_.a_md, cpu_engine_, DummyData));
+    context_.b_mem.reset(
+        new mkldnn::memory(*context_.b_md, cpu_engine_, DummyData));
+    context_.c_mem.reset(
+        new mkldnn::memory(*context_.b_md, cpu_engine_, DummyData));
+
+    // Create matmul primitive.
+    matmul_primitive.reset(new mkldnn::matmul(*context_.prim_desc));
+    context_.net_args.push_back({{MKLDNN_ARG_SRC, *context_.a_mem},
+                                 {MKLDNN_ARG_WEIGHTS, *context_.b_mem},
+                                 { MKLDNN_ARG_DST,
+                                   *context_.c_mem }});
+
+    context_.matmul_primitives.push_back(*matmul_primitive);
+    return;
+  }
+
+  struct MklMatMulContext context_;
+  engine cpu_engine_;
+};
+
+template <typename T>
+class MklMatMulPrimitiveFactory : public MklPrimitiveFactory<T> {
+ public:
+  static MklMatMulPrimitive<T>* Get(const MklMatMulParams& params,
+                                    bool do_not_cache) {
+    MklMatMulPrimitive<T>* matmul_prim = nullptr;
+
+    if (do_not_cache) {
+      // Always create new primitive
+      matmul_prim = new MklMatMulPrimitive<T>(params);
+    } else {
+      // Try to find a suitable one in pool
+      matmul_prim = dynamic_cast<MklMatMulPrimitive<T>*>(
+          MklMatMulPrimitiveFactory<T>::GetInstance().GetMklMatMul(params));
+      if (matmul_prim == nullptr) {
+        matmul_prim = new MklMatMulPrimitive<T>(params);
+        MklMatMulPrimitiveFactory<T>::GetInstance().SetMklMatMul(params,
+                                                                 matmul_prim);
+      }
+    }
+
+    return matmul_prim;
+  }
+
+ private:
+  MklMatMulPrimitiveFactory() {}
+  ~MklMatMulPrimitiveFactory() {}
+
+  static MklMatMulPrimitiveFactory& GetInstance() {
+    static MklMatMulPrimitiveFactory instance_;
+    return instance_;
+  }
+
+  static string CreateKey(const MklMatMulParams& params) {
+    string prefix = "matmul_";
+    FactoryKeyCreator key_creator;
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(params.a_dims);
+    key_creator.AddAsKey(params.b_dims);
+    key_creator.AddAsKey(params.c_dims);
+    key_creator.AddAsKey(params.a_strides);
+    key_creator.AddAsKey(params.b_strides);
+    key_creator.AddAsKey(params.c_strides);
+    key_creator.AddAsKey(typeid(T).name());
+
+    return key_creator.GetKey();
+  }
+
+  MklPrimitive* GetMklMatMul(const MklMatMulParams& params) {
+    string key = CreateKey(params);
+    return this->GetOp(key);
+  }
+
+  void SetMklMatMul(const MklMatMulParams& params, MklPrimitive* op) {
+    string key = CreateKey(params);
+    this->SetOp(key, op);
+  }
+};
 
 template <typename T>
 void dnnl_gemm_batch(const std::vector<bool>& transa,
@@ -589,45 +746,47 @@ void dnnl_gemm_batch(const std::vector<bool>& transa,
       !transb[0] ? dims{k[0] * n[0], n[0], 1} : dims{n[0] * k[0], 1, k[0]};
   dims c_strides = dims{m[0] * n[0], n[0], 1};
 
-  // Prepare memory descriptors
-  memory::desc a_md(a_sizes, MklDnnType<T>(), a_strides);
-  memory::desc b_md(b_sizes, MklDnnType<T>(), b_strides);
-  memory::desc c_md(c_sizes, MklDnnType<T>(), c_strides);
-  // Create attributes (to handle alpha and beta if necessary)
-  mkldnn::primitive_attr attr;
-  if (alpha[0] != 1.f) attr.set_output_scales(/* mask */ 0, {alpha[0]});
-  if (beta[0] != 0.f) {
-    mkldnn::post_ops po;
-    po.append_sum(beta[0]);
-    attr.set_post_ops(po);
-  }
-  dnnl_gemm_exec(a_md, b_md, c_md, static_cast<const void*>(a),
-                 static_cast<const void*>(b), static_cast<void*>(c), attr);
+  // MklMatMul uses const alpha and beta, make guarantee here to ensure
+  // they are never changed.
+  DCHECK_EQ(alpha, 1.0f);
+  DCHECK_EQ(beta, 0.f);
+
+  MklMatMulParams params(a_sizes, b_sizes, c_sizes, a_strides, b_strides,
+                         c_strides);
+  MklMatMulPrimitive<T>* matmul_prim =
+      MklMatMulPrimitiveFactory<T>::Get(params, 0);
+
+  // Execute matmul primitive.
+  matmul_prim->Execute(a, b, c);
 }
 
 template <typename T>
 void dnnl_gemm(char transa, char transb, int64_t m, int64_t n, int64_t k,
                float alpha, const T* a, int64_t lda, const T* b, int64_t ldb,
-               float beta, float* c, int64_t ldc) {
+               float beta, T* c, int64_t ldc) {
   using dims = mkldnn::memory::dims;
+
   // Prepare strides based on the transa and transb flags: transposed
   // matrices have strides swapped
+  dims a_dims = dims{m, k};
+  dims b_dims = dims{k, n};
+  dims c_dims = dims{m, n};
   dims a_strides = tolower(transa) == 'n' ? dims{lda, 1} : dims{1, lda};
   dims b_strides = tolower(transb) == 'n' ? dims{ldb, 1} : dims{1, ldb};
-  // Prepare memory descriptors
-  memory::desc a_md({m, k}, MklDnnType<T>(), a_strides);
-  memory::desc b_md({k, n}, MklDnnType<T>(), b_strides);
-  memory::desc c_md({m, n}, MklDnnType<float>(), {ldc, 1});
-  // Create attributes (to handle alpha and beta if necessary)
-  mkldnn::primitive_attr attr;
-  if (alpha != 1.f) attr.set_output_scales(/* mask */ 0, {alpha});
-  if (beta != 0.f) {
-    mkldnn::post_ops po;
-    po.append_sum(beta);
-    attr.set_post_ops(po);
-  }
-  dnnl_gemm_exec(a_md, b_md, c_md, static_cast<const void*>(a),
-                 static_cast<const void*>(b), static_cast<void*>(c), attr);
+  dims c_strides = dims{ldc, 1};
+
+  // MklMatMul uses const alpha and beta, make guarantee here to ensure
+  // they are never changed.
+  DCHECK_EQ(alpha, 1.0f);
+  DCHECK_EQ(beta, 0.f);
+
+  MklMatMulParams params(a_dims, b_dims, c_dims, a_strides, b_strides,
+                         c_strides);
+  MklMatMulPrimitive<T>* matmul_prim =
+      MklMatMulPrimitiveFactory<T>::Get(params, 0);
+
+  // Execute matmul primitive.
+  matmul_prim->Execute(a, b, c);
 }
 
 }  // anonymous namespace
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index dbccb35b88b..ac6a1046507 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -167,10 +167,12 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       const T* src_data = input_tensor.flat<T>().data();
 
       T* dst_data = output_tensor->flat<T>().data();
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(context, pooling_fwd->GetEngine()));
 
       if (int8_forward_inference) {
         // Execute pooling op
-        pooling_fwd->Execute(src_data, dst_data);
+        pooling_fwd->Execute(src_data, dst_data, nullptr, fwd_cpu_stream);
 
         // Pass min, max from input to output.
         const Tensor& min_input_t = MklGetInput(context, 1);
@@ -197,7 +199,7 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
         T* ws_data =
             static_cast<T*>(dnn_data_wksp.GetOpMem().get_data_handle());
         // Execute pooling op.
-        pooling_fwd->Execute(src_data, dst_data, ws_data);
+        pooling_fwd->Execute(src_data, dst_data, ws_data, fwd_cpu_stream);
       }
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
@@ -322,6 +324,8 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       MklPoolingBwdPrimitive<T>* pooling_bwd =
           MklPoolingBwdPrimitiveFactory<T>::Get(bwdParams);
 
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, pooling_bwd->GetEngine()));
       // Allocate output tensor and memory primitive.
       Tensor* output_tensor = nullptr;
       this->AllocateOutputTensor(context, *(pooling_bwd->GetPoolingBwdPd()),
@@ -335,8 +339,10 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, pooling_bwd_pd,
                                      pooling_bwd)) {
         grad_dnn_data.SetUsrMem(diff_dst_md, &grad_tensor);
-        grad_dnn_data.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            GET_DIFF_DST_DESC_FROM_OP_PD(pooling_bwd_pd), cpu_engine_));
+        grad_dnn_data.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(GET_DIFF_DST_DESC_FROM_OP_PD(pooling_bwd_pd),
+                                   cpu_engine_),
+            context);
         diff_dst_data =
             static_cast<T*>(grad_dnn_data.GetOpMem().get_data_handle());
       } else {
@@ -361,7 +367,8 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       T* diff_src_data = output_tensor->flat<T>().data();
 
       // Execute pooling op.
-      pooling_bwd->Execute(diff_dst_data, diff_src_data, ws_data);
+      pooling_bwd->Execute(diff_dst_data, diff_src_data, ws_data,
+                           bwd_cpu_stream);
     } catch (mkldnn::error& e) {
       string error_msg = "Status:" + std::to_string(e.status) +
                          ", message: " + string(e.message) + ". in file " +
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index 5bd9c17f95e..2dfc6db0075 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_shape_util.h"
-
 namespace tensorflow {
 using mkldnn::prop_kind;
 
@@ -38,11 +37,11 @@ void MklPoolingFwdPrimitive<T>::Setup(const MklPoolingParams& fwdParams) {
   context_.alg_kind = fwdParams.alg_kind;
   context_.prop_kind = fwdParams.prop_kind;
 
-  // Create memory descriptor
-  // FIXME: Pooling doesn't expose to get the src_primitive_desc,
-  //        so src format is currently hard-coded.
-  //        A utility function is used to do this,
-  //        which may be broken with future CPU architectures
+// Create memory descriptor
+// FIXME: Pooling doesn't expose to get the src_primitive_desc,
+//        so src format is currently hard-coded.
+//        A utility function is used to do this,
+//        which may be broken with future CPU architectures
 #ifndef ENABLE_MKLDNN_V1
   bool is_2d = (fwdParams.src_dims.size() == 4);
   if (std::is_same<T, qint8>::value || std::is_same<T, quint8>::value)
@@ -126,7 +125,8 @@ void MklPoolingFwdPrimitive<T>::Setup(const MklPoolingParams& fwdParams) {
 
 template <typename T>
 void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
-                                        void* ws_data) {
+                                        void* ws_data,
+                                        std::shared_ptr<stream> fwd_stream) {
   context_.src_mem->set_data_handle(
       static_cast<void*>(const_cast<T*>(src_data)));
   context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
@@ -138,10 +138,9 @@ void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
   }
 
 #ifdef ENABLE_MKLDNN_V1
-  execute_primitives(context_.fwd_primitives, context_.fwd_stream,
-                     context_.net_args);
+  execute_primitives(context_.fwd_primitives, fwd_stream, context_.net_args);
 #else
-  context_.fwd_stream->submit(context_.fwd_primitives);
+  fwd_stream->submit(context_.fwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
   // Set back data handle.
@@ -268,7 +267,8 @@ void MklPoolingBwdPrimitive<T>::Setup(const MklPoolingParams& bwdParams) {
 
 template <typename T>
 void MklPoolingBwdPrimitive<T>::Execute(const T* diff_dst_data,
-                                        T* diff_src_data, const void* ws_data) {
+                                        T* diff_src_data, const void* ws_data,
+                                        std::shared_ptr<stream> bwd_stream) {
   context_.diff_dst_mem->set_data_handle(
       static_cast<void*>(const_cast<T*>(diff_dst_data)));
   context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data));
@@ -278,10 +278,9 @@ void MklPoolingBwdPrimitive<T>::Execute(const T* diff_dst_data,
   }
 
 #ifdef ENABLE_MKLDNN_V1
-  execute_primitives(context_.bwd_primitives, context_.bwd_stream,
-                     context_.net_args);
+  execute_primitives(context_.bwd_primitives, bwd_stream, context_.net_args);
 #else
-  context_.bwd_stream->submit(context_.bwd_primitives);
+  bwd_stream->submit(context_.bwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
   // Set back data handle.
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 54f4dc8503e..cb3674b2dd4 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -86,8 +86,7 @@ template <typename T>
 class MklPoolingFwdPrimitive : public MklPrimitive {
  public:
   explicit MklPoolingFwdPrimitive(const MklPoolingParams& fwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     if (context_.fwd == nullptr) Setup(fwdParams);
   }
 
@@ -97,7 +96,8 @@ class MklPoolingFwdPrimitive : public MklPrimitive {
   //   src_data:  input data buffer of src
   //   ws_data:   output data buffer of workspace
   //   dst_data:  output data buffer of dst
-  void Execute(const T* src_data, T* dst_data, void* ws_data = nullptr);
+  void Execute(const T* src_data, T* dst_data, void* ws_data,
+               std::shared_ptr<stream> fwd_stream);
 
   std::shared_ptr<PoolingFwdPd> GetPoolingFwdPd() const {
     return context_.fwd_pd;
@@ -159,12 +159,10 @@ class MklPoolingFwdPrimitive : public MklPrimitive {
           fwd_pd(nullptr),
           src_md(nullptr),
           dst_md(nullptr),
-          fwd(nullptr),
-          fwd_stream(nullptr) {}
+          fwd(nullptr) {}
   };
 
   struct PoolingFwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -229,8 +227,7 @@ template <typename T>
 class MklPoolingBwdPrimitive : public MklPrimitive {
  public:
   explicit MklPoolingBwdPrimitive(const MklPoolingParams& bwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.bwd_stream.reset(new CPU_STREAM(cpu_engine_));
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     if (context_.bwd == nullptr) Setup(bwdParams);
   }
 
@@ -240,8 +237,8 @@ class MklPoolingBwdPrimitive : public MklPrimitive {
   //   diff_dst_data:  input data buffer of diff_dst
   //   diff_src_data:  output data buffer of diff_src
   //   ws_data:        input data buffer of workspace
-  void Execute(const T* diff_dst_data, T* diff_src_data,
-               const void* ws_data = nullptr);
+  void Execute(const T* diff_dst_data, T* diff_src_data, const void* ws_data,
+               std::shared_ptr<stream> bwd_stream);
 
  public:
   std::shared_ptr<PoolingFwdPd> GetPoolingFwdPd() const {
@@ -315,12 +312,10 @@ class MklPoolingBwdPrimitive : public MklPrimitive {
           bwd_desc(nullptr),
           fwd_pd(nullptr),
           bwd_pd(nullptr),
-          bwd(nullptr),
-          bwd_stream(nullptr) {}
+          bwd(nullptr) {}
   };
 
   struct PoolingBwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
diff --git a/tensorflow/core/kernels/mkl_quantize_op.cc b/tensorflow/core/kernels/mkl_quantize_op.cc
index d049b5f58d2..5adb9862250 100644
--- a/tensorflow/core/kernels/mkl_quantize_op.cc
+++ b/tensorflow/core/kernels/mkl_quantize_op.cc
@@ -77,7 +77,7 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
  public:
   explicit MklReorderWithScalePrimitive(
       const MklReorderWithScaleFwdParams& fwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     // Create reorder primitive
     Setup(fwdParams);
   }
@@ -86,14 +86,14 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
 
   std::shared_ptr<primitive> GetPrimitive() { return context_.reorder_prim; }
 
-  void Execute(void* src_data, void* dst_data) {
+  void Execute(void* src_data, void* dst_data,
+               std::shared_ptr<stream> reorder_stream) {
     context_.src_mem->set_data_handle(src_data);
     context_.dst_mem->set_data_handle(dst_data);
 #ifndef ENABLE_MKLDNN_V1
-    context_.reorder_stream->submit(context_.net);
+    reorder_stream->submit(context_.net);
 #else
-    context_.reorder_prim->execute(*context_.reorder_stream,
-                                   context_.prim_args);
+    context_.reorder_prim->execute(*reorder_stream, context_.prim_args);
 #endif  // !ENABLE_MKLDNN_V1
     // After execution, set data handle back.
     context_.src_mem->set_data_handle(DummyData);
@@ -124,12 +124,9 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
         : src_mem(nullptr),
           dst_mem(nullptr),
           reorder_pd(nullptr),
-          reorder_prim(nullptr),
-          reorder_stream(nullptr) {}
+          reorder_prim(nullptr) {}
   } context_;
 
-  engine cpu_engine_;
-
   // Reorder primitive setup
   void Setup(const MklReorderWithScaleFwdParams& fwdParams) {
     // Create memory descriptors for reorder data with specified format
@@ -163,7 +160,6 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
     context_.prim_args.insert({MKLDNN_ARG_FROM, *context_.src_mem});
     context_.prim_args.insert({MKLDNN_ARG_TO, *context_.dst_mem});
 #endif  // !ENABLE_MKLDNN_V1
-    context_.reorder_stream.reset(new CPU_STREAM(cpu_engine_));
   }
 };
 
@@ -491,7 +487,10 @@ class MklQuantizeV2Op : public OpKernel {
     MklReorderWithScalePrimitive* reorder_prim =
         MklReorderWithScalePrimitiveFactory<T>::Get(src.GetUsrMem(),
                                                     dst.GetUsrMem(), fwdParams);
-    reorder_prim->Execute(src.GetUsrMemDataHandle(), dst.GetUsrMemDataHandle());
+    std::shared_ptr<stream> cpu_stream;
+    cpu_stream.reset(CreateStream(ctx, reorder_prim->GetEngine()));
+    reorder_prim->Execute(src.GetUsrMemDataHandle(), dst.GetUsrMemDataHandle(),
+                          cpu_stream);
 
     output_min_tensor->flat<float>()(0) = min_range;
     output_max_tensor->flat<float>()(0) = max_range;
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index ffbc1e28355..784bbc682dc 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -61,13 +61,11 @@ template <typename T>
 class MklEltwiseFwdPrimitive : public MklPrimitive {
  public:
   explicit MklEltwiseFwdPrimitive(const MklEltwiseFwdParams<T>& fwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
 #ifndef ENABLE_MKLDNN_V1
     context_.src_fmt =
         static_cast<mkldnn::memory::format>(fwdParams.src_md.data.format);
 #endif
-    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
-
     // create eltwise primitive
     if (context_.eltwise_fwd == nullptr) {
       Setup(fwdParams);
@@ -79,7 +77,8 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
   // Eltwise forward execute
   //   src_data:  input data buffer of src
   //   dst_data:  output data buffer of dst
-  void Execute(const T* src_data, T* dst_data) {
+  void Execute(const T* src_data, T* dst_data,
+               std::shared_ptr<stream> fwd_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
@@ -87,12 +86,10 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
 #ifdef ENABLE_MKLDNN_V1
     DCHECK_EQ(context_.fwd_primitives.size(),
               context_.fwd_primitives_args.size());
-    for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) {
-      context_.fwd_primitives.at(i).execute(*context_.fwd_stream,
-                                            context_.fwd_primitives_args.at(i));
-    }
+    execute_primitives(context_.fwd_primitives, fwd_stream,
+                       context_.fwd_primitives_args);
 #else
-    context_.fwd_stream->submit(context_.fwd_primitives);
+    fwd_stream->submit(context_.fwd_primitives);
 #endif
 
     // After execution, set data handle back.
@@ -134,7 +131,6 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
     // Eltwise primitive
     std::shared_ptr<mkldnn::primitive> eltwise_fwd;
 
-    std::shared_ptr<stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
 
 #ifdef ENABLE_MKLDNN_V1
@@ -153,8 +149,7 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
           src_md(nullptr),
           dst_md(nullptr),
           src_mpd(nullptr),
-          eltwise_fwd(nullptr),
-          fwd_stream(nullptr) {
+          eltwise_fwd(nullptr) {
     }
   };
 
@@ -169,14 +164,12 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
 #else
         new MEMORY_PD_CONSTRUCTOR_2_PARAMS(*context_.src_md, cpu_engine_));
 #endif
-
     // Create an eltwise forward descriptor and primitive descriptor
     context_.fwd_desc.reset(new eltwise_forward::desc(
         prop_kind::forward, fwdParams.alg_kind, *context_.src_md,
         fwdParams.alpha, fwdParams.beta));
     context_.fwd_pd.reset(new EltwiseFwdPd(*context_.fwd_desc, cpu_engine_));
     auto fwd_pd = context_.fwd_pd.get();
-
 #ifdef ENABLE_MKLDNN_V1
     // Create memory primitive based on dummy data
     context_.src_mem.reset(new MEMORY_CONSTRUCTOR(fwd_pd->PRIMITIVE_DESC_SRC,
@@ -195,12 +188,10 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
     context_.eltwise_fwd.reset(new eltwise_forward(
         *context_.fwd_pd, *context_.src_mem, *context_.dst_mem));
 #endif
-
     context_.fwd_primitives.push_back(*context_.eltwise_fwd);
   }
 
   struct EltwiseFwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -281,14 +272,13 @@ template <typename T>
 class MklEltwiseBwdPrimitive : public MklPrimitive {
  public:
   explicit MklEltwiseBwdPrimitive(const MklEltwiseBwdParams<T>& bwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
 #ifndef ENABLE_MKLDNN_V1
     context_.src_fmt =
         static_cast<mkldnn::memory::format>(bwdParams.common_md.data.format);
     context_.diff_dst_fmt =
         static_cast<mkldnn::memory::format>(bwdParams.common_md.data.format);
 #endif
-    context_.bwd_stream.reset(new stream(CPU_STREAM(cpu_engine_)));
     // create eltwise primitive
     if (context_.eltwise_bwd == nullptr) {
       Setup(bwdParams);
@@ -301,7 +291,8 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
   //   src_data:       input data buffer of src
   //   diff_dst_data:  input data buffer of diff_dst
   //   diff_src_data:  output data buffer of diff_src
-  void Execute(const T* src_data, const T* diff_dst_data, T* diff_src_data) {
+  void Execute(const T* src_data, const T* diff_dst_data, T* diff_src_data,
+               std::shared_ptr<stream> bwd_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.diff_dst_mem->set_data_handle(
@@ -311,12 +302,10 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
 #ifdef ENABLE_MKLDNN_V1
     DCHECK_EQ(context_.bwd_primitives.size(),
               context_.bwd_primitives_args.size());
-    for (size_t i = 0; i < context_.bwd_primitives.size(); ++i) {
-      context_.bwd_primitives.at(i).execute(*context_.bwd_stream,
-                                            context_.bwd_primitives_args.at(i));
-    }
+    execute_primitives(context_.bwd_primitives, bwd_stream,
+                       context_.bwd_primitives_args);
 #else
-    context_.bwd_stream->submit(context_.bwd_primitives);
+    bwd_stream->submit(context_.bwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
     // after execution, set data handle back
@@ -367,7 +356,6 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
     // Eltwise primitive.
     std::shared_ptr<mkldnn::primitive> eltwise_bwd;
 
-    std::shared_ptr<stream> bwd_stream;
     std::vector<mkldnn::primitive> bwd_primitives;
 
 #ifdef ENABLE_MKLDNN_V1
@@ -391,8 +379,7 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
           fwd_desc(nullptr),
           fwd_pd(nullptr),
           bwd_pd(nullptr),
-          eltwise_bwd(nullptr),
-          bwd_stream(nullptr) {
+          eltwise_bwd(nullptr) {
     }
   };
 
@@ -448,7 +435,6 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
   }
 
   struct EltwiseBwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -525,12 +511,10 @@ class MklReluOpBase : public OpKernel {
       const Tensor& src_tensor = MklGetInput(context, src_index);
       MklDnnShape dnn_shape_src;
       GetMklShape(context, src_index, &dnn_shape_src);
-
       if (src_tensor.dims() == 0) {
         Compute_Scalar(context);
         return;
       }
-
       MklDnnShape dnn_shape_dst;
       TensorShape tf_shape_dst;
       Tensor* dst_tensor = nullptr;
@@ -542,7 +526,6 @@ class MklReluOpBase : public OpKernel {
                                   dnn_shape_dst);
         return;
       }
-
       // Set DNN primitive - src
       MklDnnData<T> src(&cpu_engine);
       memory::dims src_dims;
@@ -556,26 +539,25 @@ class MklReluOpBase : public OpKernel {
         // Create blocked memory descriptor
         src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides);
       }
-
       // Try to get an eltwise forward primitive from caching pool
       MklEltwiseFwdParams<T> fwdParams(src_dims, src_md, alg_kind, alpha_,
                                        beta_);
-
       MklEltwiseFwdPrimitive<T>* eltwise_fwd =
           MklEltwiseFwdPrimitiveFactory<T>::Get(fwdParams);
-
       auto eltwise_fwd_pd = eltwise_fwd->GetEltwiseFwdPd();
-
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(context, eltwise_fwd->GetEngine()));
       // Check if src needs to be reordered
       const T* src_data = src_tensor.flat<T>().data();
       if (IS_SRC_REORDER_NEEDED(src_md, eltwise_fwd_pd, eltwise_fwd)) {
         src.SetUsrMem(src_md, &src_tensor);
-        src.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            eltwise_fwd_pd->PRIMITIVE_DESC_SRC, cpu_engine));
+        src.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(eltwise_fwd_pd->PRIMITIVE_DESC_SRC,
+                                   cpu_engine),
+            context);
         src_data = const_cast<T*>(
             reinterpret_cast<T*>(src.GetOpMem().get_data_handle()));
       }
-
       // Allocate dst tensor, always set it as MKL-DNN layout
       if (dnn_shape_src.IsMklTensor()) {
         dnn_shape_dst.SetMklTensor(true);
@@ -590,7 +572,6 @@ class MklReluOpBase : public OpKernel {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = src_tensor.shape();
       }
-
       OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
                                   {static_cast<const int>(src_index)},
                                   static_cast<const int>(dst_index),
@@ -600,7 +581,7 @@ class MklReluOpBase : public OpKernel {
       T* dst_data = dst_tensor->flat<T>().data();
 
       // execute eltwise
-      eltwise_fwd->Execute(src_data, dst_data);
+      eltwise_fwd->Execute(src_data, dst_data, fwd_cpu_stream);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
@@ -727,13 +708,16 @@ class MklReluGradOpBase : public OpKernel {
           MklEltwiseBwdPrimitiveFactory<T>::Get(bwdParams);
 
       auto eltwise_bwd_pd = eltwise_bwd->GetEltwiseBwdPd();
-
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, eltwise_bwd->GetEngine()));
       // check whether need reorder for src / diff_dst
       const T* src_data = src_tensor.flat<T>().data();
       if (IS_SRC_REORDER_NEEDED(src_md, eltwise_bwd_pd, eltwise_bwd)) {
         src.SetUsrMem(src_md, &src_tensor);
-        src.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            eltwise_bwd_pd.get()->PRIMITIVE_DESC_DIFF_SRC, cpu_engine));
+        src.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(
+                eltwise_bwd_pd.get()->PRIMITIVE_DESC_DIFF_SRC, cpu_engine),
+            context);
         src_data = const_cast<T*>(
             reinterpret_cast<T*>(src.GetOpMem().get_data_handle()));
       }
@@ -742,8 +726,10 @@ class MklReluGradOpBase : public OpKernel {
       if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, eltwise_bwd_pd,
                                      eltwise_bwd)) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
-        diff_dst.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            eltwise_bwd_pd.get()->PRIMITIVE_DESC_DIFF_SRC, cpu_engine));
+        diff_dst.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(
+                eltwise_bwd_pd.get()->PRIMITIVE_DESC_DIFF_SRC, cpu_engine),
+            context);
         diff_dst_data = const_cast<T*>(
             reinterpret_cast<T*>(diff_dst.GetOpMem().get_data_handle()));
       }
@@ -779,7 +765,8 @@ class MklReluGradOpBase : public OpKernel {
       T* diff_src_data = diff_src_tensor->flat<T>().data();
 
       // execute eltwise bwd
-      eltwise_bwd->Execute(src_data, diff_dst_data, diff_src_data);
+      eltwise_bwd->Execute(src_data, diff_dst_data, diff_src_data,
+                           bwd_cpu_stream);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
diff --git a/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc b/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
index f7b72f77cb9..0a0464f648b 100644
--- a/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
+++ b/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
@@ -130,9 +130,10 @@ class MklRequantizePerChannelOp : public OpKernel {
               GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(input_mem_prim),
               GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(output_mem_prim),
               cpu_engine_, reorder_attr);
-      mkldnn::stream reorder_stream = CPU_STREAM(cpu_engine_);
+      std::shared_ptr<stream> reorder_stream;
+      reorder_stream.reset(CreateStream(ctx, cpu_engine_));
 #ifndef ENABLE_MKLDNN_V1
-      reorder_stream.submit(
+      reorder_stream->submit(
           {mkldnn::reorder(reorder_pd, *input_mem_prim, *output_mem_prim)});
 #else
       std::unordered_map<int, mkldnn::memory> reorder_args = {
@@ -140,7 +141,7 @@ class MklRequantizePerChannelOp : public OpKernel {
           {MKLDNN_ARG_TO, *output_mem_prim}};
       std::unique_ptr<mkldnn::primitive> reorder_prim(
           new mkldnn::reorder(reorder_pd));
-      reorder_prim->execute(reorder_stream, reorder_args);
+      reorder_prim->execute(*reorder_stream, reorder_args);
 #endif  // !ENABLE_MKLDNN_V1
 
       Tensor* output_min = nullptr;
diff --git a/tensorflow/core/kernels/mkl_slice_op.cc b/tensorflow/core/kernels/mkl_slice_op.cc
index 699c3d44eb7..4115691c79d 100644
--- a/tensorflow/core/kernels/mkl_slice_op.cc
+++ b/tensorflow/core/kernels/mkl_slice_op.cc
@@ -181,22 +181,22 @@ template <typename T>
 class MklSlicePrimitive : public MklPrimitive {
  public:
   explicit MklSlicePrimitive(const MklSliceParams& sliceParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.slice_stream.reset(new CPU_STREAM(cpu_engine_));
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     Setup(sliceParams);
   }
 
   ~MklSlicePrimitive() {}
 
-  void Execute(const MklSliceParams& sliceParams) {
+  void Execute(const MklSliceParams& sliceParams,
+               std::shared_ptr<stream> slice_stream) {
     context_.src_mem->set_data_handle(sliceParams.from->get_data_handle());
     context_.dst_mem->set_data_handle(sliceParams.to->get_data_handle());
 
 #ifdef ENABLE_MKLDNN_V1
-    execute_primitives(context_.slice_primitives, context_.slice_stream,
+    execute_primitives(context_.slice_primitives, slice_stream,
                        context_.slice_primitives_args);
 #else
-    context_.slice_stream->submit(context_.slice_primitives);
+    slice_stream->submit(context_.slice_primitives);
 #endif
 
     // We should set it back to DummyData so as to make the primitive
@@ -228,8 +228,6 @@ class MklSlicePrimitive : public MklPrimitive {
         : src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {}
   } context_;
 
-  engine cpu_engine_;
-
   void Setup(const MklSliceParams& sliceParams) {
     // Actually, DummyData will not be used in computation,
     // because the real data will be filled before execution.
@@ -465,7 +463,7 @@ class MklSliceOp : public OpKernel {
       auto op_md =
           MklDnnData<T>::CreateBlockedMemDesc(input_dims, input_strides);
 #ifdef ENABLE_MKLDNN_V1
-      src.CheckReorderToOpMem(op_md, cpu_engine);
+      src.CheckReorderToOpMem(op_md, cpu_engine, context);
 #else
       auto op_pd = memory::primitive_desc(op_md, cpu_engine);
       src.CheckReorderToOpMem(op_pd);
@@ -492,7 +490,9 @@ class MklSliceOp : public OpKernel {
       MklSlicePrimitive<T>* reorder_prim =
           MklSlicePrimitiveFactory<T>::Get(sliceParams);
       // Execute slice reorder.
-      reorder_prim->Execute(sliceParams);
+      std::shared_ptr<stream> slice_stream;
+      slice_stream.reset(CreateStream(context, reorder_prim->GetEngine()));
+      reorder_prim->Execute(sliceParams, slice_stream);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index b07cd58cfd2..4d1cf90f28d 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -48,8 +48,7 @@ template <typename T>
 class MklSoftmaxPrimitive : public MklPrimitive {
  public:
   explicit MklSoftmaxPrimitive(const MklSoftmaxParams& fwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     Setup(fwdParams);
   }
 
@@ -58,16 +57,18 @@ class MklSoftmaxPrimitive : public MklPrimitive {
   // Softmax forward execute
   //   src_data:  input data buffer of src
   //   dst_data:  output data buffer of dst
-  void Execute(const T* src_data, T* dst_data) {
+  void Execute(const T* src_data, T* dst_data,
+               std::shared_ptr<stream> fwd_cpu_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
 
 #ifdef ENABLE_MKLDNN_V1
-    execute_primitives(context_.fwd_primitives, context_.fwd_stream,
+    DCHECK_EQ(context_.fwd_primitives.size(), context_.fwd_net_args.size());
+    execute_primitives(context_.fwd_primitives, fwd_cpu_stream,
                        context_.fwd_net_args);
 #else
-    context_.fwd_stream->submit(context_.fwd_primitives);
+    fwd_cpu_stream->submit(context_.fwd_primitives);
 #endif
 
     // After execution, set data handle back.
@@ -95,7 +96,6 @@ class MklSoftmaxPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::softmax_forward::primitive_desc> fwd_pd;
     std::shared_ptr<mkldnn::primitive> softmax_fwd;
 
-    std::shared_ptr<stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
     std::vector<MemoryArgsMap> fwd_net_args;
 
@@ -105,8 +105,7 @@ class MklSoftmaxPrimitive : public MklPrimitive {
           fwd_desc(nullptr),
           src_md(nullptr),
           fwd_pd(nullptr),
-          softmax_fwd(nullptr),
-          fwd_stream(nullptr) {}
+          softmax_fwd(nullptr) {}
   };
 
   // Softmax forward primitive setup
@@ -143,7 +142,6 @@ class MklSoftmaxPrimitive : public MklPrimitive {
   }
 
   struct SoftmaxFwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -303,9 +301,9 @@ class MklSoftmaxOp : public OpKernel {
 
       const T* src_data = src_tensor.flat<T>().data();
       T* dst_data = reinterpret_cast<T*>(output_tensor->flat<T>().data());
-
-      // Execute softmax primitive.
-      softmax_fwd->Execute(src_data, dst_data);
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(context, softmax_fwd->GetEngine()));
+      softmax_fwd->Execute(src_data, dst_data, fwd_cpu_stream);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
diff --git a/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc b/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
index 9b2d09fb827..ed5fec677e8 100644
--- a/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
+++ b/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
@@ -58,7 +58,9 @@ namespace tensorflow {
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("_FusedMatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp);  \
   REGISTER_KERNEL_BUILDER(                                                    \
-      Name("BatchMatMulV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp);
+      Name("BatchMatMulV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp); \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Softmax").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp);
 
 TF_CALL_bfloat16(REGISTER_CPU);
 #undef REGISTER_CPU
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc
index a82aed0243c..77a68afa752 100644
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@@ -144,12 +144,14 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
 
     std::vector<primitive> net;
 #ifdef ENABLE_MKLDNN_V1
+    std::shared_ptr<stream> transpose_stream;
     auto* prim = FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem());
+    transpose_stream.reset(CreateStream(context, prim->GetEngine()));
     net.push_back(*(prim->GetPrimitive()));
     std::vector<MemoryArgsMap> net_args;
     net_args.push_back({{MKLDNN_ARG_FROM, *in.GetUsrMem()},
                         {MKLDNN_ARG_TO, *out.GetUsrMem()}});
-    execute_primitives(net, prim->GetStream(), net_args);
+    execute_primitives(net, transpose_stream, net_args);
 #else
     std::shared_ptr<stream> transpose_stream;
     transpose_stream.reset(new CPU_STREAM(cpu_engine));
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 31ead11dd34..532d861e615 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -192,6 +192,7 @@ class Pooling3DOp : public UnaryOp<T> {
                                             {{out[2], out[1], out[0]}}, depth);
     Tensor* output;
     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+    if (out_shape.num_elements() == 0) return;
     LaunchPoolingOp<Device, T, Type>::launch(context, tensor_in, window, stride,
                                              padding, data_format_, padding_,
                                              output);
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
index 46d8051fff1..d1066d6556b 100644
--- a/tensorflow/core/kernels/reduction_ops.h
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 // Functor definitions for Reduction ops, must be compilable by nvcc.
 
-#include <iostream>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -58,6 +57,29 @@ struct ReduceEigenImpl {
   }
 };
 
+// Specialization for BF16 Reducer to fix accuracy.
+// TODO: All BF16 reducers should have specializations to fix accuracy.
+#define CASTING_SPECIALIZATION(Reducer, ScalarType, IntermediateType)        \
+  template <typename Device, typename OUT_T, typename IN_T,                  \
+            typename ReductionAxes>                                          \
+  struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,                 \
+                         Reducer<ScalarType>> {                              \
+    void operator()(const Device& d, OUT_T out, IN_T in,                     \
+                    const ReductionAxes& reduction_axes,                     \
+                    const Reducer<ScalarType>& reducer) {                    \
+      static_assert(std::is_same<ScalarType, typename OUT_T::Scalar>::value, \
+                    "");                                                     \
+      Reducer<IntermediateType> intermediate_reducer;                        \
+      auto in_as_intermediate = in.template cast<IntermediateType>();        \
+      out.device(d) =                                                        \
+          in_as_intermediate.reduce(reduction_axes, intermediate_reducer)    \
+              .template cast<ScalarType>();                                  \
+    }                                                                        \
+  };
+
+CASTING_SPECIALIZATION(Eigen::internal::SumReducer, bfloat16, float);
+#undef CASTING_SPECIALIZATION
+
 template <typename Device, typename OUT_T, typename IN_T,
           typename ReductionAxes, typename Scalar>
 struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index ccd1e3c835d..b606d411a3d 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -282,6 +282,7 @@ REGISTER_KERNEL_BUILDER(
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_int64(REGISTER_GPU_KERNELS);
 TF_CALL_variant(REGISTER_GPU_KERNELS);
+TF_CALL_uint32(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
 REGISTER_KERNEL_BUILDER(Name("_VarHandlesOp")
@@ -511,6 +512,7 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
+TF_CALL_uint32(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -524,6 +526,7 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_int64(REGISTER_GPU_KERNELS);
 TF_CALL_variant(REGISTER_GPU_KERNELS);
+TF_CALL_uint32(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
index 0e112133915..b5b62bc76ca 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -43,9 +43,9 @@ typedef Eigen::GpuDevice GPUDevice;
 template <typename Device, typename Tlen>
 void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
   const Tensor& input = context->input(0);
-  const Tensor& seq_lens = context->input(1);
+  const Tensor& seq_lengths = context->input(1);
 
-  auto seq_lens_t = seq_lens.vec<Tlen>();
+  auto seq_lens_t = seq_lengths.vec<Tlen>();
 
   std::vector<Tlen> seq_lens_vec(seq_lens_t.size());
 
@@ -56,15 +56,16 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
   OP_REQUIRES(context, batch_dim != seq_dim,
               errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim));
   OP_REQUIRES(context, seq_dim < input.dims(),
-              errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
+              errors::InvalidArgument("seq_dim must be < input rank", " ( ",
                                       seq_dim, " vs. ", input.dims(), ")"));
   OP_REQUIRES(context, batch_dim < input.dims(),
-              errors::InvalidArgument("batch_dim must be < input.dims()", "( ",
+              errors::InvalidArgument("batch_dim must be < input rank", " ( ",
                                       batch_dim, " vs. ", input.dims(), ")"));
-  OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(batch_dim),
-              errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim,
-                                      "), ", "(", seq_lens.NumElements(),
-                                      " vs. ", input.dim_size(batch_dim), ")"));
+  OP_REQUIRES(
+      context, seq_lengths.NumElements() == input.dim_size(batch_dim),
+      errors::InvalidArgument("Length of seq_lengths != input.dims(", batch_dim,
+                              "), ", "(", seq_lengths.NumElements(), " vs. ",
+                              input.dim_size(batch_dim), ")"));
 
   for (size_t d = 0; d < seq_lens_vec.size(); ++d) {
     OP_REQUIRES(context, seq_lens_vec[d] >= 0,
@@ -77,21 +78,22 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
 
 void CheckErrorsGPU(OpKernelContext* context, int batch_dim, int seq_dim) {
   const Tensor& input = context->input(0);
-  const Tensor& seq_lens = context->input(1);
+  const Tensor& seq_lengths = context->input(1);
 
   OP_REQUIRES(context, batch_dim != seq_dim,
               errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim));
   OP_REQUIRES(context, seq_dim < input.dims(),
-              errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
+              errors::InvalidArgument("seq_dim must be < input rank", " ( ",
                                       seq_dim, " vs. ", input.dims(), ")"));
   OP_REQUIRES(context, batch_dim < input.dims(),
-              errors::InvalidArgument("batch_dim must be < input.dims()", "( ",
+              errors::InvalidArgument("batch_dim must be < input rank", " ( ",
                                       batch_dim, " vs. ", input.dims(), ")"));
 
-  OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(batch_dim),
-              errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim,
-                                      "), ", "(", seq_lens.NumElements(),
-                                      " vs. ", input.dim_size(batch_dim), ")"));
+  OP_REQUIRES(
+      context, seq_lengths.NumElements() == input.dim_size(batch_dim),
+      errors::InvalidArgument("Length of seq_lengths != input.dims(", batch_dim,
+                              "), ", "(", seq_lengths.NumElements(), " vs. ",
+                              input.dim_size(batch_dim), ")"));
 }
 
 template <>
@@ -117,14 +119,14 @@ class ReverseSequenceOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
-    const Tensor& seq_lens = context->input(1);
+    const Tensor& seq_lengths = context->input(1);
 
     // Preliminary validation of sizes.
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lens.shape()),
-                errors::InvalidArgument("seq_lens input must be 1-dim, not ",
-                                        seq_lens.dims()));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lengths.shape()),
+                errors::InvalidArgument("seq_lengths must be 1-dim, not ",
+                                        seq_lengths.dims()));
 
-    auto seq_lens_t = seq_lens.vec<Tlen>();
+    auto seq_lens_t = seq_lengths.vec<Tlen>();
 
     CheckErrors<Device, Tlen>(context, batch_dim_, seq_dim_);
     if (!context->status().ok()) return;
@@ -186,7 +188,7 @@ namespace functor {
   void ReverseSequence<GPUDevice, T, Tlen, Dims>::Compute(             \
       const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \
       int32 batch_dim, int32 seq_dim,                                  \
-      typename TTypes<Tlen>::ConstVec seq_lens,                        \
+      typename TTypes<Tlen>::ConstVec seq_lengths,                     \
       typename TTypes<T, Dims>::Tensor output);                        \
   extern template struct ReverseSequence<GPUDevice, T, Tlen, Dims>;
 
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl.h b/tensorflow/core/kernels/segment_reduction_ops_impl.h
index 8954dcd4681..6c3fad668ae 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl.h
@@ -508,6 +508,12 @@ class SparseSegmentReductionOpBase : public OpKernel {
                 errors::InvalidArgument("segment ids must be >= 0"));
     auto output_flat = output->flat_outer_dims<T>();
 
+    Tensor temp;
+    if (input.dtype() == DT_BFLOAT16) {
+      temp = tensorflow::Tensor(DT_FLOAT, output_shape);
+    }
+    auto temp_flat = temp.flat_outer_dims<float>();
+
     int64 start = 0, end = 1;
     // Index from which the output is not initialized.
     SegmentId uninitialized_index = 0;
@@ -546,8 +552,9 @@ class SparseSegmentReductionOpBase : public OpKernel {
       }
 
       auto out = output_flat.template chip<0>(out_index);
-      const int bad_offset =
-          Reduce(input_flat, indices_vec, start, end - start, out);
+      auto temp = temp_flat.template chip<0>(out_index);
+      const int bad_offset = Reduce<T, Index>(input_flat, indices_vec, start,
+                                              end - start, out, temp);
       OP_REQUIRES(context, bad_offset < 0,
                   errors::InvalidArgument(
                       "Bad: indices[", start + bad_offset,
@@ -572,40 +579,89 @@ class SparseSegmentReductionOpBase : public OpKernel {
   }
 
  private:
-  int64 Reduce(const typename TTypes<T>::ConstMatrix& input_flat,
-               const typename TTypes<Index>::ConstVec& indices_vec, int64 start,
-               int64 num,
-               Eigen::TensorChippingOp<0, typename TTypes<T>::Matrix> out) {
+  template <typename Tin>
+  using EnableIfBfloat16 =
+      typename std::enable_if<std::is_same<Tin, bfloat16>::value, int>::type;
+  template <typename Tin>
+  using EnableIfNotBfloat16 =
+      typename std::enable_if<!std::is_same<Tin, bfloat16>::value, int>::type;
+
+  template <typename Tin, typename Tindex, EnableIfNotBfloat16<Tin> = 0>
+  EIGEN_ALWAYS_INLINE auto fetch_val(
+      const typename TTypes<Tin>::ConstMatrix& input_flat, Tindex index) {
+    return input_flat.template chip<0>(index);
+  }
+
+  template <typename Tin, typename Tindex, EnableIfBfloat16<Tin> = 0>
+  EIGEN_ALWAYS_INLINE auto fetch_val(
+      const typename TTypes<Tin>::ConstMatrix& input_flat, Tindex index) {
+    return input_flat.template chip<0>(index).template cast<float>();
+  }
+
+  template <typename Tout>
+  EIGEN_ALWAYS_INLINE Tout get_scaling_factor(int64 num) {
+    Tout m(1);
+    if (is_mean_ && (num < 10)) {
+      m = Tout(num);
+    }
+    if (is_sqrtn_ && (num < 10)) {
+      m = Tout(sqrt(num));
+    }
+    return Tout(1) / m;
+  }
+
+  template <typename Tin, typename Tindex, EnableIfNotBfloat16<Tin> = 0>
+  int64 Reduce(
+      const typename TTypes<Tin>::ConstMatrix& input_flat,
+      const typename TTypes<Tindex>::ConstVec& indices_vec, int64 start,
+      int64 num, Eigen::TensorChippingOp<0, typename TTypes<Tin>::Matrix> out,
+      Eigen::TensorChippingOp<0, typename TTypes<float>::Matrix> temp) {
+    return ReduceImpl<Tin, Tindex, Tin>(input_flat, indices_vec, start, num,
+                                        out, get_scaling_factor<Tin>(num));
+  }
+
+  template <typename Tin, typename Tindex, EnableIfBfloat16<Tin> = 0>
+  int64 Reduce(
+      const typename TTypes<Tin>::ConstMatrix& input_flat,
+      const typename TTypes<Tindex>::ConstVec& indices_vec, int64 start,
+      int64 num, Eigen::TensorChippingOp<0, typename TTypes<Tin>::Matrix> out,
+      Eigen::TensorChippingOp<0, typename TTypes<float>::Matrix> temp) {
+    int64 res =
+        ReduceImpl<Tin, Tindex, float>(input_flat, indices_vec, start, num,
+                                       temp, get_scaling_factor<float>(num));
+    out = temp.template cast<bfloat16>();
+    return res;
+  }
+
+  template <typename Tin, typename Tindex, typename Tout>
+  int64 ReduceImpl(
+      const typename TTypes<Tin>::ConstMatrix& input_flat,
+      const typename TTypes<Tindex>::ConstVec& indices_vec, int64 start,
+      int64 num, Eigen::TensorChippingOp<0, typename TTypes<Tout>::Matrix> out,
+      const Tout scaling_factor) {
 #define INDEX(n, i)                               \
   const auto index##n = indices_vec(start + (i)); \
   if (!FastBoundsCheck(index##n, input_flat.dimension(0))) return (i);
 
-#define L(n) input_flat.template chip<0>(index##n)
+#define L(n) fetch_val<Tin, Tindex>(input_flat, index##n)
 
     if (num == 1) {
       INDEX(0, 0);
       out = L(0);
     } else {
-      int64 r = num % 8;
-      T m(1);
-      if (is_mean_ && (num < 10)) {
-        m = T(num);
-      }
-      if (is_sqrtn_ && (num < 10)) {
-        m = T(sqrt(num));
-      }
+      int64 r = num & 7;
       switch (r) {
         case 2: {
           INDEX(0, 0);
           INDEX(1, 1);
-          out = (L(0) + L(1)) / m;
+          out = (L(0) + L(1)) * scaling_factor;
           break;
         }
         case 3: {
           INDEX(0, 0);
           INDEX(1, 1);
           INDEX(2, 2);
-          out = (L(0) + L(1) + L(2)) / m;
+          out = (L(0) + L(1) + L(2)) * scaling_factor;
           break;
         }
         case 4: {
@@ -613,7 +669,7 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(1, 1);
           INDEX(2, 2);
           INDEX(3, 3);
-          out = (L(0) + L(1) + L(2) + L(3)) / m;
+          out = (L(0) + L(1) + L(2) + L(3)) * scaling_factor;
           break;
         }
         case 5: {
@@ -622,7 +678,7 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(2, 2);
           INDEX(3, 3);
           INDEX(4, 4);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4)) / m;
+          out = (L(0) + L(1) + L(2) + L(3) + L(4)) * scaling_factor;
           break;
         }
         case 6: {
@@ -632,7 +688,7 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(3, 3);
           INDEX(4, 4);
           INDEX(5, 5);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5)) / m;
+          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5)) * scaling_factor;
           break;
         }
         case 7: {
@@ -643,7 +699,8 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(4, 4);
           INDEX(5, 5);
           INDEX(6, 6);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6)) / m;
+          out =
+              (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6)) * scaling_factor;
           break;
         }
         case 0: {
@@ -655,7 +712,8 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(5, 5);
           INDEX(6, 6);
           INDEX(7, 7);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7)) / m;
+          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7)) *
+                scaling_factor;
           r = 8;
           break;
         }
@@ -669,8 +727,8 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(6, 6);
           INDEX(7, 7);
           INDEX(8, 8);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7) + L(8)) /
-                m;
+          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7) + L(8)) *
+                scaling_factor;
           r = 9;
           break;
         }
@@ -687,10 +745,10 @@ class SparseSegmentReductionOpBase : public OpKernel {
         out += L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7);
       }
       if (is_mean_ && num >= 10) {
-        out = out / static_cast<T>(num);
+        out = out / static_cast<Tout>(num);
       }
       if (is_sqrtn_ && num >= 10) {
-        out = out / static_cast<T>(sqrt(num));
+        out = out / static_cast<Tout>(sqrt(num));
       }
     }
 
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc b/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
index fee0f818c5e..03a448e52b3 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
@@ -64,6 +64,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE);
                                                   segment_ids_type>);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(float);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
+REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(bfloat16);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
 #define REGISTER_CPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
@@ -85,6 +86,7 @@ REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
           CPUDevice, type, index_type, segment_ids_type>);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(float);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
+REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(bfloat16);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
 #define REGISTER_CPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 93830515040..f82d4645f13 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -92,14 +92,16 @@ void SendOp::Compute(OpKernelContext* ctx) {
   FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
   if (frame_iter == FrameAndIter(0, 0)) {
     // Use the cached rendezvous key.
-    VLOG(2) << "Send " << parsed_key_.buf_;
+    VLOG(2) << "Send " << parsed_key_.buf_ << " using "
+            << reinterpret_cast<uintptr_t>(ctx->rendezvous());
     ctx->SetStatus(ctx->rendezvous()->Send(parsed_key_, args, ctx->input(0),
                                            ctx->is_input_dead()));
     return;
   } else {
     Rendezvous::ParsedKey in_loop_parsed;
     GetRendezvousKey(key_prefix_, frame_iter, &in_loop_parsed.buf_);
-    VLOG(2) << "Send " << in_loop_parsed.buf_;
+    VLOG(2) << "Send " << in_loop_parsed.buf_ << " using "
+            << reinterpret_cast<uintptr_t>(ctx->rendezvous());
     OP_REQUIRES_OK(ctx,
                    Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed));
 
@@ -191,22 +193,19 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   Rendezvous::Args args;
   args.device_context = ctx->op_device_context();
   args.alloc_attrs = ctx->output_alloc_attr(0);
-  if (ctx->is_eager()) {
-    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
-    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
-    // rendezvous if it encounters any error.
-    args.cancellation_manager = ctx->cancellation_manager();
-  }
+  args.cancellation_manager = ctx->cancellation_manager();
 
   FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
   if (frame_iter == FrameAndIter(0, 0)) {
-    VLOG(2) << "Recv " << parsed_key_.buf_;
+    VLOG(2) << "Recv " << parsed_key_.buf_ << " using "
+            << reinterpret_cast<uintptr_t>(ctx->rendezvous());
     ctx->rendezvous()->RecvAsync(parsed_key_, args,
                                  make_recv_callback(ctx, std::move(done)));
   } else {
     Rendezvous::ParsedKey in_loop_parsed;
     GetRendezvousKey(key_prefix_, frame_iter, &in_loop_parsed.buf_);
-    VLOG(2) << "Recv " << in_loop_parsed.buf_;
+    VLOG(2) << "Recv " << in_loop_parsed.buf_ << " using "
+            << reinterpret_cast<uintptr_t>(ctx->rendezvous());
     OP_REQUIRES_OK_ASYNC(
         ctx, Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed), done);
     ctx->rendezvous()->RecvAsync(in_loop_parsed, args,
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index 54c0e9f91e5..9bdfca32d88 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -82,6 +82,7 @@ class SoftmaxOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                       \
       Name("Softmax").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       SoftmaxOp<CPUDevice, T>);
+TF_CALL_bfloat16(REGISTER_CPU);
 TF_CALL_half(REGISTER_CPU);
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
diff --git a/tensorflow/core/kernels/sparse/mat_mul_op.cc b/tensorflow/core/kernels/sparse/mat_mul_op.cc
index a0834800446..50fa0ec88ea 100644
--- a/tensorflow/core/kernels/sparse/mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/mat_mul_op.cc
@@ -728,12 +728,14 @@ namespace {
 template <typename T>
 struct GPUDataType;
 
+// GPUDataType templates are currently not instantiated in the ROCm flow
+// So leaving out the #elif TENSORFLOW_USE_ROCM blocks for now
+// hipblas library is not (yet) being pulled in via rocm_configure.bzl
+// so cannot reference tyeps from hipblas headers here
 template <>
 struct GPUDataType<Eigen::half> {
 #if GOOGLE_CUDA
   static constexpr cudaDataType_t type = CUDA_R_16F;
-#elif TENSORFLOW_USE_ROCM
-  static constexpr hipblasDataType_t type = HIPBLAS_R_16F;
 #endif
 };
 
@@ -741,8 +743,6 @@ template <>
 struct GPUDataType<float> {
 #if GOOGLE_CUDA
   static constexpr cudaDataType_t type = CUDA_R_32F;
-#elif TENSORFLOW_USE_ROCM
-  static constexpr hipblasDataType_t type = HIPBLAS_R_32F;
 #endif
 };
 
@@ -750,8 +750,6 @@ template <>
 struct GPUDataType<std::complex<float>> {
 #if GOOGLE_CUDA
   static constexpr cudaDataType_t type = CUDA_C_32F;
-#elif TENSORFLOW_USE_ROCM
-  static constexpr hipblasDataType_t type = HIPBLAS_C_32F;
 #endif
 };
 
@@ -759,8 +757,6 @@ template <>
 struct GPUDataType<double> {
 #if GOOGLE_CUDA
   static constexpr cudaDataType_t type = CUDA_R_64F;
-#elif TENSORFLOW_USE_ROCM
-  static constexpr hipblasDataType_t type = HIPBLAS_R_64F;
 #endif
 };
 
@@ -768,8 +764,6 @@ template <>
 struct GPUDataType<std::complex<double>> {
 #if GOOGLE_CUDA
   static constexpr cudaDataType_t type = CUDA_C_64F;
-#elif TENSORFLOW_USE_ROCM
-  static constexpr hipblasDataType_t type = HIPBLAS_C_64F;
 #endif
 };
 
@@ -957,7 +951,7 @@ class CSRSparseMatrixMatVec<GPUDevice, T> {
       const int n = a.dense_shape_host(1);
       const int nnz = a.values.size();
       DCHECK_EQ(nnz, a.col_ind.size());
-#if CUDA_VERSION >= 10020
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10020)
       TF_RETURN_IF_ERROR(cuda_sparse.Csrmv(transA_, m, n, nnz, &alpha,
                                            a.values.data(), a.row_ptr.data(),
                                            a.col_ind.data(), x, &beta, y));
diff --git a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
index 7325d5f6873..fb652e13d15 100644
--- a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
@@ -417,7 +417,7 @@ class CSRSparseMatMulGPUOp : public OpKernel {
     }
     auto b_input_dense_shape = b_input_matrix->dense_shape().vec<int64>();
 
-#if CUDA_VERSION >= 10000
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
     size_t maxWorkspaceSize = 0;
     for (int i = 0; i < batch_size; ++i) {
       // Calculate maximum workspace size over batch.
@@ -558,7 +558,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
         initialized_(false),
         transpose_a_(transpose_a),
         adjoint_a_(adjoint_a),
-#if CUDA_VERSION < 10000
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
         transpose_b_(transpose_b) {
 #else
         transpose_b_(transpose_b),
@@ -573,7 +573,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
                           : GPUSPARSE(OPERATION_NON_TRANSPOSE);
   }
 
-#if CUDA_VERSION >= 10000
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
   ~CSRSparseSparseMatrixMatMul() {
     if (initialized_) {
       cusparseDestroyCsrgemm2Info(info_);
@@ -591,7 +591,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
     TF_RETURN_IF_ERROR(descrA_.Initialize());
     TF_RETURN_IF_ERROR(descrB_.Initialize());
     TF_RETURN_IF_ERROR(descrC_.Initialize());
-#if CUDA_VERSION >= 10000
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
     TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateCsrgemm2Info(&info_));
 #endif
     initialized_ = true;
@@ -600,6 +600,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
 
   Status GetWorkspaceSize(const ConstCSRComponent<T>& a,
                           const ConstCSRComponent<T>& b, size_t* bufferSize) {
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
     DCHECK(initialized_);
     const int m =
         a.dense_shape_host(a.dense_shape_host.size() - (transpose_a_ ? 1 : 2));
@@ -621,6 +622,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
         m, n, k, descrA_.descr(), nnzA, a.row_ptr.data(), a.col_ind.data(),
         descrB_.descr(), nnzB, b.row_ptr.data(), b.col_ind.data(), info_,
         bufferSize));
+#endif
 
     return Status::OK();
   }
@@ -650,7 +652,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
 
     *output_nnz = -1;
 
-#if CUDA_VERSION < 10000
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
     TF_RETURN_IF_ERROR(cuda_sparse_.CsrgemmNnz(
         transA_, transB_, m, n, k, descrA_.descr(), nnzA, a.row_ptr.data(),
         a.col_ind.data(), descrB_.descr(), nnzB, b.row_ptr.data(),
@@ -693,7 +695,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
         b.dense_shape_host(b.dense_shape_host.size() - (transpose_b_ ? 2 : 1));
     DCHECK_EQ(n, c->dense_shape_host(c->dense_shape_host.size() - 1));
 
-#if CUDA_VERSION < 10000
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
     TF_RETURN_IF_ERROR(cuda_sparse_.Csrgemm(
         transA_, transB_, m, k, n, descrA_.descr(), nnzA, a.values.data(),
         a.row_ptr.data(), a.col_ind.data(), descrB_.descr(), nnzB,
@@ -732,7 +734,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
   GpuSparseMatrixDescriptor descrC_;
   gpusparseOperation_t transA_;
   gpusparseOperation_t transB_;
-#if CUDA_VERSION >= 10000
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
   csrgemm2Info_t info_;
 #endif
 };
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index c7c538a945f..9a80aad5d04 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // Contains OP to generate sparse crosses.
 #include <assert.h>
+
 #include <limits>
 #include <string>
 #include <vector>
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/strong_hash.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -42,7 +44,8 @@ class ColumnInterface {
   virtual int64 FeatureCount(int64 batch) const = 0;
 
   // Returns the fingerprint of nth feature from the specified batch.
-  virtual InternalType Feature(int64 batch, int64 n) const = 0;
+  virtual InternalType Feature(int64 batch, int64 n,
+                               bool strong_hash) const = 0;
 
   virtual ~ColumnInterface() {}
 };
@@ -63,7 +66,7 @@ class SparseTensorColumn : public ColumnInterface<InternalType> {
     return feature_counts_[batch];
   }
 
-  InternalType Feature(int64 batch, int64 n) const override;
+  InternalType Feature(int64 batch, int64 n, bool strong_hash) const override;
 
   ~SparseTensorColumn() override {}
 
@@ -73,18 +76,69 @@ class SparseTensorColumn : public ColumnInterface<InternalType> {
   std::vector<int64> feature_start_indices_;
 };
 
+// A column that is backed by a sparse tensor.
+template <typename InternalType>
+class KeyedSparseTensorColumn : public ColumnInterface<InternalType> {
+ public:
+  KeyedSparseTensorColumn(const Tensor& values,
+                          std::vector<int64> feature_counts,
+                          std::vector<int64> feature_start_indices,
+                          std::vector<int64> key)
+      : values_(values),
+        feature_counts_(std::move(feature_counts)),
+        feature_start_indices_(std::move(feature_start_indices)) {
+    DCHECK_EQ(feature_counts_.size(), feature_start_indices_.size());
+    std::memcpy(key_, key.data(), sizeof(key_));
+  }
+
+  int64 FeatureCount(int64 batch) const override {
+    return feature_counts_[batch];
+  }
+
+  InternalType Feature(int64 batch, int64 n, bool strong_hash) const override;
+
+  ~KeyedSparseTensorColumn() override {}
+
+ private:
+  const Tensor& values_;
+  uint64 key_[2];
+  std::vector<int64> feature_counts_;
+  std::vector<int64> feature_start_indices_;
+};
+
 // InternalType is int64 only when using HashCrosser.
 template <>
-int64 SparseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
+int64 SparseTensorColumn<int64>::Feature(int64 batch, int64 n,
+                                         bool strong_hash) const {
   const int64 start = feature_start_indices_[batch];
   if (DT_STRING == values_.dtype())
     return Fingerprint64(values_.vec<tstring>().data()[start + n]);
   return values_.vec<int64>().data()[start + n];
 }
 
+template <>
+int64 KeyedSparseTensorColumn<int64>::Feature(int64 batch, int64 n,
+                                              bool strong_hash) const {
+  const int64 start = feature_start_indices_[batch];
+  if (strong_hash) {
+    if (DT_STRING == values_.dtype()) {
+      return StrongKeyedHash(key_, values_.vec<tstring>()(start + n));
+    }
+    return StrongKeyedHash(
+        key_, {reinterpret_cast<const char*>(&values_.vec<int64>()(start + n)),
+               sizeof(values_.dtype())});
+  }
+  if (DT_STRING == values_.dtype())
+    return Fingerprint64(values_.vec<tstring>()(start + n));
+  return Fingerprint64(
+      {reinterpret_cast<const char*>(&values_.vec<int64>()(start + n)),
+       sizeof(values_.dtype())});
+}
+
 // InternalType is string or StringPiece when using StringCrosser.
 template <>
-tstring SparseTensorColumn<tstring>::Feature(int64 batch, int64 n) const {
+tstring SparseTensorColumn<tstring>::Feature(int64 batch, int64 n,
+                                             bool strong_hash) const {
   const int64 start = feature_start_indices_[batch];
   if (DT_STRING == values_.dtype())
     return values_.vec<tstring>().data()[start + n];
@@ -92,8 +146,24 @@ tstring SparseTensorColumn<tstring>::Feature(int64 batch, int64 n) const {
 }
 
 template <>
-StringPiece SparseTensorColumn<StringPiece>::Feature(int64 batch,
-                                                     int64 n) const {
+tstring KeyedSparseTensorColumn<tstring>::Feature(int64 batch, int64 n,
+                                                  bool strong_hash) const {
+  const int64 start = feature_start_indices_[batch];
+  if (DT_STRING == values_.dtype())
+    return values_.vec<tstring>().data()[start + n];
+  return std::to_string(values_.vec<int64>().data()[start + n]);
+}
+
+template <>
+StringPiece SparseTensorColumn<StringPiece>::Feature(int64 batch, int64 n,
+                                                     bool strong_hash) const {
+  const int64 start = feature_start_indices_[batch];
+  return values_.vec<tstring>().data()[start + n];
+}
+
+template <>
+StringPiece KeyedSparseTensorColumn<StringPiece>::Feature(
+    int64 batch, int64 n, bool strong_hash) const {
   const int64 start = feature_start_indices_[batch];
   return values_.vec<tstring>().data()[start + n];
 }
@@ -106,7 +176,7 @@ class DenseTensorColumn : public ColumnInterface<InternalType> {
 
   int64 FeatureCount(int64 batch) const override { return tensor_.dim_size(1); }
 
-  InternalType Feature(int64 batch, int64 n) const override;
+  InternalType Feature(int64 batch, int64 n, bool strong_hash) const override;
 
   ~DenseTensorColumn() override {}
 
@@ -114,9 +184,46 @@ class DenseTensorColumn : public ColumnInterface<InternalType> {
   const Tensor& tensor_;
 };
 
+// A column that is backed by a dense tensor.
+template <typename InternalType>
+class KeyedDenseTensorColumn : public ColumnInterface<InternalType> {
+ public:
+  explicit KeyedDenseTensorColumn(const Tensor& tensor, std::vector<int64> key)
+      : tensor_(tensor) {
+    std::memcpy(key_, key.data(), sizeof(key_));
+  }
+
+  int64 FeatureCount(int64 batch) const override { return tensor_.dim_size(1); }
+
+  InternalType Feature(int64 batch, int64 n, bool strong_hash) const override;
+
+  ~KeyedDenseTensorColumn() override {}
+
+ private:
+  const Tensor& tensor_;
+  uint64 key_[2];
+};
+
 // InternalType is int64 only when using HashCrosser.
 template <>
-int64 DenseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
+int64 DenseTensorColumn<int64>::Feature(int64 batch, int64 n,
+                                        bool strong_hash) const {
+  if (DT_STRING == tensor_.dtype())
+    return Fingerprint64(tensor_.matrix<tstring>()(batch, n));
+  return tensor_.matrix<int64>()(batch, n);
+}
+
+template <>
+int64 KeyedDenseTensorColumn<int64>::Feature(int64 batch, int64 n,
+                                             bool strong_hash) const {
+  if (strong_hash) {
+    if (DT_STRING == tensor_.dtype()) {
+      return StrongKeyedHash(key_, tensor_.matrix<tstring>()(batch, n));
+    }
+    return StrongKeyedHash(
+        key_, {reinterpret_cast<const char*>(tensor_.matrix<int64>()(batch, n)),
+               sizeof(tensor_.dtype())});
+  }
   if (DT_STRING == tensor_.dtype())
     return Fingerprint64(tensor_.matrix<tstring>()(batch, n));
   return tensor_.matrix<int64>()(batch, n);
@@ -124,14 +231,28 @@ int64 DenseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
 
 // Internal type is string or StringPiece when using StringCrosser.
 template <>
-tstring DenseTensorColumn<tstring>::Feature(int64 batch, int64 n) const {
+tstring DenseTensorColumn<tstring>::Feature(int64 batch, int64 n,
+                                            bool strong_hash) const {
   if (DT_STRING == tensor_.dtype()) return tensor_.matrix<tstring>()(batch, n);
   return std::to_string(tensor_.matrix<int64>()(batch, n));
 }
 
 template <>
-StringPiece DenseTensorColumn<StringPiece>::Feature(int64 batch,
-                                                    int64 n) const {
+tstring KeyedDenseTensorColumn<tstring>::Feature(int64 batch, int64 n,
+                                                 bool strong_hash) const {
+  if (DT_STRING == tensor_.dtype()) return tensor_.matrix<tstring>()(batch, n);
+  return std::to_string(tensor_.matrix<int64>()(batch, n));
+}
+
+template <>
+StringPiece DenseTensorColumn<StringPiece>::Feature(int64 batch, int64 n,
+                                                    bool strong_hash) const {
+  return tensor_.matrix<tstring>()(batch, n);
+}
+
+template <>
+StringPiece KeyedDenseTensorColumn<StringPiece>::Feature(
+    int64 batch, int64 n, bool strong_hash) const {
   return tensor_.matrix<tstring>()(batch, n);
 }
 
@@ -169,24 +290,24 @@ class StringCrosser {
  public:
   StringCrosser(const std::vector<
                     std::unique_ptr<ColumnInterface<InternalType>>>& columns,
-                const int64 num_buckets_unused, const uint64 hash_key_unused)
-      : columns_(columns) {}
-
-  string Generate(const int64 batch_index,
-                  const std::vector<int>& permutation) const {
-    static const auto k_feature_separator = "_X_";
+                const int64 num_buckets_unused, const uint64 hash_key_unused,
+                const tstring k_feature_separator)
+      : columns_(columns), k_feature_separator_(k_feature_separator) {}
 
+  string Generate(const int64 batch_index, const std::vector<int>& permutation,
+                  bool unused_strong_hash) const {
     gtl::InlinedVector<InternalType, 6> cross_vec(columns_.size());
     for (int i = 0; i < permutation.size(); i++) {
-      cross_vec[i] = columns_[i]->Feature(batch_index, permutation[i]);
+      cross_vec[i] = columns_[i]->Feature(batch_index, permutation[i], false);
     }
     // TODO(zakaria): this will copy the string twice, might effect
     // performance.
-    return absl::StrJoin(cross_vec, k_feature_separator);
+    return absl::StrJoin(cross_vec, k_feature_separator_);
   }
 
  private:
   const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns_;
+  const tstring k_feature_separator_;
 };
 
 // Generates the sparse crosses as nested hash to avoid string manipulations.
@@ -194,15 +315,16 @@ class HashCrosser {
  public:
   HashCrosser(
       const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns,
-      const int64 num_buckets, const uint64 hash_key)
+      const int64 num_buckets, const uint64 hash_key,
+      const tstring k_feature_separator_unused)
       : columns_(columns), num_buckets_(num_buckets), hash_key_(hash_key) {}
 
-  int64 Generate(const int64 batch_index,
-                 const std::vector<int>& permutation) const {
+  int64 Generate(const int64 batch_index, const std::vector<int>& permutation,
+                 bool unused_strong_hash) const {
     // Do the fingerprint concatenation on uint64.
     uint64 hashed_output = hash_key_;
     for (size_t i = 0; i < permutation.size(); ++i) {
-      uint64 hash_i = columns_[i]->Feature(batch_index, permutation[i]);
+      uint64 hash_i = columns_[i]->Feature(batch_index, permutation[i], false);
       hashed_output = FingerprintCat64(hashed_output, hash_i);
     }
     // The return value is int64 based on the number of buckets.
@@ -220,6 +342,39 @@ class HashCrosser {
   const uint64 hash_key_;
 };
 
+// Generates the sparse crosses as nested hash to avoid string manipulations.
+class HashCrosserV2 {
+ public:
+  HashCrosserV2(
+      const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns,
+      const int64 num_buckets, const uint64 hash_key_unused,
+      const tstring k_feature_separator_unused)
+      : columns_(columns), num_buckets_(num_buckets) {}
+
+  int64 Generate(const int64 batch_index, const std::vector<int>& permutation,
+                 bool strong_hash) const {
+    // Do the fingerprint concatenation on uint64.
+    uint64 hashed_output =
+        columns_[0]->Feature(batch_index, permutation[0], strong_hash);
+    for (size_t i = 1; i < permutation.size(); ++i) {
+      uint64 hash_i =
+          columns_[i]->Feature(batch_index, permutation[i], strong_hash);
+      hashed_output = FingerprintCat64(hashed_output, hash_i);
+    }
+    // The return value is int64 based on the number of buckets.
+    if (num_buckets_ > 0) {
+      return hashed_output % num_buckets_;
+    } else {
+      // To prevent negative output we take modulo to max int64.
+      return hashed_output % std::numeric_limits<int64>::max();
+    }
+  }
+
+ private:
+  const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns_;
+  const int64 num_buckets_;
+};
+
 // ProductIterator generates cartesian products based on indices.
 template <typename InternalType>
 class ProductIterator {
@@ -275,16 +430,264 @@ struct CrossTraits;
 template <typename InternalType>
 struct CrossTraits<false, InternalType> {
   typedef StringCrosser<InternalType> Crosser;
+  typedef StringCrosser<InternalType> CrosserV2;
   typedef OutputUpdater<tstring> Updater;
 };
 
 template <>
 struct CrossTraits<true, int64> {
   typedef HashCrosser Crosser;
+  typedef HashCrosserV2 CrosserV2;
   typedef OutputUpdater<int64> Updater;
 };
 }  // namespace
 
+// Calculate the batch size from either the shapes input or the dense input.
+int64 CalculateBatchSize(const OpInputList& shapes_list_in,
+                         const OpInputList& dense_list_in) {
+  if (shapes_list_in.size() > 0) {
+    return shapes_list_in[0].vec<int64>()(0);
+  }
+
+  if (dense_list_in.size() > 0) {
+    return dense_list_in[0].dim_size(0);
+  }
+
+  return 0;
+}
+
+// Validates input tensors.
+Status ValidateInput(const OpInputList& indices_list_in,
+                     const OpInputList& values_list_in,
+                     const OpInputList& shapes_list_in,
+                     const OpInputList& dense_list_in) {
+  const auto size = indices_list_in.size();
+  // Validates indices_list_in OpInputList.
+  for (int i = 0; i < size; i++) {
+    if (!TensorShapeUtils::IsMatrix(indices_list_in[i].shape())) {
+      return errors::InvalidArgument(
+          "Input indices should be a matrix but received shape ",
+          indices_list_in[i].shape().DebugString(), " at position ", i);
+    }
+    if (indices_list_in[i].shape().dim_size(1) != 2) {
+      return errors::InvalidArgument("Expected D2 of index to be 2 got ",
+                                     indices_list_in[i].shape().dim_size(1),
+                                     " at position ", i);
+    }
+  }
+
+  // Validates values_list_in OpInputList.
+  if (values_list_in.size() != size) {
+    return errors::InvalidArgument("Expected ", size, " input values, got ",
+                                   values_list_in.size());
+  }
+  for (int i = 0; i < size; i++) {
+    if (!TensorShapeUtils::IsVector(values_list_in[i].shape())) {
+      return errors::InvalidArgument(
+          "Input values should be a vector but received shape ",
+          values_list_in[i].shape().DebugString(), " at position ", i);
+    }
+    if (indices_list_in[i].shape().dim_size(0) !=
+        values_list_in[i].shape().dim_size(0)) {
+      return errors::InvalidArgument(
+          "Expected size of values to be ",
+          indices_list_in[i].shape().dim_size(0), " got ",
+          values_list_in[i].shape().dim_size(0), " at position ", i);
+    }
+  }
+
+  // Validates shapes_list_in OpInputList
+  if (shapes_list_in.size() != size) {
+    return errors::InvalidArgument("Expected ", size, " input shapes, got ",
+                                   shapes_list_in.size());
+  }
+  for (int i = 0; i < size; i++) {
+    if (!TensorShapeUtils::IsVector(shapes_list_in[i].shape())) {
+      return errors::InvalidArgument(
+          "Input shapes should be a vector but received shape ",
+          shapes_list_in[i].shape().DebugString(), " at position ", i);
+    }
+
+    if (shapes_list_in[i].vec<int64>().size() != 2) {
+      return errors::InvalidArgument("shape should imply a 2D tensor, but got ",
+                                     shapes_list_in[i].shape().DebugString(),
+                                     " at position ", i);
+    }
+  }
+
+  // Validates dense_list_in OpInputList
+  for (int i = 0; i < dense_list_in.size(); ++i) {
+    if (!TensorShapeUtils::IsMatrix(dense_list_in[i].shape())) {
+      return errors::InvalidArgument(
+          "Dense inputs should be a matrix but received shape ",
+          dense_list_in[i].shape().DebugString(), " at position ", i);
+    }
+  }
+
+  // Validates batch sizes.  (Note: we do this after validating the input
+  // shapes, because CalculateBatchSize() depends on inputs having valid
+  // shapes).
+  const auto batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+  for (int i = 0; i < size; i++) {
+    if (shapes_list_in[i].vec<int64>()(0) != batch_size) {
+      return errors::InvalidArgument("Expected batch size ", batch_size,
+                                     " got ", shapes_list_in[i].vec<int64>()(0),
+                                     " at position ", i);
+    }
+  }
+  for (int i = 0; i < dense_list_in.size(); ++i) {
+    if (dense_list_in[i].dim_size(0) != batch_size) {
+      return errors::InvalidArgument("Expected batch size ", batch_size,
+                                     " got ", dense_list_in[i].dim_size(0),
+                                     " at dense tensor ", i);
+    }
+  }
+
+  return Status::OK();
+}
+
+// Extracts data about the features and populates feature data.
+void ExtractFeatureData(
+    const OpInputList& indices_list_in, int64 batch_size,
+    std::vector<std::vector<int64>>* feature_counts,
+    std::vector<std::vector<int64>>* feature_start_indices) {
+  gtl::InlinedVector<int64, 8> current_row(indices_list_in.size(), 0);
+  for (int b = 0; b < batch_size; b++) {
+    for (int i = 0; i < indices_list_in.size(); i++) {
+      const auto indices = indices_list_in[i].matrix<int64>();
+      int64 feature_count = 0;
+      int64 start_index = current_row[i];
+      // Loops until we reach next batch index for current feature column.
+      while (current_row[i] < indices_list_in[i].dim_size(0) &&
+             indices(current_row[i], 0) == b) {
+        feature_count++;
+        current_row[i]++;
+      }
+      (*feature_counts)[i].push_back(feature_count);
+      (*feature_start_indices)[i].push_back(start_index);
+    }
+  }
+}
+
+// Returns number of crosses for a given batch_index
+template <typename InternalType>
+int64 CrossCountByBatchIndex(
+    const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns,
+    int batch_index) {
+  int64 cross_count = 1;
+  for (int i = 0; i < columns.size(); i++) {
+    const auto feature_count = columns[i]->FeatureCount(batch_index);
+    // If one column is missing any feature, there won't be any cross.
+    if (feature_count == 0) {
+      return 0;
+    }
+    cross_count *= feature_count;
+  }
+  return cross_count;
+}
+
+// Generate the columns given the sparse and dense inputs.
+template <typename InternalType>
+std::vector<std::unique_ptr<ColumnInterface<InternalType>>>
+GenerateColumnsFromInput(const OpInputList& indices_list_in,
+                         const OpInputList& values_list_in,
+                         const OpInputList& shapes_list_in,
+                         const OpInputList& dense_list_in) {
+  std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
+  const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+  const int64 number_of_columns = shapes_list_in.size();
+
+  std::vector<std::vector<int64>> feature_counts(number_of_columns,
+                                                 std::vector<int64>());
+  std::vector<std::vector<int64>> feature_start_indices(number_of_columns,
+                                                        std::vector<int64>());
+
+  ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
+                     &feature_start_indices);
+
+  columns.reserve(values_list_in.size());
+  for (int i = 0; i < values_list_in.size(); ++i) {
+    columns.emplace_back(new SparseTensorColumn<InternalType>(
+        values_list_in[i], std::move(feature_counts[i]),
+        std::move(feature_start_indices[i])));
+  }
+  for (int i = 0; i < dense_list_in.size(); ++i) {
+    columns.emplace_back(new DenseTensorColumn<InternalType>(dense_list_in[i]));
+  }
+
+  return columns;
+}
+
+// Generate the columns given the sparse and dense inputs.
+template <typename InternalType>
+std::vector<std::unique_ptr<ColumnInterface<InternalType>>>
+GenerateKeyedColumnsFromInput(const OpInputList& indices_list_in,
+                              const OpInputList& values_list_in,
+                              const OpInputList& shapes_list_in,
+                              const OpInputList& dense_list_in,
+                              std::vector<int64> keys) {
+  std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
+  const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+  const int64 number_of_columns = shapes_list_in.size();
+
+  std::vector<std::vector<int64>> feature_counts(number_of_columns,
+                                                 std::vector<int64>());
+  std::vector<std::vector<int64>> feature_start_indices(number_of_columns,
+                                                        std::vector<int64>());
+
+  ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
+                     &feature_start_indices);
+
+  columns.reserve(values_list_in.size());
+  for (int i = 0; i < values_list_in.size(); ++i) {
+    columns.emplace_back(new KeyedSparseTensorColumn<InternalType>(
+        values_list_in[i], std::move(feature_counts[i]),
+        std::move(feature_start_indices[i]), keys));
+  }
+  for (int i = 0; i < dense_list_in.size(); ++i) {
+    columns.emplace_back(
+        new KeyedDenseTensorColumn<InternalType>(dense_list_in[i], keys));
+  }
+
+  return columns;
+}
+
+// Allocates output tensors with proper size and sets the shape tensor of
+// the output SparseTensor.
+// It also output_start_indices which contains the start indices for each
+// input in the output SparseTensor.
+template <typename InternalType>
+Status CreateOutputTensors(
+    const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns,
+    int64 batch_size, OpKernelContext* context, Tensor** indices_out,
+    Tensor** values_out, Tensor** shape_out,
+    std::vector<int64>* output_start_indices) {
+  // Calculates dimensions for output tensors.
+  int64 cross_count_total = 0;
+  int64 max_cross_count = 0;
+  for (int64 b = 0; b < batch_size; b++) {
+    // For each input, sets starting indices in output SparseTensor
+    (*output_start_indices)[b] = cross_count_total;
+    const auto cross_count = CrossCountByBatchIndex(columns, b);
+    max_cross_count = std::max(max_cross_count, cross_count);
+    cross_count_total += cross_count;
+  }
+
+  // Allocates tensors.
+  TF_RETURN_IF_ERROR(context->allocate_output(
+      0, TensorShape({cross_count_total, 2}), indices_out));
+  TF_RETURN_IF_ERROR(context->allocate_output(
+      1, TensorShape({cross_count_total}), values_out));
+  TF_RETURN_IF_ERROR(context->allocate_output(2, TensorShape({2}), shape_out));
+
+  // Sets shape.
+  auto shape_vec = (*shape_out)->vec<int64>();
+  shape_vec(0) = batch_size;
+  shape_vec(1) = max_cross_count;
+
+  return Status::OK();
+}
+
 template <bool HASHED_OUTPUT, typename InternalType>
 class SparseCrossOp : public OpKernel {
  public:
@@ -312,11 +715,12 @@ class SparseCrossOp : public OpKernel {
                                           shapes_list_in, dense_list_in));
 
     std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns =
-        GenerateColumnsFromInput(indices_list_in, values_list_in,
-                                 shapes_list_in, dense_list_in);
+        GenerateColumnsFromInput<InternalType>(indices_list_in, values_list_in,
+                                               shapes_list_in, dense_list_in);
 
+    const tstring k_feature_separator = "_X_";
     typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser crosser(
-        columns, num_buckets_, hash_key_);
+        columns, num_buckets_, hash_key_, k_feature_separator);
     Tensor* indices_out;
     Tensor* values_out;
     Tensor* shape_out;
@@ -335,7 +739,8 @@ class SparseCrossOp : public OpKernel {
         int64 cross_count = 0;
         while (product_iterator.HasNext()) {
           const auto permutation = product_iterator.Next();
-          updater.Update(b, cross_count, crosser.Generate(b, permutation));
+          updater.Update(b, cross_count,
+                         crosser.Generate(b, permutation, false));
           cross_count++;
         }
       }
@@ -349,222 +754,138 @@ class SparseCrossOp : public OpKernel {
   }
 
  private:
-  // Validates input tensors.
-  Status ValidateInput(const OpInputList& indices_list_in,
-                       const OpInputList& values_list_in,
-                       const OpInputList& shapes_list_in,
-                       const OpInputList& dense_list_in) {
-    const auto size = indices_list_in.size();
-    // Validates indices_list_in OpInputList.
-    for (int i = 0; i < size; i++) {
-      if (!TensorShapeUtils::IsMatrix(indices_list_in[i].shape())) {
-        return errors::InvalidArgument(
-            "Input indices should be a matrix but received shape ",
-            indices_list_in[i].shape().DebugString(), " at position ", i);
-      }
-      if (indices_list_in[i].shape().dim_size(1) != 2) {
-        return errors::InvalidArgument("Expected D2 of index to be 2 got ",
-                                       indices_list_in[i].shape().dim_size(1),
-                                       " at position ", i);
-      }
-    }
-
-    // Validates values_list_in OpInputList.
-    if (values_list_in.size() != size) {
-      return errors::InvalidArgument("Expected ", size, " input values, got ",
-                                     values_list_in.size());
-    }
-    for (int i = 0; i < size; i++) {
-      if (!TensorShapeUtils::IsVector(values_list_in[i].shape())) {
-        return errors::InvalidArgument(
-            "Input values should be a vector but received shape ",
-            values_list_in[i].shape().DebugString(), " at position ", i);
-      }
-      if (indices_list_in[i].shape().dim_size(0) !=
-          values_list_in[i].shape().dim_size(0)) {
-        return errors::InvalidArgument(
-            "Expected size of values to be ",
-            indices_list_in[i].shape().dim_size(0), " got ",
-            values_list_in[i].shape().dim_size(0), " at position ", i);
-      }
-    }
-
-    // Validates shapes_list_in OpInputList
-    if (shapes_list_in.size() != size) {
-      return errors::InvalidArgument("Expected ", size, " input shapes, got ",
-                                     shapes_list_in.size());
-    }
-    for (int i = 0; i < size; i++) {
-      if (!TensorShapeUtils::IsVector(shapes_list_in[i].shape())) {
-        return errors::InvalidArgument(
-            "Input shapes should be a vector but received shape ",
-            shapes_list_in[i].shape().DebugString(), " at position ", i);
-      }
-
-      if (shapes_list_in[i].vec<int64>().size() != 2) {
-        return errors::InvalidArgument(
-            "shape should imply a 2D tensor, but got ",
-            shapes_list_in[i].shape().DebugString(), " at position ", i);
-      }
-    }
-
-    // Validates dense_list_in OpInputList
-    for (int i = 0; i < dense_list_in.size(); ++i) {
-      if (!TensorShapeUtils::IsMatrix(dense_list_in[i].shape())) {
-        return errors::InvalidArgument(
-            "Dense inputs should be a matrix but received shape ",
-            dense_list_in[i].shape().DebugString(), " at position ", i);
-      }
-    }
-
-    // Validates batch sizes.  (Note: we do this after validating the input
-    // shapes, because CalculateBatchSize() depends on inputs having valid
-    // shapes).
-    const auto batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
-    for (int i = 0; i < size; i++) {
-      if (shapes_list_in[i].vec<int64>()(0) != batch_size) {
-        return errors::InvalidArgument(
-            "Expected batch size ", batch_size, " got ",
-            shapes_list_in[i].vec<int64>()(0), " at position ", i);
-      }
-    }
-    for (int i = 0; i < dense_list_in.size(); ++i) {
-      if (dense_list_in[i].dim_size(0) != batch_size) {
-        return errors::InvalidArgument("Expected batch size ", batch_size,
-                                       " got ", dense_list_in[i].dim_size(0),
-                                       " at dense tensor ", i);
-      }
-    }
-
-    return Status::OK();
-  }
-
-  // Calculate the batch size from either the shapes input or the dense input.
-  int64 CalculateBatchSize(const OpInputList& shapes_list_in,
-                           const OpInputList& dense_list_in) {
-    if (shapes_list_in.size() > 0) {
-      return shapes_list_in[0].vec<int64>()(0);
-    }
-
-    if (dense_list_in.size() > 0) {
-      return dense_list_in[0].dim_size(0);
-    }
-
-    return 0;
-  }
-
-  // Generate the columns given the sparse and dense inputs.
-  std::vector<std::unique_ptr<ColumnInterface<InternalType>>>
-  GenerateColumnsFromInput(const OpInputList& indices_list_in,
-                           const OpInputList& values_list_in,
-                           const OpInputList& shapes_list_in,
-                           const OpInputList& dense_list_in) {
-    std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
-    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
-    const int64 number_of_columns = shapes_list_in.size();
-
-    std::vector<std::vector<int64>> feature_counts(number_of_columns,
-                                                   std::vector<int64>());
-    std::vector<std::vector<int64>> feature_start_indices(number_of_columns,
-                                                          std::vector<int64>());
-
-    ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
-                       &feature_start_indices);
-
-    columns.reserve(values_list_in.size());
-    for (int i = 0; i < values_list_in.size(); ++i) {
-      columns.emplace_back(new SparseTensorColumn<InternalType>(
-          values_list_in[i], std::move(feature_counts[i]),
-          std::move(feature_start_indices[i])));
-    }
-    for (int i = 0; i < dense_list_in.size(); ++i) {
-      columns.emplace_back(
-          new DenseTensorColumn<InternalType>(dense_list_in[i]));
-    }
-
-    return columns;
-  }
-
-  // Extracts data about the features and populates feature data.
-  void ExtractFeatureData(
-      const OpInputList& indices_list_in, int64 batch_size,
-      std::vector<std::vector<int64>>* feature_counts,
-      std::vector<std::vector<int64>>* feature_start_indices) {
-    gtl::InlinedVector<int64, 8> current_row(indices_list_in.size(), 0);
-    for (int b = 0; b < batch_size; b++) {
-      for (int i = 0; i < indices_list_in.size(); i++) {
-        const auto indices = indices_list_in[i].matrix<int64>();
-        int64 feature_count = 0;
-        int64 start_index = current_row[i];
-        // Loops until we reach next batch index for current feature column.
-        while (current_row[i] < indices_list_in[i].dim_size(0) &&
-               indices(current_row[i], 0) == b) {
-          feature_count++;
-          current_row[i]++;
-        }
-        (*feature_counts)[i].push_back(feature_count);
-        (*feature_start_indices)[i].push_back(start_index);
-      }
-    }
-  }
-
-  // Allocates output tensors with proper size and sets the shape tensor of
-  // the output SparseTensor.
-  // It also output_start_indices which contains the start indices for each
-  // input in the output SparseTensor.
-  Status CreateOutputTensors(
-      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
-          columns,
-      int64 batch_size, OpKernelContext* context, Tensor** indices_out,
-      Tensor** values_out, Tensor** shape_out,
-      std::vector<int64>* output_start_indices) {
-    // Calculates dimensions for output tensors.
-    int64 cross_count_total = 0;
-    int64 max_cross_count = 0;
-    for (int64 b = 0; b < batch_size; b++) {
-      // For each input, sets starting indices in output SparseTensor
-      (*output_start_indices)[b] = cross_count_total;
-      const auto cross_count = CrossCountByBatchIndex(columns, b);
-      max_cross_count = std::max(max_cross_count, cross_count);
-      cross_count_total += cross_count;
-    }
-
-    // Allocates tensors.
-    TF_RETURN_IF_ERROR(context->allocate_output(
-        0, TensorShape({cross_count_total, 2}), indices_out));
-    TF_RETURN_IF_ERROR(context->allocate_output(
-        1, TensorShape({cross_count_total}), values_out));
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(2, TensorShape({2}), shape_out));
-
-    // Sets shape.
-    auto shape_vec = (*shape_out)->vec<int64>();
-    shape_vec(0) = batch_size;
-    shape_vec(1) = max_cross_count;
-
-    return Status::OK();
-  }
-
-  // Returns number of crosses for a given batch_index
-  int64 CrossCountByBatchIndex(
-      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
-          columns,
-      int batch_index) {
-    int64 cross_count = 1;
-    for (int i = 0; i < columns.size(); i++) {
-      const auto feature_count = columns[i]->FeatureCount(batch_index);
-      // If one column is missing any feature, there won't be any cross.
-      if (feature_count == 0) {
-        return 0;
-      }
-      cross_count *= feature_count;
-    }
-    return cross_count;
-  }
   int64 num_buckets_;
   uint64 hash_key_;
 };
 
+class SparseCrossV2Op : public OpKernel {
+ public:
+  explicit SparseCrossV2Op(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList indices_list_in;
+    OP_REQUIRES_OK(context, context->input_list("indices", &indices_list_in));
+    OpInputList values_list_in;
+    OP_REQUIRES_OK(context, context->input_list("values", &values_list_in));
+    OpInputList shapes_list_in;
+    OP_REQUIRES_OK(context, context->input_list("shapes", &shapes_list_in));
+    OpInputList dense_list_in;
+    OP_REQUIRES_OK(context,
+                   context->input_list("dense_inputs", &dense_list_in));
+
+    OP_REQUIRES_OK(context, ValidateInput(indices_list_in, values_list_in,
+                                          shapes_list_in, dense_list_in));
+
+    const Tensor* sep_t;
+    OP_REQUIRES_OK(context, context->input("sep", &sep_t));
+    const tstring separator = sep_t->scalar<tstring>()();
+
+    std::vector<std::unique_ptr<ColumnInterface<tstring>>> columns =
+        GenerateColumnsFromInput<tstring>(indices_list_in, values_list_in,
+                                          shapes_list_in, dense_list_in);
+    Tensor* indices_out;
+    Tensor* values_out;
+    Tensor* shape_out;
+    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    std::vector<int64> output_start_indices(batch_size);
+    OP_REQUIRES_OK(
+        context,
+        CreateOutputTensors(columns, batch_size, context, &indices_out,
+                            &values_out, &shape_out, &output_start_indices));
+    StringCrosser<tstring> crosser(columns, 0, 0, separator);
+    OutputUpdater<tstring> updater(output_start_indices, indices_out,
+                                   values_out);
+    auto do_work = [&columns, crosser, updater](int64 begin, int64 end) {
+      for (int b = begin; b < end; b++) {
+        ProductIterator<tstring> product_iterator(columns, b);
+        int64 cross_count = 0;
+        while (product_iterator.HasNext()) {
+          const auto permutation = product_iterator.Next();
+          updater.Update(b, cross_count,
+                         crosser.Generate(b, permutation, false));
+          cross_count++;
+        }
+      }
+    };
+
+    auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    // TODO(zakaria): optimize kCostPerUnit
+    const int kCostPerUnit = 5000 * indices_list_in.size();
+    Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+          kCostPerUnit, do_work);
+  }
+};
+
+class SparseCrossHashedOp : public OpKernel {
+ public:
+  explicit SparseCrossHashedOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList indices_list_in;
+    OP_REQUIRES_OK(context, context->input_list("indices", &indices_list_in));
+    OpInputList values_list_in;
+    OP_REQUIRES_OK(context, context->input_list("values", &values_list_in));
+    OpInputList shapes_list_in;
+    OP_REQUIRES_OK(context, context->input_list("shapes", &shapes_list_in));
+    OpInputList dense_list_in;
+    OP_REQUIRES_OK(context,
+                   context->input_list("dense_inputs", &dense_list_in));
+
+    OP_REQUIRES_OK(context, ValidateInput(indices_list_in, values_list_in,
+                                          shapes_list_in, dense_list_in));
+
+    const Tensor* num_buckets_t;
+    OP_REQUIRES_OK(context, context->input("num_buckets", &num_buckets_t));
+    const int64 num_buckets = num_buckets_t->scalar<int64>()();
+
+    const Tensor* strong_hash_t;
+    OP_REQUIRES_OK(context, context->input("strong_hash", &strong_hash_t));
+    const bool strong_hash = strong_hash_t->scalar<bool>()();
+
+    const Tensor* salt_t;
+    OP_REQUIRES_OK(context, context->input("salt", &salt_t));
+    const auto salt = salt_t->flat<int64>();
+    std::vector<int64> key_{salt(0), salt(1)};
+
+    std::vector<std::unique_ptr<ColumnInterface<int64>>> columns =
+        GenerateKeyedColumnsFromInput<int64>(indices_list_in, values_list_in,
+                                             shapes_list_in, dense_list_in,
+                                             key_);
+    Tensor* indices_out;
+    Tensor* values_out;
+    Tensor* shape_out;
+    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    std::vector<int64> output_start_indices(batch_size);
+    OP_REQUIRES_OK(
+        context,
+        CreateOutputTensors(columns, batch_size, context, &indices_out,
+                            &values_out, &shape_out, &output_start_indices));
+    const tstring unused_sep;
+    HashCrosserV2 crosser(columns, num_buckets, 0, unused_sep);
+    OutputUpdater<int64> updater(output_start_indices, indices_out, values_out);
+    auto do_work = [&columns, crosser, updater, strong_hash](int64 begin,
+                                                             int64 end) {
+      for (int b = begin; b < end; b++) {
+        ProductIterator<int64> product_iterator(columns, b);
+        int64 cross_count = 0;
+        while (product_iterator.HasNext()) {
+          const auto permutation = product_iterator.Next();
+          updater.Update(b, cross_count,
+                         crosser.Generate(b, permutation, strong_hash));
+          cross_count++;
+        }
+      }
+    };
+
+    auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    // TODO(zakaria): optimize kCostPerUnit
+    const int kCostPerUnit = 5000 * indices_list_in.size();
+    Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+          kCostPerUnit, do_work);
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("SparseCross")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<tstring>("out_type")
@@ -589,4 +910,10 @@ REGISTER_KERNEL_BUILDER(Name("SparseCross")
                             .TypeConstraint<int64>("internal_type"),
                         SparseCrossOp<true, int64>);
 
+REGISTER_KERNEL_BUILDER(Name("SparseCrossV2").Device(DEVICE_CPU),
+                        SparseCrossV2Op);
+
+REGISTER_KERNEL_BUILDER(Name("SparseCrossHashed").Device(DEVICE_CPU),
+                        SparseCrossHashedOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc
index d86c7b2fe23..02dd9259a76 100644
--- a/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc
@@ -21,9 +21,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/strided_slice_op_gpu_impl.h"
 
 namespace tensorflow {
-TF_CALL_int8(DEFINE_GPU_KERNELS);
-TF_CALL_int32(DEFINE_GPU_KERNELS);
-TF_CALL_int64(DEFINE_GPU_KERNELS);
+TF_CALL_INTEGRAL_TYPES(DEFINE_GPU_KERNELS);
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc
index 2821abf8a6c..482fd057e4e 100644
--- a/tensorflow/core/kernels/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/kernels/eye_functor.h"
 #include "tensorflow/core/kernels/linalg_ops_common.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -390,8 +391,22 @@ class SvdOpGpu : public AsyncOpKernel {
                          done);
 
     if (n == 0 || m == 0) {
-      // If X is an empty matrix (0 rows, 0 col), X * X' == X.
-      // Therefore, we return X.
+      if (n == m || !compute_uv_ || !full_matrices_) {
+        // S, U, and V are all empty. Nothing to do.
+        done();
+        return;
+      }
+      auto device = context->eigen_device<GPUDevice>();
+      functor::EyeFunctor<GPUDevice, Scalar> eye;
+      if (m > 0) {
+        // Return a full canonical basis for the column space.
+        auto outputU_reshaped = outputU->flat_inner_dims<Scalar, 3>();
+        eye(device, outputU_reshaped);
+      } else if (n > 0) {
+        // Return a full canonical basis for the row space.
+        auto outputV_reshaped = outputV->flat_inner_dims<Scalar, 3>();
+        eye(device, outputV_reshaped);
+      }
       done();
       return;
     }
diff --git a/tensorflow/core/kernels/svd_op_impl.h b/tensorflow/core/kernels/svd_op_impl.h
index 2a67700c126..675826a057c 100644
--- a/tensorflow/core/kernels/svd_op_impl.h
+++ b/tensorflow/core/kernels/svd_op_impl.h
@@ -83,16 +83,29 @@ class SvdOp : public LinearAlgebraOp<Scalar> {
 
   void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
                      MatrixMaps* outputs) final {
+    int64 n = inputs[0].cols();
+    int64 m = inputs[0].rows();
+    const bool empty = (m == 0 || n == 0);
     int options = 0;  // Don't compute singular vectors;
     if (compute_uv_) {
       options = full_matrices_ ? Eigen::ComputeFullU | Eigen::ComputeFullV
                                : Eigen::ComputeThinU | Eigen::ComputeThinV;
     }
-    Eigen::BDCSVD<Matrix> svd(inputs[0], options);
-    outputs->at(0) = svd.singularValues().template cast<Scalar>();
-    if (compute_uv_) {
-      outputs->at(1) = svd.matrixU();
-      outputs->at(2) = svd.matrixV();
+    if (!empty) {
+      Eigen::BDCSVD<Matrix> svd(inputs[0], options);
+      outputs->at(0) = svd.singularValues().template cast<Scalar>();
+      if (compute_uv_) {
+        outputs->at(1) = svd.matrixU();
+        outputs->at(2) = svd.matrixV();
+      }
+    } else if (compute_uv_ && full_matrices_) {
+      // For an empty matrix where only one dimension is zero, we still set
+      // U or V to the unit matrix for the dimension that is non-zero.
+      if (m > 0) {
+        outputs->at(1) = Matrix::Identity(m, m);
+      } else {
+        outputs->at(2) = Matrix::Identity(n, n);
+      }
     }
   }
 
diff --git a/tensorflow/core/kernels/tile_functor_cpu_uint32.cc b/tensorflow/core/kernels/tile_functor_cpu_uint32.cc
new file mode 100644
index 00000000000..4dd44eeea0f
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_cpu_uint32.cc
@@ -0,0 +1,29 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/tile_functor_cpu.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template struct Tile<CPUDevice, uint32, int32>;
+template struct Tile<CPUDevice, uint32, int64>;
+
+}  // end namespace functor
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_functor_cpu_uint64.cc b/tensorflow/core/kernels/tile_functor_cpu_uint64.cc
new file mode 100644
index 00000000000..ec1eb7b0946
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_cpu_uint64.cc
@@ -0,0 +1,29 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/tile_functor_cpu.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template struct Tile<CPUDevice, uint64, int32>;
+template struct Tile<CPUDevice, uint64, int64>;
+
+}  // end namespace functor
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_functor_cpu_variant.cc b/tensorflow/core/kernels/tile_functor_cpu_variant.cc
new file mode 100644
index 00000000000..9ecfb4e9fe1
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_cpu_variant.cc
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/kernels/tile_functor_cpu.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template struct Tile<CPUDevice, Variant, int32>;
+template struct Tile<CPUDevice, Variant, int64>;
+
+}  // end namespace functor
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index cd047ed9d4a..e626d430864 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -139,10 +139,13 @@ TF_CALL_uint8(DECLARE_TYPE);
 TF_CALL_int32(DECLARE_TYPE);
 TF_CALL_int16(DECLARE_TYPE);
 TF_CALL_int64(DECLARE_TYPE);
+TF_CALL_uint32(DECLARE_TYPE);
+TF_CALL_uint64(DECLARE_TYPE);
 TF_CALL_half(DECLARE_TYPE);
 TF_CALL_complex64(DECLARE_TYPE);
 TF_CALL_complex128(DECLARE_TYPE);
 TF_CALL_tstring(DECLARE_TYPE);
+TF_CALL_variant(DECLARE_TYPE);
 #undef DECLARE_TYPE
 
 #define DECLARE_DIM(T, NDIM)                           \
@@ -240,10 +243,13 @@ class TileOp : public OpKernel {
     TF_CALL_int32(HANDLE_TYPE_NAME);
     TF_CALL_int16(HANDLE_TYPE_NAME);
     TF_CALL_int64(HANDLE_TYPE_NAME);
+    TF_CALL_uint32(HANDLE_TYPE_NAME);
+    TF_CALL_uint64(HANDLE_TYPE_NAME);
     TF_CALL_half(HANDLE_TYPE_NAME);
     TF_CALL_tstring(HANDLE_TYPE_NAME);  // when DEVICE=CPUDevice.
     TF_CALL_complex64(HANDLE_TYPE_NAME);
     TF_CALL_complex128(HANDLE_TYPE_NAME);
+    TF_CALL_variant(HANDLE_TYPE_NAME);  // when DEVICE=CPUDevice
 
 #undef HANDLE_TYPE_NAME
 #undef HANDLE_TYPE
@@ -319,10 +325,13 @@ TF_CALL_int8(HANDLE_TYPE_NAME_CPU);
 TF_CALL_int32(HANDLE_TYPE_NAME_CPU);
 TF_CALL_int16(HANDLE_TYPE_NAME_CPU);
 TF_CALL_int64(HANDLE_TYPE_NAME_CPU);
+TF_CALL_uint32(HANDLE_TYPE_NAME_CPU);
+TF_CALL_uint64(HANDLE_TYPE_NAME_CPU);
 TF_CALL_half(HANDLE_TYPE_NAME_CPU);
 TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
 TF_CALL_tstring(HANDLE_TYPE_NAME_CPU);
+TF_CALL_variant(HANDLE_TYPE_NAME_CPU);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TF_CALL_bool(HANDLE_TYPE_NAME_GPU);
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 4c38738593f..54d78480066 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -194,171 +194,170 @@ struct bfloat16 {
     input = f.u;
     bfloat16 output;
 
+    // Fast rounding algorithm that rounds a half value to nearest even. This
+    // reduces expected error when we convert a large number of floats. Here
+    // is how it works:
+    //
+    // Definitions:
+    // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
+    // with the following tags:
+    //
+    // Sign |  Exp (8 bits) | Frac (23 bits)
+    //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
+    //
+    //  S: Sign bit.
+    //  E: Exponent bits.
+    //  F: First 6 bits of fraction.
+    //  L: Least significant bit of resulting bfloat16 if we truncate away the
+    //  rest of the float32. This is also the 7th bit of fraction
+    //  R: Rounding bit, 8th bit of fraction.
+    //  T: Sticky bits, rest of fraction, 15 bits.
+    //
+    // To round half to nearest even, there are 3 cases where we want to round
+    // down (simply truncate the result of the bits away, which consists of
+    // rounding bit and sticky bits) and two cases where we want to round up
+    // (truncate then add one to the result).
+    //
+    // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
+    // 1s) as the rounding bias, adds the rounding bias to the input, then
+    // truncates the last 16 bits away.
+    //
+    // To understand how it works, we can analyze this algorithm case by case:
+    //
+    // 1. L = 0, R = 0:
+    //   Expect: round down, this is less than half value.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input may create any carry, depending on
+    //   whether there is any value set to 1 in T bits.
+    //   - R may be set to 1 if there is a carry.
+    //   - L remains 0.
+    //   - Note that this case also handles Inf and -Inf, where all fraction
+    //   bits, including L, R and Ts are all 0. The output remains Inf after
+    //   this algorithm.
+    //
+    // 2. L = 1, R = 0:
+    //   Expect: round down, this is less than half value.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 1 = 0x8000
+    //   - Adding rounding bias to input doesn't change sticky bits but
+    //   adds 1 to rounding bit.
+    //   - L remains 1.
+    //
+    // 3. L = 0, R = 1, all of T are 0:
+    //   Expect: round down, this is exactly at half, the result is already
+    //   even (L=0).
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input sets all sticky bits to 1, but
+    //   doesn't create a carry.
+    //   - R remains 1.
+    //   - L remains 0.
+    //
+    // 4. L = 1, R = 1:
+    //   Expect: round up, this is exactly at half, the result needs to be
+    //   round to the next even number.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 1 = 0x8000
+    //   - Adding rounding bias to input doesn't change sticky bits, but
+    //   creates a carry from rounding bit.
+    //   - The carry sets L to 0, creates another carry bit and propagate
+    //   forward to F bits.
+    //   - If all the F bits are 1, a carry then propagates to the exponent
+    //   bits, which then creates the minimum value with the next exponent
+    //   value. Note that we won't have the case where exponents are all 1,
+    //   since that's either a NaN (handled in the other if condition) or inf
+    //   (handled in case 1).
+    //
+    // 5. L = 0, R = 1, any of T is 1:
+    //   Expect: round up, this is greater than half.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input creates a carry from sticky bits,
+    //   sets rounding bit to 0, then create another carry.
+    //   - The second carry sets L to 1.
+    //
+    // Examples:
+    //
+    //  Exact half value that is already even:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
+    //
+    //     This falls into case 3. We truncate the rest of 16 bits and no
+    //     carry is created into F and L:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+    //
+    //  Exact half value, round to next even number:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     which then propagates into L and F:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+    //
+    //
+    //  Max denormal value round to min normal value:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     propagate into L and F, which then propagates into exponent
+    //     bits:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
+    //
+    //  Max normal value round to Inf:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     propagate into L and F, which then propagates into exponent
+    //     bits:
+    //
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
+    //
+    //
+    // Least significant bit of resulting bfloat.
+    uint32_t lsb = (input >> 16) & 1;
+    uint32_t rounding_bias = 0x7fff + lsb;
+    input += rounding_bias;
+    output.value = static_cast<uint16_t>(input >> 16);
+    if ((f.u & 0xff800000u) == 0) {
+      // Flush positive denormal to 0
+      output.value = 0x0;
+    }
+    if ((f.u & 0xff800000u) == 0x80000000u) {
+      // Flush negative denormal to -0
+      output.value = 0x8000;
+    }
     if (float_isnan(v)) {
-      // If the value is a NaN, squash it to a qNaN with msb of fraction set,
-      // this makes sure after truncation we don't end up with an inf.
-      //
-      // qNaN magic: All exponent bits set + most significant bit of fraction
-      // set.
-      output.value = 0x7fc0;
-    } else if (std::fabs(v) < std::numeric_limits<float>::min()) {
-      // Flush denormal to +/- 0.0
-      output.value = std::signbit(v) ? 0x8000 : 0;
-    } else {
-      // Fast rounding algorithm that rounds a half value to nearest even. This
-      // reduces expected error when we convert a large number of floats. Here
-      // is how it works:
-      //
-      // Definitions:
-      // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
-      // with the following tags:
-      //
-      // Sign |  Exp (8 bits) | Frac (23 bits)
-      //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
-      //
-      //  S: Sign bit.
-      //  E: Exponent bits.
-      //  F: First 6 bits of fraction.
-      //  L: Least significant bit of resulting bfloat16 if we truncate away the
-      //  rest of the float32. This is also the 7th bit of fraction
-      //  R: Rounding bit, 8th bit of fraction.
-      //  T: Sticky bits, rest of fraction, 15 bits.
-      //
-      // To round half to nearest even, there are 3 cases where we want to round
-      // down (simply truncate the result of the bits away, which consists of
-      // rounding bit and sticky bits) and two cases where we want to round up
-      // (truncate then add one to the result).
-      //
-      // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
-      // 1s) as the rounding bias, adds the rounding bias to the input, then
-      // truncates the last 16 bits away.
-      //
-      // To understand how it works, we can analyze this algorithm case by case:
-      //
-      // 1. L = 0, R = 0:
-      //   Expect: round down, this is less than half value.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 0 = 0x7fff
-      //   - Adding rounding bias to input may create any carry, depending on
-      //   whether there is any value set to 1 in T bits.
-      //   - R may be set to 1 if there is a carry.
-      //   - L remains 0.
-      //   - Note that this case also handles Inf and -Inf, where all fraction
-      //   bits, including L, R and Ts are all 0. The output remains Inf after
-      //   this algorithm.
-      //
-      // 2. L = 1, R = 0:
-      //   Expect: round down, this is less than half value.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 1 = 0x8000
-      //   - Adding rounding bias to input doesn't change sticky bits but
-      //   adds 1 to rounding bit.
-      //   - L remains 1.
-      //
-      // 3. L = 0, R = 1, all of T are 0:
-      //   Expect: round down, this is exactly at half, the result is already
-      //   even (L=0).
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 0 = 0x7fff
-      //   - Adding rounding bias to input sets all sticky bits to 1, but
-      //   doesn't create a carry.
-      //   - R remains 1.
-      //   - L remains 0.
-      //
-      // 4. L = 1, R = 1:
-      //   Expect: round up, this is exactly at half, the result needs to be
-      //   round to the next even number.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 1 = 0x8000
-      //   - Adding rounding bias to input doesn't change sticky bits, but
-      //   creates a carry from rounding bit.
-      //   - The carry sets L to 0, creates another carry bit and propagate
-      //   forward to F bits.
-      //   - If all the F bits are 1, a carry then propagates to the exponent
-      //   bits, which then creates the minimum value with the next exponent
-      //   value. Note that we won't have the case where exponents are all 1,
-      //   since that's either a NaN (handled in the other if condition) or inf
-      //   (handled in case 1).
-      //
-      // 5. L = 0, R = 1, any of T is 1:
-      //   Expect: round up, this is greater than half.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 0 = 0x7fff
-      //   - Adding rounding bias to input creates a carry from sticky bits,
-      //   sets rounding bit to 0, then create another carry.
-      //   - The second carry sets L to 1.
-      //
-      // Examples:
-      //
-      //  Exact half value that is already even:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
-      //
-      //     This falls into case 3. We truncate the rest of 16 bits and no
-      //     carry is created into F and L:
-      //
-      //    Output:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
-      //
-      //  Exact half value, round to next even number:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
-      //
-      //     This falls into case 4. We create a carry from R and T,
-      //     which then propagates into L and F:
-      //
-      //    Output:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
-      //
-      //
-      //  Max denormal value round to min normal value:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
-      //
-      //     This falls into case 4. We create a carry from R and T,
-      //     propagate into L and F, which then propagates into exponent
-      //     bits:
-      //
-      //    Output:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
-      //
-      //  Max normal value round to Inf:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
-      //
-      //     This falls into case 4. We create a carry from R and T,
-      //     propagate into L and F, which then propagates into exponent
-      //     bits:
-      //
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
-      //
-      //
-      // Least significant bit of resulting bfloat.
-      uint32_t lsb = (input >> 16) & 1;
-      uint32_t rounding_bias = 0x7fff + lsb;
-      input += rounding_bias;
-      output.value = static_cast<uint16_t>(input >> 16);
+      output.value = NAN_VALUE;
     }
     return output;
   }
diff --git a/tensorflow/core/lib/core/BUILD b/tensorflow/core/lib/core/BUILD
index 80ad4943f16..491e4c5e7aa 100644
--- a/tensorflow/core/lib/core/BUILD
+++ b/tensorflow/core/lib/core/BUILD
@@ -138,10 +138,13 @@ tf_proto_library(
     cc_api_version = 2,
     make_default_target_header_only = True,
     protodeps = [
-        "//tensorflow/core:error_codes_proto_impl",
+        "//tensorflow/core/protobuf:error_codes_proto_impl",
     ],
-    visibility = ["//tensorflow/core:__subpackages__"],
-    exports = ["//tensorflow/core:error_codes_proto_impl"],
+    visibility = [
+        "//tensorflow/core:__subpackages__",
+        "//tensorflow/core/protobuf:__subpackages__",
+    ],
+    exports = ["//tensorflow/core/protobuf:error_codes_proto_impl"],
 )
 
 # Export source files needed for mobile builds, which do not use granular targets.
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index d318059e8f6..592ab2bd2a5 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -235,7 +235,8 @@ Status DecodeLin16WaveAsFloatVector(const string& wav_string,
       ReadValue<uint32>(wav_string, &format_chunk_size, &offset));
   if ((format_chunk_size != 16) && (format_chunk_size != 18)) {
     return errors::InvalidArgument(
-        "Bad file size for WAV: Expected 16 or 18, but got", format_chunk_size);
+        "Bad format chunk size for WAV: Expected 16 or 18, but got",
+        format_chunk_size);
   }
   uint16 audio_format;
   TF_RETURN_IF_ERROR(ReadValue<uint16>(wav_string, &audio_format, &offset));
diff --git a/tensorflow/core/ops/batch_ops.cc b/tensorflow/core/ops/batch_ops.cc
index ba7faeb5e8a..cfa4049938d 100644
--- a/tensorflow/core/ops/batch_ops.cc
+++ b/tensorflow/core/ops/batch_ops.cc
@@ -25,6 +25,19 @@ REGISTER_OP("BatchFunction")
     .Output("out_tensors: Tout")
     .Attr("f: func")
     .Attr("num_batch_threads: int")
+    // 'max_batch_size' denotes the maximum batch size acceptable, i.e., inputs
+    // with larger batch size are simply invalidated.
+    // By default, 'max_batch_size' must be equal to max value of
+    // 'allowed_batch_sizes'.
+    // By setting 'enable_large_batch_splitting' (attribute below) to true,
+    // 'max_batch_size' can be greater than or equal to max value of
+    // 'allowed_batch_sizes', in other words,
+    // 1) input with size > 'max_batch_size' is still invalidated.
+    // 2) input with
+    //    a) size <= 'max_batch_size'
+    //    b) size > max value of 'allowed_batch_sizes'
+    //    will automatically be split into multiple batches (with batch size in
+    //    'allowed_batch_sizes'), executed, and re-composed (as final output).
     .Attr("max_batch_size: int")
     .Attr("batch_timeout_micros: int")
     .Attr("max_enqueued_batches: int = 10")
@@ -35,6 +48,12 @@ REGISTER_OP("BatchFunction")
     .Attr("Tin: list(type)")
     .Attr("Tcaptured: list(type) >= 0")
     .Attr("Tout: list(type)")
+    // If 'enable_large_batch_splitting' is true, for input batches exceeding
+    // the largest value in "allowed_batch_sizes", allow the batch to be split
+    // into multiple batches with batch size within "allowed_batch_sizes".
+    // NOTE: Support for `enable_large_batch_splitting == true` is still
+    // developed in progress.
+    .Attr("enable_large_batch_splitting: bool = false")
     // TODO(apassos): Fix this shape inference function. It requires shape
     // inference of function calls.
     .SetShapeFn(shape_inference::UnknownShape);
diff --git a/tensorflow/core/ops/collective_ops.cc b/tensorflow/core/ops/collective_ops.cc
index 492c7488b2c..23d09ff61ec 100644
--- a/tensorflow/core/ops/collective_ops.cc
+++ b/tensorflow/core/ops/collective_ops.cc
@@ -31,6 +31,7 @@ REGISTER_OP("CollectiveReduce")
     .Attr("subdiv_offsets: list(int)")
     .Attr("wait_for: list(int) = []")
     .Attr("communication_hint: string = 'auto'")
+    .Attr("timeout_seconds: float = 0")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape);
 
@@ -43,6 +44,7 @@ REGISTER_OP("CollectiveGather")
     .Attr("instance_key: int")
     .Attr("shape: shape")
     .Attr("communication_hint: string = 'auto'")
+    .Attr("timeout_seconds: float = 0")
     .SetIsStateful()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       // Scalar input is not supported.
@@ -86,6 +88,7 @@ REGISTER_OP("CollectiveBcastSend")
     .Attr("instance_key: int")
     .Attr("shape: shape")
     .Attr("communication_hint: string = 'auto'")
+    .Attr("timeout_seconds: float = 0")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ExplicitShape);
 
@@ -97,6 +100,7 @@ REGISTER_OP("CollectiveBcastRecv")
     .Attr("instance_key: int")
     .Attr("shape: shape")
     .Attr("communication_hint: string = 'auto'")
+    .Attr("timeout_seconds: float = 0")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ExplicitShape);
 
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index d8bfd4473f7..299076d8cfd 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -39,10 +39,6 @@ tf_cc_test(
         "ops_history_v*/*.pbtxt",
         "ops_history.v*.pbtxt",
     ]),
-    tags = [
-        "no_oss",  # TODO(b/150030420): Reenable when fix lands.
-        "notap",  # TODO(b/150030420): Reenable when fix lands.
-    ],
     deps = [
         ":op_compatibility_lib",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt
index daf3c4692b8..1a3cb96431d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt
@@ -82,3 +82,94 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "BatchFunction"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "captured_tensors"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "out_tensors"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "enable_large_batch_splitting"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastRecv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastRecv.pbtxt
index 55935ccca72..5ab41c40b73 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastRecv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastRecv.pbtxt
@@ -80,3 +80,55 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveBcastRecv"
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BOOL
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastSend.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastSend.pbtxt
index 9c67349c01f..35734c2a611 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastSend.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastSend.pbtxt
@@ -88,3 +88,59 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveBcastSend"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BOOL
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveGather.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveGather.pbtxt
index 23326568a79..c093e30dceb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveGather.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveGather.pbtxt
@@ -87,3 +87,58 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveGather"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduce.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduce.pbtxt
index 25b1485e3e1..14f4b1073a0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduce.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduce.pbtxt
@@ -210,3 +210,88 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  attr {
+    name: "wait_for"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CompressElement.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CompressElement.pbtxt
new file mode 100644
index 00000000000..07d8cb461af
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/CompressElement.pbtxt
@@ -0,0 +1,17 @@
+op {
+  name: "CompressElement"
+  input_arg {
+    name: "components"
+    type_list_attr: "input_types"
+  }
+  output_arg {
+    name: "compressed"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "input_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt
index e26e1639e82..9bab6854e40 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt
@@ -39,7 +39,7 @@ op {
     }
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
     default_value {
       b: false
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DenseCountSparseOutput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DenseCountSparseOutput.pbtxt
index c5b845fd0fb..be566eab9f4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DenseCountSparseOutput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DenseCountSparseOutput.pbtxt
@@ -6,7 +6,7 @@ op {
   }
   input_arg {
     name: "weights"
-    type: DT_FLOAT
+    type_attr: "output_type"
   }
   output_arg {
     name: "output_indices"
@@ -49,7 +49,7 @@ op {
     minimum: -1
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
   }
   attr {
@@ -57,8 +57,10 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_INT32
         type: DT_INT64
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DeviceIndex.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DeviceIndex.pbtxt
new file mode 100644
index 00000000000..a00f6849c33
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/DeviceIndex.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "DeviceIndex"
+  output_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  attr {
+    name: "device_names"
+    type: "list(string)"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExtractGlimpseV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExtractGlimpseV2.pbtxt
new file mode 100644
index 00000000000..08725f4504c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExtractGlimpseV2.pbtxt
@@ -0,0 +1,47 @@
+op {
+  name: "ExtractGlimpseV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "offsets"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "glimpse"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "centered"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "normalized"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "uniform_noise"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "noise"
+    type: "string"
+    default_value {
+      s: "uniform"
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt
index 9d94149cc09..4f5fb24109c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt
@@ -43,7 +43,7 @@ op {
     }
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
     default_value {
       b: false
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedCountSparseOutput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedCountSparseOutput.pbtxt
index 7f492418b48..aa1a4e07aaf 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedCountSparseOutput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedCountSparseOutput.pbtxt
@@ -10,7 +10,7 @@ op {
   }
   input_arg {
     name: "weights"
-    type: DT_FLOAT
+    type_attr: "output_type"
   }
   output_arg {
     name: "output_indices"
@@ -53,7 +53,7 @@ op {
     minimum: -1
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
   }
   attr {
@@ -61,8 +61,10 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_INT32
         type: DT_INT64
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt
new file mode 100644
index 00000000000..f0868514182
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt
@@ -0,0 +1,60 @@
+op {
+  name: "SnapshotDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "path"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "reader_func_other_args"
+    type_list_attr: "Treader_func_args"
+  }
+  input_arg {
+    name: "shard_func_other_args"
+    type_list_attr: "Tshard_func_args"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reader_func"
+    type: "func"
+  }
+  attr {
+    name: "shard_func"
+    type: "func"
+  }
+  attr {
+    name: "Treader_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tshard_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt
index 333b71a5e1c..9bbc5132845 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt
@@ -47,7 +47,7 @@ op {
     }
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
     default_value {
       b: false
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseCountSparseOutput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseCountSparseOutput.pbtxt
index b701e5fc0db..ed79733f97f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseCountSparseOutput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseCountSparseOutput.pbtxt
@@ -14,7 +14,7 @@ op {
   }
   input_arg {
     name: "weights"
-    type: DT_FLOAT
+    type_attr: "output_type"
   }
   output_arg {
     name: "output_indices"
@@ -57,7 +57,7 @@ op {
     minimum: -1
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
   }
   attr {
@@ -65,8 +65,10 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_INT32
         type: DT_INT64
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseCrossHashed.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseCrossHashed.pbtxt
new file mode 100644
index 00000000000..73002a92f24
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseCrossHashed.pbtxt
@@ -0,0 +1,72 @@
+op {
+  name: "SparseCrossHashed"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  input_arg {
+    name: "num_buckets"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "strong_hash"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "salt"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseCrossV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseCrossV2.pbtxt
new file mode 100644
index 00000000000..206542e4713
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseCrossV2.pbtxt
@@ -0,0 +1,64 @@
+op {
+  name: "SparseCrossV2"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  input_arg {
+    name: "sep"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
index a3fde8699b1..31b04288b75 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
@@ -1,45 +1,3 @@
-op {
-  name: "SparseSegmentMean"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
 op {
   name: "SparseSegmentMean"
   input_arg {
@@ -95,3 +53,59 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
index 2d1d816200a..1fe616e3552 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
@@ -1,62 +1,3 @@
-op {
-  name: "SparseSegmentMeanWithNumSegments"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
 op {
   name: "SparseSegmentMeanWithNumSegments"
   input_arg {
@@ -129,3 +70,76 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentMeanWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
index 6ab44de93ec..581fad0b5d8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
@@ -1,45 +1,3 @@
-op {
-  name: "SparseSegmentSqrtN"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
 op {
   name: "SparseSegmentSqrtN"
   input_arg {
@@ -95,3 +53,59 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentSqrtN"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
index 038a5a2bd28..1ac5edbd39a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -1,62 +1,3 @@
-op {
-  name: "SparseSegmentSqrtNWithNumSegments"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
 op {
   name: "SparseSegmentSqrtNWithNumSegments"
   input_arg {
@@ -129,3 +70,76 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentSqrtNWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUReplicatedInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUReplicatedInput.pbtxt
index a293537e36d..b549b570c13 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUReplicatedInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUReplicatedInput.pbtxt
@@ -56,3 +56,46 @@ op {
     }
   }
 }
+op {
+  name: "TPUReplicatedInput"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "is_mirrored_variable"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "index"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "is_packed"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UncompressElement.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UncompressElement.pbtxt
new file mode 100644
index 00000000000..68406e0e4bc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/UncompressElement.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "UncompressElement"
+  input_arg {
+    name: "compressed"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/count_ops.cc b/tensorflow/core/ops/count_ops.cc
index c9fbe1f8d8e..8de0a2ef954 100644
--- a/tensorflow/core/ops/count_ops.cc
+++ b/tensorflow/core/ops/count_ops.cc
@@ -19,12 +19,21 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
 
 Status DenseCountSparseOutputShapeFn(InferenceContext *c) {
-  int32 rank = c->Rank(c->input(0));
-  DimensionHandle nvals = c->UnknownDim();
+  auto values = c->input(0);
+  auto weights = c->input(1);
+  ShapeHandle output;
+  auto num_weights = c->NumElements(weights);
+  if (c->ValueKnown(num_weights) && c->Value(num_weights) == 0) {
+    output = values;
+  } else {
+    TF_RETURN_IF_ERROR(c->Merge(weights, values, &output));
+  }
+  auto rank = c->Rank(output);
+  auto nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -32,8 +41,8 @@ Status DenseCountSparseOutputShapeFn(InferenceContext *c) {
 }
 
 Status SparseCountSparseOutputShapeFn(InferenceContext *c) {
-  DimensionHandle rank = c->Dim(c->input(0), 1);
-  DimensionHandle nvals = c->UnknownDim();
+  auto rank = c->Dim(c->input(0), 1);
+  auto nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -45,7 +54,7 @@ Status RaggedCountSparseOutputShapeFn(InferenceContext *c) {
   if (rank != c->kUnknownRank) {
     ++rank;  // Add the ragged dimension
   }
-  DimensionHandle nvals = c->UnknownDim();
+  auto nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -54,12 +63,12 @@ Status RaggedCountSparseOutputShapeFn(InferenceContext *c) {
 
 REGISTER_OP("DenseCountSparseOutput")
     .Input("values: T")
-    .Input("weights: float")
+    .Input("weights: output_type")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_count: bool")
-    .Attr("output_type: {int64, float}")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
     .SetShapeFn(DenseCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
@@ -69,12 +78,12 @@ REGISTER_OP("SparseCountSparseOutput")
     .Input("indices: int64")
     .Input("values: T")
     .Input("dense_shape: int64")
-    .Input("weights: float")
+    .Input("weights: output_type")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_count: bool")
-    .Attr("output_type: {int64, float}")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
     .SetShapeFn(SparseCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
@@ -83,12 +92,12 @@ REGISTER_OP("SparseCountSparseOutput")
 REGISTER_OP("RaggedCountSparseOutput")
     .Input("splits: int64")
     .Input("values: T")
-    .Input("weights: float")
+    .Input("weights: output_type")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_count: bool")
-    .Attr("output_type: {int64, float}")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
     .SetShapeFn(RaggedCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 0122cbed087..6a633fb679d 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -731,42 +731,19 @@ REGISTER_OP("OneShotIterator")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ScalarShape);
 
-namespace {
-
-Status IteratorGetNextShapeFn(shape_inference::InferenceContext* c) {
-  shape_inference::ShapeHandle unused;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-  std::vector<PartialTensorShape> output_shapes;
-  TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
-  if (output_shapes.size() != c->num_outputs()) {
-    return errors::InvalidArgument(
-        "`output_shapes` must be the same length as `output_types` (",
-        output_shapes.size(), " vs. ", c->num_outputs());
-  }
-  for (size_t i = 0; i < output_shapes.size(); ++i) {
-    shape_inference::ShapeHandle output_shape_handle;
-    TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
-        output_shapes[i], &output_shape_handle));
-    c->set_output(static_cast<int>(i), output_shape_handle);
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
 REGISTER_OP("IteratorGetNext")
     .Input("iterator: resource")
     .Output("components: output_types")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(IteratorGetNextShapeFn);
+    .SetShapeFn(shape_inference::DatasetIteratorShape);
 
 REGISTER_OP("IteratorGetNextSync")
     .Input("iterator: resource")
     .Output("components: output_types")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(IteratorGetNextShapeFn);
+    .SetShapeFn(shape_inference::DatasetIteratorShape);
 
 // TODO(b/124308596): Instead of conservatively marking this op as stateful,
 // implement a mechanism to determine whether `dataset` has a side-effect
@@ -778,7 +755,7 @@ REGISTER_OP("DatasetToSingleElement")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()
-    .SetShapeFn(IteratorGetNextShapeFn);
+    .SetShapeFn(shape_inference::DatasetIteratorShape);
 
 // TODO(b/124308596): Instead of conservatively marking this op as stateful,
 // implement a mechanism to determine whether `dataset` has a side-effect
@@ -796,7 +773,7 @@ REGISTER_OP("ReduceDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("use_inter_op_parallelism: bool = true")
     .SetIsStateful()
-    .SetShapeFn(IteratorGetNextShapeFn);
+    .SetShapeFn(shape_inference::DatasetIteratorShape);
 
 REGISTER_OP("IteratorToStringHandle")
     .Input("resource_handle: resource")
@@ -875,7 +852,7 @@ REGISTER_OP("OptionalGetValue")
     .Output("components: output_types")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(IteratorGetNextShapeFn);
+    .SetShapeFn(shape_inference::DatasetIteratorShape);
 
 REGISTER_OP("IteratorGetNextAsOptional")
     .Input("iterator: resource")
@@ -992,7 +969,7 @@ REGISTER_OP("MultiDeviceIteratorGetNextFromShard")
     .Output("components: output_types")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(IteratorGetNextShapeFn);
+    .SetShapeFn(shape_inference::DatasetIteratorShape);
 
 REGISTER_OP("MultiDeviceIteratorToStringHandle")
     .Input("multi_device_iterator: resource")
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 2c9cbe2f416..18b35d3a172 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -132,6 +132,19 @@ REGISTER_OP("ExperimentalChooseFastestDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("CompressElement")
+    .Input("components: input_types")
+    .Output("compressed: variant")
+    .Attr("input_types: list(type) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("UncompressElement")
+    .Input("compressed: variant")
+    .Output("components: output_types")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::DatasetIteratorShape);
+
 REGISTER_OP("CSVDataset")
     .Input("filenames: string")
     .Input("compression_type: string")
@@ -886,6 +899,26 @@ REGISTER_OP("SnapshotDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("SnapshotDatasetV2")
+    .Input("input_dataset: variant")
+    .Input("path: string")
+    .Input("reader_func_other_args: Treader_func_args")
+    .Input("shard_func_other_args: Tshard_func_args")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("compression: string = ''")
+    .Attr("reader_func: func")
+    .Attr("shard_func: func")
+    .Attr("Treader_func_args: list(type) >= 0")
+    .Attr("Tshard_func_args: list(type) >= 0")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `path` should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("SqlDataset")
     .Input("driver_name: string")
     .Input("data_source_name: string")
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 0a08925f7e1..11b10f3c504 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -299,4 +299,10 @@ REGISTER_OP("FakeParam")
       return Status::OK();
     });
 
+// Returns the device index.
+REGISTER_OP("DeviceIndex")
+    .Output("index: int32")
+    .Attr("device_names: list(string)")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 418f1e20e37..e11f14b8538 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -756,6 +756,41 @@ REGISTER_OP("ExtractGlimpse")
                                    c->Dim(input, 3));
     });
 
+REGISTER_OP("ExtractGlimpseV2")
+    .Input("input: float")
+    .Input("size: int32")
+    .Input("offsets: float")
+    .Output("glimpse: float")
+    .Attr("centered: bool = true")
+    .Attr("normalized: bool = true")
+    .Attr("uniform_noise: bool = true")
+    .Attr("noise: string = 'uniform'")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+      ShapeHandle offsets;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &offsets));
+
+      DimensionHandle batch_dim;
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->Dim(input, 0), c->Dim(offsets, 0), &batch_dim));
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(offsets, 1), 2, &unused));
+
+      bool uniform_noise = false;
+      TF_RETURN_IF_ERROR(c->GetAttr("uniform_noise", &uniform_noise));
+      string noise;
+      TF_RETURN_IF_ERROR(c->GetAttr("noise", &noise));
+      if (uniform_noise && (!noise.empty() && noise != "uniform")) {
+        return errors::InvalidArgument(
+            "The uniform_noise and noise should not be specified at the same "
+            "time");
+      }
+
+      return SetOutputToSizedImage(c, batch_dim, 1 /* size_input_idx */,
+                                   c->Dim(input, 3));
+    });
+
 // --------------------------------------------------------------------------
 
 REGISTER_OP("CropAndResize")
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 7ac003379d4..dfc2463915c 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -936,7 +936,7 @@ REGISTER_OP("_MklMatMul")
     .Output("product: T")
     .Attr("transpose_a: bool = false")
     .Attr("transpose_b: bool = false")
-    .Attr("T: {bfloat16, float, double, complex64, complex128}")
+    .Attr("T: {bfloat16, float}")
     .SetShapeFn(shape_inference::MatMulShape);
 #endif  // INTEL_MKL
 
@@ -1337,7 +1337,7 @@ REGISTER_OP("SparseSegmentMean")
     .Input("indices: Tidx")
     .Input("segment_ids: Tsegmentids")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionShapeFn);
@@ -1348,7 +1348,7 @@ REGISTER_OP("SparseSegmentMeanWithNumSegments")
     .Input("segment_ids: Tsegmentids")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
@@ -1370,7 +1370,7 @@ REGISTER_OP("SparseSegmentSqrtN")
     .Input("indices: Tidx")
     .Input("segment_ids: Tsegmentids")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionShapeFn);
@@ -1381,7 +1381,7 @@ REGISTER_OP("SparseSegmentSqrtNWithNumSegments")
     .Input("segment_ids: Tsegmentids")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
@@ -1657,7 +1657,7 @@ REGISTER_OP("DenseBincount")
     .Input("weights: T")
     .Attr("Tidx: {int32, int64}")
     .Attr("T: {int32, int64, float32, float64}")
-    .Attr("binary_count: bool = false")
+    .Attr("binary_output: bool = false")
     .Output("output: T")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
@@ -1704,7 +1704,7 @@ REGISTER_OP("SparseBincount")
     .Input("weights: T")
     .Attr("Tidx: {int32, int64}")
     .Attr("T: {int32, int64, float32, float64}")
-    .Attr("binary_count: bool = false")
+    .Attr("binary_output: bool = false")
     .Output("output: T")
     .SetShapeFn([](InferenceContext* c) {
       const Tensor* size_tensor = c->input_tensor(3);
@@ -1754,7 +1754,7 @@ REGISTER_OP("RaggedBincount")
     .Input("weights: T")
     .Attr("Tidx: {int32, int64}")
     .Attr("T: {int32, int64, float32, float64}")
-    .Attr("binary_count: bool = false")
+    .Attr("binary_output: bool = false")
     .Output("output: T")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->UnknownShape());
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index a625fb64ed3..248cf1d0e8a 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -1369,6 +1369,48 @@ REGISTER_OP("_MklFusedBatchNormGradV3")
         R"doc(MKL-DNN implementation of FusedBatchNormGradV3: Do not invoke this operator directly in Python.
              Graph rewrite pass is expected to invoke this operator.)doc");
 
+REGISTER_OP("_MklFusedBatchNormEx")
+    .Input("x: T")
+    .Input("scale: U")
+    .Input("offset: U")
+    .Input("mean: U")
+    .Input("variance: U")
+    .Input("side_input: num_side_inputs * T")
+    .Input("mkl_x: uint8")
+    .Input("mkl_scale: uint8")
+    .Input("mkl_offset: uint8")
+    .Input("mkl_mean: uint8")
+    .Input("mkl_variance: uint8")
+    .Input("mkl_side_input: num_side_inputs * uint8")
+    .Output("y: T")
+    .Output("batch_mean: U")
+    .Output("batch_variance: U")
+    .Output("reserve_space_1: U")
+    .Output("reserve_space_2: U")
+    .Output("reserve_space_3: U")
+    .Output("mkl_y: uint8")
+    .Output("mkl_batch_mean: uint8")
+    .Output("mkl_batch_variance: uint8")
+    .Output("mkl_reserve_space_1: uint8")
+    .Output("mkl_reserve_space_2: uint8")
+    .Output("mkl_reserve_space_3: uint8")
+    .Attr("T: {bfloat16, float}")
+    .Attr("U: {float}")
+    .Attr("epsilon: float = 0.0001")
+    .Attr("exponential_avg_factor: float = 1.0")
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("num_side_inputs: int >= 0 = 0")
+    .Attr("activation_mode: string = \"Identity\"")
+    .Attr("is_training: bool = true")
+    .SetShapeFn(shape_inference::FusedBatchNormShape)
+    .Doc(R"doc(
+MKL version of FusedBatchNormEx operator. Uses MKL DNN APIs to perform fused
+batch normalization and relu.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/ops/nn_grad.cc b/tensorflow/core/ops/nn_grad.cc
index 7beaf57c10b..30c0c96c6c8 100644
--- a/tensorflow/core/ops/nn_grad.cc
+++ b/tensorflow/core/ops/nn_grad.cc
@@ -31,7 +31,7 @@ Status SoftmaxGrad(const AttrSlice& attrs, FunctionDef* g) {
       // Ret val defs
       {"grad_x: T"},
       // Attr defs
-      {{"T: {float, double}"}},
+      {{"T: {float, double, bfloat16}"}},
       // Nodes
       // Based on _SoftmaxGrad in nn_grad.py.
       {
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 9200547cf45..518972696f1 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -238,7 +238,11 @@ REGISTER_OP("_FusedBatchNormEx")
     .Output("reserve_space_1: U")
     .Output("reserve_space_2: U")
     .Output("reserve_space_3: U")
+#ifdef ENABLE_MKLDNN_V1
+    .Attr("T: {half, float, bfloat16}")
+#else
     .Attr("T: {half, float}")
+#endif
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("exponential_avg_factor: float = 1.0")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 1ea06a2fdac..346cc19c820 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -3553,6 +3553,13 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "enable_large_batch_splitting"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "BatchIFFT"
@@ -7068,6 +7075,13 @@ op {
       s: "auto"
     }
   }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
   is_stateful: true
 }
 op {
@@ -7117,6 +7131,13 @@ op {
       s: "auto"
     }
   }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
   is_stateful: true
 }
 op {
@@ -7165,6 +7186,13 @@ op {
       s: "auto"
     }
   }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
   is_stateful: true
 }
 op {
@@ -7283,6 +7311,13 @@ op {
       s: "auto"
     }
   }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
   is_stateful: true
 }
 op {
@@ -7451,6 +7486,23 @@ op {
     }
   }
 }
+op {
+  name: "CompressElement"
+  input_arg {
+    name: "components"
+    type_list_attr: "input_types"
+  }
+  output_arg {
+    name: "compressed"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "input_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ComputeAccidentalHits"
   input_arg {
@@ -11515,7 +11567,7 @@ op {
     }
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
     default_value {
       b: false
@@ -11530,7 +11582,7 @@ op {
   }
   input_arg {
     name: "weights"
-    type: DT_FLOAT
+    type_attr: "output_type"
   }
   output_arg {
     name: "output_indices"
@@ -11573,7 +11625,7 @@ op {
     minimum: -1
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
   }
   attr {
@@ -11581,8 +11633,10 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_INT32
         type: DT_INT64
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
@@ -12198,6 +12252,17 @@ op {
     type: "string"
   }
 }
+op {
+  name: "DeviceIndex"
+  output_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  attr {
+    name: "device_names"
+    type: "list(string)"
+  }
+}
 op {
   name: "Diag"
   input_arg {
@@ -15092,6 +15157,53 @@ op {
     }
   }
 }
+op {
+  name: "ExtractGlimpseV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "offsets"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "glimpse"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "centered"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "normalized"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "uniform_noise"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "noise"
+    type: "string"
+    default_value {
+      s: "uniform"
+    }
+  }
+}
 op {
   name: "ExtractImagePatches"
   input_arg {
@@ -33206,7 +33318,7 @@ op {
     }
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
     default_value {
       b: false
@@ -33225,7 +33337,7 @@ op {
   }
   input_arg {
     name: "weights"
-    type: DT_FLOAT
+    type_attr: "output_type"
   }
   output_arg {
     name: "output_indices"
@@ -33268,7 +33380,7 @@ op {
     minimum: -1
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
   }
   attr {
@@ -33276,8 +33388,10 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_INT32
         type: DT_INT64
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
@@ -43165,6 +43279,66 @@ op {
     }
   }
 }
+op {
+  name: "SnapshotDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "path"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "reader_func_other_args"
+    type_list_attr: "Treader_func_args"
+  }
+  input_arg {
+    name: "shard_func_other_args"
+    type_list_attr: "Tshard_func_args"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reader_func"
+    type: "func"
+  }
+  attr {
+    name: "shard_func"
+    type: "func"
+  }
+  attr {
+    name: "Treader_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tshard_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+}
 op {
   name: "SobolSample"
   input_arg {
@@ -44717,7 +44891,7 @@ op {
     }
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
     default_value {
       b: false
@@ -44849,7 +45023,7 @@ op {
   }
   input_arg {
     name: "weights"
-    type: DT_FLOAT
+    type_attr: "output_type"
   }
   output_arg {
     name: "output_indices"
@@ -44892,7 +45066,7 @@ op {
     minimum: -1
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
   }
   attr {
@@ -44900,8 +45074,10 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_INT32
         type: DT_INT64
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
@@ -44999,6 +45175,142 @@ op {
     }
   }
 }
+op {
+  name: "SparseCrossHashed"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  input_arg {
+    name: "num_buckets"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "strong_hash"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "salt"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "SparseCrossV2"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  input_arg {
+    name: "sep"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "SparseDenseCwiseAdd"
   input_arg {
@@ -45891,6 +46203,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -46009,6 +46322,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -46077,6 +46391,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -46195,6 +46510,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -49723,6 +50039,13 @@ op {
       i: -1
     }
   }
+  attr {
+    name: "is_packed"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "TPUReplicatedOutput"
@@ -52520,6 +52843,29 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UncompressElement"
+  input_arg {
+    name: "compressed"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "UnicodeDecode"
   input_arg {
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index 85186c4a2d8..906cef1f5ec 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -272,6 +272,46 @@ REGISTER_OP("SparseCross")
       return Status::OK();
     });
 
+REGISTER_OP("SparseCrossV2")
+    .Input("indices: N * int64")
+    .Input("values: sparse_types")
+    .Input("shapes: N * int64")
+    .Input("dense_inputs: dense_types")
+    .Input("sep: string")
+    .Output("output_indices: int64")
+    .Output("output_values: string")
+    .Output("output_shape: int64")
+    .Attr("N: int >= 0")
+    .Attr("sparse_types: list({int64, string}) >= 0")
+    .Attr("dense_types: list({int64, string}) >= 0")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->UnknownDim(), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    });
+
+REGISTER_OP("SparseCrossHashed")
+    .Input("indices: N * int64")
+    .Input("values: sparse_types")
+    .Input("shapes: N * int64")
+    .Input("dense_inputs: dense_types")
+    .Input("num_buckets: int64")
+    .Input("strong_hash: bool")
+    .Input("salt: int64")
+    .Output("output_indices: int64")
+    .Output("output_values: int64")
+    .Output("output_shape: int64")
+    .Attr("N: int >= 0")
+    .Attr("sparse_types: list({int64, string}) >= 0")
+    .Attr("dense_types: list({int64, string}) >= 0")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->UnknownDim(), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    });
+
 REGISTER_OP("SparseSplit")
     .Input("split_dim: int64")
     .Input("indices: int64")
diff --git a/tensorflow/core/ops/tpu_host_compute_ops.cc b/tensorflow/core/ops/tpu_host_compute_ops.cc
index 48aeb81ac13..753cc0015d9 100644
--- a/tensorflow/core/ops/tpu_host_compute_ops.cc
+++ b/tensorflow/core/ops/tpu_host_compute_ops.cc
@@ -28,8 +28,7 @@ REGISTER_OP("_XlaSendFromHost")
     .SetIsStateful()
     .SetShapeFn(::tensorflow::shape_inference::NoOutputs)
     .Doc(R"doc(
-A placeholder op for multiple values that will be sent from TensorFlow to a
-running XLA computation.
+A placeholder op to send values to a running XLA computation.
 
 inputs: A list of tensors that will be sent to the XLA computation.
 dynamic_key: The key sent at runtime by the compile node to identify which
@@ -49,8 +48,7 @@ REGISTER_OP("_XlaRecvAtHost")
     .SetIsStateful()
     .SetShapeFn(::tensorflow::shape_inference::UnknownShape)
     .Doc(R"doc(
-A placeholder op for multiple values that will be sent to TensorFlow from a
-running XLA computation.
+A placeholder op to receive values from a running XLA computation.
 
 dynamic_key: The key sent at runtime by the compile node to identify which
 execution the transfer corresponds to.
diff --git a/tensorflow/core/ops/tpu_replication_ops.cc b/tensorflow/core/ops/tpu_replication_ops.cc
index 3bb94044e14..a729d3c3b7b 100644
--- a/tensorflow/core/ops/tpu_replication_ops.cc
+++ b/tensorflow/core/ops/tpu_replication_ops.cc
@@ -44,6 +44,8 @@ REGISTER_OP("TPUReplicatedInput")
     .Attr("is_mirrored_variable: bool = false")
     // Index of the input. If is_mirrored_variable is true, this is ignored.
     .Attr("index: int = -1")
+    // All inputs are packed into one input
+    .Attr("is_packed: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle cur = c->input(c->num_inputs() - 1);
       for (int i = c->num_inputs() - 2; i >= 0; --i) {
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 819f8fcdadb..30734a840d1 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -386,6 +386,7 @@ py_test(
     name = "ram_file_system_test",
     srcs = ["ram_file_system_test.py"],
     python_version = "PY3",
+    tags = ["no_windows"],  # TODO(b/156428279): reenable this test once the image is updated.
     deps = [
         "//tensorflow:tensorflow_py",
     ],
@@ -621,7 +622,7 @@ cc_library(
         ":stringpiece",
         ":stringprintf",
         ":types",
-        "//tensorflow/core:error_codes_proto_impl_cc",
+        "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/base",
     ],
 )
@@ -674,6 +675,7 @@ cc_library(
 cc_library(
     name = "strong_hash",
     hdrs = ["strong_hash.h"],
+    visibility = ["//visibility:public"],
     deps = [
         ":platform",
         ":types",
@@ -1472,6 +1474,7 @@ filegroup(
         "abi.h",
         "blocking_counter.h",
         "byte_order.h",
+        "casts.h",
         "coding.cc",
         "coding.h",
         "context.h",
@@ -1557,7 +1560,6 @@ filegroup(
     srcs = [
         "base64.cc",
         "base64.h",
-        "casts.h",
         "cpu_feature_guard.cc",
         "cpu_feature_guard.h",
         "fingerprint.h",
diff --git a/tensorflow/core/platform/build_config.bzl b/tensorflow/core/platform/build_config.bzl
index f0613cdc069..ab452562245 100644
--- a/tensorflow/core/platform/build_config.bzl
+++ b/tensorflow/core/platform/build_config.bzl
@@ -26,6 +26,7 @@ load(
     _tf_platform_alias = "tf_platform_alias",
     _tf_platform_deps = "tf_platform_deps",
     _tf_portable_deps_no_runtime = "tf_portable_deps_no_runtime",
+    _tf_portable_proto_lib = "tf_portable_proto_lib",
     _tf_proto_library = "tf_proto_library",
     _tf_proto_library_cc = "tf_proto_library_cc",
     _tf_proto_library_py = "tf_proto_library_py",
@@ -65,6 +66,7 @@ tf_lib_proto_parsing_deps = _tf_lib_proto_parsing_deps
 tf_logging_deps = _tf_logging_deps
 tf_platform_alias = _tf_platform_alias
 tf_platform_deps = _tf_platform_deps
+tf_portable_proto_lib = _tf_portable_proto_lib
 tf_portable_deps_no_runtime = _tf_portable_deps_no_runtime
 tf_proto_library = _tf_proto_library
 tf_proto_library_cc = _tf_proto_library_cc
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 101d7ac5807..2440549a353 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -20,6 +20,7 @@ package_group(
     packages = [
         "//learning/brain/tfrc/...",
         "//tensorflow/...",
+        "//third_party/gstpufs/...",
     ],
 )
 
diff --git a/tensorflow/core/platform/cloud/curl_http_request.cc b/tensorflow/core/platform/cloud/curl_http_request.cc
index a227edb1fb0..6b91d615f6f 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request.cc
@@ -145,6 +145,8 @@ CurlHttpRequest::CurlHttpRequest(LibCurl* libcurl, Env* env)
   CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_NOSIGNAL, 1L));
 
   // TODO(b/74351157): Enable HTTP/2.
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_HTTP_VERSION,
+                                           CURL_HTTP_VERSION_1_1));
 
   // Set up the progress meter.
   CHECK_CURL_OK(
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index e4047c78998..92210498b01 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -158,12 +158,17 @@ string JoinGcsPath(const string& path, const string& subpath) {
 /// For example:
 ///  - for 'a/b/c/d' it will append 'a', 'a/b' and 'a/b/c'
 ///  - for 'a/b/c/' it will append 'a', 'a/b' and 'a/b/c'
+///  - for 'a//b/c/' it will append 'a', 'a//b' and 'a//b/c'
+///  - for '/a/b/c/' it will append '/a', '/a/b' and '/a/b/c'
 std::set<string> AddAllSubpaths(const std::vector<string>& paths) {
   std::set<string> result;
   result.insert(paths.begin(), paths.end());
   for (const string& path : paths) {
     StringPiece subpath = io::Dirname(path);
-    while (!subpath.empty()) {
+    // If `path` starts with `/`, `subpath` will be `/` and then we get into an
+    // infinite loop. Same behavior happens if there is a `//` pattern in
+    // `path`, so we check for that and leave the loop quicker.
+    while (!(subpath.empty() || subpath == "/")) {
       result.emplace(string(subpath));
       subpath = io::Dirname(subpath);
     }
@@ -1349,9 +1354,19 @@ Status GcsFileSystem::GetMatchingPaths(const string& pattern,
 
         const auto& files_and_folders = AddAllSubpaths(all_files);
 
+        // To handle `/` in the object names, we need to remove it from `dir`
+        // and then use `StrCat` to insert it back.
+        const StringPiece dir_no_slash = str_util::StripSuffix(dir, "/");
+
         // Match all obtained paths to the input pattern.
         for (const auto& path : files_and_folders) {
-          const string& full_path = this->JoinPath(dir, path);
+          // Manually construct the path instead of using `JoinPath` for the
+          // cases where `path` starts with a `/` (which is a valid character in
+          // the filenames of GCS objects). `JoinPath` canonicalizes the result,
+          // removing duplicate slashes. We know that `dir_no_slash` does not
+          // end in `/`, so we are safe inserting the new `/` here as the path
+          // separator.
+          const string full_path = strings::StrCat(dir_no_slash, "/", path);
           if (this->Match(full_path, pattern)) {
             results->push_back(full_path);
           }
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 802f18a31ae..14af9f979e6 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -1969,6 +1969,56 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SelfDirectoryMarker) {
   EXPECT_EQ(std::vector<string>({"gs://bucket/path/file3.txt"}), result);
 }
 
+TEST(GcsFileSystemTest, GetMatchingPaths_SlashInObjectName) {
+  std::vector<HttpRequest*> requests({new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
+      "fields=items%2Fname%2CnextPageToken&prefix=path%2F\n"
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
+      "{\"items\": [ "
+      "  { \"name\": \"path/\" },"
+      "  { \"name\": \"path//foo.txt\" }]}")});
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
+
+  std::vector<string> result;
+  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", &result));
+  EXPECT_EQ(std::vector<string>(), result);
+}
+
+TEST(GcsFileSystemTest, GetMatchingPaths_SlashInObjectNameEscaped) {
+  std::vector<HttpRequest*> requests({new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
+      "fields=items%2Fname%2CnextPageToken&prefix=path%2F\n"
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
+      "{\"items\": [ "
+      "  { \"name\": \"path/\" },"
+      "  { \"name\": \"path//foo.txt\" }]}")});
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
+
+  std::vector<string> result;
+  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/\\/*", &result));
+  EXPECT_EQ(std::vector<string>({"gs://bucket/path//foo.txt"}), result);
+}
+
 TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_NoMatches) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
diff --git a/tensorflow/core/platform/default/BUILD b/tensorflow/core/platform/default/BUILD
index 49318fd0811..89231b0f206 100644
--- a/tensorflow/core/platform/default/BUILD
+++ b/tensorflow/core/platform/default/BUILD
@@ -509,6 +509,7 @@ filegroup(
 filegroup(
     name = "mobile_srcs_no_runtime",
     srcs = [
+        "casts.h",
         "context.h",
         "dynamic_annotations.h",
         "env.cc",
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 18a8285ece1..2dc4fdc0fd9 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -577,8 +577,8 @@ def tf_additional_all_protos():
 
 def tf_protos_all_impl():
     return [
-        clean_dep("//tensorflow/core:autotuning_proto_cc_impl"),
-        clean_dep("//tensorflow/core:conv_autotuning_proto_cc_impl"),
+        clean_dep("//tensorflow/core/protobuf:autotuning_proto_cc_impl"),
+        clean_dep("//tensorflow/core/protobuf:conv_autotuning_proto_cc_impl"),
         clean_dep("//tensorflow/core:protos_all_cc_impl"),
     ]
 
@@ -727,6 +727,9 @@ def tf_protobuf_deps():
         otherwise = [clean_dep("@com_google_protobuf//:protobuf_headers")],
     )
 
+def tf_portable_proto_lib():
+    return ["//tensorflow/core:protos_all_cc_impl"]
+
 def tf_protobuf_compiler_deps():
     return if_static(
         [
@@ -764,7 +767,7 @@ def tf_portable_deps_no_runtime():
         "@nsync//:nsync_cpp",
         "@com_googlesource_code_re2//:re2",
         "@farmhash_archive//:farmhash",
-    ] + tf_protobuf_deps()
+    ]
 
 def tf_google_mobile_srcs_no_runtime():
     return []
diff --git a/tensorflow/core/platform/default/distribute.bzl b/tensorflow/core/platform/default/distribute.bzl
index 58119f055c1..46a5d826a79 100644
--- a/tensorflow/core/platform/default/distribute.bzl
+++ b/tensorflow/core/platform/default/distribute.bzl
@@ -42,6 +42,10 @@ def distribute_py_test(
         disable_v3: whether tests for TPU version 3 should be generated.
         **kwargs: extra keyword arguments to the non-tpu test.
     """
+
+    # Default to PY3 since multi worker tests require PY3.
+    kwargs.setdefault("python_version", "PY3")
+
     _ignore = (full_precision)
     tpu_tags = tags if (tpu_tags == None) else tpu_tags
     main = main if main else "%s.py" % name
diff --git a/tensorflow/core/platform/default/stacktrace_handler.cc b/tensorflow/core/platform/default/stacktrace_handler.cc
index 72907ecb526..70fcdc27042 100644
--- a/tensorflow/core/platform/default/stacktrace_handler.cc
+++ b/tensorflow/core/platform/default/stacktrace_handler.cc
@@ -100,7 +100,8 @@ static void StacktraceHandler(int sig, siginfo_t *si, void *v) {
 void InstallStacktraceHandler() {
   int handled_signals[] = {SIGSEGV, SIGABRT, SIGBUS, SIGILL, SIGFPE};
 
-  for (int i = 0; i < sizeof(handled_signals) / sizeof(int); i++) {
+  size_t array_limit = sizeof(handled_signals) / sizeof(int);
+  for (size_t i = 0; i < array_limit; i++) {
     int sig = handled_signals[i];
     struct sigaction sa;
     struct sigaction osa;
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index 62e5e01b3cc..57773c54e3e 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -79,11 +79,7 @@ limitations under the License.
 // analysis. Giving it this information can help it optimize for the
 // common case in the absence of better information (ie.
 // -fprofile-arcs).
-//
-// We need to disable this for GPU builds, though, since nvcc8 and older
-// don't recognize `__builtin_expect` as a builtin, and fail compilation.
-#if (!defined(__NVCC__)) && \
-    (TF_HAS_BUILTIN(__builtin_expect) || (defined(__GNUC__) && __GNUC__ >= 3))
+#if TF_HAS_BUILTIN(__builtin_expect) || (defined(__GNUC__) && __GNUC__ >= 3)
 #define TF_PREDICT_FALSE(x) (__builtin_expect(x, 0))
 #define TF_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
 #else
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.cc b/tensorflow/core/platform/profile_utils/cpu_utils.cc
index 587c97875a0..b22123a804a 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.cc
@@ -88,6 +88,8 @@ static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
      defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
     retval = sscanf(line.c_str(), "clock              : %lfMHz", &cpu_freq);
     freq_factor = 1.0;
+#elif defined(__s390x__)
+    retval = sscanf(line.c_str(), "bogomips per cpu: %lf", &cpu_freq);
 #else
     retval = sscanf(line.c_str(), "bogomips : %lf", &cpu_freq);
 #endif
diff --git a/tensorflow/core/platform/protobuf.cc b/tensorflow/core/platform/protobuf.cc
index 1912ab11e62..5176207d956 100644
--- a/tensorflow/core/platform/protobuf.cc
+++ b/tensorflow/core/platform/protobuf.cc
@@ -23,7 +23,7 @@ const char* kProtobufUint64Typename = "::tensorflow::protobuf_uint64";
 TStringOutputStream::TStringOutputStream(tstring* target) : target_(target) {}
 
 bool TStringOutputStream::Next(void** data, int* size) {
-  int old_size = target_->size();
+  size_t old_size = target_->size();
 
   // Grow the string.
   if (old_size < target_->capacity()) {
@@ -32,16 +32,16 @@ bool TStringOutputStream::Next(void** data, int* size) {
     target_->resize_uninitialized(target_->capacity());
   } else {
     // Size has reached capacity, try to double the size.
-    if (old_size > std::numeric_limits<int>::max() / 2) {
+    if (old_size > std::numeric_limits<size_t>::max() / 2) {
       // Can not double the size otherwise it is going to cause integer
       // overflow in the expression below: old_size * 2 ";
       return false;
     }
     // Double the size, also make sure that the new size is at least
     // kMinimumSize.
-    target_->resize_uninitialized(
-        std::max(old_size * 2,
-                 kMinimumSize + 0));  // "+ 0" works around GCC4 weirdness.
+    target_->resize_uninitialized(std::max(
+        old_size * 2,
+        (size_t)kMinimumSize + 0));  // "+ 0" works around GCC4 weirdness.
   }
 
   *data = target_->data() + old_size;
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 1726c9fbc6c..ee7dded6f98 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <aws/core/utils/StringUtils.h>
 #include <aws/core/utils/logging/AWSLogging.h>
 #include <aws/core/utils/logging/LogSystemInterface.h>
+#include <aws/core/utils/stream/PreallocatedStreamBuf.h>
 #include <aws/s3/S3Client.h>
 #include <aws/s3/S3Errors.h>
 #include <aws/s3/model/AbortMultipartUploadRequest.h>
@@ -58,10 +59,16 @@ static const char* kS3TempFileTemplate = "/tmp/s3_filesystem_XXXXXX";
 static const char* kS3FileSystemAllocationTag = "S3FileSystemAllocation";
 static const size_t kS3ReadAppendableFileBufferSize = 1024 * 1024;
 static const int64 kS3TimeoutMsec = 300000;                       // 5 min
-static const uint64 kS3MultiPartCopyPartSize = 50 * 1024 * 1024;  // 50MB
+static const uint64 kS3MultiPartUploadChunkSize = 50 * 1024 * 1024;   // 50 MB
+static const uint64 kS3MultiPartDownloadChunkSize = 2 * 1024 * 1024;  // 50 MB
 static const int kS3GetChildrenMaxKeys = 100;
-static const int kExecutorPoolSize = 5;
-static const int kUploadRetries = 5;
+
+// With this change multiple threads are used in one single download.
+// Increasing the thread pool size since multiple downloads
+// and uploads can occur in parallel.
+static const int kExecutorPoolSize = 25;
+static const int kUploadRetries = 3;
+static const int kDownloadRetries = 3;
 static const char* kExecutorTag = "TransferManagerExecutor";
 
 Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
@@ -223,9 +230,16 @@ static Status CreateStatusFromAwsError(
 
 class S3RandomAccessFile : public RandomAccessFile {
  public:
-  S3RandomAccessFile(const string& bucket, const string& object,
-                     std::shared_ptr<Aws::S3::S3Client> s3_client)
-      : bucket_(bucket), object_(object), s3_client_(s3_client) {}
+  S3RandomAccessFile(
+      const string& bucket, const string& object,
+      const bool use_multi_part_download,
+      std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager,
+      std::shared_ptr<Aws::S3::S3Client> s3_client)
+      : bucket_(bucket),
+        object_(object),
+        use_multi_part_download_(use_multi_part_download),
+        transfer_manager_(transfer_manager),
+        s3_client_(s3_client) {}
 
   Status Name(StringPiece* result) const override {
     return errors::Unimplemented("S3RandomAccessFile does not support Name()");
@@ -235,6 +249,66 @@ class S3RandomAccessFile : public RandomAccessFile {
               char* scratch) const override {
     VLOG(1) << "ReadFilefromS3 s3://" << bucket_ << "/" << object_ << " from "
             << offset << " for n:" << n;
+    if (use_multi_part_download_) {
+      return ReadS3TransferManager(offset, n, result, scratch);
+    } else {
+      return ReadS3Client(offset, n, result, scratch);
+    }
+  }
+
+  Status ReadS3TransferManager(uint64 offset, size_t n, StringPiece* result,
+                               char* scratch) const {
+    VLOG(3) << "Using TransferManager";
+
+    auto create_stream_fn = [&]() {  // create stream lambda fn
+      return Aws::New<TFS3UnderlyingStream>(
+          "S3ReadStream",
+          Aws::New<Aws::Utils::Stream::PreallocatedStreamBuf>(
+              "S3ReadStream", reinterpret_cast<unsigned char*>(scratch), n));
+    };
+
+    VLOG(3) << "Created stream to read with transferManager";
+
+    std::shared_ptr<Aws::Transfer::TransferHandle> handle =
+        transfer_manager_.get()->DownloadFile(bucket_.c_str(), object_.c_str(),
+                                              offset, n, create_stream_fn);
+    handle->WaitUntilFinished();
+
+    // todo change this
+    int retries = 0;
+
+    while (handle->GetStatus() == Aws::Transfer::TransferStatus::FAILED &&
+           handle->GetLastError().GetResponseCode() !=
+               Aws::Http::HttpResponseCode::REQUESTED_RANGE_NOT_SATISFIABLE &&
+           retries++ < kDownloadRetries) {
+      // only failed parts will be downloaded again
+      VLOG(1) << "Retrying read of s3://" << bucket_ << "/" << object_
+              << " after failure. Current retry count:" << retries;
+      transfer_manager_.get()->RetryDownload(handle);
+      handle->WaitUntilFinished();
+    }
+
+    if (handle->GetStatus() != Aws::Transfer::TransferStatus::COMPLETED) {
+      auto error = handle->GetLastError();
+      if (error.GetResponseCode() ==
+          Aws::Http::HttpResponseCode::REQUESTED_RANGE_NOT_SATISFIABLE) {
+        // expected when end of file is reached
+        n = 0;
+        *result = StringPiece(scratch, n);
+        return Status(error::OUT_OF_RANGE, "Read less bytes than requested");
+      }
+      return CreateStatusFromAwsError(error);
+    } else {
+      n = handle->GetBytesTotalSize();
+      *result = StringPiece(scratch, handle->GetBytesTransferred());
+      return Status::OK();
+    }
+  }
+
+  Status ReadS3Client(uint64 offset, size_t n, StringPiece* result,
+                      char* scratch) const {
+    VLOG(3) << "ReadFile using S3Client s3://" << bucket_ << "/" << object_;
+
     Aws::S3::Model::GetObjectRequest getObjectRequest;
     getObjectRequest.WithBucket(bucket_.c_str()).WithKey(object_.c_str());
     string bytes = strings::StrCat("bytes=", offset, "-", offset + n - 1);
@@ -242,6 +316,7 @@ class S3RandomAccessFile : public RandomAccessFile {
     getObjectRequest.SetResponseStreamFactory([]() {
       return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag);
     });
+
     auto getObjectOutcome = this->s3_client_->GetObject(getObjectRequest);
     if (!getObjectOutcome.IsSuccess()) {
       auto error = getObjectOutcome.GetError();
@@ -252,18 +327,21 @@ class S3RandomAccessFile : public RandomAccessFile {
         return Status(error::OUT_OF_RANGE, "Read less bytes than requested");
       }
       return CreateStatusFromAwsError(error);
-    }
-    n = getObjectOutcome.GetResult().GetContentLength();
-    getObjectOutcome.GetResult().GetBody().read(scratch, n);
+    } else {
+      n = getObjectOutcome.GetResult().GetContentLength();
+      getObjectOutcome.GetResult().GetBody().read(scratch, n);
 
-    *result = StringPiece(scratch, n);
-    return Status::OK();
+      *result = StringPiece(scratch, n);
+      return Status::OK();
+    }
   }
 
  private:
   string bucket_;
   string object_;
   std::shared_ptr<Aws::S3::S3Client> s3_client_;
+  std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager_;
+  bool use_multi_part_download_;
 };
 
 class S3WritableFile : public WritableFile {
@@ -375,16 +453,53 @@ class S3ReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 S3FileSystem::S3FileSystem()
     : s3_client_(nullptr, ShutdownClient),
       initialization_lock_(),
-      transfer_manager_(nullptr, ShutdownTransferManager),
       executor_(nullptr, ShutdownExecutor) {
-  const char* part_size_str = getenv("S3_MULTI_PART_COPY_PART_SIZE");
-  multi_part_copy_part_size_ = kS3MultiPartCopyPartSize;
+  const char* part_size_str = getenv("S3_MULTI_PART_UPLOAD_CHUNK_SIZE");
+  multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD] =
+      kS3MultiPartUploadChunkSize;
   if (part_size_str) {
     uint64 part_size_num;
     if (strings::safe_strtou64(part_size_str, &part_size_num)) {
-      multi_part_copy_part_size_ = part_size_num;
+      multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD] =
+          part_size_num;
     }
   }
+
+  // Different TensorFlow APIs call the download API with different
+  // buffer size. Download performance depends on that size and this chunk size.
+  part_size_str = getenv("S3_MULTI_PART_DOWNLOAD_CHUNK_SIZE");
+  multi_part_chunk_size_[Aws::Transfer::TransferDirection::DOWNLOAD] =
+      kS3MultiPartDownloadChunkSize;
+  if (part_size_str) {
+    uint64 part_size_num;
+    if (strings::safe_strtou64(part_size_str, &part_size_num)) {
+      multi_part_chunk_size_[Aws::Transfer::TransferDirection::DOWNLOAD] =
+          part_size_num;
+    }
+  }
+
+  use_multi_part_download_ = true;
+  const char* disable_transfer_mgr = getenv("S3_DISABLE_MULTI_PART_DOWNLOAD");
+  if (disable_transfer_mgr) {
+    if (disable_transfer_mgr[0] == '1') {
+      use_multi_part_download_ = false;
+    }
+  }
+
+  auto upload_pair = std::pair<Aws::Transfer::TransferDirection,
+                               std::shared_ptr<Aws::Transfer::TransferManager>>(
+      Aws::Transfer::TransferDirection::UPLOAD,
+      std::shared_ptr<Aws::Transfer::TransferManager>(nullptr,
+                                                      ShutdownTransferManager));
+  auto download_pair =
+      std::pair<Aws::Transfer::TransferDirection,
+                std::shared_ptr<Aws::Transfer::TransferManager>>(
+          Aws::Transfer::TransferDirection::DOWNLOAD,
+          std::shared_ptr<Aws::Transfer::TransferManager>(
+              nullptr, ShutdownTransferManager));
+
+  this->transfer_managers_.insert(upload_pair);
+  this->transfer_managers_.insert(download_pair);
 }
 
 S3FileSystem::~S3FileSystem() {}
@@ -424,20 +539,22 @@ std::shared_ptr<Aws::S3::S3Client> S3FileSystem::GetS3Client() {
 }
 
 std::shared_ptr<Aws::Transfer::TransferManager>
-S3FileSystem::GetTransferManager() {
+S3FileSystem::GetTransferManager(
+    const Aws::Transfer::TransferDirection& direction) {
   std::shared_ptr<Aws::S3::S3Client> s3_client = this->GetS3Client();
   std::lock_guard<mutex> lock(this->initialization_lock_);
-  if (this->transfer_manager_.get() == nullptr) {
+  if (this->transfer_managers_[direction].get() == nullptr) {
     Aws::Transfer::TransferManagerConfiguration config(
         this->GetExecutor().get());
     config.s3Client = s3_client;
-    config.bufferSize = this->multi_part_copy_part_size_;
-    // must be larger than pool size * multi_part_copy_part_size
+    config.bufferSize = this->multi_part_chunk_size_[direction];
+    // must be larger than pool size * multi part chunk size
     config.transferBufferMaxHeapSize =
-        (kExecutorPoolSize + 1) * this->multi_part_copy_part_size_;
-    this->transfer_manager_ = Aws::Transfer::TransferManager::Create(config);
+        (kExecutorPoolSize + 1) * this->multi_part_chunk_size_[direction];
+    this->transfer_managers_[direction] =
+        Aws::Transfer::TransferManager::Create(config);
   }
-  return this->transfer_manager_;
+  return this->transfer_managers_[direction];
 }
 
 std::shared_ptr<Aws::Utils::Threading::PooledThreadExecutor>
@@ -452,9 +569,21 @@ S3FileSystem::GetExecutor() {
 
 Status S3FileSystem::NewRandomAccessFile(
     const string& fname, std::unique_ptr<RandomAccessFile>* result) {
+  return NewRandomAccessFile(fname, result, true);
+}
+
+Status S3FileSystem::NewRandomAccessFile(
+    const string& fname, std::unique_ptr<RandomAccessFile>* result,
+    bool use_multi_part_download) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3RandomAccessFile(bucket, object, this->GetS3Client()));
+
+  // check if an override was defined for this file. used for testing
+  bool use_mpd = this->use_multi_part_download_ && use_multi_part_download;
+  result->reset(new S3RandomAccessFile(
+      bucket, object, use_mpd,
+      this->GetTransferManager(Aws::Transfer::TransferDirection::DOWNLOAD),
+      this->GetS3Client()));
   return Status::OK();
 }
 
@@ -462,8 +591,11 @@ Status S3FileSystem::NewWritableFile(const string& fname,
                                      std::unique_ptr<WritableFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3WritableFile(bucket, object, this->GetTransferManager(),
-                                   this->GetS3Client()));
+  result->reset(new S3WritableFile(
+      bucket, object,
+      this->GetTransferManager(Aws::Transfer::TransferDirection::UPLOAD),
+      this->GetS3Client()));
+
   return Status::OK();
 }
 
@@ -478,8 +610,10 @@ Status S3FileSystem::NewAppendableFile(const string& fname,
 
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3WritableFile(bucket, object, this->GetTransferManager(),
-                                   this->GetS3Client()));
+  result->reset(new S3WritableFile(
+      bucket, object,
+      this->GetTransferManager(Aws::Transfer::TransferDirection::UPLOAD),
+      this->GetS3Client()));
 
   while (true) {
     status = reader->Read(offset, kS3ReadAppendableFileBufferSize, &read_chunk,
@@ -773,10 +907,13 @@ Status S3FileSystem::CopyFile(const Aws::String& source_bucket,
   TF_RETURN_IF_ERROR(
       this->GetFileSize(string(source_full_path.c_str()), &file_length));
   int num_parts;
-  if (file_length <= multi_part_copy_part_size_) {
+  if (file_length <=
+      multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD]) {
     num_parts = 1;
   } else {
-    num_parts = ceil((float)file_length / multi_part_copy_part_size_);
+    num_parts =
+        ceil((float)file_length /
+             multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD]);
   }
 
   if (num_parts == 1) {
@@ -786,7 +923,8 @@ Status S3FileSystem::CopyFile(const Aws::String& source_bucket,
         "MultiPartCopy with number of parts more than 10000 is not supported. "
         "Your object ",
         source, " required ", num_parts,
-        " as multi_part_copy_part_size is set to ", multi_part_copy_part_size_,
+        " as multi_part_copy_part_size is set to ",
+        multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD],
         ". You can control this part size using the environment variable ",
         "S3_MULTI_PART_COPY_PART_SIZE to increase it.");
     return tensorflow::errors::Unimplemented(message);
@@ -831,7 +969,9 @@ Status S3FileSystem::MultiPartCopy(const Aws::String& source,
 
   Aws::String uploadID = multipartUploadOutcome.GetResult().GetUploadId();
   VLOG(1) << "Copying from " << source << " in " << num_parts
-          << " parts of size " << multi_part_copy_part_size_ << " each";
+          << " parts of size "
+          << multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD]
+          << " each";
   Aws::S3::Model::CompletedMultipartUpload completedMPURequest;
 
   // passed to each callback keyed by partNumber
@@ -859,8 +999,12 @@ Status S3FileSystem::MultiPartCopy(const Aws::String& source,
     for (std::map<int, PartState>::iterator it = incompletePartStates.begin();
          it != incompletePartStates.end(); it++) {
       int partNumber = it->first;
-      uint64 startPos = (partNumber - 1) * multi_part_copy_part_size_;
-      uint64 endPos = startPos + kS3MultiPartCopyPartSize - 1;
+      uint64 startPos =
+          (partNumber - 1) *
+          multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD];
+      uint64 endPos =
+          startPos +
+          multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD] - 1;
       if (endPos >= file_length) {
         endPos = file_length - 1;
       }
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index 7b70c374a51..c69d678185e 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -52,6 +52,10 @@ class S3FileSystem : public FileSystem {
   Status NewRandomAccessFile(
       const string& fname, std::unique_ptr<RandomAccessFile>* result) override;
 
+  Status NewRandomAccessFile(const string& fname,
+                             std::unique_ptr<RandomAccessFile>* result,
+                             bool use_multi_part_download);
+
   Status NewWritableFile(const string& fname,
                          std::unique_ptr<WritableFile>* result) override;
 
@@ -101,8 +105,12 @@ class S3FileSystem : public FileSystem {
   std::shared_ptr<Aws::S3::S3Client> s3_client_;
 
   // Returns the member transfer manager, initializing as-needed.
-  std::shared_ptr<Aws::Transfer::TransferManager> GetTransferManager();
-  std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager_;
+  std::shared_ptr<Aws::Transfer::TransferManager> GetTransferManager(
+      const Aws::Transfer::TransferDirection& direction);
+  void InitializeTransferManagers();
+  std::map<Aws::Transfer::TransferDirection,
+           std::shared_ptr<Aws::Transfer::TransferManager> >
+      transfer_managers_;
 
   // Returns the member executor for transfer manager, initializing as-needed.
   std::shared_ptr<Aws::Utils::Threading::PooledThreadExecutor> GetExecutor();
@@ -132,8 +140,10 @@ class S3FileSystem : public FileSystem {
   // Lock held when checking for s3_client_ and transfer_manager_ initialization
   mutex initialization_lock_;
 
-  // size to split objects during multipart copy
-  uint64 multi_part_copy_part_size_;
+  // size to split objects during multipart upload/download/copy
+  std::map<Aws::Transfer::TransferDirection, uint64> multi_part_chunk_size_;
+
+  bool use_multi_part_download_;
 };
 
 /// S3 implementation of a file system with retry on failures.
@@ -147,6 +157,16 @@ class RetryingS3FileSystem : public RetryingFileSystem<S3FileSystem> {
                         )) {}
 };
 
+// AWS Streams destroy the buffer (buf) passed, so creating a new
+// IOStream that retains the buffer so the calling function
+// can control it's lifecycle
+class TFS3UnderlyingStream : public Aws::IOStream {
+ public:
+  using Base = Aws::IOStream;
+  TFS3UnderlyingStream(std::streambuf* buf) : Base(buf) {}
+  virtual ~TFS3UnderlyingStream() = default;
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
diff --git a/tensorflow/core/platform/s3/s3_file_system_test.cc b/tensorflow/core/platform/s3/s3_file_system_test.cc
index 224e30c6bb3..06432165e68 100644
--- a/tensorflow/core/platform/s3/s3_file_system_test.cc
+++ b/tensorflow/core/platform/s3/s3_file_system_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/platform/s3/s3_file_system.h"
 
+#include <time.h>
+
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/path.h"
@@ -62,6 +64,96 @@ class S3FileSystemTest : public ::testing::Test {
     return Status::OK();
   }
 
+  Status ReadAllInChunks(const string& fname, string* content,
+                         bool use_multi_part_download = true) {
+    std::unique_ptr<RandomAccessFile> reader;
+
+    TF_RETURN_IF_ERROR(
+        s3fs.NewRandomAccessFile(fname, &reader, use_multi_part_download));
+
+    uint64 file_size = 0;
+    TF_RETURN_IF_ERROR(s3fs.GetFileSize(fname, &file_size));
+
+    content->resize(file_size);
+
+    uint64 buffer_size = 16 * 1024 * 1024;
+
+    std::size_t part_count = (std::max)(
+        static_cast<size_t>((file_size + buffer_size - 1) / buffer_size),
+        static_cast<std::size_t>(1));
+    VLOG(1) << "buffersize:" << buffer_size << " file_size:" << file_size
+            << " part_count=" << part_count;
+    std::unique_ptr<char[]> buffer{new char[buffer_size]};
+    std::stringstream ss;
+
+    int offset = 0;
+    int result_size = 0;
+
+    using namespace std::chrono;
+    auto start = high_resolution_clock::now();
+
+    for (int i = 0; i < part_count; i++) {
+      StringPiece result;
+      offset = i * buffer_size;
+      TF_RETURN_IF_ERROR(
+          reader->Read(offset, buffer_size, &result, buffer.get()));
+
+      if (result.size() != 0) {
+        ss.write(result.data(), result.size());
+        result_size += result.size();
+      }
+      if (result_size == file_size) {
+        break;
+      }
+      if (result.size() != buffer_size) {
+        VLOG(1) << "Result size and buffer size did not match";
+        if (result.empty()) {
+          return errors::OutOfRange("eof");
+        } else {
+          return errors::DataLoss("truncated record at ", offset);
+        }
+      }
+    }
+
+    if (file_size != result_size) {
+      return errors::DataLoss("expected ", file_size, " got ", result_size,
+                              " bytes");
+    }
+
+    auto stop = high_resolution_clock::now();
+    duration<double> time_taken = duration_cast<duration<double>>(stop - start);
+    VLOG(1) << "Time Taken"
+            << " : " << time_taken.count() << "seconds";
+
+    memcpy((char*)(content->data()), ss.str().data(),
+           static_cast<size_t>(file_size));
+
+    return Status::OK();
+  }
+
+  Status ReadLargeFile() {
+    // const string fname = TmpDir("train-00001-of-01024");
+    auto large_file_name = getenv("LARGE_DOWNLOAD_FILE_NAME");
+    const string fname = TmpDir(large_file_name);
+    string content_xfer;
+    string content_s3client;
+
+    // Read using Chunked Transfer Manager
+    VLOG(1) << "Using transfer manager";
+    TF_RETURN_IF_ERROR(ReadAllInChunks(fname, &content_xfer));
+
+    VLOG(1) << "Without transfer manager";
+    // Read using old S3 API and see if the contents match with TransferManager
+    TF_RETURN_IF_ERROR(ReadAllInChunks(fname, &content_s3client, false));
+
+    if (content_xfer == content_s3client) {
+      return Status::OK();
+    } else {
+      VLOG(1) << "ReadLargeFile contents DO NOT match";
+      return Status(error::OUT_OF_RANGE, "ReadLargeFile contents DO NOT match");
+    }
+  }
+
   S3FileSystem s3fs;
 };
 
@@ -236,5 +328,9 @@ TEST_F(S3FileSystemTest, HasAtomicMove) {
   EXPECT_EQ(has_atomic_move, false);
 }
 
+TEST_F(S3FileSystemTest, NewRandomAccessBigFile) {
+  TF_EXPECT_OK(ReadLargeFile());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/tensor_coding.cc b/tensorflow/core/platform/tensor_coding.cc
index 66d28d7b15f..cd938a5be1d 100644
--- a/tensorflow/core/platform/tensor_coding.cc
+++ b/tensorflow/core/platform/tensor_coding.cc
@@ -134,14 +134,7 @@ std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in) {
 #if defined(TENSORFLOW_PROTOBUF_USES_CORD)
 void AssignRefCounted(StringPiece src, core::RefCounted* obj, absl::Cord* out) {
   obj->Ref();
-  out->Clear();
-  // Defines a lambda to unref "obj" when Cord deletes this piece of
-  // memory. +[] converts the lambda to a C style function pointer.
-  auto cleanup = +[](absl::string_view donotcare, void* obj) {
-    reinterpret_cast<core::RefCounted*>(obj)->Unref();
-  };
-  out->AppendExternalMemory(absl::string_view(src.data(), src.size()), obj,
-                            cleanup);
+  *out = absl::MakeCordFromExternal(src, [obj] { obj->Unref(); });
 }
 
 void EncodeStringList(const tstring* strings, int64 n, absl::Cord* out) {
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 387efc831ff..b195000137d 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -17,15 +17,18 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:cost_utils",
-        "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:op_utils",
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:timespan",
         "//tensorflow/core/profiler/utils:trace_utils",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -35,9 +38,11 @@ tf_cc_test(
     srcs = ["xplane_to_op_metrics_db_test.cc"],
     deps = [
         ":xplane_to_op_metrics_db",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:time_utils",
         "//tensorflow/core/profiler/utils:xplane_builder",
@@ -54,6 +59,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -86,13 +92,15 @@ cc_library(
         ":op_stats_to_input_pipeline_analysis",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:logging",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
         "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
+        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
+        "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
         "//tensorflow/core/profiler/utils:errors",
+        "//tensorflow/core/profiler/utils:html_utils",
         "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:time_utils",
@@ -118,11 +126,11 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/utils:errors",
         "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core/profiler/utils:html_utils",
         "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:time_utils",
         "//tensorflow/core/util:stats_calculator_portable",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -135,13 +143,12 @@ cc_library(
     hdrs = ["op_stats_to_tf_stats.h"],
     deps = [
         ":op_metrics_to_record",
+        "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
-        "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:time_utils",
-        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -152,13 +159,18 @@ tf_cc_test(
     deps = [
         ":op_stats_to_tf_stats",
         ":xplane_to_op_stats",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:time_utils",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -171,6 +183,9 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core/profiler/utils:timespan",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -205,6 +220,7 @@ cc_library(
     srcs = ["xplane_to_op_stats.cc"],
     hdrs = ["xplane_to_op_stats.h"],
     deps = [
+        ":op_metrics_db_combiner",
         ":step_events_to_steps_db",
         ":xplane_to_kernel_stats_db",
         ":xplane_to_op_metrics_db",
@@ -213,15 +229,21 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
         "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:hardware_type_utils",
         "//tensorflow/core/profiler/utils:kernel_stats_utils",
+        "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -239,11 +261,15 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
+        "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_test_utils",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -259,7 +285,6 @@ cc_library(
         ":xplane_to_memory_profile",
         ":xplane_to_op_stats",
         ":xplane_to_trace_events",
-        "//tensorflow/core:human_readable_json",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
@@ -269,6 +294,7 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/rpc/client:save_profile",
         "//tensorflow/core/profiler/utils:xplane_schema",
@@ -284,12 +310,14 @@ tf_cc_test(
     srcs = ["xplane_to_profile_response_test.cc"],
     deps = [
         ":xplane_to_profile_response",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
         "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
@@ -303,13 +331,16 @@ cc_library(
     hdrs = ["xplane_to_step_events.h"],
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
+        "//tensorflow/core/profiler/utils:timespan",
         "//tensorflow/core/profiler/utils:trace_utils",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -319,12 +350,16 @@ tf_cc_test(
     srcs = ["xplane_to_step_events_test.cc"],
     deps = [
         ":xplane_to_step_events",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_test_utils",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -338,7 +373,9 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -354,6 +391,8 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
@@ -369,14 +408,14 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:kernel_stats_utils",
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:trace_utils",
         "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -389,14 +428,14 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:timespan",
         "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -413,10 +452,14 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_test_utils",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -428,15 +471,18 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:protos_all_cc",
         "//tensorflow/core/platform:protobuf",
         "//tensorflow/core/profiler/protobuf:memory_profile_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -446,10 +492,14 @@ tf_cc_test(
     srcs = ["xplane_to_memory_profile_test.cc"],
     deps = [
         ":xplane_to_memory_profile",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/protobuf:memory_profile_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_test_utils",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
index 3f601bb9134..0e0f5b1387f 100644
--- a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
+++ b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
@@ -15,12 +15,41 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 
 namespace tensorflow {
 namespace profiler {
 namespace {
 
+using OperationType = OpMetrics::MemoryAccessed::OperationType;
+
+void CombineMemoryAccessedBreakdown(const OpMetrics& src, OpMetrics* dst) {
+  absl::flat_hash_map<std::pair<uint64 /*memory_space*/, OperationType>,
+                      OpMetrics_MemoryAccessed*>
+      dst_memory_accessed_map;
+  for (auto& dst_memory_accessed : *dst->mutable_memory_accessed_breakdown()) {
+    dst_memory_accessed_map[{dst_memory_accessed.memory_space(),
+                             dst_memory_accessed.operation_type()}] =
+        &dst_memory_accessed;
+  }
+  for (const auto& src_memory_accessed : src.memory_accessed_breakdown()) {
+    uint64 memory_space = src_memory_accessed.memory_space();
+    OperationType operation_type = src_memory_accessed.operation_type();
+    auto*& dst_memory_accessed =
+        dst_memory_accessed_map[{memory_space, operation_type}];
+    if (dst_memory_accessed == nullptr) {
+      dst_memory_accessed = dst->add_memory_accessed_breakdown();
+      dst_memory_accessed->set_memory_space(memory_space);
+      dst_memory_accessed->set_operation_type(operation_type);
+    }
+    dst_memory_accessed->set_bytes_accessed(
+        src_memory_accessed.bytes_accessed() +
+        dst_memory_accessed->bytes_accessed());
+  }
+}
+
 // Combines the src OpMetrics into the dst OpMetrics.
 void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst) {
   DCHECK(dst != nullptr);
@@ -41,6 +70,7 @@ void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst) {
   dst->set_self_time_ps(src.self_time_ps() + dst->self_time_ps());
   dst->set_flops(src.flops() + dst->flops());
   dst->set_bytes_accessed(src.bytes_accessed() + dst->bytes_accessed());
+  CombineMemoryAccessedBreakdown(src, dst);
   dst->set_dma_stall_ps(src.dma_stall_ps() + dst->dma_stall_ps());
 }
 
diff --git a/tensorflow/core/profiler/convert/op_metrics_to_record.cc b/tensorflow/core/profiler/convert/op_metrics_to_record.cc
index b51c679776b..8e28199b827 100644
--- a/tensorflow/core/profiler/convert/op_metrics_to_record.cc
+++ b/tensorflow/core/profiler/convert/op_metrics_to_record.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
 
+#include <iterator>
 #include <tuple>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
index ca2a6c28875..89b4939f5d0 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
@@ -15,11 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
 
+#include <math.h>
+
 #include <algorithm>
-#include <utility>
+#include <vector>
 
 #include "google/protobuf/any.pb.h"
-#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/utils/errors.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
+#include "tensorflow/core/profiler/utils/html_utils.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
@@ -103,7 +104,7 @@ StepSummary GetStepSummaryForSampleStats(const Stat<double>& sample_stats) {
     avg = sdv = min = max = 0.0;
   } else {
     avg = sample_stats.avg();
-    sdv = std::sqrt(sample_stats.sample_variance());
+    sdv = sqrt(sample_stats.sample_variance());
     min = sample_stats.min();
     max = sample_stats.max();
   }
@@ -243,7 +244,7 @@ enum class InputOpCategory {
   kPreprocessing      // data preprocessing.
 };
 
-string InputOpCategoryString(InputOpCategory category) {
+std::string InputOpCategoryString(InputOpCategory category) {
   switch (category) {
     case InputOpCategory::kEnqueue:
       return "Enqueue";
@@ -327,10 +328,6 @@ InputOpDetails ConvertOpMetricsToInputOpDetails(const OpMetrics& op_metrics,
   return details;
 }
 
-string AnchorElement(absl::string_view url, absl::string_view text) {
-  return absl::StrCat("<a href=\"", url, "\" target=\"_blank\">", text, "</a>");
-}
-
 // Returns the ratio of the host-to-device time in each step to the step-time.
 double RatioOfHostToDeviceTimeToStepTime(
     const OpMetricsDb& host_tf_metrics_db,
@@ -362,9 +359,9 @@ double RatioOfHostToDeviceTimeToStepTime(
 }
 
 void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
-                          string* kernel_launch_classification,
-                          string* kernel_launch_statement) {
-  string percent_str = absl::StrFormat("%.1lf", kernel_launch_percent);
+                          std::string* kernel_launch_classification,
+                          std::string* kernel_launch_statement) {
+  std::string percent_str = absl::StrFormat("%.1lf", kernel_launch_percent);
   if (kernel_launch_percent >= kHighlyKernelLaunchBoundThresholdInPercent) {
     *kernel_launch_classification = "high";
     *kernel_launch_statement = absl::StrCat(
@@ -389,14 +386,14 @@ void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
 }
 
 void AllOtherAnalysis(bool all_other_reported, double all_other_percent,
-                      string* all_other_classification,
-                      string* all_other_statement) {
+                      std::string* all_other_classification,
+                      std::string* all_other_statement) {
   if (all_other_reported) {
     *all_other_classification = "no";
     *all_other_statement = "";
     return;
   }
-  string percent_str = absl::StrFormat("%.1lf", all_other_percent);
+  std::string percent_str = absl::StrFormat("%.1lf", all_other_percent);
   if (all_other_percent >= kHighlyAllOtherBoundThresholdInPercent) {
     *all_other_classification = "high";
     *all_other_statement =
@@ -588,9 +585,10 @@ InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
 }
 
 bool InputAnalysis(double input_percent, double all_other_percent,
-                   string* input_classification, string* input_statement) {
+                   std::string* input_classification,
+                   std::string* input_statement) {
   absl::string_view non_input_time = "other time";
-  string infeed_percent_str = absl::StrFormat("%.1lf", input_percent);
+  std::string infeed_percent_str = absl::StrFormat("%.1lf", input_percent);
   if (input_percent >= kHighlyInfeedBoundThresholdInPercent) {
     *input_classification = "host";
     *input_statement = absl::StrCat(
@@ -610,7 +608,8 @@ bool InputAnalysis(double input_percent, double all_other_percent,
     // Input analysis says it is not input-bound, but "All-Other" time
     // is significant. It could still be input-bound (or Python overhead).
     *input_classification = "both";
-    string all_other_percent_str = absl::StrFormat("%.1lf", all_other_percent);
+    std::string all_other_percent_str =
+        absl::StrFormat("%.1lf", all_other_percent);
     *input_statement = absl::StrCat(
         "Your program is POTENTIALLY input-bound because ",
         all_other_percent_str,
@@ -630,8 +629,8 @@ bool InputAnalysis(double input_percent, double all_other_percent,
   }
 }
 
-void OutputAnalysis(double output_percent, string* output_classification,
-                    string* output_statement) {
+void OutputAnalysis(double output_percent, std::string* output_classification,
+                    std::string* output_statement) {
   string tc_outfeed_percent_str = absl::StrFormat("%.1lf", output_percent);
   if (output_percent >= kHighlyOutfeedBoundThresholdInPercent) {
     *output_classification = "host";
@@ -703,19 +702,19 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
   double kernel_launch_percent =
       100.0 * total_host_prepare_ms / total_step_time_ms;
   double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
-  string input_classification;
-  string input_statement;
+  std::string input_classification;
+  std::string input_statement;
   bool all_other_reported =
       InputAnalysis(input_percent, all_other_percent, &input_classification,
                     &input_statement);
 
-  string kernel_launch_classification;
-  string kernel_launch_statement;
+  std::string kernel_launch_classification;
+  std::string kernel_launch_statement;
   KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
                        &kernel_launch_classification, &kernel_launch_statement);
 
-  string all_other_classification;
-  string all_other_statement;
+  std::string all_other_classification;
+  std::string all_other_statement;
   AllOtherAnalysis(all_other_reported, all_other_percent,
                    &all_other_classification, &all_other_statement);
 
@@ -729,9 +728,9 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
   return analysis;
 }
 
-string GetSummaryNextStep(absl::string_view input_classification,
-                          const InputTimeBreakdown& breakdown) {
-  string summary_next_step;
+std::string GetSummaryNextStep(absl::string_view input_classification,
+                               const InputTimeBreakdown& breakdown) {
+  std::string summary_next_step;
   if (input_classification == "host" || input_classification == "both") {
     if (!TfDataInUse(breakdown)) {
       summary_next_step = absl::StrCat(
@@ -753,5 +752,17 @@ string GetSummaryNextStep(absl::string_view input_classification,
   return summary_next_step;
 }
 
+double HostToDeviceTransferAsPercentOfInputTime(
+    const InputTimeBreakdown& breakdown) {
+  // Thanks to the scaling trick we did in GenerateHostResult(), we can
+  // estimate the percentage of input-time spent on host-to-device transfer in
+  // the following way.
+  double total_input_time_us =
+      breakdown.demanded_file_read_us() + breakdown.advanced_file_read_us() +
+      breakdown.preprocessing_us() + breakdown.enqueue_us() +
+      breakdown.unclassified_non_enqueue_us();
+  return 100.0 * SafeDivide(breakdown.enqueue_us(), total_input_time_us);
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
index 738daeaac12..2191251ee88 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
@@ -16,18 +16,32 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_INPUT_PIPELINE_ANALYSIS_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_INPUT_PIPELINE_ANALYSIS_H_
 
+#include <string>
+
 #include "google/protobuf/any.pb.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 
 namespace tensorflow {
 namespace profiler {
 
+// If the percent of input-time spent on host-to-device transfer is greater than
+// kHostToDeviceTimePercentAsSignificant, we should advise the
+// user to optimize this transfer.
+constexpr double kHostToDeviceTimePercentAsSignificant = 10.0;
+
+// If the percent of input-time spent on host-to-device transfer is greater than
+// kHostToDeviceTimePercentAsDominant, we should ONLY advise the
+// user to optimize this transfer; we won't bother to suggest optimization for
+// tf.data.
+constexpr double kHostToDeviceTimePercentAsDominant = 90.0;
+
 // Computes the summary of step time in milliseconds.
 StepSummary ComputeStepTimeSummaryInMs(
     const ::tensorflow::protobuf::RepeatedPtrField<PerCoreStepInfo>&
@@ -50,16 +64,23 @@ InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
 // Returns true if explanation for "All Others" time is also included in
 // input_statement.
 bool InputAnalysis(double input_percent, double all_other_percent,
-                   string* input_classification, string* input_statement);
+                   std::string* input_classification,
+                   std::string* input_statement);
 
-void OutputAnalysis(double output_percent, string* output_classification,
-                    string* output_statement);
+void OutputAnalysis(double output_percent, std::string* output_classification,
+                    std::string* output_statement);
 
 string GetSummaryNextStep(absl::string_view input_classification,
                           const InputTimeBreakdown& breakdown);
 
+// Returns the percentage of the input time that is spent on transferring the
+// data from host to device.
+double HostToDeviceTransferAsPercentOfInputTime(
+    const InputTimeBreakdown& breakdown);
+
 void AddErrorMessages(const OpStats& op_stats,
                       InputPipelineAnalysisResult* result);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index e19690a6606..62f37c50155 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -15,13 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
 
-#include <algorithm>
-#include <utility>
+#include <string>
 
 #include "google/protobuf/any.pb.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
@@ -30,7 +28,10 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
 #include "tensorflow/core/profiler/utils/errors.h"
+#include "tensorflow/core/profiler/utils/html_utils.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
@@ -44,24 +45,23 @@ namespace {
 // statement of suggestion will be made.
 constexpr double kLowPrecisionPercentThreshold = 10;
 
-OverviewPageTip MakeOverviewPageTip(const string& text) {
-  OverviewPageTip tip;
-  tip.set_link(text);
-  return tip;
-}
+struct TfFunctionInfo {
+  absl::string_view function_name;
+  double expensive_call_percent;
+};
 
-string AnchorElement(const string& url, const string& text) {
-  return absl::StrCat("<a href=\"", url, "\" target=\"_blank\">", text, "</a>");
+OverviewPageTip MakeOverviewPageTip(std::string text) {
+  OverviewPageTip tip;
+  tip.set_link(std::move(text));
+  return tip;
 }
 
 // Makes a recommendation for looking up a document.
 // doc_url is expected to be already be escaped suitably for use in an HTML
 // attribute.
-OverviewPageTip MakeOverviewPageTipDocLink(const string& doc_url,
-                                           const string& text) {
-  OverviewPageTip tip;
-  tip.set_link(AnchorElement(doc_url, text));
-  return tip;
+OverviewPageTip MakeOverviewPageTipDocLink(absl::string_view doc_url,
+                                           absl::string_view text) {
+  return MakeOverviewPageTip(AnchorElement(doc_url, text));
 }
 
 void ComputeHostTips(OverviewPageRecommendation* re) {
@@ -75,12 +75,13 @@ void ComputeHostTips(OverviewPageRecommendation* re) {
 
 void ComputeDeviceTips(HardwareType hardware_type,
                        OverviewPageRecommendation* re) {
-  const string& device_name = HardwareType_Name(hardware_type);
-  string timeline_name =
-      (hardware_type == tensorflow::profiler::TPU) ? "TPU core" : device_name;
-  string op_stats_toolname = (hardware_type == tensorflow::profiler::TPU)
-                                 ? "op_profile"
-                                 : "tensorflow_stats";
+  absl::string_view device_name = HardwareType_Name(hardware_type);
+  absl::string_view timeline_name = device_name;
+  absl::string_view op_stats_toolname = "tensorflow_stats";
+  if (hardware_type == tensorflow::profiler::TPU) {
+    timeline_name = "TPU core";
+    op_stats_toolname = "op_profile";
+  }
   *re->add_device_tips() = MakeOverviewPageTip(
       absl::StrCat(op_stats_toolname,
                    " (identify the time-consuming operations "
@@ -96,6 +97,9 @@ void ComputeFaqTips(OverviewPageRecommendation* re) {
 }
 
 void ComputeDocumentationTips(OverviewPageRecommendation* re) {
+  *re->add_documentation_tips() = MakeOverviewPageTipDocLink(
+      "https://www.tensorflow.org/guide/data_performance_analysis",
+      "Analyze tf.data performance with the TF Profiler");
   *re->add_documentation_tips() = MakeOverviewPageTipDocLink(
       "https://www.tensorflow.org/guide/"
       "data_performance",
@@ -121,14 +125,16 @@ std::string GeneratePrecisionStatement(const PrecisionStats& precision_stats) {
 
 }  // namespace
 
-void SetCommonRecommendation(const string& input_classification,
-                             const string& input_statement,
-                             const string& output_statement,
+void SetCommonRecommendation(absl::string_view input_classification,
+                             absl::string_view input_statement,
+                             absl::string_view output_statement,
                              HardwareType hardware_type,
+                             absl::string_view tf_function_statement_html,
                              OverviewPageRecommendation* re) {
-  re->set_bottleneck(input_classification);
-  re->set_statement(input_statement);
-  re->set_output_statement(output_statement);
+  re->set_bottleneck(std::string(input_classification));
+  re->set_statement(std::string(input_statement));
+  re->set_output_statement(std::string(output_statement));
+  re->set_tf_function_statement_html(std::string(tf_function_statement_html));
   ComputeHostTips(re);
   ComputeDeviceTips(hardware_type, re);
   ComputeDocumentationTips(re);
@@ -245,6 +251,35 @@ OverviewPageRunEnvironment ComputeRunEnvironment(
   return re;
 }
 
+std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db) {
+  std::vector<TfFunctionInfo> candidates;
+  for (const auto& name_fun : tf_function_db.tf_functions()) {
+    const auto& fun = name_fun.second;
+    if (fun.expensive_call_percent() >= kTfFunctionReportThresholdInPercent) {
+      candidates.push_back({name_fun.first, fun.expensive_call_percent()});
+    }
+  }
+  if (candidates.empty()) return "";
+  auto cmp = [](const TfFunctionInfo& a, const TfFunctionInfo& b) {
+    return a.expensive_call_percent > b.expensive_call_percent;
+  };
+  // Sorts candidates in descending order of expensive_call_percent.
+  absl::c_sort(candidates, cmp);
+  std::string expensive_functions = "";
+  auto num_functions_shown = std::min(
+      static_cast<decltype(candidates)::size_type>(3), candidates.size());
+
+  for (auto i = 0; i < num_functions_shown; i++) {
+    if (i > 0) absl::StrAppend(&expensive_functions, ", ");
+    absl::StrAppend(&expensive_functions, "\"", candidates[i].function_name,
+                    "\"");
+  }
+  if (candidates.size() > num_functions_shown)
+    absl::StrAppend(&expensive_functions, " and more");
+  return absl::StrCat("Expensive tf-functions detected (", expensive_functions,
+                      ") due to either retracing or eager execution.");
+}
+
 OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
                                           HardwareType hardware_type) {
   OverviewPage overview_page;
@@ -258,9 +293,11 @@ OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
       overview_page.input_analysis().step_details());
   *overview_page.mutable_recommendation() = ComputeGenericRecommendation(
       bottleneck, op_stats.device_op_metrics_db().precision_stats());
-  SetCommonRecommendation(bottleneck.input_classification(),
-                          bottleneck.input_statement(), "", hardware_type,
-                          overview_page.mutable_recommendation());
+  SetCommonRecommendation(
+      bottleneck.input_classification(), bottleneck.input_statement(), "",
+      hardware_type, TfFunctionRecommendationHtml(op_stats.tf_function_db()),
+      overview_page.mutable_recommendation());
+  SetOverviewPageErrorMessage(op_stats, &overview_page);
   return overview_page;
 }
 
@@ -277,5 +314,18 @@ void SetRemarks(const OpStats& op_stats, OverviewPageAnalysis* analysis) {
   }
 }
 
+void SetOverviewPageErrorMessage(const OpStats& op_stats,
+                                 OverviewPage* overview_page) {
+  *overview_page->mutable_errors() = op_stats.errors();
+  absl::c_sort(*overview_page->mutable_errors());
+  if (overview_page->errors().empty()) {
+    // Shows run-environment error only if there is no other existing error.
+    if (op_stats.run_environment().device_type() != "CPU" &&
+        op_stats.run_environment().device_core_count() <= 0) {
+      *overview_page->add_errors() = std::string(kNoDeviceTraceCollected);
+    }
+  }
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
index e6d12708e9f..d4d75c03454 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
@@ -17,9 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
@@ -29,10 +27,16 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-void SetCommonRecommendation(const string& input_classification,
-                             const string& input_statement,
-                             const string& output_statement,
+// Reports tf-function optimization opportunity in the Overview Page if the
+// expensive-call-time percentage is over this threshold for at least one of
+// the tf-functions profiled.
+const double kTfFunctionReportThresholdInPercent = 20;
+
+void SetCommonRecommendation(absl::string_view input_classification,
+                             absl::string_view input_statement,
+                             absl::string_view output_statement,
                              HardwareType hardware_type,
+                             absl::string_view tf_function_statement_html,
                              OverviewPageRecommendation* re);
 
 OverviewPageRecommendation ComputeGenericRecommendation(
@@ -44,9 +48,15 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats);
 OverviewPageRunEnvironment ComputeRunEnvironment(
     const RunEnvironment& run_environment);
 
+void SetOverviewPageErrorMessage(const OpStats& op_stats,
+                                 OverviewPage* overview_page);
+
 OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
                                           HardwareType hardware_type);
 
+// Returns a html which provides tf-function related recommendation.
+std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db);
+
 void SetRemarks(const OpStats& op_stats, OverviewPageAnalysis* analysis);
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
index da409f89a60..e23813a5b5d 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
@@ -15,13 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
 
-#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
index 3e098da7eb8..9ca83b51a70 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
@@ -75,8 +79,8 @@ TEST(OpStatsToTfStats, GpuTfStats) {
                        kKernel3DurationNs, /*on_device=*/true, kKernel3,
                        &device_plane, &stream2);
 
-  const OpStats& op_stats = ConvertXSpaceToOpStats(space);
-  const TfStatsDatabase& tf_stats = ConvertOpStatsToTfStats(op_stats);
+  const OpStats op_stats = ConvertXSpaceToOpStats(space);
+  const TfStatsDatabase tf_stats = ConvertOpStatsToTfStats(op_stats);
 
   // TfOp1, TfOp2, Idle
   EXPECT_EQ(3, tf_stats.with_idle().tf_stats_record_size());
diff --git a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
index ed0d83ade2f..e4713cd73fb 100644
--- a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
+++ b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
@@ -15,10 +15,18 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
 
 #include <sstream>
+#include <utility>
+#include <vector>
 
 #include "google/protobuf/any.pb.h"
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/utils/event_span.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/step_events_to_steps_db.h b/tensorflow/core/profiler/convert/step_events_to_steps_db.h
index b3ea74e905f..9db65163f7a 100644
--- a/tensorflow/core/profiler/convert/step_events_to_steps_db.h
+++ b/tensorflow/core/profiler/convert/step_events_to_steps_db.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_STEP_EVENTS_TO_STEPS_DB_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_STEP_EVENTS_TO_STEPS_DB_H_
 
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
 
diff --git a/tensorflow/core/profiler/convert/trace_events_to_json.cc b/tensorflow/core/profiler/convert/trace_events_to_json.cc
index 9c8176c10ad..07e32ced9d0 100644
--- a/tensorflow/core/profiler/convert/trace_events_to_json.cc
+++ b/tensorflow/core/profiler/convert/trace_events_to_json.cc
@@ -15,9 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/trace_events_to_json.h"
 
+#include <algorithm>
+#include <map>
+#include <utility>
+
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "include/json/json.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
index 785902e2a50..023d6a73d77 100644
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
@@ -15,16 +15,20 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
 
+#include <functional>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/trace_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
index 04bd0e8ae5f..9c7fca22887 100644
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
+++ b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
@@ -17,9 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_KERNEL_STATS_DB_H_
 
 #include <functional>
-#include <vector>
 
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
@@ -31,6 +29,7 @@ KernelStatsDb ConvertDeviceTraceXPlaneToKernelStatsDb(
     const XPlane& device_trace,
     const std::function<void(const XEventVisitor&, KernelReport*)>&
         on_kernel_fn);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
index 1695bd34d73..6dfc3478b31 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
@@ -15,21 +15,28 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
 
-#include <cstddef>
+#include <algorithm>
 #include <string>
 #include <tuple>
+#include <type_traits>
 #include <utility>
+#include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -316,7 +323,7 @@ void InsertSpecialAllocations(int64 unmapped_allocation_bytes, int64 step_id,
     FillActivityMetadata(
         HostEventType::kMemoryAllocation,
         {unmapped_allocation_bytes, unmapped_allocation_bytes, 0,
-         "preallocated/unknown", step_id, "persist", 0, "unknown"},
+         "preallocated/unknown", step_id, "persist/dynamic", 0, "unknown"},
         special_allocation);
     active_allocs->push_back({--index, special_allocation});
   }
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
index 1173e4d5c72..2b6356a7aa1 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -34,53 +38,52 @@ TEST(ConvertXPlaneToMemoryProfile, OneAllocatorMultiActivitiesTest) {
   host_plane_builder.ReserveLines(1);
 
   auto tf_executor_thread = host_plane_builder.GetOrCreateLine(0);
-  CreateXEventWithIntAndStringViewMetadataValue(
-      &host_plane_builder, &tf_executor_thread, "MemoryAllocation", 40000, 1000,
-      {{StatType::kBytesReserved, 2000},
-       {StatType::kBytesAllocated, 3000},
-       {StatType::kBytesAvailable, 5000},
-       {StatType::kPeakBytesInUse, 8500},
-       {StatType::kRequestedBytes, 200},
-       {StatType::kAllocationBytes, 256},
-       {StatType::kAddress, 222333},
-       {StatType::kStepId, -93746},
-       {StatType::kDataType, 1}},
-      {{StatType::kAllocatorName, "GPU_0_bfc"},
-       {StatType::kTfOp, "foo/bar"},
-       {StatType::kRegionType, "output"},
-       {StatType::kTensorShapes, "[3, 3, 512, 512]"}});
+  CreateXEvent(&host_plane_builder, &tf_executor_thread, "MemoryAllocation",
+               40000, 1000,
+               {{StatType::kBytesReserved, 2000},
+                {StatType::kBytesAllocated, 3000},
+                {StatType::kBytesAvailable, 5000},
+                {StatType::kPeakBytesInUse, 8500},
+                {StatType::kRequestedBytes, 200},
+                {StatType::kAllocationBytes, 256},
+                {StatType::kAddress, 222333},
+                {StatType::kStepId, -93746},
+                {StatType::kDataType, 1},
+                {StatType::kAllocatorName, "GPU_0_bfc"},
+                {StatType::kTfOp, "foo/bar"},
+                {StatType::kRegionType, "output"},
+                {StatType::kTensorShapes, "[3, 3, 512, 512]"}});
 
-  CreateXEventWithIntAndStringViewMetadataValue(
-      &host_plane_builder, &tf_executor_thread, "MemoryDeallocation", 50000,
-      1000,
-      {{StatType::kBytesReserved, 2000},
-       {StatType::kBytesAllocated, 2744},
-       {StatType::kBytesAvailable, 5256},
-       {StatType::kPeakBytesInUse, 8500},
-       {StatType::kRequestedBytes, 200},
-       {StatType::kAllocationBytes, 256},
-       {StatType::kAddress, 222333},
-       {StatType::kStepId, 0},
-       {StatType::kDataType, 0}},
-      {{StatType::kAllocatorName, "GPU_0_bfc"},
-       {StatType::kRegionType, ""},
-       {StatType::kTensorShapes, ""}});
+  CreateXEvent(&host_plane_builder, &tf_executor_thread, "MemoryDeallocation",
+               50000, 1000,
+               {{StatType::kBytesReserved, 2000},
+                {StatType::kBytesAllocated, 2744},
+                {StatType::kBytesAvailable, 5256},
+                {StatType::kPeakBytesInUse, 8500},
+                {StatType::kRequestedBytes, 200},
+                {StatType::kAllocationBytes, 256},
+                {StatType::kAddress, 222333},
+                {StatType::kStepId, 0},
+                {StatType::kDataType, 0},
+                {StatType::kAllocatorName, "GPU_0_bfc"},
+                {StatType::kRegionType, ""},
+                {StatType::kTensorShapes, ""}});
 
-  CreateXEventWithIntAndStringViewMetadataValue(
-      &host_plane_builder, &tf_executor_thread, "MemoryAllocation", 70000, 1000,
-      {{StatType::kBytesReserved, 2000},
-       {StatType::kBytesAllocated, 5000},
-       {StatType::kBytesAvailable, 3000},
-       {StatType::kPeakBytesInUse, 9500},
-       {StatType::kRequestedBytes, 300},
-       {StatType::kAllocationBytes, 300},
-       {StatType::kAddress, 345678},
-       {StatType::kStepId, -93746},
-       {StatType::kDataType, 9}},
-      {{StatType::kAllocatorName, "GPU_0_bfc"},
-       {StatType::kTfOp, "mul_grad/Sum"},
-       {StatType::kRegionType, "temp"},
-       {StatType::kTensorShapes, "[1, 2]"}});
+  CreateXEvent(&host_plane_builder, &tf_executor_thread, "MemoryAllocation",
+               70000, 1000,
+               {{StatType::kBytesReserved, 2000},
+                {StatType::kBytesAllocated, 5000},
+                {StatType::kBytesAvailable, 3000},
+                {StatType::kPeakBytesInUse, 9500},
+                {StatType::kRequestedBytes, 300},
+                {StatType::kAllocationBytes, 300},
+                {StatType::kAddress, 345678},
+                {StatType::kStepId, -93746},
+                {StatType::kDataType, 9},
+                {StatType::kAllocatorName, "GPU_0_bfc"},
+                {StatType::kTfOp, "mul_grad/Sum"},
+                {StatType::kRegionType, "temp"},
+                {StatType::kTensorShapes, "[1, 2]"}});
 
   MemoryProfile memory_profile = ConvertXPlaneToMemoryProfile(*host_plane);
   EXPECT_EQ(memory_profile.memory_profile_per_allocator().size(), 1);
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
index 09df59e44d9..4a369b8b96a 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
@@ -15,21 +15,31 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
 
+#include <algorithm>
+#include <memory>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
 #include "tensorflow/core/profiler/convert/op_stack.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/cost_utils.h"
+#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 #include "tensorflow/core/profiler/utils/op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
+#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/trace_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h
index 1a785d0335f..f2d7fc702fc 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h
@@ -21,10 +21,9 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
-#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
index 3e577d00e1c..8bd0443b8f6 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
@@ -15,9 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
 
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
index 7fdd6ffd8cb..4d2a45747e0 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
@@ -15,7 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
 
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
 #include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
@@ -23,12 +28,19 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
+#include "tensorflow/core/profiler/utils/tf_op_utils.h"
+#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -98,12 +110,20 @@ void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
 
 }  // namespace
 
+void PropagateXSpaceErrorsToOpStats(const XSpace& space, OpStats* op_stats) {
+  if (space.errors().empty()) return;
+  absl::flat_hash_set<std::string> unique_errors;
+  unique_errors.insert(space.errors().begin(), space.errors().end());
+  *op_stats->mutable_errors() = {unique_errors.begin(), unique_errors.end()};
+}
+
 OpStats ConvertXSpaceToOpStats(const XSpace& space) {
   const XPlane* host_plane = FindPlaneWithName(space, kHostThreads);
   std::vector<const XPlane*> device_planes =
       FindPlanesWithPrefix(space, kGpuPlanePrefix);
   OpStats op_stats;
   StepEvents step_events;
+  PropagateXSpaceErrorsToOpStats(space, &op_stats);
   // Convert device planes.
   OpMetricsDbCombiner op_metrics_db_combiner(
       op_stats.mutable_device_op_metrics_db());
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.h b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
index 2d30a5d5fad..4708caa5aae 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.h
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
@@ -25,6 +25,9 @@ namespace profiler {
 // NOTE: call GroupTfEvents before if OpStats.step_db needs to be generated.
 OpStats ConvertXSpaceToOpStats(const XSpace& space);
 
+// Propagate and dedup the errors in XSpace and add to OpStats.
+void PropagateXSpaceErrorsToOpStats(const XSpace& space, OpStats* op_stats);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
index c7b140b6a67..e4cda680a56 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
@@ -15,14 +15,18 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -181,6 +185,18 @@ TEST(ConcertXPlaneToOpStats, TfFunctionTest) {
   EXPECT_EQ(not_traced_mode.self_time_ps(), 20);
 }
 
+TEST(ConvertXPlaneToOpStats, PropagateAndDedupErrors) {
+  XSpace space;
+  static constexpr char kError[] = "host: error";
+  *space.add_errors() = kError;
+  *space.add_errors() = kError;
+
+  OpStats op_stats = ConvertXSpaceToOpStats(space);
+
+  EXPECT_EQ(1, op_stats.errors_size());
+  EXPECT_EQ(kError, op_stats.errors(/*index=*/0));
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
index b0259bb8865..70a07171310 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
@@ -14,9 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 
+#include <string>
+
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/human_readable_json.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
@@ -33,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
@@ -65,20 +70,26 @@ void AddToolData(absl::string_view tool_name, const Proto& tool_output,
 }
 
 template <typename Proto>
-Status AddJsonToolData(absl::string_view tool_name, const Proto& tool_output,
-                       ProfileResponse* response) {
-  std::string json_output;
-  TF_RETURN_IF_ERROR(ProtoToHumanReadableJson(tool_output, &json_output,
-                                              /*ignore_accuracy_loss=*/true));
-  auto* tool_data = response->add_tool_data();
-  tool_data->set_name(string(tool_name));
-  tool_data->mutable_data()->append(json_output.data(), json_output.size());
+Status ConvertProtoToJson(const Proto& proto_output, std::string* json_output) {
+  protobuf::util::JsonPrintOptions json_options;
+  json_options.always_print_primitive_fields = true;
+  auto status = protobuf::util::MessageToJsonString(proto_output, json_output,
+                                                    json_options);
+  if (!status.ok()) {
+    // Convert error_msg google::protobuf::StringPiece (or absl::string_view) to
+    // tensorflow::StringPiece.
+    auto error_msg = status.message();
+    return errors::Internal(
+        "Could not convert proto to JSON string: ",
+        absl::string_view(error_msg.data(), error_msg.length()));
+  }
   return Status::OK();
 }
 
 // Returns the tool name with extension.
-string ToolName(absl::string_view tool) {
+std::string ToolName(absl::string_view tool) {
   if (tool == kTraceViewer) return "trace.json.gz";
+  if (tool == kMemoryProfile) return "memory_profile.json.gz";
   return absl::StrCat(tool, ".pb");
 }
 
@@ -130,8 +141,11 @@ Status ConvertXSpaceToProfileResponse(const XSpace& xspace,
   if (tools.contains(kMemoryProfile)) {
     if (const XPlane* host_plane = FindPlaneWithName(xspace, kHostThreads)) {
       MemoryProfile memory_profile = ConvertXPlaneToMemoryProfile(*host_plane);
-      TF_RETURN_IF_ERROR(
-          AddJsonToolData(ToolName(kMemoryProfile), memory_profile, response));
+      std::string json_output;
+      TF_RETURN_IF_ERROR(ConvertProtoToJson(memory_profile, &json_output));
+      TF_RETURN_IF_ERROR(SaveGzippedToolDataToTensorboardProfile(
+          req.repository_root(), req.session_id(), req.host_name(),
+          ToolName(kMemoryProfile), json_output));
     }
   }
   return Status::OK();
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response.h b/tensorflow/core/profiler/convert/xplane_to_profile_response.h
index 84b9fdd914b..03ca13f1788 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response.h
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response.h
@@ -15,8 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_PROFILE_RESPONSE_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_PROFILE_RESPONSE_H_
 
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
index d4965a9975c..ad9ca1028f6 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
@@ -14,13 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events.cc b/tensorflow/core/profiler/convert/xplane_to_step_events.cc
index 78bd3dbee0f..7bb7cd6943c 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events.cc
@@ -15,29 +15,37 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
 
-#include "tensorflow/core/lib/strings/str_util.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/trace_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
 namespace {
 
 inline bool IsExplicitHostStepMarker(absl::string_view event_name) {
-  return (str_util::StartsWith(event_name, "train") ||
-          str_util::StartsWith(event_name, "test") ||
-          str_util::StartsWith(event_name, "TraceContext")) &&
-         !str_util::StrContains(event_name, "/");
+  return (absl::StartsWith(event_name, "train") ||
+          absl::StartsWith(event_name, "test") ||
+          absl::StartsWith(event_name, "TraceContext")) &&
+         !absl::StrContains(event_name, "/");
 }
 
 // Returns true if the given event_name should be considered as real computation
 // on CPU.
 inline bool IsRealCpuCompute(absl::string_view event_name) {
-  bool not_real = str_util::StartsWith(event_name, "EagerExecute") ||
-                  str_util::StartsWith(event_name, "EagerLocalExecute") ||
-                  str_util::StartsWith(event_name, "EagerKernelExecute") ||
-                  str_util::StartsWith(event_name, "FunctionRun") ||
+  bool not_real = absl::StartsWith(event_name, "EagerExecute") ||
+                  absl::StartsWith(event_name, "EagerLocalExecute") ||
+                  absl::StartsWith(event_name, "EagerKernelExecute") ||
+                  absl::StartsWith(event_name, "FunctionRun") ||
                   IsExplicitHostStepMarker(event_name);
   return !not_real;
 }
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events.h b/tensorflow/core/profiler/convert/xplane_to_step_events.h
index a7ac3b9e89e..62fc89813a1 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events.h
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
-#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc b/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
index 3e1610c2e0f..9ace9eb185c 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
@@ -15,11 +15,17 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
 
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
index 15cc98df9fb..b25cdc4d219 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
@@ -15,20 +15,25 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
 
+#include <algorithm>
 #include <stack>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
@@ -54,6 +59,21 @@ std::pair<TfFunctionExecutionMode, TfFunctionCompiler> Decode(
   DCHECK(false);
 }
 
+double ComputeExpensiveCallPercent(const TfFunction& tf_function) {
+  // Computes the expensiveness in terms of time (rather than count).
+  uint64 total_call_time_ps = 0;
+  uint64 expensive_call_time_ps = 0;
+  for (const auto& mode_metrics : tf_function.metrics()) {
+    const auto mode = mode_metrics.first;
+    const auto& metrics = mode_metrics.second;
+    total_call_time_ps += metrics.self_time_ps();
+    if (mode == TRACED_MODE || mode == EAGER_MODE) {
+      expensive_call_time_ps += metrics.self_time_ps();
+    }
+  }
+  return SafeDivide(100.0 * expensive_call_time_ps, total_call_time_ps);
+}
+
 // Each invocation of a tf-function creates an ActivationRecord.
 struct ActivationRecord {
   std::string function_name;               // name of the tf-function.
@@ -133,6 +153,7 @@ void CombineTfFunction(const TfFunction& src, TfFunction* dst) {
       CombineTfFunctionMetrics(src_metrics, dst_metrics);
     }
   }
+  dst->set_expensive_call_percent(ComputeExpensiveCallPercent(*dst));
 }
 
 // Execution history of all tf-functions invoked.
@@ -210,6 +231,10 @@ class TfFunctionExecutions {
       metrics->set_count(metrics->count() + 1);
       metrics->set_self_time_ps(metrics->self_time_ps() + self_time_ps);
     }
+    for (auto& name_fun : *result.mutable_tf_functions()) {
+      TfFunction& fun = name_fun.second;
+      fun.set_expensive_call_percent(ComputeExpensiveCallPercent(fun));
+    }
     return result;
   }
 
@@ -243,9 +268,9 @@ class TfFunctionExecutions {
 
 }  // namespace
 
-std::string DebugString(const TfFunctionDb tf_function_db) {
+std::string DebugString(const TfFunctionDb& tf_function_db) {
   std::string str;
-  ::tensorflow::protobuf::TextFormat::PrintToString(tf_function_db, &str);
+  protobuf::TextFormat::PrintToString(tf_function_db, &str);
   return str;
 }
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions.h b/tensorflow/core/profiler/convert/xplane_to_tf_functions.h
index 470b22d34b8..df55ac79bb8 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions.h
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions.h
@@ -16,8 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_FUNCTIONS_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_FUNCTIONS_H_
 
+#include <string>
+
 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
index 253ef1a74f9..12287217e04 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
@@ -15,12 +15,18 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
 
+#include <string>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -33,6 +39,8 @@ const absl::string_view kTracedXla = "traced-xla";
 const absl::string_view kNotTracedNonXla = "notTraced-nonXla";
 const absl::string_view kNotTracedXla = "notTraced-xla";
 
+constexpr double kMaxError = 0.001;
+
 TfFunctionDb ConvertXSpaceToTfFunctionDb(const XSpace& space) {
   TfFunctionDb result;
   const XPlane* host_plane = FindPlaneWithName(space, kHostThreads);
@@ -75,6 +83,8 @@ TEST(ConvertXPlaneToTfFunctions, CombineTwoThreads) {
       tf_function_db.tf_functions().at(kFunctionName);
   EXPECT_EQ(tf_function.total_tracing_count(), 4);
   EXPECT_EQ(tf_function.compiler(), MIXED_COMPILER);
+  EXPECT_NEAR(tf_function.expensive_call_percent(), 90, kMaxError);
+
   const auto& metrics = tf_function.metrics();
   EXPECT_EQ(metrics.size(), 2);
   EXPECT_EQ(metrics.count(TRACED_MODE), 1);
@@ -108,6 +118,7 @@ TEST(ConvertXPlaneToTfFunctions, NestedFunctions) {
       tf_function_db.tf_functions().at(kOuterFunctionName);
   EXPECT_EQ(outer.total_tracing_count(), 1);
   EXPECT_EQ(outer.compiler(), OTHER_COMPILER);
+  EXPECT_NEAR(outer.expensive_call_percent(), 100, kMaxError);
   const auto& outer_metrics = outer.metrics();
   EXPECT_EQ(outer_metrics.size(), 1);
   EXPECT_EQ(outer_metrics.count(TRACED_MODE), 1);
@@ -118,6 +129,7 @@ TEST(ConvertXPlaneToTfFunctions, NestedFunctions) {
       tf_function_db.tf_functions().at(kInnerFunctionName);
   EXPECT_EQ(inner.total_tracing_count(), 0);
   EXPECT_EQ(inner.compiler(), XLA_COMPILER);
+  EXPECT_NEAR(inner.expensive_call_percent(), 0, kMaxError);
   const auto& inner_metrics = inner.metrics();
   EXPECT_EQ(inner_metrics.size(), 1);
   EXPECT_EQ(inner_metrics.count(NOT_TRACED_MODE), 1);
@@ -148,6 +160,7 @@ TEST(ConvertXPlaneToTfFunctions, EagerPlusConcrete) {
       tf_function_db.tf_functions().at(kEagerFunctionName);
   EXPECT_EQ(eager.total_tracing_count(), 0);
   EXPECT_EQ(eager.compiler(), INVALID_COMPILER);
+  EXPECT_NEAR(eager.expensive_call_percent(), 100, kMaxError);
   const auto& eager_metrics = eager.metrics();
   EXPECT_EQ(eager_metrics.size(), 1);
   EXPECT_EQ(eager_metrics.count(EAGER_MODE), 1);
@@ -158,6 +171,7 @@ TEST(ConvertXPlaneToTfFunctions, EagerPlusConcrete) {
       tf_function_db.tf_functions().at(kConcreteFunctionName);
   EXPECT_EQ(concrete.total_tracing_count(), 0);
   EXPECT_EQ(concrete.compiler(), INVALID_COMPILER);
+  EXPECT_NEAR(concrete.expensive_call_percent(), 0, kMaxError);
   const auto& concrete_metrics = concrete.metrics();
   EXPECT_EQ(concrete_metrics.size(), 1);
   EXPECT_EQ(concrete_metrics.count(CONCRETE_MODE), 1);
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc b/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
index 901f3be764a..c404f7bb7e4 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
@@ -15,8 +15,21 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
 
+#include <stddef.h>
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events.h b/tensorflow/core/profiler/convert/xplane_to_trace_events.h
index 5c6fbead805..b7bddb7b366 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events.h
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_events.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_EVENTS_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_EVENTS_H_
 
-#include "absl/strings/str_split.h"
+#include <string>
+
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc b/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
index afff5e60d97..b9a9fe09981 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
@@ -16,8 +16,9 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
 
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 9fab42cd54a..85fa4e7fc44 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -423,8 +423,10 @@ tf_cc_test(
     deps = [
         ":traceme_recorder",
         "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -434,7 +436,6 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
     ],
 )
@@ -444,6 +445,7 @@ cc_library(
     hdrs = ["profiler_factory.h"],
     deps = [
         ":profiler_interface",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
     ] + if_static([
         ":profiler_factory_impl",
     ]),
@@ -461,8 +463,7 @@ cc_library(
     deps = [
         ":profiler_interface",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
     ],
     alwayslink = True,
 )
@@ -513,15 +514,10 @@ tf_cc_test(
     srcs = ["scoped_annotation_test.cc"],
     deps = [
         ":annotation_stack",
-        "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
         "//tensorflow/core/profiler/lib:scoped_annotation",
         "@com_google_absl//absl/strings",
     ],
@@ -544,6 +540,6 @@ tf_cc_test(
         ":parse_annotation",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/profiler/internal/annotation_stack.cc b/tensorflow/core/profiler/internal/annotation_stack.cc
index 4cfd1027a68..4c15ca47c3d 100644
--- a/tensorflow/core/profiler/internal/annotation_stack.cc
+++ b/tensorflow/core/profiler/internal/annotation_stack.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/internal/annotation_stack.h"
 
+#include <atomic>
+
+#include "tensorflow/core/platform/types.h"
+
 namespace tensorflow {
 namespace profiler {
 namespace internal {
diff --git a/tensorflow/core/profiler/internal/annotation_stack.h b/tensorflow/core/profiler/internal/annotation_stack.h
index 38cd962cb32..e626c4c73cc 100644
--- a/tensorflow/core/profiler/internal/annotation_stack.h
+++ b/tensorflow/core/profiler/internal/annotation_stack.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stddef.h>
 
 #include <atomic>
+#include <utility>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/core/profiler/internal/cpu/BUILD b/tensorflow/core/profiler/internal/cpu/BUILD
index e156667c5a7..c24c8c7d456 100644
--- a/tensorflow/core/profiler/internal/cpu/BUILD
+++ b/tensorflow/core/profiler/internal/cpu/BUILD
@@ -18,6 +18,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -26,10 +27,10 @@ cc_library(
     srcs = ["host_tracer.cc"],
     deps = [
         ":host_tracer_utils",
-        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/internal:traceme_recorder",
@@ -50,14 +51,17 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/lib:profiler_session",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -67,17 +71,14 @@ cc_library(
     copts = ["-fexceptions"],
     features = ["-use_header_modules"],
     deps = [
-        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/python/profiler/internal:python_hooks",
-        "@com_google_absl//absl/strings",
     ],
     alwayslink = True,
 )
@@ -86,9 +87,12 @@ cc_library(
     name = "metadata_collector",
     srcs = ["metadata_collector.cc"],
     deps = [
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/service/gpu:gpu_debug_info_manager",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
index 753d8c53b9c..be1a7a2777b 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/cpu/host_tracer_utils.h"
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
@@ -119,8 +124,8 @@ Status HostTracer::CollectData(RunMetadata* run_metadata) {
           std::vector<absl::string_view> parts =
               absl::StrSplit(event.name, kUserMetadataMarker);
           if (parts.size() >= 2) {
-            ns->set_node_name(string(parts[0]));
-            ns->set_timeline_label(string(parts[1]));
+            ns->set_node_name(std::string(parts[0]));
+            ns->set_timeline_label(std::string(parts[1]));
           } else {
             ns->set_node_name(std::move(event.name));
           }
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
index e32ba92de66..499b7b6b564 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
@@ -12,17 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <memory>
+#include <ostream>
 #include <string>
 
 #include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
@@ -38,13 +44,13 @@ namespace {
 
 using ::testing::UnorderedElementsAre;
 
-NodeExecStats MakeNodeStats(const string& name, uint32 thread_id,
-                            const string& label = "") {
+NodeExecStats MakeNodeStats(absl::string_view name, uint32 thread_id,
+                            absl::string_view label = "") {
   NodeExecStats ns;
-  ns.set_node_name(name);
+  ns.set_node_name(std::string(name));
   ns.set_thread_id(thread_id);
   if (!label.empty()) {
-    ns.set_timeline_label(label);
+    ns.set_timeline_label(std::string(label));
   }
   return ns;
 }
@@ -109,7 +115,7 @@ TEST(HostTracerTest, CollectsTraceMeEventsAsRunMetadata) {
 
 TEST(HostTracerTest, CollectsTraceMeEventsAsXSpace) {
   uint32 thread_id;
-  string thread_name = "MyThreadName";
+  std::string thread_name = "MyThreadName";
   XSpace space;
 
   // We start a thread with a known and controled name. As of the time of
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
index a4709ae2113..2e5d8ac1770 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/internal/cpu/host_tracer_utils.h"
 
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/parse_annotation.h"
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
diff --git a/tensorflow/core/profiler/internal/cpu/metadata_collector.cc b/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
index c6aa7840920..da922d4b18b 100644
--- a/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
+++ b/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
@@ -13,17 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -62,12 +68,11 @@ class MetadataCollector : public ProfilerInterface {
       XPlane* plane = GetOrCreatePlane(space, kMetadataPlane);
       plane->set_id(kMetadataPlaneId);
       XPlaneBuilder xplane(plane);
+      const XStatMetadata& hlo_proto_stat =
+          *xplane.GetOrCreateStatMetadata(kHloProto);
       for (auto& p : debug_info_) {
-        std::string hlo_proto;
-        p.hlo_proto->SerializeToString(&hlo_proto);
+        xplane.AddStatValue(hlo_proto_stat, *p.hlo_proto);
         p.hlo_proto.reset();
-        xplane.AddStatValue(*xplane.GetOrCreateStatMetadata(kHloProto),
-                            std::move(hlo_proto), /*is_bytes=*/true);
       }
       debug_info_.clear();
     }
diff --git a/tensorflow/core/profiler/internal/cpu/python_tracer.cc b/tensorflow/core/profiler/internal/cpu/python_tracer.cc
index aa259f53cfa..d684cb8f768 100644
--- a/tensorflow/core/profiler/internal/cpu/python_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/python_tracer.cc
@@ -12,18 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <utility>
-#include <vector>
+#include <memory>
 
-#include "absl/strings/str_split.h"
-#include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/python/profiler/internal/python_hooks.h"
 
diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
index e6ee8514227..2bb0805a592 100644
--- a/tensorflow/core/profiler/internal/gpu/BUILD
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -15,6 +15,10 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "tf_additional_cupti_deps",
+)
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -55,7 +59,6 @@ tf_cc_test_gpu(
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + [
         "nomac",
-        "notap",  # b/154510273
         "gpu_cupti",
     ],
     deps = [
@@ -92,22 +95,25 @@ tf_cuda_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:platform_base",
-    ] + if_cuda_is_configured_compat([
-        "//tensorflow/stream_executor/cuda:cupti_stub",
-    ]),
+    ] + tf_additional_cupti_deps(),
 )
 
+# Rationale for linkstatic: The symbols in libcupti_static.a have hidden
+# visibility. The wrapper will fail to find them if it's ever built as a
+# shared library. This is the same issue as b/11094727. Always linking
+# the wrapper statically works around the issue. An alternative would be
+# to patch libcupti_static, but it's not worth the trouble considering
+# that the wrapper is about the only direct user.
 tf_cuda_library(
     name = "cupti_wrapper",
     srcs = if_cuda_is_configured_compat(["cupti_wrapper.cc"]),
     hdrs = if_cuda_is_configured_compat(["cupti_wrapper.h"]),
     copts = tf_copts(),
+    linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
         ":cupti_interface",
-    ] + if_cuda_is_configured_compat([
-        "//tensorflow/stream_executor/cuda:cupti_stub",
-    ]),
+    ] + tf_additional_cupti_deps(),
 )
 
 tf_cuda_library(
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index 1110e103d57..ab16693deae 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_map.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mem.h"
@@ -286,19 +286,14 @@ void CUPTIAPI FreeCuptiActivityBuffer(CUcontext context, uint32_t stream_id,
           << reinterpret_cast<uintptr_t>(buffer) << std::dec
           << " size: " << size << " valid_size: " << valid_size;
 
-  // Ensure buffer is free when this function returns.
-  auto buffer_cleanup =
-      gtl::MakeCleanup([buffer] { port::AlignedFree(buffer); });
+  if (valid_size > 0) {
+    VLOG(3) << "Activity profile for stream " << stream_id;
 
-  if (valid_size <= 0) {
-    return;
+    CuptiTracer *cupti_tracer = CuptiTracer::GetCuptiTracerSingleton();
+    cupti_tracer->ProcessActivityBuffer(context, stream_id, buffer, valid_size)
+        .IgnoreError();
   }
-
-  VLOG(3) << "Activity profile for stream " << stream_id;
-
-  CuptiTracer *cupti_tracer = CuptiTracer::GetCuptiTracerSingleton();
-  cupti_tracer->ProcessActivityBuffer(context, stream_id, buffer, valid_size)
-      .IgnoreError();
+  port::AlignedFree(buffer);
 }
 
 void AddKernelEventUponApiExit(CuptiTraceCollector *collector, uint32 device_id,
@@ -621,15 +616,42 @@ class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook {
     // Grab timestamp for API exit. API entry timestamp saved in cbdata.
     uint64 end_tsc = CuptiTracer::GetTimestamp();
     uint64 start_tsc = *cbdata->correlationData;
+    TrackContext(cbid, cbdata->context);
     return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
                                      start_tsc, end_tsc, domain, cbid, cbdata);
   }
-  Status Flush() override { return Status::OK(); }
+  Status SyncAndFlush() override {
+    if (option_.sync_devices_before_stop) {
+      CuptiApiTracingDisabler disabler;
+      absl::MutexLock lock(&mutex_);
+      for (auto &ctx : contexts_) {
+        cuCtxPushCurrent(ctx);
+        cuCtxSynchronize();  // Ignore error here for best effort.
+        CUcontext current;
+        cuCtxPopCurrent(&current);
+      }
+    }
+    return Status::OK();
+  }
 
  private:
+  void TrackContext(CUpti_CallbackId cbid, CUcontext ctx) {
+    if (!option_.sync_devices_before_stop) return;
+    if (ctx == NULL) return;
+    absl::MutexLock lock(&mutex_);
+    if (cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy_v2 ||
+        cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy) {
+      contexts_.erase(ctx);
+    } else {
+      contexts_.emplace(ctx);
+    }
+  }
+
   const CuptiTracerOptions option_;
   CuptiInterface *cupti_interface_;
   CuptiTraceCollector *collector_;
+  absl::Mutex mutex_;
+  absl::flat_hash_set<CUcontext> contexts_ TF_GUARDED_BY(mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithActivityApi);
 };
@@ -984,7 +1006,7 @@ class CudaEventRecorder {
   using StreamKey = std::pair<CUcontext, CUstream>;
 
   absl::node_hash_map<CUcontext, ContextInfo> context_infos_;
-  absl::flat_hash_map<StreamKey, StreamInfo, hash<StreamKey>> stream_infos_;
+  absl::flat_hash_map<StreamKey, StreamInfo> stream_infos_;
 };
 
 // This hook uses cuda events to measure device side activities.
@@ -1165,7 +1187,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
     return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
                                      start_tsc, end_tsc, domain, cbid, cbdata);
   }
-  Status Flush() override {
+  Status SyncAndFlush() override {
     for (auto &recorder : cuda_event_recorders_) {
       TF_RETURN_IF_ERROR(recorder->Stop());
     }
@@ -1243,6 +1265,11 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
   std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
   TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
 };
+
+/*static*/ std::string ErrorWithHostname(absl::string_view error_message) {
+  return absl::StrCat(port::Hostname(), ": ", error_message);
+}
+
 }  // namespace
 
 /*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
@@ -1404,7 +1431,7 @@ void CuptiTracer::Disable() {
   }
   cupti_interface_->CleanUp();
   Finalize().IgnoreError();
-  cupti_driver_api_hook_->Flush().IgnoreError();
+  cupti_driver_api_hook_->SyncAndFlush().IgnoreError();
   collector_->Flush();
   collector_ = nullptr;
   option_.reset();
@@ -1648,11 +1675,13 @@ Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
 
 /*static*/ std::string CuptiTracer::ErrorIfAny() {
   if (CuptiTracer::NumGpus() == 0) {
-    return "No GPU detected.";
+    return ErrorWithHostname("No GPU detected.");
   } else if (CuptiTracer::GetCuptiTracerSingleton()->NeedRootAccess()) {
-    return "Insufficient privilege to run libcupti (you need root permission).";
+    return ErrorWithHostname(
+        "Insufficient privilege to run libcupti (you need root permission).");
   } else if (CuptiTracer::GetTimestamp() == 0) {
-    return "Failed to load libcupti (is it installed and accessible?)";
+    return ErrorWithHostname(
+        "Failed to load libcupti (is it installed and accessible?)");
   }
   return "";
 }
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
index c6e0c50b093..a62c08013e8 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
@@ -21,9 +21,9 @@ limitations under the License.
 #include "absl/container/node_hash_set.h"
 #include "absl/types/optional.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/gpu/cupti_interface.h"
 
@@ -147,6 +147,8 @@ struct CuptiTracerOptions {
   std::vector<CUpti_ActivityKind> activities_selected;
   // Whether to call cuptiFinalize.
   bool cupti_finalize = false;
+  // Whether to call cuCtxSynchronize for each device before Stop().
+  bool sync_devices_before_stop = false;
 };
 
 struct CuptiTracerCollectorOptions {
@@ -219,7 +221,7 @@ class CuptiDriverApiHook {
   virtual Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
                                  CUpti_CallbackId cbid,
                                  const CUpti_CallbackData* callback_info) = 0;
-  virtual Status Flush() = 0;
+  virtual Status SyncAndFlush() = 0;
 
  protected:
   static Status AddDriverApiCallbackEvent(
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index 3fb502dcde2..0370f6a51f9 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -27,9 +27,9 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/abi.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -659,12 +659,16 @@ Status GpuTracer::CollectData(XSpace* space) {
     case State::kStartedOk:
       return errors::FailedPrecondition("Cannot collect trace before stopping");
     case State::kStartedError:
-      LOG(ERROR) << "Cannot collect, xprof failed to start";
+      LOG(ERROR) << "Cannot collect, profiler failed to start";
       return Status::OK();
     case State::kStoppedError:
       VLOG(1) << "No trace data collected";
       return Status::OK();
     case State::kStoppedOk: {
+      std::string cupti_error = CuptiTracer::ErrorIfAny();
+      if (!cupti_error.empty()) {
+        space->add_errors(cupti_error);
+      }
       if (cupti_collector_) {
         cupti_collector_->Export(space);
       }
diff --git a/tensorflow/core/profiler/internal/parse_annotation.cc b/tensorflow/core/profiler/internal/parse_annotation.cc
index 2a3fa3f8454..32c26befa3d 100644
--- a/tensorflow/core/profiler/internal/parse_annotation.cc
+++ b/tensorflow/core/profiler/internal/parse_annotation.cc
@@ -15,6 +15,9 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/parse_annotation.h"
 
 #include <stack>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_split.h"
diff --git a/tensorflow/core/profiler/internal/parse_annotation.h b/tensorflow/core/profiler/internal/parse_annotation.h
index 6c2e536962b..bb0f12217d3 100644
--- a/tensorflow/core/profiler/internal/parse_annotation.h
+++ b/tensorflow/core/profiler/internal/parse_annotation.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PARSE_ANNOTATION_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_PARSE_ANNOTATION_H_
 
-#include <utility>
 #include <vector>
 
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/core/profiler/internal/parse_annotation_test.cc b/tensorflow/core/profiler/internal/parse_annotation_test.cc
index 4d4a2d5ea95..e5d876ac5af 100644
--- a/tensorflow/core/profiler/internal/parse_annotation_test.cc
+++ b/tensorflow/core/profiler/internal/parse_annotation_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/internal/parse_annotation.h"
 
+#include <vector>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/internal/profiler_factory.cc b/tensorflow/core/profiler/internal/profiler_factory.cc
index e2bae59b892..5152e79bdc8 100644
--- a/tensorflow/core/profiler/internal/profiler_factory.cc
+++ b/tensorflow/core/profiler/internal/profiler_factory.cc
@@ -14,8 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/profiler_factory.h b/tensorflow/core/profiler/internal/profiler_factory.h
index 6bcdcf28c3c..c223d7275d9 100644
--- a/tensorflow/core/profiler/internal/profiler_factory.h
+++ b/tensorflow/core/profiler/internal/profiler_factory.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/profiler_interface.h b/tensorflow/core/profiler/internal/profiler_interface.h
index 2605e834f09..9fe85e38652 100644
--- a/tensorflow/core/profiler/internal/profiler_interface.h
+++ b/tensorflow/core/profiler/internal/profiler_interface.h
@@ -15,8 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
 
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
diff --git a/tensorflow/core/profiler/internal/scoped_annotation_test.cc b/tensorflow/core/profiler/internal/scoped_annotation_test.cc
index 70a627fd640..50c1244b9ee 100644
--- a/tensorflow/core/profiler/internal/scoped_annotation_test.cc
+++ b/tensorflow/core/profiler/internal/scoped_annotation_test.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
 
+#include <string>
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/annotation_stack.h"
 
 namespace tensorflow {
@@ -48,11 +49,13 @@ TEST(ScopedAnnotation, Simple) {
   EXPECT_EQ(AnnotationStack::Get(), "");  // not enabled
 }
 
-string GenerateRandomString(int length) { return string(length, 'a'); }
+std::string GenerateRandomString(int length) {
+  return std::string(length, 'a');
+}
 
 void BM_ScopedAnnotationDisabled(int iters, int annotation_size) {
   testing::StopTiming();
-  string annotation = GenerateRandomString(annotation_size);
+  std::string annotation = GenerateRandomString(annotation_size);
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
     ScopedAnnotation trace(annotation);
@@ -64,7 +67,7 @@ BENCHMARK(BM_ScopedAnnotationDisabled)->Arg(8)->Arg(32)->Arg(128);
 
 void BM_ScopedAnnotationEnabled(int iters, int annotation_size) {
   testing::StopTiming();
-  string annotation = GenerateRandomString(annotation_size);
+  std::string annotation = GenerateRandomString(annotation_size);
   AnnotationStack::Enable(true);
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
@@ -78,7 +81,7 @@ BENCHMARK(BM_ScopedAnnotationEnabled)->Arg(8)->Arg(32)->Arg(128);
 
 void BM_ScopedAnnotationEnabled_Nested(int iters, int annotation_size) {
   testing::StopTiming();
-  string annotation = GenerateRandomString(annotation_size);
+  std::string annotation = GenerateRandomString(annotation_size);
   AnnotationStack::Enable(true);
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc
index 22b3bdc2042..56e6e2bcba3 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats.cc
@@ -58,7 +58,6 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph,
       ckpt_reader_(std::move(ckpt_reader)) {
   CHECK(graph) << "Must at least have GraphDef";
 
-  absl::PrintF("Parsing Inputs...\n");
   AddGraph(std::move(graph));
   if (run_meta && run_meta->has_step_stats()) {
     AddRunMeta(0, std::move(run_meta));
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.cc b/tensorflow/core/profiler/internal/traceme_recorder.cc
index 365e3992bc3..268585bde8c 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.cc
+++ b/tensorflow/core/profiler/internal/traceme_recorder.cc
@@ -16,8 +16,18 @@ limitations under the License.
 
 #include <stddef.h>
 
+#include <algorithm>
+#include <atomic>
+#include <new>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.h b/tensorflow/core/profiler/internal/traceme_recorder.h
index 8b5b32cf4bc..5fdea5bddbd 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.h
+++ b/tensorflow/core/profiler/internal/traceme_recorder.h
@@ -15,9 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
 
-#include <stddef.h>
-
 #include <atomic>
+#include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -54,13 +53,13 @@ class TraceMeRecorder {
   // Times are in ns since the Unix epoch.
   struct Event {
     uint64 activity_id;
-    string name;
+    std::string name;
     uint64 start_time;  // 0 = missing
     uint64 end_time;    // 0 = missing
   };
   struct ThreadInfo {
     uint32 tid;
-    string name;
+    std::string name;
   };
   struct ThreadEvents {
     ThreadInfo thread;
diff --git a/tensorflow/core/profiler/internal/traceme_recorder_test.cc b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
index 90478881361..8d7abc94e8f 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder_test.cc
+++ b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
@@ -15,19 +15,28 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
 
 #include <atomic>
+#include <istream>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include <gmock/gmock.h>
-#include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
-#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/threadpool.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
 namespace {
 
+using ::testing::ElementsAre;
+
 MATCHER_P(Named, name, "") { return arg.name == name; }
 
 constexpr static uint64 kNanosInSec = 1000000000;
@@ -45,7 +54,7 @@ TEST(RecorderTest, SingleThreaded) {
 
   ASSERT_EQ(results.size(), 1);
   EXPECT_THAT(results[0].events,
-              ::testing::ElementsAre(Named("during1"), Named("during2")));
+              ElementsAre(Named("during1"), Named("during2")));
 }
 
 void SpinNanos(int nanos) {
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 33486685fb8..e80b9fc9766 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -47,18 +47,19 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:framework",
         "//tensorflow/core/platform",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/util:ptr_util",
+        "@com_google_absl//absl/memory",
+        "//tensorflow/core:protos_all_cc",
     ] + if_not_android([
         ":profiler_utils",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/utils:derived_timeline",
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_schema",
     ]),
     alwayslink = True,
 )
@@ -93,6 +94,7 @@ cc_library(
     hdrs = ["traceme.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":traceme_encode",
         "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
@@ -101,6 +103,16 @@ cc_library(
     ]),
 )
 
+cc_library(
+    name = "traceme_encode",
+    hdrs = ["traceme_encode.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "annotated_traceme",
     hdrs = ["annotated_traceme.h"],
@@ -110,13 +122,21 @@ cc_library(
         ":traceme",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
-tf_pybind_cc_library_wrapper(
-    name = "scoped_annotation_headers",
-    visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
-    deps = [":scoped_annotation"],
+cc_library(
+    name = "connected_traceme",
+    hdrs = ["connected_traceme.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":traceme",
+        ":traceme_encode",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
 )
 
 cc_library(
@@ -147,6 +167,7 @@ filegroup(
         "profiler_session.h",
         "scoped_annotation.h",
         "traceme.h",
+        "traceme_encode.h",
     ],
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/core/profiler/lib/annotated_traceme.h b/tensorflow/core/profiler/lib/annotated_traceme.h
index f40c1e9ad92..c3257e2adbe 100644
--- a/tensorflow/core/profiler/lib/annotated_traceme.h
+++ b/tensorflow/core/profiler/lib/annotated_traceme.h
@@ -15,7 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_
 
+#include <utility>
+
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
diff --git a/tensorflow/core/profiler/lib/connected_traceme.h b/tensorflow/core/profiler/lib/connected_traceme.h
new file mode 100644
index 00000000000..5b16e2e3adf
--- /dev/null
+++ b/tensorflow/core/profiler/lib/connected_traceme.h
@@ -0,0 +1,123 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_CONNECTED_TRACEME_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_CONNECTED_TRACEME_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
+
+namespace tensorflow {
+namespace profiler {
+
+enum class ContextType : int {
+  kGeneric,
+  kTfExecutor,
+};
+
+/*
+ * TraceMeProducer and TraceMeConsumer are used to correlate TraceMe events on
+ * different threads. TraceMeProducer generates the context information to be
+ * passed to TraceMeConsumer, which consists of the context id and optionally
+ * the context type. They may be provided by the user. Then, the events of the
+ * same context information can be correlated during the analysis.
+ *
+ * Example Usages:
+ * (1) Using the user-provided context type and id. The user is responsible for
+ *     providing the same context type and id to TraceMeProducer and
+ *     TraceMeConsumer.
+ * [Producer Thread]
+ * // user_context_id is provided by the user.
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); },
+ *     ContextType::kTfExecutor, user_context_id);
+ * [Consumer Thread]
+ * // user_context_id is provided by the user.
+ * TraceMeConsumer consumer(
+ *     [&] { return "op_execute"; }, user_context_id, ContextType::kTfExecutor);
+ *
+ * (2) Using the user-provided context type and generic id. The user is
+ *     responsible for passing the TraceMeProducer's context id to
+ *     TraceMeConsumer as well as providing the same context type to
+ *     TraceMeProducer and TraceMeConsumer.
+ * [Producer Thread]
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); },
+ *     ContextType::kTfExecutor);
+ * context_id = producer.GetContextId();
+ * // Pass context_id to the consumer thread.
+ * [Consumer Thread]
+ * // context_id is passed from the producer thread.
+ * TraceMeConsumer consumer(
+ *     [&] { return "op_execute"; }, context_id, ContextType::kTfExecutor);
+ *
+ * (3) Using the generic context information. The user is responsible for
+ *     passing the TraceMeProducer's context id to TraceMeConsumer.
+ * [Producer Thread]
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); });
+ * context_id = producer.GetContextId();
+ * // Pass context_id to the consumer thread.
+ * [Consumer Thread]
+ * // context_id is passed from the producer thread.
+ * TraceMeConsumer consumer([&] { return "op_execute"; }, context_id);
+ */
+class TraceMeProducer {
+ public:
+  template <typename NameT>
+  explicit TraceMeProducer(NameT name,
+                           ContextType context_type = ContextType::kGeneric,
+                           absl::optional<uint64> context_id = absl::nullopt,
+                           int level = 2)
+      : trace_me_(name, level) {
+    trace_me_.AppendMetadata([&] {
+      context_id_ =
+          context_id.has_value() ? *context_id : TraceMe::NewActivityId();
+      return TraceMeEncode(
+          {{"$pt", static_cast<int>(context_type)}, {"$p", context_id_}});
+    });
+  }
+
+  uint64 GetContextId() const { return context_id_; }
+
+ private:
+  TraceMe trace_me_;
+  uint64 context_id_ = 0;
+};
+
+class TraceMeConsumer {
+ public:
+  template <typename NameT>
+  TraceMeConsumer(NameT name, uint64 context_id,
+                  ContextType context_type = ContextType::kGeneric,
+                  int level = 2)
+      : trace_me_(name, level) {
+    trace_me_.AppendMetadata([&] {
+      return TraceMeEncode(
+          {{"$ct", static_cast<int>(context_type)}, {"$c", context_id}});
+    });
+  }
+
+ private:
+  TraceMe trace_me_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_CONNECTED_TRACEME_H_
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
index b907f74179c..9783cd14f95 100644
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -15,19 +15,28 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 
-#include "tensorflow/core/lib/core/errors.h"
+#include <memory>
+
+#include "absl/memory/memory.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/env_var.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/lib/profiler_utils.h"
 #include "tensorflow/core/profiler/utils/derived_timeline.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 #endif
 
@@ -44,7 +53,7 @@ ProfileOptions GetOptions(const ProfileOptions& opts) {
 
 /*static*/ std::unique_ptr<ProfilerSession> ProfilerSession::Create(
     const ProfileOptions& options) {
-  return WrapUnique(new ProfilerSession(options));
+  return absl::WrapUnique(new ProfilerSession(options));
 }
 
 /*static*/ std::unique_ptr<ProfilerSession> ProfilerSession::Create() {
diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h
index 1c20876d9d0..6f92b047eb7 100644
--- a/tensorflow/core/profiler/lib/profiler_session.h
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -18,12 +18,14 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/profiler/lib/scoped_annotation.h b/tensorflow/core/profiler/lib/scoped_annotation.h
index 61b0cf42dd6..2cad5fd4708 100644
--- a/tensorflow/core/profiler/lib/scoped_annotation.h
+++ b/tensorflow/core/profiler/lib/scoped_annotation.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stddef.h>
 
 #include <atomic>
+#include <utility>
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index 8b42f187850..6df196bdba7 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -15,9 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
 
-#include "absl/strings/match.h"
+#include <new>
+#include <string>
+#include <utility>
+
 #include "absl/strings/string_view.h"
-#include "absl/strings/strip.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -26,6 +28,7 @@ limitations under the License.
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
 #endif
+#include "tensorflow/core/profiler/lib/traceme_encode.h"  // IWYU pragma: export
 
 namespace tensorflow {
 namespace profiler {
@@ -74,20 +77,21 @@ inline int GetTFTraceMeLevel(bool is_expensive) {
 //          auto id = ActivityStart("step");
 //          ... do some work ...
 //          ActivityEnd(id);
+//       The two static methods should be called within the same thread.
 class TraceMe {
  public:
-  // Constructor that traces a user-defined activity labeled with activity_name
+  // Constructor that traces a user-defined activity labeled with name
   // in the UI. Level defines the trace priority, used for filtering TraceMe
   // events. By default, traces with TraceMe level <= 2 are recorded. Levels:
   // - Must be a positive integer.
   // - Can be a value in enum TraceMeLevel.
   // Users are welcome to use level > 3 in their code, if they wish to filter
   // out their host traces based on verbosity.
-  explicit TraceMe(absl::string_view activity_name, int level = 1) {
+  explicit TraceMe(absl::string_view name, int level = 1) {
     DCHECK_GE(level, 1);
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
-      new (&no_init_.name) string(activity_name);
+      new (&no_init_.name) std::string(name);
       start_time_ = EnvTime::NowNanos();
     }
 #endif
@@ -98,46 +102,55 @@ class TraceMe {
   // Note: We can't take the string by value because a) it would make the
   // overloads ambiguous, and b) we want lvalue strings to use the string_view
   // constructor so we avoid copying them when tracing is disabled.
-  explicit TraceMe(string &&activity_name, int level = 1) {
+  explicit TraceMe(std::string&& name, int level = 1) {
     DCHECK_GE(level, 1);
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
-      new (&no_init_.name) string(std::move(activity_name));
+      new (&no_init_.name) std::string(std::move(name));
       start_time_ = EnvTime::NowNanos();
     }
 #endif
   }
 
   // Do not allow passing strings by reference or value since the caller
-  // may unintentionally maintain ownership of the activity_name.
-  // Explicitly std::move the activity_name or wrap it in a string_view if
+  // may unintentionally maintain ownership of the name.
+  // Explicitly std::move the name or wrap it in a string_view if
   // you really wish to maintain ownership.
-  explicit TraceMe(const string &activity_name, int level = 1) = delete;
+  explicit TraceMe(const std::string& name, int level = 1) = delete;
 
   // This overload is necessary to make TraceMe's with string literals work.
   // Otherwise, the string&& and the string_view constructor would be equally
   // good overload candidates.
-  explicit TraceMe(const char *raw, int level = 1)
+  explicit TraceMe(const char* raw, int level = 1)
       : TraceMe(absl::string_view(raw), level) {}
 
-  // This overload only generates the activity name if tracing is enabled.
-  // Useful for avoiding things like string concatenation when tracing is
-  // disabled. The |name_generator| may be a lambda or functor that returns a
-  // type that the string() constructor can take.
+  // This overload only generates the name (and possibly metadata) if tracing is
+  // enabled. Useful for avoiding expensive operations (e.g., string
+  // concatenation) when tracing is disabled.
+  // name_generator may be a lambda or functor that returns a type that the
+  // string() constructor can take, e.g., the result of TraceMeEncode.
   // name_generator is templated, rather than a std::function to avoid
   // allocations std::function might make even if never called.
-  // Usage: profiler::TraceMe([&]{ return StrCat(prefix, ":", postfix); });
+  // Example Usage:
+  //   TraceMe op_trace_me([&]() {
+  //     return StrCat(op_name, ":", op_type);
+  //   }
+  //   TraceMe trace_me_with_metadata([&value1]() {
+  //     return TraceMeEncode("my_trace", {{"key1", value1}, {"key2", 42}});
+  //   });
   template <typename NameGeneratorT>
   explicit TraceMe(NameGeneratorT name_generator, int level = 1) {
     DCHECK_GE(level, 1);
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
-      new (&no_init_.name) string(name_generator());
+      new (&no_init_.name) std::string(name_generator());
       start_time_ = EnvTime::NowNanos();
     }
 #endif
   }
 
+  ~TraceMe() { Stop(); }
+
   // Stop tracing the activity. Called by the destructor, but exposed to allow
   // stopping tracing before the object goes out of scope. Only has an effect
   // the first time it is called.
@@ -162,28 +175,27 @@ class TraceMe {
 #endif
   }
 
-  // Sets new_metadata in the metadata part of no_init_.name.
-  void SetMetadata(absl::string_view new_metadata) {
+  // Appends new_metadata to the TraceMe name passed to the constructor.
+  // metadata_generator may be a lambda or functor that returns a type that the
+  // string() constructor can take, e.g., the result of TraceMeEncode.
+  // metadata_generator is only evaluated when tracing is enabled.
+  // metadata_generator is templated, rather than a std::function to avoid
+  // allocations std::function might make even if never called.
+  // Example Usage:
+  //   trace_me.AppendMetadata([&value1]() {
+  //     return TraceMeEncode({{"key1", value1}, {"key2", 42}});
+  //   });
+  template <typename MetadataGeneratorT>
+  void AppendMetadata(MetadataGeneratorT metadata_generator) {
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(start_time_ != kUntracedActivity)) {
       if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) {
-        absl::string_view orig = no_init_.name;
-        if (absl::EndsWith(orig, "#")) {
-          // orig does have metadata.
-          absl::ConsumeSuffix(&orig, "#");
-          absl::ConsumePrefix(&new_metadata, "#");
-          no_init_.name = absl::StrCat(orig, ",", new_metadata);
-        } else {
-          // orig does not have metadata.
-          absl::StrAppend(&no_init_.name, new_metadata);
-        }
+        traceme_internal::AppendMetadata(&no_init_.name, metadata_generator());
       }
     }
 #endif
   }
 
-  ~TraceMe() { Stop(); }
-
   // Static API, for use when scoped objects are inconvenient.
 
   // Record the start time of an activity.
@@ -192,7 +204,7 @@ class TraceMe {
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
       uint64 activity_id = TraceMeRecorder::NewActivityId();
-      TraceMeRecorder::Record({activity_id, string(name),
+      TraceMeRecorder::Record({activity_id, std::string(name),
                                /*start_time=*/EnvTime::NowNanos(),
                                /*end_time=*/0});
       return activity_id;
@@ -207,7 +219,8 @@ class TraceMe {
     // We don't check the level again (see TraceMe::Stop()).
     if (TF_PREDICT_FALSE(activity_id != kUntracedActivity)) {
       if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) {
-        TraceMeRecorder::Record({activity_id, /*name=*/"", /*start_time=*/0,
+        TraceMeRecorder::Record({activity_id, /*name=*/std::string(),
+                                 /*start_time=*/0,
                                  /*end_time=*/EnvTime::NowNanos()});
       }
     }
@@ -222,6 +235,14 @@ class TraceMe {
 #endif
   }
 
+  static uint64 NewActivityId() {
+#if !defined(IS_MOBILE_PLATFORM)
+    return TraceMeRecorder::NewActivityId();
+#else
+    return 0;
+#endif
+  }
+
  private:
   // Activity ID or start time used when tracing is disabled.
   constexpr static uint64 kUntracedActivity = 0;
@@ -235,7 +256,7 @@ class TraceMe {
   union NoInit {
     NoInit() {}
     ~NoInit() {}
-    string name;
+    std::string name;
   } no_init_;
 
   uint64 start_time_ = kUntracedActivity;
diff --git a/tensorflow/core/profiler/lib/traceme_encode.h b/tensorflow/core/profiler/lib/traceme_encode.h
new file mode 100644
index 00000000000..2e23c6d878b
--- /dev/null
+++ b/tensorflow/core/profiler/lib/traceme_encode.h
@@ -0,0 +1,127 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_TRACEME_ENCODE_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_TRACEME_ENCODE_H_
+
+#include <string.h>
+
+#include <initializer_list>
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace traceme_internal {
+
+// Copies the contents of str to the address pointed by out.
+// Returns the address after the copy.
+// REQUIRED: The address range [out, out + str.size()] must have been allocated.
+TF_ATTRIBUTE_ALWAYS_INLINE inline char* Append(char* out,
+                                               absl::string_view str) {
+  const size_t str_size = str.size();
+  if (TF_PREDICT_TRUE(str_size > 0)) {
+    memcpy(out, str.data(), str_size);
+    out += str_size;
+  }
+  return out;
+}
+
+// Appends args encoded as TraceMe metadata to name.
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string AppendArgs(
+    std::string name,
+    const std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>>&
+        args) {
+  if (TF_PREDICT_TRUE(args.size() > 0)) {
+    const auto old_size = name.size();
+    auto new_size = old_size + args.size() * 2 + 1;
+    for (const auto& arg : args) {
+      new_size += arg.first.size() + arg.second.size();
+    }
+    name.resize(new_size);
+    char* const begin = &name[0];
+    char* out = begin + old_size;
+    *out++ = '#';
+    for (const auto& arg : args) {
+      out = Append(out, arg.first);
+      *out++ = '=';
+      out = Append(out, arg.second.Piece());
+      *out++ = ',';
+    }
+    *(out - 1) = '#';
+    DCHECK_EQ(out, begin + new_size);
+  }
+  return name;
+}
+
+// Appends new_metadata to the metadata part of name.
+TF_ATTRIBUTE_ALWAYS_INLINE inline void AppendMetadata(
+    std::string* name, absl::string_view new_metadata) {
+  if (!TF_PREDICT_FALSE(new_metadata.empty())) {
+    if (!name->empty() && name->back() == '#') {  // name already has metadata
+      name->back() = ',';
+      if (TF_PREDICT_TRUE(new_metadata.front() == '#')) {
+        new_metadata.remove_prefix(1);
+      }
+    }
+    name->append(new_metadata.data(), new_metadata.size());
+  }
+}
+
+}  // namespace traceme_internal
+
+// Encodes an event name and arguments into TraceMe metadata.
+// Use within a lambda to avoid expensive operations when tracing is disabled.
+// Example Usage:
+//   TraceMe trace_me([value1]() {
+//     return TraceMeEncode("my_trace", {{"key1", value1}, {"key2", 42}});
+//   });
+inline std::string TraceMeEncode(
+    std::string name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  return traceme_internal::AppendArgs(std::move(name), args);
+}
+inline std::string TraceMeEncode(
+    absl::string_view name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  return traceme_internal::AppendArgs(std::string(name), args);
+}
+inline std::string TraceMeEncode(
+    const char* name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  return traceme_internal::AppendArgs(std::string(name), args);
+}
+
+// Encodes arguments into TraceMe metadata.
+// Use within a lambda to avoid expensive operations when tracing is disabled.
+// Example Usage:
+//   TraceMe trace_me("my_trace");
+//   ...
+//   trace_me.AppendMetadata([value1]() {
+//     return TraceMeEncode({{"key1", value1}, {"key2", 42}});
+//   });
+inline std::string TraceMeEncode(
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  return traceme_internal::AppendArgs(std::string(), args);
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_TRACEME_ENCODE_H_
diff --git a/tensorflow/core/profiler/profiler_service.proto b/tensorflow/core/profiler/profiler_service.proto
index 37ca4084e42..a096a10efe2 100644
--- a/tensorflow/core/profiler/profiler_service.proto
+++ b/tensorflow/core/profiler/profiler_service.proto
@@ -10,6 +10,10 @@ import "tensorflow/core/profiler/profiler_service_monitor_result.proto";
 service ProfilerService {
   // Starts a profiling session, blocks until it completes, and returns data.
   rpc Profile(ProfileRequest) returns (ProfileResponse) {}
+  // Signal to terminate the Profile rpc for a on-going profiling session,
+  // The Profile rpc will return successfully and prematurely without timeout.
+  // This is used by programmatic mode to end the session in workers.
+  rpc Terminate(TerminateRequest) returns (TerminateResponse) {}
   // Collects profiling data and returns user-friendly metrics.
   rpc Monitor(MonitorRequest) returns (MonitorResponse) {}
 }
@@ -81,6 +85,13 @@ message ProfileResponse {
   // next-field: 8
 }
 
+message TerminateRequest {
+  // Which session id to terminate.
+  string session_id = 1;
+}
+
+message TerminateResponse {}
+
 message MonitorRequest {
   // Duration for which to profile between each update.
   uint64 duration_ms = 1;
diff --git a/tensorflow/core/profiler/protobuf/op_metrics.proto b/tensorflow/core/profiler/protobuf/op_metrics.proto
index c0f34773e02..af38795b7b2 100644
--- a/tensorflow/core/profiler/protobuf/op_metrics.proto
+++ b/tensorflow/core/profiler/protobuf/op_metrics.proto
@@ -26,7 +26,7 @@ message LayoutAnalysis {
 }
 
 // Metrics for an operation (accumulated over all occurrences).
-// Next ID: 19
+// Next ID: 20
 message OpMetrics {
   // HLO module id. 0 for TF ops.
   uint64 hlo_module_id = 13;
@@ -50,6 +50,19 @@ message OpMetrics {
   uint64 flops = 2;
   // Total bytes accessed.
   uint64 bytes_accessed = 5;
+  // Breakdown of memory accessed by operation type and memory space.
+  message MemoryAccessed {
+    enum OperationType {
+      UNKNOWN = 0;
+      READ = 1;
+      WRITE = 2;
+    }
+    OperationType operation_type = 1;
+    // Device-specific id of memory space.
+    uint64 memory_space = 2;
+    uint64 bytes_accessed = 3;
+  }
+  repeated MemoryAccessed memory_accessed_breakdown = 19;
   // Total dma stall time in picoseconds.
   uint64 dma_stall_ps = 10;
   // The data layout for this op. Only set for convolution ops for now.
diff --git a/tensorflow/core/profiler/protobuf/overview_page.proto b/tensorflow/core/profiler/protobuf/overview_page.proto
index 8c83dbd0871..1590076d55f 100644
--- a/tensorflow/core/profiler/protobuf/overview_page.proto
+++ b/tensorflow/core/profiler/protobuf/overview_page.proto
@@ -81,9 +81,14 @@ message OverviewPageRecommendation {
   // A statement for input that recommends the next steps for investigating the
   // bottleneck.
   string statement = 2;
+  // A list of tips for tackling input bottleneck.
+  repeated OverviewPageTip input_tips = 11;
   // A statement for output that recommends the next steps for investigating the
   // bottleneck.
   string output_statement = 9;
+  // A statement that recommends the next steps for investigating tf-function
+  // related bottleneck (it is a html so that it can link to other tools/docs.
+  string tf_function_statement_html = 10;
   // A list of tips for improving host performance.
   repeated OverviewPageTip host_tips = 3;
   // A list of tips for improving device performance.
diff --git a/tensorflow/core/profiler/protobuf/tf_function.proto b/tensorflow/core/profiler/protobuf/tf_function.proto
index fe07c00c8d3..1f5e1530475 100644
--- a/tensorflow/core/profiler/protobuf/tf_function.proto
+++ b/tensorflow/core/profiler/protobuf/tf_function.proto
@@ -49,6 +49,9 @@ message TfFunction {
   int64 total_tracing_count = 2;
   // Compiler used to compile this function.
   TfFunctionCompiler compiler = 3;
+  // Percentage of time spent in the expensive calls to this function in the
+  // profiled period.
+  double expensive_call_percent = 4;
 }
 
 // Statistics for all tf-functions.
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index d8af53fe8f9..1e572dfd9bd 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -14,14 +14,12 @@ cc_library(
         ["//tensorflow_serving/model_servers:__pkg__"],
     ),
     deps = [
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "//tensorflow/core/profiler/convert:xplane_to_profile_response",
         "//tensorflow/core/profiler/lib:profiler_session_headers",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/memory",
         tf_grpc_cc_dependency(),
     ],
 )
@@ -36,7 +34,6 @@ cc_library(
     ],
     deps = [
         ":profiler_service_impl",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index 43ebb35230c..609f98aa6c1 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -11,9 +11,10 @@ cc_library(
     visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
     deps = [
         ":save_profile",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_analysis_proto_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "@com_google_absl//absl/strings",
         tf_grpc_cc_dependency(),
@@ -28,8 +29,8 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
     ],
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc
index ebc74c9252c..a8642aff54a 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc
@@ -14,19 +14,25 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/rpc/client/capture_profile.h"
 
+#include <iostream>
+#include <limits>
+#include <memory>
 #include <vector>
 
 #include "grpcpp/grpcpp.h"
-#include "absl/strings/escaping.h"
-#include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/profiler_analysis.grpc.pb.h"
+#include "tensorflow/core/profiler/profiler_analysis.pb.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+#include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
-#include "tensorflow/core/util/events_writer.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.h b/tensorflow/core/profiler/rpc/client/capture_profile.h
index 404912ef716..c809d2099ae 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.h
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.h
@@ -17,9 +17,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
 #define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
 
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.cc b/tensorflow/core/profiler/rpc/client/save_profile.cc
index ab2e494871c..9cf2e291692 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/save_profile.cc
@@ -15,20 +15,27 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
 
-#include <cstdio>
-#include <ctime>
+#include <initializer_list>
+#include <memory>
+#include <sstream>
+#include <string>
 #include <vector>
 
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/io/compression.h"
+#include "tensorflow/core/lib/io/zlib_compression_options.h"
+#include "tensorflow/core/lib/io/zlib_outputbuffer.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/profiler/profiler_service.pb.h"
+
 // Windows.h #defines ERROR, but it is also used in
 // tensorflow/core/util/event.proto
 #undef ERROR
@@ -56,9 +63,9 @@ string ProfilerJoinPathImpl(std::initializer_list<absl::string_view> paths) {
 
     path = absl::StripPrefix(path, kPathSep);
     if (absl::EndsWith(result, kPathSep)) {
-      strings::StrAppend(&result, path);
+      absl::StrAppend(&result, path);
     } else {
-      strings::StrAppend(&result, kPathSep, path);
+      absl::StrAppend(&result, kPathSep, path);
     }
   }
 
@@ -75,7 +82,8 @@ string ProfilerJoinPath(const T&... args) {
 constexpr char kProtoTraceFileName[] = "trace";
 constexpr char kTfStatsHelperSuffix[] = "tf_stats_helper_result";
 
-Status DumpToolDataToLogDirectory(StringPiece run_dir, const string& host,
+Status DumpToolDataToLogDirectory(absl::string_view run_dir,
+                                  absl::string_view host,
                                   const ProfileToolData& tool,
                                   std::ostream* os) {
   // Don't save the intermediate results for combining the per host tool data.
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.h b/tensorflow/core/profiler/rpc/client/save_profile.h
index d9070f06c71..2e8fc96390a 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.h
+++ b/tensorflow/core/profiler/rpc/client/save_profile.h
@@ -16,7 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
 #define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
 
-#include "tensorflow/core/lib/core/status.h"
+#include <ostream>
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/rpc/profiler_server.cc b/tensorflow/core/profiler/rpc/profiler_server.cc
index 36f0f9efad9..f05a829fb93 100644
--- a/tensorflow/core/profiler/rpc/profiler_server.cc
+++ b/tensorflow/core/profiler/rpc/profiler_server.cc
@@ -16,18 +16,19 @@ limitations under the License.
 #include "tensorflow/core/profiler/rpc/profiler_server.h"
 
 #include <memory>
-#include <utility>
+#include <string>
 
 #include "grpcpp/grpcpp.h"
 #include "absl/strings/str_cat.h"
-#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 
 void ProfilerServer::StartProfilerServer(int32 port) {
-  string server_address = absl::StrCat("0.0.0.0:", port);
+  std::string server_address = absl::StrCat("0.0.0.0:", port);
   service_ = CreateProfilerService();
   ::grpc::ServerBuilder builder;
   builder.AddListeningPort(server_address, ::grpc::InsecureServerCredentials());
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
index 8f1be23594a..0a234d7e4da 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.cc
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
@@ -15,19 +15,24 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
 
+#include <memory>
+
 #include "grpcpp/support/status.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/lib/core/errors.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
+#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+#include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace {
@@ -61,11 +66,16 @@ class ProfilerServiceImpl : public grpc::ProfilerService::Service {
     }
 
     Env* env = Env::Default();
-    for (size_t i = 0; i < req->duration_ms(); ++i) {
+    for (uint64 i = 0; i < req->duration_ms(); ++i) {
       env->SleepForMicroseconds(EnvTime::kMillisToMicros);
       if (ctx->IsCancelled()) {
         return ::grpc::Status::CANCELLED;
       }
+      if (TF_PREDICT_FALSE(IsStopped(req->session_id()))) {
+        mutex_lock lock(mutex_);
+        stop_signals_per_session_.erase(req->session_id());
+        break;
+      }
     }
 
     status = CollectDataToResponse(*req, profiler.get(), response);
@@ -76,12 +86,31 @@ class ProfilerServiceImpl : public grpc::ProfilerService::Service {
 
     return ::grpc::Status::OK;
   }
+
+  ::grpc::Status Terminate(::grpc::ServerContext* ctx,
+                           const TerminateRequest* req,
+                           TerminateResponse* response) override {
+    mutex_lock lock(mutex_);
+    stop_signals_per_session_[req->session_id()] = true;
+    return ::grpc::Status::OK;
+  }
+
+ private:
+  bool IsStopped(const std::string& session_id) {
+    mutex_lock lock(mutex_);
+    auto it = stop_signals_per_session_.find(session_id);
+    return it != stop_signals_per_session_.end() && it->second;
+  }
+
+  mutex mutex_;
+  absl::flat_hash_map<std::string, bool> stop_signals_per_session_
+      GUARDED_BY(mutex_);
 };
 
 }  // namespace
 
 std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService() {
-  return MakeUnique<ProfilerServiceImpl>();
+  return absl::make_unique<ProfilerServiceImpl>();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.h b/tensorflow/core/profiler/rpc/profiler_service_impl.h
index 4a7636cf101..00a850acbf2 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.h
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.h
@@ -15,10 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
 #define TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
 
-#include "grpcpp/grpcpp.h"
-#include "grpcpp/server_context.h"
-#include "grpcpp/support/status.h"
-#include "tensorflow/core/profiler/lib/profiler_session.h"
+#include <memory>
+
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index ad26dcc5774..8741ac01fc5 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -30,6 +30,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
@@ -50,6 +51,14 @@ cc_library(
     hdrs = ["math_utils.h"],
 )
 
+cc_library(
+    name = "html_utils",
+    hdrs = ["html_utils.h"],
+    deps = [
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "op_metrics_db_utils",
     srcs = ["op_metrics_db_utils.cc"],
@@ -83,7 +92,6 @@ cc_library(
     hdrs = ["tf_op_utils.h"],
     deps = [
         "//tensorflow/core:regexp_internal",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -96,6 +104,7 @@ tf_cc_test(
         ":tf_op_utils",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -156,6 +165,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -168,9 +178,9 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -182,7 +192,6 @@ cc_library(
     deps = [
         ":timespan",
         ":xplane_builder",
-        ":xplane_schema",
         ":xplane_visitor",
         "//tensorflow/core:platform_base",
         "//tensorflow/core/platform:types",
@@ -196,7 +205,6 @@ tf_cc_test(
     name = "xplane_utils_test",
     srcs = ["xplane_utils_test.cc"],
     deps = [
-        ":time_utils",
         ":xplane_builder",
         ":xplane_utils",
         ":xplane_visitor",
@@ -205,6 +213,25 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "xplane_test_utils",
+    testonly = True,
+    srcs = ["xplane_test_utils.cc"],
+    hdrs = ["xplane_test_utils.h"],
+    visibility = [":friends"],
+    deps = [
+        ":xplane_builder",
+        ":xplane_schema",
+        "//tensorflow/core/platform:types",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:variant",
     ],
 )
 
@@ -232,6 +259,7 @@ cc_library(
     deps = [
         ":xplane_schema",
         ":xplane_visitor",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
     ],
 )
 
@@ -243,9 +271,11 @@ cc_library(
     deps = [
         ":tf_op_utils",
         ":tf_xplane_visitor",
+        ":xplane_builder",
         ":xplane_schema",
         ":xplane_utils",
         ":xplane_visitor",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -262,11 +292,14 @@ tf_cc_test(
         ":tf_xplane_visitor",
         ":xplane_builder",
         ":xplane_schema",
-        ":xplane_utils",
+        ":xplane_test_utils",
+        ":xplane_visitor",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -281,10 +314,13 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+        "//tensorflow/core/grappler/costs:op_context",
         "//tensorflow/core/grappler/costs:op_level_cost_estimator",
         "//tensorflow/core/grappler/costs:op_performance_data_cc",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -296,6 +332,7 @@ cc_library(
         ":group_events",
         ":tf_op_utils",
         ":tf_xplane_visitor",
+        ":time_utils",
         ":timespan",
         ":trace_utils",
         ":xplane_builder",
@@ -305,8 +342,10 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -320,7 +359,9 @@ tf_cc_test(
         ":trace_utils",
         ":xplane_builder",
         ":xplane_schema",
-        ":xplane_utils",
+        ":xplane_test_utils",
+        ":xplane_visitor",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
@@ -347,10 +388,10 @@ cc_library(
         ":xplane_builder",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:protos_all_cc",
         "//tensorflow/core/profiler/protobuf:tfstreamz_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
     ],
 )
diff --git a/tensorflow/core/profiler/utils/cost_utils.cc b/tensorflow/core/profiler/utils/cost_utils.cc
index 754aa655af3..a94f09bb79c 100644
--- a/tensorflow/core/profiler/utils/cost_utils.cc
+++ b/tensorflow/core/profiler/utils/cost_utils.cc
@@ -15,12 +15,27 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/cost_utils.h"
 
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/op_context.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/cost_utils.h b/tensorflow/core/profiler/utils/cost_utils.h
index f1095556c2b..a778bca5330 100644
--- a/tensorflow/core/profiler/utils/cost_utils.h
+++ b/tensorflow/core/profiler/utils/cost_utils.h
@@ -15,12 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_COST_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_COST_UTILS_H_
 
-#include <set>
+#include <string>
 
-#include "absl/strings/string_view.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
@@ -46,7 +47,8 @@ class TfOpRoofLineCostEstimator
   OpRoofLineStats Predict(const XEventVisitor& event);
 
  private:
-  std::set<string> unsupported_ops_;  // summary for unsupported ops.
+  absl::flat_hash_set<std::string>
+      unsupported_ops_;  // summary for unsupported ops.
 
   TF_DISALLOW_COPY_AND_ASSIGN(TfOpRoofLineCostEstimator);
 };
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index c99d8e82cb7..112c0977763 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -14,15 +14,27 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/derived_timeline.h"
 
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/time_utils.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/trace_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
diff --git a/tensorflow/core/profiler/utils/derived_timeline.h b/tensorflow/core/profiler/utils/derived_timeline.h
index 61b62bdc8da..cd4da7996c5 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.h
+++ b/tensorflow/core/profiler/utils/derived_timeline.h
@@ -15,7 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_DERIVED_TIMELINE_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_DERIVED_TIMELINE_H_
 
+#include <functional>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
diff --git a/tensorflow/core/profiler/utils/derived_timeline_test.cc b/tensorflow/core/profiler/utils/derived_timeline_test.cc
index f3e6b66f087..c6922e7ab74 100644
--- a/tensorflow/core/profiler/utils/derived_timeline_test.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline_test.cc
@@ -15,15 +15,17 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/derived_timeline.h"
 
-#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/trace_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -45,21 +47,12 @@ TEST(DerivedTimelineTest, HloModuleNameTest) {
   XPlane* plane = space.add_planes();
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
-  auto first_event = CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100);
-  first_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                               GetStatTypeStr(StatType::kHloModule)),
-                           kHloModuleName);
-  first_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                               GetStatTypeStr(StatType::kKernelDetails)),
-                           kKernelDetails);
-  auto second_event =
-      CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300);
-  second_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kHloModule)),
-                            kHloModuleName);
-  second_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kKernelDetails)),
-                            kKernelDetails);
+  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
+               {{StatType::kHloModule, kHloModuleName},
+                {StatType::kKernelDetails, kKernelDetails}});
+  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
+               {{StatType::kHloModule, kHloModuleName},
+                {StatType::kKernelDetails, kKernelDetails}});
   GenerateDerivedTimeLines(event_group_name_map, &space);
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
   // Only the hlo module line is added and other empty lines are removed at the
@@ -84,21 +77,12 @@ TEST(DerivedTimelineTest, TfOpLineTest) {
   XPlane* plane = space.add_planes();
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
-  auto first_event = CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100);
-  first_event.AddStatValue(
-      *plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
-      kTfOpName);
-  first_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                               GetStatTypeStr(StatType::kKernelDetails)),
-                           kKernelDetails);
-  auto second_event =
-      CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300);
-  second_event.AddStatValue(
-      *plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
-      kTfOpName);
-  second_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kKernelDetails)),
-                            kKernelDetails);
+  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
+               {{StatType::kLevel0, kTfOpName},
+                {StatType::kKernelDetails, kKernelDetails}});
+  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
+               {{StatType::kLevel0, kTfOpName},
+                {StatType::kKernelDetails, kKernelDetails}});
   GenerateDerivedTimeLines(event_group_name_map, &space);
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
   // Only the tf op line is added and other empty lines are removed at the end.
@@ -125,22 +109,14 @@ TEST(DerivedTimelineTest, DependencyTest) {
   XPlane* plane = space.add_planes();
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
-  auto first_event = CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
-                                  {{StatType::kGroupId, 0}});
-  first_event.AddStatValue(
-      *plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
-      kTfOpName);
-  first_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                               GetStatTypeStr(StatType::kKernelDetails)),
-                           kKernelDetails);
-  auto second_event = CreateXEvent(&plane_builder, &line_builder, "op2", 200,
-                                   300, {{StatType::kGroupId, 1}});
-  second_event.AddStatValue(
-      *plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
-      kTfOpName);
-  second_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kKernelDetails)),
-                            kKernelDetails);
+  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
+               {{StatType::kGroupId, 0},
+                {StatType::kLevel0, kTfOpName},
+                {StatType::kKernelDetails, kKernelDetails}});
+  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
+               {{StatType::kGroupId, 1},
+                {StatType::kLevel0, kTfOpName},
+                {StatType::kKernelDetails, kKernelDetails}});
   GenerateDerivedTimeLines(event_group_name_map, &space);
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
   // The step line and the TF op line are added.
@@ -162,21 +138,12 @@ TEST(DerivedTimelineTest, TfOpNameScopeTest) {
   XPlane* plane = space.add_planes();
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
-  auto first_event = CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100);
-  first_event.AddStatValue(
-      *plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
-      kTfOpName);
-  first_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                               GetStatTypeStr(StatType::kKernelDetails)),
-                           kKernelDetails);
-  auto second_event =
-      CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300);
-  second_event.AddStatValue(
-      *plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
-      kTfOpName);
-  second_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kKernelDetails)),
-                            kKernelDetails);
+  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
+               {{StatType::kLevel0, kTfOpName},
+                {StatType::kKernelDetails, kKernelDetails}});
+  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
+               {{StatType::kLevel0, kTfOpName},
+                {StatType::kKernelDetails, kKernelDetails}});
   GenerateDerivedTimeLines(event_group_name_map, &space);
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
   // The TF name scope line and the TF op line are added.
diff --git a/tensorflow/core/profiler/utils/errors.cc b/tensorflow/core/profiler/utils/errors.cc
index d829ee06709..1851c624e5c 100644
--- a/tensorflow/core/profiler/utils/errors.cc
+++ b/tensorflow/core/profiler/utils/errors.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/errors.h"
 
+#include "absl/strings/string_view.h"
+
 namespace tensorflow {
 namespace profiler {
 
@@ -31,5 +33,10 @@ const absl::string_view kErrorNoStepMarker =
     " than the step time. For (1), you need to add step instrumentation;"
     " for (2), you may try to profile longer.";
 
+const absl::string_view kNoDeviceTraceCollected =
+    "No device trace was collected. This might happen if your job hadn't been "
+    "run on the device when sampling was turned on. You could try the sampling"
+    " again later.";
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/errors.h b/tensorflow/core/profiler/utils/errors.h
index b213fd05c71..2dcb60e6899 100644
--- a/tensorflow/core/profiler/utils/errors.h
+++ b/tensorflow/core/profiler/utils/errors.h
@@ -28,6 +28,8 @@ ABSL_CONST_INIT extern const absl::string_view kErrorIncompleteStep;
 // step info.
 ABSL_CONST_INIT extern const absl::string_view kErrorNoStepMarker;
 
+ABSL_CONST_INIT extern const absl::string_view kNoDeviceTraceCollected;
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/utils/event_span.cc b/tensorflow/core/profiler/utils/event_span.cc
index 1c64f7bf6bb..5e0413c4ba2 100644
--- a/tensorflow/core/profiler/utils/event_span.cc
+++ b/tensorflow/core/profiler/utils/event_span.cc
@@ -14,14 +14,19 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/event_span.h"
 
-#include <chrono>  // NOLINT
-#include <ctime>
-#include <thread>  // NOLINT
+#include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -269,10 +274,7 @@ void CombineStepEvents(const StepEvents& src, StepEvents* dst) {
 
 // Converts from overlapped step-events to non-overlapped step-events.
 StepEvents ToNonOverlappedStepEvents(const StepEvents& overlapped_step_events) {
-  auto start_time = std::chrono::steady_clock::now();
   StepEvents non_overlapped_step_events;
-
-  // We could parallelize the following loop if necessary.
   for (const auto& step_events : overlapped_step_events) {
     const auto& step_id = step_events.first;
     const auto& step_details = step_events.second;
@@ -281,12 +283,6 @@ StepEvents ToNonOverlappedStepEvents(const StepEvents& overlapped_step_events) {
     *non_overlapped_step_events[step_id].MutableEvents() =
         ToNonOverlappedEvents(step_details.Events());
   }
-  auto end_time = std::chrono::steady_clock::now();
-  auto elapsed_time_us = std::chrono::duration_cast<std::chrono::microseconds>(
-      end_time - start_time);
-  double elapsed_time_ms = elapsed_time_us.count() / 1000.0;
-  LOG(INFO) << "Generation of step-events took " << elapsed_time_ms << " ms"
-            << std::endl;
   return non_overlapped_step_events;
 }
 
diff --git a/tensorflow/core/profiler/utils/event_span.h b/tensorflow/core/profiler/utils/event_span.h
index 36b31722968..1adc6a75d82 100644
--- a/tensorflow/core/profiler/utils/event_span.h
+++ b/tensorflow/core/profiler/utils/event_span.h
@@ -16,10 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_EVENT_SPAN_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_EVENT_SPAN_H_
 
+#include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/core/platform/logging.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index 60d12c0862d..001fb78b073 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -15,13 +15,25 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/group_events.h"
 
-#include <stack>
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
@@ -91,21 +103,35 @@ int64 GetEventType(const XPlaneVisitor& visitor, const XEvent& event) {
   }
 }
 
-const XStat* GetStat(const XPlaneVisitor& visitor, const XEvent& event,
-                     int64 stat_type) {
-  for (const auto& stat : event.stats()) {
-    if (visitor.GetStatType(stat) == stat_type) {
-      return &stat;
-    }
-  }
-  return nullptr;
-}
-
 void SetGroupId(const XPlaneVisitor& visitor, int64 group_id, XEvent* event) {
   AddOrUpdateIntStat(*visitor.GetStatMetadataId(StatType::kGroupId), group_id,
                      event);
 }
 
+void SetContextGroup(EventNode* event, ContextGroupMap* context_groups) {
+  auto producer = event->GetProducerContext();
+  if (producer.has_value()) {
+    ((*context_groups)[producer->type][producer->id]).producer = event;
+  }
+  auto consumer = event->GetConsumerContext();
+  if (consumer.has_value()) {
+    ((*context_groups)[consumer->type][consumer->id])
+        .consumers.push_back(event);
+  }
+}
+
+void ConnectContextGroups(const ContextGroupMap& context_groups) {
+  for (auto& type_id_group : context_groups) {
+    for (auto& id_group : type_id_group.second) {
+      const ContextGroup& group = id_group.second;
+      EventNode* parent = group.producer;
+      for (EventNode* child : group.consumers) {
+        parent->AddChild(child);
+      }
+    }
+  }
+}
+
 using VirtualEventNodeMap =
     absl::flat_hash_map<int64 /*step_id*/,
                         absl::flat_hash_map<int64 /*iter_num*/, EventNode*>>;
@@ -134,18 +160,73 @@ bool NeedsVirtualEventsForAsyncExecutor(
 
 bool HasFunctionRun(EventNode* event_node) {
   for (EventNode* child : event_node->GetChildren()) {
-    if (child->GetPlaneVisitor().GetEventType(child->GetEvent()) ==
-        HostEventType::kFunctionRun) {
+    if (child->GetEventVisitor().Type() == HostEventType::kFunctionRun) {
       return true;
     }
   }
   return false;
 }
 
+void ProcessRootEvent(int64 group_id, EventNode* root_event,
+                      EventGroupNameMap* event_group_name_map) {
+  root_event->PropagateGroupId(group_id);
+  std::string group_name = root_event->GetGroupName();
+  // TODO(jihochoi): change event name instead.
+  root_event->AddStepName(group_name);
+  event_group_name_map->emplace(group_id, std::move(group_name));
+}
+
 }  // namespace
 
+EventNode::EventNode(const XPlaneVisitor* plane, XLine* raw_line,
+                     XEvent* raw_event)
+    : plane_(plane),
+      visitor_(plane, raw_line, raw_event),
+      raw_line_(raw_line),
+      raw_event_(raw_event) {
+  absl::optional<int> producer_type;
+  absl::optional<uint64> producer_id;
+  absl::optional<int> consumer_type;
+  absl::optional<uint64> consumer_id;
+  visitor_.ForEachStat([&](const XStatVisitor& stat) {
+    if (!stat.Type().has_value()) return;
+    switch (*stat.Type()) {
+      case StatType::kProducerType:
+        producer_type = stat.IntValue();
+        break;
+      case StatType::kProducerId:
+        producer_id = stat.IntValue();
+        break;
+      case StatType::kConsumerType:
+        consumer_type = stat.IntValue();
+        break;
+      case StatType::kConsumerId:
+        consumer_id = stat.IntValue();
+        break;
+      case StatType::kIsRoot:
+        is_root_ = stat.IntValue();
+        break;
+      case StatType::kIsAsync:
+        is_async_ = stat.IntValue();
+        break;
+      default:
+        break;
+    }
+  });
+  if (producer_type.has_value() && producer_id.has_value()) {
+    producer_context_ = {*producer_type, *producer_id};
+  }
+  if (consumer_type.has_value() && consumer_id.has_value()) {
+    consumer_context_ = {*consumer_type, *consumer_id};
+  }
+}
+
+EventNode::EventNode(const EventNode& event_node)
+    : EventNode(event_node.plane_, event_node.raw_line_,
+                event_node.raw_event_) {}
+
 const XStat* EventNode::GetContextStat(int64 stat_type) const {
-  if (const XStat* stat = GetStat(*visitor_, *event_, stat_type)) {
+  if (const XStat* stat = visitor_.GetStats(stat_type)) {
     return stat;
   } else if (parent_) {
     return parent_->GetContextStat(stat_type);
@@ -156,7 +237,7 @@ const XStat* EventNode::GetContextStat(int64 stat_type) const {
 std::string EventNode::GetGroupName() const {
   std::vector<std::string> name_parts;
   if (const XStat* graph_type_stat = GetContextStat(StatType::kGraphType)) {
-    XStatVisitor stat(visitor_, graph_type_stat);
+    XStatVisitor stat(plane_, graph_type_stat);
     name_parts.push_back(stat.ToString());
   }
   int64 step_num = group_id_.value_or(0);
@@ -172,7 +253,7 @@ std::string EventNode::GetGroupName() const {
 
 void EventNode::PropagateGroupId(int64 group_id) {
   group_id_ = group_id;
-  SetGroupId(*visitor_, group_id, event_);
+  SetGroupId(*plane_, group_id, raw_event_);
   for (const auto& child : children_) {
     // Skip if it already belongs to a group. Some nodes may be added multiple
     // times as child (e.g., sometimes async ops are executed synchronously and
@@ -184,13 +265,13 @@ void EventNode::PropagateGroupId(int64 group_id) {
 }
 
 void EventNode::AddStepName(absl::string_view step_name) {
-  AddOrUpdateStrStat(*visitor_->GetStatMetadataId(StatType::kStepName),
-                     step_name, event_);
+  AddOrUpdateStrStat(*plane_->GetStatMetadataId(StatType::kStepName), step_name,
+                     raw_event_);
 }
 
 void EventNode::SetIsEager(bool is_eager) {
-  AddOrUpdateIntStat(*visitor_->GetStatMetadataId(StatType::kIsEager),
-                     is_eager ? 1 : 0, event_);
+  AddOrUpdateIntStat(*plane_->GetStatMetadataId(StatType::kIsEager),
+                     is_eager ? 1 : 0, raw_event_);
 }
 
 bool EventNode::IsEager() {
@@ -201,14 +282,9 @@ bool EventNode::IsEager() {
          FindParent(HostEventType::kEagerKernelExecute) != nullptr;
 }
 
-bool EventNode::IsNestedIn(EventNode* parent) {
-  return parent && IsNested(GetEvent(), parent->GetEvent());
-}
-
 EventNode* EventNode::FindParent(int64 event_type) {
   if (parent_) {
-    if (GetEventType(parent_->GetPlaneVisitor(), parent_->GetEvent()) ==
-        event_type) {
+    if (parent_->GetEventVisitor().Type() == event_type) {
       return parent_;
     }
     return parent_->FindParent(event_type);
@@ -217,14 +293,22 @@ EventNode* EventNode::FindParent(int64 event_type) {
 }
 
 void EventForest::ConnectIntraThread(const XPlaneVisitor& visitor,
-                                     XPlane* plane) {
+                                     XPlane* plane,
+                                     ContextGroupMap* context_groups) {
   for (auto& line : *plane->mutable_lines()) {
     std::vector<EventNode*> parent_nodes;
     for (auto& event : *line.mutable_events()) {
-      auto cur_node = absl::make_unique<EventNode>(&visitor, &event);
+      auto cur_node = absl::make_unique<EventNode>(&visitor, &line, &event);
+      // Update `context_groups` for `ConnectInterThread`.
+      SetContextGroup(cur_node.get(), context_groups);
+      // Update `root_events_` for `CreateEventGroup`.
+      if (cur_node->IsRoot()) root_events_.push_back(cur_node.get());
+      // Async events are ignored when processing the nesting relationship.
+      if (cur_node->IsAsync()) continue;
       while (!parent_nodes.empty()) {
         EventNode* parent_node = parent_nodes.back();
-        if (cur_node->IsNestedIn(parent_node)) {
+        if (parent_node->GetEventVisitor().GetTimespan().Includes(
+                cur_node->GetEventVisitor().GetTimespan())) {
           parent_node->AddChild(cur_node.get());
           break;
         } else {
@@ -289,21 +373,19 @@ void EventForest::ConnectInterThread(
 void EventForest::CreateEventGroup(
     const std::vector<int64 /*EventType*/>& root_event_types) {
   int64 next_group_id = 0;
+  for (EventNode* root_event : root_events_) {
+    ProcessRootEvent(next_group_id++, root_event, &event_group_name_map_);
+  }
   for (int64 root_event_type : root_event_types) {
-    if (auto root_event_node_list =
-            gtl::FindOrNull(event_node_map_, root_event_type)) {
-      for (const auto& root_event_node : *root_event_node_list) {
+    if (auto root_events = gtl::FindOrNull(event_node_map_, root_event_type)) {
+      for (const auto& root_event : *root_events) {
         // Skip if it already belongs to a group.
-        if (root_event_node->GetGroupId()) continue;
-        int64 group_id = next_group_id++;
-        root_event_node->PropagateGroupId(group_id);
-        std::string group_name = root_event_node->GetGroupName();
-        // TODO(jihochoi): change event name instead.
-        root_event_node->AddStepName(group_name);
-        event_group_name_map_[group_id] = std::move(group_name);
+        if (root_event->GetGroupId()) continue;
+        ProcessRootEvent(next_group_id++, root_event.get(),
+                         &event_group_name_map_);
       }
       // Only use the first root event type found.
-      if (!root_event_node_list->empty()) break;
+      if (!root_events->empty()) break;
     }
   }
 }
@@ -345,12 +427,8 @@ void EventForest::CreateVirtualEventsForHostTrainingLoop() {
     if (!iter_num) continue;
     EventNode*& virtual_event_node = virtual_event_node_map[step_id][iter_num];
     if (!virtual_event_node) {
-      std::unique_ptr<XEvent> new_virtual_event =
-          CreateVirtualEvent(*step_id_stat, *iter_num_stat);
-      auto new_virtual_event_node = absl::make_unique<EventNode>(
-          &executor_event_node->GetPlaneVisitor(), new_virtual_event.get());
-      // virtual_event_container_ keeps new_virtual_event alive.
-      virtual_event_container_.push_back(std::move(new_virtual_event));
+      auto new_virtual_event_node =
+          absl::make_unique<EventNode>(*executor_event_node);
       virtual_event_node = new_virtual_event_node.get();
       // event_node_map_ keeps new_virtual_event_node alive.
       event_node_map_[HostEventType::kHostTrainingLoopIteration].push_back(
@@ -368,12 +446,8 @@ void EventForest::CreateVirtualEventsForAsyncExecutor() {
   for (auto& eager_kernel_execute_event_node :
        *eager_kernel_execute_event_node_list) {
     if (HasFunctionRun(eager_kernel_execute_event_node.get())) {
-      auto new_virtual_event = absl::make_unique<XEvent>();
-      auto new_virtual_event_node = absl::make_unique<EventNode>(
-          &eager_kernel_execute_event_node->GetPlaneVisitor(),
-          new_virtual_event.get());
-      // virtual_event_container_ keeps new_virtual_event alive.
-      virtual_event_container_.push_back(std::move(new_virtual_event));
+      auto new_virtual_event_node =
+          absl::make_unique<EventNode>(*eager_kernel_execute_event_node);
       virtual_event_node = new_virtual_event_node.get();
       // event_node_map_ keeps new_virtual_event_node alive.
       event_node_map_[HostEventType::kAsyncExecutorTraceContext].push_back(
@@ -390,13 +464,15 @@ EventForest::EventForest(
     const std::vector<int64>& root_event_types,
     const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
     XSpace* space) {
+  ContextGroupMap context_groups;
   visitors_.reserve(space->planes_size());
   for (auto& plane : *space->mutable_planes()) {
     CreateStatMetadata(&plane);
     visitors_.push_back(visitor_factory(&plane));
-    ConnectIntraThread(visitors_.back(), &plane);
+    ConnectIntraThread(visitors_.back(), &plane, &context_groups);
   }
   ConnectInterThread(connect_info_list);
+  ConnectContextGroups(context_groups);
   if (NeedsVirtualEventsForHostTrainingLoop(root_event_types)) {
     CreateVirtualEventsForHostTrainingLoop();
   }
diff --git a/tensorflow/core/profiler/utils/group_events.h b/tensorflow/core/profiler/utils/group_events.h
index 1140f2dab8d..544ae39c1ed 100644
--- a/tensorflow/core/profiler/utils/group_events.h
+++ b/tensorflow/core/profiler/utils/group_events.h
@@ -16,9 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_GROUP_EVENTS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_GROUP_EVENTS_H_
 
+#include <functional>
 #include <memory>
+#include <string>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
@@ -36,16 +43,20 @@ struct InterThreadConnectInfo {
   std::vector<int64> child_stat_types;
 };
 
+struct ContextInfo {
+  ContextInfo(int type, uint64 id) : type(type), id(id) {}
+  int type;
+  uint64 id;
+};
+
 // A wrapper for XEvent with parent and children pointers. Through these
 // pointers, a tree of EventNode is formed.
 class EventNode {
  public:
-  // REQUIRED: visitor and event should not be nullptr.
-  explicit EventNode(const XPlaneVisitor* visitor, XEvent* event)
-      : visitor_(visitor), event_(event) {
-    DCHECK(visitor);
-    DCHECK(event);
-  }
+  // REQUIRED: all inputs should not be nullptr.
+  EventNode(const XPlaneVisitor* plane, XLine* raw_line, XEvent* raw_event);
+
+  EventNode(const EventNode& event_node);
 
   EventNode* GetParent() const { return parent_; }
 
@@ -63,9 +74,9 @@ class EventNode {
   // Sets group_id for this node and its descendants.
   void PropagateGroupId(int64 group_id);
 
-  const XPlaneVisitor& GetPlaneVisitor() const { return *visitor_; }
+  const XPlaneVisitor& GetPlaneVisitor() const { return *plane_; }
 
-  const XEvent& GetEvent() const { return *event_; }
+  const XEventVisitor& GetEventVisitor() const { return visitor_; }
 
   const XStat* GetContextStat(int64 stat_type) const;
 
@@ -81,25 +92,54 @@ class EventNode {
   // Returns the closest parent of the given event type.
   EventNode* FindParent(int64 event_type);
 
+  absl::optional<ContextInfo> GetProducerContext() const {
+    return producer_context_;
+  }
+
+  absl::optional<ContextInfo> GetConsumerContext() const {
+    return consumer_context_;
+  }
+
+  bool IsRoot() const { return is_root_; }
+
+  bool IsAsync() const { return is_async_; }
+
  private:
-  const XPlaneVisitor* visitor_;
-  XEvent* event_;
+  const XPlaneVisitor* plane_;
+  XEventVisitor visitor_;
+  XLine* raw_line_;
+  XEvent* raw_event_;
   EventNode* parent_ = nullptr;
   std::vector<EventNode*> children_;
   absl::optional<int64> group_id_;
+  absl::optional<ContextInfo> producer_context_;
+  absl::optional<ContextInfo> consumer_context_;
+  bool is_root_ = false;
+  bool is_async_ = false;
 };
 
 using EventNodeMap =
     absl::flat_hash_map<int64 /*event_type*/,
                         std::vector<std::unique_ptr<EventNode>>>;
 
-using VirtualEventContainer = std::vector<std::unique_ptr<XEvent>>;
-
 using EventGroupNameMap = absl::flat_hash_map<int64 /*group_id*/, std::string>;
 
-// Creates a forest of EventNode by stitching events in space using the nesting
-// relationship within the same thread and connect_info_list across threads, and
-// groups them by the root events.
+using EventList = std::vector<EventNode*>;
+
+struct ContextGroup {
+  EventNode* producer = nullptr;
+  std::vector<EventNode*> consumers;
+};
+
+using ContextGroupMap = absl::flat_hash_map<
+    int /*context_type*/,
+    absl::flat_hash_map<uint64 /*context_id*/, ContextGroup>>;
+
+// EventForest augments the input XSpace with the trace context. The trace
+// context is created by stitching XEvents (1) using the nesting relationship
+// within the same thread and (2) comparing the semantic arguments or using
+// connect_info_list across threads. It also groups the events by the root
+// events specified in root_event_types or marked by the semantic argument.
 class EventForest {
  public:
   EventForest(const std::vector<InterThreadConnectInfo>& connect_info_list,
@@ -116,7 +156,8 @@ class EventForest {
  private:
   // Creates an EventNode for each event in event_node_map and connect events
   // according to the nesting relationship within the thread.
-  void ConnectIntraThread(const XPlaneVisitor& visitor, XPlane* plane);
+  void ConnectIntraThread(const XPlaneVisitor& visitor, XPlane* plane,
+                          ContextGroupMap* context_groups);
 
   // Connects events across threads according to connect_info_list.
   void ConnectInterThread(
@@ -134,21 +175,21 @@ class EventForest {
   // Sets the is_eager stat to true for the eagerly executed CPU TF op events.
   void MarkEagerlyExecutedCpuTfOps();
 
-  // Create virtual events of HostEventType::kHostTrainingLoopIteration and
-  // event nodes for them. A virtual event is created for each iteration of the
-  // host training loop and connected to the
-  // HostEventType::kExecutorStateProcess event nodes of the iteration.
+  // Create virtual events of HostEventType::kHostTrainingLoopIteration. A
+  // virtual event is created for each iteration of the host training loop and
+  // connected to the HostEventType::kExecutorStateProcess events of the
+  // iteration.
   void CreateVirtualEventsForHostTrainingLoop();
 
-  // Create virutal events of HostEventType::kAsyncExecutorTraceContext and
-  // event nodes for them. A virtual event is created for every FunctionRun and
-  // the following eager ops (e.g., for Keras callback).
+  // Create virutal events of HostEventType::kAsyncExecutorTraceContext. A
+  // virtual event is created for every FunctionRun and the following eager ops
+  // (e.g., for Keras callback).
   void CreateVirtualEventsForAsyncExecutor();
 
   EventNodeMap event_node_map_;
   std::vector<XPlaneVisitor> visitors_;
-  VirtualEventContainer virtual_event_container_;
   EventGroupNameMap event_group_name_map_;
+  EventList root_events_;
 };
 
 std::vector<InterThreadConnectInfo> CreateInterThreadConnectInfoList();
diff --git a/tensorflow/core/profiler/utils/group_events_test.cc b/tensorflow/core/profiler/utils/group_events_test.cc
index 6b6a0d2a19d..8545bc94e54 100644
--- a/tensorflow/core/profiler/utils/group_events_test.cc
+++ b/tensorflow/core/profiler/utils/group_events_test.cc
@@ -16,12 +16,15 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/group_events.h"
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -234,6 +237,84 @@ TEST(GroupEventsTest, FunctionOpTest) {
   EXPECT_EQ(gpu_kernel.stats(2).int64_value(), 0);
 }
 
+TEST(GroupEventsTest, SemanticArgTest) {
+  constexpr int64 kStepNum = 100;
+  constexpr int kContextType = 123;
+  constexpr uint64 kContextId = 456;
+
+  XSpace raw_space;
+  XPlane* raw_plane = raw_space.add_planes();
+  XPlaneBuilder plane(raw_plane);
+  plane.ReserveLines(2);
+  auto root_producer = plane.GetOrCreateLine(0);
+  CreateXEvent(&plane, &root_producer, HostEventType::kTraceContext, 0, 100,
+               {{StatType::kIsRoot, 1}, {StatType::kStepNum, kStepNum}});
+  CreateXEvent(&plane, &root_producer, HostEventType::kFunctionRun, 10, 90,
+               {{StatType::kProducerType, kContextType},
+                {StatType::kProducerId, kContextId}});
+  auto consumer = plane.GetOrCreateLine(1);
+  CreateXEvent(&plane, &consumer, HostEventType::kExecutorStateProcess, 20, 80,
+               {{StatType::kConsumerType, kContextType},
+                {StatType::kConsumerId, kContextId}});
+
+  GroupTfEvents(&raw_space, /*event_group_name_map=*/nullptr);
+  int num_events = 0;
+  CreateTfXPlaneVisitor(raw_plane).ForEachLine(
+      [&](const tensorflow::profiler::XLineVisitor& line) {
+        num_events += line.NumEvents();
+        line.ForEachEvent(
+            [&](const tensorflow::profiler::XEventVisitor& event) {
+              absl::optional<int64> group_id;
+              event.ForEachStat(
+                  [&](const tensorflow::profiler::XStatVisitor& stat) {
+                    if (stat.Type() == StatType::kGroupId) {
+                      group_id = stat.IntValue();
+                    }
+                  });
+              EXPECT_TRUE(group_id.has_value());
+              EXPECT_EQ(*group_id, 0);
+            });
+      });
+  EXPECT_EQ(num_events, 3);
+}
+
+TEST(GroupEventsTest, AsyncEventTest) {
+  constexpr absl::string_view kParent = "parent";
+  constexpr absl::string_view kAsync = "async";
+  constexpr absl::string_view kChild = "child";
+
+  XSpace raw_space;
+  XPlane* raw_plane = raw_space.add_planes();
+  XPlaneBuilder plane(raw_plane);
+  plane.ReserveLines(1);
+  auto line = plane.GetOrCreateLine(0);
+  CreateXEvent(&plane, &line, kParent, 0, 100, {{StatType::kIsRoot, 1}});
+  CreateXEvent(&plane, &line, kAsync, 10, 200, {{StatType::kIsAsync, 1}});
+  CreateXEvent(&plane, &line, kChild, 20, 80);
+
+  GroupTfEvents(&raw_space, /*event_group_name_map=*/nullptr);
+  CreateTfXPlaneVisitor(raw_plane).ForEachLine(
+      [&](const tensorflow::profiler::XLineVisitor& line) {
+        EXPECT_EQ(line.NumEvents(), 3);
+        line.ForEachEvent(
+            [&](const tensorflow::profiler::XEventVisitor& event) {
+              absl::optional<int64> group_id;
+              event.ForEachStat(
+                  [&](const tensorflow::profiler::XStatVisitor& stat) {
+                    if (stat.Type() == StatType::kGroupId) {
+                      group_id = stat.IntValue();
+                    }
+                  });
+              if (event.Name() == kAsync) {
+                EXPECT_FALSE(group_id.has_value());
+              } else {
+                EXPECT_TRUE(group_id.has_value());
+                EXPECT_EQ(*group_id, 0);
+              }
+            });
+      });
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/hardware_type_utils.cc b/tensorflow/core/profiler/utils/hardware_type_utils.cc
index 75896c03851..e2a4004555b 100644
--- a/tensorflow/core/profiler/utils/hardware_type_utils.cc
+++ b/tensorflow/core/profiler/utils/hardware_type_utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/html_utils.h b/tensorflow/core/profiler/utils/html_utils.h
new file mode 100644
index 00000000000..215d9f51d5b
--- /dev/null
+++ b/tensorflow/core/profiler/utils/html_utils.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HTML_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_HTML_UTILS_H_
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Creates a html that links to the given url with the given text.
+inline std::string AnchorElement(absl::string_view url,
+                                 absl::string_view text) {
+  return absl::StrCat("<a href=\"", url, "\" target=\"_blank\">", text, "</a>");
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_HTML_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils.cc b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
index 14038d5c177..c40c3a89c9c 100644
--- a/tensorflow/core/profiler/utils/kernel_stats_utils.cc
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
@@ -15,15 +15,17 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
 
+#include <algorithm>
+#include <string>
 #include <tuple>
 #include <vector>
 
 #include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
-#include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 
 namespace tensorflow {
@@ -34,15 +36,15 @@ void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
   const std::vector<absl::string_view> params =
       absl::StrSplit(xstat_kernel_details, absl::ByAnyChar(":\n"));
 
-  constexpr uint32_t kNumDimensions = 3;
-  for (uint32_t dim = 0; dim < kNumDimensions; ++dim) {
+  constexpr uint32 kNumDimensions = 3;
+  for (uint32 dim = 0; dim < kNumDimensions; ++dim) {
     kernel->add_block_dim(1);
     kernel->add_grid_dim(1);
   }
 
   // Process value pairs.
-  for (uint32_t ii = 0; ii < params.size(); ii += 2) {
-    uint32_t value = 0;
+  for (uint32 ii = 0; ii < params.size(); ii += 2) {
+    uint32 value = 0;
     if (params[ii] == "registers_per_thread" &&
         absl::SimpleAtoi(params[ii + 1], &value)) {
       kernel->set_registers_per_thread(value);
diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
index 06307d6d102..863d2f79819 100644
--- a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
+++ b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
@@ -15,8 +15,13 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 
+#include <algorithm>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
@@ -40,7 +45,7 @@ class DeviceTfOpMetricsDbBuilder : public OpMetricsDbBuilder {
         /*hlo_module_id=*/0, tf_op_name);
     if (tf_op_metrics->category().empty()) {
       tf_op_metrics->set_category(
-          tf_op_type == kUnknownOp ? "Unknown" : string(tf_op_type));
+          tf_op_type == kUnknownOp ? "Unknown" : std::string(tf_op_type));
     }
     tf_op_metrics->set_is_eager(device_op_metrics.is_eager());
     // The occurrences of a TF-op is the maximum among the occurrences of all
@@ -89,8 +94,8 @@ uint64 IdleTimePs(const OpMetricsDb& metrics_db) {
 void AddIdleOp(OpMetricsDb* db) {
   uint64 idle_time_ps = IdleTimePs(*db);
   OpMetrics* metrics = db->add_metrics_db();
-  metrics->set_name(string(kIdle));
-  metrics->set_category(string(kIdle));
+  metrics->set_name(std::string(kIdle));
+  metrics->set_category(std::string(kIdle));
   metrics->set_occurrences(0);
   metrics->set_time_ps(idle_time_ps);
   metrics->set_self_time_ps(idle_time_ps);
diff --git a/tensorflow/core/profiler/utils/op_utils.cc b/tensorflow/core/profiler/utils/op_utils.cc
index 74ce13def0a..75789bc1071 100644
--- a/tensorflow/core/profiler/utils/op_utils.cc
+++ b/tensorflow/core/profiler/utils/op_utils.cc
@@ -15,8 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/op_utils.h"
 
+#include <algorithm>
+#include <string>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -57,21 +63,19 @@ void HostOpMetricsDbBuilder::UpdateHostInfeedEnqInfo(
       start_timestamp_ps_diff);
 }
 
-void DeviceOpMetricsDbBuilder::EnterOp(uint64 program_id,
-                                       absl::string_view name,
-                                       absl::string_view category,
-                                       absl::string_view provenance,
-                                       bool is_eager, uint64 occurrences,
-                                       uint64 time_ps, uint64 children_time_ps,
-                                       int64 flops, int64 bytes_accessed) {
+void DeviceOpMetricsDbBuilder::EnterOp(
+    uint64 program_id, absl::string_view name, absl::string_view category,
+    absl::string_view provenance, bool is_eager, uint64 occurrences,
+    uint64 time_ps, uint64 children_time_ps, int64 flops, int64 bytes_accessed,
+    const std::vector<OpMetrics::MemoryAccessed>& memory_accessed_breakdown) {
   uint64 self_time_ps = time_ps - children_time_ps;
   DCHECK_GE(time_ps, self_time_ps);
   OpMetrics* op_metrics = LookupOrInsertNewOpMetrics(program_id, name);
   if (op_metrics->category().empty())
     op_metrics->set_category(category == kUnknownOp ? "unknown"
-                                                    : string(category));
+                                                    : std::string(category));
   if (op_metrics->provenance().empty())
-    op_metrics->set_provenance(string(provenance));
+    op_metrics->set_provenance(std::string(provenance));
   op_metrics->set_is_eager(op_metrics->is_eager() || is_eager);
   op_metrics->set_occurrences(op_metrics->occurrences() + occurrences);
   op_metrics->set_time_ps(op_metrics->time_ps() + time_ps);
@@ -83,6 +87,9 @@ void DeviceOpMetricsDbBuilder::EnterOp(uint64 program_id,
       op_metrics->bytes_accessed() +
       GetCappedPerf(bytes_accessed * occurrences, self_time_ps,
                     peak_hbm_bw_giga_bytes_per_second_ / 1000));
+  for (const auto& memory_accessed : memory_accessed_breakdown) {
+    *op_metrics->add_memory_accessed_breakdown() = memory_accessed;
+  }
   db()->set_total_op_time_ps(db()->total_op_time_ps() + self_time_ps);
 }
 
diff --git a/tensorflow/core/profiler/utils/op_utils.h b/tensorflow/core/profiler/utils/op_utils.h
index 8aaa0f4f5c2..9c9762853b8 100644
--- a/tensorflow/core/profiler/utils/op_utils.h
+++ b/tensorflow/core/profiler/utils/op_utils.h
@@ -16,13 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_OP_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_OP_UTILS_H_
 
-#include <string>
-
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -72,10 +69,14 @@ class DeviceOpMetricsDbBuilder : public OpMetricsDbBuilder {
   //                      picoseconds.
   //   flops = the number of floating-point operations computed.
   //   bytes_accessed = the sum of bytes read and bytes written by this OP.
+  //   memory_accessed_breakdown = the breakdown of memory accessed by operation
+  //                               type and memory space.
   void EnterOp(uint64 program_id, absl::string_view name,
                absl::string_view category, absl::string_view provenance,
                bool is_eager, uint64 occurrences, uint64 time_ps,
-               uint64 children_time_ps, int64 flops, int64 bytes_accessed);
+               uint64 children_time_ps, int64 flops, int64 bytes_accessed,
+               const std::vector<OpMetrics::MemoryAccessed>&
+                   memory_accessed_breakdown = {});
 
  protected:
   // Peak performance of a TensorCore or a GPU in TFLOP/s.
diff --git a/tensorflow/core/profiler/utils/tf_op_utils.cc b/tensorflow/core/profiler/utils/tf_op_utils.cc
index 5a4204440a3..630a74c4e47 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils.cc
+++ b/tensorflow/core/profiler/utils/tf_op_utils.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 
+#include <string>
+#include <vector>
+
 #include "absl/strings/ascii.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
-#include "absl/strings/strip.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/regexp.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/utils/tf_op_utils.h b/tensorflow/core/profiler/utils/tf_op_utils.h
index d1ac69e2976..b8af9463d51 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils.h
+++ b/tensorflow/core/profiler/utils/tf_op_utils.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_TF_OP_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_TF_OP_UTILS_H_
 
+#include <string>
 #include <vector>
 
-#include "absl/base/attributes.h"
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
 
diff --git a/tensorflow/core/profiler/utils/tf_op_utils_test.cc b/tensorflow/core/profiler/utils/tf_op_utils_test.cc
index fa5169557d1..136dbee2430 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils_test.cc
+++ b/tensorflow/core/profiler/utils/tf_op_utils_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/utils/tf_xplane_visitor.h b/tensorflow/core/profiler/utils/tf_xplane_visitor.h
index 33a170f8efd..17a7b94ef92 100644
--- a/tensorflow/core/profiler/utils/tf_xplane_visitor.h
+++ b/tensorflow/core/profiler/utils/tf_xplane_visitor.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_TF_XPLANE_VISITOR_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_TF_XPLANE_VISITOR_H_
 
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
diff --git a/tensorflow/core/profiler/utils/tfstreamz_utils.cc b/tensorflow/core/profiler/utils/tfstreamz_utils.cc
index 5fef494fc3b..230efb6a055 100644
--- a/tensorflow/core/profiler/utils/tfstreamz_utils.cc
+++ b/tensorflow/core/profiler/utils/tfstreamz_utils.cc
@@ -14,37 +14,46 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/tfstreamz_utils.h"
 
+#include <map>
 #include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/substitute.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/monitoring/collected_metrics.h"
-#include "tensorflow/core/lib/monitoring/collection_registry.h"
-#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+#include "tensorflow/core/lib/monitoring/types.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/tfstreamz.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
 
 namespace tensorflow {
 namespace profiler {
 
 namespace {
-string ConstructXStatName(const string& name, const monitoring::Point& point) {
+
+std::string ConstructXStatName(absl::string_view name,
+                               const monitoring::Point& point) {
   if (point.labels.empty()) {
-    return name;
+    return std::string(name);
   }
   return absl::Substitute(
       "$0{$1}", name,
-      absl::StrJoin(point.labels, ", ",
-                    [](string* out, const monitoring::Point::Label& label) {
-                      absl::StrAppend(out, label.name, "=", label.value);
-                    }));
+      absl::StrJoin(
+          point.labels, ", ",
+          [](std::string* out, const monitoring::Point::Label& label) {
+            absl::StrAppend(out, label.name, "=", label.value);
+          }));
 }
 
-string SerializePercentile(const monitoring::Percentiles& percentiles) {
+tfstreamz::Percentiles ToProto(const monitoring::Percentiles& percentiles) {
   tfstreamz::Percentiles output;
   output.set_unit_of_measure(
       static_cast<tfstreamz::UnitOfMeasure>(percentiles.unit_of_measure));
@@ -62,7 +71,7 @@ string SerializePercentile(const monitoring::Percentiles& percentiles) {
     percentile_point->set_percentile(pp.percentile);
     percentile_point->set_value(pp.value);
   }
-  return output.SerializeAsString();
+  return output;
 }
 
 }  // namespace
@@ -82,11 +91,11 @@ Status SerializeToXPlane(const std::vector<TfStreamzSnapshot>& snapshots,
     xevent.SetEndTimestampNs(snapshot.end_time_ns);
     auto& metric_descriptor_map = snapshot.metrics->metric_descriptor_map;
     for (const auto& point_set : snapshot.metrics->point_set_map) {
-      const string& metric_name = point_set.first;
+      const std::string& metric_name = point_set.first;
       // Each metrics have multiple points corresponding to different labels.
       for (const auto& point : point_set.second->points) {
         // Generates one KPI metric for each point.
-        string stat_name = ConstructXStatName(metric_name, *point);
+        std::string stat_name = ConstructXStatName(metric_name, *point);
         auto* metadata = xplane.GetOrCreateStatMetadata(stat_name);
         auto it = metric_descriptor_map.find(metric_name);
         if (it != metric_descriptor_map.end()) {
@@ -104,14 +113,10 @@ Status SerializeToXPlane(const std::vector<TfStreamzSnapshot>& snapshots,
                                                point->string_value));
             break;
           case monitoring::ValueType::kHistogram:
-            xevent.AddStatValue(*metadata,
-                                point->histogram_value.SerializeAsString(),
-                                /*is_bytes=*/true);
+            xevent.AddStatValue(*metadata, point->histogram_value);
             break;
           case monitoring::ValueType::kPercentiles:
-            xevent.AddStatValue(*metadata,
-                                SerializePercentile(point->percentiles_value),
-                                /*is_bytes=*/true);
+            xevent.AddStatValue(*metadata, ToProto(point->percentiles_value));
             break;
         }
       }
diff --git a/tensorflow/core/profiler/utils/tfstreamz_utils.h b/tensorflow/core/profiler/utils/tfstreamz_utils.h
index ae8e4079bcb..1ab21ed1b5e 100644
--- a/tensorflow/core/profiler/utils/tfstreamz_utils.h
+++ b/tensorflow/core/profiler/utils/tfstreamz_utils.h
@@ -15,11 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_TFSTREAMZ_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_TFSTREAMZ_UTILS_H_
 
+#include <memory>
+#include <vector>
+
 #include "tensorflow/core/lib/monitoring/collected_metrics.h"
-#include "tensorflow/core/lib/monitoring/collection_registry.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/timespan.h b/tensorflow/core/profiler/utils/timespan.h
index bccbeaa796f..82775af1415 100644
--- a/tensorflow/core/profiler/utils/timespan.h
+++ b/tensorflow/core/profiler/utils/timespan.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_TIMESPAN_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_TIMESPAN_H_
 
+#include <algorithm>
+#include <string>
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/profiler/utils/xplane_builder.cc b/tensorflow/core/profiler/utils/xplane_builder.cc
index 9e66a15cc36..f923f3982f4 100644
--- a/tensorflow/core/profiler/utils/xplane_builder.cc
+++ b/tensorflow/core/profiler/utils/xplane_builder.cc
@@ -14,6 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 
+#include <algorithm>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
 
 namespace tensorflow {
@@ -54,7 +62,7 @@ XEventMetadata* XPlaneBuilder::GetOrCreateEventMetadata(
   return metadata;
 }
 
-XEventMetadata* XPlaneBuilder::GetOrCreateEventMetadata(string&& name) {
+XEventMetadata* XPlaneBuilder::GetOrCreateEventMetadata(std::string&& name) {
   XEventMetadata*& metadata = event_metadata_by_name_[name];
   if (metadata == nullptr) {
     metadata =
diff --git a/tensorflow/core/profiler/utils/xplane_builder.h b/tensorflow/core/profiler/utils/xplane_builder.h
index 803cc7b89c2..d827953cc88 100644
--- a/tensorflow/core/profiler/utils/xplane_builder.h
+++ b/tensorflow/core/profiler/utils/xplane_builder.h
@@ -15,10 +15,16 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_BUILDER_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_BUILDER_H_
 
+#include <stddef.h>
+
+#include <string>
+#include <utility>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
@@ -50,26 +56,20 @@ class XStatsBuilder {
   void AddStatValue(const XStatMetadata& metadata, double value) {
     AddStat(metadata)->set_double_value(value);
   }
-  void AddStatValue(const XStatMetadata& metadata, absl::string_view value,
-                    bool is_bytes = false) {
-    if (is_bytes) {
-      AddStat(metadata)->set_bytes_value(string(value));
-    } else {
-      AddStat(metadata)->set_str_value(string(value));
-    }
+  void AddStatValue(const XStatMetadata& metadata, absl::string_view value) {
+    AddStat(metadata)->set_str_value(std::string(value));
   }
-  void AddStatValue(const XStatMetadata& metadata, string&& value,
-                    bool is_bytes = false) {
-    if (is_bytes) {
-      AddStat(metadata)->set_bytes_value(std::move(value));
-    } else {
-      AddStat(metadata)->set_str_value(std::move(value));
-    }
+  void AddStatValue(const XStatMetadata& metadata, std::string&& value) {
+    AddStat(metadata)->set_str_value(std::move(value));
   }
-
   void AddStatValue(const XStatMetadata& key, const XStatMetadata& value) {
     AddStat(key)->set_ref_value(value.id());
   }
+  void AddStatValue(const XStatMetadata& metadata,
+                    const protobuf::MessageLite& proto) {
+    auto* bytes = AddStat(metadata)->mutable_bytes_value();
+    proto.SerializeToString(bytes);
+  }
 
   void AddStat(const XStatMetadata& key, const XStat& stat, const XPlane& src);
 
@@ -160,7 +160,7 @@ class XLineBuilder {
 
   int64 NumEvents() { return line_->events_size(); }
 
-  void SetName(absl::string_view name) { line_->set_name(string(name)); }
+  void SetName(absl::string_view name) { line_->set_name(std::string(name)); }
 
   void SetNameIfEmpty(absl::string_view name) {
     if (line_->name().empty()) SetName(name);
@@ -202,10 +202,10 @@ class XPlaneBuilder : public XStatsBuilder<XPlane> {
  public:
   explicit XPlaneBuilder(XPlane* plane);
 
-  int64 Id() { return plane_->id(); }
+  int64 Id() const { return plane_->id(); }
   void SetId(int64 id) { plane_->set_id(id); }
 
-  void SetName(absl::string_view name) { plane_->set_name(string(name)); }
+  void SetName(absl::string_view name) { plane_->set_name(std::string(name)); }
 
   void ReserveLines(size_t num_lines) {
     plane_->mutable_lines()->Reserve(num_lines);
@@ -222,7 +222,7 @@ class XPlaneBuilder : public XStatsBuilder<XPlane> {
 
   XEventMetadata* GetOrCreateEventMetadata(int64 metadata_id);
   XEventMetadata* GetOrCreateEventMetadata(absl::string_view name);
-  XEventMetadata* GetOrCreateEventMetadata(string&& name);
+  XEventMetadata* GetOrCreateEventMetadata(std::string&& name);
   inline XEventMetadata* GetOrCreateEventMetadata(const char* name) {
     return GetOrCreateEventMetadata(absl::string_view(name));
   }
@@ -251,7 +251,7 @@ void XStatsBuilder<T>::AddStat(const XStatMetadata& key, const XStat& stat,
   if (stat.value_case() == XStat::kRefValue) {
     const auto& stat_metadata_map = src.stat_metadata();
     const auto it = stat_metadata_map.find(stat.ref_value());
-    if (ABSL_PREDICT_FALSE(it == stat_metadata_map.end())) {
+    if (TF_PREDICT_FALSE(it == stat_metadata_map.end())) {
       // the reference value in stat is not found in XStatMetadata from src.
       return;
     }
diff --git a/tensorflow/core/profiler/utils/xplane_builder_test.cc b/tensorflow/core/profiler/utils/xplane_builder_test.cc
index cb8749703a2..e55e01d8233 100644
--- a/tensorflow/core/profiler/utils/xplane_builder_test.cc
+++ b/tensorflow/core/profiler/utils/xplane_builder_test.cc
@@ -14,7 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 
+#include <string>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index 51bc4d03810..8a4538197f3 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -16,8 +16,12 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -144,6 +148,15 @@ const StatTypeMap& GetStatTypeMap() {
       {"region_type", kRegionType},
       {"data_type", kDataType},
       {"shape", kTensorShapes},
+      {"kpi_name", kKpiName},
+      {"kpi_value", kKpiValue},
+      // XPlane semantics related.
+      {"$pt", kProducerType},
+      {"$ct", kConsumerType},
+      {"$p", kProducerId},
+      {"$c", kConsumerId},
+      {"$r", kIsRoot},
+      {"$a", kIsAsync},
       // Device trace arguments.
       {"device_id", kDeviceId},
       {"context_id", kContextId},
@@ -155,6 +168,7 @@ const StatTypeMap& GetStatTypeMap() {
       {"stream", kStream},
       // Stats added when processing traces.
       {"group_id", kGroupId},
+      {"flow", kFlow},
       {"step_name", kStepName},
       {"level 0", kLevel0},
       {"tf_op", kTfOp},
@@ -220,5 +234,14 @@ absl::optional<int64> FindStatType(absl::string_view stat_name) {
   return absl::nullopt;
 }
 
+bool IsInternalStat(absl::optional<int64> stat_type) {
+  static const auto* const kInternalStats = new absl::flat_hash_set<int64>{
+      StatType::kKernelDetails, StatType::kLevel0,
+      StatType::kProducerType,  StatType::kProducerId,
+      StatType::kConsumerType,  StatType::kConsumerId,
+      StatType::kIsRoot,        StatType::kIsAsync};
+  return stat_type.has_value() && kInternalStats->contains(*stat_type);
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 97e54a7fc2f..d0944bf88e1 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -16,11 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
 
-#include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
-#include "absl/types/span.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -140,6 +139,15 @@ enum StatType {
   kRegionType,
   kDataType,
   kTensorShapes,
+  kKpiName,
+  kKpiValue,
+  // XPlane semantics related.
+  kProducerType,
+  kConsumerType,
+  kProducerId,
+  kConsumerId,
+  kIsRoot,
+  kIsAsync,
   // Device trace arguments.
   kDeviceId,
   kContextId,
@@ -151,6 +159,7 @@ enum StatType {
   kStream,
   // Stats added when processing traces.
   kGroupId,
+  kFlow,
   kStepName,
   kLevel0,
   kTfOp,
@@ -200,10 +209,39 @@ inline bool IsStatType(StatType stat_type, absl::string_view stat_name) {
 absl::optional<int64> FindStatType(absl::string_view stat_name);
 
 // Returns true if the given stat shouldn't be shown in the trace viewer.
-inline bool IsInternalStat(absl::optional<int64> stat_type) {
-  return stat_type == StatType::kKernelDetails ||
-         stat_type == StatType::kLevel0;
-}
+bool IsInternalStat(absl::optional<int64> stat_type);
+
+// Support for flow events:
+// This class enables encoding/decoding the flow id and direction, stored as
+// XStat value.
+class XFlow {
+ public:
+  enum FlowDirection {
+    kFlowUnspecified = 0x0,
+    kFlowIn = 0x1,
+    kFlowOut = 0x2,
+    kFlowInOut = 0x3,
+  };
+
+  XFlow(uint64 flow_id, FlowDirection direction)
+      : encoded_((flow_id << 2) | (direction & 0x3)) {
+    DCHECK_NE(Direction(), kFlowUnspecified);
+  }
+
+  // Encoding
+  uint64 ToStatValue() const { return encoded_; }
+
+  // Decoding
+  static XFlow FromStatValue(uint64 encoded) { return XFlow(encoded); }
+
+  uint64 Id() const { return (encoded_ >> 2); }
+  FlowDirection Direction() const { return FlowDirection(encoded_ & 0x3); }
+
+ private:
+  explicit XFlow(uint64 encoded) : encoded_(encoded) {}
+
+  uint64 encoded_;
+};
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/xplane_test_utils.cc b/tensorflow/core/profiler/utils/xplane_test_utils.cc
new file mode 100644
index 00000000000..cd8821f05a8
--- /dev/null
+++ b/tensorflow/core/profiler/utils/xplane_test_utils.cc
@@ -0,0 +1,92 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+class XStatValueVisitor {
+ public:
+  XStatValueVisitor(XEventBuilder* event, const XStatMetadata* stat_metadata)
+      : event_(event), stat_metadata_(stat_metadata) {}
+
+  template <typename T>
+  void operator()(const T& value) {
+    event_->AddStatValue(*stat_metadata_, value);
+  }
+
+ private:
+  XEventBuilder* event_;
+  const XStatMetadata* stat_metadata_;
+};
+
+}  // namespace
+
+void CreateXEvent(
+    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
+    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
+    std::initializer_list<std::pair<StatType, XStatValue>> stats) {
+  auto event_builder = line_builder->AddEvent(
+      *plane_builder->GetOrCreateEventMetadata(event_name));
+  event_builder.SetOffsetPs(offset_ps);
+  event_builder.SetDurationPs(duration_ps);
+  for (const auto& stat_type_and_value : stats) {
+    StatType stat_type = stat_type_and_value.first;
+    const XStatValue& stat_value = stat_type_and_value.second;
+    XStatValueVisitor stat_value_visitor(
+        &event_builder,
+        plane_builder->GetOrCreateStatMetadata(GetStatTypeStr(stat_type)));
+    absl::visit(stat_value_visitor, stat_value);
+  }
+}
+
+void CreateXEvent(
+    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
+    HostEventType event_type, int64 offset_ps, int64 duration_ps,
+    std::initializer_list<std::pair<StatType, XStatValue>> stats) {
+  CreateXEvent(plane_builder, line_builder, GetHostEventTypeStr(event_type),
+               offset_ps, duration_ps, stats);
+}
+
+void CreateTfFunctionCallEvent(XPlaneBuilder* plane_builder,
+                               XLineBuilder* line_builder,
+                               absl::string_view function_name, int64 offset_ps,
+                               int64 duration_ps,
+                               absl::string_view execution_mode,
+                               int64 tracing_count) {
+  if (tracing_count >= 0) {
+    // Adds the tracing_count stats only if tracing_count is valid.
+    CreateXEvent(plane_builder, line_builder, function_name, offset_ps,
+                 duration_ps,
+                 {{StatType::kTfFunctionCall, execution_mode},
+                  {StatType::kTfFunctionTracingCount, tracing_count}});
+  } else {
+    CreateXEvent(plane_builder, line_builder, function_name, offset_ps,
+                 duration_ps, {{StatType::kTfFunctionCall, execution_mode}});
+  }
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/xplane_test_utils.h b/tensorflow/core/profiler/utils/xplane_test_utils.h
new file mode 100644
index 00000000000..b42599baecd
--- /dev/null
+++ b/tensorflow/core/profiler/utils/xplane_test_utils.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
+
+#include <initializer_list>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/variant.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using XStatValue = absl::variant<int64, absl::string_view>;
+
+void CreateXEvent(
+    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
+    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
+    std::initializer_list<std::pair<StatType, XStatValue>> stats = {});
+
+void CreateXEvent(
+    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
+    HostEventType event_type, int64 offset_ps, int64 duration_ps,
+    std::initializer_list<std::pair<StatType, XStatValue>> stats);
+
+void CreateTfFunctionCallEvent(XPlaneBuilder* plane_builder,
+                               XLineBuilder* line_builder,
+                               absl::string_view function_name, int64 offset_ps,
+                               int64 duration_ps,
+                               absl::string_view execution_mode,
+                               int64 tracing_count = -1);
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/xplane_utils.cc b/tensorflow/core/profiler/utils/xplane_utils.cc
index b2cc1fd46a5..366711aab45 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils.cc
@@ -14,9 +14,17 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
@@ -129,71 +137,6 @@ void AddOrUpdateStrStat(int64 metadata_id, absl::string_view value,
   stat->set_str_value(std::string(value));
 }
 
-XEventBuilder CreateXEvent(
-    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
-    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, int64 /*stat_value*/>& stats) {
-  auto event_builder = line_builder->AddEvent(
-      *plane_builder->GetOrCreateEventMetadata(event_name));
-  event_builder.SetOffsetPs(offset_ps);
-  event_builder.SetDurationPs(duration_ps);
-  for (const auto& stat_type_and_value : stats) {
-    event_builder.AddStatValue(*plane_builder->GetOrCreateStatMetadata(
-                                   GetStatTypeStr(stat_type_and_value.first)),
-                               stat_type_and_value.second);
-  }
-  return event_builder;
-}
-
-XEventBuilder CreateXEvent(
-    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
-    HostEventType event_type, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, int64 /*stat_value*/>& stats) {
-  return CreateXEvent(plane_builder, line_builder,
-                      GetHostEventTypeStr(event_type), offset_ps, duration_ps,
-                      stats);
-}
-
-XEventBuilder CreateXEventWithStringViewMetadataValue(
-    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
-    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, absl::string_view /*stat_value*/>&
-        stats) {
-  auto event_builder = line_builder->AddEvent(
-      *plane_builder->GetOrCreateEventMetadata(event_name));
-  event_builder.SetOffsetPs(offset_ps);
-  event_builder.SetDurationPs(duration_ps);
-  for (const auto& stat_type_and_value : stats) {
-    event_builder.AddStatValue(*plane_builder->GetOrCreateStatMetadata(
-                                   GetStatTypeStr(stat_type_and_value.first)),
-                               stat_type_and_value.second);
-  }
-  return event_builder;
-}
-
-XEventBuilder CreateXEventWithIntAndStringViewMetadataValue(
-    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
-    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, int64 /*stat_value*/>& int_stats,
-    const absl::flat_hash_map<StatType, absl::string_view /*stat_value*/>&
-        str_stats) {
-  auto event_builder = line_builder->AddEvent(
-      *plane_builder->GetOrCreateEventMetadata(event_name));
-  event_builder.SetOffsetPs(offset_ps);
-  event_builder.SetDurationPs(duration_ps);
-  for (const auto& stat_type_and_value : int_stats) {
-    event_builder.AddStatValue(*plane_builder->GetOrCreateStatMetadata(
-                                   GetStatTypeStr(stat_type_and_value.first)),
-                               stat_type_and_value.second);
-  }
-  for (const auto& stat_type_and_value : str_stats) {
-    event_builder.AddStatValue(*plane_builder->GetOrCreateStatMetadata(
-                                   GetStatTypeStr(stat_type_and_value.first)),
-                               stat_type_and_value.second);
-  }
-  return event_builder;
-}
-
 void RemovePlaneWithName(XSpace* space, absl::string_view name) {
   auto* planes = space->mutable_planes();
   planes->erase(
@@ -337,23 +280,5 @@ uint64 GetStartTimestampNs(const XPlane& plane) {
   return plane_timestamp;
 }
 
-void CreateTfFunctionCallEvent(XPlaneBuilder* plane_builder,
-                               XLineBuilder* line_builder,
-                               absl::string_view function_name, int64 offset_ps,
-                               int64 duration_ps,
-                               absl::string_view execution_mode,
-                               int64 tracing_count) {
-  XEventBuilder event_builder = CreateXEventWithStringViewMetadataValue(
-      plane_builder, line_builder, function_name, offset_ps, duration_ps,
-      {{StatType::kTfFunctionCall, execution_mode}});
-  if (tracing_count >= 0) {
-    // Adds the tracing_count stats only if tracing_count is valid.
-    event_builder.AddStatValue(
-        *plane_builder->GetOrCreateStatMetadata(
-            GetStatTypeStr(StatType::kTfFunctionTracingCount)),
-        tracing_count);
-  }
-}
-
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/xplane_utils.h b/tensorflow/core/profiler/utils/xplane_utils.h
index 4f0a8b82646..e1afb0bd56f 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.h
+++ b/tensorflow/core/profiler/utils/xplane_utils.h
@@ -17,11 +17,10 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -46,31 +45,6 @@ void AddOrUpdateIntStat(int64 metadata_id, int64 value,
 void AddOrUpdateStrStat(int64 metadata_id, absl::string_view value,
                         tensorflow::profiler::XEvent* event);
 
-// Creates an XEvent with int64 stats.
-XEventBuilder CreateXEvent(
-    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
-    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, int64 /*stat_value*/>& stats = {});
-XEventBuilder CreateXEvent(
-    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
-    HostEventType event_type, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, int64 /*stat_value*/>& stats);
-
-// Creates an XEvent with string stats.
-XEventBuilder CreateXEventWithStringViewMetadataValue(
-    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
-    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, absl::string_view /*stat_value*/>&
-        stats);
-
-// Creates an XEvent with int64 and string stats.
-XEventBuilder CreateXEventWithIntAndStringViewMetadataValue(
-    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
-    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, int64 /*stat_value*/>& int_stats,
-    const absl::flat_hash_map<StatType, absl::string_view /*stat_value*/>&
-        str_stats);
-
 void RemovePlaneWithName(XSpace* space, absl::string_view name);
 void RemoveEmptyPlanes(XSpace* space);
 void RemoveEmptyLines(XPlane* plane);
@@ -102,13 +76,6 @@ void MergePlanes(const XPlane& src_plane, XPlane* dst_plane);
 // timestamps. If zero line exists, return 0;
 uint64 GetStartTimestampNs(const XPlane& plane);
 
-// Creates a Xevent in the given plane & line for a tf-function.
-void CreateTfFunctionCallEvent(XPlaneBuilder* plane_builder,
-                               XLineBuilder* line_builder,
-                               absl::string_view function_name, int64 offset_ps,
-                               int64 duration_ps,
-                               absl::string_view execution_mode,
-                               int64 tracing_count = -1);
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/utils/xplane_utils_test.cc b/tensorflow/core/profiler/utils/xplane_utils_test.cc
index b9b15b2e8a9..04e06fcb05b 100644
--- a/tensorflow/core/profiler/utils/xplane_utils_test.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils_test.cc
@@ -15,9 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 
+#include <string>
+
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
diff --git a/tensorflow/core/profiler/utils/xplane_visitor.cc b/tensorflow/core/profiler/utils/xplane_visitor.cc
index ab97271a69a..42068b7c61a 100644
--- a/tensorflow/core/profiler/utils/xplane_visitor.cc
+++ b/tensorflow/core/profiler/utils/xplane_visitor.cc
@@ -14,7 +14,16 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/xplane_visitor.h b/tensorflow/core/profiler/utils/xplane_visitor.h
index 52aa60bb2e6..4120a2821ca 100644
--- a/tensorflow/core/profiler/utils/xplane_visitor.h
+++ b/tensorflow/core/profiler/utils/xplane_visitor.h
@@ -15,9 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_VISITOR_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_VISITOR_H_
 
+#include <stddef.h>
+
 #include <functional>
-#include <unordered_map>
-#include <utility>
+#include <string>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/core/protobuf/BUILD b/tensorflow/core/protobuf/BUILD
new file mode 100644
index 00000000000..a374c808a14
--- /dev/null
+++ b/tensorflow/core/protobuf/BUILD
@@ -0,0 +1,182 @@
+# For platform specific build config
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_additional_all_protos",
+    "tf_proto_library",
+    "tf_proto_library_cc",
+    "tf_pyclif_proto_library",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+        "//tensorflow/core:__subpackages__",
+        "//tensorflow_models:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+COMMON_PROTO_SRCS = [
+    "bfc_memory_map.proto",
+    "config.proto",
+    "cluster.proto",
+    "debug.proto",
+    "device_filters.proto",
+    "device_properties.proto",
+    "graph_debug_info.proto",
+    "queue_runner.proto",
+    "rewriter_config.proto",
+    "tensor_bundle.proto",
+    "saver.proto",
+    "verifier_config.proto",
+]
+
+[
+    [
+        tf_pyclif_proto_library(
+            name = "%s_pyclif" % proto_name,
+            proto_lib = ":for_core_protos",
+            proto_srcfile = "%s.proto" % proto_name,
+            visibility = ["//visibility:public"],
+        ),
+    ]
+    for proto_name in [
+        "config",
+        "device_properties",
+        "graph_debug_info",
+        "meta_graph",
+        "saved_model",
+    ]
+]
+
+tf_proto_library(
+    name = "autotuning_proto",
+    srcs = ["autotuning.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+)
+
+tf_proto_library(
+    name = "conv_autotuning_proto",
+    srcs = ["conv_autotuning.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    protodeps = [
+        "//tensorflow/stream_executor:dnn_proto",
+    ],
+)
+
+tf_proto_library_cc(
+    name = "worker_proto",
+    srcs = ["worker.proto"],
+    cc_api_version = 2,
+    protodeps = tf_additional_all_protos(),
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library_cc(
+    name = "worker_service_proto",
+    srcs = ["worker_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_stubby_versions = ["2"],
+    protodeps = [":worker_proto"],
+)
+
+tf_proto_library_cc(
+    name = "master_proto",
+    srcs = ["master.proto"],
+    cc_api_version = 2,
+    protodeps = tf_additional_all_protos(),
+    visibility = ["//tensorflow:internal"],
+)
+
+tf_proto_library_cc(
+    name = "master_service_proto",
+    srcs = ["master_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_stubby_versions = ["2"],
+    protodeps = [":master_proto"],
+)
+
+tf_proto_library_cc(
+    name = "eager_service_proto",
+    srcs = ["eager_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_grpc_version = 1,
+    cc_stubby_versions = ["2"],
+    protodeps = tf_additional_all_protos(),
+)
+
+tf_proto_library_cc(
+    name = "replay_log_proto",
+    srcs = ["replay_log.proto"],
+    cc_api_version = 2,
+    protodeps = [
+        ":master_proto",
+    ] + tf_additional_all_protos(),
+)
+
+tf_proto_library(
+    name = "error_codes_proto_impl",
+    srcs = ["error_codes.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+)
+
+exports_files(
+    srcs = ["error_codes.proto"] + COMMON_PROTO_SRCS + [
+        # Protos which are not needed on mobile builds, but should be included
+        # in protos_all.
+        #
+        # Note that some protos are in neither core_proto_srcs nor this
+        # filegroup; e.g. ones with individual proto_library targets.
+        "control_flow.proto",
+        # TODO(ebrevdo): Re-enable once CriticalSection is in core.
+        # "critical_section.proto",
+        "data/experimental/snapshot.proto",
+        "debug_event.proto",
+        "meta_graph.proto",
+        "named_tensor.proto",
+        "remote_tensor_handle.proto",
+        "saved_model.proto",
+        "saved_object_graph.proto",
+        "struct.proto",
+        "tensorflow_server.proto",
+        "trackable_object_graph.proto",
+        "transport_options.proto",
+    ],
+)
+
+tf_proto_library(
+    name = "for_core_protos",
+    srcs = COMMON_PROTO_SRCS + [
+        # Protos which are not needed on mobile builds, but should be included
+        # in protos_all.
+        #
+        # Note that some protos are in neither core_proto_srcs nor this
+        # filegroup; e.g. ones with individual proto_library targets.
+        "control_flow.proto",
+        # TODO(ebrevdo): Re-enable once CriticalSection is in core.
+        # "critical_section.proto",
+        "data/experimental/snapshot.proto",
+        "debug_event.proto",
+        "meta_graph.proto",
+        "named_tensor.proto",
+        "remote_tensor_handle.proto",
+        "saved_model.proto",
+        "saved_object_graph.proto",
+        "struct.proto",
+        "tensorflow_server.proto",
+        "trackable_object_graph.proto",
+        "transport_options.proto",
+    ],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    protodeps = [
+        ":error_codes_proto_impl",
+        "//tensorflow/core/framework:protos_all",
+    ],
+)
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index e9e21777d3f..3fe2bd486ba 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -69,6 +69,7 @@ message QueueItem {
     // enqueued in streaming call. Request with this item type waits for pending
     // nodes to finish on the remote executor and report status.
     SyncRemoteExecutorForStream sync_remote_executor_for_stream = 6;
+    SendPackedHandleOp send_packed_handle = 7;
   }
 }
 
@@ -238,6 +239,27 @@ message SendTensorOp {
   string device_name = 3;
 }
 
+// Send a packed TensorHandle to a remote worker.
+message SendPackedHandleOp {
+  // Op id of the remote packed TensorHandle.
+  int64 op_id = 1;
+
+  message LocalTensorHandle {
+    TensorProto tensor = 1;
+    // Device where the tensor is produced.
+    string device = 2;
+  }
+
+  message Handle {
+    oneof item {
+      LocalTensorHandle local_handle = 1;
+      RemoteTensorHandle remote_handle = 2;
+    }
+  }
+
+  repeated Handle handles = 2;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // Eager Service defines a TensorFlow service that executes operations eagerly
diff --git a/tensorflow/core/protobuf/remote_tensor_handle.proto b/tensorflow/core/protobuf/remote_tensor_handle.proto
index 10995226a9b..36e3f810b73 100644
--- a/tensorflow/core/protobuf/remote_tensor_handle.proto
+++ b/tensorflow/core/protobuf/remote_tensor_handle.proto
@@ -21,11 +21,11 @@ message RemoteTensorHandle {
   int64 op_id = 1;
   // The index into the outputs of the operation that produced this tensor.
   int32 output_num = 2;
-  // Device of the operation that produced this tensor. Cannot be empty.
+  // Device where the tensor is located. Cannot be empty.
   // For multi-device functions, it's the default device passed to placer.
   string device = 3;
-  // Device where the tensor is located. Can be empty if the operation producing
-  // this tensor is a multi-device function.
+  // Device of the operation producing this tensor. Can be empty if the
+  // operation producing this tensor is a multi-device function.
   string op_device = 4;
   // Tensor type.
   DataType dtype = 5;
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 4df199d935b..c56fac42c1d 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -21,7 +21,7 @@ limitations under the License.
 // Also update tensorflow/tensorflow.bzl and
 // tensorflow/tools/pip_package/setup.py
 #define TF_MAJOR_VERSION 2
-#define TF_MINOR_VERSION 1
+#define TF_MINOR_VERSION 2
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 394  // Updated: 2020/5/7
+#define TF_GRAPH_DEF_VERSION 420  // Updated: 2020/6/2
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index 6184f52d240..5d1b7e1101f 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -1,6 +1,10 @@
 # Description: Utilities for TPU Operations
 
 package(
+    default_visibility = [
+        "//tensorflow/core/tpu:__subpackages__",
+        "//tensorflow/stream_executor/tpu:__subpackages__",
+    ],
     licenses = ["notice"],  # Apache 2.0
 )
 
@@ -32,3 +36,66 @@ cc_library(
         "//tensorflow/core/protobuf/tpu:tpu_embedding_output_layout_proto_cc",
     ],
 )
+
+cc_library(
+    name = "tpu_compilation_device",
+    srcs = ["tpu_compilation_device.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tpu_defs",
+        ":tpu_node_device_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tpu_node_device_util",
+    srcs = ["tpu_node_device_util.cc"],
+    hdrs = ["tpu_node_device_util.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:tf2xla_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "tpu_defs",
+    srcs = ["tpu_defs.cc"],
+    hdrs = ["tpu_defs.h"],
+    deps = ["//tensorflow/core:protos_all_cc"],
+)
+
+cc_library(
+    name = "tpu_configuration",
+    srcs = ["tpu_configuration.cc"],
+    hdrs = ["tpu_configuration.h"],
+    deps = ["//tensorflow/core:framework"],
+)
+
+cc_library(
+    name = "tpu_init_mode",
+    srcs = ["tpu_init_mode.cc"],
+    hdrs = ["tpu_init_mode.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "tpu_config_c_api",
+    hdrs = ["tpu_config_c_api.h"],
+    deps = [
+        "//tensorflow/c:tf_status",
+    ],
+)
+
+cc_library(
+    name = "tpu_library_loader",
+    srcs = ["tpu_library_loader.cc"],
+    hdrs = ["tpu_library_loader.h"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = ["//tensorflow/core/platform:status"],
+)
diff --git a/tensorflow/core/tpu/tpu_compilation_device.cc b/tensorflow/core/tpu/tpu_compilation_device.cc
new file mode 100644
index 00000000000..2b2314820bc
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_compilation_device.cc
@@ -0,0 +1,24 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/core/tpu/tpu_node_device_util.h"
+
+namespace tensorflow {
+
+REGISTER_XLA_BACKEND(DEVICE_TPU_XLA_JIT, kTpuAllTypes, TpuOpFilter);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_config_c_api.h b/tensorflow/core/tpu/tpu_config_c_api.h
new file mode 100644
index 00000000000..b7caf0648b1
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_config_c_api.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
+#define TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "tensorflow/c/tf_status.h"
+
+typedef struct TpuSerializedProto TpuSerializedProto;
+
+extern "C" {
+
+bool TPUHostInitialized();
+
+void ConfigureDistributedTpuOp_DoWork(const size_t num_cores_per_host_size,
+                                      const int32_t* num_cores_per_host,
+                                      size_t* host_config_output_size,
+                                      char** host_config_output,
+                                      TF_Status* status);
+
+void WaitForDistributedTpuOp_DoWork(
+    const size_t num_hosts, const size_t num_cores_per_host,
+    const int32_t** host_ordinal_to_global_core_id_map,
+    size_t* tpu_topology_output_size, char** tpu_topology_output,
+    TF_Status* status);
+
+void ShutdownDistributedTpuOp_DoWork(TF_Status* status);
+
+void InitializeHostForDistributedTpuOp_DoWork(
+    const size_t tpu_host_config_size, const char* tpu_host_config,
+    const bool enable_whole_mesh_compilations, size_t* core_id_output_size,
+    int32_t** core_id_output, TF_Status* status);
+
+void SetGlobalTPUArrayOp_DoWork(const size_t tpu_topology_size,
+                                const char* tpu_topology, TF_Status* status);
+
+void DisconnectDistributedTpuChipsOp_DoWork(int32_t* number_of_chips_output,
+                                            TF_Status* status);
+
+void TpuConfigurationApi_FreeCharArray(char* output);
+void TpuConfigurationApi_FreeInt32Array(int32_t* output);
+}
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
diff --git a/tensorflow/core/tpu/tpu_configuration.cc b/tensorflow/core/tpu/tpu_configuration.cc
new file mode 100644
index 00000000000..3788d5cc6c2
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_configuration.cc
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_configuration.h"
+
+namespace tensorflow {
+
+namespace {
+
+ResourceMgr* GetGlobalResourceMgr() {
+  static ResourceMgr* const rmgr = new ResourceMgr();
+  return rmgr;
+}
+
+}  // namespace
+
+#if !defined(PLATFORM_GOOGLE)
+// Used only by Google-internal tests, so deliberately left empty.
+void MaybeInitializeTPUSystemForTests() {}
+#endif
+
+ResourceMgr* GetTPUConfigResourceMgr() {
+  MaybeInitializeTPUSystemForTests();
+
+  // Put all TPU-related state in the global ResourceMgr. This includes the
+  // TpuPodState, compilation cache, etc. We don't use the TPU_SYSTEM
+  // ResourceMgr because there may be more than one TPU_SYSTEM ResourceMgr when
+  // DirectSession or isolate_session_state are used.
+  return GetGlobalResourceMgr();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.h b/tensorflow/core/tpu/tpu_configuration.h
similarity index 60%
rename from tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.h
rename to tensorflow/core/tpu/tpu_configuration.h
index 7250a9493bc..6c337bd0fe7 100644
--- a/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.h
+++ b/tensorflow/core/tpu/tpu_configuration.h
@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_COMPATIBILITY_ANALYSIS_H_
-#define TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_COMPATIBILITY_ANALYSIS_H_
+#ifndef TENSORFLOW_CORE_TPU_TPU_CONFIGURATION_H_
+#define TENSORFLOW_CORE_TPU_TPU_CONFIGURATION_H_
 
-#include "mlir/IR/Module.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tfrt/analysis/analysis.pb.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 
 namespace tensorflow {
 
-// Analyze a MLIR module in tf dialect.
-mlir::tfrt::CompatibilityAnalysisProto AnalyzeTFCompatibility(
-    mlir::ModuleOp op);
+void MaybeInitializeTPUSystemForTests();
+
+// Returns a process-wide global ResourceMgr.
+ResourceMgr* GetTPUConfigResourceMgr();
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_COMPATIBILITY_ANALYSIS_H_
+#endif  // TENSORFLOW_CORE_TPU_TPU_CONFIGURATION_H_
diff --git a/tensorflow/core/tpu/tpu_defs.cc b/tensorflow/core/tpu/tpu_defs.cc
new file mode 100644
index 00000000000..dc370ea2ba7
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_defs.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_defs.h"
+
+namespace tensorflow {
+
+const char* const DEVICE_TPU_NODE = "TPU";
+const char* const TPU_FAST_MEM_ATTR = "_TPU_FAST_MEM";
+const char* const DEVICE_TPU_REPLICATED_CORE = "TPU_REPLICATED_CORE";
+const char* const DEVICE_TPU_SYSTEM = "TPU_SYSTEM";
+const char* const DEVICE_TPU_XLA_JIT = "XLA_TPU_JIT";
+const char* const TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR =
+    "_mirrored_variable_indices";
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_defs.h b/tensorflow/core/tpu/tpu_defs.h
new file mode 100644
index 00000000000..497afb5c392
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_defs.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Common definitions related to TPUs.
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_DEFS_H_
+#define TENSORFLOW_CORE_TPU_TPU_DEFS_H_
+
+#include <array>
+
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+
+// Name of the TPU device, which corresponds to a single core.
+extern const char* const DEVICE_TPU_NODE;  // "TPU";
+
+// The TPU_REPLICATED_CORE device is a virtual device corresponding to one core
+// of a replicated TPU computation. Only valid within the body of a
+// TPUReplicate computation.
+extern const char* const DEVICE_TPU_REPLICATED_CORE;
+
+extern const char* const DEVICE_TPU_SYSTEM;  // "TPU_SYSTEM";
+
+// Name of the XLA_TPU_JIT compilation device, which is an internal device to
+// compile graphs for TPU. Not registered as a device; no operators can be
+// assigned to this device by a user.
+extern const char* const DEVICE_TPU_XLA_JIT;  // "XLA_TPU_JIT";
+
+// Attribute used internally to pass "is_mirrored_variable" attribute on
+// TPUReplicatedInput nodes to _TPUReplicate.
+extern const char* const TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR;
+
+// Attribute used internally to annoate ops which might consume TPU FastMem
+// variable.
+extern const char* const TPU_FAST_MEM_ATTR;  // "_TPU_FAST_MEM"
+
+// Supported types for TPUs.
+static constexpr std::array<DataType, 11> kTpuAllTypes = {
+    {DT_INT32, DT_UINT32, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL,
+     DT_COMPLEX64, DT_INT64, DT_UINT64, DT_QINT8, DT_QUINT8}};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_DEFS_H_
diff --git a/tensorflow/core/tpu/tpu_init_mode.cc b/tensorflow/core/tpu/tpu_init_mode.cc
new file mode 100644
index 00000000000..42952df29d8
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_init_mode.cc
@@ -0,0 +1,66 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_init_mode.h"
+
+#include <atomic>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+namespace {
+
+mutex init_mode_mutex(LINKER_INITIALIZED);
+TPUInitMode init_mode TF_GUARDED_BY(init_mode_mutex);
+
+}  // namespace
+
+namespace test {
+
+void ForceSetTPUInitMode(const TPUInitMode mode) {
+  mutex_lock l(init_mode_mutex);
+  init_mode = mode;
+}
+
+}  // namespace test
+
+Status SetTPUInitMode(const TPUInitMode mode) {
+  if (mode == TPUInitMode::kNone) {
+    return errors::InvalidArgument("State cannot be set to: ",
+                                   static_cast<int>(mode));
+  }
+  {
+    mutex_lock l(init_mode_mutex);
+    if (init_mode != TPUInitMode::kNone && mode != init_mode) {
+      return errors::FailedPrecondition(
+          "TPUInit already attempted with mode: ", static_cast<int>(init_mode),
+          " and cannot be changed to: ", static_cast<int>(mode),
+          ". You are most probably trying to initialize the TPU system, both "
+          "using the explicit API and using an initialization Op within the "
+          "graph; please choose one. ");
+    }
+    init_mode = mode;
+  }
+  return Status::OK();
+}
+
+TPUInitMode GetTPUInitMode() {
+  mutex_lock l(init_mode_mutex);
+  return init_mode;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_init_mode.h b/tensorflow/core/tpu/tpu_init_mode.h
new file mode 100644
index 00000000000..73ca68ad8a0
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_init_mode.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_INIT_MODE_H_
+#define TENSORFLOW_CORE_TPU_TPU_INIT_MODE_H_
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+enum class TPUInitMode : int { kNone, kGlobal, kRegular };
+
+// Sets the TPU initialization mode appropriately.
+//
+// Requires that mode is not kNone, and mode doesn't transition kGlobal
+// <-> kRegular.
+//
+// IMPLEMENTATION DETAILS:
+// Used internally to record the current mode and type of API used for TPU
+// initialization in a global static variable.
+Status SetTPUInitMode(TPUInitMode mode);
+
+// Returns the current TPUInitMode.
+TPUInitMode GetTPUInitMode();
+
+namespace test {
+
+// Forces the tpu init mode to be changed.
+void ForceSetTPUInitMode(TPUInitMode mode);
+
+}  // namespace test
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_INIT_MODE_H_
diff --git a/tensorflow/core/tpu/tpu_library_loader.cc b/tensorflow/core/tpu/tpu_library_loader.cc
new file mode 100644
index 00000000000..bfd9fe29efe
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_library_loader.cc
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_library_loader.h"
+
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace tpu {
+
+Status InitializeTPULibrary(void* library) {
+  // TODO(frankchn): dlsym the loaded library and populate a struct with the
+  // relevant C APIs necessary for TPUs.
+  return Status::OK();
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_library_loader.h b/tensorflow/core/tpu/tpu_library_loader.h
new file mode 100644
index 00000000000..35a7dd7c9be
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_library_loader.h
@@ -0,0 +1,29 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_LIBRARY_LOADER_H_
+#define TENSORFLOW_CORE_TPU_TPU_LIBRARY_LOADER_H_
+
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace tpu {
+
+Status InitializeTPULibrary(void* library);
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_LIBRARY_LOADER_H_
diff --git a/tensorflow/core/tpu/tpu_node_device_util.cc b/tensorflow/core/tpu/tpu_node_device_util.cc
new file mode 100644
index 00000000000..2dfd7d984d6
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_node_device_util.cc
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_node_device_util.h"
+
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace tensorflow {
+
+bool TpuOpFilter(KernelDef* kdef) {
+  StringPiece op(kdef->op());
+  VLOG(2) << "TpuOpFilter " << op;
+  // Enable const string operands to Assert op (b/69167214).
+  if (op == "Const") {
+    AddDtypeToKernelDefConstraint("dtype", DT_STRING, kdef);
+  }
+  if (op == "Assert") {
+    AddDtypeToKernelDefConstraint("T", DT_STRING, kdef);
+  }
+  return true;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_node_device_util.h b/tensorflow/core/tpu/tpu_node_device_util.h
new file mode 100644
index 00000000000..c6d5be9f5a6
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_node_device_util.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_NODE_DEVICE_H_
+#define TENSORFLOW_CORE_TPU_TPU_NODE_DEVICE_H_
+
+#include "tensorflow/core/framework/kernel_def.pb.h"
+
+namespace tensorflow {
+
+// This is a BackendOpFilter. (see tensorflow/compiler/tf2xla/xla_op_registry.h)
+// It returns true if the op should be registered on the device, it may
+// optionally modify the KernelDef.
+bool TpuOpFilter(KernelDef* kdef);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_NODE_DEVICE_H_
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index de2dce9c0c2..8e878c2464d 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -505,6 +505,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "incremental_barrier",
+    srcs = ["incremental_barrier.cc"],
+    hdrs = ["incremental_barrier.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/functional:bind_front",
+    ],
+)
+
 # Tests.
 
 tf_cc_test(
@@ -632,6 +642,20 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "incremental_barrier_test",
+    srcs = ["incremental_barrier_test.cc"],
+    deps = [
+        ":incremental_barrier",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform",
+        "@com_google_absl//absl/functional:bind_front",
+        "@com_google_absl//absl/time",
+    ],
+)
+
 # Proto libraries.
 tf_proto_library(
     name = "test_log_proto_impl",
diff --git a/tensorflow/core/util/batch_util.cc b/tensorflow/core/util/batch_util.cc
index 0aff6b00f1c..b88c365ced0 100644
--- a/tensorflow/core/util/batch_util.cc
+++ b/tensorflow/core/util/batch_util.cc
@@ -216,6 +216,79 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   }
 }
 
+Status CopyContiguousSlices(const Tensor& src, int64 src_offset,
+                            int64 dst_offset, int64 num_slices, Tensor* dst) {
+  if (src.dtype() != dst->dtype()) {
+    return errors::FailedPrecondition(
+        "CopyContiguousSlices cannot perform copy: src and dst have different "
+        "dtypes. Source dtype: ",
+        src.dtype(), " dstination dtype: ", dst->dtype(), ".");
+  }
+  if (src.dims() < 1) {
+    return errors::FailedPrecondition(
+        "CopyContiguousSlices cannot perform copy: src has to be a tensor with "
+        "rank >= 1. Source shape: ",
+        src.shape().DebugString());
+  }
+
+  if (dst->dims() < 1) {
+    return errors::FailedPrecondition(
+        "CopyContiguousSlices cannot perform copy: dst has to be a tensor "
+        "with rank >= 1. Dest shape: ",
+        dst->shape().DebugString());
+  }
+
+  const int64 src_dim0 = src.dim_size(0);
+  const int64 dst_dim0 = dst->dim_size(0);
+  int64 src_chip_size = 1;
+  int64 dst_chip_size = 1;
+  for (int i = 1; i < src.dims(); ++i) {
+    src_chip_size *= src.dim_size(i);
+  }
+  for (int i = 1; i < dst->dims(); ++i) {
+    dst_chip_size *= dst->dim_size(i);
+  }
+
+  if (src_chip_size != dst_chip_size) {
+    return errors::FailedPrecondition(
+        "CopyContiguousSlices cannot perform copy: source and dst shapes are"
+        "not compatible. Source shape: ",
+        src.shape().DebugString(), ", dst shape: ", dst->shape().DebugString());
+  }
+
+  if (src_chip_size == 0 && dst_chip_size == 0) {
+    return Status::OK();
+  }
+
+  if (src_offset < 0 || src_offset + num_slices > src_dim0 || dst_offset < 0 ||
+      dst_offset + num_slices > dst_dim0) {
+    return errors::FailedPrecondition(
+        "CopyContiguousSlices cannot perform copy: index out of range. "
+        "src_offset: ",
+        src_offset, ", num_slices: ", num_slices, ", src_dim0: ", src_dim0,
+        ", dst_offset: ", dst_offset, ", dst_dim0: ", dst_dim0, ".");
+  }
+
+#define HANDLE_TYPE(T)                                                 \
+  case DataTypeToEnum<T>::value: {                                     \
+    const T* src_p = src.base<T>() + (src_chip_size * src_offset);     \
+    T* dst_p = dst->base<T>() + (dst_chip_size * dst_offset);          \
+    HandleSliceToElement<T>(src_p, dst_p, src_chip_size * num_slices); \
+    return Status::OK();                                               \
+  }
+
+  switch (src.dtype()) {
+    TF_CALL_ALL_TYPES(HANDLE_TYPE);
+    TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+    TF_CALL_uint32(HANDLE_TYPE);
+    TF_CALL_uint64(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return errors::Unimplemented("CopyContiguousSlices unhandled data type: ",
+                                   src.dtype());
+  }
+}
+
 // Copies the index^th slice of parent (in the 0th dimension) into element.
 //
 // NOTE(mrry): The implementation may be able to optimize the copy to a move.
diff --git a/tensorflow/core/util/batch_util.h b/tensorflow/core/util/batch_util.h
index eee0309fbc4..d44d82ec0a2 100644
--- a/tensorflow/core/util/batch_util.h
+++ b/tensorflow/core/util/batch_util.h
@@ -32,6 +32,17 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
 // Copies the index^th slice of parent (in the 0th dimension) into element.
 Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
 
+// Copies 'num_slices' contiguous slices from 'src' tensor starting from index
+// 'src_offset' into target tensor 'dst', and places them into slices
+// starting from 'dst_offset'.
+//
+// This function requires 'src' and 'dst' to have compatible shapes. That is it
+// requires cum_prod(src.shape[1:] == cum_prod(dst->shape[1:]). For example if
+// source is of shape [x, 2, 1] and dst is a tensor of shape [y, 1, 2], this
+// function can still proceed successfully.
+Status CopyContiguousSlices(const Tensor& src, int64 src_offset,
+                            int64 dst_offset, int64 num_slices, Tensor* dst);
+
 // Copies the index^th slice of parent (in the 0th dimension) into element.
 //
 // NOTE(mrry): The implementation may be able to optimize the copy to a move.
diff --git a/tensorflow/core/util/debug_events_writer.cc b/tensorflow/core/util/debug_events_writer.cc
index 595f92d07c0..d9c3393ce3c 100644
--- a/tensorflow/core/util/debug_events_writer.cc
+++ b/tensorflow/core/util/debug_events_writer.cc
@@ -179,7 +179,7 @@ Status DebugEventsWriter::Init() {
   metadata->set_tensorflow_version(TF_VERSION_STRING);
   metadata->set_file_version(
       strings::Printf("%s%d", kVersionPrefix, kCurrentFormatVersion));
-  SerializeAndWriteDebugEvent(&debug_event, METADATA);
+  TF_RETURN_IF_ERROR(SerializeAndWriteDebugEvent(&debug_event, METADATA));
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       metadata_writer_->Flush(), "Failed to flush debug event metadata writer");
 
@@ -189,38 +189,38 @@ Status DebugEventsWriter::Init() {
   return Status::OK();
 }
 
-void DebugEventsWriter::WriteSourceFile(SourceFile* source_file) {
+Status DebugEventsWriter::WriteSourceFile(SourceFile* source_file) {
   DebugEvent debug_event;
   debug_event.set_allocated_source_file(source_file);
-  SerializeAndWriteDebugEvent(&debug_event, SOURCE_FILES);
+  return SerializeAndWriteDebugEvent(&debug_event, SOURCE_FILES);
 }
 
-void DebugEventsWriter::WriteStackFrameWithId(
+Status DebugEventsWriter::WriteStackFrameWithId(
     StackFrameWithId* stack_frame_with_id) {
   DebugEvent debug_event;
   debug_event.set_allocated_stack_frame_with_id(stack_frame_with_id);
-  SerializeAndWriteDebugEvent(&debug_event, STACK_FRAMES);
+  return SerializeAndWriteDebugEvent(&debug_event, STACK_FRAMES);
 }
 
-void DebugEventsWriter::WriteGraphOpCreation(
+Status DebugEventsWriter::WriteGraphOpCreation(
     GraphOpCreation* graph_op_creation) {
   DebugEvent debug_event;
   debug_event.set_allocated_graph_op_creation(graph_op_creation);
-  SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
+  return SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
 }
 
-void DebugEventsWriter::WriteDebuggedGraph(DebuggedGraph* debugged_graph) {
+Status DebugEventsWriter::WriteDebuggedGraph(DebuggedGraph* debugged_graph) {
   DebugEvent debug_event;
   debug_event.set_allocated_debugged_graph(debugged_graph);
-  SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
+  return SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
 }
 
-void DebugEventsWriter::WriteExecution(Execution* execution) {
+Status DebugEventsWriter::WriteExecution(Execution* execution) {
   if (circular_buffer_size_ <= 0) {
     // No cyclic-buffer behavior.
     DebugEvent debug_event;
     debug_event.set_allocated_execution(execution);
-    SerializeAndWriteDebugEvent(&debug_event, EXECUTION);
+    return SerializeAndWriteDebugEvent(&debug_event, EXECUTION);
   } else {
     // Circular buffer behavior.
     DebugEvent debug_event;
@@ -234,16 +234,18 @@ void DebugEventsWriter::WriteExecution(Execution* execution) {
     if (execution_buffer_.size() > circular_buffer_size_) {
       execution_buffer_.pop_front();
     }
+    return Status::OK();
   }
 }
 
-void DebugEventsWriter::WriteGraphExecutionTrace(
+Status DebugEventsWriter::WriteGraphExecutionTrace(
     GraphExecutionTrace* graph_execution_trace) {
+  TF_RETURN_IF_ERROR(Init());
   if (circular_buffer_size_ <= 0) {
     // No cyclic-buffer behavior.
     DebugEvent debug_event;
     debug_event.set_allocated_graph_execution_trace(graph_execution_trace);
-    SerializeAndWriteDebugEvent(&debug_event, GRAPH_EXECUTION_TRACES);
+    return SerializeAndWriteDebugEvent(&debug_event, GRAPH_EXECUTION_TRACES);
   } else {
     // Circular buffer behavior.
     DebugEvent debug_event;
@@ -257,15 +259,14 @@ void DebugEventsWriter::WriteGraphExecutionTrace(
     if (graph_execution_trace_buffer_.size() > circular_buffer_size_) {
       graph_execution_trace_buffer_.pop_front();
     }
+    return Status::OK();
   }
 }
 
-void DebugEventsWriter::WriteGraphExecutionTrace(const string& tfdbg_context_id,
-                                                 const string& device_name,
-                                                 const string& op_name,
-                                                 int32 output_slot,
-                                                 int32 tensor_debug_mode,
-                                                 const Tensor& tensor_value) {
+Status DebugEventsWriter::WriteGraphExecutionTrace(
+    const string& tfdbg_context_id, const string& device_name,
+    const string& op_name, int32 output_slot, int32 tensor_debug_mode,
+    const Tensor& tensor_value) {
   std::unique_ptr<GraphExecutionTrace> trace(new GraphExecutionTrace());
   trace->set_tfdbg_context_id(tfdbg_context_id);
   if (!op_name.empty()) {
@@ -279,7 +280,7 @@ void DebugEventsWriter::WriteGraphExecutionTrace(const string& tfdbg_context_id,
   }
   trace->set_device_name(device_name);
   tensor_value.AsProtoTensorContent(trace->mutable_tensor_proto());
-  WriteGraphExecutionTrace(trace.release());
+  return WriteGraphExecutionTrace(trace.release());
 }
 
 void DebugEventsWriter::WriteSerializedNonExecutionDebugEvent(
@@ -487,8 +488,8 @@ Status DebugEventsWriter::InitNonMetadataFile(DebugEventFileType type) {
   return Status::OK();
 }
 
-void DebugEventsWriter::SerializeAndWriteDebugEvent(DebugEvent* debug_event,
-                                                    DebugEventFileType type) {
+Status DebugEventsWriter::SerializeAndWriteDebugEvent(DebugEvent* debug_event,
+                                                      DebugEventFileType type) {
   std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
   SelectWriter(type, &writer);
   if (writer != nullptr) {
@@ -497,6 +498,11 @@ void DebugEventsWriter::SerializeAndWriteDebugEvent(DebugEvent* debug_event,
     string str;
     debug_event->AppendToString(&str);
     (*writer)->WriteSerializedDebugEvent(str);
+    return Status::OK();
+  } else {
+    return errors::Internal(
+        "Unable to find debug events file writer for DebugEventsFileType ",
+        type);
   }
 }
 
diff --git a/tensorflow/core/util/debug_events_writer.h b/tensorflow/core/util/debug_events_writer.h
index 6d219d7c9ef..39835adf1a6 100644
--- a/tensorflow/core/util/debug_events_writer.h
+++ b/tensorflow/core/util/debug_events_writer.h
@@ -119,27 +119,27 @@ class DebugEventsWriter {
   // The four DebugEvent fields below are written _without_ the circular buffer.
   // Source file contents are written to the *.source_files file.
   // Takes ownership of source_file.
-  void WriteSourceFile(SourceFile* source_file);
+  Status WriteSourceFile(SourceFile* source_file);
   // Stack frames are written to the *.code_locations file.
   // Takes ownership of stack_frame_with_id.
-  void WriteStackFrameWithId(StackFrameWithId* stack_frame_with_id);
+  Status WriteStackFrameWithId(StackFrameWithId* stack_frame_with_id);
   // Graph op creation events are written to the *.graphs file.
   // Takes ownership of graph_op_creation.
-  void WriteGraphOpCreation(GraphOpCreation* graph_op_creation);
+  Status WriteGraphOpCreation(GraphOpCreation* graph_op_creation);
   // Debugged graphs are written to the *.graphs file.
   // Takes ownership of debugged_graph.
-  void WriteDebuggedGraph(DebuggedGraph* debugged_graph);
+  Status WriteDebuggedGraph(DebuggedGraph* debugged_graph);
 
   // The two DebugEvent fields below are written to the circular buffer
   // and saved to disk only at the FlushExecutionFiles() call.
   // Execution events (eager execution of an op or a tf.function) are written to
   // the *.execution file.
   // Takes ownership of execution.
-  void WriteExecution(Execution* execution);
+  Status WriteExecution(Execution* execution);
   // Graph execution traces (graph-internal tensor values or their summaries)
   // are written to the *.graph_execution_traces file.
   // Takes ownership of graph_execution_trace.
-  void WriteGraphExecutionTrace(GraphExecutionTrace* graph_execution_trace);
+  Status WriteGraphExecutionTrace(GraphExecutionTrace* graph_execution_trace);
 
   // Write a graph execution trace without using a protocol buffer.
   // Instead, pass the raw values related to the graph execution trace.
@@ -155,11 +155,11 @@ class DebugEventsWriter {
   //   tensor_value: The value of the tensor that describes the tensor(s)
   //     that this trace is concerned with. The semantics of this tensor value
   //     depends on the value of `tensor_debug_mode`.
-  void WriteGraphExecutionTrace(const string& tfdbg_context_id,
-                                const string& device_name,
-                                const string& op_name, int32 output_slot,
-                                int32 tensor_debug_mode,
-                                const Tensor& tensor_value);
+  Status WriteGraphExecutionTrace(const string& tfdbg_context_id,
+                                  const string& device_name,
+                                  const string& op_name, int32 output_slot,
+                                  int32 tensor_debug_mode,
+                                  const Tensor& tensor_value);
 
   // Writes a serialized DebugEvent to one of the debug-events files
   // concerned with the non-execution events: the SOURCE_FILES, STACK_FRAMES
@@ -217,8 +217,8 @@ class DebugEventsWriter {
   // Initialize the TFRecord writer for non-metadata file type.
   Status InitNonMetadataFile(DebugEventFileType type);
 
-  void SerializeAndWriteDebugEvent(DebugEvent* debug_event,
-                                   DebugEventFileType type);
+  Status SerializeAndWriteDebugEvent(DebugEvent* debug_event,
+                                     DebugEventFileType type);
 
   void SelectWriter(DebugEventFileType type,
                     std::unique_ptr<SingleDebugEventFileWriter>** writer);
diff --git a/tensorflow/core/util/debug_events_writer_test.cc b/tensorflow/core/util/debug_events_writer_test.cc
index 66cde55864b..bd0c731bc90 100644
--- a/tensorflow/core/util/debug_events_writer_test.cc
+++ b/tensorflow/core/util/debug_events_writer_test.cc
@@ -263,7 +263,7 @@ TEST_F(DebugEventsWriterTest, WriteSourceFile) {
   source_file_1->add_lines("");
   source_file_1->add_lines("print(tf.constant([42.0]))");
   source_file_1->add_lines("");
-  writer->WriteSourceFile(source_file_1);
+  TF_ASSERT_OK(writer->WriteSourceFile(source_file_1));
 
   SourceFile* source_file_2 = new SourceFile();
   source_file_2->set_file_path("/home/tf_programs/train.py");
@@ -271,7 +271,7 @@ TEST_F(DebugEventsWriterTest, WriteSourceFile) {
   source_file_2->add_lines("import tensorflow.keras as keras");
   source_file_2->add_lines("");
   source_file_2->add_lines("model = keras.Sequential()");
-  writer->WriteSourceFile(source_file_2);
+  TF_ASSERT_OK(writer->WriteSourceFile(source_file_2));
 
   TF_ASSERT_OK(writer->FlushNonExecutionFiles());
   TF_ASSERT_OK(writer->Close());
@@ -336,8 +336,8 @@ TEST_F(DebugEventsWriterTest, WriteStackFramesFile) {
   file_line_col->set_func("my_func");
   file_line_col->set_code("  x = x ** 2.0");
 
-  writer->WriteStackFrameWithId(stack_frame_1);
-  writer->WriteStackFrameWithId(stack_frame_2);
+  TF_ASSERT_OK(writer->WriteStackFrameWithId(stack_frame_1));
+  TF_ASSERT_OK(writer->WriteStackFrameWithId(stack_frame_2));
   TF_ASSERT_OK(writer->FlushNonExecutionFiles());
   TF_ASSERT_OK(writer->Close());
 
@@ -382,12 +382,12 @@ TEST_F(DebugEventsWriterTest, WriteGraphOpCreationAndDebuggedGraph) {
   GraphOpCreation* graph_op_creation = new GraphOpCreation();
   graph_op_creation->set_op_type("MatMul");
   graph_op_creation->set_op_name("Dense_1/MatMul");
-  writer->WriteGraphOpCreation(graph_op_creation);
+  TF_ASSERT_OK(writer->WriteGraphOpCreation(graph_op_creation));
 
   DebuggedGraph* debugged_graph = new DebuggedGraph();
   debugged_graph->set_graph_id("deadbeaf");
   debugged_graph->set_graph_name("my_func_graph");
-  writer->WriteDebuggedGraph(debugged_graph);
+  TF_ASSERT_OK(writer->WriteDebuggedGraph(debugged_graph));
 
   TF_ASSERT_OK(writer->FlushNonExecutionFiles());
   TF_ASSERT_OK(writer->Close());
@@ -428,7 +428,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheSameFile) {
     SourceFile* source_file = new SourceFile();
     source_file->set_file_path(file_path);
     source_file->set_host_name("localhost.localdomain");
-    writer->WriteSourceFile(source_file);
+    TF_ASSERT_OK(writer->WriteSourceFile(source_file));
   };
   for (size_t i = 0; i < kConcurrentWrites; ++i) {
     thread_pool->Schedule(fn);
@@ -469,7 +469,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteAndFlushCallsToTheSameFile) {
     SourceFile* source_file = new SourceFile();
     source_file->set_file_path(file_path);
     source_file->set_host_name("localhost.localdomain");
-    writer->WriteSourceFile(source_file);
+    TF_ASSERT_OK(writer->WriteSourceFile(source_file));
     TF_ASSERT_OK(writer->FlushNonExecutionFiles());
   };
   for (size_t i = 0; i < kConcurrentWrites; ++i) {
@@ -512,16 +512,16 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) {
       source_file->set_file_path(
           strings::Printf("/home/tf_programs/program_%.2d.py", index));
       source_file->set_host_name("localhost.localdomain");
-      writer->WriteSourceFile(source_file);
+      TF_ASSERT_OK(writer->WriteSourceFile(source_file));
     } else if (index % 3 == 1) {
       StackFrameWithId* stack_frame = new StackFrameWithId();
       stack_frame->set_id(strings::Printf("e%.2d", index));
-      writer->WriteStackFrameWithId(stack_frame);
+      TF_ASSERT_OK(writer->WriteStackFrameWithId(stack_frame));
     } else {
       GraphOpCreation* op_creation = new GraphOpCreation();
       op_creation->set_op_type("Log");
       op_creation->set_op_name(strings::Printf("Log_%.2d", index));
-      writer->WriteGraphOpCreation(op_creation);
+      TF_ASSERT_OK(writer->WriteGraphOpCreation(op_creation));
     }
   };
   for (size_t i = 0; i < kConcurrentWrites; ++i) {
@@ -586,7 +586,7 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferNoFlush) {
     Execution* execution = new Execution();
     execution->set_op_type("Log");
     execution->add_input_tensor_ids(i);
-    writer->WriteExecution(execution);
+    TF_ASSERT_OK(writer->WriteExecution(execution));
   }
 
   std::vector<DebugEvent> actuals;
@@ -611,7 +611,7 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferFlush) {
     Execution* execution = new Execution();
     execution->set_op_type("Log");
     execution->add_input_tensor_ids(i);
-    writer->WriteExecution(execution);
+    TF_ASSERT_OK(writer->WriteExecution(execution));
   }
 
   TF_ASSERT_OK(writer->FlushExecutionFiles());
@@ -637,7 +637,7 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferFlush) {
     Execution* execution = new Execution();
     execution->set_op_type("Abs");
     execution->add_input_tensor_ids(counter.fetch_add(1));
-    writer->WriteExecution(execution);
+    TF_ASSERT_OK(writer->WriteExecution(execution));
   };
   for (size_t i = 0; i < kCyclicBufferSize * 2; ++i) {
     thread_pool->Schedule(fn);
@@ -682,7 +682,7 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferNoFlush) {
   for (size_t i = 0; i < kCyclicBufferSize * 2; ++i) {
     GraphExecutionTrace* trace = new GraphExecutionTrace();
     trace->set_tfdbg_context_id(strings::Printf("graph_%.2ld", i));
-    writer->WriteGraphExecutionTrace(trace);
+    TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
   }
 
   std::vector<DebugEvent> actuals;
@@ -695,6 +695,31 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferNoFlush) {
   TF_ASSERT_OK(writer->Close());
 }
 
+TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithoutPreviousInitCall) {
+  const size_t kCyclicBufferSize = -1;
+  DebugEventsWriter* writer =
+      DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize);
+  // NOTE(cais): `writer->Init()` is not called here before
+  // WriteGraphExecutionTrace() is called. This test checks that this is okay
+  // and the `GraphExecutionTrace` gets written correctly even without `Init()`
+  // being called first. This scenario can happen when a TF Graph with tfdbg
+  // debug ops are executed on a remote TF server.
+
+  GraphExecutionTrace* trace = new GraphExecutionTrace();
+  trace->set_tfdbg_context_id(strings::Printf("graph_0"));
+  TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
+  TF_ASSERT_OK(writer->FlushExecutionFiles());
+
+  std::vector<DebugEvent> actuals;
+  ReadDebugEventProtos(writer, DebugEventFileType::GRAPH_EXECUTION_TRACES,
+                       &actuals);
+  EXPECT_EQ(actuals.size(), 1);
+  EXPECT_EQ(actuals[0].graph_execution_trace().tfdbg_context_id(), "graph_0");
+
+  // Close the writer so the files can be safely deleted.
+  TF_ASSERT_OK(writer->Close());
+}
+
 TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) {
   const size_t kCyclicBufferSize = 10;
   DebugEventsWriter* writer =
@@ -706,7 +731,7 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) {
   for (size_t i = 0; i < kCyclicBufferSize * 2; ++i) {
     GraphExecutionTrace* trace = new GraphExecutionTrace();
     trace->set_tfdbg_context_id(strings::Printf("graph_%.2ld", i));
-    writer->WriteGraphExecutionTrace(trace);
+    TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
   }
 
   TF_ASSERT_OK(writer->FlushExecutionFiles());
@@ -731,7 +756,7 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) {
     GraphExecutionTrace* trace = new GraphExecutionTrace();
     trace->set_tfdbg_context_id(
         strings::Printf("new_graph_%.2ld", counter.fetch_add(1)));
-    writer->WriteGraphExecutionTrace(trace);
+    TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
   };
   for (size_t i = 0; i < kCyclicBufferSize * 2; ++i) {
     thread_pool->Schedule(fn);
@@ -818,7 +843,7 @@ TEST_F(DebugEventsWriterTest, DisableCyclicBufferBehavior) {
     Execution* execution = new Execution();
     execution->set_op_type("Log");
     execution->add_input_tensor_ids(i);
-    writer->WriteExecution(execution);
+    TF_ASSERT_OK(writer->WriteExecution(execution));
   }
   TF_ASSERT_OK(writer->FlushExecutionFiles());
 
@@ -834,7 +859,7 @@ TEST_F(DebugEventsWriterTest, DisableCyclicBufferBehavior) {
   for (size_t i = 0; i < kNumEvents; ++i) {
     GraphExecutionTrace* trace = new GraphExecutionTrace();
     trace->set_tfdbg_context_id(strings::Printf("graph_%.2ld", i));
-    writer->WriteGraphExecutionTrace(trace);
+    TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
   }
   TF_ASSERT_OK(writer->FlushExecutionFiles());
 
diff --git a/tensorflow/core/util/incremental_barrier.cc b/tensorflow/core/util/incremental_barrier.cc
new file mode 100644
index 00000000000..cbea7f25cc5
--- /dev/null
+++ b/tensorflow/core/util/incremental_barrier.cc
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/incremental_barrier.h"
+
+#include <atomic>
+#include <functional>
+
+#include "absl/functional/bind_front.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+class InternalIncrementalBarrier {
+ public:
+  explicit InternalIncrementalBarrier(IncrementalBarrier::DoneCallback callback)
+      : left_(1), done_callback_(std::move(callback)) {}
+
+  void operator()() {
+    DCHECK_GE(left_.load(std::memory_order_relaxed), 0);
+
+    if (left_.fetch_sub(1, std::memory_order_acq_rel) - 1 == 0) {
+      IncrementalBarrier::DoneCallback done_callback =
+          std::move(done_callback_);
+      delete this;
+      done_callback();
+    }
+  }
+
+  IncrementalBarrier::BarrierCallback Inc() {
+    left_.fetch_add(1, std::memory_order_acq_rel);
+
+    // std::bind_front is only available ever since C++20.
+    return absl::bind_front(&InternalIncrementalBarrier::operator(), this);
+  }
+
+ private:
+  std::atomic<int> left_;
+  IncrementalBarrier::DoneCallback done_callback_;
+};
+
+IncrementalBarrier::IncrementalBarrier(DoneCallback done_callback)
+    : internal_barrier_(
+          new InternalIncrementalBarrier(std::move(done_callback))) {}
+
+IncrementalBarrier::~IncrementalBarrier() { (*internal_barrier_)(); }
+
+IncrementalBarrier::BarrierCallback IncrementalBarrier::Inc() {
+  return internal_barrier_->Inc();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/incremental_barrier.h b/tensorflow/core/util/incremental_barrier.h
new file mode 100644
index 00000000000..be45e9d4d8b
--- /dev/null
+++ b/tensorflow/core/util/incremental_barrier.h
@@ -0,0 +1,81 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_INCREMENTAL_BARRIER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_INCREMENTAL_BARRIER_H_
+
+#include <atomic>
+#include <functional>
+
+namespace tensorflow {
+
+class InternalIncrementalBarrier;
+
+// BarrierClosure (see
+// https://github.com/chromium/chromium/blob/master/base/barrier_closure.h)
+// executes a callback after it has been invoked |num_closures| times.
+// Plus, `BarrierClosure` is a continuation-passing style abstraction and self-
+// deleting.
+
+// IncrementalBarrier is a convenience class to be used in place of a barrier
+// closure, which is particularly helpful (e.g. simplify code) because callers
+// don't need to calculate the |num_closures| beforehand.
+//
+// Example Usage:
+//   void MakeCalls() {
+//     typedef std::function<void()> Callback;
+//     typedef std::function<void(Callback)> OtherCallback;
+//     Callback done_callback = ...
+//     OtherCallback cb1 = ...
+//     OtherCallback cb2 = ...
+//     std::thread threads[2];
+//     {
+//         IncrementalBarrier barrier(done_callback);
+//         threads[0] = std::thread(cb1(barrier.Inc());
+//         threads[1] = std::thread(cb2(barrier.Inc());
+//         ... at this moment, `barrier` is incremented twice, and then
+//         destructed....
+//     }
+//     threads[0].join();
+//     threads[1].join();
+//   }
+//
+//  `done_callback` will be called when both conditions are true:
+//  1) after `barrier` is destructed.
+//  2) Each `BarrierCallback` returned by `Inc` is called.
+// This class is thread-safe.
+class IncrementalBarrier {
+ public:
+  typedef std::function<void()> DoneCallback;
+  typedef std::function<void()> BarrierCallback;
+  explicit IncrementalBarrier(DoneCallback callback);
+
+  ~IncrementalBarrier();
+
+  // Returns a BarrierCallback (std::function) that individual task call to
+  // signal its completeness.
+  // The returned BarrierCallback outlives this `IncrementalBarrier` instance.
+  // Furthermore, each task should eventually call the returned function, or
+  // else done_callback wouldn't be called.
+  BarrierCallback Inc();
+
+ private:
+  // self-deleting, thereby not owned by 'IncrementalBarrier'.
+  InternalIncrementalBarrier* internal_barrier_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_INCREMENTAL_BARRIER_H_
diff --git a/tensorflow/core/util/incremental_barrier_test.cc b/tensorflow/core/util/incremental_barrier_test.cc
new file mode 100644
index 00000000000..020cb9ece32
--- /dev/null
+++ b/tensorflow/core/util/incremental_barrier_test.cc
@@ -0,0 +1,133 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/incremental_barrier.h"
+
+#include <atomic>
+
+#include "absl/functional/bind_front.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/threadpool.h"
+
+namespace tensorflow {
+namespace {
+
+// A thread-safe counter class.
+class Counter {
+ public:
+  void Increment() TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    ++count_;
+  }
+
+  int GetCount() TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    return count_;
+  }
+
+ private:
+  mutex mu_;
+  int count_ = 0;
+};
+
+TEST(IncrementalBarrierTest, RunInstantlyWhenZeroClosure) {
+  Counter counter;
+  EXPECT_EQ(counter.GetCount(), 0);
+  {
+    IncrementalBarrier::DoneCallback done_callback =
+        absl::bind_front(&Counter::Increment, &counter);
+    IncrementalBarrier barrier(done_callback);
+    EXPECT_EQ(counter.GetCount(), 0);
+  }
+  EXPECT_EQ(counter.GetCount(), 1);
+}
+
+TEST(IncrementalBarrierTest, RunAfterNumClosuresOneNowTwoLater) {
+  Counter counter;
+
+  IncrementalBarrier::BarrierCallback bc1, bc2;
+  {
+    IncrementalBarrier::DoneCallback done_callback =
+        absl::bind_front(&Counter::Increment, &counter);
+    IncrementalBarrier barrier(done_callback);
+
+    CHECK_EQ(counter.GetCount(), 0);
+
+    bc1 = barrier.Inc();
+    bc2 = barrier.Inc();
+
+    IncrementalBarrier::BarrierCallback bc3 = barrier.Inc();
+    bc3();
+
+    CHECK_EQ(counter.GetCount(), 0);
+  }
+
+  CHECK_EQ(counter.GetCount(), 0);
+  bc1();
+  CHECK_EQ(counter.GetCount(), 0);
+  bc2();
+  CHECK_EQ(counter.GetCount(), 1);
+}
+
+TEST(IncrementalBarrierTest, RunAfterNumClosuresConcurrency) {
+  const int num_closure = 100, num_thread = 2;
+  std::atomic<int> schedule_count{0};
+  Counter counter;
+
+  {
+    IncrementalBarrier::DoneCallback done_callback =
+        absl::bind_front(&Counter::Increment, &counter);
+    IncrementalBarrier barrier(done_callback);
+
+    CHECK_EQ(counter.GetCount(), 0);
+
+    tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(),
+                                        "BarrierClosure", num_thread);
+    for (int i = 0; i < num_closure; ++i) {
+      pool.Schedule([&barrier, &schedule_count]() {
+        schedule_count.fetch_add(1);
+        IncrementalBarrier::BarrierCallback bc = barrier.Inc();
+
+        Env::Default()->SleepForMicroseconds(100);
+        bc();
+      });
+    }
+
+    CHECK_EQ(counter.GetCount(), 0);
+  }
+
+  CHECK_EQ(schedule_count.load(std::memory_order_relaxed), 100);
+  CHECK_EQ(counter.GetCount(), 1);
+}
+
+#if defined(PLATFORM_GOOGLE)
+void BM_FunctionInc(benchmark::State& state) {
+  IncrementalBarrier barrier([] {});
+  for (auto _ : state) {
+    barrier.Inc()();
+  }
+}
+
+BENCHMARK(BM_FunctionInc);
+#endif  // PLATFORM_GOOGLE
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/mkl_threadpool.h b/tensorflow/core/util/mkl_threadpool.h
index da4b516d3b8..493f7732b8f 100644
--- a/tensorflow/core/util/mkl_threadpool.h
+++ b/tensorflow/core/util/mkl_threadpool.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 #include <vector>
-
 #include "mkldnn.hpp"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/threadpool.h"
@@ -114,6 +113,7 @@ class MklDnnThreadPoolWrapper {
     return instance_;
   }
   MklDnnThreadPool* CreateThreadPoolPtr(OpKernelContext* ctx) {
+    mutex_lock l(m_);
     if (threadpool_map_.empty() ||
         threadpool_map_.find(ctx->device()) == threadpool_map_.end()) {
       auto tp_iface = new MklDnnThreadPool(ctx);
@@ -126,6 +126,7 @@ class MklDnnThreadPoolWrapper {
   }
 
  private:
+  mutex m_;
   std::unordered_map<DeviceBase*, MklDnnThreadPool*> threadpool_map_;
   MklDnnThreadPoolWrapper() {}
   MklDnnThreadPoolWrapper(const MklDnnThreadPoolWrapper&) = delete;
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index 2654d126e86..54e24da0ff5 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -474,7 +474,9 @@ inline SparseTensor SparseTensor::Concat(
     const int st_num_entries = st.num_entries();
 
     // Fill in indices & values.
-    std::copy_n(&st.vals_.vec<T>()(0), st_num_entries, &vals_t(offset));
+    if (st_num_entries > 0) {
+      std::copy_n(&st.vals_.vec<T>()(0), st_num_entries, &vals_t(offset));
+    }
 
     const auto* st_ix = &st.ix_.matrix<int64>()(0, 0);
     auto* ix_out = &ix_t(offset, 0);
diff --git a/tensorflow/core/util/test_log.proto b/tensorflow/core/util/test_log.proto
index ddb7a0275ac..6d3af02e657 100644
--- a/tensorflow/core/util/test_log.proto
+++ b/tensorflow/core/util/test_log.proto
@@ -1,6 +1,8 @@
 // Protocol messages for describing the results of benchmarks and unit tests.
 syntax = "proto3";
 
+package tensorflow;
+
 import "google/protobuf/any.proto";
 import "google/protobuf/wrappers.proto";
 
@@ -9,14 +11,12 @@ option java_outer_classname = "TestLogProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util.testlog";
 
-package tensorflow;
-
 message EntryValue {
   oneof kind {
     double double_value = 1;
     string string_value = 2;
   }
-};
+}
 
 message MetricEntry {
   // Metric name
@@ -62,7 +62,7 @@ message BenchmarkEntry {
   // Metric name, value and expected range. This can include accuracy metrics
   // typically used to determine whether the accuracy test has passed
   repeated MetricEntry metrics = 7;
-};
+}
 
 message BenchmarkEntries {
   repeated BenchmarkEntry entry = 1;
@@ -72,7 +72,7 @@ message BuildConfiguration {
   string mode = 1;               // opt, dbg, etc
   repeated string cc_flags = 2;  // CC compiler flags, if known
   repeated string opts = 3;      // Bazel compilation options, if known
-};
+}
 
 message CommitId {
   oneof kind {
@@ -85,7 +85,7 @@ message CommitId {
   string snapshot = 3;
   // Changelist tested if the change list is not already submitted.
   int64 pending_changelist = 4;
-};
+}
 
 message CPUInfo {
   int64 num_cores = 1;
@@ -105,7 +105,7 @@ message CPUInfo {
 
   // Cache sizes (in bytes), e.g. "L2": 262144 (for 256KB)
   map<string, int64> cache_size = 6;
-};
+}
 
 message MemoryInfo {
   int64 total = 1;      // Total virtual memory in bytes
@@ -113,26 +113,26 @@ message MemoryInfo {
 }
 
 message GPUInfo {
-  string model = 1;  // e.g. "Tesla K40c"
-  string uuid = 2;   // Final entry in output of "nvidia-smi -L"
+  string model = 1;   // e.g. "Tesla K40c"
+  string uuid = 2;    // Final entry in output of "nvidia-smi -L"
   string bus_id = 3;  // e.g. "0000:04:00.0"
-};
+}
 
 message PlatformInfo {
-  string bits = 1;       // e.g. '64bit'
-  string linkage = 2;    // e.g. 'ELF'
-  string machine = 3;    // e.g. 'i386'
-  string release = 4;    // e.g. '3.13.0-76-generic'
-  string system = 5;     // e.g. 'Linux'
-  string version = 6;    // e.g. '#120-Ubuntu SMP Mon Jan 18 15:59:10 UTC 2016'
-};
+  string bits = 1;     // e.g. '64bit'
+  string linkage = 2;  // e.g. 'ELF'
+  string machine = 3;  // e.g. 'i386'
+  string release = 4;  // e.g. '3.13.0-76-generic'
+  string system = 5;   // e.g. 'Linux'
+  string version = 6;  // e.g. '#120-Ubuntu SMP Mon Jan 18 15:59:10 UTC 2016'
+}
 
 message AvailableDeviceInfo {       // Matches DeviceAttributes
   string name = 1;                  // Device name.
   string type = 2;                  // Device type, e.g. 'CPU' or 'GPU'.
   int64 memory_limit = 3;           // Memory capacity in bytes.
   string physical_description = 4;  // The physical description of this device.
-};
+}
 
 message MachineConfiguration {
   // Host name of machine that ran the benchmark.
@@ -154,7 +154,7 @@ message MachineConfiguration {
   repeated AvailableDeviceInfo available_device_info = 5;
 
   MemoryInfo memory_info = 6;
-};
+}
 
 // Run-specific items such as arguments to the test / benchmark.
 message RunConfiguration {
@@ -206,6 +206,7 @@ message TestResults {
     PYTHON_BENCHMARK = 2;
     ANDROID_BENCHMARK = 3;
     EDGE_BENCHMARK = 4;
+    IOS_BENCHMARK = 5;
   }
   BenchmarkType benchmark_type = 10;
 
@@ -219,4 +220,4 @@ message TestResults {
   // TensorFlow version this benchmark runs against.
   // This can be either set to full version or just the major version.
   string tf_version = 12;
-};
+}
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index 162a44ac109..a0e5005d45a 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -35,7 +35,7 @@ tf_cc_binary(
             # cc:cc_ops is used to include image ops (for label_image)
             # Jpg, gif, and png related code won't be included
             "//tensorflow/cc:cc_ops",
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
             # cc:android_tensorflow_image_op is for including jpeg/gif/png
             # decoder to enable real-image evaluation on Android
             "//tensorflow/core/kernels:android_tensorflow_image_op",
diff --git a/tensorflow/examples/saved_model/integration_tests/BUILD b/tensorflow/examples/saved_model/integration_tests/BUILD
index 4f55cfa3042..0bfbee1ee2a 100644
--- a/tensorflow/examples/saved_model/integration_tests/BUILD
+++ b/tensorflow/examples/saved_model/integration_tests/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow/core/platform/default:distribute.bzl", "distribute_py_test")
 
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -48,7 +48,7 @@ py_library(
     ],
 )
 
-cuda_py_test(
+distribute_py_test(
     name = "saved_model_test",
     srcs = [
         "saved_model_test.py",
@@ -60,6 +60,9 @@ cuda_py_test(
         "nomsan",  # forge input size exceeded
         "notsan",  # forge input size exceeded
     ],
+    tpu_tags = [
+        "no_oss",  # Test infra collision (b/157754990)
+    ],
     deps = [
         ":distribution_strategy_utils",
         ":integration_scripts",
diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index 95547045111..c4ea8abb543 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -567,11 +567,10 @@ func isListAttr(attrdef *odpb.OpDef_AttrDef) bool {
 // This is useful when 's' corresponds to a "oneof" protocol buffer message.
 // For example, consider the protocol buffer message:
 //   oneof value { bool b = 1;  int64 i = 2; }
-// String() on a Go corresponding object (using proto.CompactTextString) will
-// print "b:true", or "i:7" etc. This function strips out the leading "b:" or
-// "i:".
-func stripLeadingColon(s fmt.Stringer) string {
-	x := s.String()
+// proto.CompactTextString) will print "b:true", or "i:7" etc. This function
+// strips out the leading "b:" or "i:".
+func stripLeadingColon(m proto.Message) string {
+	x := proto.CompactTextString(m)
 	y := strings.SplitN(x, ":", 2)
 	if len(y) < 2 {
 		return x
diff --git a/tensorflow/go/genop/internal/genop_test.go b/tensorflow/go/genop/internal/genop_test.go
index a339d181e8d..b467efc7aea 100644
--- a/tensorflow/go/genop/internal/genop_test.go
+++ b/tensorflow/go/genop/internal/genop_test.go
@@ -533,6 +533,261 @@ func TestOp(scope *Scope, bb tf.Output, aa tf.Output, optional ...TestOpAttr) (c
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
+`,
+		},
+		{
+			tag: "SampleDistortedBoundingBox",
+			opdef: `
+name: "SampleDistortedBoundingBox"
+input_arg {
+	name: "image_size"
+	type_attr: "T"
+}
+input_arg {
+	name: "bounding_boxes"
+	type: DT_FLOAT
+}
+output_arg {
+	name: "begin"
+	type_attr: "T"
+}
+output_arg {
+	name: "size"
+	type_attr: "T"
+}
+output_arg {
+	name: "bboxes"
+	type: DT_FLOAT
+}
+attr {
+	name: "T"
+	type: "type"
+	allowed_values {
+		list {
+			type: DT_UINT8
+			type: DT_INT8
+			type: DT_INT16
+			type: DT_INT32
+			type: DT_INT64
+		}
+	}
+}
+attr {
+	name: "seed"
+	type: "int"
+	default_value {
+		i: 0
+	}
+}
+attr {
+	name: "seed2"
+	type: "int"
+	default_value {
+		i: 0
+	}
+}
+attr {
+	name: "min_object_covered"
+	type: "float"
+	default_value {
+		f: 0.1
+	}
+}
+attr {
+	name: "aspect_ratio_range"
+	type: "list(float)"
+	default_value {
+		list {
+			f: 0.75
+			f: 1.33
+		}
+	}
+}
+attr {
+	name: "area_range"
+	type: "list(float)"
+	default_value {
+		list {
+			f: 0.05
+			f: 1
+		}
+	}
+}
+attr {
+	name: "max_attempts"
+	type: "int"
+	default_value {
+		i: 100
+	}
+}
+attr {
+	name: "use_image_if_no_bounding_boxes"
+	type: "bool"
+	default_value {
+		b: false
+	}
+}
+is_stateful: true
+`,
+			apidef: `
+op {
+  graph_op_name: "SampleDistortedBoundingBox"
+  in_arg {
+    name: "image_size"
+    description: "Blah blah"
+  }
+  in_arg {
+    name: "bounding_boxes"
+    description: "Blah blah"
+  }
+  out_arg {
+    name: "begin"
+    description: "Blah blah"
+  }
+  out_arg {
+    name: "size"
+    description: "Blah blah"
+  }
+  out_arg {
+    name: "bboxes"
+    description: "Blah blah"
+  }
+  attr {
+    name: "seed"
+    description: "Blah blah"
+  }
+  attr {
+    name: "seed2"
+    description: "Blah blah"
+  }
+  attr {
+    name: "min_object_covered"
+    description: "Blah blah"
+  }
+  attr {
+    name: "aspect_ratio_range"
+    description: "Blah blah"
+  }
+  attr {
+    name: "area_range"
+    description: "Blah blah"
+  }
+  attr {
+    name: "max_attempts"
+    description: "Blah blah"
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    description: "Blah blah"
+  }
+  summary: "Generate a single randomly distorted bounding box for an image."
+	description: "Blah blah"
+}
+`,
+			wanted: `
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
+//
+// Blah blah
+//
+// Arguments:
+//	image_size: Blah blah
+//	bounding_boxes: Blah blah
+//
+// Returns:
+//	begin: Blah blah
+//	size: Blah blah
+//	bboxes: Blah blah
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SampleDistortedBoundingBox",
+		Input: []tf.Input{
+			image_size, bounding_boxes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
 `,
 		},
 	}
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index b537cc30190..ec9c38bd913 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -1274,7 +1274,7 @@ type SqueezeAttr func(optionalAttr)
 // value: If specified, only squeezes the dimensions listed. The dimension
 // index starts at 0. It is an error to squeeze a dimension that is not 1. Must
 // be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func SqueezeAxis(value []int64) SqueezeAttr {
@@ -1358,7 +1358,7 @@ type PlaceholderAttr func(optionalAttr)
 //
 // value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
 // shape is unconstrained.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func PlaceholderShape(value tf.Shape) PlaceholderAttr {
 	return func(m optionalAttr) {
 		m["shape"] = value
@@ -1950,8 +1950,8 @@ func GatherV2BatchDims(value int64) GatherV2Attr {
 // Gather slices from `params` axis `axis` according to `indices`.
 //
 // `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-// params.shape[axis + 1:]` where:
+// Produces an output tensor with shape `params.shape[:axis] +
+// indices.shape[batch_dims:] + params.shape[axis + 1:]` where:
 //
 // ```python
 //     # Scalar indices (output is rank(params) - 1).
@@ -3138,41 +3138,6 @@ func BoostedTreesBucketize(scope *Scope, float_values []tf.Output, bucket_bounda
 	return buckets
 }
 
-// Generate the bucket boundaries for each feature based on accumulated summaries.
-//
-// An op that returns a list of float tensors for a quantile stream resource. Each
-// tensor is Rank 1 containing bucket boundaries for a single feature.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	num_features: inferred int; number of features to get bucket boundaries for.
-//
-// Returns float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
-func BoostedTreesQuantileStreamResourceGetBucketBoundaries(scope *Scope, quantile_stream_resource_handle tf.Output, num_features int64) (bucket_boundaries []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_features": num_features}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceGetBucketBoundaries",
-		Input: []tf.Input{
-			quantile_stream_resource_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if bucket_boundaries, idx, err = makeOutputList(op, idx, "bucket_boundaries"); err != nil {
-		scope.UpdateErr("BoostedTreesQuantileStreamResourceGetBucketBoundaries", err)
-		return
-	}
-	return bucket_boundaries
-}
-
 // Returns immutable tensor from memory region.
 //
 // The current implementation memmaps the tensor from a file.
@@ -4016,7 +3981,7 @@ func FixedUnigramCandidateSamplerShard(value int64) FixedUnigramCandidateSampler
 //
 // value: A list of unigram counts or probabilities, one per ID in sequential
 // order. Exactly one of vocab_file and unigrams should be passed to this op.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func FixedUnigramCandidateSamplerUnigrams(value []float32) FixedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
 		m["unigrams"] = value
@@ -4715,7 +4680,7 @@ type DenseCountSparseOutputAttr func(optionalAttr)
 
 // DenseCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: int32; minimum value to count. Can be set to -1 for no minimum.
+// value: Minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -4727,7 +4692,7 @@ func DenseCountSparseOutputMinlength(value int64) DenseCountSparseOutputAttr {
 
 // DenseCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: int32; maximum value to count. Can be set to -1 for no maximum.
+// value: Maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -4742,20 +4707,20 @@ func DenseCountSparseOutputMaxlength(value int64) DenseCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	values: int32 or int64; Tensor containing data to count.
-//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
-//	binary_count: bool; whether to output the number of occurrences of each value or 1.
-//	output_type: dtype; dtype of the output values tensor.
+//	values: Tensor containing data to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values. May
+// also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
 //
 // Returns:
-//	output_indices: int64; indices tensor for the resulting sparse tensor object.
-//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
-//	output_dense_shape: int64; shape tensor for the resulting sparse tensor object.
-func DenseCountSparseOutput(scope *Scope, values tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...DenseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
+func DenseCountSparseOutput(scope *Scope, values tf.Output, weights tf.Output, binary_output bool, optional ...DenseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
+	attrs := map[string]interface{}{"binary_output": binary_output}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -7166,7 +7131,7 @@ func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source
 type TensorArrayV2Attr func(optionalAttr)
 
 // TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -7291,7 +7256,7 @@ type TensorArrayConcatV3Attr func(optionalAttr)
 // excluding the first dimension. Used to validate the shapes of
 // TensorArray elements. If this shape is not fully specified, concatenating
 // zero-size TensorArrays is an error.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
 	return func(m optionalAttr) {
 		m["element_shape_except0"] = value
@@ -7350,7 +7315,7 @@ type TensorArrayGatherV3Attr func(optionalAttr)
 // value: The expected shape of an element, if known. Used to
 // validate the shapes of TensorArray elements. If this shape is not
 // fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -7841,7 +7806,7 @@ type PriorityQueueV2Attr func(optionalAttr)
 // PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
 //
 // value: The type of each component in a value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
@@ -8148,7 +8113,7 @@ type MultiDeviceIteratorFromStringHandleAttr func(optionalAttr)
 // MultiDeviceIteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
 //
 // value: The type list for the return values.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDeviceIteratorFromStringHandleAttr {
@@ -8160,7 +8125,7 @@ func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDe
 // MultiDeviceIteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
 //
 // value: The list of shapes being produced.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func MultiDeviceIteratorFromStringHandleOutputShapes(value []tf.Shape) MultiDeviceIteratorFromStringHandleAttr {
@@ -8516,7 +8481,7 @@ func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output
 type OptimizeDatasetAttr func(optionalAttr)
 
 // OptimizeDatasetOptimizationConfigs sets the optional optimization_configs attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func OptimizeDatasetOptimizationConfigs(value []string) OptimizeDatasetAttr {
 	return func(m optionalAttr) {
 		m["optimization_configs"] = value
@@ -8607,7 +8572,7 @@ type RaggedCountSparseOutputAttr func(optionalAttr)
 
 // RaggedCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: int32; minimum value to count. Can be set to -1 for no minimum.
+// value: Minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -8619,7 +8584,7 @@ func RaggedCountSparseOutputMinlength(value int64) RaggedCountSparseOutputAttr {
 
 // RaggedCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: int32; maximum value to count. Can be set to -1 for no maximum.
+// value: Maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -8634,33 +8599,27 @@ func RaggedCountSparseOutputMaxlength(value int64) RaggedCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	splits: int64; Tensor containing the row splits of the ragged tensor to count.
-//	values: int32 or int64; Tensor containing values of the sparse tensor to count.
-//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
-//	binary_count: bool; whether to output the number of occurrences of each value or 1.
-//	output_type: dtype; dtype of the output values tensor.
+//	splits: Tensor containing the row splits of the ragged tensor to count.
+//	values: Tensor containing values of the sparse tensor to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values.
+// May also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
 //
 // Returns:
-//	output_indices: int64; indices tensor for the resulting sparse tensor object.
-//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
-//   END
-//   }
-//   out_arg {
-//     name: "output_dense_shape"
-//     description: <<END
-// int64; shape tensor for the resulting sparse tensor object.
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
 //   END
 //   }
 //   attr {
 //     name: "T"
 //     description: <<END
-// dtype; dtype of the input values tensor.
-//	output_dense_shape
-func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+// Dtype of the input values tensor.
+func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_output bool, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
+	attrs := map[string]interface{}{"binary_output": binary_output}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -9298,7 +9257,7 @@ type RandomShuffleQueueV2Attr func(optionalAttr)
 // be either 0 or the same as the length of component_types. If the length of
 // this attr is 0, the shapes of queue elements are not constrained, and
 // only one element may be dequeued at a time.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
@@ -9521,7 +9480,7 @@ func DebugIdentityV2TensorDebugMode(value int64) DebugIdentityV2Attr {
 // DebugIdentityV2DebugUrls sets the optional debug_urls attribute to value.
 //
 // value: List of URLs to debug targets, e.g., file:///foo/tfdbg_dump.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DebugIdentityV2DebugUrls(value []string) DebugIdentityV2Attr {
 	return func(m optionalAttr) {
 		m["debug_urls"] = value
@@ -9586,7 +9545,7 @@ func DebugNanCountTensorName(value string) DebugNanCountAttr {
 //
 // value: List of URLs to debug targets, e.g.,
 //   file:///foo/tfdbg_dump, grpc:://localhost:11011.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DebugNanCountDebugUrls(value []string) DebugNanCountAttr {
 	return func(m optionalAttr) {
 		m["debug_urls"] = value
@@ -9660,7 +9619,7 @@ func DebugIdentityTensorName(value string) DebugIdentityAttr {
 //
 // value: List of URLs to debug targets, e.g.,
 //   file:///foo/tfdbg_dump, grpc:://localhost:11011
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DebugIdentityDebugUrls(value []string) DebugIdentityAttr {
 	return func(m optionalAttr) {
 		m["debug_urls"] = value
@@ -10237,6 +10196,41 @@ func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_nam
 	return op.Output(0)
 }
 
+// Generate the bucket boundaries for each feature based on accumulated summaries.
+//
+// An op that returns a list of float tensors for a quantile stream resource. Each
+// tensor is Rank 1 containing bucket boundaries for a single feature.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	num_features: inferred int; number of features to get bucket boundaries for.
+//
+// Returns float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
+func BoostedTreesQuantileStreamResourceGetBucketBoundaries(scope *Scope, quantile_stream_resource_handle tf.Output, num_features int64) (bucket_boundaries []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_features": num_features}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesQuantileStreamResourceGetBucketBoundaries",
+		Input: []tf.Input{
+			quantile_stream_resource_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if bucket_boundaries, idx, err = makeOutputList(op, idx, "bucket_boundaries"); err != nil {
+		scope.UpdateErr("BoostedTreesQuantileStreamResourceGetBucketBoundaries", err)
+		return
+	}
+	return bucket_boundaries
+}
+
 // Creates a dataset that passes a sliding window over `input_dataset`.
 //
 // Arguments:
@@ -10527,7 +10521,7 @@ func ParseExampleDatasetV2Deterministic(value string) ParseExampleDatasetV2Attr
 }
 
 // ParseExampleDatasetV2RaggedKeys sets the optional ragged_keys attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetV2RaggedKeys(value []string) ParseExampleDatasetV2Attr {
@@ -10537,7 +10531,7 @@ func ParseExampleDatasetV2RaggedKeys(value []string) ParseExampleDatasetV2Attr {
 }
 
 // ParseExampleDatasetV2RaggedValueTypes sets the optional ragged_value_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetV2RaggedValueTypes(value []tf.DataType) ParseExampleDatasetV2Attr {
@@ -10547,7 +10541,7 @@ func ParseExampleDatasetV2RaggedValueTypes(value []tf.DataType) ParseExampleData
 }
 
 // ParseExampleDatasetV2RaggedSplitTypes sets the optional ragged_split_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetV2RaggedSplitTypes(value []tf.DataType) ParseExampleDatasetV2Attr {
@@ -11423,6 +11417,32 @@ func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged
 	return op.Output(0)
 }
 
+// Uncompresses a compressed dataset element.
+func UncompressElement(scope *Scope, compressed tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "UncompressElement",
+		Input: []tf.Input{
+			compressed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("UncompressElement", err)
+		return
+	}
+	return components
+}
+
 // Records the bytes size of each element of `input_dataset` in a StatsAggregator.
 func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
@@ -11520,6 +11540,21 @@ func AssertNextDataset(scope *Scope, input_dataset tf.Output, transformations tf
 	return op.Output(0)
 }
 
+// Return the index of device the op runs.
+func DeviceIndex(scope *Scope, device_names []string) (index tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"device_names": device_names}
+	opspec := tf.OpSpec{
+		Type: "DeviceIndex",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ShardDatasetAttr is an optional argument to ShardDataset.
 type ShardDatasetAttr func(optionalAttr)
 
@@ -11915,6 +11950,108 @@ func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxe
 	return op.Output(0)
 }
 
+// ExtractGlimpseV2Attr is an optional argument to ExtractGlimpseV2.
+type ExtractGlimpseV2Attr func(optionalAttr)
+
+// ExtractGlimpseV2Centered sets the optional centered attribute to value.
+//
+// value: indicates if the offset coordinates are centered relative to
+// the image, in which case the (0, 0) offset is relative to the center
+// of the input images. If false, the (0,0) offset corresponds to the
+// upper left corner of the input images.
+// If not specified, defaults to true
+func ExtractGlimpseV2Centered(value bool) ExtractGlimpseV2Attr {
+	return func(m optionalAttr) {
+		m["centered"] = value
+	}
+}
+
+// ExtractGlimpseV2Normalized sets the optional normalized attribute to value.
+//
+// value: indicates if the offset coordinates are normalized.
+// If not specified, defaults to true
+func ExtractGlimpseV2Normalized(value bool) ExtractGlimpseV2Attr {
+	return func(m optionalAttr) {
+		m["normalized"] = value
+	}
+}
+
+// ExtractGlimpseV2UniformNoise sets the optional uniform_noise attribute to value.
+//
+// value: indicates if the noise should be generated using a
+// uniform distribution or a Gaussian distribution.
+// If not specified, defaults to true
+func ExtractGlimpseV2UniformNoise(value bool) ExtractGlimpseV2Attr {
+	return func(m optionalAttr) {
+		m["uniform_noise"] = value
+	}
+}
+
+// ExtractGlimpseV2Noise sets the optional noise attribute to value.
+//
+// value: indicates if the noise should `uniform`, `gaussian`, or
+// `zero`. The default is `uniform` which means the the noise type
+// will be decided by `uniform_noise`.
+// If not specified, defaults to "uniform"
+func ExtractGlimpseV2Noise(value string) ExtractGlimpseV2Attr {
+	return func(m optionalAttr) {
+		m["noise"] = value
+	}
+}
+
+// Extracts a glimpse from the input tensor.
+//
+// Returns a set of windows called glimpses extracted at location
+// `offsets` from the input tensor. If the windows only partially
+// overlaps the inputs, the non overlapping areas will be filled with
+// random noise.
+//
+// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+// glimpse_width, channels]`. The channels and batch dimensions are the
+// same as that of the input tensor. The height and width of the output
+// windows are specified in the `size` parameter.
+//
+// The argument `normalized` and `centered` controls how the windows are built:
+//
+// * If the coordinates are normalized but not centered, 0.0 and 1.0
+//   correspond to the minimum and maximum of each height and width
+//   dimension.
+// * If the coordinates are both normalized and centered, they range from
+//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+//   left corner, the lower right corner is located at (1.0, 1.0) and the
+//   center is at (0, 0).
+// * If the coordinates are not normalized they are interpreted as
+//   numbers of pixels.
+//
+// Arguments:
+//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+//	size: A 1-D tensor of 2 elements containing the size of the glimpses
+// to extract.  The glimpse height must be specified first, following
+// by the glimpse width.
+//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
+// the y, x locations of the center of each window.
+//
+// Returns A tensor representing the glimpses `[batch_size,
+// glimpse_height, glimpse_width, channels]`.
+func ExtractGlimpseV2(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseV2Attr) (glimpse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExtractGlimpseV2",
+		Input: []tf.Input{
+			input, size, offsets,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
 type ExtractGlimpseAttr func(optionalAttr)
 
@@ -12059,7 +12196,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to <f:0.75 f:1.33 >
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12207,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -13316,7 +13453,7 @@ func ParseExampleDatasetSloppy(value bool) ParseExampleDatasetAttr {
 }
 
 // ParseExampleDatasetRaggedKeys sets the optional ragged_keys attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetRaggedKeys(value []string) ParseExampleDatasetAttr {
@@ -13326,7 +13463,7 @@ func ParseExampleDatasetRaggedKeys(value []string) ParseExampleDatasetAttr {
 }
 
 // ParseExampleDatasetRaggedValueTypes sets the optional ragged_value_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetRaggedValueTypes(value []tf.DataType) ParseExampleDatasetAttr {
@@ -13336,7 +13473,7 @@ func ParseExampleDatasetRaggedValueTypes(value []tf.DataType) ParseExampleDatase
 }
 
 // ParseExampleDatasetRaggedSplitTypes sets the optional ragged_split_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetRaggedSplitTypes(value []tf.DataType) ParseExampleDatasetAttr {
@@ -13706,7 +13843,7 @@ type SparseCountSparseOutputAttr func(optionalAttr)
 
 // SparseCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: int32; minimum value to count. Can be set to -1 for no minimum.
+// value: Minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -13718,7 +13855,7 @@ func SparseCountSparseOutputMinlength(value int64) SparseCountSparseOutputAttr {
 
 // SparseCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: int32; maximum value to count. Can be set to -1 for no maximum.
+// value: Maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -13733,22 +13870,22 @@ func SparseCountSparseOutputMaxlength(value int64) SparseCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	indices: int64; Tensor containing the indices of the sparse tensor to count.
-//	values: int32 or int64; Tensor containing values of the sparse tensor to count.
-//	dense_shape: int64; Tensor containing the dense shape of the sparse tensor to count.
-//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
-//	binary_count: bool; whether to output the number of occurrences of each value or 1.
-//	output_type: dtype; dtype of the output values tensor.
+//	indices: Tensor containing the indices of the sparse tensor to count.
+//	values: Tensor containing values of the sparse tensor to count.
+//	dense_shape: Tensor containing the dense shape of the sparse tensor to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values.
+// May also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
 //
 // Returns:
-//	output_indices: int64; indices tensor for the resulting sparse tensor object.
-//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
-//	output_dense_shape: int64; shape tensor for the resulting sparse tensor object.
-func SparseCountSparseOutput(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...SparseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
+func SparseCountSparseOutput(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, weights tf.Output, binary_output bool, optional ...SparseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
+	attrs := map[string]interface{}{"binary_output": binary_output}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -13901,7 +14038,7 @@ func DebugNumericSummaryTensorName(value string) DebugNumericSummaryAttr {
 //
 // value: List of URLs to debug targets, e.g.,
 //   file:///foo/tfdbg_dump, grpc:://localhost:11011.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DebugNumericSummaryDebugUrls(value []string) DebugNumericSummaryAttr {
 	return func(m optionalAttr) {
 		m["debug_urls"] = value
@@ -15145,7 +15282,7 @@ func TensorSummaryDescription(value string) TensorSummaryAttr {
 // TensorSummaryLabels sets the optional labels attribute to value.
 //
 // value: An unused list of strings.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func TensorSummaryLabels(value []string) TensorSummaryAttr {
 	return func(m optionalAttr) {
 		m["labels"] = value
@@ -15402,7 +15539,7 @@ func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableO
 }
 
 // MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
 		m["value_shape"] = value
@@ -16118,7 +16255,7 @@ type ParseSingleSequenceExampleAttr func(optionalAttr)
 // each context Feature given in context_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
@@ -16128,7 +16265,7 @@ func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSing
 }
 
 // ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
@@ -16144,7 +16281,7 @@ func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseS
 // The number of elements in the Feature corresponding to context_dense_key[j]
 // must always equal context_dense_shapes[j].NumEntries().
 // The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
@@ -16159,7 +16296,7 @@ func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleS
 // of data in each FeatureList given in feature_list_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
@@ -16175,7 +16312,7 @@ func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) Parse
 // The shape of each Feature in the FeatureList corresponding to
 // feature_list_dense_key[j] must always equal
 // feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
@@ -17329,13 +17466,13 @@ func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
 // SparseBincountAttr is an optional argument to SparseBincount.
 type SparseBincountAttr func(optionalAttr)
 
-// SparseBincountBinaryCount sets the optional binary_count attribute to value.
+// SparseBincountBinaryOutput sets the optional binary_output attribute to value.
 //
 // value: bool; Whether the kernel should count the appearance or number of occurrences.
 // If not specified, defaults to false
-func SparseBincountBinaryCount(value bool) SparseBincountAttr {
+func SparseBincountBinaryOutput(value bool) SparseBincountAttr {
 	return func(m optionalAttr) {
-		m["binary_count"] = value
+		m["binary_output"] = value
 	}
 }
 
@@ -17434,13 +17571,13 @@ func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
 // DenseBincountAttr is an optional argument to DenseBincount.
 type DenseBincountAttr func(optionalAttr)
 
-// DenseBincountBinaryCount sets the optional binary_count attribute to value.
+// DenseBincountBinaryOutput sets the optional binary_output attribute to value.
 //
 // value: bool; Whether the kernel should count the appearance or number of occurrences.
 // If not specified, defaults to false
-func DenseBincountBinaryCount(value bool) DenseBincountAttr {
+func DenseBincountBinaryOutput(value bool) DenseBincountAttr {
 	return func(m optionalAttr) {
-		m["binary_count"] = value
+		m["binary_output"] = value
 	}
 }
 
@@ -18975,7 +19112,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to <f:0.75 f:1.33 >
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +19123,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19527,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -19469,6 +19606,14 @@ func CollectiveBcastSendCommunicationHint(value string) CollectiveBcastSendAttr
 	}
 }
 
+// CollectiveBcastSendTimeoutSeconds sets the optional timeout_seconds attribute to value.
+// If not specified, defaults to 0
+func CollectiveBcastSendTimeoutSeconds(value float32) CollectiveBcastSendAttr {
+	return func(m optionalAttr) {
+		m["timeout_seconds"] = value
+	}
+}
+
 // Broadcasts a tensor value to one or more other devices.
 func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape, optional ...CollectiveBcastSendAttr) (data tf.Output) {
 	if scope.Err() != nil {
@@ -20461,7 +20606,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21605,7 +21750,7 @@ func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
 // dimension, the amount of padding inserted before and after the dimension is
 // `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
 // `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -21633,7 +21778,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22313,7 +22458,7 @@ func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
 // dimension, the amount of padding inserted before and after the dimension is
 // `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
 // `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func Conv2DExplicitPaddings(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -22341,7 +22486,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22682,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22545,7 +22690,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64
 }
 
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizePaddingList sets the optional padding_list attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizePaddingList(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["padding_list"] = value
@@ -22606,7 +22751,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22614,7 +22759,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDe
 }
 
 // QuantizedDepthwiseConv2DWithBiasAndReluPaddingList sets the optional padding_list attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func QuantizedDepthwiseConv2DWithBiasAndReluPaddingList(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["padding_list"] = value
@@ -22721,7 +22866,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22925,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +23099,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23476,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23706,7 +23851,7 @@ func QuantizedMatMulWithBias(scope *Scope, a tf.Output, b tf.Output, bias tf.Out
 type TensorArrayGatherV2Attr func(optionalAttr)
 
 // TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -23901,7 +24046,7 @@ func CopyTensorName(value string) CopyAttr {
 // <debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented
 // as 0/1. E.g., "DebugIdentity;grpc://foo:3333;1",
 // "DebugIdentity;file:///tmp/tfdbg_1;0".
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func CopyDebugOpsSpec(value []string) CopyAttr {
 	return func(m optionalAttr) {
 		m["debug_ops_spec"] = value
@@ -23940,9 +24085,12 @@ func Copy(scope *Scope, input tf.Output, optional ...CopyAttr) (output tf.Output
 	return op.Output(0)
 }
 
-//     Updates specified rows with values in `v`.
+// Updates specified rows 'i' with values 'v'.
 //
-//     Computes `x[i, :] = v; return x`.
+// Computes `x[i, :] = v; return x`.
+//
+// Originally this function is mutative however for compilation we make this
+// operation create / operate on a copy of `x`.
 //
 // Arguments:
 //	x: A tensor of type `T`.
@@ -24130,7 +24278,7 @@ type FIFOQueueV2Attr func(optionalAttr)
 // be either 0 or the same as the length of component_types. If the length of
 // this attr is 0, the shapes of queue elements are not constrained, and
 // only one element may be dequeued at a time.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
@@ -24474,7 +24622,7 @@ func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTable
 // MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
 //
 // value: The shape of each value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["value_shape"] = value
@@ -25651,7 +25799,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25862,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25937,7 +26085,7 @@ func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *
 type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
 
 // DepthwiseConv2dNativeBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DepthwiseConv2dNativeBackpropInputExplicitPaddings(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -25965,7 +26113,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26106,6 +26254,173 @@ func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf
 	return op.Output(0)
 }
 
+// Adjust the hue of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpreted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustHue",
+		Input: []tf.Input{
+			images, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// List of the given size with empty elements.
+//
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListReserve",
+		Input: []tf.Input{
+			element_shape, num_elements,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Clips tensor values to a specified min and max.
+//
+// Given a tensor `t`, this operation returns a tensor of the same type and
+// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+// greater than `clip_value_max` are set to `clip_value_max`.
+//
+// Arguments:
+//	t: A `Tensor`.
+//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The minimum value to clip by.
+//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The maximum value to clip by.
+//
+// Returns A clipped `Tensor` with the same shape as input 't'.
+func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ClipByValue",
+		Input: []tf.Input{
+			t, clip_value_min, clip_value_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
+
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2DBackpropFilter",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ConfigureDistributedTPUAttr is an optional argument to ConfigureDistributedTPU.
 type ConfigureDistributedTPUAttr func(optionalAttr)
 
@@ -26449,7 +26764,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27245,7 +27560,7 @@ func ParseSequenceExampleV2NcontextSparse(value int64) ParseSequenceExampleV2Att
 // each context Feature given in context_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2ContextSparseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27257,7 +27572,7 @@ func ParseSequenceExampleV2ContextSparseTypes(value []tf.DataType) ParseSequence
 // ParseSequenceExampleV2ContextRaggedValueTypes sets the optional context_ragged_value_types attribute to value.
 //
 // value: RaggedTensor.value dtypes for the ragged context features.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2ContextRaggedValueTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27269,7 +27584,7 @@ func ParseSequenceExampleV2ContextRaggedValueTypes(value []tf.DataType) ParseSeq
 // ParseSequenceExampleV2ContextRaggedSplitTypes sets the optional context_ragged_split_types attribute to value.
 //
 // value: RaggedTensor.row_split dtypes for the ragged context features.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2ContextRaggedSplitTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27285,7 +27600,7 @@ func ParseSequenceExampleV2ContextRaggedSplitTypes(value []tf.DataType) ParseSeq
 // The number of elements in the Feature corresponding to context_dense_key[j]
 // must always equal context_dense_shapes[j].NumEntries().
 // The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2ContextDenseShapes(value []tf.Shape) ParseSequenceExampleV2Attr {
@@ -27315,7 +27630,7 @@ func ParseSequenceExampleV2NfeatureListDense(value int64) ParseSequenceExampleV2
 }
 
 // ParseSequenceExampleV2FeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27330,7 +27645,7 @@ func ParseSequenceExampleV2FeatureListDenseTypes(value []tf.DataType) ParseSeque
 // of data in each FeatureList given in feature_list_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27342,7 +27657,7 @@ func ParseSequenceExampleV2FeatureListSparseTypes(value []tf.DataType) ParseSequ
 // ParseSequenceExampleV2FeatureListRaggedValueTypes sets the optional feature_list_ragged_value_types attribute to value.
 //
 // value: RaggedTensor.value dtypes for the ragged FeatureList features.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListRaggedValueTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27354,7 +27669,7 @@ func ParseSequenceExampleV2FeatureListRaggedValueTypes(value []tf.DataType) Pars
 // ParseSequenceExampleV2FeatureListRaggedSplitTypes sets the optional feature_list_ragged_split_types attribute to value.
 //
 // value: RaggedTensor.row_split dtypes for the ragged FeatureList features.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListRaggedSplitTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27370,7 +27685,7 @@ func ParseSequenceExampleV2FeatureListRaggedSplitTypes(value []tf.DataType) Pars
 // The shape of each Feature in the FeatureList corresponding to
 // feature_list_dense_key[j] must always equal
 // feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleV2Attr {
@@ -28551,7 +28866,7 @@ func BatchMaxEnqueuedBatches(value int64) BatchAttr {
 }
 
 // BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func BatchAllowedBatchSizes(value []int64) BatchAttr {
 	return func(m optionalAttr) {
 		m["allowed_batch_sizes"] = value
@@ -30246,6 +30561,21 @@ func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, upd
 	return scope.AddOperation(opspec)
 }
 
+// Compresses a dataset element.
+func CompressElement(scope *Scope, components []tf.Output) (compressed tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CompressElement",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // MatMulAttr is an optional argument to MatMul.
 type MatMulAttr func(optionalAttr)
 
@@ -30658,57 +30988,6 @@ func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_in
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Clips tensor values to a specified min and max.
-//
-// Given a tensor `t`, this operation returns a tensor of the same type and
-// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
-// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
-// greater than `clip_value_max` are set to `clip_value_max`.
-//
-// Arguments:
-//	t: A `Tensor`.
-//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The minimum value to clip by.
-//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The maximum value to clip by.
-//
-// Returns A clipped `Tensor` with the same shape as input 't'.
-func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ClipByValue",
-		Input: []tf.Input{
-			t, clip_value_min, clip_value_max,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// List of the given size with empty elements.
-//
-// element_shape: the shape of the future elements of the list
-// num_elements: the number of elements to reserve
-// handle: the output list
-// element_dtype: the desired type of elements in the list.
-func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListReserve",
-		Input: []tf.Input{
-			element_shape, num_elements,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // VariableShapeAttr is an optional argument to VariableShape.
 type VariableShapeAttr func(optionalAttr)
 
@@ -31178,7 +31457,7 @@ func VarHandleOpSharedName(value string) VarHandleOpAttr {
 //
 // value: The allowed devices containing the resource variable. Set when the output
 // ResourceHandle represents a per-replica/partitioned resource variable.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func VarHandleOpAllowedDevices(value []string) VarHandleOpAttr {
 	return func(m optionalAttr) {
 		m["allowed_devices"] = value
@@ -31761,6 +32040,14 @@ func CollectiveBcastRecvCommunicationHint(value string) CollectiveBcastRecvAttr
 	}
 }
 
+// CollectiveBcastRecvTimeoutSeconds sets the optional timeout_seconds attribute to value.
+// If not specified, defaults to 0
+func CollectiveBcastRecvTimeoutSeconds(value float32) CollectiveBcastRecvAttr {
+	return func(m optionalAttr) {
+		m["timeout_seconds"] = value
+	}
+}
+
 // Receives a tensor value broadcast from another device.
 func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape, optional ...CollectiveBcastRecvAttr) (data tf.Output) {
 	if scope.Err() != nil {
@@ -32525,7 +32812,7 @@ func CopyHostTensorName(value string) CopyHostAttr {
 // <debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented
 // as 0/1. E.g., "DebugIdentity;grpc://foo:3333;1",
 // "DebugIdentity;file:///tmp/tfdbg_1;0".
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func CopyHostDebugOpsSpec(value []string) CopyHostAttr {
 	return func(m optionalAttr) {
 		m["debug_ops_spec"] = value
@@ -32854,7 +33141,7 @@ type IteratorFromStringHandleAttr func(optionalAttr)
 //
 // value: If specified, defines the type of each tuple component in an
 // element produced by the resulting iterator.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
@@ -32867,7 +33154,7 @@ func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromString
 //
 // value: If specified, defines the shape of each tuple component in an
 // element produced by the resulting iterator.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
@@ -33009,6 +33296,14 @@ func TPUReplicatedInputIndex(value int64) TPUReplicatedInputAttr {
 	}
 }
 
+// TPUReplicatedInputIsPacked sets the optional is_packed attribute to value.
+// If not specified, defaults to false
+func TPUReplicatedInputIsPacked(value bool) TPUReplicatedInputAttr {
+	return func(m optionalAttr) {
+		m["is_packed"] = value
+	}
+}
+
 // Connects N inputs to an N-way replicated TPU computation.
 //
 // This operation holds a replicated input to a `tpu.replicate()` computation subgraph.
@@ -34199,6 +34494,74 @@ func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	strong_hash: boolean, if true, siphash with salt will be used instead of farmhash.
+//	salt: Specify the salt that will be used by the siphash function.
+//
+// Returns:
+//	output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
+//	output_values: 1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.
+//	output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCrossHashed(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, num_buckets tf.Output, strong_hash tf.Output, salt tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseCrossHashed",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs), num_buckets, strong_hash, salt,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
 type QuantizedInstanceNormAttr func(optionalAttr)
 
@@ -34460,6 +34823,71 @@ func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output)
 	return op.Output(0)
 }
 
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	sep: string used when joining a list of string inputs, can be used as separator later.
+//
+// Returns:
+//	output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
+//	output_values: 1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.
+//	output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCrossV2(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, sep tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseCrossV2",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs), sep,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Pads a tensor with mirrored values.
 //
 // This operation pads a `input` with mirrored values according to the `paddings`
@@ -34523,7 +34951,7 @@ type TensorArrayV3Attr func(optionalAttr)
 // value: The expected shape of an element, if known. Used to
 // validate the shapes of TensorArray elements. If this shape is not
 // fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -36613,7 +37041,7 @@ func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr
 // each context Feature given in context_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
@@ -36623,7 +37051,7 @@ func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceEx
 }
 
 // ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
@@ -36639,7 +37067,7 @@ func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenc
 // The number of elements in the Feature corresponding to context_dense_key[j]
 // must always equal context_dense_shapes[j].NumEntries().
 // The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
@@ -36654,7 +37082,7 @@ func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExamp
 // of data in each FeatureList given in feature_list_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
@@ -36670,7 +37098,7 @@ func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequen
 // The shape of each Feature in the FeatureList corresponding to
 // feature_list_dense_key[j] must always equal
 // feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
@@ -36890,34 +37318,6 @@ func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_t
 	return components
 }
 
-// Adjust the hue of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpreted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
-//
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustHue",
-		Input: []tf.Input{
-			images, delta,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Says whether the targets are in the top `K` predictions.
 //
 // This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
@@ -37840,6 +38240,14 @@ func CollectiveGatherCommunicationHint(value string) CollectiveGatherAttr {
 	}
 }
 
+// CollectiveGatherTimeoutSeconds sets the optional timeout_seconds attribute to value.
+// If not specified, defaults to 0
+func CollectiveGatherTimeoutSeconds(value float32) CollectiveGatherAttr {
+	return func(m optionalAttr) {
+		m["timeout_seconds"] = value
+	}
+}
+
 // Mutually accumulates multiple tensors of identical type and shape.
 func CollectiveGather(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape, optional ...CollectiveGatherAttr) (data tf.Output) {
 	if scope.Err() != nil {
@@ -38651,13 +39059,13 @@ func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output,
 // RaggedBincountAttr is an optional argument to RaggedBincount.
 type RaggedBincountAttr func(optionalAttr)
 
-// RaggedBincountBinaryCount sets the optional binary_count attribute to value.
+// RaggedBincountBinaryOutput sets the optional binary_output attribute to value.
 //
 // value: bool; Whether the kernel should count the appearance or number of occurrences.
 // If not specified, defaults to false
-func RaggedBincountBinaryCount(value bool) RaggedBincountAttr {
+func RaggedBincountBinaryOutput(value bool) RaggedBincountAttr {
 	return func(m optionalAttr) {
-		m["binary_count"] = value
+		m["binary_output"] = value
 	}
 }
 
@@ -39268,7 +39676,7 @@ type PrelinearizeTupleAttr func(optionalAttr)
 // tuple shapes in the order the shapes appear in the "shapes" input. The layout
 // elements for a sub-shape can be set to -1 in which case the corresponding layout
 // will be computed by the infeed operation.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func PrelinearizeTupleLayouts(value []int64) PrelinearizeTupleAttr {
 	return func(m optionalAttr) {
 		m["layouts"] = value
@@ -40939,7 +41347,7 @@ func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_ke
 type DatasetToGraphAttr func(optionalAttr)
 
 // DatasetToGraphStatefulWhitelist sets the optional stateful_whitelist attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func DatasetToGraphStatefulWhitelist(value []string) DatasetToGraphAttr {
@@ -41348,7 +41756,7 @@ func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, l
 type TensorArrayConcatV2Attr func(optionalAttr)
 
 // TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
 	return func(m optionalAttr) {
 		m["element_shape_except0"] = value
@@ -41663,7 +42071,7 @@ func TPUReplicateMetadataUseTpu(value bool) TPUReplicateMetadataAttr {
 // TPUReplicateMetadataDeviceAssignment sets the optional device_assignment attribute to value.
 //
 // value: The assignment of devices for the computation.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
 		m["device_assignment"] = value
@@ -41673,7 +42081,7 @@ func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAtt
 // TPUReplicateMetadataComputationShape sets the optional computation_shape attribute to value.
 //
 // value: DEPRECATED. Use num_cores_per_replica instead.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
 		m["computation_shape"] = value
@@ -41681,7 +42089,7 @@ func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAtt
 }
 
 // TPUReplicateMetadataHostComputeCore sets the optional host_compute_core attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
 		m["host_compute_core"] = value
@@ -41689,7 +42097,7 @@ func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAtt
 }
 
 // TPUReplicateMetadataPaddingMap sets the optional padding_map attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func TPUReplicateMetadataPaddingMap(value []string) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
 		m["padding_map"] = value
@@ -41740,7 +42148,7 @@ func TPUReplicateMetadata(scope *Scope, num_replicas int64, optional ...TPURepli
 type TensorListConcatAttr func(optionalAttr)
 
 // TensorListConcatElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -42549,7 +42957,7 @@ func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
 type CollectiveReduceAttr func(optionalAttr)
 
 // CollectiveReduceWaitFor sets the optional wait_for attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func CollectiveReduceWaitFor(value []int64) CollectiveReduceAttr {
 	return func(m optionalAttr) {
 		m["wait_for"] = value
@@ -42564,6 +42972,14 @@ func CollectiveReduceCommunicationHint(value string) CollectiveReduceAttr {
 	}
 }
 
+// CollectiveReduceTimeoutSeconds sets the optional timeout_seconds attribute to value.
+// If not specified, defaults to 0
+func CollectiveReduceTimeoutSeconds(value float32) CollectiveReduceAttr {
+	return func(m optionalAttr) {
+		m["timeout_seconds"] = value
+	}
+}
+
 // Mutually reduces multiple tensors of identical type and shape.
 func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64, optional ...CollectiveReduceAttr) (data tf.Output) {
 	if scope.Err() != nil {
@@ -43080,7 +43496,7 @@ func DecodeCSVNaValue(value string) DecodeCSVAttr {
 }
 
 // DecodeCSVSelectCols sets the optional select_cols attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
 	return func(m optionalAttr) {
 		m["select_cols"] = value
@@ -43625,7 +44041,7 @@ func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddin
 // the sum of the weights be 0 for 'mean' or the sum of the squared weights be
 // 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
 // all tables.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
 	return func(m optionalAttr) {
 		m["combiners"] = value
@@ -44717,7 +45133,7 @@ func EnqueueTPUEmbeddingRaggedTensorBatchDeviceOrdinal(value int64) EnqueueTPUEm
 // the sum of the weights be 0 for 'mean' or the sum of the squared weights be
 // 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
 // all tables.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func EnqueueTPUEmbeddingRaggedTensorBatchCombiners(value []string) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
 	return func(m optionalAttr) {
 		m["combiners"] = value
@@ -44725,7 +45141,7 @@ func EnqueueTPUEmbeddingRaggedTensorBatchCombiners(value []string) EnqueueTPUEmb
 }
 
 // EnqueueTPUEmbeddingRaggedTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func EnqueueTPUEmbeddingRaggedTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
 	return func(m optionalAttr) {
 		m["max_sequence_lengths"] = value
@@ -45509,7 +45925,7 @@ func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Ou
 type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
 // DepthwiseConv2dNativeBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DepthwiseConv2dNativeBackpropFilterExplicitPaddings(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -45537,7 +45953,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -46635,7 +47051,7 @@ type InfeedEnqueueTupleAttr func(optionalAttr)
 // all the tuple shapes, in the order the shapes appear in the "shapes" input.
 // The layout elements for a sub-shape can be set to -1, in which case the
 // corresponding layout will be computed by the infeed operation.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func InfeedEnqueueTupleLayouts(value []int64) InfeedEnqueueTupleAttr {
 	return func(m optionalAttr) {
 		m["layouts"] = value
@@ -47477,7 +47893,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47520,7 +47936,7 @@ func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_ba
 type DepthwiseConv2dNativeAttr func(optionalAttr)
 
 // DepthwiseConv2dNativeExplicitPaddings sets the optional explicit_paddings attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DepthwiseConv2dNativeExplicitPaddings(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -47548,7 +47964,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47637,7 +48053,7 @@ type PaddingFIFOQueueV2Attr func(optionalAttr)
 // zeros up to the maximum shape of all elements in the given batch.
 // If the length of this attr is 0, different queue elements may have
 // different ranks and shapes, but only one element may be dequeued at a time.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
@@ -47889,7 +48305,7 @@ type InfeedEnqueueAttr func(optionalAttr)
 // InfeedEnqueueShape sets the optional shape attribute to value.
 //
 // value: The shape of the tensor.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
 	return func(m optionalAttr) {
 		m["shape"] = value
@@ -47901,7 +48317,7 @@ func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
 // value: A vector holding the requested layout in minor-to-major sequence.
 // If a layout attribute is passed, but its values are all -1, the layout will
 // be computed by the infeed operation.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func InfeedEnqueueLayout(value []int64) InfeedEnqueueAttr {
 	return func(m optionalAttr) {
 		m["layout"] = value
@@ -48492,94 +48908,6 @@ func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
-
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
-//
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to {}
-func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
-	}
-}
-
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
-func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of convolution with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // LRNGradAttr is an optional argument to LRNGrad.
 type LRNGradAttr func(optionalAttr)
 
@@ -48656,7 +48984,7 @@ type PrelinearizeAttr func(optionalAttr)
 // PrelinearizeShape sets the optional shape attribute to value.
 //
 // value: The shape of the tensor.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func PrelinearizeShape(value tf.Shape) PrelinearizeAttr {
 	return func(m optionalAttr) {
 		m["shape"] = value
@@ -48668,7 +48996,7 @@ func PrelinearizeShape(value tf.Shape) PrelinearizeAttr {
 // value: A vector holding the requested layout in minor-to-major sequence. If a layout
 // attribute is passed but its values are all -1 the layout will be computed by
 // the infeed operation.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func PrelinearizeLayout(value []int64) PrelinearizeAttr {
 	return func(m optionalAttr) {
 		m["layout"] = value
@@ -49087,7 +49415,7 @@ func EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal(value int64) EnqueueTPUEm
 // the sum of the weights be 0 for 'mean' or the sum of the squared weights be
 // 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
 // all tables.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmbeddingSparseTensorBatchAttr {
 	return func(m optionalAttr) {
 		m["combiners"] = value
@@ -49095,7 +49423,7 @@ func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmb
 }
 
 // EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
 	return func(m optionalAttr) {
 		m["max_sequence_lengths"] = value
diff --git a/tensorflow/go/saved_model.go b/tensorflow/go/saved_model.go
index 7aa1e83cbc4..64ae82e3b01 100644
--- a/tensorflow/go/saved_model.go
+++ b/tensorflow/go/saved_model.go
@@ -22,7 +22,7 @@ import (
 	"unsafe"
 
 	"github.com/golang/protobuf/proto"
-	corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto"
+	corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"
 )
 
 // #include <stdlib.h>
diff --git a/tensorflow/go/signature.go b/tensorflow/go/signature.go
index 8aac0e2ec93..c2db0c75247 100644
--- a/tensorflow/go/signature.go
+++ b/tensorflow/go/signature.go
@@ -16,7 +16,7 @@ limitations under the License.
 
 package tensorflow
 
-import corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto"
+import corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"
 
 // #include "tensorflow/c/c_api.h"
 import "C"
diff --git a/tensorflow/go/signature_test.go b/tensorflow/go/signature_test.go
index e6927f3cebd..f9fa8427819 100644
--- a/tensorflow/go/signature_test.go
+++ b/tensorflow/go/signature_test.go
@@ -20,9 +20,9 @@ import (
 	"fmt"
 	"testing"
 
-	corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto"
 	tspb "github.com/tensorflow/tensorflow/tensorflow/go/core/framework/tensor_shape_go_proto"
 	typb "github.com/tensorflow/tensorflow/tensorflow/go/core/framework/types_go_proto"
+	corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"
 )
 
 func TestSignatureFromProto(t *testing.T) {
diff --git a/tensorflow/java/build_defs.bzl b/tensorflow/java/build_defs.bzl
index 04139bac065..85256700fae 100644
--- a/tensorflow/java/build_defs.bzl
+++ b/tensorflow/java/build_defs.bzl
@@ -5,6 +5,7 @@
 
 JAVA_VERSION_OPTS = [
     "-source 7 -target 7",
+    "-XDnoparameters",
 ]
 
 # A more robust set of lint and errorprone checks when building
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index ce1acc20b00..aa4a9bb4618 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -16,7 +16,7 @@
     <dependency>
       <groupId>com.google.protobuf</groupId>
       <artifactId>protobuf-java</artifactId>
-      <version>3.5.1</version>
+      <version>3.9.2</version>
     </dependency>
   </dependencies>
 
diff --git a/tensorflow/java/src/main/native/BUILD b/tensorflow/java/src/main/native/BUILD
index 0b363ff577e..e38e58d6fe6 100644
--- a/tensorflow/java/src/main/native/BUILD
+++ b/tensorflow/java/src/main/native/BUILD
@@ -30,7 +30,7 @@ tf_cuda_library(
     }),
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/c:c_api",
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index a2ab4854165..810f3ab1a2b 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -246,13 +246,16 @@ cc_library(
         ":minimal_logging",
         ":simple_memory_arena",
         ":string",
+        ":tflite_with_xnnpack_optional",
         ":type_to_tflitetype",
         ":util",
         ":version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/delegates:status",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/experimental/resource",
+        "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/schema:schema_fbs",
     ] + select({
@@ -310,6 +313,8 @@ cc_library(
     ],
 )
 
+# Link this library to inject XNNPACK delegate to TFLite runtime automatically
+# by utilizing the weak symbols if they're supported by the platform.
 cc_library(
     name = "tflite_with_xnnpack",
     srcs = ["tflite_with_xnnpack.cc"],
@@ -322,6 +327,35 @@ cc_library(
     alwayslink = 1,
 )
 
+# Enables applying XNNPACK delegate for float models in TFLite runtime.
+# WARNING: This build flag is experimental and subject to change.
+config_setting(
+    name = "tflite_with_xnnpack_enabled",
+    values = {"define": "tflite_with_xnnpack=true"},
+)
+
+cc_library(
+    name = "tflite_with_xnnpack_optional",
+    srcs = ["tflite_with_xnnpack_optional.cc"],
+    hdrs = [
+        "core/macros.h",
+        "tflite_with_xnnpack_optional.h",
+    ],
+    copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
+    defines = select({
+        ":tflite_with_xnnpack_enabled": ["TFLITE_BUILD_WITH_XNNPACK_DELEGATE"],
+        "//conditions:default": [],
+    }),
+    deps = [
+        "//tensorflow/lite/c:common",
+    ] + select({
+        ":tflite_with_xnnpack_enabled": [
+            "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
 cc_test(
     name = "string_util_test",
     size = "small",
@@ -343,7 +377,9 @@ cc_test(
 cc_test(
     name = "interpreter_test",
     size = "small",
-    srcs = ["interpreter_test.cc"],
+    srcs = [
+        "interpreter_test.cc",
+    ],
     features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
     tags = [
         "tflite_not_portable_ios",  # TODO(b/117786830)
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index f6cdb981328..285824a613f 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -634,7 +634,7 @@ def gen_selected_ops(name, model, namespace = "", **kwargs):
 
     Args:
       name: Name of the generated library.
-      model: TFLite model to interpret.
+      model: TFLite models to interpret, expect a list in case of multiple models.
       namespace: Namespace in which to put RegisterSelectedOps.
       **kwargs: Additional kwargs to pass to genrule.
     """
@@ -645,12 +645,17 @@ def gen_selected_ops(name, model, namespace = "", **kwargs):
     # isinstance is not supported in skylark.
     if type(model) != type([]):
         model = [model]
+
+    input_models_args = " --input_models=%s" % ",".join(
+        ["$(location %s)" % f for f in model],
+    )
+
     native.genrule(
         name = name,
         srcs = model,
         outs = [out],
-        cmd = ("$(location %s) --namespace=%s --output_registration=$(location %s) --tflite_path=%s $(SRCS)") %
-              (tool, namespace, out, tflite_path[2:]),
+        cmd = ("$(location %s) --namespace=%s --output_registration=$(location %s) --tflite_path=%s %s") %
+              (tool, namespace, out, tflite_path[2:], input_models_args),
         tools = [tool],
         **kwargs
     )
@@ -680,6 +685,9 @@ def gen_model_coverage_test(src, model_name, data, failure_type, tags, size = "m
         if failure_type[i] != "none":
             args.append("--failure_type=%s" % failure_type[i])
         i = i + 1
+
+        # Avoid coverage timeouts for large/enormous tests.
+        coverage_tags = ["nozapfhahn"] if size in ["large", "enormous"] else []
         native.py_test(
             name = "model_coverage_test_%s_%s" % (model_name, target_op_sets.lower().replace(",", "_")),
             srcs = [src],
@@ -696,7 +704,7 @@ def gen_model_coverage_test(src, model_name, data, failure_type, tags, size = "m
                 "no_gpu",  # Executing with TF GPU configurations is redundant.
                 "no_oss",
                 "no_windows",
-            ] + tags,
+            ] + tags + coverage_tags,
             deps = [
                 "//tensorflow/lite/testing/model_coverage:model_coverage_lib",
                 "//tensorflow/lite/python:lite",
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index e1702d40d5a..1aa043b7c0c 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -22,6 +22,9 @@ package(
 tflite_cc_shared_object(
     name = "tensorflowlite_c",
     linkopts = select({
+        "//tensorflow:ios": [
+            "-Wl,-exported_symbols_list,$(location //tensorflow/lite/c:exported_symbols.lds)",
+        ],
         "//tensorflow:macos": [
             "-Wl,-exported_symbols_list,$(location //tensorflow/lite/c:exported_symbols.lds)",
         ],
diff --git a/tensorflow/lite/c/common.c b/tensorflow/lite/c/common.c
index f70a60002dd..e6b47896528 100644
--- a/tensorflow/lite/c/common.c
+++ b/tensorflow/lite/c/common.c
@@ -79,7 +79,8 @@ TfLiteFloatArray* TfLiteFloatArrayCreate(int size) {
 void TfLiteFloatArrayFree(TfLiteFloatArray* a) { free(a); }
 
 void TfLiteTensorDataFree(TfLiteTensor* t) {
-  if (t->allocation_type == kTfLiteDynamic) {
+  if (t->allocation_type == kTfLiteDynamic ||
+      t->allocation_type == kTfLitePersistentRo) {
     free(t->data.raw);
   }
   t->data.raw = NULL;
@@ -172,7 +173,8 @@ void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
 }
 
 void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor) {
-  if (tensor->allocation_type != kTfLiteDynamic) {
+  if (tensor->allocation_type != kTfLiteDynamic &&
+      tensor->allocation_type != kTfLitePersistentRo) {
     return;
   }
   // TODO(b/145340303): Tensor data should be aligned.
diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index 12ddf9945fd..31d8d53d563 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -29,6 +29,9 @@ limitations under the License.
 // TfLiteDelegate - allows delegation of nodes to alternative backends.
 //
 // Some abstractions in this file are created and managed by Interpreter.
+//
+// NOTE: The order of values in these structs are "semi-ABI stable". New values
+// should be added only to the end of structs and never reordered.
 
 #ifndef TENSORFLOW_LITE_C_COMMON_H_
 #define TENSORFLOW_LITE_C_COMMON_H_
@@ -83,8 +86,9 @@ typedef struct TfLiteIntArray {
   int size;
 // gcc 6.1+ have a bug where flexible members aren't properly handled
 // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
-#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
-    __GNUC_MINOR__ >= 1
+#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
+     __GNUC_MINOR__ >= 1) ||                                      \
+    defined(HEXAGON)
   int data[0];
 #else
   int data[];
@@ -122,6 +126,7 @@ typedef struct TfLiteFloatArray {
   int size;
 // gcc 6.1+ have a bug where flexible members aren't properly handled
 // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
+// This also applies to the toolchain used for Qualcomm Hexagon DSPs.
 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
     __GNUC_MINOR__ >= 1
   float data[0];
@@ -318,15 +323,23 @@ typedef union TfLitePtrUnion {
   void* data;
 } TfLitePtrUnion;
 
-// Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
-// data (or data externally allocated). kTfLiteArenaRw is arena allocated
-// data. kTfLiteDynamic is for tensors that are allocated during evaluation.
+// Memory allocation strategies.
+//  * kTfLiteMmapRo: Read-only memory-mapped data, or data externally allocated.
+//  * kTfLiteArenaRw: Arena allocated with no guarantees about persistence,
+//        and available during eval.
+//  * kTfLiteArenaRwPersistent: Arena allocated but persistent across eval, and
+//        only available during eval.
+//  * kTfLiteDynamic: Allocated during eval, or for string tensors.
+//  * kTfLitePersistentRo: Allocated and populated during prepare. This is
+//        useful for tensors that can be computed during prepare and treated
+//        as constant inputs for downstream ops (also in prepare).
 typedef enum TfLiteAllocationType {
   kTfLiteMemNone = 0,
   kTfLiteMmapRo,
   kTfLiteArenaRw,
   kTfLiteArenaRwPersistent,
   kTfLiteDynamic,
+  kTfLitePersistentRo,
 } TfLiteAllocationType;
 
 // The delegates should use zero or positive integers to represent handles.
@@ -731,8 +744,9 @@ typedef struct TfLiteDelegate {
                           struct TfLiteDelegate* delegate);
 
   // Copy the data from delegate buffer handle into raw memory of the given
-  // 'tensor'. This cannot be null. The delegate is allowed to allocate the raw
-  // bytes as long as it follows the rules for kTfLiteDynamic tensors.
+  // 'tensor'. Note that the delegate is allowed to allocate the raw bytes as
+  // long as it follows the rules for kTfLiteDynamic tensors, in which case this
+  // cannot be null.
   TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
                                        struct TfLiteDelegate* delegate,
                                        TfLiteBufferHandle buffer_handle,
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index 6681a3ed550..419a3b2486d 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -26,6 +26,7 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "@flatbuffers//:runtime_cc",
     ],
 )
 
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 63e04899ca3..c52fc9f690b 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 
-#include <cstdlib>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index d774afe8e85..2feddfaa8e6 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -19,9 +19,12 @@ limitations under the License.
 // flatbuffer serialization format into in-memory values that are used by the
 // runtime API and interpreter.
 
+#include <cstddef>
+#include <new>
+#include <type_traits>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/core/api/op_resolver.cc b/tensorflow/lite/core/api/op_resolver.cc
index 6424071f371..c239d9ed23e 100644
--- a/tensorflow/lite/core/api/op_resolver.cc
+++ b/tensorflow/lite/core/api/op_resolver.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/op_resolver.h"
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+
 namespace tflite {
 
 TfLiteStatus GetRegistrationFromOpCode(
diff --git a/tensorflow/lite/core/api/profiler.h b/tensorflow/lite/core/api/profiler.h
index 938652cf698..897efbe1438 100644
--- a/tensorflow/lite/core/api/profiler.h
+++ b/tensorflow/lite/core/api/profiler.h
@@ -22,34 +22,56 @@ namespace tflite {
 // A simple utility for enabling profiled event tracing in TensorFlow Lite.
 class Profiler {
  public:
+  // As certain Profiler instance might be only interested in certain event
+  // types, we define each event type value to allow a Profiler to use
+  // bitmasking bitwise operations to determine whether an event should be
+  // recorded or not.
   enum class EventType {
     // Default event type, the metadata field has no special significance.
-    DEFAULT = 0,
+    DEFAULT = 1,
 
     // The event is an operator invocation and the event_metadata field is the
     // index of operator node.
-    OPERATOR_INVOKE_EVENT = 1,
+    OPERATOR_INVOKE_EVENT = 2,
 
     // The event is an invocation for an internal operator of a TFLite delegate.
     // The event_metadata field is the index of operator node that's specific to
     // the delegate.
-    DELEGATE_OPERATOR_INVOKE_EVENT = 2
+    DELEGATE_OPERATOR_INVOKE_EVENT = 4,
+
+    // The event is a recording of runtime instrumentation such as the overall
+    // TFLite runtime status, the TFLite delegate status (if a delegate
+    // is applied), and the overall model inference latency etc.
+    // Note, the delegate status and overall status are stored as separate
+    // event_metadata fields. In particular, the delegate status is encoded
+    // as DelegateStatus::full_status().
+    GENERAL_RUNTIME_INSTRUMENTATION_EVENT = 8,
   };
 
   virtual ~Profiler() {}
 
-  // Signals the beginning of an event from a subgraph indexed at
-  // 'event_subgraph_index', returning a handle to the profile event.
+  // Signals the beginning of an event and returns a handle to the profile
+  // event. The `event_metadata1` and `event_metadata2` have different
+  // interpretations based on the actual Profiler instance and the `event_type`.
+  // For example, as for the 'SubgraphAwareProfiler' defined in
+  // lite/core/subgraph.h, when the event_type is OPERATOR_INVOKE_EVENT,
+  // `event_metadata1` represents the index of a TFLite node, and
+  // `event_metadata2` represents the index of the subgraph that this event
+  // comes from.
   virtual uint32_t BeginEvent(const char* tag, EventType event_type,
-                              uint32_t event_metadata,
-                              uint32_t event_subgraph_index) = 0;
-  // Similar w/ the above, but the event comes from the primary subgraph that's
-  // indexed at 0.
-  virtual uint32_t BeginEvent(const char* tag, EventType event_type,
-                              uint32_t event_metadata) {
-    return BeginEvent(tag, event_type, event_metadata, /*primary subgraph*/ 0);
+                              int64_t event_metadata1,
+                              int64_t event_metadata2) = 0;
+  // Similar w/ the above, but `event_metadata2` defaults to 0.
+  uint32_t BeginEvent(const char* tag, EventType event_type,
+                      int64_t event_metadata) {
+    return BeginEvent(tag, event_type, event_metadata, /*event_metadata2*/ 0);
   }
 
+  // Signals an end to the specified profile event with 'event_metadata's, This
+  // is useful when 'event_metadata's are not available when the event begins
+  // or when one wants to overwrite the 'event_metadata's set at the beginning.
+  virtual void EndEvent(uint32_t event_handle, int64_t event_metadata1,
+                        int64_t event_metadata2) {}
   // Signals an end to the specified profile event.
   virtual void EndEvent(uint32_t event_handle) = 0;
 
@@ -60,15 +82,18 @@ class Profiler {
   // they assume the value is in "usec", if in any case subclasses
   // didn't put usec, then the values are not meaningful.
   // TODO karimnosseir: Revisit and make the function more clear.
-  virtual void AddEvent(const char* tag, EventType event_type,
-                        uint32_t event_metadata, uint64_t start, uint64_t end) {
-    AddEvent(tag, event_type, event_metadata, start, end,
-             /*event_subgraph_index*/ 0);
+  void AddEvent(const char* tag, EventType event_type, uint64_t start,
+                uint64_t end, int64_t event_metadata) {
+    AddEvent(tag, event_type, start, end, event_metadata,
+             /*event_metadata2*/ 0);
   }
 
-  virtual void AddEvent(const char* tag, EventType event_type,
-                        uint32_t event_metadata, uint64_t start, uint64_t end,
-                        uint32_t event_subgraph_index) {}
+  virtual void AddEvent(const char* tag, EventType event_type, uint64_t start,
+                        uint64_t end, int64_t event_metadata1,
+                        int64_t event_metadata2) {}
+
+ protected:
+  friend class ScopedProfile;
 };
 
 // Adds a profile event to `profiler` that begins with the construction
@@ -79,7 +104,7 @@ class ScopedProfile {
  public:
   ScopedProfile(Profiler* profiler, const char* tag,
                 Profiler::EventType event_type = Profiler::EventType::DEFAULT,
-                uint32_t event_metadata = 0)
+                int64_t event_metadata = 0)
       : profiler_(profiler), event_handle_(0) {
     if (profiler) {
       event_handle_ = profiler_->BeginEvent(tag, event_type, event_metadata);
@@ -92,8 +117,8 @@ class ScopedProfile {
     }
   }
 
- private:
-  Profiler* const profiler_;
+ protected:
+  Profiler* profiler_;
   uint32_t event_handle_;
 };
 
@@ -113,6 +138,31 @@ class ScopedDelegateOperatorProfile : public ScopedProfile {
                       static_cast<uint32_t>(node_index)) {}
 };
 
+class ScopedRuntimeInstrumentationProfile : public ScopedProfile {
+ public:
+  ScopedRuntimeInstrumentationProfile(Profiler* profiler, const char* tag)
+      : ScopedProfile(
+            profiler, tag,
+            Profiler::EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT, -1) {}
+
+  void set_runtime_status(int64_t delegate_status, int64_t interpreter_status) {
+    if (profiler_) {
+      delegate_status_ = delegate_status;
+      interpreter_status_ = interpreter_status;
+    }
+  }
+
+  ~ScopedRuntimeInstrumentationProfile() {
+    if (profiler_) {
+      profiler_->EndEvent(event_handle_, delegate_status_, interpreter_status_);
+    }
+  }
+
+ private:
+  int64_t delegate_status_;
+  int64_t interpreter_status_;
+};
+
 }  // namespace tflite
 
 #define TFLITE_VARNAME_UNIQ_IMPL(name, ctr) name##ctr
@@ -130,4 +180,15 @@ class ScopedDelegateOperatorProfile : public ScopedProfile {
   tflite::ScopedDelegateOperatorProfile TFLITE_VARNAME_UNIQ(               \
       _profile_, __COUNTER__)((profiler), (tag), (node_index))
 
+#define TFLITE_ADD_RUNTIME_INSTRUMENTATION_EVENT(                          \
+    profiler, tag, delegate_status, interpreter_status)                    \
+  do {                                                                     \
+    if (!profiler) {                                                       \
+      const auto handle = profiler->BeginEvent(                            \
+          tag, Profiler::EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT, \
+          delegate_status, interpreter_status);                            \
+      profiler->EndEvent(handle);                                          \
+    }                                                                      \
+  } while (false);
+
 #endif  // TENSORFLOW_LITE_CORE_API_PROFILER_H_
diff --git a/tensorflow/lite/core/api/tensor_utils.cc b/tensorflow/lite/core/api/tensor_utils.cc
index d8d6fc46a18..3aac16b6878 100644
--- a/tensorflow/lite/core/api/tensor_utils.cc
+++ b/tensorflow/lite/core/api/tensor_utils.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string.h>
 
+#include "tensorflow/lite/c/common.h"
+
 namespace tflite {
 
 TfLiteStatus ResetVariableTensor(TfLiteTensor* tensor) {
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 4cebd059a80..81710df128b 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -533,6 +533,11 @@ void Subgraph::SetCancellationFunction(void* data,
   check_cancelled_func_ = check_cancelled_func;
 }
 
+bool Subgraph::IsCancelled() {
+  return (check_cancelled_func_ != nullptr) &&
+         (*check_cancelled_func_)(cancellation_data_);
+}
+
 void Subgraph::ReserveNodes(int count) {
   nodes_and_registration_.reserve(count);
 }
@@ -1183,7 +1188,8 @@ TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
   // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
   if (tensor->allocation_type == kTfLiteArenaRw ||
       tensor->allocation_type == kTfLiteDynamic ||
-      tensor->allocation_type == kTfLiteArenaRwPersistent) {
+      tensor->allocation_type == kTfLiteArenaRwPersistent ||
+      tensor->allocation_type == kTfLitePersistentRo) {
     tensor_resized_since_op_invoke_ |=
         TfLiteIntArrayEqual(tensor->dims, new_size) == 0;
     if (tensor->type != kTfLiteString) {
@@ -1195,14 +1201,16 @@ TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
         return kTfLiteError;
       }
 
-      // Realloc space for kTfLiteDynamic tensors.
+      // Realloc space for heap-allocated tensors.
       TfLiteTensorRealloc(bytesRequired, tensor);
       tensor->bytes = bytesRequired;
     }
     if (tensor->dims) TfLiteIntArrayFree(tensor->dims);
     tensor->dims = new_size;
 
-    if (tensor->allocation_type != kTfLiteDynamic) {
+    // Reset arena-allocated tensors; they will be allocated later.
+    if (tensor->allocation_type == kTfLiteArenaRw ||
+        tensor->allocation_type == kTfLiteArenaRwPersistent) {
       tensor->data.raw = nullptr;
     }
   } else {
@@ -1313,6 +1321,8 @@ TfLiteStatus Subgraph::RemoveAllDelegates() {
   return kTfLiteOk;
 }
 
+bool Subgraph::HasDelegates() { return !delegates_applied_.empty(); }
+
 TfLiteStatus Subgraph::EnsureMemoryAllocations() {
   if (memory_planner_) {
     state_ = kStateUninvokable;
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 0b0c1e31e89..d9ccff35105 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_SUBGRAPH_H_
 #define TENSORFLOW_LITE_CORE_SUBGRAPH_H_
 
+#include <cstdint>
 #include <cstdlib>
 #include <map>
 #include <utility>
@@ -338,21 +339,16 @@ class Subgraph {
   class SubgraphAwareProfiler : public Profiler {
    public:
     // Constructor should be called with the non-nullptr profiler argument.
-    SubgraphAwareProfiler(Profiler* profiler, uint32_t subgraph_index)
+    SubgraphAwareProfiler(Profiler* profiler, int64_t subgraph_index)
         : profiler_(profiler), subgraph_index_(subgraph_index) {}
     ~SubgraphAwareProfiler() override {}
 
     uint32_t BeginEvent(const char* tag, EventType event_type,
-                        uint32_t event_metadata,
-                        uint32_t subgraph_index) override {
+                        int64_t event_metadata1,
+                        int64_t event_metadata2) override {
       if (!profiler_) return 0;
-      return profiler_->BeginEvent(tag, event_type, event_metadata,
-                                   subgraph_index);
-    }
-
-    uint32_t BeginEvent(const char* tag, EventType event_type,
-                        uint32_t event_metadata) override {
-      return BeginEvent(tag, event_type, event_metadata, subgraph_index_);
+      return profiler_->BeginEvent(tag, event_type, event_metadata1,
+                                   subgraph_index_);
     }
 
     void EndEvent(uint32_t event_handle) override {
@@ -360,17 +356,24 @@ class Subgraph {
       profiler_->EndEvent(event_handle);
     }
 
-    void AddEvent(const char* tag, EventType event_type,
-                  uint32_t event_metadata, uint64_t start,
-                  uint64_t end) override {
+    void EndEvent(uint32_t event_handle, int64_t event_metadata1,
+                  int64_t event_metadata2) override {
       if (!profiler_) return;
-      profiler_->AddEvent(tag, event_type, event_metadata, start, end);
+      profiler_->EndEvent(event_handle, event_metadata1, event_metadata2);
+    }
+
+    void AddEvent(const char* tag, EventType event_type, uint64_t start,
+                  uint64_t end, int64_t event_metadata1,
+                  int64_t event_metadata2) override {
+      if (!profiler_) return;
+      profiler_->AddEvent(tag, event_type, start, end, event_metadata1,
+                          subgraph_index_);
     }
 
    private:
     // Not own the memory.
     Profiler* const profiler_;
-    const uint32_t subgraph_index_;
+    const int64_t subgraph_index_;
   };
 
   // Prevent 'context_' from accessing functions that are only available to
@@ -553,6 +556,9 @@ class Subgraph {
   // afterwards.
   TfLiteStatus RemoveAllDelegates();
 
+  // Returns true if the subgraph has delegates applied.
+  bool HasDelegates();
+
   // Cleanups up data reserved for the given node. Does not remove the {node,
   // registration} pair from nodes_and_registrations_.
   void CleanupNode(int node_index);
@@ -578,6 +584,9 @@ class Subgraph {
   // Ensures the memory required is planned and allocated.
   TfLiteStatus EnsureMemoryAllocations();
 
+  // Returns true if cancellation function returns true.
+  bool IsCancelled();
+
   // The state of the Interpreter.
   enum State {
     // The interpreter isn't ready to be invoked.
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index df671675ec9..e1f91f32c34 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -20,6 +20,15 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+cc_library(
+    name = "status",
+    hdrs = ["status.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite/c:common",
+    ],
+)
+
 cc_library(
     name = "utils",
     srcs = ["utils.cc"],
@@ -32,6 +41,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "interpreter_utils",
+    srcs = ["interpreter_utils.cc"],
+    hdrs = ["interpreter_utils.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite:framework",
+    ],
+)
+
 cc_test(
     name = "utils_test",
     srcs = ["utils_test.cc"],
@@ -43,3 +62,25 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_test(
+    name = "delegate_test",
+    size = "small",
+    srcs = ["delegate_test.cc"],
+    features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
+    deps = [
+        ":interpreter_utils",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:version",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels/internal:compatibility",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/delegates/delegate_test.cc b/tensorflow/lite/delegates/delegate_test.cc
new file mode 100644
index 00000000000..1efe6e44d54
--- /dev/null
+++ b/tensorflow/lite/delegates/delegate_test.cc
@@ -0,0 +1,1051 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/interpreter_utils.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace {
+
+// Build a kernel registration for an op that copies its one input
+// to an output
+TfLiteRegistration AddOpRegistration() {
+  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+  reg.custom_name = "my_add";
+  reg.builtin_code = tflite::BuiltinOperator_CUSTOM;
+
+  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+    // Set output size to input size
+    const TfLiteTensor* input1 = GetInput(context, node, 0);
+    const TfLiteTensor* input2 = GetInput(context, node, 1);
+    TfLiteTensor* output = GetOutput(context, node, 0);
+
+    TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
+    for (int i = 0; i < input1->dims->size; ++i) {
+      TF_LITE_ENSURE_EQ(context, input1->dims->data[i], input2->dims->data[i]);
+    }
+
+    TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+        context, output, TfLiteIntArrayCopy(input1->dims)));
+    return kTfLiteOk;
+  };
+
+  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+    // Copy input data to output data.
+    const TfLiteTensor* a0 = GetInput(context, node, 0);
+    TF_LITE_ENSURE(context, a0);
+    TF_LITE_ENSURE(context, a0->data.f);
+    const TfLiteTensor* a1 = GetInput(context, node, 1);
+    TF_LITE_ENSURE(context, a1);
+    TF_LITE_ENSURE(context, a1->data.f);
+    TfLiteTensor* out = GetOutput(context, node, 0);
+    TF_LITE_ENSURE(context, out);
+    TF_LITE_ENSURE(context, out->data.f);
+    int num = a0->dims->data[0];
+    for (int i = 0; i < num; i++) {
+      out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+    }
+    return kTfLiteOk;
+  };
+  return reg;
+}
+
+}  // namespace
+
+// TestDelegate is a friend of Interpreter to access RemoveAllDelegates().
+class TestDelegate : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddTensors(5);
+    interpreter_->SetInputs({0, 1});
+    interpreter_->SetOutputs({3, 4});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration reg = AddOpRegistration();
+    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
+  }
+
+  void TearDown() override {
+    // Interpreter relies on delegate to free the resources properly. Thus
+    // the life cycle of delegate must be longer than interpreter.
+    interpreter_.reset();
+    delegate_.reset();
+  }
+
+  TfLiteBufferHandle last_allocated_handle_ = kTfLiteNullBufferHandle;
+
+  TfLiteBufferHandle AllocateBufferHandle() { return ++last_allocated_handle_; }
+
+  TfLiteStatus RemoveAllDelegates() {
+    return interpreter_->RemoveAllDelegates();
+  }
+
+ protected:
+  class SimpleDelegate {
+   public:
+    // Create a simple implementation of a TfLiteDelegate. We use the C++ class
+    // SimpleDelegate and it can produce a handle TfLiteDelegate that is
+    // value-copyable and compatible with TfLite.
+    // fail_node_prepare: To simulate failure of Delegate node's Prepare().
+    // min_ops_per_subset: If >0, partitioning preview is used to choose only
+    // those subsets with min_ops_per_subset number of nodes.
+    // fail_node_invoke: To simulate failure of Delegate node's Invoke().
+    explicit SimpleDelegate(
+        const std::vector<int>& nodes,
+        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
+        bool fail_node_prepare = false, int min_ops_per_subset = 0,
+        bool fail_node_invoke = false)
+        : nodes_(nodes),
+          fail_delegate_node_prepare_(fail_node_prepare),
+          min_ops_per_subset_(min_ops_per_subset),
+          fail_delegate_node_invoke_(fail_node_invoke) {
+      delegate_.Prepare = [](TfLiteContext* context,
+                             TfLiteDelegate* delegate) -> TfLiteStatus {
+        auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
+        TfLiteIntArray* nodes_to_separate =
+            TfLiteIntArrayCreate(simple->nodes_.size());
+        // Mark nodes that we want in TfLiteIntArray* structure.
+        int index = 0;
+        for (auto node_index : simple->nodes_) {
+          nodes_to_separate->data[index++] = node_index;
+          // make sure node is added
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+        }
+        // Check that all nodes are available
+        TfLiteIntArray* execution_plan;
+        TF_LITE_ENSURE_STATUS(
+            context->GetExecutionPlan(context, &execution_plan));
+        for (int exec_index = 0; exec_index < execution_plan->size;
+             exec_index++) {
+          int node_index = execution_plan->data[exec_index];
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          if (exec_index == node_index) {
+            // Check op details only if it wasn't delegated already.
+            TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+            TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+          }
+        }
+
+        // Get preview of delegate partitioning from the context.
+        TfLiteDelegateParams* params_array;
+        int num_partitions;
+        TFLITE_CHECK_EQ(
+            context->PreviewDelegatePartitioning(
+                context, nodes_to_separate, &params_array, &num_partitions),
+            kTfLiteOk);
+
+        if (simple->min_ops_per_subset() > 0) {
+          // Build a new vector of ops from subsets with atleast the minimum
+          // size.
+          std::vector<int> allowed_ops;
+          for (int idx = 0; idx < num_partitions; ++idx) {
+            const auto* nodes_in_subset = params_array[idx].nodes_to_replace;
+            if (nodes_in_subset->size < simple->min_ops_per_subset()) continue;
+            allowed_ops.insert(allowed_ops.end(), nodes_in_subset->data,
+                               nodes_in_subset->data + nodes_in_subset->size);
+          }
+
+          // Free existing nodes_to_separate & initialize a new array with
+          // allowed_ops.
+          TfLiteIntArrayFree(nodes_to_separate);
+          nodes_to_separate = TfLiteIntArrayCreate(allowed_ops.size());
+          memcpy(nodes_to_separate->data, allowed_ops.data(),
+                 sizeof(int) * nodes_to_separate->size);
+        }
+
+        // Another call to PreviewDelegateParitioning should be okay, since
+        // partitioning memory is managed by context.
+        TFLITE_CHECK_EQ(
+            context->PreviewDelegatePartitioning(
+                context, nodes_to_separate, &params_array, &num_partitions),
+            kTfLiteOk);
+
+        context->ReplaceNodeSubsetsWithDelegateKernels(
+            context, simple->FakeFusedRegistration(), nodes_to_separate,
+            delegate);
+        TfLiteIntArrayFree(nodes_to_separate);
+        return kTfLiteOk;
+      };
+      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
+                                        TfLiteDelegate* delegate,
+                                        TfLiteBufferHandle buffer_handle,
+                                        TfLiteTensor* tensor) -> TfLiteStatus {
+        // TODO(b/156586986): Implement tests to test buffer copying logic.
+        return kTfLiteOk;
+      };
+      delegate_.CopyFromBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle buffer_handle,
+             TfLiteTensor* output) -> TfLiteStatus {
+        TFLITE_CHECK_GE(buffer_handle, -1);
+        TFLITE_CHECK_EQ(output->buffer_handle, buffer_handle);
+        const float floats[] = {6., 6., 6.};
+        int num = output->dims->data[0];
+        for (int i = 0; i < num; i++) {
+          output->data.f[i] = floats[i];
+        }
+        return kTfLiteOk;
+      };
+
+      delegate_.FreeBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
+      // Store type-punned data SimpleDelegate structure.
+      delegate_.data_ = static_cast<void*>(this);
+      delegate_.flags = delegate_flags;
+    }
+
+    TfLiteRegistration FakeFusedRegistration() {
+      TfLiteRegistration reg = {nullptr};
+      reg.custom_name = "fake_fused_op";
+
+      reg.invoke = [](TfLiteContext* context,
+                      TfLiteNode* node) -> TfLiteStatus {
+        // Copy input data to output data.
+        const TfLiteTensor* a0;
+        const TfLiteTensor* a1;
+        if (node->inputs->size == 2) {
+          a0 = GetInput(context, node, 0);
+          a1 = GetInput(context, node, 1);
+        } else {
+          a0 = GetInput(context, node, 0);
+          a1 = a0;
+        }
+        TfLiteTensor* out = GetOutput(context, node, 0);
+        int num = 1;
+        for (int i = 0; i < a0->dims->size; ++i) {
+          num *= a0->dims->data[i];
+        }
+        for (int i = 0; i < num; i++) {
+          out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+        }
+        if (out->buffer_handle != kTfLiteNullBufferHandle) {
+          // Make the data stale so that CopyFromBufferHandle can be invoked
+          out->data_is_stale = true;
+        }
+        return kTfLiteOk;
+      };
+      if (fail_delegate_node_invoke_) {
+        reg.invoke = [](TfLiteContext* context,
+                        TfLiteNode* node) -> TfLiteStatus {
+          return kTfLiteError;
+        };
+      }
+
+      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+        // Set output size to input size
+        const TfLiteTensor* input1;
+        const TfLiteTensor* input2;
+        if (node->inputs->size == 2) {
+          input1 = GetInput(context, node, 0);
+          input2 = GetInput(context, node, 1);
+        } else {
+          input1 = GetInput(context, node, 0);
+          input2 = input1;
+        }
+        TfLiteTensor* output = GetOutput(context, node, 0);
+
+        TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+            context, output, TfLiteIntArrayCopy(input1->dims)));
+        return kTfLiteOk;
+      };
+      if (fail_delegate_node_prepare_) {
+        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+          return kTfLiteError;
+        };
+      }
+
+      return reg;
+    }
+
+    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
+
+    int min_ops_per_subset() { return min_ops_per_subset_; }
+
+   private:
+    std::vector<int> nodes_;
+    TfLiteDelegate delegate_;
+    bool fail_delegate_node_prepare_ = false;
+    int min_ops_per_subset_ = 0;
+    bool fail_delegate_node_invoke_ = false;
+  };
+
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
+};
+namespace {
+
+TEST_F(TestDelegate, BasicDelegate) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  int node = interpreter_->execution_plan()[0];
+  const auto* node_and_reg = interpreter_->node_and_registration(node);
+  EXPECT_EQ(node_and_reg->second.custom_name,
+            delegate_->FakeFusedRegistration().custom_name);
+
+  const TfLiteDelegateParams* params = static_cast<const TfLiteDelegateParams*>(
+      node_and_reg->first.builtin_data);
+  ASSERT_EQ(params->nodes_to_replace->size, 3);
+  EXPECT_EQ(params->nodes_to_replace->data[0], 0);
+  EXPECT_EQ(params->nodes_to_replace->data[1], 1);
+  EXPECT_EQ(params->nodes_to_replace->data[2], 2);
+
+  ASSERT_EQ(params->input_tensors->size, 2);
+  EXPECT_EQ(params->input_tensors->data[0], 0);
+  EXPECT_EQ(params->input_tensors->data[1], 1);
+
+  ASSERT_EQ(params->output_tensors->size, 2);
+  EXPECT_EQ(params->output_tensors->data[0], 3);
+  EXPECT_EQ(params->output_tensors->data[1], 4);
+}
+
+TEST_F(TestDelegate, DelegateNodePrepareFailure) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, kTfLiteDelegateFlagsNone, true /**fail_node_prepare**/));
+  // ModifyGraphWithDelegate fails, since the Prepare() method in the node's
+  // TfLiteRegistration returns an error status.
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteDelegateError);
+  // Execution plan should remain unchanged.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, DelegateNodeInvokeFailure) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
+      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Delegation modified execution plan.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+
+  // Verify Invoke() behavior: fails first, succeeds after RemoveAllDelegates().
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  EXPECT_EQ(interpreter_->Invoke(), kTfLiteError);
+  ASSERT_EQ(RemoveAllDelegates(), kTfLiteOk);
+  // Delegation removed, returning to original execution plan.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, DelegateNodeInvokeFailureFallback) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
+      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Delegation modified execution plan.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  EXPECT_EQ(
+      delegates::InterpreterUtils::InvokeWithCPUFallback(interpreter_.get()),
+      kTfLiteDelegateError);
+  // Delegation removed, returning to original execution plan.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  // Check outputs.
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, SecondDelegationPrepareFailure) {
+  // First delegate only supports nodes 1, 2. Gets applied successfully.
+  // This delegate should support dynamic tensors, otherwise the second won't be
+  // applied.
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({1, 2}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  // Second delegate supports node 0, but fails during the delegate-node's
+  // Prepare.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0}, kTfLiteDelegateFlagsNone, true /**fail_node_prepare**/));
+
+  // Initially, execution plan has 3 nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  // First delegate should be applied successfully, yielding a plan with 2
+  // nodes.
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+  // Second delegate won't get applied.
+  // As a result, previous delegate should also get undone, restoring the
+  // execution plan to its original state.
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteDelegateError);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, SecondDelegationInvokeFailure) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({1, 2}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
+      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  // Outputs match the AddOp path, rather than delegate path.
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+
+  // Verify Invoke() behavior to ensure Interpreter isn't broken.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  EXPECT_EQ(interpreter_->Invoke(), kTfLiteError);
+  EXPECT_EQ(RemoveAllDelegates(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, StaticDelegateMakesGraphImmutable) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  // Deliberately try to set tensor params with quantization while immutable,
+  // ensuring quantization is properly freed.
+  TfLiteQuantization quant = {};
+  quant.type = kTfLiteAffineQuantization;
+  auto quant_params = static_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  quant_params->scale = nullptr;
+  quant_params->zero_point = nullptr;
+  quant_params->quantized_dimension = 0;
+  quant.params = quant_params;
+  ASSERT_NE(interpreter_->SetTensorParametersReadWrite(0, kTfLiteInt8, "", {3},
+                                                       quant),
+            kTfLiteOk);
+}
+
+TEST_F(TestDelegate, ComplexDelegate) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
+  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+  // 0th should be a non-delegated original op
+  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
+  // 1st should be a new macro op (3) which didn't exist)
+  ASSERT_EQ(interpreter_->execution_plan()[1], 3);
+  const auto* node_and_reg = interpreter_->node_and_registration(3);
+  ASSERT_EQ(node_and_reg->second.custom_name,
+            delegate_->FakeFusedRegistration().custom_name);
+}
+
+TEST_F(TestDelegate, SetBufferHandleToInput) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  constexpr int kOutputTensorIndex = 0;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  ASSERT_EQ(tensor->delegate, nullptr);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  TfLiteBufferHandle handle = AllocateBufferHandle();
+  TfLiteStatus status =
+      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
+  ASSERT_EQ(status, kTfLiteOk);
+  EXPECT_EQ(tensor->delegate, delegate);
+  EXPECT_EQ(tensor->buffer_handle, handle);
+}
+
+TEST_F(TestDelegate, SetBufferHandleToOutput) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  // Before setting the buffer handle, the tensor's `delegate` is already set
+  // because it will be written by the delegate.
+  ASSERT_EQ(tensor->delegate, delegate);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  TfLiteBufferHandle handle = AllocateBufferHandle();
+  TfLiteStatus status =
+      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
+  ASSERT_EQ(status, kTfLiteOk);
+  EXPECT_EQ(tensor->delegate, delegate);
+  EXPECT_EQ(tensor->buffer_handle, handle);
+}
+
+TEST_F(TestDelegate, SetInvalidHandleToTensor) {
+  interpreter_->Invoke();
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  SimpleDelegate another_simple_delegate({0, 1, 2});
+
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  // Before setting the buffer handle, the tensor's `delegate` is already set
+  // because it will be written by the delegate.
+  ASSERT_EQ(tensor->delegate, delegate);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  TfLiteBufferHandle handle = AllocateBufferHandle();
+  TfLiteStatus status = interpreter_->SetBufferHandle(
+      kOutputTensorIndex, handle,
+      another_simple_delegate.get_tf_lite_delegate());
+  // Setting a buffer handle to a tensor with another delegate will fail.
+  ASSERT_EQ(status, kTfLiteError);
+  EXPECT_EQ(tensor->delegate, delegate);
+  EXPECT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+}
+
+// We utilize delegation in such a way as to allow node subsets with a minimum
+// number of ops only.
+TEST_F(TestDelegate, TestDelegationWithPartitionPreview) {
+  // We set kTfLiteDelegateFlagsAllowDynamicTensors to ensure the second
+  // delegate can be applied.
+  // Ops 0 and 2 are delegated but end up in the same partition (based on
+  // dependency analysis). However, since min_ops_per_subset = 3, no delegation
+  // takes place.
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 2}, kTfLiteDelegateFlagsAllowDynamicTensors,
+      false /**fail_node_prepare**/, 3 /**min_ops_per_subset**/));
+  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
+
+  // Original execution plan remains.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
+  ASSERT_EQ(interpreter_->execution_plan()[1], 1);
+  ASSERT_EQ(interpreter_->execution_plan()[2], 2);
+
+  // Same ops supported, but min_ops_per_subset = 2.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 2}, kTfLiteDelegateFlagsAllowDynamicTensors,
+      false /**fail_node_prepare**/, 2 /**min_ops_per_subset**/));
+  interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate());
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+  ASSERT_EQ(interpreter_->execution_plan()[0], 3);
+  const auto* node_and_reg = interpreter_->node_and_registration(3);
+  ASSERT_EQ(node_and_reg->second.custom_name,
+            delegate2_->FakeFusedRegistration().custom_name);
+  ASSERT_EQ(interpreter_->execution_plan()[1], 1);
+}
+
+TEST_F(TestDelegate, TestResizeInputWithNonDynamicDelegate) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+
+  // Try resizing input to same shape as before (which should be a No-op).
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  // This should fail, since the previous application of the delegate will be
+  // re-done automatically, making the graph immutable again.
+  ASSERT_NE(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Ensure graph has been restored to its valid delegated state.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+
+  // Resize again, but call AllocateTensors as usual afterwards.
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
+  // First delegate only supports node 0.
+  // This delegate should support dynamic tensors, otherwise the second won't be
+  // applied.
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Should be two delegates nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  // Try resizing input to same shape as before (which should be a No-op).
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  // Resizing input tensors should temporarily restore original execution plan
+  // of 3 nodes.
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  // This should fail, since the previous application of the delegate will be
+  // re-done automatically, making the graph immutable again.
+  ASSERT_NE(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Ensure graph has been restored to its valid delegated state.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
+  constexpr int kOutputTensorIndex = 2;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+
+  // Resize again, but call AllocateTensors as usual afterwards.
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, TestFallbackWithMultipleDelegates) {
+  // First delegate only supports node 0.
+  // This delegate should support dynamic tensors, otherwise the second won't be
+  // applied.
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
+      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
+  // Pre-delegation execution plan should have three nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Should be two delegates nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 2;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  EXPECT_EQ(
+      delegates::InterpreterUtils::InvokeWithCPUFallback(interpreter_.get()),
+      kTfLiteDelegateError);
+  // All delegates should be undone.
+  EXPECT_EQ(interpreter_->execution_plan().size(), 3);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, ReleaseNonPersistentMemoryWithDelegates) {
+  // First delegate only supports node 0.
+  // This delegate should support dynamic tensors, otherwise the second won't be
+  // applied.
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
+
+  // No-op.
+  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
+
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Should be two delegates nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  // This should fail, since the graph is immutable.
+  ASSERT_NE(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
+  constexpr int kOutputTensorIndex = 2;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+
+  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
+}
+
+TEST_F(TestDelegate, TestCopyFromBufferInvoke) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  std::vector<float> floats = {1.0f, 2.0f, 3.0f};
+  memcpy(interpreter_->typed_tensor<float>(0), floats.data(),
+         floats.size() * sizeof(float));
+
+  memcpy(interpreter_->typed_tensor<float>(1), floats.data(),
+         floats.size() * sizeof(float));
+
+  // Before setting the buffer handle, the tensor's `delegate` is already set
+  // because it will be written by the delegate.
+  ASSERT_EQ(tensor->delegate, delegate);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  // Called Invoke without setting the buffer will not call the CopyFromBuffer
+  interpreter_->Invoke();
+  std::vector<float> res = {2.0f, 4.0f, 6.0f};
+  for (int i = 0; i < tensor->dims->data[0]; ++i) {
+    ASSERT_EQ(tensor->data.f[i], res[i]);
+  }
+}
+
+TEST_F(TestDelegate, TestCopyFromBuffer) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  std::vector<float> floats = {1.0f, 2.0f, 3.0f};
+  memcpy(interpreter_->typed_tensor<float>(0), floats.data(),
+         floats.size() * sizeof(float));
+
+  memcpy(interpreter_->typed_tensor<float>(1), floats.data(),
+         floats.size() * sizeof(float));
+
+  // Before setting the buffer handle, the tensor's `delegate` is already set
+  // because it will be written by the delegate.
+  ASSERT_EQ(tensor->delegate, delegate);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  TfLiteBufferHandle handle = AllocateBufferHandle();
+  TfLiteStatus status =
+      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
+  interpreter_->Invoke();
+  ASSERT_EQ(status, kTfLiteOk);
+  EXPECT_EQ(tensor->delegate, delegate);
+  EXPECT_EQ(tensor->buffer_handle, handle);
+  for (int i = 0; i < tensor->dims->data[0]; ++i) {
+    ASSERT_EQ(tensor->data.f[i], 6.0f);
+  }
+}
+
+TEST_F(TestDelegate, DelegateCustomOpResolution) {
+  // Build a flatbuffer model that contains the "my_add" custom op which gets
+  // resolved only after SimpleDelegate is applied.
+  flatbuffers::FlatBufferBuilder builder;
+  // Tensors.
+  const int32_t shape[1] = {3};
+  flatbuffers::Offset<Tensor> tensors[3] = {
+      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
+                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("X")),
+      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
+                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("Y")),
+      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
+                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("Z")),
+  };
+  // Custom op definition.
+  flatbuffers::Offset<OperatorCode> op_code =
+      CreateOperatorCodeDirect(builder, BuiltinOperator_CUSTOM, "my_add");
+  const int32_t inputs[2] = {0, 1};
+  const int32_t outputs[1] = {2};
+  flatbuffers::Offset<Operator> op = CreateOperator(
+      builder, /*opcode_index=*/0, builder.CreateVector<int32_t>(inputs, 2),
+      builder.CreateVector<int32_t>(outputs, 1), BuiltinOptions_NONE,
+      /*builtin_options=*/0,
+      /*custom_options=*/0, tflite::CustomOptionsFormat_FLEXBUFFERS);
+  // Subgraph & Model.
+  flatbuffers::Offset<SubGraph> subgraph =
+      CreateSubGraph(builder, builder.CreateVector(tensors, 3),
+                     builder.CreateVector<int32_t>(inputs, 2),
+                     builder.CreateVector<int32_t>(outputs, 1),
+                     builder.CreateVector(&op, 1), /*name=*/0);
+  flatbuffers::Offset<Buffer> buffers[1] = {
+      CreateBuffer(builder, builder.CreateVector({})),
+  };
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&op_code, 1),
+      builder.CreateVector(&subgraph, 1), builder.CreateString("test_model"),
+      builder.CreateVector(buffers, 1));
+  builder.Finish(model_buffer);
+  std::vector<char> buffer =
+      std::vector<char>(builder.GetBufferPointer(),
+                        builder.GetBufferPointer() + builder.GetSize());
+  const Model* model = GetModel(buffer.data());
+
+  // Build an interpreter with the model. Initialization should work fine.
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(
+          model, ::tflite::ops::builtin::BuiltinOpResolver())(&interpreter),
+      kTfLiteOk);
+  // AllocateTensors should fail, since my_add hasn't been resolved.
+  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteError);
+
+  // Applying static delegate won't work, since the interpreter will first try
+  // to Prepare all original nodes.
+  std::unique_ptr<SimpleDelegate> static_delegate(new SimpleDelegate({0}));
+  ASSERT_EQ(interpreter->ModifyGraphWithDelegate(
+                static_delegate->get_tf_lite_delegate()),
+            kTfLiteError);
+
+  // Applying delegate that supports dynamic tensors should work.
+  std::unique_ptr<SimpleDelegate> dynamic_delegate(
+      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  ASSERT_EQ(interpreter->ModifyGraphWithDelegate(
+                dynamic_delegate->get_tf_lite_delegate()),
+            kTfLiteOk);
+  // AllocateTensors will now work.
+  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
+}
+
+class TestDelegateWithDynamicTensors : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+
+    interpreter_->AddTensors(2);
+    interpreter_->SetInputs({0});
+    interpreter_->SetOutputs({1});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration reg = DynamicCopyOpRegistration();
+    interpreter_->AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr, &reg);
+
+    delegate_.Prepare = [](TfLiteContext* context,
+                           TfLiteDelegate* delegate) -> TfLiteStatus {
+      // In this test, the delegate replaces all the nodes if this function is
+      // called.
+      TfLiteIntArray* execution_plan;
+      TF_LITE_ENSURE_STATUS(
+          context->GetExecutionPlan(context, &execution_plan));
+      context->ReplaceNodeSubsetsWithDelegateKernels(
+          context, DelegateRegistration(), execution_plan, delegate);
+      return kTfLiteOk;
+    };
+    delegate_.flags = kTfLiteDelegateFlagsNone;
+  }
+
+  static TfLiteRegistration DynamicCopyOpRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      TfLiteTensor* output = GetOutput(context, node, 0);
+      SetTensorToDynamic(output);
+      return kTfLiteOk;
+    };
+
+    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+      // Not implemented since this isn't required in testing.
+      return kTfLiteOk;
+    };
+    return reg;
+  }
+
+  static TfLiteRegistration DelegateRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+    return reg;
+  }
+
+  std::unique_ptr<Interpreter> interpreter_;
+  TfLiteDelegate delegate_;
+};
+
+TEST_F(TestDelegateWithDynamicTensors, DisallowDynamicTensors) {
+  interpreter_->ModifyGraphWithDelegate(&delegate_);
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  // The interpreter should not call delegate's `Prepare` when dynamic tensors
+  // exist. So the node ID isn't changed.
+  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
+}
+
+TEST_F(TestDelegateWithDynamicTensors, AllowDynamicTensors) {
+  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+  interpreter_->ModifyGraphWithDelegate(&delegate_);
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  // The node should be replaced because dynamic tensors are allowed. Therefore
+  // only node ID in the execution plan is changed from 0 to 1.
+  ASSERT_EQ(interpreter_->execution_plan()[0], 1);
+}
+
+TEST_F(TestDelegateWithDynamicTensors, ModifyGraphAfterAllocate) {
+  // Trigger allocation *before* delegate application.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(&delegate_), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  ASSERT_EQ(interpreter_->execution_plan()[0], 1);
+
+  // Allocation should still succeed.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 9fe80605e39..7981f4b5c27 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -23,10 +23,10 @@ cc_library(
         "//tensorflow/lite:string_util",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/c:c_api_internal",
@@ -61,12 +61,13 @@ cc_library(
     deps = [
         ":delegate_data",
         ":delegate_only_runtime",
+        "//tensorflow/lite/delegates/utils:simple_delegate",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:tensorflow",
@@ -82,6 +83,8 @@ cc_library(
     name = "delegate_only_runtime",
     srcs = [
         "delegate.cc",
+        "kernel.cc",
+        "kernel.h",
     ],
     hdrs = [
         "delegate.h",
@@ -90,23 +93,32 @@ cc_library(
     deps = [
         ":buffer_map",
         ":delegate_data",
-        ":kernel",
         ":util",
+        "@flatbuffers",
         "@com_google_absl//absl/strings:strings",
+        "//tensorflow/lite/core/api",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/delegates/utils:simple_delegate",
+        "//tensorflow/lite/kernels:kernel_util",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
+            "//tensorflow/core/common_runtime/eager:context",
+            "//tensorflow/core/common_runtime/eager:execute",
+            "//tensorflow/core/common_runtime/eager:tensor_handle",
             "//tensorflow/core:lib",
+            "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core:framework",
         ],
     }),
     alwayslink = 1,
@@ -129,15 +141,16 @@ cc_library(
     name = "delegate_data",
     srcs = ["delegate_data.cc"],
     hdrs = ["delegate_data.h"],
+    visibility = ["//visibility:public"],
     deps = [
         ":buffer_map",
         "@com_google_absl//absl/memory",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core/common_runtime/eager:context",
@@ -162,40 +175,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "kernel",
-    srcs = ["kernel.cc"],
-    hdrs = ["kernel.h"],
-    deps = [
-        ":delegate_data",
-        ":util",
-        "@flatbuffers",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite:string",
-        "//tensorflow/lite/kernels:kernel_util",
-    ] + select({
-        # TODO(b/111881878): The android_tensorflow_lib target pulls in the full
-        # set of core TensorFlow kernels. We may want to revisit this dependency
-        # to allow selective registration via build targets.
-        "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
-        ],
-        "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
-        ],
-        "//conditions:default": [
-            "//tensorflow/core/common_runtime/eager:context",
-            "//tensorflow/core/common_runtime/eager:execute",
-            "//tensorflow/core/common_runtime/eager:tensor_handle",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:protos_all_cc",
-            "//tensorflow/core:framework",
-        ],
-    }),
-)
-
 tf_cc_test(
     name = "kernel_test",
     size = "small",
@@ -203,20 +182,10 @@ tf_cc_test(
     tags = ["no_gpu"],  # GPU + flex is not officially supported.
     deps = [
         ":delegate_data",
-        ":kernel",
+        ":delegate_only_runtime",
         ":test_util",
         "@com_google_googletest//:gtest",
-    ] + select({
-        "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-        ],
-        "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
-        ],
-        "//conditions:default": [
-            "//tensorflow/core:tensorflow",
-        ],
-    }),
+    ],
 )
 
 cc_library(
@@ -242,10 +211,10 @@ cc_library(
         "//tensorflow/lite:kernel_api",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/c:c_api_internal",
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index 985b2b68afe..4741bddc2f5 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/flex/delegate.h"
 
+#include <memory>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
@@ -27,10 +28,32 @@ limitations under the License.
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
-namespace flex {
-namespace delegate {
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) {
+// Corresponding weak declaration found in lite/interpreter_builder.cc.
+TfLiteDelegateUniquePtr AcquireFlexDelegate() {
+  return tflite::FlexDelegate::Create();
+}
+
+TfLiteDelegateUniquePtr FlexDelegate::Create(
+    std::unique_ptr<FlexDelegate> base_delegate) {
+  TFLITE_LOG_PROD_ONCE(TFLITE_LOG_INFO,
+                       "Created TensorFlow Lite delegate for select TF ops.");
+  if (base_delegate == nullptr) {
+    base_delegate.reset(new FlexDelegate());
+  }
+  auto flex_delegate = TfLiteDelegateFactory::Create(std::move(base_delegate));
+  flex_delegate->CopyFromBufferHandle =
+      [](TfLiteContext* context, TfLiteDelegate* delegate,
+         TfLiteBufferHandle buffer_handle,
+         TfLiteTensor* tensor) -> TfLiteStatus {
+    return reinterpret_cast<FlexDelegate*>(delegate->data_)
+        ->CopyFromBufferHandle(context, buffer_handle, tensor);
+  };
+  flex_delegate->flags |= kTfLiteDelegateFlagsAllowDynamicTensors;
+  return flex_delegate;
+}
+
+TfLiteStatus FlexDelegate::Initialize(TfLiteContext* context) {
   // If the TensorFlow Lite thread count is explicitly configured, use it,
   // otherwise rely on the default TensorFlow threading behavior.
   tensorflow::SessionOptions session_options;
@@ -39,47 +62,37 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) {
         context->recommended_num_threads);
   }
 
-  auto status = reinterpret_cast<DelegateData*>(delegate->data_)
-                    ->Prepare(session_options);
+  auto status = delegate_data_.Prepare(session_options);
   if (!status.ok()) {
     context->ReportError(context, "Failed to initialize TensorFlow context: %s",
                          status.error_message().c_str());
     return kTfLiteError;
   }
 
-  // Get the nodes in the current execution plan. Interpreter owns this array.
-  TfLiteIntArray* plan;
-  TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
-
-  // Add all custom ops starting with "Flex" to list of supported nodes.
-  std::vector<int> supported_nodes;
-  for (int node_index : TfLiteIntArrayView(plan)) {
-    TfLiteNode* node;
-    TfLiteRegistration* registration;
-    TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
-        context, node_index, &node, &registration));
-
-    if (IsFlexOp(registration->custom_name)) {
-      supported_nodes.push_back(node_index);
-    }
-  }
-
-  // Request TFLite to partition the graph and make kernels for each independent
-  // node sub set.
-  TfLiteIntArray* size_and_nodes =
-      ConvertVectorToTfLiteIntArray(supported_nodes);
-  context->ReplaceNodeSubsetsWithDelegateKernels(context, GetKernel(),
-                                                 size_and_nodes, delegate);
-  TfLiteIntArrayFree(size_and_nodes);
   return kTfLiteOk;
 }
 
-TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
-                                  TfLiteDelegate* delegate,
-                                  TfLiteBufferHandle buffer_handle,
-                                  TfLiteTensor* output) {
-  BufferMap* buffer_map =
-      reinterpret_cast<DelegateData*>(delegate->data_)->GetBufferMap(context);
+const char* FlexDelegate::Name() const {
+  static constexpr char kName[] = "TfLiteFlexDelegate";
+  return kName;
+}
+
+bool FlexDelegate::IsNodeSupportedByDelegate(
+    const TfLiteRegistration* registration, const TfLiteNode* node,
+    TfLiteContext* context) const {
+  return IsFlexOp(registration->custom_name);
+}
+
+std::unique_ptr<SimpleDelegateKernelInterface>
+FlexDelegate::CreateDelegateKernelInterface() {
+  return std::unique_ptr<SimpleDelegateKernelInterface>(
+      new tflite::flex::DelegateKernel());
+}
+
+TfLiteStatus FlexDelegate::CopyFromBufferHandle(
+    TfLiteContext* context, TfLiteBufferHandle buffer_handle,
+    TfLiteTensor* output) {
+  flex::BufferMap* buffer_map = delegate_data_.GetBufferMap(context);
 
   if (!buffer_map->HasTensor(buffer_handle)) {
     context->ReportError(context, "Invalid tensor index %d.", buffer_handle);
@@ -122,31 +135,4 @@ TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-}  // namespace delegate
-}  // namespace flex
-
-// Corresponding weak declaration found in lite/model.cc.
-std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
-AcquireFlexDelegate() {
-  return std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>(
-      tflite::FlexDelegate::Create().release(), [](TfLiteDelegate* delegate) {
-        delete reinterpret_cast<tflite::FlexDelegate*>(delegate);
-      });
-}
-
-std::unique_ptr<FlexDelegate> FlexDelegate::Create() {
-  TFLITE_LOG_PROD_ONCE(TFLITE_LOG_INFO,
-                       "Created TensorFlow Lite delegate for select TF ops.");
-  return std::unique_ptr<FlexDelegate>(new FlexDelegate());
-}
-
-FlexDelegate::FlexDelegate() : TfLiteDelegate(TfLiteDelegateCreate()) {
-  data_ = &delegate_data_;
-  Prepare = &flex::delegate::Prepare;
-  CopyFromBufferHandle = &flex::delegate::CopyFromBufferHandle;
-  flags = kTfLiteDelegateFlagsAllowDynamicTensors;
-}
-
-FlexDelegate::~FlexDelegate() {}
-
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/delegate.h b/tensorflow/lite/delegates/flex/delegate.h
index a199ae9eda8..be890a5456d 100644
--- a/tensorflow/lite/delegates/flex/delegate.h
+++ b/tensorflow/lite/delegates/flex/delegate.h
@@ -17,9 +17,16 @@ limitations under the License.
 
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/flex/delegate_data.h"
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
 
 namespace tflite {
 
+namespace flex {
+namespace testing {
+class KernelTest;
+}  // namespace testing
+}  // namespace flex
+
 // WARNING: This is an experimental interface that is subject to change.
 // Delegate that can be used to extract parts of a graph that are designed to be
 // executed by TensorFlow's runtime via Eager.
@@ -33,22 +40,49 @@ namespace tflite {
 //   ... build interpreter ...
 //
 //   if (delegate) {
-//     interpreter->ModifyGraphWithDelegate(delegate.get());
+//     interpreter->ModifyGraphWithDelegate(std::move(delegate));
 //   }
 //   ... run inference ...
 //   ... destroy interpreter ...
-//   ... destroy delegate ...
-class FlexDelegate : public TfLiteDelegate {
+class FlexDelegate : public SimpleDelegateInterface {
  public:
+  friend class flex::testing::KernelTest;
+
   // Creates a delegate that supports TF ops.
-  //
-  // If the underyling TF Flex context creation fails, returns null.
-  static std::unique_ptr<FlexDelegate> Create();
+  static TfLiteDelegateUniquePtr Create() {
+    return Create(/*base_delegate*/ nullptr);
+  }
 
-  ~FlexDelegate();
+  ~FlexDelegate() override {}
 
- private:
-  FlexDelegate();
+  flex::DelegateData* mutable_data() { return &delegate_data_; }
+
+ protected:
+  // We sometimes have to create certain stub data to test FlexDelegate. To
+  // achieve this, we will make a testing flex delegate class that inherits from
+  // FlexDelegate to override certain things for stub data creation. Therefore,
+  // this function accepts a FlexDelegate instance to initiliaze it properly for
+  // create a testing flex delegate in some cases, and it is only used in
+  // testing.
+  static TfLiteDelegateUniquePtr Create(
+      std::unique_ptr<FlexDelegate> base_delegate);
+
+  FlexDelegate() {}
+
+  const char* Name() const override;
+
+  bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration,
+                                 const TfLiteNode* node,
+                                 TfLiteContext* context) const override;
+
+  TfLiteStatus Initialize(TfLiteContext* context) override;
+
+  std::unique_ptr<SimpleDelegateKernelInterface> CreateDelegateKernelInterface()
+      override;
+
+  TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
+                                    TfLiteBufferHandle buffer_handle,
+                                    TfLiteTensor* output);
 
   flex::DelegateData delegate_data_;
 };
diff --git a/tensorflow/lite/delegates/flex/delegate_test.cc b/tensorflow/lite/delegates/flex/delegate_test.cc
index b48fe181e1f..d574d8fabbb 100644
--- a/tensorflow/lite/delegates/flex/delegate_test.cc
+++ b/tensorflow/lite/delegates/flex/delegate_test.cc
@@ -26,8 +26,7 @@ using ::testing::ElementsAre;
 
 class DelegateTest : public testing::FlexModelTest {
  public:
-  DelegateTest() {
-    delegate_ = FlexDelegate::Create();
+  DelegateTest() : delegate_(FlexDelegate::Create()) {
     interpreter_.reset(new Interpreter(&error_reporter_));
   }
 
@@ -44,7 +43,7 @@ class DelegateTest : public testing::FlexModelTest {
   }
 
  private:
-  std::unique_ptr<FlexDelegate> delegate_;
+  std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)> delegate_;
 };
 
 TEST_F(DelegateTest, FullGraph) {
diff --git a/tensorflow/lite/delegates/flex/java/src/main/native/BUILD b/tensorflow/lite/delegates/flex/java/src/main/native/BUILD
index b240e0d3825..4732b963ed2 100644
--- a/tensorflow/lite/delegates/flex/java/src/main/native/BUILD
+++ b/tensorflow/lite/delegates/flex/java/src/main/native/BUILD
@@ -18,6 +18,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/delegates/flex:delegate",
+        "//tensorflow/lite/delegates/utils:simple_delegate",
         "//tensorflow/lite/java/jni",
         "//tensorflow/lite/testing:init_tensorflow",
     ],
diff --git a/tensorflow/lite/delegates/flex/java/src/main/native/flex_delegate_jni.cc b/tensorflow/lite/delegates/flex/java/src/main/native/flex_delegate_jni.cc
index 957c523c9fd..fef71914d80 100644
--- a/tensorflow/lite/delegates/flex/java/src/main/native/flex_delegate_jni.cc
+++ b/tensorflow/lite/delegates/flex/java/src/main/native/flex_delegate_jni.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <jni.h>
 
 #include "tensorflow/lite/delegates/flex/delegate.h"
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
 #include "tensorflow/lite/testing/init_tensorflow.h"
 
 #ifdef __cplusplus
@@ -37,7 +38,8 @@ Java_org_tensorflow_lite_flex_FlexDelegate_nativeCreateDelegate(JNIEnv* env,
 JNIEXPORT void JNICALL
 Java_org_tensorflow_lite_flex_FlexDelegate_nativeDeleteDelegate(
     JNIEnv* env, jclass clazz, jlong delegate) {
-  delete reinterpret_cast<tflite::FlexDelegate*>(delegate);
+  tflite::TfLiteDelegateFactory::DeleteSimpleDelegate(
+      reinterpret_cast<struct TfLiteDelegate*>(delegate));
 }
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index d1c21086703..e7705ecf3ce 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/delegates/flex/delegate.h"
 #include "tensorflow/lite/delegates/flex/delegate_data.h"
 #include "tensorflow/lite/delegates/flex/util.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -49,7 +50,6 @@ limitations under the License.
 
 namespace tflite {
 namespace flex {
-namespace kernel {
 
 struct OpNode;
 
@@ -357,33 +357,29 @@ struct OpData {
   std::vector<int> subgraph_outputs;
 };
 
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* op_data = new OpData;
+DelegateKernel::DelegateKernel() : op_data_(new OpData) {}
+DelegateKernel::~DelegateKernel() {}
 
-  const TfLiteDelegateParams* params =
-      reinterpret_cast<const TfLiteDelegateParams*>(buffer);
-  CHECK(params);
-  CHECK(params->delegate);
-  CHECK(params->delegate->data_);
-  op_data->eager_context =
-      reinterpret_cast<DelegateData*>(params->delegate->data_)
-          ->GetEagerContext();
-  op_data->buffer_map = reinterpret_cast<DelegateData*>(params->delegate->data_)
-                            ->GetBufferMap(context);
+TfLiteStatus DelegateKernel::Init(TfLiteContext* context,
+                                  const TfLiteDelegateParams* params) {
+  auto* flex_delegate_data =
+      reinterpret_cast<FlexDelegate*>(params->delegate->data_)->mutable_data();
+  op_data_->eager_context = flex_delegate_data->GetEagerContext();
+  op_data_->buffer_map = flex_delegate_data->GetBufferMap(context);
 
   CHECK(params->output_tensors);
   std::set<int> output_set;
   for (auto tensor_index : TfLiteIntArrayView(params->output_tensors)) {
-    op_data->subgraph_outputs.push_back(tensor_index);
+    op_data_->subgraph_outputs.push_back(tensor_index);
     output_set.insert(tensor_index);
   }
 
   CHECK(params->input_tensors);
   for (auto tensor_index : TfLiteIntArrayView(params->input_tensors)) {
-    op_data->subgraph_inputs.push_back(tensor_index);
+    op_data_->subgraph_inputs.push_back(tensor_index);
   }
 
-  op_data->nodes.reserve(params->nodes_to_replace->size);
+  op_data_->nodes.reserve(params->nodes_to_replace->size);
 
   CHECK(params->nodes_to_replace);
   tensorflow::Status status;
@@ -392,8 +388,8 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
     TfLiteRegistration* reg;
     context->GetNodeAndRegistration(context, node_index, &node, &reg);
 
-    op_data->nodes.emplace_back(new OpNode(node->inputs, node->outputs));
-    OpNode& node_data = *op_data->nodes.back();
+    op_data_->nodes.emplace_back(new OpNode(node->inputs, node->outputs));
+    OpNode& node_data = *op_data_->nodes.back();
 
     node_data.set_index(node_index);
     node_data.set_name("");
@@ -401,16 +397,11 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
     status = node_data.InitializeNodeDef(node->custom_initial_data,
                                          node->custom_initial_data_size);
     if (!status.ok()) break;
-    status = node_data.BuildEagerOp(op_data->eager_context);
+    status = node_data.BuildEagerOp(op_data_->eager_context);
     if (!status.ok()) break;
   }
 
-  if (ConvertStatus(context, status) != kTfLiteOk) {
-    // We can't return an error from this function but ConvertStatus will
-    // report them and we will stop processing in Prepare() if anything went
-    // wrong.
-    return op_data;
-  }
+  TF_LITE_ENSURE_STATUS(ConvertStatus(context, status));
 
   // Given a TfLite tensor index, return the OpNode that produces it,
   // along with it index into that OpNodes list of outputs.
@@ -418,7 +409,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 
   // Find out how each tensor is produced. This does not account for
   // tensors that are not produce by eager ops.
-  for (auto& node_data : op_data->nodes) {
+  for (auto& node_data : op_data_->nodes) {
     node_data->mutable_outputs()->InitializeGraphOutputs(output_set);
     for (int i = 0; i < node_data->outputs().Size(); ++i) {
       int output_index = node_data->outputs().TfLiteIndex(i);
@@ -428,21 +419,15 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 
   // For each node, resolve the inputs, so we can keep pointers to the nodes
   // that produces them.
-  for (auto& node_data : op_data->nodes) {
+  for (auto& node_data : op_data_->nodes) {
     node_data->mutable_inputs()->InitializeTensorSources(tflite_tensor_sources);
   }
-
-  return op_data;
+  return kTfLiteOk;
 }
 
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<OpData*>(buffer);
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const auto* op_data = reinterpret_cast<OpData*>(node->user_data);
+TfLiteStatus DelegateKernel::Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_MSG(
-      context, op_data->eager_context != nullptr,
+      context, op_data_->eager_context != nullptr,
       "Failed to initialize eager context. This often happens when a CPU "
       "device has not been registered, presumably because some symbols from "
       "tensorflow/core:core_cpu_impl were not linked into the binary.");
@@ -452,8 +437,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   std::map<int, int> tensor_ref_count;
 
   // Whenever we find a constant tensor, insert it in the buffer map.
-  BufferMap* buffer_map = op_data->buffer_map;
-  for (auto tensor_index : op_data->subgraph_inputs) {
+  BufferMap* buffer_map = op_data_->buffer_map;
+  for (auto tensor_index : op_data_->subgraph_inputs) {
     TfLiteTensor* tensor = &context->tensors[tensor_index];
     if (IsConstantTensor(tensor)) {
       if (!buffer_map->HasTensor(tensor_index)) {
@@ -469,12 +454,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // All output tensors are allocated by TensorFlow/Eager, so we
   // mark them as kTfLiteDynamic.
-  for (auto tensor_index : op_data->subgraph_outputs) {
+  for (auto tensor_index : op_data_->subgraph_outputs) {
     SetTensorToDynamic(&context->tensors[tensor_index]);
     ++tensor_ref_count[tensor_index];
   }
 
-  for (const auto& node_data : op_data->nodes) {
+  for (const auto& node_data : op_data_->nodes) {
     if (node_data->nodedef().op().empty()) {
       context->ReportError(context, "Invalid NodeDef in Flex op '%s'",
                            node_data->name().c_str());
@@ -490,7 +475,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // All tensors that are referenced exactly once are marked as "forwardable",
   // meaning that we will allow TensorFlow to reuse its buffer as the output of
   // an op.
-  for (auto& node_data : op_data->nodes) {
+  for (auto& node_data : op_data_->nodes) {
     for (int i = 0; i < node_data->inputs().Size(); ++i) {
       bool f = (tensor_ref_count[node_data->inputs().TfLiteIndex(i)] == 1);
       node_data->mutable_inputs()->SetForwardable(i, f);
@@ -500,13 +485,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* op_data = reinterpret_cast<OpData*>(node->user_data);
-  BufferMap* buffer_map = op_data->buffer_map;
+TfLiteStatus DelegateKernel::Eval(TfLiteContext* context, TfLiteNode* node) {
+  BufferMap* buffer_map = op_data_->buffer_map;
 
   // Insert a tensor in the buffer map for all inputs that are not constant.
   // Constants were handled in Prepare() already.
-  for (auto tensor_index : op_data->subgraph_inputs) {
+  for (auto tensor_index : op_data_->subgraph_inputs) {
     TfLiteTensor* tensor = &context->tensors[tensor_index];
     if (!IsConstantTensor(tensor)) {
       // If this tensor is part of an earlier TF subgraph we should not add it
@@ -519,7 +503,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 
   // Execute the TensorFlow Ops sequentially.
-  for (auto& node_data : op_data->nodes) {
+  for (auto& node_data : op_data_->nodes) {
     TFLITE_SCOPED_DELEGATE_OPERATOR_PROFILE(
         reinterpret_cast<Profiler*>(context->profiler),
         node_data->name().c_str(), node_data->index());
@@ -528,7 +512,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_OK(context, ConvertStatus(context, status));
   }
 
-  for (auto tensor_index : op_data->subgraph_outputs) {
+  for (auto tensor_index : op_data_->subgraph_outputs) {
     if (!buffer_map->HasTensor(tensor_index)) {
       context->ReportError(context, "Cannot write to invalid tensor index %d",
                            tensor_index);
@@ -546,21 +530,5 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace kernel
-
-TfLiteRegistration GetKernel() {
-  TfLiteRegistration registration{
-      &kernel::Init,
-      &kernel::Free,
-      &kernel::Prepare,
-      &kernel::Eval,
-      nullptr,                 // .profiling_string
-      kTfLiteBuiltinDelegate,  // .builtin_code
-      "TfLiteFlexDelegate",    // .custom_name
-      1,                       // .version
-  };
-  return registration;
-}
-
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/kernel.h b/tensorflow/lite/delegates/flex/kernel.h
index 6b30a19fbc2..9a7b93e31f2 100644
--- a/tensorflow/lite/delegates/flex/kernel.h
+++ b/tensorflow/lite/delegates/flex/kernel.h
@@ -15,18 +15,28 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_FLEX_KERNEL_H_
 #define TENSORFLOW_LITE_DELEGATES_FLEX_KERNEL_H_
 
+#include <memory>
+
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
 
 namespace tflite {
 namespace flex {
 
-// Return the registration object used to initialize and execute ops that will
-// be delegated to TensorFlow's Eager runtime. This TF Lite op is created by
-// the flex delegate to handle execution of a supported subgraph. The usual
-// flow is that the delegate informs the interpreter of supported nodes in a
-// graph, and each supported subgraph is replaced with one instance of this
-// kernel.
-TfLiteRegistration GetKernel();
+struct OpData;
+class DelegateKernel : public SimpleDelegateKernelInterface {
+ public:
+  DelegateKernel();
+  ~DelegateKernel() override;
+
+  TfLiteStatus Init(TfLiteContext* context,
+                    const TfLiteDelegateParams* params) override;
+  TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) override;
+  TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) override;
+
+ private:
+  std::unique_ptr<OpData> op_data_;
+};
 
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/kernel_test.cc b/tensorflow/lite/delegates/flex/kernel_test.cc
index 380dbfb4f03..f7234075c95 100644
--- a/tensorflow/lite/delegates/flex/kernel_test.cc
+++ b/tensorflow/lite/delegates/flex/kernel_test.cc
@@ -12,38 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/delegates/flex/kernel.h"
-
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/flex/delegate.h"
 #include "tensorflow/lite/delegates/flex/delegate_data.h"
 #include "tensorflow/lite/delegates/flex/test_util.h"
 
 namespace tflite {
 namespace flex {
-namespace {
+namespace testing {
 
 using ::testing::ContainsRegex;
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 
-TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate,
-                            const std::vector<int>& supported_nodes) {
-  TfLiteIntArray* size_and_nodes =
-      ConvertVectorToTfLiteIntArray(supported_nodes);
-  TF_LITE_ENSURE_STATUS(context->ReplaceNodeSubsetsWithDelegateKernels(
-      context, flex::GetKernel(), size_and_nodes, delegate));
-  TfLiteIntArrayFree(size_and_nodes);
-  return kTfLiteOk;
-}
-
-// There is no easy way to pass a parameter into the TfLiteDelegate's
-// 'prepare' function, so we keep a global map for testing purposed.
-// To avoid collisions use: GetPrepareFunction<__LINE__>().
-std::map<int, std::vector<int>>* GetGlobalOpLists() {
-  static auto* op_list = new std::map<int, std::vector<int>>;
-  return op_list;
-}
+// A testing flex delegate that supports every node regardless whether it's
+// actually supported or not. It's only for testing certain scenarios.
+class TestFlexDelegate : public FlexDelegate {
+ protected:
+  bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration,
+                                 const TfLiteNode* node,
+                                 TfLiteContext* context) const override {
+    return true;
+  }
+};
 
 class KernelTest : public testing::FlexModelTest {
  public:
@@ -51,51 +43,16 @@ class KernelTest : public testing::FlexModelTest {
   static constexpr int kTwos = 2;  // This is the index of a tensor of 2's.
   static constexpr int kMaxTensors = 30;
 
-  static void SetUpTestSuite() { GetGlobalOpLists()->clear(); }
+  KernelTest() { interpreter_.reset(new Interpreter(&error_reporter_)); }
 
-  KernelTest() {
-    CHECK(delegate_data_.Prepare(tensorflow::SessionOptions{}).ok());
-    interpreter_.reset(new Interpreter(&error_reporter_));
+  void ApplyFlexDelegate(std::unique_ptr<FlexDelegate> delegate = nullptr) {
+    auto flex_delegate = FlexDelegate::Create(std::move(delegate));
+    auto* delegate_data =
+        reinterpret_cast<FlexDelegate*>(flex_delegate->data_)->mutable_data();
+    CHECK(delegate_data->Prepare(tensorflow::SessionOptions{}).ok());
+    CHECK(interpreter_->ModifyGraphWithDelegate(std::move(flex_delegate)) ==
+          kTfLiteOk);
   }
-
-  typedef TfLiteStatus (*PrepareFunction)(TfLiteContext* context,
-                                          TfLiteDelegate* delegate);
-
-  template <int KEY>
-  PrepareFunction GetPrepareFunction() {
-    GetGlobalOpLists()->insert({KEY, tf_ops_});
-    return [](TfLiteContext* context, TfLiteDelegate* delegate) {
-      return GenericPrepare(context, delegate, GetGlobalOpLists()->at(KEY));
-    };
-  }
-
-  template <typename T>
-  void ConfigureDelegate(T prepare_function) {
-    delegate_.data_ = &delegate_data_;
-    delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
-    delegate_.FreeBufferHandle = nullptr;
-    delegate_.Prepare = prepare_function;
-    delegate_.CopyFromBufferHandle = [](TfLiteContext* context,
-                                        TfLiteDelegate* delegate,
-                                        TfLiteBufferHandle buffer_handle,
-                                        TfLiteTensor* output) {
-      auto* delegate_data = reinterpret_cast<DelegateData*>(delegate->data_);
-      auto* buffer_map = delegate_data->GetBufferMap(context);
-      if (!buffer_map->HasTensor(buffer_handle)) {
-        context->ReportError(context, "Tensor '%d' not found", buffer_handle);
-        return kTfLiteError;
-      }
-      tensorflow::StringPiece values =
-          buffer_map->GetTensor(buffer_handle).tensor_data();
-      memcpy(output->data.raw, values.data(), values.size());
-      return kTfLiteOk;
-    };
-    CHECK(interpreter_->ModifyGraphWithDelegate(&delegate_) == kTfLiteOk);
-  }
-
- private:
-  DelegateData delegate_data_;
-  TfLiteDelegate delegate_;
 };
 
 TEST_F(KernelTest, FullGraph) {
@@ -108,10 +65,7 @@ TEST_F(KernelTest, FullGraph) {
   AddTfOp(testing::kAdd, {2, 5}, {7});
   AddTfOp(testing::kMul, {6, 7}, {8});
 
-  // Apply Delegate.
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0, 1, 2, 3, 4});
-  });
+  ApplyFlexDelegate();
 
   // Define inputs.
   SetShape(0, {2, 2, 1});
@@ -140,9 +94,7 @@ TEST_F(KernelTest, BadTensorFlowOp) {
   AddTensors(2, {0}, {1}, kTfLiteFloat32, {3});
   AddTfOp(testing::kNonExistent, {0}, {1});
 
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0});
-  });
+  ApplyFlexDelegate(std::unique_ptr<FlexDelegate>(new TestFlexDelegate()));
 
   ASSERT_NE(interpreter_->AllocateTensors(), kTfLiteOk);
   ASSERT_THAT(error_reporter().error_messages(),
@@ -153,9 +105,7 @@ TEST_F(KernelTest, BadNumberOfOutputs) {
   AddTensors(3, {0}, {1, 2}, kTfLiteFloat32, {3});
   AddTfOp(testing::kIdentity, {0}, {1, 2});
 
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0});
-  });
+  ApplyFlexDelegate();
 
   SetShape(0, {2, 2, 1});
   SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
@@ -171,9 +121,7 @@ TEST_F(KernelTest, IncompatibleNodeDef) {
   // Cast is a TF op, but we don't add the proper nodedef to it in AddTfOp.
   AddTfOp(testing::kIncompatibleNodeDef, {0}, {1});
 
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0});
-  });
+  ApplyFlexDelegate();
 
   SetShape(0, {2, 2, 1});
   SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
@@ -188,14 +136,14 @@ TEST_F(KernelTest, WrongSetOfNodes) {
   AddTfOp(testing::kUnpack, {0}, {1, 2});
   AddTfLiteMulOp({1, 2}, {3});
 
-  // Specify that testing::kMul (#1) is supported when it actually isn't.
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0, 1});
-  });
+  // Specify that testing::kMul (#1) is supported when it actually isn't so that
+  // we choose to use the TestFlexDelegate that supports every node regardless
+  // whether it's actually supported or not.
+  ApplyFlexDelegate(std::unique_ptr<FlexDelegate>(new TestFlexDelegate()));
 
   ASSERT_NE(interpreter_->AllocateTensors(), kTfLiteOk);
   ASSERT_THAT(error_reporter().error_messages(),
-              ContainsRegex("Invalid NodeDef in Flex op"));
+              ContainsRegex("Cannot convert empty data into a valid NodeDef"));
 }
 
 TEST_F(KernelTest, MixedGraph) {
@@ -207,9 +155,7 @@ TEST_F(KernelTest, MixedGraph) {
   AddTfOp(testing::kAdd, {2, 5}, {7});
   AddTfLiteMulOp({6, 7}, {8});
 
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0, 1, 2, 3});
-  });
+  ApplyFlexDelegate();
 
   SetShape(0, {2, 2, 1});
   SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
@@ -251,14 +197,7 @@ TEST_F(KernelTest, SplitGraph) {
   // The two branches added together:
   AddTfOp(testing::kAdd, {9, 16}, {17});  // => 16
 
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    // All ops but #3 are TF ops, handled by the delegate. However, because #4
-    // depends on the non-TF op, two subgraphs are necessary:
-    //    TF subgraph 1: 0, 1, 2, 6, 7, 8, 9
-    //    TF Lite Op: 3
-    //    TF subgraph 2: 4, 5, 10
-    return GenericPrepare(context, delegate, {0, 1, 2, 4, 5, 6, 7, 8, 9, 10});
-  });
+  ApplyFlexDelegate();
 
   SetShape(0, {2, 2, 2, 1});
   SetValues(0, a);
@@ -291,9 +230,8 @@ class MultipleSubgraphsTest : public KernelTest {
  public:
   static constexpr int kInput = 0;
 
-  void PrepareInterpreter(PrepareFunction prepare,
-                          const std::vector<float>& input) {
-    ConfigureDelegate(prepare);
+  void PrepareInterpreter(const std::vector<float>& input) {
+    ApplyFlexDelegate();
 
     SetShape(kOnes, {3});
     SetValues(kOnes, {1.0f, 1.0f, 1.0f});
@@ -336,7 +274,7 @@ TEST_F(MultipleSubgraphsTest, ForwardabilityIsLocal) {
   AddTfLiteMulOp({10, 7}, {12});
 
   auto input = {3.0f, 4.0f, 5.0f};
-  PrepareInterpreter(GetPrepareFunction<__LINE__>(), input);
+  PrepareInterpreter(input);
 
   ASSERT_TRUE(Invoke());
   ASSERT_THAT(GetValues(12), ElementsAreArray(Apply(input, [](float in) {
@@ -371,7 +309,7 @@ TEST_F(MultipleSubgraphsTest, DoNotRemoveInputTensors) {
   AddTfLiteMulOp({10, 7}, {12});
 
   auto input = {3.0f, 4.0f, 5.0f};
-  PrepareInterpreter(GetPrepareFunction<__LINE__>(), input);
+  PrepareInterpreter(input);
 
   ASSERT_TRUE(Invoke());
   ASSERT_THAT(GetValues(12), ElementsAreArray(Apply(input, [](float in) {
@@ -405,7 +343,7 @@ TEST_F(MultipleSubgraphsTest, DoNotForwardInputTensors) {
   AddTfLiteMulOp({10, 7}, {12});
 
   auto input = {3.0f, 4.0f, 5.0f};
-  PrepareInterpreter(GetPrepareFunction<__LINE__>(), input);
+  PrepareInterpreter(input);
 
   ASSERT_TRUE(Invoke());
   ASSERT_THAT(GetValues(12), ElementsAreArray(Apply(input, [](float in) {
@@ -413,7 +351,7 @@ TEST_F(MultipleSubgraphsTest, DoNotForwardInputTensors) {
               })));
 }
 
-}  // namespace
+}  // namespace testing
 }  // namespace flex
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
index b38a66f5687..d9150698298 100644
--- a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
@@ -66,6 +66,8 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "BiasAdd",
           "BiasAddGrad",
           "BiasAddV1",
+          "BoostedTreesBucketize",
+          "Bucketize",
           "BroadcastArgs",
           "BroadcastGradientArgs",
           "BroadcastTo",
@@ -386,6 +388,9 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "SparseApplyProximalAdagrad",
           "SparseApplyProximalGradientDescent",
           "SparseApplyRMSProp",
+          "SparseCross",
+          "SparseCrossHashed",
+          "SparseCrossV2",
           "SparseFillEmptyRows",
           "SparseFillEmptyRowsGrad",
           "SparseReshape",
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index 099f653a1b8..181d96b9cdf 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -32,7 +32,11 @@ cc_library(
     linkopts = select({
         "//tensorflow:android": [
             "-lEGL",
-            "-lGLESv3",
+            # We don't need to link libGLESv3, because if it exists,
+            # it is a symlink to libGLESv2.
+            # See Compatibility Definition Document:
+            # https://source.android.com/compatibility/10/android-10-cdd#7_1_4_1_opengl_es
+            "-lGLESv2",
         ],
         "//conditions:default": [],
     }),
@@ -76,6 +80,7 @@ objc_library(
     name = "metal_delegate",
     srcs = ["metal_delegate.mm"],
     hdrs = ["metal_delegate.h"],
+    module_name = "TensorFlowLiteCMetal",
     sdk_frameworks = ["Metal"],
     deps = [
         "//tensorflow/lite:kernel_api",
@@ -167,7 +172,7 @@ ios_static_framework(
         "metal_delegate.h",
         "metal_delegate_internal.h",
     ],
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     deps = [":metal_delegate"],
 )
 
@@ -220,7 +225,11 @@ cc_library(
     linkopts = select({
         "//tensorflow:android": [
             "-lEGL",
-            "-lGLESv3",
+            # We don't need to link libGLESv3, because if it exists,
+            # it is a symlink to libGLESv2.
+            # See Compatibility Definition Document:
+            # https://source.android.com/compatibility/10/android-10-cdd#7_1_4_1_opengl_es
+            "-lGLESv2",
         ],
         "//conditions:default": [],
     }),
@@ -235,6 +244,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_builder",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
+        "//tensorflow/lite/delegates/gpu/common:quantization_util",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/gl:api2",
         "//tensorflow/lite/kernels/internal:optimized_base",
diff --git a/tensorflow/lite/delegates/gpu/api.cc b/tensorflow/lite/delegates/gpu/api.cc
index 6c299e4965c..cddd14b6855 100644
--- a/tensorflow/lite/delegates/gpu/api.cc
+++ b/tensorflow/lite/delegates/gpu/api.cc
@@ -31,6 +31,12 @@ struct ObjectTypeGetter {
   ObjectType operator()(OpenClTexture) const {
     return ObjectType::OPENCL_TEXTURE;
   }
+  ObjectType operator()(VulkanBuffer) const {
+    return ObjectType::VULKAN_BUFFER;
+  }
+  ObjectType operator()(VulkanTexture) const {
+    return ObjectType::VULKAN_TEXTURE;
+  }
   ObjectType operator()(CpuMemory) const { return ObjectType::CPU_MEMORY; }
 };
 
@@ -42,6 +48,8 @@ struct ObjectValidityChecker {
   }
   bool operator()(OpenClBuffer obj) const { return obj.memobj; }
   bool operator()(OpenClTexture obj) const { return obj.memobj; }
+  bool operator()(VulkanBuffer obj) const { return obj.memory; }
+  bool operator()(VulkanTexture obj) const { return obj.memory; }
   bool operator()(CpuMemory obj) const {
     return obj.data != nullptr && obj.size_bytes > 0 &&
            (data_type == DataType::UNKNOWN ||
@@ -72,15 +80,19 @@ bool IsValid(const TensorObjectDef& def, const TensorObject& object) {
 bool IsObjectPresent(ObjectType type, const TensorObject& obj) {
   switch (type) {
     case ObjectType::CPU_MEMORY:
-      return absl::get_if<CpuMemory>(&obj);
+      return absl::holds_alternative<CpuMemory>(obj);
     case ObjectType::OPENGL_SSBO:
-      return absl::get_if<OpenGlBuffer>(&obj);
+      return absl::holds_alternative<OpenGlBuffer>(obj);
     case ObjectType::OPENGL_TEXTURE:
-      return absl::get_if<OpenGlTexture>(&obj);
+      return absl::holds_alternative<OpenGlTexture>(obj);
     case ObjectType::OPENCL_BUFFER:
-      return absl::get_if<OpenClBuffer>(&obj);
+      return absl::holds_alternative<OpenClBuffer>(obj);
     case ObjectType::OPENCL_TEXTURE:
-      return absl::get_if<OpenClTexture>(&obj);
+      return absl::holds_alternative<OpenClTexture>(obj);
+    case ObjectType::VULKAN_BUFFER:
+      return absl::holds_alternative<VulkanBuffer>(obj);
+    case ObjectType::VULKAN_TEXTURE:
+      return absl::holds_alternative<VulkanTexture>(obj);
     case ObjectType::UNKNOWN:
       return false;
   }
diff --git a/tensorflow/lite/delegates/gpu/api.h b/tensorflow/lite/delegates/gpu/api.h
index 2a531f1f81b..1dfeeebd700 100644
--- a/tensorflow/lite/delegates/gpu/api.h
+++ b/tensorflow/lite/delegates/gpu/api.h
@@ -71,6 +71,8 @@ enum class ObjectType {
   CPU_MEMORY,
   OPENCL_TEXTURE,
   OPENCL_BUFFER,
+  VULKAN_BUFFER,
+  VULKAN_TEXTURE
 };
 
 struct OpenGlBuffer {
@@ -104,11 +106,37 @@ struct OpenClTexture {
   // TODO(akulik): should it specify texture format?
 };
 
+struct VulkanBuffer {
+  VulkanBuffer() = default;
+  explicit VulkanBuffer(VkBuffer buffer_, VkDeviceSize size_,
+                        VkDeviceMemory memory_, VkDeviceSize offset_)
+      : buffer(buffer_), size(size_), memory(memory_), offset(offset_) {}
+
+  VkBuffer buffer;
+  VkDeviceSize size;
+  VkDeviceMemory memory;
+  VkDeviceSize offset;
+};
+
+struct VulkanTexture {
+  VulkanTexture() = default;
+  explicit VulkanTexture(VkDeviceMemory new_memory) : memory(new_memory) {}
+
+  VkImage image;
+  VkImageView image_view;
+  VkFormat format;
+  VkExtent3D extent;
+  VkDeviceMemory memory;
+  VkDeviceSize offset;
+};
+
 struct VulkanMemory {
   VulkanMemory() = default;
   explicit VulkanMemory(VkDeviceMemory new_memory) : memory(new_memory) {}
 
   VkDeviceMemory memory;
+  VkDeviceSize size;
+  VkDeviceSize offset;
 };
 
 struct CpuMemory {
@@ -195,8 +223,9 @@ bool IsValid(const TensorObjectDef& def);
 // @return the number of elements in a tensor object.
 uint32_t NumElements(const TensorObjectDef& def);
 
-using TensorObject = absl::variant<absl::monostate, OpenGlBuffer, OpenGlTexture,
-                                   CpuMemory, OpenClBuffer, OpenClTexture>;
+using TensorObject =
+    absl::variant<absl::monostate, OpenGlBuffer, OpenGlTexture, CpuMemory,
+                  OpenClBuffer, OpenClTexture, VulkanBuffer, VulkanTexture>;
 
 // @return true if object is set and corresponding values are defined.
 bool IsValid(const TensorObjectDef& def, const TensorObject& object);
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 2e686810767..7ace4ebf6a9 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -38,6 +38,39 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "arguments",
+    srcs = ["arguments.cc"],
+    hdrs = ["arguments.h"],
+    deps = [
+        ":gpu_object",
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:access_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_test(
+    name = "arguments_test",
+    srcs = ["arguments_test.cc"],
+    linkstatic = True,
+    tags = tf_gpu_tests_tags() + [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":arguments",
+        ":gpu_object",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "buffer",
     srcs = ["buffer.cc"],
@@ -291,6 +324,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gpu_object",
+    hdrs = ["gpu_object.h"],
+    deps = [
+        ":opencl_wrapper",
+        "//tensorflow/lite/delegates/gpu/common:access_type",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+    ],
+)
+
 cc_library(
     name = "inference_context",
     srcs = ["inference_context.cc"],
@@ -328,6 +372,7 @@ cc_library(
     hdrs = ["linear_storage.h"],
     deps = [
         ":buffer",
+        ":gpu_object",
         ":opencl_wrapper",
         ":tensor_type",
         ":texture2d",
@@ -413,6 +458,7 @@ cc_library(
         ":cl_device",
         ":cl_image_format",
         ":cl_memory",
+        ":gpu_object",
         ":tensor_type",
         ":util",
         "//tensorflow/lite/delegates/gpu/common:data_type",
@@ -449,8 +495,10 @@ cc_library(
     srcs = ["tensor_type.cc"],
     hdrs = ["tensor_type.h"],
     deps = [
+        ":gpu_object",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:shape",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/cl/api.cc b/tensorflow/lite/delegates/gpu/cl/api.cc
index 7ffb5604d83..e82f67392e8 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.cc
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/api.h"
 
+#include <EGL/eglext.h>
+
 #include <algorithm>
 #include <cstring>
 
-#include <EGL/eglext.h>
 #include "absl/memory/memory.h"
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
@@ -352,10 +353,10 @@ class GlBufferHolder : public TensorTie {
 };
 
 TensorObject TensorToObj(const Tensor& tensor) {
-  if (tensor.StorageType() == TensorStorageType::BUFFER) {
+  if (tensor.GetStorageType() == TensorStorageType::BUFFER) {
     return OpenClBuffer{tensor.GetMemoryPtr()};
   }
-  if (tensor.StorageType() == TensorStorageType::IMAGE_BUFFER) {
+  if (tensor.GetStorageType() == TensorStorageType::IMAGE_BUFFER) {
     return OpenClBuffer{tensor.GetMemoryPtrForWriting()};
   }
   return OpenClTexture{tensor.GetMemoryPtr()};
@@ -516,9 +517,9 @@ TensorObjectDef TensorToDef(const Tensor& tensor) {
   def.dimensions.h = tensor.Height();
   def.dimensions.w = tensor.Width();
   def.dimensions.c = tensor.Channels();
-  def.object_def.data_layout = ToDataLayout(tensor.StorageType());
-  def.object_def.data_type = tensor.DataType();
-  def.object_def.object_type = ToObjectType(tensor.StorageType());
+  def.object_def.data_layout = ToDataLayout(tensor.GetStorageType());
+  def.object_def.data_type = tensor.GetDataType();
+  def.object_def.object_type = ToObjectType(tensor.GetStorageType());
   def.object_def.user_provided = false;
   return def;
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/api.h b/tensorflow/lite/delegates/gpu/cl/api.h
index 9d3f9f7214c..bddf7de3363 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.h
+++ b/tensorflow/lite/delegates/gpu/cl/api.h
@@ -16,10 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
 
+#include <EGL/egl.h>
+
 #include <cstdint>
 #include <memory>
 
-#include <EGL/egl.h>
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/api.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
new file mode 100644
index 00000000000..202482170f8
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -0,0 +1,586 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
+
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+bool IsWordSymbol(char symbol) {
+  return absl::ascii_isalnum(symbol) || symbol == '_';
+}
+
+std::string GetNextWord(const std::string& code, size_t first_position) {
+  size_t pos = first_position;
+  char t = code[pos];
+  while (IsWordSymbol(t)) {
+    pos++;
+    t = code[pos];
+  }
+  return code.substr(first_position, pos - first_position);
+}
+
+size_t FindEnclosingBracket(const std::string& text, size_t first_pos,
+                            char bracket) {
+  const std::map<char, char> brackets = {
+      {'(', ')'},
+      {'{', '}'},
+      {'[', ']'},
+  };
+  char b_open = bracket;
+  auto it = brackets.find(b_open);
+  if (it == brackets.end()) {
+    return -1;
+  }
+  char b_close = it->second;
+  size_t pos = first_pos;
+  int opened = 1;
+  int closed = 0;
+  while (opened != closed && pos < text.size()) {
+    if (text[pos] == b_open) {
+      opened++;
+    } else if (text[pos] == b_close) {
+      closed++;
+    }
+    pos++;
+  }
+  if (opened == closed) {
+    return pos;
+  } else {
+    return -1;
+  }
+}
+
+void ReplaceAllWords(const std::string& old_word, const std::string& new_word,
+                     std::string* str) {
+  size_t position = str->find(old_word);
+  while (position != std::string::npos) {
+    char prev = position == 0 ? '.' : (*str)[position - 1];
+    char next = position + old_word.size() < str->size()
+                    ? (*str)[position + old_word.size()]
+                    : '.';
+    if (IsWordSymbol(prev) || IsWordSymbol(next)) {
+      position = str->find(old_word, position + 1);
+      continue;
+    }
+    str->replace(position, old_word.size(), new_word);
+    position = str->find(old_word, position + new_word.size());
+  }
+}
+
+void AppendArgument(const std::string& arg, std::string* args) {
+  if (!args->empty()) {
+    absl::StrAppend(args, ",\n  ");
+  }
+  absl::StrAppend(args, arg);
+}
+
+std::string GetImageModifier(AccessType access) {
+  switch (access) {
+    case AccessType::READ:
+      return "__read_only";
+    case AccessType::WRITE:
+      return "__write_only";
+    case AccessType::READ_WRITE:
+      return "__read_write";
+  }
+}
+
+}  // namespace
+
+Arguments::Arguments(Arguments&& args)
+    : int_values_(std::move(args.int_values_)),
+      shared_int4s_data_(std::move(args.shared_int4s_data_)),
+      float_values_(std::move(args.float_values_)),
+      shared_float4s_data_(std::move(args.shared_float4s_data_)),
+      buffers_(std::move(args.buffers_)),
+      images2d_(std::move(args.images2d_)),
+      image2d_arrays_(std::move(args.image2d_arrays_)),
+      images3d_(std::move(args.images3d_)),
+      image_buffers_(std::move(args.image_buffers_)),
+      object_refs_(std::move(args.object_refs_)),
+      objects_(std::move(args.objects_)) {}
+Arguments& Arguments::operator=(Arguments&& args) {
+  if (this != &args) {
+    int_values_ = std::move(args.int_values_);
+    shared_int4s_data_ = std::move(args.shared_int4s_data_);
+    float_values_ = std::move(args.float_values_);
+    shared_float4s_data_ = std::move(args.shared_float4s_data_);
+    buffers_ = std::move(args.buffers_);
+    images2d_ = std::move(args.images2d_);
+    image2d_arrays_ = std::move(args.image2d_arrays_);
+    images3d_ = std::move(args.images3d_);
+    image_buffers_ = std::move(args.image_buffers_);
+    object_refs_ = std::move(args.object_refs_);
+    objects_ = std::move(args.objects_);
+  }
+  return *this;
+}
+
+void Arguments::AddFloat(const std::string& name, float value) {
+  float_values_[name].value = value;
+}
+void Arguments::AddInt(const std::string& name, int value) {
+  int_values_[name].value = value;
+}
+void Arguments::AddBuffer(const std::string& name,
+                          const GPUBufferDescriptor& desc) {
+  buffers_[name] = desc;
+}
+void Arguments::AddImage2D(const std::string& name,
+                           const GPUImage2DDescriptor& desc) {
+  images2d_[name] = desc;
+}
+
+void Arguments::AddImage2DArray(const std::string& name,
+                                const GPUImage2DArrayDescriptor& desc) {
+  image2d_arrays_[name] = desc;
+}
+
+void Arguments::AddImage3D(const std::string& name,
+                           const GPUImage3DDescriptor& desc) {
+  images3d_[name] = desc;
+}
+
+void Arguments::AddImageBuffer(const std::string& name,
+                               const GPUImageBufferDescriptor& desc) {
+  image_buffers_[name] = desc;
+}
+
+void Arguments::AddObjectRef(const std::string& name, AccessType access_type,
+                             GPUObjectDescriptorPtr&& descriptor_ptr) {
+  object_refs_[name] = {access_type, std::move(descriptor_ptr)};
+}
+
+void Arguments::AddObject(const std::string& name, AccessType access_type,
+                          GPUObjectPtr&& object) {
+  objects_[name] = {access_type, std::move(object)};
+}
+
+void Arguments::AddGPUResources(const std::string& name,
+                                const GPUResources& resources) {
+  for (const auto& r : resources.ints) {
+    AddInt(absl::StrCat(name, "_", r));
+  }
+  for (const auto& r : resources.floats) {
+    AddFloat(absl::StrCat(name, "_", r));
+  }
+  for (const auto& r : resources.buffers) {
+    AddBuffer(absl::StrCat(name, "_", r.first), r.second);
+  }
+  for (const auto& r : resources.images2d) {
+    AddImage2D(absl::StrCat(name, "_", r.first), r.second);
+  }
+  for (const auto& r : resources.image2d_arrays) {
+    AddImage2DArray(absl::StrCat(name, "_", r.first), r.second);
+  }
+  for (const auto& r : resources.images3d) {
+    AddImage3D(absl::StrCat(name, "_", r.first), r.second);
+  }
+  for (const auto& r : resources.image_buffers) {
+    AddImageBuffer(absl::StrCat(name, "_", r.first), r.second);
+  }
+}
+
+absl::Status Arguments::SetInt(const std::string& name, int value) {
+  auto ii = int_values_.find(name);
+  if (ii == int_values_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No int argument with name - ", name));
+  }
+  ii->second.value = value;
+  if (ii->second.active) {
+    shared_int4s_data_[ii->second.offset] = value;
+  }
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::SetFloat(const std::string& name, float value) {
+  auto fi = float_values_.find(name);
+  if (fi == float_values_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No float argument with name - ", name));
+  }
+  fi->second.value = value;
+  if (fi->second.active) {
+    shared_float4s_data_[fi->second.offset] = value;
+  }
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::SetImage2D(const std::string& name, cl_mem memory) {
+  auto it = images2d_.find(name);
+  if (it == images2d_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No image2D argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::SetBuffer(const std::string& name, cl_mem memory) {
+  auto it = buffers_.find(name);
+  if (it == buffers_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No buffer argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::SetImage2DArray(const std::string& name,
+                                        cl_mem memory) {
+  auto it = image2d_arrays_.find(name);
+  if (it == image2d_arrays_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No image2D array argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::SetImage3D(const std::string& name, cl_mem memory) {
+  auto it = images3d_.find(name);
+  if (it == images3d_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No image3D argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::SetImageBuffer(const std::string& name, cl_mem memory) {
+  auto it = image_buffers_.find(name);
+  if (it == image_buffers_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No image buffer argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::SetObjectRef(const std::string& name,
+                                     const GPUObject* object) {
+  auto it = object_refs_.find(name);
+  if (it == object_refs_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No object ref with name - ", name));
+  }
+  return SetGPUResources(name, object->GetGPUResources(it->second.access_type));
+}
+
+absl::Status Arguments::SetGPUResources(
+    const std::string& name, const GPUResourcesWithValue& resources) {
+  for (const auto& r : resources.ints) {
+    RETURN_IF_ERROR(SetInt(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.floats) {
+    RETURN_IF_ERROR(SetFloat(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.buffers) {
+    RETURN_IF_ERROR(SetBuffer(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.images2d) {
+    RETURN_IF_ERROR(SetImage2D(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.image2d_arrays) {
+    RETURN_IF_ERROR(
+        SetImage2DArray(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.images3d) {
+    RETURN_IF_ERROR(SetImage3D(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.image_buffers) {
+    RETURN_IF_ERROR(SetImageBuffer(absl::StrCat(name, "_", r.first), r.second));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::TransformToCLCode(std::string* code) {
+  RETURN_IF_ERROR(AddObjectArgs());
+  RETURN_IF_ERROR(ResolveSelectorsPass(code));
+  ResolveArgsPass(code);
+  return absl::OkStatus();
+}
+
+std::string Arguments::GetListOfArgs() {
+  std::string result;
+  for (auto& t : buffers_) {
+    const std::string type_name =
+        t.second.data_type == DataType::FLOAT32 ? "float" : "half";
+    AppendArgument(absl::StrCat("__global ", type_name, t.second.element_size,
+                                "* ", t.first),
+                   &result);
+  }
+  for (auto& t : image_buffers_) {
+    AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type),
+                                " image1d_buffer_t ", t.first),
+                   &result);
+  }
+  for (auto& t : images2d_) {
+    AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type),
+                                " image2d_t ", t.first),
+                   &result);
+  }
+  for (auto& t : image2d_arrays_) {
+    AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type),
+                                " image2d_array_t ", t.first),
+                   &result);
+  }
+  for (auto& t : images3d_) {
+    AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type),
+                                " image3d_t ", t.first),
+                   &result);
+  }
+  for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
+    AppendArgument(absl::StrCat("int4 shared_int4_", i), &result);
+  }
+  for (int i = 0; i < shared_float4s_data_.size() / 4; ++i) {
+    AppendArgument(absl::StrCat("float4 shared_float4_", i), &result);
+  }
+  return result;
+}
+
+absl::Status Arguments::Bind(cl_kernel kernel, int offset) {
+  for (auto& t : buffers_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (auto& t : image_buffers_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (auto& t : images2d_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (auto& t : image2d_arrays_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (auto& t : images3d_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
+    const int error_code = clSetKernelArg(kernel, offset, sizeof(int32_t) * 4,
+                                          &shared_int4s_data_[i * 4]);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (int i = 0; i < shared_float4s_data_.size() / 4; ++i) {
+    const int error_code = clSetKernelArg(kernel, offset, sizeof(int32_t) * 4,
+                                          &shared_float4s_data_[i * 4]);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  return absl::OkStatus();
+}
+
+std::string Arguments::AddActiveArgument(const std::string& arg_name) {
+  if (auto it = int_values_.find(arg_name); it != int_values_.end()) {
+    int int_index;
+    if (it->second.active) {
+      int_index = it->second.offset;
+    } else {
+      it->second.active = true;
+      it->second.offset = shared_int4s_data_.size();
+      int_index = it->second.offset;
+      shared_int4s_data_.push_back(it->second.value);
+    }
+    std::string index = std::to_string(int_index / 4);
+    std::string postfixes[4] = {"x", "y", "z", "w"};
+    return "shared_int4_" + index + "." + postfixes[int_index % 4];
+  }
+  if (auto it = float_values_.find(arg_name); it != float_values_.end()) {
+    int float_index;
+    if (it->second.active) {
+      float_index = it->second.offset;
+    } else {
+      it->second.active = true;
+      it->second.offset = shared_float4s_data_.size();
+      float_index = it->second.offset;
+      shared_float4s_data_.push_back(it->second.value);
+    }
+    std::string index = std::to_string(float_index / 4);
+    std::string postfixes[4] = {"x", "y", "z", "w"};
+    return "shared_float4_" + index + "." + postfixes[float_index % 4];
+  }
+  return arg_name;
+}
+
+void Arguments::ResolveArgsPass(std::string* code) {
+  std::string result;
+  size_t position = 0;
+  size_t next_position = code->find(kArgsPrefix);
+  while (next_position != std::string::npos) {
+    size_t arg_pos = next_position;
+    next_position += strlen(kArgsPrefix);
+    std::string object_name = GetNextWord(*code, next_position);
+    std::string new_name = AddActiveArgument(object_name);
+    code->replace(arg_pos, object_name.size() + strlen(kArgsPrefix), new_name);
+    position = arg_pos + new_name.size();
+    next_position = code->find(kArgsPrefix, position);
+  }
+
+  int shared_int4s_aligned_size = AlignByN(shared_int4s_data_.size(), 4);
+  shared_int4s_data_.resize(shared_int4s_aligned_size);
+  int shared_float4s_aligned_size = AlignByN(shared_float4s_data_.size(), 4);
+  shared_float4s_data_.resize(shared_float4s_aligned_size);
+}
+
+void Arguments::ResolveObjectNames(const std::string& object_name,
+                                   const std::vector<std::string>& member_names,
+                                   std::string* code) {
+  for (const auto& member_name : member_names) {
+    const std::string new_name = kArgsPrefix + object_name + "_" + member_name;
+    ReplaceAllWords(member_name, new_name, code);
+  }
+}
+
+absl::Status Arguments::ResolveSelector(const std::string& object_name,
+                                        const std::string& selector,
+                                        const std::vector<std::string>& args,
+                                        std::string* result) {
+  const GPUObjectDescriptor* desc_ptr;
+  AccessType access_type;
+  if (auto it = object_refs_.find(object_name); it != object_refs_.end()) {
+    desc_ptr = it->second.descriptor.get();
+    access_type = it->second.access_type;
+  } else if (auto it = objects_.find(object_name); it != objects_.end()) {
+    desc_ptr = it->second.obj_ptr->GetGPUDescriptor();
+    access_type = it->second.access_type;
+  } else {
+    return absl::NotFoundError(
+        absl::StrCat("No object with name - ", object_name));
+  }
+  RETURN_IF_ERROR(desc_ptr->PerformSelector(selector, args, result));
+  auto names = desc_ptr->GetGPUResources(access_type).GetNames();
+  ResolveObjectNames(object_name, names, result);
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::ResolveSelectorsPass(std::string* code) {
+  std::string result;
+  size_t position = 0;
+  size_t next_position = code->find(kArgsPrefix);
+  while (next_position != std::string::npos) {
+    size_t arg_pos = next_position;
+    next_position += strlen(kArgsPrefix);
+    std::string object_name = GetNextWord(*code, next_position);
+    char next = (*code)[next_position + object_name.size()];
+    if (next == '.') {
+      next_position += object_name.size() + 1;
+      std::string selector_name = GetNextWord(*code, next_position);
+      next_position += selector_name.size();
+      next = (*code)[next_position];
+      if (next != '(') {
+        return absl::NotFoundError(
+            absl::StrCat("Expected ( after function ", selector_name, " call"));
+      }
+      next_position += 1;
+      size_t bracket_pos = FindEnclosingBracket(*code, next_position, '(');
+      if (bracket_pos == -1) {
+        return absl::NotFoundError(
+            absl::StrCat("Not found enclosing bracket for function ",
+                         selector_name, " call"));
+      }
+      std::string str_args =
+          code->substr(next_position, bracket_pos - next_position - 1);
+      std::vector<absl::string_view> words = absl::StrSplit(str_args, ',');
+      std::vector<std::string> args;
+      args.reserve(words.size());
+      for (const auto& word : words) {
+        absl::string_view arg = absl::StripAsciiWhitespace(word);
+        if (!arg.empty()) {
+          args.push_back(std::string(arg));
+        }
+      }
+      std::string patch;
+      RETURN_IF_ERROR(
+          ResolveSelector(object_name, selector_name, args, &patch));
+      code->replace(arg_pos, bracket_pos - arg_pos, patch);
+      position = arg_pos + patch.size();
+    } else {
+      position = arg_pos + strlen(kArgsPrefix);
+    }
+    next_position = code->find(kArgsPrefix, position);
+  }
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::AddObjectArgs() {
+  for (auto& t : objects_) {
+    AddGPUResources(t.first,
+                    t.second.obj_ptr->GetGPUDescriptor()->GetGPUResources(
+                        t.second.access_type));
+    RETURN_IF_ERROR(SetGPUResources(
+        t.first, t.second.obj_ptr->GetGPUResources(t.second.access_type)));
+  }
+  for (auto& t : object_refs_) {
+    AddGPUResources(t.first,
+                    t.second.descriptor->GetGPUResources(t.second.access_type));
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
new file mode 100644
index 00000000000..db711bcb1d3
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -0,0 +1,146 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_ARGUMENTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_ARGUMENTS_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Arguments {
+ public:
+  Arguments() = default;
+  void AddFloat(const std::string& name, float value = 0.0f);
+  void AddInt(const std::string& name, int value = 0);
+  void AddBuffer(const std::string& name, const GPUBufferDescriptor& desc);
+  void AddImage2D(const std::string& name, const GPUImage2DDescriptor& desc);
+  void AddImage2DArray(const std::string& name,
+                       const GPUImage2DArrayDescriptor& desc);
+  void AddImage3D(const std::string& name, const GPUImage3DDescriptor& desc);
+  void AddImageBuffer(const std::string& name,
+                      const GPUImageBufferDescriptor& desc);
+
+  void AddObjectRef(const std::string& name, AccessType access_type,
+                    GPUObjectDescriptorPtr&& descriptor_ptr);
+  void AddObject(const std::string& name, AccessType access_type,
+                 GPUObjectPtr&& object);
+
+  absl::Status SetInt(const std::string& name, int value);
+  absl::Status SetFloat(const std::string& name, float value);
+  absl::Status SetImage2D(const std::string& name, cl_mem memory);
+  absl::Status SetBuffer(const std::string& name, cl_mem memory);
+  absl::Status SetImage2DArray(const std::string& name, cl_mem memory);
+  absl::Status SetImage3D(const std::string& name, cl_mem memory);
+  absl::Status SetImageBuffer(const std::string& name, cl_mem memory);
+  absl::Status SetObjectRef(const std::string& name, const GPUObject* object);
+
+  std::string GetListOfArgs();
+
+  absl::Status Bind(cl_kernel kernel, int offset);
+
+  absl::Status TransformToCLCode(std::string* code);
+
+  // Move only
+  Arguments(Arguments&& args);
+  Arguments& operator=(Arguments&& args);
+  Arguments(const Arguments&) = delete;
+  Arguments& operator=(const Arguments&) = delete;
+
+ private:
+  std::string AddActiveArgument(const std::string& arg_name);
+  void AddGPUResources(const std::string& name, const GPUResources& resources);
+
+  absl::Status SetGPUResources(const std::string& name,
+                               const GPUResourcesWithValue& resources);
+
+  absl::Status AddObjectArgs();
+
+  void ResolveArgsPass(std::string* code);
+  absl::Status ResolveSelectorsPass(std::string* code);
+
+  absl::Status ResolveSelector(const std::string& object_name,
+                               const std::string& selector,
+                               const std::vector<std::string>& args,
+                               std::string* result);
+
+  void ResolveObjectNames(const std::string& object_name,
+                          const std::vector<std::string>& member_names,
+                          std::string* code);
+
+  static constexpr char kArgsPrefix[] = "args.";
+
+  struct IntValue {
+    int value;
+
+    // many uniforms generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // offset to shared uniform storage.
+    uint32_t offset = -1;
+  };
+  std::map<std::string, IntValue> int_values_;
+  std::vector<int32_t> shared_int4s_data_;
+
+  struct FloatValue {
+    float value;
+
+    // many uniforms generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // offset to shared uniform storage.
+    uint32_t offset = -1;
+  };
+  std::map<std::string, FloatValue> float_values_;
+  std::vector<float> shared_float4s_data_;
+
+  std::map<std::string, GPUBufferDescriptor> buffers_;
+  std::map<std::string, GPUImage2DDescriptor> images2d_;
+  std::map<std::string, GPUImage2DArrayDescriptor> image2d_arrays_;
+  std::map<std::string, GPUImage3DDescriptor> images3d_;
+  std::map<std::string, GPUImageBufferDescriptor> image_buffers_;
+
+  struct ObjectRefArg {
+    AccessType access_type;
+    GPUObjectDescriptorPtr descriptor;
+  };
+  std::map<std::string, ObjectRefArg> object_refs_;
+
+  struct ObjectArg {
+    AccessType access_type;
+    GPUObjectPtr obj_ptr;
+  };
+  std::map<std::string, ObjectArg> objects_;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_ARGUMENTS_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments_test.cc b/tensorflow/lite/delegates/gpu/cl/arguments_test.cc
new file mode 100644
index 00000000000..1a4c9fc9c00
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/arguments_test.cc
@@ -0,0 +1,96 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+struct TestDescriptor : public GPUObjectDescriptor {
+  absl::Status PerformSelector(const std::string& selector,
+                               const std::vector<std::string>& args,
+                               std::string* result) const override {
+    if (selector == "Length") {
+      *result = "length";
+      return absl::OkStatus();
+    } else if (selector == "Read") {
+      if (args.size() != 1) {
+        return absl::NotFoundError(
+            absl::StrCat("TestDescriptor Read require one argument, but ",
+                         args.size(), " was passed"));
+      }
+      *result = absl::StrCat("buffer[", args[0], "]");
+      return absl::OkStatus();
+    } else {
+      return absl::NotFoundError(absl::StrCat(
+          "TestDescriptor don't have selector with name - ", selector));
+    }
+  }
+
+  GPUResources GetGPUResources() const override {
+    GPUResources resources;
+    resources.ints.push_back("length");
+    GPUBufferDescriptor desc;
+    desc.data_type = DataType::FLOAT32;
+    desc.element_size = 4;
+    resources.buffers.push_back({"buffer", desc});
+    return resources;
+  }
+};
+}  // namespace
+
+TEST(ArgumentsTest, TestSelectorResolve) {
+  TestDescriptor descriptor;
+  Arguments args;
+  args.AddObjectRef("object", absl::make_unique<TestDescriptor>(descriptor));
+  std::string sample_code = R"(
+  if (a < 3) {
+    value = args.object.Read(id);
+  }
+)";
+  const std::string expected_result = R"(
+  if (a < 3) {
+    value = object_buffer[id];
+  }
+)";
+  ASSERT_OK(args.TransformToCLCode(&sample_code));
+  EXPECT_EQ(sample_code, expected_result);
+
+  std::string cl_arguments = args.GetListOfArgs();
+  EXPECT_TRUE(cl_arguments.find("__global float4* object_buffer") !=
+              std::string::npos);
+}
+
+TEST(ArgumentsTest, TestNoSelector) {
+  TestDescriptor descriptor;
+  Arguments args;
+  args.AddObjectRef("object", absl::make_unique<TestDescriptor>(descriptor));
+  std::string sample_code = R"(
+  if (a < 3) {
+    value = args.object.Write(id);
+  }
+)";
+  EXPECT_FALSE(args.TransformToCLCode(&sample_code).ok());
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_kernel.h b/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
index b575684d2b4..be9dc6dbf03 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
@@ -65,6 +65,7 @@ class CLKernel {
   int GetPrivateMemorySize() const { return private_memory_size_; }
   int GetMaxWorkGroupSize() const { return max_work_group_size_; }
 
+  int GetBindingCounter() const { return binding_counter_; }
   void ResetBindingCounter() { binding_counter_ = 0; }
 
   // Do not use this function
diff --git a/tensorflow/lite/delegates/gpu/cl/egl_sync.cc b/tensorflow/lite/delegates/gpu/cl/egl_sync.cc
index ddc373bce31..f50bc75b8be 100644
--- a/tensorflow/lite/delegates/gpu/cl/egl_sync.cc
+++ b/tensorflow/lite/delegates/gpu/cl/egl_sync.cc
@@ -22,8 +22,15 @@ namespace gpu {
 namespace cl {
 
 absl::Status EglSync::NewFence(EGLDisplay display, EglSync* sync) {
+  static auto* egl_create_sync_khr =
+      reinterpret_cast<decltype(&eglCreateSyncKHR)>(
+          eglGetProcAddress("eglCreateSyncKHR"));
+  if (egl_create_sync_khr == nullptr) {
+    // Needs extension: EGL_KHR_fence_sync (EGL) / GL_OES_EGL_sync (OpenGL ES).
+    return absl::InternalError("Not supported: eglCreateSyncKHR.");
+  }
   EGLSyncKHR egl_sync;
-  RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(eglCreateSyncKHR, &egl_sync, display,
+  RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(*egl_create_sync_khr, &egl_sync, display,
                                       EGL_SYNC_FENCE_KHR, nullptr));
   if (egl_sync == EGL_NO_SYNC_KHR) {
     return absl::InternalError("Returned empty KHR EGL sync");
@@ -43,25 +50,46 @@ EglSync& EglSync::operator=(EglSync&& sync) {
 
 void EglSync::Invalidate() {
   if (sync_ != EGL_NO_SYNC_KHR) {
-    eglDestroySyncKHR(display_, sync_);
+    static auto* egl_destroy_sync_khr =
+        reinterpret_cast<decltype(&eglDestroySyncKHR)>(
+            eglGetProcAddress("eglDestroySyncKHR"));
+    // Needs extension: EGL_KHR_fence_sync (EGL) / GL_OES_EGL_sync (OpenGL ES).
+    if (egl_destroy_sync_khr) {
+      // Note: we're doing nothing when the function pointer is nullptr, or the
+      // call returns EGL_FALSE.
+      (*egl_destroy_sync_khr)(display_, sync_);
+    }
     sync_ = EGL_NO_SYNC_KHR;
   }
 }
 
 absl::Status EglSync::ServerWait() {
+  static auto* egl_wait_sync_khr = reinterpret_cast<decltype(&eglWaitSyncKHR)>(
+      eglGetProcAddress("eglWaitSyncKHR"));
+  if (egl_wait_sync_khr == nullptr) {
+    // Needs extension: EGL_KHR_wait_sync
+    return absl::InternalError("Not supported: eglWaitSyncKHR.");
+  }
   EGLint result;
   RETURN_IF_ERROR(
-      TFLITE_GPU_CALL_EGL(eglWaitSyncKHR, &result, display_, sync_, 0));
+      TFLITE_GPU_CALL_EGL(*egl_wait_sync_khr, &result, display_, sync_, 0));
   return result == EGL_TRUE ? absl::OkStatus()
                             : absl::InternalError("eglWaitSync failed");
 }
 
 absl::Status EglSync::ClientWait() {
+  static auto* egl_client_wait_sync_khr =
+      reinterpret_cast<decltype(&eglClientWaitSyncKHR)>(
+          eglGetProcAddress("eglClientWaitSyncKHR"));
+  if (egl_client_wait_sync_khr == nullptr) {
+    // Needs extension: EGL_KHR_fence_sync (EGL) / GL_OES_EGL_sync (OpenGL ES).
+    return absl::InternalError("Not supported: eglClientWaitSyncKHR.");
+  }
   EGLint result;
   // TODO(akulik): make it active wait for better performance
-  RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(eglClientWaitSyncKHR, &result, display_,
-                                      sync_, EGL_SYNC_FLUSH_COMMANDS_BIT_KHR,
-                                      EGL_FOREVER_KHR));
+  RETURN_IF_ERROR(
+      TFLITE_GPU_CALL_EGL(*egl_client_wait_sync_khr, &result, display_, sync_,
+                          EGL_SYNC_FLUSH_COMMANDS_BIT_KHR, EGL_FOREVER_KHR));
   return result == EGL_CONDITION_SATISFIED_KHR
              ? absl::OkStatus()
              : absl::InternalError("eglClientWaitSync failed");
diff --git a/tensorflow/lite/delegates/gpu/cl/egl_sync.h b/tensorflow/lite/delegates/gpu/cl/egl_sync.h
index d0943a797ee..dbea2436d73 100644
--- a/tensorflow/lite/delegates/gpu/cl/egl_sync.h
+++ b/tensorflow/lite/delegates/gpu/cl/egl_sync.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
+
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/cl/gl_interop.cc b/tensorflow/lite/delegates/gpu/cl/gl_interop.cc
index eaeff2cda07..599e6766301 100644
--- a/tensorflow/lite/delegates/gpu/cl/gl_interop.cc
+++ b/tensorflow/lite/delegates/gpu/cl/gl_interop.cc
@@ -273,7 +273,7 @@ GlClBufferCopier::GlClBufferCopier(const TensorObjectDef& input_def,
 
 absl::Status GlClBufferCopier::Convert(const TensorObject& input_obj,
                                        const TensorObject& output_obj) {
-  if (absl::get_if<OpenGlBuffer>(&input_obj)) {
+  if (absl::holds_alternative<OpenGlBuffer>(input_obj)) {
     auto ssbo = absl::get_if<OpenGlBuffer>(&input_obj);
     auto cl_mem = absl::get_if<OpenClBuffer>(&output_obj);
     RETURN_IF_ERROR(
diff --git a/tensorflow/lite/delegates/gpu/cl/gl_interop.h b/tensorflow/lite/delegates/gpu/cl/gl_interop.h
index 1ca0181e8e5..aac769b9682 100644
--- a/tensorflow/lite/delegates/gpu/cl/gl_interop.h
+++ b/tensorflow/lite/delegates/gpu/cl/gl_interop.h
@@ -16,10 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GL_INTEROP_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_GL_INTEROP_H_
 
-#include <vector>
-
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
+
+#include <vector>
+
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
index c3df1f7a426..1a9fb73e6ab 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
 
-#include <stdint.h>
-
 #include <EGL/egl.h>
 #include <GLES3/gl31.h>
+#include <stdint.h>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/delegate.h"
 
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
new file mode 100644
index 00000000000..b936c1b01ee
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -0,0 +1,162 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_OBJECT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_OBJECT_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+struct GPUImage2DDescriptor {
+  DataType data_type;
+  AccessType access_type;
+  cl_mem memory;
+};
+
+struct GPUImage3DDescriptor {
+  DataType data_type;
+  AccessType access_type;
+  cl_mem memory;
+};
+
+struct GPUImage2DArrayDescriptor {
+  DataType data_type;
+  AccessType access_type;
+  cl_mem memory;
+};
+
+struct GPUImageBufferDescriptor {
+  DataType data_type;
+  AccessType access_type;
+  cl_mem memory;
+};
+
+struct GPUBufferDescriptor {
+  DataType data_type;
+  AccessType access_type;
+  int element_size;
+  cl_mem memory;
+};
+
+struct GPUResources {
+  std::vector<std::string> ints;
+  std::vector<std::string> floats;
+  std::vector<std::pair<std::string, GPUBufferDescriptor>> buffers;
+  std::vector<std::pair<std::string, GPUImage2DDescriptor>> images2d;
+  std::vector<std::pair<std::string, GPUImage2DArrayDescriptor>> image2d_arrays;
+  std::vector<std::pair<std::string, GPUImage3DDescriptor>> images3d;
+  std::vector<std::pair<std::string, GPUImageBufferDescriptor>> image_buffers;
+
+  std::vector<std::string> GetNames() const {
+    std::vector<std::string> names = ints;
+    names.insert(names.end(), floats.begin(), floats.end());
+    for (const auto& obj : buffers) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : images2d) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : image2d_arrays) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : images3d) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : image_buffers) {
+      names.push_back(obj.first);
+    }
+    return names;
+  }
+};
+
+struct GPUResourcesWithValue {
+  std::vector<std::pair<std::string, int>> ints;
+  std::vector<std::pair<std::string, float>> floats;
+  std::vector<std::pair<std::string, cl_mem>> buffers;
+  std::vector<std::pair<std::string, cl_mem>> images2d;
+  std::vector<std::pair<std::string, cl_mem>> image2d_arrays;
+  std::vector<std::pair<std::string, cl_mem>> images3d;
+  std::vector<std::pair<std::string, cl_mem>> image_buffers;
+};
+
+class GPUObjectDescriptor {
+ public:
+  GPUObjectDescriptor() = default;
+  GPUObjectDescriptor(const GPUObjectDescriptor& obj_desc)
+      : state_vars_(obj_desc.state_vars_) {}
+  GPUObjectDescriptor& operator=(const GPUObjectDescriptor& obj_desc) {
+    if (this != &obj_desc) {
+      state_vars_ = obj_desc.state_vars_;
+    }
+    return *this;
+  }
+  virtual ~GPUObjectDescriptor() = default;
+
+  void SetStateVar(const std::string& key, const std::string& value) const {
+    state_vars_[key] = value;
+  }
+
+  virtual std::string PerformConstExpr(const std::string& const_expr) const {
+    return "";
+  }
+
+  virtual absl::Status PerformSelector(const std::string& selector,
+                                       const std::vector<std::string>& args,
+                                       std::string* result) const {
+    *result = "";
+    return absl::OkStatus();
+  }
+  virtual GPUResources GetGPUResources(AccessType access_type) const {
+    return GPUResources();
+  }
+
+ protected:
+  mutable std::map<std::string, std::string> state_vars_;
+};
+
+using GPUObjectDescriptorPtr = std::unique_ptr<GPUObjectDescriptor>;
+
+class GPUObject {
+ public:
+  GPUObject() = default;
+  // Move only
+  GPUObject(GPUObject&& obj_desc) = default;
+  GPUObject& operator=(GPUObject&& obj_desc) = default;
+  GPUObject(const GPUObject&) = delete;
+  GPUObject& operator=(const GPUObject&) = delete;
+  virtual ~GPUObject() = default;
+  virtual const GPUObjectDescriptor* GetGPUDescriptor() const = 0;
+  virtual GPUResourcesWithValue GetGPUResources(
+      AccessType access_type) const = 0;
+};
+
+using GPUObjectPtr = std::unique_ptr<GPUObject>;
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_OBJECT_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index ff6f06eeb68..8b54078dbf8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -682,7 +682,9 @@ cc_library(
     deps = [
         ":gpu_operation",
         ":util",
+        "//tensorflow/lite/delegates/gpu/cl:storage_type_util",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -874,45 +876,6 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "multiply_add",
-    srcs = ["multiply_add.cc"],
-    hdrs = ["multiply_add.h"],
-    deps = [
-        ":flt_type",
-        ":gpu_operation",
-        ":util",
-        "//tensorflow/lite/delegates/gpu/cl:cl_context",
-        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
-        "//tensorflow/lite/delegates/gpu/cl:precision",
-        "//tensorflow/lite/delegates/gpu/common:data_type",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
-        "//tensorflow/lite/delegates/gpu/common:types",
-        "@com_google_absl//absl/types:variant",
-    ],
-)
-
-cc_test(
-    name = "multiply_add_test",
-    srcs = ["multiply_add_test.cc"],
-    linkstatic = True,
-    tags = tf_gpu_tests_tags() + [
-        "linux",
-        "local",
-    ],
-    deps = [
-        ":cl_test",
-        ":multiply_add",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 cc_library(
     name = "padding",
     srcs = ["padding.cc"],
@@ -1290,8 +1253,10 @@ cc_library(
         ":gpu_operation",
         ":util",
         ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:arguments",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1383,6 +1348,7 @@ cc_library(
         ":gpu_operation",
         ":util",
         ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:arguments",
         "//tensorflow/lite/delegates/gpu/cl:cl_device",
         "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
         "//tensorflow/lite/delegates/gpu/cl:linear_storage",
@@ -1393,6 +1359,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:winograd_util",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
 )
@@ -1452,7 +1419,6 @@ test_suite(
         "fully_connected_test",
         "lstm_test",
         "max_unpooling_test",
-        "multiply_add_test",
         "padding_test",
         "pooling_test",
         "prelu_test",
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
index e435bccef03..782cf357c40 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
@@ -20,10 +20,48 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/storage_type_util.h"
 
 namespace tflite {
 namespace gpu {
 namespace cl {
+namespace {
+std::string GetTwoInputCode(const OperationType& op_type,
+                            const std::string& input0,
+                            const std::string& input1) {
+  std::string result;
+  switch (op_type) {
+    case OperationType::ADD:
+      result += "$0 += $1;\n";
+      break;
+    case OperationType::DIV:
+      result += "$0 /= $1;\n";
+      break;
+    case OperationType::MAXIMUM:
+      result += "$0 = max($0, $1);\n";
+      break;
+    case OperationType::MINIMUM:
+      result += "$0 = min($0, $1);\n";
+      break;
+    case OperationType::MUL:
+      result += "$0 *= $1;\n";
+      break;
+    case OperationType::POW:
+      result += "$0 = pow($0, $1);\n";
+      break;
+    case OperationType::SQUARED_DIFF:
+      result += "$0 -= $1;\n";
+      result += "$0 *= $0;\n";
+      break;
+    case OperationType::SUB:
+      result += "$0 -= $1;\n";
+      break;
+    default:
+      return "Unknown operation type;\n";
+  }
+  return absl::Substitute(result, input0, input1);
+}
+}  // namespace
 
 ElementwiseOneInput::ElementwiseOneInput(ElementwiseOneInput&& operation)
     : ElementwiseOperation(std::move(operation)),
@@ -105,13 +143,66 @@ ElementwiseOneInput CreateElementwiseOneInput(const OperationDef& definition,
   return operation;
 }
 
+ElementwiseOneRuntimeOneScalar::ElementwiseOneRuntimeOneScalar(
+    ElementwiseOneRuntimeOneScalar&& operation)
+    : ElementwiseOperation(std::move(operation)),
+      link_index_(operation.link_index_),
+      op_type_(operation.op_type_),
+      scalar_parameter_(std::move(operation.scalar_parameter_)) {}
+
+ElementwiseOneRuntimeOneScalar& ElementwiseOneRuntimeOneScalar::operator=(
+    ElementwiseOneRuntimeOneScalar&& operation) {
+  if (this != &operation) {
+    link_index_ = operation.link_index_;
+    op_type_ = operation.op_type_;
+    scalar_parameter_ = operation.scalar_parameter_;
+    ElementwiseOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+void ElementwiseOneRuntimeOneScalar::SetLinkIndex(int index) {
+  link_index_ = index;
+  scalar_parameter_.SetName(absl::StrCat("scalar_parmeter_", index));
+}
+
+std::string ElementwiseOneRuntimeOneScalar::GetCoreCode(
+    const LinkingContext& context) const {
+  std::string second_var =
+      absl::StrCat("(FLT)(", scalar_parameter_.GetName(), ")");
+  return GetTwoInputCode(op_type_, context.var_name, second_var);
+}
+
+std::string ElementwiseOneRuntimeOneScalar::GetArgsDeclaration() const {
+  std::string args;
+  absl::StrAppend(&args, ",\n    ", scalar_parameter_.GetDeclaration());
+  return args;
+}
+
+absl::Status ElementwiseOneRuntimeOneScalar::BindArguments(CLKernel* kernel) {
+  RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_parameter_));
+  return absl::OkStatus();
+}
+
+ElementwiseOneRuntimeOneScalar CreateElementwiseOneRuntimeOneScalar(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const OperationType& op_type, float scalar_parameter) {
+  const auto scalar_precision = creation_context.device->IsPowerVR()
+                                    ? CalculationsPrecision::F32
+                                    : definition.precision;
+  ElementwiseOneRuntimeOneScalar operation(
+      definition, op_type, FLT(scalar_precision, scalar_parameter));
+  operation.SetLinkIndex(0);
+  return operation;
+}
+
 ElementwiseTwoInput::ElementwiseTwoInput(ElementwiseTwoInput&& operation)
     : ElementwiseOperation(std::move(operation)),
       link_index_(operation.link_index_),
       op_type_(operation.op_type_),
       broadcast_(operation.broadcast_),
-      scalar_para_(operation.scalar_para_),
-      use_scalar_para_(operation.use_scalar_para_) {}
+      use_constant_tensor_(operation.use_constant_tensor_),
+      constant_tensor_(std::move(operation.constant_tensor_)) {}
 
 ElementwiseTwoInput& ElementwiseTwoInput::operator=(
     ElementwiseTwoInput&& operation) {
@@ -119,8 +210,8 @@ ElementwiseTwoInput& ElementwiseTwoInput::operator=(
     link_index_ = operation.link_index_;
     op_type_ = operation.op_type_;
     broadcast_ = operation.broadcast_;
-    scalar_para_ = operation.scalar_para_;
-    use_scalar_para_ = operation.use_scalar_para_;
+    use_constant_tensor_ = operation.use_constant_tensor_;
+    constant_tensor_ = std::move(operation.constant_tensor_);
     ElementwiseOperation::operator=(std::move(operation));
   }
   return *this;
@@ -128,84 +219,51 @@ ElementwiseTwoInput& ElementwiseTwoInput::operator=(
 
 void ElementwiseTwoInput::SetLinkIndex(int index) {
   link_index_ = index;
-  if (use_scalar_para_) {
-    scalar_para_.SetName(absl::StrCat("scalar_para_", index));
-  }
 }
 
 std::string ElementwiseTwoInput::GetCoreCode(
     const LinkingContext& context) const {
   std::string result;
   std::string second_var;
-  if (use_scalar_para_) {
-    second_var = absl::StrCat("(FLT)(", scalar_para_.GetName(), ")");
-  } else {
-    const std::string size_name = "src_size_" + std::to_string(link_index_);
-    TensorCodeGenerator src_tensor(
-        absl::StrCat("src_data_", link_index_),
-        WHSPoint{size_name + ".x", size_name + ".y", size_name + ".z"},
-        definition_.src_tensors[1]);
-    const std::string x_coord = broadcast_.width ? "0" : context.x_coord;
-    const std::string y_coord = broadcast_.height ? "0" : context.y_coord;
-    const std::string s_coord = broadcast_.channels ? "0" : context.s_coord;
-    second_var = "second_var_" + std::to_string(link_index_);
-    result = "  FLT4 " + second_var + " = " +
-             src_tensor.ReadWHS(x_coord, y_coord, s_coord) + ";\n";
-    if (broadcast_.channels) {
-      result += "  " + second_var + ".y = " + second_var + ".x;\n";
-      result += "  " + second_var + ".z = " + second_var + ".x;\n";
-      result += "  " + second_var + ".w = " + second_var + ".x;\n";
-    }
+  const std::string size_name = "src_size_" + std::to_string(link_index_);
+  TensorDescriptor descriptor = use_constant_tensor_
+                                    ? constant_tensor_.GetDescriptor()
+                                    : definition_.src_tensors[1];
+  TensorCodeGenerator src_tensor(
+      absl::StrCat("src_data_", link_index_),
+      WHSPoint{size_name + ".x", size_name + ".y", size_name + ".z"},
+      descriptor);
+  const std::string x_coord = broadcast_.width ? "0" : context.x_coord;
+  const std::string y_coord = broadcast_.height ? "0" : context.y_coord;
+  const std::string s_coord = broadcast_.channels ? "0" : context.s_coord;
+  second_var = "second_var_" + std::to_string(link_index_);
+  result = "  FLT4 " + second_var + " = " +
+           src_tensor.ReadWHS(x_coord, y_coord, s_coord) + ";\n";
+  if (broadcast_.channels) {
+    result += "  " + second_var + ".y = " + second_var + ".x;\n";
+    result += "  " + second_var + ".z = " + second_var + ".x;\n";
+    result += "  " + second_var + ".w = " + second_var + ".x;\n";
   }
-  switch (op_type_) {
-    case OperationType::ADD:
-      result += "$0 += $1;\n";
-      break;
-    case OperationType::DIV:
-      result += "$0 /= $1;\n";
-      break;
-    case OperationType::MAXIMUM:
-      result += "$0 = max($0, $1);\n";
-      break;
-    case OperationType::MINIMUM:
-      result += "$0 = min($0, $1);\n";
-      break;
-    case OperationType::MUL:
-      result += "$0 *= $1;\n";
-      break;
-    case OperationType::POW:
-      result += "$0 = pow($0, $1);\n";
-      break;
-    case OperationType::SQUARED_DIFF:
-      result += "$0 -= $1;\n";
-      result += "$0 *= $0;\n";
-      break;
-    case OperationType::SUB:
-      result += "$0 -= $1;\n";
-      break;
-    default:
-      return "Unknown operation type;\n";
-  }
-  return absl::Substitute(result, context.var_name, second_var);
+  return result + GetTwoInputCode(op_type_, context.var_name, second_var);
 }
 
 std::string ElementwiseTwoInput::GetArgsDeclaration() const {
   std::string args;
-  if (use_scalar_para_) {
-    absl::StrAppend(&args, ",\n    ", scalar_para_.GetDeclaration());
-  } else {
-    absl::StrAppend(&args, ",\n",
-                    GetTensorDeclaration(AccessType::READ,
-                                         absl::StrCat("src_data_", link_index_),
-                                         definition_.src_tensors[1]));
-    absl::StrAppend(&args, ",\n   int4 src_size_", link_index_);
-  }
+  TensorDescriptor descriptor = use_constant_tensor_
+                                    ? constant_tensor_.GetDescriptor()
+                                    : definition_.src_tensors[1];
+  absl::StrAppend(
+      &args, ",\n",
+      GetTensorDeclaration(AccessType::READ,
+                           absl::StrCat("src_data_", link_index_), descriptor));
+  absl::StrAppend(&args, ",\n   int4 src_size_", link_index_);
   return args;
 }
 
 absl::Status ElementwiseTwoInput::BindArguments(CLKernel* kernel) {
-  if (use_scalar_para_) {
-    RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_para_));
+  if (use_constant_tensor_) {
+    RETURN_IF_ERROR(kernel->SetMemoryAuto(constant_tensor_.GetMemoryPtr()));
+    RETURN_IF_ERROR(kernel->SetBytesAuto(constant_tensor_.GetWBatchedHSB()));
   } else {
     RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[1]->GetMemoryPtr()));
     RETURN_IF_ERROR(kernel->SetBytesAuto(src_[1]->GetWBatchedHSB()));
@@ -213,27 +271,68 @@ absl::Status ElementwiseTwoInput::BindArguments(CLKernel* kernel) {
   return absl::OkStatus();
 }
 
-ElementwiseTwoInput CreateElementwiseTwoInput(
+absl::Status CreateElementwiseTwoInput(
     const CreationContext& creation_context, const OperationDef& definition,
-    const OperationType& op_type, const BroadcastSettings& broadcast,
-    const ElementwiseAttributes* attr) {
-  ElementwiseTwoInput operation(definition, op_type, broadcast);
-  if (attr) {
-    const float* scalar = absl::get_if<float>(&attr->param);
-    if (scalar) {
-      const auto scalar_precision = creation_context.device->IsPowerVR()
-                                        ? CalculationsPrecision::F32
-                                        : definition.precision;
-      operation.SetScalarPara(FLT(scalar_precision, *scalar));
-    }
-  }
-  operation.SetLinkIndex(0);
-  return operation;
+    const OperationType& op_type,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& constant_tensor,
+    ElementwiseTwoInput* result) {
+  const BHWC shape = BHWC(1, 1, 1, constant_tensor.shape.v);
+  TensorStorageType storage_type =
+      SelectBestStorageType(*creation_context.context, *creation_context.device,
+                            shape, definition.GetPrimaryStorageType(),
+                            definition.GetDataType(), Layout::HWC);
+  TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
+  Tensor gpu_tensor;
+  RETURN_IF_ERROR(CreateTensor(*creation_context.context,
+                               *creation_context.device, shape, desc,
+                               &gpu_tensor));
+  RETURN_IF_ERROR(
+      gpu_tensor.WriteData(creation_context.queue, constant_tensor));
+  BroadcastSettings broadcast;
+  broadcast.width = true;
+  broadcast.height = true;
+  broadcast.channels = shape.c == 1;
+  *result = ElementwiseTwoInput(definition, op_type, broadcast,
+                                std::move(gpu_tensor));
+  result->SetLinkIndex(0);
+  return absl::OkStatus();
 }
 
-ElementwiseTwoInput CreateElementwiseTwoInput(
-    const OperationDef& definition, const OperationType& op_type,
-    const BroadcastSettings& broadcast) {
+absl::Status CreateElementwiseTwoInput(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const OperationType& op_type,
+    const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& constant_tensor,
+    ElementwiseTwoInput* result) {
+  const BHWC shape = BHWC(1, constant_tensor.shape.h, constant_tensor.shape.w,
+                          constant_tensor.shape.c);
+  TensorStorageType storage_type =
+      SelectBestStorageType(*creation_context.context, *creation_context.device,
+                            shape, definition.GetPrimaryStorageType(),
+                            definition.GetDataType(), Layout::HWC);
+  TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
+  Tensor gpu_tensor;
+  RETURN_IF_ERROR(CreateTensor(*creation_context.context,
+                               *creation_context.device, shape, desc,
+                               &gpu_tensor));
+  RETURN_IF_ERROR(
+      gpu_tensor.WriteData(creation_context.queue, constant_tensor));
+  BroadcastSettings broadcast;
+  broadcast.width = shape.w == 1;
+  broadcast.height = shape.h == 1;
+  broadcast.channels = shape.c == 1;
+  *result = ElementwiseTwoInput(definition, op_type, broadcast,
+                                std::move(gpu_tensor));
+  result->SetLinkIndex(0);
+  return absl::OkStatus();
+}
+
+ElementwiseTwoInput CreateElementwiseTwoInput(const OperationDef& definition,
+                                              const OperationType& op_type,
+                                              const BHWC& shape) {
+  BroadcastSettings broadcast;
+  broadcast.width = shape.w == 1;
+  broadcast.height = shape.h == 1;
+  broadcast.channels = shape.c == 1;
   ElementwiseTwoInput operation(definition, op_type, broadcast);
   operation.SetLinkIndex(0);
   return operation;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
index 4c85fee6071..79467da0077 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
 namespace gpu {
@@ -48,23 +49,71 @@ class ElementwiseOneInput : public ElementwiseOperation {
 ElementwiseOneInput CreateElementwiseOneInput(const OperationDef& definition,
                                               const OperationType& op_type);
 
+// Class for simple two input (first input is runtime tensor and second input is
+// scalar argument) operations without any parameters, for example sub, div and
+// etc.
+class ElementwiseOneRuntimeOneScalar : public ElementwiseOperation {
+ public:
+  ElementwiseOneRuntimeOneScalar(const OperationDef& definition,
+                                 const OperationType& op_type,
+                                 FLT scalar_parameter)
+      : ElementwiseOperation(definition),
+        op_type_(op_type),
+        scalar_parameter_(scalar_parameter) {}
+
+  // Move only
+  ElementwiseOneRuntimeOneScalar(ElementwiseOneRuntimeOneScalar&& operation);
+  ElementwiseOneRuntimeOneScalar& operator=(
+      ElementwiseOneRuntimeOneScalar&& operation);
+  ElementwiseOneRuntimeOneScalar(const ElementwiseOneRuntimeOneScalar&) =
+      delete;
+  ElementwiseOneRuntimeOneScalar& operator=(
+      const ElementwiseOneRuntimeOneScalar&) = delete;
+
+  void SetLinkIndex(int index) override;
+  std::string GetCoreCode(const LinkingContext& context) const override;
+  std::string GetArgsDeclaration() const override;
+  absl::Status BindArguments(CLKernel* kernel) override;
+
+ private:
+  int link_index_;
+  OperationType op_type_;
+  FLT scalar_parameter_;
+};
+
+ElementwiseOneRuntimeOneScalar CreateElementwiseOneRuntimeOneScalar(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const OperationType& op_type, float scalar_parameter);
+
 struct BroadcastSettings {
   bool width;
   bool height;
   bool channels;
 };
 
-// Class for simple two input operations without any parameters, for example
+// Class for simple two input(first input is runtime tensor and second input is
+// runtime or constant tensor) operations without any parameters, for example
 // sub, div and etc.
 class ElementwiseTwoInput : public ElementwiseOperation {
  public:
-  explicit ElementwiseTwoInput(const OperationDef& definition,
-                               const OperationType& op_type,
-                               const BroadcastSettings& broadcast)
+  ElementwiseTwoInput() = default;
+  ElementwiseTwoInput(const OperationDef& definition,
+                      const OperationType& op_type,
+                      const BroadcastSettings& broadcast)
       : ElementwiseOperation(definition),
         op_type_(op_type),
         broadcast_(broadcast),
-        use_scalar_para_(false) {}
+        use_constant_tensor_(false) {}
+
+  ElementwiseTwoInput(const OperationDef& definition,
+                      const OperationType& op_type,
+                      const BroadcastSettings& broadcast,
+                      Tensor&& constant_tensor)
+      : ElementwiseOperation(definition),
+        op_type_(op_type),
+        broadcast_(broadcast),
+        use_constant_tensor_(true),
+        constant_tensor_(std::move(constant_tensor)) {}
 
   // Move only
   ElementwiseTwoInput(ElementwiseTwoInput&& operation);
@@ -76,30 +125,30 @@ class ElementwiseTwoInput : public ElementwiseOperation {
   std::string GetCoreCode(const LinkingContext& context) const override;
   std::string GetArgsDeclaration() const override;
   absl::Status BindArguments(CLKernel* kernel) override;
-  inline void SetScalarPara(FLT scalar) {
-    scalar_para_ = scalar;
-    use_scalar_para_ = true;
-  }
 
  private:
   int link_index_;
   OperationType op_type_;
   BroadcastSettings broadcast_;
-  FLT scalar_para_;
-  bool use_scalar_para_;
+  bool use_constant_tensor_;
+  Tensor constant_tensor_;
 };
 
-ElementwiseTwoInput CreateElementwiseTwoInput(
+absl::Status CreateElementwiseTwoInput(
     const CreationContext& creation_context, const OperationDef& definition,
-    const OperationType& op_type, const BroadcastSettings& broadcast,
-    const ElementwiseAttributes* attr);
+    const OperationType& op_type,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& constant_tensor,
+    ElementwiseTwoInput* result);
 
-ElementwiseTwoInput CreateElementwiseTwoInput(
-    const OperationDef& definition, const OperationType& op_type,
-    const BroadcastSettings& broadcast);
+absl::Status CreateElementwiseTwoInput(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const OperationType& op_type,
+    const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& constant_tensor,
+    ElementwiseTwoInput* result);
 
 ElementwiseTwoInput CreateElementwiseTwoInput(const OperationDef& definition,
-                                              const OperationType& op_type);
+                                              const OperationType& op_type,
+                                              const BHWC& shape);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
index 7a3aaecfe7f..7c3bdbe66e7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
@@ -329,8 +329,8 @@ TEST_F(OpenCLOperationTest, Sub) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::SUB);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::SUB, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -357,8 +357,8 @@ TEST_F(OpenCLOperationTest, SquaredDiff) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::SQUARED_DIFF);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::SQUARED_DIFF, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -385,8 +385,8 @@ TEST_F(OpenCLOperationTest, Div) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::DIV);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::DIV, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -413,8 +413,8 @@ TEST_F(OpenCLOperationTest, Pow) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::POW);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::POW, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -441,8 +441,8 @@ TEST_F(OpenCLOperationTest, Add) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::ADD);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::ADD, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -452,7 +452,7 @@ TEST_F(OpenCLOperationTest, Add) {
   }
 }
 
-TEST_F(OpenCLOperationTest, Maxiumum) {
+TEST_F(OpenCLOperationTest, Maximum) {
   TensorFloat32 src_tensor_0, src_tensor_1;
   src_tensor_0.shape = BHWC(1, 2, 1, 2);
   src_tensor_1.shape = BHWC(1, 2, 1, 2);
@@ -469,8 +469,8 @@ TEST_F(OpenCLOperationTest, Maxiumum) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::MAXIMUM);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::MAXIMUM, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -480,7 +480,7 @@ TEST_F(OpenCLOperationTest, Maxiumum) {
   }
 }
 
-TEST_F(OpenCLOperationTest, MaxiumumWithScalar) {
+TEST_F(OpenCLOperationTest, MaximumWithScalar) {
   TensorFloat32 src_tensor_0;
   src_tensor_0.shape = BHWC(1, 4, 1, 1);
   src_tensor_0.data = {0.0f, -6.2f, 2.0f, -3.0f};
@@ -497,9 +497,10 @@ TEST_F(OpenCLOperationTest, MaxiumumWithScalar) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      BroadcastSettings broadcast;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
-          creation_context_, op_def, OperationType::MAXIMUM, broadcast, &attr);
+      const float* scalar = absl::get_if<float>(&attr.param);
+      ElementwiseOneRuntimeOneScalar operation =
+          CreateElementwiseOneRuntimeOneScalar(creation_context_, op_def,
+                                               OperationType::MAXIMUM, *scalar);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 4, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -508,6 +509,97 @@ TEST_F(OpenCLOperationTest, MaxiumumWithScalar) {
   }
 }
 
+TEST_F(OpenCLOperationTest, MaximumWithConstantLinearTensor) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {1.0f, -6.2f, -2.0f, 3.0f};
+
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> linear_tensor;
+  linear_tensor.shape = Linear(2);
+  linear_tensor.data = {0.5f, 2.0f};
+  ElementwiseAttributes attr;
+  attr.param = linear_tensor;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      ElementwiseTwoInput operation;
+      ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def,
+                                          OperationType::MAXIMUM, linear_tensor,
+                                          &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.0f, 2.0f, 0.5f, 3.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensor) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {1.0f, -6.2f, -2.0f, 3.0f};
+
+  ::tflite::gpu::Tensor<HWC, DataType::FLOAT32> hwc_tensor;
+  hwc_tensor.shape = HWC(2, 1, 2);
+  hwc_tensor.data = {0.5f, 2.0f, 0.7f, 4.7f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      ElementwiseTwoInput operation;
+      ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def,
+                                          OperationType::MAXIMUM, hwc_tensor,
+                                          &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.0f, 2.0f, 0.7f, 4.7f}));
+    }
+  }
+}
+TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensorBroadcastChannels) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {1.0f, -6.2f, -2.0f, 3.0f};
+
+  ::tflite::gpu::Tensor<HWC, DataType::FLOAT32> hwc_tensor;
+  hwc_tensor.shape = HWC(2, 1, 1);
+  hwc_tensor.data = {0.5f, 2.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      ElementwiseTwoInput operation;
+      ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def,
+                                          OperationType::MAXIMUM, hwc_tensor,
+                                          &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.0f, 0.5f, 2.0f, 3.0f}));
+    }
+  }
+}
+
 TEST_F(OpenCLOperationTest, Minimum) {
   TensorFloat32 src_tensor_0, src_tensor_1;
   src_tensor_0.shape = BHWC(1, 2, 1, 2);
@@ -525,8 +617,8 @@ TEST_F(OpenCLOperationTest, Minimum) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::MINIMUM);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::MINIMUM, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -553,9 +645,10 @@ TEST_F(OpenCLOperationTest, MinimumWithScalar) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      BroadcastSettings broadcast;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
-          creation_context_, op_def, OperationType::MINIMUM, broadcast, &attr);
+      const float* scalar = absl::get_if<float>(&attr.param);
+      ElementwiseOneRuntimeOneScalar operation =
+          CreateElementwiseOneRuntimeOneScalar(creation_context_, op_def,
+                                               OperationType::MINIMUM, *scalar);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 4, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -581,8 +674,8 @@ TEST_F(OpenCLOperationTest, Mul) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::MUL);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::MUL, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -599,11 +692,6 @@ TEST_F(OpenCLOperationTest, MulBroadcastHW) {
   src_tensor_0.data = {1.0f, 2.0f, 3.0f, 4.5f};
   src_tensor_1.data = {0.5f, 3.0f};
 
-  BroadcastSettings broadcast;
-  broadcast.width = true;
-  broadcast.height = true;
-  broadcast.channels = false;
-
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
       const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
@@ -614,8 +702,8 @@ TEST_F(OpenCLOperationTest, MulBroadcastHW) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::MUL, broadcast);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::MUL, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -632,11 +720,6 @@ TEST_F(OpenCLOperationTest, MulBroadcastChannels) {
   src_tensor_0.data = {1.0f, 2.0f, 3.0f, 4.5f};
   src_tensor_1.data = {0.5f, 3.0f};
 
-  BroadcastSettings broadcast;
-  broadcast.width = false;
-  broadcast.height = false;
-  broadcast.channels = true;
-
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
       const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
@@ -647,8 +730,8 @@ TEST_F(OpenCLOperationTest, MulBroadcastChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::MUL, broadcast);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::MUL, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
index 9f4c9871123..3aa01981844 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
@@ -205,7 +205,7 @@ std::string PostProcess(const std::vector<ElementwiseOperation*>& linked_ops,
                         const LinkingContext& context) {
   std::string code;
   for (auto linked_op : linked_ops) {
-    code += linked_op->GetCoreCode(context);
+    code += "{" + linked_op->GetCoreCode(context) + "}";
   }
   return code;
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.cc b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.cc
deleted file mode 100644
index 5b9b78f073c..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h"
-
-#include "absl/types/variant.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/precision.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/shape.h"
-#include "tensorflow/lite/delegates/gpu/common/tensor.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-MultiplyAdd::MultiplyAdd(MultiplyAdd&& operation)
-    : ElementwiseOperation(std::move(operation)),
-      mul_vec_(std::move(operation.mul_vec_)),
-      add_vec_(std::move(operation.add_vec_)),
-      use_mul_vec_(operation.use_mul_vec_),
-      use_add_vec_(operation.use_add_vec_),
-      scalar_mul_(std::move(operation.scalar_mul_)),
-      scalar_add_(std::move(operation.scalar_add_)) {}
-
-MultiplyAdd& MultiplyAdd::operator=(MultiplyAdd&& operation) {
-  if (this != &operation) {
-    mul_vec_ = std::move(operation.mul_vec_);
-    add_vec_ = std::move(operation.add_vec_);
-    use_mul_vec_ = operation.use_mul_vec_;
-    use_add_vec_ = operation.use_add_vec_;
-    scalar_mul_ = std::move(operation.scalar_mul_);
-    scalar_add_ = std::move(operation.scalar_add_);
-    ElementwiseOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-void MultiplyAdd::SetLinkIndex(int index) {
-  scalar_mul_.SetName(absl::StrCat("mad_scalar_mul_", index));
-  scalar_add_.SetName(absl::StrCat("mad_scalar_add_", index));
-  mul_vec_.SetName(absl::StrCat("mad_mul_", index));
-  add_vec_.SetName(absl::StrCat("mad_add_", index));
-}
-
-std::string MultiplyAdd::GetCoreCode(const LinkingContext& context) const {
-  std::string result = absl::StrCat(context.var_name, " = ", context.var_name);
-  if (use_mul_vec_) {
-    absl::StrAppend(&result, " * ", mul_vec_.ReadLinearFLT4(context.s_coord));
-  }
-  if (scalar_mul_.Active()) {
-    absl::StrAppend(&result, " * (FLT)(", scalar_mul_.GetName(), ")");
-  }
-  if (use_add_vec_) {
-    absl::StrAppend(&result, " + ", add_vec_.ReadLinearFLT4(context.s_coord));
-  }
-  if (scalar_add_.Active()) {
-    absl::StrAppend(&result, " + (FLT)(", scalar_add_.GetName(), ")");
-  }
-  return absl::StrCat(result, ";\n");
-}
-
-std::string MultiplyAdd::GetArgsDeclaration() const {
-  std::string args;
-  if (use_mul_vec_) {
-    absl::StrAppend(&args, ",\n    ", mul_vec_.GetDeclaration());
-  }
-  if (use_add_vec_) {
-    absl::StrAppend(&args, ",\n    ", add_vec_.GetDeclaration());
-  }
-  if (scalar_mul_.Active()) {
-    absl::StrAppend(&args, ",\n    ", scalar_mul_.GetDeclaration());
-  }
-  if (scalar_add_.Active()) {
-    absl::StrAppend(&args, ",\n    ", scalar_add_.GetDeclaration());
-  }
-  return args;
-}
-
-absl::Status MultiplyAdd::BindArguments(CLKernel* kernel) {
-  if (use_mul_vec_) {
-    RETURN_IF_ERROR(kernel->SetMemoryAuto(mul_vec_.GetMemoryPtr()));
-  }
-  if (use_add_vec_) {
-    RETURN_IF_ERROR(kernel->SetMemoryAuto(add_vec_.GetMemoryPtr()));
-  }
-  if (scalar_mul_.Active()) {
-    RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_mul_));
-  }
-  if (scalar_add_.Active()) {
-    RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_add_));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status MultiplyAdd::UploadMul(const MultiplyAttributes& attr,
-                                    CalculationsPrecision scalar_precision,
-                                    CLContext* context) {
-  auto mul =
-      absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(&attr.param);
-  auto mul_scalar = absl::get_if<float>(&attr.param);
-  if (mul) {
-    RETURN_IF_ERROR(UploadMul(*mul, context));
-  } else {
-    scalar_mul_ = FLT(scalar_precision, *mul_scalar);
-  }
-  return absl::OkStatus();
-}
-
-absl::Status MultiplyAdd::UploadAdd(const AddAttributes& attr,
-                                    CalculationsPrecision scalar_precision,
-                                    CLContext* context) {
-  auto add =
-      absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(&attr.param);
-  auto add_scalar = absl::get_if<float>(&attr.param);
-  if (add) {
-    RETURN_IF_ERROR(UploadAdd(*add, context));
-  } else {
-    scalar_add_ = FLT(scalar_precision, *add_scalar);
-  }
-  return absl::OkStatus();
-}
-
-absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const MultiplyAttributes& attr,
-                               MultiplyAdd* result) {
-  const auto scalar_precision = creation_context.device->IsPowerVR()
-                                    ? CalculationsPrecision::F32
-                                    : definition.precision;
-  *result = MultiplyAdd(definition);
-  RETURN_IF_ERROR(
-      result->UploadMul(attr, scalar_precision, creation_context.context));
-  result->SetLinkIndex(0);
-  return absl::OkStatus();
-}
-
-absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const AddAttributes& attr, MultiplyAdd* result) {
-  const auto scalar_precision = creation_context.device->IsPowerVR()
-                                    ? CalculationsPrecision::F32
-                                    : definition.precision;
-  *result = MultiplyAdd(definition);
-  RETURN_IF_ERROR(
-      result->UploadAdd(attr, scalar_precision, creation_context.context));
-  result->SetLinkIndex(0);
-  return absl::OkStatus();
-}
-
-absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const MultiplyAttributes& mul_attr,
-                               const AddAttributes& add_attr,
-                               MultiplyAdd* result) {
-  const auto scalar_precision = creation_context.device->IsPowerVR()
-                                    ? CalculationsPrecision::F32
-                                    : definition.precision;
-  *result = MultiplyAdd(definition);
-  RETURN_IF_ERROR(
-      result->UploadMul(mul_attr, scalar_precision, creation_context.context));
-  RETURN_IF_ERROR(
-      result->UploadAdd(add_attr, scalar_precision, creation_context.context));
-  result->SetLinkIndex(0);
-  return absl::OkStatus();
-}
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h
deleted file mode 100644
index 2e222f785f6..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MULTIPLY_ADD_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MULTIPLY_ADD_H_
-
-#include <string>
-
-#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
-#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-class MultiplyAdd : public ElementwiseOperation {
- public:
-  // Move only
-  MultiplyAdd() = default;
-  MultiplyAdd(MultiplyAdd&& operation);
-  MultiplyAdd& operator=(MultiplyAdd&& operation);
-  MultiplyAdd(const MultiplyAdd&) = delete;
-  MultiplyAdd& operator=(const MultiplyAdd&) = delete;
-
-  absl::Status UploadMul(const MultiplyAttributes& attr,
-                         CalculationsPrecision scalar_precision,
-                         CLContext* context);
-  absl::Status UploadAdd(const AddAttributes& attr,
-                         CalculationsPrecision scalar_precision,
-                         CLContext* context);
-
-  template <DataType T>
-  absl::Status UploadMul(const tflite::gpu::Tensor<Linear, T>& mul,
-                         CLContext* context);
-
-  template <DataType T>
-  absl::Status UploadAdd(const tflite::gpu::Tensor<Linear, T>& add,
-                         CLContext* context);
-
-  void SetLinkIndex(int index) override;
-  std::string GetCoreCode(const LinkingContext& context) const override;
-
-  std::string GetArgsDeclaration() const override;
-  absl::Status BindArguments(CLKernel* kernel) override;
-
-  friend absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                                        const OperationDef& definition,
-                                        const MultiplyAttributes& attr,
-                                        MultiplyAdd* result);
-
-  friend absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                                        const OperationDef& definition,
-                                        const AddAttributes& attr,
-                                        MultiplyAdd* result);
-
-  friend absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                                        const OperationDef& definition,
-                                        const MultiplyAttributes& mul_attr,
-                                        const AddAttributes& add_attr,
-                                        MultiplyAdd* result);
-
- private:
-  explicit MultiplyAdd(const OperationDef& definition)
-      : ElementwiseOperation(definition),
-        use_mul_vec_(false),
-        use_add_vec_(false) {}
-
-  LinearStorage mul_vec_;
-  LinearStorage add_vec_;
-  bool use_mul_vec_;
-  bool use_add_vec_;
-  FLT scalar_mul_;
-  FLT scalar_add_;
-};
-
-absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const MultiplyAttributes& attr,
-                               MultiplyAdd* result);
-
-absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const AddAttributes& attr, MultiplyAdd* result);
-
-absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const MultiplyAttributes& mul_attr,
-                               const AddAttributes& add_attr,
-                               MultiplyAdd* result);
-
-template <DataType T>
-absl::Status MultiplyAdd::UploadMul(const tflite::gpu::Tensor<Linear, T>& mul,
-                                    CLContext* context) {
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type =
-      DeduceLinearStorageType(definition_.GetPrimaryStorageType());
-  create_info.data_type = definition_.GetDataType();
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, mul, context, &mul_vec_));
-  use_mul_vec_ = true;
-  return absl::OkStatus();
-}
-
-template <DataType T>
-absl::Status MultiplyAdd::UploadAdd(const tflite::gpu::Tensor<Linear, T>& add,
-                                    CLContext* context) {
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type =
-      DeduceLinearStorageType(definition_.GetPrimaryStorageType());
-  create_info.data_type = definition_.GetDataType();
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, add, context, &add_vec_));
-  use_add_vec_ = true;
-  return absl::OkStatus();
-}
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MULTIPLY_ADD_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc
deleted file mode 100644
index 2adb6a20bc4..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h"
-
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/tensor.h"
-
-using ::testing::FloatNear;
-using ::testing::Pointwise;
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-namespace {
-
-TEST_F(OpenCLOperationTest, MultiplyAddVectorMul) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 1, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
-
-  MultiplyAttributes attr;
-  Tensor<Linear, DataType::FLOAT32> parameters;
-  parameters.shape = Linear(2);
-  parameters.data = {0.5f, 2.0f};
-  attr.param = parameters;
-
-  for (auto storage : env_.GetSupportedStorages()) {
-    for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      MultiplyAdd operation;
-      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
-      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                    BHWC(1, 2, 1, 2), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps), {0.0f, 2.0f, 1.0f, 6.0f}));
-    }
-  }
-}
-
-TEST_F(OpenCLOperationTest, MultiplyAddVectorAdd) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 1, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
-
-  AddAttributes attr;
-  Tensor<Linear, DataType::FLOAT32> parameters;
-  parameters.shape = Linear(2);
-  parameters.data = {0.5f, 2.0f};
-  attr.param = parameters;
-
-  for (auto storage : env_.GetSupportedStorages()) {
-    for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      MultiplyAdd operation;
-      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
-      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                    BHWC(1, 2, 1, 2), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps), {0.5f, 3.0f, 2.5f, 5.0f}));
-    }
-  }
-}
-
-TEST_F(OpenCLOperationTest, MultiplyAddScalarMul) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 1, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
-
-  MultiplyAttributes attr;
-  attr.param = 0.5f;
-
-  for (auto storage : env_.GetSupportedStorages()) {
-    for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      MultiplyAdd operation;
-      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
-      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                    BHWC(1, 2, 1, 2), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps), {0.0f, 0.5f, 1.0f, 1.5f}));
-    }
-  }
-}
-
-TEST_F(OpenCLOperationTest, MultiplyAddScalarAdd) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 1, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
-
-  AddAttributes attr;
-  attr.param = -0.5f;
-
-  for (auto storage : env_.GetSupportedStorages()) {
-    for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      MultiplyAdd operation;
-      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
-      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                    BHWC(1, 2, 1, 2), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps), {-0.5f, 0.5f, 1.5f, 2.5f}));
-    }
-  }
-}
-
-TEST_F(OpenCLOperationTest, MultiplyAddVectorMad) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 1, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
-
-  MultiplyAttributes mul_attr;
-  Tensor<Linear, DataType::FLOAT32> parameters;
-  parameters.shape = Linear(2);
-  parameters.data = {0.5f, 2.0f};
-  mul_attr.param = parameters;
-
-  AddAttributes add_attr;
-  parameters.data = {-0.5f, 0.5f};
-  add_attr.param = parameters;
-
-  for (auto storage : env_.GetSupportedStorages()) {
-    for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      MultiplyAdd operation;
-      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, mul_attr, add_attr,
-                                  &operation));
-      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                    BHWC(1, 2, 1, 2), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps), {-0.5f, 2.5f, 0.5f, 6.5f}));
-    }
-  }
-}
-
-}  // namespace
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
index 01b603b5961..4b0006c7f32 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
@@ -37,7 +37,7 @@ TEST_F(OpenCLOperationTest, PReLUAlpha) {
   src_tensor.data = {0.0f, -1.0f, -2.0f, 3.0f};
 
   PReLUAttributes attr;
-  Tensor<Linear, DataType::FLOAT32> parameters;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
   parameters.shape = Linear(2);
   parameters.data = {0.5f, -2.0f};
   attr.alpha = parameters;
@@ -68,7 +68,7 @@ TEST_F(OpenCLOperationTest, PReLUAlphaClip) {
   src_tensor.data = {0.0f, -1.0f, -2.0f, 3.0f};
 
   PReLUAttributes attr;
-  Tensor<Linear, DataType::FLOAT32> parameters;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
   parameters.shape = Linear(2);
   parameters.data = {0.5f, -2.0f};
   attr.alpha = parameters;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
index 66a272fa2da..4c46a88d3c4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/substitute.h"
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 
@@ -27,37 +29,39 @@ namespace {
 
 std::string GetTransposeCode(
     const OperationDef& op_def, const TransposeAttributes& attr,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+    const std::vector<ElementwiseOperation*>& linked_operations,
+    Arguments* args) {
+  TensorCodeGenerator dst_tensor("dst_data",
+                                 WHSBPoint{"args.dst_width", "args.dst_height",
+                                           "args.dst_slices", "args.dst_batch"},
+                                 op_def.dst_tensors[0]);
+
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddInt("dst_width");
+  args->AddInt("dst_height");
+  args->AddInt("dst_slices");
+  args->AddInt("dst_batch");
+  args->AddInt("dst_channels");
 
   const std::string batch_id = op_def.IsBatchSupported() ? "B" : "";
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE);
   c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
-  c += "    int src_channels,          \n";
-  c += "    int dst_channels           \n";
-  c += ") {\n";
+  c += "$0) {\n";
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X = linear_id / dst_size.w;\n";
-    c += "  int B = linear_id % dst_size.w;\n";
+    c += "  int X = linear_id / args.dst_batch;\n";
+    c += "  int B = linear_id % args.dst_batch;\n";
   } else {
     c += "  int X = get_global_id(0);\n";
   }
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) { \n";
+  c += "  if (X >= args.dst_width || Y >= args.dst_height || Z >= "
+       "args.dst_slices) { \n";
   c += "    return; \n";
   c += "  } \n";
   c += "  FLT temps[4];\n";
@@ -72,10 +76,12 @@ std::string GetTransposeCode(
   remap[attr.perm.c] = 3;
   if (attr.perm.c == 3) {  // optimized reading when no channels permutation
     const std::string bhw[] = {"B", "Y", "X"};
-    std::string src_b = op_def.IsBatchSupported() ? bhw[remap[0]] : "";
+    if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+      c += "  args.src_tensor.SetBatchRef(" + bhw[remap[0]] + ");\n";
+    }
     c += "  int s_y = " + bhw[remap[1]] + ";\n";
     c += "  int s_x = " + bhw[remap[2]] + ";\n";
-    c += "  FLT4 t =" + src_tensor.ReadWHSB("s_x", "s_y", "Z", src_b) + ";\n";
+    c += "  FLT4 t = args.src_tensor.Read(s_x, s_y, Z);\n";
     c += "  temps[0] = t.x;\n";
     c += "  temps[1] = t.y;\n";
     c += "  temps[2] = t.z;\n";
@@ -83,16 +89,17 @@ std::string GetTransposeCode(
   } else {
     c += "  for (int i = 0; i < 4; ++i) {\n";
     c += "    int dst_channel = Z * 4 + i;\n";
-    c += "    if (dst_channel < dst_channels) {;\n";
+    c += "    if (dst_channel < args.dst_channels) {\n";
     const std::string bhwc[] = {"B", "Y", "X", "dst_channel"};
-    std::string src_b = op_def.IsBatchSupported() ? bhwc[remap[0]] : "";
+    if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+      c += "      args.src_tensor.SetBatchRef(" + bhwc[remap[0]] + ");\n";
+    }
     c += "      int s_y = " + bhwc[remap[1]] + ";\n";
     c += "      int s_x = " + bhwc[remap[2]] + ";\n";
     c += "      int s_c = " + bhwc[remap[3]] + ";\n";
     c += "      int s_z = s_c / 4;\n";
     c += "      int src_sub_ch = s_c % 4;\n";
-    c += "      FLT4 t =" + src_tensor.ReadWHSB("s_x", "s_y", "s_z", src_b) +
-         ";\n";
+    c += "      FLT4 t = args.src_tensor.Read(s_x, s_y, s_z);\n";
     c += "      FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
     c += "      temps[i] = t_ar[src_sub_ch];\n";
     c += "    }\n";
@@ -100,7 +107,7 @@ std::string GetTransposeCode(
   }
   c += "  FLT4 result = (FLT4)(temps[0], temps[1], temps[2], temps[3]);\n";
   std::string x_3dcoord =
-      op_def.IsBatchSupported() ? "X * dst_size.w + B" : "X";
+      op_def.IsBatchSupported() ? "X * args.dst_batch + B" : "X";
   const LinkingContext context{"result", x_3dcoord, "Y", "Z"};
   c += PostProcess(linked_operations, context);
   c += "  " + dst_tensor.WriteWHSB("result", "X", "Y", "Z", batch_id);
@@ -112,12 +119,14 @@ std::string GetTransposeCode(
 Transpose::Transpose(Transpose&& operation)
     : GPUOperation(std::move(operation)),
       attr_(operation.attr_),
+      args_(std::move(operation.args_)),
       kernel_(std::move(operation.kernel_)),
       work_group_size_(operation.work_group_size_) {}
 
 Transpose& Transpose::operator=(Transpose&& operation) {
   if (this != &operation) {
     attr_ = operation.attr_;
+    args_ = std::move(operation.args_);
     kernel_ = std::move(operation.kernel_);
     std::swap(work_group_size_, operation.work_group_size_);
     GPUOperation::operator=(std::move(operation));
@@ -126,21 +135,26 @@ Transpose& Transpose::operator=(Transpose&& operation) {
 }
 
 absl::Status Transpose::Compile(const CreationContext& creation_context) {
-  const auto code = GetTransposeCode(definition_, attr_, linked_operations_);
+  std::string code =
+      GetTransposeCode(definition_, attr_, linked_operations_, &args_);
+  RETURN_IF_ERROR(args_.TransformToCLCode(&code));
+  code = absl::Substitute(code, args_.GetListOfArgs());
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status Transpose::BindArguments() {
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetInt("dst_width", dst_[0]->Width()));
+  RETURN_IF_ERROR(args_.SetInt("dst_height", dst_[0]->Height()));
+  RETURN_IF_ERROR(args_.SetInt("dst_slices", dst_[0]->Slices()));
+  RETURN_IF_ERROR(args_.SetInt("dst_batch", dst_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetInt("dst_channels", dst_[0]->Channels()));
   kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Channels()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->Channels()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
index 61038b1e0ca..13f06281012 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_TRANSPOSE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_TRANSPOSE_H_
 
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
@@ -43,6 +44,7 @@ class Transpose : public GPUOperation {
   int3 GetGridSize() const;
 
   TransposeAttributes attr_;
+  Arguments args_;
   CLKernel kernel_;
   int3 work_group_size_;
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index 6219952b9bf..eeb95ebaff7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_format.h"
+#include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
@@ -34,8 +35,9 @@ namespace cl {
 namespace {
 
 std::string GetWinograd4x4To36Code(
-    const OperationDef& op_def, const LinearStorage& bt_arr,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
+    const OperationDef& op_def,
+    const std::vector<ElementwiseOperation*>& linked_operations,
+    Arguments* args) {
   TensorCodeGenerator src_tensor(
       "src_data",
       WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
@@ -78,31 +80,31 @@ std::string GetWinograd4x4To36Code(
   }
   c += "};\n";
 
+  args->AddInt("padding_x");
+  args->AddInt("padding_y");
+  args->AddInt("tiles_total");
+  args->AddInt("tiles_x");
+
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += bt_arr.GetDeclaration();
+  c += src_tensor.GetDeclaration(AccessType::READ);
   c += GetArgsDeclaration(linked_operations);
   c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
   c += "    int4 src_size,                              \n";
-  c += "    int4 dst_size,                              \n";
-  c += "    int2 padding,                               \n";
-  c += "    int tiles_total,                            \n";
-  c += "    int tiles_x                                 \n";
-  c += ") {\n";
+  c += "    int4 dst_size,\n  ";
+  c += "$0) {\n";
   c += "  int DST_X = get_global_id(0);\n";
   c += "  int DST_Y = get_global_id(1);\n";
   c += "  int DST_Z = get_global_id(2);\n";
-  c += "  if (DST_X >= tiles_total || DST_Y >= 6 || DST_Z >= dst_size.z) {\n";
+  c += "  if (DST_X >= args.tiles_total || DST_Y >= 6 || DST_Z >= dst_size.z) "
+       "{\n";
   c += "    return; \n";
   c += "  }\n";
-  c += "  int tile_x = (DST_X % tiles_x) * 4;\n";
-  c += "  int tile_y = (DST_X / tiles_x) * 4;\n";
+  c += "  int tile_x = (DST_X % args.tiles_x) * 4;\n";
+  c += "  int tile_y = (DST_X / args.tiles_x) * 4;\n";
   c += "  ACCUM_FLT4 I0, I1, I2, I3, I4, I5;\n";
   c += "  ACCUM_FLT bt_ar[6];\n";
-  c += "  ACCUM_FLT4 t0 = TO_ACCUM_TYPE(" +
-       bt_arr.ReadLinearFLT4("DST_Y * 2 + 0") + ");\n";
-  c += "  ACCUM_FLT4 t1 = TO_ACCUM_TYPE(" +
-       bt_arr.ReadLinearFLT4("DST_Y * 2 + 1") + ");\n";
+  c += "  ACCUM_FLT4 t0 = TO_ACCUM_TYPE(args.bt.Read(DST_Y * 2 + 0));\n";
+  c += "  ACCUM_FLT4 t1 = TO_ACCUM_TYPE(args.bt.Read(DST_Y * 2 + 1));\n";
   c += "  DST_Y *= 6;\n";
   c += "  bt_ar[0] = t0.x;\n";
   c += "  bt_ar[1] = t0.y;\n";
@@ -121,15 +123,16 @@ std::string GetWinograd4x4To36Code(
            " * m" + xs + "_x;\n";
     } else {
       c += "    ACCUM_FLT4 " + src + " = " +
-           src_tensor.ReadAsTypeWHSB(accum_type, "tile_x + padding.x + " + xs,
-                                     "yc", "DST_Z", batch_id) +
+           src_tensor.ReadAsTypeWHSB(accum_type,
+                                     "tile_x + args.padding_x + " + xs, "yc",
+                                     "DST_Z", batch_id) +
            ";\n";
     }
   };
   if (is_buffer || is_image_buffer) {
     for (int x = 0; x < 6; ++x) {
       const std::string xs = std::to_string(x);
-      c += "  int xc" + xs + " = tile_x + padding.x + " + xs + ";\n";
+      c += "  int xc" + xs + " = tile_x + args.padding_x + " + xs + ";\n";
       c += "  ACCUM_FLT m" + xs + "_x = (ACCUM_FLT)(xc" + xs + " >= 0 && xc" +
            xs + " < src_size.x);\n";
       c += "  bool inx" + xs + " = (xc" + xs + " >= 0 && xc" + xs +
@@ -144,7 +147,7 @@ std::string GetWinograd4x4To36Code(
     }
   }
   c += "  {\n";
-  c += "    int yc = tile_y + padding.y;\n";
+  c += "    int yc = tile_y + args.padding_y;\n";
   if (is_buffer || is_image_buffer) {
     c += "    bool iny = (yc >= 0 && yc < src_size.y);\n";
     c += "    int offset = select(0, yc * src_size.x, iny);\n";
@@ -162,7 +165,7 @@ std::string GetWinograd4x4To36Code(
   for (int y = 1; y < 6; ++y) {
     const std::string ys = std::to_string(y);
     c += "  {\n";
-    c += "    int yc = tile_y + padding.y + (" + ys + ");\n";
+    c += "    int yc = tile_y + args.padding_y + (" + ys + ");\n";
     if (is_buffer || is_image_buffer) {
       c += "    bool iny = (yc >= 0 && yc < src_size.y);\n";
       c += "    int offset = select(0, yc * src_size.x, iny);\n";
@@ -223,7 +226,6 @@ std::string GetWinograd4x4To36Code(
   c += "    DST_Y++;\n";
   c += "  }\n";
   c += "}\n";
-  // std::cout << c << std::endl;
   return c;
 }
 
@@ -366,15 +368,15 @@ std::string GetWinograd36To4x4Code(
 
 Winograd4x4To36::Winograd4x4To36(Winograd4x4To36&& operation)
     : GPUOperation(std::move(operation)),
-      bt_(std::move(operation.bt_)),
       padding_(operation.padding_),
+      args_(std::move(operation.args_)),
       kernel_(std::move(operation.kernel_)),
       work_group_size_(operation.work_group_size_) {}
 
 Winograd4x4To36& Winograd4x4To36::operator=(Winograd4x4To36&& operation) {
   if (this != &operation) {
-    bt_ = std::move(operation.bt_);
     std::swap(padding_, operation.padding_);
+    args_ = std::move(operation.args_);
     kernel_ = std::move(operation.kernel_);
     std::swap(work_group_size_, operation.work_group_size_);
     GPUOperation::operator=(std::move(operation));
@@ -392,8 +394,10 @@ absl::Status Winograd4x4To36::Compile(const CreationContext& creation_context) {
     options.push_back(CompilerOptions::POWERVR_FP16);
   }
   RETURN_IF_ERROR(UploadBt(creation_context.context));
-  const auto code =
-      GetWinograd4x4To36Code(definition_, bt_, linked_operations_);
+  std::string code =
+      GetWinograd4x4To36Code(definition_, linked_operations_, &args_);
+  RETURN_IF_ERROR(args_.TransformToCLCode(&code));
+  code = absl::Substitute(code, args_.GetListOfArgs());
   RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", options, *creation_context.context,
       *creation_context.device, &kernel_));
@@ -418,7 +422,12 @@ absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
   create_info.storage_type = LinearStorageType::TEXTURE_2D;
   create_info.data_type = definition_.GetDataType();
   create_info.name = "bt_arr";
-  return CreateLinearStorage(create_info, bt_aligned, context, &bt_);
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(CreateLinearStorage(create_info, bt_aligned, context, &lt));
+  args_.AddObject("bt", AccessType::READ,
+                  absl::make_unique<LinearStorage>(std::move(lt)));
+  return absl::OkStatus();
 }
 
 int3 Winograd4x4To36::SelectBestWorkGroup() {
@@ -429,22 +438,22 @@ int3 Winograd4x4To36::SelectBestWorkGroup() {
 }
 
 absl::Status Winograd4x4To36::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(bt_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
   const int tiles_x = DivideRoundUp(
       src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2, 4);
   const int tiles_y = DivideRoundUp(
       src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2, 4);
   const int tiles_total = tiles_x * tiles_y;
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(-padding_.prepended.w, -padding_.prepended.h)));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(tiles_total));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(tiles_x));
+  RETURN_IF_ERROR(args_.SetInt("padding_x", -padding_.prepended.w));
+  RETURN_IF_ERROR(args_.SetInt("padding_y", -padding_.prepended.h));
+  RETURN_IF_ERROR(args_.SetInt("tiles_total", tiles_total));
+  RETURN_IF_ERROR(args_.SetInt("tiles_x", tiles_x));
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
+  RETURN_IF_ERROR(args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
index c6a88773af3..02e3c268b28 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_WINOGRAD_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_WINOGRAD_H_
 
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
@@ -59,9 +60,9 @@ class Winograd4x4To36 : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  LinearStorage bt_;
   Padding2D padding_;
 
+  Arguments args_;
   CLKernel kernel_;
   int3 work_group_size_ = int3(128, 1, 1);
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
index aff64dd48f3..1dada33ae04 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
@@ -111,7 +111,7 @@ TEST_F(OpenCLOperationTest, Winograd36To4x4) {
     src_tensor.data[i] = sin(i);
   }
 
-  Tensor<Linear, DataType::FLOAT32> biases;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> biases;
   biases.shape = Linear(1);
   biases.data.resize(biases.shape.DimensionsProduct());
   for (int i = 0; i < biases.data.size(); ++i) {
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
index 4fb21d0ec6a..7edf83f57ff 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
@@ -15,24 +15,82 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
 
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
 namespace gpu {
 namespace cl {
 
+GPUResources TensorLinearDescriptor::GetGPUResources(
+    AccessType access_type) const {
+  GPUResources resources;
+  resources.ints.push_back("length");
+  if (storage_type == LinearStorageType::BUFFER) {
+    GPUBufferDescriptor desc;
+    desc.data_type = element_type;
+    desc.access_type = access_type;
+    desc.element_size = 4;
+    resources.buffers.push_back({"buffer", desc});
+  } else {
+    GPUImage2DDescriptor desc;
+    desc.data_type = element_type;
+    desc.access_type = access_type;
+    resources.images2d.push_back({"tex2d", desc});
+  }
+  return resources;
+}
+
+absl::Status TensorLinearDescriptor::PerformSelector(
+    const std::string& selector, const std::vector<std::string>& args,
+    std::string* result) const {
+  if (selector == "Length") {
+    *result = "length";
+    return absl::OkStatus();
+  } else if (selector == "Read") {
+    return PerformReadSelector(args, result);
+  } else {
+    return absl::NotFoundError(absl::StrCat(
+        "TensorLinearDescriptor don't have selector with name - ", selector));
+  }
+}
+
+absl::Status TensorLinearDescriptor::PerformReadSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  if (args.size() != 1) {
+    return absl::NotFoundError(
+        absl::StrCat("TensorLinearDescriptor Read require one argument, but ",
+                     args.size(), " was passed"));
+  }
+  if (storage_type == LinearStorageType::BUFFER) {
+    *result = absl::StrCat("buffer[", args[0], "]");
+    return absl::OkStatus();
+  } else {
+    const std::string read =
+        element_type == DataType::FLOAT16 ? "read_imageh" : "read_imagef";
+    *result = absl::StrCat(read, "(tex2d, smp_none, (int2)(", args[0], ", 0))");
+    return absl::OkStatus();
+  }
+}
+
 LinearStorage::LinearStorage(int depth, LinearStorageType storage_type,
                              DataType data_type)
-    : depth_(depth), storage_type_(storage_type), data_type_(data_type) {}
+    : depth_(depth), storage_type_(storage_type), data_type_(data_type) {
+  desc_.storage_type = storage_type;
+  desc_.element_type = data_type;
+}
 
 LinearStorage::LinearStorage(LinearStorage&& storage)
-    : texture_storage_(std::move(storage.texture_storage_)),
+    : GPUObject(std::move(storage)),
+      texture_storage_(std::move(storage.texture_storage_)),
       buffer_storage_(std::move(storage.buffer_storage_)),
       memory_(storage.memory_),
       depth_(storage.depth_),
       name_(std::move(storage.name_)),
       storage_type_(storage.storage_type_),
-      data_type_(storage.data_type_) {
+      data_type_(storage.data_type_),
+      desc_(storage.desc_) {
   storage.memory_ = nullptr;
 }
 
@@ -45,6 +103,8 @@ LinearStorage& LinearStorage::operator=(LinearStorage&& storage) {
     name_ = std::move(storage.name_);
     std::swap(storage_type_, storage.storage_type_);
     std::swap(data_type_, storage.data_type_);
+    desc_ = storage.desc_;
+    GPUObject::operator=(std::move(storage));
   }
   return *this;
 }
@@ -66,6 +126,20 @@ std::string LinearStorage::GetDeclaration() const {
   }
 }
 
+GPUResourcesWithValue LinearStorage::GetGPUResources(
+    AccessType access_type) const {
+  GPUResourcesWithValue resources;
+  resources.ints.push_back({"length", depth_});
+
+  if (storage_type_ == LinearStorageType::BUFFER) {
+    resources.buffers.push_back({"buffer", memory_});
+  } else {
+    resources.images2d.push_back({"tex2d", memory_});
+  }
+
+  return resources;
+}
+
 LinearStorageType DeduceLinearStorageType(
     TensorStorageType tensor_storage_type) {
   if (tensor_storage_type == TensorStorageType::BUFFER) {
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.h b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
index f461b08ebec..83c41e2c833 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.h
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
@@ -36,6 +37,33 @@ namespace cl {
 
 enum class LinearStorageType { BUFFER, TEXTURE_2D };
 
+struct TensorLinearDescriptor : public GPUObjectDescriptor {
+  LinearStorageType storage_type;
+  DataType element_type;  // FLOAT32 or FLOAT16
+
+  TensorLinearDescriptor() = default;
+  TensorLinearDescriptor(const TensorLinearDescriptor& desc)
+      : GPUObjectDescriptor(desc),
+        storage_type(desc.storage_type),
+        element_type(desc.element_type) {}
+  TensorLinearDescriptor& operator=(const TensorLinearDescriptor& desc) {
+    if (this != &desc) {
+      storage_type = desc.storage_type;
+      element_type = desc.element_type;
+      GPUObjectDescriptor::operator=(desc);
+    }
+    return *this;
+  }
+
+  absl::Status PerformSelector(const std::string& selector,
+                               const std::vector<std::string>& args,
+                               std::string* result) const override;
+
+  GPUResources GetGPUResources(AccessType access_type) const override;
+  absl::Status PerformReadSelector(const std::vector<std::string>& args,
+                                   std::string* result) const;
+};
+
 struct LinearStorageCreateInfo {
   LinearStorageType storage_type;
   DataType data_type;
@@ -48,7 +76,7 @@ LinearStorageType DeduceLinearStorageType(
 
 // Represent GPU 1D-array of FLT4(float4/half4) values
 // Can use inside texture2d or buffer
-class LinearStorage {
+class LinearStorage : public GPUObject {
  public:
   LinearStorage() {}
 
@@ -63,6 +91,11 @@ class LinearStorage {
   std::string ReadLinearFLT4(const std::string& z_coord) const;
   std::string GetDeclaration() const;
 
+  const GPUObjectDescriptor* GetGPUDescriptor() const override {
+    return &desc_;
+  }
+  GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
+
  private:
   friend absl::Status CreateTextureLinearStorage(int size, DataType data_type,
                                                  void* data, CLContext* context,
@@ -81,6 +114,7 @@ class LinearStorage {
   std::string name_;
   LinearStorageType storage_type_;
   DataType data_type_;
+  TensorLinearDescriptor desc_;
 };
 
 absl::Status CreateBufferLinearStorage(int size, DataType data_type, void* data,
diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
index be551bc9973..bdaa807d83c 100644
--- a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
+++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <dlfcn.h>
 
+#include <string>
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
@@ -24,42 +26,51 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+#ifdef __ANDROID__
 #define LoadFunction(function)                                                 \
   if (is_pixel) {                                                              \
     function = reinterpret_cast<PFN_##function>(loadOpenCLPointer(#function)); \
   } else {                                                                     \
     function = reinterpret_cast<PFN_##function>(dlsym(libopencl, #function));  \
   }
+#else
+#define LoadFunction(function) \
+  function = reinterpret_cast<PFN_##function>(dlsym(libopencl, #function));
+#endif
 
 absl::Status LoadOpenCL() {
   void* libopencl = dlopen("libOpenCL.so", RTLD_NOW | RTLD_LOCAL);
   if (libopencl) {
     LoadOpenCLFunctions(libopencl, false);
     return absl::OkStatus();
-  } else {
-    // Pixel phone?
-    libopencl = dlopen("libOpenCL-pixel.so", RTLD_NOW | RTLD_LOCAL);
-    if (libopencl) {
-      typedef void (*enableOpenCL_t)();
-      enableOpenCL_t enableOpenCL =
-          reinterpret_cast<enableOpenCL_t>(dlsym(libopencl, "enableOpenCL"));
-      enableOpenCL();
-      LoadOpenCLFunctions(libopencl, true);
-      return absl::OkStatus();
-    } else {
-      return absl::UnknownError(
-          absl::StrCat("OpenCL library not loaded - ", dlerror()));
-    }
   }
+  // record error
+  std::string error(dlerror());
+#ifdef __ANDROID__
+  // Pixel phone?
+  libopencl = dlopen("libOpenCL-pixel.so", RTLD_NOW | RTLD_LOCAL);
+  if (libopencl) {
+    typedef void (*enableOpenCL_t)();
+    enableOpenCL_t enableOpenCL =
+        reinterpret_cast<enableOpenCL_t>(dlsym(libopencl, "enableOpenCL"));
+    enableOpenCL();
+    LoadOpenCLFunctions(libopencl, true);
+    return absl::OkStatus();
+  }
+#endif
+  return absl::UnknownError(
+      absl::StrCat("Can not open OpenCL library on this device - ", error));
 }
 
 void LoadOpenCLFunctions(void* libopencl, bool is_pixel) {
+#ifdef __ANDROID__
   typedef void* (*loadOpenCLPointer_t)(const char* name);
   loadOpenCLPointer_t loadOpenCLPointer;
   if (is_pixel) {
     loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(
         dlsym(libopencl, "loadOpenCLPointer"));
   }
+#endif
 
   LoadFunction(clGetPlatformIDs);
   LoadFunction(clGetPlatformInfo);
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
index 9650b53937a..ff196cfaf71 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
@@ -131,7 +131,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl/kernels:lstm",
         "//tensorflow/lite/delegates/gpu/cl/kernels:max_unpooling",
         "//tensorflow/lite/delegates/gpu/cl/kernels:mean",
-        "//tensorflow/lite/delegates/gpu/cl/kernels:multiply_add",
         "//tensorflow/lite/delegates/gpu/cl/kernels:padding",
         "//tensorflow/lite/delegates/gpu/cl/kernels:pooling",
         "//tensorflow/lite/delegates/gpu/cl/kernels:prelu",
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
index 2a04a04460d..12a1d726368 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
@@ -27,6 +27,22 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+absl::Status SelectFullyConnectedGeneric(
+    const FullyConnectedAttributes& attr,
+    const CreationContext& creation_context, const OperationDef& op_def,
+    int batch_size, std::unique_ptr<GPUOperation>* ptr) {
+  if (op_def.IsBatchSupported()) {
+    ConvTexture conv;
+    RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvTexture>(std::move(conv));
+  } else {
+    FullyConnected fc;
+    RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
+    *ptr = absl::make_unique<FullyConnected>(std::move(fc));
+  }
+  return absl::OkStatus();
+}
+
 absl::Status SelectFullyConnectedAdreno(const FullyConnectedAttributes& attr,
                                         const CreationContext& creation_context,
                                         const OperationDef& op_def,
@@ -38,8 +54,7 @@ absl::Status SelectFullyConnectedAdreno(const FullyConnectedAttributes& attr,
     *ptr = absl::make_unique<ConvTexture>(std::move(conv));
   } else {
     FullyConnected fc;
-    RETURN_IF_ERROR(
-        CreateFullyConnected(creation_context, op_def, attr, &fc));
+    RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
     *ptr = absl::make_unique<FullyConnected>(std::move(fc));
   }
   return absl::OkStatus();
@@ -55,8 +70,7 @@ absl::Status SelectFullyConnectedPowerVR(
     *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
   } else {
     FullyConnected fc;
-    RETURN_IF_ERROR(
-        CreateFullyConnected(creation_context, op_def, attr, &fc));
+    RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
     *ptr = absl::make_unique<FullyConnected>(std::move(fc));
   }
   return absl::OkStatus();
@@ -80,8 +94,7 @@ absl::Status SelectFullyConnectedMali(const FullyConnectedAttributes& attr,
     }
   } else {
     FullyConnected fc;
-    RETURN_IF_ERROR(
-        CreateFullyConnected(creation_context, op_def, attr, &fc));
+    RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
     *ptr = absl::make_unique<FullyConnected>(std::move(fc));
   }
   return absl::OkStatus();
@@ -102,8 +115,8 @@ absl::Status SelectFullyConnected(const FullyConnectedAttributes& attr,
       return SelectFullyConnectedMali(attr, creation_context, op_def,
                                       batch_size, ptr);
     default:
-      return SelectFullyConnectedAdreno(attr, creation_context, op_def,
-                                        batch_size, ptr);
+      return SelectFullyConnectedGeneric(attr, creation_context, op_def,
+                                         batch_size, ptr);
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
index 60d06d7da89..606798db687 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -37,23 +37,6 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {
-
-bool IsWidthBroadcastedForSecondInput(const std::vector<Value*>& inputs) {
-  return inputs.size() == 2 &&
-         inputs[0]->tensor.shape.w != inputs[1]->tensor.shape.w &&
-         inputs[1]->tensor.shape.w == 1;
-}
-bool IsHeightBroadcastedForSecondInput(const std::vector<Value*>& inputs) {
-  return inputs.size() == 2 &&
-         inputs[0]->tensor.shape.h != inputs[1]->tensor.shape.h &&
-         inputs[1]->tensor.shape.h == 1;
-}
-bool IsChannelsBroadcastedForSecondInput(const std::vector<Value*>& inputs) {
-  return inputs.size() == 2 &&
-         inputs[0]->tensor.shape.c != inputs[1]->tensor.shape.c &&
-         inputs[1]->tensor.shape.c == 1;
-}
-
 bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
                                    const CLDevice& device,
                                    const BHWC& dst_shape) {
@@ -155,33 +138,44 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
   auto op_type = OperationTypeFromString(node.operation.type);
   switch (op_type) {
     case OperationType::ADD: {
-      const auto attr =
-          absl::any_cast<AddAttributes>(node.operation.attributes);
-      const auto* adds =
+      auto attr = absl::any_cast<AddAttributes>(node.operation.attributes);
+      const float* scalar = absl::get_if<float>(&attr.param);
+      const auto* linear_tensor =
           absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
               &attr.param);
-      const auto* adds_scalar = absl::get_if<float>(&attr.param);
-      if (adds || adds_scalar) {
-        return SelectBroadcastAdd(attr, creation_context, op_def, gpu_op);
+      if (scalar) {
+        ElementwiseOneRuntimeOneScalar operation =
+            CreateElementwiseOneRuntimeOneScalar(creation_context, op_def,
+                                                 op_type, *scalar);
+        *gpu_op = absl::make_unique<ElementwiseOneRuntimeOneScalar>(
+            std::move(operation));
+        return absl::OkStatus();
+      } else if (linear_tensor) {
+        ElementwiseTwoInput operation;
+        RETURN_IF_ERROR(CreateElementwiseTwoInput(
+            creation_context, op_def, op_type, *linear_tensor, &operation));
+        *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+        return absl::OkStatus();
       } else {
-        BroadcastSettings broadcast;
-        broadcast.width = IsWidthBroadcastedForSecondInput(inputs);
-        broadcast.height = IsHeightBroadcastedForSecondInput(inputs);
-        broadcast.channels = IsChannelsBroadcastedForSecondInput(inputs);
-        if (broadcast.width || broadcast.height || broadcast.channels) {
-          ElementwiseTwoInput operation =
-              CreateElementwiseTwoInput(op_def, op_type, broadcast);
+        if (inputs.size() == 2 &&
+            (inputs[0]->tensor.shape.c == inputs[1]->tensor.shape.c ||
+             inputs[1]->tensor.shape.c == 1)) {
+          ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+              op_def, op_type, inputs[1]->tensor.shape);
           *gpu_op =
               absl::make_unique<ElementwiseTwoInput>(std::move(operation));
-        } else {
+          return absl::OkStatus();
+        } else if (inputs.size() >= 2) {
           auto output = outputs[0];
           std::vector<int> channels(inputs.size());
           for (int i = 0; i < inputs.size(); ++i) {
             channels[i] = inputs[i]->tensor.shape.c;
           }
           SelectAdd(op_def, channels, output->tensor.shape.c, gpu_op);
+          return absl::OkStatus();
         }
-        return absl::OkStatus();
+        return absl::UnimplementedError(absl::StrCat(
+            "No support of ", node.operation.type, " with this parameters"));
       }
     }
     case OperationType::CONCAT: {
@@ -277,27 +271,35 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       return SelectMean(attr, op_def, gpu_op);
     }
     case OperationType::MUL: {
-      if (node.operation.attributes.has_value()) {
-        auto attr =
-            absl::any_cast<MultiplyAttributes>(node.operation.attributes);
-
-        return SelectMultiplyScalar(attr, creation_context, op_def, gpu_op);
+      auto attr = absl::any_cast<MultiplyAttributes>(node.operation.attributes);
+      const float* scalar = absl::get_if<float>(&attr.param);
+      const auto* linear_tensor =
+          absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
+              &attr.param);
+      if (scalar) {
+        ElementwiseOneRuntimeOneScalar operation =
+            CreateElementwiseOneRuntimeOneScalar(creation_context, op_def,
+                                                 op_type, *scalar);
+        *gpu_op = absl::make_unique<ElementwiseOneRuntimeOneScalar>(
+            std::move(operation));
+        return absl::OkStatus();
+      } else if (linear_tensor) {
+        ElementwiseTwoInput operation;
+        RETURN_IF_ERROR(CreateElementwiseTwoInput(
+            creation_context, op_def, op_type, *linear_tensor, &operation));
+        *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+        return absl::OkStatus();
       } else {
         if (inputs.size() == 2) {
-          BroadcastSettings broadcast;
-          broadcast.width = IsWidthBroadcastedForSecondInput(inputs);
-          broadcast.height = IsHeightBroadcastedForSecondInput(inputs);
-          broadcast.channels = IsChannelsBroadcastedForSecondInput(inputs);
-          ElementwiseTwoInput operation =
-              CreateElementwiseTwoInput(op_def, op_type, broadcast);
+          ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+              op_def, op_type, inputs[1]->tensor.shape);
           *gpu_op =
               absl::make_unique<ElementwiseTwoInput>(std::move(operation));
           return absl::OkStatus();
         } else {
-          return absl::UnimplementedError(
-              "No support of multiply with more than 2 inputs");
+          return absl::UnimplementedError(absl::StrCat(
+              "No support of ", node.operation.type, " with this parameters"));
         }
-        return absl::OkStatus();
       }
     }
     case OperationType::PAD: {
@@ -379,16 +381,37 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
     case OperationType::POW:
     case OperationType::SQUARED_DIFF:
     case OperationType::SUB: {
-      BroadcastSettings broadcast;
-      broadcast.width = IsWidthBroadcastedForSecondInput(inputs);
-      broadcast.height = IsHeightBroadcastedForSecondInput(inputs);
-      broadcast.channels = IsChannelsBroadcastedForSecondInput(inputs);
-      const ElementwiseAttributes* attr =
-          absl::any_cast<ElementwiseAttributes>(&node.operation.attributes);
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
-          creation_context, op_def, op_type, broadcast, attr);
-      *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
-      return absl::OkStatus();
+      auto attr =
+          absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
+      const float* scalar = absl::get_if<float>(&attr.param);
+      const auto* linear_tensor =
+          absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
+              &attr.param);
+      if (scalar) {
+        ElementwiseOneRuntimeOneScalar operation =
+            CreateElementwiseOneRuntimeOneScalar(creation_context, op_def,
+                                                 op_type, *scalar);
+        *gpu_op = absl::make_unique<ElementwiseOneRuntimeOneScalar>(
+            std::move(operation));
+        return absl::OkStatus();
+      } else if (linear_tensor) {
+        ElementwiseTwoInput operation;
+        RETURN_IF_ERROR(CreateElementwiseTwoInput(
+            creation_context, op_def, op_type, *linear_tensor, &operation));
+        *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+        return absl::OkStatus();
+      } else {
+        if (inputs.size() == 2) {
+          ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+              op_def, op_type, inputs[1]->tensor.shape);
+          *gpu_op =
+              absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+          return absl::OkStatus();
+        } else {
+          return absl::UnimplementedError(absl::StrCat(
+              "No support of ", node.operation.type, " with this parameters"));
+        }
+      }
     }
     default:
       return SelectDefault(creation_context, op_def, hints, inputs, outputs,
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
index d05e2cf0364..5fc04d12822 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/lstm.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/mean.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/padding.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/pooling.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/prelu.h"
@@ -158,28 +157,6 @@ absl::Status SelectMean(const MeanAttributes& attr, const OperationDef& op_def,
   return absl::OkStatus();
 }
 
-absl::Status SelectMultiplyScalar(const MultiplyAttributes& attr,
-                                  const CreationContext& creation_context,
-                                  const OperationDef& op_def,
-                                  std::unique_ptr<GPUOperation>* ptr) {
-  MultiplyAdd operation;
-  RETURN_IF_ERROR(
-      CreateMultiplyAdd(creation_context, op_def, attr, &operation));
-  *ptr = absl::make_unique<MultiplyAdd>(std::move(operation));
-  return absl::OkStatus();
-}
-
-absl::Status SelectBroadcastAdd(const AddAttributes& attr,
-                                const CreationContext& creation_context,
-                                const OperationDef& op_def,
-                                std::unique_ptr<GPUOperation>* ptr) {
-  MultiplyAdd operation;
-  RETURN_IF_ERROR(
-      CreateMultiplyAdd(creation_context, op_def, attr, &operation));
-  *ptr = absl::make_unique<MultiplyAdd>(std::move(operation));
-  return absl::OkStatus();
-}
-
 void SelectSoftmax(const BHWC& shape, const OperationDef& op_def,
                    std::unique_ptr<GPUOperation>* ptr) {
   if (shape.w == 1 && shape.h == 1) {
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
index 401bd5df7f9..eb111ff9509 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
@@ -70,16 +70,6 @@ void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
 absl::Status SelectMean(const MeanAttributes& attr, const OperationDef& op_def,
                         std::unique_ptr<GPUOperation>* ptr);
 
-absl::Status SelectMultiplyScalar(const MultiplyAttributes& attr,
-                                  const CreationContext& creation_context,
-                                  const OperationDef& op_def,
-                                  std::unique_ptr<GPUOperation>* ptr);
-
-absl::Status SelectBroadcastAdd(const AddAttributes& attr,
-                                const CreationContext& creation_context,
-                                const OperationDef& op_def,
-                                std::unique_ptr<GPUOperation>* ptr);
-
 void SelectSoftmax(const BHWC& shape, const OperationDef& op_def,
                    std::unique_ptr<GPUOperation>* ptr);
 
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index f01975e2347..053514056f0 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -29,7 +29,7 @@ namespace cl {
 namespace {
 
 absl::Status CreateImageBufferFromBuffer(const CLContext& context,
-                                         cl_mem memory, enum DataType data_type,
+                                         cl_mem memory, DataType data_type,
                                          int width, cl_mem* result) {
   cl_image_format format;
   cl_image_desc desc;
@@ -117,6 +117,7 @@ Tensor::Tensor(Tensor&& tensor)
       shape_(tensor.shape_),
       descriptor_(tensor.descriptor_) {
   tensor.memory_ = nullptr;
+  tensor.image_buffer_memory_ = nullptr;
 }
 
 Tensor& Tensor::operator=(Tensor&& tensor) {
@@ -132,9 +133,10 @@ Tensor& Tensor::operator=(Tensor&& tensor) {
 }
 
 void Tensor::Release() {
+  // image_buffer_memory_ always owned by object
   if (image_buffer_memory_) {
     clReleaseMemObject(image_buffer_memory_);
-    memory_ = nullptr;
+    image_buffer_memory_ = nullptr;
   }
   if (memory_owner_ && memory_) {
     clReleaseMemObject(memory_);
@@ -142,6 +144,45 @@ void Tensor::Release() {
   }
 }
 
+GPUResourcesWithValue Tensor::GetGPUResources(AccessType access_type) const {
+  GPUResourcesWithValue resources;
+  if (descriptor_.HasAxis(Axis::WIDTH)) {
+    resources.ints.push_back({"width", Width()});
+  }
+  if (descriptor_.HasAxis(Axis::HEIGHT)) {
+    resources.ints.push_back({"height", Height()});
+  }
+  if (descriptor_.HasAxis(Axis::CHANNELS)) {
+    resources.ints.push_back({"slices", Slices()});
+    resources.ints.push_back({"channels", Channels()});
+  }
+  if (descriptor_.HasAxis(Axis::BATCH)) {
+    resources.ints.push_back({"batch", Batch()});
+  }
+  if (descriptor_.HasAxis(Axis::DEPTH)) {
+    resources.ints.push_back({"depth", Depth()});
+  }
+
+  if (descriptor_.storage_type == TensorStorageType::BUFFER) {
+    resources.buffers.push_back({"buffer", memory_});
+  } else if (descriptor_.storage_type == TensorStorageType::TEXTURE_2D ||
+             descriptor_.storage_type == TensorStorageType::SINGLE_TEXTURE_2D) {
+    resources.images2d.push_back({"image2d", memory_});
+  } else if (descriptor_.storage_type == TensorStorageType::TEXTURE_ARRAY) {
+    resources.image2d_arrays.push_back({"image2d_array", memory_});
+  } else if (descriptor_.storage_type == TensorStorageType::TEXTURE_3D) {
+    resources.images3d.push_back({"image3d", memory_});
+  } else if (descriptor_.storage_type == TensorStorageType::IMAGE_BUFFER) {
+    if (access_type == AccessType::READ) {
+      resources.image_buffers.push_back({"image_buffer", image_buffer_memory_});
+    } else {
+      resources.buffers.push_back({"buffer", memory_});
+    }
+  }
+
+  return resources;
+}
+
 int3 Tensor::GetFullTensorRegion() const {
   switch (descriptor_.storage_type) {
     case TensorStorageType::BUFFER:
@@ -284,6 +325,18 @@ absl::Status Tensor::WriteData(CLCommandQueue* queue,
   return WriteDataBHWDC(absl::MakeConstSpan(src.data), queue);
 }
 
+absl::Status Tensor::WriteData(
+    CLCommandQueue* queue,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src) {
+  return WriteDataBHWDC(absl::MakeConstSpan(src.data), queue);
+}
+
+absl::Status Tensor::WriteData(
+    CLCommandQueue* queue,
+    const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& src) {
+  return WriteDataBHWDC(absl::MakeConstSpan(src.data), queue);
+}
+
 absl::Status Tensor::WriteData(CLCommandQueue* queue,
                                const Tensor5DFloat32& src) {
   RETURN_IF_ERROR(IsValid(src.shape));
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index d59ef838888..c1b0b14709f 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@@ -36,7 +37,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class Tensor {
+class Tensor : public GPUObject {
  public:
   Tensor()
       : memory_(nullptr), image_buffer_memory_(nullptr), memory_owner_(true) {}
@@ -57,6 +58,11 @@ class Tensor {
 
   virtual ~Tensor() { Release(); }
 
+  const GPUObjectDescriptor* GetGPUDescriptor() const override {
+    return &descriptor_;
+  }
+  GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
+
   int Width() const { return shape_.w; }
   int Height() const { return shape_.h; }
   int Depth() const { return shape_.d; }
@@ -75,8 +81,9 @@ class Tensor {
   int4 GetWHSB() const { return int4(shape_.w, shape_.h, Slices(), shape_.b); }
   int4 GetWHDS() const { return int4(shape_.w, shape_.h, shape_.d, Slices()); }
 
-  enum DataType DataType() const { return descriptor_.data_type; }
-  TensorStorageType StorageType() const { return descriptor_.storage_type; }
+  TensorDescriptor GetDescriptor() const { return descriptor_; }
+  DataType GetDataType() const { return descriptor_.data_type; }
+  TensorStorageType GetStorageType() const { return descriptor_.storage_type; }
 
   // for profiling and memory statistics
   uint64_t GetMemorySizeInBytes() const;
@@ -88,6 +95,12 @@ class Tensor {
   cl_mem GetMemoryPtrForWriting() const;
 
   absl::Status WriteData(CLCommandQueue* queue, const TensorFloat32& src);
+  absl::Status WriteData(
+      CLCommandQueue* queue,
+      const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src);
+  absl::Status WriteData(
+      CLCommandQueue* queue,
+      const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& src);
   absl::Status WriteData(CLCommandQueue* queue, const Tensor5DFloat32& src);
   absl::Status ReadData(CLCommandQueue* queue, TensorFloat32* dst) const;
   absl::Status ReadData(CLCommandQueue* queue, Tensor5DFloat32* dst) const;
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index 900bf6e620d..47caa7fa123 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -15,9 +15,128 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/substitute.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+
 namespace tflite {
 namespace gpu {
 namespace cl {
+namespace {
+std::string GetGlobalAddressNoDeclarationWHS(const std::string& x,
+                                             const std::string& y,
+                                             const std::string& s,
+                                             TensorStorageType storage_type) {
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::IMAGE_BUFFER:
+      return absl::Substitute("((($2) * height + ($1)) * width + ($0))", x, y,
+                              s);
+    case TensorStorageType::TEXTURE_2D:
+      return absl::Substitute("(int2)(($0), ($1) * slices + ($2))", x, y, s);
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return absl::StrCat("(int2)(", x, ", ", y, ")");
+    case TensorStorageType::TEXTURE_ARRAY:
+    case TensorStorageType::TEXTURE_3D:
+      return absl::StrCat("(int4)(", x, ", ", y, ", ", s, ", 0)");
+    case TensorStorageType::UNKNOWN:
+      return "error";
+  }
+}
+
+std::string GetGlobalAddressNoDeclarationWHSB(const std::string& x,
+                                              const std::string& y,
+                                              const std::string& s,
+                                              const std::string& b,
+                                              TensorStorageType storage_type) {
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::IMAGE_BUFFER:
+      return absl::Substitute(
+          "(((($3) * height + $2) * width + ($1)) * batch + ($0))", b, x, y, s);
+    case TensorStorageType::TEXTURE_2D:
+      return absl::Substitute(
+          "(int2)(($0) * batch + ($1), ($2) * slices + ($3))", x, b, y, s);
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return absl::Substitute("(int2)(($0) * batch + ($1), ($2))", x, b, y);
+    case TensorStorageType::TEXTURE_ARRAY:
+    case TensorStorageType::TEXTURE_3D:
+      return absl::Substitute("(int4)(($0) * batch + ($1), ($2), ($3), 0)", x,
+                              b, y, s);
+    case TensorStorageType::UNKNOWN:
+      return "error";
+    default:
+      return "error";
+  }
+}
+
+std::string GetGlobalAddressNoDeclarationWHDS(const std::string& x,
+                                              const std::string& y,
+                                              const std::string& z,
+                                              const std::string& s,
+                                              TensorStorageType storage_type) {
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::IMAGE_BUFFER:
+      return absl::Substitute(
+          "(((($3) * slices + ($2)) * height + ($1)) * width + ($0))", x, y, s,
+          z);
+    case TensorStorageType::TEXTURE_2D:
+      return absl::Substitute(
+          "(int2)(($0) * depth + ($1), ($2) * slices + ($3))", x, z, y, s);
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return absl::Substitute("(int2)(($0) * depth + ($1), ($2))", x, z, y);
+    case TensorStorageType::TEXTURE_ARRAY:
+    case TensorStorageType::TEXTURE_3D:
+      return absl::Substitute("(int4)(($0), ($1), ($2) * slices + ($3), 0)", x,
+                              y, z, s);
+    case TensorStorageType::UNKNOWN:
+      return "error";
+  }
+}
+
+std::string GetGlobalAddressNoDeclarationWHDSB(const std::string& x,
+                                               const std::string& y,
+                                               const std::string& z,
+                                               const std::string& s,
+                                               const std::string& b,
+                                               TensorStorageType storage_type) {
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::IMAGE_BUFFER:
+      return absl::Substitute(
+          "((((($4) * slices + ($3)) * height + $2) * width + ($1)) * batch + "
+          "($0))",
+          b, x, y, s, z);
+    case TensorStorageType::TEXTURE_2D:
+      return absl::Substitute(
+          "(int2)((($0) * batch + ($1)) * depth + ($2), ($3) * slices + ($4))",
+          x, b, z, y, s);
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return absl::Substitute(
+          "(int2)((($0) * batch + ($1)) * depth + ($2), ($3))", x, b, z, y);
+    case TensorStorageType::TEXTURE_ARRAY:
+    case TensorStorageType::TEXTURE_3D:
+      return absl::Substitute(
+          "(int4)(($0) * batch + ($1), ($2), ($3) * slices + ($4), 0)", x, b, y,
+          z, s);
+    case TensorStorageType::UNKNOWN:
+      return "error";
+    default:
+      return "error";
+  }
+}
+
+std::string GetReadImageFromDataType(DataType data_type) {
+  if (data_type == DataType::FLOAT32) {
+    return "read_imagef";
+  } else if (data_type == DataType::FLOAT16) {
+    return "read_imageh";
+  } else {
+    return "error";
+  }
+}
+}  // namespace
 
 std::string ToString(TensorStorageType type) {
   switch (type) {
@@ -38,6 +157,218 @@ std::string ToString(TensorStorageType type) {
   }
 }
 
+GPUResources TensorDescriptor::GetGPUResources(AccessType access_type) const {
+  GPUResources resources;
+  if (HasAxis(Axis::WIDTH)) {
+    resources.ints.push_back("width");
+  }
+  if (HasAxis(Axis::HEIGHT)) {
+    resources.ints.push_back("height");
+  }
+  if (HasAxis(Axis::CHANNELS)) {
+    resources.ints.push_back("slices");
+    resources.ints.push_back("channels");
+  }
+  if (HasAxis(Axis::BATCH)) {
+    resources.ints.push_back("batch");
+  }
+  if (HasAxis(Axis::DEPTH)) {
+    resources.ints.push_back("depth");
+  }
+  if (storage_type == TensorStorageType::BUFFER) {
+    GPUBufferDescriptor desc;
+    desc.data_type = data_type;
+    desc.access_type = access_type;
+    desc.element_size = 4;
+    resources.buffers.push_back({"buffer", desc});
+  } else if (storage_type == TensorStorageType::SINGLE_TEXTURE_2D ||
+             storage_type == TensorStorageType::TEXTURE_2D) {
+    GPUImage2DDescriptor desc;
+    desc.data_type = data_type;
+    desc.access_type = access_type;
+    resources.images2d.push_back({"image2d", desc});
+  } else if (storage_type == TensorStorageType::TEXTURE_ARRAY) {
+    GPUImage2DArrayDescriptor desc;
+    desc.data_type = data_type;
+    desc.access_type = access_type;
+    resources.image2d_arrays.push_back({"image2d_array", desc});
+  } else if (storage_type == TensorStorageType::TEXTURE_3D) {
+    GPUImage3DDescriptor desc;
+    desc.data_type = data_type;
+    desc.access_type = access_type;
+    resources.images3d.push_back({"image3d", desc});
+  } else if (storage_type == TensorStorageType::IMAGE_BUFFER) {
+    if (access_type == AccessType::READ) {
+      GPUImageBufferDescriptor desc;
+      desc.data_type = data_type;
+      desc.access_type = access_type;
+      resources.image_buffers.push_back({"image_buffer", desc});
+    } else {
+      GPUBufferDescriptor desc;
+      desc.data_type = data_type;
+      desc.access_type = access_type;
+      desc.element_size = 4;
+      resources.buffers.push_back({"buffer", desc});
+    }
+  }
+  return resources;
+}
+
+absl::Status TensorDescriptor::PerformSelector(
+    const std::string& selector, const std::vector<std::string>& args,
+    std::string* result) const {
+  if (selector == "Width") {
+    *result = "width";
+    return absl::OkStatus();
+  } else if (selector == "Height") {
+    *result = "height";
+    return absl::OkStatus();
+  } else if (selector == "Slices") {
+    *result = "slices";
+    return absl::OkStatus();
+  } else if (selector == "Channels") {
+    *result = "channels";
+    return absl::OkStatus();
+  } else if (selector == "Batch") {
+    *result = "batch";
+    return absl::OkStatus();
+  } else if (selector == "Depth") {
+    *result = "depth";
+    return absl::OkStatus();
+  } else if (selector == "SetBatchRef") {
+    if (args.size() != 1) {
+      return absl::InvalidArgumentError(
+          "Unsupported arguments in SetBatchRef selector");
+    }
+    state_vars_["batch_id"] = args[0];
+    *result = "";
+    return absl::OkStatus();
+  } else if (selector == "Read") {
+    return PerformReadSelector(args, result);
+  } else {
+    return absl::NotFoundError(absl::StrCat(
+        "TensorLinearDescriptor don't have selector with name - ", selector));
+  }
+}
+
+absl::Status TensorDescriptor::PerformReadSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  std::string xc;
+  std::string yc;
+  std::string zc;
+  std::string sc;
+  std::string bc;
+  bool parsed = ParseCoordsFromArgs(args, 0, &xc, &yc, &zc, &sc, &bc);
+  if (args.size() < 2 || !parsed) {
+    return absl::NotFoundError("Unrecognized Read selector");
+  }
+
+  if (layout == Layout::HWC) {
+    *result = Read(GetGlobalAddressNoDeclarationWHS(xc, yc, sc, storage_type));
+    return absl::OkStatus();
+  } else if (layout == Layout::BHWC) {
+    *result =
+        Read(GetGlobalAddressNoDeclarationWHSB(xc, yc, sc, bc, storage_type));
+    return absl::OkStatus();
+  } else if (layout == Layout::HWDC) {
+    *result =
+        Read(GetGlobalAddressNoDeclarationWHDS(xc, yc, zc, sc, storage_type));
+    return absl::OkStatus();
+  } else if (layout == Layout::BHWDC) {
+    *result = Read(
+        GetGlobalAddressNoDeclarationWHDSB(xc, yc, zc, sc, bc, storage_type));
+    return absl::OkStatus();
+  } else {
+    return absl::NotFoundError("Unsupported layout");
+  }
+}
+
+std::string TensorDescriptor::Read(const std::string& global_address) const {
+  std::string image_type;
+  if (storage_type == TensorStorageType::TEXTURE_2D ||
+      storage_type == TensorStorageType::SINGLE_TEXTURE_2D) {
+    image_type = "image2d";
+  } else if (storage_type == TensorStorageType::TEXTURE_3D) {
+    image_type = "image3d";
+  } else if (storage_type == TensorStorageType::TEXTURE_ARRAY) {
+    image_type = "image2d_array";
+  }
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+      return absl::StrCat("buffer[", global_address, "]");
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::TEXTURE_3D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+    case TensorStorageType::TEXTURE_ARRAY:
+      return absl::StrCat(GetReadImageFromDataType(data_type), "(", image_type,
+                          ", smp_none, ", global_address, ")");
+    case TensorStorageType::IMAGE_BUFFER:
+      return absl::StrCat(GetReadImageFromDataType(data_type),
+                          "(image_buffer, ", global_address, ")");
+    case TensorStorageType::UNKNOWN:
+      return "";
+  }
+}
+
+bool TensorDescriptor::HasAxis(Axis axis) const {
+  if (axis == Axis::WIDTH || axis == Axis::HEIGHT || axis == Axis::CHANNELS) {
+    return true;
+  }
+  if (axis == Axis::BATCH &&
+      (layout == Layout::BHWC || layout == Layout::BHWDC)) {
+    return true;
+  }
+  if (axis == Axis::DEPTH &&
+      (layout == Layout::HWDC || layout == Layout::BHWDC)) {
+    return true;
+  }
+  return false;
+}
+
+bool TensorDescriptor::ParseCoordsFromArgs(const std::vector<std::string>& args,
+                                           int offset, std::string* xc,
+                                           std::string* yc, std::string* zc,
+                                           std::string* sc,
+                                           std::string* bc) const {
+  if (HasAxis(Axis::WIDTH)) {
+    if (offset >= args.size()) return false;
+    *xc = args[offset++];
+  }
+  if (HasAxis(Axis::HEIGHT)) {
+    if (offset >= args.size()) return false;
+    *yc = args[offset++];
+  }
+  if (HasAxis(Axis::DEPTH)) {
+    if (offset >= args.size()) return false;
+    *zc = args[offset++];
+  }
+  if (HasAxis(Axis::CHANNELS)) {
+    if (offset >= args.size()) {
+      auto it = state_vars_.find("slice_id");
+      if (it == state_vars_.end()) {
+        return false;
+      } else {
+        *sc = it->second;
+      }
+    } else {
+      *sc = args[offset++];
+    }
+  }
+  if (HasAxis(Axis::BATCH)) {
+    if (offset >= args.size()) {
+      auto it = state_vars_.find("batch_id");
+      if (it == state_vars_.end()) {
+        return false;
+      } else {
+        *bc = it->second;
+      }
+    } else {
+      *bc = args[offset++];
+    }
+  }
+  return true;
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index 9d98d38900f..42f4f9b98e5 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstddef>
 #include <string>
 
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 
@@ -36,10 +37,24 @@ enum class TensorStorageType {
   SINGLE_TEXTURE_2D
 };
 
-struct TensorDescriptor {
+struct TensorDescriptor : public GPUObjectDescriptor {
   TensorDescriptor() = default;
   TensorDescriptor(DataType dt, TensorStorageType st, Layout l)
       : data_type(dt), storage_type(st), layout(l) {}
+  TensorDescriptor(const TensorDescriptor& desc)
+      : GPUObjectDescriptor(desc),
+        data_type(desc.data_type),
+        storage_type(desc.storage_type),
+        layout(desc.layout) {}
+  TensorDescriptor& operator=(const TensorDescriptor& desc) {
+    if (this != &desc) {
+      data_type = desc.data_type;
+      storage_type = desc.storage_type;
+      layout = desc.layout;
+      GPUObjectDescriptor::operator=(desc);
+    }
+    return *this;
+  }
 
   bool operator==(const TensorDescriptor& d) const {
     return data_type == d.data_type && storage_type == d.storage_type &&
@@ -48,12 +63,30 @@ struct TensorDescriptor {
 
   bool operator!=(const TensorDescriptor& d) const { return !(*this == d); }
 
+  absl::Status PerformSelector(const std::string& selector,
+                               const std::vector<std::string>& args,
+                               std::string* result) const override;
+
+  GPUResources GetGPUResources(AccessType access_type) const override;
+
+  bool HasAxis(Axis axis) const;
+
   DataType data_type = DataType::UNKNOWN;
   TensorStorageType storage_type = TensorStorageType::UNKNOWN;
   // This field describes logical layout, actual(physical) GPU layout can be
   // totally different.
   Layout layout =
       Layout::UNKNOWN;  // Supported layouts is HWC, BHWC, HWDC, BHWDC
+
+ private:
+  absl::Status PerformReadSelector(const std::vector<std::string>& args,
+                                   std::string* result) const;
+
+  std::string Read(const std::string& global_address) const;
+
+  bool ParseCoordsFromArgs(const std::vector<std::string>& args, int offset,
+                           std::string* xc, std::string* yc, std::string* zc,
+                           std::string* sc, std::string* bc) const;
 };
 
 std::string ToString(TensorStorageType type);
diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD
index 94d79182a92..333c54f145f 100644
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@@ -116,6 +116,7 @@ cc_library(
         ":status",
         ":tensor",
         "@com_google_absl//absl/strings",
+        "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite:context",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:util",
@@ -202,6 +203,30 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "quantization_util",
+    srcs = ["quantization_util.cc"],
+    hdrs = ["quantization_util.h"],
+    deps = [
+        ":status",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels/internal:optimized_base",
+        "//tensorflow/lite/kernels/internal:types",
+    ],
+)
+
+cc_test(
+    name = "quantization_util_test",
+    srcs = ["quantization_util_test.cc"],
+    deps = [
+        ":quantization_util",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/micro/testing:micro_test",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 # TODO(impjdi): Add unit test for operations.
 
 cc_library(
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 6f0a13bc1bd..475f2bade9c 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+#include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -81,7 +82,7 @@ inline void DequantizeConstantTensor(const TfLiteTensor& tensor,
                                      const T* source_data,
                                      float* dequantized_data) {
   TfLiteAffineQuantization* quant_params =
-      reinterpret_cast<TfLiteAffineQuantization*>(tensor.quantization.params);
+      static_cast<TfLiteAffineQuantization*>(tensor.quantization.params);
   if (quant_params->scale->size > 1) {
     // Tensor is per-channel quantized.
     PerChannelDequantizationParams op_params;
@@ -232,25 +233,21 @@ absl::Status GetFullyConnectedAttributes(int weights_tensor_id,
 
 template <typename ParamsT>
 absl::Status RetrieveBuiltinData(const TfLiteNode* tflite_node,
-                                 ParamsT** tf_options) {
-  const auto* params =
-      reinterpret_cast<const ParamsT*>(tflite_node->builtin_data);
-  if (!params) {
+                                 const ParamsT** tf_options) {
+  *tf_options = static_cast<const ParamsT*>(tflite_node->builtin_data);
+  if (!*tf_options) {
     return absl::InternalError("Unable to retrieve builtin_data.");
   }
-  *tf_options = const_cast<ParamsT*>(params);
   return absl::OkStatus();
 }
 
-template <typename ParamsType>
+template <typename ParamsT>
 absl::Status RetrieveCustomInitialData(const TfLiteNode* tflite_node,
-                                       ParamsType** tf_options) {
-  const auto* params =
-      reinterpret_cast<const ParamsType*>(tflite_node->custom_initial_data);
-  if (!params) {
+                                       const ParamsT** tf_options) {
+  *tf_options = static_cast<const ParamsT*>(tflite_node->custom_initial_data);
+  if (!*tf_options) {
     return absl::InternalError("Unable to retrieve custom_initial_data.");
   }
-  *tf_options = const_cast<ParamsType*>(params);
   return absl::OkStatus();
 }
 
@@ -260,18 +257,7 @@ absl::Status CheckMaxSupportedOpVersion(const TfLiteRegistration* registration,
   if (op_version > max_version) {
     return absl::UnimplementedError(
         absl::StrCat("Max version supported: ", max_version,
-                     ". Requested version ", op_version));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status CheckExactSupportedOpVersion(
-    const TfLiteRegistration* registration, int expected_version) {
-  int op_version = registration->version;
-  if (op_version != expected_version) {
-    return absl::UnimplementedError(
-        absl::StrCat("Only version ", expected_version,
-                     " is supported. Requested version ", op_version));
+                     ". Requested version ", op_version, "."));
   }
   return absl::OkStatus();
 }
@@ -401,7 +387,14 @@ class AddOperationParser : public TFLiteOperationParser {
       return absl::UnimplementedError("ADD requires two input tensors.");
     }
     // TODO(eignasheva): Add shapes check.
-    TfLiteAddParams* tf_options = nullptr;
+    for (int i = 0; i < 2; i++) {
+      auto input = tflite::GetInput(context, tflite_node, i);
+      if (IsConstantTensor(input) && input->dims->size > 0) {
+        RETURN_IF_ERROR(CheckIfLinearConvertible(input->dims));
+      }
+    }
+
+    const TfLiteAddParams* tf_options;
     return RetrieveBuiltinData(tflite_node, &tf_options);
   }
 
@@ -418,11 +411,8 @@ class AddOperationParser : public TFLiteOperationParser {
     AddAttributes attr;
     RETURN_IF_ERROR(ParseInputsWithConstTensor(node, reader, &attr.param));
     node->operation.attributes = std::move(attr);
-    const auto* tf_options =
-        reinterpret_cast<const TfLiteAddParams*>(tflite_node->builtin_data);
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite params");
-    }
+    const TfLiteAddParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     return MaybeFuseActivationToTheSingleOutput(tf_options->activation, graph,
                                                 node);
   }
@@ -440,7 +430,7 @@ class ConcatenationOperationParser : public TFLiteOperationParser {
     //   RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, idx));
     // }
     // TODO(eignasheva): add axis checking.
-    TfLiteConcatenationParams* tf_options = nullptr;
+    const TfLiteConcatenationParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     return absl::OkStatus();
   }
@@ -495,11 +485,8 @@ class ConcatenationOperationParser : public TFLiteOperationParser {
         break;
       }
     }
-    const auto* tf_options = reinterpret_cast<const TfLiteConcatenationParams*>(
-        tflite_node->builtin_data);
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite params");
-    }
+    const TfLiteConcatenationParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(MaybeFuseActivationToTheSingleOutput(tf_options->activation,
                                                          graph, node));
     node->operation.attributes = attr;
@@ -572,7 +559,7 @@ class Conv2DOperationParser : public TFLiteOperationParser {
     if (runtime_inputs == 1) {
       RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
     }
-    TfLiteConvParams* tf_options = nullptr;
+    const TfLiteConvParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(CheckStridesAndDilation(
         tf_options->stride_height, tf_options->stride_width,
@@ -597,11 +584,8 @@ class Conv2DOperationParser : public TFLiteOperationParser {
     }
     reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
 
-    const auto* tf_options =
-        reinterpret_cast<const TfLiteConvParams*>(tflite_node->builtin_data);
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite params");
-    }
+    const TfLiteConvParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     attr.strides = ToHW(tf_options->stride_height, tf_options->stride_width);
     attr.dilations = HW(tf_options->dilation_height_factor,
                         tf_options->dilation_width_factor);
@@ -620,7 +604,7 @@ class Convolution2DTransposeBiasParser : public TFLiteOperationParser {
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
     RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
-    TfLiteTransposeConvParams* tf_options = nullptr;
+    const TfLiteTransposeConvParams* tf_options;
     RETURN_IF_ERROR(RetrieveCustomInitialData(tflite_node, &tf_options));
     RETURN_IF_ERROR(
         CheckStrides(tf_options->stride_height, tf_options->stride_width));
@@ -635,17 +619,19 @@ class Convolution2DTransposeBiasParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddInput(node, 0));
     RETURN_IF_ERROR(reader->AddOutputs(node));
 
-    const auto* params = reinterpret_cast<const TfLiteTransposeConvParams*>(
-        tflite_node->custom_initial_data);
+    const TfLiteTransposeConvParams* tf_options;
+    auto status = RetrieveCustomInitialData(tflite_node, &tf_options);
+
     ConvolutionTransposedAttributes attr;
-    attr.stride =
-        params ? HW(params->stride_height, params->stride_width) : HW(1, 1);
+    attr.stride = status.ok()
+                      ? HW(tf_options->stride_height, tf_options->stride_width)
+                      : HW(1, 1);
 
     RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
     reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
 
-    UpdatePadding(params->padding, graph->FindInputs(node->id)[0]->tensor.shape,
-                  &attr);
+    UpdatePadding(status.ok() ? tf_options->padding : kTfLitePaddingUnknown,
+                  graph->FindInputs(node->id)[0]->tensor.shape, &attr);
 
     node->operation.attributes = std::move(attr);
     return absl::OkStatus();
@@ -661,7 +647,7 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/1, /*outputs=*/1));
     RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
-    TfLiteDepthwiseConvParams* tf_options;
+    const TfLiteDepthwiseConvParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(CheckStridesAndDilation(
         tf_options->stride_height, tf_options->stride_width,
@@ -716,7 +702,7 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
     DepthwiseConvolution2DAttributes attr;
     RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
     reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
-    TfLiteDepthwiseConvParams* tf_options;
+    const TfLiteDepthwiseConvParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     attr.strides = ToHW(tf_options->stride_height, tf_options->stride_width);
     attr.dilations = HW(std::max(1, tf_options->dilation_height_factor),
@@ -870,17 +856,15 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
       TfLiteFusedActivation activation = kTfLiteActNone;
       switch (operation_type_) {
         case OperationType::SUB: {
-          const auto* tf_options = reinterpret_cast<const TfLiteSubParams*>(
-              tflite_node->builtin_data);
-          if (tf_options != nullptr) {
+          const TfLiteSubParams* tf_options;
+          if (RetrieveBuiltinData(tflite_node, &tf_options).ok()) {
             activation = tf_options->activation;
           }
           break;
         }
         case OperationType::DIV: {
-          const auto* tf_options = reinterpret_cast<const TfLiteDivParams*>(
-              tflite_node->builtin_data);
-          if (tf_options != nullptr) {
+          const TfLiteDivParams* tf_options;
+          if (RetrieveBuiltinData(tflite_node, &tf_options).ok()) {
             activation = tf_options->activation;
           }
           break;
@@ -913,15 +897,15 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
   absl::Status GetActivation(const TfLiteNode* tflite_node,
                              TfLiteFusedActivation* activation) const {
     if (operation_type_ == OperationType::DIV) {
-      TfLiteDivParams* tf_options;
-      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
-      *activation = tf_options ? tf_options->activation : kTfLiteActNone;
+      const TfLiteDivParams* tf_options;
+      auto status = RetrieveBuiltinData(tflite_node, &tf_options);
+      *activation = status.ok() ? tf_options->activation : kTfLiteActNone;
       return absl::OkStatus();
     }
     if (operation_type_ == OperationType::SUB) {
-      TfLiteSubParams* tf_options;
-      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
-      *activation = tf_options ? tf_options->activation : kTfLiteActNone;
+      const TfLiteSubParams* tf_options;
+      auto status = RetrieveBuiltinData(tflite_node, &tf_options);
+      *activation = status.ok() ? tf_options->activation : kTfLiteActNone;
       return absl::OkStatus();
     }
 
@@ -981,7 +965,7 @@ class FullyConnectedOperationParser : public TFLiteOperationParser {
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
     RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 4));
-    TfLiteFullyConnectedParams* tf_options = nullptr;
+    const TfLiteFullyConnectedParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     if (tf_options->weights_format !=
         kTfLiteFullyConnectedWeightsFormatDefault) {
@@ -998,9 +982,8 @@ class FullyConnectedOperationParser : public TFLiteOperationParser {
     Node* node = graph->NewNode();
     RETURN_IF_ERROR(reader->AddInput(node, 0));
 
-    const auto* tf_options =
-        reinterpret_cast<const TfLiteFullyConnectedParams*>(
-            tflite_node->builtin_data);
+    const TfLiteFullyConnectedParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     if (tf_options->weights_format !=
         kTfLiteFullyConnectedWeightsFormatDefault) {
       return absl::UnimplementedError(
@@ -1088,12 +1071,12 @@ class LSTMOperationParser : public TFLiteOperationParser {
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckExactSupportedOpVersion(registration, 2));
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
     // TODO(eignasheva): Fix bad check.
     // RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
     // /*runtime_inputs=*/5,
     //                                    /*outputs=*/4));
-    TfLiteLSTMParams* tf_options = nullptr;
+    const TfLiteLSTMParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(CheckParameters(tf_options));
     return absl::OkStatus();
@@ -1109,12 +1092,9 @@ class LSTMOperationParser : public TFLiteOperationParser {
       return absl::InvalidArgumentError("LSTM should have 4 output tensors");
     }
 
-    const auto* params =
-        reinterpret_cast<const TfLiteLSTMParams*>(tflite_node->builtin_data);
-    if (!params) {
-      return absl::InternalError("Missing tflite params");
-    }
-    RETURN_IF_ERROR(CheckParameters(params));
+    const TfLiteLSTMParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+    RETURN_IF_ERROR(CheckParameters(tf_options));
 
     Node* concat_node = graph->NewNode();
     concat_node->operation.type = ToString(OperationType::CONCAT);
@@ -1187,7 +1167,7 @@ class MulOperationParser : public TFLiteOperationParser {
     if (tflite_node->inputs->size != 2) {
       return absl::UnimplementedError("MUL requires two input tensors.");
     }
-    TfLiteMulParams* tf_options;
+    const TfLiteMulParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     return IsActivationSupported(tf_options->activation);
   }
@@ -1248,11 +1228,8 @@ class MulOperationParser : public TFLiteOperationParser {
                                           constant_dims, graph, reader));
     }
 
-    const auto* tf_options =
-        reinterpret_cast<const TfLiteMulParams*>(tflite_node->builtin_data);
-    if (!tf_options) {
-      return absl::InternalError("Missing TfLiteMulParams");
-    }
+    const TfLiteMulParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     return MaybeFuseActivationToTheSingleOutput(tf_options->activation, graph,
                                                 node);
   }
@@ -1335,8 +1312,8 @@ class PadOperationParser : public TFLiteOperationParser {
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
     if (mirror_pad_) {
-      auto* tf_options = reinterpret_cast<const TfLiteMirrorPaddingParams*>(
-          tflite_node->builtin_data);
+      const TfLiteMirrorPaddingParams* tf_options;
+      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
       if (tf_options->mode !=
           TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect) {
         return absl::InvalidArgumentError(
@@ -1347,6 +1324,17 @@ class PadOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/1, /*outputs=*/1));
     RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
+    auto pad_tensor = tflite::GetInput(context, tflite_node, 1);
+    if (pad_tensor->dims->size != 2) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Invalid paddings tensor dimension: expected 2 dim, got ",
+          pad_tensor->dims->size, " dim"));
+    }
+    if (pad_tensor->dims->data[0] != 4 || pad_tensor->dims->data[1] != 2) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Invalid paddings tensor shape: expected 4x2, got ",
+          pad_tensor->dims->data[0], "x", pad_tensor->dims->data[1]));
+    }
     return absl::OkStatus();
   }
 
@@ -1370,6 +1358,7 @@ class PadOperationParser : public TFLiteOperationParser {
 
     // 4x2 tensor with paddings.
     if (paddings.shape.h != 4 || paddings.shape.w != 2) {
+      // It shouldn't fail here since it's checked at IsSupported().
       return absl::InvalidArgumentError(
           "Paddings tensor has unexpected shape.");
     }
@@ -1391,7 +1380,7 @@ class Pooling2DOperationParser : public TFLiteOperationParser {
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
     RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
-    TfLitePoolParams* tf_options = nullptr;
+    const TfLitePoolParams* tf_options;
     auto status = RetrieveCustomInitialData(tflite_node, &tf_options);
     if (status.ok()) {  // custom case with indices as a second output
       RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
@@ -1429,14 +1418,9 @@ class Pooling2DOperationParser : public TFLiteOperationParser {
     // is MaxPoolingWithArgmax2D. There is no way to read
     // tflite_node->builtin_code, so, simply check whether custom data is
     // available.
-    auto* tf_options = reinterpret_cast<const TfLitePoolParams*>(
-        tflite_node->custom_initial_data);
-    if (!tf_options) {
-      tf_options =
-          reinterpret_cast<const TfLitePoolParams*>(tflite_node->builtin_data);
-    }
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite params");
+    const TfLitePoolParams* tf_options;
+    if (!RetrieveCustomInitialData(tflite_node, &tf_options).ok()) {
+      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     }
 
     std::vector<uint32_t> max_tensor_id{0};
@@ -1519,9 +1503,9 @@ class ReLUOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddInput(node, 0));
 
     ReLUAttributes attr;
-    TfLiteLeakyReluParams* tf_options = nullptr;
-    RetrieveBuiltinData(tflite_node, &tf_options).IgnoreError();
-    attr.alpha = tf_options ? tf_options->alpha : 0;
+    const TfLiteLeakyReluParams* tf_options;
+    auto status = RetrieveBuiltinData(tflite_node, &tf_options);
+    attr.alpha = status.ok() ? tf_options->alpha : 0;
     attr.clip = clip_;
     node->operation.attributes = attr;
     return reader->AddOutputs(node);
@@ -1622,11 +1606,8 @@ class Resize2DOperationParser : public TFLiteOperationParser {
   template <class T>
   absl::Status GetAlignCornersValueForType(const TfLiteNode* tflite_node,
                                            bool* align_corners) {
-    const auto* tf_options =
-        reinterpret_cast<const T*>(tflite_node->builtin_data);
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite params");
-    }
+    const T* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     *align_corners = tf_options->align_corners;
     return absl::OkStatus();
   }
@@ -1634,12 +1615,8 @@ class Resize2DOperationParser : public TFLiteOperationParser {
   absl::Status GetHalfPixelCentersValue(const TfLiteNode* tflite_node,
                                         bool* half_pixel_centers) {
     if (sampling_type_ == SamplingType::BILINEAR) {
-      const auto* tf_options = reinterpret_cast<TfLiteResizeBilinearParams*>(
-          tflite_node->builtin_data);
-      if (!tf_options) {
-        return absl::InternalError(
-            "Missing tflite params for ResizeBilinear op");
-      }
+      const TfLiteResizeBilinearParams* tf_options;
+      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
       if (tf_options->align_corners && tf_options->half_pixel_centers) {
         return absl::InternalError(
             "If half_pixel_centers is True, align_corners must be False.");
@@ -1778,7 +1755,7 @@ class SoftmaxOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/1, /*outputs=*/1));
-    TfLiteSoftmaxParams* tf_options = nullptr;
+    const TfLiteSoftmaxParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     if (tf_options->beta != 1) {
       // TODO(eignasheva): figure out, what's wrong with softmax.
@@ -1795,11 +1772,8 @@ class SoftmaxOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddInput(node, 0));
     RETURN_IF_ERROR(reader->AddOutputs(node));
 
-    const auto* tf_options =
-        reinterpret_cast<const TfLiteSoftmaxParams*>(tflite_node->builtin_data);
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite params");
-    }
+    const TfLiteSoftmaxParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     if (tf_options->beta != 1) {
       // there is multiply by scalar operation fused in softmax. Make a layer
       // out of it before softmax.
@@ -1823,7 +1797,7 @@ class SpaceToDepthOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/1, /*outputs=*/1));
     // TODO(impjdi): Dims check.
-    TfLiteSpaceToDepthParams* s2d_params = nullptr;
+    const TfLiteSpaceToDepthParams* s2d_params;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &s2d_params));
     if (s2d_params->block_size == 1) {
       return absl::InvalidArgumentError(
@@ -1843,8 +1817,8 @@ class SpaceToDepthOperationParser : public TFLiteOperationParser {
     node->operation.type = ToString(OperationType::SPACE_TO_DEPTH);
     RETURN_IF_ERROR(reader->AddInput(node, 0));
     RETURN_IF_ERROR(reader->AddOutputs(node));
-    const auto* tf_options = reinterpret_cast<const TfLiteSpaceToDepthParams*>(
-        tflite_node->builtin_data);
+    const TfLiteSpaceToDepthParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     SpaceToDepthAttributes attr;
     attr.block_size = tf_options->block_size;
     node->operation.attributes = attr;
@@ -1858,7 +1832,7 @@ class StridedSliceOperationParser : public TFLiteOperationParser {
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
     RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
-    TfLiteStridedSliceParams* tf_options = nullptr;
+    const TfLiteStridedSliceParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(CheckOptionsSupport(tf_options));
     return absl::OkStatus();
@@ -1884,14 +1858,12 @@ class StridedSliceOperationParser : public TFLiteOperationParser {
           "Slicing is supported for 3 or 4 dimensional tensors only.");
     }
 
-    const auto* tf_options = reinterpret_cast<const TfLiteStridedSliceParams*>(
-        tflite_node->builtin_data);
-    auto out_shape = graph->FindOutputs(node->id)[0]->tensor.shape;
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite params");
-    }
+    const TfLiteStridedSliceParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(CheckOptionsSupport(tf_options));
 
+    auto out_shape = graph->FindOutputs(node->id)[0]->tensor.shape;
+
     SliceAttributes attr;
     if (read_without_batch) {
       RETURN_IF_ERROR(ReadAttribsWithoutBatch(reader, tf_options,
@@ -2040,7 +2012,7 @@ class TransposeConvOperationParser : public TFLiteOperationParser {
                            const TfLiteRegistration* registration) final {
     RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
     RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
-    TfLiteTransposeConvParams* tf_options = nullptr;
+    const TfLiteTransposeConvParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(
         CheckStrides(tf_options->stride_height, tf_options->stride_width));
@@ -2060,11 +2032,9 @@ class TransposeConvOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(graph->AddConsumer(node->id, input->id));
     RETURN_IF_ERROR(reader->AddOutputs(node));
 
-    const auto* tf_options = reinterpret_cast<const TfLiteTransposeConvParams*>(
-        tflite_node->builtin_data);
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite options.");
-    }
+    const TfLiteTransposeConvParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+
     ConvolutionTransposedAttributes attr;
     attr.stride = tf_options
                       ? HW(tf_options->stride_height, tf_options->stride_width)
@@ -2123,9 +2093,9 @@ class Unpooling2DOperationParser : public TFLiteOperationParser {
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
-    TfLitePoolParams* tf_options = nullptr;
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/2, /*outputs=*/1));
+    const TfLitePoolParams* tf_options;
     RETURN_IF_ERROR(RetrieveCustomInitialData(tflite_node, &tf_options));
     RETURN_IF_ERROR(CheckKernelsAndStrides(
         tf_options->filter_height, tf_options->filter_width,
@@ -2143,11 +2113,10 @@ class Unpooling2DOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddOutputs(node));
     auto input_shape = graph->FindInputs(node->id)[0]->tensor.shape;
     MaxUnpooling2DAttributes attr;
-    const auto* tf_options = reinterpret_cast<const TfLitePoolParams*>(
-        tflite_node->custom_initial_data);
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite params");
-    }
+
+    const TfLitePoolParams* tf_options;
+    RETURN_IF_ERROR(RetrieveCustomInitialData(tflite_node, &tf_options));
+
     attr.kernel = ToHW(tf_options->filter_height, tf_options->filter_width);
     attr.strides = ToHW(tf_options->stride_height, tf_options->stride_width);
     UpdatePadding(tf_options->padding, input_shape, &attr);
@@ -2350,7 +2319,7 @@ class TransformTensorOperationParser : public TFLiteOperationParser {
  private:
 };
 
-class TransformTensorV2OperationParser : public TFLiteOperationParser {
+class TransformTensorBilinearV2OperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
@@ -2368,7 +2337,7 @@ class TransformTensorV2OperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddInput(node, 1));  // bbox
     RETURN_IF_ERROR(reader->AddOutputs(node));
 
-    std::string op_name = "transform_tensor_v2";
+    std::string op_name = "transform_tensor_bilinear_v2";
     node->operation.type = op_name;
     BHWC output_shape;
     RETURN_IF_ERROR(
@@ -2421,6 +2390,40 @@ class TransformLandmarksOperationParser : public TFLiteOperationParser {
  private:
 };
 
+class TransformLandmarksV2OperationParser : public TFLiteOperationParser {
+ public:
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
+                                       /*runtime_inputs=*/2, /*outputs=*/1));
+    return absl::OkStatus();
+  }
+
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    RETURN_IF_ERROR(reader->AddInput(node, 0));  // data
+    RETURN_IF_ERROR(reader->AddInput(node, 1));  // bbox
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+    std::string op_name = "transform_landmarks_v2";
+    node->operation.type = op_name;
+
+    auto output_value = graph->FindOutputs(node->id)[0];
+    output_value->tensor.shape = graph->FindInputs(node->id)[0]->tensor.shape;
+    BHWC output_shape = output_value->tensor.shape;
+    RETURN_IF_ERROR(
+        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
+                              tflite_node->custom_initial_data_size,
+                              &(node->operation.attributes), &output_shape));
+
+    return absl::OkStatus();
+  }
+
+ private:
+};
+
 class Landmarks2TransformMatrixOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
@@ -2451,6 +2454,37 @@ class Landmarks2TransformMatrixOperationParser : public TFLiteOperationParser {
   }
 };
 
+class Landmarks2TransformMatrixV2OperationParser
+    : public TFLiteOperationParser {
+ public:
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    return CheckInputsOutputs(context, tflite_node, /*runtime_inputs=*/1,
+                              /*outputs=*/1);
+  }
+
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    RETURN_IF_ERROR(reader->AddInput(node, 0));  // landmarks
+    RETURN_IF_ERROR(reader->AddOutputs(node));   // transform matrix
+
+    const std::string op_name = "landmarks_to_transform_matrix_v2";
+    node->operation.type = op_name;
+    BHWC output_shape;
+    RETURN_IF_ERROR(
+        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
+                              tflite_node->custom_initial_data_size,
+                              &(node->operation.attributes), &output_shape));
+
+    auto output_value = graph->FindOutputs(node->id)[0];
+    output_value->tensor.shape = output_shape;
+    return absl::OkStatus();
+  }
+};
+
 class AlignmentPointsToTransformMatrixOperationParser
     : public TFLiteOperationParser {
  public:
@@ -2666,15 +2700,21 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       if (custom_name == "TransformTensor") {
         return std::make_unique<TransformTensorOperationParser>();
       }
-      if (custom_name == "TransformTensorV2") {
-        return std::make_unique<TransformTensorV2OperationParser>();
+      if (custom_name == "TransformTensorBilinearV2") {
+        return std::make_unique<TransformTensorBilinearV2OperationParser>();
       }
       if (custom_name == "TransformLandmarks") {
         return std::make_unique<TransformLandmarksOperationParser>();
       }
+      if (custom_name == "TransformLandmarksV2") {
+        return std::make_unique<TransformLandmarksV2OperationParser>();
+      }
       if (custom_name == "Landmarks2TransformMatrix") {
         return std::make_unique<Landmarks2TransformMatrixOperationParser>();
       }
+      if (custom_name == "Landmarks2TransformMatrixV2") {
+        return std::make_unique<Landmarks2TransformMatrixV2OperationParser>();
+      }
       if (custom_name == "AlignmentPointsToTransformMatrix") {
         return std::make_unique<
             AlignmentPointsToTransformMatrixOperationParser>();
@@ -2691,10 +2731,13 @@ absl::Status IsSupported(const TfLiteContext* context, TfLiteNode* node,
       ->IsSupported(context, node, registration);
 }
 
-bool IsAllAllowedTensors(TfLiteContext* context, const TfLiteIntArray* array,
+bool IsAllAllowedTensors(TfLiteContext* context,
+                         const TfLiteIntArray* tensor_indices,
                          bool allow_quant_ops = false) {
-  for (int i = 0; i < array->size; ++i) {
-    const TfLiteTensor* t = context->tensors + array->data[i];
+  for (int i = 0; i < tensor_indices->size; ++i) {
+    int tensor_idx = tensor_indices->data[i];
+    if (tensor_idx == kTfLiteOptionalTensor) continue;
+    const TfLiteTensor* t = &context->tensors[tensor_idx];
     bool type_supported =
         (t->type == kTfLiteFloat32 || t->type == kTfLiteFloat16);
     if (allow_quant_ops) {
@@ -2738,7 +2781,8 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops,
     return true;
   };
 
-  GraphWithDequantPartitionHelper partition_helper(context, node_supported_fn);
+  delegates::FP16GraphPartitionHelper partition_helper(context,
+                                                       node_supported_fn);
   std::set<std::string> unsupported_nodes_info;
   if (partition_helper.Partition(&unsupported_nodes_info) != kTfLiteOk) {
     return TfLiteIntArrayCreate(0);
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
index 65e2b6f0d47..a0f7db25210 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -15,12 +15,11 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
 
-#include <set>
 #include <string>
-#include <unordered_map>
 
 #include <fp16.h>
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context.h"
@@ -33,157 +32,6 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 
-TfLiteStatus GraphWithDequantPartitionHelper::Partition(
-    std::set<std::string>* unsupported_nodes_info) {
-  const auto status = GraphPartitionHelper::Partition(unsupported_nodes_info);
-  // Clean up those partitions that have a single dequant op. NoteThose
-  // removed dequant ops have to be reserved in the graph and should not be
-  // delegated.
-  RemoveSingleDequantNodePartitions();
-  return status;
-}
-
-std::vector<int>
-GraphWithDequantPartitionHelper::GetNodesOfFirstNLargestPartitions(int n) {
-  // We first get partitions to reduce the number of nodes to be checked in
-  // deciding which dequant ops could actually be replaced. And then we
-  // remap input-tensor to dequant nodes' inputs and remove those
-  // to-be-reserved dequant nodes.
-  auto first_nps = GetFirstNLargestPartitions(n);
-  std::vector<int> ops_to_replace;
-  for (const auto p : first_nps) {
-    auto nodes = p->nodes_to_replace;
-    ops_to_replace.insert(ops_to_replace.end(), nodes->data,
-                          nodes->data + nodes->size);
-  }
-  RemapInputTensors(ops_to_replace);
-  RemoveReservedDequantsFromNodes(&ops_to_replace);
-  return ops_to_replace;
-}
-
-bool GraphWithDequantPartitionHelper::IsNodeSupported(
-    TfLiteContext* context, TfLiteNode* node, TfLiteRegistration* registration,
-    int node_id, std::string* unsupported_details) {
-  // If we need to handle dequant nodes, we have to remap input tensors of
-  // this node if some of them come from a dequant node before testing if
-  // the node is supported.
-  std::vector<int> orig_inputs;
-  if (RecordAndRemapInputTensors(registration->builtin_code, node_id, node,
-                                 &orig_inputs)) {
-    // We have a dequant op here. Note that we retrun an Ok status because a
-    // dequant node is first added as supported. Later, this dequant node
-    // will be removed if it has to be preserved in the graph which happens
-    // when its immediate downstream nodes cannot be supported.
-    return true;
-  }
-  const auto status = GraphPartitionHelper::IsNodeSupported(
-      context, node, registration, node_id, unsupported_details);
-  RestoreToOrigInputTensors(node, orig_inputs);
-  return status;
-}
-
-bool GraphWithDequantPartitionHelper::RecordAndRemapInputTensors(
-    int32_t op_code, int node_id, TfLiteNode* node,
-    std::vector<int>* orig_inputs) {
-  orig_inputs->clear();
-  // Record the dequant node.
-  if (op_code == kTfLiteBuiltinDequantize &&
-      context_->tensors[node->inputs->data[0]].type ==
-          TfLiteType::kTfLiteFloat16) {
-    dequant_nodes_[node->outputs->data[0]] = node->inputs->data[0];
-    return true;
-  }
-  // For a dequantize op, there's no need to remap its input tensors.
-  if (dequant_nodes_.empty()) return false;
-  RemapInputTensors(node, orig_inputs);
-  return false;
-}
-
-void GraphWithDequantPartitionHelper::RestoreToOrigInputTensors(
-    TfLiteNode* node, const std::vector<int>& orig_inputs) {
-  if (node->inputs->size != orig_inputs.size()) return;
-  for (int j = 0; j < node->inputs->size; ++j) {
-    node->inputs->data[j] = orig_inputs[j];
-  }
-}
-
-void GraphWithDequantPartitionHelper::RemapInputTensors(
-    const std::vector<int>& nodes) const {
-  for (int node_id : nodes) {
-    TfLiteNode* node;
-    TfLiteRegistration* registration;
-    GetNodeAndRegistration(context_, node_id, &node, &registration)
-        .IgnoreError();
-    RemapInputTensors(node, nullptr /* orig_inputs*/);
-  }
-}
-
-void GraphWithDequantPartitionHelper::RemoveSingleDequantNodePartitions() {
-  auto it = partitions_.begin();
-  while (it != partitions_.end()) {
-    auto p = *it;
-    if (p->nodes_to_replace->size != 1) {
-      ++it;
-      continue;
-    }
-    int node_id = p->nodes_to_replace->data[0];
-    TfLiteNode* node = nullptr;
-    TfLiteRegistration* registration = nullptr;
-    GetNodeAndRegistration(context_, node_id, &node, &registration)
-        .IgnoreError();
-    if (registration->builtin_code != kTfLiteBuiltinDequantize ||
-        context_->tensors[node->inputs->data[0]].type !=
-            TfLiteType::kTfLiteFloat16) {
-      ++it;
-      continue;
-    }
-    // Note such dequant nodes have to be preserved in the graph as dequant
-    // ops are not actually supported in the GPU delegate.
-    dequant_nodes_to_save_.insert(node_id);
-    it = partitions_.erase(it);
-  }
-}
-
-void GraphWithDequantPartitionHelper::RemoveReservedDequantsFromNodes(
-    std::vector<int>* nodes) {
-  if (dequant_nodes_to_save_.empty()) return;
-  auto it = nodes->begin();
-  while (it != nodes->end()) {
-    if (dequant_nodes_to_save_.find(*it) == dequant_nodes_to_save_.end()) {
-      ++it;
-      continue;
-    }
-    it = nodes->erase(it);
-  }
-}
-
-void GraphWithDequantPartitionHelper::RemapInputTensors(
-    TfLiteNode* node, std::vector<int>* orig_inputs) const {
-  TfLiteIntArray* inputs = node->inputs;
-  auto inputs_view = TfLiteIntArrayView(inputs);
-  // Prepopulate 'orig_inputs' first and clear it if there's no input from a
-  // dequant op.
-  if (orig_inputs) {
-    orig_inputs->clear();
-    orig_inputs->reserve(inputs->size);
-    for (auto tid : inputs_view) {
-      orig_inputs->push_back(tid);
-    }
-  }
-  // Fix this node's inputs (i.e. prune out the preceding dequantize node) in
-  // order to test if it is supported.
-  bool is_remapped = false;
-  for (int j = 0; j < inputs->size; ++j) {
-    const int input_tid = inputs->data[j];
-    const auto it = dequant_nodes_.find(input_tid);
-    if (it != dequant_nodes_.end()) {
-      inputs->data[j] = it->second;
-      is_remapped = true;
-    }
-  }
-  if (!is_remapped && orig_inputs) orig_inputs->clear();
-}
-
 absl::Status GetNodeAndRegistration(TfLiteContext* context, int node_id,
                                     TfLiteNode** tflite_node,
                                     TfLiteRegistration** registration) {
@@ -373,50 +221,70 @@ absl::Status CreateVectorCopyData<float>(const TfLiteTensor& tensor,
   return absl::OkStatus();
 }
 
+const std::string GetDimensionString(const TfLiteIntArray* dimensions) {
+  return absl::StrJoin(TfLiteIntArrayView(dimensions), "x");
+}
+
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Scalar* shape) {
   if (dimensions->size < 0) {
     return absl::InvalidArgumentError("Invalid Scalar dimensions");
   }
   for (int i = 0; i < dimensions->size; ++i) {
     if (dimensions->data[i] != 1) {
-      return absl::InvalidArgumentError(
-          "Dimension can not be reduced to scalar.");
+      return absl::InvalidArgumentError(absl::StrCat(
+          GetDimensionString(dimensions), "  cannot be reduced to scalar."));
     }
   }
   shape->v = 1;
   return absl::OkStatus();
 }
 
-absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Linear* shape) {
+absl::Status CheckIfLinearConvertible(const TfLiteIntArray* dimensions) {
   if (dimensions->size <= 0) {
     return absl::InvalidArgumentError("Dimension is empty.");
   }
   for (int i = 0; i < dimensions->size - 1; ++i) {
     if (dimensions->data[i] != 1) {
-      return absl::InvalidArgumentError(
-          "Dimension can not be reduced to linear.");
+      return absl::InvalidArgumentError(absl::StrCat(
+          GetDimensionString(dimensions), "  cannot be reduced to linear."));
     }
   }
+  return absl::OkStatus();
+}
+
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Linear* shape) {
+  RETURN_IF_ERROR(CheckIfLinearConvertible(dimensions));
   shape->v = dimensions->data[dimensions->size - 1];
   return absl::OkStatus();
 }
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HWC* shape) {
-  if (dimensions->size != 4) {
-    return absl::InvalidArgumentError("Dimensions are not HWC");
+  if (dimensions->size == 3) {
+    shape->h = dimensions->data[0];
+    shape->w = dimensions->data[1];
+    shape->c = dimensions->data[2];
+    return absl::OkStatus();
   }
-  if (dimensions->data[0] != 1) {
-    return absl::UnimplementedError("Batch size is not equal to 1.");
+  if (dimensions->size == 4) {
+    if (dimensions->data[0] != 1) {
+      return absl::UnimplementedError("Batch size is not equal to 1.");
+    }
+    shape->h = dimensions->data[1];
+    shape->w = dimensions->data[2];
+    shape->c = dimensions->data[3];
+    return absl::OkStatus();
   }
-  shape->h = dimensions->data[1];
-  shape->w = dimensions->data[2];
-  shape->c = dimensions->data[3];
-  return absl::OkStatus();
+  return absl::InvalidArgumentError(
+      absl::StrCat("Expected a 3D tensor of shape HxWxC or a 4D tensor of "
+                   "shape 1xHxWxC but got ",
+                   GetDimensionString(dimensions)));
 }
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HW* shape) {
   if (dimensions->size != 2) {
-    return absl::InvalidArgumentError("Dimensions are not HW");
+    return absl::InvalidArgumentError(
+        absl::StrCat("Expected a 2D tensor of shape HxW but got ",
+                     GetDimensionString(dimensions)));
   }
   shape->h = dimensions->data[0];
   shape->w = dimensions->data[1];
@@ -426,7 +294,8 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HW* shape) {
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, OHWI* shape) {
   if (dimensions->size != 4) {
     return absl::InvalidArgumentError(
-        absl::StrCat("Dimensions are not OHWI: ", dimensions->size));
+        absl::StrCat("Expected a 4D tensor of shape OxHxWxI but got ",
+                     GetDimensionString(dimensions)));
   }
   shape->o = dimensions->data[0];
   shape->h = dimensions->data[1];
@@ -437,7 +306,9 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, OHWI* shape) {
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, BHWC* shape) {
   if (dimensions->size != 4) {
-    return absl::InvalidArgumentError("Dimensions are not BHWC");
+    return absl::InvalidArgumentError(
+        absl::StrCat("Expected a 4D tensor of shape BxHxWxC but got ",
+                     GetDimensionString(dimensions)));
   }
   shape->b = dimensions->data[0];
   shape->h = dimensions->data[1];
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
index 54ae19e890a..6cbfcd9e7d6 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
@@ -16,17 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
 
-#include <set>
-#include <string>
-#include <unordered_map>
-
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -35,61 +30,6 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 
-class GraphWithDequantPartitionHelper : public delegates::GraphPartitionHelper {
- public:
-  GraphWithDequantPartitionHelper(
-      TfLiteContext* context, delegates::IsNodeSupportedFn is_node_supported_fn)
-      : GraphPartitionHelper(context, std::move(is_node_supported_fn)) {}
-
-  TfLiteStatus Partition(
-      std::set<std::string>* unsupported_nodes_info) override;
-
-  // Returns a list of node indices of all nodes from the first n largest
-  // partitions. If there are fewer paritions than n, all nodes will be
-  // returned. The partition is ranked according to the number of nodes.
-  std::vector<int> GetNodesOfFirstNLargestPartitions(int n);
-
- protected:
-  bool IsNodeSupported(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteRegistration* registration, int node_id,
-                       std::string* unsupported_details) override;
-
- private:
-  // Record 'node' if it is a dequant op (i.e. a fp16 one here) and return true.
-  // When it's not a dequant op, remap its inputs to the inputs of the preceding
-  // dequant if there's a one and returns false. 'orig_inputs' records original
-  // input tensor ids of this node if any input is remapped.
-  bool RecordAndRemapInputTensors(int32_t op_code, int node_id,
-                                  TfLiteNode* node,
-                                  std::vector<int>* orig_inputs);
-
-  // Restore inputs of 'node' to 'orig_inputs' only if two sizes match.
-  void RestoreToOrigInputTensors(TfLiteNode* node,
-                                 const std::vector<int>& orig_inputs);
-
-  // Remap input tensors of every node in 'nodes' (i.e. node indices) if some of
-  // them are from dequant ops.
-  void RemapInputTensors(const std::vector<int>& nodes) const;
-
-  void RemoveSingleDequantNodePartitions();
-
-  void RemoveReservedDequantsFromNodes(std::vector<int>* nodes);
-
-  // Remap input tensors of a single 'node' if some of come from a dequant op.
-  // If 'orig_inputs' isn't nullptr, it records original input tensor ids of
-  // this node if any input is remapped.
-  void RemapInputTensors(TfLiteNode* node, std::vector<int>* orig_inputs) const;
-
-  // A map recording dequantize nodes's input/output tensors of this selected
-  // graph. The key is the output tensor id, and the value is the input tensor
-  // id.
-  std::unordered_map<int, int> dequant_nodes_;
-
-  // A set of dequant nodes as in node indices that have to be preserved in the
-  // graph.
-  std::set<int> dequant_nodes_to_save_;
-};
-
 absl::Status GetNodeAndRegistration(TfLiteContext* context, int node_id,
                                     TfLiteNode** tflite_node,
                                     TfLiteRegistration** registration);
@@ -168,6 +108,8 @@ absl::Status CreateVectorCopyData<float>(const TfLiteTensor& tensor,
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Scalar* shape);
 
+absl::Status CheckIfLinearConvertible(const TfLiteIntArray* dimensions);
+
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Linear* shape);
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HWC* shape);
diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index 28ce67b1ce3..c3861ca2baa 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -499,6 +499,14 @@ BHWC CalculateOutputShape(const BHWC& input, const SliceAttributes& attr) {
               StridedSize(attr.ends.c - attr.starts.c, attr.strides.c));
 }
 
+BHWDC CalculateOutputShape(const BHWDC& input, const Slice3DAttributes& attr) {
+  return BHWDC(StridedSize(attr.ends.b - attr.starts.b, attr.strides.b),
+               StridedSize(attr.ends.h - attr.starts.h, attr.strides.h),
+               StridedSize(attr.ends.w - attr.starts.w, attr.strides.w),
+               StridedSize(attr.ends.d - attr.starts.d, attr.strides.d),
+               StridedSize(attr.ends.c - attr.starts.c, attr.strides.c));
+}
+
 BHWC CalculateOutputShape(const BHWC& input, const PadAttributes& attr) {
   return BHWC(attr.appended.b + attr.prepended.b + input.b,
               attr.appended.h + attr.prepended.h + input.h,
@@ -506,6 +514,14 @@ BHWC CalculateOutputShape(const BHWC& input, const PadAttributes& attr) {
               attr.appended.c + attr.prepended.c + input.c);
 }
 
+BHWDC CalculateOutputShape(const BHWDC& input, const Pad3DAttributes& attr) {
+  return BHWDC(attr.appended.b + attr.prepended.b + input.b,
+               attr.appended.h + attr.prepended.h + input.h,
+               attr.appended.w + attr.prepended.w + input.w,
+               attr.appended.d + attr.prepended.d + input.d,
+               attr.appended.c + attr.prepended.c + input.c);
+}
+
 BHWC CalculateOutputShape(const BHWC& input,
                           const FullyConnectedAttributes& attr) {
   return BHWC(input.b, 1, 1, attr.weights.shape.o);
@@ -526,9 +542,10 @@ absl::Status CalculateOutputShape(const std::vector<BHWC>& input,
   switch (attr.axis) {
     case Axis::CHANNELS:
       for (int i = 1; i < input.size(); i++) {
-        if (input[i].h != new_shape.h || input[i].w != new_shape.w) {
+        if (input[i].h != new_shape.h || input[i].w != new_shape.w ||
+            input[i].b != new_shape.b) {
           return absl::InvalidArgumentError(
-              "Height and Width must be the same when concatenating "
+              "Height, Width and Batch must be the same when concatenating "
               "by channels axis");
         }
         new_shape.c += input[i].c;
@@ -536,9 +553,10 @@ absl::Status CalculateOutputShape(const std::vector<BHWC>& input,
       break;
     case Axis::HEIGHT:
       for (int i = 1; i < input.size(); i++) {
-        if (input[i].w != new_shape.w || input[i].c != new_shape.c) {
+        if (input[i].w != new_shape.w || input[i].c != new_shape.c ||
+            input[i].b != new_shape.b) {
           return absl::InvalidArgumentError(
-              "Channels and Width must be the same when concatenating "
+              "Channels, Width and Batch must be the same when concatenating "
               "by height axis");
         }
         new_shape.h += input[i].h;
@@ -546,14 +564,26 @@ absl::Status CalculateOutputShape(const std::vector<BHWC>& input,
       break;
     case Axis::WIDTH:
       for (int i = 1; i < input.size(); i++) {
-        if (input[i].h != new_shape.h || input[i].c != new_shape.c) {
+        if (input[i].h != new_shape.h || input[i].c != new_shape.c ||
+            input[i].b != new_shape.b) {
           return absl::InvalidArgumentError(
-              "Height and Channels must be the same when concatenating "
+              "Height, Channels and Batch must be the same when concatenating "
               "by width axis");
         }
         new_shape.w += input[i].w;
       }
       break;
+    case Axis::BATCH:
+      for (int i = 1; i < input.size(); i++) {
+        if (input[i].h != new_shape.h || input[i].c != new_shape.c ||
+            input[i].w != new_shape.w) {
+          return absl::InvalidArgumentError(
+              "Width, Height and Channels must be the same when concatenating "
+              "by batch axis");
+        }
+        new_shape.b += input[i].b;
+      }
+      break;
     default:
       return absl::InvalidArgumentError("Invalid axis");
       break;
@@ -562,6 +592,78 @@ absl::Status CalculateOutputShape(const std::vector<BHWC>& input,
   return absl::OkStatus();
 }
 
+absl::Status CalculateOutputShape(const std::vector<BHWDC>& input,
+                                  const ConcatAttributes& attr,
+                                  BHWDC* output_shape) {
+  BHWDC new_shape = input[0];
+  switch (attr.axis) {
+    case Axis::CHANNELS:
+      for (int i = 1; i < input.size(); ++i) {
+        if (input[i].h != new_shape.h || input[i].w != new_shape.w ||
+            input[i].d != new_shape.d || input[i].b != new_shape.b) {
+          return absl::InvalidArgumentError(
+              "Height, Width, Batch and Depth must be the same when "
+              "concatenating "
+              "by channels axis");
+        }
+        new_shape.c += input[i].c;
+      }
+      break;
+    case Axis::HEIGHT:
+      for (int i = 1; i < input.size(); ++i) {
+        if (input[i].w != new_shape.w || input[i].c != new_shape.c ||
+            input[i].d != new_shape.d || input[i].b != new_shape.b) {
+          return absl::InvalidArgumentError(
+              "Width, Depth, Batch and Channels must be the same when "
+              "concatenating "
+              "by height axis");
+        }
+        new_shape.h += input[i].h;
+      }
+      break;
+    case Axis::WIDTH:
+      for (int i = 1; i < input.size(); ++i) {
+        if (input[i].h != new_shape.h || input[i].c != new_shape.c ||
+            input[i].d != new_shape.d || input[i].b != new_shape.b) {
+          return absl::InvalidArgumentError(
+              "Height, Depth, Batch and Channels must be the same when "
+              "concatenating "
+              "by width axis");
+        }
+        new_shape.w += input[i].w;
+      }
+      break;
+    case Axis::DEPTH:
+      for (int i = 1; i < input.size(); ++i) {
+        if (input[i].w != new_shape.w || input[i].h != new_shape.h ||
+            input[i].c != new_shape.c || input[i].b != new_shape.b) {
+          return absl::InvalidArgumentError(
+              "Width, Height, Batch and Channels must be the same when "
+              "concatenating "
+              "by depth axis");
+        }
+        new_shape.d += input[i].d;
+      }
+      break;
+    case Axis::BATCH:
+      for (int i = 1; i < input.size(); ++i) {
+        if (input[i].w != new_shape.w || input[i].h != new_shape.h ||
+            input[i].c != new_shape.c || input[i].d != new_shape.d) {
+          return absl::InvalidArgumentError(
+              "Width, Height, Depth and Channels must be the same when "
+              "concatenating "
+              "by batch axis");
+        }
+        new_shape.b += input[i].b;
+      }
+      break;
+    default:
+      return absl::InvalidArgumentError("Invalid axis");
+  }
+  *output_shape = new_shape;
+  return absl::OkStatus();
+}
+
 Padding2D CalculateSamePadding(const BHWC& input,
                                const Convolution2DAttributes& attr) {
   return MakeSamePadding(input, attr);
@@ -640,5 +742,12 @@ BHWC CalculateOutputShape(const BHWC& input, const TransposeAttributes& attr) {
               input.get(attr.perm.w), input.get(attr.perm.c));
 }
 
+BHWDC CalculateOutputShape(const BHWDC& input,
+                           const Transpose3DAttributes& attr) {
+  return BHWDC(input.get(attr.perm.b), input.get(attr.perm.h),
+               input.get(attr.perm.w), input.get(attr.perm.d),
+               input.get(attr.perm.c));
+}
+
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index 4eb41dfe1a3..9d714d9bc55 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -206,6 +206,12 @@ absl::Status CalculateOutputShape(const std::vector<BHWC>& input,
                                   const ConcatAttributes& attr,
                                   BHWC* output_shape);
 
+// @return shape of a tensor after Concat operation is applied to the given
+//         input.
+absl::Status CalculateOutputShape(const std::vector<BHWDC>& input,
+                                  const ConcatAttributes& attr,
+                                  BHWDC* output_shape);
+
 // @return padding for pooling operation to make sure output keep the same shape
 // as the given input.
 Padding2D CalculateSamePadding(const BHWC& input,
@@ -393,6 +399,9 @@ struct Resize3DAttributes {
   // If true, the centers of the 8 corner pixels of the input and output tensors
   // are aligned, preserving the values at the corner pixels. Defaults to false.
   bool align_corners = false;
+  // half_pixel_centers assumes pixels are of half the actual dimensions, and
+  // yields more accurate resizes. Only applicable to BILINEAR sampling.
+  bool half_pixel_centers = false;
 };
 
 float CalculateResizeScale(int32_t input_size, int32_t output_size,
@@ -425,6 +434,17 @@ struct PadAttributes {
 // @return shape of a tensor after Pad operation is applied to the given input.
 BHWC CalculateOutputShape(const BHWC& input, const PadAttributes& attr);
 
+struct Pad3DAttributes {
+  PaddingContentType type = PaddingContentType::ZEROS;
+
+  BHWDC prepended;
+  BHWDC appended;
+};
+
+// @return shape of a tensor after Pad3D operation is applied to the given
+// input.
+BHWDC CalculateOutputShape(const BHWDC& input, const Pad3DAttributes& attr);
+
 struct ConstTensorAttributes {
   Tensor<BHWC, DataType::FLOAT32> tensor;
 };
@@ -443,6 +463,20 @@ struct SliceAttributes {
 //         input.
 BHWC CalculateOutputShape(const BHWC& input, const SliceAttributes& attr);
 
+// Simple slicing without advanced support for shrinking, reverse slicing etc.
+struct Slice3DAttributes {
+  // Specifies start and end dimensions for slicing.
+  BHWDC starts;
+  BHWDC ends;
+
+  // Stride should be >= 1.
+  BHWDC strides;
+};
+
+// @return shape of a tensor after Slice3D operation is applied to the given
+//         input.
+BHWDC CalculateOutputShape(const BHWDC& input, const Slice3DAttributes& attr);
+
 struct AddAttributes {
   TensorOrScalar param;
 };
@@ -468,6 +502,10 @@ struct ReshapeAttributes {
   BHWC new_shape;
 };
 
+struct Reshape3DAttributes {
+  BHWDC new_shape;
+};
+
 struct TransposeAttributes {
   // A permutation of the dimensions of input tensor
   BHWC perm;
@@ -477,6 +515,16 @@ struct TransposeAttributes {
 // the given input.
 BHWC CalculateOutputShape(const BHWC& input, const TransposeAttributes& attr);
 
+struct Transpose3DAttributes {
+  // A permutation of the dimensions of input tensor
+  BHWDC perm;
+};
+
+// @return shape of a tensor after Transpose3D operation is applied to
+// the given input.
+BHWDC CalculateOutputShape(const BHWDC& input,
+                           const Transpose3DAttributes& attr);
+
 struct SpaceToDepthAttributes {
   int block_size;
 };
diff --git a/tensorflow/lite/delegates/gpu/common/quantization_util.cc b/tensorflow/lite/delegates/gpu/common/quantization_util.cc
new file mode 100644
index 00000000000..9584d1d98ec
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/quantization_util.cc
@@ -0,0 +1,120 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace {
+void DequantizeInput(TfLiteContext* context, int input_index,
+                     const std::unordered_map<int, int>& quant_conversion_map) {
+  if (quant_conversion_map.find(input_index) == quant_conversion_map.end()) {
+    return;
+  }
+  int original_tensor_idx = quant_conversion_map.at(input_index);
+  const TfLiteTensor& dequantized_tflite_tensor = context->tensors[input_index];
+  const TfLiteTensor& original_tflite_tensor =
+      context->tensors[original_tensor_idx];
+  DequantizationParams op_params;
+  op_params.zero_point = original_tflite_tensor.params.zero_point;
+  op_params.scale = original_tflite_tensor.params.scale;
+  if (original_tflite_tensor.type == kTfLiteInt8) {
+    optimized_ops::Dequantize(op_params,
+                              GetTensorShape(&original_tflite_tensor),
+                              original_tflite_tensor.data.int8,
+                              GetTensorShape(&original_tflite_tensor),
+                              dequantized_tflite_tensor.data.f);
+  } else if (original_tflite_tensor.type == kTfLiteUInt8) {
+    optimized_ops::Dequantize(op_params,
+                              GetTensorShape(&original_tflite_tensor),
+                              original_tflite_tensor.data.uint8,
+                              GetTensorShape(&original_tflite_tensor),
+                              dequantized_tflite_tensor.data.f);
+  }
+}
+
+void QuantizeOutput(TfLiteContext* context, int output_index,
+                    const std::unordered_map<int, int>& quant_conversion_map) {
+  if (quant_conversion_map.find(output_index) == quant_conversion_map.end()) {
+    return;
+  }
+  int original_tensor_idx = quant_conversion_map.at(output_index);
+  const TfLiteTensor& dequantized_tflite_tensor =
+      context->tensors[output_index];
+  const TfLiteTensor& original_tflite_tensor =
+      context->tensors[original_tensor_idx];
+  tflite::QuantizationParams op_params;
+  op_params.zero_point = original_tflite_tensor.params.zero_point;
+  op_params.scale = original_tflite_tensor.params.scale;
+  if (original_tflite_tensor.type == kTfLiteInt8) {
+    optimized_ops::AffineQuantize(op_params,
+                                  GetTensorShape(&original_tflite_tensor),
+                                  dequantized_tflite_tensor.data.f,
+                                  GetTensorShape(&original_tflite_tensor),
+                                  original_tflite_tensor.data.int8);
+  } else if (original_tflite_tensor.type == kTfLiteUInt8) {
+    optimized_ops::AffineQuantize(op_params,
+                                  GetTensorShape(&original_tflite_tensor),
+                                  dequantized_tflite_tensor.data.f,
+                                  GetTensorShape(&original_tflite_tensor),
+                                  original_tflite_tensor.data.uint8);
+  }
+}
+}  // namespace
+
+absl::Status DequantizeInputs(
+    TfLiteContext* context, const std::vector<uint32_t>& input_indices,
+    const std::unordered_map<int, int>& quant_conversion_map) {
+  for (auto index : input_indices) {
+    DequantizeInput(context, static_cast<int>(index), quant_conversion_map);
+  }
+  return absl::OkStatus();
+}
+
+absl::Status DequantizeInputs(
+    TfLiteContext* context, const std::vector<int64_t>& input_indices,
+    const std::unordered_map<int, int>& quant_conversion_map) {
+  for (auto index : input_indices) {
+    DequantizeInput(context, static_cast<int>(index), quant_conversion_map);
+  }
+  return absl::OkStatus();
+}
+
+absl::Status QuantizeOutputs(
+    TfLiteContext* context, const std::vector<uint32_t>& output_indices,
+    const std::unordered_map<int, int>& quant_conversion_map) {
+  for (auto index : output_indices) {
+    QuantizeOutput(context, static_cast<int>(index), quant_conversion_map);
+  }
+
+  return absl::OkStatus();
+}
+
+absl::Status QuantizeOutputs(
+    TfLiteContext* context, const std::vector<int64_t>& output_indices,
+    const std::unordered_map<int, int>& quant_conversion_map) {
+  for (auto index : output_indices) {
+    QuantizeOutput(context, static_cast<int>(index), quant_conversion_map);
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/quantization_util.h b/tensorflow/lite/delegates/gpu/common/quantization_util.h
new file mode 100644
index 00000000000..26512531f29
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/quantization_util.h
@@ -0,0 +1,56 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_QUANTIZATION_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_QUANTIZATION_UTIL_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+// Dequantizes input tensors pre-inference, leaving float tensors intact.
+// input_indices contains dequantized (fp32) outputs, that are used as
+// inputs to GPU delegate.
+// quant_conversion_map contains bidirectional mapping between dequantized
+// tensor and its original quantized one.
+absl::Status DequantizeInputs(
+    TfLiteContext* context, const std::vector<uint32_t>& input_indices,
+    const std::unordered_map<int, int>& quant_conversion_map);
+
+absl::Status DequantizeInputs(
+    TfLiteContext* context, const std::vector<int64_t>& input_indices,
+    const std::unordered_map<int, int>& quant_conversion_map);
+
+// Quantizes output tensors post-inference, leaving float tensors intact.
+// output_indices contains (fp32) inputs to be quantized, which are outputs of
+// GPU delegate.
+// quant_conversion_map contains bidirectional mapping between dequantized
+// tensor and its original quantized one.
+absl::Status QuantizeOutputs(
+    TfLiteContext* context, const std::vector<uint32_t>& output_indices,
+    const std::unordered_map<int, int>& quant_conversion_map);
+
+absl::Status QuantizeOutputs(
+    TfLiteContext* context, const std::vector<int64_t>& output_indices,
+    const std::unordered_map<int, int>& quant_conversion_map);
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc b/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc
new file mode 100644
index 00000000000..1ca6922dfe4
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc
@@ -0,0 +1,139 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/micro/testing/test_utils.h"
+#include "tensorflow/lite/util.h"
+
+using ::testing::Eq;
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace {
+
+std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> BuildTfLiteIntArray(
+    const std::vector<int>& data) {
+  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> result(
+      TfLiteIntArrayCreate(data.size()));
+  std::copy(data.begin(), data.end(), result->data);
+  return result;
+}
+
+TEST(DequantizeInputs, Int8) {
+  TfLiteContext context;
+  auto input_dims = BuildTfLiteIntArray({1, 3, 2, 1});
+  std::vector<int8_t> data = {-3, -2, -1, 1, 2, 3};
+  std::vector<float> dequantized_data(data.size());
+
+  TfLiteTensor input = tflite::testing::CreateQuantizedTensor(
+      data.data(), input_dims.get(), "input",
+      /*min=*/-12.8f, /*max=*/12.7f, /*is_variable=*/false);
+  TfLiteTensor dequantized_input = tflite::testing::CreateFloatTensor(
+      dequantized_data.data(), input_dims.get(), "input_dequant",
+      /*is_variable=*/true);
+
+  std::vector<TfLiteTensor> tensors{input, dequantized_input};
+  tflite::testing::PopulateContext(tensors.data(), tensors.size(),
+                                   /*error_reporter=*/nullptr, &context);
+
+  std::vector<uint32_t> input_indices = {1};
+  std::unordered_map<int, int> quant_conversion_map = {{1, 0}};
+
+  auto status = DequantizeInputs(&context, input_indices, quant_conversion_map);
+  EXPECT_TRUE(status.ok());
+  EXPECT_THAT(dequantized_data,
+              Pointwise(FloatNear(1e-6), {-0.3, -0.2, -0.1, 0.1, 0.2, 0.3}));
+}
+
+TEST(DequantizeInputs, UInt8) {
+  TfLiteContext context;
+  auto input_dims = BuildTfLiteIntArray({1, 3, 2, 1});
+  std::vector<uint8_t> data = {0, 1, 2, 3, 4, 5};
+  std::vector<float> dequantized_data(data.size());
+
+  TfLiteTensor input = tflite::testing::CreateQuantizedTensor(
+      data.data(), input_dims.get(), "input",
+      /*min=*/0.0f, /*max=*/25.5f, /*is_variable=*/false);
+  TfLiteTensor dequantized_input = tflite::testing::CreateFloatTensor(
+      dequantized_data.data(), input_dims.get(), "input_dequant",
+      /*is_variable=*/true);
+
+  std::vector<TfLiteTensor> tensors{input, dequantized_input};
+  tflite::testing::PopulateContext(tensors.data(), tensors.size(),
+                                   /*error_reporter=*/nullptr, &context);
+
+  std::vector<int64_t> input_indices = {1};
+  std::unordered_map<int, int> quant_conversion_map = {{1, 0}};
+
+  auto status = DequantizeInputs(&context, input_indices, quant_conversion_map);
+  EXPECT_TRUE(status.ok());
+  EXPECT_THAT(dequantized_data,
+              Pointwise(FloatNear(1e-6), {0.0, 0.1, 0.2, 0.3, 0.4, 0.5}));
+}
+
+TEST(QuantizeOutputs, Int8) {
+  TfLiteContext context;
+  auto input_dims = BuildTfLiteIntArray({1, 3, 2, 1});
+  std::vector<float> data = {-0.3, -0.2, -0.1, 0.1, 0.2, 0.3};
+  std::vector<int8_t> quantized_data(data.size());
+  TfLiteTensor output = tflite::testing::CreateFloatTensor(
+      data.data(), input_dims.get(), "output", /*is_variable=*/false);
+  TfLiteTensor quantized_output = tflite::testing::CreateQuantizedTensor(
+      quantized_data.data(), input_dims.get(), "output_quant",
+      /*min=*/-12.8f, /*max=*/12.7f, /*is_variable=*/true);
+
+  std::vector<TfLiteTensor> tensors{output, quantized_output};
+  tflite::testing::PopulateContext(tensors.data(), tensors.size(),
+                                   /*error_reporter=*/nullptr, &context);
+
+  std::vector<uint32_t> output_indices = {0};
+  std::unordered_map<int, int> quant_conversion_map = {{0, 1}};
+
+  auto status = QuantizeOutputs(&context, output_indices, quant_conversion_map);
+  EXPECT_TRUE(status.ok());
+  EXPECT_THAT(quantized_data, Pointwise(Eq(), {-3, -2, -1, 1, 2, 3}));
+}
+
+TEST(QuantizeOutputs, UInt8) {
+  TfLiteContext context;
+  auto input_dims = BuildTfLiteIntArray({1, 3, 2, 1});
+  std::vector<float> data = {0.0, 0.1, 0.2, 0.3, 0.4, 0.5};
+  std::vector<uint8_t> quantized_data(data.size());
+  TfLiteTensor output = tflite::testing::CreateFloatTensor(
+      data.data(), input_dims.get(), "output", /*is_variable=*/false);
+  TfLiteTensor quantized_output = tflite::testing::CreateQuantizedTensor(
+      quantized_data.data(), input_dims.get(), "output_quant",
+      /*min=*/0.0f, /*max=*/25.5f, /*is_variable=*/true);
+
+  std::vector<TfLiteTensor> tensors{output, quantized_output};
+  tflite::testing::PopulateContext(tensors.data(), tensors.size(),
+                                   /*error_reporter=*/nullptr, &context);
+
+  std::vector<int64_t> output_indices = {0};
+  std::unordered_map<int, int> quant_conversion_map = {{0, 1}};
+
+  auto status = QuantizeOutputs(&context, output_indices, quant_conversion_map);
+  EXPECT_TRUE(status.ok());
+  EXPECT_THAT(quantized_data, Pointwise(Eq(), {0, 1, 2, 3, 4, 5}));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
index b0c5b7526f8..b5ceff30d1e 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
@@ -24,10 +24,10 @@ cc_library(
     hdrs = ["utils.h"],
     deps = [
         "//tensorflow/lite:framework",
-        "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
index f2a6fa10b1e..ae746cdb08d 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
@@ -11,7 +11,7 @@ cc_library(
     ],
     deps = [
         ":add",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/delegates/gpu/common/testing/feature_parity:utils",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc
index 8f6e3cc64bf..bdcbf7ed62e 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc
@@ -116,7 +116,7 @@ absl::optional<std::string> CoordinateToString(TfLiteIntArray* shape,
   return result;
 }
 
-// Builds intepreter for a model, allocates tensors.
+// Builds interpreter for a model, allocates tensors.
 absl::Status BuildInterpreter(const Model* model,
                               std::unique_ptr<Interpreter>* interpreter) {
   TfLiteStatus status =
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
index 68c4a1a0d1e..7c34978fb55 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
@@ -115,7 +115,7 @@ class TensorEqMatcher {
         return false;
       }
 
-      // 4. Proceed to data comparison. Iterate throught elements as they lay
+      // 4. Proceed to data comparison. Iterate through elements as they lay
       // flat. If some pair of elements don't match, deduct the coordinate
       // basing on the dimensions, then return.
       absl::Span<float> lhs_span(lhs.data.f, lhs.bytes / sizeof(float));
@@ -163,7 +163,7 @@ class TensorEqMatcher {
   const TfLiteTensor rhs_;
 };
 
-// Builds intepreter for a model, allocates tensors.
+// Builds interpreter for a model, allocates tensors.
 absl::Status BuildInterpreter(const Model* model,
                               std::unique_ptr<Interpreter>* interpreter);
 
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
index 4efb98a6847..b279e49e40c 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
@@ -48,8 +48,9 @@ class MergeConvolutionWithAdd : public SequenceTransformation {
     }
     AddAttributes add_attr =
         absl::any_cast<AddAttributes>(add_node.operation.attributes);
-    if (!absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param) &&
-        !absl::get_if<float>(&add_attr.param)) {
+    if (!absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
+            add_attr.param) &&
+        !absl::holds_alternative<float>(add_attr.param)) {
       return {TransformStatus::DECLINED,
               "This fuse applicable only for broadcast or scalar addition."};
     }
@@ -104,8 +105,9 @@ class MergeAddWithConvolution : public SequenceTransformation {
     }
     AddAttributes add_attr =
         absl::any_cast<AddAttributes>(add_node.operation.attributes);
-    if (!absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param) &&
-        !absl::get_if<float>(&add_attr.param)) {
+    if (!absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
+            add_attr.param) &&
+        !absl::holds_alternative<float>(add_attr.param)) {
       return {TransformStatus::DECLINED,
               "This fuse applicable only for broadcast or scalar addition."};
     }
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
index 749382c3417..f4ace3c0d41 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
@@ -45,8 +45,9 @@ class MergeConvolutionWithMul : public SequenceTransformation {
 
     MultiplyAttributes mul_attr =
         absl::any_cast<MultiplyAttributes>(mul_node.operation.attributes);
-    if (!absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param) &&
-        !absl::get_if<float>(&mul_attr.param)) {
+    if (!absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
+            mul_attr.param) &&
+        !absl::holds_alternative<float>(mul_attr.param)) {
       return {
           TransformStatus::DECLINED,
           "This fuse applicable only for broadcast or scalar multiplication."};
@@ -108,9 +109,9 @@ class MergeMulWithConvolution : public SequenceTransformation {
 
     MultiplyAttributes mul_attr =
         absl::any_cast<MultiplyAttributes>(mul_node.operation.attributes);
-    if (!absl::get_if<Tensor<Linear, DataType::FLOAT32>>(
-            &mul_attr.param) &&
-        !absl::get_if<float>(&mul_attr.param)) {
+    if (!absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
+            mul_attr.param) &&
+        !absl::holds_alternative<float>(mul_attr.param)) {
       return {
           TransformStatus::DECLINED,
           "This fuse applicable only for broadcast or scalar multiplication."};
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
index 23e99bc3305..2f1621eb34b 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
@@ -146,10 +146,11 @@ class MergePaddingWithAddOperation : public NodeTransformation {
 
     AddAttributes add_attr =
         absl::any_cast<AddAttributes>(add_node->operation.attributes);
-    const auto add_broadcast =
-        absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param);
-    const float* add_scalar = absl::get_if<float>(&add_attr.param);
-    if (add_broadcast || add_scalar) {
+    const bool is_add_broadcast =
+        absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
+            add_attr.param);
+    const bool is_add_scalar = absl::holds_alternative<float>(add_attr.param);
+    if (is_add_broadcast || is_add_scalar) {
       return {TransformStatus::SKIPPED,
               "Cannot remove padding when this broadcast/scalar ADD"};
     }
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
index e80b244b34f..b4cdd87109a 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
@@ -77,9 +77,9 @@ std::unique_ptr<SequenceTransformation> NewRemoveSingleInputAdd() {
         }
         auto& attr =
             absl::any_cast<const AddAttributes&>(node->operation.attributes);
-        return absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&attr.param) ==
-                   nullptr &&
-               absl::get_if<float>(&attr.param) == nullptr;
+        return !absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
+                   attr.param) &&
+               !absl::holds_alternative<float>(attr.param);
       });
 }
 
diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index 58da8862937..702dd9a8417 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+#include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/api2.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
@@ -210,12 +211,14 @@ class DelegateKernel {
 
     const bool is_dequant_required = !quant_conversion_map_.empty();
     if (is_dequant_required) {
-      RETURN_IF_ERROR(DequantizeInputs(context));
+      RETURN_IF_ERROR(
+          DequantizeInputs(context, input_indices_, quant_conversion_map_));
     }
     RETURN_IF_ERROR(SetInputsAndOutputs(context));
     RETURN_IF_ERROR(runner_->Run());
     if (is_dequant_required) {
-      RETURN_IF_ERROR(QuantizeOutputs(context));
+      RETURN_IF_ERROR(
+          QuantizeOutputs(context, output_indices_, quant_conversion_map_));
     }
     return absl::OkStatus();
   }
@@ -263,12 +266,12 @@ class DelegateKernel {
 
     input_refs->clear();
     output_refs->clear();
-    const auto& inputs = graph->inputs();
+    const auto inputs = graph->inputs();
     input_refs->reserve(inputs.size());
     for (const auto& input : inputs) {
       input_refs->push_back(input->tensor.ref);
     }
-    const auto& outputs = graph->outputs();
+    const auto outputs = graph->outputs();
     output_refs->reserve(outputs.size());
     for (const auto& output : outputs) {
       output_refs->push_back(output->tensor.ref);
@@ -277,70 +280,6 @@ class DelegateKernel {
     return absl::OkStatus();
   }
 
-  // TODO(b/150798231): Refactor these two into common utils when generalizing
-  // to other backends.
-
-  // Dequantizes input tensors pre-inference, leaving float tensors intact.
-  absl::Status DequantizeInputs(TfLiteContext* context) {
-    for (auto index : input_indices_) {
-      if (quant_conversion_map_.find(index) == quant_conversion_map_.end()) {
-        continue;
-      }
-      int original_tensor_idx = quant_conversion_map_[index];
-      const TfLiteTensor& dequantized_tflite_tensor = context->tensors[index];
-      const TfLiteTensor& original_tflite_tensor =
-          context->tensors[original_tensor_idx];
-      DequantizationParams op_params;
-      op_params.zero_point = original_tflite_tensor.params.zero_point;
-      op_params.scale = original_tflite_tensor.params.scale;
-      if (original_tflite_tensor.type == kTfLiteInt8) {
-        optimized_ops::Dequantize(op_params,
-                                  GetTensorShape(&original_tflite_tensor),
-                                  original_tflite_tensor.data.int8,
-                                  GetTensorShape(&original_tflite_tensor),
-                                  dequantized_tflite_tensor.data.f);
-      } else if (original_tflite_tensor.type == kTfLiteUInt8) {
-        optimized_ops::Dequantize(op_params,
-                                  GetTensorShape(&original_tflite_tensor),
-                                  original_tflite_tensor.data.uint8,
-                                  GetTensorShape(&original_tflite_tensor),
-                                  dequantized_tflite_tensor.data.f);
-      }
-    }
-    return absl::OkStatus();
-  }
-
-  // Quantizes output tensors post-inference, leaving float tensors intact.
-  absl::Status QuantizeOutputs(TfLiteContext* context) {
-    for (auto index : output_indices_) {
-      if (quant_conversion_map_.find(index) == quant_conversion_map_.end()) {
-        continue;
-      }
-      int original_tensor_idx = quant_conversion_map_[index];
-      const TfLiteTensor& dequantized_tflite_tensor = context->tensors[index];
-      const TfLiteTensor& original_tflite_tensor =
-          context->tensors[original_tensor_idx];
-      tflite::QuantizationParams op_params;
-      op_params.zero_point = original_tflite_tensor.params.zero_point;
-      op_params.scale = original_tflite_tensor.params.scale;
-      if (original_tflite_tensor.type == kTfLiteInt8) {
-        optimized_ops::AffineQuantize(op_params,
-                                      GetTensorShape(&original_tflite_tensor),
-                                      dequantized_tflite_tensor.data.f,
-                                      GetTensorShape(&original_tflite_tensor),
-                                      original_tflite_tensor.data.int8);
-      } else if (original_tflite_tensor.type == kTfLiteUInt8) {
-        optimized_ops::AffineQuantize(op_params,
-                                      GetTensorShape(&original_tflite_tensor),
-                                      dequantized_tflite_tensor.data.f,
-                                      GetTensorShape(&original_tflite_tensor),
-                                      original_tflite_tensor.data.uint8);
-      }
-    }
-
-    return absl::OkStatus();
-  }
-
   absl::Status InitializeOpenClApi(GraphFloat32* graph,
                                    std::unique_ptr<InferenceBuilder>* builder,
                                    bool* graph_is_destroyed) {
diff --git a/tensorflow/lite/delegates/gpu/gl/compiled_model.fbs b/tensorflow/lite/delegates/gpu/gl/compiled_model.fbs
index f25f9026629..6887b665ee4 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiled_model.fbs
+++ b/tensorflow/lite/delegates/gpu/gl/compiled_model.fbs
@@ -156,7 +156,7 @@ table CompiledModel {
 
 table Parameters {
   // indicated flow engine version that compiled this model. If engine version
-  // does not match compiled model, then a model need to be recompiled. 
+  // does not match compiled model, then a model need to be recompiled.
   // version:uint32; // not implemented
 
   // Could potentially be used to track environment when a model was compiled
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc b/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
index 1de49676219..344e494690a 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
@@ -145,6 +145,19 @@ absl::Status CreatePersistentBuffer(size_t size,
   return absl::OkStatus();
 }
 
+namespace gl_buffer_internal {
+
+BufferMapper::BufferMapper(GLenum target, size_t offset, size_t bytes,
+                           GLbitfield access)
+    : target_(target),
+      data_(glMapBufferRange(target_, offset, bytes, access)) {}
+
+BufferMapper::~BufferMapper() {
+  TFLITE_GPU_CALL_GL(glUnmapBuffer, target_).IgnoreError();
+}
+
+};  // namespace gl_buffer_internal
+
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_buffer.h b/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
index 3225679ec5a..1877fb1f144 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
@@ -229,11 +229,9 @@ class BufferBinder {
 // RAII for mapping and unmapping a buffer.
 class BufferMapper {
  public:
-  BufferMapper(GLenum target, size_t offset, size_t bytes, GLbitfield access)
-      : target_(target),
-        data_(glMapBufferRange(target_, offset, bytes, access)) {}
+  BufferMapper(GLenum target, size_t offset, size_t bytes, GLbitfield access);
 
-  ~BufferMapper() { TFLITE_GPU_CALL_GL(glUnmapBuffer, target_).IgnoreError(); }
+  ~BufferMapper();
 
   void* data() { return data_; }
 
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index d2ef617a8e2..700a553a125 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -73,6 +73,7 @@ cc_library(
 cc_test(
     name = "add_test",
     srcs = ["add_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -102,6 +103,7 @@ cc_library(
 cc_test(
     name = "concat_test",
     srcs = ["concat_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -136,6 +138,7 @@ cc_library(
 cc_test(
     name = "conv_test",
     srcs = ["conv_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -176,6 +179,7 @@ cc_library(
 cc_test(
     name = "depthwise_conv_test",
     srcs = ["depthwise_conv_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -205,6 +209,7 @@ cc_library(
 cc_test(
     name = "elementwise_test",
     srcs = ["elementwise_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -235,6 +240,7 @@ cc_library(
 cc_test(
     name = "fully_connected_test",
     srcs = ["fully_connected_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -263,6 +269,7 @@ cc_library(
 cc_test(
     name = "lstm_test",
     srcs = ["lstm_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -292,6 +299,7 @@ cc_library(
 cc_test(
     name = "max_unpooling_test",
     srcs = ["max_unpooling_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -322,6 +330,7 @@ cc_library(
 cc_test(
     name = "mean_test",
     srcs = ["mean_test.cc"],
+    linkstatic = True,
     tags = [
         "notap",
         "tflite_not_portable_ios",
@@ -351,6 +360,7 @@ cc_library(
 cc_test(
     name = "mul_test",
     srcs = ["mul_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -380,6 +390,7 @@ cc_library(
 cc_test(
     name = "pad_test",
     srcs = ["pad_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -409,6 +420,7 @@ cc_library(
 cc_test(
     name = "pooling_test",
     srcs = ["pooling_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -440,6 +452,7 @@ cc_library(
 cc_test(
     name = "prelu_test",
     srcs = ["prelu_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -471,6 +484,7 @@ cc_library(
 cc_test(
     name = "quantize_and_dequantize_test",
     srcs = ["quantize_and_dequantize_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -501,6 +515,7 @@ cc_library(
 cc_test(
     name = "relu_test",
     srcs = ["relu_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -529,6 +544,7 @@ cc_library(
 cc_test(
     name = "reshape_test",
     srcs = ["reshape_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -558,6 +574,7 @@ cc_library(
 cc_test(
     name = "slice_test",
     srcs = ["slice_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -589,6 +606,7 @@ cc_library(
 cc_test(
     name = "softmax_test",
     srcs = ["softmax_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -618,6 +636,7 @@ cc_library(
 cc_test(
     name = "space_to_depth_test",
     srcs = ["space_to_depth_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -679,6 +698,7 @@ cc_library(
 cc_test(
     name = "transpose_conv_test",
     srcs = ["transpose_conv_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -708,6 +728,7 @@ cc_library(
 cc_test(
     name = "resize_test",
     srcs = ["resize_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
diff --git a/tensorflow/lite/delegates/gpu/gl/object.h b/tensorflow/lite/delegates/gpu/gl/object.h
index 3463d0678b6..0c2a2326356 100644
--- a/tensorflow/lite/delegates/gpu/gl/object.h
+++ b/tensorflow/lite/delegates/gpu/gl/object.h
@@ -70,7 +70,7 @@ struct Object {
 
 // @return true if object is a reference.
 inline bool IsRef(const Object& object) {
-  return !absl::get_if<ObjectData>(&object.object);
+  return !absl::holds_alternative<ObjectData>(object.object);
 }
 
 inline ObjectRef GetRef(const Object& object) {
diff --git a/tensorflow/lite/delegates/gpu/gl/runtime.cc b/tensorflow/lite/delegates/gpu/gl/runtime.cc
index 2a48b59c8d9..b7e01a33570 100644
--- a/tensorflow/lite/delegates/gpu/gl/runtime.cc
+++ b/tensorflow/lite/delegates/gpu/gl/runtime.cc
@@ -483,7 +483,7 @@ absl::Status ApplyTexturesAssignment(
     Object* object = global_ref_to_object_ptr[global_ref];
     if (usage_rec_id == kNotAssigned || object == nullptr ||
         object->object_type != ObjectType::TEXTURE ||
-        !absl::get_if<ObjectSizeT>(&object->size)) {
+        !absl::holds_alternative<ObjectSizeT>(object->size)) {
       // Skip objects with other data type, non-textures and textures with wrong
       // number of dimensions.
       continue;
diff --git a/tensorflow/lite/delegates/gpu/gl_delegate.cc b/tensorflow/lite/delegates/gpu/gl_delegate.cc
index f6b2067d90c..f18c665a15d 100644
--- a/tensorflow/lite/delegates/gpu/gl_delegate.cc
+++ b/tensorflow/lite/delegates/gpu/gl_delegate.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/gl_delegate.h"
 
+#include <EGL/egl.h>
+#include <GLES3/gl31.h>
+
 #include <algorithm>
 #include <cstdint>
 #include <cstring>
@@ -22,8 +25,6 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include <EGL/egl.h>
-#include <GLES3/gl31.h>
 #include "absl/types/span.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
diff --git a/tensorflow/lite/delegates/gpu/gl_delegate.h b/tensorflow/lite/delegates/gpu/gl_delegate.h
index bfc15fb120e..fa8eec2ad6b 100644
--- a/tensorflow/lite/delegates/gpu/gl_delegate.h
+++ b/tensorflow/lite/delegates/gpu/gl_delegate.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_DELEGATE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_DELEGATE_H_
 
+#include <GLES3/gl31.h>
 #include <stdint.h>
 
-#include <GLES3/gl31.h>
 #include "absl/base/macros.h"
 #include "tensorflow/lite/c/common.h"
 
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
index 8d802ae044a..78cab0d2cbf 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
+++ b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
@@ -17,18 +17,19 @@ package org.tensorflow.lite.gpu;
 
 import java.io.Closeable;
 import org.tensorflow.lite.Delegate;
+import org.tensorflow.lite.annotations.UsedByReflection;
 
 /**
  * {@link Delegate} for GPU inference.
  *
- * <p>Note: When calling {@code Interpreter.modifyGraphWithDelegate()}/
- * {@code Interpreter.Options.addDelegate()} and {@code Interpreter.run()}, the caller must have an
- * {@code EGLContext} in the <b>current thread</b> and {@code Interpreter.run()} must be called from
- * the same {@code EGLContext}. If an {@code EGLContext} does not exist, the delegate will
- * internally create one, but then the developer must ensure that {@code Interpreter.run()} is
- * always called from the same thread in which {@code Interpreter.modifyGraphWithDelegate()} was
- * called.
+ * <p>Note: When calling {@code Interpreter.modifyGraphWithDelegate()}/ {@code
+ * Interpreter.Options.addDelegate()} and {@code Interpreter.run()}, the caller must have an {@code
+ * EGLContext} in the <b>current thread</b> and {@code Interpreter.run()} must be called from the
+ * same {@code EGLContext}. If an {@code EGLContext} does not exist, the delegate will internally
+ * create one, but then the developer must ensure that {@code Interpreter.run()} is always called
+ * from the same thread in which {@code Interpreter.modifyGraphWithDelegate()} was called.
  */
+@UsedByReflection("TFLiteSupport/model/GpuDelegateProxy")
 public class GpuDelegate implements Delegate, Closeable {
 
   private static final long INVALID_DELEGATE_HANDLE = 0;
@@ -62,6 +63,18 @@ public class GpuDelegate implements Delegate, Closeable {
       return this;
     }
 
+    /**
+     * Enables running quantized models with the delegate. Defaults to false.
+     *
+     * <p>WARNING: This is an experimental API and subject to change.
+     *
+     * @param quantizedModelsAllowed When {@code true}, the GPU may run quantized models.
+     */
+    public Options setQuantizedModelsAllowed(boolean quantizedModelsAllowed) {
+      this.quantizedModelsAllowed = quantizedModelsAllowed;
+      return this;
+    }
+
     /**
      * Sets the inference preference for precision/compilation/runtime tradeoffs.
      *
@@ -74,13 +87,19 @@ public class GpuDelegate implements Delegate, Closeable {
     }
 
     boolean precisionLossAllowed = true;
+    boolean quantizedModelsAllowed = false;
     int inferencePreference = INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
   }
 
   public GpuDelegate(Options options) {
-    delegateHandle = createDelegate(options.precisionLossAllowed, options.inferencePreference);
+    delegateHandle =
+        createDelegate(
+            options.precisionLossAllowed,
+            options.quantizedModelsAllowed,
+            options.inferencePreference);
   }
 
+  @UsedByReflection("TFLiteSupport/model/GpuDelegateProxy")
   public GpuDelegate() {
     this(new Options());
   }
@@ -107,7 +126,8 @@ public class GpuDelegate implements Delegate, Closeable {
     System.loadLibrary(TFLITE_GPU_LIB);
   }
 
-  private static native long createDelegate(boolean precisionLossAllowed, int preference);
+  private static native long createDelegate(
+      boolean precisionLossAllowed, boolean quantizedModelsAllowed, int preference);
 
   private static native void deleteDelegate(long delegateHandle);
 }
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
index 089e2c2f816..900cc0e0d75 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
+++ b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
@@ -23,7 +23,7 @@ extern "C" {
 
 JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_createDelegate(
     JNIEnv* env, jclass clazz, jboolean precision_loss_allowed,
-    jint inference_preference) {
+    jboolean quantized_models_allowed, jint inference_preference) {
   TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
   if (precision_loss_allowed == JNI_TRUE) {
     options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
@@ -31,6 +31,10 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_createDelegate(
         TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE;
     options.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
   }
+  options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
+  if (quantized_models_allowed) {
+    options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
+  }
   options.inference_preference = static_cast<int32_t>(inference_preference);
   return reinterpret_cast<jlong>(TfLiteGpuDelegateV2Create(&options));
 }
diff --git a/tensorflow/lite/delegates/gpu/metal/BUILD b/tensorflow/lite/delegates/gpu/metal/BUILD
index 192c787b0db..4db8f3d071d 100644
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@@ -80,7 +80,7 @@ objc_library(
 ios_unit_test(
     name = "common_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -117,7 +117,7 @@ objc_library(
 ios_unit_test(
     name = "compiled_model_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -205,7 +205,7 @@ objc_library(
 ios_unit_test(
     name = "inference_context_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -235,7 +235,7 @@ ios_application(
         "iphone",
     ],
     infoplists = ["Info.plist"],
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     provisioning_profile = "//tensorflow/lite/delegates/gpu/metal:provisioning_profile.mobileprovision",
     tags = tf_gpu_tests_tags() + [
         "local",
@@ -267,7 +267,7 @@ objc_library(
 
 ios_unit_test(
     name = "ComponentsTests",
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + ["notap"],
     test_host = ":TestApplication",
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index a1052b8adf4..657e9b53a59 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -71,7 +71,7 @@ objc_library(
 ios_unit_test(
     name = "add_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -109,7 +109,7 @@ objc_library(
 ios_unit_test(
     name = "concat_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -151,7 +151,7 @@ objc_library(
 ios_unit_test(
     name = "conv_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -213,7 +213,7 @@ objc_library(
 ios_unit_test(
     name = "depthwise_conv_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -253,7 +253,7 @@ objc_library(
 ios_unit_test(
     name = "elementwise_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -293,7 +293,7 @@ objc_library(
 ios_unit_test(
     name = "fully_connected_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -332,7 +332,7 @@ objc_library(
 ios_unit_test(
     name = "max_unpooling_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -371,7 +371,7 @@ objc_library(
 ios_unit_test(
     name = "mean_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = [
         "notap",
@@ -450,7 +450,7 @@ objc_library(
 ios_unit_test(
     name = "padding_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -490,7 +490,7 @@ objc_library(
 ios_unit_test(
     name = "pooling_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -530,7 +530,7 @@ objc_library(
 ios_unit_test(
     name = "prelu_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -569,7 +569,7 @@ objc_library(
 ios_unit_test(
     name = "relu_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -608,7 +608,7 @@ objc_library(
 ios_unit_test(
     name = "resize_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -648,7 +648,7 @@ objc_library(
 ios_unit_test(
     name = "reshape_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -687,7 +687,7 @@ objc_library(
 ios_unit_test(
     name = "slice_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -727,7 +727,7 @@ objc_library(
 ios_unit_test(
     name = "softmax_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -764,7 +764,7 @@ objc_library(
 ios_unit_test(
     name = "space_to_depth_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -804,7 +804,7 @@ objc_library(
 ios_unit_test(
     name = "transpose_conv_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -885,7 +885,7 @@ objc_library(
 ios_unit_test(
     name = "winograd_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
diff --git a/tensorflow/lite/delegates/interpreter_utils.cc b/tensorflow/lite/delegates/interpreter_utils.cc
new file mode 100644
index 00000000000..89955b23361
--- /dev/null
+++ b/tensorflow/lite/delegates/interpreter_utils.cc
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/interpreter_utils.h"
+
+namespace tflite {
+namespace delegates {
+TfLiteStatus InterpreterUtils::InvokeWithCPUFallback(Interpreter* interpreter) {
+  TfLiteStatus status = interpreter->Invoke();
+  if (status == kTfLiteOk || interpreter->IsCancelled() ||
+      !interpreter->HasDelegates()) {
+    return status;
+  }
+  // Retry without delegation.
+  // TODO(b/138706191): retry only if error is due to delegation.
+  TF_LITE_REPORT_ERROR(
+      interpreter->error_reporter(),
+      "Invoke() failed in the presence of delegation. Retrying without.");
+
+  // Copy input data to a buffer.
+  // Input data is safe since Subgraph::PrepareOpsAndTensors() passes
+  // preserve_inputs=true to ArenaPlanner.
+  std::vector<char> buf;
+  size_t input_size = 0;
+
+  for (auto i : interpreter->inputs()) {
+    TF_LITE_ENSURE_STATUS(interpreter->EnsureTensorDataIsReadable(i));
+    TfLiteTensor* t = interpreter->tensor(i);
+    input_size += t->bytes;
+  }
+  buf.reserve(input_size);
+  for (auto i : interpreter->inputs()) {
+    TfLiteTensor* t = interpreter->tensor(i);
+    buf.insert(buf.end(), t->data.raw, t->data.raw + t->bytes);
+  }
+
+  TF_LITE_ENSURE_STATUS(interpreter->RemoveAllDelegates());
+
+  // Copy inputs from buffer.
+  auto bufp = buf.begin();
+  for (auto i : interpreter->inputs()) {
+    TfLiteTensor* t = interpreter->tensor(i);
+    std::copy(bufp, bufp + t->bytes, t->data.raw);
+    bufp += t->bytes;
+  }
+
+  // Invoke again.
+  TF_LITE_ENSURE_STATUS(interpreter->Invoke());
+  return kTfLiteDelegateError;
+}
+
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/interpreter_utils.h b/tensorflow/lite/delegates/interpreter_utils.h
new file mode 100644
index 00000000000..f736c2db1f4
--- /dev/null
+++ b/tensorflow/lite/delegates/interpreter_utils.h
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
+#define TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
+
+#include "tensorflow/lite/interpreter.h"
+
+// Utility functions and classes for using delegates.
+
+namespace tflite {
+namespace delegates {
+#if !TFLITE_EXPERIMENTAL_RUNTIME_EAGER
+class InterpreterUtils {
+ public:
+  /// Invokes an interpreter with automatic fallback from delegation to CPU.
+  ///
+  /// If using the delegate fails, the delegate is automatically undone and an
+  /// attempt made to return the interpreter to an invokable state.
+  ///
+  /// Allowing the fallback is suitable only if both of the following hold:
+  /// - The caller is known not to cache pointers to tensor data across Invoke()
+  ///   calls.
+  /// - The model is not stateful (no variables, no LSTMs) or the state isn't
+  ///   needed between batches.
+  ///
+  /// Returns one of the following three status codes:
+  /// 1. kTfLiteOk: Success. Output is valid.
+  /// 2. kTfLiteDelegateError: Delegate error but fallback succeeded. Output is
+  /// valid.
+  /// NOTE: This undoes all delegates previously applied to the Interpreter.
+  /// 3. kTfLiteError: Unexpected/runtime failure. Output is invalid.
+  /// WARNING: This is an experimental API and subject to change.
+  static TfLiteStatus InvokeWithCPUFallback(Interpreter* interpreter);
+};
+#endif  // !TFLITE_EXPERIMENTAL_RUNTIME_EAGER
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index 11f6b188d7d..5df7b8cf427 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -32,6 +32,7 @@ cc_library(
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/nnapi:nnapi_lib",
@@ -68,6 +69,7 @@ cc_library(
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/nnapi:nnapi_lib",
diff --git a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
index 01d56fb2102..7800e984c7f 100644
--- a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
+++ b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
@@ -56,6 +56,7 @@ FloatActivationsOpTest/PRelu,29
 LogisticOpTest/LogisticOpTest/Sigmoid(.+nt8)?/\d+
 LogisticOpTest/LogisticOpTest/Sigmoid/\d+
 TanhOpTest/TanhOpTest/Tanh(.+nt8)?/\d+,29
+FloatActivationsOpTest/Elu,30
 FloatActivationsOpTest/HardSwish
 QuantizedActivationsOpTest/HardSwish
 QuantizedActivationsOpTest/HardSwishBias
@@ -172,6 +173,11 @@ ExpOpTest/FloatTest,29
 # Only constant tensors models
 ExpandDimsOpTest/.+/1,29
 
+# fill_test
+FillOpTest/FillOpTest/FillInt32/0,30
+FillOpTest/FillOpTest/FillFloat/0,30
+FillOpTest/FillOpTest/FillFloatInt32Dims/0,30
+
 # floor_test
 FloorOpTest/.+
 
@@ -300,13 +306,15 @@ VariedShapeSpec/ReshapeOpTest/RegularShapes/1
 VariedShapeSpec/ReshapeOpTest/WithStretchDimension/1
 
 # resize_bilinear_test
+// align_corners & half_pixel_centers are not implemented in NNAPI before API 30
+ResizeBilinearOpTest/ResizeBilinearOpTest.+HalfPixelCenters.*/0,30
 // Only models with constant size tensor are accelerated
 ResizeBilinearOpTest/ResizeBilinearOpTest/.+/0,29
 
 # resize_nearest_neighbor_test
-// align_corners & half_pixel_centers are not implemented in NNAPI.
--ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+AlignCorners.*,29
--ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+HalfPixelCenters.*,29
+// align_corners & half_pixel_centers are not implemented in NNAPI before API 30
+ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+AlignCorners.*/0,30
+ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+HalfPixelCenters.*/0,30
 // Only models with constant size tensor are accelerated
 ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest/.+/0,29
 
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
index 105c02dadba..533407e5d3a 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
@@ -118,12 +118,24 @@ public class NnApiDelegate implements Delegate, AutoCloseable {
       return this;
     }
 
+    /**
+     * Enable or disable to allow fp32 computation to be run in fp16 in NNAPI. See
+     * https://source.android.com/devices/neural-networks#android-9
+     *
+     * <p>Only effective on Android 9 (API level 28) and above.
+     */
+    public Options setAllowFp16(boolean enable) {
+      this.allowFp16 = enable;
+      return this;
+    }
+
     private int executionPreference = EXECUTION_PREFERENCE_UNDEFINED;
     private String acceleratorName = null;
     private String cacheDir = null;
     private String modelToken = null;
     private Integer maxDelegatedPartitions = null;
     private Boolean useNnapiCpu = null;
+    private Boolean allowFp16 = null;
   }
 
   public NnApiDelegate(Options options) {
@@ -139,7 +151,8 @@ public class NnApiDelegate implements Delegate, AutoCloseable {
             /*overrideDisallowCpu=*/ options.useNnapiCpu != null,
             /*disallowCpuValue=*/ options.useNnapiCpu != null
                 ? !options.useNnapiCpu.booleanValue()
-                : false);
+                : false,
+            options.allowFp16 != null ? options.allowFp16 : false);
   }
 
   public NnApiDelegate() {
@@ -204,7 +217,8 @@ public class NnApiDelegate implements Delegate, AutoCloseable {
       String modelToken,
       int maxDelegatedPartitions,
       boolean overrideDisallowCpu,
-      boolean disallowCpuValue);
+      boolean disallowCpuValue,
+      boolean allowFp16);
 
   private static native void deleteDelegate(long delegateHandle);
 
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc
index 6b5171ddfef..df2c4030e6c 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc
@@ -27,7 +27,8 @@ JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_nnapi_NnApiDelegate_createDelegate(
     JNIEnv* env, jclass clazz, jint preference, jstring accelerator_name,
     jstring cache_dir, jstring model_token, jint max_delegated_partitions,
-    jboolean override_disallow_cpu, jboolean disallow_cpu_value) {
+    jboolean override_disallow_cpu, jboolean disallow_cpu_value,
+    jboolean allow_fp16) {
   StatefulNnApiDelegate::Options options = StatefulNnApiDelegate::Options();
   options.execution_preference =
       (StatefulNnApiDelegate::Options::ExecutionPreference)preference;
@@ -49,6 +50,10 @@ Java_org_tensorflow_lite_nnapi_NnApiDelegate_createDelegate(
     options.disallow_nnapi_cpu = disallow_cpu_value;
   }
 
+  if (allow_fp16) {
+    options.allow_fp16 = allow_fp16;
+  }
+
   auto delegate = new StatefulNnApiDelegate(options);
 
   if (options.accelerator_name) {
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 67e038e962e..bdcb72848f6 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdarg>
 #include <cstddef>
 #include <cstdint>
+#include <cstdio>
 #include <cstring>
 #include <functional>
 #include <initializer_list>
@@ -52,6 +53,7 @@ limitations under the License.
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h"
 #include "tensorflow/lite/delegates/nnapi/quant_lstm_sup.h"
+#include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
@@ -659,8 +661,10 @@ class NNAPIOpBuilder {
   // Lower hardswish according to the following equation:
   // hard_swish[x] = x (ReLU6(x + 3)) / 6 == x * (Relu_N1_to_1(x/3) * 3 + 3) / 6
   // = 0.5x * Relu_N1_to_1(x/3) + 0.5x
-  TfLiteStatus AddHardSwish(int lite_input_index, int lite_output_index,
-                            bool need_int8_conversion, int lite_node_index) {
+  TfLiteStatus TransformHardSwishIntoSupportedOps(int lite_input_index,
+                                                  int lite_output_index,
+                                                  bool need_int8_conversion,
+                                                  int lite_node_index) {
     const TfLiteTensor& tensor = context_->tensors[lite_input_index];
     float input_scale = tensor.params.scale;
     int input_zero_point = tensor.params.zero_point;
@@ -1623,7 +1627,7 @@ bool NNAPIDelegateKernel::Validate(
       }
     } break;
     case kTfLiteBuiltinResizeBilinear: {
-      ExpectMaxOpVersion(version, 2, &val_ctx);
+      ExpectMaxOpVersion(version, 3, &val_ctx);
       const auto& input = context->tensors[node->inputs->data[0]];
       const auto output_dims = context->tensors[node->outputs->data[0]].dims;
       Expect(input.dims->size == 4,
@@ -1648,13 +1652,14 @@ bool NNAPIDelegateKernel::Validate(
       }
       auto builtin =
           reinterpret_cast<TfLiteResizeBilinearParams*>(node->builtin_data);
-      Expect(!builtin->align_corners,
-             NNAPIValidationFailureType::kUnsupportedOperandValue,
-             "NNAPI does not support align_corners == true.", &val_ctx);
-      // TODO(b/147696142): Update when NNAPI delegate can support TF2 behavior.
-      Expect(!builtin->half_pixel_centers,
-             NNAPIValidationFailureType::kUnsupportedOperandValue,
-             "NNAPI does not support half_pixel_centers == true.", &val_ctx);
+      if (android_sdk_version <= kMinSdkVersionForNNAPI12) {
+        Expect(!builtin->align_corners,
+               NNAPIValidationFailureType::kUnsupportedOperandValue,
+               "NNAPI does not support align_corners == true.", &val_ctx);
+        Expect(!builtin->half_pixel_centers,
+               NNAPIValidationFailureType::kUnsupportedOperandValue,
+               "NNAPI does not support half_pixel_centers == true.", &val_ctx);
+      }
       if (android_sdk_version < kMinSdkVersionForNNAPI12) {
         Expect(input.type == kTfLiteFloat32,
                NNAPIValidationFailureType::kUnsupportedInputType,
@@ -1662,20 +1667,20 @@ bool NNAPIDelegateKernel::Validate(
       }
     } break;
     case kTfLiteBuiltinResizeNearestNeighbor: {
-      ExpectMaxOpVersion(version, 2, &val_ctx);
+      ExpectMaxOpVersion(version, 3, &val_ctx);
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
                                  &val_ctx);
       ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
       auto builtin = reinterpret_cast<TfLiteResizeNearestNeighborParams*>(
           node->builtin_data);
-      // TODO(b/149823713): Update when NNAPI delegate can support align_corners
-      // & half_pixel_centers.
-      Expect(!builtin->align_corners,
-             NNAPIValidationFailureType::kUnsupportedOperandValue,
-             "NNAPI does not support align_corners == true.", &val_ctx);
-      Expect(!builtin->half_pixel_centers,
-             NNAPIValidationFailureType::kUnsupportedOperandValue,
-             "NNAPI does not support half_pixel_centers == true.", &val_ctx);
+      if (android_sdk_version <= kMinSdkVersionForNNAPI12) {
+        Expect(!builtin->align_corners,
+               NNAPIValidationFailureType::kUnsupportedOperandValue,
+               "NNAPI does not support align_corners == true.", &val_ctx);
+        Expect(!builtin->half_pixel_centers,
+               NNAPIValidationFailureType::kUnsupportedOperandValue,
+               "NNAPI does not support half_pixel_centers == true.", &val_ctx);
+      }
     } break;
     case kTfLiteBuiltinSqueeze: {
       ExpectOpVersion(version, 1, &val_ctx);
@@ -2333,6 +2338,55 @@ bool NNAPIDelegateKernel::Validate(
              NNAPIValidationFailureType::kUnsupportedInputType,
              "NNAPI only supports floating point input.", &val_ctx);
     } break;
+    case kTfLiteBuiltinElu: {
+      ExpectOpVersion(version, 1, &val_ctx);
+      ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI13,
+                                 &val_ctx);
+    } break;
+    case kTfLiteBuiltinFill: {
+      ExpectOpVersion(version, 1, &val_ctx);
+      ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI13,
+                                 &val_ctx);
+      const auto& dims_tensor = context->tensors[node->inputs->data[0]];
+      Expect(IsConstantTensor(&dims_tensor),
+             NNAPIValidationFailureType::kUnsupportedInputType,
+             "NNAPI doesn't support dynamic dimensions tensor.", &val_ctx);
+      EXPECT_INPUT_TYPE_IN(dims_tensor.type, kTfLiteInt32, kTfLiteInt64);
+      if (IsConstantTensor(&dims_tensor)) {
+        Expect(dims_tensor.dims->data[0] != 0,
+               NNAPIValidationFailureType::kUnsupportedOperandValue,
+               "NNAPI doesn't support generating scalars from FILL", &val_ctx);
+        if (dims_tensor.type == kTfLiteInt64) {
+          bool fit_in_int32 =
+              std::all_of(dims_tensor.data.i64,
+                          dims_tensor.data.i64 + dims_tensor.dims->data[0],
+                          [](int64_t dim) {
+                            return std::numeric_limits<int32_t>::min() <= dim &&
+                                   dim <= std::numeric_limits<int32_t>::max();
+                          });
+          Expect(fit_in_int32,
+                 NNAPIValidationFailureType::kUnsupportedOperandValue,
+                 "NNAPI only supports int32 dimensions tensor. If the "
+                 "dimensions type is int64 and they are constant we can "
+                 "convert them to int32 if the value isn't too large.",
+                 &val_ctx);
+        }
+      }
+      const auto& value_tensor = context->tensors[node->inputs->data[1]];
+      EXPECT_INPUT_TYPE_IN(value_tensor.type, kTfLiteFloat32, kTfLiteInt32,
+                           kTfLiteInt64);
+      if (value_tensor.type == kTfLiteInt64) {
+        Expect(
+            IsConstantTensor(&value_tensor) &&
+                *value_tensor.data.i64 <= std::numeric_limits<int32_t>::max() &&
+                *value_tensor.data.i64 >= std::numeric_limits<int32_t>::min(),
+            NNAPIValidationFailureType::kUnsupportedInputType,
+            "NNAPI only supports int32 input. If the input type is int64 and "
+            "constant we can convert it to int32 if the value isn't too "
+            "large.",
+            &val_ctx);
+      }
+    } break;
     default:
       // All other operators are not mapped.
       AddValidationFailure(NNAPIValidationFailureType::kUnsupportedOperator,
@@ -2418,6 +2472,9 @@ TfLiteStatus NNAPIDelegateKernel::Map(
       mapping_args.builder->AddScalarInt32Operand(builtin->activation);
       *nn_op_type = ANEURALNETWORKS_FULLY_CONNECTED;
     } break;
+    case kTfLiteBuiltinHardSwish: {
+      *nn_op_type = ANEURALNETWORKS_HARD_SWISH;
+    } break;
     case kTfLiteBuiltinSoftmax: {
       auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(
           mapping_args.node->builtin_data);
@@ -2436,6 +2493,14 @@ TfLiteStatus NNAPIDelegateKernel::Map(
       const int output_width = output.dims->data[2];
       mapping_args.builder->AddScalarInt32Operand(output_width);
       mapping_args.builder->AddScalarInt32Operand(output_height);
+      auto builtin = reinterpret_cast<TfLiteResizeBilinearParams*>(
+          mapping_args.node->builtin_data);
+      if (builtin->align_corners == true ||
+          builtin->half_pixel_centers == true) {
+        mapping_args.builder->AddScalarBoolOperand(false);  // Use NHWC format
+        mapping_args.builder->AddScalarBoolOperand(builtin->align_corners);
+        mapping_args.builder->AddScalarBoolOperand(builtin->half_pixel_centers);
+      }
       *nn_op_type = ANEURALNETWORKS_RESIZE_BILINEAR;
     } break;
     case kTfLiteBuiltinResizeNearestNeighbor: {
@@ -2445,7 +2510,13 @@ TfLiteStatus NNAPIDelegateKernel::Map(
       mapping_args.builder->AddScalarInt32Operand(new_shape.data.i32[1]);
       mapping_args.builder->AddScalarInt32Operand(new_shape.data.i32[0]);
       mapping_args.builder->AddScalarBoolOperand(false);  // Use NHWC format
-
+      auto builtin = reinterpret_cast<TfLiteResizeNearestNeighborParams*>(
+          mapping_args.node->builtin_data);
+      if (builtin->align_corners == true ||
+          builtin->half_pixel_centers == true) {
+        mapping_args.builder->AddScalarBoolOperand(builtin->align_corners);
+        mapping_args.builder->AddScalarBoolOperand(builtin->half_pixel_centers);
+      }
       *nn_op_type = ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR;
     } break;
     case kTfLiteBuiltinSqueeze: {
@@ -3096,6 +3167,13 @@ TfLiteStatus NNAPIDelegateKernel::Map(
       mapping_args.builder->AddScalarBoolOperand(builtin->keep_dims);
       *nn_op_type = ANEURALNETWORKS_REDUCE_SUM;
     } break;
+    case kTfLiteBuiltinElu: {
+      mapping_args.builder->AddScalarFloat32Operand(1.0);
+      *nn_op_type = ANEURALNETWORKS_ELU;
+    } break;
+    case kTfLiteBuiltinFill: {
+      *nn_op_type = ANEURALNETWORKS_FILL;
+    } break;
     default:
       // All other operators are not mapped.
       return kTfLiteError;
@@ -3136,7 +3214,8 @@ TfLiteStatus NNAPIDelegateKernel::Init(TfLiteContext* context,
                                     "creating NNAPI model", nnapi_errno);
     nn_model_.reset(model);
 
-    TF_LITE_ENSURE_STATUS(BuildGraph(context, params->input_tensors,
+    TF_LITE_ENSURE_STATUS(BuildGraph(context, delegate_options,
+                                     params->input_tensors,
                                      params->output_tensors, nnapi_errno));
   }
 
@@ -3230,6 +3309,22 @@ TfLiteStatus NNAPIDelegateKernel::Prepare(TfLiteContext* context,
     RETURN_TFLITE_ERROR_IF_NN_ERROR(context, set_caching_result,
                                     "configuring NNAPI caching", nnapi_errno);
   }
+  // Set compilation timeout if applicable.
+  if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI13) {
+    if (delegate_options.max_compilation_timeout_duration_ns > 0) {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksCompilation_setTimeout(
+              compilation,
+              delegate_options.max_compilation_timeout_duration_ns),
+          "setting compilation timeout", nnapi_errno);
+    }
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context,
+        nnapi_->ANeuralNetworksCompilation_setPriority(
+            compilation, delegate_options.execution_priority),
+        "setting compilation priority", nnapi_errno);
+  }
   const int finish_result =
       nnapi_->ANeuralNetworksCompilation_finish(compilation);
   if (finish_result != ANEURALNETWORKS_NO_ERROR) {
@@ -3296,6 +3391,27 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
   std::unique_ptr<ANeuralNetworksExecution, NNFreeExecution>
       execution_unique_ptr(execution, NNFreeExecution(nnapi_));
 
+  // Set compilation timeout if applicable.
+  const auto delegate_options =
+      StatefulNnApiDelegate::GetOptions(node->delegate);
+  if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI13) {
+    if (delegate_options.max_execution_timeout_duration_ns > 0) {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksExecution_setTimeout(
+              execution, delegate_options.max_execution_timeout_duration_ns),
+          "setting execution timeout", nnapi_errno);
+    }
+    if (delegate_options.max_execution_loop_timeout_duration_ns > 0) {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksExecution_setLoopTimeout(
+              execution,
+              delegate_options.max_execution_loop_timeout_duration_ns),
+          "setting execution loop timeout", nnapi_errno);
+    }
+  }
+
   // Set the input tensor buffers. Note: we access tflite tensors using
   // absolute indices but NN api indices inputs by relative indices.
   int relative_input_index = 0;
@@ -3572,10 +3688,14 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
       input_tensor_flags |= NN_TENSOR_FLAG_SCALAR_AS_TENSOR;
     }
 
-    // h_swish will be lowered into supported NNAPI operations.
-    if (reg->builtin_code == kTfLiteBuiltinHardSwish) {
-      builder.AddHardSwish(node->inputs->data[0], node->outputs->data[0],
-                           need_int8_conversion, node_index);
+    // On SDK level less than 30, h_swish will be lowered into supported NNAPI
+    // operations. Since SDK level 30, h_swish is supported as a single
+    // operation.
+    if (reg->builtin_code == kTfLiteBuiltinHardSwish &&
+        nnapi_->android_sdk_version < kMinSdkVersionForNNAPI13) {
+      builder.TransformHardSwishIntoSupportedOps(
+          node->inputs->data[0], node->outputs->data[0], need_int8_conversion,
+          node_index);
       continue;
     }
     // Map inputs to NN API tensor indices.
@@ -3754,7 +3874,8 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
         }
       } else if (reg->builtin_code == kTfLiteBuiltinMaximum ||
                  reg->builtin_code == kTfLiteBuiltinMinimum) {
-        const TfLiteTensor& operand_tensor = context->tensors[input_pos];
+        const TfLiteTensor& operand_tensor =
+            context->tensors[node->inputs->data[input_pos]];
         if (operand_tensor.dims->size == 0) {
           int tensor_index;
 
@@ -3799,7 +3920,8 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
                   reg->builtin_code == kTfLiteBuiltinSum) &&
                  (input_pos == 1)) {
         // The axis needs, be converted to a tensor if specified as scalar
-        const TfLiteTensor& axis_tensor = context->tensors[1];
+        const TfLiteTensor& axis_tensor =
+            context->tensors[node->inputs->data[input_pos]];
         if (axis_tensor.dims->size == 0) {
           TF_LITE_ENSURE_STATUS(
               builder.AddVectorInt32Operand(axis_tensor.data.i32, 1));
@@ -3807,6 +3929,52 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
           TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op,
                                                        input_tensor_flags));
         }
+      } else if (reg->builtin_code == kTfLiteBuiltinFill) {
+        if (input_pos == 0) {
+          const int dims_id = node->inputs->data[0];
+          const TfLiteTensor& dims_tensor = context->tensors[dims_id];
+          switch (dims_tensor.type) {
+            case kTfLiteInt32:
+              TF_LITE_ENSURE_STATUS(
+                  builder.AddTensorInput(input_index, hybrid_op));
+              break;
+            case kTfLiteInt64: {
+              // We made sure that dimensions are constant and fit into int32 in
+              // Map(), so we can safely create a new tensor with casted values.
+              const int dims_size = dims_tensor.dims->data[0];
+              std::vector<int32_t> dims_int32(dims_size);
+              std::copy(dims_tensor.data.i64, dims_tensor.data.i64 + dims_size,
+                        dims_int32.begin());
+              int new_tensor_index = -1;
+              builder.AddNewInputConstantTensor(
+                  ANEURALNETWORKS_TENSOR_INT32, kTfLiteInt32, dims_tensor.dims,
+                  dims_int32, dims_tensor.params, &new_tensor_index);
+            } break;
+            default:
+              return kTfLiteError;
+          }
+        } else {
+          const int value_id = node->inputs->data[1];
+          const TfLiteTensor& value_tensor = context->tensors[value_id];
+          switch (value_tensor.type) {
+            case kTfLiteFloat32:
+              TF_LITE_ENSURE_STATUS(
+                  builder.AddScalarFloat32Operand(*value_tensor.data.f));
+              break;
+            case kTfLiteInt32:
+              TF_LITE_ENSURE_STATUS(
+                  builder.AddScalarInt32Operand(*value_tensor.data.i32));
+              break;
+            case kTfLiteInt64:
+              // Map() function already makes sure int64 input is constant and
+              // fits into int32.
+              TF_LITE_ENSURE_STATUS(builder.AddScalarInt32Operand(
+                  static_cast<int32_t>(*value_tensor.data.i64)));
+              break;
+            default:
+              return kTfLiteError;
+          }
+        }
       } else {
         TF_LITE_ENSURE_STATUS(
             builder.AddTensorInput(input_index, hybrid_op, input_tensor_flags));
@@ -3858,8 +4026,10 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
 }
 
 TfLiteStatus NNAPIDelegateKernel::BuildGraph(
-    TfLiteContext* context, const TfLiteIntArray* input_tensors,
-    const TfLiteIntArray* output_tensors, int* nnapi_errno) {
+    TfLiteContext* context,
+    const StatefulNnApiDelegate::Options& delegate_options,
+    const TfLiteIntArray* input_tensors, const TfLiteIntArray* output_tensors,
+    int* nnapi_errno) {
   // Build the ops and tensors.
   TF_LITE_ENSURE_STATUS(AddOpsAndTensors(context, nnapi_errno));
   // Map input and output tensor indices to ANN
@@ -3924,11 +4094,13 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
           outputs.data()),
       "identifying model inputs and outputs", nnapi_errno);
 
+  auto allow_fp16 =
+      context->allow_fp32_relax_to_fp16 | delegate_options.allow_fp16;
   if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI11) {
     RETURN_TFLITE_ERROR_IF_NN_ERROR(
         context,
         nnapi_->ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-            nn_model_.get(), context->allow_fp32_relax_to_fp16),
+            nn_model_.get(), allow_fp16),
         "set relaxed computation mode for fp32 if possible", nnapi_errno);
   }
 
@@ -3950,6 +4122,8 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
 
 using ::tflite::delegate::nnapi::NNAPIDelegateKernel;
 
+StatefulNnApiDelegate::Data::Data(const NnApi* nnapi) : nnapi(nnapi) {}
+
 StatefulNnApiDelegate::Data::~Data() {
   std::for_each(std::begin(delegate_state_cache),
                 std::end(delegate_state_cache),
@@ -3987,9 +4161,7 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(Options options)
 
 StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
                                              Options options)
-    : TfLiteDelegate(TfLiteDelegateCreate()),
-      delegate_data_(Data{.execution_preference = options.execution_preference,
-                          .nnapi = nnapi}) {
+    : TfLiteDelegate(TfLiteDelegateCreate()), delegate_data_(nnapi) {
   if (options.accelerator_name) {
     delegate_data_.accelerator_name = options.accelerator_name;
   }
@@ -3999,9 +4171,11 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
   if (options.model_token) {
     delegate_data_.model_token = options.model_token;
   }
+  delegate_data_.execution_preference = options.execution_preference;
   delegate_data_.disallow_nnapi_cpu = options.disallow_nnapi_cpu;
   delegate_data_.max_number_delegated_partitions =
       options.max_number_delegated_partitions;
+  delegate_data_.allow_fp16 = options.allow_fp16;
   TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
                        "Created TensorFlow Lite delegate for NNAPI.");
   Prepare = DoPrepare;
@@ -4031,6 +4205,7 @@ const StatefulNnApiDelegate::Options StatefulNnApiDelegate::GetOptions(
   options.disallow_nnapi_cpu = delegate_data->disallow_nnapi_cpu;
   options.max_number_delegated_partitions =
       delegate_data->max_number_delegated_partitions;
+  options.allow_fp16 = delegate_data->allow_fp16;
   return options;
 }
 
@@ -4097,17 +4272,6 @@ int StatefulNnApiDelegate::GetNnApiErrno() const {
 using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI;
 using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI12;
 
-namespace {
-
-std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> BuildTfLiteIntArray(
-    const std::vector<int>& data) {
-  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> result(
-      TfLiteIntArrayCreate(data.size()));
-  std::copy(data.begin(), data.end(), result->data);
-  return result;
-}
-}  // namespace
-
 // static
 TfLiteStatus StatefulNnApiDelegate::GetNodesSupportedByAccelerator(
     TfLiteContext* context, TfLiteDelegate* delegate, const NnApi* nnapi,
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index fe777ea99aa..7ef02bc5107 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
 typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
@@ -89,6 +90,33 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // The selection is currently done sorting partitions in decreasing order
     // of number of nodes and selecting them until the limit is reached.
     int max_number_delegated_partitions = 3;
+
+    // allow fp32 compuation to be run in fp16.
+    bool allow_fp16 = false;
+
+    // Specifies the relative priority for executions of the model.
+    // Available values are {ANEURALNETWORKS_PRIORITY_LOW,
+    // ANEURALNETWORKS_PRIORITY_MEDIUM, ANEURALNETWORKS_PRIORITY_HIGH,
+    // ANEURALNETWORKS_PRIORITY_DEFAULT}.
+    int execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT;
+
+    // Specifies the maximum expected duration in nanosecond for compiling the
+    // model. If the device is not able to complete the compilation within the
+    // specified duration, the compilation may be aborted. If set to 0, the
+    // timeout duration is considered infinite.
+    uint64_t max_compilation_timeout_duration_ns = 0;
+
+    // Specifies the maximum expected duration in nanosecond for executing the
+    // model. If the device is not able to complete the execution within the
+    // specified duration, the execution may be aborted. If set to 0, the
+    // timeout duration is considered infinite.
+    uint64_t max_execution_timeout_duration_ns = 0;
+
+    // Specifies the maximum expected duration in nanosecond for WHILE loops in
+    // the execution. If a WHILE loop condition model does not output false
+    // within the specified duration, the execution will be aborted. If set to
+    // 0, the default timeout for loops will be used.
+    uint64_t max_execution_loop_timeout_duration_ns = 0;
   };
 
   // Uses default options.
@@ -153,8 +181,6 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
  private:
   // Encapsulates all delegate data.
   struct Data {
-    // Preferred Power/perf trade-off.
-    Options::ExecutionPreference execution_preference;
     // Pointer to NNAPI implementation to be used by this delegate as
     // set when building the StatefulNnApiDelegate instance.
     // Will generally be the NnApiInstance() singleton but can be overridden
@@ -162,6 +188,8 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // The ownership of the nnapi instance is left to the caller of
     // the StatefulNnApiDelegate constructor.
     const NnApi* nnapi;
+    // Preferred Power/perf trade-off.
+    Options::ExecutionPreference execution_preference;
     // Selected NNAPI accelerator name.
     std::string accelerator_name;
     // The cache dir for NNAPI model.
@@ -174,7 +202,7 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     std::vector<MemoryRegistration> tensor_memory_map;
     // Contains a non zero value if any NNAPI method call
     // operation returned a non zero result code.
-    int nnapi_errno;
+    int nnapi_errno = ANEURALNETWORKS_NO_ERROR;
     // Cache of kernels already built in StatefulNnApiDelegate::DoPrepare
     // when trying to understand if all nodes are supported by the target
     // accelerators.
@@ -184,7 +212,21 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // Maximum number of NNAPI partition to delegate. Zero or negative means
     // no limit. Copied from StatefulNnApiDelegate::Options
     int max_number_delegated_partitions;
+    // allow fp32 computation to be run in fp16.
+    bool allow_fp16;
+    // Specifies the relative priority for executions of the model.
+    int execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT;
+    // Specifies the maximum expected duration in nanosecond for compiling the
+    // model.
+    uint64_t max_compilation_timeout_duration_ns = 0;
+    // Specifies the maximum expected duration in nanosecond for executing the
+    // model.
+    uint64_t max_execution_timeout_duration_ns = 0;
+    // Specifies the maximum expected duration in nanosecond for WHILE loops in
+    // the execution
+    uint64_t max_execution_loop_timeout_duration_ns = 0;
 
+    explicit Data(const NnApi* nnapi);
     ~Data();
 
     // Caches an initialised NNAPIDelegateKernel.
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
index 3c23054ea25..2bc7ae58449 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
@@ -27,7 +27,8 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(Options /* options */)
     : StatefulNnApiDelegate() {}
 
 StatefulNnApiDelegate::StatefulNnApiDelegate()
-    : TfLiteDelegate(TfLiteDelegateCreate()) {
+    : TfLiteDelegate(TfLiteDelegateCreate()),
+      delegate_data_(/*nnapi=*/nullptr) {
   Prepare = DoPrepare;
 }
 
@@ -46,6 +47,8 @@ int StatefulNnApiDelegate::GetNnApiErrno() const { return 0; }
 
 using ::tflite::delegate::nnapi::NNAPIDelegateKernel;
 
+StatefulNnApiDelegate::Data::Data(const NnApi* nnapi) : nnapi(nnapi) {}
+
 StatefulNnApiDelegate::Data::~Data() {}
 
 void StatefulNnApiDelegate::Data::CacheDelegateKernel(
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index b35bf0224fd..af93d9650c9 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -31,6 +31,7 @@ namespace nnapi {
 constexpr int32_t kMinSdkVersionForNNAPI = 27;
 constexpr int32_t kMinSdkVersionForNNAPI11 = 28;
 constexpr int32_t kMinSdkVersionForNNAPI12 = 29;
+constexpr int32_t kMinSdkVersionForNNAPI13 = 30;
 
 // Track tensor indices to NN API tensor indices mapping.
 class OperandMapping {
@@ -349,6 +350,7 @@ class NNAPIDelegateKernel {
   TfLiteStatus AddOpsAndTensors(TfLiteContext* context, int* nnapi_errno);
 
   TfLiteStatus BuildGraph(TfLiteContext* context,
+                          const StatefulNnApiDelegate::Options& options,
                           const TfLiteIntArray* input_tensors,
                           const TfLiteIntArray* output_tensors,
                           int* nnapi_errno);
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index ea9111c4567..acfa0c77d30 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -304,6 +304,23 @@ TEST(NNAPIDelegate, StatefulDelegateWithCompilationCaching) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
 }
 
+// Sanity check for the state-ful NNAPI delegate with QoS hints.
+TEST(NNAPIDelegate, StatefulDelegateWithQoS) {
+  StatefulNnApiDelegate::Options options;
+  options.execution_priority = ANEURALNETWORKS_PRIORITY_HIGH;
+  options.max_compilation_timeout_duration_ns = UINT64_MAX;
+  options.max_execution_timeout_duration_ns = UINT64_MAX;
+  options.max_execution_loop_timeout_duration_ns = UINT64_MAX;
+
+  FloatAddOpModel m(options, {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
+}
+
 // Sanity check for the state-ful NNAPI delegate using TfLiteBufferHandle.
 TEST(NNAPIDelegate, StatefulDelegateWithBufferHandles) {
   // Skip the test if Android specific functions could not be found.
diff --git a/tensorflow/lite/delegates/status.h b/tensorflow/lite/delegates/status.h
new file mode 100644
index 00000000000..e56bf7ce577
--- /dev/null
+++ b/tensorflow/lite/delegates/status.h
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_STATUS_H_
+#define TENSORFLOW_LITE_DELEGATES_STATUS_H_
+
+#include <cstdint>
+#include <limits>
+
+#include "tensorflow/lite/c/common.h"
+
+// This file defines data structures to represent detailed TFLite delegate
+// status, e.g. NNAPI delegate application failure because of a driver issue
+// etc. Such status is ONLY to be used for internal APIs.
+// Note, we simply use TfLiteStatus to represent high-level status while
+// delegate-specific status codes are defined with DelegateStatus.
+// WARNING: This is an experimental feature that is subject to change.
+namespace tflite {
+namespace delegates {
+
+// Defines the source of the code where it is generated from. We list all TFLite
+// delegates that're officially implemented and available as of April, 2020
+// (i.e. w/ 'TFLITE_' prefix to imply this).
+enum class DelegateStatusSource {
+  NONE = 0,
+  TFLITE_GPU = 1,
+  TFLITE_NNAPI = 2,
+  TFLITE_HEXAGON = 3,
+  TFLITE_XNNPACK = 4,
+  TFLITE_COREML = 5,
+  MAX_NUM_SOURCES = std::numeric_limits<int32_t>::max(),
+};
+
+// Defines the detailed status that combines a DelegateStatusSource and a
+// status int32_t code.
+class DelegateStatus {
+ public:
+  DelegateStatus() : DelegateStatus(DelegateStatusSource::NONE, 0) {}
+  explicit DelegateStatus(int32_t code)
+      : DelegateStatus(DelegateStatusSource::NONE, code) {}
+  explicit DelegateStatus(int64_t full_status)
+      : DelegateStatus(
+            static_cast<DelegateStatusSource>(
+                full_status >> 32 &
+                static_cast<int32_t>(DelegateStatusSource::MAX_NUM_SOURCES)),
+            static_cast<int32_t>(full_status &
+                                 std::numeric_limits<int32_t>::max())) {}
+  DelegateStatus(DelegateStatusSource source, int32_t code)
+      : source_(static_cast<int32_t>(source)), code_(code) {}
+
+  // Return the detailed full status encoded as a int64_t value.
+  int64_t full_status() const {
+    return static_cast<int64_t>(source_) << 32 | code_;
+  }
+
+  DelegateStatusSource source() const {
+    return static_cast<DelegateStatusSource>(source_);
+  }
+
+  int32_t code() const { return code_; }
+
+ private:
+  // value of a DelegateStatusSource, like DelegateStatusSource::TFLITE_GPU
+  int32_t source_;
+  // value of a status code, like kTfLiteOk.
+  int32_t code_;
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_STATUS_H_
diff --git a/tensorflow/lite/delegates/utils.cc b/tensorflow/lite/delegates/utils.cc
index fba8bec39a5..135b4d531f9 100644
--- a/tensorflow/lite/delegates/utils.cc
+++ b/tensorflow/lite/delegates/utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/context_util.h"
 
 namespace tflite {
@@ -94,6 +95,19 @@ GraphPartitionHelper::GetFirstNLargestPartitions(
   return results;
 }
 
+std::vector<int> GraphPartitionHelper::GetNodesOfFirstNLargestPartitionsImpl(
+    int n, int min_nodes_per_partition) {
+  auto first_n_partitions =
+      GetFirstNLargestPartitions(n, min_nodes_per_partition);
+  std::vector<int> ops_to_replace;
+  for (const auto p : first_n_partitions) {
+    auto nodes = p->nodes_to_replace;
+    ops_to_replace.insert(ops_to_replace.end(), nodes->data,
+                          nodes->data + nodes->size);
+  }
+  return ops_to_replace;
+}
+
 TfLiteStatus GraphPartitionHelper::PrepareSupportedNodes(
     std::set<std::string>* unsupported_nodes_info) {
   if (!is_node_supported_fn_) return kTfLiteOk;
@@ -136,5 +150,160 @@ TfLiteStatus GraphPartitionHelper::PrepareSupportedNodes(
   return kTfLiteOk;
 }
 
+TfLiteStatus FP16GraphPartitionHelper::Partition(
+    std::set<std::string>* unsupported_nodes_info) {
+  const auto status = GraphPartitionHelper::Partition(unsupported_nodes_info);
+  // Clean up those partitions that have a single dequant op. NoteThose
+  // removed dequant ops have to be reserved in the graph and should not be
+  // delegated.
+  RemoveSingleDequantNodePartitions();
+  return status;
+}
+
+std::vector<int>
+FP16GraphPartitionHelper::GetNodesOfFirstNLargestPartitionsImpl(
+    int n, int min_nodes_per_partition) {
+  std::vector<int> ops_to_replace =
+      GraphPartitionHelper::GetNodesOfFirstNLargestPartitionsImpl(
+          n, min_nodes_per_partition);
+  RemapInputTensors(ops_to_replace);
+  RemoveReservedDequantsFromNodes(&ops_to_replace);
+  return ops_to_replace;
+}
+
+bool FP16GraphPartitionHelper::IsNodeSupported(
+    TfLiteContext* context, TfLiteNode* node, TfLiteRegistration* registration,
+    int node_id, std::string* unsupported_details) {
+  // If we need to handle dequant nodes, we have to remap input tensors of
+  // this node if some of them come from a dequant node before testing if
+  // the node is supported.
+  std::vector<int> orig_inputs;
+  if (RecordAndRemapInputTensors(registration->builtin_code, node_id, node,
+                                 &orig_inputs)) {
+    // We have a dequant op here. Note that we retrun an Ok status because a
+    // dequant node is first added as supported. Later, this dequant node
+    // will be removed if it has to be preserved in the graph which happens
+    // when its immediate downstream nodes cannot be supported.
+    return true;
+  }
+  const auto status = GraphPartitionHelper::IsNodeSupported(
+      context, node, registration, node_id, unsupported_details);
+  RestoreToOrigInputTensors(node, orig_inputs);
+  return status;
+}
+
+bool FP16GraphPartitionHelper::RecordAndRemapInputTensors(
+    int32_t op_code, int node_id, TfLiteNode* node,
+    std::vector<int>* orig_inputs) {
+  orig_inputs->clear();
+  // Record the dequant node.
+  if (op_code == kTfLiteBuiltinDequantize &&
+      context_->tensors[node->inputs->data[0]].type ==
+          TfLiteType::kTfLiteFloat16) {
+    dequant_nodes_[node->outputs->data[0]] = node->inputs->data[0];
+    return true;
+  }
+  // For a dequantize op, there's no need to remap its input tensors.
+  if (dequant_nodes_.empty()) return false;
+  RemapInputTensors(node, orig_inputs);
+  return false;
+}
+
+void FP16GraphPartitionHelper::RestoreToOrigInputTensors(
+    TfLiteNode* node, const std::vector<int>& orig_inputs) {
+  if (node->inputs->size != orig_inputs.size()) return;
+  for (int j = 0; j < node->inputs->size; ++j) {
+    node->inputs->data[j] = orig_inputs[j];
+  }
+}
+
+void FP16GraphPartitionHelper::RemapInputTensors(
+    const std::vector<int>& nodes) const {
+  for (int node_id : nodes) {
+    TfLiteNode* node;
+    TfLiteRegistration* registration;
+    TfLiteStatus status = context_->GetNodeAndRegistration(
+        context_, node_id, &node, &registration);
+    if (status != kTfLiteOk) {
+      TF_LITE_KERNEL_LOG(context_,
+                         "Couldn't get node and registration info for op: %d\n",
+                         node_id);
+    }
+    RemapInputTensors(node, nullptr /* orig_inputs*/);
+  }
+}
+
+void FP16GraphPartitionHelper::RemoveSingleDequantNodePartitions() {
+  auto it = partitions_.begin();
+  while (it != partitions_.end()) {
+    auto p = *it;
+    if (p->nodes_to_replace->size != 1) {
+      ++it;
+      continue;
+    }
+    int node_id = p->nodes_to_replace->data[0];
+    TfLiteNode* node = nullptr;
+    TfLiteRegistration* registration = nullptr;
+
+    TfLiteStatus status = context_->GetNodeAndRegistration(
+        context_, node_id, &node, &registration);
+    if (status != kTfLiteOk) {
+      TF_LITE_KERNEL_LOG(context_,
+                         "Couldn't get node and registration info for op: %d\n",
+                         node_id);
+    }
+    if (registration->builtin_code != kTfLiteBuiltinDequantize ||
+        context_->tensors[node->inputs->data[0]].type !=
+            TfLiteType::kTfLiteFloat16) {
+      ++it;
+      continue;
+    }
+    // Note such dequant nodes have to be preserved in the graph as dequant
+    // ops are not actually supported in the GPU delegate.
+    dequant_nodes_to_save_.insert(node_id);
+    it = partitions_.erase(it);
+  }
+}
+
+void FP16GraphPartitionHelper::RemoveReservedDequantsFromNodes(
+    std::vector<int>* nodes) {
+  if (dequant_nodes_to_save_.empty()) return;
+  auto it = nodes->begin();
+  while (it != nodes->end()) {
+    if (dequant_nodes_to_save_.find(*it) == dequant_nodes_to_save_.end()) {
+      ++it;
+      continue;
+    }
+    it = nodes->erase(it);
+  }
+}
+
+void FP16GraphPartitionHelper::RemapInputTensors(
+    TfLiteNode* node, std::vector<int>* orig_inputs) const {
+  TfLiteIntArray* inputs = node->inputs;
+  auto inputs_view = TfLiteIntArrayView(inputs);
+  // Prepopulate 'orig_inputs' first and clear it if there's no input from a
+  // dequant op.
+  if (orig_inputs) {
+    orig_inputs->clear();
+    orig_inputs->reserve(inputs->size);
+    for (auto tid : inputs_view) {
+      orig_inputs->push_back(tid);
+    }
+  }
+  // Fix this node's inputs (i.e. prune out the preceding dequantize node) in
+  // order to test if it is supported.
+  bool is_remapped = false;
+  for (int j = 0; j < inputs->size; ++j) {
+    const int input_tid = inputs->data[j];
+    const auto it = dequant_nodes_.find(input_tid);
+    if (it != dequant_nodes_.end()) {
+      inputs->data[j] = it->second;
+      is_remapped = true;
+    }
+  }
+  if (!is_remapped && orig_inputs) orig_inputs->clear();
+}
+
 }  // namespace delegates
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils.h b/tensorflow/lite/delegates/utils.h
index d6d22c4efa2..6b498b908f9 100644
--- a/tensorflow/lite/delegates/utils.h
+++ b/tensorflow/lite/delegates/utils.h
@@ -16,10 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_UTILS_H_
 #define TENSORFLOW_LITE_DELEGATES_UTILS_H_
 
+// Utility functions and classes for implementing delegates.
+
 #include <functional>
 #include <limits>
 #include <set>
 #include <string>
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/lite/c/common.h"
@@ -68,10 +72,21 @@ class GraphPartitionHelper {
   // Note that partitions are ranked according to the number of nodes that
   // a partition has, and the returned TfLiteDelegateParams objects are *owned*
   // by the TfLite runtime.
+  // TODO(b/156707497): remove this and use GetNodesOfFirstNLargestPartitions
   std::vector<TfLiteDelegateParams*> GetFirstNLargestPartitions(
       int n = std::numeric_limits<int>::max(),
       int min_nodes_per_partition = 0) const;
 
+  // Returns a list of node indices of all nodes from the first n largest
+  // partitions. If there are fewer paritions than n, all nodes will be
+  // returned. The partition is ranked according to the number of nodes.
+  std::vector<int> GetNodesOfFirstNLargestPartitions(
+      int n = std::numeric_limits<int>::max(),
+      int min_nodes_per_partition = 0) {
+    // Separated implementation that can be overrided, to preserve default value
+    return GetNodesOfFirstNLargestPartitionsImpl(n, min_nodes_per_partition);
+  }
+
   int num_total_nodes() const { return num_total_nodes_; }
   int num_partitions() const { return partitions_.size(); }
 
@@ -82,6 +97,8 @@ class GraphPartitionHelper {
     return is_node_supported_fn_(context, node, registration,
                                  unsupported_details);
   }
+  virtual std::vector<int> GetNodesOfFirstNLargestPartitionsImpl(
+      int n, int min_nodes_per_partition);
 
   TfLiteContext* const context_ = nullptr;
 
@@ -109,6 +126,64 @@ class GraphPartitionHelper {
   // Contains an array of supported node indices.
   TfLiteIntArray* supported_nodes_ = nullptr;  // owns the memory
 };
+
+// While partitioning the graph, this claims DEQUANTIZE nodes (FP16->FP32) in
+// addition to supported nodes for the delegate, when the DEQUANTIZE node's
+// output is an input to the kernel that supports FP16 input.
+class FP16GraphPartitionHelper : public GraphPartitionHelper {
+ public:
+  FP16GraphPartitionHelper(TfLiteContext* context,
+                           IsNodeSupportedFn is_node_supported_fn)
+      : GraphPartitionHelper(context, std::move(is_node_supported_fn)) {}
+
+  TfLiteStatus Partition(
+      std::set<std::string>* unsupported_nodes_info) override;
+
+ protected:
+  bool IsNodeSupported(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteRegistration* registration, int node_id,
+                       std::string* unsupported_details) override;
+
+  // This will remap input tensors by removing FP16 to FP32 dequantized tensors.
+  std::vector<int> GetNodesOfFirstNLargestPartitionsImpl(
+      int n, int min_nodes_per_partition) override;
+
+ private:
+  // Record 'node' if it is a dequant op (i.e. a fp16 one here) and return true.
+  // When it's not a dequant op, remap its inputs to the inputs of the preceding
+  // dequant if there's a one and returns false. 'orig_inputs' records original
+  // input tensor ids of this node if any input is remapped.
+  bool RecordAndRemapInputTensors(int32_t op_code, int node_id,
+                                  TfLiteNode* node,
+                                  std::vector<int>* orig_inputs);
+
+  // Restore inputs of 'node' to 'orig_inputs' only if two sizes match.
+  void RestoreToOrigInputTensors(TfLiteNode* node,
+                                 const std::vector<int>& orig_inputs);
+
+  // Remap input tensors of every node in 'nodes' (i.e. node indices) if some of
+  // them are from dequant ops.
+  void RemapInputTensors(const std::vector<int>& nodes) const;
+
+  void RemoveSingleDequantNodePartitions();
+
+  void RemoveReservedDequantsFromNodes(std::vector<int>* nodes);
+
+  // Remap input tensors of a single 'node' if some of come from a dequant op.
+  // If 'orig_inputs' isn't nullptr, it records original input tensor ids of
+  // this node if any input is remapped.
+  void RemapInputTensors(TfLiteNode* node, std::vector<int>* orig_inputs) const;
+
+  // A map recording dequantize nodes's input/output tensors of this selected
+  // graph. The key is the output tensor id, and the value is the input tensor
+  // id.
+  std::unordered_map<int, int> dequant_nodes_;
+
+  // A set of dequant nodes as in node indices that have to be preserved in the
+  // graph.
+  std::set<int> dequant_nodes_to_save_;
+};
+
 }  // namespace delegates
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/utils/BUILD b/tensorflow/lite/delegates/utils/BUILD
new file mode 100644
index 00000000000..069da167455
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/BUILD
@@ -0,0 +1,36 @@
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "simple_delegate",
+    srcs = [
+        "simple_delegate.cc",
+    ],
+    hdrs = [
+        "simple_delegate.h",
+    ],
+    deps = [
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates:utils",
+        "//tensorflow/lite/kernels/internal:compatibility",
+    ],
+)
+
+cc_test(
+    name = "simple_delegate_test",
+    srcs = ["simple_delegate_test.cc"],
+    deps = [
+        ":simple_delegate",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/delegates/utils/simple_delegate.cc b/tensorflow/lite/delegates/utils/simple_delegate.cc
new file mode 100644
index 00000000000..f8c0a027cca
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/simple_delegate.cc
@@ -0,0 +1,130 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/delegates/utils.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/minimal_logging.h"
+
+namespace tflite {
+namespace {
+TfLiteRegistration GetDelegateKernelRegistration(
+    SimpleDelegateInterface* delegate) {
+  TfLiteRegistration kernel_registration;
+  kernel_registration.profiling_string = nullptr;
+  kernel_registration.builtin_code = kTfLiteBuiltinDelegate;
+  kernel_registration.custom_name = delegate->Name();
+  kernel_registration.version = 1;
+  kernel_registration.free = [](TfLiteContext* context, void* buffer) -> void {
+    delete reinterpret_cast<SimpleDelegateKernelInterface*>(buffer);
+  };
+  kernel_registration.init = [](TfLiteContext* context, const char* buffer,
+                                size_t length) -> void* {
+    const TfLiteDelegateParams* params =
+        reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+    if (params == nullptr) {
+      TF_LITE_KERNEL_LOG(context, "NULL TfLiteDelegateParams passed.");
+      return nullptr;
+    }
+    auto* delegate =
+        reinterpret_cast<SimpleDelegateInterface*>(params->delegate->data_);
+    std::unique_ptr<SimpleDelegateKernelInterface> delegate_kernel(
+        delegate->CreateDelegateKernelInterface());
+    if (delegate_kernel->Init(context, params) != kTfLiteOk) {
+      return nullptr;
+    }
+    return delegate_kernel.release();
+  };
+  kernel_registration.prepare = [](TfLiteContext* context,
+                                   TfLiteNode* node) -> TfLiteStatus {
+    if (node->user_data == nullptr) {
+      TF_LITE_KERNEL_LOG(context, "Delegate kernel was not initialized");
+      return kTfLiteError;
+    }
+    SimpleDelegateKernelInterface* delegate_kernel =
+        reinterpret_cast<SimpleDelegateKernelInterface*>(node->user_data);
+    return delegate_kernel->Prepare(context, node);
+  };
+  kernel_registration.invoke = [](TfLiteContext* context,
+                                  TfLiteNode* node) -> TfLiteStatus {
+    SimpleDelegateKernelInterface* delegate_kernel =
+        reinterpret_cast<SimpleDelegateKernelInterface*>(node->user_data);
+    TFLITE_DCHECK(delegate_kernel != nullptr);
+    return delegate_kernel->Eval(context, node);
+  };
+
+  return kernel_registration;
+}
+
+TfLiteStatus DelegatePrepare(TfLiteContext* context,
+                             TfLiteDelegate* base_delegate) {
+  auto* delegate =
+      reinterpret_cast<SimpleDelegateInterface*>(base_delegate->data_);
+  TF_LITE_ENSURE_STATUS(delegate->Initialize(context));
+  delegates::IsNodeSupportedFn node_supported_fn =
+      [=](TfLiteContext* context, TfLiteNode* node,
+          TfLiteRegistration* registration,
+          std::string* unsupported_details) -> bool {
+    return delegate->IsNodeSupportedByDelegate(registration, node, context);
+  };
+  // TODO(b/149484598): Update to have method that gets all supported nodes.
+  delegates::GraphPartitionHelper helper(context, node_supported_fn);
+  TF_LITE_ENSURE_STATUS(helper.Partition(nullptr));
+
+  std::vector<int> supported_nodes = helper.GetNodesOfFirstNLargestPartitions();
+
+  TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
+                  "%s delegate: %d nodes delegated out of %d nodes with "
+                  "%d partitions.\n",
+                  delegate->Name(), supported_nodes.size(),
+                  helper.num_total_nodes(), helper.num_partitions());
+  TfLiteRegistration delegate_kernel_registration =
+      GetDelegateKernelRegistration(delegate);
+
+  return context->ReplaceNodeSubsetsWithDelegateKernels(
+      context, delegate_kernel_registration,
+      BuildTfLiteIntArray(supported_nodes).get(), base_delegate);
+}
+}  // namespace
+
+TfLiteDelegate* TfLiteDelegateFactory::CreateSimpleDelegate(
+    std::unique_ptr<SimpleDelegateInterface> simple_delegate) {
+  if (simple_delegate == nullptr) {
+    return nullptr;
+  }
+  auto delegate = new TfLiteDelegate();
+  delegate->Prepare = &DelegatePrepare;
+  delegate->flags = kTfLiteDelegateFlagsNone;
+  delegate->CopyFromBufferHandle = nullptr;
+  delegate->CopyToBufferHandle = nullptr;
+  delegate->FreeBufferHandle = nullptr;
+  delegate->data_ = simple_delegate.release();
+  return delegate;
+}
+
+void TfLiteDelegateFactory::DeleteSimpleDelegate(TfLiteDelegate* delegate) {
+  if (!delegate) return;
+  SimpleDelegateInterface* simple_delegate =
+      reinterpret_cast<SimpleDelegateInterface*>(delegate->data_);
+  delete simple_delegate;
+  delete delegate;
+}
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils/simple_delegate.h b/tensorflow/lite/delegates/utils/simple_delegate.h
new file mode 100644
index 00000000000..7b6be43047b
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/simple_delegate.h
@@ -0,0 +1,125 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file has utilities that facilitates creating new delegates.
+// - SimpleDelegateKernelInterface: Represents a Kernel which handles a subgraph
+// to be delegated. It has Init/Prepare/Invoke which are going to be called
+// during inference, similar to TFLite Kernels. Delegate owner should implement
+// this interface to build/prepare/invoke the delegated subgraph.
+// - SimpleDelegateInterface:
+// This class wraps TFLiteDelegate and users need to implement the interface and
+// then call TfLiteDelegateFactory::CreateSimpleDelegate(...) to get
+// TfLiteDelegate* that can be passed to ModifyGraphWithDelegate and free it via
+// TfLiteDelegateFactory::DeleteSimpleDelegate(...).
+// or call TfLiteDelegateFactory::Create(...) to get a std::unique_ptr
+// TfLiteDelegate that can also be passed to ModifyGraphWithDelegate, in which
+// case TfLite interpereter takes the memory ownership of the delegate.
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_DELEGATE_H_
+
+#include <memory>
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+
+using TfLiteDelegateUniquePtr =
+    std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
+
+// Users should inherit from this class and implement the interface below.
+// Each instance represents a single part of the graph (subgraph).
+class SimpleDelegateKernelInterface {
+ public:
+  virtual ~SimpleDelegateKernelInterface() {}
+
+  // Initializes a delegated subgraph.
+  // The nodes in the subgraph are inside TfLiteDelegateParams->nodes_to_replace
+  virtual TfLiteStatus Init(TfLiteContext* context,
+                            const TfLiteDelegateParams* params) = 0;
+
+  // Will be called by the framework. Should handle any needed preparation
+  // for the subgraph e.g. allocating buffers, compiling model.
+  // Returns status, and signalling any errors.
+  virtual TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) = 0;
+
+  // Actual subgraph inference should happen on this call.
+  // Returns status, and signalling any errors.
+  virtual TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) = 0;
+};
+
+// Pure Interface that clients should implement.
+// The Interface represents a delegate capabilities and provide factory
+// for SimpleDelegateKernelInterface
+//
+// Clients should implement the following methods:
+// - IsNodeSupportedByDelegate
+// - Initialize
+// - name
+// - CreateDelegateKernelInterface
+class SimpleDelegateInterface {
+ public:
+  SimpleDelegateInterface() {}
+
+  virtual ~SimpleDelegateInterface() {}
+
+  // Returns true if 'node' is supported by the delegate. False otherwise.
+  virtual bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration,
+                                         const TfLiteNode* node,
+                                         TfLiteContext* context) const = 0;
+
+  // Initialize the delegate before finding and replacing TfLite nodes with
+  // delegate kernels, for example, retrieving some TFLite settings from
+  // 'context'.
+  virtual TfLiteStatus Initialize(TfLiteContext* context) = 0;
+
+  // Returns a name that identifies the delegate.
+  // This name is used for debugging/logging/profiling.
+  virtual const char* Name() const = 0;
+
+  // Returns instance of an object that implements the interface
+  // SimpleDelegateKernelInterface.
+  // An instance of SimpleDelegateKernelInterface represents one subgraph to
+  // be delegated.
+  // Caller takes ownership of the returned object.
+  virtual std::unique_ptr<SimpleDelegateKernelInterface>
+  CreateDelegateKernelInterface() = 0;
+};
+
+// Factory class that provides static methods to deal with SimpleDelegate
+// creation and deletion.
+class TfLiteDelegateFactory {
+ public:
+  // Creates TfLiteDelegate from the provided SimpleDelegateInterface.
+  // The returned TfLiteDelegate should be deleted using DeleteSimpleDelegate.
+  static TfLiteDelegate* CreateSimpleDelegate(
+      std::unique_ptr<SimpleDelegateInterface> simple_delegate);
+
+  // Deletes 'delegate' the passed pointer must be the one returned
+  // from CreateSimpleDelegate.
+  // This function will destruct the SimpleDelegate object too.
+  static void DeleteSimpleDelegate(TfLiteDelegate* delegate);
+
+  // A convenient function wrapping the above two functions and returning a
+  // std::unique_ptr type for auto memory management.
+  inline static TfLiteDelegateUniquePtr Create(
+      std::unique_ptr<SimpleDelegateInterface> simple_delegate) {
+    return TfLiteDelegateUniquePtr(
+        CreateSimpleDelegate(std::move(simple_delegate)), DeleteSimpleDelegate);
+  }
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_DELEGATE_H_
diff --git a/tensorflow/lite/delegates/utils/simple_delegate_test.cc b/tensorflow/lite/delegates/utils/simple_delegate_test.cc
new file mode 100644
index 00000000000..12a790fff1a
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/simple_delegate_test.cc
@@ -0,0 +1,197 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/builtin_op_kernels.h"
+
+namespace tflite {
+namespace {
+// Delegate options.
+struct TestSimpleDelegateOptions {
+  // Allowed ops to delegate.
+  int allowed_builtin_code;
+  // Report error during init.
+  bool error_during_init = false;
+  // Report error during prepare.
+  bool error_during_prepare = false;
+  // Report error during invoke.
+  bool error_during_invoke = false;
+};
+
+// Dummy delegate kernel.
+class TestSimpleDelegateKernel : public SimpleDelegateKernelInterface {
+ public:
+  explicit TestSimpleDelegateKernel(TestSimpleDelegateOptions options)
+      : options_(options) {}
+
+  TfLiteStatus Init(TfLiteContext* context,
+                    const TfLiteDelegateParams* params) override {
+    return !options_.error_during_init ? kTfLiteOk : kTfLiteError;
+  }
+
+  TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) override {
+    return !options_.error_during_prepare ? kTfLiteOk : kTfLiteError;
+  }
+
+  TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) override {
+    return !options_.error_during_invoke ? kTfLiteOk : kTfLiteError;
+  }
+
+ private:
+  TestSimpleDelegateOptions options_;
+};
+
+// Simple delegate which implements the interface of SimpleDelegateInterface.
+// This holds the Delegate capabilities.
+class TestSimpleDelegate : public SimpleDelegateInterface {
+ public:
+  explicit TestSimpleDelegate(TestSimpleDelegateOptions options)
+      : options_(options) {}
+  bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration,
+                                 const TfLiteNode* node,
+                                 TfLiteContext* context) const override {
+    return options_.allowed_builtin_code == registration->builtin_code;
+  }
+
+  TfLiteStatus Initialize(TfLiteContext* context) override { return kTfLiteOk; }
+
+  const char* Name() const override {
+    static constexpr char kName[] = "TestSimpleDelegate";
+    return kName;
+  }
+
+  std::unique_ptr<SimpleDelegateKernelInterface> CreateDelegateKernelInterface()
+      override {
+    return std::make_unique<TestSimpleDelegateKernel>(options_);
+  }
+
+ private:
+  TestSimpleDelegateOptions options_;
+};
+
+class TestDelegate : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddTensors(5);
+    interpreter_->SetInputs({0, 1});
+    interpreter_->SetOutputs({3, 4});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration* reg = ops::builtin::Register_ADD();
+    void* builtin_data_1 = malloc(sizeof(int));
+    void* builtin_data_2 = malloc(sizeof(int));
+    void* builtin_data_3 = malloc(sizeof(int));
+    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, builtin_data_1,
+                                        reg);
+    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, builtin_data_2,
+                                        reg);
+    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, builtin_data_3,
+                                        reg);
+  }
+
+  void TearDown() override { interpreter_.reset(); }
+
+ protected:
+  std::unique_ptr<Interpreter> interpreter_;
+};
+
+TEST_F(TestDelegate, BasicDelegate) {
+  TestSimpleDelegateOptions options;
+  options.allowed_builtin_code = kTfLiteBuiltinAdd;
+  auto delegate = TfLiteDelegateFactory::Create(
+      std::make_unique<TestSimpleDelegate>(options));
+  interpreter_->ModifyGraphWithDelegate(std::move(delegate));
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  int node = interpreter_->execution_plan()[0];
+  const auto* node_and_reg = interpreter_->node_and_registration(node);
+  EXPECT_STREQ("TestSimpleDelegate", node_and_reg->second.custom_name);
+  EXPECT_EQ(1, node_and_reg->second.version);
+
+  const TfLiteDelegateParams* params = static_cast<const TfLiteDelegateParams*>(
+      node_and_reg->first.builtin_data);
+  ASSERT_EQ(params->nodes_to_replace->size, 3);
+  EXPECT_EQ(params->nodes_to_replace->data[0], 0);
+  EXPECT_EQ(params->nodes_to_replace->data[1], 1);
+  EXPECT_EQ(params->nodes_to_replace->data[2], 2);
+
+  ASSERT_EQ(params->input_tensors->size, 2);
+  EXPECT_EQ(params->input_tensors->data[0], 0);
+  EXPECT_EQ(params->input_tensors->data[1], 1);
+
+  ASSERT_EQ(params->output_tensors->size, 2);
+  EXPECT_EQ(params->output_tensors->data[0], 3);
+  EXPECT_EQ(params->output_tensors->data[1], 4);
+}
+
+TEST_F(TestDelegate, NoNodesToDelegate) {
+  TestSimpleDelegateOptions options;
+  options.allowed_builtin_code = kTfLiteBuiltinSub;
+  auto delegate = TfLiteDelegateFactory::Create(
+      std::make_unique<TestSimpleDelegate>(options));
+  interpreter_->ModifyGraphWithDelegate(std::move(delegate));
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+}
+
+TEST_F(TestDelegate, DelegateFailedPrepare) {
+  TestSimpleDelegateOptions options;
+  options.allowed_builtin_code = kTfLiteBuiltinAdd;
+  options.error_during_prepare = true;
+  auto delegate = TfLiteDelegateFactory::Create(
+      std::make_unique<TestSimpleDelegate>(options));
+  ASSERT_EQ(kTfLiteDelegateError,
+            interpreter_->ModifyGraphWithDelegate(std::move(delegate)));
+}
+
+TEST_F(TestDelegate, DelegateFailedInvoke) {
+  TestSimpleDelegateOptions options;
+  options.allowed_builtin_code = kTfLiteBuiltinAdd;
+  options.error_during_invoke = true;
+  auto delegate = TfLiteDelegateFactory::Create(
+      std::make_unique<TestSimpleDelegate>(options));
+  ASSERT_EQ(kTfLiteOk,
+            interpreter_->ModifyGraphWithDelegate(std::move(delegate)));
+  ASSERT_EQ(kTfLiteError, interpreter_->Invoke());
+}
+
+TEST_F(TestDelegate, DelegateFailedInit) {
+  TestSimpleDelegateOptions options;
+  options.allowed_builtin_code = kTfLiteBuiltinAdd;
+  options.error_during_init = true;
+  auto delegate = TfLiteDelegateFactory::Create(
+      std::make_unique<TestSimpleDelegate>(options));
+  ASSERT_EQ(kTfLiteDelegateError,
+            interpreter_->ModifyGraphWithDelegate(std::move(delegate)));
+}
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index e8e6c061160..880145e51eb 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -24,6 +24,7 @@ cc_library(
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
         "@XNNPACK",
     ],
 )
@@ -39,6 +40,7 @@ cc_library(
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
         "@XNNPACK",
     ],
 )
@@ -53,8 +55,27 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "conv_2d_tester",
+    testonly = 1,
+    srcs = ["conv_2d_tester.cc"],
+    hdrs = ["conv_2d_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -68,8 +89,10 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -80,6 +103,23 @@ cc_library(
     testonly = 1,
     srcs = ["fully_connected_tester.cc"],
     hdrs = ["fully_connected_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "pad_tester",
+    testonly = 1,
+    srcs = ["pad_tester.cc"],
+    hdrs = ["pad_tester.h"],
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
@@ -99,6 +139,7 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -114,6 +155,7 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -129,6 +171,7 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -188,14 +231,10 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":conv_2d_tester",
         ":test_main",
-        ":xnnpack_delegate",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/schema:schema_fbs",
+        ":xnnpack_delegate_test_mode",
         "@com_google_googletest//:gtest",
-        "@flatbuffers",
     ],
 )
 
@@ -252,9 +291,6 @@ cc_test(
         "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
         "//conditions:default": [],
     }),
-    tags = [
-        "notsan",  # TODO(b/155404603)
-    ],
     deps = [
         ":pool_2d_tester",
         ":test_main",
@@ -293,6 +329,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "pad_test",
+    srcs = ["pad_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":pad_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "relu_test",
     srcs = ["relu_test.cc"],
diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index e0ef6f0899c..98a08a4f647 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -1,15 +1,48 @@
 # XNNPACK backend for TensorFlow Lite
 
 XNNPACK is a highly optimized library of floating-point neural network
-inference operators for ARM, WebAssembly, and x86 platforms. This document
-describes how to use the XNNPACK library as a backend for TensorFlow Lite.
+inference operators for ARM, x86, and WebAssembly architectures in Android, iOS,
+Windows, Linux, macOS, and Emscripten environments. This document describes how
+to use the XNNPACK library as an inference engine for TensorFlow Lite.
 
-## Enabling XNNPACK backend in TensorFlow Lite models
+## Using XNNPACK engine with TensorFlow Lite interpreter
 
 XNNPACK integrates with TensorFlow Lite interpreter through the delegation
-mechanism. To leverage XNNPACK library for acceleration, the users need to
-create an XNNPACK delegate with the `TfLiteXNNPackDelegateCreate` function,
-and call `Interpreter::ModifyGraphWithDelegate` to delegate supported parts of
+mechanism. There are three methods to enable XNNPACK engine in TensorFlow Lite.
+
+### Enable XNNPACK via Bazel build flags (recommended)
+
+When building TensorFlow Lite with Bazel, add
+`--define tflite_with_xnnpack=true`, and the TensorFlow Lite interpreter will
+use XNNPACK engine by default.
+
+The exact command depends on the target platform, e.g. for Android AAR you'd use
+
+```
+bazel build -c opt --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
+  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  --define tflite_with_xnnpack=true \
+  //tensorflow/lite/java:tensorflow-lite
+```
+
+### Enable XNNPACK via additional dependency
+
+Another way to enable XNNPACK is to build and link the
+`//tensorflow/lite:tflite_with_xnnpack` target into your application alongside
+the TensorFlow Lite framework.
+
+This method works on platforms which support POSIX-style weak symbols (Android,
+iOS, Linux, Mac, but **NOT** Windows).
+
+### Enable XNNPACK via low-level delegate API (not recommended)
+
+While it is possible to use low-level delegate API to enable XNNPACK, this
+method is **NOT RECOMMENDED** unless you need to use TensorFlow Lite both with
+and without XNNPACK (e.g. for benchmarking).
+
+With low-level delegate API users create an XNNPACK delegate with the
+`TfLiteXNNPackDelegateCreate` function, and then call
+`Interpreter::ModifyGraphWithDelegate` to delegate supported parts of
 the model to the XNNPACK delegate. The users must destroy the delegate with
 `TfLiteXNNPackDelegateDelete` **after** releasing the TensorFlow Lite
 interpreter. The snippet below illustrates the typical usage:
@@ -59,8 +92,6 @@ Below is the list of current operators and limitations:
 * Only addition with two inputs is supported.
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `AVERAGE_POOL_2D`
 
@@ -68,8 +99,6 @@ Below is the list of current operators and limitations:
 * 1x1 pooling is not supported.
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `CONV_2D`
 
@@ -78,8 +107,6 @@ Below is the list of current operators and limitations:
 * Both filter and bias must be static (use `kTfLiteMmapRo` allocation type).
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) input and output
-  are not supported.
 
 ### `DEPTHWISE_CONV_2D`
 
@@ -88,8 +115,6 @@ Below is the list of current operators and limitations:
 * Both filter and bias must be static (use `kTfLiteMmapRo` allocation type).
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) input and output
-  are not supported.
 
 ### `FULLY_CONNECTED`
 
@@ -98,20 +123,14 @@ Below is the list of current operators and limitations:
 * Both filter and bias must be static (use `kTfLiteMmapRo` allocation type).
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) input and output
-  are not supported.
 
 ### `HARD_SWISH`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `LOGISTIC`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `MAX_POOL_2D`
 
@@ -119,16 +138,19 @@ Below is the list of current operators and limitations:
 * 1x1 pooling is not supported.
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `MUL`
 
 * Inputs and outputs must be in 32-bit floating-point format.
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
+
+### `PAD`
+
+* The first input and the output must be in 32-bit floating-point format.
+* The second input (the input with the padding specification) must be static
+  (use `kTfLiteMmapRo` allocation type).
+* The numbers of padding elements must be non-negative.
 
 ### `PRELU`
 
@@ -136,36 +158,28 @@ Below is the list of current operators and limitations:
 * Slope must be static (use `kTfLiteMmapRo` allocation type).
 * Slope must be either a 1D tensor, or have all its non-channel dimensions equal
   1.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) input and output
-  are not supported.
 
 ### `RELU`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `RELU6`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `RELU_N1_TO_1`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `SOFTMAX`
 
 * Inputs and outputs must be in 32-bit floating-point format.
 * Only `beta = 1.0` is supported.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### Other limitations
 
+* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
+  outputs are not supported.
 * Resizing model inputs (via `Interpreter::ResizeInputTensor`) is supported, but
   cause a complete reinitialization of the delegate instance, which has
   considerable overhead.
diff --git a/tensorflow/lite/delegates/xnnpack/add_test.cc b/tensorflow/lite/delegates/xnnpack/add_test.cc
index dd2857e01ce..6bc8f8d6bca 100644
--- a/tensorflow/lite/delegates/xnnpack/add_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/add_test.cc
@@ -679,6 +679,35 @@ TEST(Add, 2DByStatic0D) {
       .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
 }
 
+TEST(Add, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .FP16Weights()
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input2Static(true)
+      .FP16Weights()
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
 TEST(Add, ReluActivation) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
index e846cbeffe3..ad5b197d6fa 100644
--- a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include <fp16.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
@@ -62,6 +63,9 @@ void BinaryElementwiseTester::Test(tflite::BuiltinOperator binary_op,
   if (Input1Static()) {
     ASSERT_FALSE(Input2Static());
   }
+  if (FP16Weights()) {
+    ASSERT_TRUE(Input1Static() || Input2Static());
+  }
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -180,8 +184,12 @@ std::vector<char> BinaryElementwiseTester::CreateTfLiteModel(
   auto input2_rng = std::bind(input2_distribution, std::ref(rng));
 
   flatbuffers::FlatBufferBuilder builder;
-  flatbuffers::Offset<OperatorCode> operator_code =
-      CreateOperatorCode(builder, binary_op);
+  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+      {CreateOperatorCode(builder, binary_op)}};
+  if (FP16Weights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+  }
 
   std::vector<flatbuffers::Offset<Buffer>> buffers{{
       CreateBuffer(builder, builder.CreateVector({})),
@@ -189,43 +197,89 @@ std::vector<char> BinaryElementwiseTester::CreateTfLiteModel(
 
   int32_t input1_buffer = 0;
   if (Input1Static()) {
-    std::vector<float> input1_data(ComputeSize(Input1Shape()));
-    std::generate(input1_data.begin(), input1_data.end(), input1_rng);
+    if (FP16Weights()) {
+      std::vector<uint16_t> input1_data(ComputeSize(Input1Shape()));
+      std::generate(input1_data.begin(), input1_data.end(),
+                    std::bind(fp16_ieee_from_fp32_value, input1_rng));
 
-    input1_buffer = buffers.size();
-    buffers.push_back(CreateBuffer(
-        builder, builder.CreateVector(
-                     reinterpret_cast<const uint8_t*>(input1_data.data()),
-                     sizeof(float) * input1_data.size())));
+      buffers.push_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(input1_data.data()),
+                       sizeof(uint16_t) * input1_data.size())));
+    } else {
+      std::vector<float> input1_data(ComputeSize(Input1Shape()));
+      std::generate(input1_data.begin(), input1_data.end(), input1_rng);
+
+      input1_buffer = buffers.size();
+      buffers.push_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(input1_data.data()),
+                       sizeof(float) * input1_data.size())));
+    }
   }
 
   int32_t input2_buffer = 0;
   if (Input2Static()) {
-    std::vector<float> input2_data(ComputeSize(Input2Shape()));
-    std::generate(input2_data.begin(), input2_data.end(), input2_rng);
+    if (FP16Weights()) {
+      std::vector<uint16_t> input2_data(ComputeSize(Input2Shape()));
+      std::generate(input2_data.begin(), input2_data.end(),
+                    std::bind(fp16_ieee_from_fp32_value, input1_rng));
 
-    input2_buffer = buffers.size();
-    buffers.push_back(CreateBuffer(
-        builder, builder.CreateVector(
-                     reinterpret_cast<const uint8_t*>(input2_data.data()),
-                     sizeof(float) * input2_data.size())));
+      buffers.push_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(input2_data.data()),
+                       sizeof(uint16_t) * input2_data.size())));
+    } else {
+      std::vector<float> input2_data(ComputeSize(Input2Shape()));
+      std::generate(input2_data.begin(), input2_data.end(), input2_rng);
+
+      input2_buffer = buffers.size();
+      buffers.push_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(input2_data.data()),
+                       sizeof(float) * input2_data.size())));
+    }
   }
 
   const std::vector<int32_t> output_shape = OutputShape();
-  const std::array<flatbuffers::Offset<Tensor>, 3> tensors{{
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(Input1Shape().data(),
-                                                 Input1Shape().size()),
-                   TensorType_FLOAT32, input1_buffer),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(Input2Shape().data(),
-                                                 Input2Shape().size()),
-                   TensorType_FLOAT32, input2_buffer),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(output_shape.data(),
-                                                 output_shape.size()),
-                   TensorType_FLOAT32),
-  }};
+  std::vector<flatbuffers::Offset<Tensor>> tensors;
+  std::vector<flatbuffers::Offset<Operator>> operators;
+  if (FP16Weights() && Input1Static()) {
+    tensors.emplace_back(
+        CreateTensor(builder,
+                     builder.CreateVector<int32_t>(Input1Shape().data(),
+                                                   Input1Shape().size()),
+                     TensorType_FLOAT16, 1));
+  }
+  if (FP16Weights() && Input2Static()) {
+    tensors.emplace_back(
+        CreateTensor(builder,
+                     builder.CreateVector<int32_t>(Input2Shape().data(),
+                                                   Input2Shape().size()),
+                     TensorType_FLOAT16, 1));
+  }
+  if (FP16Weights()) {
+    const std::array<int32_t, 1> dequantize_inputs{{0}};
+    const std::array<int32_t, 1> dequantize_outputs{{Input1Static() ? 1 : 2}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_inputs.data(),
+                                      dequantize_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_outputs.data(),
+                                      dequantize_outputs.size())));
+  }
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(Input1Shape().data(), Input1Shape().size()),
+      TensorType_FLOAT32, input1_buffer));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(Input2Shape().data(), Input2Shape().size()),
+      TensorType_FLOAT32, input2_buffer));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
+      TensorType_FLOAT32));
 
   tflite::BuiltinOptions builtin_options_type = tflite::BuiltinOptions_NONE;
   flatbuffers::Offset<void> builtin_options = 0;
@@ -250,35 +304,40 @@ std::vector<char> BinaryElementwiseTester::CreateTfLiteModel(
       EXPECT_EQ(Activation(), ActivationFunctionType_NONE);
   }
 
-  const std::array<int32_t, 2> op_inputs{{0, 1}};
-  const std::array<int32_t, 1> op_outputs{{2}};
-  flatbuffers::Offset<Operator> op = CreateOperator(
+  const std::array<int32_t, 2> op_inputs{
+      {static_cast<int>(tensors.size()) - 3,
+       static_cast<int>(tensors.size()) - 2}};
+  const std::array<int32_t, 1> op_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
+  operators.emplace_back(CreateOperator(
       builder, /*opcode_index=*/0,
       builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
       builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
-      builtin_options_type, builtin_options);
+      builtin_options_type, builtin_options));
 
   std::vector<int32_t> subgraph_inputs;
   if (!Input1Static()) {
-    subgraph_inputs.push_back(0);
+    subgraph_inputs.push_back(tensors.size() - 3);
   }
   if (!Input2Static()) {
-    subgraph_inputs.push_back(1);
+    subgraph_inputs.push_back(tensors.size() - 2);
   }
-  const std::array<int32_t, 1> subgraph_outputs{{2}};
+  const std::array<int32_t, 1> subgraph_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
   flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
       builder, builder.CreateVector(tensors.data(), tensors.size()),
       builder.CreateVector<int32_t>(subgraph_inputs.data(),
                                     subgraph_inputs.size()),
       builder.CreateVector<int32_t>(subgraph_outputs.data(),
                                     subgraph_outputs.size()),
-      builder.CreateVector(&op, 1));
+      builder.CreateVector(operators.data(), operators.size()));
 
   flatbuffers::Offset<flatbuffers::String> description =
       builder.CreateString("Binary operator model");
 
   flatbuffers::Offset<Model> model_buffer = CreateModel(
-      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder, TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(operator_codes.data(), operator_codes.size()),
       builder.CreateVector(&subgraph, 1), description,
       builder.CreateVector(buffers.data(), buffers.size()));
 
diff --git a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
index 6d9a8b6caa9..a0c2440f59a 100644
--- a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
@@ -17,17 +17,11 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_BINARY_ELEMENTWISE_TESTER_H_
 
 #include <cstdint>
-#include <functional>
-#include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/version.h"
 
 namespace tflite {
 namespace xnnpack {
@@ -80,6 +74,13 @@ class BinaryElementwiseTester {
 
   inline bool Input2Static() const { return input2_static_; }
 
+  inline BinaryElementwiseTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
   inline BinaryElementwiseTester& ReluActivation() {
     activation_ = ::tflite::ActivationFunctionType_RELU;
     return *this;
@@ -120,6 +121,7 @@ class BinaryElementwiseTester {
   std::vector<int32_t> input2_shape_;
   bool input1_static_ = false;
   bool input2_static_ = false;
+  bool fp16_weights_ = false;
   ::tflite::ActivationFunctionType activation_ =
       ::tflite::ActivationFunctionType_NONE;
 };
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
index 95a358d1b9c..816b835883b 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
@@ -15,328 +15,93 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <memory>
 #include <random>
-#include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/delegates/xnnpack/conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/version.h"
 
 namespace tflite {
 namespace xnnpack {
 
-namespace {
-
-class Conv2DTester {
- public:
-  Conv2DTester() = default;
-  Conv2DTester(const Conv2DTester&) = delete;
-  Conv2DTester& operator=(const Conv2DTester&) = delete;
-
-  Conv2DTester& BatchSize(int32_t batch_size) {
-    EXPECT_GT(batch_size, 0);
-    batch_size_ = batch_size;
-    return *this;
-  }
-
-  int32_t BatchSize() const { return batch_size_; }
-
-  Conv2DTester& InputChannels(int32_t input_channels) {
-    EXPECT_GT(input_channels, 0);
-    input_channels_ = input_channels;
-    return *this;
-  }
-
-  int32_t InputChannels() const { return input_channels_; }
-
-  Conv2DTester& OutputChannels(int32_t output_channels) {
-    EXPECT_GT(output_channels, 0);
-    output_channels_ = output_channels;
-    return *this;
-  }
-
-  int32_t OutputChannels() const { return output_channels_; }
-
-  Conv2DTester& InputHeight(int32_t input_height) {
-    EXPECT_GT(input_height, 0);
-    input_height_ = input_height;
-    return *this;
-  }
-
-  int32_t InputHeight() const { return input_height_; }
-
-  Conv2DTester& InputWidth(int32_t input_width) {
-    EXPECT_GT(input_width, 0);
-    input_width_ = input_width;
-    return *this;
-  }
-
-  int32_t InputWidth() const { return input_width_; }
-
-  int32_t OutputWidth() const {
-    if (SamePadding()) {
-      return (InputWidth() - 1) / StrideWidth() + 1;
-    } else {
-      return (InputWidth() - (KernelWidth() - 1) * DilationWidth() - 1) /
-                 StrideWidth() +
-             1;
-    }
-  }
-
-  int32_t OutputHeight() const {
-    if (SamePadding()) {
-      return (InputHeight() - 1) / StrideHeight() + 1;
-    } else {
-      return (InputHeight() - (KernelHeight() - 1) * DilationHeight() - 1) /
-                 StrideHeight() +
-             1;
-    }
-  }
-
-  Conv2DTester& KernelHeight(int32_t kernel_height) {
-    EXPECT_GT(kernel_height, 0);
-    kernel_height_ = kernel_height;
-    return *this;
-  }
-
-  int32_t KernelHeight() const { return kernel_height_; }
-
-  Conv2DTester& KernelWidth(int32_t kernel_width) {
-    EXPECT_GT(kernel_width, 0);
-    kernel_width_ = kernel_width;
-    return *this;
-  }
-
-  int32_t KernelWidth() const { return kernel_width_; }
-
-  Conv2DTester& StrideHeight(int32_t stride_height) {
-    EXPECT_GT(stride_height, 0);
-    stride_height_ = stride_height;
-    return *this;
-  }
-
-  int32_t StrideHeight() const { return stride_height_; }
-
-  Conv2DTester& StrideWidth(int32_t stride_width) {
-    EXPECT_GT(stride_width, 0);
-    stride_width_ = stride_width;
-    return *this;
-  }
-
-  int32_t StrideWidth() const { return stride_width_; }
-
-  Conv2DTester& DilationHeight(int32_t dilation_height) {
-    EXPECT_GT(dilation_height, 0);
-    dilation_height_ = dilation_height;
-    return *this;
-  }
-
-  int32_t DilationHeight() const { return dilation_height_; }
-
-  Conv2DTester& DilationWidth(int32_t dilation_width) {
-    EXPECT_GT(dilation_width, 0);
-    dilation_width_ = dilation_width;
-    return *this;
-  }
-
-  int32_t DilationWidth() const { return dilation_width_; }
-
-  Conv2DTester& SamePadding(bool same_padding) {
-    same_padding_ = same_padding;
-    return *this;
-  }
-
-  bool SamePadding() const { return same_padding_; }
-
-  void Test(TfLiteDelegate* delegate) const {
-    std::random_device random_device;
-    auto rng = std::mt19937(random_device());
-    auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
-
-    std::vector<char> buffer = CreateTfLiteModel(std::ref(f32rng));
-    const Model* model = GetModel(buffer.data());
-
-    std::unique_ptr<Interpreter> delegate_interpreter;
-    ASSERT_EQ(
-        InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
-            &delegate_interpreter),
-        kTfLiteOk);
-    std::unique_ptr<Interpreter> default_interpreter;
-    ASSERT_EQ(
-        InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
-            &default_interpreter),
-        kTfLiteOk);
-
-    ASSERT_TRUE(delegate_interpreter);
-    ASSERT_TRUE(default_interpreter);
-
-    ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
-    ASSERT_EQ(default_interpreter->inputs().size(), 1);
-
-    ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
-    ASSERT_EQ(default_interpreter->outputs().size(), 1);
-
-    ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
-    ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
-
-    ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate),
-              kTfLiteOk);
-
-    float* default_input_data = default_interpreter->typed_tensor<float>(
-        default_interpreter->inputs()[0]);
-    std::generate(default_input_data,
-                  default_input_data + BatchSize() * InputHeight() *
-                                           InputWidth() * InputChannels(),
-                  std::ref(f32rng));
-
-    float* xnnpack_input_data = delegate_interpreter->typed_tensor<float>(
-        delegate_interpreter->inputs()[0]);
-    std::copy(default_input_data,
-              default_input_data +
-                  BatchSize() * InputHeight() * InputWidth() * InputChannels(),
-              xnnpack_input_data);
-
-    default_interpreter->Invoke();
-    delegate_interpreter->Invoke();
-
-    float* default_output_data = default_interpreter->typed_tensor<float>(
-        default_interpreter->outputs()[0]);
-    float* xnnpack_output_data = delegate_interpreter->typed_tensor<float>(
-        delegate_interpreter->outputs()[0]);
-
-    for (size_t i = 0;
-         i < BatchSize() * OutputHeight() * OutputWidth() * OutputChannels();
-         i++) {
-      ASSERT_NEAR(default_output_data[i], xnnpack_output_data[i],
-                  std::numeric_limits<float>::epsilon() *
-                      std::max(std::abs(default_output_data[i]) * 25.0f, 1.0f));
-    }
-  }
-
- private:
-  std::vector<char> CreateTfLiteModel(std::function<float()> f32rng) const {
-    flatbuffers::FlatBufferBuilder builder;
-    flatbuffers::Offset<OperatorCode> operator_code =
-        CreateOperatorCode(builder, BuiltinOperator_CONV_2D, 0);
-
-    flatbuffers::Offset<Conv2DOptions> conv2d_options = CreateConv2DOptions(
-        builder, SamePadding() ? tflite::Padding_SAME : tflite::Padding_VALID,
-        StrideWidth(), StrideHeight(), ActivationFunctionType_NONE,
-        DilationWidth(), DilationHeight());
-
-    std::vector<float> filter_data(OutputChannels() * KernelHeight() *
-                                   KernelWidth() * InputChannels());
-    std::vector<float> bias_data(OutputChannels());
-
-    std::generate(filter_data.begin(), filter_data.end(), f32rng);
-    std::generate(bias_data.begin(), bias_data.end(), f32rng);
-
-    flatbuffers::Offset<Buffer> buffers[3] = {
-        CreateBuffer(builder, builder.CreateVector({})),
-        CreateBuffer(builder,
-                     builder.CreateVector(
-                         reinterpret_cast<const uint8_t*>(filter_data.data()),
-                         sizeof(float) * filter_data.size())),
-        CreateBuffer(builder,
-                     builder.CreateVector(
-                         reinterpret_cast<const uint8_t*>(bias_data.data()),
-                         sizeof(float) * bias_data.size())),
-    };
-
-    const int32_t input_shape[4] = {BatchSize(), InputHeight(), InputWidth(),
-                                    InputChannels()};
-    const int32_t output_shape[4] = {BatchSize(), OutputHeight(), OutputWidth(),
-                                     OutputChannels()};
-    const int32_t filter_shape[4] = {OutputChannels(), KernelHeight(),
-                                     KernelWidth(), InputChannels()};
-    const int32_t bias_shape[1] = {OutputChannels()};
-
-    flatbuffers::Offset<Tensor> tensors[4] = {
-        CreateTensor(builder, builder.CreateVector<int32_t>(input_shape, 4),
-                     TensorType_FLOAT32, /*buffer=*/0,
-                     builder.CreateString("X")),
-        CreateTensor(builder, builder.CreateVector<int32_t>(filter_shape, 4),
-                     TensorType_FLOAT32, /*buffer=*/1,
-                     builder.CreateString("W")),
-        CreateTensor(builder, builder.CreateVector<int32_t>(bias_shape, 1),
-                     TensorType_FLOAT32, /*buffer=*/2,
-                     builder.CreateString("b")),
-        CreateTensor(builder, builder.CreateVector<int32_t>(output_shape, 4),
-                     TensorType_FLOAT32, /*buffer=*/0,
-                     builder.CreateString("Y")),
-    };
-
-    const int32_t op_inputs[3] = {0, 1, 2};
-    const int32_t op_outputs[1] = {3};
-
-    flatbuffers::Offset<Operator> op =
-        CreateOperator(builder, /*opcode_index=*/0,
-                       builder.CreateVector<int32_t>(op_inputs, 3),
-                       builder.CreateVector<int32_t>(op_outputs, 1),
-                       BuiltinOptions_Conv2DOptions, conv2d_options.Union());
-
-    int32_t subgraph_inputs[1] = {0};
-    int32_t subgraph_outputs[1] = {3};
-    flatbuffers::Offset<SubGraph> subgraph =
-        CreateSubGraph(builder, builder.CreateVector(tensors, 4),
-                       builder.CreateVector<int32_t>(subgraph_inputs, 1),
-                       builder.CreateVector<int32_t>(subgraph_outputs, 1),
-                       builder.CreateVector(&op, 1), /*name=*/0);
-
-    flatbuffers::Offset<flatbuffers::String> description =
-        builder.CreateString("Conv2D model");
-
-    flatbuffers::Offset<Model> model_buffer = CreateModel(
-        builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
-        builder.CreateVector(&subgraph, 1), description,
-        builder.CreateVector(buffers, 3));
-
-    builder.Finish(model_buffer);
-
-    return std::vector<char>(builder.GetBufferPointer(),
-                             builder.GetBufferPointer() + builder.GetSize());
-  }
-
-  int32_t batch_size_ = 1;
-  int32_t input_channels_ = 1;
-  int32_t output_channels_ = 1;
-  int32_t input_height_ = 1;
-  int32_t input_width_ = 1;
-  int32_t kernel_height_ = 1;
-  int32_t kernel_width_ = 1;
-  int32_t stride_height_ = 1;
-  int32_t stride_width_ = 1;
-  int32_t dilation_height_ = 1;
-  int32_t dilation_width_ = 1;
-  bool same_padding_ = true;
-};
-
-}  // namespace
-
-TEST(Conv2D, Pointwise) {
+TEST(Conv2D, 1x1) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
   auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
 
   Conv2DTester()
+      .BatchSize(batch_rng())
       .InputHeight(input_rng())
       .InputWidth(input_rng())
       .InputChannels(channel_rng())
       .OutputChannels(channel_rng())
       .KernelHeight(1)
       .KernelWidth(1)
+      .ValidPadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Conv2D, 3x3) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(3)
+      .KernelWidth(3)
+      .SamePadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Conv2D, 3x3Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(3)
+      .KernelWidth(3)
+      .StrideHeight(2)
+      .StrideWidth(2)
+      .SamePadding()
       .Test(xnnpack_delegate.get());
 }
 
@@ -347,21 +112,24 @@ TEST(Conv2D, SmallKernelWithSamePadding) {
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
   auto kernel_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 7), std::ref(rng));
   auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
 
   Conv2DTester()
+      .BatchSize(batch_rng())
       .InputHeight(input_rng())
       .InputWidth(input_rng())
       .InputChannels(channel_rng())
       .OutputChannels(channel_rng())
       .KernelHeight(kernel_rng())
       .KernelWidth(kernel_rng())
-      .SamePadding(true)
+      .SamePadding()
       .Test(xnnpack_delegate.get());
 }
 
@@ -372,21 +140,24 @@ TEST(Conv2D, SmallKernelWithValidPadding) {
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
   auto kernel_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 7), std::ref(rng));
   auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
 
   Conv2DTester()
+      .BatchSize(batch_rng())
       .InputHeight(input_rng())
       .InputWidth(input_rng())
       .InputChannels(channel_rng())
       .OutputChannels(channel_rng())
       .KernelHeight(kernel_rng())
       .KernelWidth(kernel_rng())
-      .SamePadding(false)
+      .ValidPadding()
       .Test(xnnpack_delegate.get());
 }
 
@@ -397,6 +168,8 @@ TEST(Conv2D, StrideWithSamePadding) {
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
   auto kernel_rng =
@@ -404,9 +177,10 @@ TEST(Conv2D, StrideWithSamePadding) {
   auto stride_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
   auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
 
   Conv2DTester()
+      .BatchSize(batch_rng())
       .InputHeight(input_rng())
       .InputWidth(input_rng())
       .InputChannels(channel_rng())
@@ -415,7 +189,7 @@ TEST(Conv2D, StrideWithSamePadding) {
       .KernelWidth(kernel_rng())
       .StrideHeight(stride_rng())
       .StrideWidth(stride_rng())
-      .SamePadding(true)
+      .SamePadding()
       .Test(xnnpack_delegate.get());
 }
 
@@ -426,6 +200,8 @@ TEST(Conv2D, StrideWithValidPadding) {
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
   auto kernel_rng =
@@ -433,9 +209,10 @@ TEST(Conv2D, StrideWithValidPadding) {
   auto stride_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
   auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
 
   Conv2DTester()
+      .BatchSize(batch_rng())
       .InputHeight(input_rng())
       .InputWidth(input_rng())
       .InputChannels(channel_rng())
@@ -444,7 +221,7 @@ TEST(Conv2D, StrideWithValidPadding) {
       .KernelWidth(kernel_rng())
       .StrideHeight(stride_rng())
       .StrideWidth(stride_rng())
-      .SamePadding(false)
+      .ValidPadding()
       .Test(xnnpack_delegate.get());
 }
 
@@ -455,6 +232,8 @@ TEST(Conv2D, DilationWithSamePadding) {
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
   auto kernel_rng =
@@ -462,9 +241,10 @@ TEST(Conv2D, DilationWithSamePadding) {
   auto dilation_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
   auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
 
   Conv2DTester()
+      .BatchSize(batch_rng())
       .InputHeight(input_rng())
       .InputWidth(input_rng())
       .InputChannels(channel_rng())
@@ -473,7 +253,7 @@ TEST(Conv2D, DilationWithSamePadding) {
       .KernelWidth(kernel_rng())
       .DilationHeight(dilation_rng())
       .DilationWidth(dilation_rng())
-      .SamePadding(true)
+      .SamePadding()
       .Test(xnnpack_delegate.get());
 }
 
@@ -484,6 +264,8 @@ TEST(Conv2D, DilationWithValidPadding) {
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
   auto kernel_rng =
@@ -491,9 +273,10 @@ TEST(Conv2D, DilationWithValidPadding) {
   auto dilation_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
   auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
 
   Conv2DTester()
+      .BatchSize(batch_rng())
       .InputHeight(input_rng())
       .InputWidth(input_rng())
       .InputChannels(channel_rng())
@@ -502,7 +285,233 @@ TEST(Conv2D, DilationWithValidPadding) {
       .KernelWidth(kernel_rng())
       .DilationHeight(dilation_rng())
       .DilationWidth(dilation_rng())
-      .SamePadding(false)
+      .ValidPadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Conv2D, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .FP16Weights()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Conv2D, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .ReluActivation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Conv2D, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .Relu6Activation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Conv2D, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .ReluMinus1To1Activation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Conv2D, DISABLED_TanhActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .TanhActivation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Conv2D, DISABLED_SignBitActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SignBitActivation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Conv2D, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
       .Test(xnnpack_delegate.get());
 }
 
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
new file mode 100644
index 00000000000..476889bde51
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
@@ -0,0 +1,298 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/conv_2d_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <fp16.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+void Conv2DTester::Test(TfLiteDelegate* delegate) const {
+  std::vector<char> buffer = CreateTfLiteModel();
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng =
+      std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+  float* default_input_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->inputs()[0]);
+  std::generate(default_input_data,
+                default_input_data + BatchSize() * InputHeight() *
+                                         InputWidth() * InputChannels(),
+                input_rng);
+
+  float* delegate_input_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->inputs()[0]);
+  std::copy(default_input_data,
+            default_input_data +
+                BatchSize() * InputHeight() * InputWidth() * InputChannels(),
+            delegate_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->outputs()[0]);
+  float* delegate_output_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->outputs()[0]);
+
+  for (int32_t i = 0; i < BatchSize(); i++) {
+    for (int32_t y = 0; y < OutputHeight(); y++) {
+      for (int32_t x = 0; x < OutputWidth(); x++) {
+        for (int32_t c = 0; c < OutputChannels(); c++) {
+          const int32_t index = ((i * OutputHeight() + y) * OutputWidth() + x) *
+                                    OutputChannels() +
+                                c;
+          ASSERT_NEAR(default_output_data[index], delegate_output_data[index],
+                      std::abs(default_output_data[index]) * 3.0e-6f)
+              << "batch " << i << " / " << BatchSize() << ", y position " << y
+              << " / " << OutputHeight() << ", x position " << x << " / "
+              << OutputWidth() << ", channel " << c << " / "
+              << OutputChannels();
+        }
+      }
+    }
+  }
+}
+
+std::vector<char> Conv2DTester::CreateTfLiteModel() const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto range_rng = std::bind(
+      std::uniform_real_distribution<float>(-25.0f, 25.0f), std::ref(rng));
+
+  flatbuffers::FlatBufferBuilder builder;
+  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+      {CreateOperatorCode(builder, BuiltinOperator_CONV_2D)}};
+  std::vector<flatbuffers::Offset<tflite::Operator>> operators;
+  std::vector<flatbuffers::Offset<tflite::Buffer>> buffers{
+      {CreateBuffer(builder, builder.CreateVector({}))}};
+
+  if (FP16Weights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+
+    std::vector<uint16_t> filter_data(OutputChannels() * KernelHeight() *
+                                      KernelWidth() * InputChannels());
+    std::vector<uint16_t> bias_data(OutputChannels());
+    for (int32_t oc = 0; oc < OutputChannels(); oc++) {
+      // Use the same range of all-positive or all-negative values to generate
+      // all weights within the same output channel, but different ranges for
+      // different output channels. This ensures that no catastrophic
+      // cancellation occur, but test covers both positive and negative inputs.
+      const float range = range_rng();
+      auto value_rng =
+          std::bind(fp16_ieee_from_fp32_value,
+                    std::bind(std::uniform_real_distribution<float>(
+                                  std::min(range, 0.0f), std::max(range, 0.0f)),
+                              std::ref(rng)));
+      bias_data[oc] = value_rng();
+      for (int32_t ic = 0; ic < InputChannels(); ic++) {
+        for (int32_t y = 0; y < KernelHeight(); y++) {
+          for (int32_t x = 0; x < KernelWidth(); x++) {
+            const int32_t index =
+                ((oc * KernelHeight() + y) * KernelWidth() + x) *
+                    InputChannels() +
+                ic;
+            filter_data[index] = value_rng();
+          }
+        }
+      }
+    }
+
+    buffers.emplace_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(filter_data.data()),
+                     sizeof(uint16_t) * filter_data.size())));
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                             sizeof(uint16_t) * bias_data.size())));
+
+    const std::array<int32_t, 1> dequantize_filter_inputs{{0}};
+    const std::array<int32_t, 1> dequantize_filter_outputs{{3}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
+                                      dequantize_filter_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
+                                      dequantize_filter_outputs.size())));
+    const std::array<int32_t, 1> dequantize_bias_inputs{{1}};
+    const std::array<int32_t, 1> dequantize_bias_outputs{{4}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
+                                      dequantize_bias_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
+                                      dequantize_bias_outputs.size())));
+  } else {
+    std::vector<float> filter_data(OutputChannels() * KernelHeight() *
+                                   KernelWidth() * InputChannels());
+    std::vector<float> bias_data(OutputChannels());
+    for (int32_t oc = 0; oc < OutputChannels(); oc++) {
+      // Use the same range of all-positive or all-negative values to generate
+      // all weights within the same output channel, but different ranges for
+      // different output channels. This ensures that no catastrophic
+      // cancellation occur, but test covers both positive and negative inputs.
+      const float range = range_rng();
+      auto value_rng =
+          std::bind(std::uniform_real_distribution<float>(
+                        std::min(range, 0.0f), std::max(range, 0.0f)),
+                    std::ref(rng));
+      bias_data[oc] = value_rng();
+      for (int32_t ic = 0; ic < InputChannels(); ic++) {
+        for (int32_t y = 0; y < KernelHeight(); y++) {
+          for (int32_t x = 0; x < KernelWidth(); x++) {
+            const int32_t index =
+                ((oc * KernelHeight() + y) * KernelWidth() + x) *
+                    InputChannels() +
+                ic;
+            filter_data[index] = value_rng();
+          }
+        }
+      }
+    }
+
+    buffers.emplace_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(filter_data.data()),
+                     sizeof(float) * filter_data.size())));
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                             sizeof(float) * bias_data.size())));
+  }
+
+  const std::array<int32_t, 4> input_shape{
+      {BatchSize(), InputHeight(), InputWidth(), InputChannels()}};
+  const std::array<int32_t, 4> output_shape{
+      {BatchSize(), OutputHeight(), OutputWidth(), OutputChannels()}};
+  const std::array<int32_t, 4> filter_shape{
+      {OutputChannels(), KernelHeight(), KernelWidth(), InputChannels()}};
+  const std::array<int32_t, 1> bias_shape{{OutputChannels()}};
+
+  std::vector<flatbuffers::Offset<tflite::Tensor>> tensors;
+  if (FP16Weights()) {
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+        TensorType_FLOAT16, /*buffer=*/1));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+        TensorType_FLOAT16, /*buffer=*/2));
+  }
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
+      TensorType_FLOAT32));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 1));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 2));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
+      TensorType_FLOAT32));
+
+  const std::array<int32_t, 3> op_inputs{
+      {static_cast<int>(tensors.size()) - 4,
+       static_cast<int>(tensors.size()) - 3,
+       static_cast<int>(tensors.size()) - 2}};
+  const std::array<int32_t, 1> op_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
+
+  flatbuffers::Offset<Conv2DOptions> conv2d_options =
+      CreateConv2DOptions(builder, Padding(), StrideWidth(), StrideHeight(),
+                          Activation(), DilationWidth(), DilationHeight());
+  operators.emplace_back(CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
+      BuiltinOptions_Conv2DOptions, conv2d_options.Union()));
+
+  const std::array<int32_t, 1> subgraph_inputs{
+      {static_cast<int>(tensors.size()) - 4}};
+  const std::array<int32_t, 1> subgraph_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(operators.data(), operators.size()));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("Conv2D model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(operator_codes.data(), operator_codes.size()),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h
new file mode 100644
index 00000000000..6b22ded3458
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h
@@ -0,0 +1,224 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_CONV_2D_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_CONV_2D_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class Conv2DTester {
+ public:
+  Conv2DTester() = default;
+  Conv2DTester(const Conv2DTester&) = delete;
+  Conv2DTester& operator=(const Conv2DTester&) = delete;
+
+  inline Conv2DTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline Conv2DTester& InputChannels(int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline Conv2DTester& OutputChannels(int32_t output_channels) {
+    EXPECT_GT(output_channels, 0);
+    output_channels_ = output_channels;
+    return *this;
+  }
+
+  inline int32_t OutputChannels() const { return output_channels_; }
+
+  inline Conv2DTester& InputHeight(int32_t input_height) {
+    EXPECT_GT(input_height, 0);
+    input_height_ = input_height;
+    return *this;
+  }
+
+  inline int32_t InputHeight() const { return input_height_; }
+
+  inline Conv2DTester& InputWidth(int32_t input_width) {
+    EXPECT_GT(input_width, 0);
+    input_width_ = input_width;
+    return *this;
+  }
+
+  inline int32_t InputWidth() const { return input_width_; }
+
+  inline int32_t OutputWidth() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      EXPECT_GE(InputWidth(), 1);
+      return (InputWidth() - 1) / StrideWidth() + 1;
+    } else {
+      EXPECT_GE(InputWidth(), DilatedKernelWidth());
+      return 1 + (InputWidth() - DilatedKernelWidth()) / StrideWidth();
+    }
+  }
+
+  inline int32_t OutputHeight() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      EXPECT_GE(InputHeight(), 1);
+      return (InputHeight() - 1) / StrideHeight() + 1;
+    } else {
+      EXPECT_GE(InputHeight(), DilatedKernelHeight());
+      return 1 + (InputHeight() - DilatedKernelHeight()) / StrideHeight();
+    }
+  }
+
+  inline Conv2DTester& KernelHeight(int32_t kernel_height) {
+    EXPECT_GT(kernel_height, 0);
+    kernel_height_ = kernel_height;
+    return *this;
+  }
+
+  inline int32_t KernelHeight() const { return kernel_height_; }
+
+  inline Conv2DTester& KernelWidth(int32_t kernel_width) {
+    EXPECT_GT(kernel_width, 0);
+    kernel_width_ = kernel_width;
+    return *this;
+  }
+
+  inline int32_t KernelWidth() const { return kernel_width_; }
+
+  inline Conv2DTester& StrideHeight(int32_t stride_height) {
+    EXPECT_GT(stride_height, 0);
+    stride_height_ = stride_height;
+    return *this;
+  }
+
+  inline int32_t StrideHeight() const { return stride_height_; }
+
+  inline Conv2DTester& StrideWidth(int32_t stride_width) {
+    EXPECT_GT(stride_width, 0);
+    stride_width_ = stride_width;
+    return *this;
+  }
+
+  inline int32_t StrideWidth() const { return stride_width_; }
+
+  inline Conv2DTester& DilationHeight(int32_t dilation_height) {
+    EXPECT_GT(dilation_height, 0);
+    dilation_height_ = dilation_height;
+    return *this;
+  }
+
+  inline int32_t DilationHeight() const { return dilation_height_; }
+
+  inline Conv2DTester& DilationWidth(int32_t dilation_width) {
+    EXPECT_GT(dilation_width, 0);
+    dilation_width_ = dilation_width;
+    return *this;
+  }
+
+  inline int32_t DilationWidth() const { return dilation_width_; }
+
+  inline int32_t DilatedKernelHeight() const {
+    return (KernelHeight() - 1) * DilationHeight() + 1;
+  }
+
+  inline int32_t DilatedKernelWidth() const {
+    return (KernelWidth() - 1) * DilationWidth() + 1;
+  }
+
+  inline Conv2DTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
+  inline Conv2DTester& SamePadding() {
+    padding_ = ::tflite::Padding_SAME;
+    return *this;
+  }
+
+  inline Conv2DTester& ValidPadding() {
+    padding_ = ::tflite::Padding_VALID;
+    return *this;
+  }
+
+  inline Conv2DTester& ReluActivation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU;
+    return *this;
+  }
+
+  inline Conv2DTester& Relu6Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU6;
+    return *this;
+  }
+
+  inline Conv2DTester& ReluMinus1To1Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU_N1_TO_1;
+    return *this;
+  }
+
+  inline Conv2DTester& TanhActivation() {
+    activation_ = ::tflite::ActivationFunctionType_TANH;
+    return *this;
+  }
+
+  inline Conv2DTester& SignBitActivation() {
+    activation_ = ::tflite::ActivationFunctionType_SIGN_BIT;
+    return *this;
+  }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  inline ::tflite::Padding Padding() const { return padding_; }
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  int32_t batch_size_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t output_channels_ = 1;
+  int32_t input_height_ = 1;
+  int32_t input_width_ = 1;
+  int32_t kernel_height_ = 1;
+  int32_t kernel_width_ = 1;
+  int32_t stride_height_ = 1;
+  int32_t stride_width_ = 1;
+  int32_t dilation_height_ = 1;
+  int32_t dilation_width_ = 1;
+  bool fp16_weights_ = false;
+  ::tflite::Padding padding_ = ::tflite::Padding_VALID;
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_CONV_2D_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
index fd82e4fd83f..c9d274cbe01 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
@@ -371,6 +371,37 @@ TEST(DepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
+TEST(DepthwiseConv2D, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .FP16Weights()
+      .Test(xnnpack_delegate.get());
+}
+
 TEST(DepthwiseConv2D, ReluActivation) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
index b6d1dfec69b..9b6749e42f6 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include <fp16.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
@@ -107,56 +108,110 @@ void DepthwiseConv2DTester::Test(TfLiteDelegate* delegate) const {
 }
 
 std::vector<char> DepthwiseConv2DTester::CreateTfLiteModel() const {
-  flatbuffers::FlatBufferBuilder builder;
-  flatbuffers::Offset<OperatorCode> operator_code =
-      CreateOperatorCode(builder, BuiltinOperator_DEPTHWISE_CONV_2D);
-
-  flatbuffers::Offset<DepthwiseConv2DOptions> depthwise_conv2d_options =
-      CreateDepthwiseConv2DOptions(
-          builder, Padding(), StrideWidth(), StrideHeight(), DepthMultiplier(),
-          Activation(), DilationWidth(), DilationHeight());
-
-  std::vector<float> filter_data(KernelHeight() * KernelWidth() *
-                                 OutputChannels());
-  std::vector<float> bias_data(OutputChannels());
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto range_rng = std::bind(
       std::uniform_real_distribution<float>(-25.0f, 25.0f), std::ref(rng));
-  for (int32_t ic = 0; ic < InputChannels(); ic++) {
-    // Use the same range of all-positive or all-negative values to generate
-    // all pixels within the same batch index & channel, but different ranges
-    // for different channels or batches. This ensures that no catastrophic
-    // cancellation occur, but test covers both positive and negative inputs.
-    const float range = range_rng();
-    auto value_rng =
-        std::bind(std::uniform_real_distribution<float>(std::min(range, 0.0f),
-                                                        std::max(range, 0.0f)),
-                  std::ref(rng));
-    for (int32_t m = 0; m < DepthMultiplier(); m++) {
-      const int32_t oc = ic * DepthMultiplier() + m;
-      bias_data[oc] = value_rng();
-      for (int32_t y = 0; y < KernelHeight(); y++) {
-        for (int32_t x = 0; x < KernelWidth(); x++) {
-          const int32_t index = (y * KernelWidth() + x) * OutputChannels() + oc;
-          filter_data[index] = value_rng();
+
+  flatbuffers::FlatBufferBuilder builder;
+  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+      {CreateOperatorCode(builder, BuiltinOperator_DEPTHWISE_CONV_2D)}};
+  std::vector<flatbuffers::Offset<tflite::Operator>> operators;
+  std::vector<flatbuffers::Offset<tflite::Buffer>> buffers{
+      {CreateBuffer(builder, builder.CreateVector({}))}};
+
+  if (FP16Weights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+
+    std::vector<uint16_t> filter_data(KernelHeight() * KernelWidth() *
+                                      OutputChannels());
+    std::vector<uint16_t> bias_data(OutputChannels());
+    for (int32_t ic = 0; ic < InputChannels(); ic++) {
+      // Use the same range of all-positive or all-negative values to generate
+      // all pixels within the same batch index & channel, but different ranges
+      // for different channels or batches. This ensures that no catastrophic
+      // cancellation occur, but test covers both positive and negative inputs.
+      const float range = range_rng();
+      auto value_rng =
+          std::bind(fp16_ieee_from_fp32_value,
+                    std::bind(std::uniform_real_distribution<float>(
+                                  std::min(range, 0.0f), std::max(range, 0.0f)),
+                              std::ref(rng)));
+      for (int32_t m = 0; m < DepthMultiplier(); m++) {
+        const int32_t oc = ic * DepthMultiplier() + m;
+        bias_data[oc] = value_rng();
+        for (int32_t y = 0; y < KernelHeight(); y++) {
+          for (int32_t x = 0; x < KernelWidth(); x++) {
+            const int32_t index =
+                (y * KernelWidth() + x) * OutputChannels() + oc;
+            filter_data[index] = value_rng();
+          }
         }
       }
     }
-  }
 
-  const std::array<flatbuffers::Offset<tflite::Buffer>, 3> buffers{{
-      CreateBuffer(builder, builder.CreateVector({})),
-      CreateBuffer(builder,
-                   builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(filter_data.data()),
-                       sizeof(float) * filter_data.size())),
-      CreateBuffer(builder,
-                   builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(bias_data.data()),
-                       sizeof(float) * bias_data.size())),
-  }};
+    buffers.emplace_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(filter_data.data()),
+                     sizeof(uint16_t) * filter_data.size())));
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                             sizeof(uint16_t) * bias_data.size())));
+
+    const std::array<int32_t, 1> dequantize_filter_inputs{{0}};
+    const std::array<int32_t, 1> dequantize_filter_outputs{{3}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
+                                      dequantize_filter_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
+                                      dequantize_filter_outputs.size())));
+    const std::array<int32_t, 1> dequantize_bias_inputs{{1}};
+    const std::array<int32_t, 1> dequantize_bias_outputs{{4}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
+                                      dequantize_bias_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
+                                      dequantize_bias_outputs.size())));
+  } else {
+    std::vector<float> filter_data(KernelHeight() * KernelWidth() *
+                                   OutputChannels());
+    std::vector<float> bias_data(OutputChannels());
+    for (int32_t ic = 0; ic < InputChannels(); ic++) {
+      // Use the same range of all-positive or all-negative values to generate
+      // all pixels within the same batch index & channel, but different ranges
+      // for different channels or batches. This ensures that no catastrophic
+      // cancellation occur, but test covers both positive and negative inputs.
+      const float range = range_rng();
+      auto value_rng =
+          std::bind(std::uniform_real_distribution<float>(
+                        std::min(range, 0.0f), std::max(range, 0.0f)),
+                    std::ref(rng));
+      for (int32_t m = 0; m < DepthMultiplier(); m++) {
+        const int32_t oc = ic * DepthMultiplier() + m;
+        bias_data[oc] = value_rng();
+        for (int32_t y = 0; y < KernelHeight(); y++) {
+          for (int32_t x = 0; x < KernelWidth(); x++) {
+            const int32_t index =
+                (y * KernelWidth() + x) * OutputChannels() + oc;
+            filter_data[index] = value_rng();
+          }
+        }
+      }
+    }
+
+    buffers.emplace_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(filter_data.data()),
+                     sizeof(float) * filter_data.size())));
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                             sizeof(float) * bias_data.size())));
+  }
 
   const std::array<int32_t, 4> input_shape{
       {BatchSize(), InputHeight(), InputWidth(), InputChannels()}};
@@ -166,49 +221,69 @@ std::vector<char> DepthwiseConv2DTester::CreateTfLiteModel() const {
       {1, KernelHeight(), KernelWidth(), OutputChannels()}};
   const std::array<int32_t, 1> bias_shape{{OutputChannels()}};
 
-  const std::array<flatbuffers::Offset<tflite::Tensor>, 4> tensors{{
-      CreateTensor(
-          builder,
-          builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
-          TensorType_FLOAT32),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(filter_shape.data(),
-                                                 filter_shape.size()),
-                   TensorType_FLOAT32, /*buffer=*/1),
-      CreateTensor(
-          builder,
-          builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
-          TensorType_FLOAT32, /*buffer=*/2),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(output_shape.data(),
-                                                 output_shape.size()),
-                   TensorType_FLOAT32),
-  }};
+  std::vector<flatbuffers::Offset<tflite::Tensor>> tensors;
+  if (FP16Weights()) {
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+        TensorType_FLOAT16, /*buffer=*/1));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+        TensorType_FLOAT16, /*buffer=*/2));
+  }
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
+      TensorType_FLOAT32));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 1));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 2));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
+      TensorType_FLOAT32));
 
-  const std::array<int32_t, 3> op_inputs{{0, 1, 2}};
-  const std::array<int32_t, 1> op_outputs{{3}};
+  const std::array<int32_t, 3> op_inputs{
+      {static_cast<int>(tensors.size()) - 4,
+       static_cast<int>(tensors.size()) - 3,
+       static_cast<int>(tensors.size()) - 2}};
+  const std::array<int32_t, 1> op_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
 
-  flatbuffers::Offset<tflite::Operator> op = CreateOperator(
+  flatbuffers::Offset<DepthwiseConv2DOptions> depthwise_conv2d_options =
+      CreateDepthwiseConv2DOptions(
+          builder, Padding(), StrideWidth(), StrideHeight(), DepthMultiplier(),
+          Activation(), DilationWidth(), DilationHeight());
+  operators.emplace_back(CreateOperator(
       builder, /*opcode_index=*/0,
       builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
       builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
-      BuiltinOptions_DepthwiseConv2DOptions, depthwise_conv2d_options.Union());
+      BuiltinOptions_DepthwiseConv2DOptions, depthwise_conv2d_options.Union()));
 
-  const std::array<int32_t, 1> subgraph_inputs{{0}};
-  const std::array<int32_t, 1> subgraph_outputs{{3}};
+  const std::array<int32_t, 1> subgraph_inputs{
+      {static_cast<int>(tensors.size()) - 4}};
+  const std::array<int32_t, 1> subgraph_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
   flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
       builder, builder.CreateVector(tensors.data(), tensors.size()),
       builder.CreateVector<int32_t>(subgraph_inputs.data(),
                                     subgraph_inputs.size()),
       builder.CreateVector<int32_t>(subgraph_outputs.data(),
                                     subgraph_outputs.size()),
-      builder.CreateVector(&op, 1));
+      builder.CreateVector(operators.data(), operators.size()));
 
   flatbuffers::Offset<flatbuffers::String> description =
       builder.CreateString("DepthwiseConv2D model");
 
   flatbuffers::Offset<Model> model_buffer = CreateModel(
-      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder, TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(operator_codes.data(), operator_codes.size()),
       builder.CreateVector(&subgraph, 1), description,
       builder.CreateVector(buffers.data(), buffers.size()));
 
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
index ec8e4cea429..102c66af340 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
@@ -17,17 +17,11 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_DEPTHWISE_CONV_2D_TESTER_H_
 
 #include <cstdint>
-#include <functional>
-#include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/version.h"
 
 namespace tflite {
 namespace xnnpack {
@@ -158,6 +152,13 @@ class DepthwiseConv2DTester {
     return (KernelWidth() - 1) * DilationWidth() + 1;
   }
 
+  inline DepthwiseConv2DTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
   inline DepthwiseConv2DTester& SamePadding() {
     padding_ = ::tflite::Padding_SAME;
     return *this;
@@ -215,6 +216,7 @@ class DepthwiseConv2DTester {
   int32_t stride_width_ = 1;
   int32_t dilation_height_ = 1;
   int32_t dilation_width_ = 1;
+  bool fp16_weights_ = false;
   ::tflite::Padding padding_ = ::tflite::Padding_VALID;
   ::tflite::ActivationFunctionType activation_ =
       ::tflite::ActivationFunctionType_NONE;
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
index a801ce141ed..0dffd1dee19 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
@@ -228,6 +228,29 @@ TEST(FullyConnected, 4DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
+TEST(FullyConnected, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = batch_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .FP16Weights()
+      .Test(xnnpack_delegate.get());
+}
+
 TEST(FullyConnected, ReluActivation) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
index 05716bf18fb..8962b8ba7ba 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include <fp16.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
@@ -109,98 +110,165 @@ void FullyConnectedTester::Test(TfLiteDelegate* delegate) const {
 std::vector<char> FullyConnectedTester::CreateTfLiteModel() const {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
-
   auto range_rng = std::bind(
       std::uniform_real_distribution<float>(-25.0f, 25.0f), std::ref(rng));
 
   flatbuffers::FlatBufferBuilder builder;
-  flatbuffers::Offset<OperatorCode> operator_code =
-      CreateOperatorCode(builder, BuiltinOperator_FULLY_CONNECTED);
+  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+      {CreateOperatorCode(builder, BuiltinOperator_FULLY_CONNECTED)}};
+  std::vector<flatbuffers::Offset<Operator>> operators;
+  std::vector<flatbuffers::Offset<Buffer>> buffers{
+      {CreateBuffer(builder, builder.CreateVector({}))}};
 
-  std::vector<float> filter_data(InputChannels() * OutputChannels());
-  std::vector<float> bias_data(OutputChannels());
+  if (FP16Weights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
 
-  for (int32_t oc = 0; oc < OutputChannels(); oc++) {
-    // Use the same range of all-positive or all-negative values to generate
-    // all filter & bias weights within the same channel, but different ranges
-    // for different output channels. This ensures that no catastrophic
-    // cancellation occur, but test covers both positive and negative inputs.
-    const float range = range_rng();
-    auto value_rng =
-        std::bind(std::uniform_real_distribution<float>(std::min(range, 0.0f),
-                                                        std::max(range, 0.0f)),
-                  std::ref(rng));
+    std::vector<uint16_t> filter_data(InputChannels() * OutputChannels());
+    std::vector<uint16_t> bias_data(OutputChannels());
 
-    bias_data[oc] = value_rng();
-    for (int32_t ic = 0; ic < InputChannels(); ic++) {
-      filter_data[oc * InputChannels() + ic] = value_rng();
+    for (int32_t oc = 0; oc < OutputChannels(); oc++) {
+      // Use the same range of all-positive or all-negative values to generate
+      // all filter & bias weights within the same channel, but different ranges
+      // for different output channels. This ensures that no catastrophic
+      // cancellation occur, but test covers both positive and negative inputs.
+      const float range = range_rng();
+      auto value_rng =
+          std::bind(fp16_ieee_from_fp32_value,
+                    std::bind(std::uniform_real_distribution<float>(
+                                  std::min(range, 0.0f), std::max(range, 0.0f)),
+                              std::ref(rng)));
+
+      bias_data[oc] = value_rng();
+      for (int32_t ic = 0; ic < InputChannels(); ic++) {
+        filter_data[oc * InputChannels() + ic] = value_rng();
+      }
     }
-  }
 
-  std::array<flatbuffers::Offset<Buffer>, 3> buffers{{
-      CreateBuffer(builder, builder.CreateVector({})),
-      CreateBuffer(builder,
-                   builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(filter_data.data()),
-                       sizeof(float) * filter_data.size())),
-      CreateBuffer(builder,
-                   builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(bias_data.data()),
-                       sizeof(float) * bias_data.size())),
-  }};
+    buffers.emplace_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(filter_data.data()),
+                     sizeof(uint16_t) * filter_data.size())));
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                             sizeof(uint16_t) * bias_data.size())));
+
+    const std::array<int32_t, 1> dequantize_filter_inputs{{0}};
+    const std::array<int32_t, 1> dequantize_filter_outputs{{3}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
+                                      dequantize_filter_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
+                                      dequantize_filter_outputs.size())));
+    const std::array<int32_t, 1> dequantize_bias_inputs{{1}};
+    const std::array<int32_t, 1> dequantize_bias_outputs{{4}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
+                                      dequantize_bias_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
+                                      dequantize_bias_outputs.size())));
+  } else {
+    std::vector<float> filter_data(InputChannels() * OutputChannels());
+    std::vector<float> bias_data(OutputChannels());
+
+    for (int32_t oc = 0; oc < OutputChannels(); oc++) {
+      // Use the same range of all-positive or all-negative values to generate
+      // all filter & bias weights within the same channel, but different ranges
+      // for different output channels. This ensures that no catastrophic
+      // cancellation occur, but test covers both positive and negative inputs.
+      const float range = range_rng();
+      auto value_rng =
+          std::bind(std::uniform_real_distribution<float>(
+                        std::min(range, 0.0f), std::max(range, 0.0f)),
+                    std::ref(rng));
+
+      bias_data[oc] = value_rng();
+      for (int32_t ic = 0; ic < InputChannels(); ic++) {
+        filter_data[oc * InputChannels() + ic] = value_rng();
+      }
+    }
+
+    buffers.emplace_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(filter_data.data()),
+                     sizeof(float) * filter_data.size())));
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                             sizeof(float) * bias_data.size())));
+  }
 
   const std::array<int32_t, 2> filter_shape(
       {OutputChannels(), InputChannels()});
   const std::array<int32_t, 1> bias_shape({OutputChannels()});
 
   const std::vector<int32_t> output_shape = OutputShape();
-  const std::array<flatbuffers::Offset<Tensor>, 4> tensors{{
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(InputShape().data(),
-                                                 InputShape().size()),
-                   TensorType_FLOAT32),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(filter_shape.data(),
-                                                 filter_shape.size()),
-                   TensorType_FLOAT32, /*buffer=*/1),
-      CreateTensor(
-          builder,
-          builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
-          TensorType_FLOAT32, /*buffer=*/2),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(output_shape.data(),
-                                                 output_shape.size()),
-                   TensorType_FLOAT32),
-  }};
+  std::vector<flatbuffers::Offset<Tensor>> tensors;
+  if (FP16Weights()) {
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+        TensorType_FLOAT16, /*buffer=*/1));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+        TensorType_FLOAT16, /*buffer=*/2));
+  }
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(InputShape().data(), InputShape().size()),
+      TensorType_FLOAT32));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 1));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 2));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
+      TensorType_FLOAT32));
 
   flatbuffers::Offset<FullyConnectedOptions> fully_connected_options =
       CreateFullyConnectedOptions(builder, Activation(),
                                   FullyConnectedOptionsWeightsFormat_DEFAULT,
                                   KeepDims());
 
-  const std::array<int32_t, 3> op_inputs{{0, 1, 2}};
-  const std::array<int32_t, 1> op_outputs{{3}};
-  flatbuffers::Offset<Operator> op = CreateOperator(
+  const std::array<int32_t, 3> op_inputs{
+      {static_cast<int>(tensors.size()) - 4,
+       static_cast<int>(tensors.size()) - 3,
+       static_cast<int>(tensors.size()) - 2}};
+  const std::array<int32_t, 1> op_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
+  operators.emplace_back(CreateOperator(
       builder, /*opcode_index=*/0,
       builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
       builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
-      BuiltinOptions_FullyConnectedOptions, fully_connected_options.Union());
+      BuiltinOptions_FullyConnectedOptions, fully_connected_options.Union()));
 
-  const std::array<int32_t, 1> subgraph_inputs{{0}};
-  const std::array<int32_t, 1> subgraph_outputs{{3}};
+  const std::array<int32_t, 1> subgraph_inputs{
+      {static_cast<int>(tensors.size()) - 4}};
+  const std::array<int32_t, 1> subgraph_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
   flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
       builder, builder.CreateVector(tensors.data(), tensors.size()),
       builder.CreateVector<int32_t>(subgraph_inputs.data(),
                                     subgraph_inputs.size()),
       builder.CreateVector<int32_t>(subgraph_outputs.data(),
                                     subgraph_outputs.size()),
-      builder.CreateVector(&op, 1));
+      builder.CreateVector(operators.data(), operators.size()));
 
   flatbuffers::Offset<flatbuffers::String> description =
       builder.CreateString("Fully Connected model");
 
   flatbuffers::Offset<Model> model_buffer = CreateModel(
-      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder, TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(operator_codes.data(), operator_codes.size()),
       builder.CreateVector(&subgraph, 1), description,
       builder.CreateVector(buffers.data(), buffers.size()));
 
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
index 1c8e3d5d60c..6350bc8d739 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
@@ -17,8 +17,6 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_FULLY_CONNECTED_TESTER_H_
 
 #include <cstdint>
-#include <functional>
-#include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -73,6 +71,13 @@ class FullyConnectedTester {
 
   inline bool KeepDims() const { return keep_dims_; }
 
+  inline FullyConnectedTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
   inline FullyConnectedTester& ReluActivation() {
     activation_ = ::tflite::ActivationFunctionType_RELU;
     return *this;
@@ -104,6 +109,7 @@ class FullyConnectedTester {
   int32_t input_channels_ = 1;
   int32_t output_channels_ = 1;
   bool keep_dims_ = false;
+  bool fp16_weights_ = false;
   ::tflite::ActivationFunctionType activation_ =
       ::tflite::ActivationFunctionType_NONE;
 };
diff --git a/tensorflow/lite/delegates/xnnpack/mul_test.cc b/tensorflow/lite/delegates/xnnpack/mul_test.cc
index 6c0475e2b64..2dbb2663b80 100644
--- a/tensorflow/lite/delegates/xnnpack/mul_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/mul_test.cc
@@ -679,6 +679,35 @@ TEST(Mul, 2DByStatic0D) {
       .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
 }
 
+TEST(Mul, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .FP16Weights()
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input2Static(true)
+      .FP16Weights()
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
 TEST(Mul, ReluActivation) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/pad_test.cc b/tensorflow/lite/delegates/xnnpack/pad_test.cc
new file mode 100644
index 00000000000..c93ff8ab661
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/pad_test.cc
@@ -0,0 +1,279 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/pad_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(Pad, Full4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), pad_rng(), pad_rng(), pad_rng()})
+      .InputPostPaddings({pad_rng(), pad_rng(), pad_rng(), pad_rng()})
+      .InputShape({shape_rng(), shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Batch4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), 0, 0, 0})
+      .InputPostPaddings({pad_rng(), 0, 0, 0})
+      .InputShape({shape_rng(), shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, HeightAndWidth4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, pad_rng(), pad_rng(), 0})
+      .InputPostPaddings({0, pad_rng(), pad_rng(), 0})
+      .InputShape({shape_rng(), shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Channels4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, 0, 0, pad_rng()})
+      .InputPostPaddings({0, 0, 0, pad_rng()})
+      .InputShape({shape_rng(), shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Full3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), pad_rng(), pad_rng()})
+      .InputPostPaddings({pad_rng(), pad_rng(), pad_rng()})
+      .InputShape({shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Batch3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), 0, 0})
+      .InputPostPaddings({pad_rng(), 0, 0})
+      .InputShape({shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Width3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, pad_rng(), 0})
+      .InputPostPaddings({0, pad_rng(), 0})
+      .InputShape({shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Channels3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, 0, pad_rng()})
+      .InputPostPaddings({0, 0, pad_rng()})
+      .InputShape({shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Full2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), pad_rng()})
+      .InputPostPaddings({pad_rng(), pad_rng()})
+      .InputShape({shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Batch2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), 0})
+      .InputPostPaddings({pad_rng(), 0})
+      .InputShape({shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Channels2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, pad_rng()})
+      .InputPostPaddings({0, pad_rng()})
+      .InputShape({shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, 1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), pad_rng()})
+      .InputPostPaddings({pad_rng(), pad_rng()})
+      .InputShape({shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, 0, 0, pad_rng()})
+      .InputPostPaddings({0, 0, 0, pad_rng()})
+      .InputShape({shape_rng(), shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/pad_tester.cc b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
new file mode 100644
index 00000000000..e364b880124
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
@@ -0,0 +1,187 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/pad_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+std::vector<int32_t> PadTester::OutputShape() const {
+  std::vector<int32_t> output_shape;
+  output_shape.reserve(InputShape().size());
+  for (size_t i = 0; i < InputShape().size(); i++) {
+    int32_t output_dim = InputShape()[i];
+    if (i < InputPrePaddings().size()) {
+      output_dim += InputPrePaddings()[i];
+    }
+    if (i < InputPostPaddings().size()) {
+      output_dim += InputPostPaddings()[i];
+    }
+    output_shape.push_back(output_dim);
+  }
+  return output_shape;
+}
+
+void PadTester::Test(TfLiteDelegate* delegate) const {
+  ASSERT_EQ(InputPrePaddings().size(), InputPostPaddings().size());
+  ASSERT_LE(InputPrePaddings().size(), InputShape().size());
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng =
+      std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+
+  std::vector<char> buffer = CreateTfLiteModel();
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  float* default_input_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->inputs()[0]);
+  std::generate(default_input_data,
+                default_input_data + ComputeSize(InputShape()),
+                std::ref(input_rng));
+
+  float* delegate_input_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->inputs()[0]);
+  std::copy(default_input_data, default_input_data + ComputeSize(InputShape()),
+            delegate_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->outputs()[0]);
+  float* delegate_output_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->outputs()[0]);
+
+  for (size_t i = 0; i < ComputeSize(OutputShape()); i++) {
+    ASSERT_EQ(default_output_data[i], delegate_output_data[i]);
+  }
+}
+
+std::vector<char> PadTester::CreateTfLiteModel() const {
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<OperatorCode> operator_code =
+      CreateOperatorCode(builder, BuiltinOperator_PAD);
+
+  std::vector<int32_t> paddings(InputPrePaddings().size() +
+                                InputPostPaddings().size());
+  for (size_t i = 0; i < InputPrePaddings().size(); i++) {
+    paddings[i * 2] = InputPrePaddings()[i];
+    paddings[i * 2 + 1] = InputPostPaddings()[i];
+  }
+  const std::array<flatbuffers::Offset<Buffer>, 2> buffers{{
+      CreateBuffer(builder, builder.CreateVector({})),
+      CreateBuffer(builder,
+                   builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(paddings.data()),
+                       sizeof(float) * paddings.size())),
+  }};
+
+  const std::vector<int32_t> output_shape = OutputShape();
+  const std::array<int32_t, 2> paddings_shape{
+      {static_cast<int32_t>(InputPrePaddings().size()), 2}};
+  const std::array<flatbuffers::Offset<Tensor>, 3> tensors{{
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(InputShape().data(),
+                                                 InputShape().size()),
+                   TensorType_FLOAT32),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(paddings_shape.data(),
+                                                 paddings_shape.size()),
+                   TensorType_INT32, /*buffer=*/1),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(output_shape.data(),
+                                                 output_shape.size()),
+                   TensorType_FLOAT32),
+  }};
+
+  const std::array<int32_t, 2> op_inputs{{0, 1}};
+  const std::array<int32_t, 1> op_outputs{{2}};
+  flatbuffers::Offset<Operator> op = CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
+
+  const std::array<int32_t, 1> subgraph_inputs{{0}};
+  const std::array<int32_t, 1> subgraph_outputs{{2}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(&op, 1));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("Pad model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+int32_t PadTester::ComputeSize(const std::vector<int32_t>& shape) {
+  return std::accumulate(shape.cbegin(), shape.cend(), 1,
+                         std::multiplies<int32_t>());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/pad_tester.h b/tensorflow/lite/delegates/xnnpack/pad_tester.h
new file mode 100644
index 00000000000..a6951fdf156
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/pad_tester.h
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_PAD_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_PAD_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class PadTester {
+ public:
+  PadTester() = default;
+  PadTester(const PadTester&) = delete;
+  PadTester& operator=(const PadTester&) = delete;
+
+  inline PadTester& InputShape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline PadTester& InputPrePaddings(std::initializer_list<int32_t> paddings) {
+    for (auto it = paddings.begin(); it != paddings.end(); ++it) {
+      EXPECT_GE(*it, 0);
+    }
+    input_pre_paddings_ =
+        std::vector<int32_t>(paddings.begin(), paddings.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t> InputPrePaddings() const {
+    return input_pre_paddings_;
+  }
+
+  inline PadTester& InputPostPaddings(std::initializer_list<int32_t> paddings) {
+    for (auto it = paddings.begin(); it != paddings.end(); ++it) {
+      EXPECT_GE(*it, 0);
+    }
+    input_post_paddings_ =
+        std::vector<int32_t>(paddings.begin(), paddings.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t> InputPostPaddings() const {
+    return input_post_paddings_;
+  }
+
+  std::vector<int32_t> OutputShape() const;
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  std::vector<int32_t> input_pre_paddings_;
+  std::vector<int32_t> input_post_paddings_;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_PAD_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/pool_2d_tester.h b/tensorflow/lite/delegates/xnnpack/pool_2d_tester.h
index 3125e9231f6..a84be10ad45 100644
--- a/tensorflow/lite/delegates/xnnpack/pool_2d_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/pool_2d_tester.h
@@ -17,17 +17,11 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_POOL_2D_TESTER_H_
 
 #include <cstdint>
-#include <functional>
-#include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/version.h"
 
 namespace tflite {
 namespace xnnpack {
diff --git a/tensorflow/lite/delegates/xnnpack/softmax_tester.h b/tensorflow/lite/delegates/xnnpack/softmax_tester.h
index 9f930a6f21e..674dc9a443e 100644
--- a/tensorflow/lite/delegates/xnnpack/softmax_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/softmax_tester.h
@@ -17,17 +17,11 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_SOFTMAX_TESTER_H_
 
 #include <cstdint>
-#include <functional>
-#include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/version.h"
 
 namespace tflite {
 namespace xnnpack {
diff --git a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h
index 88508ccd1c1..e3c210fd6b3 100644
--- a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h
@@ -17,17 +17,11 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_UNARY_ELEMENTWISE_TESTER_H_
 
 #include <cstdint>
-#include <functional>
-#include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/version.h"
 
 namespace tflite {
 namespace xnnpack {
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 388509c9873..9cbdea60706 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -16,15 +16,18 @@ limitations under the License.
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 #include <algorithm>
+#include <array>
 #include <cstdint>
 #include <cstring>
 #include <limits>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
+#include <fp16.h>
 #include <xnnpack.h>
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
@@ -38,6 +41,8 @@ namespace {
 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
 
 class Delegate {
+  friend class Subgraph;
+
  public:
   explicit Delegate(const TfLiteXNNPackDelegateOptions* options) {
 #if !defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__)
@@ -48,9 +53,10 @@ class Delegate {
 #endif
   }
 
+  TfLiteIntArray* PrepareOpsToDelegate(TfLiteContext* context);
   TfLiteDelegate* tflite_delegate() { return &delegate_; }
 
-  pthreadpool_t threadpool() {
+  pthreadpool_t threadpool() const {
 #if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
     return nullptr;
 #else
@@ -68,6 +74,17 @@ class Delegate {
       kTfLiteDelegateFlagsNone,       // .flags
   };
 
+  // Unpacked data for quasi-static tensors, i.e. tensors produced by
+  // dequantizing or unpacking static buffers.
+  std::vector<char> static_unpacked_data_;
+  // Mapping from a tensor index for a quasi-static tensor to the offset to
+  // its unpacked data within static_unpacked_data_.
+  std::unordered_map<int, size_t> static_unpacked_data_map_;
+  // Set of indices of nodes which unpack static data, e.g. Dequantize
+  // operators which convert FP16 static weights to FP32. These nodes are simply
+  // ignored in the delegate implementation, because their outputs are
+  // pre-unpacked in DelegatePrepare.
+  std::unordered_set<int> static_unpack_nodes_;
 #if !defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__)
   // Thread pool with smart-pointer for lifetime management.
   std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> threadpool_{
@@ -79,14 +96,20 @@ class Subgraph {
  public:
   static Subgraph* Create(TfLiteContext* context,
                           const TfLiteDelegateParams* params,
-                          pthreadpool_t threadpool) {
+                          const Delegate* delegate) {
     // Convert subgraph inputs and outputs to hash sets for faster lookup.
     const std::unordered_set<int> inputs(
         &params->input_tensors->data[0],
         &params->input_tensors->data[params->input_tensors->size]);
-    const std::unordered_set<int> outputs(
-        &params->output_tensors->data[0],
-        &params->output_tensors->data[params->output_tensors->size]);
+    std::unordered_set<int> outputs;
+    for (int o = 0; o < params->output_tensors->size; o++) {
+      const int output_tensor_idx = params->output_tensors->data[o];
+      // Exclude quasi-static tensors which may have become subgraph outputs
+      // after partitioning.
+      if (delegate->static_unpacked_data_map_.count(output_tensor_idx) == 0) {
+        outputs.insert(output_tensor_idx);
+      }
+    }
     std::unordered_set<int> externals(outputs);
 
     TfLiteIntArray* execution_plan;
@@ -112,17 +135,36 @@ class Subgraph {
     // filtered out and removed later.
     std::vector<int> tensors(context->tensors_size, -1);
     for (int i = 0; i < params->nodes_to_replace->size; i++) {
+      const int node_index = params->nodes_to_replace->data[i];
+      if (delegate->static_unpack_nodes_.count(node_index)) {
+        // The node unpacks static input and can be skipped because its input
+        // was pre-unpacked in DelegatePrepare.
+        continue;
+      }
+
       TfLiteNode* node = nullptr;
       TfLiteRegistration* registration = nullptr;
-      if (context->GetNodeAndRegistration(context,
-                                          params->nodes_to_replace->data[i],
-                                          &node, &registration) != kTfLiteOk) {
+      if (context->GetNodeAndRegistration(context, node_index, &node,
+                                          &registration) != kTfLiteOk) {
         return nullptr;
       }
 
-      for (int k = 0; k < node->inputs->size; k++) {
-        const int t = node->inputs->data[k];
-        tensors[t] = t;
+      switch (registration->builtin_code) {
+        case kTfLiteBuiltinPad:
+          // Ignore the second input (static padding), because it is
+          // represented as parameters of the XNNPACK operator rather than
+          // extra input.
+          {
+            const int t = node->inputs->data[0];
+            tensors[t] = t;
+          }
+          break;
+        default:
+          // All other operators: process all inputs
+          for (int k = 0; k < node->inputs->size; k++) {
+            const int t = node->inputs->data[k];
+            tensors[t] = t;
+          }
       }
       for (int k = 0; k < node->outputs->size; k++) {
         const int t = node->outputs->data[k];
@@ -150,6 +192,12 @@ class Subgraph {
       const void* data = nullptr;
       if (context->tensors[t].allocation_type == kTfLiteMmapRo) {
         data = context->tensors[t].data.raw_const;
+      } else {
+        // Check for quasi-static data.
+        const auto it = delegate->static_unpacked_data_map_.find(t);
+        if (it != delegate->static_unpacked_data_map_.end()) {
+          data = delegate->static_unpacked_data_.data() + it->second;
+        }
       }
       if (inputs.count(t) != 0) {
         flags |= XNN_VALUE_FLAG_EXTERNAL_INPUT;
@@ -175,25 +223,38 @@ class Subgraph {
       }
     }
 
+    // Create a set of quasi-static tensors for VisitNode function
+    std::unordered_set<int> quasi_static_tensors;
+    for (const std::pair<const int, size_t>& entry :
+         delegate->static_unpacked_data_map_) {
+      quasi_static_tensors.insert(entry.first);
+    }
+
     // Create XNNPACK nodes for TFLite delegate nodes
     for (int i = 0; i < params->nodes_to_replace->size; i++) {
+      const int node_index = params->nodes_to_replace->data[i];
+      if (delegate->static_unpack_nodes_.count(node_index)) {
+        // The node unpacks static input and can be skipped because its input
+        // was pre-unpacked in DelegatePrepare.
+        continue;
+      }
+
       TfLiteNode* node = nullptr;
       TfLiteRegistration* registration = nullptr;
-      if (context->GetNodeAndRegistration(context,
-                                          params->nodes_to_replace->data[i],
-                                          &node, &registration) != kTfLiteOk) {
+      if (context->GetNodeAndRegistration(context, node_index, &node,
+                                          &registration) != kTfLiteOk) {
         return nullptr;
       }
 
-      if (VisitNode(subgraph.get(), context, registration, node, i,
-                    xnnpack_tensors) != kTfLiteOk) {
+      if (VisitNode(subgraph.get(), context, registration, node, node_index,
+                    quasi_static_tensors, xnnpack_tensors) != kTfLiteOk) {
         return nullptr;
       }
     }
 
     xnn_runtime_t runtime_ptr = nullptr;
-    status = xnn_create_runtime_v2(subgraph.get(), threadpool, /*flags=*/0,
-                                   &runtime_ptr);
+    status = xnn_create_runtime_v2(subgraph.get(), delegate->threadpool(),
+                                   /*flags=*/0, &runtime_ptr);
     if (status != xnn_status_success) {
       TF_LITE_KERNEL_LOG(context, "failed to create XNNPACK runtime");
       return nullptr;
@@ -245,10 +306,9 @@ class Subgraph {
         *flags = 0;
         return kTfLiteOk;
       default:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context, "invalid padding mode (%d) in node #%d",
-                             static_cast<int>(padding), node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(context,
+                                 "invalid padding mode (%d) in node #%d",
+                                 static_cast<int>(padding), node_index);
         return kTfLiteError;
     }
   }
@@ -274,32 +334,24 @@ class Subgraph {
         *output_max = 6.0f;
         return kTfLiteOk;
       case kTfLiteActTanh:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "unsupported fused activation (Tanh) in node #%d",
-                             node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Tanh) in node #%d",
+            node_index);
         return kTfLiteError;
       case kTfLiteActSignBit:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "unsupported fused activation (Sign) in node #%d",
-                             node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Sign) in node #%d",
+            node_index);
         return kTfLiteError;
       case kTfLiteActSigmoid:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(
-              context, "unsupported fused activation (Sigmoid) in node #%d",
-              node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Sigmoid) in node #%d",
+            node_index);
         return kTfLiteError;
       default:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "invalid fused activation (%d) in node #%d",
-                             static_cast<int>(activation), node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(context,
+                                 "invalid fused activation (%d) in node #%d",
+                                 static_cast<int>(activation), node_index);
         return kTfLiteError;
     }
   }
@@ -308,34 +360,26 @@ class Subgraph {
                                              const TfLiteConvParams* params,
                                              int node_index) {
     if (params->stride_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
-                           params->stride_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
-                           params->stride_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
       return kTfLiteError;
     }
 
     if (params->dilation_width_factor <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid dilation width factor %d in node #%d",
-                           params->dilation_width_factor, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid dilation width factor %d in node #%d",
+                               params->dilation_width_factor, node_index);
       return kTfLiteError;
     }
     if (params->dilation_height_factor <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid dilation height factor %d in node #%d",
-                           params->dilation_height_factor, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid dilation height factor %d in node #%d",
+                               params->dilation_height_factor, node_index);
       return kTfLiteError;
     }
 
@@ -346,52 +390,41 @@ class Subgraph {
       TfLiteContext* context, const TfLiteDepthwiseConvParams* params,
       int output_channels, int node_index) {
     if (params->stride_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
-                           params->stride_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
-                           params->stride_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
       return kTfLiteError;
     }
 
     if (params->depth_multiplier <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid depth multiplier %d in node #%d",
-                           params->depth_multiplier, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid depth multiplier %d in node #%d",
+                               params->depth_multiplier, node_index);
       return kTfLiteError;
     }
     if (output_channels % params->depth_multiplier != 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "depth multiplier %d is incompatible with "
-                           "number of output channels %d in node #%d",
-                           params->depth_multiplier, output_channels,
-                           node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "depth multiplier %d is incompatible with "
+                               "number of output channels %d in node #%d",
+                               params->depth_multiplier, output_channels,
+                               node_index);
       return kTfLiteError;
     }
 
     if (params->dilation_width_factor <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid dilation width factor %d in node #%d",
-                           params->dilation_width_factor, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid dilation width factor %d in node #%d",
+                               params->dilation_width_factor, node_index);
       return kTfLiteError;
     }
     if (params->dilation_height_factor <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid dilation height factor %d in node #%d",
-                           params->dilation_height_factor, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid dilation height factor %d in node #%d",
+                               params->dilation_height_factor, node_index);
       return kTfLiteError;
     }
 
@@ -402,17 +435,13 @@ class Subgraph {
       TfLiteContext* context, const TfLiteTransposeConvParams* params,
       int node_index) {
     if (params->stride_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
-                           params->stride_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
-                           params->stride_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
       return kTfLiteError;
     }
 
@@ -502,11 +531,9 @@ class Subgraph {
       TfLiteContext* context, const TfLiteFullyConnectedParams* params,
       int node_index) {
     if (params->weights_format != kTfLiteFullyConnectedWeightsFormatDefault) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "unsupported non-default weights format in node #%d",
-                           node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unsupported non-default weights format in node #%d",
+          node_index);
       return kTfLiteError;
     }
 
@@ -517,39 +544,29 @@ class Subgraph {
                                          const TfLitePoolParams* params,
                                          int node_index) {
     if (params->stride_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
-                           params->stride_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
-                           params->stride_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
       return kTfLiteError;
     }
 
     if (params->filter_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid filter width %d in node #%d",
-                           params->filter_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid filter width %d in node #%d",
+                               params->filter_width, node_index);
       return kTfLiteError;
     }
     if (params->filter_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid filter height %d in node #%d",
-                           params->filter_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid filter height %d in node #%d",
+                               params->filter_height, node_index);
       return kTfLiteError;
     }
     if (params->filter_width == 1 && params->filter_height == 1) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "meaningless 1x1 pooling in node #%d",
-                           node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "meaningless 1x1 pooling in node #%d",
+                               node_index);
       return kTfLiteError;
     }
 
@@ -562,19 +579,28 @@ class Subgraph {
                                                int expected_num_outputs,
                                                int node_index) {
     if (node->inputs->size != expected_num_inputs) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "unexpected number of inputs (%d != %d) in node #%d",
-                           node->inputs->size, expected_num_inputs, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unexpected number of inputs (%d != %d) in node #%d",
+          node->inputs->size, expected_num_inputs, node_index);
       return kTfLiteError;
     }
     if (node->outputs->size != expected_num_outputs) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            context, "unexpected number of output (%d != %d) in node #%d",
-            node->outputs->size, expected_num_outputs, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unexpected number of output (%d != %d) in node #%d",
+          node->outputs->size, expected_num_outputs, node_index);
+      return kTfLiteError;
+    }
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus CheckTensorType(TfLiteContext* context,
+                                      const TfLiteTensor& tensor,
+                                      TfLiteType expected_type,
+                                      int tensor_index, int node_index) {
+    if (tensor.type != expected_type) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unsupported type %s in tensor #%d in node #%d",
+          TfLiteTypeGetName(tensor.type), tensor_index, node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -583,13 +609,49 @@ class Subgraph {
   static TfLiteStatus CheckTensorFloatType(TfLiteContext* context,
                                            const TfLiteTensor& tensor,
                                            int tensor_index, int node_index) {
-    if (tensor.type != kTfLiteFloat32) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            context, "unsupported type %s in tensor #%d in node #%d",
-            TfLiteTypeGetName(tensor.type), tensor_index, node_index);
+    return CheckTensorType(context, tensor, kTfLiteFloat32, tensor_index,
+                           node_index);
+  }
+
+  static TfLiteStatus CheckTensorShape(TfLiteContext* context,
+                                       const TfLiteTensor& tensor,
+                                       int min_num_dims, int max_num_dims,
+                                       int tensor_index) {
+    if (min_num_dims == max_num_dims) {
+      if (tensor.dims->size != min_num_dims) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context,
+            "unsupported number of shape dimensions (%d) in tensor #%d: "
+            "%d dimensions expected",
+            tensor.dims->size, tensor_index, min_num_dims);
+        return kTfLiteError;
+      }
+    } else {
+      if (tensor.dims->size < min_num_dims) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context,
+            "unsupported number of shape dimensions (%d) in tensor #%d: "
+            "at least %d dimensions expected",
+            tensor.dims->size, tensor_index, min_num_dims);
+        return kTfLiteError;
+      }
+      if (tensor.dims->size > max_num_dims) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context,
+            "unsupported number of shape dimensions (%d) in tensor #%d: "
+            "at most %d dimensions expected",
+            tensor.dims->size, tensor_index, max_num_dims);
+        return kTfLiteError;
+      }
+    }
+    for (int i = 0; i < tensor.dims->size; i++) {
+      if (tensor.dims->data[i] <= 0) {
+        TF_LITE_MAYBE_KERNEL_LOG(context,
+                                 "invalid num of elements (%d) in "
+                                 "dimension #%d in tensor #%d",
+                                 tensor.dims->data[i], i, tensor_index);
+        return kTfLiteError;
       }
-      return kTfLiteError;
     }
     return kTfLiteOk;
   }
@@ -598,68 +660,79 @@ class Subgraph {
                                        const TfLiteTensor& tensor,
                                        int expected_num_dims,
                                        int tensor_index) {
-    if (tensor.dims->size != expected_num_dims) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            context,
-            "unexpected number of shape dimensions (%d != %d) in tensor #%d",
-            tensor.dims->size, expected_num_dims, tensor_index);
-      }
-      return kTfLiteError;
-    }
-    for (int i = 0; i < tensor.dims->size; i++) {
-      if (tensor.dims->data[i] <= 0) {
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "invalid dimension #%d (%d) in tensor #%d", i,
-                             tensor.dims->data[i], tensor_index);
-        }
-        return kTfLiteError;
-      }
-    }
-    return kTfLiteOk;
+    return CheckTensorShape(context, tensor, expected_num_dims,
+                            expected_num_dims, tensor_index);
   }
 
   static TfLiteStatus CheckSlopeTensorShape(TfLiteContext* context,
                                             const TfLiteTensor& tensor,
                                             int tensor_index, int node_index) {
     if (tensor.dims->size < 1) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "unexpected number of shape dimensions (%d) in "
-                           "tensor #%d in node #%d: "
-                           "expected at least a 1D tensor",
-                           tensor.dims->size, tensor_index, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unexpected number of shape dimensions (%d) in "
+                               "tensor #%d in node #%d: "
+                               "expected at least a 1D tensor",
+                               tensor.dims->size, tensor_index, node_index);
       return kTfLiteError;
     }
     // Validate that all non-channel dimensions (if any) are exactly 1.
     for (int i = 0; i < tensor.dims->size - 1; i++) {
       if (tensor.dims->data[i] != 1) {
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "unexpected value %d of shape dimension #%d in "
-                             "tensor #%d in node #%d: "
-                             "expected 1 for non-channel dimensions",
-                             tensor.dims[i], i, tensor_index, node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context,
+            "unexpected value %d of shape dimension #%d in "
+            "tensor #%d in node #%d: "
+            "expected 1 for non-channel dimensions",
+            tensor.dims[i], i, tensor_index, node_index);
         return kTfLiteError;
       }
     }
     return kTfLiteOk;
   }
 
+  static TfLiteStatus CheckPaddingsTensorShape(TfLiteContext* context,
+                                               const TfLiteTensor& tensor,
+                                               int expected_rows,
+                                               int tensor_index,
+                                               int node_index) {
+    if (tensor.dims->size != 2) {
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unexpected number of shape dimensions (%d) in "
+                               "padding tensor #%d in node #%d: "
+                               "expected a 2D tensor",
+                               tensor.dims->size, tensor_index, node_index);
+      return kTfLiteError;
+    }
+    if (tensor.dims->data[0] != expected_rows) {
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unexpected number of rows (%d) in "
+                               "padding tensor #%d in node #%d: "
+                               "%d rows expected",
+                               tensor.dims->size, tensor_index, node_index,
+                               expected_rows);
+      return kTfLiteError;
+    }
+    if (tensor.dims->data[1] != 2) {
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unexpected number of columns (%d) in "
+                               "padding tensor #%d in node #%d: "
+                               "2 columns expected",
+                               tensor.dims->size, tensor_index, node_index);
+      return kTfLiteError;
+    }
+    return kTfLiteOk;
+  }
+
   static TfLiteStatus CheckTensorNonDynamicAllocation(
       TfLiteContext* context, const TfLiteTensor& tensor, int tensor_index,
       int node_index) {
     // TODO(b/149120844): remove checks once dynamic tensors are supported
     if (tensor.allocation_type == kTfLiteDynamic) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid allocation type in tensor #%d in node #%d: "
-                           "expected non-dynamic tensor",
-                           tensor_index, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context,
+          "invalid allocation type in tensor #%d in node #%d: "
+          "expected non-dynamic tensor",
+          tensor_index, node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -671,21 +744,21 @@ class Subgraph {
                                                   int node_index) {
     if (tensor.allocation_type != kTfLiteMmapRo ||
         tensor.data.raw_const == nullptr) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid allocation type in tensor #%d in node #%d: "
-                           "expected static read-only tensor",
-                           tensor_index, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context,
+          "invalid allocation type in tensor #%d in node #%d: "
+          "expected static read-only tensor",
+          tensor_index, node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
   }
 
-  static TfLiteStatus VisitNode(xnn_subgraph_t subgraph, TfLiteContext* context,
-                                TfLiteRegistration* registration,
-                                TfLiteNode* node, int node_index,
-                                const std::vector<uint32_t>& xnnpack_tensors) {
+  static TfLiteStatus VisitNode(
+      xnn_subgraph_t subgraph, TfLiteContext* context,
+      TfLiteRegistration* registration, TfLiteNode* node, int node_index,
+      const std::unordered_set<int>& quasi_static_tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
     // TFLite context used for logging purposes. When we create a new node
     // (subgraph is non-null), logging context is the same as context, and error
     // messages are passed to TFLite. When we detect supported operations
@@ -713,7 +786,8 @@ class Subgraph {
             static_cast<const TfLiteConvParams*>(node->builtin_data);
 
         return VisitConv2DNode(subgraph, logging_context, node_index, node,
-                               context->tensors, conv_params, xnnpack_tensors);
+                               context->tensors, conv_params,
+                               quasi_static_tensors, xnnpack_tensors);
       }
       case kTfLiteBuiltinDepthwiseConv2d: {
         const TfLiteDepthwiseConvParams* dwconv_params =
@@ -721,7 +795,7 @@ class Subgraph {
 
         return VisitDepthwiseConv2DNode(subgraph, logging_context, node_index,
                                         node, context->tensors, dwconv_params,
-                                        xnnpack_tensors);
+                                        quasi_static_tensors, xnnpack_tensors);
       }
       case kTfLiteBuiltinFullyConnected: {
         const TfLiteFullyConnectedParams* fc_params =
@@ -729,7 +803,7 @@ class Subgraph {
 
         return VisitFullyConnectedNode(subgraph, logging_context, node_index,
                                        node, context->tensors, fc_params,
-                                       xnnpack_tensors);
+                                       quasi_static_tensors, xnnpack_tensors);
       }
       case kTfLiteBuiltinHardSwish:
         return VisitHardSwishNode(subgraph, logging_context, node_index, node,
@@ -752,9 +826,13 @@ class Subgraph {
         return VisitMulNode(subgraph, logging_context, node_index, node,
                             context->tensors, mul_params, xnnpack_tensors);
       }
+      case kTfLiteBuiltinPad:
+        return VisitPadNode(subgraph, logging_context, node_index, node,
+                            context->tensors, xnnpack_tensors);
       case kTfLiteBuiltinPrelu:
         return VisitPreluNode(subgraph, logging_context, node_index, node,
-                              context->tensors, xnnpack_tensors);
+                              context->tensors, quasi_static_tensors,
+                              xnnpack_tensors);
       case kTfLiteBuiltinRelu:
         return VisitReluNode(
             subgraph, logging_context, node_index, node, context->tensors, 0.0f,
@@ -782,7 +860,7 @@ class Subgraph {
 
           return VisitMediaPipeDeconvolutionNode(
               subgraph, context, node_index, node, context->tensors,
-              &deconv_params, xnnpack_tensors);
+              &deconv_params, quasi_static_tensors, xnnpack_tensors);
         } else if (strcmp(registration->custom_name,
                           "MaxPoolingWithArgmax2D") == 0) {
           TfLitePoolParams pool_params = {kTfLitePaddingUnknown};
@@ -920,6 +998,7 @@ class Subgraph {
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
       const TfLiteConvParams* conv_params,
+      const std::unordered_set<int>& quasi_static_tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
     TF_LITE_ENSURE_STATUS(
         CheckConvolutionParams(logging_context, conv_params, node_index));
@@ -940,16 +1019,20 @@ class Subgraph {
         logging_context, filter_tensor, node->inputs->data[1], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 4,
                                            node->inputs->data[1]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, filter_tensor, node->inputs->data[1], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, filter_tensor, node->inputs->data[1], node_index));
+    }
 
     const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, filter_tensor, node->inputs->data[2], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
                                            node->inputs->data[2]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, bias_tensor, node->inputs->data[2], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, bias_tensor, node->inputs->data[2], node_index));
+    }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
@@ -1006,6 +1089,7 @@ class Subgraph {
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
       const TfLiteDepthwiseConvParams* dwconv_params,
+      const std::unordered_set<int>& quasi_static_tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
     TF_LITE_ENSURE_STATUS(
         CheckNumInputsAndOutputs(logging_context, node, 3, 1, node_index));
@@ -1023,16 +1107,20 @@ class Subgraph {
         logging_context, filter_tensor, node->inputs->data[1], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 4,
                                            node->inputs->data[1]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, filter_tensor, node->inputs->data[1], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, filter_tensor, node->inputs->data[1], node_index));
+    }
 
     const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, filter_tensor, node->inputs->data[2], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
                                            node->inputs->data[2]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, bias_tensor, node->inputs->data[2], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, bias_tensor, node->inputs->data[2], node_index));
+    }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
@@ -1095,6 +1183,7 @@ class Subgraph {
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
       const TfLiteFullyConnectedParams* fc_params,
+      const std::unordered_set<int>& quasi_static_tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
     TF_LITE_ENSURE_STATUS(
         CheckFullyConnectedParams(logging_context, fc_params, node_index));
@@ -1113,16 +1202,20 @@ class Subgraph {
         logging_context, filter_tensor, node->inputs->data[1], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 2,
                                            node->inputs->data[1]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, filter_tensor, node->inputs->data[1], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, filter_tensor, node->inputs->data[1], node_index));
+    }
 
     const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, filter_tensor, node->inputs->data[2], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
                                            node->inputs->data[2]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, bias_tensor, node->inputs->data[2], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, bias_tensor, node->inputs->data[2], node_index));
+    }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
@@ -1134,23 +1227,19 @@ class Subgraph {
     const int32_t input_channels = filter_tensor.dims->data[1];
 
     if (input_tensor.dims->size == 0) {
-      if (logging_context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            logging_context,
-            "unexpected number of shape dimensions %d in tensor #%d",
-            input_tensor.dims->size, node->inputs->data[0]);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unexpected number of shape dimensions %d in tensor #%d",
+          input_tensor.dims->size, node->inputs->data[0]);
       return kTfLiteError;
     }
 
     int32_t num_input_elements = 1;
     for (int i = 0; i < input_tensor.dims->size; i++) {
       if (input_tensor.dims->data[i] <= 0) {
-        if (logging_context != nullptr) {
-          TF_LITE_KERNEL_LOG(logging_context,
-                             "invalid dimension #%d (%d) in tensor #%d", i,
-                             input_tensor.dims->data[i], node->inputs->data[0]);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context, "invalid dimension #%d (%d) in tensor #%d", i,
+            input_tensor.dims->data[i], node->inputs->data[0]);
         return kTfLiteError;
       }
       num_input_elements *= input_tensor.dims->data[i];
@@ -1163,55 +1252,47 @@ class Subgraph {
 
       for (int i = 0; i < input_tensor.dims->size - 1; i++) {
         if (input_tensor.dims->data[i] != output_tensor.dims->data[i]) {
-          if (logging_context != nullptr) {
-            TF_LITE_KERNEL_LOG(
-                logging_context,
-                "mismatch in shape dimension %d (%d != %d) in input and output "
-                "tensors of FULLY_CONNECTED operator #%d",
-                i, input_tensor.dims->data[i], output_tensor.dims->data[i],
-                node_index);
-          }
+          TF_LITE_MAYBE_KERNEL_LOG(
+              logging_context,
+              "mismatch in shape dimension %d (%d != %d) in input and output "
+              "tensors of FULLY_CONNECTED operator #%d",
+              i, input_tensor.dims->data[i], output_tensor.dims->data[i],
+              node_index);
           return kTfLiteError;
         }
       }
     } else {
       if (num_input_elements % input_channels != 0) {
-        if (logging_context != nullptr) {
-          TF_LITE_KERNEL_LOG(
-              logging_context,
-              "number of elements in input tensor #%d in FULLY_CONNECTED "
-              "operator is not divisible by input channels (%d)",
-              node->inputs->data[0], input_channels);
-          return kTfLiteError;
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "number of elements in input tensor #%d in FULLY_CONNECTED "
+            "operator is not divisible by input channels (%d)",
+            node->inputs->data[0], input_channels);
+        return kTfLiteError;
       }
 
       TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 2,
                                              node->outputs->data[0]));
 
       if (output_tensor.dims->data[0] != num_input_elements / input_channels) {
-        if (logging_context != nullptr) {
-          TF_LITE_KERNEL_LOG(
-              logging_context,
-              "batch size %d in output tensor #%d in FULLY_CONNECTED operator "
-              "does not match batch size %d in reshaped input tensor #%d",
-              output_tensor.dims->data[0], node->outputs->data[0],
-              num_input_elements / input_channels, node->inputs->data[0]);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "batch size %d in output tensor #%d in FULLY_CONNECTED operator "
+            "does not match batch size %d in reshaped input tensor #%d",
+            output_tensor.dims->data[0], node->outputs->data[0],
+            num_input_elements / input_channels, node->inputs->data[0]);
         return kTfLiteError;
       }
     }
 
     if (output_tensor.dims->data[output_tensor.dims->size - 1] !=
         output_channels) {
-      if (logging_context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            logging_context,
-            "number of channels %d in output tensor #%d does not match output "
-            "channels %d in filter tensor #%d",
-            output_tensor.dims->data[output_tensor.dims->size - 1],
-            node->outputs->data[0], output_channels, node->inputs->data[1]);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "number of channels %d in output tensor #%d does not match output "
+          "channels %d in filter tensor #%d",
+          output_tensor.dims->data[output_tensor.dims->size - 1],
+          node->outputs->data[0], output_channels, node->inputs->data[1]);
       return kTfLiteError;
     }
 
@@ -1371,6 +1452,7 @@ class Subgraph {
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
       const TfLiteTransposeConvParams* deconv_params,
+      const std::unordered_set<int>& quasi_static_tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
     TF_LITE_ENSURE_STATUS(
         CheckNumInputsAndOutputs(logging_context, node, 3, 1, node_index));
@@ -1388,16 +1470,20 @@ class Subgraph {
         logging_context, filter_tensor, node->inputs->data[1], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 4,
                                            node->inputs->data[1]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, filter_tensor, node->inputs->data[1], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, filter_tensor, node->inputs->data[1], node_index));
+    }
 
     const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, filter_tensor, node->inputs->data[2], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
                                            node->inputs->data[2]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, bias_tensor, node->inputs->data[2], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, bias_tensor, node->inputs->data[2], node_index));
+    }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
@@ -1636,9 +1722,90 @@ class Subgraph {
     return kTfLiteOk;
   }
 
+  static TfLiteStatus VisitPadNode(
+      xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
+      TfLiteNode* node, const TfLiteTensor* tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 1,
+                                           XNN_MAX_TENSOR_DIMS,
+                                           node->inputs->data[0]));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& paddings_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorType(logging_context, paddings_tensor,
+                                          kTfLiteInt32, node->inputs->data[1],
+                                          node_index));
+    TF_LITE_ENSURE_STATUS(CheckPaddingsTensorShape(
+        logging_context, paddings_tensor, input_tensor.dims->size,
+        node->inputs->data[1], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+        logging_context, paddings_tensor, node->inputs->data[1], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 1,
+                                           XNN_MAX_TENSOR_DIMS,
+                                           node->outputs->data[0]));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    const int32_t* paddings_data =
+        reinterpret_cast<const int32_t*>(paddings_tensor.data.data);
+    for (int i = 0; i < paddings_tensor.dims->size; i++) {
+      const int32_t pre_padding = paddings_data[i * 2 + 0];
+      if (pre_padding < 0) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "invalid pre-padding %d for dimension #%d in node %d", pre_padding,
+            i, node_index);
+        return kTfLiteError;
+      }
+
+      const int32_t post_padding = paddings_data[i * 2 + 1];
+      if (post_padding < 0) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "invalid post-padding %d for dimension #%d in node %d", pre_padding,
+            i, node_index);
+        return kTfLiteError;
+      }
+    }
+
+    if (subgraph != nullptr) {
+      std::array<size_t, XNN_MAX_TENSOR_DIMS> pre_paddings{};
+      std::array<size_t, XNN_MAX_TENSOR_DIMS> post_paddings{};
+      for (int i = 0; i < paddings_tensor.dims->data[0]; i++) {
+        pre_paddings[i] = static_cast<size_t>(paddings_data[i * 2 + 0]);
+        post_paddings[i] = static_cast<size_t>(paddings_data[i * 2 + 1]);
+      }
+
+      const xnn_status status = xnn_define_static_constant_pad(
+          subgraph, pre_paddings.data(), post_paddings.data(),
+          /*padding_value=*/0.0f,
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate PAD node #%d",
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
   static TfLiteStatus VisitPreluNode(
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
+      const std::unordered_set<int>& quasi_static_tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
     TF_LITE_ENSURE_STATUS(
         CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
@@ -1656,8 +1823,10 @@ class Subgraph {
         logging_context, slope_tensor, node->inputs->data[1], node_index));
     TF_LITE_ENSURE_STATUS(CheckSlopeTensorShape(
         logging_context, slope_tensor, node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, slope_tensor, node->inputs->data[1], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, slope_tensor, node->inputs->data[1], node_index));
+    }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
@@ -1773,15 +1942,29 @@ class Subgraph {
   bool first_run_{true};
 };
 
-TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) {
+TfLiteIntArray* Delegate::PrepareOpsToDelegate(TfLiteContext* context) {
+  // Clear previous data, in case the delegate is reused without re-creation.
+  static_unpacked_data_map_.clear();
+  static_unpacked_data_.clear();
+  static_unpack_nodes_.clear();
+
   TfLiteIntArray* execution_plan = nullptr;
   if (context->GetExecutionPlan(context, &execution_plan) != kTfLiteOk) {
     TF_LITE_KERNEL_LOG(context, "Unable to get graph execution plan.");
     return nullptr;
   }
 
-  TfLiteIntArray* nodes_to_replace = TfLiteIntArrayCreate(execution_plan->size);
-  nodes_to_replace->size = 0;
+  // Mapping for quasi-static (unpacked from static) tensor index to the node
+  // index that produced it.
+  std::unordered_map<int, int> quasi_static_tensors_producers;
+  // Set of all quasi-static tensors in the execution plan.
+  std::unordered_set<int> quasi_static_tensors;
+  // Set of quasi-static tensors consumed by the delegated nodes.
+  std::unordered_set<int> quasi_static_tensors_to_unpack;
+
+  TfLiteIntArray* nodes_to_delegate =
+      TfLiteIntArrayCreate(execution_plan->size);
+  nodes_to_delegate->size = 0;
   for (int i = 0; i < execution_plan->size; ++i) {
     const int node_index = execution_plan->data[i];
 
@@ -1796,15 +1979,142 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) {
       continue;  // Soft error (skip this node).
     }
 
+    if (registration->builtin_code == kTfLiteBuiltinDequantize &&
+        node->inputs->size == 1 && node->outputs->size == 1) {
+      const TfLiteTensor& input_tensor =
+          context->tensors[node->inputs->data[0]];
+      const TfLiteTensor& output_tensor =
+          context->tensors[node->outputs->data[0]];
+      if (input_tensor.allocation_type == kTfLiteMmapRo &&
+          input_tensor.type == kTfLiteFloat16 &&
+          output_tensor.type == kTfLiteFloat32) {
+        static_unpack_nodes_.insert(i);
+        quasi_static_tensors_producers[node->outputs->data[0]] = i;
+        quasi_static_tensors.insert(node->outputs->data[0]);
+
+        // Skip this node for now. If output of the node is consumed only by
+        // delegated nodes, it will be added to nodes_to_delegate in the end.
+        continue;
+      }
+    }
+
     if (Subgraph::VisitNode(/*subgraph=*/nullptr, context, registration, node,
-                            node_index, std::vector<uint32_t>()) != kTfLiteOk) {
+                            node_index, quasi_static_tensors,
+                            std::vector<uint32_t>()) != kTfLiteOk) {
+      // If a non-delegated node consumes output of a node that unpacks static
+      // data, that node shouldn't be delegated.
+      for (int j = 0; j < node->inputs->size; j++) {
+        const auto it =
+            quasi_static_tensors_producers.find(node->inputs->data[j]);
+        if (it != quasi_static_tensors_producers.end()) {
+          static_unpack_nodes_.erase(it->second);
+        }
+      }
+
       // Non-delegatable node is not an error.
       continue;
     }
 
-    nodes_to_replace->data[nodes_to_replace->size++] = node_index;
+    for (int j = 0; j < node->inputs->size; j++) {
+      if (quasi_static_tensors.count(node->inputs->data[j]) != 0) {
+        quasi_static_tensors_to_unpack.insert(node->inputs->data[j]);
+      }
+    }
+
+    nodes_to_delegate->data[nodes_to_delegate->size++] = node_index;
   }
 
+  // Unpack static data of all tensors
+  for (int t : quasi_static_tensors_to_unpack) {
+    const int producer_index = quasi_static_tensors_producers[t];
+    // Check if TFLite nodes can be delegated to XNNPACK
+    TfLiteNode* node = nullptr;
+    TfLiteRegistration* registration = nullptr;
+    if (context->GetNodeAndRegistration(context, producer_index, &node,
+                                        &registration) != kTfLiteOk) {
+      TF_LITE_KERNEL_LOG(context,
+                         "Unable to get node and registration for node %d.",
+                         producer_index);
+      TfLiteIntArrayFree(nodes_to_delegate);
+      return nullptr;  // Hard error.
+    }
+
+    if (node->inputs->size != 1) {
+      TF_LITE_KERNEL_LOG(context, "unexpected number of inputs (%d) in node %d",
+                         node->inputs->size, producer_index);
+      TfLiteIntArrayFree(nodes_to_delegate);
+      return nullptr;  // Hard error.
+    }
+
+    if (node->outputs->size != 1) {
+      TF_LITE_KERNEL_LOG(context,
+                         "unexpected number of outputs (%d) in node %d",
+                         node->outputs->size, producer_index);
+      TfLiteIntArrayFree(nodes_to_delegate);
+      return nullptr;  // Hard error.
+    }
+
+    const TfLiteTensor& input_tensor = context->tensors[node->inputs->data[0]];
+    if (input_tensor.allocation_type != kTfLiteMmapRo) {
+      TF_LITE_KERNEL_LOG(context,
+                         "unexpected allocation type in tensor %d in node %d",
+                         node->inputs->data[0], producer_index);
+      TfLiteIntArrayFree(nodes_to_delegate);
+      return nullptr;  // Hard error.
+    }
+
+    const TfLiteTensor& output_tensor = context->tensors[t];
+    if (output_tensor.type != kTfLiteFloat32) {
+      TF_LITE_KERNEL_LOG(context,
+                         "unexpected datatype (%s) in tensor %d in node %d",
+                         TfLiteTypeGetName(output_tensor.type),
+                         node->outputs->data[0], producer_index);
+      TfLiteIntArrayFree(nodes_to_delegate);
+      return nullptr;  // Hard error.
+    }
+    const size_t tensor_elements = output_tensor.bytes / sizeof(float);
+
+    // Align to XNN_EXTRA_BYTES bytes
+    while (static_unpacked_data_.size() % XNN_EXTRA_BYTES != 0) {
+      static_unpacked_data_.push_back(0);
+    }
+    const size_t tensor_offset = static_unpacked_data_.size();
+    static_unpacked_data_.resize(tensor_offset + context->tensors[t].bytes);
+
+    float* unpacked_data =
+        reinterpret_cast<float*>(static_unpacked_data_.data() + tensor_offset);
+    switch (input_tensor.type) {
+      case kTfLiteFloat16: {
+        const uint16_t* packed_data =
+            static_cast<const uint16_t*>(input_tensor.data.data);
+        for (size_t i = 0; i < tensor_elements; i++) {
+          unpacked_data[i] = fp16_ieee_to_fp32_value(packed_data[i]);
+        }
+        break;
+      }
+      default:
+        TF_LITE_KERNEL_LOG(context,
+                           "unexpected datatype (%s) in tensor %d in node %d",
+                           TfLiteTypeGetName(output_tensor.type),
+                           node->outputs->data[0], producer_index);
+        TfLiteIntArrayFree(nodes_to_delegate);
+        return nullptr;  // Hard error.
+    }
+
+    static_unpacked_data_map_[t] = tensor_offset;
+  }
+
+  // Add nodes that unpack static data consumed by delegated nodes.
+  // Note: this is done purely to avoid the overhead of running these nodes
+  // again in TFLite interpreter which would allocate memory for their outputs.
+  // We mark them as delegated, but the delegate would simply ignore these nodes
+  // as the static weights are already unpacked.
+  for (int node_index : static_unpack_nodes_) {
+    nodes_to_delegate->data[nodes_to_delegate->size++] = node_index;
+  }
+  std::sort(&nodes_to_delegate->data[0],
+            &nodes_to_delegate->data[nodes_to_delegate->size]);
+
 #ifdef XNNPACK_DELEGATE_TEST_MODE
   // In the test mode build (used by unit tests), XNNPACK delegate claims to
   // support all operators in the execution plan to disable fallback to the
@@ -1812,24 +2122,22 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) {
   // not supported by the delegate, they will cause a failure in
   // ::tflite::Interpreter::ModifyGraphWithDelegate, to be caught in the unit
   // tests.
-  nodes_to_replace->size = execution_plan->size;
+  nodes_to_delegate->size = execution_plan->size;
   std::copy(&execution_plan->data[0],
             &execution_plan->data[execution_plan->size],
-            &nodes_to_replace->data[0]);
+            &nodes_to_delegate->data[0]);
 #endif
 
-  return nodes_to_replace;
+  return nodes_to_delegate;
 }
 
 void* SubgraphInit(TfLiteContext* context, const char* buffer, size_t length) {
   const TfLiteDelegateParams* params =
       reinterpret_cast<const TfLiteDelegateParams*>(buffer);
 
-  pthreadpool_t threadpool =
-      static_cast<::tflite::xnnpack::Delegate*>(params->delegate->data_)
-          ->threadpool();
-
-  return static_cast<void*>(Subgraph::Create(context, params, threadpool));
+  return static_cast<void*>(Subgraph::Create(
+      context, params,
+      static_cast<::tflite::xnnpack::Delegate*>(params->delegate->data_)));
 }
 
 TfLiteStatus SubgraphPrepare(TfLiteContext* context, TfLiteNode* node) {
@@ -1866,7 +2174,9 @@ const TfLiteRegistration kSubgraphRegistration = {
 };
 
 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
-  TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
+  TfLiteIntArray* ops_to_replace =
+      static_cast<::tflite::xnnpack::Delegate*>(delegate->data_)
+          ->PrepareOpsToDelegate(context);
   const TfLiteStatus status = context->ReplaceNodeSubsetsWithDelegateKernels(
       context, kSubgraphRegistration, ops_to_replace, delegate);
   TfLiteIntArrayFree(ops_to_replace);
diff --git a/tensorflow/lite/examples/label_image/README.md b/tensorflow/lite/examples/label_image/README.md
index 09e9e77b86a..b283e169359 100644
--- a/tensorflow/lite/examples/label_image/README.md
+++ b/tensorflow/lite/examples/label_image/README.md
@@ -111,27 +111,41 @@ uniform 0.0379589: 907 Windsor tie 0.00735866: 466 bulletproof vest 0.00605307:
 
 To run a model with the Hexagon Delegate, assuming we have followed the
 [Hexagon Delegate Guide](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/hexagon_delegate.md)
-and installed Hexagon libraries in `/data/local/tmp`. Run it `adb shell
-"/data/local/tmp/label_image \ -m
+and installed Hexagon libraries in `/data/local/tmp`. Run it wth (`-j 1`) `adb
+shell \ "/data/local/tmp/label_image \ -m
 /data/local/tmp/mobilenet_v1_1.0_224_quant.tflite \ -i
 /data/local/tmp/grace_hopper.bmp \ -l /data/local/tmp/labels.txt -j 1"` then you
 should see something like the followings: ``` Loaded model
 /data/local/tmp/mobilenet_v1_1.0_224_quant.tflite resolved reporter INFO:
-Initialized TensorFlow Lite runtime. INFO: Created TensorFlow Lite delegate for
-Hexagon. INFO: Hexagon delegate: 31 nodes delegated out of 31 nodes.
+Initialized TensorFlow Lite runtime. loaded libcdsprpc.so INFO: Created
+TensorFlow Lite delegate for Hexagon. INFO: Hexagon delegate: 31 nodes delegated
+out of 31 nodes with 1 partitions.
 
-remote_handle_control available and used Applied Hexagon delegate.invoked
-average time: 8.307 ms 0.729412: 653 military uniform 0.0980392: 907 Windsor tie
-0.0313726: 466 bulletproof vest 0.0313726: 458 bow tie 0.0117647: 700 panpipe
-```
+Applied Hexagon delegate.invoked average time: 4.231 ms 0.639216: 458 bow tie
+0.329412: 653 military uniform 0.00784314: 835 suit 0.00784314: 611 jersey
+0.00392157: 514 cornet ```
 
-Run the model with the XNNPACK delegate (`-x 1`), `adb shell
+Run the model with the XNNPACK delegate (`-x 1`), `adb shell \
 "/data/local/tmp/label_image \ -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \
 -i /data/local/tmp/grace_hopper.bmp \ -l /data/local/tmp/labels.txt -x 1"` then
 you should see something like the followings: `Loaded model
 /data/local/tmp/mobilenet_v1_1.0_224.tflite resolved reporter INFO: Initialized
-TensorFlow Lite runtime. Applied XNNPACK delegate.invoked average time: 11.0237
-ms 0.90707: 653 military uniform 0.0372418: 907 Windsor tie 0.0073376: 466
-bulletproof vest 0.00592856: 458 bow tie 0.00414093: 514 cornet`
+TensorFlow Lite runtime. Applied XNNPACK delegate.invoked average time: 17.33 ms
+0.90707: 653 military uniform 0.0372418: 907 Windsor tie 0.0073376: 466
+bulletproof vest 0.00592857: 458 bow tie 0.00414093: 514 cornet`
+
+With `-h` or any other unsupported flags, `label_image` will list supported
+options `sargo:/data/local/tmp $ ./label_image -h ./label_image: invalid
+option -- h label_image --accelerated, -a: [0|1], use Android NNAPI or not
+--old_accelerated, -d: [0|1], use old Android NNAPI delegate or not
+--allow_fp16, -f: [0|1], allow running fp32 models with fp16 or not --count, -c:
+loop interpreter->Invoke() for certain times --gl_backend, -g: [0|1]: use GL GPU
+Delegate on Android --hexagon_delegate, -j: [0|1]: use Hexagon Delegate on
+Android --input_mean, -b: input mean --input_std, -s: input standard deviation
+--image, -i: image_name.bmp --labels, -l: labels for the model --tflite_model,
+-m: model_name.tflite --profiling, -p: [0|1], profiling or not --num_results,
+-r: number of results to show --threads, -t: number of threads --verbose, -v:
+[0|1] print more information --warmup_runs, -w: number of warmup runs
+--xnnpack_delegate, -x [0:1]: xnnpack delegate`
 
 See the `label_image.cc` source code for other command line options.
diff --git a/tensorflow/lite/examples/label_image/label_image.cc b/tensorflow/lite/examples/label_image/label_image.cc
index ec744d70381..5967c23be33 100644
--- a/tensorflow/lite/examples/label_image/label_image.cc
+++ b/tensorflow/lite/examples/label_image/label_image.cc
@@ -301,7 +301,7 @@ void RunInference(Settings* s) {
     profiler->StopProfiling();
     auto profile_events = profiler->GetProfileEvents();
     for (int i = 0; i < profile_events.size(); i++) {
-      auto subgraph_index = profile_events[i]->event_subgraph_index;
+      auto subgraph_index = profile_events[i]->extra_event_metadata;
       auto op_index = profile_events[i]->event_metadata;
       const auto subgraph = interpreter->subgraph(subgraph_index);
       const auto node_and_registration =
@@ -362,8 +362,8 @@ void display_usage() {
       << "--old_accelerated, -d: [0|1], use old Android NNAPI delegate or not\n"
       << "--allow_fp16, -f: [0|1], allow running fp32 models with fp16 or not\n"
       << "--count, -c: loop interpreter->Invoke() for certain times\n"
-      << "--gl_backend, -g: use GL GPU Delegate on Android\n"
-      << "--hexagon_delegate: use Hexagon Delegate on Android\n"
+      << "--gl_backend, -g: [0|1]: use GL GPU Delegate on Android\n"
+      << "--hexagon_delegate, -j: [0|1]: use Hexagon Delegate on Android\n"
       << "--input_mean, -b: input mean\n"
       << "--input_std, -s: input standard deviation\n"
       << "--image, -i: image_name.bmp\n"
@@ -374,7 +374,7 @@ void display_usage() {
       << "--threads, -t: number of threads\n"
       << "--verbose, -v: [0|1] print more information\n"
       << "--warmup_runs, -w: number of warmup runs\n"
-      << "--xnnpack_delegate, -x: xnnpack delegate\n"
+      << "--xnnpack_delegate, -x [0:1]: xnnpack delegate\n"
       << "\n";
 }
 
diff --git a/tensorflow/lite/experimental/delegates/coreml/BUILD b/tensorflow/lite/experimental/delegates/coreml/BUILD
index 92aa96d5c50..2985cd3a315 100644
--- a/tensorflow/lite/experimental/delegates/coreml/BUILD
+++ b/tensorflow/lite/experimental/delegates/coreml/BUILD
@@ -1,4 +1,4 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -46,12 +46,18 @@ objc_library(
     name = "coreml_delegate",
     srcs = ["coreml_delegate.mm"],
     hdrs = ["coreml_delegate.h"],
+    module_name = "TensorFlowLiteCCoreML",
+    # By setting CoreML as weak_framework, the TensorFlow Lite can be built for older iOS versions.
+    weak_sdk_frameworks = [
+        "CoreML",
+    ],
     deps = [
         ":coreml_delegate_kernel",
         ":mlmodel_proto_cc",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/experimental/delegates/coreml/builders:op_builder",
     ],
 )
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/convolution_op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/convolution_op_builder.cc
index b6a859d1dff..8dbc18722f1 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/convolution_op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/convolution_op_builder.cc
@@ -167,21 +167,42 @@ void ConvolutionOpBuilder::TransposeKernelWeights() {
     layer_->mutable_convolution()->set_isdeconvolution(true);
   }
 
-  auto* coreml_weights =
-      layer_->mutable_convolution()->mutable_weights()->mutable_floatvalue();
-  coreml_weights->Resize(NumElements(weights_), 0);
+  if (weights_->type == kTfLiteFloat32) {
+    auto* coreml_weights =
+        layer_->mutable_convolution()->mutable_weights()->mutable_floatvalue();
+    coreml_weights->Resize(NumElements(weights_), 0);
 
-  optimized_ops::Transpose<float>(params, tfl_shape, weights_->data.f,
-                                  coreml_shape, coreml_weights->mutable_data());
+    optimized_ops::Transpose<float>(params, tfl_shape, weights_->data.f,
+                                    coreml_shape,
+                                    coreml_weights->mutable_data());
+  } else if (weights_->type == kTfLiteFloat16) {
+    auto* coreml_weights = layer_->mutable_convolution()
+                               ->mutable_weights()
+                               ->mutable_float16value();
+    // float16value has type of bytes (std::string)
+    coreml_weights->resize(weights_->bytes, 0);
+
+    optimized_ops::Transpose<uint16_t>(
+        params, tfl_shape, reinterpret_cast<uint16_t*>(weights_->data.raw),
+        coreml_shape, reinterpret_cast<uint16_t*>(&coreml_weights->front()));
+  }
 }
 
 void ConvolutionOpBuilder::FillCoreMLBias() {
   if (bias_ != nullptr) {
     layer_->mutable_convolution()->set_hasbias(true);
-    std::copy(bias_->data.f, bias_->data.f + NumElements(bias_->dims),
-              google::protobuf::RepeatedFieldBackInserter(layer_->mutable_convolution()
-                                                    ->mutable_bias()
-                                                    ->mutable_floatvalue()));
+    if (bias_->type == kTfLiteFloat32) {
+      std::copy(bias_->data.f, bias_->data.f + NumElements(bias_->dims),
+                google::protobuf::RepeatedFieldBackInserter(layer_->mutable_convolution()
+                                                      ->mutable_bias()
+                                                      ->mutable_floatvalue()));
+    } else if (bias_->type == kTfLiteFloat16) {
+      // float16value has type of bytes (std::string)
+      layer_->mutable_convolution()
+          ->mutable_bias()
+          ->mutable_float16value()
+          ->assign(bias_->data.raw, bias_->bytes);
+    }
   }
 }
 
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/dummy_op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/dummy_op_builder.cc
new file mode 100644
index 00000000000..b19af8d7f5c
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/dummy_op_builder.cc
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/coreml/builders/dummy_op_builder.h"
+
+#include "tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+
+CoreML::Specification::NeuralNetworkLayer* DummyOpBuilder::Build() {
+  return nullptr;
+}
+
+const char* DummyOpBuilder::DebugName() { return "Dummy OpBuilder"; }
+
+TfLiteStatus DummyOpBuilder::PopulateSubgraph(TfLiteContext* context) {
+  return kTfLiteOk;
+}
+
+OpBuilder* CreateDummyOpBuilder(GraphBuilder* graph_builder) {
+  return new DummyOpBuilder(graph_builder);
+}
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/dummy_op_builder.h b/tensorflow/lite/experimental/delegates/coreml/builders/dummy_op_builder.h
new file mode 100644
index 00000000000..1b151886515
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/dummy_op_builder.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_COREML_BUILDERS_DUMMY_OP_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_COREML_BUILDERS_DUMMY_OP_BUILDER_H_
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+
+// Dummy Opbuilder for nodes that are claimed but not used. ex) FP16 dequantize
+class DummyOpBuilder : public OpBuilder {
+ public:
+  explicit DummyOpBuilder(GraphBuilder* graph_builder)
+      : OpBuilder(graph_builder) {}
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+  TfLiteStatus PopulateSubgraph(TfLiteContext* context) override;
+  const char* DebugName() override;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_COREML_BUILDERS_DUMMY_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
index 09c386b55f0..47a2eecb51b 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
@@ -24,7 +24,6 @@ namespace tflite {
 namespace delegates {
 namespace coreml {
 OpBuilder* GraphBuilder::AddBuilder(int builtin_code, const TfLiteNode* node) {
-  // Follow the ordering of TfLiteBuiltinOperator enum.
   switch (builtin_code) {
     case kTfLiteBuiltinAdd:
       return AddBuilder(CreateAddOpBuilder, node);
@@ -36,6 +35,11 @@ OpBuilder* GraphBuilder::AddBuilder(int builtin_code, const TfLiteNode* node) {
       return AddBuilder(CreateConvolutionOpBuilder, node);
     case kTfLiteBuiltinDepthwiseConv2d:
       return AddBuilder(CreateDepthwiseConvolutionOpBuilder, node);
+    // TODO(b/141490853): Add proper dequantize OpBuilder for int8/uint8 inputs.
+    case kTfLiteBuiltinDequantize:
+      // FP16 dequantize is claimed by the delegate to prevent them from running
+      // on CPU, but don't need to be excuted on the Core ML delegate either.
+      return AddBuilder(CreateDummyOpBuilder, node);
     case kTfLiteBuiltinFullyConnected:
       return AddBuilder(CreateFullyConnectedOpBuilder, node);
     case kTfLiteBuiltinLogistic:
@@ -87,6 +91,17 @@ OpBuilder* GraphBuilder::AddBuilder(
 
 CoreML::Specification::Model* GraphBuilder::BuildModel() {
   CoreML::Specification::Model* model = new CoreML::Specification::Model();
+  if (coreml_version_ == 2) {  // Core ML 2, iOS >= 12.0
+    model->set_specificationversion(3);
+  } else if (coreml_version_ == 3) {  // Core ML 3, iOS >= 13.0
+    model->set_specificationversion(4);
+    model->mutable_neuralnetwork()->set_arrayinputshapemapping(
+        CoreML::Specification::EXACT_ARRAY_MAPPING);
+  } else {
+    fprintf(stderr, "Unsupported Core ML version: %d\n", coreml_version_);
+    delete model;
+    return nullptr;
+  }
   auto* neural_network = model->mutable_neuralnetwork();
   for (auto& builder : builders_) {
     CoreML::Specification::NeuralNetworkLayer* layer = builder->Build();
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h
index 5367ae20d2f..c59c30a5a28 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h
@@ -52,6 +52,8 @@ class TensorID {
 // API is experimental and subject to change.
 class GraphBuilder {
  public:
+  explicit GraphBuilder(int coreml_version) : coreml_version_(coreml_version) {}
+
   // Returns pointer to the created builder. Ownership still belongs
   // to the GraphBuilder.
   OpBuilder* AddBuilder(int builtin_code, const TfLiteNode* node);
@@ -79,6 +81,8 @@ class GraphBuilder {
   // This information is used to mark constant tensors that are used as input.
   bool IsTensorUsed(int tflite_tensor_index);
 
+  const int coreml_version_;
+
  private:
   std::vector<std::unique_ptr<OpBuilder>> builders_;
   // Index in the vector is the tflite_tensor_index, the value
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h b/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h
index 9ca133be064..bc275908d10 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h
@@ -44,6 +44,8 @@ OpBuilder* CreateTransposeConvolutionOpBuilder(GraphBuilder* graph_builder);
 
 OpBuilder* CreateActivationLayerBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateThresholdLayerBuilder(GraphBuilder* graph_builder);
+// Dummy Opbuilder for nodes that are claimed but not used. ex) FP16 dequantize
+OpBuilder* CreateDummyOpBuilder(GraphBuilder* graph_builder);
 
 }  // namespace coreml
 }  // namespace delegates
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h b/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
index b0fe24ee288..501a304706c 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
@@ -32,7 +32,8 @@ bool IsFullyConnectedOpSupported(const TfLiteRegistration* registration,
                                  const TfLiteNode* node,
                                  TfLiteContext* context);
 bool IsReshapeOpSupported(const TfLiteRegistration* registration,
-                          const TfLiteNode* node, TfLiteContext* context);
+                          const TfLiteNode* node, TfLiteContext* context,
+                          int coreml_version);
 bool IsResizeBilinearOpSupported(const TfLiteRegistration* registration,
                                  const TfLiteNode* node,
                                  TfLiteContext* context);
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc
index 33040e2e070..b7b78653d36 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc
@@ -114,7 +114,11 @@ TfLiteStatus ReshapeOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
 }
 
 bool IsReshapeOpSupported(const TfLiteRegistration* registration,
-                          const TfLiteNode* node, TfLiteContext* context) {
+                          const TfLiteNode* node, TfLiteContext* context,
+                          int coreml_version) {
+  if (coreml_version >= 3) {
+    return false;
+  }
   if (node->inputs->size == 1) {
     const auto* params =
         reinterpret_cast<TfLiteReshapeParams*>(node->builtin_data);
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h
index 0d75afc8e34..8ad81040499 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h
@@ -31,6 +31,13 @@ typedef enum {
 typedef struct {
   // Only create delegate when Neural Engine is available on the device.
   TfLiteCoreMlDelegateEnabledDevices enabled_devices;
+  // Specifies target Core ML version for model conversion.
+  // Core ML 3 come with a lot more ops, but some ops (e.g. reshape) is not
+  // delegated due to input rank constraint.
+  // if not set to one of the valid versions, the delegate will use highest
+  // version possible in the platform.
+  // Valid versions: (2, 3)
+  int coreml_version;
   // This sets the maximum number of Core ML delegates created.
   // Each graph corresponds to one delegated node subset in the
   // TFLite model. Set this to 0 to delegate all possible partitions.
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
index 5d0564ebc48..3c6d6c57f5f 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
@@ -36,7 +36,7 @@ constexpr int kMinNodesPerCoreMlDelegate = 2;
 using delegates::coreml::CoreMlDelegateKernel;
 
 bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration, const TfLiteNode* node,
-                               TfLiteContext* context) {
+                               TfLiteContext* context, const TfLiteCoreMlDelegateOptions* options) {
   if (@available(iOS 11.0, *)) {
   } else {
     return false;
@@ -120,7 +120,8 @@ bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration, const TfL
       return true;
     }
     case kTfLiteBuiltinReshape: {
-      return delegates::coreml::IsReshapeOpSupported(registration, node, context);
+      return delegates::coreml::IsReshapeOpSupported(registration, node, context,
+                                                     options->coreml_version);
     }
     case kTfLiteBuiltinResizeBilinear: {
       return delegates::coreml::IsResizeBilinearOpSupported(registration, node, context);
@@ -142,6 +143,39 @@ bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration, const TfL
   return false;
 }
 
+class CoreMlDelegate : public TfLiteDelegate {
+ public:
+  explicit CoreMlDelegate(const TfLiteCoreMlDelegateOptions* params)
+      : params_(params != nullptr ? *params : TfLiteCoreMlDelegateOptions()) {
+    {
+      if (@available(iOS 13.0, *)) {
+        if (params_.coreml_version != 2 && params_.coreml_version != 3) {
+          NSLog(@"coreml_version must be 2 or 3. Setting to 3.");
+          params_.coreml_version = 3;
+        }
+      } else if (@available(iOS 12.0, *)) {
+        if (params_.coreml_version != 2) {
+          NSLog(@"coreml_version must be 2 - using Core ML version 2.");
+          params_.coreml_version = 2;
+        }
+      }
+      if (params_.max_delegated_partitions <= 0) {
+        params_.max_delegated_partitions = std::numeric_limits<int>::max();
+      }
+      if (params_.min_nodes_per_partition <= 0) {
+        params_.min_nodes_per_partition = kMinNodesPerCoreMlDelegate;
+      }
+    }
+  }
+
+  TfLiteCoreMlDelegateOptions* params() { return &params_; }
+
+  bool VerifyDelegate() { return true; }
+
+ private:
+  TfLiteCoreMlDelegateOptions params_;
+};
+
 TfLiteRegistration GetCoreMlKernelRegistration() {
   // This is the registration for the Delegate Node that gets added to
   // the TFLite graph instead of the subGraph it replaces it.
@@ -158,8 +192,9 @@ TfLiteRegistration GetCoreMlKernelRegistration() {
   };
   kernel_registration.init = [](TfLiteContext* context, const char* buffer,
                                 size_t length) -> void* {
-    const TfLiteDelegateParams* params = reinterpret_cast<const TfLiteDelegateParams*>(buffer);
-    CoreMlDelegateKernel* coreml_kernel = new CoreMlDelegateKernel();
+    const auto* params = reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+    const auto* coreml_options = (reinterpret_cast<CoreMlDelegate*>(params->delegate))->params();
+    CoreMlDelegateKernel* coreml_kernel = new CoreMlDelegateKernel(coreml_options->coreml_version);
     if (coreml_kernel->Init(context, params) != kTfLiteOk) {
       delete coreml_kernel;
       return nullptr;
@@ -187,64 +222,29 @@ TfLiteRegistration GetCoreMlKernelRegistration() {
 }
 
 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
-  const auto* params =
-      reinterpret_cast<TfLiteCoreMlDelegateOptions*>(delegate->data_);
+  const auto* params = reinterpret_cast<TfLiteCoreMlDelegateOptions*>(delegate->data_);
 
-  delegates::IsNodeSupportedFn node_supported_fn =
-      [=](TfLiteContext* context, TfLiteNode* node,
-          TfLiteRegistration* registration,
-          std::string* unsupported_details) -> bool {
-    return IsNodeSupportedByDelegate(registration, node, context);
+  delegates::IsNodeSupportedFn node_supported_fn = [=](TfLiteContext* context, TfLiteNode* node,
+                                                       TfLiteRegistration* registration,
+                                                       std::string* unsupported_details) -> bool {
+    return IsNodeSupportedByDelegate(registration, node, context, params);
   };
 
-  delegates::GraphPartitionHelper helper(context, node_supported_fn);
-  TF_LITE_ENSURE_STATUS(helper.Partition(nullptr));
+  delegates::FP16GraphPartitionHelper partition_helper(context, node_supported_fn);
+  TF_LITE_ENSURE_STATUS(partition_helper.Partition(nullptr));
 
-  const auto delegate_partitions = helper.GetFirstNLargestPartitions(
+  std::vector<int> delegated_nodes = partition_helper.GetNodesOfFirstNLargestPartitions(
       params->max_delegated_partitions, params->min_nodes_per_partition);
-
-  // To avoid creating a new TfLiteIntArray and free it later, we reserve one
-  // element to represent TfLiteIntArray.size which is the 1st element of
-  // TfLiteIntArray C struct.
-  std::vector<int> supported_nodes(1);
-  for (const auto partition : delegate_partitions) {
-    auto nodes = TfLiteIntArrayView(partition->nodes_to_replace);
-    supported_nodes.insert(supported_nodes.end(), nodes.begin(), nodes.end());
-  }
-
-  // Set first element to the number of nodes to replace.
-  supported_nodes[0] = supported_nodes.size() - 1;
-  TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO, "CoreML delegate: %d nodes delegated out of %d nodes, "
+  TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
+                  "CoreML delegate: %d nodes delegated out of %d nodes, "
                   "with %d partitions.\n",
-                  supported_nodes[0], helper.num_total_nodes(), delegate_partitions.size());
-
+                  delegated_nodes.size(), partition_helper.num_total_nodes(),
+                  partition_helper.num_partitions());
   return context->ReplaceNodeSubsetsWithDelegateKernels(
-      context, GetCoreMlKernelRegistration(),
-      reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()), delegate);
+      context, GetCoreMlKernelRegistration(), BuildTfLiteIntArray(delegated_nodes).get(),
+      delegate);
 }
 
-class CoreMlDelegate : public TfLiteDelegate {
- public:
-  explicit CoreMlDelegate(const TfLiteCoreMlDelegateOptions* params)
-      : params_(params != nullptr ? *params : TfLiteCoreMlDelegateOptions()) {
-    {
-      if (params_.max_delegated_partitions <= 0) {
-        params_.max_delegated_partitions = std::numeric_limits<int>::max();
-      }
-      if (params_.min_nodes_per_partition <= 0) {
-        params_.min_nodes_per_partition = kMinNodesPerCoreMlDelegate;
-      }
-    }
-  }
-
-  TfLiteCoreMlDelegateOptions* params() { return &params_; }
-
-  bool VerifyDelegate() { return true; }
-
- private:
-  TfLiteCoreMlDelegateOptions params_;
-};
-
 TfLiteDelegate* CreateCoreMlDelegate(const TfLiteCoreMlDelegateOptions* options) {
   TfLiteDelegate* delegate = new CoreMlDelegate(options);
   if (!static_cast<CoreMlDelegate*>(delegate)->VerifyDelegate()) {
@@ -288,7 +288,7 @@ bool IsNeuralEngineAvailable() {
 }  // namespace
 
 TfLiteDelegate* TfLiteCoreMlDelegateCreate(const TfLiteCoreMlDelegateOptions* options) {
-  if (@available(iOS 11.0, *)) {
+  if (@available(iOS 12.0, *)) {
     if (options->enabled_devices == TfLiteCoreMlDelegateDevicesWithNeuralEngine &&
         !IsNeuralEngineAvailable()) {
       NSLog(@"This device does not have Neural Engine, so Core ML delegate will not be enabled. "
@@ -299,7 +299,7 @@ TfLiteDelegate* TfLiteCoreMlDelegateCreate(const TfLiteCoreMlDelegateOptions* op
     return tflite::CreateCoreMlDelegate(options);
   } else {
     NSLog(@"Core ML delegate is not supported in this iOS version. "
-           "Minimum required iOS version is 11.0.");
+           "Minimum required iOS version is 12.0.");
     return nullptr;
   }
 }
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h
index 04053ea81c1..8c983fb11aa 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h
@@ -29,6 +29,8 @@ namespace coreml {
 // implements Init/Prepare/Invoke as TFLite kernel nodes.
 class CoreMlDelegateKernel {
  public:
+  explicit CoreMlDelegateKernel(int coreml_version)
+      : coreml_version_(coreml_version) {}
   // Initialize the delegated graph and add required nodes.
   TfLiteStatus Init(TfLiteContext* context, const TfLiteDelegateParams* params);
 
@@ -56,6 +58,7 @@ class CoreMlDelegateKernel {
   std::unique_ptr<delegates::coreml::GraphBuilder> builder_;
   std::unique_ptr<CoreML::Specification::Model> model_;
   ::CoreMlExecutor* executor_;
+  int coreml_version_;
 
   std::vector<int> input_tensor_ids_;
   std::vector<TensorData> inputs_;
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.mm
index a36837bcc44..6a668bc971b 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.mm
@@ -60,7 +60,7 @@ void TransposeToHWC(const float* chw, float* hwc, const TfLiteIntArray* hwc_dims
 
 TfLiteStatus CoreMlDelegateKernel::Init(TfLiteContext* context,
                                         const TfLiteDelegateParams* delegate_params) {
-  if (@available(iOS 11.0, *)) {
+  if (@available(iOS 12.0, *)) {
     executor_ = [[::CoreMlExecutor alloc] init];
     TF_LITE_ENSURE_STATUS(BuildModel(context, delegate_params));
     // Serialize the model protocol buffer and compile it.
@@ -76,7 +76,7 @@ TfLiteStatus CoreMlDelegateKernel::Init(TfLiteContext* context,
     }
     return kTfLiteOk;
   } else {
-    TF_LITE_KERNEL_LOG(context, "Minimum required iOS version is 11.0.");
+    TF_LITE_KERNEL_LOG(context, "Minimum required iOS version is 12.0.");
     return kTfLiteError;
   }
 }
@@ -104,6 +104,9 @@ void CoreMlDelegateKernel::AddOutputTensors(const TfLiteIntArray* output_tensors
     int batch_size, height_size, width_size, depth_size;
     GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor.dims);
     multi_array->set_datatype(CoreML::Specification::ArrayFeatureType::FLOAT32);
+    if (coreml_version_ >= 3) {
+      multi_array->mutable_shape()->Add(batch_size);
+    }
     multi_array->mutable_shape()->Add(depth_size);
     multi_array->mutable_shape()->Add(height_size);
     multi_array->mutable_shape()->Add(width_size);
@@ -114,7 +117,7 @@ TfLiteStatus CoreMlDelegateKernel::BuildModel(TfLiteContext* context,
                                               const TfLiteDelegateParams* delegate_params) {
   TfLiteNode* node;
   TfLiteRegistration* reg;
-  builder_.reset(new delegates::coreml::GraphBuilder());
+  builder_.reset(new delegates::coreml::GraphBuilder(coreml_version_));
   // Add Inputs
   AddInputTensors(delegate_params->input_tensors, context);
   // Build all ops.
@@ -144,8 +147,6 @@ TfLiteStatus CoreMlDelegateKernel::BuildModel(TfLiteContext* context,
     return kTfLiteError;
   }
   AddOutputTensors(delegate_params->output_tensors, context);
-  // TODO(karimnosseir): Set correct version ?
-  model_->set_specificationversion(1);
   auto* model_description = model_->mutable_description();
   for (int i = 0; i < delegate_params->input_tensors->size; ++i) {
     const int tensor_id = delegate_params->input_tensors->data[i];
@@ -158,6 +159,9 @@ TfLiteStatus CoreMlDelegateKernel::BuildModel(TfLiteContext* context,
       int batch_size, height_size, width_size, depth_size;
       GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor.dims);
       multi_array->set_datatype(CoreML::Specification::ArrayFeatureType::FLOAT32);
+      if (coreml_version_ >= 3) {
+        multi_array->mutable_shape()->Add(batch_size);
+      }
       multi_array->mutable_shape()->Add(depth_size);
       multi_array->mutable_shape()->Add(height_size);
       multi_array->mutable_shape()->Add(width_size);
@@ -181,9 +185,12 @@ TfLiteStatus CoreMlDelegateKernel::Prepare(TfLiteContext* context, TfLiteNode* n
     int batch_size, height_size, width_size, depth_size;
     GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor->dims);
 
-    inputs_.push_back({std::vector<float>(input_size),
-                       builder_->GetTensorName(tensor_index),
-                       {depth_size, height_size, width_size}});
+    std::vector<int> input_shape = {depth_size, height_size, width_size};
+    if (coreml_version_ >= 3) {
+      input_shape.insert(input_shape.begin(), batch_size);
+    }
+    inputs_.push_back(
+        {std::vector<float>(input_size), builder_->GetTensorName(tensor_index), input_shape});
   }
 
   outputs_.reserve(node->outputs->size);
@@ -222,9 +229,7 @@ TfLiteStatus CoreMlDelegateKernel::Invoke(TfLiteContext* context, TfLiteNode* no
   }
 }
 
-CoreMlDelegateKernel::~CoreMlDelegateKernel() {
-  [executor_ cleanup];
-}
+CoreMlDelegateKernel::~CoreMlDelegateKernel() { [executor_ cleanup]; }
 
 }  // namespace coreml
 }  // namespace delegates
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_executor.h b/tensorflow/lite/experimental/delegates/coreml/coreml_executor.h
index edec3020cbc..5ce0a0ade6c 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_executor.h
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_executor.h
@@ -45,4 +45,5 @@ struct TensorData {
 @property MLModel* model API_AVAILABLE(ios(11));
 @property NSString* mlModelFilePath;
 @property NSString* compiledModelFilePath;
+@property(nonatomic, readonly) int coreMlVersion;
 @end
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_executor.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_executor.mm
index 2091c0d7ca0..1f808e08d49 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_executor.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_executor.mm
@@ -39,17 +39,22 @@ NSURL* createTemporaryFile() {
   NSSet* _featureNames;
 }
 
-- (instancetype)initWithInputs:(const std::vector<TensorData>*)inputs;
+- (instancetype)initWithInputs:(const std::vector<TensorData>*)inputs
+                 coreMlVersion:(int)coreMlVersion;
 - (MLFeatureValue*)featureValueForName:(NSString*)featureName API_AVAILABLE(ios(11));
 - (NSSet<NSString*>*)featureNames;
 
+@property(nonatomic, readonly) int coreMlVersion;
+
 @end
 
 @implementation MultiArrayFeatureProvider
 
-- (instancetype)initWithInputs:(const std::vector<TensorData>*)inputs {
+- (instancetype)initWithInputs:(const std::vector<TensorData>*)inputs
+                 coreMlVersion:(int)coreMlVersion {
   self = [super init];
   _inputs = inputs;
+  _coreMlVersion = coreMlVersion;
   for (auto& input : *_inputs) {
     if (input.name.empty()) {
       return nil;
@@ -74,8 +79,31 @@ NSURL* createTemporaryFile() {
   for (auto& input : *_inputs) {
     if ([featureName cStringUsingEncoding:NSUTF8StringEncoding] == input.name) {
       // TODO(b/141492326): Update shape handling for higher ranks
-      NSArray* shape = @[ @(input.shape[0]), @(input.shape[1]), @(input.shape[2]) ];
-      NSArray* strides = @[ @(input.shape[1] * input.shape[2]), @(input.shape[2]), @1 ];
+      NSArray* shape = @[
+        @(input.shape[0]),
+        @(input.shape[1]),
+        @(input.shape[2]),
+      ];
+      NSArray* strides = @[
+        @(input.shape[1] * input.shape[2]),
+        @(input.shape[2]),
+        @1,
+      ];
+
+      if ([self coreMlVersion] >= 3) {
+        shape = @[
+          @(input.shape[0]),
+          @(input.shape[1]),
+          @(input.shape[2]),
+          @(input.shape[3]),
+        ];
+        strides = @[
+          @(input.shape[1] * input.shape[2] * input.shape[3]),
+          @(input.shape[2] * input.shape[3]),
+          @(input.shape[3]),
+          @1,
+        ];
+      };
       NSError* error = nil;
       MLMultiArray* mlArray = [[MLMultiArray alloc] initWithDataPointer:(float*)input.data.data()
                                                                   shape:shape
@@ -106,7 +134,7 @@ NSURL* createTemporaryFile() {
   }
   NSError* error = nil;
   MultiArrayFeatureProvider* inputFeature =
-      [[MultiArrayFeatureProvider alloc] initWithInputs:&inputs];
+      [[MultiArrayFeatureProvider alloc] initWithInputs:&inputs coreMlVersion:[self coreMlVersion]];
   if (inputFeature == nil) {
     NSLog(@"inputFeature is not initialized.");
     return NO;
@@ -153,6 +181,14 @@ NSURL* createTemporaryFile() {
 - (NSURL*)saveModel:(CoreML::Specification::Model*)model {
   NSURL* modelUrl = createTemporaryFile();
   NSString* modelPath = [modelUrl path];
+  if (model->specificationversion() == 3) {
+    _coreMlVersion = 2;
+  } else if (model->specificationversion() == 4) {
+    _coreMlVersion = 3;
+  } else {
+    NSLog(@"Only Core ML models with specification version 3 or 4 are supported");
+    return nil;
+  }
   // Flush data to file.
   // TODO(karimnosseir): Can we mmap this instead of actual writing it to phone ?
   std::ofstream file_stream([modelPath UTF8String], std::ios::out | std::ios::binary);
diff --git a/tensorflow/lite/experimental/delegates/hexagon/README.md b/tensorflow/lite/experimental/delegates/hexagon/README.md
index 5cf71fdb5bf..106ddce038b 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/README.md
+++ b/tensorflow/lite/experimental/delegates/hexagon/README.md
@@ -79,9 +79,14 @@ are verified in `IsNodeSupportedByHexagon`:
 * Hardswish
 * L2Normalization (without any activation)
 * Logistic (aka Sigmoid)
+* Maximum
 * MaxPool2D (without any activation) (b/129276536)
+* Mean
+* Minimum
+* MirrorPad
 * Mul (without any activation) (b/129276536)
 * Neg
+* Pack
 * Pad: Only supports 0 padding (b/139277813)
 * Quantize (8-bit inputs & outputs only)
 * Relu
@@ -91,6 +96,7 @@ are verified in `IsNodeSupportedByHexagon`:
   * Constraints:
     - Requested size <= 65 (b/143105433)
 * Resize Nearest Neighbor
+* Slice
 * SoftMax
 * SpaceToDepth
 * Split
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
index ae8ffe293e9..feadd096c54 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
@@ -19,8 +19,11 @@ cc_library(
         "hardswish_builder.cc",
         "l2_normalization_builder.cc",
         "matmul_builder.cc",
+        "min_max_builder.cc",
+        "mirror_pad_builder.cc",
         "neg_op_builder.cc",
         "op_builder.cc",
+        "pack_builder.cc",
         "pad_builder.cc",
         "pool_2d_builder.cc",
         "quantize_builder.cc",
@@ -28,6 +31,7 @@ cc_library(
         "reshape_builder.cc",
         "resize_bilinear_builder.cc",
         "resize_nearest_neighbor_builder.cc",
+        "slice_builder.cc",
         "softmax_builder.cc",
         "space_to_depth_builder.cc",
         "split_builder.cc",
@@ -45,8 +49,11 @@ cc_library(
         "hardswish_builder.h",
         "l2_normalization_builder.h",
         "matmul_builder.h",
+        "min_max_builder.h",
+        "mirror_pad_builder.h",
         "neg_op_builder.h",
         "op_builder.h",
+        "pack_builder.h",
         "pad_builder.h",
         "pool_2d_builder.h",
         "quantize_builder.h",
@@ -54,6 +61,7 @@ cc_library(
         "reshape_builder.h",
         "resize_bilinear_builder.h",
         "resize_nearest_neighbor_builder.h",
+        "slice_builder.h",
         "softmax_builder.h",
         "space_to_depth_builder.h",
         "split_builder.h",
@@ -74,6 +82,7 @@ cc_library(
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:padding",
         "//tensorflow/lite/kernels/internal:optimized_base",
+        "//tensorflow/lite/kernels/internal:tensor",
         "@hexagon_nn//:hexagon_nn_ops",
     ],
 )
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/arithmetic_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/arithmetic_builder.cc
index 9494de0da29..090d3b4a9a7 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/arithmetic_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/arithmetic_builder.cc
@@ -47,12 +47,6 @@ TfLiteStatus ArithmeticOpBuilder::PopulateSubGraph(
   // Second input data tensor.
   tensor_id = inputs->data[1];
   const auto& input2_tensor = context->tensors[tensor_id];
-  // TODO(karimnosseir): Have this as util to generalize to all ops.
-  if (input2_tensor.allocation_type == kTfLiteMmapRo) {
-    auto* const_input_node =
-        graph_builder_->AddConstNodeWithData(tensor_id, input2_tensor);
-    graph_builder_->AddTensorWithID(tensor_id, const_input_node->GetID(), 0);
-  }
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
   TF_LITE_ENSURE_STATUS(
       ComputeMinAndMaxQuantValues(input2_tensor, &input2_min_, &input2_max_));
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
index c53e62d27a7..2d02cff82ad 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
@@ -18,7 +18,9 @@ limitations under the License.
 
 #include <limits>
 
+#include "hexagon/hexagon_nn_ops.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -27,9 +29,121 @@ namespace tflite {
 namespace delegates {
 namespace hexagon {
 namespace {
+void GetDims(int* batch_size, int* height_size, int* width_size,
+             int* depth_size, const TfLiteIntArray* dims) {
+  int* dim[] = {batch_size, height_size, width_size, depth_size};
+  for (int i = 0; i < 4; ++i) *(dim[i]) = 1;
+  for (int i = 4 - dims->size; i < 4; ++i) {
+    *dim[i] = dims->data[i - (4 - dims->size)];
+  }
+}
 
 constexpr uint8_t k8BitSignFlipConstant = 0x80;
 
+TfLiteStatus AddFullyConnectedHelper(const TfLiteIntArray* inputs,
+                                     const TfLiteIntArray* outputs,
+                                     const OpBuilder::TensorID weights_id,
+                                     const OpBuilder::TensorID weights_min_id,
+                                     const OpBuilder::TensorID weights_max_id,
+                                     GraphBuilder* graph_builder,
+                                     TfLiteContext* context,
+                                     OpBuilder* matmul_op,
+                                     OpBuilder::TensorID* node_output) {
+  static int scalar_shape[] = {1, 1, 1, 1};
+  // Data tensor.
+  int data_tensor_id = inputs->data[0];
+  const auto& data_tensor = context->tensors[data_tensor_id];
+  float data_min, data_max;
+  TF_LITE_ENSURE_STATUS(OpBuilder::ComputeMinAndMaxQuantValues(
+      data_tensor, &data_min, &data_max));
+  auto* data_min_const = graph_builder->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&data_min), sizeof(data_min));
+  auto* data_max_const = graph_builder->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&data_max), sizeof(data_max));
+
+  // Data and weight tensors in required order.
+  matmul_op->AddInput(graph_builder->GetHexagonTensorId(data_tensor_id));
+  matmul_op->AddInput(weights_id);
+  matmul_op->AddInput(OpBuilder::TensorID(data_min_const->GetID(), 0));
+  matmul_op->AddInput(OpBuilder::TensorID(data_max_const->GetID(), 0));
+  matmul_op->AddInput(weights_min_id);
+  matmul_op->AddInput(weights_max_id);
+
+  // Outputs for the MatMul node, which are in int32 format.
+  // Output shape should still be the same.
+  int output_batch_size, output_height_size, output_width_size,
+      output_depth_size;
+  GetDims(&output_batch_size, &output_height_size, &output_width_size,
+          &output_depth_size, context->tensors[outputs->data[0]].dims);
+  const auto& matmul_out =
+      matmul_op->AddOutput(sizeof(int32_t), 4,
+                           {output_batch_size, output_height_size,
+                            output_width_size, output_depth_size});
+  const auto& matmul_out_min =
+      matmul_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  const auto& matmul_out_max =
+      matmul_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+
+  // Bias tensor.
+  int bias_tensor_id = inputs->data[2];
+  OpBuilder::TensorID matmul_and_bias_out = matmul_out,
+                      matmul_and_bias_out_min = matmul_out_min,
+                      matmul_and_bias_out_max = matmul_out_max;
+  if (bias_tensor_id != -1) {
+    const auto& bias_tensor = context->tensors[bias_tensor_id];
+    float bias_min, bias_max;
+    OpBuilder::ComputeMinAndMaxQuantValues(bias_tensor, &bias_min, &bias_max);
+    auto* bias_min_const = graph_builder->AddConstNodeWithData(
+        scalar_shape, reinterpret_cast<char*>(&bias_min), sizeof(bias_min));
+    auto* bias_max_const = graph_builder->AddConstNodeWithData(
+        scalar_shape, reinterpret_cast<char*>(&bias_max), sizeof(bias_max));
+
+    // MatMul + Bias.
+    auto* bias_add_op = graph_builder->AddNode(matmul_op->GetTFLiteNodeID());
+    bias_add_op->SetOpType(OP_QuantizedBiasAdd_32p32to32);
+    bias_add_op->AddInput(matmul_out);
+    bias_add_op->AddInput(graph_builder->GetHexagonTensorId(bias_tensor_id));
+    bias_add_op->AddInput(matmul_out_min);
+    bias_add_op->AddInput(matmul_out_max);
+    bias_add_op->AddInput(OpBuilder::TensorID(bias_min_const->GetID(), 0));
+    bias_add_op->AddInput(OpBuilder::TensorID(bias_max_const->GetID(), 0));
+    matmul_and_bias_out =
+        bias_add_op->AddOutput(sizeof(int32_t), 4,
+                               {output_batch_size, output_height_size,
+                                output_width_size, output_depth_size});
+    matmul_and_bias_out_min =
+        bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+    matmul_and_bias_out_max =
+        bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  }
+
+  float output_min, output_max;
+  // Quantize 32-bit result into 8-bit format using output tensor min/max.
+  OpBuilder::ComputeMinAndMaxQuantValues(context->tensors[outputs->data[0]],
+                                         &output_min, &output_max);
+  auto* output_min_const = graph_builder->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_min), sizeof(output_min));
+  auto* output_max_const = graph_builder->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_max), sizeof(output_max));
+  auto* quantize_biasadd_op =
+      graph_builder->AddNode(matmul_op->GetTFLiteNodeID());
+  quantize_biasadd_op->SetOpType(OP_Requantize_32to8);
+  quantize_biasadd_op->AddInput(matmul_and_bias_out);
+  quantize_biasadd_op->AddInput(matmul_and_bias_out_min);
+  quantize_biasadd_op->AddInput(matmul_and_bias_out_max);
+  quantize_biasadd_op->AddInput(
+      OpBuilder::TensorID(output_min_const->GetID(), 0));
+  quantize_biasadd_op->AddInput(
+      OpBuilder::TensorID(output_max_const->GetID(), 0));
+  *node_output =
+      quantize_biasadd_op->AddOutput(sizeof(uint8_t), 4,
+                                     {output_batch_size, output_height_size,
+                                      output_width_size, output_depth_size});
+  quantize_biasadd_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  quantize_biasadd_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  return kTfLiteOk;
+}
+
 }  // namespace
 
 // The TFLite 'Fully-connected' quantized op corresponds to the following
@@ -38,27 +152,14 @@ constexpr uint8_t k8BitSignFlipConstant = 0x80;
 // MatMul out (int32), Bias (int32) => QuantizedBiasAdd => BiasAdd out (int32)
 // BiasAdd out (int32) => Requantize_32to8 => Output (8-bit)
 // TODO(b/129276536): Add activation support.
-TfLiteStatus MatMulOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
-                                               const TfLiteIntArray* outputs,
-                                               TfLiteContext* context) {
+TfLiteStatus MatMulWithConstWeightsOpBuilder::PopulateSubGraph(
+    const TfLiteIntArray* inputs, const TfLiteIntArray* outputs,
+    TfLiteContext* context) {
   static int quant_bound_shape[] = {1, 1, 1, 1};
 
-  // Data tensor.
-  int data_tensor_id = inputs->data[0];
-  const auto& data_tensor = context->tensors[data_tensor_id];
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(data_tensor, &data_min_, &data_max_));
-  auto* data_min_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, reinterpret_cast<char*>(&data_min_),
-      sizeof(data_min_));
-  auto* data_max_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, reinterpret_cast<char*>(&data_max_),
-      sizeof(data_max_));
-
   // Weights vector.
   int weights_tensor_id = inputs->data[1];
   const auto& weights_tensor = context->tensors[weights_tensor_id];
-  // TODO(srjoglekar): Abstract out.
   if (weights_tensor.allocation_type != kTfLiteMmapRo) {
     context->ReportError(
         context, "Weights tensor doesn't have correct allocation type: %s",
@@ -98,7 +199,7 @@ TfLiteStatus MatMulOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
       weights_shape_.data(), reinterpret_cast<char*>(nhcw.data()),
       weights_tensor.bytes);
   graph_builder_->AddTensorWithID(weights_tensor_id,
-                                  const_weights_node->GetID(), 0);
+                                  const_weights_node->GetID(), 0, true);
   ComputeMinAndMaxQuantValues(weights_tensor, &weights_min_, &weights_max_);
   auto* weights_min_const = graph_builder_->AddConstNodeWithData(
       quant_bound_shape, reinterpret_cast<char*>(&weights_min_),
@@ -107,84 +208,74 @@ TfLiteStatus MatMulOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
       quant_bound_shape, reinterpret_cast<char*>(&weights_max_),
       sizeof(weights_max_));
 
-  // Data and weight tensors in required order.
-  AddInput(graph_builder_->GetHexagonTensorId(data_tensor_id));
+  return AddFullyConnectedHelper(
+      inputs, outputs, graph_builder_->GetHexagonTensorId(weights_tensor_id),
+      TensorID(weights_min_const->GetID(), 0),
+      TensorID(weights_max_const->GetID(), 0), graph_builder_, context, this,
+      &node_output_);
+}
+
+TfLiteStatus MatMulWithConstWeightsOpBuilder::RegisterOutputs(
+    const TfLiteIntArray* outputs, TfLiteContext* context) {
+  // Should be only 1 output.
+  graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                  node_output_.second);
+  return kTfLiteOk;
+}
+
+TfLiteStatus MatMulOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                               const TfLiteIntArray* outputs,
+                                               TfLiteContext* context) {
+  static int scalar_shape[] = {1, 1, 1, 1};
+  const int weights_tensor_id = inputs->data[1];
+  const auto& weights_tensor = context->tensors[weights_tensor_id];
+  int batch_size, height_size, width_size, depth_size;
+  GetDims(&batch_size, &height_size, &width_size, &depth_size,
+          weights_tensor.dims);
+  weights_shape_ = {batch_size, height_size, depth_size, width_size};
+  // Permutation for transposing.
+  int permutation[] = {0, 1, 3, 2};
+  const int permutation_shape[] = {1, 1, 1, 4};
+  auto permutation_node = graph_builder_->AddConstNodeWithData(
+      permutation_shape, reinterpret_cast<char*>(permutation),
+      4 * sizeof(permutation[0]));
   AddInput(graph_builder_->GetHexagonTensorId(weights_tensor_id));
-  AddInput(TensorID(data_min_const->GetID(), 0));
-  AddInput(TensorID(data_max_const->GetID(), 0));
+  AddInput(TensorID(permutation_node->GetID(), 0));
+
+  ComputeMinAndMaxQuantValues(weights_tensor, &weights_min_, &weights_max_);
+  auto* weights_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&weights_min_),
+      sizeof(weights_min_));
+  auto* weights_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&weights_max_),
+      sizeof(weights_max_));
   AddInput(TensorID(weights_min_const->GetID(), 0));
   AddInput(TensorID(weights_max_const->GetID(), 0));
 
-  // Outputs for the MatMul node, which are in int32 format.
-  // Output shape should still be the same.
-  int output_batch_size, output_height_size, output_width_size,
-      output_depth_size;
-  GetDims(&output_batch_size, &output_height_size, &output_width_size,
-          &output_depth_size, context->tensors[outputs->data[0]].dims);
-  const auto& matmul_out = AddOutput(sizeof(int32_t), 4,
-                                     {output_batch_size, output_height_size,
-                                      output_width_size, output_depth_size});
-  const auto& matmul_out_min = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  const auto& matmul_out_max = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  auto transposed_weights = AddOutput(sizeof(uint8_t), 4, weights_shape_);
+  auto transposed_weights_min = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  auto transposed_weights_max = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
 
-  // Bias tensor.
-  int bias_tensor_id = inputs->data[2];
-  const auto& bias_tensor = context->tensors[bias_tensor_id];
-  auto* const_bias_node =
-      graph_builder_->AddConstNodeWithData(bias_tensor_id, bias_tensor);
-  graph_builder_->AddTensorWithID(bias_tensor_id, const_bias_node->GetID(), 0);
-  ComputeMinAndMaxQuantValues(bias_tensor, &bias_min_, &bias_max_);
-  auto* bias_min_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, reinterpret_cast<char*>(&bias_min_),
-      sizeof(bias_min_));
-  auto* bias_max_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, reinterpret_cast<char*>(&bias_max_),
-      sizeof(bias_max_));
-
-  // MatMul + Bias.
-  auto* bias_add_op = graph_builder_->AddNode(GetTFLiteNodeID());
-  bias_add_op->SetOpType(OP_QuantizedBiasAdd_32p32to32);
-  bias_add_op->AddInput(matmul_out);
-  bias_add_op->AddInput(graph_builder_->GetHexagonTensorId(bias_tensor_id));
-  bias_add_op->AddInput(matmul_out_min);
-  bias_add_op->AddInput(matmul_out_max);
-  bias_add_op->AddInput(TensorID(bias_min_const->GetID(), 0));
-  bias_add_op->AddInput(TensorID(bias_max_const->GetID(), 0));
-  const auto& bias_add_out =
-      bias_add_op->AddOutput(sizeof(int32_t), 4,
-                             {output_batch_size, output_height_size,
-                              output_width_size, output_depth_size});
-  const auto& bias_add_out_min =
-      bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  const auto& bias_add_out_max =
-      bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-
-  // Quantize 32-bit result into 8-bit format using output tensor min/max.
-  ComputeMinAndMaxQuantValues(context->tensors[outputs->data[0]], &output_min_,
-                              &output_max_);
-  auto* output_min_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, reinterpret_cast<char*>(&output_min_),
-      sizeof(output_min_));
-  auto* output_max_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, reinterpret_cast<char*>(&output_max_),
-      sizeof(output_max_));
-  auto* quantize_biasadd_op = graph_builder_->AddNode(GetTFLiteNodeID());
-  quantize_biasadd_op->SetOpType(OP_Requantize_32to8);
-  quantize_biasadd_op->AddInput(bias_add_out);
-  quantize_biasadd_op->AddInput(bias_add_out_min);
-  quantize_biasadd_op->AddInput(bias_add_out_max);
-  quantize_biasadd_op->AddInput(TensorID(output_min_const->GetID(), 0));
-  quantize_biasadd_op->AddInput(TensorID(output_max_const->GetID(), 0));
-  node_output_ =
-      quantize_biasadd_op->AddOutput(sizeof(uint8_t), 4,
-                                     {output_batch_size, output_height_size,
-                                      output_width_size, output_depth_size});
-  quantize_biasadd_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  quantize_biasadd_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  auto* matmul_op = graph_builder_->AddNode(GetTFLiteNodeID());
+  matmul_op->SetOpType(OP_QuantizedMatMul_8x8to32);
 
+  AddFullyConnected(inputs, outputs, transposed_weights, transposed_weights_min,
+                    transposed_weights_max, context, matmul_op);
   return kTfLiteOk;
 }
 
+TfLiteStatus MatMulOpBuilder::AddFullyConnected(const TfLiteIntArray* inputs,
+                                                const TfLiteIntArray* outputs,
+                                                const TensorID weights_id,
+                                                const TensorID weights_min_id,
+                                                const TensorID weights_max_id,
+                                                TfLiteContext* context,
+                                                OpBuilder* matmul_op) {
+  return AddFullyConnectedHelper(inputs, outputs, weights_id, weights_min_id,
+                                 weights_max_id, graph_builder_, context,
+                                 matmul_op, &node_output_);
+}
+
 TfLiteStatus MatMulOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
                                               TfLiteContext* context) {
   // Should be only 1 output.
@@ -193,9 +284,12 @@ TfLiteStatus MatMulOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
   return kTfLiteOk;
 }
 
-MatMulOpBuilder::~MatMulOpBuilder() {}
+OpBuilder* CreateMatMulWithConstWeightsOpBuilder(GraphBuilder* graph_builder,
+                                                 int op_type) {
+  return new MatMulWithConstWeightsOpBuilder(graph_builder, op_type);
+}
 
-OpBuilder* CreateMatMulBuilder(GraphBuilder* graph_builder, int op_type) {
+OpBuilder* CreateMatMulOpBuilder(GraphBuilder* graph_builder, int op_type) {
   return new MatMulOpBuilder(graph_builder, op_type);
 }
 
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.h
index 212ea7be7a3..89f3c1273d7 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.h
@@ -23,6 +23,28 @@ namespace tflite {
 namespace delegates {
 namespace hexagon {
 
+// Builder for FullyConnected op in Hexagon with weights as const.
+class MatMulWithConstWeightsOpBuilder : public OpBuilder {
+ public:
+  explicit MatMulWithConstWeightsOpBuilder(GraphBuilder* graph_builder,
+                                           int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+  std::vector<int> weights_shape_, bias_shape_;
+  std::vector<float> transposed_weights_;
+  float data_min_, data_max_, weights_min_, weights_max_, bias_min_, bias_max_,
+      output_min_, output_max_;
+};
+
+// Builder for FullyConnected op in Hexagon with non const weights.
 class MatMulOpBuilder : public OpBuilder {
  public:
   explicit MatMulOpBuilder(GraphBuilder* graph_builder, int op_type)
@@ -34,9 +56,15 @@ class MatMulOpBuilder : public OpBuilder {
   TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
                                TfLiteContext* context) override;
 
-  ~MatMulOpBuilder() override;
-
  private:
+  // Adds Fully connected op related ops to the graph.
+  TfLiteStatus AddFullyConnected(const TfLiteIntArray* inputs,
+                                 const TfLiteIntArray* outputs,
+                                 const TensorID weights_id,
+                                 const TensorID weights_min_id,
+                                 const TensorID weights_max_id,
+                                 TfLiteContext* context, OpBuilder* matmul_op);
+
   TensorID node_output_;
   std::vector<int> weights_shape_, bias_shape_;
   std::vector<float> transposed_weights_;
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.cc
new file mode 100644
index 00000000000..ab5895b9a14
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.cc
@@ -0,0 +1,106 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h"
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+TfLiteStatus MinMaxOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                               const TfLiteIntArray* outputs,
+                                               TfLiteContext* context) {
+  static int scalar_shape[] = {1, 1, 1, 1};
+  int a_tensor_id;
+  int b_tensor_id;
+
+  // Input tensors a and b.
+  a_tensor_id = inputs->data[0];
+  b_tensor_id = inputs->data[1];
+  const auto& a_tensor = context->tensors[a_tensor_id];
+  const auto& b_tensor = context->tensors[b_tensor_id];
+  if (a_tensor.allocation_type == kTfLiteMmapRo)
+    graph_builder_->AddConstNodeWithData(a_tensor_id, a_tensor);
+  if (b_tensor.allocation_type == kTfLiteMmapRo)
+    graph_builder_->AddConstNodeWithData(b_tensor_id, b_tensor);
+  AddInput(graph_builder_->GetHexagonTensorId(a_tensor_id));
+  AddInput(graph_builder_->GetHexagonTensorId(b_tensor_id));
+
+  // Add Inputs A & B min/max
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(a_tensor, &a_input_min_, &a_input_max_));
+  auto* a_input_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&a_input_min_),
+      sizeof(a_input_min_));
+  auto* a_input_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&a_input_max_),
+      sizeof(a_input_max_));
+  AddInput(TensorID(a_input_min_const->GetID(), 0));
+  AddInput(TensorID(a_input_max_const->GetID(), 0));
+
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(b_tensor, &b_input_min_, &b_input_max_));
+  auto* b_input_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&b_input_min_),
+      sizeof(b_input_min_));
+  auto* b_input_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&b_input_max_),
+      sizeof(b_input_max_));
+  AddInput(TensorID(b_input_min_const->GetID(), 0));
+  AddInput(TensorID(b_input_max_const->GetID(), 0));
+
+  // Add output min/max
+  const int output_tensor_id = outputs->data[0];
+  const auto& output_tensor = context->tensors[output_tensor_id];
+  float output_min, output_max;
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(output_tensor, &output_min, &output_max));
+  auto* output_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_min), sizeof(output_min));
+  auto* output_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_max), sizeof(output_max));
+  AddInput(TensorID(output_min_const->GetID(), 0));
+  AddInput(TensorID(output_max_const->GetID(), 0));
+
+  // Add outputs.
+  int output_batch_size, output_height_size, output_width_size,
+      output_depth_size;
+  GetDims(&output_batch_size, &output_height_size, &output_width_size,
+          &output_depth_size, context->tensors[outputs->data[0]].dims);
+  node_output_ = AddOutput(sizeof(uint8_t), 4,
+                           {output_batch_size, output_height_size,
+                            output_width_size, output_depth_size});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus MinMaxOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
+                                              TfLiteContext* context) {
+  // Should be only 1 output.
+  graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                  node_output_.second);
+
+  return kTfLiteOk;
+}
+
+OpBuilder* CreateMinMaxBuilder(GraphBuilder* graph_builder, int op_type) {
+  return new MinMaxOpBuilder(graph_builder, op_type);
+}
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h
new file mode 100644
index 00000000000..4d50d941e4f
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIN_MAX_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIN_MAX_BUILDER_H_
+
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class MinMaxOpBuilder : public OpBuilder {
+ public:
+  explicit MinMaxOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+  float a_input_min_, a_input_max_, b_input_min_, b_input_max_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIN_MAX_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.cc
new file mode 100644
index 00000000000..2a04088f4f3
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.cc
@@ -0,0 +1,112 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.h"
+
+#include <stdint.h>
+
+#include <limits>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+TfLiteStatus MirrorPadOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                                  const TfLiteIntArray* outputs,
+                                                  TfLiteContext* context) {
+  static int quant_bound_shape[] = {1, 1, 1, 1};
+  int tensor_id;
+
+  // Input data tensor.
+  tensor_id = inputs->data[0];
+  const auto& input_tensor = context->tensors[tensor_id];
+  AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
+
+  // Padding tensor.
+  // Should be a constant.
+  tensor_id = inputs->data[1];
+  const auto& padding_tensor = context->tensors[tensor_id];
+  if (padding_tensor.dims->size != 2 || padding_tensor.dims->data[0] > 4 ||
+      padding_tensor.dims->data[1] != 2) {
+    TF_LITE_KERNEL_LOG(context, "Invalid padding tensor shape");
+    return kTfLiteError;
+  }
+  paddings_shape_ = {1, 1, 4, 2};
+  std::vector<int> padding_data(8, 0);
+  // Hexagon always expects padding data for each dimension in order {b, h, w,
+  // d}. This start value ensures we pad the non-relevant dimensions with 0.
+  int padding_data_start = 8 - padding_tensor.dims->data[0] * 2;
+  for (int i = 0; i < padding_tensor.dims->data[0] * 2; ++i) {
+    padding_data[padding_data_start + i] = padding_tensor.data.i32[i];
+  }
+  auto* const_padding_node = graph_builder_->AddConstNodeWithData(
+      paddings_shape_.data(), reinterpret_cast<char*>(padding_data.data()),
+      padding_data.size() * sizeof(padding_data[0]));
+  AddInput(TensorID(const_padding_node->GetID(), 0));
+  // Padding type.
+  const TfLiteMirrorPaddingParams* params =
+      reinterpret_cast<const TfLiteMirrorPaddingParams*>(builtin_data_);
+  if (params->mode == kTfLiteMirrorPaddingReflect) {
+    SetPaddingType(NN_PAD_MIRROR_REFLECT);
+  } else if (params->mode == kTfLiteMirrorPaddingSymmetric) {
+    SetPaddingType(NN_PAD_MIRROR_SYMMETRIC);
+  }
+
+  // Min/max values for input tensor.
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
+  auto* input_min_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&input_min_),
+      sizeof(input_min_));
+  auto* input_max_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&input_max_),
+      sizeof(input_max_));
+  AddInput(TensorID(input_min_const->GetID(), 0));
+  AddInput(TensorID(input_max_const->GetID(), 0));
+
+  // Hexagon outputs for this node.
+  int output_batch_size, output_height_size, output_width_size,
+      output_depth_size;
+  GetDims(&output_batch_size, &output_height_size, &output_width_size,
+          &output_depth_size, context->tensors[outputs->data[0]].dims);
+  node_output_ = AddOutput(sizeof(uint8_t), 4,
+                           {output_batch_size, output_height_size,
+                            output_width_size, output_depth_size});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus MirrorPadOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
+                                                 TfLiteContext* context) {
+  // Should be only 1 output.
+  graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                  node_output_.second);
+  return kTfLiteOk;
+}
+
+MirrorPadOpBuilder::~MirrorPadOpBuilder() {}
+
+OpBuilder* CreateMirrorPadBuilder(GraphBuilder* graph_builder, int op_type) {
+  return new MirrorPadOpBuilder(graph_builder, op_type);
+}
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.h
new file mode 100644
index 00000000000..6fcb2606701
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIRROR_PAD_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIRROR_PAD_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class MirrorPadOpBuilder : public OpBuilder {
+ public:
+  explicit MirrorPadOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~MirrorPadOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+  float input_min_, input_max_;
+  std::vector<int> paddings_shape_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIRROR_PAD_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
index e20127ac6c1..543bd7caf43 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
@@ -23,7 +23,8 @@ namespace tflite {
 namespace delegates {
 namespace hexagon {
 
-OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type) {
+OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type,
+                                                     TfLiteNode* node) {
   switch (op_type) {
     case kTfLiteBuiltinAdd:
       return CreateArithmeticBuilder(this, OP_QuantizedAdd_8p8to8);
@@ -43,8 +44,16 @@ OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type) {
       return CreateReduceBuilder(this, OP_QuantizedSum_8to32);
     case kTfLiteBuiltinPad:
       return CreatePadBuilder(this, OP_QuantizedPad_8);
-    case kTfLiteBuiltinFullyConnected:
-      return CreateMatMulBuilder(this, OP_QuantizedMatMul_8x8to32);
+    case kTfLiteBuiltinMirrorPad:
+      return CreateMirrorPadBuilder(this, OP_MirrorPad_8);
+    case kTfLiteBuiltinFullyConnected: {
+      const auto& weights_tensor = context_->tensors[node->inputs->data[1]];
+      if (weights_tensor.allocation_type == kTfLiteMmapRo)
+        return CreateMatMulWithConstWeightsOpBuilder(
+            this, OP_QuantizedMatMul_8x8to32);
+      else
+        return CreateMatMulOpBuilder(this, OP_Transpose_8);
+    }
     case kTfLiteBuiltinAveragePool2d:
       return CreatePool2DBuilder(this, OP_QuantizedAvgPool_8);
     case kTfLiteBuiltinMaxPool2d:
@@ -91,6 +100,14 @@ OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type) {
       return CreateQuantizeBuilder(this, OP_Requantize_8to8);
     case kTfLiteBuiltinHardSwish:
       return CreateHardSwishBuilder(this, OP_QuantizedHardSwish_8);
+    case kTfLiteBuiltinMinimum:
+      return CreateMinMaxBuilder(this, OP_QuantizedMinimum_8);
+    case kTfLiteBuiltinMaximum:
+      return CreateMinMaxBuilder(this, OP_QuantizedMaximum_8);
+    case kTfLiteBuiltinSlice:
+      return CreateSliceOpBuilder(this, OP_QuantizedSlice_8);
+    case kTfLiteBuiltinPack:
+      return CreatePackBuilder(this, OP_QuantizedPack_8);
     default:
       context_->ReportError(context_, "Op not supported: %d", op_type);
       return nullptr;
@@ -114,7 +131,8 @@ OpBuilder* GraphBuilder::AddConstNodeWithData(const int shape[], char* data,
 }
 
 OpBuilder* GraphBuilder::AddConstNodeWithData(int tensor_id,
-                                              const TfLiteTensor& tensor) {
+                                              const TfLiteTensor& tensor,
+                                              bool int8_to_uint8) {
   builders_.emplace_back(new OpBuilder(this, OP_Const));
   const int node_id = builders_.size();
   builders_.back()->SetConstNode();
@@ -130,14 +148,19 @@ OpBuilder* GraphBuilder::AddConstNodeWithData(int tensor_id,
     return nullptr;
   }
   AddTensorWithID(tensor_id, node_id, 0);
+  // Cast int8 to uint8 if requested.
+  // This will add cast op to uint8 and update tensor map to point
+  // to the casted tensor.
+  if (int8_to_uint8 && tensor.type == kTfLiteInt8) {
+    AddCastOp(context_, OP_Quantized_CastInt8ToUInt8, tensor_id);
+  }
   return builders_.back().get();
 }
 
 // TODO(b/154604279): Support these casting ops in Hexagon op profiling (which
 // seems to key tensors on a single op, which may not be the case now).
 TfLiteStatus GraphBuilder::AddCastOp(TfLiteContext* context, int op_type,
-                                     int tensor_id,
-                                     OpBuilder::TensorID hexagon_input) {
+                                     int tensor_id) {
   // Create a new OpBuilder for casting the tensor.
   OpBuilder* cast_builder = CreateCastBuilder(this, op_type);
   builders_.emplace_back(cast_builder);
@@ -171,9 +194,8 @@ TfLiteStatus GraphBuilder::AddInputTensors(const TfLiteIntArray* input_tensors,
     AddTensorWithID(tensor_id, input_op->GetID(), num_inputs);
     // If tensor is of type int8, add an op to cast it to uint8.
     if (tensor.type == kTfLiteInt8) {
-      TF_LITE_ENSURE_STATUS(AddCastOp(context, OP_Quantized_CastInt8ToUInt8,
-                                      tensor_id,
-                                      GetHexagonTensorId(tensor_id)));
+      TF_LITE_ENSURE_STATUS(
+          AddCastOp(context, OP_Quantized_CastInt8ToUInt8, tensor_id));
     }
     ++num_inputs;
   }
@@ -191,9 +213,8 @@ TfLiteStatus GraphBuilder::AddOutputTensors(
     const auto& tensor = context->tensors[tensor_id];
     // If tensor is of type int8, add an op to cast it to uint8.
     if (tensor.type == kTfLiteInt8) {
-      TF_LITE_ENSURE_STATUS(AddCastOp(context, OP_Quantized_CastUInt8ToInt8,
-                                      tensor_id,
-                                      GetHexagonTensorId(tensor_id)));
+      TF_LITE_ENSURE_STATUS(
+          AddCastOp(context, OP_Quantized_CastUInt8ToInt8, tensor_id));
     }
     hexagon_output_ids.push_back(GetHexagonTensorId(tensor_id));
   }
@@ -261,7 +282,7 @@ OpBuilder* GraphBuilder::AddNode(int tflite_node_index) {
 
 OpBuilder* GraphBuilder::AddNodeFromTfLiteOp(int op_type, TfLiteNode* node,
                                              int tflite_node_index) {
-  OpBuilder* op = CreateOpBuilderFromTfLiteOp(op_type);
+  OpBuilder* op = CreateOpBuilderFromTfLiteOp(op_type, node);
   builders_.emplace_back(op);
   op->SetNodeId(builders_.size());
   op->SetTFLiteNodeId(tflite_node_index);
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h
index 267fc818ca1..436dd1336df 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h
@@ -192,12 +192,15 @@ class GraphBuilder {
   OpBuilder* AddNode(int tflite_node_index = -1);
 
   // Add const node that provides the data held by 'tensor'.
-  OpBuilder* AddConstNodeWithData(int tensor_id, const TfLiteTensor& tensor);
+  // If `int8_to_uint8` is true, then the data will be casted to uint8 from
+  // int8.
+  OpBuilder* AddConstNodeWithData(int tensor_id, const TfLiteTensor& tensor,
+                                  bool int8_to_uint8 = false);
 
   // Same as above but takes shape of the tensor that will holds the data.
   OpBuilder* AddConstNodeWithData(const int shape[], char* data, int data_size);
 
-  OpBuilder* CreateOpBuilderFromTfLiteOp(int op_type);
+  OpBuilder* CreateOpBuilderFromTfLiteOp(int op_type, TfLiteNode* node);
 
   // Construct Input node with 'input_tensors' as output.
   TfLiteStatus AddInputTensors(const TfLiteIntArray* input_tensors,
@@ -217,7 +220,6 @@ class GraphBuilder {
   // Returns tensor id inside Hexagon graph.
   OpBuilder::TensorID GetHexagonTensorId(int tflite_tensor_index) {
     if (!HasTensor(tflite_tensor_index)) {
-      printf("Could not find tensor id: %d\n", tflite_tensor_index);
       // Return invalid ID.
       return OpBuilder::TensorID(-1, -1);
     }
@@ -310,8 +312,7 @@ class GraphBuilder {
   }
 
   // Adds a Cast op to convert a tensor from int8 to uint8 (or vice versa).
-  TfLiteStatus AddCastOp(TfLiteContext* context, int op_type, int tensor_id,
-                         OpBuilder::TensorID hexagon_input);
+  TfLiteStatus AddCastOp(TfLiteContext* context, int op_type, int tensor_id);
 
   const HexagonNN* hexagon_nn_ = nullptr;
   TfLiteContext* context_ = nullptr;
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h b/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
index e7236fb0e00..33b56e91f0a 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
@@ -26,7 +26,8 @@ class OpBuilder;
 OpBuilder* CreateArgMinMaxOpBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateActivationBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateArithmeticBuilder(GraphBuilder* graph_builder, int op_type);
-OpBuilder* CreateMatMulBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateMatMulWithConstWeightsOpBuilder(GraphBuilder* graph_builder,
+                                                 int op_type);
 OpBuilder* CreateConcatBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateConv2DBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateTransposeConv2DBuilder(GraphBuilder* graph_builder,
@@ -35,6 +36,7 @@ OpBuilder* CreatePool2DBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateReshapeBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateSoftmaxBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateReduceBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateMirrorPadBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreatePadBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateResizeNearestNeighborBuilder(GraphBuilder* graph_builder,
                                               int op_type);
@@ -53,6 +55,10 @@ OpBuilder* CreateBatchSeqBuilder(GraphBuilder* graph_builder, int op_type,
 OpBuilder* CreateQuantizeBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateHardSwishBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateCastBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateMinMaxBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateSliceOpBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreatePackBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateMatMulOpBuilder(GraphBuilder* graph_builder, int op_type);
 
 }  // namespace hexagon
 }  // namespace delegates
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/pack_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/pack_builder.cc
new file mode 100644
index 00000000000..1d99f3bbb8d
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/pack_builder.cc
@@ -0,0 +1,134 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/pack_builder.h"
+
+#include <stdint.h>
+
+#include <limits>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+namespace {
+
+int GetAxis(int axis, const TfLiteIntArray* inputs, TfLiteContext* context) {
+  auto& input_tensor = context->tensors[inputs->data[0]];
+  // Handle -ve axis.
+  if (axis < 0) {
+    axis += input_tensor.dims->size + 1;
+  }
+  // We need to adjust the axis to be as if the inputs are of rank 4, since
+  // we represent tensors in Hexagon of rank 4.
+  return (4 - input_tensor.dims->size) + axis - 1;
+}
+
+}  // namespace
+TfLiteStatus PackOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                             const TfLiteIntArray* outputs,
+                                             TfLiteContext* context) {
+  static int scalar_shape[] = {1, 1, 1, 1};
+  auto* params = reinterpret_cast<TfLitePackParams*>(builtin_data_);
+  int axis = GetAxis(params->axis, inputs, context);
+  // Add axis
+  auto* axis_node = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&axis), sizeof(axis));
+  AddInput(TensorID(axis_node->GetID(), 0));
+
+  // Add all input tensors.
+  minima_.reserve(inputs->size);
+  maxima_.reserve(inputs->size);
+  int tensor_id = -1;
+  float data_min, data_max;
+  for (int i = 0; i < inputs->size; ++i) {
+    tensor_id = inputs->data[i];
+    auto& input_tensor = context->tensors[tensor_id];
+    AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
+    TF_LITE_ENSURE_STATUS(
+        ComputeMinAndMaxQuantValues(input_tensor, &data_min, &data_max));
+    minima_.push_back(data_min);
+    maxima_.push_back(data_max);
+  }
+
+  // Minima tensors.
+  for (int i = 0; i < minima_.size(); ++i) {
+    auto* data_min_const = graph_builder_->AddConstNodeWithData(
+        scalar_shape, reinterpret_cast<char*>(&minima_[i]), sizeof(minima_[i]));
+    AddInput(TensorID(data_min_const->GetID(), 0));
+  }
+
+  // Maxima tensors.
+  for (int i = 0; i < maxima_.size(); ++i) {
+    auto* data_max_const = graph_builder_->AddConstNodeWithData(
+        scalar_shape, reinterpret_cast<char*>(&maxima_[i]), sizeof(maxima_[i]));
+    AddInput(TensorID(data_max_const->GetID(), 0));
+  }
+
+  // Hexagon outputs for this node.
+  int output_batch_size, output_height_size, output_width_size,
+      output_depth_size;
+  GetDims(&output_batch_size, &output_height_size, &output_width_size,
+          &output_depth_size, context->tensors[outputs->data[0]].dims);
+
+  TensorID pack_out = AddOutput(sizeof(uint8_t), 4,
+                                {output_batch_size, output_height_size,
+                                 output_width_size, output_depth_size});
+
+  // Output min/max for requantization.
+  float output_min, output_max;
+  TF_LITE_ENSURE_STATUS(ComputeMinAndMaxQuantValues(
+      context->tensors[outputs->data[0]], &output_min, &output_max));
+  auto* output_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_min), sizeof(output_min));
+  auto* output_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_max), sizeof(output_max));
+
+  const auto& pack_out_min = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  const auto& pack_out_max = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+
+  // Requantize output to the expected min/max.
+  auto* requantize_op = graph_builder_->AddNode(GetTFLiteNodeID());
+  requantize_op->SetOpType(OP_Requantize_8to8);
+  requantize_op->AddInput(pack_out);
+  requantize_op->AddInput(pack_out_min);
+  requantize_op->AddInput(pack_out_max);
+  requantize_op->AddInput(TensorID(output_min_const->GetID(), 0));
+  requantize_op->AddInput(TensorID(output_max_const->GetID(), 0));
+  node_output_ =
+      requantize_op->AddOutput(sizeof(uint8_t), 4,
+                               {output_batch_size, output_height_size,
+                                output_width_size, output_depth_size});
+  requantize_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  requantize_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  return kTfLiteOk;
+}
+
+TfLiteStatus PackOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
+                                            TfLiteContext* context) {
+  // Should be only 1 output.
+  graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                  node_output_.second);
+  return kTfLiteOk;
+}
+
+OpBuilder* CreatePackBuilder(GraphBuilder* graph_builder, int op_type) {
+  return new PackOpBuilder(graph_builder, op_type);
+}
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/pack_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/pack_builder.h
new file mode 100644
index 00000000000..a372c519c01
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/pack_builder.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_PACK_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_PACK_BUILDER_H_
+#include <vector>
+
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class PackOpBuilder : public OpBuilder {
+ public:
+  explicit PackOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+  // Min/max for all inputs.
+  std::vector<float> minima_, maxima_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_PACK_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/reduce_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/reduce_builder.cc
index 8401f76cf4d..066c82560a8 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/reduce_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/reduce_builder.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/util.h"
 
 namespace tflite {
 namespace delegates {
@@ -35,9 +36,7 @@ TfLiteStatus ReduceOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   tensor_id = inputs->data[0];
   const auto& input_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_,
-                              std::numeric_limits<uint8_t>::min(),
-                              std::numeric_limits<uint8_t>::max());
+  ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_);
   auto* input_min_const = graph_builder_->AddConstNodeWithData(
       quant_bound_shape, reinterpret_cast<char*>(&input_min_),
       sizeof(input_min_));
@@ -63,37 +62,48 @@ TfLiteStatus ReduceOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
     return kTfLiteError;
   }
 
+  auto& output_tensor = context->tensors[outputs->data[0]];
   int output_batch_size, output_height_size, output_width_size,
       output_depth_size;
   GetDims(&output_batch_size, &output_height_size, &output_width_size,
-          &output_depth_size, context->tensors[outputs->data[0]].dims);
+          &output_depth_size, output_tensor.dims);
 
-  // Hexagon's sum-reduction outputs int32, so we shrink it down to UInt8.
-  if (op_node_.op_type == OP_QuantizedSum_8to32) {
-    const auto& reduce_out = AddOutput(sizeof(int32_t), 4,
-                                       {output_batch_size, output_height_size,
-                                        output_width_size, output_depth_size});
-    const auto& reduce_out_min = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-    const auto& reduce_out_max = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  float output_min = -1, output_max = -1;
+  ComputeMinAndMaxQuantValues(output_tensor, &output_min, &output_max);
+  auto* output_min_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&output_min),
+      sizeof(output_min));
+  auto* output_max_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&output_max),
+      sizeof(output_max));
+  // Min/max values for output tensor.
+  AddInput(TensorID(output_min_const->GetID(), 0));
+  AddInput(TensorID(output_max_const->GetID(), 0));
 
-    auto* quantize_output_op = graph_builder_->AddNode(GetTFLiteNodeID());
-    quantize_output_op->SetOpType(OP_QuantizeDownAndShrinkRange_32to8);
-    quantize_output_op->AddInput(reduce_out);
-    quantize_output_op->AddInput(reduce_out_min);
-    quantize_output_op->AddInput(reduce_out_max);
-    node_output_ =
-        quantize_output_op->AddOutput(sizeof(uint8_t), 4,
-                                      {output_batch_size, output_height_size,
-                                       output_width_size, output_depth_size});
-    quantize_output_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-    quantize_output_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  } else {
-    node_output_ = AddOutput(sizeof(uint8_t), 4,
-                             {output_batch_size, output_height_size,
-                              output_width_size, output_depth_size});
-    AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-    AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  }
+  // Add outputs
+  size_t output_element_size = 0;
+  TF_LITE_ENSURE_STATUS(
+      GetSizeOfType(context, output_tensor.type, &output_element_size));
+  auto mean_output = AddOutput(output_element_size, 4,
+                               {output_batch_size, output_height_size,
+                                output_width_size, output_depth_size});
+  auto mean_out_min = AddOutput(output_element_size, 4, {1, 1, 1, 1});
+  auto mean_out_max = AddOutput(output_element_size, 4, {1, 1, 1, 1});
+  // Mean op doesn't honor the passed min/max for output, so we need
+  // to add requantize.
+  auto* requantize_op = graph_builder_->AddNode(GetTFLiteNodeID());
+  requantize_op->SetOpType(OP_Requantize_8to8);
+  requantize_op->AddInput(mean_output);
+  requantize_op->AddInput(mean_out_min);
+  requantize_op->AddInput(mean_out_max);
+  requantize_op->AddInput(TensorID(output_min_const->GetID(), 0));
+  requantize_op->AddInput(TensorID(output_max_const->GetID(), 0));
+  node_output_ =
+      requantize_op->AddOutput(sizeof(uint8_t), 4,
+                               {output_batch_size, output_height_size,
+                                output_width_size, output_depth_size});
+  requantize_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  requantize_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.cc
new file mode 100644
index 00000000000..cc282343f0c
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.cc
@@ -0,0 +1,106 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.h"
+
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/tensor.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+namespace {
+template <typename T>
+void GetBeginAndSizeVectors(int dimensions, const TfLiteTensor* begin,
+                            const TfLiteTensor* size, std::vector<int>* begins,
+                            std::vector<int>* sizes) {
+  for (int i = 0; i < dimensions; ++i) {
+    begins->push_back(GetTensorData<T>(begin)[i]);
+    sizes->push_back(GetTensorData<T>(size)[i]);
+  }
+}
+}  // namespace
+
+TfLiteStatus SliceOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                              const TfLiteIntArray* outputs,
+                                              TfLiteContext* context) {
+  static int quant_bound_shape[] = {1, 1, 1, 1};
+
+  // Input data tensor.
+  const int tensor_id = inputs->data[0];
+  const auto& input_tensor = context->tensors[tensor_id];
+  AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
+  // Start / Size
+  const auto& begin_tensor = context->tensors[inputs->data[1]];
+  const auto& size_tensor = context->tensors[inputs->data[2]];
+  std::vector<int32_t> begins, sizes;
+  if (begin_tensor.type == kTfLiteInt32) {
+    GetBeginAndSizeVectors<int32_t>(input_tensor.dims->size, &begin_tensor,
+                                    &size_tensor, &begins, &sizes);
+  } else if (begin_tensor.type == kTfLiteInt64) {
+    GetBeginAndSizeVectors<int64_t>(input_tensor.dims->size, &begin_tensor,
+                                    &size_tensor, &begins, &sizes);
+  } else {
+    return kTfLiteError;
+  }
+  const int32_t begins_shape[] = {1, 1, 1, static_cast<int32_t>(begins.size())};
+  auto begins_node = graph_builder_->AddConstNodeWithData(
+      begins_shape, reinterpret_cast<char*>(begins.data()),
+      sizeof(int32_t) * begins.size());
+  auto sizes_node = graph_builder_->AddConstNodeWithData(
+      begins_shape, reinterpret_cast<char*>(sizes.data()),
+      sizeof(int32_t) * begins.size());
+  AddInput(TensorID(begins_node->GetID(), 0));
+  AddInput(TensorID(sizes_node->GetID(), 0));
+
+  // Input min/max
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
+  auto* input_min_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&input_min_),
+      sizeof(input_min_));
+  auto* input_max_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&input_max_),
+      sizeof(input_max_));
+  AddInput(TensorID(input_min_const->GetID(), 0));
+  AddInput(TensorID(input_max_const->GetID(), 0));
+
+  // Outputs
+  int output_batch_size, output_height_size, output_width_size,
+      output_depth_size;
+  GetDims(&output_batch_size, &output_height_size, &output_width_size,
+          &output_depth_size, context->tensors[outputs->data[0]].dims);
+  node_output_ = AddOutput(sizeof(uint8_t), 4,
+                           {output_batch_size, output_height_size,
+                            output_width_size, output_depth_size});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  return kTfLiteOk;
+}
+
+TfLiteStatus SliceOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
+                                             TfLiteContext* context) {
+  // Should be only 1 output.
+  graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                  node_output_.second);
+  return kTfLiteOk;
+}
+
+OpBuilder* CreateSliceOpBuilder(GraphBuilder* graph_builder, int op_type) {
+  return new SliceOpBuilder(graph_builder, op_type);
+}
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.h
new file mode 100644
index 00000000000..0ee06630dba
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_SLICE_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_SLICE_BUILDER_H_
+
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class SliceOpBuilder : public OpBuilder {
+ public:
+  explicit SliceOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+  float input_min_, input_max_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_SLICE_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
index b1df59c4098..bfa6618447e 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
@@ -17,6 +17,9 @@ cc_library(
         "//tensorflow/lite/experimental/delegates/hexagon:hexagon_delegate",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/kernels/internal:reference_base",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/kernels/internal:types",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
@@ -26,18 +29,23 @@ hexagon_op_tests(
     srcs = [
         "activations_test.cc",
         "arg_min_max_test.cc",
+        "arithmetic_test.cc",
         "concat_test.cc",
         "conv_test.cc",
         "l2_norm_test.cc",
         "matmul_test.cc",
+        "min_max_builder_test.cc",
+        "mirror_pad_test.cc",
         "mul_test.cc",
         "neg_test.cc",
+        "pack_test.cc",
         "pad_test.cc",
         "pool_test.cc",
         "quantize_test.cc",
         "reduce_test.cc",
         "reshape_test.cc",
         "resize_test.cc",
+        "slice_test.cc",
         "softmax_test.cc",
         "space_to_depth_test.cc",
         "split_test.cc",
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/activations_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/activations_test.cc
index eed70619acf..45698da7a17 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/activations_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/activations_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 namespace tflite {
 using testing::ElementsAreArray;
 
+namespace {
 void GenerateUniformRandomVector(int size, float min, float max,
                                  std::minstd_rand* random_engine,
                                  std::vector<float>* result) {
@@ -44,6 +45,7 @@ void GenerateUniformRandomVector(int size, float min, float max,
     (*result)[i] = min + (max - min) * random_value_scaled_0_1;
   }
 }
+}  // namespace
 
 class ActivationOpModel : public SingleOpModelWithHexagon {
  public:
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/arithmetic_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/arithmetic_test.cc
new file mode 100644
index 00000000000..6f9b48054be
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/arithmetic_test.cc
@@ -0,0 +1,175 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+class ArithmeticOpBaseModel : public SingleOpModelWithHexagon {
+ public:
+  ArithmeticOpBaseModel(const TensorData& input1, const TensorData& input2,
+                        const TensorData& output)
+      : SingleOpModelWithHexagon() {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+  }
+  ArithmeticOpBaseModel(const TensorData& input1, const TensorData& input2,
+                        const TensorData& output,
+                        const std::initializer_list<uint8_t>& input1_data,
+                        const std::initializer_list<uint8_t>& input2_data) {
+    if (input1_data.size() > 0)
+      input1_ = AddConstInput(input1, input1_data);
+    else
+      input1_ = AddInput(input1);
+    if (input2_data.size() > 0)
+      input2_ = AddConstInput(input2, input2_data);
+    else
+      input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+  }
+
+  void InitInterpreter() {
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  template <typename T>
+  void SetInput1(const std::vector<float>& data) {
+    QuantizeAndPopulate<T>(input1_, data);
+  }
+
+  template <typename T>
+  void SetInput2(const std::vector<float>& data) {
+    QuantizeAndPopulate<T>(input2_, data);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+class AddOpModel : public ArithmeticOpBaseModel {
+ public:
+  AddOpModel(const TensorData& input1, const TensorData& input2,
+             const TensorData& output)
+      : ArithmeticOpBaseModel(input1, input2, output) {}
+  AddOpModel(const TensorData& input1, const TensorData& input2,
+             const TensorData& output,
+             const std::initializer_list<uint8_t>& input1_data,
+             const std::initializer_list<uint8_t>& input2_data)
+      : ArithmeticOpBaseModel(input1, input2, output, input1_data,
+                              input2_data) {}
+
+  void InitInterpreter() {
+    SetBuiltinOp(
+        BuiltinOperator_ADD, BuiltinOptions_AddOptions,
+        CreateAddOptions(builder_, ActivationFunctionType_NONE).Union());
+    ArithmeticOpBaseModel::InitInterpreter();
+  }
+};
+
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedTestsNoActivation() {
+  const float kQuantizedTolerance = 2.0 / 255.0;
+  std::vector<std::vector<float>> inputs1 = {
+      {0.1, 0.2, 0.3, 0.4}, {-0.8, 0.2, 0.4, 0.7}, {-0.8, 0.2, 0.7, 0.3}};
+  std::vector<std::vector<float>> inputs2 = {
+      {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}};
+  for (size_t i = 0; i < 1; ++i) {
+    AddOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                 {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                 {tensor_type, {1, 2, 2, 1}, -1.0, 1.0});
+    m.InitInterpreter();
+    m.SetInput1<integer_dtype>(inputs1[i]);
+    m.SetInput2<integer_dtype>(inputs2[i]);
+    m.Invoke();
+    auto reference_output = m.GetDequantizedOutput<integer_dtype>();
+    m.ApplyDelegateAndInvoke();
+    EXPECT_THAT(
+        m.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear(reference_output, kQuantizedTolerance)))
+        << "With test number " << i;
+  }
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivationUInt8) {
+  QuantizedTestsNoActivation<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt8) {
+  QuantizedTestsNoActivation<TensorType_INT8, int8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivationUInt8_ConstInput_1) {
+  const float kQuantizedTolerance = 2.0 / 255.0;
+  AddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+               {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+               {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+               {110, 142, 156, 171}, {});
+  m.InitInterpreter();
+  m.SetInput1<uint8_t>({0.1, 0.2, 0.3, 0.4});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear(reference_output, kQuantizedTolerance)));
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivationUInt8_ConstInput_2) {
+  const float kQuantizedTolerance = 2.0 / 255.0;
+  AddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+               {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+               {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {},
+               {110, 142, 156, 171});
+  m.InitInterpreter();
+  m.SetInput2<uint8_t>({0.1, 0.2, 0.3, 0.4});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear(reference_output, kQuantizedTolerance)));
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt8_ConstInput) {
+  const float kQuantizedTolerance = 2.0 / 255.0;
+  AddOpModel m({TensorType_INT8, {1, 2, 2, 1}, -1.0, 1.0},
+               {TensorType_INT8, {1, 2, 2, 1}, -1.0, 1.0},
+               {TensorType_INT8, {1, 2, 2, 1}, -1.0, 1.0}, {},
+               {110, 101, 105, 120});
+  m.InitInterpreter();
+  m.SetInput2<int8_t>({0.1, 0.2, 0.3, 0.4});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<int8_t>();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int8_t>(),
+      ElementsAreArray(ArrayFloatNear(reference_output, kQuantizedTolerance)));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/concat_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/concat_test.cc
index 335586d7b13..be036323e10 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/concat_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/concat_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 namespace tflite {
 using testing::ElementsAreArray;
 
+namespace {
 void GenerateUniformRandomVector(int size, float min, float max,
                                  std::minstd_rand* random_engine,
                                  std::vector<float>* result) {
@@ -41,6 +42,7 @@ void GenerateUniformRandomVector(int size, float min, float max,
     (*result)[i] = min + (max - min) * random_value_scaled_0_1;
   }
 }
+}  // namespace
 
 class QuantizedConcatenationOpModel : public SingleOpModelWithHexagon {
  public:
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc
index a16e22888dd..ff2c71946e7 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc
@@ -22,7 +22,8 @@ using testing::ElementsAreArray;
 class FullyConnectedOpModel : public SingleOpModelWithHexagon {
  public:
   FullyConnectedOpModel(int units, int batches, const TensorData& input,
-                        const TensorData& output)
+                        const TensorData& output, bool optional_bias = false,
+                        bool const_weights = true)
       : batches_(batches), units_(units) {
     int total_input_size = 1;
     for (size_t i = 0; i < input.shape.size(); ++i) {
@@ -34,9 +35,13 @@ class FullyConnectedOpModel : public SingleOpModelWithHexagon {
     weights_ =
         AddInput({input.type, {units_, input_size_}, input.min, input.max});
 
-    auto bias_scale = GetScale(input_) * GetScale(weights_);
-    TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale};
-    bias_ = AddInput(bias);
+    if (optional_bias) {
+      bias_ = AddNullInput();
+    } else {
+      auto bias_scale = GetScale(input_) * GetScale(weights_);
+      TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale};
+      bias_ = AddInput(bias);
+    }
 
     output_ = AddOutput(output);
 
@@ -46,15 +51,18 @@ class FullyConnectedOpModel : public SingleOpModelWithHexagon {
                                     FullyConnectedOptionsWeightsFormat_DEFAULT,
                                     /*keep_num_dims=*/false)
             .Union());
-
-    BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
+    BuildInterpreter({GetShape(input_), GetShape(weights_)});
 
     // Weights & bias tensors need to be constant.
     // We don't use AddConstInput to allow setting filter values later.
-    auto* weights_tensor = interpreter_->tensor(weights_);
-    weights_tensor->allocation_type = kTfLiteMmapRo;
-    auto* bias_tensor = interpreter_->tensor(bias_);
-    bias_tensor->allocation_type = kTfLiteMmapRo;
+    if (const_weights) {
+      auto* weights_tensor = interpreter_->tensor(weights_);
+      weights_tensor->allocation_type = kTfLiteMmapRo;
+    }
+    if (!optional_bias) {
+      auto* bias_tensor = interpreter_->tensor(bias_);
+      bias_tensor->allocation_type = kTfLiteMmapRo;
+    }
   }
 
   void SetBias(const std::vector<float>& data) {
@@ -146,4 +154,113 @@ TEST(QuantizedFullyConnectedOpTest, TestQuantizedUint8) {
               ElementsAre(151, 152, 153, 185, 186, 187));
 }
 
+TEST(QuantizedFullyConnectedOpTest, TestQuantizedUint8_NoBias) {
+  FullyConnectedOpModel m(
+      /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_UINT8, {}, -127, 128}, /*optional_bias*/ true);
+
+  m.SetWeights<uint8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+
+  m.SetInput<uint8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output)));
+}
+
+TEST(QuantizedFullyConnectedOpTest, TestQuantizedInt8_NoBias) {
+  FullyConnectedOpModel m(/*units=*/3, /*batches*/ 2,
+                          /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+                          /*output=*/{TensorType_INT8, {}, -127, 128},
+                          /*optional_bias*/ true);
+
+  m.SetWeights<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<int8_t>();
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output)));
+}
+
+TEST(QuantizedFullyConnectedOpTest, TestQuantizedInt8_NonConstWeights) {
+  FullyConnectedOpModel m(/*units=*/3, /*batches*/ 2,
+                          /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+                          /*output=*/{TensorType_INT8, {}, -127, 128},
+                          /*optional_bias=*/false, /*const_weights=*/false);
+
+  m.SetWeights<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<int8_t>();
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output)));
+}
+
+TEST(QuantizedFullyConnectedOpTest, TestQuantizedUint8_NonConstWeights) {
+  FullyConnectedOpModel m(
+      /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_UINT8, {}, -127, 128}, /*optional_bias=*/false,
+      /*const_weights=*/false);
+
+  m.SetWeights<uint8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<uint8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({
+                  24, 25, 26,  //
+                  58, 59, 60,  //
+              })));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAre(151, 152, 153, 185, 186, 187));
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/min_max_builder_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/min_max_builder_test.cc
new file mode 100644
index 00000000000..315ea909c53
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/min_max_builder_test.cc
@@ -0,0 +1,171 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+template <typename data_type>
+class MinMaxOpModel : public SingleOpModelWithHexagon {
+ public:
+  MinMaxOpModel(tflite::BuiltinOperator op, const TensorData& input1,
+                const TensorData& input2, const TensorData& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(op, BuiltinOptions_MaximumMinimumOptions,
+                 CreateMaximumMinimumOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  MinMaxOpModel(tflite::BuiltinOperator op, const TensorData& input1,
+                std::initializer_list<data_type> input1_values,
+                const TensorData& input2,
+                std::initializer_list<data_type> input2_values,
+                const TensorData& output, bool input1_const) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(op, BuiltinOptions_MaximumMinimumOptions,
+                 CreateMaximumMinimumOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+
+    // A workaround to mark the tensors as constant.
+    if (input1_const) {
+      auto* input1_tensor = interpreter_->tensor(input1_);
+      input1_tensor->allocation_type = kTfLiteMmapRo;
+    } else {
+      auto* input2_tensor = interpreter_->tensor(input2_);
+      input2_tensor->allocation_type = kTfLiteMmapRo;
+    }
+  }
+
+  void SetInput1(std::vector<data_type> data) { PopulateTensor(input1_, data); }
+
+  void SetInput2(std::vector<data_type> data) { PopulateTensor(input2_, data); }
+
+  std::vector<data_type> GetOutput() {
+    return ExtractVector<data_type>(output_);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+template <typename data_type>
+void TestModel(tflite::BuiltinOperator op, const TensorData& input1,
+               const TensorData& input2, const TensorData& output,
+               std::initializer_list<data_type> input1_values,
+               std::initializer_list<data_type> input2_values) {
+  std::unique_ptr<MinMaxOpModel<data_type>> m;
+  m = std::make_unique<MinMaxOpModel<data_type>>(op, input1, input2, output);
+  m->SetInput1(input1_values);
+  m->SetInput2(input2_values);
+
+  m->Invoke();
+  const auto reference_output = m->GetOutput();
+  const auto reference_output_shape = m->GetOutputShape();
+  m->ApplyDelegateAndInvoke();
+  EXPECT_THAT(m->GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m->GetOutput(), ElementsAreArray(reference_output));
+}
+
+template <typename data_type>
+void TestModelConstInput(tflite::BuiltinOperator op, const TensorData& input1,
+                         const TensorData& input2, const TensorData& output,
+                         std::initializer_list<data_type> input1_values,
+                         std::initializer_list<data_type> input2_values,
+                         bool input1_const) {
+  std::unique_ptr<MinMaxOpModel<data_type>> m;
+  m = std::make_unique<MinMaxOpModel<data_type>>(
+      op, input1, input1_values, input2, input2_values, output, input1_const);
+  m->SetInput1(input1_values);
+  m->SetInput2(input2_values);
+
+  m->Invoke();
+  const auto reference_output = m->GetOutput();
+  const auto reference_output_shape = m->GetOutputShape();
+  m->ApplyDelegateAndInvoke();
+  EXPECT_THAT(m->GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m->GetOutput(), ElementsAreArray(reference_output));
+}
+
+TEST(MinMaxOpTest, Maximum_Uint8Test) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  TestModel<uint8_t>(BuiltinOperator_MAXIMUM,
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255}, data1, data2);
+}
+
+TEST(MinMaxOpTest, Maximum_Uint8Test_Const) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  TestModelConstInput<uint8_t>(
+      BuiltinOperator_MAXIMUM, {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+      {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+      {TensorType_UINT8, {1, 3, 1, 2}, -1, 255}, data1, data2, false);
+}
+
+TEST(MinMaxOpTest, Minimum_Uint8Test) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  TestModel<uint8_t>(BuiltinOperator_MINIMUM,
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255}, data1, data2);
+}
+
+TEST(MinMaxOpTest, Minimum_Uint8Test_Const) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 20, 1};
+  TestModelConstInput<uint8_t>(
+      BuiltinOperator_MINIMUM, {TensorType_UINT8, {1, 3, 1, 2}, -1, 25},
+      {TensorType_UINT8, {1, 3, 1, 2}, -1, 25},
+      {TensorType_UINT8, {1, 3, 1, 2}, -1, 25}, data1, data2, false);
+}
+
+TEST(MinMaxOpTest, Maximum_Int8Test) {
+  std::initializer_list<int8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<int8_t> data2 = {0, 0, 1, 12, 123, 1};
+  TestModel<int8_t>(BuiltinOperator_MAXIMUM,
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 125},
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 125},
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 125}, data1, data2);
+}
+
+TEST(MinMaxOpTest, Minimum_Int8Test) {
+  std::initializer_list<int8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<int8_t> data2 = {0, 0, 1, 12, 12, 1};
+  TestModel<int8_t>(BuiltinOperator_MINIMUM,
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 25},
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 25},
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 25}, data1, data2);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/mirror_pad_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/mirror_pad_test.cc
new file mode 100644
index 00000000000..4caf96ac8ce
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/mirror_pad_test.cc
@@ -0,0 +1,127 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+template <typename T>
+class MirrorPadOpModel : public SingleOpModelWithHexagon {
+ public:
+  MirrorPadOpModel(const TensorData& input,
+                   std::initializer_list<int> paddings_shape,
+                   std::initializer_list<int> paddings,
+                   const TensorData& output, const tflite::MirrorPadMode mode) {
+    input_id_ = AddInput(input);
+    padding_matrix_id_ =
+        AddConstInput(TensorType_INT32, paddings, paddings_shape);
+    output_id_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MIRROR_PAD, BuiltinOptions_MirrorPadOptions,
+                 CreateMirrorPadOptions(builder_, mode).Union());
+    BuildInterpreter({GetShape(input_id_), GetShape(padding_matrix_id_)});
+  }
+
+  int input_tensor_id() { return input_id_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_id_); }
+
+ protected:
+  int input_id_;
+  int padding_matrix_id_;
+  int output_id_;
+};
+
+TEST(MirrorPadTest, EmptyPad_UInt8) {
+  MirrorPadOpModel<uint8_t> model(
+      {TensorType_UINT8, {2, 3}, -1.0, 1.0}, {2, 2}, {0, 0, 0, 0},
+      {TensorType_UINT8, {}, -1.0, 1.0}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<uint8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Symmetric_Int8) {
+  MirrorPadOpModel<int8_t> model({TensorType_INT8, {2, 3}, -1.0, 1.0}, {2, 2},
+                                 {1, 1, 1, 1}, {TensorType_INT8, {}, -1.0, 1.0},
+                                 tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 1, 2, 3, 3, 1, 1, 2, 3, 3,
+                                4, 4, 5, 6, 6, 4, 4, 5, 6, 6}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Reflect_UInt8) {
+  MirrorPadOpModel<uint8_t> model(
+      {TensorType_UINT8, {2, 3}, -1.0, 1.0}, {2, 2}, {1, 1, 1, 1},
+      {TensorType_UINT8, {}, -1.0, 1.0}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<uint8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 4, 5, 6, 5, 2, 1, 2, 3, 2,
+                                5, 4, 5, 6, 5, 2, 1, 2, 3, 2}));
+}
+
+TEST(MirrorPadTest, PadOneSide_left_Reflect_Int8) {
+  MirrorPadOpModel<int8_t> model({TensorType_INT8, {2, 3}, -1.0, 1.0}, {2, 2},
+                                 {1, 0, 1, 0}, {TensorType_INT8, {}, -1.0, 1.0},
+                                 tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 4, 5, 6, 2, 1, 2, 3, 5, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadOneSide_right_Symmetric_UInt8) {
+  MirrorPadOpModel<uint8_t> model(
+      {TensorType_UINT8, {2, 3}, -1.0, 1.0}, {2, 2}, {0, 1, 0, 1},
+      {TensorType_UINT8, {}, -1.0, 1.0}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<uint8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 3, 4, 5, 6, 6, 4, 5, 6, 6}));
+}
+
+TEST(MirrorPadTest, Pad_1D_Reflect_Int8) {
+  MirrorPadOpModel<int8_t> model({TensorType_INT8, {3}, -1.0, 1.0}, {1, 2},
+                                 {0, 2}, {TensorType_INT8, {}, -1.0, 1.0},
+                                 tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int8_t>(model.input_tensor_id(), {1, 2, 3});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 2, 1}));
+}
+
+TEST(MirrorPadTest, Pad_1D_Symmetric_UInt8) {
+  MirrorPadOpModel<uint8_t> model({TensorType_UINT8, {3}, -1.0, 1.0}, {1, 2},
+                                  {0, 2}, {TensorType_UINT8, {}, -1.0, 1.0},
+                                  tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<uint8_t>(model.input_tensor_id(), {1, 2, 3});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 3, 2}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Reflect_Whole_UInt8) {
+  MirrorPadOpModel<uint8_t> model(
+      {TensorType_UINT8, {2, 3}, -1.0, 1.0}, {2, 2}, {1, 1, 2, 2},
+      {TensorType_UINT8, {}, -1.0, 1.0}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<uint8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1,
+                                6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1}));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/pack_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/pack_test.cc
new file mode 100644
index 00000000000..6f030575a01
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/pack_test.cc
@@ -0,0 +1,125 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+class PackOpModel : public SingleOpModelWithHexagon {
+ public:
+  PackOpModel(const TensorData& input_template, int axis, int values_count) {
+    std::vector<std::vector<int>> all_input_shapes;
+    for (int i = 0; i < values_count; ++i) {
+      all_input_shapes.push_back(input_template.shape);
+      AddInput(input_template);
+    }
+    output_ = AddOutput({input_template.type, /*shape=*/{}, input_template.min,
+                         input_template.max});
+    SetBuiltinOp(BuiltinOperator_PACK, BuiltinOptions_PackOptions,
+                 CreatePackOptions(builder_, values_count, axis).Union());
+    BuildInterpreter(all_input_shapes);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+  template <typename integer_type>
+  void SetInput(int index, std::initializer_list<float> data) {
+    QuantizeAndPopulate<integer_type>(index, data);
+  }
+
+  template <typename integer_type>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<integer_type>(ExtractVector<integer_type>(output_),
+                                    GetScale(output_), GetZeroPoint(output_));
+  }
+
+ private:
+  int output_;
+};
+
+template <typename InputType>
+struct PackOpTest : public ::testing::Test {
+  using TypeToTest = InputType;
+  TensorType TENSOR_TYPE =
+      (std::is_same<InputType, int16_t>::value
+           ? TensorType_INT16
+           : (std::is_same<InputType, uint8_t>::value ? TensorType_UINT8
+                                                      : TensorType_INT8));
+};
+
+using TestTypes = testing::Types<int8_t, uint8_t>;
+TYPED_TEST_CASE(PackOpTest, TestTypes);
+
+TYPED_TEST(PackOpTest, ThreeInputs) {
+  PackOpModel model({TestFixture::TENSOR_TYPE, {2}, -10, 10}, 0, 3);
+  model.SetInput<typename TestFixture::TypeToTest>(0, {1, 4});
+  model.SetInput<typename TestFixture::TypeToTest>(1, {2, 5});
+  model.SetInput<typename TestFixture::TypeToTest>(2, {3, 6});
+  model.Invoke();
+  auto ref_output_shape = model.GetOutputShape();
+  auto ref_output =
+      model.GetDequantizedOutput<typename TestFixture::TypeToTest>();
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(model.GetDequantizedOutput<typename TestFixture::TypeToTest>(),
+              ElementsAreArray(ArrayFloatNear(ref_output)));
+}
+
+TYPED_TEST(PackOpTest, ThreeInputsDifferentAxis) {
+  PackOpModel model({TestFixture::TENSOR_TYPE, {2}, -10, 10}, 1, 3);
+  model.SetInput<typename TestFixture::TypeToTest>(0, {1, 4});
+  model.SetInput<typename TestFixture::TypeToTest>(1, {2, 5});
+  model.SetInput<typename TestFixture::TypeToTest>(2, {3, 6});
+  model.Invoke();
+  auto ref_output_shape = model.GetOutputShape();
+  auto ref_output =
+      model.GetDequantizedOutput<typename TestFixture::TypeToTest>();
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(model.GetDequantizedOutput<typename TestFixture::TypeToTest>(),
+              ElementsAreArray(ArrayFloatNear(ref_output)));
+}
+
+TYPED_TEST(PackOpTest, ThreeInputsNegativeAxis) {
+  PackOpModel model({TestFixture::TENSOR_TYPE, {2}, -10, 10}, -1, 3);
+  model.SetInput<typename TestFixture::TypeToTest>(0, {1, 4});
+  model.SetInput<typename TestFixture::TypeToTest>(1, {2, 5});
+  model.SetInput<typename TestFixture::TypeToTest>(2, {3, 6});
+  model.Invoke();
+  auto ref_output_shape = model.GetOutputShape();
+  auto ref_output =
+      model.GetDequantizedOutput<typename TestFixture::TypeToTest>();
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(model.GetDequantizedOutput<typename TestFixture::TypeToTest>(),
+              ElementsAreArray(ArrayFloatNear(ref_output)));
+}
+
+TYPED_TEST(PackOpTest, MultilDimensions) {
+  PackOpModel model({TestFixture::TENSOR_TYPE, {2, 3}, -10, 20}, 1, 2);
+  model.SetInput<typename TestFixture::TypeToTest>(0, {1, 2, 3, 4, 5, 6});
+  model.SetInput<typename TestFixture::TypeToTest>(1, {7, 8, 9, 10, 11, 12});
+  model.Invoke();
+  auto ref_output_shape = model.GetOutputShape();
+  auto ref_output =
+      model.GetDequantizedOutput<typename TestFixture::TypeToTest>();
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(model.GetDequantizedOutput<typename TestFixture::TypeToTest>(),
+              ElementsAreArray(ArrayFloatNear(ref_output)));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reduce_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reduce_test.cc
index 7e4f95ffa96..a3cd8c8255b 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reduce_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reduce_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace tflite {
 using testing::ElementsAreArray;
 
-// TODO(b/148390890): All tests are disabled, enable after fix is availabel
-// and op is enabled.
+// TODO(b/148390890): Reduce Sum tests are disabled, enable after fix is
+// available and op is enabled.
 class ReduceOpModel : public SingleOpModelWithHexagon {
  public:
   ReduceOpModel(BuiltinOperator type, const TensorData& input,
@@ -49,32 +49,52 @@ class ReduceOpModel : public SingleOpModelWithHexagon {
   int output_;
 };
 
-TEST(ReduceOpModel, DISABLED_MeanNotKeepDims) {
+template <TensorType Tensor_Type, typename input_type>
+void TestMeanImpl() {
   float kQuantizedTolerance = 2.0 / 255;
   std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
-  ReduceOpModel m(BuiltinOperator_MEAN,
-                  {TensorType_UINT8, {1, 1, 3, 2}, -1.0, 1.0},
-                  {TensorType_UINT8, {2}, -1.0, 1.0}, {1}, {2}, false);
-  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  ReduceOpModel m(BuiltinOperator_MEAN, {Tensor_Type, {1, 1, 3, 2}, -1.0, 1.0},
+                  {Tensor_Type, {2}, -1.0, 1.0}, {1}, {2}, false);
+  m.QuantizeAndPopulate<input_type>(m.Input(), data);
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<input_type>();
   m.ApplyDelegateAndInvoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput<uint8_t>(),
-      ElementsAreArray(ArrayFloatNear({0.4, 0.4}, kQuantizedTolerance)));
+      m.GetDequantizedOutput<input_type>(),
+      ElementsAreArray(ArrayFloatNear(reference_output, kQuantizedTolerance)));
 }
 
-TEST(ReduceOpModel, DISABLED_MeanKeepDims) {
+TEST(ReduceOpModel, MeanNotKeepDims_Uint8) {
+  TestMeanImpl<TensorType_UINT8, uint8_t>();
+}
+
+TEST(ReduceOpModel, MeanNotKeepDims_Int8) {
+  TestMeanImpl<TensorType_INT8, int8_t>();
+}
+
+template <TensorType Tensor_Type, typename input_type>
+void TestMeanKeppDimsImpl() {
   float kQuantizedTolerance = 2.0 / 255;
   std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
-  ReduceOpModel m(BuiltinOperator_MEAN,
-                  {TensorType_UINT8, {1, 1, 3, 2}, -1.0, 1.0},
-                  {TensorType_UINT8, {3}, -1.0, 1.0}, {1}, {3}, true);
-  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  ReduceOpModel m(BuiltinOperator_MEAN, {Tensor_Type, {1, 1, 3, 2}, -1.0, 1.0},
+                  {Tensor_Type, {3}, -1.0, 1.0}, {1}, {3}, true);
+  m.QuantizeAndPopulate<input_type>(m.Input(), data);
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<input_type>();
   m.ApplyDelegateAndInvoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 3, 1}));
   EXPECT_THAT(
-      m.GetDequantizedOutput<uint8_t>(),
-      ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance)));
+      m.GetDequantizedOutput<input_type>(),
+      ElementsAreArray(ArrayFloatNear(reference_output, kQuantizedTolerance)));
+}
+
+TEST(ReduceOpModel, MeanKeepDims_Int8) {
+  TestMeanKeppDimsImpl<TensorType_INT8, int8_t>();
+}
+
+TEST(ReduceOpModel, MeanKeepDims_Uint8) {
+  TestMeanKeppDimsImpl<TensorType_UINT8, uint8_t>();
 }
 
 TEST(ReduceOpModel, DISABLED_SumNotKeepDims) {
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/slice_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/slice_test.cc
new file mode 100644
index 00000000000..d3bcfb6a6c2
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/slice_test.cc
@@ -0,0 +1,163 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+template <typename index_type>
+class SliceOpModel : public SingleOpModelWithHexagon {
+ public:
+  SliceOpModel(const TensorData& input, const TensorData& output,
+               const TensorData& begin, const TensorData& size,
+               std::initializer_list<index_type> begin_data,
+               std::initializer_list<index_type> size_data) {
+    input_ = AddInput(input);
+    begin_ = AddConstInput(begin, begin_data);
+    size_ = AddConstInput(size, size_data);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_SLICE, BuiltinOptions_SliceOptions,
+                 CreateSliceOptions(builder_).Union());
+    BuildInterpreter({GetShape(input_), GetShape(begin_), GetShape(size_)});
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<T>(input_, data);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int begin_;
+  int size_;
+  int output_;
+};
+
+TEST(SliceOpTest, Input_1D_Uint8) {
+  SliceOpModel<int32_t> m(/*input=*/{TensorType_UINT8, {4}, -10, 10},
+                          /*output=*/{TensorType_UINT8, {2}, -10, 10},
+                          {TensorType_INT32, {1}}, {TensorType_INT32, {1}}, {1},
+                          {2});
+  m.SetInput<uint8_t>({1, 2, 3, 4});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({2, 3}, 0.1)));
+}
+
+TEST(SliceOpTest, Input_2D_Uint8) {
+  SliceOpModel<int32_t> m(
+      /*input=*/{TensorType_UINT8, {2, 3}, -10, 10},
+      /*output=*/{TensorType_UINT8, {1, 2}, -10, 10}, {TensorType_INT32, {2}},
+      {TensorType_INT32, {2}}, {1, 0}, {1, 2});
+  m.SetInput<uint8_t>({1, 2, 3, 4, 5, 6});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+TEST(SliceOpTest, SizeInt64_Uint8) {
+  SliceOpModel<int64_t> m(/*input=*/{TensorType_UINT8, {4, 1, 1, 1}, -10, 10},
+                          /*output=*/{TensorType_UINT8, {3, 1, 1, 1}, -10, 10},
+                          {TensorType_INT64, {4}}, {TensorType_INT64, {4}},
+                          {1, 0, 0, 0}, {3, 1, 1, 1});
+  m.SetInput<uint8_t>({1, 2, 3, 4});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+TEST(SliceOpTest, SizeMinus1) {
+  SliceOpModel<int64_t> m(
+      /*input=*/{TensorType_UINT8, {3, 2, 3, 1}, -10, 10},
+      /*output=*/{TensorType_UINT8, {2, 1, 3, 1}, -10, 10},
+      {TensorType_INT64, {4}}, {TensorType_INT64, {4}}, {1, 0, 0, 0},
+      {2, 1, -1, 1});
+  m.SetInput<uint8_t>({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+TEST(SliceOpTest, BeginNonZeroSizeMinus1Axis1) {
+  SliceOpModel<int64_t> m(
+      /*input=*/{TensorType_UINT8, {3, 3, 2, 1}, -10, 10},
+      /*output=*/{TensorType_UINT8, {2, 2, 1, 1}, -10, 10},
+      {TensorType_INT64, {4}}, {TensorType_INT64, {4}}, {1, 1, 0, 0},
+      {2, -1, 1, 1});
+  m.SetInput<uint8_t>({1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+TEST(SliceOpTest, BeginNonZeroSizeMinus1Axis2) {
+  SliceOpModel<int64_t> m(
+      /*input=*/{TensorType_UINT8, {3, 2, 3, 1}, -10, 10},
+      /*output=*/{TensorType_UINT8, {2, 1, 2, 1}, -10, 10},
+      {TensorType_INT64, {4}}, {TensorType_INT64, {4}}, {1, 0, 1, 0},
+      {2, 1, -1, 1});
+  m.SetInput<uint8_t>({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+TEST(SliceOpTest, BeginNonZeroSizeMinus1Axis2_Int8) {
+  SliceOpModel<int64_t> m(
+      /*input=*/{TensorType_INT8, {3, 2, 3, 1}, -10, 10},
+      /*output=*/{TensorType_INT8, {2, 1, 2, 1}, -10, 10},
+      {TensorType_INT64, {4}}, {TensorType_INT64, {4}}, {1, 0, 1, 0},
+      {2, 1, -1, 1});
+  m.SetInput<int8_t>({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<int8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/tests.bzl b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/tests.bzl
index 79cab14128f..57ce69f5e2c 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/tests.bzl
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/tests.bzl
@@ -1,9 +1,11 @@
 """Rules for generating unit-tests using hexagon delegates."""
 
+load("//tensorflow/lite:special_rules.bzl", "tflite_hexagon_mobile_test")
+
 def hexagon_op_tests(
         srcs = [],
         deps = []):
-    """Create separate unit test targets for each test file in 'srcs'.
+    """Create both monolithic and individual unit test targets for each test file in 'srcs'.
 
     Args:
         srcs: list of test files, separate target will be created for each item in the list.
@@ -23,3 +25,17 @@ def hexagon_op_tests(
                 "notap",
             ],
         )
+
+    all_ops_test_name = "hexagon_op_tests_all"
+    native.cc_test(
+        name = all_ops_test_name,
+        srcs = srcs,
+        deps = deps,
+        linkstatic = 1,
+        tags = [
+            "no_oss",
+            "nobuilder",
+            "notap",
+        ],
+    )
+    tflite_hexagon_mobile_test(all_ops_test_name)
diff --git a/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.cc b/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.cc
index c6acc5ac947..c1b31135df0 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.cc
@@ -157,23 +157,12 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
 
   TfLiteHexagonDelegateOptions* params =
       static_cast<TfLiteHexagonDelegateOptions*>(delegate->data_);
-  const auto delegate_partitions = helper.GetFirstNLargestPartitions(
+  std::vector<int> supported_nodes = helper.GetNodesOfFirstNLargestPartitions(
       params->max_delegated_partitions, params->min_nodes_per_partition);
 
-  // To avoid creating a new TfLiteIntArray and free it later, we reserve one
-  // element to represent TfLiteIntArray.size which is the 1st element of
-  // TfLiteIntArray C struct.
-  std::vector<int> supported_nodes(1);
-  for (const auto partition : delegate_partitions) {
-    auto* nodes = partition->nodes_to_replace;
-    supported_nodes.insert(supported_nodes.end(), nodes->data,
-                           nodes->data + nodes->size);
-  }
-  // Set first element to the number of nodes to replace.
-  supported_nodes[0] = supported_nodes.size() - 1;
   auto* hexagon_delegate = static_cast<HexagonDelegate*>(delegate);
   // Make sure dynamic batch is requested on fully delegated graph only.
-  if (supported_nodes[0] != helper.num_total_nodes() &&
+  if (supported_nodes.size() != helper.num_total_nodes() &&
       hexagon_delegate != nullptr &&
       hexagon_delegate->params()->enable_dynamic_batch_size) {
     TF_LITE_KERNEL_LOG(
@@ -183,12 +172,12 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
   TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
                   "Hexagon delegate: %d nodes delegated out of %d nodes with "
                   "%d partitions.\n",
-                  supported_nodes[0], helper.num_total_nodes(),
-                  delegate_partitions.size());
+                  supported_nodes.size(), helper.num_total_nodes(),
+                  helper.num_partitions());
 
   return context->ReplaceNodeSubsetsWithDelegateKernels(
       context, GetHexagonKernelRegistration(),
-      reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()), delegate);
+      BuildTfLiteIntArray(supported_nodes).get(), delegate);
 }
 
 TfLiteDelegate* CreateDelegate(const TfLiteHexagonDelegateOptions* params) {
diff --git a/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.cc b/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.cc
index 5786562fc6a..d28791ce393 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.cc
@@ -300,6 +300,19 @@ TfLiteStatus HexagonDelegateKernel::BuildGraph(
   for (int node_index : nodes_) {
     TF_LITE_ENSURE_STATUS(
         context->GetNodeAndRegistration(context, node_index, &node, &reg));
+    // Const inputs needs to be added to the hexagon graph as const nodes.
+    // Adding them earlier here to the graph
+    // - Simplifies separate builders
+    // - Simplifies int8 vs uint8 cases, builders don't need to handle them.
+    for (int i = 0; i < node->inputs->size; ++i) {
+      const int tensor_id = node->inputs->data[i];
+      if (tensor_id == -1) continue;
+      const auto& input_tensor = context->tensors[tensor_id];
+      if (input_tensor.allocation_type == kTfLiteMmapRo) {
+        builder_->AddConstNodeWithData(tensor_id, input_tensor,
+                                       /*int8_to_uint8*/ true);
+      }
+    }
     auto* op_builder =
         builder_->AddNodeFromTfLiteOp(reg->builtin_code, node, node_index);
     TF_LITE_ENSURE_STATUS(
diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index df7d7424e37..d283b329015 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -42,6 +42,8 @@ bool InputsWithCorrectTypes(
     const std::vector<std::vector<TfLiteType>>& per_input_possible_types) {
   if (node->inputs->size != per_input_possible_types.size()) return false;
   for (int i = 0; i < per_input_possible_types.size(); ++i) {
+    // Skip optional tensor.
+    if (node->inputs->data[i] == -1) continue;
     bool type_found = false;
     for (auto possible_type : per_input_possible_types[i]) {
       if (TensorTypeMatch(node->inputs->data[i], context, possible_type)) {
@@ -79,13 +81,19 @@ bool CheckOpVersion(const TfLiteRegistration* registration) {
     case kTfLiteBuiltinConcatenation:
     case kTfLiteBuiltinL2Normalization:
     case kTfLiteBuiltinLogistic:
+    case kTfLiteBuiltinMaximum:
     case kTfLiteBuiltinMaxPool2d:
+    case kTfLiteBuiltinMean:
+    case kTfLiteBuiltinMinimum:
+    case kTfLiteBuiltinMirrorPad:
     case kTfLiteBuiltinMul:
+    case kTfLiteBuiltinPack:
     case kTfLiteBuiltinPad:
     case kTfLiteBuiltinQuantize:
     case kTfLiteBuiltinRelu6:
     case kTfLiteBuiltinResizeBilinear:
     case kTfLiteBuiltinResizeNearestNeighbor:
+    case kTfLiteBuiltinSlice:
     case kTfLiteBuiltinSoftmax:
     case kTfLiteBuiltinSpaceToDepth:
     case kTfLiteBuiltinSplit:
@@ -112,6 +120,9 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
   int tensor_id;
   for (int i = 0; i < node->inputs->size; ++i) {
     tensor_id = node->inputs->data[i];
+    // Skip optional tensors. Builders should handle optional tensors
+    // not available.
+    if (tensor_id == -1) continue;
     const auto& tensor = context->tensors[tensor_id];
     if (tensor.dims->size > 4) return false;
   }
@@ -131,7 +142,7 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
         return false;
       const TfLiteAddParams* add_params =
           reinterpret_cast<const TfLiteAddParams*>(node->builtin_data);
-      return IsActivationReluOrNone(add_params->activation);
+      return add_params->activation == kTfLiteActNone;
     }
     case kTfLiteBuiltinMul: {
       if (!InputsWithCorrectTypes(
@@ -150,14 +161,29 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
         return false;
       const TfLiteSubParams* sub_params =
           reinterpret_cast<const TfLiteSubParams*>(node->builtin_data);
-      return IsActivationReluOrNone(sub_params->activation);
+      return sub_params->activation == kTfLiteActNone;
     }
     case kTfLiteBuiltinSum:
-    case kTfLiteBuiltinMean: {
       // TODO(b/139277813): Enable these when they pass unit tests. These seem
       // to recompute the output min/max instead of taking them as inputs, which
       // causes an unexpected shift in dequantized values.
       return false;
+    case kTfLiteBuiltinMean: {
+      return InputsWithCorrectTypes(
+                 node, context,
+                 {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteInt32}}) &&
+             IsConstantTensor(GetInput(context, node, 1));
+    }
+    case kTfLiteBuiltinMirrorPad: {
+      if (!InputsWithCorrectTypes(
+              node, context, {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteInt32}}) ||
+          !IsConstantTensor(GetInput(context, node, 1)))
+        return false;
+      const TfLiteMirrorPaddingParams* params =
+          reinterpret_cast<const TfLiteMirrorPaddingParams*>(
+              node->builtin_data);
+      return params->mode == kTfLiteMirrorPaddingReflect ||
+             params->mode == kTfLiteMirrorPaddingSymmetric;
     }
     case kTfLiteBuiltinPad: {
       // TODO(b/139277813): Currently we only support padding with the default
@@ -172,20 +198,21 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
       if (!InputsWithCorrectTypes(node, context,
                                   {{kTfLiteUInt8, kTfLiteInt8},
                                    {kTfLiteUInt8, kTfLiteInt8},
-                                   {kTfLiteInt32}}))
+                                   {kTfLiteInt32, kTfLiteNoType}})) {
         return false;
+      }
 
-      const auto& weights_tensor = context->tensors[node->inputs->data[1]];
-      const auto& bias_tensor = context->tensors[node->inputs->data[2]];
-      const bool weights_and_bias_const =
-          weights_tensor.allocation_type == kTfLiteMmapRo &&
-          bias_tensor.allocation_type == kTfLiteMmapRo;
+      bool bias_const_or_no_bias = true;
+      if (node->inputs->data[2] != -1) {
+        const auto& bias_tensor = context->tensors[node->inputs->data[2]];
+        bias_const_or_no_bias = bias_tensor.allocation_type == kTfLiteMmapRo;
+      }
 
       const TfLiteFullyConnectedParams* matmul_params =
           reinterpret_cast<const TfLiteFullyConnectedParams*>(
               node->builtin_data);
-      return (weights_and_bias_const &&
-              IsActivationReluOrNone(matmul_params->activation) &&
+      return (bias_const_or_no_bias &&
+              matmul_params->activation == kTfLiteActNone &&
               matmul_params->keep_num_dims == false &&
               matmul_params->weights_format ==
                   kTfLiteFullyConnectedWeightsFormatDefault);
@@ -316,7 +343,8 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
         return false;
       const auto& input_tensor = context->tensors[node->inputs->data[1]];
       const bool is_four_dim_or_less = input_tensor.dims->size < 5;
-      // We need splitting axis to be constant, so Hexagon knows output shapes.
+      // We need splitting axis to be constant, so Hexagon knows output
+      // shapes.
       return is_four_dim_or_less &&
              IsConstantTensor(GetInput(context, node, 0));
     }
@@ -349,6 +377,35 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
       return InputsWithCorrectTypes(node, context,
                                     {{kTfLiteUInt8, kTfLiteInt8}});
     }
+    case kTfLiteBuiltinMinimum: {
+      return InputsWithCorrectTypes(
+          node, context,
+          {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteUInt8, kTfLiteInt8}});
+    }
+    case kTfLiteBuiltinMaximum: {
+      return InputsWithCorrectTypes(
+          node, context,
+          {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteUInt8, kTfLiteInt8}});
+    }
+    case kTfLiteBuiltinSlice: {
+      const auto& begins_tensor = context->tensors[node->inputs->data[1]];
+      const auto& sizes_tensor = context->tensors[node->inputs->data[2]];
+      if (!IsConstantTensor(&begins_tensor) || !IsConstantTensor(&sizes_tensor))
+        return false;
+      return InputsWithCorrectTypes(node, context,
+                                    {{kTfLiteUInt8, kTfLiteInt8},
+                                     {kTfLiteInt32, kTfLiteInt64},
+                                     {kTfLiteInt32, kTfLiteInt64}});
+    }
+    case kTfLiteBuiltinPack: {
+      // All tensors must be 8-bit.
+      for (int i = 0; i < node->inputs->size; ++i) {
+        if (!TensorTypeMatch(node->inputs->data[i], context, kTfLiteUInt8) &&
+            !TensorTypeMatch(node->inputs->data[i], context, kTfLiteInt8))
+          return false;
+      }
+      return true;
+    }
     default:
       return false;
   }
diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index faa3f12971c..ddbfc0dec5b 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -11,27 +11,8 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-genrule(
-    name = "strip_coreml_include_hdr",
-    srcs = ["//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h"],
-    outs = ["coreml_delegate.h"],
-    cmd = """
-    sed 's/#include \".*common.h"/#include \"common.h\"/' \
-    "$(location //tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h)" \
-    > "$@"
-    """,
-)
-
-TFL_LIBRARY_HDRS = [
-    "//tensorflow/lite/delegates/gpu:metal_delegate.h",
-    "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h",
-    "//tensorflow/lite/c:c_api.h",
-    "//tensorflow/lite/c:common.h",
-]
-
 TFL_FRAMEWORK_HDRS = [
     "//tensorflow/lite/delegates/gpu:metal_delegate.h",
-    ":coreml_delegate.h",
     "//tensorflow/lite/c:c_api.h",
     "//tensorflow/lite/c:common.h",
 ]
@@ -42,19 +23,6 @@ ios_static_framework(
     hdrs = TFL_FRAMEWORK_HDRS,
     bundle_name = "TensorFlowLiteC",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
-    deps = [
-        ":TensorFlowLiteC",
-    ],
-)
-
-objc_library(
-    name = "TensorFlowLiteC",
-    hdrs = TFL_LIBRARY_HDRS,
-    module_name = "TensorFlowLiteC",
-    weak_sdk_frameworks = [
-        "Metal",
-        "CoreML",
-    ],
     deps = [
         ":tensorflow_lite_c",
     ],
@@ -78,24 +46,71 @@ ios_static_framework(
     ],
 )
 
-# Using this intermediate target is a workaround for a bug in bazel build rules
-# involving mixed objc_library & cc_library deps mentioned in (b/74809458).
-# When these dependencies are declared directly under the "TensorFlowLiteC"
-# target above, the resulting static library incorrectly contains duplicate
-# symbols from some ObjC code in the transitive dependencies.
+genrule(
+    name = "strip_coreml_include_hdr",
+    srcs = ["//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h"],
+    outs = ["coreml_delegate.h"],
+    cmd = """
+    sed 's|#include ".*common.h"|#include "TensorFlowLiteC/common.h"|'\
+    "$(location //tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h)"\
+    > "$@"
+    """,
+)
+
+# This target builds the Core ML delegate as a separate static framework, which
+# does not include the TensorFlow Lite runtime. As this target does not contain
+# TensorFlow Lite runtime, it is intended to be linked along with the
+# TensorFlowLiteC framework above in a composable way.
 #
-# When a new dependency should be added to the TensorFlowLiteC framework, the
-# dependency should be added under this target instead.
-# When a new header file needs to be exposed, the header should be added to the
-# TFL_LIBRARY_HDRS list above.
+# bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteCCoreMl_framework
+ios_static_framework(
+    name = "TensorFlowLiteCCoreML_framework",
+    hdrs = [
+        ":coreml_delegate.h",
+    ],
+    avoid_deps = [
+        ":tensorflow_lite_c",
+    ],
+    bundle_name = "TensorFlowLiteCCoreML",
+    minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    deps = [
+        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
+    ],
+)
+
+# This target builds the Metal delegate as a separate static framework, which
+# does not include the TensorFlow Lite runtime. As this target does not contain
+# TensorFlow Lite runtime, it is intended to be linked along with the
+# TensorFlowLiteC framework above in a composable way.
+#
+# bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteCMetal_framework
+ios_static_framework(
+    name = "TensorFlowLiteCMetal_framework",
+    hdrs = [
+        "//tensorflow/lite/delegates/gpu:metal_delegate.h",
+    ],
+    avoid_deps = [
+        ":tensorflow_lite_c",
+    ],
+    bundle_name = "TensorFlowLiteCMetal",
+    minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    deps = [
+        "//tensorflow/lite/delegates/gpu:metal_delegate",
+    ],
+)
+
 cc_library(
     name = "tensorflow_lite_c",
-    hdrs = TFL_LIBRARY_HDRS,
-    tags = ["nobuilder"],
+    hdrs = [
+        "//tensorflow/lite/c:c_api.h",
+        "//tensorflow/lite/c:common.h",
+    ],
+    tags = [
+        "nobuilder",
+        "swift_module=TensorFlowLiteC",
+    ],
     deps = [
         "//tensorflow/lite/c:c_api",
-        "//tensorflow/lite/delegates/gpu:metal_delegate",
-        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template
index d69c479282b..3f0517e1fe6 100644
--- a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template
+++ b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template
@@ -19,6 +19,22 @@ Pod::Spec.new do |s|
 
   s.module_name = 'TensorFlowLiteC'
   s.library = 'c++'
-  s.vendored_frameworks = 'Frameworks/TensorFlowLiteC.framework'
-  s.weak_frameworks = 'CoreML'
+
+  s.default_subspec = 'Core'
+
+  s.subspec 'Core' do |core|
+    core.vendored_frameworks = 'Frameworks/TensorFlowLiteC.framework'
+  end
+
+  s.subspec 'CoreML' do |coreml|
+    coreml.weak_framework = 'CoreML'
+    coreml.dependency 'TensorFlowLiteC/Core'
+    coreml.vendored_frameworks = 'Frameworks/TensorFlowLiteCCoreML.framework'
+  end
+
+  s.subspec 'Metal' do |metal|
+    metal.weak_framework = 'Metal'
+    metal.dependency 'TensorFlowLiteC/Core'
+    metal.vendored_frameworks = 'Frameworks/TensorFlowLiteCMetal.framework'
+  end
 end
diff --git a/tensorflow/lite/experimental/microfrontend/BUILD b/tensorflow/lite/experimental/microfrontend/BUILD
index aaaf864bb60..bf0eb6ae726 100644
--- a/tensorflow/lite/experimental/microfrontend/BUILD
+++ b/tensorflow/lite/experimental/microfrontend/BUILD
@@ -27,6 +27,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "audio_microfrontend_op_lib",
+    srcs = ["ops/audio_microfrontend_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite/experimental/microfrontend/lib:frontend",
+    ],
+    alwayslink = 1,
+)
+
 cc_test(
     name = "audio_microfrontend_test",
     size = "small",
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
index 21194bbb455..e039fb57114 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
@@ -33,7 +33,7 @@ Pod::Spec.new do |s|
     'HEADER_SEARCH_PATHS' =>
       '"${PODS_TARGET_SRCROOT}" ' +
       '"${PODS_TARGET_SRCROOT}/' + objc_dir  + 'apis"',
-    'VALID_ARCHS' => 'x86_64 armv7 arm64',
+    'VALID_ARCHS' => 'i386 x86_64 armv7 arm64',
   }
 
   s.test_spec 'Tests' do |ts|
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
index 4b04c5e65f2..c673cfad759 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
@@ -33,7 +33,7 @@ Pod::Spec.new do |s|
     'HEADER_SEARCH_PATHS' =>
       '"${PODS_TARGET_SRCROOT}" ' +
       '"${PODS_TARGET_SRCROOT}/' + objc_dir  + 'apis"',
-    'VALID_ARCHS' => 'x86_64 armv7 arm64',
+    'VALID_ARCHS' => 'i386 x86_64 armv7 arm64',
   }
 
   s.test_spec 'Tests' do |ts|
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
index 47e60f33c47..fc9e10e4a2c 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
@@ -33,7 +33,7 @@ Pod::Spec.new do |s|
     'HEADER_SEARCH_PATHS' =>
       '"${PODS_TARGET_SRCROOT}" ' +
       '"${PODS_TARGET_SRCROOT}/' + objc_dir  + 'apis"',
-    'VALID_ARCHS' => 'x86_64 armv7 arm64',
+    'VALID_ARCHS' => 'i386 x86_64 armv7 arm64',
   }
 
   s.test_spec 'Tests' do |ts|
diff --git a/tensorflow/lite/experimental/support/java/BUILD b/tensorflow/lite/experimental/support/java/BUILD
index 43e984a0cb8..85f5da17193 100644
--- a/tensorflow/lite/experimental/support/java/BUILD
+++ b/tensorflow/lite/experimental/support/java/BUILD
@@ -9,7 +9,24 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+# TODO(b/156482505): The NOGPU target is a temporary target. Internally, people
+# may already depend on "tensorflow-lite-support" so we shouldn't remove GPU
+# from its dependency. We will have CLs to help users migrate. After migration
+# is done, the "NOGPU" target will be removed.
+android_library(
+    name = "tensorflow-lite-support-nogpu",
+    srcs = glob(["src/java/org/tensorflow/lite/support/**/*.java"]),
+    javacopts = JAVACOPTS,
+    manifest = "AndroidManifest.xml",
+    deps = [
+        "//tensorflow/lite/java:tensorflowlite",
+        "@org_checkerframework_qual",
+    ],
+)
+
 # TODO(138904786): Split Java part and Android part to make the support library usable by pure Java.
+# For new users: Please use "tensorflow-lite-support-nogpu" if possible, and
+# additionally depends on "tensorflowlite_gpu" if needed.
 android_library(
     name = "tensorflow-lite-support",
     srcs = glob(["src/java/org/tensorflow/lite/support/**/*.java"]),
@@ -17,7 +34,7 @@ android_library(
     manifest = "AndroidManifest.xml",
     deps = [
         "//tensorflow/lite/java:tensorflowlite",
-        "//tensorflow/lite/java:tensorflowlite_gpu",
+        "//tensorflow/lite/java:tensorflowlite_gpu",  # unuseddeps: keep
         "@org_checkerframework_qual",
     ],
 )
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/TensorImage.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/TensorImage.java
index b19ef2e3b62..bced23e6f67 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/TensorImage.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/TensorImage.java
@@ -231,6 +231,26 @@ public class TensorImage {
     return container.getDataType();
   }
 
+  /**
+   * Gets the image width.
+   *
+   * @throws IllegalStateException if the TensorImage never loads data.
+   * @throws IllegalArgumentException if the container data is corrupted.
+   */
+  public int getWidth() {
+    return container.getWidth();
+  }
+
+  /**
+   * Gets the image height.
+   *
+   * @throws IllegalStateException if the TensorImage never loads data.
+   * @throws IllegalArgumentException if the container data is corrupted.
+   */
+  public int getHeight() {
+    return container.getHeight();
+  }
+
   // Requires tensor shape [h, w, 3] or [1, h, w, 3].
   static void checkImageTensorShape(int[] shape) {
     SupportPreconditions.checkArgument(
@@ -273,6 +293,41 @@ public class TensorImage {
       isBufferUpdated = true;
     }
 
+    int getWidth() {
+      SupportPreconditions.checkState(
+          isBitmapUpdated || isBufferUpdated,
+          "Both buffer and bitmap data are obsolete. Forgot to call TensorImage#load?");
+      if (isBitmapUpdated) {
+        return bitmapImage.getWidth();
+      }
+      return getBufferDimensionSize(-2);
+    }
+
+    int getHeight() {
+      SupportPreconditions.checkState(
+          isBitmapUpdated || isBufferUpdated,
+          "Both buffer and bitmap data are obsolete. Forgot to call TensorImage#load?");
+      if (isBitmapUpdated) {
+        return bitmapImage.getHeight();
+      }
+      return getBufferDimensionSize(-3);
+    }
+
+    // Internal helper method to get the size of one dimension in the shape of the `bufferImage`.
+    // Requires `isBufferUpdated` is true.
+    // Throws `IllegalArgumentException` if data is corrupted.
+    private int getBufferDimensionSize(int dim) {
+      int[] shape = bufferImage.getShape();
+      // The defensive check is needed because bufferImage might be invalidly changed by user
+      // (a.k.a internal data is corrupted)
+      TensorImage.checkImageTensorShape(shape);
+      dim = dim % shape.length;
+      if (dim < 0) {
+        dim += shape.length;
+      }
+      return shape[dim];
+    }
+
     public DataType getDataType() {
       return dataType;
     }
@@ -284,7 +339,8 @@ public class TensorImage {
         return bitmapImage;
       }
       if (!isBufferUpdated) {
-        throw new IllegalStateException("Both buffer and bitmap data are obsolete.");
+        throw new IllegalStateException(
+            "Both buffer and bitmap data are obsolete. Forgot to call TensorImage#load?");
       }
       if (bufferImage.getDataType() != DataType.UINT8) {
         throw new IllegalStateException(
@@ -310,7 +366,8 @@ public class TensorImage {
         return bufferImage;
       }
       SupportPreconditions.checkArgument(
-          isBitmapUpdated, "Both buffer and bitmap data are obsolete.");
+          isBitmapUpdated,
+          "Both buffer and bitmap data are obsolete. Forgot to call TensorImage#load?");
       int requiredFlatSize = bitmapImage.getWidth() * bitmapImage.getHeight() * 3;
       if (bufferImage == null
           || (!bufferImage.isDynamic() && bufferImage.getFlatSize() != requiredFlatSize)) {
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java
new file mode 100644
index 00000000000..9cfcf923ded
--- /dev/null
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.model;
+
+import android.util.Log;
+import java.io.Closeable;
+import java.io.IOException;
+import org.checkerframework.checker.nullness.qual.Nullable;
+import org.tensorflow.lite.Delegate;
+
+/**
+ * Helper class to create and call necessary methods of {@code GpuDelegate} which is not a strict
+ * dependency.
+ */
+class GpuDelegateProxy implements Delegate, Closeable {
+
+  private static final String TAG = "GpuDelegateProxy";
+
+  private final Delegate proxiedDelegate;
+  private final Closeable proxiedCloseable;
+
+  @Nullable
+  public static GpuDelegateProxy maybeNewInstance() {
+    try {
+      Class<?> clazz = Class.forName("org.tensorflow.lite.gpu.GpuDelegate");
+      Object instance = clazz.getDeclaredConstructor().newInstance();
+      return new GpuDelegateProxy(instance);
+    } catch (ReflectiveOperationException e) {
+      Log.e(TAG, "Failed to create the GpuDelegate dynamically.", e);
+      return null;
+    }
+  }
+
+  /** Calls {@code close()} method of the delegate. */
+  @Override
+  public void close() {
+    try {
+      proxiedCloseable.close();
+    } catch (IOException e) {
+      // Should not trigger, because GpuDelegate#close never throws. The catch is required because
+      // of Closeable#close.
+      Log.e(TAG, "Failed to close the GpuDelegate.", e);
+    }
+  }
+
+  /** Calls {@code getNativeHandle()} method of the delegate. */
+  @Override
+  public long getNativeHandle() {
+    return proxiedDelegate.getNativeHandle();
+  }
+
+  private GpuDelegateProxy(Object instance) {
+    this.proxiedCloseable = (Closeable) instance;
+    this.proxiedDelegate = (Delegate) instance;
+  }
+}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
index c7f9e83f692..8062d68d7b9 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
@@ -22,7 +22,7 @@ import java.util.Map;
 import org.checkerframework.checker.nullness.qual.NonNull;
 import org.checkerframework.checker.nullness.qual.Nullable;
 import org.tensorflow.lite.Interpreter;
-import org.tensorflow.lite.gpu.GpuDelegate;
+import org.tensorflow.lite.Tensor;
 import org.tensorflow.lite.support.common.FileUtil;
 import org.tensorflow.lite.support.common.SupportPreconditions;
 
@@ -91,7 +91,7 @@ public class Model {
   /** The memory-mapped model data. */
   private final MappedByteBuffer byteModel;
 
-  private final GpuDelegate gpuDelegate;
+  private final GpuDelegateProxy gpuDelegateProxy;
 
   /**
    * Builder for {@link Model}.
@@ -181,24 +181,30 @@ public class Model {
    * @param modelPath The original path of the model. It can be fetched later by {@link
    *     Model#getPath()}.
    * @param options The options for running the model.
+   * @throws IllegalArgumentException if {@code options.device} is {@link Device#GPU} but
+   *     "tensorflow-lite-gpu" is not linked to the project.
    */
   public static Model createModel(
       @NonNull MappedByteBuffer byteModel, @NonNull String modelPath, @NonNull Options options) {
     Interpreter.Options interpreterOptions = new Interpreter.Options();
-    GpuDelegate gpuDelegate = options.device.equals(Device.GPU) ? new GpuDelegate() : null;
+    GpuDelegateProxy gpuDelegateProxy = null;
     switch (options.device) {
       case NNAPI:
         interpreterOptions.setUseNNAPI(true);
         break;
       case GPU:
-        interpreterOptions.addDelegate(gpuDelegate);
+        gpuDelegateProxy = GpuDelegateProxy.maybeNewInstance();
+        SupportPreconditions.checkArgument(
+            gpuDelegateProxy != null,
+            "Cannot inference with GPU. Did you add \"tensorflow-lite-gpu\" as dependency?");
+        interpreterOptions.addDelegate(gpuDelegateProxy);
         break;
       case CPU:
         break;
     }
     interpreterOptions.setNumThreads(options.numThreads);
     Interpreter interpreter = new Interpreter(byteModel, interpreterOptions);
-    return new Model(modelPath, byteModel, interpreter, gpuDelegate);
+    return new Model(modelPath, byteModel, interpreter, gpuDelegateProxy);
   }
 
   /** Returns the memory-mapped model data. */
@@ -213,6 +219,24 @@ public class Model {
     return modelPath;
   }
 
+  /**
+   * Gets the Tensor associated with the provdied input index.
+   *
+   * @throws IllegalStateException if the interpreter is closed.
+   */
+  public Tensor getInputTensor(int inputIndex) {
+    return interpreter.getInputTensor(inputIndex);
+  }
+
+  /**
+   * Gets the Tensor associated with the provdied output index.
+   *
+   * @throws IllegalStateException if the interpreter is closed.
+   */
+  public Tensor getOutputTensor(int outputIndex) {
+    return interpreter.getOutputTensor(outputIndex);
+  }
+
   /**
    * Returns the output shape. Useful if output shape is only determined when graph is created.
    *
@@ -243,8 +267,8 @@ public class Model {
     if (interpreter != null) {
       interpreter.close();
     }
-    if (gpuDelegate != null) {
-      gpuDelegate.close();
+    if (gpuDelegateProxy != null) {
+      gpuDelegateProxy.close();
     }
   }
 
@@ -252,10 +276,10 @@ public class Model {
       @NonNull String modelPath,
       @NonNull MappedByteBuffer byteModel,
       @NonNull Interpreter interpreter,
-      @Nullable GpuDelegate gpuDelegate) {
+      @Nullable GpuDelegateProxy gpuDelegateProxy) {
     this.modelPath = modelPath;
     this.byteModel = byteModel;
     this.interpreter = interpreter;
-    this.gpuDelegate = gpuDelegate;
+    this.gpuDelegateProxy = gpuDelegateProxy;
   }
 }
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java
index 16622a25333..fa05be363a6 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java
@@ -379,13 +379,13 @@ public abstract class TensorBuffer {
 
     // Check if the new shape is the same as current shape.
     int newFlatSize = computeFlatSize(shape);
+    this.shape = shape.clone();
     if (flatSize == newFlatSize) {
       return;
     }
 
     // Update to the new shape.
     flatSize = newFlatSize;
-    this.shape = shape.clone();
     buffer = ByteBuffer.allocateDirect(flatSize * getTypeSize());
     buffer.order(ByteOrder.nativeOrder());
   }
diff --git a/tensorflow/lite/experimental/support/metadata/BUILD b/tensorflow/lite/experimental/support/metadata/BUILD
index d6417a1bfcf..4621c8c55d2 100644
--- a/tensorflow/lite/experimental/support/metadata/BUILD
+++ b/tensorflow/lite/experimental/support/metadata/BUILD
@@ -62,6 +62,7 @@ py_library(
     deps = [
         ":metadata_schema_py",
         ":schema_py",
+        "//tensorflow/lite/experimental/support/metadata/cc/python:_pywrap_metadata_version",
         "//tensorflow/lite/experimental/support/metadata/flatbuffers_lib:_pywrap_flatbuffers",
         "//tensorflow/python:platform",
         "@flatbuffers//:runtime_py",
diff --git a/tensorflow/lite/experimental/support/metadata/cc/BUILD b/tensorflow/lite/experimental/support/metadata/cc/BUILD
new file mode 100644
index 00000000000..2b288abe368
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/BUILD
@@ -0,0 +1,16 @@
+package(
+    default_visibility = ["//tensorflow/lite/experimental/support:users"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "metadata_version",
+    srcs = ["metadata_version.cc"],
+    hdrs = ["metadata_version.h"],
+    deps = [
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/experimental/support/metadata:metadata_schema_cc",
+        "//tensorflow/lite/tools:logging",
+        "@flatbuffers",
+    ],
+)
diff --git a/tensorflow/lite/experimental/support/metadata/cc/metadata_version.cc b/tensorflow/lite/experimental/support/metadata/cc/metadata_version.cc
new file mode 100644
index 00000000000..4f43c1431a7
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/metadata_version.cc
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/support/metadata/cc/metadata_version.h"
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/experimental/support/metadata/metadata_schema_generated.h"
+#include "tensorflow/lite/tools/logging.h"
+
+namespace tflite {
+namespace metadata {
+
+TfLiteStatus GetMinimumMetadataParserVersion(const uint8_t* buffer_data,
+                                             size_t buffer_size,
+                                             std::string* min_version) {
+  flatbuffers::Verifier verifier =
+      flatbuffers::Verifier(buffer_data, buffer_size);
+  if (!tflite::VerifyModelMetadataBuffer(verifier)) {
+    TFLITE_LOG(ERROR) << "The model metadata is not a valid FlatBuffer buffer.";
+    return kTfLiteError;
+  }
+
+  // Returns the version as the initial default one, "1.0.0", because it is the
+  // first version ever for metadata_schema.fbs.
+  //
+  // Later, when new fields are added to the schema, we'll update the logic of
+  // getting the minimum metadata parser version. To be more specific, we'll
+  // have a table that records the new fields and the versions of the schema
+  // they are added to. And the minimum metadata parser version will be the
+  // largest version number of all fields that has been added to a metadata
+  // flatbuffer.
+  // TODO(b/156539454): replace the hardcoded version with template + genrule.
+  static constexpr char kDefaultVersion[] = "1.0.0";
+  *min_version = kDefaultVersion;
+  return kTfLiteOk;
+}
+
+}  // namespace metadata
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/metadata/cc/metadata_version.h b/tensorflow/lite/experimental/support/metadata/cc/metadata_version.h
new file mode 100644
index 00000000000..71e90788af4
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/metadata_version.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_VERSION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_VERSION_H_
+
+#include <string>
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace metadata {
+
+// Gets the minimum metadata parser version that can fully understand all fields
+// in a given metadata flatbuffer. TFLite Metadata follows Semantic Versioning
+// 2.0. Each release version has the form MAJOR.MINOR.PATCH.
+TfLiteStatus GetMinimumMetadataParserVersion(const uint8_t* buffer_data,
+                                             size_t buffer_size,
+                                             std::string* min_version);
+
+}  // namespace metadata
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_VERSION_H_
diff --git a/tensorflow/lite/experimental/support/metadata/cc/python/BUILD b/tensorflow/lite/experimental/support/metadata/cc/python/BUILD
new file mode 100644
index 00000000000..4128f0ac9d1
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/python/BUILD
@@ -0,0 +1,22 @@
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
+
+package(
+    default_visibility = [
+        "//tensorflow/lite/experimental/support/metadata:__pkg__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+pybind_extension(
+    name = "_pywrap_metadata_version",
+    srcs = [
+        "metadata_version.cc",
+    ],
+    features = ["-use_header_modules"],
+    module_name = "_pywrap_metadata_version",
+    deps = [
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/experimental/support/metadata/cc:metadata_version",
+        "@pybind11",
+    ],
+)
diff --git a/tensorflow/lite/experimental/support/metadata/cc/python/metadata_version.cc b/tensorflow/lite/experimental/support/metadata/cc/python/metadata_version.cc
new file mode 100644
index 00000000000..7d1f9d1e122
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/python/metadata_version.cc
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/support/metadata/cc/metadata_version.h"
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace metadata {
+
+PYBIND11_MODULE(_pywrap_metadata_version, m) {
+  m.doc() = R"pbdoc(
+    _pywrap_metadata_version
+    A module that returns the minimum metadata parser version of a given
+    metadata flatbuffer.
+  )pbdoc";
+
+  // Using pybind11 type conversions to convert between Python and native
+  // C++ types. There are other options to provide access to native Python types
+  // in C++ and vice versa. See the pybind 11 instrcution [1] for more details.
+  // Type converstions is recommended by pybind11, though the main downside
+  // is that a copy of the data must be made on every Python to C++ transition:
+  // this is needed since the C++ and Python versions of the same type generally
+  // won’t have the same memory layout.
+  //
+  // [1]: https://pybind11.readthedocs.io/en/stable/advanced/cast/index.html
+  m.def("GetMinimumMetadataParserVersion",
+        [](const std::string& buffer_data) -> std::string {
+          std::string min_version;
+          if (GetMinimumMetadataParserVersion(
+                  reinterpret_cast<const uint8_t*>(buffer_data.c_str()),
+                  buffer_data.length(), &min_version) != kTfLiteOk) {
+            pybind11::value_error(
+                "Error occurred when getting the minimum metadata parser "
+                "version of the metadata flatbuffer.");
+          }
+          return min_version;
+        });
+}
+
+}  // namespace metadata
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/metadata/cc/test/BUILD b/tensorflow/lite/experimental/support/metadata/cc/test/BUILD
new file mode 100644
index 00000000000..fd829124c73
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/test/BUILD
@@ -0,0 +1,15 @@
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_test(
+    name = "metadata_version_test",
+    srcs = ["metadata_version_test.cc"],
+    deps = [
+        "//tensorflow/lite/experimental/support/metadata:metadata_schema_cc",
+        "//tensorflow/lite/experimental/support/metadata/cc:metadata_version",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers",
+    ],
+)
diff --git a/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc b/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc
new file mode 100644
index 00000000000..00d9c0902c6
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/support/metadata/cc/metadata_version.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/experimental/support/metadata/metadata_schema_generated.h"
+
+namespace tflite {
+namespace metadata {
+namespace {
+
+using ::testing::MatchesRegex;
+
+TEST(MetadataVersionTest,
+     GetMinimumMetadataParserVersionSucceedsWithValidMetadata) {
+  // Creates a dummy metadata flatbuffer for test.
+  flatbuffers::FlatBufferBuilder builder(1024);
+  auto name = builder.CreateString("Foo");
+  ModelMetadataBuilder metadata_builder(builder);
+  metadata_builder.add_name(name);
+  auto metadata = metadata_builder.Finish();
+  FinishModelMetadataBuffer(builder, metadata);
+
+  // Gets the mimimum metadata parser version.
+  std::string min_version;
+  EXPECT_EQ(GetMinimumMetadataParserVersion(builder.GetBufferPointer(),
+                                            builder.GetSize(), &min_version),
+            kTfLiteOk);
+  // Validates that the version is well-formed (x.y.z).
+  EXPECT_THAT(min_version, MatchesRegex("[0-9]*\\.[0-9]*\\.[0-9]"));
+}
+
+TEST(MetadataVersionTest,
+     GetMinimumMetadataParserVersionSucceedsWithInvalidIdentifier) {
+  // Creates a dummy metadata flatbuffer without identifier.
+  flatbuffers::FlatBufferBuilder builder(1024);
+  ModelMetadataBuilder metadata_builder(builder);
+  auto metadata = metadata_builder.Finish();
+  builder.Finish(metadata);
+
+  // Gets the mimimum metadata parser version and triggers error.
+  std::string min_version;
+  EXPECT_EQ(GetMinimumMetadataParserVersion(builder.GetBufferPointer(),
+                                            builder.GetSize(), &min_version),
+            kTfLiteError);
+  EXPECT_TRUE(min_version.empty());
+}
+
+}  // namespace
+}  // namespace metadata
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/metadata/java/BUILD b/tensorflow/lite/experimental/support/metadata/java/BUILD
index f1cd6173b9e..c208752ae24 100644
--- a/tensorflow/lite/experimental/support/metadata/java/BUILD
+++ b/tensorflow/lite/experimental/support/metadata/java/BUILD
@@ -16,7 +16,6 @@ android_library(
     deps = [
         "//tensorflow/lite/experimental/support/metadata:metadata_schema_fbs_android",
         "//tensorflow/lite/experimental/support/metadata:schema_fbs_android",
-        "//tensorflow/lite/java:tensorflowlite_java",
         "@org_checkerframework_qual",
     ],
 )
@@ -25,10 +24,13 @@ java_library(
     name = "tensorflow-lite-support-metadata-lib",
     srcs = glob(["src/java/org/tensorflow/lite/support/metadata/**/*.java"]),
     javacopts = JAVACOPTS,
+    resource_jars = [
+        "//tensorflow/lite/experimental/support/metadata:libmetadata_schema_java.jar",
+        "//tensorflow/lite/experimental/support/metadata:libschema_fbs_java.jar",
+    ],
     deps = [
         "//tensorflow/lite/experimental/support/metadata:metadata_schema_java",
         "//tensorflow/lite/experimental/support/metadata:schema_fbs_java",
-        "//tensorflow/lite/java:tensorflowlite_javalib",
         "@org_checkerframework_qual",
     ],
 )
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
index 054ea0e9730..be4d8caf577 100644
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
@@ -22,8 +22,6 @@ import java.io.InputStream;
 import java.nio.ByteBuffer;
 import java.util.zip.ZipException;
 import org.checkerframework.checker.nullness.qual.Nullable;
-import org.tensorflow.lite.DataType;
-import org.tensorflow.lite.Tensor.QuantizationParams;
 import org.tensorflow.lite.schema.Tensor;
 import org.tensorflow.lite.support.metadata.schema.ModelMetadata;
 import org.tensorflow.lite.support.metadata.schema.TensorMetadata;
@@ -54,6 +52,11 @@ import org.tensorflow.lite.support.metadata.schema.TensorMetadata;
  * MetadataExtractor} omits subgraph index as an input in its methods.
  */
 public class MetadataExtractor {
+  // TODO(b/156539454): remove the hardcode versioning number and populate the version through
+  // genrule.
+  /** The version of the metadata parser that this {@link MetadataExtractor} library depends on. */
+  public static final String METADATA_PARSER_VERSION = "1.0.0";
+
   /** The helper class to load metadata from TFLite model FlatBuffer. */
   private final ModelInfo modelInfo;
 
@@ -76,6 +79,15 @@ public class MetadataExtractor {
     ByteBuffer metadataBuffer = modelInfo.getMetadataBuffer();
     if (metadataBuffer != null) {
       metadataInfo = new ModelMetadataInfo(metadataBuffer);
+
+      // Prints warning message if the minimum parser version is not satisfied.
+      if (!isMinimumParserVersionSatisfied()) {
+        System.err.printf(
+            "<Warning> Some fields in the metadata belong to a future schema. The minimum parser"
+                + " version required is %s, but the version of the current metadata parser is %s",
+            metadataInfo.getMininumParserVersion(), METADATA_PARSER_VERSION);
+      }
+
       checkArgument(
           modelInfo.getInputTensorCount() == metadataInfo.getInputTensorCount(),
           String.format(
@@ -97,8 +109,50 @@ public class MetadataExtractor {
     zipFile = createZipFile(buffer);
   }
 
+  /**
+   * Quantization parameters that corresponds to the table, {@code QuantizationParameters}, in the
+   * <a
+   * href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs">TFLite
+   * Model schema file.</a>
+   *
+   * <p>Since per-channel quantization does not apply to input and output tensors, {@code scale} and
+   * {@code zero_point} are both single values instead of arrays.
+   *
+   * <p>For tensor that are not quantized, the values of scale and zero_point are both 0.
+   *
+   * <p>Given a quantized value q, the corresponding float value f should be: <br>
+   * f = scale * (q - zero_point) <br>
+   */
+  public static class QuantizationParams {
+    /** The scale value used in quantization. */
+    private final float scale;
+    /** The zero point value used in quantization. */
+    private final int zeroPoint;
+
+    /**
+     * Creates a {@link QuantizationParams} with {@code scale} and {@code zero_point}.
+     *
+     * @param scale The scale value used in quantization.
+     * @param zeroPoint The zero point value used in quantization.
+     */
+    public QuantizationParams(final float scale, final int zeroPoint) {
+      this.scale = scale;
+      this.zeroPoint = zeroPoint;
+    }
+
+    /** Returns the scale value. */
+    public float getScale() {
+      return scale;
+    }
+
+    /** Returns the zero point value. */
+    public int getZeroPoint() {
+      return zeroPoint;
+    }
+  }
+
   /** Returns {@code true} if the model has metadata. Otherwise, returns {@code false}. */
-  public Boolean hasMetadata() {
+  public boolean hasMetadata() {
     return metadataInfo != null;
   }
 
@@ -152,11 +206,11 @@ public class MetadataExtractor {
   }
 
   /**
-   * Gets the {@link DataType} of the input tensor with {@code inputIndex}.
+   * Gets the {@link TensorType} of the input tensor with {@code inputIndex}.
    *
    * @param inputIndex the index of the desired input tensor
    */
-  public DataType getInputTensorType(int inputIndex) {
+  public byte getInputTensorType(int inputIndex) {
     return modelInfo.getInputTensorType(inputIndex);
   }
 
@@ -207,16 +261,40 @@ public class MetadataExtractor {
   }
 
   /**
-   * Gets the {@link DataType} of the output tensor with {@code outputIndex}.
+   * Gets the {@link TensorType} of the output tensor with {@code outputIndex}.
    *
    * @param outputIndex the index of the desired output tensor
    */
-  public DataType getOutputTensorType(int outputIndex) {
+  public byte getOutputTensorType(int outputIndex) {
     return modelInfo.getOutputTensorType(outputIndex);
   }
 
   /**
-   * Asserts if {@link metdadataInfo} is not initialized. Some models may not have metadata and this
+   * Returns {@code true} if the minimum parser version required by the given metadata flatbuffer
+   * precedes or equals to the version of the metadata parser that this MetadataExtractor library is
+   * relying on. All fields in the metadata can be parsed correctly with this metadata extractor
+   * library in this case. Otherwise, it returns {@code false}.
+   *
+   * <p>For example, assume the underlying metadata parser version is {@code 1.14.1},
+   *
+   * <ul>
+   *   <li>it returns {@code true}, if the required minimum parser version is the same or older,
+   *       such as {@code 1.14.1} or {@code 1.14.0}. Null version precedes all numeric versions,
+   *       because some metadata flatbuffers are generated before the first versioned release; <br>
+   *   <li>it returns {@code false}, if the required minimum parser version is newer, such as {@code
+   *       1.14.2}.
+   * </ul>
+   */
+  public final boolean isMinimumParserVersionSatisfied() {
+    String minVersion = metadataInfo.getMininumParserVersion();
+    if (minVersion == null) {
+      return true;
+    }
+    return compareVersions(minVersion, METADATA_PARSER_VERSION) <= 0;
+  }
+
+  /**
+   * Asserts if {@link #metadataInfo} is not initialized. Some models may not have metadata and this
    * is allowed. However, invoking methods that reads the metadata is not allowed.
    *
    * @throws IllegalStateException if this model does not contain model metadata
@@ -260,4 +338,35 @@ public class MetadataExtractor {
       return null;
     }
   }
+
+  /**
+   * Compares two semantic version numbers.
+   *
+   * <p>Examples of comparing two versions: <br>
+   * {@code 1.9} precedes {@code 1.14}; <br>
+   * {@code 1.14} precedes {@code 1.14.1}; <br>
+   * {@code 1.14} and {@code 1.14.0} are euqal;
+   *
+   * @return the value {@code 0} if the two versions are equal; a value less than {@code 0} if
+   *     {@code version1} precedes {@code version2}; a value greater than {@code 0} if {@code
+   *     version2} precedes {@code version1}.
+   */
+  private static int compareVersions(String version1, String version2) {
+    // Using String.split instead of the recommanded Guava Splitter because we've been avoiding
+    // depending on other third party libraries in this project.
+    String[] levels1 = version1.split("\\.", 0);
+    String[] levels2 = version2.split("\\.", 0);
+
+    int length = Math.max(levels1.length, levels2.length);
+    for (int i = 0; i < length; i++) {
+      Integer v1 = i < levels1.length ? Integer.parseInt(levels1[i]) : 0;
+      Integer v2 = i < levels2.length ? Integer.parseInt(levels2[i]) : 0;
+      int compare = v1.compareTo(v2);
+      if (compare != 0) {
+        return compare;
+      }
+    }
+
+    return 0;
+  }
 }
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java
index e2905d108d7..309a3dbe774 100644
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java
@@ -21,12 +21,8 @@ import static org.tensorflow.lite.support.metadata.Preconditions.checkNotNull;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 import org.checkerframework.checker.nullness.qual.Nullable;
-import org.tensorflow.lite.DataType;
-import org.tensorflow.lite.Tensor.QuantizationParams;
 import org.tensorflow.lite.schema.Buffer;
 import org.tensorflow.lite.schema.Metadata;
 import org.tensorflow.lite.schema.Model;
@@ -34,6 +30,7 @@ import org.tensorflow.lite.schema.QuantizationParameters;
 import org.tensorflow.lite.schema.SubGraph;
 import org.tensorflow.lite.schema.Tensor;
 import org.tensorflow.lite.schema.TensorType;
+import org.tensorflow.lite.support.metadata.MetadataExtractor.QuantizationParams;
 
 /** Extracts model information out of TFLite model FLatBuffer. */
 final class ModelInfo {
@@ -49,9 +46,6 @@ final class ModelInfo {
   /** Identifier of the TFLite model metadata in the Metadata array. */
   static final String METADATA_FIELD_NAME = "TFLITE_METADATA";
 
-  /** Maps from TensorType in TFlite FlatBuffer to {@link DataType} in Java. */
-  private final Map<Byte, DataType> tensorTypeToDataTypeMap;
-
   /**
    * Creates a {@link ModelInfo} with the model FlatBuffer, {@code buffer}.
    *
@@ -74,7 +68,6 @@ final class ModelInfo {
 
     inputTensors = getInputTensors(model);
     outputTensors = getOutputTensors(model);
-    tensorTypeToDataTypeMap = createTensorTypeToDataTypeMap();
   }
 
   /**
@@ -106,13 +99,12 @@ final class ModelInfo {
   }
 
   /**
-   * Gets {@link DataType} of the input tensor with {@code inputIndex}.
+   * Gets the {@link TensorType} in byte of the input tensor with {@code inputIndex}.
    *
    * @param inputIndex The index of the desired intput tensor.
    */
-  DataType getInputTensorType(int inputIndex) {
-    Tensor tensor = getInputTensor(inputIndex);
-    return getDataType(tensor.type());
+  byte getInputTensorType(int inputIndex) {
+    return getInputTensor(inputIndex).type();
   }
 
   /** Gets the metadata FlatBuffer from the model FlatBuffer. */
@@ -163,13 +155,12 @@ final class ModelInfo {
   }
 
   /**
-   * Gets {@link DataType} of the output tensor {@code outputIndex}.
+   * Gets the {@link TensorType} in byte of the output tensor {@code outputIndex}.
    *
    * @param outputIndex The index of the desired outtput tensor.
    */
-  DataType getOutputTensorType(int outputIndex) {
-    Tensor tensor = getOutputTensor(outputIndex);
-    return getDataType(tensor.type());
+  byte getOutputTensorType(int outputIndex) {
+    return getOutputTensor(outputIndex).type();
   }
 
   /**
@@ -233,29 +224,6 @@ final class ModelInfo {
             + " flatbuffer.");
   }
 
-  private static Map<Byte, DataType> createTensorTypeToDataTypeMap() {
-    Map<Byte, DataType> map = new HashMap<>();
-    map.put(TensorType.FLOAT32, DataType.FLOAT32);
-    map.put(TensorType.INT32, DataType.INT32);
-    map.put(TensorType.UINT8, DataType.UINT8);
-    map.put(TensorType.INT64, DataType.INT64);
-    map.put(TensorType.STRING, DataType.STRING);
-    return Collections.unmodifiableMap(map);
-  }
-
-  /**
-   * Transforms from TensorType in TFlite FlatBuffer to {@link DataType} in Java.
-   *
-   * @param tensorType The tensor type to be converted.
-   * @throws IllegalArgumentException if the tensor type is not supported.
-   */
-  private DataType getDataType(byte tensorType) {
-    checkArgument(
-        tensorTypeToDataTypeMap.containsKey(tensorType),
-        String.format("Tensor type %d is not supported.", tensorType));
-    return tensorTypeToDataTypeMap.get(tensorType);
-  }
-
   /**
    * Gets the shape of a tensor.
    *
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
index 57fa7113c2a..751ed500dc2 100644
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
@@ -38,6 +38,9 @@ final class ModelMetadataInfo {
   /** Metadata array of output tensors. */
   private final List</* @Nullable */ TensorMetadata> outputsMetadata;
 
+  /** The minimum parser version required to fully understand the metadata flatbuffer. */
+  private final String /* @Nullable */ minVersion;
+
   /**
    * Creates a {@link ModelMetadataInfo} with the metadata FlatBuffer, {@code buffer}.
    *
@@ -56,6 +59,7 @@ final class ModelMetadataInfo {
 
     inputsMetadata = getInputsMetadata(modelMetadata);
     outputsMetadata = getOutputsMetadata(modelMetadata);
+    minVersion = modelMetadata.minParserVersion();
   }
 
   /** Gets the count of input tensors with metadata in the metadata FlatBuffer. */
@@ -77,6 +81,15 @@ final class ModelMetadataInfo {
     return inputsMetadata.get(inputIndex);
   }
 
+  /**
+   * Gets the minimum parser version of the metadata. It can be {@code null} if the version is not
+   * populated.
+   */
+  @Nullable
+  String getMininumParserVersion() {
+    return minVersion;
+  }
+
   /** Gets the root handler for the model metadata. */
   ModelMetadata getModelMetadata() {
     return modelMetadata;
diff --git a/tensorflow/lite/experimental/support/metadata/metadata.py b/tensorflow/lite/experimental/support/metadata/metadata.py
index 25ca57bb4cc..b3d8d28806b 100644
--- a/tensorflow/lite/experimental/support/metadata/metadata.py
+++ b/tensorflow/lite/experimental/support/metadata/metadata.py
@@ -28,6 +28,7 @@ import zipfile
 from flatbuffers.python import flatbuffers
 from tensorflow.lite.experimental.support.metadata import metadata_schema_py_generated as _metadata_fb
 from tensorflow.lite.experimental.support.metadata import schema_py_generated as _schema_fb
+from tensorflow.lite.experimental.support.metadata.cc.python import _pywrap_metadata_version
 from tensorflow.lite.experimental.support.metadata.flatbuffers_lib import _pywrap_flatbuffers
 from tensorflow.python.platform import resource_loader
 
@@ -55,7 +56,7 @@ class MetadataPopulator(object):
   classifer model using Flatbuffers API. Attach the label file onto the ouput
   tensor (the tensor of probabilities) in the metadata.
 
-  Then, pack the metadata and lable file into the model as follows.
+  Then, pack the metadata and label file into the model as follows.
 
     ```python
     # Populating a metadata file (or a metadta buffer) and associated files to
@@ -78,6 +79,9 @@ class MetadataPopulator(object):
     with open("updated_model.tflite", "wb") as f:
       f.write(updated_model_buf)
     ```
+
+  Note that existing metadata buffer (if applied) will be overridden by the new
+  metadata buffer.
   """
   # As Zip API is used to concatenate associated files after tflite model file,
   # the populating operation is developed based on a model file. For in-memory
@@ -218,12 +222,27 @@ class MetadataPopulator(object):
     Raises:
       ValueError: The metadata to be populated is empty.
       ValueError: The metadata does not have the expected flatbuffer identifer.
+      ValueError: Error occurs when getting the minimum metadata parser version.
     """
     if not metadata_buf:
       raise ValueError("The metadata to be populated is empty.")
 
     _assert_metadata_buffer_identifier(metadata_buf)
-    self._metadata_buf = metadata_buf
+
+    # Gets the minimum metadata parser version of the metadata_buf.
+    min_version = _pywrap_metadata_version.GetMinimumMetadataParserVersion(
+        bytes(metadata_buf))
+
+    # Inserts in the minimum metadata parser version into the metadata_buf.
+    metadata = _metadata_fb.ModelMetadataT.InitFromObj(
+        _metadata_fb.ModelMetadata.GetRootAsModelMetadata(metadata_buf, 0))
+    metadata.minParserVersion = min_version
+
+    b = flatbuffers.Builder(0)
+    b.Finish(metadata.Pack(b), self.METADATA_FILE_IDENTIFIER)
+    metadata_buf_with_version = b.Output()
+
+    self._metadata_buf = metadata_buf_with_version
 
   def load_metadata_file(self, metadata_file):
     """Loads the metadata file to be populated.
@@ -325,6 +344,9 @@ class MetadataPopulator(object):
     Inserts metadata_buf into the metadata field of schema.Model. If the
     MetadataPopulator object is created using the method,
     with_model_file(model_file), the model file will be updated.
+
+    Existing metadata buffer (if applied) will be overridden by the new metadata
+    buffer.
     """
 
     with open(self._model_file, "rb") as f:
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
index 7e8d148d504..2883a91bd0c 100644
--- a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
+++ b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
@@ -29,11 +29,31 @@ namespace tflite;
 // generate the model interface. It is recommended to fill in at least those
 // enties to boost the codegen performance.
 
-// This corresponds to the schema version.
+// LINT.IfChange
+
+// The Metadata schema is versioned by the Semantic versioning number, which
+// tracks the schema changes according to the Semantic versioning rules.
+//
+// ModelMetadata.min_parser_version indicates the minimum necessary metadata
+// parser version to fully understand all fields in a given metadata flatbuffer.
+//
+// New fields and types will have associated comments with the schema version for
+// which they were added.
+//
+// Schema Semantic version: 1.0.0
+
+// This indicates the flatbuffer compatibility. The number will bump up when a
+// break change is applied to the schema, such as removing fields or adding new
+// fields to the middle of a table.
 file_identifier "M001";
 // File extension of any written files.
 file_extension "tflitemeta";
 
+// LINT.ThenChange(//tensorflow/lite/experimental/\
+//     /supportmetadata/java/src/java/org/tensorflow/lite/support/metadata/\
+//     MetadataExtractor.java)
+
+// LINT.IfChange
 enum AssociatedFileType : byte {
   UNKNOWN = 0,
   // Files such as readme.txt
@@ -267,7 +287,7 @@ table Content {
   // dimension 1/2: the heatmap image.
   // dimension 3: 17 body parts of a person.
   // Even though the last axis is body part, the real content of this tensor is
-  // the heatmap. "range" should be [min=2; max=3].
+  // the heatmap. "range" should be [min=1; max=2].
   //
   // Case 3: The tensor contains multiple different objects. (Not supported by
   // Content at this point).
@@ -297,12 +317,22 @@ table NormalizationOptions{
   // mean and std are normalization parameters. Tensor values are normalized
   // on a per-channel basis, by the formula
   //   (x - mean) / std.
-  // For example, a float MobileNet model will have
-  //   mean = 127.5f and std = 127.5f.
-  // A quantized MobileNet model will have
-  //   mean = 0.0f and std = 1.0f.
   // If there is only one value in mean or std, we'll propogate the value to
   // all channels.
+  //
+  // Quantized models share the same normalization parameters as their
+  // corresponding float models. For example, an image input tensor may have
+  // the normalization parameter of
+  //   mean = 127.5f and std = 127.5f.
+  // The image value will be normalized from [0, 255] to [-1, 1].
+  // Then, for quantized models, the image data should be further quantized
+  // according to the quantization parameters. In the case of uint8, the image
+  // data will be scaled back to [0, 255], while for int8, the image data will
+  // be scaled to [-128, 127].
+  //
+  // Both the normalization parameters and quantization parameters can be
+  // retrieved through the metadata extractor library.
+  // TODO(b/156644598): add link for the metadata extractor library.
 
   // Per-channel mean of the possible values used in normalization.
   //
@@ -498,6 +528,17 @@ table ModelMetadata {
 
   // A list of associated files of this model.
   associated_files:[AssociatedFile];
+
+  // The minimum metadata parser version that can fully understand the fields in
+  // the metadata flatbuffer. The version is effectively the largest version
+  // number among the versions of all the fields populated and the smallest
+  // compatible version indicated by the file identifier.
+  //
+  // This field is automaticaly populated by the MetadataPopulator when
+  // the metadata is populated into a TFLite model.
+  min_parser_version:string;
 }
+// LINT.ThenChange(//tensorflow/lite/experimental/\
+//     support/metadata/cc/metadata_version.cc)
 
 root_type ModelMetadata;
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_test.py b/tensorflow/lite/experimental/support/metadata/metadata_test.py
index 81b3eef62f9..28395041746 100644
--- a/tensorflow/lite/experimental/support/metadata/metadata_test.py
+++ b/tensorflow/lite/experimental/support/metadata/metadata_test.py
@@ -43,6 +43,8 @@ class MetadataTest(test_util.TensorFlowTestCase):
       f.write(self._empty_model_buf)
     self._model_file = self._create_model_file_with_metadata_and_buf_fields()
     self._metadata_file = self._create_metadata_file()
+    self._metadata_file_with_version = self._create_metadata_file_with_version(
+        self._metadata_file, "1.0.0")
     self._file1 = self.create_tempfile("file1").full_path
     self._file2 = self.create_tempfile("file2").full_path
     self._file3 = self.create_tempfile("file3").full_path
@@ -135,6 +137,25 @@ class MetadataTest(test_util.TensorFlowTestCase):
     b.Finish(model.Pack(b), identifier)
     return b.Output()
 
+  def _create_metadata_file_with_version(self, metadata_file, min_version):
+    # Creates a new metadata file with the specified min_version for testing
+    # purposes.
+    with open(metadata_file, "rb") as f:
+      metadata_buf = bytearray(f.read())
+
+    metadata = _metadata_fb.ModelMetadataT.InitFromObj(
+        _metadata_fb.ModelMetadata.GetRootAsModelMetadata(metadata_buf, 0))
+    metadata.minParserVersion = min_version
+
+    b = flatbuffers.Builder(0)
+    b.Finish(
+        metadata.Pack(b), _metadata.MetadataPopulator.METADATA_FILE_IDENTIFIER)
+
+    metadata_file_with_version = self.create_tempfile().full_path
+    with open(metadata_file_with_version, "wb") as f:
+      f.write(b.Output())
+    return metadata_file_with_version
+
 
 class MetadataPopulatorTest(MetadataTest):
 
@@ -245,7 +266,7 @@ class MetadataPopulatorTest(MetadataTest):
     buffer_data = model.Buffers(buffer_index)
     metadata_buf_np = buffer_data.DataAsNumpy()
     metadata_buf = metadata_buf_np.tobytes()
-    with open(self._metadata_file, "rb") as f:
+    with open(self._metadata_file_with_version, "rb") as f:
       expected_metadata_buf = bytearray(f.read())
     self.assertEqual(metadata_buf, expected_metadata_buf)
 
@@ -293,7 +314,7 @@ class MetadataPopulatorTest(MetadataTest):
     buffer_data = model.Buffers(buffer_index)
     metadata_buf_np = buffer_data.DataAsNumpy()
     metadata_buf = metadata_buf_np.tobytes()
-    with open(self._metadata_file, "rb") as f:
+    with open(self._metadata_file_with_version, "rb") as f:
       expected_metadata_buf = bytearray(f.read())
     self.assertEqual(metadata_buf, expected_metadata_buf)
 
diff --git a/tensorflow/lite/experimental/support/metadata/testdata/golden_json.json b/tensorflow/lite/experimental/support/metadata/testdata/golden_json.json
index bc3001e685a..9ff5581fbff 100644
--- a/tensorflow/lite/experimental/support/metadata/testdata/golden_json.json
+++ b/tensorflow/lite/experimental/support/metadata/testdata/golden_json.json
@@ -17,5 +17,6 @@
     {
       "name": "file1"
     }
-  ]
+  ],
+  "min_parser_version": "1.0.0"
 }
diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
index 2ce8428b1ce..b5e502b90f0 100644
--- a/tensorflow/lite/experimental/swift/BUILD.apple
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -2,7 +2,7 @@
 
 load("//tensorflow/lite:special_rules.bzl", "ios_visibility_whitelist", "tflite_ios_lab_runner")
 load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_DEFAULT_TAGS", "TFL_DISABLED_SANITIZER_TAGS", "TFL_MINIMUM_OS_VERSION")
-load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_static_framework", "ios_unit_test")
 load("@build_bazel_rules_swift//swift:swift.bzl", "swift_library")
 
 package(
@@ -10,14 +10,46 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+# TODO(b/153554551): investigate if separate delegate libraries can be made with same module_name
+# If you don't need delegates and want to reduce size of the app, you can exclude Metal/Core ML
+# delegate related dependencies from the rule.
+# For example, if you don't want to use Core ML delegate:
+# 1. add `exclude = ["Sources/CoreMLDelegate.swift"]` to `glob`, so that `srcs` would look like this:
+#    ```
+#    srcs = glob(
+#        ["Sources/*.swift"],
+#        exclude = ["Sources/CoreMLDelegate.swift"],
+#    ),
+# 2. remove "-Wl,-weak_framework,CoreML" from `linkopts`
+# 3. remove "...:coreml_delegate" from `deps`
+
 swift_library(
     name = "TensorFlowLite",
     srcs = glob(["Sources/*.swift"]),
+    linkopts = [
+        "-Wl,-weak_framework,CoreML",
+        "-Wl,-weak_framework,Metal",
+    ],
     module_name = "TensorFlowLite",
     tags = TFL_DEFAULT_TAGS,
     visibility = ios_visibility_whitelist(),
     deps = [
-        "//tensorflow/lite/experimental/ios:TensorFlowLiteC",
+        "//tensorflow/lite/delegates/gpu:metal_delegate",
+        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
+        "//tensorflow/lite/experimental/ios:tensorflow_lite_c",
+    ],
+)
+
+# bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/swift:TensorFlowLite_framework
+ios_static_framework(
+    name = "TensorFlowLite_framework",
+    avoid_deps = [
+        "//tensorflow/lite/experimental/ios:tensorflow_lite_c",
+    ],
+    bundle_name = "TensorFlowLite",
+    minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    deps = [
+        ":TensorFlowLite",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift b/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
index 9862de31e2c..9fc76bc3026 100644
--- a/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
+++ b/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-import TensorFlowLiteC
+import TensorFlowLiteCCoreML
 
 /// A delegate that uses the `Core ML` framework for performing TensorFlow Lite graph operations.
 ///
@@ -35,6 +35,7 @@ public final class CoreMLDelegate: Delegate {
     self.options = options
     var delegateOptions = TfLiteCoreMlDelegateOptions()
     delegateOptions.enabled_devices = options.enabledDevices.cEnabledDevices
+    delegateOptions.coreml_version = Int32(options.coreMLVersion)
     delegateOptions.max_delegated_partitions = Int32(options.maxDelegatedPartitions)
     delegateOptions.min_nodes_per_partition = Int32(options.minNodesPerPartition)
     guard let delegate = TfLiteCoreMlDelegateCreate(&delegateOptions) else { return nil }
@@ -72,6 +73,9 @@ extension CoreMLDelegate {
     /// value is `.neuralEngine` indicating that the delegate is enabled for Neural Engine devices
     /// only.
     public var enabledDevices: EnabledDevices = .neuralEngine
+    /// Target Core ML version for the model conversion. When it's not set, Core ML version will
+    /// be set to highest available version for the platform.
+    public var coreMLVersion = 0
     /// The maximum number of Core ML delegate partitions created. Each graph corresponds to one
     /// delegated node subset in the TFLite model. The default value is `0` indicating that all
     /// possible partitions are delegated.
diff --git a/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift b/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
index 8fd15f303da..6cde2533f95 100644
--- a/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
+++ b/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-import TensorFlowLiteC
+import TensorFlowLiteCMetal
 
 /// A delegate that uses the `Metal` framework for performing TensorFlow Lite graph operations with
 /// GPU acceleration.
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec
index 3b21483f663..8b0e797eeaa 100644
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec
@@ -20,8 +20,20 @@ Pod::Spec.new do |s|
 
   tfl_dir = 'tensorflow/lite/'
   swift_dir = tfl_dir + 'experimental/swift/'
-  s.source_files = swift_dir + 'Sources/*.swift'
-  s.dependency 'TensorFlowLiteC', "~> #{s.version}"
+
+  s.default_subspec = 'Core'
+
+  s.subspec 'Core' do |core|
+    core.dependency 'TensorFlowLiteC', "#{s.version}"
+    core.source_files = swift_dir + 'Sources/*.swift'
+    core.exclude_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+  end
+
+  s.subspec 'CoreML' do |coreml|
+    coreml.source_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+    coreml.dependency 'TensorFlowLiteC/CoreML', "#{s.version}"
+    coreml.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
+  end
 
   s.test_spec 'Tests' do |ts|
     ts.source_files = swift_dir + 'Tests/*.swift'
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
index 9e875b44ee2..1e414f1959f 100644
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
@@ -20,14 +20,41 @@ Pod::Spec.new do |s|
 
   tfl_dir = 'tensorflow/lite/'
   swift_dir = tfl_dir + 'experimental/swift/'
-  s.source_files = swift_dir + 'Sources/*.swift'
-  s.dependency 'TensorFlowLiteC', '~> 0.0.1-nightly'
 
-  s.test_spec 'Tests' do |ts|
-    ts.source_files = swift_dir + 'Tests/*.swift'
-    ts.resources = [
-      tfl_dir + 'testdata/add.bin',
-      tfl_dir + 'testdata/add_quantized.bin',
-    ]
+  s.default_subspec = 'Core'
+
+  s.subspec 'Core' do |core|
+    core.dependency 'TensorFlowLiteC', "#{s.version}"
+    core.source_files = swift_dir + 'Sources/*.swift'
+    core.exclude_files = swift_dir + 'Sources/{CoreML,Metal}Delegate.swift'
+
+    core.test_spec 'Tests' do |ts|
+      ts.source_files = swift_dir + 'Tests/*.swift'
+      ts.exclude_files = swift_dir + 'Tests/MetalDelegateTests.swift'
+      ts.resources = [
+        tfl_dir + 'testdata/add.bin',
+        tfl_dir + 'testdata/add_quantized.bin',
+      ]
+    end
+  end
+
+  s.subspec 'CoreML' do |coreml|
+    coreml.source_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+    coreml.dependency 'TensorFlowLiteC/CoreML', "#{s.version}"
+    coreml.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
+  end
+
+  s.subspec 'Metal' do |metal|
+    metal.source_files = swift_dir + 'Sources/MetalDelegate.swift'
+    metal.dependency 'TensorFlowLiteC/Metal', "#{s.version}"
+    metal.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
+
+    metal.test_spec 'Tests' do |ts|
+      ts.source_files = swift_dir + 'Tests/{Interpreter,MetalDelegate}Tests.swift'
+      ts.resources = [
+        tfl_dir + 'testdata/add.bin',
+        tfl_dir + 'testdata/add_quantized.bin',
+      ]
+    end
   end
 end
diff --git a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
index 09b001cb0cb..8d0140279af 100644
--- a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
@@ -50,26 +50,6 @@ class InterpreterTests: XCTestCase {
     XCTAssertNil(interpreter.delegates)
   }
 
-  func testInitWithDelegate() throws {
-    let metalDelegate = MetalDelegate()
-    let interpreter = try Interpreter(modelPath: AddQuantizedModel.path, delegates: [metalDelegate])
-    XCTAssertEqual(interpreter.delegates?.count, 1)
-    XCTAssertNil(interpreter.options)
-  }
-
-  func testInitWithOptionsAndDelegate() throws {
-    var options = Interpreter.Options()
-    options.threadCount = 1
-    let metalDelegate = MetalDelegate()
-    let interpreter = try Interpreter(
-      modelPath: AddQuantizedModel.path,
-      options: options,
-      delegates: [metalDelegate]
-    )
-    XCTAssertNotNil(interpreter.options)
-    XCTAssertEqual(interpreter.delegates?.count, 1)
-  }
-
   func testInputTensorCount() {
     XCTAssertEqual(interpreter.inputTensorCount, AddModel.inputTensorCount)
   }
@@ -268,7 +248,7 @@ class InterpreterOptionsTests: XCTestCase {
 // MARK: - Constants
 
 /// Values for the `add.bin` model.
-private enum AddModel {
+enum AddModel {
   static let info = (name: "add", extension: "bin")
   static let inputTensorCount = 1
   static let outputTensorCount = 1
@@ -301,7 +281,7 @@ private enum AddModel {
 }
 
 /// Values for the `add_quantized.bin` model.
-private enum AddQuantizedModel {
+enum AddQuantizedModel {
   static let info = (name: "add_quantized", extension: "bin")
   static let inputOutputIndex = 0
   static let shape: Tensor.Shape = [2]
diff --git a/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift b/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
index 6daa429e2f0..8af43842d7a 100644
--- a/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
@@ -32,6 +32,26 @@ class MetalDelegateTests: XCTestCase {
     XCTAssertTrue(delegate.options.allowsPrecisionLoss)
     XCTAssertEqual(delegate.options.waitType, .active)
   }
+
+  func testInitInterpreterWithDelegate() throws {
+    let metalDelegate = MetalDelegate()
+    let interpreter = try Interpreter(modelPath: AddQuantizedModel.path, delegates: [metalDelegate])
+    XCTAssertEqual(interpreter.delegates?.count, 1)
+    XCTAssertNil(interpreter.options)
+  }
+
+  func testInitInterpreterWithOptionsAndDelegate() throws {
+    var options = Interpreter.Options()
+    options.threadCount = 1
+    let metalDelegate = MetalDelegate()
+    let interpreter = try Interpreter(
+      modelPath: AddQuantizedModel.path,
+      options: options,
+      delegates: [metalDelegate]
+    )
+    XCTAssertNotNil(interpreter.options)
+    XCTAssertEqual(interpreter.delegates?.count, 1)
+  }
 }
 
 class MetalDelegateOptionsTests: XCTestCase {
diff --git a/tensorflow/lite/g3doc/convert/1x_compatibility.md b/tensorflow/lite/g3doc/convert/1x_compatibility.md
index adb2af4d8ad..ceb99bad5e2 100644
--- a/tensorflow/lite/g3doc/convert/1x_compatibility.md
+++ b/tensorflow/lite/g3doc/convert/1x_compatibility.md
@@ -1,30 +1,32 @@
-# TensorFlow 1.x compatibility
+# TensorFlow 1.x Compatibility <a name="differences"></a>
 
-The `tf.lite.TFLiteConverter` was updated between TensorFlow 1.X and 2.0. This
-document explains the differences between the 1.X and 2.0 versions of the
-converter, and provides information about how to use the 1.X version if
-required.
+The `tf.lite.TFLiteConverter` Python API was updated between TensorFlow 1.x and
+2.x. This document explains the differences between the two versions, and
+provides information about how to use the 1.x version if required.
 
-## Summary of changes in Python API between 1.X and 2.0 <a name="differences"></a>
-
-The following section summarizes the changes in the Python API from 1.X to 2.0.
 If any of the changes raise concerns, please file a
-[GitHub issue](https://github.com/tensorflow/tensorflow/issues).
+[GitHub Issue](https://github.com/tensorflow/tensorflow/issues).
 
-### Formats supported by `TFLiteConverter`
+Note: We highly recommend that you
+[migrate your TensorFlow 1.x code to TensorFlow 2.x code](https://www.tensorflow.org/guide/migrate)
+.
 
-The 2.0 version of the converter supports SavedModel and Keras model files
-generated in both 1.X and 2.0. However, the conversion process no longer
-supports "frozen graph" `GraphDef` files generated in 1.X.
+## Model formats
 
-#### Converting frozen graphs
+#### SavedModel and Keras
 
-Users who want to convert frozen graph `GraphDef` files (`.pb` files) to
-TensorFlow Lite should use `tf.compat.v1.lite.TFLiteConverter`.
+The `tf.lite.TFLiteConverter` API supports SavedModel and Keras HDF5 files
+generated in both TensorFlow 1.x and 2.x.
 
-The following snippet shows a frozen graph file being converted:
+#### Frozen Graph
+
+Note: TensorFlow 2.x no longer supports the generation of frozen graph models.
+
+The `tf.compat.v1.lite.TFLiteConverter` API supports frozen graph models
+generated in TensorFlow 1.x, as shown below:
 
 ```python
+import tensorflow as tf
 # Path to the frozen graph file
 graph_def_file = 'frozen_graph.pb'
 # A list of the names of the model's input tensors
@@ -32,70 +34,68 @@ input_arrays = ['input_name']
 # A list of the names of the model's output tensors
 output_arrays = ['output_name']
 # Load and convert the frozen graph
-converter = lite.TFLiteConverter.from_frozen_graph(
+converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
   graph_def_file, input_arrays, output_arrays)
 tflite_model = converter.convert()
 # Write the converted model to disk
 open("converted_model.tflite", "wb").write(tflite_model)
 ```
 
-### Quantization-aware training
+## Converter attributes
 
-The following attributes and methods associated with
-[quantization-aware training](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize)
-have been removed from `TFLiteConverter` in TensorFlow 2.0:
+#### Renamed attributes
 
-*   `inference_type`
-*   `inference_input_type`
-*   `quantized_input_stats`
-*   `default_ranges_stats`
-*   `reorder_across_fake_quant`
-*   `change_concat_input_ranges`
-*   `post_training_quantize` - Deprecated in the 1.X API
-*   `get_input_arrays()`
+The following 1.x attribute has been renamed in 2.x.
 
-The rewriter function that supports quantization-aware training does not support
-models generated by TensorFlow 2.0. Additionally, TensorFlow Lite’s quantization
-API is being reworked and streamlined in a direction that supports
-quantization-aware training through the Keras API. These attributes will be
-removed in the 2.0 API until the new quantization API is launched. Users who
-want to convert models generated by the rewriter function can use
-`tf.compat.v1.lite.TFLiteConverter`.
+*   `target_ops` has been renamed to `target_spec.supported_ops` - In 2.x, in
+    line with future additions to the optimization framework, it has become an
+    attribute of `TargetSpec` and has been renamed to `supported_ops`.
 
-### Changes to `TFLiteConverter` attributes
+#### Unsupported attributes
 
-The `target_ops` attribute has become an attribute of `TargetSpec` and renamed
-to `supported_ops` in line with future additions to the optimization framework.
+The following 1.x attributes have been removed in 2.x.
 
-Additionally, the following attributes have been removed:
-
-*   `drop_control_dependency` (default: `True`)
-*   _Graph visualization_ - The recommended approach for visualizing a
-    TensorFlow Lite graph in TensorFlow 2.0 will be to use
-    [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py).
-    Unlike GraphViz, it enables users to visualize the graph after post training
-    quantization has occurred. The following attributes related to graph
-    visualization will be removed:
+*   _Quantization_ - In 2.x,
+    [quantize aware training](https://www.tensorflow.org/model_optimization/guide/quantization/training)
+    is supported through the Keras API and
+    [post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization)
+    uses fewer streamlined converter flags. Thus, the following attributes and
+    methods related to quantization have been removed:
+    *   `inference_type`
+    *   `quantized_input_stats`
+    *   `post_training_quantize`
+    *   `default_ranges_stats`
+    *   `reorder_across_fake_quant`
+    *   `change_concat_input_ranges`
+    *   `get_input_arrays()`
+*   _Visualization_ - In 2.x, the recommended approach for visualizing a
+    TensorFlow Lite graph is to use
+    [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py)
+    . Unlike GraphViz, it enables users to visualize the graph after post
+    training quantization has occurred. Thus, the following attributes related
+    to graph visualization have been removed:
     *   `output_format`
     *   `dump_graphviz_dir`
     *   `dump_graphviz_video`
+*   _Frozen graph_ - In 2.x, the frozen graph model format has been removed.
+    Thus, the following attribute related to frozen graphs has been removed:
+    *   `drop_control_dependency`
 
-### General API changes
+## Unsupported APIs
 
-The following section explains several significant API changes between
-TensorFlow 1.X and 2.0.
+The following section explains several significant features in 1.x that have
+been removed in 2.x.
 
-#### Conversion methods
+#### Conversion APIs
 
-The following methods that were previously deprecated in 1.X will no longer be
-exported in 2.0:
+The following methods were deprecated in 1.x and have been removed in 2.x:
 
 *   `lite.toco_convert`
 *   `lite.TocoConverter`
 
-#### `lite.constants`
+#### `lite.constants` API
 
-The `lite.constants` API was removed in 2.0 in order to decrease duplication
+The `lite.constants` API was removed in 2.x in order to decrease duplication
 between TensorFlow and TensorFlow Lite. The following list maps the
 `lite.constant` type to the TensorFlow type:
 
@@ -106,12 +106,15 @@ between TensorFlow and TensorFlow Lite. The following list maps the
 *   `lite.constants.STRING`: `tf.string`
 *   `lite.constants.QUANTIZED_UINT8`: `tf.uint8`
 
-Additionally, `lite.constants.TFLITE` and `lite.constants.GRAPHVIZ_DOT` were
-removed due to the deprecation of the `output_format` flag in `TFLiteConverter`.
+Additionally, the deprecation of the `output_format` flag in `TFLiteConverter`
+led to the removal of the following constants:
 
-#### `lite.OpHint`
+*   `lite.constants.TFLITE`
+*   `lite.constants.GRAPHVIZ_DOT`
 
-The `OpHint` API is currently not available in 2.0 due to an incompatibility
-with the 2.0 APIs. This API enables conversion of LSTM based models. Support for
-LSTMs in 2.0 is being investigated. All related `lite.experimental` APIs have
-been removed due to this issue.
+#### `lite.OpHint` API
+
+The `OpHint` API is currently unsupported due to an incompatibility with the 2.x
+APIs. This API enables conversion of LSTM based models. Support for LSTMs in 2.x
+is being investigated. All related `lite.experimental` APIs have been removed
+due to this issue.
diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
index ba86eac25fd..36dda16a77c 100644
--- a/tensorflow/lite/g3doc/convert/python_api.md
+++ b/tensorflow/lite/g3doc/convert/python_api.md
@@ -42,7 +42,7 @@ root.v1 = tf.Variable(3.)
 root.v2 = tf.Variable(2.)
 root.f = tf.function(lambda x: root.v1 * root.v2 * x)
 
-# Save the model.
+# Save the model in SavedModel format.
 export_dir = "/tmp/test_saved_model"
 input_data = tf.constant(1., shape=[1, 1])
 to_save = root.f.get_concrete_function(input_data)
@@ -51,6 +51,10 @@ tf.saved_model.save(root, export_dir, to_save)
 # Convert the model.
 converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
 tflite_model = converter.convert()
+
+# Save the TF Lite model.
+with tf.gfile.GFile('model.tflite', 'wb') as f:
+  f.write(tflite_model)
 ```
 
 This API does not have the option of specifying the input shape of any input
@@ -87,6 +91,10 @@ model.fit(x, y, epochs=50)
 # Convert the model.
 converter = tf.lite.TFLiteConverter.from_keras_model(model)
 tflite_model = converter.convert()
+
+# Save the TF Lite model.
+with tf.io.gfile.GFile('model.tflite', 'wb') as f:
+  f.write(tflite_model)
 ```
 
 ### Converting a concrete function <a name="concrete_function"></a>
@@ -115,6 +123,10 @@ concrete_func = root.f.get_concrete_function(input_data)
 # functions is under development.
 converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
 tflite_model = converter.convert()
+
+# Save the TF Lite model.
+with tf.gfile.GFile('model.tflite', 'wb') as f:
+  f.write(tflite_model)
 ```
 
 ### End-to-end MobileNet conversion <a name="mobilenet"></a>
diff --git a/tensorflow/lite/g3doc/guide/build_rpi.md b/tensorflow/lite/g3doc/guide/build_rpi.md
index 1e04ee77a0e..dfe3709b024 100644
--- a/tensorflow/lite/g3doc/guide/build_rpi.md
+++ b/tensorflow/lite/g3doc/guide/build_rpi.md
@@ -5,87 +5,98 @@ Raspberry Pi. If you just want to start using TensorFlow Lite to execute your
 models, the fastest option is to install the TensorFlow Lite runtime package as
 shown in the [Python quickstart](python.md).
 
-Note: This page shows how to compile only the C++ static library for
-TensorFlow Lite. Alternative install options include: [install just the Python
-interpreter API](python.md) (for inferencing only); [install the full
-TensorFlow package from pip](https://www.tensorflow.org/install/pip);
-or [build the full TensorFlow package](
-https://www.tensorflow.org/install/source_rpi).
-
+**Note:** This page shows how to compile only the C++ static library for
+TensorFlow Lite. Alternative install options include:
+[install just the Python interpreter API](python.md) (for inferencing only);
+[install the full TensorFlow package from pip](https://www.tensorflow.org/install/pip);
+or
+[build the full TensorFlow package](https://www.tensorflow.org/install/source_rpi).
 
 ## Cross-compile for Raspberry Pi
 
-This has been tested on Ubuntu 16.04.3 64bit and TensorFlow devel docker image
+Instruction has been tested on Ubuntu 16.04.3 64-bit PC (AMD64) and TensorFlow
+devel docker image
 [tensorflow/tensorflow:nightly-devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
 
-To cross compile TensorFlow Lite, first install the toolchain and libs:
+To cross compile TensorFlow Lite follow the steps:
 
-```bash
-sudo apt-get update
-sudo apt-get install crossbuild-essential-armhf
-# The following is only needed for Pi Zero build.
-sudo apt-get install crossbuild-essential-armel
-```
+1.  Clone official Raspberry Pi cross-compilation toolchain:
 
-If you are using Docker, you may not use `sudo`.
+    ```bash
+    git clone https://github.com/raspberrypi/tools.git rpi_tools
+    ```
 
-Now git-clone the TensorFlow repository
-(`https://github.com/tensorflow/tensorflow`)—if you're using the TensorFlow
-Docker image, the repo is already provided in `/tensorflow_src/`—and then run
-this script at the root of the TensorFlow repository to download all the
-build dependencies:
+2.  Clone TensorFlow repository:
 
-```bash
-./tensorflow/lite/tools/make/download_dependencies.sh
-```
+    ```bash
+    git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
 
-Note that you only need to do this once.
+    ```
 
-You should then be able to compile:
+    **Note:** If you're using the TensorFlow Docker image, the repo is already
+    provided in `/tensorflow_src/`.
 
-To build ARMv7 binary for Raspberry Pi 2, 3 and 4:
+3.  Run following script at the root of the TensorFlow repository to download
+    all the build dependencies:
 
-```bash
-./tensorflow/lite/tools/make/build_rpi_lib.sh
-```
+    ```bash
+    cd tensorflow_src && ./tensorflow/lite/tools/make/download_dependencies.sh
+    ```
 
-This should compile a static library in:
-`tensorflow/lite/tools/make/gen/rpi_armv7l/lib/libtensorflow-lite.a`.
+    **Note:** You only need to do this once.
 
-To build ARMv6 binary for Raspberry Pi Zero:
+4.  To build ARMv7 binary for Raspberry Pi 2, 3 and 4 execute:
 
-```bash
-./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6
-```
+    ```bash
+    PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH ./tensorflow/lite/tools/make/build_rpi_lib.sh
+    ```
 
-This should compile a static library in:
-`tensorflow/lite/tools/make/gen/rpi_armv6/lib/libtensorflow-lite.a`.
+    **Note:** This should compile a static library in:
+    `tensorflow/lite/tools/make/gen/rpi_armv7l/lib/libtensorflow-lite.a`.
+
+5.  To build ARMv6 binary for Raspberry Pi Zero execute:
+
+    ```bash
+    PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH ./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6
+    ```
+
+    **Note:** This should compile a static library in:
+    `tensorflow/lite/tools/make/gen/rpi_armv6/lib/libtensorflow-lite.a`.
 
 ## Compile natively on Raspberry Pi
 
-This has been tested on Raspberry Pi 3b, Raspbian GNU/Linux 9.1 (stretch), gcc version 6.3.0 20170516 (Raspbian 6.3.0-18+rpi1).
+Instruction has been tested on Raspberry Pi Zero, Raspbian GNU/Linux 10
+(buster), gcc version 8.3.0 (Raspbian 8.3.0-6+rpi1):
 
-Log in to your Raspberry Pi and install the toolchain:
+To natively compile TensorFlow Lite follow the steps:
 
-```bash
-sudo apt-get install build-essential
-```
+1.  Log in to your Raspberry Pi and install the toolchain:
 
-Now git-clone the TensorFlow repository
-(`https://github.com/tensorflow/tensorflow`) and run this at the root of
-the repository:
+    ```bash
+    sudo apt-get install build-essential
+    ```
 
-```bash
-./tensorflow/lite/tools/make/download_dependencies.sh
-```
+2.  Clone TensorFlow repository:
 
-Note that you only need to do this once.
+    ```bash
+    git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
 
-You should then be able to compile:
+    ```
 
-```bash
-./tensorflow/lite/tools/make/build_rpi_lib.sh
-```
+3.  Run following script at the root of the TensorFlow repository to download
+    all the build dependencies:
 
-This should compile a static library in:
-`tensorflow/lite/tools/make/gen/lib/rpi_armv7/libtensorflow-lite.a`.
+    ```bash
+    cd tensorflow_src && ./tensorflow/lite/tools/make/download_dependencies.sh
+    ```
+
+    **Note:** You only need to do this once.
+
+4.  You should then be able to compile TensorFlow Lite with:
+
+    ```bash
+    ./tensorflow/lite/tools/make/build_rpi_lib.sh
+    ```
+
+    **Note:** This should compile a static library in:
+    `tensorflow/lite/tools/make/gen/lib/rpi_armv6/libtensorflow-lite.a`.
diff --git a/tensorflow/lite/g3doc/guide/roadmap.md b/tensorflow/lite/g3doc/guide/roadmap.md
index 35ef44a7dbf..b762db12c44 100644
--- a/tensorflow/lite/g3doc/guide/roadmap.md
+++ b/tensorflow/lite/g3doc/guide/roadmap.md
@@ -1,4 +1,4 @@
-# TensorFlow Lite 2019 Roadmap
+# TensorFlow Lite Roadmap
 
 **Updated: April 18, 2020**
 
diff --git a/tensorflow/lite/g3doc/performance/coreml_delegate.md b/tensorflow/lite/g3doc/performance/coreml_delegate.md
index da3b943fd89..c3d72b2e01f 100644
--- a/tensorflow/lite/g3doc/performance/coreml_delegate.md
+++ b/tensorflow/lite/g3doc/performance/coreml_delegate.md
@@ -6,7 +6,7 @@ which results in faster model inference on iOS devices.
 
 Note: This delegate is in experimental (beta) phase.
 
-Note: Core ML delegate is using Core ML version 2.1.
+Note: Core ML delegate supports Core ML version 2 and later.
 
 **Supported iOS versions and devices:**
 
@@ -158,6 +158,14 @@ for more detail. Alternatively, you can implement your own set of blacklist
 devices using other libraries such as
 [DeviceKit](https://github.com/devicekit/DeviceKit).
 
+### Using older Core ML version
+
+Although iOS 13 supports Core ML 3, the model might work better when it is
+converted with Core ML 2 model specification. The target conversion version is
+set to the latest version by default, but you can change this by setting
+`coreMLVersion` (in Swift, `coreml_version` in C API) in the delegate option to
+older version.
+
 ## Supported ops
 
 Following ops are supported by the Core ML delegate.
@@ -187,6 +195,8 @@ Following ops are supported by the Core ML delegate.
 *   ReluN1To1
 *   Relu6
 *   Reshape
+    *   Only supported when target Core ML version is 2, not supported when
+        targeting Core ML 3.
 *   ResizeBilinear
 *   SoftMax
 *   Tanh
diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md
index 8762afb4c83..b5abf46f845 100644
--- a/tensorflow/lite/g3doc/performance/gpu.md
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@@ -31,7 +31,7 @@ models.
 For a step-by-step tutorial, watch the
 [GPU Delegate for Android](https://youtu.be/Xkhgre8r5G0) video.
 
-Note: This requires OpenGL ES 3.1 or higher.
+Note: This requires OpenCL or OpenGL ES (3.1 or higher).
 
 #### Step 1. Clone the TensorFlow source code and open it in Android Studio
 
diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md
index 9f47c2e55e8..dce3eb8db6b 100644
--- a/tensorflow/lite/g3doc/performance/gpu_advanced.md
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@@ -1,9 +1,9 @@
 # TensorFlow Lite on GPU
 
 [TensorFlow Lite](https://www.tensorflow.org/mobile/tflite/) supports several
-hardware accelerators.  This document describes how to use the GPU backend using
-the TensorFlow Lite delegate APIs on Android (requires OpenGL ES 3.1 or higher)
-and iOS (requires iOS 8 or later).
+hardware accelerators. This document describes how to use the GPU backend using
+the TensorFlow Lite delegate APIs on Android (requires OpenCL or OpenGL ES 3.1
+and higher) and iOS (requires iOS 8 or later).
 
 ## Benefits of GPU Acceleration
 
@@ -35,25 +35,33 @@ power and generating less heat than the same task run on a CPU.
 TensorFlow Lite on GPU supports the following ops in 16-bit and 32-bit float
 precision:
 
-* `ADD v1`
-* `AVERAGE_POOL_2D v1`
-* `CONCATENATION v1`
-* `CONV_2D v1`
-* `DEPTHWISE_CONV_2D v1-2`
-* `FULLY_CONNECTED v1`
-* `LOGISTIC v1`
-* `MAX_POOL_2D v1`
-* `MUL v1`
-* `PAD v1`
-* `PRELU v1`
-* `RELU v1`
-* `RELU6 v1`
-* `RESHAPE v1`
-* `RESIZE_BILINEAR v1`
-* `SOFTMAX v1`
-* `STRIDED_SLICE v1`
-* `SUB v1`
-* `TRANSPOSE_CONV v1`
+*   `ADD`
+*   `AVERAGE_POOL_2D`
+*   `CONCATENATION`
+*   `CONV_2D`
+*   `DEPTHWISE_CONV_2D v1-2`
+*   `EXP`
+*   `FULLY_CONNECTED`
+*   `LOGISTIC`
+*   `LSTM v2 (Basic LSTM only)`
+*   `MAX_POOL_2D`
+*   `MAXIMUM`
+*   `MINIMUM`
+*   `MUL`
+*   `PAD`
+*   `PRELU`
+*   `RELU`
+*   `RELU6`
+*   `RESHAPE`
+*   `RESIZE_BILINEAR v1-3`
+*   `SOFTMAX`
+*   `STRIDED_SLICE`
+*   `SUB`
+*   `TRANSPOSE_CONV`
+
+By default, all ops are only supported at version 1. Enabling the
+[experimental quantization support](gpu_advanced.md#running-quantized-models-experimental-android-only)
+allows the appropriate versions; for example, ADD v2.
 
 ## Basic Usage
 
@@ -82,8 +90,8 @@ delegate.close();
 ### Android (C/C++)
 
 For C/C++ usage of TensorFlow Lite GPU on Android, the GPU delegate can be
-created with `TfLiteGpuDelegateCreate()` and destroyed with
-`TfLiteGpuDelegateDelete()`.
+created with `TfLiteGpuDelegateV2Create()` and destroyed with
+`TfLiteGpuDelegateV2Delete()`.
 
 ```c++
 // Set up interpreter.
@@ -94,15 +102,7 @@ std::unique_ptr<Interpreter> interpreter;
 InterpreterBuilder(*model, op_resolver)(&interpreter);
 
 // NEW: Prepare GPU delegate.
-const TfLiteGpuDelegateOptions options = {
-  .metadata = NULL,
-  .compile_options = {
-    .precision_loss_allowed = 1,  // FP16
-    .preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST,
-    .dynamic_batch_enabled = 0,   // Not fully functional yet
-  },
-};
-auto* delegate = TfLiteGpuDelegateCreate(&options);
+auto* delegate = TfLiteGpuDelegateV2Create(/*default options=*/nullptr);
 if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
 
 // Run inference.
@@ -111,9 +111,13 @@ if (interpreter->Invoke() != kTfLiteOk) return false;
 ReadFromOutputTensor(interpreter->typed_output_tensor<float>(0));
 
 // NEW: Clean up.
-TfLiteGpuDelegateDelete(delegate);
+TfLiteGpuDelegateV2Delete(delegate);
 ```
 
+Take a look at `TfLiteGpuDelegateOptionsV2` to create a delegate instance with
+custom options. You can initialize the default options with
+`TfLiteGpuDelegateOptionsV2Default()` and then modify them as necessary.
+
 TFLite GPU for Android C/C++ uses the [Bazel](https://bazel.io) build system.
 The delegate can be built, for example, using the following command:
 
@@ -165,6 +169,43 @@ called.
 
 ## Advanced Usage
 
+### Running quantized models (Experimental, Android only)
+
+The GPU delegate already supports
+[float16 quantized](https://www.tensorflow.org/lite/performance/post_training_float16_quant)
+models. There is experimental support on Android to run 8-bit quantized as well.
+This includes all flavors of quantization, including:
+
+*   Models trained with
+    [Quantization-aware training](https://www.tensorflow.org/lite/convert/quantization)
+*   [Post-training dynamic-range quantization](https://www.tensorflow.org/lite/performance/post_training_quant)
+*   [Post-training full-integer quantization](https://www.tensorflow.org/lite/performance/post_training_integer_quant)
+
+To optimize performance, use models that have floating-point input & output
+tensors.
+
+This feature can be enabled using delegate options as follows:
+
+**C++ API**
+
+```c++
+// NEW: Prepare custom options with feature enabled.
+TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
+options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
+
+auto* delegate = TfLiteGpuDelegateV2Create(options);
+if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
+```
+
+**Java API**
+
+```java
+// NEW: Prepare GPU delegate with feature turned on.
+GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
+
+Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
+```
+
 ### Delegate Options for iOS
 
 `NewGpuDelegate()` accepts a `struct` of options.
@@ -210,7 +251,7 @@ While it is convenient to use `nullptr`, we recommend that you explicitly set
 the options, to avoid any unexpected behavior if default values are changed in
 the future.
 
-### Input/Output Buffers
+### Input/Output Buffers (iOS only)
 
 To do computation on the GPU, data must be made available to the GPU. This often
 requires performing a memory copy. It is desirable not to cross the CPU/GPU
@@ -229,80 +270,10 @@ To achieve best performance, TensorFlow Lite makes it possible for users to
 directly read from and write to the TensorFlow hardware buffer and bypass
 avoidable memory copies.
 
-#### Android
-
-Assuming the image input is in the GPU memory, it must first be converted to an
-OpenGL Shader Storage Buffer Object (SSBO). You can associate a TfLiteTensor to
-a user-prepared SSBO with `Interpreter.bindGlBufferToTensor()`. Note that
-`Interpreter.bindGlBufferToTensor()` must be called before
-`Interpreter.modifyGraphWithDelegate()`.
-
-```java
-// Ensure a valid EGL rendering context.
-EGLContext eglContext = eglGetCurrentContext();
-if (eglContext.equals(EGL_NO_CONTEXT)) return false;
-
-// Create an SSBO.
-int[] id = new int[1];
-glGenBuffers(id.length, id, 0);
-glBindBuffer(GL_SHADER_STORAGE_BUFFER, id[0]);
-glBufferData(GL_SHADER_STORAGE_BUFFER, inputSize, null, GL_STREAM_COPY);
-glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);  // unbind
-int inputSsboId = id[0];
-
-// Create interpreter.
-Interpreter interpreter = new Interpreter(tfliteModel);
-Tensor inputTensor = interpreter.getInputTensor(0);
-GpuDelegate gpuDelegate = new GpuDelegate();
-// The buffer must be bound before the delegate is installed.
-gpuDelegate.bindGlBufferToTensor(inputTensor, inputSsboId);
-interpreter.modifyGraphWithDelegate(gpuDelegate);
-
-// Run inference; the null input argument indicates use of the bound buffer for input.
-fillSsboWithCameraImageTexture(inputSsboId);
-float[] outputArray = new float[outputSize];
-interpreter.runInference(null, outputArray);
-```
-
-A similar approach can be applied to the output tensor. In that case,
-`Interpreter.Options.setAllowBufferHandleOutput(true)` should be passed on, to
-disable the default copying of the network's output from GPU memory to CPU
-memory.
-
-```java
-// Ensure a valid EGL rendering context.
-EGLContext eglContext = eglGetCurrentContext();
-if (eglContext.equals(EGL_NO_CONTEXT)) return false;
-
-// Create a SSBO.
-int[] id = new int[1];
-glGenBuffers(id.length, id, 0);
-glBindBuffer(GL_SHADER_STORAGE_BUFFER, id[0]);
-glBufferData(GL_SHADER_STORAGE_BUFFER, outputSize, null, GL_STREAM_COPY);
-glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);  // unbind
-int outputSsboId = id[0];
-
-// Create interpreter.
-Interpreter.Options options = (new Interpreter.Options()).setAllowBufferHandleOutput(true);
-Interpreter interpreter = new Interpreter(tfliteModel, options);
-Tensor outputTensor = interpreter.getOutputTensor(0);
-GpuDelegate gpuDelegate = new GpuDelegate();
-// The buffer must be bound before the delegate is installed.
-gpuDelegate.bindGlBufferToTensor(outputTensor, outputSsboId);
-interpreter.modifyGraphWithDelegate(gpuDelegate);
-
-// Run inference; the null output argument indicates use of the bound buffer for output.
-ByteBuffer input = getCameraImageByteBuffer();
-interpreter.runInference(input, null);
-renderOutputSsbo(outputSsboId);
-```
-
-#### iOS
-
 Assuming the image input is in GPU memory, it must first be converted to a
 `MTLBuffer` object for Metal. You can associate a TfLiteTensor to a
-user-prepared `MTLBuffer` with `BindMetalBufferToTensor()`. Note that
-`BindMetalBufferToTensor()` must be called before
+user-prepared `MTLBuffer` with `TFLGpuDelegateBindMetalBufferToTensor()`. Note
+that `TFLGpuDelegateBindMetalBufferToTensor()` must be called before
 `Interpreter::ModifyGraphWithDelegate()`. Additionally, the inference output is,
 by default, copied from GPU memory to CPU memory. This behavior can be turned
 off by calling `Interpreter::SetAllowBufferHandleOutput(true)` during
@@ -312,8 +283,8 @@ initialization.
 // Prepare GPU delegate.
 auto* delegate = NewGpuDelegate(nullptr);
 interpreter->SetAllowBufferHandleOutput(true);  // disable default gpu->cpu copy
-if (!BindMetalBufferToTensor(delegate, interpreter->inputs()[0], user_provided_input_buffer)) return false;
-if (!BindMetalBufferToTensor(delegate, interpreter->outputs()[0], user_provided_output_buffer)) return false;
+if (!TFLGpuDelegateBindMetalBufferToTensor(delegate, interpreter->inputs()[0], user_provided_input_buffer)) return false;
+if (!TFLGpuDelegateBindMetalBufferToTensor(delegate, interpreter->outputs()[0], user_provided_output_buffer)) return false;
 if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
 
 // Run inference.
diff --git a/tensorflow/lite/g3doc/performance/hexagon_delegate.md b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
index 51af59891dc..b76b4ba9fdf 100644
--- a/tensorflow/lite/g3doc/performance/hexagon_delegate.md
+++ b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
@@ -22,15 +22,15 @@ are supported, including:
 
 **Supported models:**
 
-The Hexagon delegate currently supports quantized models generated using
-[quantization-aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize),
-e.g.,
-[these quantized models](https://www.tensorflow.org/lite/guide/hosted_models#quantized_models)
-hosted on the TensorFlow Lite repo. It does not (yet) support models with
-[8-bit symmetric quantization spec](https://www.tensorflow.org/lite/performance/quantization_spec).
-Sample models include
-[MobileNet V1](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz),
-[SSD Mobilenet](https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip).
+The Hexagon delegate supports all models that conform to our
+[8-bit symmetric quantization spec](https://www.tensorflow.org/lite/performance/quantization_spec),
+including those generated using
+[post-training integer quantization](https://www.tensorflow.org/lite/performance/post_training_integer_quant).
+UInt8 models trained with the legacy
+[quantization-aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize)
+path are also supported, for e.g.,
+[these quantized versions](https://www.tensorflow.org/lite/guide/hosted_models#quantized_models)
+on our Hosted Models page.
 
 ## Hexagon Delegate Java API
 
@@ -244,6 +244,10 @@ TfLiteHexagonTearDown();  // Needed once at end of app/DSP usage.
     *   ARM 32-bit: `app/src/main/jniLibs/armeabi-v7a`
 *   Put your .so in the directory that match the architecture.
 
+Note: If you're using App Bundle for publishing your Application, you might want
+to set android.bundle.enableUncompressedNativeLibs=false in the
+gradle.properties file.
+
 ## Feedback
 
 For issues, please create a
@@ -254,48 +258,9 @@ ro.board.platform`).
 
 ## FAQ
 
-*   Will the delegate support models created using
-    [post-training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization)?
-    *   This is tentatively planned for a future release, though there is no
-        concrete timeline.
 *   Which ops are supported by the delegate?
-    *   Initial list of supported ops:
-        *   Add
-        *   ArgMax
-        *   ArgMin
-        *   AveragePool2D (without any activation)
-        *   Concat
-        *   Conv2D with following constraints:
-            *   stride width/height <= 3
-        *   DepthToSpace
-        *   DepthwiseConv2D with following constraints:
-            *   Filter width == 3
-            *   depth_multiplier == 1
-            *   dilation only supported when stride == 1
-            *   Otherwise, stride height/width <= 3
-        *   FullyConnected (without any activation)
-        *   Hardswish
-        *   L2Normalization (without any activation)
-        *   Logistic (aka Sigmoid)
-        *   MaxPool2D (without any activation)
-        *   Mul (without any activation)
-        *   Neg
-        *   Pad: Only supports 0 padding
-        *   Relu
-        *   Relu6
-        *   Reshape
-        *   Resize Bilinear with following constraints:
-            *   Requested size <= 65
-        *   Resize Nearest Neighbor
-        *   SoftMax
-        *   SpaceToDepth
-        *   Split
-        *   Sub
-        *   Tanh
-        *   Transpose
-        *   TransposeConv2D with following constraints:
-            *   stride height/width <= 3
-            *   dilation height/width == 1
+    *   See the current list of
+        [supported ops and constraints](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/delegates/hexagon/README.md)
 *   How can I tell that the model is using the DSP when I enable the delegate?
     *   Two log messages will be printed when you enable the delegate - one to
         indicate if the delegate was created and another to indicate how many
diff --git a/tensorflow/lite/g3doc/performance/model_optimization.md b/tensorflow/lite/g3doc/performance/model_optimization.md
index feb6cfecea6..c45aacbb0c8 100644
--- a/tensorflow/lite/g3doc/performance/model_optimization.md
+++ b/tensorflow/lite/g3doc/performance/model_optimization.md
@@ -79,19 +79,18 @@ with TensorFlow Lite.
 
 ### Quantization
 
-[Quantization](https://www.tensorflow.org/model_optimization/guide/quantization)
-works by reducing the precision of the numbers used to represent a model's
-parameters, which by default are 32-bit floating point numbers. This results in
-a smaller model size and faster computation.
+Quantization works by reducing the precision of the numbers used to represent a
+model's parameters, which by default are 32-bit floating point numbers. This
+results in a smaller model size and faster computation.
 
 The following types of quantization are available in TensorFlow Lite:
 
 Technique                                                                                               | Data requirements                | Size reduction | Accuracy                    | Supported hardware
 ------------------------------------------------------------------------------------------------------- | -------------------------------- | -------------- | --------------------------- | ------------------
 [Post-training float16 quantization](post_training_float16_quant.ipynb)                                 | No data                          | Up to 50%      | Insignificant accuracy loss | CPU, GPU
-[Post-training dynamic range quantization](post_training_quant.ipynb)                                   | No data                          | Up to 75%      | Accuracy loss               | CPU
-[Post-training integer quantization](post_training_integer_quant.ipynb)                                 | Unlabelled representative sample | Up to 75%      | Smaller accuracy loss       | CPU, EdgeTPU, Hexagon DSP
-[Quantization-aware training](http://www.tensorflow.org/model_optimization/guide/quantization/training) | Labelled training data           | Up to 75%      | Smallest accuracy loss      | CPU, EdgeTPU, Hexagon DSP
+[Post-training dynamic range quantization](post_training_quant.ipynb)                                   | No data                          | Up to 75%      | Accuracy loss               | CPU, GPU (Android)
+[Post-training integer quantization](post_training_integer_quant.ipynb)                                 | Unlabelled representative sample | Up to 75%      | Smaller accuracy loss       | CPU, GPU (Android), EdgeTPU, Hexagon DSP
+[Quantization-aware training](http://www.tensorflow.org/model_optimization/guide/quantization/training) | Labelled training data           | Up to 75%      | Smallest accuracy loss      | CPU, GPU (Android), EdgeTPU, Hexagon DSP
 
 Below are the latency and accuracy results for post-training quantization and
 quantization-aware training on a few models. All latency numbers are measured on
diff --git a/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
index cf589a2b968..ef08902865e 100644
--- a/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
@@ -61,6 +61,9 @@
         "  \u003ctd\u003e\n",
         "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
         "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
         "\u003c/table\u003e"
       ]
     },
@@ -113,13 +116,7 @@
         "import logging\n",
         "logging.getLogger(\"tensorflow\").setLevel(logging.DEBUG)\n",
         "\n",
-        "try:\n",
-        "  # %tensorflow_version only exists in Colab.\n",
-        "  import tensorflow.compat.v2 as tf\n",
-        "except Exception:\n",
-        "  pass\n",
-        "tf.enable_v2_behavior()\n",
-        "\n",
+        "import tensorflow as tf\n",
         "from tensorflow import keras\n",
         "import numpy as np\n",
         "import pathlib"
@@ -173,12 +170,12 @@
         "  keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation=tf.nn.relu),\n",
         "  keras.layers.MaxPooling2D(pool_size=(2, 2)),\n",
         "  keras.layers.Flatten(),\n",
-        "  keras.layers.Dense(10, activation=tf.nn.softmax)\n",
+        "  keras.layers.Dense(10)\n",
         "])\n",
         "\n",
         "# Train the digit classification model\n",
         "model.compile(optimizer='adam',\n",
-        "              loss='sparse_categorical_crossentropy',\n",
+        "              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
         "              metrics=['accuracy'])\n",
         "model.fit(\n",
         "  train_images,\n",
@@ -597,28 +594,11 @@
         "\n",
         "Detailed documentation on the TFLite GPU delegate and how to use it in your application can be found [here](https://www.tensorflow.org/lite/performance/gpu_advanced?source=post_page---------------------------)"
       ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "BeUSdwKVixvk"
-      },
-      "outputs": [],
-      "source": [
-        ""
-      ]
     }
   ],
   "metadata": {
     "colab": {
       "collapsed_sections": [],
-      "last_runtime": {
-        "build_target": "//learning/brain/python/client:colab_notebook_py3",
-        "kind": "private"
-      },
       "name": "post_training-float16-quant.ipynb",
       "private_outputs": true,
       "provenance": [],
diff --git a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
index fddee15bc1d..ad461f56d6f 100644
--- a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
@@ -61,6 +61,9 @@
         "  \u003ctd\u003e\n",
         "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
         "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
         "\u003c/table\u003e"
       ]
     },
@@ -114,13 +117,7 @@
         "import logging\n",
         "logging.getLogger(\"tensorflow\").setLevel(logging.DEBUG)\n",
         "\n",
-        "try:\n",
-        "  # %tensorflow_version only exists in Colab.\n",
-        "  import tensorflow.compat.v2 as tf\n",
-        "except Exception:\n",
-        "  pass\n",
-        "tf.enable_v2_behavior()\n",
-        "\n",
+        "import tensorflow as tf\n",
         "from tensorflow import keras\n",
         "import numpy as np\n",
         "import pathlib"
@@ -158,15 +155,15 @@
         "model = keras.Sequential([\n",
         "  keras.layers.InputLayer(input_shape=(28, 28)),\n",
         "  keras.layers.Reshape(target_shape=(28, 28, 1)),\n",
-        "  keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation=tf.nn.relu),\n",
+        "  keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation='relu'),\n",
         "  keras.layers.MaxPooling2D(pool_size=(2, 2)),\n",
         "  keras.layers.Flatten(),\n",
-        "  keras.layers.Dense(10, activation=tf.nn.softmax)\n",
+        "  keras.layers.Dense(10)\n",
         "])\n",
         "\n",
         "# Train the digit classification model\n",
         "model.compile(optimizer='adam',\n",
-        "              loss='sparse_categorical_crossentropy',\n",
+        "              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
         "              metrics=['accuracy'])\n",
         "model.fit(\n",
         "  train_images,\n",
@@ -277,7 +274,7 @@
       },
       "outputs": [],
       "source": [
-        "converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]"
+        "converter.optimizations = [tf.lite.Optimize.DEFAULT]"
       ]
     },
     {
@@ -655,10 +652,6 @@
   "metadata": {
     "colab": {
       "collapsed_sections": [],
-      "last_runtime": {
-        "build_target": "//learning/brain/python/client:colab_notebook_py3",
-        "kind": "private"
-      },
       "name": "post_training_integer_quant.ipynb",
       "private_outputs": true,
       "provenance": [],
diff --git a/tensorflow/lite/g3doc/performance/post_training_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
index d6edb656d0e..201ccf5bdc3 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
@@ -61,6 +61,9 @@
         "  \u003ctd\u003e\n",
         "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
         "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/lite/g3doc/performance/post_training_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
         "\u003c/table\u003e"
       ]
     },
@@ -134,13 +137,7 @@
         "import logging\n",
         "logging.getLogger(\"tensorflow\").setLevel(logging.DEBUG)\n",
         "\n",
-        "try:\n",
-        "  # %tensorflow_version only exists in Colab.\n",
-        "  import tensorflow.compat.v2 as tf\n",
-        "except Exception:\n",
-        "  pass\n",
-        "tf.enable_v2_behavior()\n",
-        "\n",
+        "import tensorflow as tf\n",
         "from tensorflow import keras\n",
         "import numpy as np\n",
         "import pathlib"
@@ -181,12 +178,12 @@
         "  keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation=tf.nn.relu),\n",
         "  keras.layers.MaxPooling2D(pool_size=(2, 2)),\n",
         "  keras.layers.Flatten(),\n",
-        "  keras.layers.Dense(10, activation=tf.nn.softmax)\n",
+        "  keras.layers.Dense(10)\n",
         "])\n",
         "\n",
         "# Train the digit classification model\n",
         "model.compile(optimizer='adam',\n",
-        "              loss='sparse_categorical_crossentropy',\n",
+        "              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
         "              metrics=['accuracy'])\n",
         "model.fit(\n",
         "  train_images,\n",
@@ -620,10 +617,6 @@
   "metadata": {
     "colab": {
       "collapsed_sections": [],
-      "last_runtime": {
-        "build_target": "//learning/brain/python/client:colab_notebook_py3",
-        "kind": "private"
-      },
       "name": "post_training_quant.ipynb",
       "private_outputs": true,
       "provenance": [],
diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index 194d102d43d..a526be75b61 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -4,51 +4,44 @@ Post-training quantization is a conversion technique that can reduce model size
 while also improving CPU and hardware accelerator latency, with little
 degradation in model accuracy. You can perform these techniques using an
 already-trained float TensorFlow model when you convert it to TensorFlow Lite
-format.
+format using the [TensorFlow Lite Converter](../convert/).
 
 Note: The procedures on this page require TensorFlow 1.15 or higher.
 
-
-### Optimization options
+### Optimization Methods
 
 There are several post-training quantization options to choose from. Here is a
 summary table of the choices and the benefits they provide:
 
-| Technique                 | Benefits                  | Hardware            |
-| ------------------------- | ------------------------- | ------------------- |
-| Dynamic range             | 4x smaller, 2-3x speedup, | CPU                 |
-: quantization              : accuracy                  :                     :
-| Full integer quantization | 4x smaller, 3x+ speedup   | CPU, Edge TPU, etc. |
-| Float16 quantization      | 2x smaller, potential GPU | CPU/GPU             |
-:                           : acceleration              :                     :
+| Technique            | Benefits                  | Hardware         |
+| -------------------- | ------------------------- | ---------------- |
+| Dynamic range        | 4x smaller, 2-3x speedup  | CPU              |
+: quantization         :                           :                  :
+| Full integer         | 4x smaller, 3x+ speedup   | CPU, Edge TPU,   |
+: quantization         :                           : Microcontrollers :
+| Float16 quantization | 2x smaller, potential GPU | CPU, GPU         |
+:                      : acceleration              :                  :
 
 This decision tree can help determine which post-training quantization method is
 best for your use case:
 
 ![post-training optimization options](images/optimization.jpg)
 
-Alternatively, you might achieve higher accuracy if you perform
-[quantization-aware training](
-https://github.com/tensorflow/tensorflow/tree/r1.14/tensorflow/contrib/quantize).
-However, doing so requires some model modifications to add fake quantization
-nodes, whereas the post-training quantization techniques on this page use an
-existing pre-trained model.
-
 ### Dynamic range quantization
 
 The simplest form of post-training quantization statically quantizes only the
-weights from floating point to 8-bits of precision. This technique is enabled as
-an option in the [TensorFlow Lite converter](../convert/):
+weights from floating point to integer, which has 8-bits of precision:
 
-```
+<pre>
 import tensorflow as tf
 converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
+<b>converter.optimizations = [tf.lite.Optimize.DEFAULT]</b>
 tflite_quant_model = converter.convert()
-```
+</pre>
 
-At inference, weights are converted from 8-bits of precision to floating point and
-computed using floating-point kernels. This conversion is done once and cached to reduce latency.
+At inference, weights are converted from 8-bits of precision to floating point
+and computed using floating-point kernels. This conversion is done once and
+cached to reduce latency.
 
 To further improve latency, "dynamic-range" operators dynamically quantize
 activations based on their range to 8-bits and perform computations with 8-bit
@@ -58,89 +51,105 @@ point, so that the speedup with dynamic-range ops is less than a full
 fixed-point computation. Dynamic-range ops are available for the most
 compute-intensive operators in a network:
 
-*  [tf.contrib.layers.fully_connected](https://www.tensorflow.org/api_docs/python/tf/contrib/layers/fully_connected)
-*  [tf.nn.conv2d](https://www.tensorflow.org/api_docs/python/tf/nn/conv2d)
-*  [tf.nn.embedding_lookup](https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup)
-*  [BasicRNN](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/BasicRNNCell)
-*  [tf.nn.bidirectional_dynamic_rnn for BasicRNNCell type](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
-*  [tf.nn.dynamic_rnn for LSTM and BasicRNN Cell types](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
+*   `tf.keras.layers.Dense`
+*   `tf.keras.layers.Conv2D`
+*   `tf.keras.layers.LSTM`
+*   `tf.nn.embedding_lookup`
+*   `tf.compat.v1.nn.rnn_cell.BasicRNNCell`
+*   `tf.compat.v1.nn.bidirectional_dynamic_rnn`
+*   `tf.compat.v1.nn.dynamic_rnn`
 
-
-### Full integer quantization of weights and activations
+### Full integer quantization
 
 You can get further latency improvements, reductions in peak memory usage, and
-access to integer only hardware accelerators by making sure all model math is
-quantized.
+access to integer only hardware devices or accelerators by making sure all model
+math is integer quantized.
 
 To do this, you need to measure the dynamic range of activations and inputs by
-supplying a representative data set. You can simply create an input data
-generator and provide it to our converter. For example:
+supplying sample input data to the converter. Refer to the
+`representative_dataset_gen()` function used in the following code.
 
-```
+#### Integer with float fallback (using default float input/output)
+
+In order to fully integer quantize a model, but use float operators when they
+don't have an integer implementation (to ensure conversion occurs smoothly), use
+the following steps:
+
+<pre>
 import tensorflow as tf
-
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+<b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
 def representative_dataset_gen():
   for _ in range(num_calibration_steps):
     # Get sample input data as a numpy array in a method of your choosing.
     yield [input]
-
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.representative_dataset = representative_dataset_gen
+converter.representative_dataset = representative_dataset_gen</b>
 tflite_quant_model = converter.convert()
-```
+</pre>
 
-The resulting model should be fully quantized, but any
-ops that do not have quantized implementations are left in
-floating point. This allows conversion to occur smoothly, but the model won't be
-compatible with accelerators that require full integer quantization.
+Note: This won't be compatible with integer only devices (such as 8-bit
+microcontrollers) and accelerators (such as the Coral Edge TPU). For convenience
+during inference, the input and output still remain float in order to have the
+same interface as the original float only model.
 
-Additionally, the model still uses float input and output for convenience.
+#### Integer only
 
-To ensure compatibility with some accelerators (such as the Coral Edge TPU), you
-can enforce full integer quantization for all ops and use integer input and
-output by adding the following lines before you convert:
+*This is a common use case for
+[TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers)
+and [Coral Edge TPUs](https://coral.ai/).*
 
-```
-converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-converter.inference_input_type = tf.uint8
-converter.inference_output_type = tf.uint8
-```
+Additionally, to ensure compatibility with integer only devices (such as 8-bit
+microcontrollers) and accelerators (such as the Coral Edge TPU), you can enforce
+full integer quantization for all ops including the input and output, by using
+the following steps:
 
-The first line makes the converter throw an error if it encounters an operation
-it cannot currently quantize.
-
-Note: `target_spec.supported_ops` was previously `target_ops` in the Python API.
-
-
-### Float16 quantization of weights
-
-You can reduce the size of a floating point model by quantizing the weights to
-float16, the IEEE standard for 16-bit floating point numbers. The advantages of
-this quantization are as follows:
-
--   reduce model size by up to half (since all weights are now half the original
-    size)
--   minimal loss in accuracy
--   some delegates (e.g. the GPU delegate) can operate directly on float16 data,
-    which results in faster execution than float32 computations.
-
-This quantization may not be a good choice if you need maximum performance (a
-quantization to fixed point math would be better in that case). To enable
-float16 quantization of weights, specify "DEFAULT" optimization as above and
-then specify that float16 is in supported types for the target_spec:
-
-```
+<pre>
 import tensorflow as tf
 converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.target_spec.supported_types = [tf.lite.constants.FLOAT16]
+def representative_dataset_gen():
+  for _ in range(num_calibration_steps):
+    # Get sample input data as a numpy array in a method of your choosing.
+    yield [input]
+converter.representative_dataset = representative_dataset_gen
+<b>converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]</b>
+<b>converter.inference_input_type = tf.int8</b>  # or tf.uint8
+<b>converter.inference_output_type = tf.int8</b>  # or tf.uint8
 tflite_quant_model = converter.convert()
-```
+</pre>
 
-By default, a float16 quantized model will "dequantize" the weights values to
-float32 when run on the CPU. The GPU delegate will not perform this
-dequantization, since it can operate on float16 data.
+Note: The converter will throw an error if it encounters an operation it cannot
+currently quantize.
+
+### Float16 quantization
+
+You can reduce the size of a floating point model by quantizing the weights to
+float16, the IEEE standard for 16-bit floating point numbers. To enable float16
+quantization of weights, use the following steps:
+
+<pre>
+import tensorflow as tf
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+<b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_types = [tf.lite.constants.FLOAT16]</b>
+tflite_quant_model = converter.convert()
+</pre>
+
+The advantages of this quantization are as follows:
+
+*   Reduce model size by up to half (since all weights are now half the original
+    size).
+*   Minimal loss in accuracy.
+*   Supports some delegates (e.g. the GPU delegate) can operate directly on
+    float16 data, which results in faster execution than float32 computations.
+
+The disadvantages of this quantization are as follows:
+
+*   Not a good choice for maximum performance (a quantization to fixed point
+    math would be better in that case).
+*   By default, a float16 quantized model will "dequantize" the weights values
+    to float32 when run on the CPU. (Note that the GPU delegate will not perform
+    this dequantization, since it can operate on float16 data.)
 
 ### Model accuracy
 
@@ -152,13 +161,18 @@ accuracy of the quantized model to verify that any degradation in accuracy is
 within acceptable limits. There is a tool to evaluate
 [TensorFlow Lite model accuracy](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/accuracy/ilsvrc/README.md){:.external}.
 
-If the accuracy drop is too high, consider using
-[quantization aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize){:.external}.
+Alternatively, if the accuracy drop is too high, consider using
+[quantization aware training](https://www.tensorflow.org/model_optimization/guide/quantization/training)
+. However, doing so requires modifications during model training to add fake
+quantization nodes, whereas the post-training quantization techniques on this
+page use an existing pre-trained model.
 
 ### Representation for quantized tensors
 
 8-bit quantization approximates floating point values using the following
-formula. `real_value = (int8_value - zero_point) * scale`.
+formula.
+
+$$real\_value = (int8\_value - zero\_point) \times scale$$
 
 The representation has two main parts:
 
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
index 64a03f0fc85..464a5d1b5ef 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
@@ -49,7 +49,7 @@
       "metadata": {
         "colab_type": "text",
         "id": "nDABAblytltI"
-      },
+      }, 
       "source": [
         "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
         "  \u003ctd\u003e\n",
@@ -101,7 +101,7 @@
       },
       "outputs": [],
       "source": [
-        "!pip install git+git://github.com/tensorflow/examples.git#egg=tensorflow-examples[model_maker]"
+        "!pip install git+https://github.com/tensorflow/examples.git#egg=tensorflow-examples[model_maker]"
       ]
     },
     {
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
index 4c6a8a72154..e10507ccac7 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
@@ -101,7 +101,7 @@
       },
       "outputs": [],
       "source": [
-        "!pip install git+git://github.com/tensorflow/examples.git#egg=tensorflow-examples[model_maker]"
+        "!pip install git+https://github.com/tensorflow/examples.git#egg=tensorflow-examples[model_maker]"
       ]
     },
     {
@@ -632,7 +632,7 @@
         "id": "EoWiA_zX8rxE"
       },
       "source": [
-        "# Advanced Usage\n",
+        "## Advanced Usage\n",
         "\n",
         "The `create` function is the critical part of this library in which parameter `model_spec` defines the specification of the model, currently `AverageWordVecModelSpec` and `BertModelSpec` is supported. The `create` function contains the following steps for `AverageWordVecModelSpec`:\n",
         "\n",
@@ -651,7 +651,7 @@
         "id": "mwtiksguDfhl"
       },
       "source": [
-        "# Adjust the model\n",
+        "## Adjust the model\n",
         "\n",
         "We could adjust the model infrastructure like variables `wordvec_dim`, `seq_len` in `AverageWordVecModelSpec` class.\n"
       ]
@@ -736,7 +736,7 @@
         "id": "LvQuy7RSDir3"
       },
       "source": [
-        "## Change the training hyperparameters\n",
+        "### Change the training hyperparameters\n",
         "We could also change the training hyperparameters like `epochs` and `batch_size` that could affect the model accuracy. For instance,\n",
         "\n",
         "*   `epochs`: more epochs could achieve better accuracy, but may lead to overfitting.\n",
@@ -788,7 +788,7 @@
         "id": "Eq6B9lKMfhS6"
       },
       "source": [
-        "## Change the Model\n",
+        "### Change the Model\n",
         "\n",
         "We could change the model by changing the `model_spec`. The following shows how we change to BERT-base model.\n",
         "\n",
diff --git a/tensorflow/lite/graph_info.cc b/tensorflow/lite/graph_info.cc
index 875a03af817..a419a56a9e6 100644
--- a/tensorflow/lite/graph_info.cc
+++ b/tensorflow/lite/graph_info.cc
@@ -191,11 +191,11 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
   std::vector<NodeSubset>* node_subsets_;
   std::vector<NodeSubset::Type> node_type_;
   // Maps from tensor index to the epoch in which it is assigned. Also special
-  // negative values of kEpochNotAssigned if not assigned, kEpochNotReady if it
-  // is an input or constant.
+  // negative values of kEpochNotReady if not assigned, kEpochAlwaysReady if it
+  // is an input to the whole model or a constant that has no dependencies.
   std::vector<int> tensor_epochs_;
   // Maps from tensor index to the epoch in which it is assigned. Also special
-  // negative values of kEpochNotAssigned if not assigned.
+  // negative values of kEpochNotReady if not assigned.
   std::vector<int> node_epochs_;
 };
 
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index c8ccf671d60..cae2ca7dde0 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/delegates/status.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/memory_planner.h"
 #include "tensorflow/lite/minimal_logging.h"
@@ -71,6 +72,17 @@ TfLiteQuantization GetQuantizationFromLegacy(
   return quantization;
 }
 
+// TODO(b/153131797): We have put 'delegate_status' to 0 in the following macro
+// temporarily because delegate-specific error codes are either not retrievable
+// at the moment, which we will add later.
+#define TF_LITE_ENSURE_STATUS_WITH_SCOPED_INSTRUMENTATION(runtime_event, a) \
+  do {                                                                      \
+    TfLiteStatus status = (a);                                              \
+    runtime_event.set_runtime_status(/*delegate_status=*/0,                 \
+                                     static_cast<int64_t>(status));         \
+    TF_LITE_ENSURE_STATUS(status);                                          \
+  } while (0)
+
 }  // namespace
 
 Interpreter::Interpreter(ErrorReporter* error_reporter)
@@ -210,11 +222,15 @@ TfLiteStatus Interpreter::ReleaseNonPersistentMemory() {
 }
 
 TfLiteStatus Interpreter::Invoke() {
-  TF_LITE_ENSURE_STATUS(primary_subgraph().Invoke());
+  ScopedRuntimeInstrumentationProfile scoped_runtime_event(installed_profiler_,
+                                                           "invoke");
+  TF_LITE_ENSURE_STATUS_WITH_SCOPED_INSTRUMENTATION(
+      scoped_runtime_event, primary_subgraph().Invoke());
 
   if (!allow_buffer_handle_output_) {
     for (int tensor_index : outputs()) {
-      TF_LITE_ENSURE_STATUS(
+      TF_LITE_ENSURE_STATUS_WITH_SCOPED_INSTRUMENTATION(
+          scoped_runtime_event,
           primary_subgraph().EnsureTensorDataIsReadable(tensor_index));
     }
   }
@@ -310,6 +326,8 @@ void Interpreter::SetCancellationFunction(void* data,
   }
 }
 
+bool Interpreter::IsCancelled() { return primary_subgraph().IsCancelled(); }
+
 TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   TfLiteStatus status = kTfLiteOk;
   for (auto& subgraph : subgraphs_) {
@@ -340,6 +358,8 @@ TfLiteStatus Interpreter::RemoveAllDelegates() {
   return kTfLiteOk;
 }
 
+bool Interpreter::HasDelegates() { return primary_subgraph().HasDelegates(); }
+
 TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
                                           TfLiteBufferHandle buffer_handle,
                                           TfLiteDelegate* delegate) {
@@ -377,18 +397,21 @@ void Interpreter::SetProfiler(Profiler* profiler) {
   // Release resources occupied by owned_profiler_ which is replaced by
   // caller-owned profiler.
   owned_profiler_.reset(nullptr);
-  SetSubgraphProfiler(profiler);
+  installed_profiler_ = profiler;
+  SetSubgraphProfiler();
 }
 
 void Interpreter::SetProfiler(std::unique_ptr<Profiler> profiler) {
   owned_profiler_ = std::move(profiler);
-  SetSubgraphProfiler(owned_profiler_.get());
+  installed_profiler_ = owned_profiler_.get();
+  SetSubgraphProfiler();
 }
 
-void Interpreter::SetSubgraphProfiler(Profiler* profiler) {
+void Interpreter::SetSubgraphProfiler() {
   for (int subgraph_index = 0; subgraph_index < subgraphs_.size();
        ++subgraph_index) {
-    subgraphs_[subgraph_index]->SetProfiler(profiler, subgraph_index);
+    subgraphs_[subgraph_index]->SetProfiler(installed_profiler_,
+                                            subgraph_index);
   }
 }
 
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index b93fd76c13b..59cab6add6d 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -42,6 +42,9 @@ namespace tflite {
 
 class InterpreterTest;
 class TestDelegate;
+namespace delegates {
+class InterpreterUtils;  // Class for friend declarations.
+}  // namespace delegates
 
 namespace impl {
 
@@ -322,10 +325,9 @@ class Interpreter {
 
   /// Change the dimensionality of a given tensor. Note, this is only acceptable
   /// for tensor indices that are inputs or variables.
-  /// Returns status of failure or success.
-  /// TODO(aselle): Consider implementing ArraySlice equivalent to make this
-  ///   more adept at accepting data without an extra copy. Use absl::ArraySlice
-  ///   if our partners determine that dependency is acceptable.
+  /// Returns status of failure or success. Note that this doesn't actually
+  /// resize any existing buffers. A call to AllocateTensors() is required to
+  /// change the tensor input buffer.
   TfLiteStatus ResizeInputTensor(int tensor_index,
                                  const std::vector<int>& dims);
 
@@ -334,7 +336,8 @@ class Interpreter {
   // tensor indices that are inputs or variables. Only unknown dimensions can be
   // resized with this function. Unknown dimensions are indicated as `-1` in the
   // `dims_signature` attribute of a `TfLiteTensor`. Returns status of failure
-  // or success.
+  // or success.  Note that this doesn't actually resize any existing buffers.
+  /// A call to AllocateTensors() is required to change the tensor input buffer.
   TfLiteStatus ResizeInputTensorStrict(int tensor_index,
                                        const std::vector<int>& dims);
 
@@ -529,6 +532,7 @@ class Interpreter {
   friend class InterpreterBuilder;
   friend class tflite::InterpreterTest;
   friend class tflite::TestDelegate;
+  friend class tflite::delegates::InterpreterUtils;
 
   /// Set the value of an external context.
   static void SetExternalContext(struct TfLiteContext* context,
@@ -536,20 +540,29 @@ class Interpreter {
                                  TfLiteExternalContext* ctx);
 
   // Sets the profiler to all subgraphs.
-  void SetSubgraphProfiler(Profiler* profiler);
+  void SetSubgraphProfiler();
 
   // Remove delegates (for fallback behaviour). The interpreter is invokable
   // afterwards.
   TfLiteStatus RemoveAllDelegates();
 
+  // Returns true if delegates have been applied.
+  bool HasDelegates();
+
+  // Returns true if cancellation function returns true.
+  bool IsCancelled();
+
+  // Get the error reporter associated with this interpreter.
+  ErrorReporter* error_reporter() { return error_reporter_; }
+
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
   // This is the primary subgraph context.
-  TfLiteContext* context_;
+  TfLiteContext* context_ = nullptr;
 
   // The error reporter delegate that tflite will forward queries errors to.
-  ErrorReporter* error_reporter_;
+  ErrorReporter* error_reporter_ = nullptr;
 
   // List of delegates that have been installed and are owned by this
   // interpreter instance. Useful if client delegate ownership is burdensome.
@@ -561,6 +574,9 @@ class Interpreter {
   // Useful if client profiler ownership is burdensome.
   std::unique_ptr<Profiler> owned_profiler_;
 
+  // Points to the installed Profiler instance.
+  Profiler* installed_profiler_ = nullptr;
+
   bool allow_buffer_handle_output_ = false;
 
   // List of active external contexts.
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index e32e0768995..43d81ef0770 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -26,7 +26,9 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tflite_with_xnnpack_optional.h"
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
@@ -107,27 +109,14 @@ TfLiteStatus ParseSparseIndexVector(const DimensionMetadata* src,
 
 const char* kEmptyTensorName = "";
 
-#if TFLITE_HAS_ATTRIBUTE_WEAK
 // Using weak symbols to create a delegate allows automatic injection of the
 // delegate simply by adding it as a dependency.
-
 // For flex delegate, see also the strong override in
 // lite/delegates/flex/delegate.cc.
 TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireFlexDelegate() {
   return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
 }
 
-// For XNNPACK delegate, see also the strong override in
-// lite/tflite_with_xnnpack.cc.
-TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireXNNPACKDelegate(
-    int num_threads) {
-  return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
-}
-#else
-Interpreter::TfLiteDelegatePtr (*AcquireFlexDelegate)() = nullptr;
-Interpreter::TfLiteDelegatePtr (*AcquireXNNPACKDelegate)(int) = nullptr;
-#endif
-
 namespace impl {
 
 InterpreterBuilder::InterpreterBuilder(const FlatBufferModel& model,
@@ -209,7 +198,15 @@ class MallocDataAllocator : public BuiltinDataAllocator {
  public:
   void* Allocate(size_t size, size_t alignment_hint) override {
 #ifdef TFLITE_USE_STD_ALIGNED_ALLOC
-    return aligned_alloc(alignment_hint, size);
+    // Ensure that alignment is a power of two and a multiple of sizeof(void *)
+    // and that size is an integral multiple of alignment.
+    size_t used_alignment = std::max(alignment_hint, sizeof(void*));
+    size_t used_size =
+        ((size + used_alignment - 1) / used_alignment) * used_alignment;
+    TFLITE_DCHECK(
+        (used_alignment != 0) &&
+        ((used_alignment & (used_alignment - 1)) == 0));  // is power-of-two
+    return aligned_alloc(used_alignment, used_size);
 #else
     return malloc(size);
 #endif
@@ -532,17 +529,17 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
 TfLiteStatus InterpreterBuilder::ApplyDelegates(Interpreter* interpreter,
                                                 int num_threads) {
   // First, apply XNNPACK delegate if applicable.
-  if (AcquireXNNPACKDelegate && num_fp32_tensors_ > 0) {
-    if (auto xnnpack_delegate = AcquireXNNPACKDelegate(num_threads)) {
-      // The execution will fall back to default implementation if the XNNPACK
-      // delegate fails to be applied. Therefore, we ignore the return status
-      // here and let it fall through the rest of the code.
+  if (num_fp32_tensors_ > 0) {
+    // The execution will fall back to default implementation if the XNNPACK
+    // delegate fails to be applied. Therefore, we ignore the return status
+    // here and let it fall through the rest of the code.
+    if (auto xnnpack_delegate = MaybeCreateXNNPACKDelegate(num_threads)) {
       interpreter->ModifyGraphWithDelegate(std::move(xnnpack_delegate));
     }
   }
 
   // Secondly, apply Flex delegate if applicable.
-  if (has_flex_op_ && AcquireFlexDelegate) {
+  if (has_flex_op_) {
     if (auto flex_delegate = AcquireFlexDelegate()) {
       return interpreter->ModifyGraphWithDelegate(std::move(flex_delegate));
     }
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index abd92ad563d..49b8e7bd816 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -1304,948 +1304,6 @@ TEST_F(TestExecutionPlan, NullExecutionPlan) {
   ASSERT_EQ(run_order_, std::vector<int>());
 }
 
-// Build a kernel registration for an op that copies its one input
-// to an output
-TfLiteRegistration AddOpRegistration() {
-  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
-
-  reg.custom_name = "my_add";
-  reg.builtin_code = tflite::BuiltinOperator_CUSTOM;
-
-  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-    // Set output size to input size
-    const TfLiteTensor* input1 = GetInput(context, node, 0);
-    const TfLiteTensor* input2 = GetInput(context, node, 1);
-    TfLiteTensor* output = GetOutput(context, node, 0);
-
-    TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
-    for (int i = 0; i < input1->dims->size; ++i) {
-      TF_LITE_ENSURE_EQ(context, input1->dims->data[i], input2->dims->data[i]);
-    }
-
-    TF_LITE_ENSURE_STATUS(context->ResizeTensor(
-        context, output, TfLiteIntArrayCopy(input1->dims)));
-    return kTfLiteOk;
-  };
-
-  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-    // Copy input data to output data.
-    const TfLiteTensor* a0 = GetInput(context, node, 0);
-    TF_LITE_ENSURE(context, a0);
-    TF_LITE_ENSURE(context, a0->data.f);
-    const TfLiteTensor* a1 = GetInput(context, node, 1);
-    TF_LITE_ENSURE(context, a1);
-    TF_LITE_ENSURE(context, a1->data.f);
-    TfLiteTensor* out = GetOutput(context, node, 0);
-    TF_LITE_ENSURE(context, out);
-    TF_LITE_ENSURE(context, out->data.f);
-    int num = a0->dims->data[0];
-    for (int i = 0; i < num; i++) {
-      out->data.f[i] = a0->data.f[i] + a1->data.f[i];
-    }
-    return kTfLiteOk;
-  };
-  return reg;
-}
-
-}  // namespace
-
-// TestDelegate is a friend of Interpreter to access RemoveAllDelegates().
-class TestDelegate : public ::testing::Test {
- protected:
-  void SetUp() override {
-    interpreter_.reset(new Interpreter);
-    interpreter_->AddTensors(5);
-    interpreter_->SetInputs({0, 1});
-    interpreter_->SetOutputs({3, 4});
-    TfLiteQuantizationParams quant;
-    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
-                                               quant);
-    TfLiteRegistration reg = AddOpRegistration();
-    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
-    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
-    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
-  }
-
-  void TearDown() override {
-    // Interpreter relies on delegate to free the resources properly. Thus
-    // the life cycle of delegate must be longer than interpreter.
-    interpreter_.reset();
-    delegate_.reset();
-  }
-
-  TfLiteBufferHandle last_allocated_handle_ = kTfLiteNullBufferHandle;
-
-  TfLiteBufferHandle AllocateBufferHandle() { return ++last_allocated_handle_; }
-
-  TfLiteStatus RemoveAllDelegates() {
-    return interpreter_->RemoveAllDelegates();
-  }
-
- protected:
-  class SimpleDelegate {
-   public:
-    // Create a simple implementation of a TfLiteDelegate. We use the C++ class
-    // SimpleDelegate and it can produce a handle TfLiteDelegate that is
-    // value-copyable and compatible with TfLite.
-    // fail_node_prepare: To simulate failure of Delegate node's Prepare().
-    // min_ops_per_subset: If >0, partitioning preview is used to choose only
-    // those subsets with min_ops_per_subset number of nodes.
-    // fail_node_invoke: To simulate failure of Delegate node's Invoke().
-    explicit SimpleDelegate(
-        const std::vector<int>& nodes,
-        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
-        bool fail_node_prepare = false, int min_ops_per_subset = 0,
-        bool fail_node_invoke = false)
-        : nodes_(nodes),
-          fail_delegate_node_prepare_(fail_node_prepare),
-          min_ops_per_subset_(min_ops_per_subset),
-          fail_delegate_node_invoke_(fail_node_invoke) {
-      delegate_.Prepare = [](TfLiteContext* context,
-                             TfLiteDelegate* delegate) -> TfLiteStatus {
-        auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
-        TfLiteIntArray* nodes_to_separate =
-            TfLiteIntArrayCreate(simple->nodes_.size());
-        // Mark nodes that we want in TfLiteIntArray* structure.
-        int index = 0;
-        for (auto node_index : simple->nodes_) {
-          nodes_to_separate->data[index++] = node_index;
-          // make sure node is added
-          TfLiteNode* node;
-          TfLiteRegistration* reg;
-          context->GetNodeAndRegistration(context, node_index, &node, &reg);
-          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
-          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
-        }
-        // Check that all nodes are available
-        TfLiteIntArray* execution_plan;
-        TF_LITE_ENSURE_STATUS(
-            context->GetExecutionPlan(context, &execution_plan));
-        for (int exec_index = 0; exec_index < execution_plan->size;
-             exec_index++) {
-          int node_index = execution_plan->data[exec_index];
-          TfLiteNode* node;
-          TfLiteRegistration* reg;
-          context->GetNodeAndRegistration(context, node_index, &node, &reg);
-          if (exec_index == node_index) {
-            // Check op details only if it wasn't delegated already.
-            TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
-            TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
-          }
-        }
-
-        // Get preview of delegate partitioning from the context.
-        TfLiteDelegateParams* params_array;
-        int num_partitions;
-        TFLITE_CHECK_EQ(
-            context->PreviewDelegatePartitioning(
-                context, nodes_to_separate, &params_array, &num_partitions),
-            kTfLiteOk);
-
-        if (simple->min_ops_per_subset() > 0) {
-          // Build a new vector of ops from subsets with atleast the minimum
-          // size.
-          std::vector<int> allowed_ops;
-          for (int idx = 0; idx < num_partitions; ++idx) {
-            const auto* nodes_in_subset = params_array[idx].nodes_to_replace;
-            if (nodes_in_subset->size < simple->min_ops_per_subset()) continue;
-            allowed_ops.insert(allowed_ops.end(), nodes_in_subset->data,
-                               nodes_in_subset->data + nodes_in_subset->size);
-          }
-
-          // Free existing nodes_to_separate & initialize a new array with
-          // allowed_ops.
-          TfLiteIntArrayFree(nodes_to_separate);
-          nodes_to_separate = TfLiteIntArrayCreate(allowed_ops.size());
-          memcpy(nodes_to_separate->data, allowed_ops.data(),
-                 sizeof(int) * nodes_to_separate->size);
-        }
-
-        // Another call to PreviewDelegateParitioning should be okay, since
-        // partitioning memory is managed by context.
-        TFLITE_CHECK_EQ(
-            context->PreviewDelegatePartitioning(
-                context, nodes_to_separate, &params_array, &num_partitions),
-            kTfLiteOk);
-
-        context->ReplaceNodeSubsetsWithDelegateKernels(
-            context, simple->FakeFusedRegistration(), nodes_to_separate,
-            delegate);
-        TfLiteIntArrayFree(nodes_to_separate);
-        return kTfLiteOk;
-      };
-      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
-                                        TfLiteDelegate* delegate,
-                                        TfLiteBufferHandle buffer_handle,
-                                        TfLiteTensor* tensor) -> TfLiteStatus {
-        // TODO(ycling): Implement tests to test buffer copying logic.
-        return kTfLiteOk;
-      };
-      delegate_.CopyFromBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle buffer_handle,
-             TfLiteTensor* output) -> TfLiteStatus {
-        TFLITE_CHECK_GE(buffer_handle, -1);
-        TFLITE_CHECK_EQ(output->buffer_handle, buffer_handle);
-        const float floats[] = {6., 6., 6.};
-        int num = output->dims->data[0];
-        for (int i = 0; i < num; i++) {
-          output->data.f[i] = floats[i];
-        }
-        return kTfLiteOk;
-      };
-
-      delegate_.FreeBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
-      // Store type-punned data SimpleDelegate structure.
-      delegate_.data_ = static_cast<void*>(this);
-      delegate_.flags = delegate_flags;
-    }
-
-    TfLiteRegistration FakeFusedRegistration() {
-      TfLiteRegistration reg = {nullptr};
-      reg.custom_name = "fake_fused_op";
-
-      reg.invoke = [](TfLiteContext* context,
-                      TfLiteNode* node) -> TfLiteStatus {
-        // Copy input data to output data.
-        const TfLiteTensor* a0;
-        const TfLiteTensor* a1;
-        if (node->inputs->size == 2) {
-          a0 = GetInput(context, node, 0);
-          a1 = GetInput(context, node, 1);
-        } else {
-          a0 = GetInput(context, node, 0);
-          a1 = a0;
-        }
-        TfLiteTensor* out = GetOutput(context, node, 0);
-        int num = 1;
-        for (int i = 0; i < a0->dims->size; ++i) {
-          num *= a0->dims->data[i];
-        }
-        for (int i = 0; i < num; i++) {
-          out->data.f[i] = a0->data.f[i] + a1->data.f[i];
-        }
-        // Make the data stale so that CopyFromBufferHandle can be invoked
-        out->data_is_stale = true;
-        return kTfLiteOk;
-      };
-      if (fail_delegate_node_invoke_) {
-        reg.invoke = [](TfLiteContext* context,
-                        TfLiteNode* node) -> TfLiteStatus {
-          return kTfLiteError;
-        };
-      }
-
-      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-        // Set output size to input size
-        const TfLiteTensor* input1;
-        const TfLiteTensor* input2;
-        if (node->inputs->size == 2) {
-          input1 = GetInput(context, node, 0);
-          input2 = GetInput(context, node, 1);
-        } else {
-          input1 = GetInput(context, node, 0);
-          input2 = input1;
-        }
-        TfLiteTensor* output = GetOutput(context, node, 0);
-
-        TF_LITE_ENSURE_STATUS(context->ResizeTensor(
-            context, output, TfLiteIntArrayCopy(input1->dims)));
-        return kTfLiteOk;
-      };
-      if (fail_delegate_node_prepare_) {
-        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-          return kTfLiteError;
-        };
-      }
-
-      return reg;
-    }
-
-    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
-
-    int min_ops_per_subset() { return min_ops_per_subset_; }
-
-   private:
-    std::vector<int> nodes_;
-    TfLiteDelegate delegate_;
-    bool fail_delegate_node_prepare_ = false;
-    int min_ops_per_subset_ = 0;
-    bool fail_delegate_node_invoke_ = false;
-  };
-
-  std::unique_ptr<Interpreter> interpreter_;
-  std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
-};
-namespace {
-
-TEST_F(TestDelegate, BasicDelegate) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
-
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-  int node = interpreter_->execution_plan()[0];
-  const auto* node_and_reg = interpreter_->node_and_registration(node);
-  EXPECT_EQ(node_and_reg->second.custom_name,
-            delegate_->FakeFusedRegistration().custom_name);
-
-  const TfLiteDelegateParams* params = static_cast<const TfLiteDelegateParams*>(
-      node_and_reg->first.builtin_data);
-  ASSERT_EQ(params->nodes_to_replace->size, 3);
-  EXPECT_EQ(params->nodes_to_replace->data[0], 0);
-  EXPECT_EQ(params->nodes_to_replace->data[1], 1);
-  EXPECT_EQ(params->nodes_to_replace->data[2], 2);
-
-  ASSERT_EQ(params->input_tensors->size, 2);
-  EXPECT_EQ(params->input_tensors->data[0], 0);
-  EXPECT_EQ(params->input_tensors->data[1], 1);
-
-  ASSERT_EQ(params->output_tensors->size, 2);
-  EXPECT_EQ(params->output_tensors->data[0], 3);
-  EXPECT_EQ(params->output_tensors->data[1], 4);
-}
-
-TEST_F(TestDelegate, DelegateNodePrepareFailure) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0, 1, 2}, kTfLiteDelegateFlagsNone, true /**fail_node_prepare**/));
-  // ModifyGraphWithDelegate fails, since the Prepare() method in the node's
-  // TfLiteRegistration returns an error status.
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteDelegateError);
-  // Execution plan should remain unchanged.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  // Verify Invoke() behavior.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, DelegateNodeInvokeFailure) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0, 1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
-      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Delegation modified execution plan.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
-  constexpr int kOutputTensorIndex = 3;
-
-  // Verify Invoke() behavior: fails first, succeeds after RemoveAllDelegates().
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  EXPECT_EQ(interpreter_->Invoke(), kTfLiteError);
-  ASSERT_EQ(RemoveAllDelegates(), kTfLiteOk);
-  // Delegation removed, returning to original execution plan.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, SecondDelegationPrepareFailure) {
-  // First delegate only supports nodes 1, 2. Gets applied successfully.
-  // This delegate should support dynamic tensors, otherwise the second won't be
-  // applied.
-  delegate_ = std::unique_ptr<SimpleDelegate>(
-      new SimpleDelegate({1, 2}, kTfLiteDelegateFlagsAllowDynamicTensors));
-  // Second delegate supports node 0, but fails during the delegate-node's
-  // Prepare.
-  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0}, kTfLiteDelegateFlagsNone, true /**fail_node_prepare**/));
-
-  // Initially, execution plan has 3 nodes.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  // First delegate should be applied successfully, yielding a plan with 2
-  // nodes.
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-  // Second delegate won't get applied.
-  // As a result, previous delegate should also get undone, restoring the
-  // execution plan to its original state.
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
-      kTfLiteDelegateError);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  // Verify Invoke() behavior.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, SecondDelegationInvokeFailure) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(
-      new SimpleDelegate({1, 2}, kTfLiteDelegateFlagsAllowDynamicTensors));
-  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
-      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f};
-  // Outputs match the AddOp path, rather than delegate path.
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
-  constexpr int kOutputTensorIndex = 3;
-
-  // Verify Invoke() behavior to ensure Interpreter isn't broken.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  EXPECT_EQ(interpreter_->Invoke(), kTfLiteError);
-  EXPECT_EQ(RemoveAllDelegates(), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, StaticDelegateMakesGraphImmutable) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-
-  // Deliberately try to set tensor params with quantization while immutable,
-  // ensuring quantization is properly freed.
-  TfLiteQuantization quant = {};
-  quant.type = kTfLiteAffineQuantization;
-  auto quant_params = static_cast<TfLiteAffineQuantization*>(
-      malloc(sizeof(TfLiteAffineQuantization)));
-  quant_params->scale = nullptr;
-  quant_params->zero_point = nullptr;
-  quant_params->quantized_dimension = 0;
-  quant.params = quant_params;
-  ASSERT_NE(interpreter_->SetTensorParametersReadWrite(0, kTfLiteInt8, "", {3},
-                                                       quant),
-            kTfLiteOk);
-}
-
-TEST_F(TestDelegate, ComplexDelegate) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
-  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
-
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-  // 0th should be a non-delegated original op
-  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
-  // 1st should be a new macro op (3) which didn't exist)
-  ASSERT_EQ(interpreter_->execution_plan()[1], 3);
-  const auto* node_and_reg = interpreter_->node_and_registration(3);
-  ASSERT_EQ(node_and_reg->second.custom_name,
-            delegate_->FakeFusedRegistration().custom_name);
-}
-
-TEST_F(TestDelegate, SetBufferHandleToInput) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate);
-
-  constexpr int kOutputTensorIndex = 0;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  ASSERT_EQ(tensor->delegate, nullptr);
-  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-
-  TfLiteBufferHandle handle = AllocateBufferHandle();
-  TfLiteStatus status =
-      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
-  ASSERT_EQ(status, kTfLiteOk);
-  EXPECT_EQ(tensor->delegate, delegate);
-  EXPECT_EQ(tensor->buffer_handle, handle);
-}
-
-TEST_F(TestDelegate, SetBufferHandleToOutput) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate);
-
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  // Before setting the buffer handle, the tensor's `delegate` is already set
-  // because it will be written by the delegate.
-  ASSERT_EQ(tensor->delegate, delegate);
-  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-
-  TfLiteBufferHandle handle = AllocateBufferHandle();
-  TfLiteStatus status =
-      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
-  ASSERT_EQ(status, kTfLiteOk);
-  EXPECT_EQ(tensor->delegate, delegate);
-  EXPECT_EQ(tensor->buffer_handle, handle);
-}
-
-TEST_F(TestDelegate, SetInvalidHandleToTensor) {
-  interpreter_->Invoke();
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate);
-
-  SimpleDelegate another_simple_delegate({0, 1, 2});
-
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  // Before setting the buffer handle, the tensor's `delegate` is already set
-  // because it will be written by the delegate.
-  ASSERT_EQ(tensor->delegate, delegate);
-  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-
-  TfLiteBufferHandle handle = AllocateBufferHandle();
-  TfLiteStatus status = interpreter_->SetBufferHandle(
-      kOutputTensorIndex, handle,
-      another_simple_delegate.get_tf_lite_delegate());
-  // Setting a buffer handle to a tensor with another delegate will fail.
-  ASSERT_EQ(status, kTfLiteError);
-  EXPECT_EQ(tensor->delegate, delegate);
-  EXPECT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-}
-
-// We utilize delegation in such a way as to allow node subsets with a minimum
-// number of ops only.
-TEST_F(TestDelegate, TestDelegationWithPartitionPreview) {
-  // We set kTfLiteDelegateFlagsAllowDynamicTensors to ensure the second
-  // delegate can be applied.
-  // Ops 0 and 2 are delegated but end up in the same partition (based on
-  // dependency analysis). However, since min_ops_per_subset = 3, no delegation
-  // takes place.
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0, 2}, kTfLiteDelegateFlagsAllowDynamicTensors,
-      false /**fail_node_prepare**/, 3 /**min_ops_per_subset**/));
-  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
-
-  // Original execution plan remains.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
-  ASSERT_EQ(interpreter_->execution_plan()[1], 1);
-  ASSERT_EQ(interpreter_->execution_plan()[2], 2);
-
-  // Same ops supported, but min_ops_per_subset = 2.
-  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0, 2}, kTfLiteDelegateFlagsAllowDynamicTensors,
-      false /**fail_node_prepare**/, 2 /**min_ops_per_subset**/));
-  interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate());
-
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-  ASSERT_EQ(interpreter_->execution_plan()[0], 3);
-  const auto* node_and_reg = interpreter_->node_and_registration(3);
-  ASSERT_EQ(node_and_reg->second.custom_name,
-            delegate2_->FakeFusedRegistration().custom_name);
-  ASSERT_EQ(interpreter_->execution_plan()[1], 1);
-}
-
-TEST_F(TestDelegate, TestResizeInputWithNonDynamicDelegate) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-
-  // Try resizing input to same shape as before (which should be a No-op).
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  // This should fail, since the previous application of the delegate will be
-  // re-done automatically, making the graph immutable again.
-  ASSERT_NE(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Ensure graph has been restored to its valid delegated state.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  // Verify Invoke() behavior.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-
-  // Resize again, but call AllocateTensors as usual afterwards.
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 4; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
-  // First delegate only supports node 0.
-  // This delegate should support dynamic tensors, otherwise the second won't be
-  // applied.
-  delegate_ = std::unique_ptr<SimpleDelegate>(
-      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
-  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
-  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Should be two delegates nodes.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  // Try resizing input to same shape as before (which should be a No-op).
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  // Resizing input tensors should temporarily restore original execution plan
-  // of 3 nodes.
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  // This should fail, since the previous application of the delegate will be
-  // re-done automatically, making the graph immutable again.
-  ASSERT_NE(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Ensure graph has been restored to its valid delegated state.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
-  constexpr int kOutputTensorIndex = 2;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  // Verify Invoke() behavior.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-
-  // Resize again, but call AllocateTensors as usual afterwards.
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 4; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, ReleaseNonPersistentMemoryWithDelegates) {
-  // First delegate only supports node 0.
-  // This delegate should support dynamic tensors, otherwise the second won't be
-  // applied.
-  delegate_ = std::unique_ptr<SimpleDelegate>(
-      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
-  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
-  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
-
-  // No-op.
-  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
-
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Should be two delegates nodes.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
-  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
-
-  // This should fail, since the graph is immutable.
-  ASSERT_NE(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
-  constexpr int kOutputTensorIndex = 2;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  // Verify Invoke() behavior.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-
-  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
-}
-
-TEST_F(TestDelegate, TestCopyFromBufferInvoke) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate);
-
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  std::vector<float> floats = {1.0f, 2.0f, 3.0f};
-  memcpy(interpreter_->typed_tensor<float>(0), floats.data(),
-         floats.size() * sizeof(float));
-
-  memcpy(interpreter_->typed_tensor<float>(1), floats.data(),
-         floats.size() * sizeof(float));
-
-  // Before setting the buffer handle, the tensor's `delegate` is already set
-  // because it will be written by the delegate.
-  ASSERT_EQ(tensor->delegate, delegate);
-  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-
-  // Called Invoke without setting the buffer will not call the CopyFromBuffer
-  interpreter_->Invoke();
-  std::vector<float> res = {2.0f, 4.0f, 6.0f};
-  for (int i = 0; i < tensor->dims->data[0]; ++i) {
-    ASSERT_EQ(tensor->data.f[i], res[i]);
-  }
-}
-
-TEST_F(TestDelegate, TestCopyFromBuffer) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate);
-
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  std::vector<float> floats = {1.0f, 2.0f, 3.0f};
-  memcpy(interpreter_->typed_tensor<float>(0), floats.data(),
-         floats.size() * sizeof(float));
-
-  memcpy(interpreter_->typed_tensor<float>(1), floats.data(),
-         floats.size() * sizeof(float));
-
-  // Before setting the buffer handle, the tensor's `delegate` is already set
-  // because it will be written by the delegate.
-  ASSERT_EQ(tensor->delegate, delegate);
-  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-
-  TfLiteBufferHandle handle = AllocateBufferHandle();
-  TfLiteStatus status =
-      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
-  interpreter_->Invoke();
-  ASSERT_EQ(status, kTfLiteOk);
-  EXPECT_EQ(tensor->delegate, delegate);
-  EXPECT_EQ(tensor->buffer_handle, handle);
-  for (int i = 0; i < tensor->dims->data[0]; ++i) {
-    ASSERT_EQ(tensor->data.f[i], 6.0f);
-  }
-}
-
-TEST_F(TestDelegate, DelegateCustomOpResolution) {
-  // Build a flatbuffer model that contains the "my_add" custom op which gets
-  // resolved only after SimpleDelegate is applied.
-  flatbuffers::FlatBufferBuilder builder;
-  // Tensors.
-  const int32_t shape[1] = {3};
-  flatbuffers::Offset<Tensor> tensors[3] = {
-      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
-                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("X")),
-      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
-                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("Y")),
-      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
-                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("Z")),
-  };
-  // Custom op definition.
-  flatbuffers::Offset<OperatorCode> op_code =
-      CreateOperatorCodeDirect(builder, BuiltinOperator_CUSTOM, "my_add");
-  const int32_t inputs[2] = {0, 1};
-  const int32_t outputs[1] = {2};
-  flatbuffers::Offset<Operator> op = CreateOperator(
-      builder, /*opcode_index=*/0, builder.CreateVector<int32_t>(inputs, 2),
-      builder.CreateVector<int32_t>(outputs, 1), BuiltinOptions_NONE,
-      /*builtin_options=*/0,
-      /*custom_options=*/0, tflite::CustomOptionsFormat_FLEXBUFFERS);
-  // Subgraph & Model.
-  flatbuffers::Offset<SubGraph> subgraph =
-      CreateSubGraph(builder, builder.CreateVector(tensors, 3),
-                     builder.CreateVector<int32_t>(inputs, 2),
-                     builder.CreateVector<int32_t>(outputs, 1),
-                     builder.CreateVector(&op, 1), /*name=*/0);
-  flatbuffers::Offset<Buffer> buffers[1] = {
-      CreateBuffer(builder, builder.CreateVector({})),
-  };
-  flatbuffers::Offset<Model> model_buffer = CreateModel(
-      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&op_code, 1),
-      builder.CreateVector(&subgraph, 1), builder.CreateString("test_model"),
-      builder.CreateVector(buffers, 1));
-  builder.Finish(model_buffer);
-  std::vector<char> buffer =
-      std::vector<char>(builder.GetBufferPointer(),
-                        builder.GetBufferPointer() + builder.GetSize());
-  const Model* model = GetModel(buffer.data());
-
-  // Build an interpreter with the model. Initialization should work fine.
-  std::unique_ptr<Interpreter> interpreter;
-  ASSERT_EQ(
-      InterpreterBuilder(
-          model, ::tflite::ops::builtin::BuiltinOpResolver())(&interpreter),
-      kTfLiteOk);
-  // AllocateTensors should fail, since my_add hasn't been resolved.
-  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteError);
-
-  // Applying static delegate won't work, since the interpreter will first try
-  // to Prepare all original nodes.
-  std::unique_ptr<SimpleDelegate> static_delegate(new SimpleDelegate({0}));
-  ASSERT_EQ(interpreter->ModifyGraphWithDelegate(
-                static_delegate->get_tf_lite_delegate()),
-            kTfLiteError);
-
-  // Applying delegate that supports dynamic tensors should work.
-  std::unique_ptr<SimpleDelegate> dynamic_delegate(
-      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
-  ASSERT_EQ(interpreter->ModifyGraphWithDelegate(
-                dynamic_delegate->get_tf_lite_delegate()),
-            kTfLiteOk);
-  // AllocateTensors will now work.
-  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
-}
-
-class TestDelegateWithDynamicTensors : public ::testing::Test {
- protected:
-  void SetUp() override {
-    interpreter_.reset(new Interpreter);
-
-    interpreter_->AddTensors(2);
-    interpreter_->SetInputs({0});
-    interpreter_->SetOutputs({1});
-    TfLiteQuantizationParams quant;
-    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
-                                               quant);
-    TfLiteRegistration reg = DynamicCopyOpRegistration();
-    interpreter_->AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr, &reg);
-
-    delegate_.Prepare = [](TfLiteContext* context,
-                           TfLiteDelegate* delegate) -> TfLiteStatus {
-      // In this test, the delegate replaces all the nodes if this function is
-      // called.
-      TfLiteIntArray* execution_plan;
-      TF_LITE_ENSURE_STATUS(
-          context->GetExecutionPlan(context, &execution_plan));
-      context->ReplaceNodeSubsetsWithDelegateKernels(
-          context, DelegateRegistration(), execution_plan, delegate);
-      return kTfLiteOk;
-    };
-    delegate_.flags = kTfLiteDelegateFlagsNone;
-  }
-
-  static TfLiteRegistration DynamicCopyOpRegistration() {
-    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
-
-    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-      TfLiteTensor* output = GetOutput(context, node, 0);
-      SetTensorToDynamic(output);
-      return kTfLiteOk;
-    };
-
-    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-      // Not implemented since this isn't required in testing.
-      return kTfLiteOk;
-    };
-    return reg;
-  }
-
-  static TfLiteRegistration DelegateRegistration() {
-    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
-    return reg;
-  }
-
-  std::unique_ptr<Interpreter> interpreter_;
-  TfLiteDelegate delegate_;
-};
-
-TEST_F(TestDelegateWithDynamicTensors, DisallowDynamicTensors) {
-  interpreter_->ModifyGraphWithDelegate(&delegate_);
-
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-  // The interpreter should not call delegate's `Prepare` when dynamic tensors
-  // exist. So the node ID isn't changed.
-  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
-}
-
-TEST_F(TestDelegateWithDynamicTensors, AllowDynamicTensors) {
-  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
-  interpreter_->ModifyGraphWithDelegate(&delegate_);
-
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-  // The node should be replaced because dynamic tensors are allowed. Therefore
-  // only node ID in the execution plan is changed from 0 to 1.
-  ASSERT_EQ(interpreter_->execution_plan()[0], 1);
-}
-
-TEST_F(TestDelegateWithDynamicTensors, ModifyGraphAfterAllocate) {
-  // Trigger allocation *before* delegate application.
-  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
-
-  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
-  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(&delegate_), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-  ASSERT_EQ(interpreter_->execution_plan()[0], 1);
-
-  // Allocation should still succeed.
-  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
-}
-
 TEST(TestDelegateOwnership, ProperlyDisposed) {
   struct TfLiteInterpreterOwnedDelegate : public TfLiteDelegate {
     TfLiteInterpreterOwnedDelegate(bool* destroyed, bool* prepared)
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 49c2136ffb4..5eb5e8ab023 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -14,6 +14,7 @@ package(
 exports_files([
     "src/testdata/add.bin",
     "src/testdata/add_unknown_dimensions.bin",
+    "src/testdata/grace_hopper_224.jpg",
 ])
 
 JAVA_SRCS = glob([
@@ -240,6 +241,7 @@ java_test(
     data = [
         "src/testdata/add.bin",
         "src/testdata/add_unknown_dimensions.bin",
+        "//tensorflow/lite:testdata/dynamic_shapes.bin",
         "//tensorflow/lite:testdata/multi_add.bin",
         "//tensorflow/lite:testdata/multi_add_flex.bin",
     ],
@@ -351,6 +353,7 @@ filegroup(
 filegroup(
     name = "portable_gpu_tests",
     srcs = [
+        "src/test/java/org/tensorflow/lite/InterpreterTestHelper.java",
         "src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java",
     ],
     visibility = ["//visibility:public"],
diff --git a/tensorflow/lite/java/ovic/BUILD b/tensorflow/lite/java/ovic/BUILD
index 947fbee1a45..e64bd3036ac 100644
--- a/tensorflow/lite/java/ovic/BUILD
+++ b/tensorflow/lite/java/ovic/BUILD
@@ -58,7 +58,6 @@ android_library(
     deps = [
         "//tensorflow/lite/java:tensorflowlite",
         "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
     ],
 )
 
@@ -75,7 +74,6 @@ java_library(
         "//tensorflow/lite/java:tensorflowlite_java",
         "//tensorflow/lite/java/src/main/native",
         "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
     ],
 )
 
@@ -114,7 +112,6 @@ android_library(
     deps = [
         "//tensorflow/lite/java:tensorflowlite",
         "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
     ],
 )
 
@@ -131,6 +128,5 @@ java_library(
         "//tensorflow/lite/java:tensorflowlite_java",
         "//tensorflow/lite/java/src/main/native",
         "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
     ],
 )
diff --git a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
index 49cf21debc5..839984cfc5d 100644
--- a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
+++ b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
@@ -57,19 +57,19 @@ public abstract class OvicBenchmarker {
   /** Total runtime in ns. */
   protected double totalRuntimeNano = 0.0;
   /** Total allowed runtime in ms. */
-  protected double wallTimeNano = 20000 * 30 * 1.0e6;
+  protected double wallTimeMilli = 20000 * 30.0;
   /** Record whether benchmark has started (used to skip the first image). */
   protected boolean benchmarkStarted = false;
 
   /**
    * Initializes an {@link OvicBenchmarker}
    *
-   * @param wallTimeNano: a double number specifying the total amount of time to benchmark.
+   * @param wallTimeMilli: a double number specifying the total amount of time to benchmark.
    */
-  public OvicBenchmarker(double wallTimeNano) {
+  protected OvicBenchmarker(double wallTimeMilli) {
     benchmarkStarted = false;
     totalRuntimeNano = 0.0;
-    this.wallTimeNano = wallTimeNano;
+    this.wallTimeMilli = wallTimeMilli;
   }
 
   /** Return the cumulative latency of all runs so far. */
@@ -79,13 +79,13 @@ public abstract class OvicBenchmarker {
 
   /** Check whether the benchmarker should stop. */
   public Boolean shouldStop() {
-    if (totalRuntimeNano >= wallTimeNano) {
+    if ((totalRuntimeNano * 1.0 / 1e6) >= wallTimeMilli) {
       Log.e(
           TAG,
-          "Total runtime (ms) "
-              + (totalRuntimeNano * 1.0e-6)
-              + " exceeded wall-time "
-              + (wallTimeNano * 1.0e-6));
+          "Total runtime "
+              + (totalRuntimeNano * 1.0 / 1e6)
+              + " exceeded walltime (ms) "
+              + wallTimeMilli);
       return true;
     }
     return false;
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index d191b550d8f..0f8b7b5c2f2 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -102,8 +102,10 @@ public final class Interpreter implements AutoCloseable {
      * Sets whether to allow float16 precision for FP32 calculation when possible. Defaults to false
      * (disallow).
      *
-     * <p>WARNING: This is an experimental API and subject to change.
+     * @deprecated Prefer using {@link
+     *     org.tensorflow.lite.nnapi.NnApiDelegate.Options#setAllowFp16(boolean enable)}.
      */
+    @Deprecated
     public Options setAllowFp16PrecisionForFp32(boolean allow) {
       this.allowFp16PrecisionForFp32 = allow;
       return this;
@@ -491,6 +493,11 @@ public final class Interpreter implements AutoCloseable {
     wrapper.resetVariableTensors();
   }
 
+  int getExecutionPlanLength() {
+    checkNotClosed();
+    return wrapper.getExecutionPlanLength();
+  }
+
   /** Release resources associated with the {@code Interpreter}. */
   @Override
   public void close() {
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index a22d7241587..8eb3c66f3b5 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -324,6 +324,11 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     return outputTensor;
   }
 
+  /** Gets the number of ops in the execution plan. */
+  int getExecutionPlanLength() {
+    return getExecutionPlanLength(interpreterHandle);
+  }
+
   private void applyDelegates(Interpreter.Options options) {
     // First apply the flex delegate if necessary. This ensures the graph is fully resolved before
     // applying other delegates.
@@ -419,6 +424,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native int getOutputCount(long interpreterHandle);
 
+  private static native int getExecutionPlanLength(long interpreterHandle);
+
   private static native String[] getInputNames(long interpreterHandle);
 
   private static native String[] getOutputNames(long interpreterHandle);
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index 89a2a6a0639..cc9a6a451ac 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -196,7 +196,7 @@ public final class Tensor {
   }
 
   private void setTo(Buffer src) {
-    // Note that we attempt to use zero-copy optimization for direct, native-ordered buffers.
+    // Note that we attempt to use a direct memcpy optimization for direct, native-ordered buffers.
     // There are no base Buffer#order() or Buffer#put() methods, so again we have to ugly cast.
     if (src instanceof ByteBuffer) {
       ByteBuffer srcBuffer = (ByteBuffer) src;
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 971aa5efd7a..690b58ac1f4 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -241,6 +241,15 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputTensorIndex(
   return interpreter->outputs()[output_index];
 }
 
+JNIEXPORT jint JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_getExecutionPlanLength(
+    JNIEnv* env, jclass clazz, jlong handle) {
+  tflite_api_dispatcher::Interpreter* interpreter =
+      convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) return 0;
+  return static_cast<jint>(interpreter->execution_plan().size());
+}
+
 JNIEXPORT jint JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputCount(JNIEnv* env,
                                                                 jclass clazz,
diff --git a/tensorflow/lite/java/src/main/native/tensor_jni.cc b/tensorflow/lite/java/src/main/native/tensor_jni.cc
index 99be71ba37d..dfa4e22162a 100644
--- a/tensorflow/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/lite/java/src/main/native/tensor_jni.cc
@@ -402,14 +402,26 @@ JNIEXPORT void JNICALL Java_org_tensorflow_lite_Tensor_writeDirectBuffer(
   TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
   if (tensor == nullptr) return;
 
-  char* src_data_raw = static_cast<char*>(env->GetDirectBufferAddress(src));
+  void* src_data_raw = env->GetDirectBufferAddress(src);
   if (!src_data_raw) {
     ThrowException(env, kIllegalArgumentException,
                    "Input ByteBuffer is not a direct buffer");
     return;
   }
 
-  tensor->data.raw = src_data_raw;
+  if (!tensor->data.data) {
+    ThrowException(env, kIllegalArgumentException,
+                   "Internal error: Tensor hasn't been allocated.");
+    return;
+  }
+
+  // Historically, we would simply overwrite the tensor buffer pointer with
+  // the direct Buffer address. However, that is generally unsafe, and
+  // specifically wrong if the graph happens to have dynamic shapes where
+  // arena-allocated input buffers will be refreshed during invocation.
+  // TODO(b/156094015): Explore whether this is actually faster than
+  // using ByteBuffer.put(ByteBuffer).
+  memcpy(tensor->data.data, src_data_raw, tensor->bytes);
 }
 
 JNIEXPORT void JNICALL
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
index aaac2f9690a..446cf5f7b02 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
@@ -18,7 +18,11 @@ package org.tensorflow.lite;
 import static com.google.common.truth.Truth.assertThat;
 
 import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.Map;
+import java.util.PriorityQueue;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@@ -61,14 +65,9 @@ public final class InterpreterMobileNetTest {
   }
 
   private static void runMobileNetFloatTest(Interpreter.Options options) {
-    // Create a gray image.
-    ByteBuffer img = ByteBuffer.allocateDirect(1 * 224 * 224 * 3 * 4);
-    img.order(ByteOrder.nativeOrder());
-    img.rewind();
-    while (img.hasRemaining()) {
-      img.putFloat(0.5f);
-    }
-
+    ByteBuffer img =
+        TestUtils.getTestImageAsFloatByteBuffer(
+            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
     float[][] labels = new float[1][1001];
     try (Interpreter interpreter = new Interpreter(MOBILENET_FLOAT_MODEL_BUFFER, options)) {
       interpreter.run(img, labels);
@@ -78,22 +77,53 @@ public final class InterpreterMobileNetTest {
     assertThat(labels[0])
         .usingExactEquality()
         .containsNoneOf(new float[] {Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY});
+    // 653 == "military uniform"
+    assertThat(getTopKLabels(labels, 3)).contains(653);
   }
 
   private static void runMobileNetQuantizedTest(Interpreter.Options options) {
-    // Create a gray image.
-    ByteBuffer img = ByteBuffer.allocateDirect(1 * 224 * 224 * 3);
-    img.order(ByteOrder.nativeOrder());
-    img.rewind();
-    while (img.hasRemaining()) {
-      img.put((byte) 128);
-    }
-
+    ByteBuffer img =
+        TestUtils.getTestImageAsByteBuffer(
+            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
+    byte[][] labels = new byte[1][1001];
     try (Interpreter interpreter = new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options)) {
-      byte[][] labels = new byte[1][1001];
       interpreter.run(img, labels);
       assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
       assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
     }
+    // 653 == "military uniform"
+    assertThat(getTopKLabels(labels, 3)).contains(653);
+  }
+
+  private static ArrayList<Integer> getTopKLabels(byte[][] byteLabels, int k) {
+    float[][] labels = new float[1][1001];
+    for (int i = 0; i < byteLabels[0].length; ++i) {
+      labels[0][i] = (byteLabels[0][i] & 0xff) / 255.0f;
+    }
+    return getTopKLabels(labels, k);
+  }
+
+  private static ArrayList<Integer> getTopKLabels(float[][] labels, int k) {
+    PriorityQueue<Map.Entry<Integer, Float>> pq =
+        new PriorityQueue<>(
+            k,
+            new Comparator<Map.Entry<Integer, Float>>() {
+              @Override
+              public int compare(Map.Entry<Integer, Float> o1, Map.Entry<Integer, Float> o2) {
+                // Intentionally reversed to put high confidence at the head of the queue.
+                return o1.getValue().compareTo(o2.getValue()) * -1;
+              }
+            });
+
+    for (int i = 0; i < labels[0].length; ++i) {
+      pq.add(new AbstractMap.SimpleEntry<>(i, labels[0][i]));
+    }
+
+    final ArrayList<Integer> topKLabels = new ArrayList<>();
+    int topKLabelsSize = Math.min(pq.size(), k);
+    for (int i = 0; i < topKLabelsSize; ++i) {
+      topKLabels.add(pq.poll().getKey());
+    }
+    return topKLabels;
   }
 }
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index cd782c7f5aa..3daa9fe0766 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -40,6 +40,8 @@ public final class InterpreterTest {
       "tensorflow/lite/testdata/multi_add_flex.bin";
   private static final String UNKNOWN_DIMS_MODEL_PATH =
       "tensorflow/lite/java/src/testdata/add_unknown_dimensions.bin";
+  private static final String DYNAMIC_SHAPES_MODEL_PATH =
+      "tensorflow/lite/testdata/dynamic_shapes.bin";
 
   private static final ByteBuffer MODEL_BUFFER = TestUtils.getTestFileAsBuffer(MODEL_PATH);
   private static final ByteBuffer MULTIPLE_INPUTS_MODEL_BUFFER =
@@ -48,6 +50,8 @@ public final class InterpreterTest {
       TestUtils.getTestFileAsBuffer(FLEX_MODEL_PATH);
   private static final ByteBuffer UNKNOWN_DIMS_MODEL_PATH_BUFFER =
       TestUtils.getTestFileAsBuffer(UNKNOWN_DIMS_MODEL_PATH);
+  private static final ByteBuffer DYNAMIC_SHAPES_MODEL_BUFFER =
+      TestUtils.getTestFileAsBuffer(DYNAMIC_SHAPES_MODEL_PATH);
 
   @Test
   public void testInterpreter() throws Exception {
@@ -61,6 +65,7 @@ public final class InterpreterTest {
   }
 
   @Test
+  @SuppressWarnings("deprecation")
   public void testInterpreterWithOptions() throws Exception {
     Interpreter interpreter =
         new Interpreter(
@@ -386,6 +391,7 @@ public final class InterpreterTest {
   }
 
   @Test
+  @SuppressWarnings("deprecation")
   public void testTurnOnNNAPI() throws Exception {
     Interpreter interpreter =
         new Interpreter(
@@ -434,7 +440,7 @@ public final class InterpreterTest {
     interpreter.close();
   }
 
-  /** Smoke test validating that flex model loading fails when the flex delegate is not linked. */
+  // Smoke test validating that flex model loading fails when the flex delegate is not linked.
   @Test
   public void testFlexModel() throws Exception {
     try {
@@ -573,6 +579,45 @@ public final class InterpreterTest {
     }
   }
 
+  private static FloatBuffer fill(FloatBuffer buffer, float value) {
+    while (buffer.hasRemaining()) {
+      buffer.put(value);
+    }
+    buffer.rewind();
+    return buffer;
+  }
+
+  // Regression test case to ensure that graphs with dynamically computed shapes work properly.
+  // Historically, direct ByteBuffer addresses would overwrite the arena-allocated tensor input
+  // pointers. Normally this works fine, but for dynamic graphs, the original input tensor pointers
+  // may be "restored" at invocation time by the arena allocator, resetting the direct ByteBuffer
+  // address and leading to stale input data being used.
+  @Test
+  public void testDynamicShapesWithDirectBufferInputs() {
+    try (Interpreter interpreter = new Interpreter(DYNAMIC_SHAPES_MODEL_BUFFER)) {
+      ByteBuffer input0 =
+          ByteBuffer.allocateDirect(8 * 42 * 1024 * 4).order(ByteOrder.nativeOrder());
+      ByteBuffer input1 =
+          ByteBuffer.allocateDirect(1 * 90 * 1024 * 4).order(ByteOrder.nativeOrder());
+      ByteBuffer input2 = ByteBuffer.allocateDirect(1 * 4).order(ByteOrder.nativeOrder());
+      Object[] inputs = {input0, input1, input2};
+
+      fill(input0.asFloatBuffer(), 2.0f);
+      fill(input1.asFloatBuffer(), 0.5f);
+      // Note that the value of this input dictates the shape of the output.
+      fill(input2.asFloatBuffer(), 1.0f);
+
+      FloatBuffer output = FloatBuffer.allocate(8 * 1 * 1024);
+      Map<Integer, Object> outputs = new HashMap<>();
+      outputs.put(0, output);
+
+      interpreter.runForMultipleInputsOutputs(inputs, outputs);
+
+      FloatBuffer expected = fill(FloatBuffer.allocate(8 * 1 * 1024), 2.0f);
+      assertThat(output.array()).usingTolerance(0.1f).containsExactly(expected.array()).inOrder();
+    }
+  }
+
   private static native long getNativeHandleForDelegate();
 
   private static native long getNativeHandleForInvalidDelegate();
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTestHelper.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTestHelper.java
new file mode 100644
index 00000000000..34eb47e4dbe
--- /dev/null
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTestHelper.java
@@ -0,0 +1,29 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+/** Utility for interacting with Interpreter in delegate tests. */
+public abstract class InterpreterTestHelper {
+
+  /**
+   * Returns the number of nodes in the execution plan that are invoked per inference.
+   *
+   * <p>WARNING: This is an experimental API and subject to change.
+   */
+  public static int executionPlanLength(Interpreter interpreter) {
+    return interpreter.getExecutionPlanLength();
+  }
+}
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestUtils.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestUtils.java
index 1471b4b506b..ae88cddcf57 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestUtils.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestUtils.java
@@ -15,17 +15,24 @@ limitations under the License.
 
 package org.tensorflow.lite;
 
+import java.awt.image.BufferedImage;
 import java.io.File;
 import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.file.Files;
 import java.nio.file.StandardOpenOption;
 import java.util.EnumSet;
+import javax.imageio.ImageIO;
 
 /** Utility for interacting with test-specific data. */
 public abstract class TestUtils {
 
+  private static final float DEFAULT_IMAGE_MEAN = 127.5f;
+  private static final float DEFAULT_IMAGE_STD = 127.5f;
+
   public static MappedByteBuffer getTestFileAsBuffer(String path) {
     try (FileChannel fileChannel =
         (FileChannel)
@@ -40,5 +47,60 @@ public abstract class TestUtils {
     return true;
   }
 
+  public static ByteBuffer getTestImageAsByteBuffer(String path) {
+    File imageFile = new File(path);
+    try {
+      BufferedImage image = ImageIO.read(imageFile);
+      return toByteBuffer(image);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  public static ByteBuffer getTestImageAsFloatByteBuffer(String path) {
+    File imageFile = new File(path);
+    try {
+      BufferedImage image = ImageIO.read(imageFile);
+      return toFloatByteBuffer(image);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private static ByteBuffer toByteBuffer(BufferedImage image) {
+    ByteBuffer imgData =
+        ByteBuffer.allocateDirect(image.getHeight() * image.getWidth() * 3)
+            .order(ByteOrder.nativeOrder());
+    for (int y = 0; y < image.getHeight(); y++) {
+      for (int x = 0; x < image.getWidth(); x++) {
+        int val = image.getRGB(x, y);
+        imgData.put((byte) ((val >> 16) & 0xFF));
+        imgData.put((byte) ((val >> 8) & 0xFF));
+        imgData.put((byte) (val & 0xFF));
+      }
+    }
+    return imgData;
+  }
+
+  private static ByteBuffer toFloatByteBuffer(BufferedImage image) {
+    return toFloatByteBuffer(image, DEFAULT_IMAGE_MEAN, DEFAULT_IMAGE_STD);
+  }
+
+  private static ByteBuffer toFloatByteBuffer(
+      BufferedImage image, float imageMean, float imageStd) {
+    ByteBuffer imgData =
+        ByteBuffer.allocateDirect(image.getHeight() * image.getWidth() * 3 * 4)
+            .order(ByteOrder.nativeOrder());
+    for (int y = 0; y < image.getHeight(); y++) {
+      for (int x = 0; x < image.getWidth(); x++) {
+        int pixelValue = image.getRGB(x, y);
+        imgData.putFloat((((pixelValue >> 16) & 0xFF) - imageMean) / imageStd);
+        imgData.putFloat((((pixelValue >> 8) & 0xFF) - imageMean) / imageStd);
+        imgData.putFloat(((pixelValue & 0xFF) - imageMean) / imageStd);
+      }
+    }
+    return imgData;
+  }
+
   private TestUtils() {}
 }
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
index 1fe4a531624..d92a7119aab 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
@@ -18,12 +18,17 @@ package org.tensorflow.lite.gpu;
 import static com.google.common.truth.Truth.assertThat;
 
 import java.nio.ByteBuffer;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.PriorityQueue;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
 import org.tensorflow.lite.Interpreter;
+import org.tensorflow.lite.InterpreterTestHelper;
 import org.tensorflow.lite.TestUtils;
 
 /** Unit tests for {@link org.tensorflow.lite.gpu.GpuDelegate}. */
@@ -32,6 +37,9 @@ public final class GpuDelegateTest {
 
   private static final String MODEL_PATH = "tensorflow/lite/testdata/multi_add.bin";
   private static final ByteBuffer MODEL_BUFFER = TestUtils.getTestFileAsBuffer(MODEL_PATH);
+  private static final ByteBuffer MOBILENET_QUANTIZED_MODEL_BUFFER =
+      TestUtils.getTestFileAsBuffer(
+          "third_party/tensorflow/lite/java/demo/app/src/main/assets/mobilenet_v1_1.0_224_quant.tflite");
 
   @Test
   public void testBasic() throws Exception {
@@ -41,7 +49,7 @@ public final class GpuDelegateTest {
   }
 
   @Test
-  public void testInterpreterWithGpu() throws Exception {
+  public void testInterpreterWithGpu_FloatModel() throws Exception {
     Interpreter.Options options = new Interpreter.Options();
     try (GpuDelegate delegate = new GpuDelegate();
         Interpreter interpreter = new Interpreter(MODEL_BUFFER, options.addDelegate(delegate))) {
@@ -60,4 +68,79 @@ public final class GpuDelegateTest {
       assertThat(parsedOutput1).usingTolerance(0.1f).containsExactly(expected1).inOrder();
     }
   }
+
+  @Test
+  public void testInterpreterWithGpu_QuantModelRunWithDelegate() throws Exception {
+    ByteBuffer img =
+        TestUtils.getTestImageAsByteBuffer(
+            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
+
+    Interpreter.Options options = new Interpreter.Options();
+    try (GpuDelegate delegate =
+            new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
+        Interpreter interpreter =
+            new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
+      byte[][] output = new byte[1][1001];
+      interpreter.run(img, output);
+      // Should be only 1 node (Delegate) in the execution plan.
+      assertThat(InterpreterTestHelper.executionPlanLength(interpreter)).isEqualTo(1);
+      assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
+      assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
+      // 653 == "military uniform"
+      assertThat(getTopKLabels(output, 3)).contains(653);
+    }
+  }
+
+  @Test
+  public void testInterpreterWithGpu_QuantModelRunOnCPU() throws Exception {
+    ByteBuffer img =
+        TestUtils.getTestImageAsByteBuffer(
+            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
+
+    Interpreter.Options options = new Interpreter.Options();
+    try (GpuDelegate delegate = new GpuDelegate();
+        Interpreter interpreter =
+            new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
+      byte[][] output = new byte[1][1001];
+      interpreter.run(img, output);
+      // Original execution plan remains since default behavior doesn't allow quantized models.
+      assertThat(InterpreterTestHelper.executionPlanLength(interpreter)).isEqualTo(31);
+      assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
+      assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
+      // 653 == "military uniform"
+      assertThat(getTopKLabels(output, 3)).contains(653);
+    }
+  }
+
+  private static ArrayList<Integer> getTopKLabels(byte[][] byteLabels, int k) {
+    float[][] labels = new float[1][1001];
+    for (int i = 0; i < byteLabels[0].length; ++i) {
+      labels[0][i] = (byteLabels[0][i] & 0xff) / 255.0f;
+    }
+    return getTopKLabels(labels, k);
+  }
+
+  private static ArrayList<Integer> getTopKLabels(float[][] labels, int k) {
+    PriorityQueue<Map.Entry<Integer, Float>> pq =
+        new PriorityQueue<>(
+            k,
+            new Comparator<Map.Entry<Integer, Float>>() {
+              @Override
+              public int compare(Map.Entry<Integer, Float> o1, Map.Entry<Integer, Float> o2) {
+                // Intentionally reversed to put high confidence at the head of the queue.
+                return o1.getValue().compareTo(o2.getValue()) * -1;
+              }
+            });
+
+    for (int i = 0; i < labels[0].length; ++i) {
+      pq.add(new AbstractMap.SimpleEntry<>(i, labels[0][i]));
+    }
+
+    final ArrayList<Integer> topKLabels = new ArrayList<>();
+    int topKLabelsSize = Math.min(pq.size(), k);
+    for (int i = 0; i < topKLabelsSize; ++i) {
+      topKLabels.add(pq.poll().getKey());
+    }
+    return topKLabels;
+  }
 }
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java
index e3742dab9a3..45d66e24d35 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java
@@ -56,6 +56,26 @@ public final class NnApiDelegateTest {
     }
   }
 
+  @Test
+  public void testInterpreterWithNnApiAllowFp16() throws Exception {
+    Interpreter.Options options = new Interpreter.Options();
+    NnApiDelegate.Options nnApiOptions = new NnApiDelegate.Options();
+    nnApiOptions.setAllowFp16(true);
+
+    try (NnApiDelegate delegate = new NnApiDelegate(nnApiOptions);
+        Interpreter interpreter = new Interpreter(MODEL_BUFFER, options.addDelegate(delegate))) {
+      float[] oneD = {1.23f, 6.54f, 7.81f};
+      float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+      float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+      float[][][][] fourD = {threeD, threeD};
+      float[][][][] parsedOutputs = new float[2][8][8][3];
+      interpreter.run(fourD, parsedOutputs);
+      float[] outputOneD = parsedOutputs[0][0][0];
+      float[] expected = {3.69f, 19.62f, 23.43f};
+      assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    }
+  }
+
   @Test
   public void testGetNnApiErrnoReturnsZeroIfNoNnapiCallFailed() throws Exception {
     Interpreter.Options options = new Interpreter.Options();
diff --git a/tensorflow/lite/java/src/testdata/grace_hopper_224.jpg b/tensorflow/lite/java/src/testdata/grace_hopper_224.jpg
new file mode 100644
index 00000000000..15a2f2bd2a5
Binary files /dev/null and b/tensorflow/lite/java/src/testdata/grace_hopper_224.jpg differ
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 5b6fe4b5b21..eb62a338a45 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -235,6 +235,15 @@ cc_library(
     visibility = ["//visibility:private"],
 )
 
+cc_library(
+    name = "tflite_with_ruy_and_caching_enabled",
+    defines = [
+        "TFLITE_WITH_RUY",
+        "TFLITE_WITH_RUY_GEMV",
+    ],
+    visibility = ["//visibility:private"],
+)
+
 cc_library(
     name = "tflite_with_ruy_default",
     build_for_embedded = True,
@@ -357,9 +366,9 @@ cc_test(
         ":cpu_backend_context",
         ":cpu_backend_gemm",
         "@com_google_googletest//:gtest",
-        # ruy's reference path provides the reference implementation
+        # ruy:reference_mul provides the reference implementation
         # that this test compares against.
-        "@ruy//ruy",
+        "@ruy//ruy:reference_mul",
     ],
 )
 
@@ -386,7 +395,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:cppmath",
         "//tensorflow/lite/kernels/internal:quantization_util",
-        "@flatbuffers",
+        "@flatbuffers//:runtime_cc",
     ],
 )
 
@@ -423,140 +432,157 @@ cc_library(
     ],
 )
 
+BUILTIN_KERNEL_SRCS = [
+    "activations.cc",
+    "add.cc",
+    "add_n.cc",
+    "arg_min_max.cc",
+    "audio_spectrogram.cc",
+    "basic_rnn.cc",
+    "batch_matmul.cc",
+    "batch_to_space_nd.cc",
+    "bidirectional_sequence_lstm.cc",
+    "bidirectional_sequence_rnn.cc",
+    "cast.cc",
+    "ceil.cc",
+    "comparisons.cc",
+    "concatenation.cc",
+    "conv.cc",
+    "densify.cc",
+    "depth_to_space.cc",
+    "depthwise_conv.cc",
+    "dequantize.cc",
+    "detection_postprocess.cc",
+    "div.cc",
+    "elementwise.cc",
+    "embedding_lookup.cc",
+    "embedding_lookup_sparse.cc",
+    "exp.cc",
+    "expand_dims.cc",
+    "fake_quant.cc",
+    "fill.cc",
+    "floor.cc",
+    "floor_div.cc",
+    "floor_mod.cc",
+    "fully_connected.cc",
+    "gather.cc",
+    "gather_nd.cc",
+    "hashtable_lookup.cc",
+    "if.cc",
+    "l2norm.cc",
+    "local_response_norm.cc",
+    "logical.cc",
+    "lsh_projection.cc",
+    "lstm.cc",
+    "matrix_diag.cc",
+    "matrix_set_diag.cc",
+    "maximum_minimum.cc",
+    "mfcc.cc",
+    "mirror_pad.cc",
+    "mul.cc",
+    "neg.cc",
+    "non_max_suppression.cc",
+    "numeric_verify.cc",
+    "one_hot.cc",
+    "pack.cc",
+    "pad.cc",
+    "pooling.cc",
+    "pow.cc",
+    "quantize.cc",
+    "range.cc",
+    "rank.cc",
+    "reduce.cc",
+    "reshape.cc",
+    "resize_bilinear.cc",
+    "resize_nearest_neighbor.cc",
+    "reverse.cc",
+    "reverse_sequence.cc",
+    "round.cc",
+    "scatter_nd.cc",
+    "segment_sum.cc",
+    "select.cc",
+    "shape.cc",
+    "skip_gram.cc",
+    "slice.cc",
+    "space_to_batch_nd.cc",
+    "space_to_depth.cc",
+    "sparse_to_dense.cc",
+    "split.cc",
+    "split_v.cc",
+    "squared_difference.cc",
+    "squeeze.cc",
+    "strided_slice.cc",
+    "sub.cc",
+    "svdf.cc",
+    "tile.cc",
+    "topk_v2.cc",
+    "transpose.cc",
+    "transpose_conv.cc",
+    "unidirectional_sequence_lstm.cc",
+    "unidirectional_sequence_rnn.cc",
+    "unique.cc",
+    "unpack.cc",
+    "where.cc",
+    "while.cc",
+    "zeros_like.cc",
+]
+
+BUILTIN_KERNEL_DEPS = [
+    ":cpu_backend_context",
+    ":cpu_backend_gemm",
+    ":cpu_backend_threadpool",
+    ":eigen_support",
+    ":kernel_util",
+    ":lstm_eval",
+    ":lstm_shared",
+    ":op_macros",
+    ":padding",
+    "@com_google_absl//absl/memory",
+    "@com_google_absl//absl/strings",
+    "//third_party/eigen3",
+    "@flatbuffers",
+    "//tensorflow/lite:framework_lib",
+    "//tensorflow/lite:minimal_logging",
+    "//tensorflow/lite:string_util",
+    "//tensorflow/lite/c:common",
+    "//tensorflow/lite/kernels/internal:audio_utils",
+    "//tensorflow/lite/kernels/internal:common",
+    "//tensorflow/lite/kernels/internal:compatibility",
+    "//tensorflow/lite/kernels/internal:cpu_check",
+    "//tensorflow/lite/kernels/internal:kernel_utils",
+    "//tensorflow/lite/kernels/internal:optimized",
+    "//tensorflow/lite/kernels/internal:optimized_base",
+    "//tensorflow/lite/kernels/internal:quantization_util",
+    "//tensorflow/lite/kernels/internal:reference_base",
+    "//tensorflow/lite/kernels/internal:strided_slice_logic",
+    "//tensorflow/lite/kernels/internal:tensor",
+    "//tensorflow/lite/kernels/internal:tensor_utils",
+    "//tensorflow/lite/kernels/internal:types",
+]
+
 cc_library(
     name = "builtin_op_kernels",
-    srcs = [
-        "activations.cc",
-        "add.cc",
-        "add_n.cc",
-        "arg_min_max.cc",
-        "audio_spectrogram.cc",
-        "basic_rnn.cc",
-        "batch_matmul.cc",
-        "batch_to_space_nd.cc",
-        "bidirectional_sequence_lstm.cc",
-        "bidirectional_sequence_rnn.cc",
-        "cast.cc",
-        "ceil.cc",
-        "comparisons.cc",
-        "concatenation.cc",
-        "conv.cc",
-        "densify.cc",
-        "depth_to_space.cc",
-        "depthwise_conv.cc",
-        "dequantize.cc",
-        "detection_postprocess.cc",
-        "div.cc",
-        "elementwise.cc",
-        "embedding_lookup.cc",
-        "embedding_lookup_sparse.cc",
-        "exp.cc",
-        "expand_dims.cc",
-        "fake_quant.cc",
-        "fill.cc",
-        "floor.cc",
-        "floor_div.cc",
-        "floor_mod.cc",
-        "fully_connected.cc",
-        "gather.cc",
-        "gather_nd.cc",
-        "hashtable_lookup.cc",
-        "if.cc",
-        "l2norm.cc",
-        "local_response_norm.cc",
-        "logical.cc",
-        "lsh_projection.cc",
-        "lstm.cc",
-        "matrix_diag.cc",
-        "matrix_set_diag.cc",
-        "maximum_minimum.cc",
-        "mfcc.cc",
-        "mirror_pad.cc",
-        "mul.cc",
-        "neg.cc",
-        "non_max_suppression.cc",
-        "numeric_verify.cc",
-        "one_hot.cc",
-        "pack.cc",
-        "pad.cc",
-        "pooling.cc",
-        "pow.cc",
-        "quantize.cc",
-        "range.cc",
-        "rank.cc",
-        "reduce.cc",
-        "reshape.cc",
-        "resize_bilinear.cc",
-        "resize_nearest_neighbor.cc",
-        "reverse.cc",
-        "reverse_sequence.cc",
-        "round.cc",
-        "scatter_nd.cc",
-        "segment_sum.cc",
-        "select.cc",
-        "shape.cc",
-        "skip_gram.cc",
-        "slice.cc",
-        "space_to_batch_nd.cc",
-        "space_to_depth.cc",
-        "sparse_to_dense.cc",
-        "split.cc",
-        "split_v.cc",
-        "squared_difference.cc",
-        "squeeze.cc",
-        "strided_slice.cc",
-        "sub.cc",
-        "svdf.cc",
-        "tile.cc",
-        "topk_v2.cc",
-        "transpose.cc",
-        "transpose_conv.cc",
-        "unidirectional_sequence_lstm.cc",
-        "unidirectional_sequence_rnn.cc",
-        "unique.cc",
-        "unpack.cc",
-        "where.cc",
-        "while.cc",
-        "zeros_like.cc",
-    ],
+    srcs = BUILTIN_KERNEL_SRCS,
     hdrs = [
         "dequantize.h",
     ],
     copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS,
     visibility = ["//visibility:private"],
-    deps = [
-        ":cpu_backend_context",
-        ":cpu_backend_gemm",
-        ":cpu_backend_threadpool",
-        ":eigen_support",
-        ":kernel_util",
-        ":lstm_eval",
-        ":lstm_shared",
-        ":op_macros",
-        ":padding",
-        "//tensorflow/lite:framework_lib",
-        "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels/internal:audio_utils",
-        "//tensorflow/lite/kernels/internal:common",
-        "//tensorflow/lite/kernels/internal:compatibility",
-        "//tensorflow/lite/kernels/internal:cpu_check",
-        "//tensorflow/lite/kernels/internal:kernel_utils",
-        "//tensorflow/lite/kernels/internal:optimized",
-        "//tensorflow/lite/kernels/internal:optimized_base",
-        "//tensorflow/lite/kernels/internal:quantization_util",
-        "//tensorflow/lite/kernels/internal:reference_base",
-        "//tensorflow/lite/kernels/internal:strided_slice_logic",
-        "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/kernels/internal:tensor_utils",
-        "//tensorflow/lite/kernels/internal:types",
-        "//third_party/eigen3",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@farmhash_archive//:farmhash",
-        "@flatbuffers",
+    deps = BUILTIN_KERNEL_DEPS + ["@farmhash_archive//:farmhash"],
+)
+
+# Creates a target where Ruy is unconditionally enabled along with caching
+# on GEMV operations. This is useful for TF Lite deployments where custom
+# copts are not allowed, e.g. b/156119344
+cc_library(
+    name = "builtin_op_kernels_ruy_and_caching",
+    srcs = BUILTIN_KERNEL_SRCS,
+    hdrs = [
+        "dequantize.h",
     ],
+    copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS,
+    visibility = ["//visibility:private"],
+    deps = BUILTIN_KERNEL_DEPS + ["@farmhash_archive//:farmhash"] + [":tflite_with_ruy_and_caching_enabled"],
 )
 
 cc_library(
@@ -673,6 +699,22 @@ cc_library(
     ],
 )
 
+#  TODO(b/156664104) Remove once runtime flag available.
+cc_library(
+    name = "builtin_ops_ruy_and_caching_enabled",
+    srcs = ["register.cc"],
+    hdrs = [
+        "builtin_op_kernels.h",
+        "fully_connected.h",
+        "register.h",
+    ],
+    deps = [
+        ":builtin_op_kernels_ruy_and_caching",
+        "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite/c:common",
+    ],
+)
+
 # The builtin_ops target will resolve to optimized kernels when available. This
 # target uses reference kernels only, and is useful for validation and testing.
 # It should *not* generally be used in production.
@@ -1972,6 +2014,7 @@ cc_test(
     name = "fill_test",
     size = "small",
     srcs = ["fill_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 4d52b5c7446..fc0c461a7c1 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -84,8 +84,11 @@ struct LeakyReluOpData : public OpData {
 };
 
 struct PreluOpData : public OpData {
-  int32_t output_multiplier = 0;
-  int output_shift = 0;
+  int32_t output_multiplier_1 = 0;
+  int32_t output_shift_1 = 0;
+  int32_t output_multiplier_2 = 0;
+  int32_t output_shift_2 = 0;
+  bool requires_broadcast;
 };
 
 struct HardSwishData {
@@ -364,7 +367,8 @@ TfLiteStatus LeakyReluPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   LeakyReluOpData* data = reinterpret_cast<LeakyReluOpData*>(node->user_data);
 
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+      output->type == kTfLiteInt16) {
     const auto* params =
         reinterpret_cast<TfLiteLeakyReluParams*>(node->builtin_data);
 
@@ -436,21 +440,29 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
 
     int input_scale_log2_rounded;
-    TF_LITE_ENSURE(context,
-                   CheckedLog2(input->params.scale, &input_scale_log2_rounded));
+    bool param_scale_pot =
+        CheckedLog2(input->params.scale, &input_scale_log2_rounded);
+
+    data->input_left_shift =
+        (15 - kInputIntegerBits) + input_scale_log2_rounded;
+    param_scale_pot &=
+        (data->input_left_shift == 0 || data->input_left_shift == 1);
+
+    if (!param_scale_pot) {
+      // In case of general scale parameter, we need to do a rescaling.
+      // Magic constant 4096:
+      // We need to scale down to (-2^3, 2^3) / 3 is kInputIntegerBits/ interval
+      // from 16-bit (-2^15, 2^15),
+      // so we need to multiply by
+      // 2^(15 - kInputIntegerBits) = 2^12 = 4096.
+      data->input_multiplier = static_cast<int32_t>(input->params.scale * 4096);
+    }
 
     int output_scale_log2_rounded;
     TF_LITE_ENSURE(
         context, CheckedLog2(output->params.scale, &output_scale_log2_rounded));
     TF_LITE_ENSURE_EQ(context, output_scale_log2_rounded,
                       -kOutputFractionalBits);
-
-    data->input_left_shift =
-        (15 - kInputIntegerBits) + input_scale_log2_rounded;
-    // Support for shifts is limited until we have a parameterized version of
-    // SaturatingRoundingMultiplyByPOT().
-    TF_LITE_ENSURE(context, data->input_left_shift >= 0);
-    TF_LITE_ENSURE(context, data->input_left_shift <= 1);
   }
 
   return context->ResizeTensor(context, output,
@@ -524,19 +536,28 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
 
     int input_scale_log2_rounded;
-    TF_LITE_ENSURE(context,
-                   CheckedLog2(input->params.scale, &input_scale_log2_rounded));
+    bool param_scale_pot =
+        CheckedLog2(input->params.scale, &input_scale_log2_rounded);
+
+    data->input_left_shift =
+        (15 - kInputIntegerBits) + input_scale_log2_rounded;
+    param_scale_pot &= (data->input_left_shift == 0);
+
+    if (!param_scale_pot) {
+      // In case of general scale parameter, we need to do a rescaling.
+      // Magic constant 4096:
+      // We need to scale down to (-2^3, 2^3) / 3 is kInputIntegerBits/ interval
+      // from 16-bit (-2^15, 2^15),
+      // so we need to multiply by
+      // 2^(15 - kInputIntegerBits) = 2^12 = 4096.
+      data->input_multiplier = static_cast<int32_t>(input->params.scale * 4096);
+    }
 
     int output_scale_log2_rounded;
     TF_LITE_ENSURE(
         context, CheckedLog2(output->params.scale, &output_scale_log2_rounded));
     TF_LITE_ENSURE_EQ(context, output_scale_log2_rounded,
                       -kOutputFractionalBits);
-
-    data->input_left_shift =
-        (15 - kInputIntegerBits) + input_scale_log2_rounded;
-    // The int16 logistic implementation does not support shifting of the input.
-    TF_LITE_ENSURE_EQ(context, data->input_left_shift, 0);
   }
 
   return context->ResizeTensor(context, output,
@@ -647,7 +668,6 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
       output->type == kTfLiteInt16) {
-    // This scale check is actually needed for quantized path:
     // prelu(x) = x if x >= 0 else x * alpha.
     // So if we translate that for quantized computation:
     //
@@ -659,21 +679,22 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
     // ouput_q = (input_q - input_zp) * input_scale / output_scale + output_q
     // else:
     // output_q = (input_q - input_zp) * (alpha_q - alpha_zp) * input_scale
-    //            * alpha_scale / output_scale +output_q
+    //            * alpha_scale / output_scale + output_q
     //
-    // So we have two float values which we need to translate into multiplier
-    // shift languages.
-    // For simplicity & efficiency, if we make sure input_scale
-    // & output_scale are the same, we only need to translate the latter one
-    // into multiplier & shift format.
-    TF_LITE_ENSURE(context,
-                   std::abs(input->params.scale - output->params.scale) < 1e-4);
-    double real_multiplier =
+    // So for input_q - input_zp >= 0:
+    // output real multiplier 1 is input_scale / output_scale;
+    // for input_q - input_zp < 0:
+    // output real multiplier 2 is input_scale  * alpha_scale/ output_scale.
+    double real_multiplier_1 = input->params.scale / output->params.scale;
+    double real_multiplier_2 =
         input->params.scale * alpha->params.scale / output->params.scale;
-    QuantizeMultiplierSmallerThanOneExp(
-        real_multiplier, &data->output_multiplier, &data->output_shift);
+    QuantizeMultiplier(real_multiplier_1, &data->output_multiplier_1,
+                       &data->output_shift_1);
+    QuantizeMultiplier(real_multiplier_2, &data->output_multiplier_2,
+                       &data->output_shift_2);
   }
 
+  data->requires_broadcast = !HaveSameShapes(input, alpha);
   // PRelu (parameteric Relu) shares the same alpha value on "shared axis".
   // This means it's always required to "broadcast" alpha values in PRelu.
   TfLiteIntArray* output_size = nullptr;
@@ -849,13 +870,13 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt16: {
       TanhParams params;
       params.input_left_shift = data->input_left_shift;
-      if (kernel_type == kReference) {
+      if (kernel_type == kReference || (data->input_multiplier > 0)) {
         const int size =
             MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
 
-        reference_integer_ops::Tanh(data->input_left_shift, size,
-                                    GetTensorData<int16_t>(input),
-                                    GetTensorData<int16_t>(output));
+        reference_integer_ops::Tanh(
+            data->input_multiplier, data->input_left_shift, size,
+            GetTensorData<int16_t>(input), GetTensorData<int16_t>(output));
       } else {
         optimized_ops::Tanh(
             params, GetTensorShape(input), GetTensorData<int16_t>(input),
@@ -924,11 +945,12 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
     }
     case kTfLiteInt16: {
       LogisticParams params;
-      if (kernel_type == kReference) {
+      if (kernel_type == kReference || (data->input_multiplier > 0)) {
         const int size =
             MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
 
-        reference_integer_ops::Logistic(size, GetTensorData<int16_t>(input),
+        reference_integer_ops::Logistic(data->input_multiplier, size,
+                                        GetTensorData<int16_t>(input),
                                         GetTensorData<int16_t>(output));
       } else {
         optimized_ops::Logistic(
@@ -1141,11 +1163,19 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   const PreluOpData* data = reinterpret_cast<PreluOpData*>(node->user_data);
   switch (input->type) {
     case kTfLiteFloat32: {
-      reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
-          GetTensorShape(input), GetTensorData<float>(input),
-          GetTensorShape(alpha), GetTensorData<float>(alpha),
-          GetTensorShape(output), GetTensorData<float>(output),
-          ApplyPrelu<float>);
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
+            GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(alpha), GetTensorData<float>(alpha),
+            GetTensorShape(output), GetTensorData<float>(output),
+            ApplyPrelu<float>);
+      } else {
+        reference_ops::BinaryFunction<float, float, float>(
+            GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(alpha), GetTensorData<float>(alpha),
+            GetTensorShape(output), GetTensorData<float>(output),
+            ApplyPrelu<float>);
+      }
       return kTfLiteOk;
     } break;
     case kTfLiteUInt8: {
@@ -1153,12 +1183,21 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
       op_params.input_offset = -input->params.zero_point;
       op_params.alpha_offset = -alpha->params.zero_point;
       op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier = data->output_multiplier;
-      op_params.output_shift = data->output_shift;
-      reference_ops::BroadcastPrelu4DSlow(
-          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      op_params.output_multiplier_1 = data->output_multiplier_1;
+      op_params.output_shift_1 = data->output_shift_1;
+      op_params.output_multiplier_2 = data->output_multiplier_2;
+      op_params.output_shift_2 = data->output_shift_2;
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastPrelu4DSlow(
+            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      } else {
+        reference_ops::Prelu(
+            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      }
       return kTfLiteOk;
     } break;
     case kTfLiteInt8: {
@@ -1166,12 +1205,21 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
       op_params.input_offset = -input->params.zero_point;
       op_params.alpha_offset = -alpha->params.zero_point;
       op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier = data->output_multiplier;
-      op_params.output_shift = data->output_shift;
-      reference_ops::BroadcastPrelu4DSlow(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(alpha), GetTensorData<int8_t>(alpha),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
+      op_params.output_multiplier_1 = data->output_multiplier_1;
+      op_params.output_shift_1 = data->output_shift_1;
+      op_params.output_multiplier_2 = data->output_multiplier_2;
+      op_params.output_shift_2 = data->output_shift_2;
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastPrelu4DSlow(
+            op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+            GetTensorShape(alpha), GetTensorData<int8_t>(alpha),
+            GetTensorShape(output), GetTensorData<int8_t>(output));
+      } else {
+        reference_ops::Prelu(
+            op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+            GetTensorShape(alpha), GetTensorData<int8_t>(alpha),
+            GetTensorShape(output), GetTensorData<int8_t>(output));
+      }
       return kTfLiteOk;
     } break;
     default:
@@ -1183,6 +1231,22 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+template <typename T>
+void QuantizeLeakyRelu(const TfLiteTensor* input, TfLiteTensor* output,
+                       const LeakyReluOpData* data) {
+  LeakyReluParams op_params;
+
+  op_params.input_offset = input->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.output_multiplier_alpha = data->output_multiplier_alpha;
+  op_params.output_shift_alpha = data->output_shift_alpha;
+  op_params.output_multiplier_identity = data->output_multiplier_identity;
+  op_params.output_shift_identity = data->output_shift_identity;
+  reference_ops::QuantizeLeakyRelu(
+      op_params, GetTensorShape(input), GetTensorData<T>(input),
+      GetTensorShape(output), GetTensorData<T>(output));
+}
+
 TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -1201,33 +1265,21 @@ TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     case kTfLiteUInt8: {
-      op_params.input_offset = input->params.zero_point;
-      op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier_alpha = data->output_multiplier_alpha;
-      op_params.output_shift_alpha = data->output_shift_alpha;
-      op_params.output_multiplier_identity = data->output_multiplier_identity;
-      op_params.output_shift_identity = data->output_shift_identity;
-      reference_ops::QuantizeLeakyRelu(
-          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      QuantizeLeakyRelu<uint8_t>(input, output, data);
       return kTfLiteOk;
     } break;
     case kTfLiteInt8: {
-      op_params.input_offset = input->params.zero_point;
-      op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier_alpha = data->output_multiplier_alpha;
-      op_params.output_shift_alpha = data->output_shift_alpha;
-      op_params.output_multiplier_identity = data->output_multiplier_identity;
-      op_params.output_shift_identity = data->output_shift_identity;
-      reference_ops::QuantizeLeakyRelu(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
+      QuantizeLeakyRelu<int8_t>(input, output, data);
+      return kTfLiteOk;
+    } break;
+    case kTfLiteInt16: {
+      QuantizeLeakyRelu<int16_t>(input, output, data);
       return kTfLiteOk;
     } break;
     default:
       TF_LITE_KERNEL_LOG(
           context,
-          "Only float32, int8 and uint8 is supported currently, got %s.",
+          "Only float32, int8, int16 and uint8 is supported currently, got %s.",
           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index b4711216524..5a679147469 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -108,10 +108,20 @@ class BaseActivationsOpModel : public SingleOpModel {
   BaseActivationsOpModel(TensorData input, float alpha) {
     input_ = AddInput(input);
     // The output scale and input scale might be different.
-    if (input.type == TensorType_UINT8 || input.type == TensorType_INT8) {
+    if (input.type == TensorType_UINT8 || input.type == TensorType_INT8 ||
+        input.type == TensorType_INT16) {
       auto output_min = (input.min >= 0) ? input.min : input.min * alpha;
       auto output_max = (input.max >= 0) ? input.max : input.max * alpha;
-      output_ = AddOutput({input.type, {}, output_min, output_max});
+      if (input.type == TensorType_INT16) {
+        output_ = AddOutput({TensorType_INT16,
+                             {},
+                             0,
+                             0,
+                             output_max / (std::numeric_limits<int16_t>::max()),
+                             0});
+      } else {
+        output_ = AddOutput({input.type, {}, output_min, output_max});
+      }
     } else {
       output_ = AddOutput({input.type, {}});
     }
@@ -504,14 +514,15 @@ TEST(QuantizedActivationsOpTest, LeakyReluUint8) {
                   kQuantizedTolerance * 8)));
 }
 
-TEST(QuantizedActivationsOpTest, LeakyReluInt8) {
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedActivationsOpTestLeakyRelu() {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
 
   QuantizedActivationsOpModel m(
-      /*input=*/{TensorType_INT8, {5, 5}, 5 * kMin, 5 * kMax}, 0.1);
+      /*input=*/{tensor_type, {5, 5}, 5 * kMin, 5 * kMax}, 0.1);
 
-  m.SetInput<int8_t>({
+  m.SetInput<integer_dtype>({
       -5.0f, -4.6f, -4.2f, -3.8f, -3.4f,  // Row 1
       -3.0f, -2.6f, -2.2f, -1.8f, -1.4f,  // Row 2
       -1.0f, -0.6f, -0.2f, 0.2f,  0.6f,   // Row 3
@@ -519,7 +530,12 @@ TEST(QuantizedActivationsOpTest, LeakyReluInt8) {
       3.0f,  3.4f,  3.8f,  4.2f,  4.6f,   // Row 5
   });
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+
+  float kTestQuantizedTolerance = tensor_type == TensorType_INT16
+                                      ? kQuantizedToleranceInt16
+                                      : kQuantizedTolerance * 5;
+
+  EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
               ElementsAreArray(ArrayFloatNear(
                   {
                       -0.50f, -0.46f, -0.42f, -0.38f, -0.34f,  // Row 1
@@ -528,7 +544,15 @@ TEST(QuantizedActivationsOpTest, LeakyReluInt8) {
                       1.00f,  1.40f,  1.80f,  2.20f,  2.60f,   // Row 4
                       3.00f,  3.40f,  3.80f,  4.20f,  4.60f,   // Row 5
                   },
-                  kQuantizedTolerance * 5)));
+                  kTestQuantizedTolerance)));
+}
+
+TEST(QuantizedActivationsOpTest, LeakyReluInt8) {
+  QuantizedActivationsOpTestLeakyRelu<TensorType_INT8, int8_t>();
+}
+
+TEST(QuantizedActivationsOpTest, LeakyReluInt16) {
+  QuantizedActivationsOpTestLeakyRelu<TensorType_INT16, int16_t>();
 }
 
 TEST(QuantizedActivationsOpTest, Relu1Int8) {
@@ -773,19 +797,73 @@ TEST_P(TanhOpTest, TanhInt16) {
   const float kMax = 32767.f / 32768.f;
   QuantizedActivationsOpModel m(
       GetRegistration(), BuiltinOperator_TANH,
-      /*input=*/{TensorType_INT16, {1, 2, 8, 1}, 8 * kMin, 8 * kMax},
-      /*output=*/{TensorType_INT16, {1, 2, 8, 1}, kMin, kMax});
-  m.SetInput<int16_t>({0, -6, 2, 4,   //
-                       -4, -2, 8, 1,  //
-                       7, -8, 3, -5,  //
-                       6, -1, -3, 5});
+      /*input=*/{TensorType_INT16, {89}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_INT16, {89}, kMin, kMax});
+  m.SetInput<int16_t>(
+      {-8.0000000000, -7.8181818182, -7.6363636364, -7.4545454545,
+       -7.2727272727, -7.0909090909, -6.9090909091, -6.7272727273,
+       -6.5454545455, -6.3636363636, -6.1818181818, -6.0000000000,
+       -5.8181818182, -5.6363636364, -5.4545454545, -5.2727272727,
+       -5.0909090909, -4.9090909091, -4.7272727273, -4.5454545455,
+       -4.3636363636, -4.1818181818, -4.0000000000, -3.8181818182,
+       -3.6363636364, -3.4545454545, -3.2727272727, -3.0909090909,
+       -2.9090909091, -2.7272727273, -2.5454545455, -2.3636363636,
+       -2.1818181818, -2.0000000000, -1.8181818182, -1.6363636364,
+       -1.4545454545, -1.2727272727, -1.0909090909, -0.9090909091,
+       -0.7272727273, -0.5454545455, -0.3636363636, -0.1818181818,
+       0.0000000000,  0.1818181818,  0.3636363636,  0.5454545455,
+       0.7272727273,  0.9090909091,  1.0909090909,  1.2727272727,
+       1.4545454545,  1.6363636364,  1.8181818182,  2.0000000000,
+       2.1818181818,  2.3636363636,  2.5454545455,  2.7272727273,
+       2.9090909091,  3.0909090909,  3.2727272727,  3.4545454545,
+       3.6363636364,  3.8181818182,  4.0000000000,  4.1818181818,
+       4.3636363636,  4.5454545455,  4.7272727273,  4.9090909091,
+       5.0909090909,  5.2727272727,  5.4545454545,  5.6363636364,
+       5.8181818182,  6.0000000000,  6.1818181818,  6.3636363636,
+       6.5454545455,  6.7272727273,  6.9090909091,  7.0909090909,
+       7.2727272727,  7.4545454545,  7.6363636364,  7.8181818182,
+       8.0000000000});
   m.Invoke();
   EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
               ElementsAreArray(ArrayFloatNear(
-                  {0.0, -0.999987, 0.964027, 0.999329,                //
-                   -0.999329, -0.96402, 0.99999, 0.76159,             //
-                   0.999998337, -0.99999, 0.995054754, -0.999909204,  //
-                   0.999999996, -0.76159, -0.995054754, 0.999909204},
+                  {-0.9999997749, -0.9999996762, -0.9999995342, -0.9999993300,
+                   -0.9999990361, -0.9999986134, -0.9999980053, -0.9999971306,
+                   -0.9999958722, -0.9999940619, -0.9999914578, -0.9999877117,
+                   -0.9999823226, -0.9999745703, -0.9999634183, -0.9999473758,
+                   -0.9999242982, -0.9998911009, -0.9998433469, -0.9997746542,
+                   -0.9996758446, -0.9995337191, -0.9993292997, -0.9990353053,
+                   -0.9986125310, -0.9980046622, -0.9971308601, -0.9958751909,
+                   -0.9940716137, -0.9914827859, -0.9877703933, -0.9824541388,
+                   -0.9748561217, -0.9640275801, -0.9486568273, -0.9269625051,
+                   -0.8965880154, -0.8545351057, -0.7972097087, -0.7206956332,
+                   -0.6213939966, -0.4971057414, -0.3484130125, -0.1798408185,
+                   0.0000000000,  0.1798408185,  0.3484130125,  0.4971057414,
+                   0.6213939966,  0.7206956332,  0.7972097087,  0.8545351057,
+                   0.8965880154,  0.9269625051,  0.9486568273,  0.9640275801,
+                   0.9748561217,  0.9824541388,  0.9877703933,  0.9914827859,
+                   0.9940716137,  0.9958751909,  0.9971308601,  0.9980046622,
+                   0.9986125310,  0.9990353053,  0.9993292997,  0.9995337191,
+                   0.9996758446,  0.9997746542,  0.9998433469,  0.9998911009,
+                   0.9999242982,  0.9999473758,  0.9999634183,  0.9999745703,
+                   0.9999823226,  0.9999877117,  0.9999914578,  0.9999940619,
+                   0.9999958722,  0.9999971306,  0.9999980053,  0.9999986134,
+                   0.9999990361,  0.9999993300,  0.9999995342,  0.9999996762,
+                   0.9999997749},
+                  kQuantizedToleranceInt16)));
+}
+
+TEST_P(TanhOpTest, TanhInt16General) {
+  const float kMin = -1;
+  const float kMax = 32767.f / 32768.f;
+  QuantizedActivationsOpModel m(
+      GetRegistration(), BuiltinOperator_TANH,
+      /*input=*/{TensorType_INT16, {6}, 11 * kMin, 11 * kMax},
+      /*output=*/{TensorType_INT16, {5}, kMin, kMax});
+  m.SetInput<int16_t>({-10, -4, 0, 6, 7.0909090909, 8});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {-0.999969, -0.99408, 0, 0.999664, 0.999939, 0.999969},
                   kQuantizedToleranceInt16)));
 }
 
@@ -914,20 +992,74 @@ TEST_P(LogisticOpTest, SigmoidInt16) {
   const float kMax = 32767.f / 32768.f;
   QuantizedActivationsOpModel m(
       GetRegistration(), BuiltinOperator_LOGISTIC,
-      /*input=*/{TensorType_INT16, {1, 2, 6, 1}, 8 * kMin, 8 * kMax},
-      /*output=*/{TensorType_INT16, {1, 2, 6, 1}, kMin, kMax});
-  m.SetInput<int16_t>({0, -6, 2, 4,  //
-                       3, -2, 8, 1,  //
-                       5, -8, 7, -3});
+      /*input=*/{TensorType_INT16, {89}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_INT16, {89}, kMin, kMax});
+  m.SetInput<int16_t>(
+      {-10.0000000000, -9.7727272727, -9.5454545455, -9.3181818182,
+       -9.0909090909,  -8.8636363636, -8.6363636364, -8.4090909091,
+       -8.1818181818,  -7.9545454545, -7.7272727273, -7.5000000000,
+       -7.2727272727,  -7.0454545455, -6.8181818182, -6.5909090909,
+       -6.3636363636,  -6.1363636364, -5.9090909091, -5.6818181818,
+       -5.4545454545,  -5.2272727273, -5.0000000000, -4.7727272727,
+       -4.5454545455,  -4.3181818182, -4.0909090909, -3.8636363636,
+       -3.6363636364,  -3.4090909091, -3.1818181818, -2.9545454545,
+       -2.7272727273,  -2.5000000000, -2.2727272727, -2.0454545455,
+       -1.8181818182,  -1.5909090909, -1.3636363636, -1.1363636364,
+       -0.9090909091,  -0.6818181818, -0.4545454545, -0.2272727273,
+       0.0000000000,   0.2272727273,  0.4545454545,  0.6818181818,
+       0.9090909091,   1.1363636364,  1.3636363636,  1.5909090909,
+       1.8181818182,   2.0454545455,  2.2727272727,  2.5000000000,
+       2.7272727273,   2.9545454545,  3.1818181818,  3.4090909091,
+       3.6363636364,   3.8636363636,  4.0909090909,  4.3181818182,
+       4.5454545455,   4.7727272727,  5.0000000000,  5.2272727273,
+       5.4545454545,   5.6818181818,  5.9090909091,  6.1363636364,
+       6.3636363636,   6.5909090909,  6.8181818182,  7.0454545455,
+       7.2727272727,   7.5000000000,  7.7272727273,  7.9545454545,
+       8.1818181818,   8.4090909091,  8.6363636364,  8.8636363636,
+       9.0909090909,   9.3181818182,  9.5454545455,  9.7727272727,
+       10.0000000000});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
-              ElementsAreArray(ArrayFloatNear(
-                  {
-                      0.5, 0.002473, 0.880797, 0.982014,       //
-                      0.952574, 0.119203, 0.9995, 0.731059,    //
-                      0.993307, 0.0003535, 0.999089, 0.047426  //
-                  },
-                  kQuantizedToleranceInt16)));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int16_t>(),
+      ElementsAreArray(ArrayFloatNear(
+          {0.0000453979, 0.0000569815, 0.0000715205, 0.0000897689, 0.0001126729,
+           0.0001414198, 0.0001774998, 0.0002227827, 0.0002796147, 0.0003509396,
+           0.0004404502, 0.0005527786, 0.0006937345, 0.0008706021, 0.0010925128,
+           0.0013709094, 0.0017201256, 0.0021581065, 0.0027073042, 0.0033957870,
+           0.0042586071, 0.0053394826, 0.0066928509, 0.0083863576, 0.0105038445,
+           0.0131488902, 0.0164489307, 0.0205599431, 0.0256715863, 0.0320125562,
+           0.0398556989, 0.0495221198, 0.0613831074, 0.0758581800, 0.0934070047,
+           0.1145124805, 0.1396521834, 0.1692560327, 0.2036499335, 0.2429886272,
+           0.2871859014, 0.3358556241, 0.3882805886, 0.4434251301, 0.5000000000,
+           0.5565748699, 0.6117194114, 0.6641443759, 0.7128140986, 0.7570113728,
+           0.7963500665, 0.8307439673, 0.8603478166, 0.8854875195, 0.9065929953,
+           0.9241418200, 0.9386168926, 0.9504778802, 0.9601443011, 0.9679874438,
+           0.9743284137, 0.9794400569, 0.9835510693, 0.9868511098, 0.9894961555,
+           0.9916136424, 0.9933071491, 0.9946605174, 0.9957413929, 0.9966042130,
+           0.9972926958, 0.9978418935, 0.9982798744, 0.9986290906, 0.9989074872,
+           0.9991293979, 0.9993062655, 0.9994472214, 0.9995595498, 0.9996490604,
+           0.9997203853, 0.9997772173, 0.9998225002, 0.9998585802, 0.9998873271,
+           0.9999102311, 0.9999284795, 0.9999430185, 0.9999546021},
+          kQuantizedToleranceInt16)));
+}
+
+TEST_P(LogisticOpTest, SigmoidInt16General) {
+  const float kMin = -1;
+  const float kMax = 32767.f / 32768.f;
+  QuantizedActivationsOpModel m(
+      GetRegistration(), BuiltinOperator_LOGISTIC,
+      /*input=*/{TensorType_INT16, {8}, 10 * kMin, 10 * kMax},
+      /*output=*/{TensorType_INT16, {8}, kMin, kMax});
+  m.SetInput<int16_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int16_t>(),
+      ElementsAreArray(ArrayFloatNear({0.5, 0.00814819, 0.832031, 0.960846,  //
+                                       0.916809, 0.167969, 0.999664, 0.689972},
+                                      kQuantizedToleranceInt16)));
 }
 
 TEST(FloatActivationsOpTest, Softmax4D) {
@@ -1928,7 +2060,7 @@ TEST(FloatActivationsOpTest, PRelu) {
       0.0f, 0.0f, 0.0f,     // Row 1, Column 1
       1.0f, 1.0f, 1.0f,     // Row 1, Column 2
       -1.0f, -1.0f, -1.0f,  // Row 2, Column 1
-      -2.0f, -2.0f, -2.0f,  // Row 1, Column 2
+      -2.0f, -2.0f, -2.0f,  // Row 2, Column 2
   });
   m.SetAlpha({0.0f, 1.0f, 2.0f});
   m.Invoke();
@@ -1936,7 +2068,32 @@ TEST(FloatActivationsOpTest, PRelu) {
                                  0.0f, 0.0f, 0.0f,    // Row 1, Column 1
                                  1.0f, 1.0f, 1.0f,    // Row 1, Column 2
                                  0.0f, -1.0f, -2.0f,  // Row 2, Column 1
-                                 0.0f, -2.0f, -4.0f,  // Row 1, Column 2
+                                 0.0f, -2.0f, -4.0f,  // Row 2, Column 2
+                             }));
+}
+
+TEST(FloatActivationsOpTest, PReluSameShapes) {
+  FloatPReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
+                      {TensorType_FLOAT32, {1, 2, 2, 3}});
+
+  m.SetInput({
+      0.0f, 0.0f, 0.0f,     // Row 1, Column 1
+      1.0f, 1.0f, 1.0f,     // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,  // Row 2, Column 1
+      -2.0f, -2.0f, -2.0f,  // Row 2, Column 2
+  });
+  m.SetAlpha({
+      0.0f, 1.0f, 2.0f,  // Row 1, Column 1
+      0.0f, 1.0f, 2.0f,  // Row 1, Column 2
+      0.0f, 1.0f, 2.0f,  // Row 2, Column 1
+      0.0f, 1.0f, 2.0f,  // Row 2, Column 2
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0.0f, 0.0f, 0.0f,    // Row 1, Column 1
+                                 1.0f, 1.0f, 1.0f,    // Row 1, Column 2
+                                 0.0f, -1.0f, -2.0f,  // Row 2, Column 1
+                                 0.0f, -2.0f, -4.0f,  // Row 2, Column 2
                              }));
 }
 
@@ -1949,7 +2106,7 @@ TEST(QuantizedActivationsOpTest, PRelu) {
       0.0f, 0.0f, 0.0f,        // Row 1, Column 1
       0.5f, 0.5f, 0.5f,        // Row 1, Column 2
       -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
-      -0.25f, -0.25f, -0.25f,  // Row 1, Column 2
+      -0.25f, -0.25f, -0.25f,  // Row 2, Column 2
   });
   m.SetAlpha<uint8_t>({0.0f, 0.5f, -0.5f});
   m.Invoke();
@@ -1959,14 +2116,49 @@ TEST(QuantizedActivationsOpTest, PRelu) {
                       0.0f, 0.0f, 0.0f,       // Row 1, Column 1
                       0.5f, 0.5f, 0.5f,       // Row 1, Column 2
                       0.0f, -0.5f, 0.5f,      // Row 2, Column 1
-                      0.0f, -0.125f, 0.125f,  // Row 1, Column 2
+                      0.0f, -0.125f, 0.125f,  // Row 2, Column 2
                   },
                   kQuantizedTolerance)));
   EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({
                                           128, 128, 128,  // Row 1, Column 1
                                           192, 192, 192,  // Row 1, Column 2
                                           128, 64, 192,   // Row 2, Column 1
-                                          128, 112, 144,  // Row 1, Column 2
+                                          128, 112, 144,  // Row 2, Column 2
+                                      }));
+}
+
+TEST(QuantizedActivationsOpTest, PReluSameShapes) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedPReluOpModel m({TensorType_UINT8, {1, 2, 2, 3}, kMin, kMax},
+                          {TensorType_UINT8, {1, 2, 2, 3}, kMin, kMax});
+  m.SetInput<uint8_t>({
+      0.0f, 0.0f, 0.0f,        // Row 1, Column 1
+      0.5f, 0.5f, 0.5f,        // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
+      -0.25f, -0.25f, -0.25f,  // Row 2, Column 2
+  });
+  m.SetAlpha<uint8_t>({
+      0.0f, 0.5f, -0.5f,  // Row 1, Column 1
+      0.0f, 0.5f, -0.5f,  // Row 1, Column 2
+      0.0f, 0.5f, -0.5f,  // Row 2, Column 1
+      0.0f, 0.5f, -0.5f,  // Row 2, Column 2
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0f, 0.0f, 0.0f,       // Row 1, Column 1
+                      0.5f, 0.5f, 0.5f,       // Row 1, Column 2
+                      0.0f, -0.5f, 0.5f,      // Row 2, Column 1
+                      0.0f, -0.125f, 0.125f,  // Row 2, Column 2
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({
+                                          128, 128, 128,  // Row 1, Column 1
+                                          192, 192, 192,  // Row 1, Column 2
+                                          128, 64, 192,   // Row 2, Column 1
+                                          128, 112, 144,  // Row 2, Column 2
                                       }));
 }
 
@@ -1979,7 +2171,7 @@ TEST(QuantizedActivationsOpTest, PReluInt8) {
       0.0f, 0.0f, 0.0f,        // Row 1, Column 1
       0.5f, 0.5f, 0.5f,        // Row 1, Column 2
       -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
-      -0.25f, -0.25f, -0.25f,  // Row 1, Column 2
+      -0.25f, -0.25f, -0.25f,  // Row 2, Column 2
   });
   m.SetAlpha<int8_t>({0.0f, 0.5f, -0.5f});
   m.Invoke();
@@ -1989,14 +2181,49 @@ TEST(QuantizedActivationsOpTest, PReluInt8) {
                       0.0f, 0.0f, 0.0f,       // Row 1, Column 1
                       0.5f, 0.5f, 0.5f,       // Row 1, Column 2
                       0.0f, -0.5f, 0.5f,      // Row 2, Column 1
-                      0.0f, -0.125f, 0.125f,  // Row 1, Column 2
+                      0.0f, -0.125f, 0.125f,  // Row 2, Column 2
                   },
                   kQuantizedTolerance)));
   EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({
                                          0, 0, 0,     // Row 1, Column 1
                                          64, 64, 64,  // Row 1, Column 2
                                          0, -64, 64,  // Row 2, Column 1
-                                         0, -16, 16,  // Row 1, Column 2
+                                         0, -16, 16,  // Row 2, Column 2
+                                     }));
+}
+
+TEST(QuantizedActivationsOpTest, PReluInt8SameShapes) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedPReluOpModel m({TensorType_INT8, {1, 2, 2, 3}, kMin, kMax},
+                          {TensorType_INT8, {1, 1, 3}, kMin, kMax});
+  m.SetInput<int8_t>({
+      0.0f, 0.0f, 0.0f,        // Row 1, Column 1
+      0.5f, 0.5f, 0.5f,        // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
+      -0.25f, -0.25f, -0.25f,  // Row 2, Column 2
+  });
+  m.SetAlpha<int8_t>({
+      0.0f, 0.5f, -0.5f,  // Row 1, Column 1
+      0.0f, 0.5f, -0.5f,  // Row 1, Column 2
+      0.0f, 0.5f, -0.5f,  // Row 2, Column 1
+      0.0f, 0.5f, -0.5f,  // Row 2, Column 2
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0f, 0.0f, 0.0f,       // Row 1, Column 1
+                      0.5f, 0.5f, 0.5f,       // Row 1, Column 2
+                      0.0f, -0.5f, 0.5f,      // Row 2, Column 1
+                      0.0f, -0.125f, 0.125f,  // Row 2, Column 2
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({
+                                         0, 0, 0,     // Row 1, Column 1
+                                         64, 64, 64,  // Row 1, Column 2
+                                         0, -64, 64,  // Row 2, Column 1
+                                         0, -16, 16,  // Row 2, Column 2
                                      }));
 }
 
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 403adc725eb..21ee5f806a8 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -370,12 +370,15 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
     }
   }
 
-  // The multi-threaded kernel supports neither dilation nor hybrid kernels.
+  // The multi-threaded kernel supports neither dilation nor hybrid kernels, and
+  // is incompatible with mutable input filters that might change between evals.
   data->supports_multithreaded_kernel =
       (kernel_type == kMultithreadOptimized) &&
       (context->recommended_num_threads != 1) && !is_hybrid &&
       (params->dilation_width_factor == 1) &&
-      (params->dilation_height_factor == 1);
+      (params->dilation_height_factor == 1) &&
+      (filter->allocation_type != kTfLiteArenaRw) &&
+      !IsDynamicTensor(filter);
 
   TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired(
       context, node, is_hybrid, data->is_hybrid_per_channel, kernel_type));
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index 8569809df75..a2201835195 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdarg>
+#include <initializer_list>
 
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
@@ -39,6 +40,7 @@ namespace {
 
 using ::testing::ElementsAreArray;
 
+template <typename FilterType>
 class BaseConvolutionOpModel : public SingleOpModel {
  public:
   BaseConvolutionOpModel(
@@ -47,9 +49,15 @@ class BaseConvolutionOpModel : public SingleOpModel {
       int stride_height = 2, enum Padding padding = Padding_VALID,
       enum ActivationFunctionType activation = ActivationFunctionType_NONE,
       int dilation_width_factor = 1, int dilation_height_factor = 1,
-      int num_threads = -1) {
+      int num_threads = -1,
+      std::initializer_list<FilterType> filter_data = {}) {
     input_ = AddInput(input);
-    filter_ = AddInput(filter);
+
+    if (filter_data.size()) {
+      filter_ = AddConstInput(filter, filter_data);
+    } else {
+      filter_ = AddInput(filter);
+    }
 
     int bias_size = GetShape(filter_)[0];
     if (input.type == TensorType_FLOAT32) {
@@ -115,7 +123,7 @@ class BaseConvolutionOpModel : public SingleOpModel {
   int output_;
 };
 
-class ConvolutionOpModel : public BaseConvolutionOpModel {
+class ConvolutionOpModel : public BaseConvolutionOpModel<float> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -553,6 +561,85 @@ TEST_P(ConvolutionOpTest, HandCalculatedFloat32) {
                                     234, 261, 121}));
     }
   }
+
+  // Change the filter to ensure non-const filter behavior is correct.
+  m.SetFilter({2, 4, 7, 2, 5, 8, 3, 6, 9});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({105, 150, 183, 95, 235, 313, 359,
+                                               181, 187, 239, 267, 128}));
+}
+
+// TODO(b/157263074): Ideally using a const filter would be a parameterization
+// of the test, so we ensure full test coverage with all the different
+// types and backends.
+TEST_P(ConvolutionOpTest, HandCalculatedFloat32WithConstFilter) {
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const Padding padding = Padding_SAME;
+  // The filter matrix is:
+  // | 1 | 4 | 7 |
+  // | 2 | 5 | 8 |
+  // | 3 | 6 | 9 |
+  const std::initializer_list<float> filter_data = {1, 4, 7, 2, 5, 8, 3, 6, 9};
+  ConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_FLOAT32,
+       {image_batch_count, image_height, image_width, depth}},
+      {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+      {TensorType_FLOAT32, {}}, stride_width, stride_height, padding,
+      ActivationFunctionType_NONE,
+      /*dilation_width_factor=*/1,
+      /*dilation_height_factor=*/1,
+      /*num_threads=*/-1, filter_data);
+
+  // The image matrix is:
+  // |  1 |  2 |  3 |  4 |
+  // |  5 |  6 |  7 |  8 |
+  // |  9 | 10 | 11 | 12 |
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  // No bias for this test.
+  m.SetBias({0});
+
+  m.Invoke();
+  // We're sliding the 3x3 filter across the 3x4 image, with accesses outside
+  // the input set to zero because we're using the 'SAME' padding mode.
+  // The calculations behind the expected output are:
+  // (1*0)+(4*0)+(7*0)+(2*0)+(5*1)+(8*2)+(3*0)+(6*5)+(9*6)=105
+  // (1*0)+(4*0)+(7*0)+(2*1)+(5*2)+(8*3)+(3*5)+(6*6)+(9*7)=150
+  // (1*0)+(4*0)+(7*0)+(2*2)+(5*3)+(8*4)+(3*6)+(6*7)+(9*8)=183
+  // (1*0)+(4*0)+(7*0)+(2*3)+(5*4)+(8*0)+(3*7)+(6*8)+(9*0)=95
+  // (1*0)+(4*1)+(7*2)+(2*0)+(5*5)+(8*6)+(3*0)+(6*9)+(9*10)=235
+  // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)=312
+  // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)=357
+  // (1*3)+(4*4)+(7*0)+(2*7)+(5*8)+(8*0)+(3*11)+(6*12)+(9*0)=178
+  // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)=187
+  // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)=234
+  // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)=261
+  // (1*7)+(4*11)+(7*0)+(2*8)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)=121
+  // This means we should end up with this matrix:
+  // |  105  |  150  |  183  |   95  |
+  // |  235  |  312  |  357  |  178  |
+  // |  187  |  234  |  261  |  121  |
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({105, 150, 183, 95, 235, 312, 357,
+                                               178, 187, 234, 261, 121}));
+
+  // Add an additional test for the multi-threaded case, ensuring stability
+  // under different thread counts.
+  if (GetParam() == "MultithreadedOptimized") {
+    for (int i = 1; i < 4; ++i) {
+      m.SetNumThreads(i);
+      m.Invoke();
+      EXPECT_THAT(m.GetOutput(),
+                  ElementsAreArray({105, 150, 183, 95, 235, 312, 357, 178, 187,
+                                    234, 261, 121}));
+    }
+  }
 }
 
 TEST_P(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
@@ -766,7 +853,7 @@ TEST_P(ConvolutionOpTest, SimpleTestFloatWithDilation) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
 }
 
-class QuantizedConvolutionOpModel : public BaseConvolutionOpModel {
+class QuantizedConvolutionOpModel : public BaseConvolutionOpModel<uint8_t> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -986,7 +1073,7 @@ TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithDilation) {
               ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
 }
 
-class HybridConvolutionOpModel : public BaseConvolutionOpModel {
+class HybridConvolutionOpModel : public BaseConvolutionOpModel<int8_t> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -1325,7 +1412,8 @@ TEST_P(ConvolutionOpTest, DISABLED_PointwiseMultifilterHybrid) {
                   0.0474)));
 }
 
-class PerChannelQuantizedConvolutionOpModel : public BaseConvolutionOpModel {
+class PerChannelQuantizedConvolutionOpModel
+    : public BaseConvolutionOpModel<int8_t> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -1442,7 +1530,8 @@ TEST_P(ConvolutionOpTest, SimplePerChannelTest) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({61, 127, -115, -93}));
 }
 
-class HybridPerChannelConvolutionOpModel : public BaseConvolutionOpModel {
+class HybridPerChannelConvolutionOpModel
+    : public BaseConvolutionOpModel<int8_t> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
index 7f148dfa9f1..110eb3a07ef 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include <type_traits>
 
 #include <gtest/gtest.h>
-#include "ruy/ruy.h"  // from @ruy
+#include "ruy/reference_mul.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 
@@ -353,8 +353,7 @@ void ReferenceGemm(
   ruy::MulParams<AccumScalar, DstScalar> ruy_mul_params;
   cpu_backend_gemm::detail::MakeRuyMulParams(params, &ruy_mul_params);
 
-  ruy::Mul<ruy::Path::kReference>(ruy_lhs, ruy_rhs, ruy_mul_params,
-                                  context->ruy_context(), &ruy_dst);
+  ruy::ReferenceMul(ruy_lhs, ruy_rhs, ruy_mul_params, &ruy_dst);
 }
 
 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
diff --git a/tensorflow/lite/kernels/fill_test.cc b/tensorflow/lite/kernels/fill_test.cc
index 4ab013bb357..0717a31b9d7 100644
--- a/tensorflow/lite/kernels/fill_test.cc
+++ b/tensorflow/lite/kernels/fill_test.cc
@@ -24,87 +24,115 @@ namespace {
 using ::testing::ElementsAreArray;
 using ::testing::IsEmpty;
 
+enum class TestType {
+  kConst = 0,
+  kDynamic = 1,
+};
+
+template <typename dims_type, typename value_type>
 class FillOpModel : public SingleOpModel {
  public:
-  explicit FillOpModel(const TensorData& input1, const TensorData& input2) {
-    input1_ = AddInput(input1);
-    input2_ = AddInput(input2);
-    output_ = AddOutput(input1);
+  explicit FillOpModel(TensorType dims_tensor_type,
+                       std::initializer_list<int> dims_shape,
+                       std::initializer_list<dims_type> dims_data,
+                       value_type value, TestType input_tensor_types) {
+    if (input_tensor_types == TestType::kDynamic) {
+      dims_ = AddInput(dims_tensor_type);
+      value_ = AddInput(GetTensorType<value_type>());
+    } else {
+      dims_ = AddConstInput(dims_tensor_type, dims_data, dims_shape);
+      value_ = AddConstInput(GetTensorType<value_type>(), {value}, {});
+    }
+    output_ = AddOutput(GetTensorType<value_type>());
     SetBuiltinOp(BuiltinOperator_FILL, BuiltinOptions_FillOptions,
                  CreateFillOptions(builder_).Union());
-    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+    BuildInterpreter({dims_shape, {}});
+
+    if (input_tensor_types == TestType::kDynamic) {
+      if (dims_data.size() > 0) {
+        PopulateTensor<dims_type>(dims_, dims_data);
+      }
+      PopulateTensor<value_type>(value_, {value});
+    }
   }
 
-  int input1() { return input1_; }
-  int input2() { return input2_; }
-  int output() { return output_; }
+  std::vector<value_type> GetOutput() {
+    return ExtractVector<value_type>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
  protected:
-  int input1_;
-  int input2_;
+  int dims_;
+  int value_;
   int output_;
 };
 
-TEST(FillOpModel, FillInt32) {
-  FillOpModel m({TensorType_INT32, {2}}, {TensorType_INT32});
-  m.PopulateTensor<int32_t>(m.input1(), {2, 3});
-  m.PopulateTensor<int32_t>(m.input2(), {-11});
+class FillOpTest : public ::testing::TestWithParam<TestType> {};
+
+TEST_P(FillOpTest, FillInt32) {
+  FillOpModel<int32_t, int32_t> m(TensorType_INT32, {2}, {2, 3}, -11,
+                                  GetParam());
   m.Invoke();
-  EXPECT_THAT(m.ExtractVector<int32_t>(m.output()),
-              ElementsAreArray({-11, -11, -11, -11, -11, -11}));
-  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-11, -11, -11, -11, -11, -11}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
 }
 
-TEST(FillOpModel, FillInt64) {
-  FillOpModel m({TensorType_INT32, {2}}, {TensorType_INT64});
-  m.PopulateTensor<int32_t>(m.input1(), {2, 4});
-  m.PopulateTensor<int64_t>(m.input2(), {1LL << 45});
+TEST_P(FillOpTest, FillInt64) {
+  FillOpModel<int64_t, int64_t> m(TensorType_INT64, {2}, {2, 4}, 1LL << 45,
+                                  GetParam());
   m.Invoke();
-  EXPECT_THAT(m.ExtractVector<int64_t>(m.output()),
+  EXPECT_THAT(m.GetOutput(),
               ElementsAreArray({1LL << 45, 1LL << 45, 1LL << 45, 1LL << 45,
                                 1LL << 45, 1LL << 45, 1LL << 45, 1LL << 45}));
-  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 4}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 4}));
 }
 
-TEST(FillOpModel, FillFloat) {
-  FillOpModel m({TensorType_INT64, {3}}, {TensorType_FLOAT32});
-  m.PopulateTensor<int64_t>(m.input1(), {2, 2, 2});
-  m.PopulateTensor<float>(m.input2(), {4.0});
+TEST_P(FillOpTest, FillFloat) {
+  FillOpModel<int64_t, float> m(TensorType_INT64, {3}, {2, 2, 2}, 4.0,
+                                GetParam());
   m.Invoke();
-  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+  EXPECT_THAT(m.GetOutput(),
               ElementsAreArray({4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0}));
-  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 2, 2}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
 }
 
-TEST(FillOpModel, FillOutputScalar) {
-  FillOpModel m({TensorType_INT64, {0}}, {TensorType_FLOAT32});
-  m.PopulateTensor<float>(m.input2(), {4.0});
+TEST_P(FillOpTest, FillFloatInt32Dims) {
+  FillOpModel<int32_t, float> m(TensorType_INT32, {3}, {2, 2, 2}, 4.0,
+                                GetParam());
   m.Invoke();
-  EXPECT_THAT(m.ExtractVector<float>(m.output()), ElementsAreArray({4.0}));
-  EXPECT_THAT(m.GetTensorShape(m.output()), IsEmpty());
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
 }
 
-TEST(FillOpModel, FillBool) {
-  FillOpModel m({TensorType_INT64, {3}}, {TensorType_BOOL});
-  m.PopulateTensor<int64_t>(m.input1(), {2, 2, 2});
-  m.PopulateTensor<bool>(m.input2(), {true});
+TEST_P(FillOpTest, FillOutputScalar) {
+  FillOpModel<int64_t, float> m(TensorType_INT64, {0}, {}, 4.0, GetParam());
   m.Invoke();
-  EXPECT_THAT(
-      m.ExtractVector<bool>(m.output()),
-      ElementsAreArray({true, true, true, true, true, true, true, true}));
-  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 2, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4.0}));
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
 }
 
-TEST(FillOpModel, FillString) {
-  FillOpModel m({TensorType_INT64, {3}}, {TensorType_STRING});
-  m.PopulateTensor<int64_t>(m.input1(), {2, 2, 2});
-  m.PopulateTensor<std::string>(m.input2(), {"AB"});
+TEST_P(FillOpTest, FillBool) {
+  FillOpModel<int64_t, bool> m(TensorType_INT64, {3}, {2, 2, 2}, true,
+                               GetParam());
   m.Invoke();
-  EXPECT_THAT(
-      m.ExtractVector<std::string>(m.output()),
-      ElementsAreArray({"AB", "AB", "AB", "AB", "AB", "AB", "AB", "AB"}));
-  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 2, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({true, true, true, true, true,
+                                               true, true, true}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
 }
 
+TEST(FillOpTest, FillString) {
+  FillOpModel<int64_t, std::string> m(TensorType_INT64, {3}, {2, 2, 2}, "AB",
+                                      TestType::kDynamic);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({"AB", "AB", "AB", "AB", "AB",
+                                               "AB", "AB", "AB"}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
+}
+
+INSTANTIATE_TEST_SUITE_P(FillOpTest, FillOpTest,
+                         ::testing::Values(TestType::kConst,
+                                           TestType::kDynamic));
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 1cd1b14e7a8..cbc3efd5da5 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -61,8 +61,6 @@ enum KernelType {
   kReference,
   kGenericOptimized,
   kLegacyPie,  // Legacy path used by the PIE team and related clients.
-  kSparseReference,
-  kSparseOptimized,
 };
 
 struct OpData {
@@ -631,57 +629,20 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
     FullyConnectedParams op_params;
     op_params.float_activation_min = output_activation_min;
     op_params.float_activation_max = output_activation_max;
-    reference_ops::FullyConnected(
-        op_params, GetTensorShape(input), GetTensorData<float>(input),
-        GetTensorShape(filter), GetTensorData<float>(filter),
-        GetTensorShape(bias), GetTensorData<float>(bias),
-        GetTensorShape(output), GetTensorData<float>(output));
-  } else if (kernel_type == kSparseReference) {
-    FullyConnectedParams op_params;
-    op_params.float_activation_min = output_activation_min;
-    op_params.float_activation_max = output_activation_max;
-    TF_LITE_ENSURE(context, filter->sparsity != nullptr);
-
-    const auto& sparsity = *filter->sparsity;
-    reference_ops::FullyConnectedSparseWeight(
-        sparsity, op_params, GetTensorShape(input), GetTensorData<float>(input),
-        GetTensorShape(filter), GetTensorData<float>(filter),
-        GetTensorShape(bias), GetTensorData<float>(bias),
-        GetTensorShape(output), GetTensorData<float>(output));
-  } else if (kernel_type == kSparseOptimized) {
-    FullyConnectedParams op_params;
-    op_params.float_activation_min = output_activation_min;
-    op_params.float_activation_max = output_activation_max;
-    TF_LITE_ENSURE(context, filter->sparsity != nullptr);
-
-    const auto& sparsity = *filter->sparsity;
-    if (!SupportedSparsityFormat(sparsity)) {
-      TF_LITE_KERNEL_LOG(context,
-                         "Unsupported sparse fully-connected weight format.");
-      return kTfLiteError;
-    }
-
-    if (sparsity.dim_metadata_size == kDimMetadataSizeRandomSparse) {
-      // Random sparse.
-      optimized_ops::FullyConnectedSparseWeight(
-          sparsity, op_params, GetTensorShape(input),
-          GetTensorData<float>(input), GetTensorShape(filter),
-          GetTensorData<float>(filter), GetTensorShape(bias),
-          GetTensorData<float>(bias), GetTensorShape(output),
-          GetTensorData<float>(output));
-    } else if (sparsity.dim_metadata_size == kDimMetadataSizeBlockSparse &&
-               sparsity.dim_metadata[2].dense_size == 4) {
-      // Block sparse with block size of 1x4.
-      optimized_ops::FullyConnectedSparseWeight1x4(
+    if (filter->sparsity != nullptr) {
+      const auto& sparsity = *filter->sparsity;
+      reference_ops::FullyConnectedSparseWeight(
           sparsity, op_params, GetTensorShape(input),
           GetTensorData<float>(input), GetTensorShape(filter),
           GetTensorData<float>(filter), GetTensorShape(bias),
           GetTensorData<float>(bias), GetTensorShape(output),
           GetTensorData<float>(output));
     } else {
-      TF_LITE_KERNEL_LOG(context,
-                         "Unsupported sparse fully-connected weight format.");
-      return kTfLiteError;
+      reference_ops::FullyConnected(
+          op_params, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(filter), GetTensorData<float>(filter),
+          GetTensorShape(bias), GetTensorData<float>(bias),
+          GetTensorShape(output), GetTensorData<float>(output));
     }
   } else if (kernel_type == kLegacyPie) {
     return EvalPie(context, node, params, data, input, filter, bias, output);
@@ -689,14 +650,47 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
     FullyConnectedParams op_params;
     op_params.float_activation_min = output_activation_min;
     op_params.float_activation_max = output_activation_max;
-    op_params.lhs_cacheable = IsConstantTensor(filter);
-    op_params.rhs_cacheable = IsConstantTensor(input);
-    optimized_ops::FullyConnected(
-        op_params, GetTensorShape(input), GetTensorData<float>(input),
-        GetTensorShape(filter), GetTensorData<float>(filter),
-        GetTensorShape(bias), GetTensorData<float>(bias),
-        GetTensorShape(output), GetTensorData<float>(output),
-        CpuBackendContext::GetFromContext(context));
+    if (filter->sparsity != nullptr) {
+      const auto& sparsity = *filter->sparsity;
+      if (!SupportedSparsityFormat(sparsity)) {
+        TF_LITE_KERNEL_LOG(context,
+                           "Unsupported sparse fully-connected weight format.");
+        return kTfLiteError;
+      }
+
+      if (sparsity.dim_metadata_size == kDimMetadataSizeRandomSparse) {
+        // Random sparse.
+        optimized_ops::FullyConnectedSparseWeight(
+            sparsity, op_params, GetTensorShape(input),
+            GetTensorData<float>(input), GetTensorShape(filter),
+            GetTensorData<float>(filter), GetTensorShape(bias),
+            GetTensorData<float>(bias), GetTensorShape(output),
+            GetTensorData<float>(output));
+      } else if (sparsity.dim_metadata_size == kDimMetadataSizeBlockSparse &&
+                 sparsity.dim_metadata[2].dense_size == 4) {
+        // Block sparse with block size of 1x4.
+        optimized_ops::FullyConnectedSparseWeight1x4(
+            sparsity, op_params, GetTensorShape(input),
+            GetTensorData<float>(input), GetTensorShape(filter),
+            GetTensorData<float>(filter), GetTensorShape(bias),
+            GetTensorData<float>(bias), GetTensorShape(output),
+            GetTensorData<float>(output));
+      } else {
+        TF_LITE_KERNEL_LOG(context,
+                           "Unsupported sparse fully-connected weight format.");
+        return kTfLiteError;
+      }
+
+    } else {
+      op_params.lhs_cacheable = IsConstantTensor(filter);
+      op_params.rhs_cacheable = IsConstantTensor(input);
+      optimized_ops::FullyConnected(
+          op_params, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(filter), GetTensorData<float>(filter),
+          GetTensorShape(bias), GetTensorData<float>(bias),
+          GetTensorShape(output), GetTensorData<float>(output),
+          CpuBackendContext::GetFromContext(context));
+    }
   }
 
   return kTfLiteOk;
@@ -757,23 +751,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace fully_connected
 
-// TODO(b/147449640): Clean up sparse registrations after conversion is done.
-TfLiteRegistration* Register_FULLY_CONNECTED_SPARSE_REF() {
-  static TfLiteRegistration r = {
-      fully_connected::Init, fully_connected::Free,
-      fully_connected::Prepare<fully_connected::kSparseReference>,
-      fully_connected::Eval<fully_connected::kSparseReference>};
-  return &r;
-}
-
-TfLiteRegistration* Register_FULLY_CONNECTED_SPARSE_OPT() {
-  static TfLiteRegistration r = {
-      fully_connected::Init, fully_connected::Free,
-      fully_connected::Prepare<fully_connected::kSparseOptimized>,
-      fully_connected::Eval<fully_connected::kSparseOptimized>};
-  return &r;
-}
-
 TfLiteRegistration* Register_FULLY_CONNECTED_REF() {
   static TfLiteRegistration r = {
       fully_connected::Init, fully_connected::Free,
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index 34d68cf0b0d..7227b8a5e92 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -361,11 +361,6 @@ const auto kKernelMapNoPie = new std::map<string, TfLiteRegistration*>({
     {"GenericOptimized", ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT()},
 });
 
-const auto kKernelMapSparse = new std::map<string, TfLiteRegistration*>({
-    {"SparseReference", ops::builtin::Register_FULLY_CONNECTED_SPARSE_REF()},
-    {"SparseOptimized", ops::builtin::Register_FULLY_CONNECTED_SPARSE_OPT()},
-});
-
 class QuantizedFullyConnectedOpTest : public SingleOpTest {
  protected:
   const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
@@ -1187,7 +1182,7 @@ class SparseFullyConnectedOpModel : public SingleOpModel {
 class SparseFullyConnectedOpTest : public SingleOpTest {
  protected:
   const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
-    return *kKernelMapSparse;
+    return *kKernelMapNoPie;
   }
 };
 
@@ -1277,7 +1272,7 @@ TEST_P(SparseFullyConnectedOpTest, Simple1x4Test) {
 
 INSTANTIATE_TEST_SUITE_P(
     SparseFullyConnectedOpTest, SparseFullyConnectedOpTest,
-    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMapSparse)));
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMapNoPie)));
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 5958a9c1098..4bbf6704622 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -362,7 +362,11 @@ cc_test(
 cc_library(
     name = "cppmath",
     srcs = [],
-    hdrs = ["cppmath.h"],
+    hdrs = [
+        "cppmath.h",
+        "max.h",
+        "min.h",
+    ],
     build_for_embedded = True,
     copts = tflite_copts(),
 )
@@ -481,6 +485,7 @@ cc_library(
         "reference/strided_slice.h",
         "reference/sub.h",
         "reference/svdf.h",
+        "reference/tanh.h",
     ],
     build_for_embedded = True,
     copts = tflite_copts(),
@@ -551,6 +556,7 @@ cc_library(
         "reference/softmax.h",
         "reference/strided_slice.h",
         "reference/sub.h",
+        "reference/tanh.h",
     ],
     copts = tflite_copts(),
     deps = [
@@ -629,7 +635,6 @@ cc_library(
         ":cppmath",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:cpu_backend_context",
         "@gemmlowp",
     ],
 )
@@ -654,7 +659,6 @@ cc_library(
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:cpu_backend_gemm",
         "@ruy//ruy",
-        "@ruy//ruy:detect_arm",
     ],
 )
 
@@ -786,7 +790,6 @@ cc_library(
     deps = [
         ":cpu_check",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:cpu_backend_context",
         "//third_party/eigen3",
     ],
 )
@@ -820,6 +823,7 @@ cc_test(
         ":quantization_util",
         ":tensor_utils",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest_main",
     ],
@@ -1039,6 +1043,7 @@ cc_test(
 
 cc_library(
     name = "cpu_check",
+    srcs = ["optimized/cpu_check.cc"],
     hdrs = [
         "optimized/cpu_check.h",
         "optimized/neon_check.h",
@@ -1058,9 +1063,6 @@ cc_library(
         ":windows": tflite_deps_intel,
         "//conditions:default": [],
     },
-    deps = [
-        "@ruy//ruy:detect_arm",  # safe to use regardless of arch.
-    ],
 )
 
 cc_test(
diff --git a/tensorflow/lite/kernels/internal/cppmath.h b/tensorflow/lite/kernels/internal/cppmath.h
index 611a8d2588a..24a3aec82e3 100644
--- a/tensorflow/lite/kernels/internal/cppmath.h
+++ b/tensorflow/lite/kernels/internal/cppmath.h
@@ -19,8 +19,9 @@ limitations under the License.
 
 namespace tflite {
 
-#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) || \
-    (defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(ARDUINO)
+#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) ||                           \
+    (defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(ARDUINO) || \
+    defined(__ZEPHYR__)
 #define TF_LITE_GLOBAL_STD_PREFIX
 #else
 #define TF_LITE_GLOBAL_STD_PREFIX std
diff --git a/tensorflow/lite/kernels/internal/max.h b/tensorflow/lite/kernels/internal/max.h
new file mode 100644
index 00000000000..c18100272db
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/max.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
+
+#include <cmath>
+
+namespace tflite {
+
+#if defined(TF_LITE_USE_GLOBAL_MAX) || defined(__ZEPHYR__)
+inline float TfLiteMax(const float& x, const float& y) {
+  return std::max(x, y);
+}
+#else
+template <class T>
+inline T TfLiteMax(const T& x, const T& y) {
+  return std::fmax(x, y);
+}
+#endif
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
diff --git a/tensorflow/lite/kernels/internal/min.h b/tensorflow/lite/kernels/internal/min.h
new file mode 100644
index 00000000000..62035dccd89
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/min.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
+
+#include <cmath>
+
+namespace tflite {
+
+#if defined(TF_LITE_USE_GLOBAL_MIN) || defined(__ZEPHYR__)
+inline float TfLiteMin(const float& x, const float& y) {
+  return std::min(x, y);
+}
+#else
+template <class T>
+inline T TfLiteMin(const T& x, const T& y) {
+  return std::fmin(x, y);
+}
+#endif
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/cpu_check.cc b/tensorflow/lite/kernels/internal/optimized/cpu_check.cc
new file mode 100644
index 00000000000..8fd17a7e33a
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/cpu_check.cc
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+
+#if defined __linux__ && defined __aarch64__
+#include <sys/auxv.h>
+#endif
+
+namespace tflite {
+
+namespace {
+
+// The implementation of dotprod detection is copied from ruy's internal
+// function DetectDotprod().
+// At the moment it's only implemented on Linux ARM64. Consider syncing again
+// with ruy in the future to share improvements.
+#if defined __linux__ && defined __aarch64__
+bool DetectDotprodByLinuxAuxvMethod() {
+  // This is the value of HWCAP_ASIMDDP in sufficiently recent Linux headers,
+  // however we need to support building against older headers for the time
+  // being.
+  const int kLocalHwcapAsimddp = 1 << 20;
+  return getauxval(AT_HWCAP) & kLocalHwcapAsimddp;
+}
+#endif
+
+}  // namespace
+
+bool DetectArmNeonDotprod() {
+#if defined __linux__ && defined __aarch64__
+  return DetectDotprodByLinuxAuxvMethod();
+#endif
+
+  return false;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/lite/kernels/internal/optimized/cpu_check.h
index 2c02e756f14..b39371a3e2f 100644
--- a/tensorflow/lite/kernels/internal/optimized/cpu_check.h
+++ b/tensorflow/lite/kernels/internal/optimized/cpu_check.h
@@ -15,8 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
 
-#include "ruy/detect_arm.h"  // from @ruy
-
 // This include is superfluous. However, it's been here for a while, and a
 // number of files have been relying on it to include neon_check.h for them.
 // This should be removed, but with a global run of presubmits to catch
@@ -25,12 +23,16 @@ limitations under the License.
 
 namespace tflite {
 
+// On A64, returns true if the dotprod extension is present.
+// On other architectures, returns false unconditionally.
+bool DetectArmNeonDotprod();
+
 struct CpuFlags {
   bool neon_dotprod = false;
 };
 
 inline void GetCpuFlags(CpuFlags* cpu_flags) {
-  cpu_flags->neon_dotprod = ruy::DetectDotprod();
+  cpu_flags->neon_dotprod = DetectArmNeonDotprod();
 }
 
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
index a9dae4feac5..44479d93a31 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
@@ -35,58 +36,102 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
   TFLITE_DCHECK_GT(params.input2_offset, -256);
   TFLITE_DCHECK_LT(params.input1_offset, 256);
   TFLITE_DCHECK_LT(params.input2_offset, 256);
+
 #ifdef USE_NEON
-  const int8x8_t output_activation_min_vector =
-      vdup_n_s8(params.quantized_activation_min);
-  const int8x8_t output_activation_max_vector =
-      vdup_n_s8(params.quantized_activation_max);
-  for (; i <= size - 8; i += 8) {
-    const int8x8_t input1_val_original = vld1_s8(input1_data + i);
-    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
-    const int16x8_t input1_val_s16 = vmovl_s8(input1_val_original);
-    const int16x8_t input2_val_s16 = vmovl_s8(input2_val_original);
-    const int16x8_t input1_val =
-        vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
-    const int16x8_t input2_val =
-        vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
-    const int16x4_t input1_val_high = vget_high_s16(input1_val);
-    const int16x4_t input1_val_low = vget_low_s16(input1_val);
-    const int16x4_t input2_val_high = vget_high_s16(input2_val);
-    const int16x4_t input2_val_low = vget_low_s16(input2_val);
-    int32x4_t x11 = vmovl_s16(input1_val_low);
-    int32x4_t x12 = vmovl_s16(input1_val_high);
-    int32x4_t x21 = vmovl_s16(input2_val_low);
-    int32x4_t x22 = vmovl_s16(input2_val_high);
-    const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
-    x11 = vshlq_s32(x11, left_shift_dup);
-    x12 = vshlq_s32(x12, left_shift_dup);
-    x21 = vshlq_s32(x21, left_shift_dup);
-    x22 = vshlq_s32(x22, left_shift_dup);
-    x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
-    x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
-    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
-    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
-    const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
-    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
-    x11 = vshlq_s32(x11, input1_shift_dup);
-    x12 = vshlq_s32(x12, input1_shift_dup);
-    x21 = vshlq_s32(x21, input2_shift_dup);
-    x22 = vshlq_s32(x22, input2_shift_dup);
-    int32x4_t s1 = vaddq_s32(x11, x21);
-    int32x4_t s2 = vaddq_s32(x12, x22);
-    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
-    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
+  const int8x16_t output_activation_min_vector =
+      vdupq_n_s8(params.quantized_activation_min);
+  const int8x16_t output_activation_max_vector =
+      vdupq_n_s8(params.quantized_activation_max);
+
+  const int input1_left_shift = params.left_shift + params.input1_shift;
+  const int input2_left_shift = params.left_shift + params.input2_shift;
+  const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
+  const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
+
+  const int16x8_t input1_offset_dup = vdupq_n_s16(params.input1_offset);
+  const int16x8_t input2_offset_dup = vdupq_n_s16(params.input2_offset);
+
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+
+    const int16x8_t input1_val_s16_high =
+        vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low =
+        vmovl_s8(vget_low_s8(input1_val_original));
+
+    const int16x8_t input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low =
+        vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high =
+        vaddq_s16(input1_val_s16_high, input1_offset_dup);
+    const int16x8_t input2_val_high =
+        vaddq_s16(input2_val_s16_high, input2_offset_dup);
+    const int16x8_t input1_val_low =
+        vaddq_s16(input1_val_s16_low, input1_offset_dup);
+    const int16x8_t input2_val_low =
+        vaddq_s16(input2_val_s16_low, input2_offset_dup);
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+    int32x4_t x111 = vmovl_s16(input1_val_low_low);
+    int32x4_t x112 = vmovl_s16(input1_val_low_high);
+    int32x4_t x121 = vmovl_s16(input1_val_high_low);
+    int32x4_t x122 = vmovl_s16(input1_val_high_high);
+    int32x4_t x211 = vmovl_s16(input2_val_low_low);
+    int32x4_t x212 = vmovl_s16(input2_val_low_high);
+    int32x4_t x221 = vmovl_s16(input2_val_high_low);
+    int32x4_t x222 = vmovl_s16(input2_val_high_high);
+
+    x111 = vshlq_s32(x111, input1_left_dup);
+    x112 = vshlq_s32(x112, input1_left_dup);
+    x121 = vshlq_s32(x121, input1_left_dup);
+    x122 = vshlq_s32(x122, input1_left_dup);
+    x211 = vshlq_s32(x211, input2_left_dup);
+    x212 = vshlq_s32(x212, input2_left_dup);
+    x221 = vshlq_s32(x221, input2_left_dup);
+    x222 = vshlq_s32(x222, input2_left_dup);
+    x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier);
+    x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier);
+    x121 = vqrdmulhq_n_s32(x121, params.input1_multiplier);
+    x122 = vqrdmulhq_n_s32(x122, params.input1_multiplier);
+    x211 = vqrdmulhq_n_s32(x211, params.input2_multiplier);
+    x212 = vqrdmulhq_n_s32(x212, params.input2_multiplier);
+    x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier);
+    x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier);
+    int32x4_t s11 = vaddq_s32(x111, x211);
+    int32x4_t s12 = vaddq_s32(x112, x212);
+    int32x4_t s21 = vaddq_s32(x121, x221);
+    int32x4_t s22 = vaddq_s32(x122, x222);
+    s11 = vqrdmulhq_n_s32(s11, params.output_multiplier);
+    s12 = vqrdmulhq_n_s32(s12, params.output_multiplier);
+    s21 = vqrdmulhq_n_s32(s21, params.output_multiplier);
+    s22 = vqrdmulhq_n_s32(s22, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
-    s1 = RoundingDivideByPOT(s1, -params.output_shift);
-    s2 = RoundingDivideByPOT(s2, -params.output_shift);
-    const int16x4_t s1_narrowed = vmovn_s32(s1);
-    const int16x4_t s2_narrowed = vmovn_s32(s2);
-    const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
-                                  vdupq_n_s16(params.output_offset));
-    const int8x8_t clamped =
-        vmax_s8(output_activation_min_vector,
-                vmin_s8(output_activation_max_vector, vqmovn_s16(s)));
-    vst1_s8(output_data + i, clamped);
+    s11 = RoundingDivideByPOT(s11, -params.output_shift);
+    s12 = RoundingDivideByPOT(s12, -params.output_shift);
+    s21 = RoundingDivideByPOT(s21, -params.output_shift);
+    s22 = RoundingDivideByPOT(s22, -params.output_shift);
+    const int16x4_t s11_narrowed = vmovn_s32(s11);
+    const int16x4_t s12_narrowed = vmovn_s32(s12);
+    const int16x4_t s21_narrowed = vmovn_s32(s21);
+    const int16x4_t s22_narrowed = vmovn_s32(s22);
+    const int16x8_t s1 = vaddq_s16(vcombine_s16(s11_narrowed, s12_narrowed),
+                                   vdupq_n_s16(params.output_offset));
+    const int16x8_t s2 = vaddq_s16(vcombine_s16(s21_narrowed, s22_narrowed),
+                                   vdupq_n_s16(params.output_offset));
+    const int8x16_t s = vcombine_s8(vqmovn_s16(s1), vqmovn_s16(s2));
+
+    const int8x16_t clamped =
+        vmaxq_s8(output_activation_min_vector,
+                 vminq_s8(output_activation_max_vector, s));
+    vst1q_s8(output_data + i, clamped);
   }
 #endif  // NEON
 
@@ -231,101 +276,6 @@ inline void Add(const ArithmeticParams& params,
   AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
-inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
-                                 const RuntimeShape& unswitched_input1_shape,
-                                 const int8* unswitched_input1_data,
-                                 const RuntimeShape& unswitched_input2_shape,
-                                 const int8* unswitched_input2_data,
-                                 const RuntimeShape& output_shape,
-                                 int8* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastAddFivefoldInt8/8bit");
-
-  ArithmeticParams switched_params = unswitched_params;
-  switched_params.input1_offset = unswitched_params.input2_offset;
-  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
-  switched_params.input1_shift = unswitched_params.input2_shift;
-  switched_params.input2_offset = unswitched_params.input1_offset;
-  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
-  switched_params.input2_shift = unswitched_params.input1_shift;
-
-  const bool use_unswitched =
-      unswitched_params.broadcast_category ==
-      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const ArithmeticParams& params =
-      use_unswitched ? unswitched_params : switched_params;
-  const int8* input1_data =
-      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const int8* input2_data =
-      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
-
-  // Fivefold nested loops. The second input resets its position for each
-  // iteration of the second loop. The first input resets its position at the
-  // beginning of the fourth loop. The innermost loop is an elementwise add of
-  // sections of the arrays.
-  int8* output_data_ptr = output_data;
-  const int8* input1_data_ptr = input1_data;
-  const int8* input2_data_reset = input2_data;
-  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
-  // between input shapes. y3 for input 1 is always broadcast, and so the
-  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
-  // Put another way,
-  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
-  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
-  int y0 = params.broadcast_shape[0];
-  int y1 = params.broadcast_shape[1];
-  int y2 = params.broadcast_shape[2];
-  int y3 = params.broadcast_shape[3];
-  int y4 = params.broadcast_shape[4];
-  if (y4 > 1) {
-    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
-    // dimension.
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const int8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          for (int i3 = 0; i3 < y3; ++i3) {
-            AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                           output_data_ptr);
-            input2_data_ptr += y4;
-            output_data_ptr += y4;
-          }
-          // We have broadcast y4 of input1 data y3 times, and now move on.
-          input1_data_ptr += y4;
-        }
-      }
-      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
-      input2_data_reset = input2_data_ptr;
-    }
-  } else {
-    // Special case of y4 == 1, in which the innermost loop is a single element
-    // and can be combined with the next (y3) as an inner broadcast.
-    //
-    // Note that this handles the case of pure scalar broadcast when
-    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
-    // broadcast with batch (as y2 > 1).
-    //
-    // NOTE The process is the same as the above general case except simplified
-    // for y4 == 1 and the loop over y3 is contained within the
-    // AddScalarBroadcast function.
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const int8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
-                             output_data_ptr);
-          input2_data_ptr += y3;
-          output_data_ptr += y3;
-          input1_data_ptr += 1;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  }
-}
-
 inline void BroadcastAddDispatch(const ArithmeticParams& params,
                                  const RuntimeShape& input1_shape,
                                  const int8* input1_data,
@@ -339,8 +289,9 @@ inline void BroadcastAddDispatch(const ArithmeticParams& params,
         output_shape, output_data);
   }
 
-  BroadcastAddFivefold(params, input1_shape, input1_data, input2_shape,
-                       input2_data, output_shape, output_data);
+  optimized_ops::BinaryBroadcastFiveFold(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data, AddElementwise, AddScalarBroadcast);
 }
 
 }  // namespace optimized_integer_ops
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
index 18aeef4c8b5..15c3d291ec3 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
@@ -38,49 +39,81 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
   TFLITE_DCHECK_GT(params.output_offset, -256);
   TFLITE_DCHECK_LT(params.output_offset, 256);
 #ifdef USE_NEON
-  const auto input1_offset_vector = vdupq_n_s16(params.input1_offset);
-  const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
-  const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+  const int16x8_t input1_offset_vector = vdupq_n_s16(params.input1_offset);
+  const int16x8_t input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const int16x8_t output_offset_vector = vdupq_n_s16(params.output_offset);
   const auto output_activation_min_vector =
-      vdup_n_s8(params.quantized_activation_min);
+      vdupq_n_s8(params.quantized_activation_min);
   const auto output_activation_max_vector =
-      vdup_n_s8(params.quantized_activation_max);
+      vdupq_n_s8(params.quantized_activation_max);
   const int left_shift = std::max(0, params.output_shift);
   const int right_shift = std::max(0, -params.output_shift);
   const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
-  for (; i <= size - 8; i += 8) {
-    // We load / store 8 at a time, multiplying as two sets of 4 int32s.
-    const auto input1_val_original = vld1_s8(input1_data + i);
-    const auto input2_val_original = vld1_s8(input2_data + i);
-    const auto input1_val_s16 = vmovl_s8(input1_val_original);
-    const auto input2_val_s16 = vmovl_s8(input2_val_original);
-    const auto input1_val = vaddq_s16(input1_val_s16, input1_offset_vector);
-    const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
+  for (; i <= size - 16; i += 16) {
+    // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
 
-    const auto input1_val_low = vget_low_s16(input1_val);
-    const auto input1_val_high = vget_high_s16(input1_val);
-    const auto input2_val_low = vget_low_s16(input2_val);
-    const auto input2_val_high = vget_high_s16(input2_val);
+    const int16x8_t input1_val_s16_high =
+        vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low =
+        vmovl_s8(vget_low_s8(input1_val_original));
 
-    auto p1 = vmull_s16(input2_val_low, input1_val_low);
-    auto p2 = vmull_s16(input2_val_high, input1_val_high);
+    const int16x8_t input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low =
+        vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high =
+        vaddq_s16(input1_val_s16_high, input1_offset_vector);
+    const int16x8_t input2_val_high =
+        vaddq_s16(input2_val_s16_high, input2_offset_vector);
+    const int16x8_t input1_val_low =
+        vaddq_s16(input1_val_s16_low, input1_offset_vector);
+    const int16x8_t input2_val_low =
+        vaddq_s16(input2_val_s16_low, input2_offset_vector);
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+
+    auto p1 = vmull_s16(input2_val_high_high, input1_val_high_high);
+    auto p2 = vmull_s16(input2_val_high_low, input1_val_high_low);
+    auto p3 = vmull_s16(input2_val_low_high, input1_val_low_high);
+    auto p4 = vmull_s16(input2_val_low_low, input1_val_low_low);
 
     p1 = vshlq_s32(p1, left_shift_vec);
     p2 = vshlq_s32(p2, left_shift_vec);
+    p3 = vshlq_s32(p3, left_shift_vec);
+    p4 = vshlq_s32(p4, left_shift_vec);
+
     p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
     p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+    p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
     p1 = RoundingDivideByPOT(p1, right_shift);
     p2 = RoundingDivideByPOT(p2, right_shift);
+    p3 = RoundingDivideByPOT(p3, right_shift);
+    p4 = RoundingDivideByPOT(p4, right_shift);
 
     const auto p1_narrowed = vqmovn_s32(p1);
     const auto p2_narrowed = vqmovn_s32(p2);
-    const auto p =
-        vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
-    const auto clamped =
-        vmax_s8(output_activation_min_vector,
-                vmin_s8(output_activation_max_vector, vqmovn_s16(p)));
-    vst1_s8(output_data + i, clamped);
+    const auto p3_narrowed = vqmovn_s32(p3);
+    const auto p4_narrowed = vqmovn_s32(p4);
+
+    const int16x8_t p_part1 =
+        vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+    const int16x8_t p_part2 =
+        vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+    const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+    const auto clamped = vmaxq_s8(output_activation_min_vector,
+                                  vminq_s8(output_activation_max_vector, p));
+    vst1q_s8(output_data + i, clamped);
   }
 #endif  // NEON
 
@@ -117,40 +150,63 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
   const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
   const auto output_offset_vector = vdupq_n_s16(params.output_offset);
   const auto output_activation_min_vector =
-      vdup_n_s8(params.quantized_activation_min);
+      vdupq_n_s8(params.quantized_activation_min);
   const auto output_activation_max_vector =
-      vdup_n_s8(params.quantized_activation_max);
+      vdupq_n_s8(params.quantized_activation_max);
   const int left_shift = std::max(0, params.output_shift);
   const int right_shift = std::max(0, -params.output_shift);
   const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
-  for (; i <= size - 8; i += 8) {
-    // We load / store 8 at a time, multiplying as two sets of 4 int32s.
-    const auto input2_val_original = vld1_s8(input2_data + i);
-    const auto input2_val_s16 = vmovl_s8(input2_val_original);
-    const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
+  for (; i <= size - 16; i += 16) {
+    // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+    const auto input2_val_original = vld1q_s8(input2_data + i);
+    const auto input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const auto input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
 
-    const auto input2_val_low = vget_low_s16(input2_val);
-    const auto input2_val_high = vget_high_s16(input2_val);
+    const auto input2_val_high =
+        vaddq_s16(input2_val_s16_high, input2_offset_vector);
+    const auto input2_val_low =
+        vaddq_s16(input2_val_s16_low, input2_offset_vector);
 
-    auto p1 = vmull_n_s16(input2_val_low, input1_val);
-    auto p2 = vmull_n_s16(input2_val_high, input1_val);
+    const auto input2_val_low_low = vget_low_s16(input2_val_low);
+    const auto input2_val_low_high = vget_high_s16(input2_val_low);
+    const auto input2_val_high_low = vget_low_s16(input2_val_high);
+    const auto input2_val_high_high = vget_high_s16(input2_val_high);
+
+    auto p1 = vmull_n_s16(input2_val_high_high, input1_val);
+    auto p2 = vmull_n_s16(input2_val_high_low, input1_val);
+    auto p3 = vmull_n_s16(input2_val_low_high, input1_val);
+    auto p4 = vmull_n_s16(input2_val_low_low, input1_val);
 
     p1 = vshlq_s32(p1, left_shift_vec);
     p2 = vshlq_s32(p2, left_shift_vec);
+    p3 = vshlq_s32(p3, left_shift_vec);
+    p4 = vshlq_s32(p4, left_shift_vec);
+
     p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
     p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+    p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
     p1 = RoundingDivideByPOT(p1, right_shift);
     p2 = RoundingDivideByPOT(p2, right_shift);
+    p3 = RoundingDivideByPOT(p3, right_shift);
+    p4 = RoundingDivideByPOT(p4, right_shift);
 
     const auto p1_narrowed = vqmovn_s32(p1);
     const auto p2_narrowed = vqmovn_s32(p2);
-    const auto p =
-        vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
-    const auto clamped =
-        vmax_s8(output_activation_min_vector,
-                vmin_s8(output_activation_max_vector, vqmovn_s16(p)));
-    vst1_s8(output_data + i, clamped);
+    const auto p3_narrowed = vqmovn_s32(p3);
+    const auto p4_narrowed = vqmovn_s32(p4);
+
+    const int16x8_t p_part1 =
+        vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+    const int16x8_t p_part2 =
+        vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+    const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+    const auto clamped = vmaxq_s8(output_activation_min_vector,
+                                  vminq_s8(output_activation_max_vector, p));
+    vst1q_s8(output_data + i, clamped);
   }
 #endif  // NEON
 
@@ -181,77 +237,6 @@ inline void Mul(const ArithmeticParams& params,
   MulElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
-inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
-                                 const RuntimeShape& unswitched_input1_shape,
-                                 const int8* unswitched_input1_data,
-                                 const RuntimeShape& unswitched_input2_shape,
-                                 const int8* unswitched_input2_data,
-                                 const RuntimeShape& output_shape,
-                                 int8* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastMulFivefoldInt8/8bit");
-
-  ArithmeticParams switched_params = unswitched_params;
-  switched_params.input1_offset = unswitched_params.input2_offset;
-  switched_params.input2_offset = unswitched_params.input1_offset;
-
-  const bool use_unswitched =
-      unswitched_params.broadcast_category ==
-      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const ArithmeticParams& params =
-      use_unswitched ? unswitched_params : switched_params;
-  const int8* input1_data =
-      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const int8* input2_data =
-      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
-
-  // Fivefold nested loops. The second input resets its position for each
-  // iteration of the second loop. The first input resets its position at the
-  // beginning of the fourth loop. The innermost loop is an elementwise Mul of
-  // sections of the arrays.
-  int8* output_data_ptr = output_data;
-  const int8* input1_data_ptr = input1_data;
-  const int8* input2_data_reset = input2_data;
-  int y0 = params.broadcast_shape[0];
-  int y1 = params.broadcast_shape[1];
-  int y2 = params.broadcast_shape[2];
-  int y3 = params.broadcast_shape[3];
-  int y4 = params.broadcast_shape[4];
-  if (y4 > 1) {
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const int8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          for (int i3 = 0; i3 < y3; ++i3) {
-            MulElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                           output_data_ptr);
-            input2_data_ptr += y4;
-            output_data_ptr += y4;
-          }
-          input1_data_ptr += y4;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  } else {
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const int8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          MulSimpleBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
-                             output_data_ptr);
-          input2_data_ptr += y3;
-          output_data_ptr += y3;
-          ++input1_data_ptr;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  }
-}
-
 inline void BroadcastMulDispatch(const ArithmeticParams& params,
                                  const RuntimeShape& input1_shape,
                                  const int8* input1_data,
@@ -265,8 +250,9 @@ inline void BroadcastMulDispatch(const ArithmeticParams& params,
         output_shape, output_data);
   }
 
-  BroadcastMulFivefold(params, input1_shape, input1_data, input2_shape,
-                       input2_data, output_shape, output_data);
+  optimized_ops::BinaryBroadcastFiveFold(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data, MulElementwise, MulSimpleBroadcast);
 }
 
 }  // namespace optimized_integer_ops
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 4d8c20074d5..c96f298370a 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <limits>
 #include <utility>
 
-#include "ruy/detect_arm.h"  // from @ruy
 #include "ruy/ruy.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm.h"
@@ -80,7 +79,7 @@ inline void* aligned_alloc(size_t alignment, size_t size,
 }
 
 bool HasSdotInstruction() {
-  static const bool has_dotprod = ruy::DetectDotprod();
+  static const bool has_dotprod = DetectArmNeonDotprod();
   return has_dotprod;
 }
 
@@ -1467,16 +1466,20 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
       int i = 0;
       int32_t* scratch_ptr = scratch;
       for (; i <= total_size - 8; i += 8, result += 8) {
-        float batch_scaling_factor0 = scaling_factors[i / m_rows];
-        float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
-        if (per_channel_scale) {
-          batch_scaling_factor0 *= per_channel_scale[i % m_rows];
-          batch_scaling_factor1 *= per_channel_scale[(i + 4) % m_rows];
-        }
+        const float batch_scaling_factor0 = scaling_factors[i / m_rows];
+        const float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
         const int batch_input_offset0 = -input_offset[i / m_rows];
         const int batch_input_offset1 = -input_offset[(i + 4) / m_rows];
-        const float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0);
-        const float32x4_t scaling_factor1 = vdupq_n_f32(batch_scaling_factor1);
+        float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0);
+        float32x4_t scaling_factor1 = vdupq_n_f32(batch_scaling_factor1);
+        if (per_channel_scale) {
+          const float32x4_t per_channel_scale0 =
+              vld1q_f32(&per_channel_scale[i % m_rows]);
+          const float32x4_t per_channel_scale1 =
+              vld1q_f32(&per_channel_scale[(i + 4) % m_rows]);
+          scaling_factor0 = vmulq_f32(scaling_factor0, per_channel_scale0);
+          scaling_factor1 = vmulq_f32(scaling_factor1, per_channel_scale1);
+        }
         const int32x4_t input_offset0 = vdupq_n_s32(batch_input_offset0);
         const int32x4_t input_offset1 = vdupq_n_s32(batch_input_offset1);
         const int32x4_t row_sum0 = vld1q_s32(row_sums + (i % m_rows));
@@ -1499,7 +1502,10 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
 
       scratch_ptr += i;
       for (; i < total_size; i++) {
-        const float batch_scaling_factor = scaling_factors[i / m_rows];
+        float batch_scaling_factor = scaling_factors[i / m_rows];
+        if (per_channel_scale) {
+          batch_scaling_factor *= per_channel_scale[i % m_rows];
+        }
         const int32_t zero_point = input_offset[i / m_rows];
         int32_t dotprod = *(scratch_ptr++);
         dotprod -= row_sums[i % m_rows] * zero_point;
@@ -1515,16 +1521,6 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
       per_channel_scale, input_offset, row_sums);
 }
 
-void NeonMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
-  NeonMatrixBatchVectorMultiplyAccumulateImpl(
-      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-      per_channel_scale, input_offset, nullptr);
-}
-
 inline int64x2x2_t MulAdd(int32x4_t acc, int32x4_t lhs, int32x4_t rhs) {
   int64x2x2_t result;
   const int64x2_t lhs_low = vmovl_s32(vget_low_s32(lhs));
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index b978bf5f3bb..86951fcd559 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -55,16 +55,6 @@ void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
                    vectors, scaling_factors, n_batch, scratch, result, context);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
-  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                   vectors, scaling_factors, n_batch, result, per_channel_scale,
-                   input_offset);
-}
-
 void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
index 1b043390c22..1554d07a61c 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
@@ -62,12 +62,6 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
     bool* compute_row_sums, CpuBackendContext* context);
 
-void NeonMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset);
-
 void NeonApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
                         const int32_t* bias, int32_t layer_norm_scale_a,
                         int32_t layer_norm_scale_b, int32_t variance_limit,
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 5f183de7269..59d0ba7bf6f 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -201,67 +201,133 @@ MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
 // MultiplyByQuantizedMultipler.
 #ifdef USE_NEON
 inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
-    int32x4x4_t input_val, int32 quantized_multiplier, int shift) {
-  using gemmlowp::RoundingDivideByPOT;
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  const int left_shift = shift > 0 ? shift : 0;
-  const int right_shift = shift > 0 ? 0 : -shift;
+    int32x4x4_t input_val, int32 quantized_multiplier, int32 shift) {
+  const int left_shift = std::max(shift, 0);
+  const int right_shift = std::min(shift, 0);
   int32x4x4_t result;
-  // The vector type support for SaturatingRoundingDoublingHighMulth in gemmlowp
-  // is limited to NEON.
-#ifdef GEMMLOWP_NEON
-  const int32x4_t left_shifted_one_dup = vdupq_n_s32(1 << left_shift);
-  result.val[0] =
-      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                              vmulq_s32(input_val.val[0], left_shifted_one_dup),
-                              quantized_multiplier),
-                          right_shift);
-  result.val[1] =
-      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                              vmulq_s32(input_val.val[1], left_shifted_one_dup),
-                              quantized_multiplier),
-                          right_shift);
-  result.val[2] =
-      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                              vmulq_s32(input_val.val[2], left_shifted_one_dup),
-                              quantized_multiplier),
-                          right_shift);
-  result.val[3] =
-      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                              vmulq_s32(input_val.val[3], left_shifted_one_dup),
-                              quantized_multiplier),
-                          right_shift);
-#else
-  for (int i = 0; i < 4; ++i) {
-    int32_t vals[4];
-    vals[0] = RoundingDivideByPOT(
-        SaturatingRoundingDoublingHighMul(
-            vgetq_lane_s32(input_val.val[i], 0) * (1 << left_shift),
-            quantized_multiplier),
-        right_shift);
-    vals[1] = RoundingDivideByPOT(
-        SaturatingRoundingDoublingHighMul(
-            vgetq_lane_s32(input_val.val[i], 1) * (1 << left_shift),
-            quantized_multiplier),
-        right_shift);
-    vals[2] = RoundingDivideByPOT(
-        SaturatingRoundingDoublingHighMul(
-            vgetq_lane_s32(input_val.val[i], 2) * (1 << left_shift),
-            quantized_multiplier),
-        right_shift);
-    vals[3] = RoundingDivideByPOT(
-        SaturatingRoundingDoublingHighMul(
-            vgetq_lane_s32(input_val.val[i], 3) * (1 << left_shift),
-            quantized_multiplier),
-        right_shift);
 
-    result.val[i] = vld1q_s32(reinterpret_cast<int32_t*>(&vals));
-  }
-#endif
+  int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
+  int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
+  int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
+
+  result.val[0] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[1] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[2] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[3] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
   return result;
 }
 #endif
 
+template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
+inline void BinaryBroadcastFiveFold(const ArithmeticParams& unswitched_params,
+                                    const RuntimeShape& unswitched_input1_shape,
+                                    const T* unswitched_input1_data,
+                                    const RuntimeShape& unswitched_input2_shape,
+                                    const T* unswitched_input2_data,
+                                    const RuntimeShape& output_shape,
+                                    T* output_data, ElementwiseF elementwise_f,
+                                    ScalarBroadcastF scalar_broadcast_f) {
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+  switched_params.input1_shift = unswitched_params.input2_shift;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+  switched_params.input2_shift = unswitched_params.input1_shift;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams& params =
+      use_unswitched ? unswitched_params : switched_params;
+  const T* input1_data =
+      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const T* input2_data =
+      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise add of
+  // sections of the arrays.
+  T* output_data_ptr = output_data;
+  const T* input1_data_ptr = input1_data;
+  const T* input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for
+  // input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  if (y4 > 1) {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const T* input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          for (int i3 = 0; i3 < y3; ++i3) {
+            elementwise_f(y4, params, input1_data_ptr, input2_data_ptr,
+                          output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
+        }
+      }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  } else {
+    // Special case of y4 == 1, in which the innermost loop is a single
+    // element and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except
+    // simplified for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const T* input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr,
+                             output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+}
+
 inline void AddBiasAndEvalActivationFunction(float output_activation_min,
                                              float output_activation_max,
                                              const RuntimeShape& bias_shape,
@@ -2101,186 +2167,6 @@ inline void Add(const ArithmeticParams& params,
   output_map = output_map.cwiseMin(params.quantized_activation_max);
 }
 
-inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
-                                 const RuntimeShape& unswitched_input1_shape,
-                                 const uint8* unswitched_input1_data,
-                                 const RuntimeShape& unswitched_input2_shape,
-                                 const uint8* unswitched_input2_data,
-                                 const RuntimeShape& output_shape,
-                                 uint8* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastAddFivefold/8bit");
-
-  ArithmeticParams switched_params = unswitched_params;
-  switched_params.input1_offset = unswitched_params.input2_offset;
-  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
-  switched_params.input1_shift = unswitched_params.input2_shift;
-  switched_params.input2_offset = unswitched_params.input1_offset;
-  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
-  switched_params.input2_shift = unswitched_params.input1_shift;
-
-  const bool use_unswitched =
-      unswitched_params.broadcast_category ==
-      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const ArithmeticParams& params =
-      use_unswitched ? unswitched_params : switched_params;
-  const uint8* input1_data =
-      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const uint8* input2_data =
-      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
-
-  // Fivefold nested loops. The second input resets its position for each
-  // iteration of the second loop. The first input resets its position at the
-  // beginning of the fourth loop. The innermost loop is an elementwise add of
-  // sections of the arrays.
-  uint8* output_data_ptr = output_data;
-  const uint8* input1_data_ptr = input1_data;
-  const uint8* input2_data_reset = input2_data;
-  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
-  // between input shapes. y3 for input 1 is always broadcast, and so the
-  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
-  // Put another way,
-  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
-  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
-  int y0 = params.broadcast_shape[0];
-  int y1 = params.broadcast_shape[1];
-  int y2 = params.broadcast_shape[2];
-  int y3 = params.broadcast_shape[3];
-  int y4 = params.broadcast_shape[4];
-  if (y4 > 1) {
-    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
-    // dimension.
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          for (int i3 = 0; i3 < y3; ++i3) {
-            AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                           output_data_ptr);
-            input2_data_ptr += y4;
-            output_data_ptr += y4;
-          }
-          // We have broadcast y4 of input1 data y3 times, and now move on.
-          input1_data_ptr += y4;
-        }
-      }
-      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
-      input2_data_reset = input2_data_ptr;
-    }
-  } else {
-    // Special case of y4 == 1, in which the innermost loop is a single element
-    // and can be combined with the next (y3) as an inner broadcast.
-    //
-    // Note that this handles the case of pure scalar broadcast when
-    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
-    // broadcast with batch (as y2 > 1).
-    //
-    // NOTE The process is the same as the above general case except simplified
-    // for y4 == 1 and the loop over y3 is contained within the
-    // AddScalarBroadcast function.
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
-                             output_data_ptr);
-          input2_data_ptr += y3;
-          output_data_ptr += y3;
-          input1_data_ptr += 1;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  }
-}
-
-inline void BroadcastAddFivefold(const ArithmeticParams& params,
-                                 const RuntimeShape& unswitched_input1_shape,
-                                 const float* unswitched_input1_data,
-                                 const RuntimeShape& unswitched_input2_shape,
-                                 const float* unswitched_input2_data,
-                                 const RuntimeShape& output_shape,
-                                 float* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastAddFivefold/float");
-
-  const bool use_unswitched =
-      params.broadcast_category ==
-      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const float* input1_data =
-      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const float* input2_data =
-      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
-
-  // Fivefold nested loops. The second input resets its position for each
-  // iteration of the second loop. The first input resets its position at the
-  // beginning of the fourth loop. The innermost loop is an elementwise add of
-  // sections of the arrays.
-  float* output_data_ptr = output_data;
-  const float* input1_data_ptr = input1_data;
-  const float* input2_data_reset = input2_data;
-  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
-  // between input shapes. y3 for input 1 is always broadcast, and so the
-  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
-  // Put another way,
-  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
-  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
-  int y0 = params.broadcast_shape[0];
-  int y1 = params.broadcast_shape[1];
-  int y2 = params.broadcast_shape[2];
-  int y3 = params.broadcast_shape[3];
-  int y4 = params.broadcast_shape[4];
-  if (y4 > 1) {
-    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
-    // dimension.
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const float* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          for (int i3 = 0; i3 < y3; ++i3) {
-            AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                           output_data_ptr);
-            input2_data_ptr += y4;
-            output_data_ptr += y4;
-          }
-          // We have broadcast y4 of input1 data y3 times, and now move on.
-          input1_data_ptr += y4;
-        }
-      }
-      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
-      input2_data_reset = input2_data_ptr;
-    }
-  } else {
-    // Special case of y4 == 1, in which the innermost loop is a single element
-    // and can be combined with the next (y3) as an inner broadcast.
-    //
-    // Note that this handles the case of pure scalar broadcast when
-    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
-    // broadcast with batch (as y2 > 1).
-    //
-    // NOTE The process is the same as the above general case except simplified
-    // for y4 == 1 and the loop over y3 is contained within the
-    // AddScalarBroadcast function.
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const float* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
-                             output_data_ptr);
-          input2_data_ptr += y3;
-          output_data_ptr += y3;
-          input1_data_ptr += 1;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  }
-}
-
 template <typename T>
 inline void BroadcastAddDispatch(
     const ArithmeticParams& params, const RuntimeShape& input1_shape,
@@ -2291,8 +2177,37 @@ inline void BroadcastAddDispatch(
                               input2_data, output_shape, output_data);
   }
 
-  BroadcastAddFivefold(params, input1_shape, input1_data, input2_shape,
-                       input2_data, output_shape, output_data);
+  BinaryBroadcastFiveFold(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      static_cast<void (*)(int, const ArithmeticParams&, const T*, const T*,
+                           T*)>(AddElementwise),
+      static_cast<void (*)(int, const ArithmeticParams&, T, const T*, T*)>(
+          AddScalarBroadcast));
+}
+
+inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const uint8* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const uint8* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data) {
+  BroadcastAddDispatch(unswitched_params, unswitched_input1_shape,
+                       unswitched_input1_data, unswitched_input2_shape,
+                       unswitched_input2_data, output_shape, output_data);
+}
+
+inline void BroadcastAddFivefold(const ArithmeticParams& params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const float* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const float* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 float* output_data) {
+  BroadcastAddDispatch(params, unswitched_input1_shape, unswitched_input1_data,
+                       unswitched_input2_shape, unswitched_input2_data,
+                       output_shape, output_data);
 }
 
 inline void MulElementwise(int size, const ArithmeticParams& params,
@@ -2650,144 +2565,6 @@ inline void Mul(const ArithmeticParams& params,
   MulElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
-inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
-                                 const RuntimeShape& unswitched_input1_shape,
-                                 const uint8* unswitched_input1_data,
-                                 const RuntimeShape& unswitched_input2_shape,
-                                 const uint8* unswitched_input2_data,
-                                 const RuntimeShape& output_shape,
-                                 uint8* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastMulFivefold/8bit");
-
-  ArithmeticParams switched_params = unswitched_params;
-  switched_params.input1_offset = unswitched_params.input2_offset;
-  switched_params.input2_offset = unswitched_params.input1_offset;
-
-  const bool use_unswitched =
-      unswitched_params.broadcast_category ==
-      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const ArithmeticParams& params =
-      use_unswitched ? unswitched_params : switched_params;
-  const uint8* input1_data =
-      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const uint8* input2_data =
-      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
-
-  // Fivefold nested loops. The second input resets its position for each
-  // iteration of the second loop. The first input resets its position at the
-  // beginning of the fourth loop. The innermost loop is an elementwise Mul of
-  // sections of the arrays.
-  uint8* output_data_ptr = output_data;
-  const uint8* input1_data_ptr = input1_data;
-  const uint8* input2_data_reset = input2_data;
-  int y0 = params.broadcast_shape[0];
-  int y1 = params.broadcast_shape[1];
-  int y2 = params.broadcast_shape[2];
-  int y3 = params.broadcast_shape[3];
-  int y4 = params.broadcast_shape[4];
-  if (y4 > 1) {
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          for (int i3 = 0; i3 < y3; ++i3) {
-            MulElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                           output_data_ptr);
-            input2_data_ptr += y4;
-            output_data_ptr += y4;
-          }
-          input1_data_ptr += y4;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  } else {
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          MulSimpleBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
-                             output_data_ptr);
-          input2_data_ptr += y3;
-          output_data_ptr += y3;
-          ++input1_data_ptr;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  }
-}
-
-inline void BroadcastMulFivefold(const ArithmeticParams& params,
-                                 const RuntimeShape& unswitched_input1_shape,
-                                 const float* unswitched_input1_data,
-                                 const RuntimeShape& unswitched_input2_shape,
-                                 const float* unswitched_input2_data,
-                                 const RuntimeShape& output_shape,
-                                 float* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastMulFivefold/float");
-
-  const bool use_unswitched =
-      params.broadcast_category ==
-      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const float* input1_data =
-      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const float* input2_data =
-      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
-
-  // Fivefold nested loops. The second input resets its position for each
-  // iteration of the second loop. The first input resets its position at the
-  // beginning of the fourth loop. The innermost loop is an elementwise Mul of
-  // sections of the arrays.
-  float* output_data_ptr = output_data;
-  const float* input1_data_ptr = input1_data;
-  const float* input2_data_reset = input2_data;
-  int y0 = params.broadcast_shape[0];
-  int y1 = params.broadcast_shape[1];
-  int y2 = params.broadcast_shape[2];
-  int y3 = params.broadcast_shape[3];
-  int y4 = params.broadcast_shape[4];
-  if (y4 > 1) {
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const float* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          for (int i3 = 0; i3 < y3; ++i3) {
-            MulElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                           output_data_ptr);
-            input2_data_ptr += y4;
-            output_data_ptr += y4;
-          }
-          input1_data_ptr += y4;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  } else {
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const float* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          // The input may be switched here, but the common parameters here
-          // do not matter as they will not influence the float math execution.
-          MulSimpleBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
-                             output_data_ptr);
-          input2_data_ptr += y3;
-          output_data_ptr += y3;
-          ++input1_data_ptr;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  }
-}
-
 template <typename T>
 inline void BroadcastMulDispatch(
     const ArithmeticParams& params, const RuntimeShape& input1_shape,
@@ -2798,10 +2575,41 @@ inline void BroadcastMulDispatch(
                               input2_data, output_shape, output_data);
   }
 
-  BroadcastMulFivefold(params, input1_shape, input1_data, input2_shape,
-                       input2_data, output_shape, output_data);
+  BinaryBroadcastFiveFold(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      static_cast<void (*)(int, const ArithmeticParams&, const T*, const T*,
+                           T*)>(MulElementwise),
+      static_cast<void (*)(int, const ArithmeticParams&, T, const T*, T*)>(
+          MulSimpleBroadcast));
 }
 
+inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const uint8* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const uint8* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data) {
+  BroadcastMulDispatch(unswitched_params, unswitched_input1_shape,
+                       unswitched_input1_data, unswitched_input2_shape,
+                       unswitched_input2_data, output_shape, output_data);
+}
+
+inline void BroadcastMulFivefold(const ArithmeticParams& params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const float* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const float* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 float* output_data) {
+  BroadcastMulDispatch(params, unswitched_input1_shape, unswitched_input1_data,
+                       unswitched_input2_shape, unswitched_input2_data,
+                       output_shape, output_data);
+}
+
+
+
 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
@@ -4332,6 +4140,41 @@ inline void Logistic(const LogisticParams& params,
     }
   }
 #endif
+#ifdef GEMMLOWP_SSE4
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 3>;
+
+    for (; c <= flat_size - 16; c += 16) {
+      F3 input0 = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+      F3 input1 = F3::FromRaw(gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+          reinterpret_cast<const __m128i*>(input_data_ptr + 8))));
+      F0 output0 = gemmlowp::logistic(input0);
+      F0 output1 = gemmlowp::logistic(input1);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                       output0.raw().v);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
+                       output1.raw().v);
+      input_data_ptr += 16;
+      output_data_ptr += 16;
+    }
+    for (; c <= flat_size - 8; c += 8) {
+      F3 input = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+      F0 output = gemmlowp::logistic(input);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                       output.raw().v);
+      input_data_ptr += 8;
+      output_data_ptr += 8;
+    }
+  }
+#endif
+
   {
     // F0 uses 0 integer bits, range [-1, 1].
     // This is the return type of math functions such as tanh, logistic,
@@ -4438,6 +4281,72 @@ inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
     }
   }
 #endif
+#ifdef GEMMLOWP_SSE4
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 3>;
+
+    if (input_left_shift == 0) {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+            _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+        F3 input1 = F3::FromRaw(gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+            reinterpret_cast<const __m128i*>(input_data_ptr + 8))));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output0.raw().v);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
+                         output1.raw().v);
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+            _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+        F0 output = gemmlowp::tanh(input);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output.raw().v);
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
+    } else {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(input_data_ptr)))));
+        F3 input1 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(input_data_ptr + 8)))));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output0.raw().v);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
+                         output1.raw().v);
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(input_data_ptr)))));
+        F0 output = gemmlowp::tanh(input);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output.raw().v);
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
+    }
+  }
+#endif
+
   {
     // F0 uses 0 integer bits, range [-1, 1].
     // This is the return type of math functions such as tanh, logistic,
@@ -7820,6 +7729,130 @@ void Transpose(const TransposeParams& unshrinked_params,
                       shrinked_output_shape, output_data);
 }
 
+// Assume input1 & input2 have the same scale & zero point.
+inline void MaximumElementwise(int size, const ArithmeticParams& params,
+                               const int8* input1_data, const int8* input2_data,
+                               int8* output_data) {
+  ruy::profiler::ScopeLabel label("MaximumElementwiseInt8/8bit");
+  int i = 0;
+#ifdef USE_NEON
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t max_data =
+        vmaxq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, max_data);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const int8 input1_val = input1_data[i];
+    const int8 input2_val = input2_data[i];
+    output_data[i] = std::max(input1_val, input2_val);
+  }
+}
+
+inline void MaximumScalarBroadcast(int size, const ArithmeticParams& params,
+                                   int8 input1_data, const int8* input2_data,
+                                   int8* output_data) {
+  ruy::profiler::ScopeLabel label("MaximumScalarBroadcastInt8/8bit");
+  int i = 0;
+
+#ifdef USE_NEON
+  const int8x16_t input1_val_original = vdupq_n_s8(input1_data);
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t max_data =
+        vmaxq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, max_data);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const int8 input2_val = input2_data[i];
+    output_data[i] = std::max(input1_data, input2_val);
+  }
+}
+
+// Assume input1 & input2 have the same scale & zero point.
+inline void MinimumElementwise(int size, const ArithmeticParams& params,
+                               const int8* input1_data, const int8* input2_data,
+                               int8* output_data) {
+  ruy::profiler::ScopeLabel label("MinimumElementwiseInt8/8bit");
+  int i = 0;
+#ifdef USE_NEON
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t min_data =
+        vminq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, min_data);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const int8 input1_val = input1_data[i];
+    const int8 input2_val = input2_data[i];
+    output_data[i] = std::min(input1_val, input2_val);
+  }
+}
+
+inline void MinimumScalarBroadcast(int size, const ArithmeticParams& params,
+                                   int8 input1_data, const int8* input2_data,
+                                   int8* output_data) {
+  ruy::profiler::ScopeLabel label("MinimumScalarBroadcastInt8/8bit");
+  int i = 0;
+
+#ifdef USE_NEON
+  const int8x16_t input1_val_original = vdupq_n_s8(input1_data);
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t min_data =
+        vminq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, min_data);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const int8 input2_val = input2_data[i];
+    output_data[i] = std::min(input1_data, input2_val);
+  }
+}
+
+template <typename Op>
+inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
+                                     const RuntimeShape& input1_shape,
+                                     const int8* input1_data,
+                                     const RuntimeShape& input2_shape,
+                                     const int8* input2_data,
+                                     const RuntimeShape& output_shape,
+                                     int8* output_data, Op op) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return reference_ops::MaximumMinimumBroadcastSlow(
+        input1_shape, input1_data, input2_shape, input2_data, output_shape,
+        output_data, op);
+  }
+
+  BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape,
+                          input2_data, output_shape, output_data,
+                          MaximumElementwise, MaximumScalarBroadcast);
+}
+
+template <typename Op>
+inline void BroadcastMinimumDispatch(const ArithmeticParams& params,
+                                     const RuntimeShape& input1_shape,
+                                     const int8* input1_data,
+                                     const RuntimeShape& input2_shape,
+                                     const int8* input2_data,
+                                     const RuntimeShape& output_shape,
+                                     int8* output_data, Op op) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return reference_ops::MaximumMinimumBroadcastSlow(
+        input1_shape, input1_data, input2_shape, input2_data, output_shape,
+        output_data, op);
+  }
+
+  BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape,
+                          input2_data, output_shape, output_data,
+                          MinimumElementwise, MinimumScalarBroadcast);
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h b/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
index 750e63e152f..969caa57d47 100644
--- a/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SPARSE_OPS_FULLY_CONNECTED_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SPARSE_OPS_FULLY_CONNECTED_H_
 
+#include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
@@ -31,6 +32,8 @@ inline void FullyConnectedSparseWeight(
     const RuntimeShape& weights_shape, const float* weights_data,
     const RuntimeShape& bias_shape, const float* bias_data,
     const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("FullyConnected");
+  ruy::profiler::ScopeLabel inner_label("Random Sparse");
   const float output_activation_min = params.float_activation_min;
   const float output_activation_max = params.float_activation_max;
 
@@ -75,6 +78,8 @@ inline void FullyConnectedSparseWeight1x4(
     const RuntimeShape& weights_shape, const float* weights_data,
     const RuntimeShape& bias_shape, const float* bias_data,
     const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("FullyConnected");
+  ruy::profiler::ScopeLabel inner_label("1x4 Block Sparse");
   const float output_activation_min = params.float_activation_min;
   const float output_activation_max = params.float_activation_max;
 
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
index 7fb69e7b4f4..80cc14c6d26 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 
 namespace tflite {
@@ -89,18 +90,24 @@ float GetFloatVectorElement(__m128 v) {
 
 }  // namespace
 
-void SseMatrixBatchVectorMultiplyAccumulate(
+void SseMatrixBatchVectorMultiplyAccumulateImpl(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result) {
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, const int32_t* row_sums) {
   for (std::intptr_t batch = 0; batch < n_batch; ++batch) {
     const float batch_scaling_factor = scaling_factors[batch];
+    const int32_t batch_offset = input_offset ? input_offset[batch] : 0;
     // Compute dot-product for every column.
     for (std::intptr_t row = 0; row < m_rows; ++row) {
       // Get the address of the first element of the row.
       const int8_t* __restrict__ row_ptr = matrix + row * m_cols;
-
+      const float row_scale =
+          per_channel_scale ? per_channel_scale[row] * batch_scaling_factor
+                            : batch_scaling_factor;
+      const int32_t row_offset =
+          row_sums && batch_offset ? batch_offset * row_sums[row] : 0;
       // Initialize the dot product sum for the row to 0.
       __m128i dotprod_32x4 = _mm_setzero_si128();
       std::intptr_t col = 0;
@@ -152,8 +159,10 @@ void SseMatrixBatchVectorMultiplyAccumulate(
       for (; col < m_cols; ++col) {
         sum += row_ptr[col] * vectors[col];
       }  // for col
-
-      *result += sum * batch_scaling_factor;
+      if (row_offset) {
+        sum -= row_offset;
+      }
+      *result += sum * row_scale;
       ++result;
     }  // for row
 
@@ -165,56 +174,30 @@ void SseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, const float* __restrict__ per_channel_scale,
-    const int32_t* __restrict__ input_offset) {
-  if (input_offset == nullptr) {
-    SseMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
-                                           scaling_factors, n_batch, result);
-    return;
-  }
-  static constexpr std::intptr_t kBlockSize = 16;
-  for (std::intptr_t batch = 0; batch < n_batch; ++batch) {
-    const float batch_scaling_factor = scaling_factors[batch];
-    for (std::intptr_t row = 0; row < m_rows; ++row) {
-      const int8_t* __restrict__ row_ptr = matrix + row * m_cols;
-      float scale = batch_scaling_factor;
-      if (per_channel_scale != nullptr) {
-        scale *= per_channel_scale[row];
-      }
-      __m128i dotprod_32x4 = _mm_setzero_si128();
-      __m128i row_sum_16x8 = _mm_setzero_si128();
-      std::intptr_t col = 0;
-      for (; col < (m_cols & ~(kBlockSize - 1)); col += kBlockSize) {
-        const __m128i vec_8x16 =
-            _mm_loadu_si128(reinterpret_cast<const __m128i*>(vectors + col));
-        const __m128i row_8x16 =
-            _mm_loadu_si128(reinterpret_cast<const __m128i*>(row_ptr + col));
-        // dotprod += vec · row
-        dotprod_32x4 =
-            _mm_add_epi32(dotprod_32x4, DotProdInt8x4x4(vec_8x16, row_8x16));
+    float* __restrict__ result) {
+  SseMatrixBatchVectorMultiplyAccumulateImpl(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
+      /*row_sums=*/nullptr);
+}
 
-        // Pairwise add 16x 8-bit values; equivalently, multipy-add with 1.
-        // Result is 8x 16-bit values.
-        const __m128i row_16x8 = _mm_maddubs_epi16(_mm_set1_epi8(1), row_8x16);
-        row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8);
-      }  // for col
-      // Pairwise add 8x 16-bit values; equivalently, multipy-add with 1.
-      // Result is 4x 32-bit values.
-      const __m128i row_sum_32x4 =
-          _mm_madd_epi16(row_sum_16x8, _mm_set1_epi16(1));
-      int32_t sum = ReduceInt32x4(dotprod_32x4);
-      int32_t row_sum = ReduceInt32x4(row_sum_32x4);
-      // Postamble loop.
-      for (; col < m_cols; ++col) {
-        sum += row_ptr[col] * vectors[col];
-        row_sum += row_ptr[col];
-      }  // for col
-      sum -= row_sum * input_offset[batch];
-      *result += sum * scale;
-      ++result;
-    }  // for row
-    vectors += m_cols;
-  }  // for batch
+void SseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context) {
+  if ((input_offset != nullptr) && (!compute_row_sums || *compute_row_sums)) {
+    memset(row_sums, 0, sizeof(int32_t) * m_rows);
+    SseReductionSumVector(matrix, row_sums, m_rows, m_cols);
+    if (compute_row_sums) {
+      *compute_row_sums = false;
+    }
+  }
+  SseMatrixBatchVectorMultiplyAccumulateImpl(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      per_channel_scale, input_offset, row_sums);
 }
 
 namespace {
@@ -347,6 +330,44 @@ void SseSparseMatrixBatchVectorMultiplyAccumulate(
   }  // for batch
 }
 
+void SseReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                           const int output_size, const int reduction_size) {
+  static constexpr std::intptr_t kBlockSize = 16;
+  for (std::intptr_t row = 0; row < output_size; ++row) {
+    const int8_t* __restrict__ row_ptr = input_vector + row * reduction_size;
+    __m128i row_sum_16x8 = _mm_setzero_si128();
+    std::intptr_t col = 0;
+    for (; col < (reduction_size & ~(kBlockSize - 1)); col += kBlockSize) {
+      const __m128i row_8x16 =
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(row_ptr + col));
+      const __m128i row_16x8 = _mm_maddubs_epi16(_mm_set1_epi8(1), row_8x16);
+      row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8);
+    }  // for col
+#ifdef __SSE4_1__
+    // Postamble for 8x 8-bit inputs.
+    if (col < (reduction_size & ~7)) {
+      // _mm_loadu_si64 not supported in gcc versions < 9, breaks kokoro build.
+      const __m128i row_16x8 = _mm_cvtepi8_epi16(
+          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(row_ptr + col)));
+      // dotprod += vec · row
+      row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8);
+      col += 8;
+    }
+#endif
+    const __m128i row_sum_32x4 =
+        _mm_madd_epi16(row_sum_16x8, _mm_set1_epi16(1));
+    int32_t row_sum = ReduceInt32x4(row_sum_32x4);
+#if defined(__SSE4_1__) && defined(__clang__)
+    // SSE 4.1: Don't try to unroll and vectorize this, already done above.
+#pragma clang loop unroll(disable) vectorize(disable)
+#endif
+    for (; col < reduction_size; col++) {
+      row_sum += *(row_ptr + col);
+    }
+    *(output_vector + row) += row_sum;
+  }
+}
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
index 986e70a7823..224d811e862 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -59,10 +59,9 @@ void MatrixBatchVectorMultiplyAccumulate(
     int n_batch, float* __restrict__ result, const float* per_channel_scale,
     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
     bool* compute_row_sums, CpuBackendContext* context) {
-  PortableMatrixBatchVectorMultiplyAccumulate(
-      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-      per_channel_scale, input_offset, scratch, row_sums, compute_row_sums,
-      context);
+  SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                  vectors, scaling_factors, n_batch, result, per_channel_scale,
+                  input_offset, scratch, row_sums, compute_row_sums, context);
 }
 
 void MatrixBatchVectorMultiplyAccumulate(
@@ -75,17 +74,6 @@ void MatrixBatchVectorMultiplyAccumulate(
                   vectors, scaling_factors, n_batch, result);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors,
-    const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, const float* __restrict__ per_channel_scale,
-    const int32_t* __restrict__ input_offset) {
-  SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                  vectors, scaling_factors, n_batch, result, per_channel_scale,
-                  input_offset);
-}
-
 void SparseMatrixBatchVectorMultiplyAccumulate1x4(
     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
@@ -315,8 +303,8 @@ void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
 
 void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
                         int output_size, int reduction_size) {
-  NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
-                   reduction_size);
+  SSE_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
+                  reduction_size);
 }
 
 void MeanStddevNormalization(const float* input_vector, float* output_vector,
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
index 1996b1f30a9..c5ede624762 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
 #endif
@@ -38,8 +40,9 @@ void SseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, const float* __restrict__ per_channel_scale,
-    const int32_t* __restrict__ input_offset);
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context);
 
 // Matrix multiplication for quantized values using symmetric quantization.
 // Sparse version.
@@ -49,6 +52,9 @@ void SseSparseMatrixBatchVectorMultiplyAccumulate(
     const float* __restrict__ scaling_factors, int n_batch,
     float* __restrict__ result);
 
+void SseReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                           const int output_size, const int reduction_size);
+
 #endif  // __SSSE3__
 
 }  // namespace tensor_utils
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
index aa626f43f19..e315683c0cd 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
@@ -58,12 +58,15 @@ inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
   }
 }
 
-inline void Logistic(int32_t input_size, const int16_t* ptr_input_data,
-                     int16_t* ptr_output_data) {
+inline void Logistic(int32_t input_multiplier, int32_t input_size,
+                     const int16_t* ptr_input_data, int16_t* ptr_output_data) {
   // We use the LUT for sigmoid and take into account, that
   // tanh(x) = 2*sigmoid(2*x) - 1
+
+  int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
+
   for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
-    int32_t input_data = *ptr_input_data;
+    int32_t input_data = (*ptr_input_data) * input_data_mul;
 
     // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and
     // we do interpolation on unsigned values.
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
index 8c07c6f6d6c..baae65ab30e 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -57,12 +57,16 @@ inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
   }
 }
 
-inline void Tanh(int32_t input_left_shift, int32_t input_size,
-                 const int16_t* ptr_input_data, int16_t* ptr_output_data) {
+inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
+                 int32_t input_size, const int16_t* ptr_input_data,
+                 int16_t* ptr_output_data) {
   // We use the LUT for sigmoid and take into account, that
   // tanh(x) = 2*sigmoid(2*x) - 1
+
+  int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
+
   for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
-    int32_t input_data = *ptr_input_data;
+    int32_t input_data = (*ptr_input_data) * input_data_mul;
 
     if (input_left_shift == 1) {
       input_data <<= 1;
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 22e37d5af71..4f6db290d4f 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
@@ -53,7 +52,7 @@ void PortableSymmetricQuantizeFloats(const float* values, const int size,
 void PortableSymmetricQuantizeFloats(const float* values, const int size,
                                      int8_t* quantized_values, float min_value,
                                      float max_value, float* scaling_factor) {
-  const int kScale = 127;
+  const int32_t kScale = 127;
   const float range = std::max(std::abs(min_value), std::abs(max_value));
   if (range == 0) {
     memset(quantized_values, 0, size * sizeof(int8_t));
@@ -66,7 +65,8 @@ void PortableSymmetricQuantizeFloats(const float* values, const int size,
     const int32_t quantized_value =
         static_cast<int32_t>(TfLiteRound(values[i] * scaling_factor_inv));
     // Clamp: just in case some odd numeric offset.
-    quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
+    quantized_values[i] = static_cast<int8_t>(
+        std::min(kScale, std::max(-kScale, quantized_value)));
   }
 }
 
@@ -161,35 +161,6 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
   }    // for batch
 }
 
-void PortableMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
-  for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
-    const float batch_scaling_factor = scaling_factors[batch];
-    const float batch_offset = input_offset[batch];
-    const int8_t* row_ptr = matrix;
-    for (int row = 0; row < m_rows; ++row) {
-      int32_t dotprod = 0;
-      float scale = batch_scaling_factor;
-      if (per_channel_scale) {
-        scale *= per_channel_scale[row];
-      }
-#if defined(__GNUC__)
-      // Prefetch the row to cache.
-      __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
-                         3 /* temporal locality */);
-#endif
-      for (int col = 0; col < m_cols; ++col, ++row_ptr) {
-        dotprod += (*row_ptr) * (vectors[col] - batch_offset);
-      }  // for col
-      *result += dotprod * scale;
-      ++result;
-    }  // for row
-  }    // for batch
-}
-
 void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
@@ -660,7 +631,8 @@ void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
       int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
       value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
       value -= output_zp;
-      value = std::min(std::max(-128, value), 127);
+      value = std::min(std::max(static_cast<int32_t>(-128), value),
+                       static_cast<int32_t>(127));
 
       output[index] = static_cast<int8>(value);
     }
@@ -748,7 +720,8 @@ void PortableVectorBatchVectorCwiseProductAccumulate(
       int32_t prod = vector[v] * *batch_vector++;
       prod = MultiplyByQuantizedMultiplier(prod, multiplier, shift);
       int32_t output = prod + *result;
-      output = std::max(std::min(32767, output), -32768);
+      output = std::max(std::min(static_cast<int32_t>(32767), output),
+                        static_cast<int32_t>(-32768));
       *result++ = output;
     }
   }
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index 9a365074513..0fd7a407595 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -18,7 +18,6 @@ limitations under the License.
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
 
 #if defined(_MSC_VER)
@@ -99,16 +98,6 @@ void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
                                               scaling_factors, n_batch, result);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
-  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
-                                              scaling_factors, n_batch, result,
-                                              per_channel_scale, input_offset);
-}
-
 void SparseMatrixBatchVectorMultiplyAccumulate1x4(
     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index d8bd70f3722..34767ccd942 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -20,13 +20,17 @@ limitations under the License.
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
 #endif
 
 namespace tflite {
+
+// Not all backends support CpuBackendContext usage, so forward declare to avoid
+// pulling in its implementation.
+class CpuBackendContext;
+
 namespace tensor_utils {
 
 // Limit a float input f between +abs_limit and -abs_limit.
@@ -79,12 +83,6 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
     int n_batch, int32_t* scratch, float* __restrict__ result,
     CpuBackendContext* context);
 
-void PortableMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset);
-
 void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
diff --git a/tensorflow/lite/kernels/internal/reference/prelu.h b/tensorflow/lite/kernels/internal/reference/prelu.h
index d3d7d78a4a4..4633cb9599a 100644
--- a/tensorflow/lite/kernels/internal/reference/prelu.h
+++ b/tensorflow/lite/kernels/internal/reference/prelu.h
@@ -48,14 +48,16 @@ inline void BroadcastPrelu4DSlow(
               params.input_offset + input_data[input_index];
           int32 output_value;
           if (input_value >= 0) {
-            output_value = input_value;
+            output_value = MultiplyByQuantizedMultiplier(
+                input_value, params.output_multiplier_1, params.output_shift_1);
           } else {
             auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
             const int32 alpha_value =
                 params.alpha_offset + alpha_data[alpha_index];
+
             output_value = MultiplyByQuantizedMultiplier(
-                input_value * alpha_value, params.output_multiplier,
-                params.output_shift);
+                input_value * alpha_value, params.output_multiplier_2,
+                params.output_shift_2);
           }
           output_value += params.output_offset;
 
@@ -70,6 +72,37 @@ inline void BroadcastPrelu4DSlow(
   }
 }
 
+template <typename T>
+inline void Prelu(const PreluParams& params, const RuntimeShape& input_shape,
+                  const T* input_data, const RuntimeShape& alpha_shape,
+                  const T* alpha_data, const RuntimeShape& output_shape,
+                  T* output_data) {
+  const int32 quantized_min = std::numeric_limits<T>::min();
+  const int32 quantized_max = std::numeric_limits<T>::max();
+
+  const int flat_size =
+      MatchingElementsSize(input_shape, alpha_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const int32 input_value = params.input_offset + input_data[i];
+    int32 output_value;
+    if (input_value >= 0) {
+      output_value = MultiplyByQuantizedMultiplier(
+          input_value, params.output_multiplier_1, params.output_shift_1);
+    } else {
+      const int32 alpha_value = params.alpha_offset + alpha_data[i];
+
+      output_value = MultiplyByQuantizedMultiplier(input_value * alpha_value,
+                                                   params.output_multiplier_2,
+                                                   params.output_shift_2);
+    }
+    output_value += params.output_offset;
+
+    const int32 clamped_output =
+        std::min(quantized_max, std::max(quantized_min, output_value));
+    output_data[i] = static_cast<T>(clamped_output);
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h b/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
index 8e1a6c85919..40f779c5bdf 100644
--- a/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
+++ b/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
@@ -76,6 +76,10 @@ inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
           BroadcastableOpCategory::kFirstInputBroadcastsFast &&
       params->broadcast_category !=
           BroadcastableOpCategory::kSecondInputBroadcastsFast) {
+    // This is unreachable because at least one else clause in the above loop
+    // must be reached.
+    TFLITE_DCHECK(false);
+    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
     return false;
   }
 
diff --git a/tensorflow/lite/kernels/internal/reference/reduce.h b/tensorflow/lite/kernels/internal/reference/reduce.h
index 17dfd8557ae..fbad266e843 100644
--- a/tensorflow/lite/kernels/internal/reference/reduce.h
+++ b/tensorflow/lite/kernels/internal/reference/reduce.h
@@ -18,6 +18,8 @@ limitations under the License.
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/max.h"
+#include "tensorflow/lite/kernels/internal/min.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
@@ -381,11 +383,11 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
       for (size_t idx = 0; idx < num_outputs; ++idx) {
         float float_mean = static_cast<float>(temp_sum[idx]) /
                            static_cast<float>(num_elements_in_axis);
-        float result =
-            std::min(TfLiteRound(float_mean * scale + bias) + output_zero_point,
-                     static_cast<float>(std::numeric_limits<T>::max()));
-        result =
-            std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
+        float result = TfLiteMin(
+            TfLiteRound(float_mean * scale + bias) + output_zero_point,
+            static_cast<float>(std::numeric_limits<T>::max()));
+        result = TfLiteMax(result,
+                           static_cast<float>(std::numeric_limits<T>::min()));
         output_data[idx] = static_cast<T>(result);
       }
     }
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index f40b268b443..e991a21e3bd 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -59,6 +59,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
 #include "tensorflow/lite/kernels/internal/reference/sub.h"
+#include "tensorflow/lite/kernels/internal/reference/tanh.h"
 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -1343,59 +1344,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   }
 }
 
-inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
-                 const RuntimeShape& output_shape, float* output_data) {
-  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-
-  for (int i = 0; i < flat_size; i++) {
-    float val = input_data[i];
-    float result = std::tanh(val);
-    output_data[i] = result;
-  }
-}
-
-// Convenience version that allows, for example, generated-code calls to be
-// uniform between data types.
-inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
-                 const float* input_data, const RuntimeShape& output_shape,
-                 float* output_data) {
-  // Drop params: not needed.
-  Tanh(input_shape, input_data, output_shape, output_data);
-}
-
-inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
-                 const int16* input_data, const RuntimeShape& output_shape,
-                 int16* output_data) {
-  const int input_left_shift = params.input_left_shift;
-  // Support for shifts is limited until we have a parameterized version of
-  // SaturatingRoundingMultiplyByPOT().
-  TFLITE_DCHECK_GE(input_left_shift, 0);
-  TFLITE_DCHECK_LE(input_left_shift, 1);
-
-  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-
-  // F0 uses 0 integer bits, range [-1, 1].
-  // This is the return type of math functions such as tanh, logistic,
-  // whose range is in [-1, 1].
-  using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
-  // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
-  using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
-
-  if (input_left_shift == 0) {
-    for (int i = 0; i < flat_size; i++) {
-      F3 input = F3::FromRaw(input_data[i]);
-      F0 output = gemmlowp::tanh(input);
-      output_data[i] = output.raw();
-    }
-  } else {
-    for (int i = 0; i < flat_size; i++) {
-      F3 input = F3::FromRaw(
-          gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
-      F0 output = gemmlowp::tanh(input);
-      output_data[i] = output.raw();
-    }
-  }
-}
 
 inline void Dequantize(const RuntimeShape& input_shape,
                        const Eigen::half* input_data,
@@ -2597,7 +2545,7 @@ inline void HardSwish(const HardSwishParams& params,
     // significant bits in the high bits of our 16-bit fixedpoint values, so
     // that fixed-point approximate computations below are as accurate as
     // possible.
-    const int16_t input_value_on_hires_input_scale = input_value << 7;
+    const int16_t input_value_on_hires_input_scale = input_value * (1 << 7);
     // Compute the input value on essentially the output scale, just not
     // right-shifted yet. This is the value that we'll use in the (x >= +3)
     // case, and that in the general case we'll multiply against the "relu-ish"
diff --git a/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h b/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
index ed87863a7e5..e76fc8b6931 100644
--- a/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
+++ b/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cmath>
 
+#include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
@@ -34,7 +35,7 @@ inline int32 GetNearestNeighbor(const int input_value, const int32 input_size,
   const float offset = half_pixel_centers ? 0.5f : 0.0f;
   int32 output_value = std::min(
       align_corners
-          ? static_cast<int32>(std::round((input_value + offset) * scale))
+          ? static_cast<int32>(TfLiteRound((input_value + offset) * scale))
           : static_cast<int32>(std::floor((input_value + offset) * scale)),
       input_size - 1);
   if (half_pixel_centers) {
diff --git a/tensorflow/lite/kernels/internal/reference/tanh.h b/tensorflow/lite/kernels/internal/reference/tanh.h
new file mode 100644
index 00000000000..0f31d4ddeef
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/tanh.h
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
+
+#include <cmath>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result = std::tanh(val);
+    output_data[i] = result;
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// uniform between data types.
+inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
+                 const float* input_data, const RuntimeShape& output_shape,
+                 float* output_data) {
+  // Drop params: not needed.
+  Tanh(input_shape, input_data, output_shape, output_data);
+}
+
+inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
+                 const int16* input_data, const RuntimeShape& output_shape,
+                 int16* output_data) {
+  const int input_left_shift = params.input_left_shift;
+  // Support for shifts is limited until we have a parameterized version of
+  // SaturatingRoundingMultiplyByPOT().
+  TFLITE_DCHECK_GE(input_left_shift, 0);
+  TFLITE_DCHECK_LE(input_left_shift, 1);
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  // F0 uses 0 integer bits, range [-1, 1].
+  // This is the return type of math functions such as tanh, logistic,
+  // whose range is in [-1, 1].
+  using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+  // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+  using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+  if (input_left_shift == 0) {
+    for (int i = 0; i < flat_size; i++) {
+      F3 input = F3::FromRaw(input_data[i]);
+      F0 output = gemmlowp::tanh(input);
+      output_data[i] = output.raw();
+    }
+  } else {
+    for (int i = 0; i < flat_size; i++) {
+      F3 input = F3::FromRaw(
+          gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
+      F0 output = gemmlowp::tanh(input);
+      output_data[i] = output.raw();
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
diff --git a/tensorflow/lite/kernels/internal/tensor.h b/tensorflow/lite/kernels/internal/tensor.h
index 0005bf38d54..543117df0e5 100644
--- a/tensorflow/lite/kernels/internal/tensor.h
+++ b/tensorflow/lite/kernels/internal/tensor.h
@@ -119,6 +119,8 @@ class SequentialTensorWriter {
   T* output_ptr_;
 };
 
+// String ops are not yet supported on platforms w/ static memory.
+#ifndef TF_LITE_STATIC_MEMORY
 template <>
 class SequentialTensorWriter<string> {
  public:
@@ -138,6 +140,7 @@ class SequentialTensorWriter<string> {
   TfLiteTensor* output_;
   DynamicBuffer buffer_;
 };
+#endif  // TF_LITE_STATIC_MEMORY
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index 1929c2e2ff4..5e106eb7de4 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -20,13 +20,18 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
 #endif
 
 namespace tflite {
+
+// Not all backends support CpuBackendContext usage, so forward declare to avoid
+// pulling in its implementation. Use of CpuBackendContext in method
+// implementations is purely optional.
+class CpuBackendContext;
+
 namespace tensor_utils {
 
 // Checks if all entries of vector are zero for float.
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 9b047d3ba84..878cf0d2618 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
@@ -1135,11 +1136,15 @@ std::vector<float> TestPerChannelDotprodMatrixBatchVectorMultiply(
     bool is_per_channel = true) {
   MatrixVectorData data =
       SetupMatrixVectorData(rows, cols, batch, negative, is_per_channel);
-
+  std::vector<int32_t> scratch(rows * batch);
+  std::vector<int32_t> row_sums(rows);
+  bool compute_row_sums = true;
+  CpuBackendContext context;
   MatrixBatchVectorMultiplyAccumulate(
       data.matrix.data(), rows, cols, data.vectors.data(),
       data.scale_factors.data(), batch, &data.results[0],
-      data.per_channel_scales.data(), data.input_offsets.data());
+      data.per_channel_scales.data(), data.input_offsets.data(), scratch.data(),
+      row_sums.data(), &compute_row_sums, &context);
   return data.results;
 }
 
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
index cbdedd88901..52d74d1eca4 100644
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -972,8 +972,10 @@ struct PreluParams {
   int32 input_offset;
   int32 alpha_offset;
   int32 output_offset;
-  int32 output_multiplier;
-  int output_shift;
+  int32 output_multiplier_1;
+  int32 output_shift_1;
+  int32 output_multiplier_2;
+  int32 output_shift_2;
 };
 
 struct PoolParams {
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index b30747eac61..ded536ab3a7 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -126,11 +126,27 @@ TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
   // pipeline.
   if (bias) {
     const double bias_scale = static_cast<double>(bias->params.scale);
-    // Here we're making sure the input_product_scale & bias_scale the same.
-    // Normally this should be guaranteed by the training pipeline, we are
-    // setting the threshold to be 2e-6 to allow some numeric stability
-    // difference.
-    TF_LITE_ENSURE(context, std::abs(input_product_scale - bias_scale) <= 2e-6);
+    // Here we're making sure the input_product_scale & bias_scale are about the
+    // same. Since we have:
+    // (output - output_zp) * output_scale =
+    // input_product_scale * input_product + bias * bias_scale ---- (0)
+    //
+    // (0) equals:
+    // (input_product + bias) * input_product_scale ----- (1)
+    //           +
+    // bias * (bias_scale - input_product_scale)   ------ (2)
+    //
+    // For the real kernel computation, we're doing (1), so we really need to
+    // make sure (2) has minimum impact on the output, so:
+    // bias * (bias_scale - input_product_scale) / output_scale should be
+    // a small number for an integer.
+    // Since normally bias should be within a small range.
+    // We should expect (bias_scale - input_product_scale) / output_scale to
+    // be a small number like 0.02.
+    const double scale_diff = std::abs(input_product_scale - bias_scale);
+    const double output_scale = static_cast<double>(output->params.scale);
+
+    TF_LITE_ENSURE(context, scale_diff / output_scale <= 0.02);
   }
   return GetQuantizedConvolutionMultipler(context, input, filter, output,
                                           multiplier);
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index ad068ddd3fd..d6a2dac8583 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -28,7 +28,7 @@ inline int NumDimensions(const TfLiteTensor* t) { return t->dims->size; }
 inline int SizeOfDimension(const TfLiteTensor* t, int dim) {
   return t->dims->data[dim];
 }
-inline const TfLiteTensor* GetInput(TfLiteContext* context,
+inline const TfLiteTensor* GetInput(const TfLiteContext* context,
                                     const TfLiteNode* node, int index) {
   return &context
               ->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];
@@ -87,6 +87,10 @@ inline const TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
 }
 
 // Determines whether tensor is constant.
+// TODO(b/138199592): Introduce new query which checks for constant OR
+// persistent-read-only, which would be useful for most tensor kernels that
+// are potentially dynamic based on the input tensor value availability at the
+// time of prepare.
 inline bool IsConstantTensor(const TfLiteTensor* tensor) {
   return tensor->allocation_type == kTfLiteMmapRo;
 }
@@ -105,6 +109,14 @@ inline void SetTensorToDynamic(TfLiteTensor* tensor) {
   }
 }
 
+// Sets tensor to persistent and read-only.
+inline void SetTensorToPersistentRo(TfLiteTensor* tensor) {
+  if (tensor->allocation_type != kTfLitePersistentRo) {
+    tensor->allocation_type = kTfLitePersistentRo;
+    tensor->data.raw = nullptr;
+  }
+}
+
 // Determines whether it is a hybrid op - one that has float inputs and
 // quantized weights.
 inline bool IsHybridOp(const TfLiteTensor* input, const TfLiteTensor* weight) {
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index 2bd31eae8db..62634e6bfbd 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -2050,7 +2050,7 @@ TEST_P(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
       }};
 
   VerifyGoldens(lstm_input_, lstm_golden_output_, &layer_norm_lstm,
-                /*tolerance=*/0.000902065);
+                /*tolerance=*/0.0009021);
 }
 
 class CifgPeepholeProjectionNoClippingLayerNormLstmInt8Test
diff --git a/tensorflow/lite/kernels/maximum_minimum.cc b/tensorflow/lite/kernels/maximum_minimum.cc
index 3c6c524c13d..cad86acd8dd 100644
--- a/tensorflow/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/kernels/maximum_minimum.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -31,6 +32,7 @@ namespace maximum_minimum {
 // This file has a reference implementation of TFMaximum/TFMinimum.
 enum KernelType {
   kReference,
+  kGenericOptimized,
 };
 
 constexpr int kInputTensor1 = 0;
@@ -85,7 +87,7 @@ struct MinimumOp {
   }
 };
 
-template <typename data_type, typename op_type>
+template <KernelType kernel_type, typename data_type, typename op_type>
 void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
                      const OpContext& op_context) {
   reference_ops::MaximumMinimumBroadcastSlow(
@@ -98,29 +100,82 @@ void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
       op_type::template op<data_type>);
 }
 
+// Maximum generic opt int8.
+template <>
+void TFLiteOperation<maximum_minimum::kGenericOptimized, int8, MaximumOp>(
+    TfLiteContext* context, TfLiteNode* node, const OpContext& op_context) {
+  tflite::ArithmeticParams op_params;
+  const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+      GetTensorShape(op_context.input1), GetTensorShape(op_context.input2),
+      &op_params);
+  if (need_broadcast) {
+    optimized_ops::BroadcastMaximumDispatch(
+        op_params, GetTensorShape(op_context.input1),
+        GetTensorData<int8>(op_context.input1),
+        GetTensorShape(op_context.input2),
+        GetTensorData<int8>(op_context.input2),
+        GetTensorShape(op_context.output),
+        GetTensorData<int8>(op_context.output), MaximumOp::template op<int8>);
+    return;
+  }
+  reference_ops::MaximumMinimumBroadcastSlow(
+      GetTensorShape(op_context.input1), GetTensorData<int8>(op_context.input1),
+      GetTensorShape(op_context.input2), GetTensorData<int8>(op_context.input2),
+      GetTensorShape(op_context.output), GetTensorData<int8>(op_context.output),
+      MaximumOp::template op<int8>);
+}
+
+// Minimum generic opt int8.
+template <>
+void TFLiteOperation<maximum_minimum::kGenericOptimized, int8, MinimumOp>(
+    TfLiteContext* context, TfLiteNode* node, const OpContext& op_context) {
+  tflite::ArithmeticParams op_params;
+  const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+      GetTensorShape(op_context.input1), GetTensorShape(op_context.input2),
+      &op_params);
+  if (need_broadcast) {
+    optimized_ops::BroadcastMinimumDispatch(
+        op_params, GetTensorShape(op_context.input1),
+        GetTensorData<int8>(op_context.input1),
+        GetTensorShape(op_context.input2),
+        GetTensorData<int8>(op_context.input2),
+        GetTensorShape(op_context.output),
+        GetTensorData<int8>(op_context.output), MinimumOp::template op<int8>);
+    return;
+  }
+  reference_ops::MaximumMinimumBroadcastSlow(
+      GetTensorShape(op_context.input1), GetTensorData<int8>(op_context.input1),
+      GetTensorShape(op_context.input2), GetTensorData<int8>(op_context.input2),
+      GetTensorShape(op_context.output), GetTensorData<int8>(op_context.output),
+      MinimumOp::template op<int8>);
+}
+
 template <KernelType kernel_type, typename OpType>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
 
-  if (kernel_type == kReference) {
     switch (op_context.output->type) {
       case kTfLiteFloat32:
-        TFLiteOperation<float, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, float, OpType>(context, node, op_context);
         break;
       case kTfLiteUInt8:
-        TFLiteOperation<uint8_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, uint8_t, OpType>(context, node,
+                                                      op_context);
         break;
       case kTfLiteInt8:
-        TFLiteOperation<int8_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, int8_t, OpType>(context, node, op_context);
         break;
       case kTfLiteInt32:
-        TFLiteOperation<int32_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, int32_t, OpType>(context, node,
+                                                      op_context);
         break;
       case kTfLiteInt64:
-        TFLiteOperation<int64_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, int64_t, OpType>(context, node,
+                                                      op_context);
         break;
       case kTfLiteInt16:
-        TFLiteOperation<int16_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, int16_t, OpType>(context, node,
+                                                      op_context);
         break;
       default:
         context->ReportError(context,
@@ -128,12 +183,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                              op_context.output->type);
         return kTfLiteError;
     }
-  } else {
-    context->ReportError(context,
-                         "Type %d is currently not supported by Maximum.",
-                         op_context.output->type);
-    return kTfLiteError;
-  }
   return kTfLiteOk;
 }
 
@@ -147,6 +196,14 @@ TfLiteRegistration* Register_MAXIMUM_REF() {
   return &r;
 }
 
+TfLiteRegistration* Register_MAXIMUM_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, maximum_minimum::Prepare,
+      maximum_minimum::Eval<maximum_minimum::kGenericOptimized,
+                            maximum_minimum::MaximumOp>};
+  return &r;
+}
+
 TfLiteRegistration* Register_MINIMUM_REF() {
   static TfLiteRegistration r = {
       nullptr, nullptr, maximum_minimum::Prepare,
@@ -154,8 +211,21 @@ TfLiteRegistration* Register_MINIMUM_REF() {
                             maximum_minimum::MinimumOp>};
   return &r;
 }
-TfLiteRegistration* Register_MAXIMUM() { return Register_MAXIMUM_REF(); }
-TfLiteRegistration* Register_MINIMUM() { return Register_MINIMUM_REF(); }
+
+TfLiteRegistration* Register_MINIMUM_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, maximum_minimum::Prepare,
+      maximum_minimum::Eval<maximum_minimum::kGenericOptimized,
+                            maximum_minimum::MinimumOp>};
+  return &r;
+}
+
+TfLiteRegistration* Register_MAXIMUM() {
+  return Register_MAXIMUM_GENERIC_OPT();
+}
+TfLiteRegistration* Register_MINIMUM() {
+  return Register_MINIMUM_GENERIC_OPT();
+}
 
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/lite/kernels/non_max_suppression.cc b/tensorflow/lite/kernels/non_max_suppression.cc
index ee8e407066d..f57ee1bc5d2 100644
--- a/tensorflow/lite/kernels/non_max_suppression.cc
+++ b/tensorflow/lite/kernels/non_max_suppression.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
-#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/op_macros.h b/tensorflow/lite/kernels/op_macros.h
index 33d033b10b6..8c1a6b1be16 100644
--- a/tensorflow/lite/kernels/op_macros.h
+++ b/tensorflow/lite/kernels/op_macros.h
@@ -19,6 +19,7 @@ limitations under the License.
 // non-portable function.
 #ifdef TF_LITE_MCU_DEBUG_LOG
 
+#include "tensorflow/lite/micro/debug_log.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 
 #define DEBUG_LOG(x) \
diff --git a/tensorflow/lite/kernels/rank.cc b/tensorflow/lite/kernels/rank.cc
index 8e27ebcc325..53fd92f1682 100644
--- a/tensorflow/lite/kernels/rank.cc
+++ b/tensorflow/lite/kernels/rank.cc
@@ -30,19 +30,23 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   output->type = kTfLiteInt32;
 
+  // By design, the input shape is always known at the time of Prepare, even
+  // if the preceding op that generates |input| is dynamic. Thus, we can
+  // always compute the rank immediately, without waiting for Eval.
+  SetTensorToPersistentRo(output);
+
   // Rank produces a 0-D int32 Tensor representing the rank of input.
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(0);
-  return context->ResizeTensor(context, output, output_size);
-}
+  TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_size));
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, NumDimensions(output), 0);
 
+  // Immediately propagate the known rank to the output tensor. This allows
+  // downstream ops that rely on the value to use it during prepare.
   if (output->type == kTfLiteInt32) {
     int32_t* output_data = GetTensorData<int32_t>(output);
     *output_data = NumDimensions(input);
@@ -53,6 +57,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
 }  // namespace rank
 
 TfLiteRegistration* Register_RANK() {
diff --git a/tensorflow/lite/kernels/rank_test.cc b/tensorflow/lite/kernels/rank_test.cc
index f3dc97126ba..5373a0a66fe 100644
--- a/tensorflow/lite/kernels/rank_test.cc
+++ b/tensorflow/lite/kernels/rank_test.cc
@@ -43,6 +43,9 @@ class RankOpModel : public SingleOpModel {
 
   std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+  TfLiteAllocationType GetOutputAllocationType() const {
+    return interpreter_->tensor(interpreter_->outputs()[0])->allocation_type;
+  }
 
  private:
   int input_;
@@ -51,6 +54,13 @@ class RankOpModel : public SingleOpModel {
 
 TEST(RankOpTest, InputTypeFloat) {
   RankOpModel model({1, 3, 1, 3, 5}, TensorType_FLOAT32);
+  ASSERT_EQ(model.GetOutputAllocationType(), kTfLitePersistentRo);
+
+  // Unlike most ops, Rank populates outputs in Prepare().
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
+  EXPECT_TRUE(model.GetOutputShape().empty());
+
+  // Invoke is superfluous and shouldn't change the output.
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
@@ -59,7 +69,6 @@ TEST(RankOpTest, InputTypeFloat) {
 
 TEST(RankOpTest, InputTypeInt) {
   RankOpModel model({1, 3, 1, 3, 5}, TensorType_INT32);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
   EXPECT_TRUE(model.GetOutputShape().empty());
@@ -67,7 +76,6 @@ TEST(RankOpTest, InputTypeInt) {
 
 TEST(RankOpTest, ScalarTensor) {
   RankOpModel model({}, TensorType_FLOAT32);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0}));
   EXPECT_TRUE(model.GetOutputShape().empty());
@@ -75,7 +83,6 @@ TEST(RankOpTest, ScalarTensor) {
 
 TEST(RankOpTest, EmptyTensor) {
   RankOpModel model({1, 0}, TensorType_FLOAT32);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({2}));
   EXPECT_TRUE(model.GetOutputShape().empty());
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 8b6ed20c009..8ca58e6a309 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -39,10 +39,10 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU6, Register_RELU6(), /* min_version = */ 1,
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_TANH, Register_TANH(), /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D(),
              /* min_version */ 1,
              /* max_version */ 3);
@@ -77,7 +77,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version = */ 1,
-             /* max_version = */ 7);
+             /* max_version = */ 8);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
diff --git a/tensorflow/lite/kernels/resize_bilinear_test.cc b/tensorflow/lite/kernels/resize_bilinear_test.cc
index 5cbba026010..d4d414ae29c 100644
--- a/tensorflow/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/lite/kernels/resize_bilinear_test.cc
@@ -190,10 +190,6 @@ TEST_P(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
 
 TEST_P(ResizeBilinearOpTest,
        TwoDimensionalResizeWithTwoBatches_HalfPixelCenters) {
-  // TODO(b/147696142): Update when NNAPI delegate can support TF2 behavior.
-  if (SingleOpModel::GetForceUseNnapi()) {
-    return;
-  }
   ResizeBilinearOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}}, {3, 3},
                           GetParam(), /**half_pixel_centers**/ true);
   m.SetInput<float>({
@@ -253,10 +249,6 @@ TEST_P(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatchesUInt8) {
 
 TEST_P(ResizeBilinearOpTest,
        TwoDimensionalResizeWithTwoBatchesUInt8_HalfPixelCenters) {
-  // TODO(b/147696142): Update when NNAPI delegate can support TF2 behavior.
-  if (SingleOpModel::GetForceUseNnapi()) {
-    return;
-  }
   ResizeBilinearOpModel m({TensorType_UINT8, {2, 2, 2, 1}}, {3, 3}, GetParam(),
                           /**half_pixel_centers**/ true);
   m.SetInput<uint8>({
diff --git a/tensorflow/lite/kernels/shape.cc b/tensorflow/lite/kernels/shape.cc
index 88794fefac4..d979f083f70 100644
--- a/tensorflow/lite/kernels/shape.cc
+++ b/tensorflow/lite/kernels/shape.cc
@@ -54,19 +54,22 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
 
+  // By design, the input shape is always known at the time of Prepare, even
+  // if the preceding op that generates |input| is dynamic. Thus, we can
+  // always compute the shape immediately, without waiting for Eval.
+  SetTensorToPersistentRo(output);
+
   // Shape always produces a 1-dimensional output tensor, where each output
   // element is the length of the corresponding input tensor's dimension.
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(1);
   output_size->data[0] = NumDimensions(input);
-  return context->ResizeTensor(context, output, output_size);
-}
+  TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_size));
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TFLITE_DCHECK_EQ(NumDimensions(output), 1);
   TFLITE_DCHECK_EQ(SizeOfDimension(output, 0), NumDimensions(input));
 
+  // Immediately propagate the known shape to the output tensor. This allows
+  // downstream ops that rely on the value to use it during prepare.
   switch (output->type) {
     case kTfLiteInt32:
       ExtractShape(input, GetTensorData<int32_t>(output));
@@ -81,6 +84,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
 }  // namespace shape
 
 TfLiteRegistration* Register_SHAPE() {
diff --git a/tensorflow/lite/kernels/shape_test.cc b/tensorflow/lite/kernels/shape_test.cc
index 6a7dad4d3e0..3eeb83f5000 100644
--- a/tensorflow/lite/kernels/shape_test.cc
+++ b/tensorflow/lite/kernels/shape_test.cc
@@ -45,6 +45,9 @@ class ShapeOpModel : public SingleOpModel {
   int32_t GetOutputSize() { return GetTensorSize(output_); }
   std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+  TfLiteAllocationType GetOutputAllocationType() const {
+    return interpreter_->tensor(interpreter_->outputs()[0])->allocation_type;
+  }
 
  private:
   int input_;
@@ -54,6 +57,13 @@ class ShapeOpModel : public SingleOpModel {
 TEST(ShapeOpTest, OutTypeInt) {
   ShapeOpModel<int32_t> model({1, 3, 1, 3, 5}, TensorType_FLOAT32,
                               TensorType_INT32);
+  ASSERT_EQ(model.GetOutputAllocationType(), kTfLitePersistentRo);
+
+  // Unlike most ops, Rank populates outputs in Prepare().
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({5}));
+
+  // Invoke is superfluous and shouldn't change the output.
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5}));
@@ -63,7 +73,6 @@ TEST(ShapeOpTest, OutTypeInt) {
 TEST(ShapeOpTest, OutTypeInt64) {
   ShapeOpModel<int64_t> model({1, 3, 1, 3, 5}, TensorType_FLOAT32,
                               TensorType_INT64);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({5}));
@@ -71,7 +80,6 @@ TEST(ShapeOpTest, OutTypeInt64) {
 
 TEST(ShapeOpTest, ScalarTensor) {
   ShapeOpModel<int32_t> model({}, TensorType_FLOAT32, TensorType_INT32);
-  model.Invoke();
 
   EXPECT_EQ(model.GetOutputSize(), 0);
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({0}));
@@ -79,7 +87,6 @@ TEST(ShapeOpTest, ScalarTensor) {
 
 TEST(ShapeOpTest, EmptyTensor) {
   ShapeOpModel<int32_t> model({1, 0}, TensorType_FLOAT32, TensorType_INT32);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
index 1b04143d222..a2282a0545b 100644
--- a/tensorflow/lite/kernels/sub.cc
+++ b/tensorflow/lite/kernels/sub.cc
@@ -326,11 +326,11 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
         TF_LITE_SUB(reference_ops, Add, uint8_t);
       }
     } else {
-      if (op_params.broadcast_category ==
-          BroadcastableOpCategory::kGenericBroadcast) {
-        TF_LITE_SUB(optimized_ops, BroadcastAdd4DSlow, uint8_t);
-      } else if (need_broadcast) {
-        TF_LITE_SUB(optimized_ops, BroadcastAddFivefold, uint8_t);
+      if (need_broadcast) {
+        optimized_ops::BroadcastAddDispatch(
+            op_params, GetTensorShape(input1), GetTensorData<uint8_t>(input1),
+            GetTensorShape(input2), GetTensorData<uint8_t>(input2),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
       } else {
         TF_LITE_SUB(optimized_ops, Add, uint8_t);
       }
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 90a4df56c57..d308dfee469 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cmath>
 #include <complex>
+#include <type_traits>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -485,6 +486,9 @@ class SingleOpModel {
     return std::vector<T>(v, v + tensor_size);
   }
 
+  // Return the TFLite model buffer, only available after BuildInterpreter.
+  const uint8_t* GetModelBuffer() { return builder_.GetBufferPointer(); }
+
   std::vector<int> GetTensorShape(int index) {
     std::vector<int> result;
     TfLiteTensor* t = interpreter_->tensor(index);
@@ -812,6 +816,7 @@ TensorType GetTensorType() {
   if (std::is_same<T, int64_t>::value) return TensorType_INT64;
   if (std::is_same<T, uint8_t>::value) return TensorType_UINT8;
   if (std::is_same<T, string>::value) return TensorType_STRING;
+  if (std::is_same<T, bool>::value) return TensorType_BOOL;
   return TensorType_MIN;  // default value
 }
 
diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index 5742a383b0f..ae4ba2fefd6 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -25,22 +25,19 @@ cc_library(
 cc_library(
     name = "micro_framework",
     srcs = [
-        "debug_log.cc",
         "memory_helpers.cc",
         "micro_allocator.cc",
-        "micro_error_reporter.cc",
         "micro_interpreter.cc",
         "micro_optional_debug_tools.cc",
         "simple_memory_allocator.cc",
         "test_helpers.cc",
     ],
     hdrs = [
-        "debug_log.h",
         "memory_helpers.h",
         "micro_allocator.h",
-        "micro_error_reporter.h",
         "micro_interpreter.h",
         "micro_mutable_op_resolver.h",
+        "micro_op_resolver.h",
         "micro_optional_debug_tools.h",
         "simple_memory_allocator.h",
         "test_helpers.h",
@@ -49,15 +46,46 @@ cc_library(
     copts = micro_copts(),
     deps = [
         ":micro_compatibility",
-        ":micro_string",
         ":micro_utils",
         "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/micro/memory_planner",
         "//tensorflow/lite/micro/memory_planner:greedy_memory_planner",
         "//tensorflow/lite/schema:schema_fbs",
+        "@flatbuffers//:runtime_cc",
+    ],
+)
+
+cc_library(
+    name = "debug_log",
+    srcs = [
+        "debug_log.cc",
+    ],
+    hdrs = [
+        "debug_log.h",
+    ],
+    build_for_embedded = True,
+    copts = micro_copts(),
+)
+
+cc_library(
+    name = "micro_error_reporter",
+    srcs = [
+        "micro_error_reporter.cc",
+    ],
+    hdrs = [
+        "micro_error_reporter.h",
+    ],
+    build_for_embedded = True,
+    copts = micro_copts(),
+    deps = [
+        ":debug_log",
+        ":micro_compatibility",
+        ":micro_string",
+        "//tensorflow/lite/core/api",
     ],
 )
 
@@ -103,13 +131,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "recording_simple_memory_allocator",
+    srcs = [
+        "recording_simple_memory_allocator.cc",
+    ],
+    hdrs = [
+        "recording_simple_memory_allocator.h",
+    ],
+    build_for_embedded = True,
+    copts = micro_copts(),
+    deps = [
+        ":micro_compatibility",
+        ":micro_framework",
+    ],
+)
+
 tflite_micro_cc_test(
     name = "micro_error_reporter_test",
     srcs = [
         "micro_error_reporter_test.cc",
     ],
     deps = [
-        ":micro_framework",
+        ":micro_error_reporter",
     ],
 )
 
@@ -132,6 +176,7 @@ tflite_micro_cc_test(
     deps = [
         ":micro_framework",
         ":micro_utils",
+        "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -148,6 +193,18 @@ tflite_micro_cc_test(
     ],
 )
 
+tflite_micro_cc_test(
+    name = "recording_simple_memory_allocator_test",
+    srcs = [
+        "recording_simple_memory_allocator_test.cc",
+    ],
+    deps = [
+        ":micro_framework",
+        ":recording_simple_memory_allocator",
+        "//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
+
 tflite_micro_cc_test(
     name = "micro_allocator_test",
     srcs = [
diff --git a/tensorflow/lite/micro/apollo3evb/micro_time.cc b/tensorflow/lite/micro/apollo3evb/micro_time.cc
new file mode 100644
index 00000000000..12c9ae5c633
--- /dev/null
+++ b/tensorflow/lite/micro/apollo3evb/micro_time.cc
@@ -0,0 +1,72 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Reference implementation of timer functions.  Platforms are not required to
+// implement these timer methods, but they are required to enable profiling.
+
+// On platforms that have a POSIX stack or C library, it can be written using
+// methods from <sys/time.h> or clock() from <time.h>.
+
+// To add an equivalent function for your own platform, create your own
+// implementation file, and place it in a subfolder with named after the OS
+// you're targeting. For example, see the Cortex M bare metal version in
+// tensorflow/lite/micro/bluepill/micro_timer.cc or the mbed one on
+// tensorflow/lite/micro/mbed/micro_timer.cc.
+
+#include "tensorflow/lite/micro/micro_time.h"
+
+// These are headers from Ambiq's Apollo3 SDK.
+#include "am_bsp.h"         // NOLINT
+#include "am_mcu_apollo.h"  // NOLINT
+#include "am_util.h"        // NOLINT
+
+namespace tflite {
+namespace {
+
+// Select CTIMER 1 as benchmarking timer on Sparkfun Edge. This timer must not
+// be used elsewhere.
+constexpr int kTimerNum = 1;
+
+// Clock set to operate at 12MHz.
+constexpr int kClocksPerSecond = 12e6;
+
+}  // namespace
+
+int32_t ticks_per_second() { return kClocksPerSecond; }
+
+// Calling this method enables a timer that runs for eternity. The user is
+// responsible for avoiding trampling on this timer's config, otherwise timing
+// measurements may no longer be valid.
+int32_t GetCurrentTimeTicks() {
+  // TODO(b/150808076): Split out initialization, intialize in interpreter.
+  static bool is_initialized = false;
+  if (!is_initialized) {
+    am_hal_ctimer_config_t timer_config;
+    // Operate as a 32-bit timer.
+    timer_config.ui32Link = 1;
+    // Set timer A to continuous mode at 12MHz.
+    timer_config.ui32TimerAConfig =
+        AM_HAL_CTIMER_FN_CONTINUOUS | AM_HAL_CTIMER_HFRC_12MHZ;
+
+    am_hal_ctimer_stop(kTimerNum, AM_HAL_CTIMER_BOTH);
+    am_hal_ctimer_clear(kTimerNum, AM_HAL_CTIMER_BOTH);
+    am_hal_ctimer_config(kTimerNum, &timer_config);
+    am_hal_ctimer_start(kTimerNum, AM_HAL_CTIMER_TIMERA);
+    is_initialized = true;
+  }
+  return CTIMERn(kTimerNum)->TMR0;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/arc_emsdp/debug_log.cc b/tensorflow/lite/micro/arc_emsdp/debug_log.cc
new file mode 100644
index 00000000000..1b4d641e5e9
--- /dev/null
+++ b/tensorflow/lite/micro/arc_emsdp/debug_log.cc
@@ -0,0 +1,111 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/debug_log.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+
+// Print to debug console by default. One can define next to extend destinations
+// set: EMSDP_LOG_TO_MEMORY
+//   : fill .debug_log memory region (data section) with passed chars.
+// EMSDP_LOG_TO_HOST
+//   : Use MetaWare HostLink to print output log. Requires Synopsys MetaWare
+//   debugger
+// EMSDP_LOG_TO_UART
+//   : use default debug UART (out to FTDI channel 0). The same USB Port is used
+//   for JTAG.
+#define EMSDP_LOG_TO_UART
+
+// Memory size for symbols dump in EMSDP_LOG_TO_MEMORY destination
+#define EMSDP_LOG_TO_MEMORY_SIZE (2 * 1024)
+
+// EMSDP Debug UART related defines (registers and bits)
+#define EMSDP_DBG_UART_BASE (0xF0004000U)
+#define DW_UART_CPR_FIFO_STAT (1 << 10)
+#define DW_UART_USR_TFNF (0x02)
+#define DW_UART_LSR_TXD_EMPTY (0x20)
+
+// EMSDP UART registers map (only necessairy fields)
+typedef volatile struct dw_uart_reg {
+  uint32_t DATA; /* data in/out and DLL */
+  uint32_t RES1[4];
+  uint32_t LSR; /* Line Status Register */
+  uint32_t RES2[25];
+  uint32_t USR; /* UART status register */
+  uint32_t RES3[29];
+  uint32_t CPR; /* Component parameter register */
+} DW_UART_REG;
+
+// For simplicity we assume U-boot has already initialized debug console during
+// application loading (or on reset). Hence, we use only status and data
+// registers to organize blocking loop for printing symbols. No input and no IRQ
+// handling. See embarc_osp repository for full EMSDP uart driver.
+// (https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp)
+void DbgUartSendStr(const char* s) {
+  DW_UART_REG* uart_reg_ptr = (DW_UART_REG*)(EMSDP_DBG_UART_BASE);
+  const char* src = s;
+  while (*src) {
+    // Check uart status to send char
+    bool uart_is_ready = false;
+    if (uart_reg_ptr->CPR & DW_UART_CPR_FIFO_STAT)
+      uart_is_ready = ((uart_reg_ptr->USR & DW_UART_USR_TFNF) != 0);
+    else
+      uart_is_ready = ((uart_reg_ptr->LSR & DW_UART_LSR_TXD_EMPTY) != 0);
+
+    // Send char if uart is ready.
+    if (uart_is_ready) uart_reg_ptr->DATA = *src++;
+  }
+}
+
+// Simple dump of symbols to a pre-allocated memory region.
+// When total log exceeds memory region size, cursor is moved to its begining.
+// The memory region can be viewed afterward with debugger.
+// It can be viewed/read with debugger afterward.
+void LogToMem(const char* s) {
+  static int cursor = 0;
+#pragma Bss(".debug_log")
+  static volatile char debug_log_mem[EMSDP_LOG_TO_MEMORY_SIZE];
+#pragma Bss()
+
+  const char* src = s;
+  while (*src) {
+    debug_log_mem[cursor] = *src++;
+    cursor = (cursor < EMSDP_LOG_TO_MEMORY_SIZE) ? cursor + 1 : 0;
+  }
+  debug_log_mem[cursor] = '^';
+}
+
+extern "C" void DebugLog(const char* s) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+
+#if defined EMSDP_LOG_TO_UART
+  DbgUartSendStr(s);
+#endif
+
+#if defined EMSDP_LOG_TO_MEMORY
+#warning \
+    "EMSDP_LOG_TO_MEMORY is defined. View .debug_log memory region for stdout"
+  LogToMem(s);
+#endif
+
+#if defined EMSDP_LOG_TO_HOST
+#warning "EMSDP_LOG_TO_HOST is defined. Ensure hostlib is linked."
+  fprintf(stderr, "%s", s);
+#endif
+
+#endif  // TF_LITE_STRIP_ERROR_STRINGS
+}
diff --git a/tensorflow/lite/micro/benchmarks/BUILD b/tensorflow/lite/micro/benchmarks/BUILD
index 4af3267d769..73b288d2bc1 100644
--- a/tensorflow/lite/micro/benchmarks/BUILD
+++ b/tensorflow/lite/micro/benchmarks/BUILD
@@ -46,6 +46,7 @@ cc_binary(
     deps = [
         ":keyword_scrambled_model_data",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/kernels:micro_ops",
         "//tensorflow/lite/micro/testing:micro_benchmark",
@@ -58,6 +59,7 @@ cc_binary(
     deps = [
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:micro_utils",
         "//tensorflow/lite/micro/examples/person_detection:model_settings",
diff --git a/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc b/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
index fd8556f752e..ec4e93f9506 100644
--- a/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
@@ -53,7 +53,7 @@ class KeywordRunner {
     resolver_.AddBuiltin(tflite::BuiltinOperator_QUANTIZE,
                          tflite::ops::micro::Register_QUANTIZE());
     resolver_.AddBuiltin(tflite::BuiltinOperator_DEQUANTIZE,
-                         tflite::ops::micro::Register_DEQUANTIZE(), 1, 2);
+                         tflite::ops::micro::Register_DEQUANTIZE());
     resolver_.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
                          tflite::ops::micro::Register_SOFTMAX());
     interpreter_.AllocateTensors();
@@ -84,7 +84,7 @@ class KeywordRunner {
   const tflite::Model* keyword_spotting_model_;
   tflite::MicroErrorReporter micro_reporter_;
   tflite::ErrorReporter* reporter_;
-  tflite::MicroOpResolver<6> resolver_;
+  tflite::MicroMutableOpResolver<6> resolver_;
   tflite::MicroInterpreter interpreter_;
 };
 
diff --git a/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc b/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc
index c1e37dfb37e..834f44ca5ab 100644
--- a/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc
+++ b/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc
@@ -15,19 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.h"
 
-// We need to keep the data array aligned on some architectures.
-#ifdef __has_attribute
-#define HAVE_ATTRIBUTE(x) __has_attribute(x)
-#else
-#define HAVE_ATTRIBUTE(x) 0
-#endif
-#if HAVE_ATTRIBUTE(aligned) || (defined(__GNUC__) && !defined(__clang__))
-#define DATA_ALIGN_ATTRIBUTE __attribute__((aligned(4)))
-#else
-#define DATA_ALIGN_ATTRIBUTE
-#endif
-
-const unsigned char g_keyword_scrambled_model_data[] DATA_ALIGN_ATTRIBUTE = {
+// Keep model aligned to 8 bytes to guarantee aligned 64-bit accesses.
+alignas(8) const unsigned char g_keyword_scrambled_model_data[] = {
     0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
     0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00,
     0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xd0, 0x6e, 0x00, 0x00,
diff --git a/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc b/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
index 5287a9c1e23..bec12ad8642 100644
--- a/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
@@ -98,7 +98,7 @@ class PersonDetectionRunner {
   const tflite::Model* person_detection_model_;
   tflite::MicroErrorReporter micro_reporter_;
   tflite::ErrorReporter* reporter_;
-  tflite::MicroOpResolver<6> resolver_;
+  tflite::MicroMutableOpResolver<6> resolver_;
   tflite::MicroInterpreter interpreter_;
 };
 
diff --git a/tensorflow/lite/micro/examples/hello_world/BUILD b/tensorflow/lite/micro/examples/hello_world/BUILD
index 155aaafd98c..4488c192abb 100644
--- a/tensorflow/lite/micro/examples/hello_world/BUILD
+++ b/tensorflow/lite/micro/examples/hello_world/BUILD
@@ -35,6 +35,7 @@ tflite_micro_cc_test(
     deps = [
         ":model",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/kernels:all_ops_resolver",
         "//tensorflow/lite/micro/kernels:micro_ops",
@@ -54,7 +55,7 @@ cc_library(
     copts = micro_copts(),
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -86,8 +87,15 @@ cc_binary(
         ":model",
         ":output_handler",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/kernels:all_ops_resolver",
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
+
+sh_test(
+    name = "hello_world_binary_test",
+    srcs = ["hello_world_binary_test.sh"],
+    data = [":hello_world"],
+)
diff --git a/tensorflow/lite/micro/examples/hello_world/README.md b/tensorflow/lite/micro/examples/hello_world/README.md
index 020a7d49e88..3b633890306 100644
--- a/tensorflow/lite/micro/examples/hello_world/README.md
+++ b/tensorflow/lite/micro/examples/hello_world/README.md
@@ -14,6 +14,7 @@ of the device.
 
 ## Table of contents
 
+-   [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
 -   [Deploy to Arduino](#deploy-to-arduino)
 -   [Deploy to ESP32](#deploy-to-esp32)
 -   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
@@ -21,6 +22,78 @@ of the device.
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
 -   [Train your own model](#train-your-own-model)
 
+## Deploy to ARC EM SDP
+
+The following instructions will help you to build and deploy this example to
+[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
+board. General information and instructions on using the board with TensorFlow
+Lite Micro can be found in the common
+[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
+
+### Initial Setup
+
+Follow the instructions on the
+[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
+to get and install all required tools for work with ARC EM SDP.
+
+### Generate Example Project
+
+The example project for ARC EM SDP platform can be generated with the following
+command:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_hello_world_make_project
+```
+
+### Build and Run Example
+
+For more detailed information on building and running examples see the
+appropriate sections of general descriptions of the
+[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
+In the directory with generated project you can also find a
+*README_ARC_EMSDP.md* file with instructions and options on building and
+running. Here we only briefly mention main steps which are typically enough to
+get it started.
+
+1.  You need to
+    [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
+    and open an serial connection.
+
+2.  Go to the generated example project director
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/hello_world/make
+    ```
+
+3.  Build the example using
+
+    ```
+    make app
+    ```
+
+4.  To generate artefacts for self-boot of example from the board use
+
+    ```
+    make flash
+    ```
+
+5.  To run application from the board using microSD card:
+
+    *   Copy the content of the created /bin folder into the root of microSD
+        card. Note that the card must be formatted as FAT32 with default cluster
+        size (but less than 32 Kbytes)
+    *   Plug in the microSD card into the J11 connector.
+    *   Push the RST button. If a red LED is lit beside RST button, push the CFG
+        button.
+
+6.  If you have the MetaWare Debugger installed in your environment:
+
+    *   To run application from the console using it type `make run`.
+    *   To stop the execution type `Ctrl+C` in the console several times.
+
+In both cases (step 5 and 6) you will see the application output in the serial
+terminal.
+
 ## Deploy to Arduino
 
 The following instructions will help you build and deploy this sample
diff --git a/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb b/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb
new file mode 100644
index 00000000000..b34c0bb3fb8
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb
@@ -0,0 +1 @@
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Redirect","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyO1u6oks1qPVEQNnHFD3Cyo"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","metadata":{"id":"86C-FMxpdZxv","colab_type":"text"},"source":["This Colab notebook has been moved to [https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb)\n"]}]}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/hello_world/hello_world_binary_test.sh b/tensorflow/lite/micro/examples/hello_world/hello_world_binary_test.sh
new file mode 100755
index 00000000000..fe7683e5c4f
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/hello_world_binary_test.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Bash unit tests for the example binary.
+
+set -e
+
+OUTPUT_LOG_FILE=${TEST_TMPDIR}/output_log.txt
+
+# Needed for copybara compatibility.
+SCRIPT_BASE_DIR=/org_"tensor"flow
+${TEST_SRCDIR}${SCRIPT_BASE_DIR}/tensorflow/lite/micro/examples/hello_world/hello_world   2>&1 | head > ${OUTPUT_LOG_FILE}
+
+if ! grep -q 'x_value:.*y_value:' ${OUTPUT_LOG_FILE}; then
+  echo "ERROR: Expected logs not found in output '${OUTPUT_LOG_FILE}'"
+  exit 1
+fi
+
+echo
+echo "SUCCESS: hello_world_binary_test PASSED"
diff --git a/tensorflow/lite/micro/examples/hello_world/main_functions.cc b/tensorflow/lite/micro/examples/hello_world/main_functions.cc
index 404c8542432..d1c2cafe850 100644
--- a/tensorflow/lite/micro/examples/hello_world/main_functions.cc
+++ b/tensorflow/lite/micro/examples/hello_world/main_functions.cc
@@ -34,8 +34,12 @@ TfLiteTensor* output = nullptr;
 int inference_count = 0;
 
 // Create an area of memory to use for input, output, and intermediate arrays.
-// Finding the minimum value for your model may require some trial and error.
-constexpr int kTensorArenaSize = 2 * 1024;
+// Minimum arena size, at the time of writing. After allocating tensors
+// you can retrieve this value by invoking interpreter.arena_used_bytes().
+const int kModelArenaSize = 2352;
+// Extra headroom for model + alignment + future interpreter changes.
+const int kExtraArenaSize = 560 + 16 + 100;
+const int kTensorArenaSize = kModelArenaSize + kExtraArenaSize;
 uint8_t tensor_arena[kTensorArenaSize];
 }  // namespace
 
diff --git a/tensorflow/lite/micro/examples/hello_world/model.cc b/tensorflow/lite/micro/examples/hello_world/model.cc
index 232e4a14115..f774985fd48 100644
--- a/tensorflow/lite/micro/examples/hello_world/model.cc
+++ b/tensorflow/lite/micro/examples/hello_world/model.cc
@@ -24,19 +24,8 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/examples/hello_world/model.h"
 
-// We need to keep the data array aligned on some architectures.
-#ifdef __has_attribute
-#define HAVE_ATTRIBUTE(x) __has_attribute(x)
-#else
-#define HAVE_ATTRIBUTE(x) 0
-#endif
-#if HAVE_ATTRIBUTE(aligned) || (defined(__GNUC__) && !defined(__clang__))
-#define DATA_ALIGN_ATTRIBUTE __attribute__((aligned(4)))
-#else
-#define DATA_ALIGN_ATTRIBUTE
-#endif
-
-const unsigned char g_model[] DATA_ALIGN_ATTRIBUTE = {
+// Keep model aligned to 8 bytes to guarantee aligned 64-bit accesses.
+alignas(8) const unsigned char g_model[] = {
     0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x12, 0x00,
     0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
     0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
diff --git a/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/Makefile.inc b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/Makefile.inc
new file mode 100644
index 00000000000..292adbba732
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/Makefile.inc
@@ -0,0 +1,28 @@
+ifeq ($(TARGET), zephyr_vexriscv)
+	export ZEPHYR_TOOLCHAIN_VARIANT?=zephyr
+	export TOOLCHAIN_BASE=${ZEPHYR_SDK_INSTALL_DIR}/riscv64-zephyr-elf/riscv64-zephyr-elf
+	export TOOLCHAIN_VERSION=9.2.0
+	PROJECT_INCLUDES += ${CURDIR} ${TOOLCHAIN_BASE}/include/c++/${TOOLCHAIN_VERSION} ${TOOLCHAIN_BASE}/include ${TOOLCHAIN_BASE}/include/c++/${TOOLCHAIN_VERSION}/riscv64-zephyr-elf/rv32i/ilp32
+	ZEPHYR_HELLO_WORLD_SRCS = \
+tensorflow/lite/micro/examples/hello_world/zephyr_riscv/src/assert.cc \
+tensorflow/lite/micro/examples/hello_world/main.cc \
+tensorflow/lite/micro/examples/hello_world/main_functions.cc \
+tensorflow/lite/micro/examples/hello_world/constants.cc \
+tensorflow/lite/micro/examples/hello_world/output_handler.cc \
+tensorflow/lite/micro/examples/hello_world/model.cc \
+prj.conf
+
+$(eval $(call generate_project,cmake,zephyr_cmake_project.cmake,hello_world,$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(ZEPHYR_HELLO_WORLD_SRCS) $(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(HELLO_WORLD_HDRS),,$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS),))
+
+$(PRJDIR)hello_world/cmake/CMakeLists.txt: $(PRJDIR)hello_world/cmake/zephyr_cmake_project.cmake
+	@sed -E 's#\%\{INCLUDE_DIRS\}\%#$(PROJECT_INCLUDES)#g' $< > $@
+
+#We are skipping here copy of `zephyr` third_party repository
+#To compile standalone project ZEPHYR_BASE enviroment variable should be set
+hello_world_bin: generate_hello_world_cmake_project $(PRJDIR)hello_world/cmake/CMakeLists.txt
+	( \
+	  . ${ZEPHYR_BASE}/venv-zephyr/bin/activate; \
+	  cmake -B${GENDIR}hello_world/build -DBOARD="litex_vexriscv" -H${PRJDIR}hello_world/cmake/ -DPython_ROOT_DIR=${ZEPHYR_BASE}/venv-zephyr/bin/; \
+	  make -C ${GENDIR}hello_world/build; \
+	)
+endif
diff --git a/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf
new file mode 100644
index 00000000000..e36145c332a
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf
@@ -0,0 +1,16 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+CONFIG_CPLUSPLUS=y
+CONFIG_NEWLIB_LIBC=y
diff --git a/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/src/assert.cc b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/src/assert.cc
new file mode 100644
index 00000000000..2141c091149
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/src/assert.cc
@@ -0,0 +1,19 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+extern "C" {
+
+void __assert_func(const char *, int, const char *, const char *) {}
+}
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc b/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
index fd547b433ef..ac4de118834 100644
--- a/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
@@ -42,7 +42,7 @@ TF_LITE_MICRO_TEST(TestImageRecognitionInvoke) {
                          model->version(), TFLITE_SCHEMA_VERSION);
   }
 
-  tflite::MicroOpResolver<4> micro_op_resolver;
+  tflite::MicroMutableOpResolver<4> micro_op_resolver;
 
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
                                tflite::ops::micro::Register_CONV_2D());
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc b/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
index 09c76df0379..becdbdf1bd7 100644
--- a/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
@@ -56,7 +56,7 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  tflite::MicroOpResolver<4> micro_op_resolver;
+  tflite::MicroMutableOpResolver<4> micro_op_resolver;
 
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
                                tflite::ops::micro::Register_CONV_2D());
diff --git a/tensorflow/lite/micro/examples/magic_wand/BUILD b/tensorflow/lite/micro/examples/magic_wand/BUILD
index 7d6f3cdcecd..b0be47c1eeb 100644
--- a/tensorflow/lite/micro/examples/magic_wand/BUILD
+++ b/tensorflow/lite/micro/examples/magic_wand/BUILD
@@ -41,6 +41,7 @@ tflite_micro_cc_test(
         ":magic_wand_model_data",
         ":sample_feature_data",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/kernels:all_ops_resolver",
         "//tensorflow/lite/micro/kernels:micro_ops",
@@ -66,7 +67,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -78,6 +79,7 @@ tflite_micro_cc_test(
     deps = [
         ":accelerometer_handler",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -119,7 +121,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -155,6 +157,7 @@ cc_binary(
         ":magic_wand_model_data",
         ":output_handler",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/kernels:micro_ops",
         "//tensorflow/lite/schema:schema_fbs",
diff --git a/tensorflow/lite/micro/examples/magic_wand/magic_wand_model_data.cc b/tensorflow/lite/micro/examples/magic_wand/magic_wand_model_data.cc
index 1b8dca8eb0a..d56571dfd6f 100644
--- a/tensorflow/lite/micro/examples/magic_wand/magic_wand_model_data.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/magic_wand_model_data.cc
@@ -19,19 +19,8 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/examples/magic_wand/magic_wand_model_data.h"
 
-// We need to keep the data array aligned on some architectures.
-#ifdef __has_attribute
-#define HAVE_ATTRIBUTE(x) __has_attribute(x)
-#else
-#define HAVE_ATTRIBUTE(x) 0
-#endif
-#if HAVE_ATTRIBUTE(aligned) || (defined(__GNUC__) && !defined(__clang__))
-#define DATA_ALIGN_ATTRIBUTE __attribute__((aligned(4)))
-#else
-#define DATA_ALIGN_ATTRIBUTE
-#endif
-
-const unsigned char g_magic_wand_model_data[] DATA_ALIGN_ATTRIBUTE = {
+// Keep model aligned to 8 bytes to guarantee aligned 64-bit accesses.
+alignas(8) const unsigned char g_magic_wand_model_data[] = {
     0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x12, 0x00,
     0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
     0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
diff --git a/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc b/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc
index ab4f41680fd..88bfad860e2 100644
--- a/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc
@@ -46,7 +46,7 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
   // An easier approach is to just use the AllOpsResolver, but this will
   // incur some penalty in code space for op implementations that are not
   // needed by this graph.
-  static tflite::MicroOpResolver<5> micro_op_resolver;  // NOLINT
+  static tflite::MicroMutableOpResolver<5> micro_op_resolver;  // NOLINT
   micro_op_resolver.AddBuiltin(
       tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
       tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
diff --git a/tensorflow/lite/micro/examples/magic_wand/main_functions.cc b/tensorflow/lite/micro/examples/magic_wand/main_functions.cc
index 51e6e593cd1..26c2eb44747 100644
--- a/tensorflow/lite/micro/examples/magic_wand/main_functions.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/main_functions.cc
@@ -65,7 +65,7 @@ void setup() {
   // An easier approach is to just use the AllOpsResolver, but this will
   // incur some penalty in code space for op implementations that are not
   // needed by this graph.
-  static tflite::MicroOpResolver<5> micro_op_resolver;  // NOLINT
+  static tflite::MicroMutableOpResolver<5> micro_op_resolver;  // NOLINT
   micro_op_resolver.AddBuiltin(
       tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
       tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
diff --git a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/Makefile.inc b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/Makefile.inc
new file mode 100644
index 00000000000..e257e6620db
--- /dev/null
+++ b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/Makefile.inc
@@ -0,0 +1,31 @@
+ifeq ($(TARGET), zephyr_vexriscv)
+	export ZEPHYR_TOOLCHAIN_VARIANT?=zephyr
+	export TOOLCHAIN_BASE=${ZEPHYR_SDK_INSTALL_DIR}/riscv64-zephyr-elf/riscv64-zephyr-elf
+	export TOOLCHAIN_VERSION=9.2.0
+	PROJECT_INCLUDES += ${CURDIR} ${TOOLCHAIN_BASE}/include/c++/${TOOLCHAIN_VERSION} ${TOOLCHAIN_BASE}/include ${TOOLCHAIN_BASE}/include/c++/${TOOLCHAIN_VERSION}/riscv64-zephyr-elf/rv32i/ilp32
+	ZEPHYR_MAGIC_WAND_SRCS = \
+tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/assert.cc \
+tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.cc \
+tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.h \
+tensorflow/lite/micro/examples/magic_wand/main.cc \
+tensorflow/lite/micro/examples/magic_wand/main_functions.cc \
+tensorflow/lite/micro/examples/magic_wand/magic_wand_model_data.cc \
+tensorflow/lite/micro/examples/magic_wand/gesture_predictor.cc \
+tensorflow/lite/micro/examples/magic_wand/output_handler.cc \
+boards/litex_vexriscv.overlay \
+prj.conf
+
+$(eval $(call generate_project,cmake,zephyr_cmake_project.cmake,magic_wand,$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(ZEPHYR_MAGIC_WAND_SRCS) $(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(magic_wand_HDRS),,$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS),))
+
+$(PRJDIR)magic_wand/cmake/CMakeLists.txt: $(PRJDIR)magic_wand/cmake/zephyr_cmake_project.cmake
+	@sed -E 's#\%\{INCLUDE_DIRS\}\%#$(PROJECT_INCLUDES)#g' $< > $@
+
+#We are skipping here copy of `zephyr` third_party repository 
+#To compile standalone project ZEPHYR_BASE enviroment variable should be set
+magic_wand_bin: generate_magic_wand_cmake_project $(PRJDIR)magic_wand/cmake/CMakeLists.txt
+	( \
+	  . ${ZEPHYR_BASE}/venv-zephyr/bin/activate; \
+	  cmake -B${GENDIR}magic_wand/build -DBOARD="litex_vexriscv" -H${PRJDIR}magic_wand/cmake/ -DPython_ROOT_DIR=${ZEPHYR_BASE}/venv-zephyr/bin/; \
+	  make -C ${GENDIR}magic_wand/build; \
+	)
+endif
diff --git a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/boards/litex_vexriscv.overlay b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/boards/litex_vexriscv.overlay
new file mode 100644
index 00000000000..a75435b3ea6
--- /dev/null
+++ b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/boards/litex_vexriscv.overlay
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+&i2c0 {
+	label = "I2C0";
+	reg = <0xe0003000 0x4 0xe0003004 0x4>;
+
+	adxl@1d {
+		compatible = "adi,adxl345";
+		label = "accel-0";
+		reg = <0x1d>;
+	};
+
+};
+
+&pwm0 {
+	status = "disabled";
+};
+
+&eth0 {
+	status = "disabled";
+};
+
+&prbs0 {
+	status = "disabled";
+};
diff --git a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/prj.conf b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/prj.conf
new file mode 100644
index 00000000000..e4152086d5f
--- /dev/null
+++ b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/prj.conf
@@ -0,0 +1,22 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+CONFIG_CPLUSPLUS=y
+CONFIG_NEWLIB_LIBC=y
+CONFIG_SENSOR=y
+CONFIG_ADXL345=y
+CONFIG_PWM=n
+CONFIG_PWM_LITEX=n
+CONFIG_NETWORKING=n
+CONFIG_MAIN_STACK_SIZE=4096
diff --git a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.cc b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.cc
new file mode 100644
index 00000000000..f7f019d4d18
--- /dev/null
+++ b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.cc
@@ -0,0 +1,98 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/magic_wand/accelerometer_handler.h"
+
+#include <device.h>
+#include <drivers/sensor.h>
+#include <stdio.h>
+#include <string.h>
+#include <zephyr.h>
+
+#define BUFLEN 300
+int begin_index = 0;
+struct device* sensor = NULL;
+int current_index = 0;
+
+float bufx[BUFLEN] = {0.0f};
+float bufy[BUFLEN] = {0.0f};
+float bufz[BUFLEN] = {0.0f};
+
+bool initial = true;
+
+TfLiteStatus SetupAccelerometer(tflite::ErrorReporter* error_reporter) {
+  sensor = device_get_binding(DT_INST_0_ADI_ADXL345_LABEL);
+  if (sensor == NULL) {
+    TF_LITE_REPORT_ERROR(error_reporter,
+                         "Failed to get accelerometer, label: %s\n",
+                         DT_INST_0_ADI_ADXL345_LABEL);
+  } else {
+    TF_LITE_REPORT_ERROR(error_reporter, "Got accelerometer, label: %s\n",
+                         DT_INST_0_ADI_ADXL345_LABEL);
+  }
+  return kTfLiteOk;
+}
+
+bool ReadAccelerometer(tflite::ErrorReporter* error_reporter, float* input,
+                       int length) {
+  int rc;
+  struct sensor_value accel[3];
+  int samples_count;
+
+  rc = sensor_sample_fetch(sensor);
+  if (rc < 0) {
+    TF_LITE_REPORT_ERROR(error_reporter, "Fetch failed\n");
+    return false;
+  }
+  // skip if there is no data
+  if (!rc) {
+    return false;
+  }
+
+  samples_count = rc;
+  for (int i = 0; i < samples_count; i++) {
+    rc = sensor_channel_get(sensor, SENSOR_CHAN_ACCEL_XYZ, accel);
+    if (rc < 0) {
+      TF_LITE_REPORT_ERROR(error_reporter, "ERROR: Update failed: %d\n", rc);
+      return false;
+    }
+    bufx[begin_index] = (float)sensor_value_to_double(&accel[0]);
+    bufy[begin_index] = (float)sensor_value_to_double(&accel[1]);
+    bufz[begin_index] = (float)sensor_value_to_double(&accel[2]);
+    begin_index++;
+    if (begin_index >= BUFLEN) begin_index = 0;
+  }
+
+  if (initial && begin_index >= 100) {
+    initial = false;
+  }
+
+  if (initial) {
+    return false;
+  }
+
+  int sample = 0;
+  for (int i = 0; i < (length - 3); i += 3) {
+    int ring_index = begin_index + sample - length / 3;
+    if (ring_index < 0) {
+      ring_index += BUFLEN;
+    }
+    input[i] = bufx[ring_index];
+    input[i + 1] = bufy[ring_index];
+    input[i + 2] = bufz[ring_index];
+    sample++;
+  }
+  return true;
+}
diff --git a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.h b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.h
new file mode 100644
index 00000000000..5b3fb54a4e4
--- /dev/null
+++ b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_MAGIC_WAND_ACCELEROMETER_HANDLER_H_
+#define TENSORFLOW_LITE_MICRO_EXAMPLES_MAGIC_WAND_ACCELEROMETER_HANDLER_H_
+
+#define kChannelNumber 3
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+
+extern int begin_index;
+extern TfLiteStatus SetupAccelerometer(tflite::ErrorReporter* error_reporter);
+extern bool ReadAccelerometer(tflite::ErrorReporter* error_reporter,
+                              float* input, int length, bool reset_buffer);
+
+#endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MAGIC_WAND_ACCELEROMETER_HANDLER_H_
diff --git a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/assert.cc b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/assert.cc
new file mode 100644
index 00000000000..2141c091149
--- /dev/null
+++ b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/assert.cc
@@ -0,0 +1,19 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+extern "C" {
+
+void __assert_func(const char *, int, const char *, const char *) {}
+}
diff --git a/tensorflow/lite/micro/examples/micro_speech/BUILD b/tensorflow/lite/micro/examples/micro_speech/BUILD
index d724972fbed..b487b895f7a 100644
--- a/tensorflow/lite/micro/examples/micro_speech/BUILD
+++ b/tensorflow/lite/micro/examples/micro_speech/BUILD
@@ -50,6 +50,7 @@ tflite_micro_cc_test(
     ],
     deps = [
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_features_test_data",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:model",
@@ -107,7 +108,7 @@ cc_library(
     deps = [
         ":simple_model_settings",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -122,6 +123,7 @@ tflite_micro_cc_test(
         ":simple_features_generator_test_data",
         ":simple_model_settings",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -138,7 +140,7 @@ cc_library(
     deps = [
         ":simple_model_settings",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -153,6 +155,7 @@ tflite_micro_cc_test(
         ":simple_features_generator_test_data",
         ":simple_model_settings",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -168,7 +171,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
 )
@@ -184,7 +187,7 @@ cc_library(
     deps = [
         ":audio_large_sample_test_data",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
 )
@@ -197,6 +200,7 @@ tflite_micro_cc_test(
     deps = [
         ":audio_provider",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -212,6 +216,7 @@ tflite_micro_cc_test(
         ":audio_large_sample_test_data",
         ":audio_provider_mock",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -229,7 +234,7 @@ cc_library(
     deps = [
         ":audio_provider",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_features_generator",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
@@ -244,6 +249,7 @@ tflite_micro_cc_test(
         ":audio_provider",
         ":feature_provider",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -261,7 +267,7 @@ cc_library(
     deps = [
         ":audio_provider_mock",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_features_generator",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
@@ -275,6 +281,7 @@ tflite_micro_cc_test(
     deps = [
         ":feature_provider_mock",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_features_test_data",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
@@ -292,7 +299,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
 )
@@ -308,6 +315,7 @@ tflite_micro_cc_test(
     deps = [
         ":recognize_commands",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -323,7 +331,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -335,6 +343,7 @@ tflite_micro_cc_test(
     deps = [
         ":command_responder",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -353,6 +362,7 @@ cc_binary(
         ":feature_provider",
         ":recognize_commands",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:model",
@@ -374,6 +384,7 @@ cc_binary(
         ":feature_provider",
         ":recognize_commands",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:model",
@@ -381,3 +392,9 @@ cc_binary(
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
+
+sh_test(
+    name = "micro_speech_binary_mock_test",
+    srcs = ["micro_speech_binary_mock_test.sh"],
+    data = [":micro_speech_mock"],
+)
diff --git a/tensorflow/lite/micro/examples/micro_speech/README.md b/tensorflow/lite/micro/examples/micro_speech/README.md
index 5c20aa5fe75..e854b74e33b 100644
--- a/tensorflow/lite/micro/examples/micro_speech/README.md
+++ b/tensorflow/lite/micro/examples/micro_speech/README.md
@@ -16,6 +16,7 @@ kilobytes of Flash.
 
 ## Table of contents
 
+-   [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
 -   [Deploy to Arduino](#deploy-to-arduino)
 -   [Deploy to ESP32](#deploy-to-esp32)
 -   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
@@ -25,6 +26,95 @@ kilobytes of Flash.
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
 -   [Train your own model](#train-your-own-model)
 
+## Deploy to ARC EM SDP
+
+The following instructions will help you to build and deploy this example to
+[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
+board. General information and instructions on using the board with TensorFlow
+Lite Micro can be found in the common
+[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
+
+This example is quantized with symmetric uint8 scheme. As noted in
+[kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md),
+embARC MLI supports optimized kernels for int8 quantization only. Therefore,
+this example will only use TFLM reference kernels.
+
+The ARC EM SDP board contains the rich set of extension interfaces. You can
+choose any compatible microphone and modify
+[audio_provider.cc](/tensorflow/lite/micro/examples/micro_speech/audio_provider.cc)
+file accordingly to use input from your specific camera. By default, results of
+running this example are printed to the console. If you would like to instead
+implement some target-specific actions, you need to modify
+[command_responder.cc](/tensorflow/lite/micro/examples/micro_speech/command_responder.cc)
+accordingly.
+
+The reference implementations of these files are used by default on the EM SDP.
+
+### Initial setup
+
+Follow the instructions on the
+[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
+to get and install all required tools for work with ARC EM SDP.
+
+### Generate Example Project
+
+As default example doesn’t provide any output without real audio, it is
+recommended to get started with example for mock data. The project for ARC EM
+SDP platform can be generated with the following command:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_micro_speech_mock_make_project
+```
+
+### Build and Run Example
+
+For more detailed information on building and running examples see the
+appropriate sections of general descriptions of the
+[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
+In the directory with generated project you can also find a
+*README_ARC_EMSDP.md* file with instructions and options on building and
+running. Here we only briefly mention main steps which are typically enough to
+get it started.
+
+1.  You need to
+    [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
+    and open an serial connection.
+
+2.  Go to the generated example project director
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/micro_speech_mock/make
+    ```
+
+3.  Build the example using
+
+    ```
+    make app
+    ```
+
+4.  To generate artefacts for self-boot of example from the board use
+
+    ```
+    make flash
+    ```
+
+5.  To run application from the board using microSD card:
+
+    *   Copy the content of the created /bin folder into the root of microSD
+        card. Note that the card must be formatted as FAT32 with default cluster
+        size (but less than 32 Kbytes)
+    *   Plug in the microSD card into the J11 connector.
+    *   Push the RST button. If a red LED is lit beside RST button, push the CFG
+        button.
+
+6.  If you have the MetaWare Debugger installed in your environment:
+
+    *   To run application from the console using it type `make run`.
+    *   To stop the execution type `Ctrl+C` in the console several times.
+
+In both cases (step 5 and 6) you will see the application output in the serial
+terminal.
+
 ## Deploy to Arduino
 
 The following instructions will help you build and deploy this sample
diff --git a/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc
new file mode 100644
index 00000000000..850263f0eb9
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc
@@ -0,0 +1,28 @@
+ifeq ($(TARGET), arc_emsdp)
+
+# Patch of arc make project to adjust it specifically for micro speech example. 
+# In particular:
+# - Extend Heap and stack size for application needs
+# - Use Linker command file with better usage of fast memory
+# - In case project was generated with MLI usage, reduce scratch buffers.
+
+  MICRO_SPEECH_HDRS += \
+  micro_speech_patch.txt
+  
+  MICRO_SPEECH_TEST_HDRS += \
+  micro_speech_patch.txt
+  
+  MICRO_SPEECH_MOCK_HDRS += \
+  micro_speech_patch.txt
+
+%/micro_speech_patch.txt: %/emsdp.lcf %/Makefile
+	@cp tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf $< 
+	@echo emsdp.lcf > $@
+	@sed -E -i 's#-Hheap=[^ ]*#\-Hheap=16K \-Hstack=16K#g' $(word 2, $^)
+	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\
+	CXXFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0\
+	CCFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0#'\
+	  $(word 2, $^)
+	@echo Makefile >> $@
+
+endif
diff --git a/tensorflow/lite/micro/examples/micro_speech/feature_provider.cc b/tensorflow/lite/micro/examples/micro_speech/feature_provider.cc
index 7d917085845..fc2b1420a89 100644
--- a/tensorflow/lite/micro/examples/micro_speech/feature_provider.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/feature_provider.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h"
 #include "tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 
-FeatureProvider::FeatureProvider(int feature_size, uint8_t* feature_data)
+FeatureProvider::FeatureProvider(int feature_size, int8_t* feature_data)
     : feature_size_(feature_size),
       feature_data_(feature_data),
       is_first_run_(true) {
@@ -77,10 +77,10 @@ TfLiteStatus FeatureProvider::PopulateFeatureData(
   // +-----------+             +-----------+
   if (slices_to_keep > 0) {
     for (int dest_slice = 0; dest_slice < slices_to_keep; ++dest_slice) {
-      uint8_t* dest_slice_data =
+      int8_t* dest_slice_data =
           feature_data_ + (dest_slice * kFeatureSliceSize);
       const int src_slice = dest_slice + slices_to_drop;
-      const uint8_t* src_slice_data =
+      const int8_t* src_slice_data =
           feature_data_ + (src_slice * kFeatureSliceSize);
       for (int i = 0; i < kFeatureSliceSize; ++i) {
         dest_slice_data[i] = src_slice_data[i];
@@ -106,7 +106,7 @@ TfLiteStatus FeatureProvider::PopulateFeatureData(
                              audio_samples_size, kMaxAudioSampleSize);
         return kTfLiteError;
       }
-      uint8_t* new_slice_data = feature_data_ + (new_slice * kFeatureSliceSize);
+      int8_t* new_slice_data = feature_data_ + (new_slice * kFeatureSliceSize);
       size_t num_samples_read;
       TfLiteStatus generate_status = GenerateMicroFeatures(
           error_reporter, audio_samples, audio_samples_size, kFeatureSliceSize,
diff --git a/tensorflow/lite/micro/examples/micro_speech/feature_provider.h b/tensorflow/lite/micro/examples/micro_speech/feature_provider.h
index fc634ec108d..d086e013dc3 100644
--- a/tensorflow/lite/micro/examples/micro_speech/feature_provider.h
+++ b/tensorflow/lite/micro/examples/micro_speech/feature_provider.h
@@ -32,7 +32,7 @@ class FeatureProvider {
   // remain accessible for the lifetime of the provider object, since subsequent
   // calls will fill it with feature data. The provider does no memory
   // management of this data.
-  FeatureProvider(int feature_size, uint8_t* feature_data);
+  FeatureProvider(int feature_size, int8_t* feature_data);
   ~FeatureProvider();
 
   // Fills the feature data with information from audio inputs, and returns how
@@ -43,7 +43,7 @@ class FeatureProvider {
 
  private:
   int feature_size_;
-  uint8_t* feature_data_;
+  int8_t* feature_data_;
   // Make sure we don't try to use cached information if this is the first call
   // into the provider.
   bool is_first_run_;
diff --git a/tensorflow/lite/micro/examples/micro_speech/feature_provider_mock_test.cc b/tensorflow/lite/micro/examples/micro_speech/feature_provider_mock_test.cc
index 6dcf3da9a3f..aae556bf6e0 100644
--- a/tensorflow/lite/micro/examples/micro_speech/feature_provider_mock_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/feature_provider_mock_test.cc
@@ -27,7 +27,7 @@ TF_LITE_MICRO_TEST(TestFeatureProviderMockYes) {
   tflite::MicroErrorReporter micro_error_reporter;
   tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
-  uint8_t feature_data[kFeatureElementCount];
+  int8_t feature_data[kFeatureElementCount];
   FeatureProvider feature_provider(kFeatureElementCount, feature_data);
 
   int how_many_new_slices = 0;
@@ -47,7 +47,7 @@ TF_LITE_MICRO_TEST(TestFeatureProviderMockNo) {
   tflite::MicroErrorReporter micro_error_reporter;
   tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
-  uint8_t feature_data[kFeatureElementCount];
+  int8_t feature_data[kFeatureElementCount];
   FeatureProvider feature_provider(kFeatureElementCount, feature_data);
 
   int how_many_new_slices = 0;
diff --git a/tensorflow/lite/micro/examples/micro_speech/feature_provider_test.cc b/tensorflow/lite/micro/examples/micro_speech/feature_provider_test.cc
index 8e0e1f47d15..5d6816a91e4 100644
--- a/tensorflow/lite/micro/examples/micro_speech/feature_provider_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/feature_provider_test.cc
@@ -26,7 +26,7 @@ TF_LITE_MICRO_TEST(TestFeatureProvider) {
   tflite::MicroErrorReporter micro_error_reporter;
   tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
-  uint8_t feature_data[kFeatureElementCount];
+  int8_t feature_data[kFeatureElementCount];
   FeatureProvider feature_provider(kFeatureElementCount, feature_data);
 
   int how_many_new_slices = 0;
diff --git a/tensorflow/lite/micro/examples/micro_speech/main_functions.cc b/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
index 23c63a32986..5369008182b 100644
--- a/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
@@ -43,8 +43,8 @@ int32_t previous_time = 0;
 // determined by experimentation.
 constexpr int kTensorArenaSize = 10 * 1024;
 uint8_t tensor_arena[kTensorArenaSize];
-uint8_t feature_buffer[kFeatureElementCount];
-uint8_t* model_input_buffer = nullptr;
+int8_t feature_buffer[kFeatureElementCount];
+int8_t* model_input_buffer = nullptr;
 }  // namespace
 
 // The name of this function is important for Arduino compatibility.
@@ -74,14 +74,31 @@ void setup() {
   //
   // tflite::ops::micro::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
-  static tflite::MicroOpResolver<3> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
-                               tflite::ops::micro::Register_FULLY_CONNECTED());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+  static tflite::MicroMutableOpResolver<4> micro_op_resolver(error_reporter);
+  if (micro_op_resolver.AddBuiltin(
+          tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+          tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
+          tflite::MicroOpResolverAnyVersion()) != kTfLiteOk) {
+    return;
+  }
+  if (micro_op_resolver.AddBuiltin(
+          tflite::BuiltinOperator_FULLY_CONNECTED,
+          tflite::ops::micro::Register_FULLY_CONNECTED(),
+          tflite::MicroOpResolverAnyVersion()) != kTfLiteOk) {
+    return;
+  }
+  if (micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
+                                   tflite::ops::micro::Register_SOFTMAX(),
+                                   tflite::MicroOpResolverAnyVersion()) !=
+      kTfLiteOk) {
+    return;
+  }
+  if (micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
+                                   tflite::ops::micro::Register_RESHAPE(),
+                                   tflite::MicroOpResolverAnyVersion()) !=
+      kTfLiteOk) {
+    return;
+  }
 
   // Build an interpreter to run the model with.
   static tflite::MicroInterpreter static_interpreter(
@@ -97,15 +114,15 @@ void setup() {
 
   // Get information about the memory area to use for the model's input.
   model_input = interpreter->input(0);
-  if ((model_input->dims->size != 4) || (model_input->dims->data[0] != 1) ||
-      (model_input->dims->data[1] != kFeatureSliceCount) ||
-      (model_input->dims->data[2] != kFeatureSliceSize) ||
-      (model_input->type != kTfLiteUInt8)) {
+  if ((model_input->dims->size != 2) || (model_input->dims->data[0] != 1) ||
+      (model_input->dims->data[1] !=
+       (kFeatureSliceCount * kFeatureSliceSize)) ||
+      (model_input->type != kTfLiteInt8)) {
     TF_LITE_REPORT_ERROR(error_reporter,
                          "Bad input tensor parameters in model");
     return;
   }
-  model_input_buffer = model_input->data.uint8;
+  model_input_buffer = model_input->data.int8;
 
   // Prepare to access the audio spectrograms from a microphone or other source
   // that will provide the inputs to the neural network.
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/BUILD b/tensorflow/lite/micro/examples/micro_speech/micro_features/BUILD
index 71010493102..0aa7ff14f73 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/BUILD
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/BUILD
@@ -59,7 +59,7 @@ cc_library(
         ":micro_model_settings",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/experimental/microfrontend/lib:frontend",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -85,6 +85,7 @@ tflite_micro_cc_test(
         ":micro_features_generator_test_data",
         ":micro_model_settings",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech:audio_sample_test_data",
         "//tensorflow/lite/micro/testing:micro_test",
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.cc
index 6a01124ed86..fbb6e6e4a9f 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.cc
@@ -69,7 +69,7 @@ void SetMicroFeaturesNoiseEstimates(const uint32_t* estimate_presets) {
 
 TfLiteStatus GenerateMicroFeatures(tflite::ErrorReporter* error_reporter,
                                    const int16_t* input, int input_size,
-                                   int output_size, uint8_t* output,
+                                   int output_size, int8_t* output,
                                    size_t* num_samples_read) {
   const int16_t* frontend_input;
   if (g_is_first_time) {
@@ -84,16 +84,30 @@ TfLiteStatus GenerateMicroFeatures(tflite::ErrorReporter* error_reporter,
   for (int i = 0; i < frontend_output.size; ++i) {
     // These scaling values are derived from those used in input_data.py in the
     // training pipeline.
-    constexpr int32_t value_scale = (10 * 255);
-    constexpr int32_t value_div = (256 * 26);
+    // The feature pipeline outputs 16-bit signed integers in roughly a 0 to 670
+    // range. In training, these are then arbitrarily divided by 25.6 to get
+    // float values in the rough range of 0.0 to 26.0. This scaling is performed
+    // for historical reasons, to match up with the output of other feature
+    // generators.
+    // The process is then further complicated when we quantize the model. This
+    // means we have to scale the 0.0 to 26.0 real values to the -128 to 127
+    // signed integer numbers.
+    // All this means that to get matching values from our integer feature
+    // output into the tensor input, we have to perform:
+    // input = (((feature / 25.6) / 26.0) * 256) - 128
+    // To simplify this and perform it in 32-bit integer math, we rearrange to:
+    // input = (feature * 256) / (25.6 * 26.0) - 128
+    constexpr int32_t value_scale = 256;
+    constexpr int32_t value_div = static_cast<int32_t>((25.6f * 26.0f) + 0.5f);
     int32_t value =
         ((frontend_output.values[i] * value_scale) + (value_div / 2)) /
         value_div;
-    if (value < 0) {
-      value = 0;
+    value -= 128;
+    if (value < -128) {
+      value = -128;
     }
-    if (value > 255) {
-      value = 255;
+    if (value > 127) {
+      value = 127;
     }
     output[i] = value;
   }
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h
index 7b9bc5faec8..29304239332 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h
@@ -26,7 +26,7 @@ TfLiteStatus InitializeMicroFeatures(tflite::ErrorReporter* error_reporter);
 // feeding into a neural network.
 TfLiteStatus GenerateMicroFeatures(tflite::ErrorReporter* error_reporter,
                                    const int16_t* input, int input_size,
-                                   int output_size, uint8_t* output,
+                                   int output_size, int8_t* output,
                                    size_t* num_samples_read);
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_FEATURES_GENERATOR_H_
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc
index f88f12a5562..ee3ee03763f 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc
@@ -48,7 +48,7 @@ TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorYes) {
   };
   SetMicroFeaturesNoiseEstimates(yes_estimate_presets);
 
-  uint8_t yes_calculated_data[g_yes_feature_data_slice_size];
+  int8_t yes_calculated_data[g_yes_feature_data_slice_size];
   size_t num_samples_read;
   TfLiteStatus yes_status = GenerateMicroFeatures(
       error_reporter, g_yes_30ms_sample_data, g_yes_30ms_sample_data_size,
@@ -56,11 +56,12 @@ TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorYes) {
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, yes_status);
 
   for (int i = 0; i < g_yes_feature_data_slice_size; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(g_yes_feature_data_slice[i],
-                            yes_calculated_data[i]);
-    if (g_yes_feature_data_slice[i] != yes_calculated_data[i]) {
+    const int expected = g_yes_feature_data_slice[i];
+    const int actual = yes_calculated_data[i];
+    TF_LITE_MICRO_EXPECT_EQ(expected, actual);
+    if (expected != actual) {
       TF_LITE_REPORT_ERROR(error_reporter, "Expected value %d but found %d",
-                           g_yes_feature_data_slice[i], yes_calculated_data[i]);
+                           expected, actual);
     }
   }
 }
@@ -81,7 +82,7 @@ TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorNo) {
   };
   SetMicroFeaturesNoiseEstimates(no_estimate_presets);
 
-  uint8_t no_calculated_data[g_no_feature_data_slice_size];
+  int8_t no_calculated_data[g_no_feature_data_slice_size];
   size_t num_samples_read;
   TfLiteStatus no_status = GenerateMicroFeatures(
       error_reporter, g_no_30ms_sample_data, g_no_30ms_sample_data_size,
@@ -89,10 +90,12 @@ TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorNo) {
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, no_status);
 
   for (int i = 0; i < g_no_feature_data_slice_size; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(g_no_feature_data_slice[i], no_calculated_data[i]);
-    if (g_no_feature_data_slice[i] != no_calculated_data[i]) {
+    const int expected = g_no_feature_data_slice[i];
+    const int actual = no_calculated_data[i];
+    TF_LITE_MICRO_EXPECT_EQ(expected, actual);
+    if (expected != actual) {
       TF_LITE_REPORT_ERROR(error_reporter, "Expected value %d but found %d",
-                           g_no_feature_data_slice[i], no_calculated_data[i]);
+                           expected, actual);
     }
   }
 }
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc
index 45198c781b2..d1e797fcf7d 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc
@@ -33,1528 +33,1564 @@ limitations under the License.
 #endif
 
 const unsigned char g_model[] DATA_ALIGN_ATTRIBUTE = {
-    0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x12, 0x00,
-    0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
-    0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-    0x1c, 0x47, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
-    0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0xc0, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
-    0x0f, 0x00, 0x00, 0x00, 0x54, 0x4f, 0x43, 0x4f, 0x20, 0x43, 0x6f, 0x6e,
-    0x76, 0x65, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x00, 0x0a, 0x00, 0x00, 0x00,
-    0x60, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
-    0x3c, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
-    0x20, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x0e, 0xba, 0xff, 0xff, 0x38, 0x00, 0x00, 0x00,
-    0xbc, 0xb9, 0xff, 0xff, 0xc0, 0xb9, 0xff, 0xff, 0x1e, 0xba, 0xff, 0xff,
-    0xe0, 0x01, 0x00, 0x00, 0xcc, 0xb9, 0xff, 0xff, 0xd0, 0xb9, 0xff, 0xff,
-    0x2e, 0xba, 0xff, 0xff, 0x60, 0x03, 0x00, 0x00, 0x36, 0xba, 0xff, 0xff,
-    0x7c, 0x06, 0x00, 0x00, 0x3e, 0xba, 0xff, 0xff, 0x68, 0x45, 0x00, 0x00,
-    0xec, 0xb9, 0xff, 0xff, 0x05, 0x00, 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e,
-    0x30, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,
-    0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74,
-    0x69, 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00,
-    0x10, 0xfa, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x2c, 0x45, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x9c, 0x44, 0x00, 0x00,
-    0x8c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0xdc, 0x01, 0x00, 0x00,
-    0x68, 0x01, 0x00, 0x00, 0x3c, 0x02, 0x00, 0x00, 0x50, 0x05, 0x00, 0x00,
-    0x8e, 0xbb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,
-    0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x5f, 0x32, 0x00, 0x00, 0x00,
-    0x94, 0xfa, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x12, 0x00, 0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00,
+    0x10, 0x00, 0x14, 0x00, 0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x94, 0x48, 0x00, 0x00, 0x34, 0x42, 0x00, 0x00,
+    0x1c, 0x42, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00,
+    0x04, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x0b, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f,
+    0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73,
+    0x69, 0x6f, 0x6e, 0x00, 0x0c, 0x00, 0x00, 0x00, 0xd4, 0x41, 0x00, 0x00,
+    0xb4, 0x41, 0x00, 0x00, 0x24, 0x03, 0x00, 0x00, 0xf4, 0x02, 0x00, 0x00,
+    0xec, 0x02, 0x00, 0x00, 0xe4, 0x02, 0x00, 0x00, 0xc4, 0x02, 0x00, 0x00,
+    0xbc, 0x02, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0xbd, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e,
+    0x30, 0x00, 0x00, 0x00, 0x94, 0xba, 0xff, 0xff, 0x98, 0xba, 0xff, 0xff,
+    0x32, 0xbd, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x80, 0x02, 0x00, 0x00,
+    0xfa, 0xee, 0x28, 0xc4, 0xee, 0xfe, 0xcf, 0x0f, 0x1e, 0xf7, 0x1f, 0x06,
+    0x0d, 0xed, 0xe9, 0x83, 0x5c, 0xc9, 0x18, 0xe3, 0xf9, 0x14, 0x28, 0x2a,
+    0x09, 0xf2, 0x18, 0x34, 0x62, 0xea, 0xef, 0xd6, 0x36, 0xb7, 0x1e, 0xf7,
+    0x3b, 0x22, 0x28, 0x39, 0xc2, 0x9d, 0xf1, 0x07, 0x5e, 0x0b, 0x1e, 0x2c,
+    0x07, 0xdd, 0xfd, 0xc3, 0xd8, 0x4a, 0xf3, 0x28, 0xa7, 0x16, 0xd5, 0xf1,
+    0xc3, 0x05, 0xfd, 0x27, 0xcc, 0xba, 0x1e, 0xcb, 0xd7, 0x3d, 0xd4, 0x29,
+    0x00, 0xfd, 0x28, 0x44, 0xfb, 0xf2, 0xf3, 0xb6, 0x4f, 0xcf, 0x09, 0xf0,
+    0xfa, 0x45, 0x41, 0x49, 0x05, 0xc5, 0x17, 0x5d, 0x64, 0x00, 0xf8, 0xee,
+    0x48, 0x17, 0xf4, 0xe9, 0x2e, 0x4b, 0x2e, 0x3f, 0xdf, 0xee, 0xe4, 0x08,
+    0x38, 0xf1, 0x16, 0x13, 0x2f, 0x2a, 0xed, 0xc2, 0xbf, 0x36, 0xf4, 0x02,
+    0xcf, 0xaa, 0xd2, 0xfa, 0xac, 0x13, 0xf6, 0xe8, 0xb5, 0x68, 0x12, 0xb6,
+    0xce, 0x0e, 0xdf, 0x58, 0xe4, 0x49, 0x14, 0x15, 0x03, 0xed, 0xfa, 0xd4,
+    0x40, 0xa7, 0xf6, 0xca, 0xfb, 0x00, 0x4d, 0x5e, 0xe4, 0x55, 0x1d, 0x30,
+    0x45, 0xe2, 0xfc, 0x01, 0x48, 0x81, 0xe9, 0xf1, 0x1e, 0xfc, 0x21, 0x32,
+    0xed, 0x4b, 0xed, 0xfa, 0x2f, 0xd2, 0xfa, 0xfb, 0x4d, 0xa7, 0xed, 0xc7,
+    0x92, 0xdf, 0xe6, 0xdb, 0xf8, 0x1f, 0xd9, 0xfa, 0x91, 0xf5, 0xe5, 0xc5,
+    0x8c, 0x17, 0x0f, 0xb9, 0xd2, 0xc7, 0xfe, 0x68, 0xd3, 0x51, 0x2e, 0x49,
+    0x1f, 0xbd, 0x01, 0xeb, 0x31, 0x17, 0xf0, 0xef, 0xff, 0xb8, 0x5d, 0x62,
+    0x02, 0x0f, 0x1f, 0x78, 0x6a, 0xb0, 0xf9, 0xfe, 0x4f, 0xcc, 0xd3, 0xff,
+    0x0a, 0x96, 0x1e, 0x2c, 0xed, 0xbc, 0xf4, 0x0b, 0x42, 0xc8, 0xf1, 0xea,
+    0x6e, 0x58, 0xec, 0xc4, 0x99, 0xae, 0xdc, 0xd7, 0x12, 0x87, 0xd8, 0x06,
+    0xa2, 0xc2, 0xe6, 0xa2, 0x81, 0x24, 0xe9, 0xac, 0xce, 0xb6, 0x15, 0x6b,
+    0xba, 0x00, 0x19, 0x58, 0x29, 0xb6, 0xfe, 0x01, 0x25, 0x96, 0xd2, 0xec,
+    0x0e, 0x9c, 0x60, 0x5f, 0xe9, 0xf4, 0xf5, 0x69, 0x6b, 0xb5, 0xe1, 0xf6,
+    0x5e, 0xb7, 0xb1, 0xe5, 0x11, 0x9b, 0x18, 0x10, 0xe3, 0xe1, 0xe0, 0x0d,
+    0x4f, 0xa5, 0xde, 0xe5, 0x6f, 0xe2, 0xfb, 0x99, 0x82, 0xa5, 0xc9, 0xb6,
+    0x1f, 0x46, 0xf3, 0x04, 0xc6, 0xca, 0xd6, 0x97, 0x90, 0x1d, 0xc0, 0x95,
+    0xf0, 0x19, 0x30, 0x77, 0xc2, 0x3c, 0xfa, 0x24, 0x02, 0x4d, 0x06, 0x07,
+    0x15, 0x02, 0xb0, 0xe7, 0x27, 0x22, 0x67, 0x4d, 0xf1, 0xc2, 0xf4, 0x64,
+    0x38, 0x40, 0xdf, 0xf6, 0x3a, 0x43, 0xb8, 0xe1, 0x0d, 0x15, 0x11, 0xfe,
+    0xf5, 0xec, 0xf9, 0xe5, 0x22, 0x36, 0xe4, 0xfd, 0x6d, 0xbf, 0x0d, 0x8e,
+    0xb7, 0x15, 0xbf, 0x9f, 0x16, 0xad, 0x0a, 0x02, 0x8e, 0x14, 0xda, 0x9b,
+    0x8e, 0xc3, 0xa6, 0xca, 0xf5, 0x7f, 0x51, 0x56, 0xc1, 0xb3, 0xd9, 0x35,
+    0xf8, 0x7f, 0x04, 0x0a, 0x03, 0x3f, 0xbe, 0xee, 0x19, 0x68, 0x78, 0x50,
+    0xf9, 0xa7, 0xf7, 0x7f, 0x1d, 0x76, 0xdb, 0xe8, 0x33, 0xb9, 0xd7, 0xe7,
+    0xe8, 0x69, 0x15, 0xf7, 0xf5, 0xb2, 0xfe, 0xe8, 0xf3, 0x5b, 0xe2, 0x06,
+    0x6e, 0x09, 0x36, 0xb7, 0xcc, 0x38, 0xbf, 0x8a, 0x28, 0x14, 0x2e, 0x18,
+    0xa7, 0x26, 0xcb, 0xb2, 0x95, 0x37, 0xac, 0xcd, 0xd7, 0x51, 0x67, 0x44,
+    0xcd, 0x31, 0xde, 0x04, 0xe9, 0x6a, 0x00, 0x13, 0x0a, 0x0c, 0xdd, 0x16,
+    0xe0, 0x24, 0x7e, 0x49, 0xf1, 0xb5, 0x04, 0x52, 0x01, 0x50, 0xdd, 0xf5,
+    0x26, 0xc9, 0xf4, 0xf8, 0xd6, 0x31, 0x1b, 0xd0, 0xef, 0x03, 0x0a, 0xc0,
+    0xd4, 0x4f, 0xe2, 0xfd, 0x72, 0xf4, 0x5a, 0xc9, 0xd7, 0x31, 0xc0, 0x8e,
+    0x17, 0x5e, 0x57, 0x00, 0xb4, 0x3a, 0xc8, 0xd2, 0x92, 0x32, 0xcb, 0xd8,
+    0xc3, 0xa6, 0x63, 0x26, 0xcf, 0xbc, 0xe8, 0x57, 0x9b, 0xe9, 0xf7, 0x1c,
+    0xea, 0x12, 0xf1, 0xf7, 0xdb, 0xb9, 0x7f, 0x16, 0xf6, 0xe0, 0x08, 0x70,
+    0xa2, 0xed, 0xcc, 0xf1, 0x1e, 0x10, 0x04, 0xf7, 0xa9, 0xb7, 0x34, 0xaa,
+    0x0a, 0xdb, 0x2a, 0xa6, 0xb6, 0x10, 0xea, 0xf8, 0x5e, 0x06, 0x72, 0xdd,
+    0xd0, 0xb9, 0xd6, 0xa0, 0x10, 0x9f, 0x5a, 0x17, 0xb1, 0xe7, 0xc0, 0x01,
+    0x9d, 0x01, 0xe0, 0xe0, 0xaf, 0x9c, 0x46, 0xd8, 0xaf, 0xe8, 0xce, 0x02,
+    0x8a, 0xbb, 0xe4, 0xf6, 0xf3, 0x36, 0x07, 0xca, 0xcb, 0x87, 0x6e, 0xcc,
+    0xd6, 0x9e, 0x0a, 0x2a, 0x81, 0xd7, 0xcf, 0xc0, 0x04, 0xeb, 0x24, 0xcc,
+    0xc9, 0x95, 0x33, 0x81, 0xf7, 0xad, 0x1c, 0x9c, 0xa4, 0xd6, 0xf9, 0xe6,
+    0x3d, 0x84, 0x7f, 0xcc, 0xd4, 0xb0, 0xf4, 0xa2, 0xe9, 0x3c, 0x36, 0xee,
+    0xd5, 0xcf, 0xcd, 0x2d, 0x28, 0xbd, 0xff, 0xff, 0xc2, 0xbf, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+    0x31, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x48, 0xbd, 0xff, 0xff, 0x4c, 0xbd, 0xff, 0xff, 0xe6, 0xbf, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x8a, 0xfe, 0xff, 0xff,
+    0xa9, 0x00, 0x00, 0x00, 0xd0, 0xff, 0xff, 0xff, 0xd0, 0x00, 0x00, 0x00,
+    0x52, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x4f, 0xfb, 0xff, 0xff,
+    0x4a, 0xfd, 0xff, 0xff, 0x12, 0xc0, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x80, 0x3e, 0x00, 0x00, 0xff, 0xf9, 0xfd, 0x0a, 0x07, 0x08, 0x07, 0x03,
+    0x07, 0xf2, 0xd1, 0x09, 0xf0, 0xe9, 0x28, 0x09, 0xdf, 0x05, 0xfa, 0xf0,
+    0xe8, 0xe3, 0x13, 0x0e, 0x08, 0xef, 0xd3, 0xee, 0x0f, 0xe8, 0xeb, 0x14,
+    0xf7, 0xed, 0xfd, 0x1f, 0xe8, 0xd5, 0xeb, 0xfc, 0x0e, 0xf4, 0xf7, 0x07,
+    0x05, 0xea, 0xf6, 0x1f, 0xf8, 0xdb, 0xdc, 0x0b, 0x03, 0xdd, 0xd8, 0xf3,
+    0x0f, 0x19, 0xe1, 0x09, 0xfc, 0xe4, 0x02, 0x04, 0xf1, 0x04, 0xeb, 0xf3,
+    0x1e, 0x06, 0xfd, 0x11, 0xfc, 0xfa, 0xf6, 0x1f, 0x0f, 0x02, 0xf5, 0xf7,
+    0xff, 0x24, 0xdf, 0xf7, 0xf8, 0xf3, 0xf6, 0xe9, 0xef, 0x03, 0xdd, 0xf2,
+    0x28, 0xe1, 0xf2, 0x22, 0xf4, 0x09, 0xf7, 0xf9, 0xf0, 0xd4, 0xf9, 0xee,
+    0xff, 0x14, 0xda, 0xf3, 0x11, 0xe2, 0xf6, 0x0c, 0xf2, 0xeb, 0xf8, 0xe8,
+    0xe3, 0x08, 0x02, 0x17, 0xf4, 0x0b, 0x0c, 0x27, 0xe6, 0x02, 0x03, 0xf9,
+    0x14, 0x18, 0xf6, 0xeb, 0x1f, 0x0c, 0xf1, 0xee, 0xfc, 0x08, 0xf0, 0xfe,
+    0xfd, 0xee, 0x17, 0xfd, 0x1c, 0xef, 0xfd, 0xde, 0x04, 0x05, 0xf0, 0x31,
+    0xfa, 0x0b, 0xdc, 0x0d, 0xed, 0xf5, 0xfa, 0xf4, 0x08, 0x0c, 0xd7, 0x1e,
+    0x15, 0x03, 0xf5, 0x02, 0xf4, 0xfb, 0xed, 0x01, 0xfe, 0xd6, 0x1f, 0xfd,
+    0xfd, 0x0e, 0xfa, 0x06, 0xf1, 0xf9, 0xe2, 0x16, 0xe9, 0xf1, 0x03, 0x0d,
+    0x0d, 0xdf, 0xf9, 0x1a, 0x0e, 0xf6, 0xfc, 0x0a, 0x19, 0xe2, 0xe0, 0x09,
+    0x15, 0xf0, 0xf1, 0x06, 0xf1, 0xe1, 0xef, 0x1a, 0x08, 0xe8, 0xfd, 0x12,
+    0x14, 0x06, 0xf1, 0xfc, 0xea, 0xfb, 0xf7, 0xea, 0x1d, 0x09, 0xfa, 0xf6,
+    0x08, 0xf2, 0xe7, 0xf8, 0xfc, 0x16, 0xf5, 0x0e, 0x08, 0xf9, 0x0a, 0x03,
+    0x26, 0xd8, 0x02, 0xf5, 0xf6, 0xf6, 0xef, 0x1f, 0xe4, 0xe2, 0xfb, 0x02,
+    0x1b, 0xe6, 0xde, 0x00, 0xf2, 0xed, 0xfb, 0x18, 0xe4, 0x16, 0x1a, 0x1d,
+    0xf1, 0xf6, 0xea, 0x16, 0x05, 0xde, 0xfb, 0x18, 0xf5, 0xe4, 0xfe, 0xe2,
+    0x1b, 0x1c, 0x0c, 0xe8, 0x02, 0xee, 0xfb, 0x07, 0x24, 0xf2, 0xe9, 0xfa,
+    0x0d, 0x05, 0xf1, 0x03, 0xfe, 0xf6, 0x19, 0x06, 0xff, 0xf9, 0x04, 0xfb,
+    0x15, 0xef, 0xf1, 0xf8, 0xe9, 0xe1, 0x10, 0x04, 0xfc, 0xe6, 0x1f, 0xed,
+    0x0b, 0xef, 0x00, 0x1e, 0xe6, 0x16, 0xf3, 0x09, 0xfd, 0x08, 0x08, 0x06,
+    0x06, 0x23, 0xdf, 0xfc, 0x08, 0xf4, 0xea, 0x0c, 0xf2, 0xe6, 0x18, 0xf5,
+    0x02, 0xf9, 0x50, 0x09, 0x01, 0xda, 0x0b, 0x05, 0x12, 0x18, 0xef, 0x04,
+    0x0e, 0xd9, 0xff, 0xdc, 0xf6, 0x16, 0xf9, 0xf4, 0xec, 0xff, 0xea, 0xe6,
+    0xfa, 0x0a, 0xed, 0xef, 0x02, 0xf0, 0x25, 0x21, 0xf1, 0x26, 0xf5, 0xed,
+    0x09, 0xea, 0xea, 0x24, 0xfa, 0x11, 0xfc, 0xdf, 0xf3, 0x0a, 0x28, 0x0c,
+    0x19, 0xff, 0xf5, 0xd6, 0x0e, 0xe2, 0x2a, 0x06, 0xfa, 0x03, 0xf9, 0xe6,
+    0xef, 0x23, 0xf9, 0xfa, 0xe6, 0xfe, 0xfc, 0x03, 0x06, 0x1a, 0xf9, 0x08,
+    0xe0, 0xe5, 0xff, 0x05, 0x01, 0xe7, 0x12, 0x02, 0x1d, 0x05, 0x03, 0x05,
+    0x0b, 0xee, 0xed, 0xfc, 0x0f, 0xf3, 0x02, 0xe0, 0x15, 0xdf, 0x02, 0xed,
+    0x10, 0x26, 0xef, 0x0d, 0x06, 0xee, 0xef, 0xf6, 0xeb, 0x11, 0x09, 0xf4,
+    0xf7, 0x06, 0x0f, 0x01, 0x2a, 0x0b, 0x01, 0xdd, 0xfc, 0xf4, 0xf1, 0x17,
+    0x03, 0x04, 0x07, 0xfc, 0x22, 0xfc, 0xde, 0xfe, 0x0b, 0x03, 0xf3, 0xfb,
+    0x0c, 0x25, 0x04, 0x19, 0x04, 0x03, 0x01, 0xfa, 0xfb, 0xf7, 0xf6, 0x0e,
+    0x15, 0x0e, 0x09, 0xff, 0x06, 0xfa, 0xfb, 0x1e, 0xfb, 0x05, 0x22, 0xf9,
+    0xfe, 0xf7, 0x1d, 0xed, 0xdf, 0x18, 0x09, 0xeb, 0xef, 0x04, 0x12, 0xea,
+    0xdf, 0xfb, 0xda, 0xf6, 0xdf, 0x17, 0xef, 0xef, 0xe1, 0x1a, 0xd9, 0xe2,
+    0xe2, 0xfc, 0x05, 0x11, 0xf6, 0xee, 0xe8, 0xf2, 0xe1, 0x08, 0x26, 0x04,
+    0xed, 0x03, 0xe0, 0xfb, 0xee, 0x0c, 0xee, 0xf6, 0x04, 0x2d, 0xf2, 0xd3,
+    0xf4, 0xe0, 0xf8, 0x0c, 0xfe, 0x11, 0x0b, 0xd7, 0xfd, 0x18, 0x07, 0x0d,
+    0x07, 0x08, 0xf4, 0xc6, 0x0a, 0x0a, 0x1f, 0x0c, 0xf4, 0x1d, 0x02, 0x0b,
+    0x09, 0x0e, 0x21, 0xff, 0x17, 0x0b, 0x0d, 0xf2, 0xed, 0xd7, 0x0a, 0xf8,
+    0x03, 0x06, 0xfa, 0xe5, 0xfd, 0x03, 0x14, 0x0f, 0xe9, 0x1a, 0xf4, 0xda,
+    0x01, 0xe6, 0x09, 0x06, 0x11, 0x0d, 0xfd, 0xeb, 0x16, 0x23, 0xfa, 0x00,
+    0x0b, 0x17, 0xf7, 0xda, 0xd7, 0x1b, 0xfa, 0x01, 0x03, 0x05, 0xfe, 0xd6,
+    0x02, 0xee, 0xee, 0x02, 0xf3, 0x06, 0xed, 0x03, 0xec, 0x01, 0xf2, 0x0f,
+    0x05, 0x17, 0x0b, 0xfb, 0x0f, 0x05, 0x03, 0x13, 0xff, 0x06, 0x02, 0xf5,
+    0xf4, 0x18, 0x2b, 0xf0, 0x00, 0x17, 0xfc, 0xfd, 0x05, 0x0b, 0x0e, 0x14,
+    0xe1, 0x24, 0x08, 0x24, 0xe6, 0xeb, 0x21, 0x12, 0xfb, 0x12, 0xe7, 0xf4,
+    0xe8, 0x0e, 0x18, 0xee, 0xf5, 0xf3, 0xd9, 0xf3, 0xdb, 0xec, 0x0c, 0x1e,
+    0xcf, 0x14, 0xdb, 0xe3, 0xdc, 0x02, 0x0c, 0xfb, 0xdb, 0x1b, 0xd0, 0xfe,
+    0xf9, 0xfe, 0x2a, 0xf5, 0x00, 0x0b, 0xcd, 0xe0, 0xe2, 0x0e, 0x04, 0xf8,
+    0xda, 0x1c, 0xe5, 0x0f, 0xe8, 0xf4, 0xf7, 0x15, 0x06, 0xf8, 0x02, 0xf7,
+    0x0f, 0xfb, 0x17, 0xf9, 0xda, 0x01, 0xda, 0xd1, 0xf6, 0x02, 0xfd, 0x16,
+    0xf1, 0xe4, 0xfa, 0x07, 0xee, 0x0a, 0xf3, 0xfd, 0xf2, 0x23, 0xf0, 0xe1,
+    0x0a, 0x1a, 0x12, 0x1f, 0xef, 0x27, 0x09, 0xf1, 0x0c, 0x13, 0x23, 0xfd,
+    0xf5, 0x03, 0xfe, 0x09, 0xfd, 0x16, 0xf8, 0x07, 0x08, 0x25, 0x08, 0xf8,
+    0xf6, 0x0a, 0xf1, 0xf5, 0x07, 0x09, 0x05, 0xcc, 0xf8, 0x08, 0x13, 0xf9,
+    0x1d, 0x11, 0x0f, 0xdc, 0xee, 0xf3, 0x27, 0xf9, 0xf9, 0x22, 0xfa, 0x0d,
+    0xe2, 0x13, 0xfb, 0x11, 0x03, 0x1e, 0xff, 0xfb, 0xed, 0xf1, 0x0e, 0x0b,
+    0x0f, 0x00, 0x06, 0xe0, 0x15, 0xf3, 0x13, 0xfc, 0x18, 0xf9, 0xff, 0x09,
+    0xfa, 0x1f, 0x12, 0xe5, 0xe2, 0x06, 0xf9, 0xf4, 0x07, 0x15, 0x0b, 0x04,
+    0xdb, 0x0d, 0xeb, 0xf3, 0xe6, 0x06, 0xe5, 0xee, 0xd8, 0x22, 0xd8, 0x10,
+    0xea, 0xf9, 0x1c, 0xf7, 0xd3, 0x11, 0xc3, 0xf8, 0xde, 0x05, 0x00, 0xe6,
+    0x07, 0xfd, 0xd3, 0x03, 0xea, 0xe0, 0x13, 0x14, 0xcf, 0xeb, 0xcd, 0xd3,
+    0xde, 0xf5, 0xf0, 0x0c, 0x0c, 0xfa, 0xeb, 0xd3, 0xfb, 0xfd, 0x08, 0xf9,
+    0xf4, 0x10, 0xfa, 0xd3, 0xf4, 0x11, 0x11, 0xf8, 0xef, 0xf8, 0xf8, 0xf1,
+    0xfc, 0xe1, 0xf7, 0x12, 0x04, 0xf4, 0xfb, 0xed, 0xef, 0x0c, 0xfd, 0x1c,
+    0xfe, 0x0e, 0xfd, 0xe2, 0xfe, 0x0a, 0x02, 0xfe, 0xe6, 0x1f, 0xef, 0xe5,
+    0xe6, 0xf8, 0x16, 0x27, 0xe8, 0x20, 0x05, 0xe3, 0xf1, 0xef, 0xee, 0xed,
+    0x0d, 0x11, 0x16, 0xfb, 0xf3, 0xff, 0x14, 0x01, 0xff, 0x15, 0x10, 0x02,
+    0xe5, 0x28, 0x29, 0x13, 0x13, 0x16, 0xe6, 0x00, 0xd2, 0x26, 0xfd, 0x03,
+    0x04, 0x05, 0x07, 0x06, 0xf1, 0x0e, 0x05, 0x0d, 0xe2, 0x0f, 0x02, 0xe1,
+    0x07, 0xf7, 0x1c, 0xfa, 0x14, 0x30, 0xf7, 0xee, 0x00, 0xfa, 0x3d, 0x06,
+    0x1c, 0x04, 0x06, 0x07, 0x05, 0x1a, 0x10, 0xf6, 0xee, 0x0a, 0xeb, 0x04,
+    0xeb, 0xdf, 0x1d, 0x09, 0xd5, 0xe8, 0xd6, 0xf4, 0xf0, 0x0f, 0x1d, 0xea,
+    0xf2, 0xf8, 0xa6, 0x0b, 0xdc, 0x09, 0x08, 0x24, 0xee, 0x24, 0xaa, 0xe4,
+    0xcb, 0x15, 0xef, 0xe7, 0xe9, 0x0c, 0xcf, 0x06, 0xe3, 0x12, 0x11, 0x00,
+    0x07, 0x14, 0xd7, 0xde, 0xf6, 0x0f, 0x0b, 0x04, 0xfb, 0x0d, 0xf8, 0x0d,
+    0xf6, 0x1b, 0xf1, 0x21, 0xdd, 0xfc, 0xf4, 0xe9, 0xf8, 0xe8, 0xf7, 0x06,
+    0x03, 0x1e, 0xce, 0xe1, 0xea, 0xf6, 0x05, 0xf9, 0x16, 0x15, 0x04, 0xe0,
+    0x14, 0xf7, 0x1e, 0x1c, 0x0a, 0x27, 0xef, 0xf3, 0x0f, 0xf3, 0xee, 0x04,
+    0xf8, 0xf1, 0x07, 0xe3, 0x05, 0x0b, 0x00, 0x1c, 0x15, 0x27, 0x07, 0xf7,
+    0xfa, 0x0b, 0xfa, 0xfa, 0x17, 0x13, 0xe1, 0xf5, 0xfb, 0x0c, 0x21, 0x2f,
+    0xd7, 0xfb, 0xf5, 0xfd, 0xd3, 0xf4, 0x07, 0x0e, 0xfd, 0x0b, 0xfc, 0xfa,
+    0xf5, 0x0e, 0x02, 0xfa, 0xfa, 0x19, 0xfd, 0xfa, 0xfc, 0x13, 0x24, 0x0c,
+    0xe4, 0x31, 0xf8, 0x12, 0xf4, 0x04, 0x18, 0x29, 0x27, 0x19, 0xfc, 0x08,
+    0x11, 0xe3, 0x07, 0xfe, 0x26, 0x40, 0x05, 0x02, 0x04, 0x02, 0x0f, 0xee,
+    0xf4, 0x27, 0xea, 0xf4, 0xf5, 0x11, 0x26, 0x0b, 0xe7, 0x05, 0xd2, 0xf6,
+    0xea, 0xfa, 0x0b, 0xf9, 0xfa, 0x16, 0xba, 0x00, 0xfb, 0x0d, 0x0b, 0xf9,
+    0xe6, 0xf6, 0xc5, 0xf8, 0xf6, 0x01, 0x0f, 0xed, 0xed, 0x13, 0xcd, 0x0d,
+    0xda, 0x06, 0x17, 0xee, 0x07, 0x1d, 0xb8, 0xfa, 0xe2, 0xea, 0xf2, 0xee,
+    0x04, 0x00, 0xdc, 0xd0, 0xfb, 0xf5, 0xec, 0xfe, 0xf1, 0x0d, 0xf0, 0xdb,
+    0xf9, 0x0d, 0x03, 0x03, 0x0e, 0x0a, 0xda, 0xd6, 0x01, 0xf2, 0x06, 0x14,
+    0x1c, 0x1f, 0xe8, 0xe8, 0x0e, 0xfd, 0x0c, 0xf5, 0xf3, 0x3d, 0xf3, 0x05,
+    0x10, 0xfa, 0x1b, 0x18, 0x08, 0x36, 0x09, 0xf1, 0xeb, 0xf9, 0x22, 0x01,
+    0xf3, 0xf7, 0xff, 0xf0, 0x0c, 0xe9, 0x01, 0x29, 0x21, 0x15, 0x03, 0xee,
+    0xe9, 0x1a, 0xf7, 0x15, 0x06, 0x25, 0xfa, 0xf0, 0xe4, 0xf1, 0x1f, 0x01,
+    0xdc, 0x2d, 0xce, 0xe9, 0xea, 0x0b, 0x06, 0x2c, 0x0a, 0x30, 0xe7, 0x09,
+    0xf4, 0xf0, 0x10, 0x29, 0xf9, 0x3d, 0xe7, 0xdc, 0xe4, 0xf7, 0x3b, 0x27,
+    0x23, 0x3a, 0x0a, 0x06, 0x0e, 0xfd, 0x2c, 0x07, 0x2b, 0x1c, 0xfa, 0x00,
+    0xf9, 0x11, 0xea, 0x14, 0xeb, 0xfc, 0x18, 0x03, 0xf1, 0x16, 0x12, 0x04,
+    0xcf, 0x12, 0xdd, 0xe4, 0x0e, 0xf0, 0x09, 0xe8, 0xf3, 0xfb, 0xa8, 0xf9,
+    0xee, 0xfb, 0x1e, 0x1d, 0xfd, 0x05, 0xab, 0xe5, 0xff, 0x01, 0xfe, 0x04,
+    0xf9, 0x02, 0xb9, 0xdc, 0xdf, 0x05, 0xf1, 0xef, 0xf1, 0x1e, 0xc7, 0xee,
+    0xf7, 0x1e, 0x00, 0x00, 0xf8, 0x10, 0xec, 0xe8, 0x04, 0x0f, 0xf6, 0xff,
+    0x04, 0x09, 0xe0, 0x0a, 0x0e, 0xe4, 0xf0, 0xf1, 0x16, 0x2b, 0xd3, 0xe1,
+    0x0a, 0xef, 0xf9, 0xfe, 0x0b, 0x22, 0xf5, 0x01, 0x0a, 0xf8, 0x02, 0x00,
+    0x17, 0x19, 0xf3, 0x05, 0x21, 0xfa, 0xee, 0xee, 0x12, 0xf2, 0xfa, 0xf5,
+    0x05, 0x12, 0xee, 0xe4, 0x28, 0xfa, 0xf1, 0x03, 0x15, 0x16, 0x18, 0xfd,
+    0x0f, 0x21, 0x04, 0xf4, 0xe5, 0x0c, 0x06, 0x13, 0xde, 0x36, 0xe8, 0xfb,
+    0xe7, 0xfd, 0xf6, 0x12, 0x0e, 0x1d, 0xea, 0xf8, 0xd4, 0xe8, 0x19, 0x07,
+    0xe5, 0x1c, 0xf7, 0x0c, 0xef, 0x05, 0x0f, 0x09, 0xdd, 0x1a, 0xea, 0xd7,
+    0xf9, 0xf9, 0x12, 0x17, 0x2e, 0x10, 0x08, 0xfe, 0x14, 0xf5, 0x1d, 0xfa,
+    0x06, 0x33, 0xed, 0xfe, 0xf7, 0x11, 0xf0, 0x15, 0xe2, 0x24, 0xf6, 0x0a,
+    0xe2, 0xfc, 0x23, 0x12, 0xdd, 0x11, 0xfd, 0xe5, 0x08, 0xff, 0x15, 0xf6,
+    0xf1, 0x1b, 0xae, 0xfe, 0xe6, 0x15, 0x2c, 0x2d, 0x15, 0x15, 0xc5, 0xf8,
+    0xea, 0xe7, 0x07, 0x04, 0xfe, 0x28, 0xa1, 0xf2, 0xe1, 0xf9, 0xf8, 0xff,
+    0xf4, 0x22, 0xb4, 0xdb, 0x03, 0x20, 0xe6, 0xf3, 0x0e, 0x19, 0xe3, 0x0a,
+    0xfa, 0xee, 0xf3, 0xe5, 0xd8, 0xf9, 0xf1, 0xde, 0x06, 0x05, 0xf2, 0xf5,
+    0xe7, 0x16, 0xd8, 0xfe, 0x07, 0xea, 0xee, 0x0e, 0xfa, 0xff, 0xdb, 0xe7,
+    0x03, 0xed, 0x01, 0xfd, 0x09, 0x1a, 0xfa, 0xe6, 0x05, 0x10, 0xe9, 0x01,
+    0x1f, 0x13, 0xf7, 0xf6, 0xfb, 0x13, 0xff, 0xdb, 0xed, 0xfe, 0x0a, 0x10,
+    0x09, 0x29, 0xf5, 0x04, 0xf5, 0x26, 0x0d, 0x0c, 0xf9, 0x16, 0xfa, 0x02,
+    0xf4, 0x2e, 0xde, 0xf5, 0xe1, 0x1d, 0xfb, 0x02, 0x0b, 0x23, 0x07, 0xea,
+    0xd9, 0x0a, 0xf3, 0x0a, 0x0f, 0x1e, 0xe7, 0xf1, 0xd7, 0x0b, 0xf6, 0xff,
+    0x0d, 0x24, 0xcc, 0x0a, 0xee, 0xda, 0x14, 0x12, 0x11, 0x29, 0xf4, 0x1a,
+    0xef, 0x0b, 0xfa, 0xec, 0x0c, 0x1b, 0xf4, 0xff, 0xf5, 0xef, 0x0f, 0x10,
+    0xd4, 0x04, 0xf9, 0xf8, 0xec, 0xf9, 0x21, 0x05, 0xd3, 0x27, 0xf3, 0x17,
+    0xff, 0xf6, 0x15, 0xf9, 0xed, 0x0a, 0xac, 0x02, 0xfd, 0xfb, 0x04, 0x29,
+    0x06, 0x03, 0xb8, 0xe6, 0xd5, 0x17, 0x09, 0x1b, 0xf6, 0x1b, 0xab, 0xdc,
+    0xdf, 0xfd, 0x06, 0x09, 0x09, 0x37, 0xbb, 0xed, 0x19, 0xd7, 0xe2, 0xdd,
+    0x05, 0x01, 0xec, 0xfb, 0xe4, 0x0e, 0xeb, 0xf0, 0x03, 0x17, 0x04, 0xeb,
+    0x09, 0xee, 0xeb, 0xe7, 0x0c, 0x16, 0xcb, 0x0e, 0x17, 0xd8, 0xe1, 0xf8,
+    0x2b, 0x19, 0xde, 0xeb, 0x10, 0xf2, 0xff, 0xf8, 0xee, 0x0e, 0xe7, 0xf0,
+    0x15, 0x08, 0xf8, 0xdf, 0x06, 0x0d, 0xf9, 0x14, 0xfa, 0x0b, 0x04, 0xfd,
+    0x15, 0x23, 0x20, 0xff, 0xfd, 0x1d, 0x0c, 0xf1, 0xfe, 0x15, 0x0a, 0x02,
+    0xed, 0xfe, 0xfb, 0x04, 0xfb, 0x1e, 0xdd, 0x05, 0xe0, 0x16, 0xf9, 0xf6,
+    0xfd, 0x32, 0xdc, 0xf2, 0xd3, 0x08, 0xf4, 0xec, 0x17, 0x25, 0xe2, 0xf0,
+    0xee, 0xf1, 0x0d, 0xfe, 0x13, 0x2d, 0x01, 0x11, 0xd4, 0xe4, 0x07, 0xfb,
+    0x32, 0x11, 0x14, 0x07, 0xd7, 0x02, 0x10, 0xeb, 0x2b, 0x1d, 0x01, 0xfc,
+    0xf3, 0xf0, 0x13, 0x1a, 0xdb, 0x20, 0x00, 0xf0, 0xf0, 0x05, 0x16, 0x03,
+    0xd4, 0xe3, 0xc2, 0xf0, 0x06, 0x02, 0x1e, 0x0a, 0xec, 0x1f, 0xab, 0xea,
+    0xfa, 0xe3, 0x20, 0x22, 0x03, 0x1b, 0xb3, 0x0e, 0xe3, 0xf3, 0x1d, 0x27,
+    0xe3, 0x10, 0xa7, 0xda, 0xf3, 0x00, 0x0a, 0x0a, 0x04, 0xfb, 0xb2, 0x0f,
+    0x0c, 0xf5, 0x07, 0xff, 0x13, 0x1e, 0xdb, 0xf6, 0xf9, 0xef, 0xe8, 0xe7,
+    0xfb, 0x18, 0xeb, 0xec, 0x09, 0xda, 0xf1, 0xf0, 0x0b, 0x04, 0xe1, 0xfa,
+    0x1c, 0x25, 0xee, 0x01, 0x0b, 0x29, 0xd7, 0x0c, 0x04, 0x0b, 0xef, 0xfd,
+    0x1c, 0xfc, 0xf1, 0xfb, 0x0b, 0x0f, 0xdf, 0xed, 0x17, 0x38, 0x0c, 0xd7,
+    0xff, 0xfd, 0x01, 0xfc, 0xfb, 0xfb, 0x18, 0x1a, 0x18, 0xe3, 0xf9, 0xf4,
+    0xfa, 0x20, 0x06, 0x09, 0x11, 0x08, 0x1d, 0xf8, 0xfa, 0x1d, 0xf5, 0x1c,
+    0xf5, 0xfe, 0x03, 0x07, 0xe4, 0x33, 0xc8, 0x0c, 0xe1, 0x13, 0xff, 0xe5,
+    0x10, 0x2c, 0xd3, 0xf0, 0xed, 0x04, 0x07, 0x01, 0xf1, 0x16, 0xe0, 0x13,
+    0xfa, 0x11, 0x07, 0xfa, 0x19, 0x16, 0x01, 0x00, 0x07, 0x26, 0x00, 0xec,
+    0x1d, 0x23, 0x05, 0xf4, 0x07, 0x17, 0x2c, 0x1d, 0xee, 0xf0, 0x0c, 0x09,
+    0xe3, 0x1a, 0x24, 0x0b, 0xf3, 0x1e, 0xce, 0xfe, 0xfe, 0x12, 0x21, 0x1a,
+    0xf6, 0x23, 0xc3, 0x03, 0xf4, 0x10, 0x1a, 0x2a, 0xf4, 0x08, 0xbf, 0xff,
+    0x04, 0xf4, 0x0b, 0x1d, 0x1a, 0xf8, 0xcc, 0x00, 0xf7, 0x13, 0xf4, 0xfd,
+    0xf4, 0x19, 0xbd, 0xef, 0x0c, 0x0d, 0x02, 0xfc, 0x12, 0x13, 0xe9, 0xe7,
+    0xf5, 0xfa, 0xfa, 0xf6, 0x1a, 0x2e, 0xce, 0xd4, 0x01, 0x12, 0xfd, 0xfc,
+    0x26, 0x10, 0xcc, 0xe7, 0xee, 0x13, 0xee, 0xff, 0xef, 0xea, 0x00, 0x0e,
+    0x1a, 0x17, 0x04, 0x0c, 0x04, 0x0c, 0xe6, 0xf3, 0xf6, 0xdb, 0xdd, 0x04,
+    0xf4, 0x22, 0x11, 0x16, 0xf3, 0x07, 0xec, 0xf8, 0xf2, 0x07, 0x03, 0x02,
+    0xf5, 0x0a, 0xf6, 0x02, 0x1d, 0x1b, 0x11, 0x06, 0xf8, 0x06, 0x02, 0xea,
+    0xf3, 0x1d, 0xce, 0x00, 0xed, 0xf9, 0xef, 0xf6, 0xec, 0x22, 0xc7, 0xf0,
+    0xed, 0xdb, 0xe0, 0x02, 0x11, 0x07, 0xe8, 0xf0, 0xd1, 0xed, 0xff, 0xfd,
+    0x0c, 0x2e, 0xd4, 0xed, 0xec, 0x0e, 0xf1, 0x07, 0x01, 0x0e, 0x0e, 0xfe,
+    0xda, 0x0b, 0x0a, 0x0a, 0x1f, 0x2e, 0x13, 0x07, 0x00, 0x07, 0x14, 0x21,
+    0xe9, 0xfc, 0xf0, 0x1e, 0xd7, 0xea, 0x34, 0x07, 0xc6, 0x0c, 0xd4, 0xec,
+    0xfd, 0x06, 0x24, 0x0a, 0xf3, 0x15, 0xaf, 0xff, 0xe9, 0xf1, 0x0d, 0x3e,
+    0xe9, 0x18, 0xba, 0x13, 0xed, 0xd7, 0x0b, 0x31, 0x05, 0x0e, 0xaf, 0x13,
+    0xd6, 0x0e, 0x10, 0x02, 0x02, 0x14, 0xcb, 0xd5, 0xf9, 0x0c, 0xf9, 0x0e,
+    0x1f, 0x24, 0xd5, 0xeb, 0xff, 0xf1, 0xf5, 0x0c, 0x08, 0x07, 0xf4, 0xd7,
+    0x06, 0x10, 0xe8, 0xef, 0xfc, 0x2f, 0xee, 0xf1, 0x18, 0xf8, 0xf4, 0x02,
+    0x11, 0x21, 0xd3, 0x12, 0x14, 0xe4, 0xf4, 0x02, 0x05, 0x24, 0xca, 0xf2,
+    0xf3, 0xeb, 0xe7, 0xf8, 0x16, 0x1a, 0xeb, 0x0d, 0x05, 0x16, 0xf1, 0xec,
+    0x11, 0x1c, 0x09, 0x1e, 0xe0, 0xe6, 0xfa, 0x0e, 0x0d, 0x2a, 0xea, 0x2e,
+    0xed, 0xf9, 0xf7, 0x16, 0x09, 0x05, 0xdd, 0xd6, 0x02, 0xeb, 0xf5, 0xf3,
+    0xe4, 0x3b, 0xed, 0x04, 0xe0, 0x0e, 0xfd, 0x09, 0xfd, 0x35, 0xdc, 0x18,
+    0xf3, 0x04, 0xfa, 0x05, 0x15, 0x34, 0xe5, 0xe1, 0xe4, 0xf4, 0xe0, 0xf9,
+    0x08, 0x32, 0x04, 0x08, 0xf4, 0x0f, 0xff, 0x08, 0x09, 0x2f, 0x06, 0x02,
+    0xfd, 0x05, 0x0c, 0x24, 0xe3, 0x1e, 0xf5, 0x0c, 0xdd, 0xf8, 0x18, 0x20,
+    0xd8, 0x14, 0xef, 0xf4, 0x17, 0x08, 0x25, 0x14, 0x04, 0x06, 0xb0, 0xf5,
+    0xf5, 0x09, 0x0f, 0x3e, 0xff, 0x28, 0xb3, 0xf5, 0x19, 0xd8, 0x14, 0x21,
+    0xd9, 0xf7, 0xb7, 0xe5, 0xfe, 0xe7, 0x07, 0x1e, 0x04, 0x15, 0xc5, 0xf9,
+    0x14, 0x20, 0xeb, 0x01, 0x01, 0x18, 0xce, 0x00, 0xe6, 0xe2, 0xf7, 0xfb,
+    0xf3, 0x0d, 0xd3, 0xf3, 0x04, 0xf8, 0xf0, 0x03, 0xf1, 0x25, 0xb5, 0xef,
+    0x05, 0xe0, 0x01, 0xf6, 0x04, 0x16, 0xd1, 0x01, 0x0a, 0x21, 0x01, 0x05,
+    0x0e, 0x01, 0xf0, 0x0a, 0xf3, 0x00, 0x03, 0xf8, 0xfa, 0x03, 0x0b, 0xde,
+    0xfe, 0xff, 0xfb, 0xea, 0x09, 0x02, 0xf5, 0xe8, 0xe7, 0x08, 0x00, 0xf5,
+    0xf8, 0x0f, 0x13, 0xfa, 0xeb, 0xe8, 0xfb, 0x1f, 0x08, 0x16, 0xe6, 0xfa,
+    0xe1, 0x00, 0x03, 0xdd, 0xf1, 0x26, 0xe5, 0x1d, 0xd9, 0xff, 0xf2, 0xf8,
+    0xff, 0x33, 0xea, 0xe5, 0x03, 0x0c, 0x07, 0xf9, 0xf8, 0x0f, 0xe1, 0x1e,
+    0xdd, 0x0f, 0x00, 0xf1, 0x06, 0x21, 0x09, 0x05, 0xf3, 0xec, 0xe6, 0x04,
+    0x07, 0x32, 0xf1, 0xf9, 0xf2, 0x01, 0x18, 0x1f, 0xd2, 0xe2, 0x0a, 0xf4,
+    0xca, 0xfc, 0x28, 0x16, 0xc2, 0x10, 0xf2, 0xfc, 0x08, 0xe9, 0x2a, 0x0f,
+    0xfa, 0xf5, 0xa9, 0x07, 0xec, 0xe9, 0x19, 0x43, 0x0b, 0x1c, 0xa6, 0xe9,
+    0xf4, 0x16, 0x0d, 0x2b, 0xfc, 0x11, 0x9a, 0xe1, 0xf1, 0x1c, 0xf5, 0x0f,
+    0xe4, 0x18, 0xc0, 0xd9, 0x14, 0x26, 0xe6, 0xf8, 0x0a, 0x17, 0xec, 0xfb,
+    0xe1, 0x22, 0xdf, 0xf2, 0xfe, 0x1e, 0xd4, 0xeb, 0xd7, 0x0e, 0x08, 0xf6,
+    0xef, 0xfc, 0xe6, 0xd4, 0xf7, 0x0b, 0xfb, 0xf5, 0x01, 0x25, 0xd7, 0xfb,
+    0x0d, 0xfe, 0xff, 0xf3, 0x1d, 0x32, 0xfe, 0xee, 0x12, 0xf2, 0x0c, 0xec,
+    0x02, 0x10, 0xef, 0x01, 0xf2, 0x0b, 0xf3, 0xf7, 0xfa, 0x25, 0xfb, 0x0d,
+    0x11, 0x15, 0x04, 0xfc, 0x0c, 0x21, 0x12, 0x29, 0x00, 0xfa, 0xf6, 0xf5,
+    0x06, 0x22, 0xea, 0xe2, 0xee, 0x00, 0xfd, 0xf0, 0x0b, 0x1d, 0xd3, 0xe4,
+    0xe4, 0x0a, 0xfc, 0xe8, 0xea, 0x2c, 0xed, 0xed, 0xef, 0xe8, 0xf2, 0x05,
+    0xfd, 0x15, 0xd8, 0xda, 0xca, 0xee, 0xfa, 0x00, 0xfe, 0x0e, 0xf2, 0xf0,
+    0x0e, 0xf5, 0x04, 0x03, 0x1d, 0x2b, 0xee, 0x05, 0x0f, 0x10, 0x13, 0x35,
+    0xe2, 0x04, 0x10, 0xdf, 0xcf, 0xeb, 0x40, 0x26, 0xe4, 0x03, 0xf3, 0xf9,
+    0xf5, 0x14, 0x24, 0x2a, 0xdf, 0xfe, 0xab, 0xe5, 0xfe, 0x1c, 0x27, 0x35,
+    0xdb, 0xff, 0xac, 0x01, 0xf6, 0xfc, 0x19, 0x1a, 0x11, 0x1f, 0xa8, 0xf5,
+    0x02, 0x0f, 0x1a, 0x1f, 0xf7, 0xf2, 0xa2, 0x00, 0x15, 0x22, 0xe4, 0x13,
+    0x00, 0x09, 0xd9, 0xd5, 0x02, 0x19, 0xfd, 0xf8, 0xe7, 0xff, 0xfb, 0xe0,
+    0xef, 0xf7, 0xee, 0xf3, 0xf3, 0x19, 0xb0, 0xdf, 0x00, 0x0f, 0x08, 0xf3,
+    0x15, 0x17, 0xec, 0x0f, 0x11, 0x14, 0x02, 0x08, 0x10, 0x17, 0xe6, 0x08,
+    0xf7, 0x00, 0xed, 0xf7, 0x29, 0x07, 0x10, 0x05, 0x05, 0xe7, 0xed, 0xf4,
+    0xf9, 0x15, 0xf9, 0xf0, 0x08, 0x00, 0x03, 0x09, 0x21, 0x28, 0xf6, 0x0e,
+    0xfb, 0xf3, 0x03, 0xf7, 0x0f, 0x0c, 0xf0, 0xf5, 0xe3, 0xd8, 0xf8, 0xf2,
+    0x09, 0x1c, 0xe7, 0xfb, 0xe4, 0xf6, 0xfa, 0xf8, 0xf1, 0x42, 0xf6, 0xda,
+    0xdd, 0xd7, 0xfa, 0xff, 0x2f, 0x2c, 0xda, 0x0a, 0xde, 0xec, 0xf1, 0x14,
+    0xfb, 0x1d, 0xeb, 0xee, 0xf2, 0xeb, 0xf3, 0xed, 0x0e, 0x35, 0xf0, 0x06,
+    0x19, 0x04, 0x2f, 0x23, 0xe2, 0x07, 0x13, 0x0f, 0xe9, 0xf0, 0x22, 0x2e,
+    0xd9, 0x1a, 0xcb, 0xed, 0xfd, 0x04, 0x27, 0x1e, 0xf6, 0x07, 0x96, 0xd6,
+    0xd8, 0x11, 0x18, 0x56, 0xd2, 0xfb, 0x92, 0xfc, 0x0b, 0x0a, 0x17, 0x2c,
+    0xe5, 0x04, 0xa2, 0xf8, 0xe2, 0x04, 0x1a, 0x0d, 0xeb, 0x11, 0xa2, 0xe5,
+    0xe5, 0xf8, 0x02, 0xf7, 0x17, 0x03, 0xca, 0xe9, 0x0c, 0x1f, 0xfe, 0xf5,
+    0x18, 0x12, 0xdd, 0x08, 0x15, 0xff, 0xfc, 0xf6, 0xe1, 0x1d, 0xe2, 0xe1,
+    0xfe, 0xfc, 0x03, 0xff, 0xf2, 0x23, 0xd2, 0x01, 0x13, 0xdd, 0xf3, 0xf4,
+    0xf2, 0x07, 0xef, 0x03, 0x15, 0x21, 0xd8, 0xf8, 0x09, 0xf3, 0xe8, 0xea,
+    0xe8, 0xf2, 0x08, 0xf0, 0x04, 0x1a, 0xf2, 0x19, 0xfb, 0x1b, 0x15, 0xfc,
+    0x1d, 0x30, 0xe5, 0x1e, 0x09, 0xe8, 0xe9, 0x09, 0xf7, 0x2a, 0xe1, 0x0e,
+    0x00, 0x21, 0xf3, 0xff, 0xfb, 0x01, 0xdf, 0xf2, 0xfe, 0xf4, 0xfc, 0xf0,
+    0x0b, 0x0b, 0xdd, 0xe4, 0xd2, 0x14, 0xf7, 0xfe, 0x0b, 0x39, 0x01, 0xe6,
+    0xe4, 0x27, 0xfa, 0xe4, 0x04, 0x2c, 0xe2, 0x04, 0xf5, 0x07, 0xf2, 0x03,
+    0xf0, 0x10, 0xf5, 0xf6, 0xfc, 0x16, 0x22, 0x1b, 0xf8, 0x11, 0xe4, 0x09,
+    0xf6, 0xf0, 0x41, 0x1e, 0xcf, 0x04, 0xea, 0xee, 0x0e, 0xf6, 0x1b, 0x2f,
+    0xc7, 0xf1, 0xba, 0xef, 0x0f, 0x16, 0x1e, 0x39, 0x05, 0x1e, 0x90, 0xe6,
+    0x0d, 0xfa, 0x22, 0x3f, 0xe3, 0x23, 0xa5, 0xe3, 0xe9, 0x0f, 0x05, 0x27,
+    0x02, 0x11, 0x99, 0x05, 0xfa, 0x05, 0x03, 0x01, 0xff, 0x26, 0xd3, 0xf7,
+    0xf7, 0xf9, 0x05, 0xf4, 0xef, 0x23, 0xd2, 0xdd, 0x05, 0x08, 0xfa, 0xff,
+    0x03, 0x04, 0xbd, 0xd7, 0x14, 0x06, 0xef, 0x06, 0xe5, 0x05, 0xea, 0xea,
+    0x02, 0xfd, 0x0d, 0x00, 0x08, 0xff, 0xe7, 0xfb, 0xfe, 0x13, 0xfe, 0xec,
+    0xf9, 0x02, 0xf3, 0xff, 0xff, 0x08, 0x04, 0xed, 0x19, 0x1d, 0xfa, 0x0a,
+    0x0d, 0xf2, 0x0f, 0xec, 0x25, 0x1c, 0xec, 0x0b, 0x01, 0xff, 0x01, 0xf6,
+    0x08, 0x09, 0xe8, 0xe2, 0xec, 0x23, 0xe5, 0xe9, 0xf0, 0x2e, 0xbd, 0xe1,
+    0xef, 0x14, 0xe9, 0xf6, 0xf5, 0x1d, 0xdc, 0xe3, 0xd7, 0xfc, 0xf9, 0xf2,
+    0xfe, 0x24, 0xf2, 0x05, 0xd5, 0xed, 0xe9, 0xf9, 0xfa, 0x2d, 0xf0, 0xfe,
+    0xee, 0xf2, 0xe8, 0xf7, 0x06, 0x14, 0x01, 0x10, 0x06, 0xf3, 0x0e, 0x0e,
+    0xc2, 0x1d, 0xf2, 0x1c, 0xed, 0xe3, 0x53, 0x21, 0xb8, 0x0c, 0xde, 0x03,
+    0x15, 0xeb, 0x46, 0x39, 0xdf, 0xf6, 0xa3, 0xee, 0xf6, 0xe0, 0x33, 0x50,
+    0xdd, 0x27, 0x9f, 0x07, 0x13, 0xe2, 0x1f, 0x35, 0xed, 0x1f, 0xb7, 0x07,
+    0x11, 0xed, 0x17, 0x28, 0xf4, 0x20, 0xc1, 0xec, 0xef, 0x16, 0x02, 0xfa,
+    0xe0, 0x1b, 0xf7, 0xdb, 0xfd, 0x0a, 0xe7, 0xfb, 0xe7, 0x25, 0xe2, 0xe7,
+    0xf8, 0xf0, 0xee, 0xe9, 0x02, 0x06, 0xc9, 0xe4, 0x14, 0xe3, 0xe2, 0xf7,
+    0xf8, 0xfd, 0xdd, 0xe2, 0x08, 0x0a, 0xe4, 0x05, 0xf5, 0x16, 0xe7, 0x01,
+    0x00, 0x1c, 0xe7, 0xf0, 0xf6, 0x19, 0xfe, 0x0c, 0xf2, 0x06, 0x03, 0xe8,
+    0x0b, 0xfe, 0xe3, 0x19, 0x08, 0x1a, 0x10, 0xfd, 0x00, 0x21, 0xf0, 0xeb,
+    0x18, 0x02, 0xf3, 0x04, 0xf0, 0x18, 0xdb, 0x05, 0x01, 0xde, 0xed, 0xe9,
+    0x23, 0x15, 0xaf, 0xe6, 0xf1, 0x0a, 0xe6, 0xea, 0x01, 0x18, 0xd8, 0xfd,
+    0xf1, 0xe6, 0xec, 0xf5, 0x0e, 0x1e, 0xcc, 0xfc, 0xe7, 0x00, 0xe9, 0x11,
+    0x00, 0x30, 0xf9, 0x14, 0xf4, 0x19, 0xdd, 0xf7, 0xf7, 0x2f, 0xf4, 0xf2,
+    0xff, 0x27, 0x15, 0x1c, 0xbc, 0x2f, 0xe9, 0x14, 0xf5, 0xe8, 0x44, 0x30,
+    0xe8, 0x1d, 0xe4, 0x18, 0x11, 0x00, 0x0c, 0x2b, 0xf3, 0x29, 0x96, 0xe0,
+    0x06, 0xee, 0x3e, 0x55, 0xdc, 0x13, 0x98, 0xdf, 0xf0, 0xfe, 0x17, 0x33,
+    0xe8, 0x09, 0xa3, 0x07, 0xef, 0x0e, 0x1d, 0x37, 0xdd, 0xfe, 0xb5, 0x00,
+    0xf7, 0xe0, 0xea, 0xfd, 0xfd, 0x19, 0xbc, 0xfd, 0x15, 0xfe, 0x01, 0xf3,
+    0xd5, 0x20, 0xbf, 0xe3, 0x15, 0x0e, 0xf0, 0xf6, 0xf2, 0x14, 0xcc, 0xf0,
+    0xf7, 0x04, 0xf2, 0xff, 0x0b, 0x02, 0xd2, 0xd8, 0xfa, 0xfc, 0xe5, 0x02,
+    0x00, 0xfb, 0xf0, 0xdc, 0x1e, 0x10, 0x02, 0x01, 0x00, 0x18, 0xe9, 0xdb,
+    0x1e, 0xf6, 0xfc, 0x03, 0xef, 0x0a, 0x00, 0x16, 0x00, 0x0f, 0xf4, 0x16,
+    0xfa, 0x0b, 0xe2, 0xfa, 0xe0, 0x07, 0xfb, 0x02, 0x21, 0x0e, 0xdd, 0x0b,
+    0xea, 0xf0, 0xeb, 0xfb, 0x19, 0x09, 0xd4, 0xf2, 0xef, 0x0b, 0x00, 0xeb,
+    0x1a, 0x2f, 0xea, 0x06, 0x03, 0xf6, 0xf8, 0xfb, 0xfe, 0x1d, 0xea, 0xdd,
+    0xed, 0xfd, 0xfb, 0xe7, 0xfe, 0x18, 0xf4, 0xfc, 0x0b, 0xf6, 0xfc, 0x0b,
+    0xfb, 0x28, 0x07, 0xff, 0x07, 0x1e, 0x03, 0x21, 0xcf, 0x22, 0x05, 0xe6,
+    0xea, 0xe7, 0x43, 0x2e, 0xe7, 0x14, 0xfb, 0x0a, 0x1e, 0xfe, 0x2c, 0x24,
+    0xd5, 0xfd, 0x9e, 0xd1, 0xf2, 0x1c, 0x32, 0x51, 0x01, 0xf3, 0xac, 0xe1,
+    0xf4, 0xe5, 0x1c, 0x37, 0xf1, 0x0f, 0xa7, 0xdb, 0x00, 0xf6, 0x0f, 0x18,
+    0xe1, 0x10, 0xc9, 0xc5, 0xe8, 0xeb, 0xf2, 0xfd, 0xf6, 0x02, 0xc2, 0xff,
+    0x00, 0x19, 0x03, 0x0f, 0x02, 0x22, 0xd4, 0xe7, 0x07, 0x0f, 0xe5, 0x1a,
+    0x09, 0x0b, 0xdc, 0xd2, 0x00, 0x05, 0xee, 0xf8, 0xdc, 0x14, 0xd0, 0x0a,
+    0x0a, 0xfa, 0xeb, 0x04, 0xf3, 0x06, 0xde, 0x05, 0xfb, 0xfd, 0xe3, 0xec,
+    0xfd, 0x14, 0xd7, 0x11, 0x0e, 0xe6, 0x06, 0xec, 0xde, 0x22, 0xd7, 0x00,
+    0x03, 0xf5, 0xf5, 0x0d, 0x01, 0x05, 0xea, 0x0b, 0x16, 0x04, 0xff, 0x13,
+    0xf3, 0x12, 0xd2, 0xdf, 0x0b, 0xe4, 0x06, 0xf6, 0x08, 0x2d, 0xd3, 0xd6,
+    0xe7, 0x0a, 0xec, 0xff, 0xfe, 0x01, 0xdf, 0xf4, 0xdf, 0x1c, 0xfe, 0xf9,
+    0xf7, 0x13, 0xca, 0xff, 0x03, 0x06, 0xe9, 0xf7, 0x06, 0x08, 0xd7, 0xf3,
+    0xed, 0x08, 0xe3, 0xfd, 0x0c, 0x11, 0x15, 0xfb, 0x15, 0x08, 0x28, 0x40,
+    0xe7, 0x0d, 0x08, 0xec, 0xe8, 0x16, 0x67, 0x46, 0xc8, 0x16, 0xf1, 0x02,
+    0x24, 0x00, 0x3a, 0x43, 0xd6, 0x12, 0xae, 0xe7, 0xf4, 0xf8, 0x3a, 0x65,
+    0xe4, 0x0c, 0xb2, 0xef, 0x1f, 0xe8, 0x29, 0x59, 0xf8, 0x11, 0xc4, 0xe1,
+    0xfe, 0xfa, 0x27, 0x43, 0xc9, 0x1e, 0xbb, 0xfb, 0xf3, 0x13, 0x15, 0x0d,
+    0xf1, 0x13, 0xcd, 0xf0, 0x07, 0x19, 0x07, 0x00, 0xd8, 0xeb, 0xbf, 0xf0,
+    0xfc, 0xf6, 0xef, 0x16, 0x01, 0x02, 0xc1, 0xdf, 0xfd, 0xe9, 0x06, 0x06,
+    0xf1, 0x08, 0xd7, 0xcc, 0xfb, 0x0e, 0xfc, 0x14, 0xf2, 0x1a, 0xe2, 0x0d,
+    0xeb, 0x09, 0x07, 0x10, 0xe6, 0x13, 0xeb, 0xf5, 0x15, 0x14, 0xeb, 0xfe,
+    0xf9, 0x17, 0xd2, 0xe3, 0x1e, 0xf5, 0x04, 0x0a, 0xf1, 0x0e, 0xde, 0xe7,
+    0x01, 0x20, 0x0c, 0xfc, 0xdc, 0xf9, 0xe5, 0xe9, 0xff, 0x1d, 0x0a, 0xfe,
+    0xec, 0x25, 0xaf, 0xd2, 0x01, 0x16, 0xfc, 0x17, 0xe8, 0x1e, 0xcd, 0xd9,
+    0xe2, 0xf1, 0xeb, 0x08, 0xff, 0x33, 0xe5, 0xfb, 0xeb, 0x04, 0xfe, 0xf7,
+    0xfd, 0x1f, 0xee, 0xff, 0xed, 0xf8, 0xe0, 0xff, 0xfd, 0x2b, 0x0a, 0xf5,
+    0x15, 0x1d, 0xf3, 0x3f, 0x16, 0xf6, 0xf2, 0xee, 0xf4, 0xef, 0xf0, 0x56,
+    0x0a, 0x1a, 0xbc, 0xfc, 0x2f, 0xfb, 0xf0, 0x56, 0x1e, 0x0e, 0xc6, 0xe8,
+    0x06, 0x0b, 0x11, 0x62, 0x3e, 0xf9, 0xb8, 0xc9, 0xed, 0xeb, 0x02, 0x63,
+    0x2c, 0xfd, 0xc5, 0xe9, 0x00, 0x17, 0x0f, 0x37, 0xfe, 0x20, 0xcc, 0xe0,
+    0xe0, 0x0e, 0xe6, 0x20, 0x0a, 0xfd, 0xdf, 0xee, 0x0b, 0x02, 0xee, 0x1f,
+    0xfb, 0x06, 0xd2, 0xed, 0xfe, 0xeb, 0xfc, 0x12, 0xfd, 0x14, 0x00, 0xd8,
+    0x08, 0xf6, 0xec, 0x17, 0xf9, 0x10, 0x00, 0xd9, 0x18, 0xf1, 0xee, 0x0f,
+    0xf4, 0x03, 0xee, 0xeb, 0xf0, 0xef, 0xf2, 0x06, 0x04, 0x00, 0xf4, 0x0f,
+    0x09, 0x06, 0xf7, 0x0b, 0xfd, 0x01, 0x03, 0x03, 0xf4, 0xf6, 0xdd, 0x14,
+    0x1c, 0xef, 0xf1, 0xdd, 0xf7, 0x13, 0xd9, 0x15, 0xef, 0x02, 0xd2, 0xe7,
+    0x05, 0x05, 0xe2, 0x09, 0xf2, 0x11, 0xf5, 0xba, 0xf0, 0x04, 0xe0, 0x01,
+    0x06, 0x10, 0xe6, 0xef, 0xfc, 0x12, 0xf9, 0xf4, 0x1b, 0x2f, 0xe3, 0x0f,
+    0xd7, 0xf6, 0x0b, 0x11, 0xf7, 0x0c, 0x00, 0x06, 0x18, 0xef, 0x06, 0x03,
+    0x0a, 0x09, 0xf6, 0x1a, 0x0d, 0xed, 0xfe, 0x2c, 0x43, 0xf4, 0xe5, 0xde,
+    0xf5, 0x02, 0x25, 0x5a, 0x49, 0xd4, 0xe6, 0x24, 0x1e, 0xf7, 0x0e, 0x5c,
+    0x5d, 0xf0, 0xf9, 0xe4, 0x1c, 0xeb, 0x28, 0x7f, 0x5b, 0xec, 0xfa, 0xdb,
+    0x0c, 0xf5, 0x20, 0x49, 0x51, 0xe1, 0xed, 0xe6, 0x0e, 0x26, 0x28, 0x33,
+    0x35, 0x05, 0xe1, 0xe4, 0x1f, 0xfc, 0xf9, 0x39, 0x18, 0x04, 0xed, 0xed,
+    0x01, 0xe7, 0xe6, 0x08, 0x09, 0x03, 0xe7, 0xf9, 0x0e, 0x06, 0xec, 0x08,
+    0x12, 0x1a, 0xda, 0xef, 0xdf, 0xf9, 0xe2, 0x1e, 0x1c, 0x00, 0x12, 0xd7,
+    0x01, 0xf7, 0x21, 0x17, 0x13, 0x19, 0xde, 0xe0, 0xec, 0x16, 0x01, 0x1b,
+    0x06, 0x0c, 0xf0, 0xe8, 0x18, 0x03, 0x06, 0x0e, 0x09, 0xfa, 0x03, 0xf3,
+    0xdd, 0x01, 0xfb, 0x0a, 0x2a, 0xf4, 0xf6, 0xda, 0xe9, 0xfe, 0xe9, 0x12,
+    0x19, 0xe9, 0x05, 0xdf, 0x00, 0xeb, 0xf2, 0x10, 0x0c, 0xe1, 0xcd, 0xcb,
+    0xf2, 0x1f, 0xd9, 0x0c, 0xfa, 0xfb, 0xe8, 0xde, 0x00, 0xfc, 0xe5, 0x00,
+    0x11, 0x02, 0xe6, 0x17, 0x14, 0x00, 0xf2, 0xfd, 0x00, 0xe1, 0x10, 0x24,
+    0x12, 0xec, 0xed, 0x1e, 0x09, 0x18, 0x03, 0x0c, 0x04, 0xf4, 0x15, 0x0f,
+    0x10, 0x18, 0xd6, 0x29, 0x10, 0x04, 0x1c, 0xef, 0x0f, 0x0c, 0xc7, 0x04,
+    0xfe, 0xeb, 0xff, 0xf5, 0xe3, 0x15, 0xfe, 0xcb, 0x10, 0xff, 0x12, 0xfb,
+    0xe4, 0xeb, 0xf9, 0x00, 0x02, 0xf1, 0x14, 0x13, 0x01, 0x02, 0xf9, 0x01,
+    0x06, 0x0c, 0xf5, 0x0a, 0x1e, 0x01, 0x19, 0x0e, 0x05, 0xf5, 0x0a, 0xff,
+    0xff, 0xf2, 0xfb, 0xdb, 0xf8, 0x06, 0x17, 0xf2, 0xf7, 0x0d, 0x0e, 0xf4,
+    0xfa, 0xf7, 0x14, 0xdb, 0xe0, 0xfd, 0x08, 0x16, 0xf7, 0x16, 0xfc, 0x09,
+    0x27, 0x07, 0x09, 0xfb, 0x0a, 0xfc, 0x0c, 0xe4, 0xdb, 0xee, 0xff, 0x10,
+    0xf3, 0x09, 0xfa, 0xf4, 0x23, 0xf3, 0xf4, 0x19, 0xff, 0xfa, 0xff, 0x19,
+    0x0f, 0x11, 0xed, 0xec, 0xf8, 0x0f, 0x10, 0xf3, 0xff, 0x0b, 0xf7, 0x06,
+    0x0b, 0x0e, 0x07, 0xe4, 0x18, 0x0a, 0x08, 0x0e, 0x02, 0x0a, 0x05, 0x19,
+    0x02, 0xf3, 0xfe, 0xfe, 0x0b, 0x0f, 0xfc, 0xfa, 0x05, 0xf9, 0xe2, 0xf9,
+    0x1b, 0xf7, 0x0f, 0x07, 0xfc, 0x12, 0xfe, 0x01, 0xfd, 0xf0, 0x04, 0xf4,
+    0xfd, 0x07, 0xf2, 0x04, 0x04, 0x07, 0xef, 0x0c, 0xed, 0x0e, 0xf6, 0xef,
+    0x08, 0x07, 0x04, 0xe9, 0xf3, 0x20, 0xda, 0x15, 0xf8, 0xff, 0xec, 0xe0,
+    0xf6, 0xff, 0xe9, 0x08, 0x01, 0x10, 0xf0, 0xfc, 0xe9, 0x08, 0xe8, 0xf5,
+    0xf8, 0xe5, 0x17, 0xe6, 0x03, 0xfc, 0x09, 0xf5, 0xdd, 0xf2, 0xff, 0x05,
+    0xf6, 0xf8, 0xf5, 0x07, 0xfc, 0xf1, 0x04, 0xf3, 0x13, 0xe1, 0x0f, 0xf2,
+    0x0a, 0xf9, 0xfd, 0x1c, 0xe0, 0x11, 0x1b, 0xe6, 0xef, 0x05, 0x05, 0x0c,
+    0x23, 0x10, 0x09, 0xfe, 0xf7, 0x1a, 0xf1, 0xfc, 0x11, 0x1d, 0xff, 0x03,
+    0x03, 0xe6, 0x07, 0x11, 0x0c, 0x0d, 0x16, 0x05, 0x05, 0x25, 0xf3, 0x10,
+    0x10, 0x06, 0x09, 0xe8, 0x1a, 0xf0, 0xee, 0x09, 0xff, 0x24, 0xf7, 0xfb,
+    0xe6, 0x06, 0xfa, 0x08, 0x03, 0x00, 0xf2, 0x04, 0xf0, 0xeb, 0x14, 0x1c,
+    0x03, 0x21, 0x14, 0x1d, 0xfe, 0x03, 0xf6, 0x02, 0x09, 0xff, 0x00, 0x13,
+    0xef, 0x10, 0x1e, 0x0b, 0x1d, 0x1c, 0xf1, 0xf6, 0xe7, 0xfd, 0x14, 0x01,
+    0xff, 0x13, 0xf7, 0xfc, 0x00, 0x21, 0xe3, 0xeb, 0x07, 0x0e, 0x09, 0xf1,
+    0xf8, 0xfd, 0x03, 0xee, 0x19, 0xfd, 0xff, 0xfb, 0xff, 0xea, 0xfb, 0x07,
+    0xf0, 0x0a, 0x04, 0x04, 0x0b, 0x12, 0xfe, 0x0b, 0xe0, 0xff, 0xf6, 0xe5,
+    0xfc, 0x11, 0xed, 0xfd, 0x15, 0x03, 0xdd, 0xdb, 0x04, 0xfe, 0xff, 0x0e,
+    0xff, 0xfa, 0xfb, 0xe5, 0xef, 0xf6, 0xfe, 0x22, 0x0f, 0xe8, 0xfe, 0xf4,
+    0xfd, 0xd9, 0x03, 0x0a, 0xdf, 0xcf, 0xf1, 0x14, 0x05, 0xfd, 0xfb, 0xf3,
+    0xfb, 0xfb, 0x0f, 0xf8, 0x05, 0x09, 0x03, 0xf7, 0x05, 0x05, 0x13, 0xfb,
+    0xeb, 0x23, 0xe7, 0x18, 0xfb, 0x00, 0xfe, 0xdd, 0xe9, 0xea, 0xd3, 0xe8,
+    0x1a, 0xef, 0x01, 0xf1, 0x09, 0x1d, 0xd8, 0xfc, 0xda, 0x19, 0x03, 0xec,
+    0xe5, 0xf3, 0xed, 0x0a, 0xf4, 0x13, 0x0b, 0xf7, 0x0c, 0x00, 0xf9, 0xea,
+    0xe3, 0xfe, 0xff, 0x0d, 0x0a, 0x1b, 0xd7, 0x17, 0xeb, 0xe9, 0x00, 0x0e,
+    0xee, 0x24, 0xef, 0x09, 0x07, 0xf0, 0xf5, 0x07, 0xf5, 0xf5, 0x10, 0x17,
+    0x06, 0xf7, 0xfc, 0x02, 0xfb, 0xf9, 0xe7, 0x0a, 0x26, 0xf3, 0x01, 0x01,
+    0x09, 0x0b, 0x02, 0x27, 0xf8, 0xee, 0xfd, 0x1c, 0xf8, 0xf2, 0x0f, 0xfc,
+    0x0d, 0xe0, 0xea, 0x02, 0x0b, 0x00, 0xe0, 0x08, 0xfe, 0x10, 0x04, 0xfe,
+    0xeb, 0x13, 0x01, 0x0c, 0x0e, 0xed, 0x09, 0x01, 0x0c, 0xe3, 0x10, 0xdf,
+    0xd1, 0x14, 0xf3, 0xef, 0x09, 0xf0, 0xee, 0xe5, 0x11, 0xf4, 0xf6, 0x00,
+    0xe8, 0x20, 0x0a, 0xfc, 0xea, 0xf7, 0x02, 0x16, 0xe7, 0xf3, 0x0d, 0xe4,
+    0x04, 0xe6, 0xef, 0xf8, 0x0f, 0x23, 0x02, 0xe0, 0x01, 0x01, 0x01, 0x05,
+    0xf5, 0x0d, 0xf5, 0xf5, 0xe1, 0xff, 0x04, 0x00, 0xf4, 0x0d, 0xee, 0xf1,
+    0xef, 0xf7, 0x0b, 0xff, 0x1b, 0xec, 0x05, 0xe7, 0xf3, 0x13, 0x12, 0xf2,
+    0xf3, 0xfc, 0xea, 0x06, 0xfe, 0x13, 0x12, 0xdb, 0x11, 0xe2, 0xfc, 0x0d,
+    0x1c, 0xe8, 0x1d, 0xfc, 0xf2, 0xe2, 0x13, 0x1d, 0xda, 0xf6, 0x1c, 0x18,
+    0x1e, 0xf4, 0xfa, 0x03, 0xdc, 0x0f, 0xff, 0xff, 0x18, 0x0b, 0xed, 0xf1,
+    0xf8, 0x02, 0xf4, 0x10, 0xf9, 0xeb, 0x0b, 0x0e, 0x0f, 0x01, 0x02, 0x1b,
+    0x06, 0x10, 0x00, 0xe7, 0x23, 0x0d, 0xf6, 0x11, 0x08, 0xf5, 0x0f, 0x05,
+    0x13, 0xf7, 0x01, 0x01, 0x0c, 0xf6, 0xf9, 0xf0, 0x29, 0x01, 0xe9, 0x11,
+    0x02, 0xfa, 0xeb, 0x16, 0x0e, 0x10, 0x09, 0x0e, 0x1c, 0x0a, 0xe3, 0xd3,
+    0x01, 0xe3, 0x00, 0x06, 0xe2, 0xe9, 0x19, 0xef, 0x12, 0xf3, 0xfc, 0x02,
+    0x0b, 0x0c, 0x0d, 0xed, 0xfd, 0xf6, 0xf9, 0xe9, 0xf2, 0x28, 0xfe, 0x03,
+    0xec, 0x03, 0x00, 0xf8, 0xde, 0x0d, 0x25, 0x07, 0x1a, 0xe7, 0xfd, 0x29,
+    0xd8, 0xf7, 0xfb, 0xde, 0x0c, 0x08, 0x06, 0x22, 0xee, 0x1d, 0x05, 0x07,
+    0xf0, 0xfb, 0xfe, 0x07, 0xf1, 0x04, 0xe9, 0x01, 0xfc, 0xf1, 0x00, 0xeb,
+    0xe3, 0x08, 0xec, 0xfe, 0x04, 0xeb, 0xfc, 0x01, 0xf6, 0x0e, 0xdf, 0xf8,
+    0x12, 0xe3, 0x16, 0xdc, 0x21, 0x0a, 0xe6, 0x06, 0xe5, 0x10, 0x07, 0xf7,
+    0x1e, 0xde, 0xe3, 0x07, 0x16, 0xed, 0x23, 0xf2, 0x12, 0x0d, 0xe9, 0xf9,
+    0xe8, 0xfe, 0x0e, 0x02, 0x18, 0x0a, 0xea, 0xec, 0xfb, 0xfe, 0x0c, 0x1b,
+    0x19, 0x20, 0xfa, 0x07, 0xe5, 0x0c, 0x04, 0x27, 0xdb, 0xe6, 0xfe, 0x0d,
+    0x0a, 0x0a, 0xfe, 0x39, 0xdd, 0xde, 0x05, 0xec, 0x09, 0x05, 0x0a, 0x2c,
+    0xf4, 0x02, 0x1f, 0xd3, 0x24, 0xee, 0x0f, 0x3c, 0xf5, 0xfd, 0xf8, 0xf8,
+    0x12, 0xf5, 0xf3, 0x19, 0xf9, 0xda, 0xf6, 0x0a, 0x0a, 0xf4, 0x09, 0x0f,
+    0xfc, 0x00, 0x01, 0x01, 0xf3, 0xf8, 0x05, 0xf3, 0x0c, 0x19, 0x0e, 0xfd,
+    0xfa, 0xe1, 0xfc, 0x0c, 0x03, 0xfb, 0x1b, 0x06, 0xcc, 0xe4, 0x08, 0xf9,
+    0x10, 0xe9, 0x06, 0x00, 0x17, 0xe8, 0x0d, 0x12, 0xca, 0xf5, 0x23, 0xe4,
+    0x21, 0xf6, 0x19, 0x33, 0xdd, 0xfa, 0x0c, 0x01, 0x14, 0x07, 0x00, 0x34,
+    0xda, 0x05, 0x07, 0x01, 0x07, 0xe4, 0x06, 0x24, 0x02, 0xff, 0xf0, 0x09,
+    0xfc, 0xf4, 0x03, 0x06, 0xee, 0x08, 0xe2, 0x1d, 0xfa, 0x0c, 0xfc, 0x02,
+    0x03, 0xe5, 0xf0, 0xe2, 0x0a, 0x18, 0x12, 0x0c, 0x1e, 0x20, 0xed, 0x20,
+    0xe4, 0x01, 0x2a, 0x09, 0x0d, 0x0e, 0xd0, 0xf4, 0xdd, 0xfd, 0x2b, 0xf2,
+    0x08, 0x0c, 0xf8, 0xf7, 0xfc, 0xf9, 0x15, 0xef, 0x19, 0x1c, 0x01, 0xff,
+    0xe2, 0x01, 0xf3, 0x30, 0x0e, 0xfb, 0x15, 0xe8, 0x1c, 0x00, 0xfa, 0x16,
+    0xef, 0xea, 0xfb, 0x05, 0xf0, 0x0e, 0x02, 0x13, 0xf4, 0x01, 0x03, 0xe5,
+    0x29, 0x07, 0x09, 0x24, 0xf9, 0xe3, 0xf8, 0xde, 0x2d, 0xf4, 0xf5, 0x40,
+    0xed, 0xdf, 0x07, 0xef, 0x0f, 0x0a, 0x0b, 0x32, 0x0d, 0xe8, 0x00, 0xe6,
+    0xf6, 0xfc, 0xfd, 0x19, 0x11, 0x09, 0xf3, 0x03, 0xea, 0xf1, 0xfb, 0x02,
+    0xfd, 0x06, 0xff, 0xfe, 0x09, 0xec, 0x06, 0x0c, 0x15, 0xf9, 0x06, 0xd7,
+    0xe3, 0xf7, 0xed, 0x01, 0x03, 0xfd, 0x14, 0x01, 0x0e, 0xe0, 0x37, 0x0d,
+    0xd2, 0x18, 0x2f, 0xea, 0x12, 0x0d, 0x05, 0x3a, 0xd5, 0x07, 0x1e, 0xf2,
+    0x21, 0x11, 0xf9, 0x36, 0xd3, 0xf5, 0x12, 0xf6, 0xfb, 0xf6, 0x06, 0x0f,
+    0xde, 0xf9, 0x06, 0x09, 0xdf, 0xff, 0x0b, 0xf3, 0xf5, 0x01, 0xf1, 0xea,
+    0xf2, 0x02, 0x12, 0xfc, 0x0e, 0xee, 0xf8, 0xeb, 0x00, 0xef, 0x21, 0x0f,
+    0x09, 0xef, 0xeb, 0x1e, 0xef, 0xf2, 0x26, 0xf9, 0x17, 0xf1, 0xf1, 0xf0,
+    0x0c, 0x10, 0x1d, 0xff, 0x1d, 0x06, 0x03, 0xf6, 0xfb, 0x14, 0x1b, 0x03,
+    0x22, 0xfd, 0xec, 0x03, 0xfa, 0xf8, 0x01, 0x2b, 0x1e, 0x1b, 0x09, 0x09,
+    0x07, 0xff, 0xf0, 0x20, 0xee, 0x14, 0xfb, 0xf6, 0xf8, 0x11, 0xd9, 0x29,
+    0xf4, 0xfa, 0x07, 0xef, 0x20, 0xf9, 0xf2, 0x30, 0xee, 0xf0, 0xf3, 0xd6,
+    0x0d, 0xfe, 0x03, 0x36, 0xf5, 0xd7, 0x01, 0xe6, 0x04, 0xf0, 0x05, 0x1f,
+    0x0f, 0xdd, 0xff, 0xf8, 0x1f, 0xf2, 0x04, 0x37, 0xfa, 0x00, 0xfd, 0xf8,
+    0x10, 0xe1, 0xfb, 0x0d, 0xed, 0xf6, 0xe2, 0xfe, 0x08, 0xfe, 0x07, 0x08,
+    0x08, 0x11, 0x0a, 0xf0, 0xf8, 0xf5, 0x04, 0xea, 0x08, 0x12, 0x06, 0x0d,
+    0x0f, 0x10, 0x40, 0x28, 0xc0, 0xfb, 0x3f, 0x08, 0x1d, 0x09, 0x1b, 0x3d,
+    0xee, 0xf4, 0x29, 0x13, 0x20, 0xfc, 0x11, 0x4c, 0xdb, 0x02, 0x15, 0x05,
+    0xec, 0xeb, 0x0a, 0x22, 0xe7, 0x00, 0x02, 0x01, 0xd4, 0xea, 0x0a, 0xf3,
+    0xe3, 0xf8, 0xf5, 0xfa, 0x01, 0x0d, 0x19, 0x06, 0x24, 0x13, 0x02, 0xf5,
+    0xf1, 0xf1, 0x1b, 0x0f, 0x19, 0x04, 0xe3, 0xf9, 0xe7, 0x02, 0x29, 0xfc,
+    0x29, 0xec, 0xe9, 0x04, 0xdc, 0x22, 0x1d, 0xfd, 0x1f, 0x01, 0xec, 0xe8,
+    0xf5, 0x14, 0x1b, 0x19, 0x06, 0x0e, 0x02, 0x0d, 0xf9, 0x06, 0xfc, 0x15,
+    0x07, 0xfa, 0x0c, 0xe1, 0x18, 0x1a, 0xe8, 0x1b, 0xe9, 0xef, 0x0a, 0x18,
+    0xfc, 0x05, 0xf9, 0x14, 0xdc, 0x04, 0x01, 0xff, 0x07, 0xfd, 0xf0, 0x2c,
+    0xf2, 0xec, 0x0e, 0xe7, 0x1a, 0x05, 0xe8, 0x35, 0x13, 0x09, 0xf9, 0x07,
+    0xfe, 0xfa, 0x0d, 0x40, 0x0c, 0xea, 0xf4, 0x04, 0x01, 0x11, 0xfc, 0x23,
+    0xeb, 0xf4, 0xe9, 0x04, 0xeb, 0xe7, 0x07, 0x09, 0xfb, 0xf1, 0xf6, 0xfd,
+    0x02, 0xfa, 0x02, 0xff, 0x00, 0xff, 0xf1, 0xf1, 0x1a, 0xe9, 0x10, 0xe3,
+    0x0b, 0x0c, 0x08, 0x04, 0x1b, 0x0a, 0x2b, 0x10, 0xe1, 0x01, 0x1f, 0x06,
+    0x04, 0xec, 0x19, 0x49, 0xee, 0xf8, 0x22, 0x0c, 0x20, 0x02, 0x07, 0x31,
+    0xe7, 0xff, 0x0f, 0xf0, 0xfd, 0xea, 0x13, 0x26, 0xce, 0xfa, 0xff, 0xee,
+    0xe9, 0xfe, 0x15, 0x08, 0x04, 0x05, 0x0d, 0xfa, 0xdd, 0xf8, 0x07, 0x0b,
+    0x33, 0xef, 0xec, 0xf9, 0xd9, 0xe6, 0x1d, 0x10, 0x41, 0xf6, 0xdf, 0x11,
+    0xe3, 0x14, 0x1d, 0xfb, 0x2b, 0x15, 0xdc, 0x09, 0xf6, 0x05, 0x16, 0x00,
+    0x1c, 0x27, 0xe4, 0xfc, 0xf7, 0x16, 0x08, 0x08, 0x2f, 0xdd, 0xf8, 0xfa,
+    0xe9, 0x0e, 0x0b, 0x0b, 0x02, 0x12, 0x02, 0xfd, 0x19, 0x03, 0xeb, 0x11,
+    0xf4, 0x09, 0x09, 0x15, 0x12, 0x0d, 0xef, 0x1c, 0xe4, 0xfe, 0x17, 0x0c,
+    0x09, 0x04, 0xea, 0x2f, 0xf2, 0x1e, 0x02, 0xfb, 0xfe, 0xe3, 0x00, 0x2e,
+    0x04, 0xf9, 0x0c, 0x05, 0x27, 0x0c, 0x07, 0x2d, 0xf7, 0x0b, 0xfb, 0xf9,
+    0x1c, 0xdf, 0x11, 0x36, 0x05, 0xf2, 0x02, 0xf8, 0x0b, 0x07, 0x05, 0xfb,
+    0xfc, 0x0e, 0x13, 0xfa, 0xfb, 0x09, 0xf5, 0xfd, 0x06, 0x15, 0xf9, 0x03,
+    0x18, 0xfd, 0x1a, 0x0a, 0x03, 0xe2, 0xfb, 0x00, 0x1e, 0xfe, 0x4f, 0x27,
+    0xe1, 0xf7, 0x31, 0xf0, 0x1b, 0xec, 0x07, 0x5f, 0xe2, 0xf8, 0x40, 0x05,
+    0x17, 0x24, 0x0c, 0x3c, 0xf3, 0x10, 0x13, 0xf8, 0x0b, 0xf3, 0xf9, 0x36,
+    0xe1, 0xf3, 0xf4, 0xe8, 0xef, 0xf8, 0xfc, 0xeb, 0xe3, 0xfb, 0xf0, 0xee,
+    0xdb, 0x06, 0x0c, 0x11, 0x1e, 0x10, 0xe2, 0xe9, 0xeb, 0x0d, 0x34, 0x0f,
+    0x43, 0xd9, 0xef, 0x08, 0xec, 0x05, 0x1d, 0x02, 0x33, 0xef, 0xf4, 0xf7,
+    0xe6, 0xf9, 0x22, 0x07, 0x04, 0x06, 0xe9, 0x02, 0xf0, 0xfc, 0x24, 0x20,
+    0x24, 0x17, 0xe6, 0x0f, 0x05, 0xf6, 0xfc, 0x1f, 0xf2, 0x01, 0x0d, 0xe7,
+    0xff, 0x1d, 0xf0, 0xfa, 0xd0, 0x00, 0xff, 0x0e, 0x23, 0xf9, 0xf3, 0x11,
+    0xde, 0x0d, 0x05, 0x04, 0x0b, 0x0b, 0xfb, 0x26, 0x0d, 0x0d, 0xff, 0xe8,
+    0x16, 0xe8, 0x0b, 0x3c, 0x18, 0xe4, 0x04, 0xff, 0xfa, 0xf3, 0xff, 0x40,
+    0xee, 0x06, 0xfc, 0x0d, 0x00, 0xf7, 0x13, 0x3f, 0xf7, 0x13, 0x06, 0x08,
+    0xf9, 0x13, 0xf2, 0x19, 0xfd, 0xf9, 0xf3, 0xe6, 0xfc, 0x07, 0xf6, 0xfd,
+    0x0a, 0x22, 0x00, 0x01, 0x19, 0xff, 0xe7, 0xff, 0x08, 0xfd, 0x03, 0xfd,
+    0x1f, 0xe7, 0x28, 0x08, 0xde, 0xf3, 0x43, 0xf6, 0x0c, 0xfe, 0x1e, 0x52,
+    0xf2, 0x04, 0x17, 0xf2, 0x08, 0x0d, 0x04, 0x38, 0xde, 0x0c, 0x10, 0xef,
+    0xdf, 0x0f, 0x01, 0x24, 0xde, 0xe1, 0x0d, 0xfd, 0xd4, 0xf6, 0x12, 0x0e,
+    0xed, 0x01, 0xf0, 0xf3, 0xfd, 0xff, 0x18, 0xf3, 0x36, 0xda, 0xf6, 0xef,
+    0xe8, 0xef, 0x37, 0x27, 0x4e, 0xf8, 0xf4, 0xff, 0xe5, 0xf3, 0x32, 0x0b,
+    0x36, 0x08, 0xe9, 0xf6, 0xe2, 0x13, 0x21, 0xfe, 0x12, 0xed, 0xdd, 0xfb,
+    0xf8, 0x05, 0x0f, 0x03, 0x1c, 0x04, 0xfc, 0xf2, 0x23, 0x0e, 0x03, 0xfc,
+    0xf9, 0x18, 0xf7, 0x01, 0x1b, 0x03, 0xf5, 0xfd, 0xde, 0xf3, 0x19, 0xfc,
+    0x11, 0x02, 0xe7, 0x13, 0xde, 0xd8, 0xf2, 0x05, 0x28, 0x02, 0x02, 0x27,
+    0x07, 0x08, 0xff, 0x07, 0x27, 0x0e, 0x19, 0x40, 0xfb, 0x02, 0x0c, 0xf6,
+    0x0d, 0x07, 0x0f, 0x47, 0xf8, 0x05, 0x0e, 0xfd, 0x03, 0x1e, 0x07, 0x32,
+    0xe7, 0xf6, 0x24, 0x01, 0x01, 0x02, 0x0a, 0xff, 0xf6, 0x26, 0x15, 0xf0,
+    0x04, 0x13, 0x03, 0xfa, 0xfe, 0xf6, 0xf1, 0x09, 0x2a, 0xe6, 0xea, 0xf6,
+    0x17, 0x13, 0xeb, 0xff, 0x15, 0xeb, 0x23, 0x06, 0xc8, 0xf6, 0x33, 0xeb,
+    0xf4, 0xe7, 0x12, 0x2a, 0xe3, 0xe6, 0x32, 0xfa, 0x16, 0x15, 0x17, 0x40,
+    0xf1, 0x08, 0x1a, 0xf3, 0xf6, 0x0c, 0x0c, 0x11, 0xd0, 0x22, 0x02, 0xee,
+    0xea, 0xf4, 0xf8, 0xf9, 0x13, 0x10, 0x17, 0xf5, 0xf1, 0x0a, 0x0e, 0xfd,
+    0x32, 0xda, 0xf1, 0xe2, 0xdb, 0xf2, 0x34, 0x1f, 0x53, 0xfc, 0xe4, 0xf2,
+    0xf6, 0xf2, 0x1d, 0x04, 0x4a, 0xec, 0xee, 0x06, 0xdf, 0x01, 0x1a, 0x04,
+    0x27, 0xfc, 0xe6, 0xfd, 0xd9, 0xfd, 0x0e, 0x00, 0x0c, 0x16, 0xf3, 0x03,
+    0xf7, 0xfc, 0x0e, 0x0f, 0x09, 0x06, 0x06, 0x04, 0x08, 0x02, 0xed, 0xf5,
+    0xe4, 0xe6, 0x07, 0x06, 0x03, 0x18, 0xea, 0x13, 0xe2, 0xfa, 0x10, 0xf2,
+    0x02, 0xec, 0x03, 0x3c, 0xf6, 0xf6, 0x0a, 0x10, 0x09, 0xf8, 0x15, 0x24,
+    0xfd, 0x0d, 0x09, 0x01, 0x00, 0xff, 0x00, 0x1a, 0xf0, 0xee, 0x08, 0x03,
+    0x1d, 0x05, 0x16, 0x46, 0xe6, 0xf8, 0x08, 0x00, 0x09, 0x09, 0xff, 0x01,
+    0xfc, 0x20, 0xfc, 0xec, 0x05, 0x1b, 0x03, 0xf1, 0x12, 0xe4, 0xfa, 0x24,
+    0x1c, 0xf5, 0xf2, 0x05, 0x11, 0xe7, 0xfa, 0x02, 0x20, 0xea, 0x31, 0x10,
+    0xcf, 0xd8, 0x33, 0xee, 0xff, 0x09, 0x20, 0x3f, 0xe2, 0x0a, 0x29, 0xee,
+    0x3a, 0xf2, 0x1e, 0x39, 0x02, 0x1e, 0xfe, 0xf2, 0xef, 0xe2, 0x0d, 0x0f,
+    0xf1, 0x19, 0x02, 0xe7, 0xec, 0xff, 0xfe, 0xe4, 0xfe, 0xfb, 0x02, 0xf6,
+    0xf1, 0xf4, 0x07, 0x1a, 0x2a, 0xf9, 0x06, 0xf9, 0xda, 0xf4, 0x22, 0x02,
+    0x4f, 0x0a, 0xf3, 0xfc, 0xf3, 0xf6, 0x25, 0x0a, 0x28, 0x01, 0xf7, 0x09,
+    0xe6, 0x05, 0x28, 0xf7, 0x1e, 0xf2, 0xee, 0x13, 0xee, 0x05, 0x0f, 0x0a,
+    0x09, 0xe8, 0xe8, 0x0e, 0x05, 0x12, 0x0f, 0x15, 0x02, 0xec, 0xf8, 0x02,
+    0xf7, 0x05, 0xf8, 0xff, 0xdc, 0x00, 0x01, 0x00, 0x12, 0x17, 0xec, 0x19,
+    0xfa, 0x09, 0xfa, 0xf3, 0x1d, 0x0b, 0x07, 0x25, 0xea, 0x0c, 0xf5, 0xfa,
+    0x04, 0xf7, 0xfe, 0x33, 0xfe, 0x14, 0xef, 0x04, 0xf0, 0x00, 0x00, 0x3a,
+    0xea, 0xfa, 0x10, 0x01, 0xe4, 0x00, 0xff, 0x23, 0xe9, 0x26, 0x15, 0x10,
+    0x04, 0x14, 0x0d, 0x08, 0xf8, 0xfd, 0x10, 0xfb, 0x00, 0x21, 0x06, 0xfa,
+    0x0f, 0x08, 0xf1, 0x09, 0x28, 0xf0, 0xd8, 0x0d, 0x08, 0x09, 0x02, 0xfb,
+    0x12, 0x03, 0x0e, 0xfb, 0xce, 0xf0, 0x39, 0xe5, 0x09, 0xf6, 0x1f, 0x35,
+    0xdd, 0x1c, 0x25, 0xef, 0x17, 0x0c, 0xf6, 0x3e, 0xf0, 0x21, 0x08, 0xff,
+    0xd7, 0xfc, 0xfd, 0x1f, 0xe5, 0x18, 0x12, 0xe9, 0xf5, 0xe9, 0x12, 0xf6,
+    0x02, 0x13, 0xf4, 0x0a, 0xfd, 0x03, 0x09, 0x08, 0x2f, 0x07, 0xee, 0xfd,
+    0xd7, 0x00, 0x2b, 0x29, 0x3b, 0xdb, 0xde, 0xf1, 0xe1, 0xf7, 0x47, 0x12,
+    0x35, 0x0c, 0xe4, 0x09, 0xef, 0x17, 0x2b, 0xea, 0x2d, 0xf8, 0xe8, 0x18,
+    0xef, 0x03, 0x11, 0x0a, 0x10, 0xff, 0xe8, 0x07, 0x0c, 0x07, 0x03, 0x18,
+    0x05, 0x08, 0xf8, 0xf8, 0x06, 0x18, 0xe9, 0xf9, 0xe0, 0x0f, 0x0d, 0x18,
+    0x04, 0x01, 0xf0, 0x1c, 0xf6, 0x14, 0xfd, 0x12, 0x0c, 0x0c, 0x02, 0x34,
+    0xf6, 0xe6, 0xfd, 0xf9, 0xf9, 0xfd, 0x00, 0x2a, 0xfc, 0xf9, 0xff, 0x0a,
+    0xfe, 0x1b, 0xf5, 0x34, 0xdc, 0xf9, 0x15, 0x13, 0xe7, 0x1b, 0xf7, 0x25,
+    0xfd, 0x09, 0x08, 0x0a, 0xf0, 0x17, 0x0f, 0x04, 0xf4, 0xe9, 0x06, 0x07,
+    0xf5, 0x02, 0xfc, 0xf5, 0x09, 0xee, 0xf1, 0x07, 0x38, 0x03, 0x05, 0x0f,
+    0x16, 0x0f, 0xed, 0xff, 0x21, 0xf8, 0x34, 0x07, 0xd1, 0xf9, 0x27, 0x00,
+    0x0c, 0x21, 0x18, 0x42, 0xe6, 0x02, 0x1a, 0xf1, 0x2f, 0xf1, 0x0e, 0x3b,
+    0xee, 0xf8, 0x08, 0xea, 0xfe, 0xf9, 0x03, 0x18, 0xf5, 0xf8, 0x0d, 0xeb,
+    0x01, 0x10, 0x09, 0x02, 0x15, 0xfb, 0xf1, 0x0b, 0xf2, 0x06, 0x08, 0x09,
+    0x2f, 0x19, 0x02, 0xfe, 0xe4, 0x06, 0x1f, 0x17, 0x49, 0xf2, 0xe2, 0x02,
+    0xef, 0x04, 0x26, 0x16, 0x3f, 0x08, 0xf1, 0x0a, 0xfd, 0xf9, 0x28, 0x01,
+    0x15, 0x0b, 0xf9, 0x10, 0xdc, 0x02, 0x20, 0xf7, 0x16, 0xe6, 0x09, 0x03,
+    0xf1, 0xf5, 0x12, 0x1c, 0xfb, 0x2a, 0x08, 0xfa, 0x0a, 0x16, 0xf6, 0x15,
+    0xf0, 0x06, 0x11, 0xfd, 0x0e, 0xf9, 0xf6, 0x12, 0xed, 0xf3, 0xfd, 0x1f,
+    0x0b, 0xfa, 0x08, 0x30, 0xf8, 0xff, 0x0b, 0xeb, 0x10, 0xff, 0x07, 0x22,
+    0x0d, 0x07, 0x09, 0x03, 0xf6, 0xf8, 0xfc, 0x26, 0xf8, 0xee, 0x11, 0x02,
+    0x03, 0x0a, 0xef, 0x38, 0xfe, 0x13, 0x1b, 0x09, 0xfe, 0x06, 0x05, 0xf3,
+    0x04, 0xdf, 0xfc, 0x00, 0xe7, 0x15, 0xec, 0xf1, 0xf8, 0xfc, 0xed, 0x05,
+    0x0e, 0xf3, 0x15, 0x09, 0x01, 0x0d, 0xfd, 0x00, 0x24, 0xe2, 0x31, 0x13,
+    0xd5, 0x1b, 0x2b, 0xe8, 0x03, 0x08, 0x1d, 0x33, 0xdc, 0xfd, 0x24, 0xe4,
+    0x20, 0xfa, 0x07, 0x33, 0x01, 0x12, 0x06, 0xf5, 0xef, 0xf7, 0xfa, 0x13,
+    0x01, 0xec, 0xee, 0xe0, 0xfd, 0x0d, 0xff, 0x09, 0xf6, 0x00, 0xed, 0x07,
+    0xea, 0x0e, 0xff, 0x0e, 0x26, 0xfc, 0xf0, 0xe7, 0xe7, 0xfe, 0x30, 0xff,
+    0x24, 0x04, 0x06, 0xf4, 0xf5, 0xf8, 0x23, 0x0e, 0x3d, 0xf2, 0xfd, 0x04,
+    0xe8, 0xfb, 0x23, 0xfe, 0x33, 0xe1, 0x01, 0xfd, 0xdc, 0xfb, 0x0e, 0xfa,
+    0x22, 0xfb, 0x11, 0xfa, 0xff, 0x08, 0x21, 0x30, 0x13, 0x03, 0xf2, 0x03,
+    0xf8, 0x0f, 0xec, 0x0d, 0xef, 0x0f, 0x10, 0x10, 0x0f, 0xf6, 0xf9, 0x1e,
+    0xf7, 0xe5, 0x08, 0xfa, 0x09, 0xff, 0x00, 0x15, 0x02, 0x00, 0x08, 0xfe,
+    0xfb, 0x0e, 0x15, 0x28, 0xfa, 0xfb, 0x13, 0x06, 0xfb, 0x05, 0xf6, 0x11,
+    0xf6, 0x0b, 0x06, 0x15, 0xe1, 0x00, 0xe9, 0x0f, 0xe1, 0x1d, 0x18, 0xfd,
+    0x0b, 0x0f, 0xff, 0xf2, 0xf5, 0xfd, 0x14, 0xff, 0xf4, 0xfe, 0xe2, 0xf8,
+    0x14, 0x0b, 0xeb, 0x07, 0x35, 0xe2, 0xeb, 0x0b, 0x04, 0x22, 0xfe, 0x0e,
+    0x1d, 0xf2, 0x24, 0x11, 0xcc, 0xec, 0x25, 0xf7, 0xff, 0xf9, 0x06, 0x29,
+    0xe4, 0x07, 0x1c, 0xdb, 0xf8, 0x1d, 0xfa, 0x44, 0xf2, 0x01, 0x0f, 0xe6,
+    0x11, 0x03, 0xee, 0x17, 0x06, 0xe0, 0x0c, 0xd8, 0xe9, 0xfd, 0x11, 0xfe,
+    0x07, 0xdd, 0xea, 0xff, 0xde, 0xdd, 0x0a, 0x09, 0x30, 0xf2, 0x01, 0xe4,
+    0xe0, 0xeb, 0x2d, 0x12, 0x2d, 0xeb, 0xfc, 0xf0, 0xe8, 0xf9, 0x1f, 0x08,
+    0x3f, 0xeb, 0x0e, 0x13, 0xf9, 0x0c, 0x1c, 0x02, 0x25, 0xec, 0xf6, 0x05,
+    0xf3, 0xf4, 0x18, 0x08, 0x12, 0xe9, 0xfb, 0xfd, 0xf9, 0x08, 0x13, 0x1c,
+    0x08, 0xec, 0xfe, 0x02, 0xf1, 0x19, 0xf3, 0x1d, 0xf1, 0x07, 0x11, 0x12,
+    0xfa, 0xf2, 0xf6, 0x0d, 0xff, 0x17, 0x0a, 0xfb, 0x1f, 0xf8, 0x11, 0x24,
+    0xf6, 0xfc, 0xfe, 0x07, 0xed, 0x05, 0x1c, 0x21, 0xfe, 0xfe, 0x16, 0x0d,
+    0x08, 0x0f, 0x09, 0x33, 0xf4, 0x1f, 0x14, 0x0c, 0xfe, 0xf5, 0xeb, 0x2a,
+    0xee, 0xf3, 0x12, 0x19, 0xec, 0x01, 0x06, 0xf7, 0x05, 0x22, 0x0b, 0xeb,
+    0xeb, 0x06, 0xe1, 0xf5, 0x0d, 0xee, 0xfb, 0x0a, 0x31, 0xff, 0xe3, 0xea,
+    0x18, 0x09, 0xe3, 0x07, 0x1a, 0xf8, 0x15, 0xfc, 0xcc, 0xf2, 0x2a, 0xe5,
+    0x01, 0xea, 0x10, 0x1f, 0xd9, 0x02, 0x13, 0xf6, 0x16, 0x01, 0x0e, 0x3c,
+    0x02, 0x17, 0x04, 0xf1, 0xf7, 0x02, 0x07, 0x0c, 0x02, 0x1f, 0xf4, 0xe6,
+    0xf0, 0xe9, 0x05, 0xf4, 0xfd, 0xe4, 0xf7, 0xe9, 0xfc, 0xef, 0x06, 0x02,
+    0x26, 0xf1, 0xf1, 0xeb, 0xe9, 0xe6, 0x30, 0x1c, 0x38, 0x0f, 0x03, 0xf1,
+    0x10, 0x04, 0x30, 0x19, 0x1f, 0xfb, 0xfc, 0x05, 0xe2, 0xfe, 0x18, 0xf2,
+    0x1c, 0xf2, 0xf5, 0x0e, 0xf2, 0x05, 0x1d, 0x28, 0x12, 0xf0, 0xf0, 0x0f,
+    0x0a, 0x03, 0x1a, 0x1a, 0xf3, 0x08, 0x13, 0xef, 0xf5, 0x1c, 0x06, 0x00,
+    0xee, 0x12, 0x1d, 0x03, 0x18, 0x06, 0x0a, 0x0e, 0xf0, 0xeb, 0xfa, 0x0d,
+    0x08, 0xff, 0x06, 0x24, 0x0f, 0x03, 0x0a, 0x0f, 0x0e, 0xff, 0x08, 0x33,
+    0xfc, 0x00, 0x0e, 0xfb, 0xfb, 0x05, 0x07, 0x19, 0xe8, 0xe7, 0x12, 0x11,
+    0x15, 0xf7, 0x0c, 0x1a, 0xf6, 0x28, 0x08, 0xeb, 0xf2, 0x25, 0xee, 0x01,
+    0x03, 0xec, 0xed, 0xfa, 0xf0, 0xf2, 0xef, 0xf1, 0x02, 0x23, 0xef, 0x01,
+    0x41, 0xfa, 0xf4, 0xf4, 0x15, 0xf5, 0xf5, 0xf9, 0x28, 0xde, 0x20, 0xf6,
+    0xc7, 0xde, 0x21, 0xe4, 0xfe, 0xec, 0x0d, 0x2c, 0xee, 0x24, 0x10, 0xf0,
+    0x1d, 0x12, 0x0e, 0x2b, 0x06, 0xf8, 0xfd, 0x01, 0x08, 0xef, 0xfd, 0x0f,
+    0xeb, 0xed, 0xe1, 0xdf, 0xf1, 0xe5, 0x16, 0xe3, 0x08, 0xfc, 0xf6, 0xf6,
+    0xd8, 0xf0, 0x23, 0xfc, 0x2b, 0xf5, 0xff, 0xe7, 0xf4, 0xe9, 0x29, 0x09,
+    0x2b, 0x0c, 0xff, 0x08, 0x0b, 0xed, 0x29, 0x14, 0x3c, 0xf5, 0xeb, 0x18,
+    0xf6, 0x10, 0x22, 0xf9, 0x17, 0x23, 0x02, 0x0c, 0xf6, 0xfa, 0x2f, 0xfe,
+    0x1e, 0xeb, 0xfd, 0x03, 0xf0, 0x07, 0x1c, 0x09, 0xfa, 0xe1, 0x0d, 0x0f,
+    0x18, 0x03, 0xfe, 0xf0, 0xec, 0x0b, 0x10, 0x02, 0x14, 0x06, 0xef, 0xf7,
+    0xea, 0x0b, 0x05, 0xfe, 0x1f, 0x06, 0x0e, 0x07, 0x00, 0xe1, 0x01, 0x01,
+    0x07, 0x05, 0x09, 0xf7, 0xef, 0x15, 0xf7, 0x12, 0x05, 0x03, 0x04, 0x1d,
+    0x04, 0x10, 0x12, 0x06, 0x05, 0x00, 0x08, 0x18, 0xd6, 0xf2, 0xfa, 0x07,
+    0xf8, 0x12, 0x07, 0xfd, 0xdd, 0x00, 0x04, 0xfb, 0xf8, 0x09, 0xf3, 0x09,
+    0xfb, 0xf0, 0xe8, 0x09, 0x27, 0xf5, 0xf8, 0x06, 0x01, 0x02, 0x0e, 0xf6,
+    0x1f, 0xfa, 0x29, 0xf8, 0xd6, 0x01, 0x22, 0xf8, 0x1d, 0xe3, 0x1a, 0x39,
+    0x0a, 0x0d, 0x19, 0xf5, 0x12, 0xfb, 0x1d, 0x2a, 0x03, 0xf6, 0x0c, 0xf2,
+    0xfd, 0xec, 0x18, 0x13, 0xfe, 0x1a, 0xe8, 0xdd, 0x01, 0xf8, 0x30, 0x01,
+    0xf8, 0xfe, 0xe4, 0xe7, 0xff, 0xeb, 0x23, 0xfa, 0x2c, 0xf0, 0xfc, 0xe7,
+    0x0a, 0xf8, 0x18, 0x10, 0x23, 0x01, 0xfa, 0xe8, 0xf1, 0xfa, 0x1d, 0x0e,
+    0x17, 0xe7, 0xe4, 0xf5, 0xf9, 0x0c, 0x17, 0x0c, 0x13, 0xe8, 0xe1, 0x17,
+    0x19, 0x05, 0x0b, 0x0f, 0x23, 0xed, 0xff, 0xfe, 0xe0, 0x14, 0x16, 0x00,
+    0x0d, 0x1c, 0x0b, 0xf5, 0xfb, 0x18, 0xee, 0xff, 0xff, 0xf3, 0x18, 0x0c,
+    0x05, 0xfa, 0xf6, 0xfe, 0xfe, 0xf8, 0xf8, 0x09, 0xef, 0xf8, 0x0e, 0xf0,
+    0x00, 0xf8, 0x0c, 0xf8, 0xf6, 0x07, 0x16, 0x11, 0xf8, 0xea, 0xff, 0xff,
+    0x01, 0x20, 0x07, 0x08, 0xfd, 0x1c, 0xfc, 0x06, 0xed, 0x0d, 0x08, 0x15,
+    0xf0, 0x25, 0x01, 0x1b, 0x00, 0x02, 0xfe, 0x01, 0x05, 0x01, 0xfd, 0xf1,
+    0xe5, 0x0c, 0xe4, 0xe1, 0xf0, 0xfa, 0xee, 0x0e, 0x35, 0xee, 0x15, 0xef,
+    0x0a, 0xf9, 0x01, 0xf5, 0x1f, 0x05, 0x1f, 0x0d, 0xe1, 0xf4, 0xff, 0xf5,
+    0x23, 0x02, 0x18, 0x30, 0xfc, 0xf0, 0x0d, 0x04, 0x0d, 0x06, 0x29, 0x1d,
+    0xf9, 0x08, 0x06, 0xe5, 0x13, 0xfd, 0x0d, 0x26, 0xef, 0x09, 0xdc, 0xf2,
+    0x05, 0xdf, 0x0c, 0xf6, 0xf3, 0xd9, 0xf8, 0x08, 0xef, 0xeb, 0x0f, 0xf9,
+    0x3a, 0x03, 0xff, 0xe0, 0xf7, 0xf0, 0x15, 0x12, 0x41, 0x0b, 0xf1, 0x04,
+    0x04, 0xe2, 0x0e, 0x0b, 0x2c, 0x03, 0xea, 0x02, 0xfb, 0xe7, 0x08, 0xe9,
+    0x22, 0xf3, 0xf2, 0x1c, 0xfa, 0xf3, 0x11, 0x04, 0x1f, 0xf5, 0x02, 0x0f,
+    0x1a, 0x1f, 0x24, 0x0b, 0x06, 0x1f, 0xf3, 0x06, 0x00, 0x02, 0xe8, 0xf6,
+    0xf4, 0xe8, 0x07, 0x2e, 0xfb, 0xf8, 0x10, 0x09, 0xf0, 0x0e, 0xff, 0xfe,
+    0x1c, 0x14, 0x17, 0x06, 0xe2, 0xf1, 0xfa, 0x01, 0x11, 0x13, 0x12, 0x29,
+    0xf1, 0x0f, 0x1f, 0xfa, 0xfd, 0xfd, 0x02, 0x07, 0x0e, 0xfb, 0x0e, 0x04,
+    0x01, 0x01, 0xed, 0xfe, 0xde, 0xfd, 0x08, 0xef, 0xf6, 0x0a, 0xff, 0x0f,
+    0xe7, 0xf2, 0x0f, 0x02, 0xea, 0x10, 0xf9, 0xec, 0xfd, 0x09, 0xea, 0x1f,
+    0x46, 0xdd, 0xe2, 0xf7, 0x08, 0xf5, 0xf7, 0xe9, 0x33, 0xfb, 0x2f, 0xf6,
+    0xb5, 0x1d, 0x15, 0xeb, 0x11, 0xf7, 0x2a, 0x2e, 0x08, 0x1d, 0xf4, 0xfb,
+    0x15, 0xfa, 0x22, 0x34, 0xff, 0x06, 0xf6, 0xfd, 0xfa, 0xf9, 0x03, 0xf5,
+    0xf4, 0xf4, 0xd5, 0xea, 0x01, 0x08, 0x22, 0xf1, 0xf2, 0x06, 0xd1, 0xe5,
+    0x0c, 0xef, 0x12, 0x03, 0x08, 0x02, 0xf7, 0x05, 0x1b, 0x07, 0x39, 0x34,
+    0x21, 0xe2, 0xe3, 0x0b, 0x0c, 0xf6, 0x29, 0xf7, 0x24, 0x0a, 0xfc, 0xff,
+    0x1a, 0xfd, 0x05, 0xff, 0xff, 0x0e, 0x0a, 0x1a, 0x09, 0xfb, 0x15, 0x04,
+    0x03, 0xf7, 0xfe, 0x00, 0xfc, 0xfb, 0x11, 0xfa, 0x1d, 0x0e, 0x06, 0xed,
+    0xfc, 0x23, 0xd8, 0xf2, 0x04, 0xe5, 0x0f, 0x16, 0x29, 0xfe, 0xf5, 0xec,
+    0xe2, 0x0e, 0xeb, 0x09, 0x1d, 0x11, 0x05, 0x11, 0xe4, 0x29, 0x12, 0x02,
+    0x12, 0x19, 0x0e, 0x1a, 0xee, 0xf9, 0x05, 0x09, 0xf5, 0xfd, 0x05, 0x04,
+    0xe4, 0xf1, 0x17, 0x01, 0xf2, 0xfe, 0x0b, 0xf4, 0x0d, 0x04, 0x06, 0xfe,
+    0xff, 0xec, 0xe9, 0x00, 0xff, 0x03, 0x03, 0xfd, 0xf1, 0x15, 0xfc, 0xf3,
+    0xff, 0xfe, 0x09, 0xee, 0x3c, 0x01, 0xec, 0x02, 0xf0, 0xf6, 0x20, 0xeb,
+    0x16, 0x07, 0x32, 0xf3, 0xce, 0xf0, 0x02, 0xd4, 0x11, 0xe6, 0x28, 0x0e,
+    0xe3, 0x21, 0xee, 0xce, 0x1e, 0xd9, 0x23, 0x26, 0x06, 0xfa, 0xf9, 0xf1,
+    0x01, 0xe6, 0x0b, 0x07, 0xdc, 0x21, 0xbc, 0xe3, 0xef, 0xf8, 0x12, 0xfc,
+    0xe6, 0xfe, 0xf5, 0xd4, 0x15, 0x0a, 0x00, 0x13, 0xfc, 0xec, 0xf3, 0xd6,
+    0x1a, 0xe3, 0x21, 0x36, 0x2a, 0x03, 0xe9, 0xe3, 0xff, 0x00, 0x13, 0x1c,
+    0x0e, 0x20, 0xe5, 0xf5, 0x24, 0x0b, 0x20, 0x14, 0x13, 0xf8, 0x04, 0x1b,
+    0x2f, 0x0a, 0x15, 0x00, 0xf4, 0x1a, 0x11, 0x0d, 0x03, 0x18, 0x0f, 0x18,
+    0x04, 0x1f, 0xfb, 0xf2, 0x1f, 0x15, 0x03, 0xfb, 0x0b, 0x17, 0xfb, 0x0b,
+    0x1b, 0x1f, 0xf4, 0x07, 0xf9, 0xf9, 0xf8, 0xf4, 0x14, 0x0f, 0xf6, 0xfe,
+    0xdd, 0x0b, 0xff, 0x01, 0x18, 0x04, 0x1b, 0x0a, 0xed, 0xe7, 0xf9, 0x16,
+    0x02, 0x01, 0x00, 0xf7, 0xf1, 0x07, 0xf0, 0x06, 0xf8, 0x0b, 0x02, 0xf3,
+    0xff, 0x20, 0xfd, 0x01, 0x04, 0xf5, 0xd9, 0xf4, 0xf4, 0xf2, 0xe8, 0xff,
+    0x04, 0x00, 0xf0, 0xe2, 0xfe, 0xed, 0x1b, 0xef, 0x20, 0xfa, 0xfb, 0xf4,
+    0x02, 0x18, 0x07, 0xfb, 0xef, 0xe4, 0x08, 0x0d, 0xe1, 0x0e, 0x25, 0xc6,
+    0xfd, 0x0c, 0x1c, 0x0b, 0xf0, 0x01, 0x1c, 0xd4, 0x11, 0xf5, 0x1b, 0x09,
+    0xfb, 0xda, 0x13, 0xe3, 0xf9, 0x10, 0x14, 0xf0, 0xf0, 0xfd, 0x1f, 0xcf,
+    0xf4, 0xe4, 0xfb, 0x0e, 0x0a, 0x11, 0xed, 0xdc, 0xfc, 0xe6, 0xf7, 0xfc,
+    0x13, 0xe1, 0x0b, 0xe4, 0x04, 0x11, 0xee, 0x21, 0x14, 0xe1, 0x07, 0xe4,
+    0xfb, 0x08, 0x03, 0x2b, 0x27, 0xf6, 0x0d, 0x02, 0x1b, 0x09, 0x09, 0xf8,
+    0x14, 0x19, 0x0f, 0x0b, 0x01, 0x10, 0x09, 0x12, 0x03, 0xf5, 0x18, 0xf3,
+    0xfb, 0xf5, 0x02, 0x0e, 0x0d, 0x00, 0x07, 0xfc, 0x18, 0x25, 0x0b, 0xf0,
+    0xf9, 0xe6, 0x08, 0x01, 0x24, 0x14, 0xfa, 0xed, 0xe5, 0x1f, 0x09, 0xfe,
+    0x08, 0xee, 0x1a, 0x1a, 0x05, 0x00, 0xff, 0x0c, 0xfe, 0xf9, 0x11, 0x11,
+    0xea, 0xfe, 0x08, 0xf9, 0xf0, 0xe4, 0x01, 0x0d, 0xf1, 0x00, 0x0b, 0xea,
+    0x19, 0xea, 0xf3, 0xf8, 0x08, 0x12, 0x1c, 0x1f, 0xfb, 0xef, 0xf0, 0xf2,
+    0x14, 0xe1, 0x03, 0xfa, 0xf9, 0xda, 0xe9, 0xfc, 0xf3, 0xff, 0x12, 0x04,
+    0xf7, 0xfc, 0x17, 0x0f, 0xfc, 0x29, 0x03, 0xe5, 0xf2, 0xee, 0x1e, 0xfa,
+    0x04, 0xed, 0x25, 0xf4, 0xe1, 0x15, 0x10, 0x1e, 0xef, 0x1c, 0x04, 0xde,
+    0xe5, 0x08, 0x21, 0xfd, 0xfd, 0xea, 0x03, 0xca, 0xda, 0x26, 0x00, 0x0a,
+    0xfd, 0x05, 0xf0, 0xd4, 0xe1, 0x1a, 0xe4, 0xf5, 0x07, 0xe7, 0xfa, 0xdf,
+    0xd4, 0x03, 0xf0, 0x10, 0x15, 0x0c, 0xf4, 0xed, 0xe3, 0xfb, 0x0f, 0x1e,
+    0x16, 0x09, 0x00, 0xec, 0xea, 0x13, 0x16, 0x0b, 0x01, 0xfb, 0xff, 0x00,
+    0xfb, 0x07, 0x13, 0x08, 0xf4, 0xe4, 0x12, 0x00, 0xfb, 0xfa, 0xfc, 0x08,
+    0xeb, 0x19, 0x02, 0x1c, 0xe8, 0x26, 0xf3, 0x10, 0x09, 0x0f, 0x19, 0x02,
+    0xfb, 0xec, 0xf7, 0xe2, 0xfb, 0xfa, 0x11, 0xf3, 0x0b, 0x08, 0xff, 0xd9,
+    0xf8, 0x12, 0x18, 0x06, 0x07, 0x22, 0xff, 0x19, 0xf5, 0x0b, 0x0a, 0x13,
+    0xf2, 0xfa, 0x02, 0x21, 0xeb, 0x11, 0x17, 0x17, 0xec, 0xe1, 0x0e, 0xf7,
+    0xe8, 0xd8, 0x0e, 0x01, 0xf1, 0xed, 0xed, 0xf0, 0x09, 0xf7, 0xe7, 0xfd,
+    0xf0, 0xf9, 0xdb, 0xee, 0xdc, 0xfb, 0xf8, 0x0a, 0xf5, 0x0b, 0xd4, 0xd7,
+    0x08, 0x06, 0x18, 0x06, 0x0c, 0x13, 0xfd, 0x09, 0x13, 0x26, 0x12, 0xf4,
+    0xef, 0x00, 0xf5, 0x28, 0x18, 0xfe, 0x04, 0x0e, 0x21, 0x1a, 0x0a, 0x1e,
+    0x09, 0xf0, 0x0d, 0x0f, 0xec, 0xf3, 0x17, 0x22, 0x00, 0xec, 0x0e, 0x01,
+    0xe9, 0x08, 0x09, 0xf2, 0xf2, 0x08, 0xf0, 0x0b, 0xd9, 0x09, 0x14, 0xf5,
+    0xf6, 0x04, 0x19, 0xf4, 0x11, 0xe9, 0xf2, 0x0d, 0x20, 0x17, 0x0a, 0x05,
+    0x0c, 0x04, 0x01, 0xfd, 0xf4, 0xfb, 0x1b, 0x0c, 0xf2, 0x0b, 0xff, 0xfe,
+    0x01, 0xd8, 0xfa, 0x0e, 0xf5, 0x14, 0xf9, 0x01, 0x04, 0xf8, 0xfa, 0x02,
+    0xe8, 0xf9, 0xf9, 0xea, 0xf1, 0x07, 0xff, 0x1e, 0x01, 0x0b, 0xf7, 0x0a,
+    0xf7, 0x0c, 0xfd, 0xec, 0xf3, 0x05, 0xf8, 0xda, 0x0b, 0x15, 0xf6, 0xee,
+    0xf9, 0x10, 0xfa, 0xfe, 0x08, 0xf0, 0xe6, 0xec, 0x05, 0xff, 0x15, 0x19,
+    0x1f, 0x11, 0xfc, 0x09, 0x08, 0x01, 0x06, 0xfe, 0x04, 0x08, 0xfb, 0xfb,
+    0x08, 0xf4, 0xf6, 0x28, 0x10, 0xf9, 0x28, 0x0b, 0xf8, 0x0d, 0x01, 0x00,
+    0xff, 0x02, 0x05, 0x08, 0xea, 0xe9, 0xf4, 0xf6, 0x01, 0xea, 0xdf, 0x1f,
+    0xfe, 0x0a, 0xf9, 0xf7, 0x0c, 0x1b, 0x06, 0xed, 0xf6, 0xf2, 0x03, 0x03,
+    0xfd, 0x04, 0xf5, 0x10, 0x0a, 0x0b, 0xf4, 0xf8, 0xf1, 0xe7, 0x05, 0xfe,
+    0xe7, 0x0b, 0xf1, 0xec, 0xf4, 0xec, 0x06, 0xee, 0xde, 0x05, 0x1b, 0xfe,
+    0x13, 0xf3, 0xd9, 0xea, 0x04, 0x10, 0x05, 0xed, 0x15, 0x02, 0x0b, 0x10,
+    0xfa, 0x02, 0x05, 0x0b, 0x02, 0x07, 0xfc, 0xf5, 0x15, 0x14, 0x05, 0xf7,
+    0x0c, 0xfe, 0xf6, 0xf4, 0xfa, 0x06, 0xfc, 0x13, 0xdc, 0xe4, 0x09, 0xfa,
+    0x02, 0x23, 0xec, 0x06, 0x11, 0x13, 0xf8, 0xfa, 0x27, 0x28, 0x0b, 0x23,
+    0xec, 0xf1, 0x09, 0x17, 0x0f, 0x13, 0xff, 0xf2, 0xfc, 0x0a, 0xf5, 0x0d,
+    0x03, 0x26, 0x01, 0x0f, 0xfe, 0xf1, 0xfb, 0xe6, 0xf0, 0x02, 0xf2, 0xff,
+    0x02, 0x11, 0xff, 0xfd, 0x1c, 0x02, 0x0b, 0xf6, 0x14, 0x0c, 0x0b, 0x21,
+    0x28, 0xf0, 0x11, 0x05, 0x06, 0xed, 0xf9, 0x0a, 0xf2, 0xef, 0xf8, 0xf1,
+    0xfe, 0x0d, 0xf9, 0xf7, 0xea, 0x00, 0x08, 0xdb, 0x02, 0x0f, 0xfe, 0x04,
+    0xef, 0x20, 0x16, 0x01, 0xe8, 0xed, 0xe4, 0x22, 0xf6, 0x19, 0x00, 0x04,
+    0x01, 0x13, 0xeb, 0x0d, 0xec, 0x01, 0x08, 0x05, 0x0c, 0x0e, 0xfe, 0x02,
+    0x12, 0xf7, 0x27, 0xf9, 0xfd, 0x18, 0xfe, 0x24, 0xf7, 0x13, 0xed, 0x1e,
+    0x09, 0xff, 0xd8, 0xf4, 0x12, 0xf8, 0x04, 0x0c, 0x1c, 0x11, 0xfd, 0x17,
+    0x1d, 0x01, 0x13, 0xee, 0x11, 0xf3, 0xf8, 0x06, 0xf6, 0x16, 0xfe, 0x15,
+    0x16, 0xdc, 0x1f, 0x00, 0x25, 0xee, 0xff, 0xf7, 0xf6, 0x02, 0xdd, 0x15,
+    0xf1, 0x14, 0x08, 0xe8, 0xe5, 0x21, 0xea, 0xf0, 0x1a, 0x07, 0xea, 0x08,
+    0xea, 0xe4, 0x1e, 0x00, 0x13, 0x17, 0xec, 0x11, 0xd6, 0x11, 0x18, 0x17,
+    0x04, 0x15, 0x03, 0x3a, 0xd6, 0x02, 0x07, 0x04, 0xe6, 0xe5, 0xfe, 0x0e,
+    0xff, 0xed, 0xfc, 0xfb, 0xff, 0x1c, 0x06, 0x0a, 0xfb, 0xf9, 0xea, 0x1a,
+    0x21, 0xf5, 0x04, 0x06, 0x0a, 0xe3, 0x16, 0xea, 0x04, 0xe2, 0xf9, 0xf9,
+    0xe6, 0xfb, 0x0f, 0xfc, 0x06, 0xfb, 0x10, 0x07, 0x07, 0x13, 0x07, 0xfc,
+    0x16, 0xef, 0x07, 0xdc, 0x12, 0x1f, 0x08, 0xf4, 0xe9, 0x14, 0x06, 0xf7,
+    0xf1, 0x0c, 0x01, 0x0c, 0xe6, 0x04, 0xf3, 0xf2, 0xe5, 0xf3, 0xef, 0x1d,
+    0xf6, 0x20, 0x07, 0xfe, 0xf4, 0x05, 0xee, 0x10, 0xfd, 0x0e, 0x0b, 0x02,
+    0x0d, 0xd8, 0x07, 0xfb, 0x26, 0x0a, 0x1c, 0x21, 0x06, 0x1f, 0xf4, 0x06,
+    0x37, 0x18, 0xfa, 0x16, 0x1e, 0x24, 0xfb, 0xf0, 0x12, 0xf9, 0x02, 0x09,
+    0x17, 0x16, 0xf3, 0xf9, 0x17, 0xf2, 0x02, 0x0a, 0x2d, 0xe7, 0xe3, 0x25,
+    0xf0, 0xf9, 0x0f, 0xdd, 0x15, 0xe6, 0x04, 0xfc, 0xf1, 0x17, 0x0a, 0xea,
+    0x24, 0x07, 0xf1, 0x11, 0x13, 0x29, 0xf4, 0xc5, 0xfb, 0x07, 0xef, 0x13,
+    0x0b, 0xe1, 0xf1, 0xeb, 0xf8, 0x1b, 0x09, 0x08, 0x1f, 0x15, 0xf2, 0x05,
+    0x02, 0xdd, 0x09, 0x0f, 0x16, 0x10, 0x01, 0x30, 0xf2, 0xe0, 0x27, 0xfe,
+    0xf1, 0x0e, 0x0e, 0x07, 0xe6, 0x07, 0x0b, 0x18, 0xfe, 0x0f, 0x01, 0x07,
+    0xf4, 0x07, 0x10, 0xe7, 0xfb, 0xf3, 0xf7, 0x0b, 0xf9, 0x15, 0x18, 0x25,
+    0x0c, 0x14, 0x02, 0x08, 0x0a, 0x0f, 0x10, 0xec, 0xee, 0x1a, 0x03, 0x14,
+    0x0f, 0xfa, 0x25, 0xff, 0x18, 0x0d, 0x0b, 0xea, 0x1f, 0x28, 0x10, 0x0c,
+    0xe7, 0xee, 0xf7, 0xfa, 0x03, 0x15, 0x0c, 0x1d, 0x01, 0x00, 0x12, 0xee,
+    0x01, 0xf1, 0xf8, 0x0b, 0xf3, 0xfd, 0x04, 0xf8, 0x02, 0x1e, 0x0e, 0xf3,
+    0x02, 0x10, 0xfd, 0x07, 0x0b, 0x09, 0x03, 0x10, 0x3e, 0x08, 0x0e, 0x0c,
+    0xf4, 0xe7, 0xfd, 0x1c, 0x27, 0x1a, 0xed, 0xe1, 0x08, 0xdc, 0xd9, 0xf1,
+    0x1e, 0x07, 0x12, 0xf1, 0x10, 0xfb, 0xc8, 0x08, 0x0f, 0x03, 0x1d, 0xdc,
+    0x23, 0x04, 0xf9, 0x0a, 0xff, 0x08, 0x0e, 0xc9, 0x39, 0x0a, 0x01, 0x07,
+    0xec, 0xe0, 0x05, 0xe8, 0x14, 0xd8, 0xe1, 0xfa, 0xd6, 0xf8, 0xed, 0xdb,
+    0xff, 0x1d, 0xf5, 0x17, 0x0f, 0x1c, 0xdc, 0xed, 0xff, 0xff, 0x04, 0x13,
+    0xf5, 0xe7, 0xd2, 0x12, 0xdb, 0xe1, 0x13, 0x11, 0x23, 0x0e, 0xf9, 0x31,
+    0xdc, 0xef, 0x07, 0x0a, 0x20, 0xf2, 0xf9, 0x13, 0xff, 0x1c, 0x2a, 0xdf,
+    0xdb, 0xe7, 0x11, 0xf2, 0xfd, 0xfb, 0x28, 0x00, 0x15, 0x03, 0x02, 0x20,
+    0x07, 0xf7, 0x19, 0x13, 0x13, 0xf6, 0x09, 0xfe, 0xfd, 0x20, 0x14, 0xf5,
+    0xf5, 0xfc, 0x14, 0x0e, 0x17, 0xfe, 0x15, 0x04, 0xf9, 0xf6, 0x1d, 0xf6,
+    0x1b, 0xe4, 0xee, 0xfd, 0x00, 0xe9, 0xee, 0xce, 0x0f, 0x20, 0x05, 0x02,
+    0x0d, 0x06, 0x05, 0xf8, 0xef, 0xdf, 0x16, 0x17, 0xe6, 0xf1, 0x10, 0xf3,
+    0x06, 0x04, 0xdb, 0xfb, 0xe7, 0xf8, 0x02, 0x11, 0xff, 0x0d, 0x0a, 0xfa,
+    0x27, 0x0a, 0xfc, 0xe8, 0x11, 0x17, 0xf0, 0x0d, 0x0d, 0xee, 0xdf, 0xdd,
+    0xf1, 0x15, 0xd6, 0xf7, 0x00, 0xef, 0x2e, 0xe6, 0x24, 0xfd, 0xd5, 0x04,
+    0xf0, 0x08, 0x08, 0xed, 0x22, 0x07, 0xe1, 0x09, 0xd0, 0x0b, 0x18, 0xe6,
+    0x3f, 0x0a, 0xe5, 0xe2, 0xf9, 0x08, 0x02, 0xd6, 0x13, 0x15, 0xbd, 0x00,
+    0x0e, 0xf8, 0xe2, 0xca, 0xec, 0x0e, 0xe6, 0xef, 0x15, 0x11, 0xcb, 0xdf,
+    0xf9, 0x03, 0x22, 0x10, 0xfb, 0xf9, 0xe5, 0x08, 0xe1, 0x11, 0x10, 0xfc,
+    0xfa, 0x00, 0xf8, 0x30, 0xe5, 0x08, 0x14, 0xe8, 0x12, 0xe2, 0x04, 0x19,
+    0x0b, 0xfa, 0x33, 0xf3, 0xec, 0xfe, 0xf8, 0x25, 0xf8, 0x21, 0x28, 0xef,
+    0x00, 0xde, 0xff, 0x2b, 0x03, 0xfc, 0x10, 0x0c, 0xcf, 0xfd, 0x19, 0x0a,
+    0x0c, 0xf2, 0xf7, 0x0c, 0xfd, 0x02, 0x1c, 0xdf, 0x26, 0x0d, 0xf0, 0x0b,
+    0xce, 0x15, 0xfb, 0xec, 0x27, 0xf6, 0xf9, 0xe5, 0xe2, 0xfb, 0xfd, 0xd8,
+    0x28, 0xec, 0xe9, 0xf2, 0xca, 0x09, 0x02, 0x06, 0x0c, 0xfa, 0x05, 0x01,
+    0xd5, 0x0a, 0x02, 0xfb, 0x04, 0x17, 0xdd, 0xfe, 0xeb, 0xf1, 0x09, 0x10,
+    0x12, 0xff, 0x00, 0xe0, 0x26, 0xf7, 0xed, 0xf4, 0x00, 0xf2, 0xfa, 0x07,
+    0x02, 0xf5, 0x06, 0xe8, 0x03, 0xfd, 0xdc, 0xf2, 0xc2, 0xff, 0x0b, 0xd6,
+    0x25, 0x04, 0xe9, 0xf0, 0xd9, 0x08, 0x09, 0xc5, 0x23, 0x12, 0xf6, 0x13,
+    0x11, 0xf3, 0x18, 0xf0, 0x34, 0xfe, 0xfe, 0xed, 0xea, 0x02, 0x17, 0xdc,
+    0x1b, 0x1b, 0xea, 0xfe, 0xea, 0xfe, 0xf2, 0xc4, 0xfd, 0x04, 0xe9, 0x0d,
+    0x0d, 0x09, 0xca, 0xd4, 0xe1, 0x04, 0x1e, 0xff, 0x0f, 0xef, 0xd6, 0x0f,
+    0xd5, 0xf8, 0x26, 0xd6, 0x33, 0xe8, 0xf5, 0x3b, 0xf1, 0xe8, 0x39, 0xe8,
+    0x08, 0xe5, 0x01, 0x02, 0x04, 0xf6, 0x19, 0x0a, 0xd0, 0xeb, 0x0b, 0x15,
+    0xf7, 0x0e, 0x23, 0xf6, 0xf4, 0xd8, 0xf4, 0x17, 0x23, 0x25, 0x14, 0x01,
+    0xd7, 0xfd, 0xf9, 0x1f, 0x1b, 0x11, 0x0a, 0x18, 0xf5, 0xf5, 0x0f, 0xe0,
+    0x2e, 0x01, 0xe5, 0xdb, 0xe2, 0xf2, 0x14, 0xfa, 0x2a, 0x00, 0xe2, 0xea,
+    0xfd, 0x0e, 0xfc, 0xc1, 0x35, 0x08, 0xf6, 0xf9, 0xec, 0x00, 0x06, 0x00,
+    0x0b, 0xf6, 0x01, 0xfe, 0xea, 0x0b, 0x08, 0x05, 0xe4, 0xea, 0xd7, 0xfd,
+    0xee, 0xf3, 0x0c, 0x0c, 0x0d, 0x02, 0xfd, 0xee, 0x17, 0x10, 0x13, 0xfd,
+    0x07, 0x03, 0xf8, 0x0c, 0xd4, 0xed, 0xfe, 0x07, 0xf4, 0xee, 0xf4, 0x03,
+    0xc2, 0x18, 0x2c, 0xd1, 0x33, 0xd8, 0xdb, 0xfa, 0xed, 0x10, 0x1c, 0xe3,
+    0x37, 0x0a, 0xea, 0xfe, 0xf6, 0xef, 0x20, 0xed, 0x32, 0xf7, 0xf5, 0xf3,
+    0xca, 0xfd, 0x0a, 0xcf, 0x0d, 0x10, 0xde, 0x07, 0x18, 0x10, 0xf0, 0xd6,
+    0x0c, 0x04, 0xeb, 0x1a, 0xf9, 0x08, 0xc4, 0xcb, 0xe4, 0x0b, 0x19, 0xfc,
+    0x29, 0xf6, 0xec, 0x07, 0xf3, 0xed, 0x2b, 0xe9, 0xfa, 0x02, 0xec, 0x2b,
+    0xf0, 0xf2, 0x2d, 0xe8, 0xed, 0x00, 0x12, 0x13, 0xed, 0x1a, 0x3d, 0xf0,
+    0x05, 0x04, 0xfc, 0x13, 0x10, 0x01, 0x40, 0xf2, 0x06, 0x02, 0xf9, 0x22,
+    0x24, 0xff, 0x18, 0x00, 0xeb, 0xe8, 0x14, 0xf9, 0x25, 0xe0, 0xff, 0x03,
+    0xe5, 0xfd, 0x08, 0xea, 0x2e, 0x0b, 0x05, 0xe7, 0xde, 0xe4, 0xf5, 0xea,
+    0x3a, 0xf4, 0xf4, 0xe7, 0xed, 0xec, 0xf8, 0xee, 0x30, 0x0a, 0xdb, 0x05,
+    0xf7, 0x16, 0xff, 0xf7, 0xfa, 0x1f, 0xef, 0xe4, 0xce, 0xf8, 0x13, 0x04,
+    0xf9, 0x01, 0xe1, 0x03, 0xf9, 0xf9, 0x08, 0x04, 0xfa, 0xe4, 0xe7, 0xf7,
+    0x28, 0xfd, 0xfd, 0x00, 0xfc, 0xfb, 0xef, 0x0a, 0xec, 0x0c, 0x0a, 0xd2,
+    0x05, 0xfb, 0xcd, 0xfb, 0x9d, 0xea, 0x1c, 0xe5, 0x25, 0xe8, 0xea, 0x0b,
+    0xf0, 0xf3, 0x0d, 0xab, 0x49, 0x0e, 0xeb, 0x00, 0xe2, 0x03, 0x29, 0xe0,
+    0x3d, 0x06, 0xf7, 0xf8, 0xcf, 0x0c, 0x1a, 0xd6, 0x1f, 0xef, 0xfd, 0xff,
+    0xef, 0x0c, 0xdb, 0xe0, 0x20, 0x06, 0xdf, 0x1a, 0xe7, 0xfc, 0xb2, 0xd1,
+    0xdf, 0x13, 0x07, 0x1f, 0x0c, 0xf7, 0xde, 0x0a, 0xdb, 0xdf, 0x1a, 0xf5,
+    0x29, 0x0d, 0xeb, 0x2c, 0xcf, 0x0e, 0x26, 0xfe, 0xef, 0x04, 0xf5, 0x14,
+    0x09, 0x13, 0x34, 0xff, 0xfe, 0x0e, 0x06, 0x0e, 0x10, 0xf9, 0x2a, 0x0b,
+    0xe6, 0xfe, 0xf1, 0x1a, 0x36, 0x29, 0x29, 0x05, 0x05, 0xd8, 0x14, 0x12,
+    0x26, 0x0b, 0x18, 0xff, 0xd7, 0xdf, 0x0f, 0xed, 0x31, 0xf7, 0xfc, 0xec,
+    0x0b, 0xef, 0x0c, 0xd2, 0x30, 0xf9, 0x04, 0xfe, 0xef, 0xe4, 0xfb, 0xd1,
+    0x32, 0xe5, 0xee, 0xf0, 0x0c, 0xe6, 0x13, 0xed, 0x1e, 0x0b, 0xe4, 0xe0,
+    0xfa, 0xf4, 0x14, 0xf4, 0x18, 0xf7, 0xd9, 0xf6, 0xed, 0xea, 0xfc, 0x06,
+    0xfc, 0xf5, 0xed, 0xeb, 0x05, 0x03, 0x1b, 0x0b, 0xff, 0x0b, 0xef, 0x01,
+    0xf1, 0x16, 0x05, 0x00, 0xee, 0x0a, 0xdb, 0x10, 0xb4, 0x14, 0x0f, 0xe1,
+    0x1c, 0xfd, 0xf0, 0xf8, 0xc3, 0x11, 0x17, 0xba, 0x47, 0x15, 0xe6, 0x01,
+    0xea, 0xf1, 0x0c, 0x08, 0x4a, 0x15, 0xf0, 0xf7, 0xea, 0x00, 0xf5, 0xd4,
+    0xf1, 0xff, 0xe0, 0x0c, 0xf4, 0x17, 0xd8, 0xea, 0x03, 0xff, 0xd5, 0x18,
+    0xfb, 0x07, 0xc7, 0xc9, 0xdd, 0xf3, 0x15, 0x0d, 0x22, 0xea, 0xdb, 0x0a,
+    0xd6, 0x09, 0x1d, 0xe5, 0x2d, 0x04, 0xfc, 0x35, 0xc6, 0x0e, 0x33, 0xf1,
+    0xd7, 0xea, 0x01, 0x1b, 0x0e, 0x01, 0x2a, 0xff, 0xef, 0xf1, 0xf7, 0x0f,
+    0xff, 0x00, 0x3b, 0xe8, 0x0a, 0xff, 0xf4, 0x0d, 0x1f, 0x04, 0x17, 0xf7,
+    0xdf, 0xec, 0x12, 0x26, 0x36, 0x07, 0x0c, 0x06, 0xe7, 0xd6, 0x13, 0xe3,
+    0x30, 0x09, 0x00, 0xf5, 0xe0, 0xf3, 0x11, 0xe2, 0x38, 0x0d, 0xf6, 0x05,
+    0xec, 0x05, 0x00, 0xe5, 0x24, 0xef, 0xfe, 0xf8, 0x00, 0xd8, 0x18, 0xf1,
+    0x26, 0x0b, 0xf2, 0xfc, 0xe0, 0xe4, 0x06, 0x0b, 0x1a, 0x05, 0xc6, 0xf6,
+    0xe8, 0xde, 0xfe, 0x0c, 0x03, 0x09, 0xfe, 0xe2, 0x18, 0x1b, 0xfb, 0xf7,
+    0x06, 0xf1, 0xfe, 0xf6, 0xef, 0x1b, 0x07, 0x0d, 0x01, 0x0a, 0xed, 0xf0,
+    0xad, 0x1a, 0x17, 0xd6, 0x37, 0xfd, 0xd8, 0xec, 0xca, 0xf1, 0x15, 0xc4,
+    0x33, 0xf1, 0xed, 0xf0, 0xe9, 0x15, 0x0d, 0xf2, 0x36, 0xde, 0xfd, 0x0e,
+    0xfb, 0x10, 0x0f, 0xf6, 0xf9, 0x0c, 0xea, 0xf0, 0xe5, 0x0b, 0xee, 0xc1,
+    0x10, 0xf4, 0xe8, 0x1f, 0xee, 0x00, 0xd0, 0xe4, 0xe7, 0x13, 0x07, 0x27,
+    0x12, 0xea, 0xea, 0x0f, 0xea, 0xf4, 0x14, 0xee, 0xfe, 0x09, 0xfb, 0x31,
+    0xdb, 0x1b, 0x1c, 0xe7, 0xef, 0xf5, 0xf7, 0x1a, 0x06, 0x01, 0x2c, 0xed,
+    0xfb, 0x04, 0xfa, 0x07, 0x19, 0xec, 0x2b, 0x0d, 0xfc, 0xd8, 0xfc, 0x0f,
+    0x1f, 0xfc, 0x2d, 0xf3, 0xc9, 0xda, 0x0a, 0xfe, 0x29, 0x00, 0xfa, 0x09,
+    0xe8, 0xf6, 0x21, 0xf3, 0x4a, 0x1a, 0xf8, 0x00, 0xe7, 0xf0, 0x21, 0x01,
+    0x22, 0xf3, 0x00, 0xe9, 0x06, 0xe3, 0x15, 0xd7, 0x3d, 0x0c, 0x07, 0xf1,
+    0xf3, 0xec, 0x17, 0xdf, 0x29, 0x1b, 0xfd, 0xfe, 0xeb, 0xed, 0x17, 0xf6,
+    0x23, 0x0a, 0xea, 0xee, 0xf9, 0xf3, 0x0f, 0x0c, 0xf8, 0xf5, 0xed, 0xe8,
+    0x1c, 0x14, 0x07, 0x17, 0x0b, 0x0d, 0xed, 0xf7, 0xed, 0x10, 0x07, 0xd5,
+    0xf2, 0x09, 0xd6, 0xf7, 0xb5, 0xf6, 0x19, 0xc9, 0x25, 0x15, 0xe8, 0xf5,
+    0xc4, 0xf9, 0x2a, 0xb0, 0x39, 0x0e, 0x02, 0x11, 0xf0, 0xf7, 0x1d, 0xeb,
+    0x39, 0x10, 0x02, 0x15, 0xe0, 0x08, 0x01, 0xee, 0x1c, 0x1e, 0x08, 0x04,
+    0xf2, 0x02, 0xe8, 0xda, 0xfa, 0xfb, 0xe0, 0xfe, 0x05, 0x02, 0xd3, 0xca,
+    0xf4, 0xec, 0x10, 0x16, 0x05, 0x0d, 0xd7, 0x09, 0xdc, 0xf6, 0x1e, 0xf8,
+    0x10, 0xed, 0xf7, 0x27, 0xf5, 0x08, 0x28, 0xee, 0xec, 0xe0, 0xf8, 0x17,
+    0xfb, 0x23, 0x2e, 0xf1, 0xfa, 0xf5, 0xfc, 0x1a, 0x10, 0xf7, 0x32, 0xfb,
+    0xfb, 0xe8, 0xf1, 0x03, 0x24, 0xeb, 0x25, 0xf9, 0xca, 0xf1, 0xfe, 0x01,
+    0x2e, 0x07, 0x18, 0x03, 0xe5, 0xea, 0x10, 0xfa, 0x3b, 0x07, 0x0f, 0x11,
+    0x04, 0xf7, 0x1d, 0xf1, 0x24, 0xd9, 0x08, 0xef, 0x02, 0xdd, 0x07, 0xc8,
+    0x2c, 0x0d, 0x06, 0xec, 0x17, 0xda, 0x21, 0xdf, 0x34, 0xd9, 0xfb, 0xf2,
+    0xf4, 0xec, 0x0e, 0x0a, 0x0f, 0x0f, 0xdb, 0xf0, 0xfb, 0xe6, 0x0f, 0x00,
+    0x04, 0xf9, 0x01, 0x05, 0x05, 0xfe, 0x08, 0xf3, 0x0e, 0xf2, 0xfb, 0x01,
+    0xfd, 0x18, 0x1d, 0xf6, 0xee, 0x06, 0xcf, 0xfc, 0xae, 0x27, 0x21, 0xd2,
+    0x33, 0x03, 0xe0, 0xe0, 0xc9, 0xfb, 0x3a, 0xbd, 0x4d, 0x04, 0xe8, 0xf5,
+    0xe6, 0xeb, 0x19, 0xf2, 0x4b, 0x1d, 0xfc, 0xf7, 0xd9, 0xff, 0xfe, 0xea,
+    0x0f, 0x04, 0x0e, 0x00, 0xed, 0x19, 0xe9, 0xe9, 0xff, 0x11, 0xef, 0x14,
+    0x01, 0x17, 0xbc, 0xb5, 0xef, 0x0c, 0x22, 0x27, 0x0f, 0x01, 0xd4, 0x03,
+    0xce, 0x01, 0x25, 0xff, 0xf9, 0xf0, 0x0a, 0x1c, 0xe5, 0x0f, 0x1c, 0xee,
+    0xf4, 0xf1, 0xf4, 0x0c, 0x00, 0x08, 0x1c, 0xf4, 0xd5, 0xf1, 0xfc, 0x1f,
+    0x11, 0x00, 0x18, 0x03, 0xf7, 0xe4, 0xff, 0x07, 0x09, 0x1a, 0x18, 0xff,
+    0xea, 0xec, 0xfd, 0x13, 0x2b, 0xf8, 0x0c, 0xfa, 0xdf, 0xf6, 0x11, 0xda,
+    0x2a, 0xdc, 0xfc, 0xff, 0xff, 0xec, 0x12, 0xe1, 0x37, 0xfd, 0xeb, 0xfe,
+    0xea, 0xd1, 0x12, 0xfa, 0x28, 0x1a, 0x0d, 0xf0, 0xf7, 0xe0, 0x0c, 0xeb,
+    0x35, 0x14, 0xeb, 0x00, 0xeb, 0xe7, 0x1b, 0xfc, 0x09, 0x00, 0xf2, 0x04,
+    0xf9, 0xe5, 0x1a, 0x0e, 0x08, 0x12, 0xf8, 0xfe, 0x09, 0x0f, 0x0d, 0xea,
+    0x03, 0xe1, 0xfe, 0xf2, 0xec, 0x0d, 0x02, 0xdb, 0x04, 0x1d, 0xd4, 0x01,
+    0xca, 0x13, 0x29, 0xca, 0x28, 0x04, 0xe2, 0xf1, 0xdb, 0x0b, 0x2c, 0xcd,
+    0x44, 0x00, 0xe7, 0xf4, 0xd0, 0x12, 0x15, 0xff, 0x42, 0x11, 0x05, 0xfd,
+    0xd9, 0x11, 0x1c, 0xf4, 0x15, 0xec, 0xf2, 0x24, 0xd6, 0x1d, 0xec, 0xda,
+    0xf5, 0xec, 0xe5, 0x22, 0xf2, 0x0b, 0xbd, 0xd0, 0xeb, 0x05, 0x07, 0x1b,
+    0x01, 0xed, 0xf5, 0x02, 0xcf, 0x08, 0x15, 0xfd, 0x1c, 0xe5, 0x04, 0x19,
+    0xc7, 0x25, 0x22, 0xf3, 0xde, 0xfb, 0xfb, 0x20, 0xf6, 0xeb, 0x25, 0xfe,
+    0xf5, 0x08, 0xf5, 0x17, 0x0e, 0x04, 0x1c, 0xf9, 0xee, 0xec, 0xe1, 0x06,
+    0x12, 0xff, 0x2a, 0x13, 0xed, 0xfe, 0x05, 0x18, 0x25, 0x20, 0x09, 0x13,
+    0xea, 0xd7, 0x05, 0x06, 0x33, 0x25, 0xff, 0x0a, 0xf0, 0xea, 0x17, 0xe1,
+    0x30, 0xfa, 0x0d, 0x0a, 0x04, 0x00, 0x0e, 0xe9, 0x16, 0x20, 0x0d, 0x02,
+    0xe8, 0xed, 0x07, 0xe8, 0x3c, 0xf1, 0xd9, 0xfa, 0xe1, 0xed, 0x18, 0xfc,
+    0xf0, 0x09, 0xe3, 0x05, 0xfe, 0xd1, 0x0b, 0x0e, 0xf5, 0x25, 0xfd, 0xfb,
+    0x30, 0x1e, 0x08, 0xfc, 0x0c, 0x21, 0xea, 0xfc, 0xe5, 0x1e, 0x16, 0xf5,
+    0xf4, 0xfc, 0xf0, 0xea, 0xc4, 0x21, 0x27, 0xe9, 0x2b, 0xdb, 0xdb, 0xec,
+    0xe5, 0xfe, 0x37, 0xe2, 0x46, 0x25, 0xfa, 0xec, 0xe4, 0xf3, 0x19, 0xf2,
+    0x4c, 0x06, 0x00, 0xfb, 0xeb, 0x10, 0x10, 0xf7, 0x2a, 0xf8, 0xe9, 0x18,
+    0xee, 0x21, 0xe8, 0xd5, 0xf4, 0x0a, 0xed, 0x24, 0xfe, 0xf9, 0xb2, 0xbc,
+    0xf3, 0x1d, 0x00, 0x2f, 0x07, 0x08, 0xe1, 0xf1, 0xed, 0x27, 0x27, 0xfe,
+    0x22, 0xfd, 0x02, 0x20, 0xd8, 0x05, 0x25, 0xec, 0xf1, 0xff, 0x0a, 0x0f,
+    0xe6, 0xfe, 0x46, 0xfd, 0xe1, 0xca, 0xf7, 0x22, 0x03, 0x08, 0x21, 0xf5,
+    0x0f, 0xf7, 0xfb, 0x0c, 0xfb, 0x14, 0x2d, 0x03, 0xe5, 0xe4, 0x09, 0x0b,
+    0x1a, 0xe6, 0x01, 0x28, 0xe9, 0xd6, 0x0b, 0xf7, 0x2c, 0xfb, 0x11, 0xee,
+    0x0b, 0xed, 0x17, 0xf0, 0x3c, 0xf5, 0x08, 0xfa, 0xf8, 0xcd, 0x17, 0xfa,
+    0x39, 0xea, 0x11, 0xf5, 0xed, 0xee, 0x0a, 0xec, 0x41, 0xd6, 0xe7, 0xf9,
+    0xfa, 0xc8, 0x15, 0xf7, 0x08, 0x0e, 0xe3, 0x08, 0xe8, 0xec, 0xfd, 0xfe,
+    0xf1, 0x00, 0xe9, 0xf4, 0x09, 0x26, 0x02, 0x16, 0xf0, 0x01, 0xef, 0x01,
+    0xff, 0x03, 0x22, 0xdb, 0xfc, 0xf5, 0xde, 0xe5, 0xc4, 0x01, 0x28, 0xd4,
+    0x38, 0x08, 0xd0, 0xec, 0xd5, 0x04, 0x2f, 0xce, 0x4e, 0xeb, 0xf9, 0xe7,
+    0xdf, 0xf0, 0x1b, 0xf5, 0x42, 0xf1, 0xf6, 0x09, 0xd5, 0x0a, 0x0d, 0x08,
+    0x04, 0x05, 0xe2, 0x0e, 0xd7, 0x19, 0xdb, 0xda, 0xe1, 0x25, 0xde, 0x15,
+    0x0e, 0x14, 0xbd, 0xb0, 0xe3, 0xe5, 0x24, 0x1e, 0xf8, 0x0d, 0xd8, 0xf7,
+    0xf2, 0xff, 0x18, 0xf5, 0x07, 0xf0, 0x02, 0x25, 0xd5, 0x1e, 0x2e, 0xdf,
+    0xe7, 0x05, 0xef, 0x11, 0xe8, 0xe7, 0x47, 0xf4, 0xe1, 0xde, 0x09, 0x36,
+    0x1a, 0x11, 0x11, 0xf5, 0x12, 0xe5, 0xe7, 0x18, 0x01, 0x17, 0x2a, 0x03,
+    0x05, 0xea, 0x09, 0x0b, 0x12, 0x04, 0x17, 0xf0, 0xee, 0xd7, 0x11, 0xed,
+    0x3c, 0x17, 0x16, 0xff, 0x02, 0xdc, 0x21, 0xf3, 0x2e, 0xe5, 0x13, 0xef,
+    0xec, 0xe2, 0x10, 0xd0, 0x2e, 0xee, 0xff, 0x01, 0xe0, 0xe5, 0x0b, 0xda,
+    0x1f, 0xf8, 0xf6, 0xfb, 0x07, 0xdb, 0x05, 0xf6, 0x0c, 0xf3, 0xf0, 0x10,
+    0xf9, 0xf5, 0xf2, 0x0d, 0x10, 0xf7, 0xf6, 0xff, 0x2b, 0x0d, 0x06, 0x1e,
+    0xf3, 0x0c, 0xe9, 0x01, 0xf2, 0x23, 0xfe, 0xe9, 0xdd, 0x12, 0xdd, 0xf7,
+    0xbb, 0x22, 0x1b, 0xd4, 0x38, 0x29, 0xd4, 0xcf, 0xf5, 0xf9, 0x27, 0xdd,
+    0x47, 0x00, 0xf2, 0xe5, 0x09, 0xfc, 0x0e, 0xf9, 0x34, 0x0a, 0x02, 0xfd,
+    0xec, 0x25, 0x1d, 0x03, 0x15, 0x09, 0xf1, 0x1b, 0xd0, 0x17, 0xda, 0xda,
+    0xe7, 0x07, 0xe3, 0x15, 0xf1, 0x02, 0xb9, 0xce, 0xe6, 0x0c, 0x10, 0x31,
+    0xfe, 0xf7, 0xd9, 0xfa, 0xed, 0xed, 0x33, 0xf4, 0x19, 0xe7, 0xfe, 0x3f,
+    0xe5, 0x06, 0x2e, 0xe6, 0xf2, 0xdc, 0xf5, 0x18, 0xe6, 0x01, 0x2f, 0xee,
+    0xe7, 0xe4, 0xfe, 0x2c, 0x03, 0xf7, 0x20, 0x05, 0x07, 0xe2, 0x06, 0x1e,
+    0x05, 0xed, 0x2f, 0x03, 0xea, 0xf8, 0x0e, 0x0c, 0x1f, 0xff, 0x20, 0xf4,
+    0xe8, 0xe1, 0x1c, 0xec, 0x22, 0x1e, 0x05, 0xfd, 0xf5, 0xca, 0x30, 0xe9,
+    0x30, 0xe4, 0x14, 0xff, 0xf2, 0xdc, 0x17, 0xf8, 0x26, 0xe1, 0x0b, 0x01,
+    0x11, 0xc2, 0x02, 0xf1, 0x36, 0x10, 0x02, 0x05, 0xed, 0xf1, 0x15, 0xfa,
+    0x17, 0xf8, 0xf7, 0xf1, 0xe8, 0xd3, 0xfd, 0x08, 0xfb, 0x27, 0xf5, 0xf5,
+    0x13, 0x06, 0x0b, 0xf0, 0x01, 0xf9, 0xd7, 0x0e, 0xec, 0x12, 0xfe, 0xfd,
+    0xee, 0x25, 0xd8, 0xf1, 0xb2, 0x09, 0x1c, 0xbf, 0x34, 0xea, 0xc8, 0xea,
+    0xdb, 0x0e, 0x24, 0xde, 0x47, 0xfe, 0xdc, 0xe0, 0xf3, 0x06, 0x20, 0xfe,
+    0x2b, 0xf6, 0x18, 0x14, 0xcd, 0x19, 0x16, 0xfe, 0x1a, 0x15, 0xf8, 0x11,
+    0xf4, 0x22, 0xd7, 0xcc, 0xdd, 0x15, 0xdc, 0x14, 0xf9, 0x02, 0xbb, 0xca,
+    0xe3, 0xf3, 0x0d, 0x1e, 0x2a, 0x0c, 0xe4, 0x05, 0xe0, 0x18, 0x2a, 0x07,
+    0x20, 0xed, 0xf6, 0x17, 0xcf, 0xf4, 0x2a, 0xd6, 0xfb, 0xce, 0x03, 0x37,
+    0xe2, 0xfd, 0x1d, 0xfb, 0xe5, 0xe0, 0x05, 0x29, 0xef, 0x16, 0x23, 0xf7,
+    0x01, 0xf4, 0x0c, 0x14, 0xff, 0xee, 0x31, 0xf9, 0x12, 0xf9, 0x14, 0xf6,
+    0x0c, 0xf6, 0x0b, 0x0f, 0xd8, 0xdc, 0xfe, 0x0f, 0x37, 0xfa, 0x01, 0x09,
+    0x04, 0xd1, 0x0b, 0x0c, 0x29, 0xf3, 0x0a, 0xf9, 0xed, 0xc2, 0x18, 0xf4,
+    0x25, 0x18, 0x0f, 0x08, 0xf7, 0xed, 0x1f, 0xf7, 0x4f, 0x0e, 0xf0, 0xe4,
+    0x00, 0xeb, 0xfa, 0x1a, 0x0c, 0x03, 0xe9, 0xfc, 0xf0, 0xcc, 0x06, 0x05,
+    0xf2, 0x12, 0x04, 0xe2, 0x16, 0x0a, 0x0a, 0xf3, 0x0b, 0xf3, 0xdc, 0xfd,
+    0x10, 0xfc, 0x0e, 0xe2, 0xe0, 0xfe, 0xf0, 0xff, 0xb1, 0x06, 0x1b, 0xe4,
+    0x30, 0x13, 0xc6, 0xc3, 0xfa, 0x0c, 0x1e, 0xd9, 0x57, 0x11, 0xe1, 0xd6,
+    0xfa, 0xee, 0x1d, 0xf7, 0x37, 0xea, 0xf0, 0x05, 0xef, 0x24, 0x1e, 0xf1,
+    0x10, 0xe8, 0xeb, 0x19, 0xd1, 0x18, 0xf5, 0xc8, 0xf8, 0xec, 0xf5, 0x1f,
+    0xf2, 0xff, 0xb3, 0xd2, 0xe6, 0x0e, 0x06, 0x2e, 0x07, 0x17, 0xe0, 0xf5,
+    0x02, 0xf9, 0x20, 0x07, 0x16, 0x08, 0xe8, 0x1d, 0xd3, 0x08, 0x34, 0xda,
+    0xf2, 0xce, 0xfb, 0x1f, 0xe1, 0x00, 0x2d, 0xdb, 0xdf, 0xcc, 0x05, 0xfb,
+    0xf7, 0x00, 0x33, 0xf9, 0x0b, 0x01, 0x13, 0x28, 0xf8, 0x07, 0x24, 0xf8,
+    0x0f, 0x03, 0x0d, 0xe9, 0x06, 0xfe, 0x18, 0xf9, 0xed, 0xf5, 0x0c, 0xe0,
+    0x2c, 0x0e, 0xf9, 0x06, 0xfb, 0xce, 0x27, 0xe8, 0x29, 0x19, 0xf9, 0x01,
+    0x0e, 0xc8, 0x25, 0xed, 0x30, 0xeb, 0x01, 0xfe, 0x10, 0xdc, 0x1e, 0x00,
+    0x1e, 0x10, 0xf9, 0x00, 0xfc, 0xc8, 0x0e, 0x04, 0x13, 0x04, 0xf0, 0x02,
+    0xfe, 0xd8, 0x0f, 0x1b, 0xf7, 0xe1, 0xf8, 0xde, 0x12, 0xe2, 0xef, 0x0a,
+    0x02, 0xe0, 0xdd, 0xf1, 0x0e, 0x2a, 0x25, 0x15, 0xeb, 0x02, 0xf4, 0xf0,
+    0xbf, 0xfc, 0x27, 0xdc, 0x42, 0x0f, 0xe9, 0xbf, 0xe8, 0x20, 0x33, 0xc9,
+    0x3f, 0x10, 0xec, 0xf3, 0x03, 0x02, 0x2c, 0x04, 0x38, 0x06, 0x0a, 0xf9,
+    0xe5, 0x1c, 0x3f, 0x0f, 0x0c, 0x25, 0xe2, 0x06, 0xe6, 0x03, 0xf4, 0xd7,
+    0xfe, 0xf6, 0xe7, 0x2f, 0xfa, 0x03, 0xb6, 0xcb, 0xf1, 0x11, 0x0a, 0x2c,
+    0xfc, 0x1e, 0xe0, 0xff, 0xc2, 0xdd, 0x1d, 0xf3, 0x10, 0xfa, 0x07, 0x1e,
+    0xf6, 0x20, 0x07, 0xe6, 0xf1, 0x0a, 0xe8, 0x27, 0xf1, 0xf5, 0x24, 0xed,
+    0xfd, 0xee, 0x13, 0x15, 0xe9, 0xe2, 0x22, 0xe5, 0xf9, 0xdd, 0x1d, 0x32,
+    0x04, 0xfa, 0x25, 0x00, 0xee, 0xfd, 0x0b, 0x0e, 0x23, 0xfa, 0x0f, 0x01,
+    0xf8, 0xf0, 0x15, 0xe4, 0x21, 0xf7, 0x10, 0xf9, 0xe7, 0xc3, 0x19, 0xe1,
+    0x34, 0xff, 0xed, 0xf4, 0xef, 0xd7, 0x21, 0x01, 0x31, 0xee, 0xf7, 0xf2,
+    0xf3, 0xe5, 0x0a, 0xee, 0x2e, 0x1e, 0xf2, 0x0c, 0x07, 0xc2, 0x08, 0x0a,
+    0x14, 0x14, 0x00, 0xfc, 0xf9, 0xd6, 0xfb, 0xf8, 0xe5, 0xf1, 0xfa, 0xe0,
+    0x15, 0x21, 0xef, 0x06, 0xf9, 0x00, 0xf5, 0xf4, 0x0b, 0x0b, 0x18, 0x02,
+    0xf5, 0x04, 0xdb, 0xfd, 0xcc, 0x32, 0x1d, 0xc9, 0x3b, 0x12, 0xd9, 0xaf,
+    0xcf, 0x0f, 0x26, 0xde, 0x35, 0xe4, 0xdb, 0xd3, 0x22, 0x11, 0x2e, 0xfb,
+    0x36, 0xfa, 0xfd, 0x02, 0xeb, 0x0f, 0x37, 0x0b, 0x14, 0x1d, 0xdd, 0x18,
+    0xe0, 0x10, 0xe0, 0xdf, 0x14, 0xf9, 0xf0, 0x19, 0xf7, 0xfb, 0xc4, 0xe5,
+    0xe7, 0x11, 0x01, 0x31, 0x1a, 0xf7, 0xd8, 0xf1, 0xe9, 0xf3, 0x21, 0xf9,
+    0xfe, 0xe4, 0xe9, 0x02, 0xd0, 0x06, 0x14, 0xd7, 0xfc, 0xec, 0x06, 0x10,
+    0xfc, 0xf0, 0x1c, 0xe7, 0xec, 0xe3, 0x03, 0x21, 0xe4, 0x04, 0x12, 0xf0,
+    0xf3, 0xed, 0x16, 0x36, 0x02, 0xfd, 0x13, 0x11, 0xdf, 0xeb, 0x19, 0x07,
+    0x10, 0x0c, 0xf9, 0x08, 0xf8, 0xf4, 0x1d, 0xfd, 0x1d, 0x16, 0xf4, 0x0a,
+    0x08, 0xec, 0x0c, 0x09, 0x3d, 0xe0, 0x0b, 0xee, 0x10, 0xd1, 0x1e, 0x15,
+    0x43, 0xeb, 0xfa, 0xf3, 0x05, 0xc7, 0xf2, 0xd9, 0x25, 0x20, 0xee, 0xe9,
+    0xfd, 0xce, 0x16, 0x0c, 0x27, 0x06, 0x0a, 0x06, 0xf9, 0xd6, 0x0b, 0x05,
+    0xe8, 0x02, 0xe8, 0xd2, 0x10, 0x01, 0xf2, 0x15, 0x09, 0x04, 0xd3, 0xe2,
+    0xfe, 0xf0, 0x32, 0x1b, 0xd9, 0xf5, 0xea, 0xcc, 0xcb, 0x10, 0x1c, 0xf1,
+    0x3b, 0x02, 0xd4, 0xbf, 0xca, 0xfe, 0x12, 0xdb, 0x3b, 0xf8, 0xd5, 0xe7,
+    0x13, 0x10, 0x1a, 0xf4, 0x38, 0x09, 0x08, 0xee, 0xf4, 0xf4, 0x3c, 0xf7,
+    0x15, 0x04, 0xe4, 0xfa, 0xf4, 0x04, 0xee, 0xf4, 0x07, 0xf8, 0xe9, 0x3b,
+    0xe2, 0x1f, 0xd5, 0xed, 0xe6, 0xfd, 0x18, 0x49, 0x21, 0x06, 0xd8, 0xde,
+    0xfa, 0xf0, 0x1b, 0xfe, 0xde, 0x08, 0xf7, 0x14, 0xc7, 0x0f, 0x1d, 0xcf,
+    0x00, 0xea, 0xff, 0x1b, 0xd5, 0x08, 0x0d, 0xd9, 0xf1, 0xf4, 0x16, 0x23,
+    0xd8, 0x0c, 0x29, 0xdc, 0xf1, 0xf2, 0x21, 0x49, 0xfc, 0xe2, 0x08, 0x01,
+    0xf0, 0xf8, 0x17, 0xf9, 0x0f, 0xf5, 0xfa, 0x1a, 0xef, 0xec, 0x09, 0xeb,
+    0x1a, 0x0c, 0x17, 0x09, 0x11, 0xe9, 0x1a, 0xf7, 0x29, 0xf9, 0xfd, 0x07,
+    0x01, 0xdd, 0x0a, 0xec, 0x22, 0x15, 0x03, 0xfd, 0xe2, 0xd2, 0x15, 0xec,
+    0x4d, 0xd7, 0xfc, 0xf6, 0x0b, 0xcc, 0x0e, 0x04, 0x03, 0xf7, 0xfb, 0xfb,
+    0x0d, 0xeb, 0x19, 0x07, 0xf4, 0xf4, 0xe5, 0xde, 0x22, 0x07, 0xea, 0xf7,
+    0xeb, 0x23, 0xc8, 0xee, 0x03, 0x04, 0x0f, 0x19, 0xc3, 0xf8, 0x06, 0xd0,
+    0xf7, 0xfe, 0x0e, 0xe7, 0x0a, 0x02, 0xb0, 0xb8, 0x00, 0xfb, 0x18, 0x0f,
+    0x22, 0xf7, 0xe9, 0xdc, 0x09, 0x15, 0x23, 0x0d, 0x22, 0x13, 0xe2, 0xed,
+    0xeb, 0x18, 0x20, 0x0b, 0x12, 0xfc, 0x02, 0xf1, 0xdb, 0x0e, 0xe1, 0x04,
+    0xdb, 0x0f, 0xf3, 0x1a, 0x06, 0xef, 0xdb, 0xdc, 0xdd, 0xfb, 0x00, 0x2a,
+    0x20, 0xfd, 0xc1, 0xe3, 0xef, 0x01, 0x14, 0xf2, 0x14, 0x00, 0x0f, 0x28,
+    0xd9, 0xff, 0xf4, 0xdc, 0x09, 0xfa, 0x1c, 0x08, 0xd1, 0x03, 0x0a, 0xf4,
+    0xe4, 0xdb, 0x20, 0x30, 0xea, 0x06, 0x11, 0xe2, 0x26, 0xf7, 0x16, 0x22,
+    0xf9, 0x07, 0x02, 0xf5, 0xf6, 0xfb, 0x1d, 0x0c, 0x16, 0x0a, 0x07, 0xf9,
+    0x11, 0xde, 0x20, 0x08, 0x19, 0x04, 0x0a, 0x0b, 0x0c, 0xf7, 0xf4, 0xfc,
+    0x41, 0xf1, 0xf8, 0x16, 0x09, 0xdc, 0x0e, 0x1a, 0x2b, 0x1f, 0xe7, 0xfe,
+    0x01, 0xe0, 0xfd, 0xe2, 0x34, 0xec, 0xf3, 0xf5, 0x03, 0xec, 0x0b, 0xfb,
+    0x04, 0xf6, 0xdd, 0xfd, 0x06, 0x14, 0x0d, 0xfa, 0xfc, 0xf1, 0x0a, 0xca,
+    0x01, 0xec, 0x0e, 0x0e, 0xec, 0xd7, 0xee, 0xd4, 0xf2, 0xfe, 0x16, 0xfa,
+    0xbd, 0x0d, 0xef, 0xcb, 0xc4, 0xee, 0xed, 0x13, 0x10, 0x19, 0xf8, 0xb1,
+    0xf1, 0xe3, 0x00, 0xf3, 0x0c, 0xf6, 0xde, 0xc6, 0x15, 0x27, 0x14, 0x29,
+    0x15, 0xf6, 0xf4, 0xf5, 0xe7, 0x00, 0x0b, 0x2f, 0x0c, 0xef, 0x03, 0x0f,
+    0xfd, 0x08, 0xf3, 0xf9, 0xf9, 0x05, 0x0d, 0x34, 0x15, 0x1b, 0xc8, 0xd1,
+    0xf2, 0x1b, 0x0a, 0x22, 0x12, 0x11, 0xe9, 0xf4, 0xe1, 0x2a, 0x20, 0x03,
+    0xf2, 0xf8, 0x14, 0x0b, 0xd0, 0xf4, 0x0e, 0xbf, 0xc6, 0xd8, 0x04, 0x05,
+    0xf8, 0xf4, 0x04, 0xc9, 0xea, 0xfd, 0xf7, 0xfa, 0xe3, 0x1b, 0x11, 0xde,
+    0x0c, 0x11, 0x25, 0x29, 0xe5, 0x02, 0xef, 0xef, 0x02, 0xfa, 0x1a, 0x21,
+    0x19, 0x09, 0x08, 0x05, 0x04, 0xe5, 0xfa, 0xed, 0x2d, 0x26, 0xfa, 0x17,
+    0xf6, 0xe8, 0x12, 0x12, 0x31, 0xfc, 0x0d, 0x00, 0xf7, 0xeb, 0x19, 0xf1,
+    0x2a, 0x06, 0x14, 0xec, 0x08, 0xd3, 0x21, 0x07, 0x32, 0xe3, 0x02, 0x0b,
+    0xfb, 0xd8, 0x27, 0x07, 0x05, 0xe6, 0xf5, 0xf5, 0x0a, 0xf7, 0x2c, 0x2a,
+    0xd8, 0x1b, 0xda, 0xf7, 0xea, 0xf6, 0xf9, 0x0e, 0xf8, 0x0c, 0x05, 0xc7,
+    0xd6, 0x06, 0x12, 0xe3, 0xe1, 0xe1, 0xd8, 0xdb, 0xc6, 0xf8, 0xe6, 0xfa,
+    0x0c, 0x07, 0xf8, 0xe7, 0xe1, 0x0f, 0x00, 0xf3, 0x03, 0xf0, 0xde, 0xcc,
+    0xf5, 0xfc, 0xef, 0x1e, 0x16, 0x13, 0xfb, 0xf4, 0x03, 0xe9, 0xfc, 0xfa,
+    0x15, 0xe8, 0x15, 0x09, 0xf1, 0x0d, 0xdb, 0x0a, 0xe8, 0x09, 0xf5, 0x1a,
+    0x04, 0xf8, 0xd8, 0xd4, 0x04, 0xee, 0x25, 0x29, 0x09, 0xfe, 0xf3, 0xf5,
+    0xd4, 0x0a, 0x15, 0x19, 0xf5, 0x12, 0xfe, 0x04, 0xe7, 0x01, 0xeb, 0xde,
+    0xbe, 0xfe, 0x09, 0x12, 0xdf, 0x13, 0xe0, 0xef, 0xc7, 0xff, 0x03, 0x08,
+    0xfe, 0xf2, 0x19, 0xe0, 0xe4, 0x0c, 0x22, 0x1e, 0x05, 0xf7, 0x16, 0xf2,
+    0xf9, 0x06, 0x17, 0xf6, 0x0c, 0x1e, 0x23, 0x08, 0xfe, 0xdc, 0xfd, 0x17,
+    0x11, 0xdf, 0xf5, 0x0f, 0x01, 0x03, 0x08, 0xee, 0x1b, 0x02, 0x0b, 0x1b,
+    0x0c, 0x16, 0x1a, 0x00, 0x0f, 0x26, 0x14, 0xf8, 0xf4, 0xf3, 0x19, 0x16,
+    0x22, 0x0a, 0xd0, 0xf9, 0xf1, 0x05, 0x2b, 0x1e, 0x1e, 0xef, 0xf5, 0x06,
+    0x05, 0xe7, 0x3f, 0x2a, 0x06, 0xf0, 0x15, 0x14, 0x13, 0x20, 0x1b, 0xde,
+    0x10, 0x05, 0x33, 0xf8, 0x08, 0x04, 0x17, 0x0d, 0x0f, 0xf6, 0x01, 0xed,
+    0x28, 0x25, 0x1c, 0x13, 0xfb, 0xea, 0xfb, 0xf3, 0x1c, 0xf9, 0x1f, 0xf0,
+    0xfb, 0x17, 0xf8, 0xff, 0x10, 0xf7, 0x0b, 0x24, 0x04, 0x00, 0x0d, 0x0c,
+    0xf7, 0x0a, 0x16, 0x13, 0xf8, 0x05, 0x0a, 0xf1, 0xf5, 0xee, 0xf8, 0x14,
+    0x0e, 0xed, 0xfe, 0x1b, 0xfe, 0x17, 0x13, 0x10, 0x12, 0x21, 0x1c, 0xfa,
+    0xe5, 0x0b, 0x08, 0x0c, 0x10, 0x1b, 0x03, 0xef, 0x0d, 0x05, 0x0a, 0xf0,
+    0x04, 0x11, 0x15, 0x00, 0xfd, 0xef, 0x02, 0x18, 0xf4, 0x09, 0xfa, 0xf6,
+    0x02, 0xf7, 0xfd, 0x13, 0xef, 0x13, 0xf7, 0xf9, 0x17, 0x0f, 0xfa, 0xf8,
+    0x15, 0xff, 0x04, 0xef, 0xf0, 0x15, 0xfa, 0xfe, 0xf0, 0xf4, 0xed, 0x06,
+    0x1c, 0x02, 0xfb, 0xf7, 0x05, 0xfb, 0x0c, 0xef, 0xf4, 0xf0, 0xf6, 0xec,
+    0x17, 0xf3, 0xf5, 0xef, 0x02, 0xfd, 0xe5, 0x21, 0x0c, 0xf1, 0x1e, 0x08,
+    0xf1, 0x0b, 0xf7, 0x09, 0x1d, 0xf2, 0xf9, 0xf2, 0xfb, 0x0e, 0xed, 0xf8,
+    0xfa, 0xdd, 0xf0, 0xfd, 0xdb, 0x1a, 0xf4, 0xef, 0x0c, 0x06, 0x0f, 0xdf,
+    0xe2, 0x06, 0x06, 0xee, 0xfa, 0x0d, 0x17, 0xfc, 0xf9, 0x15, 0x1a, 0xe4,
+    0xfb, 0x0c, 0x1a, 0xfc, 0x1b, 0x04, 0x07, 0x20, 0xff, 0x09, 0x0f, 0xf2,
+    0x26, 0x19, 0x1f, 0x0d, 0x02, 0x16, 0x03, 0x03, 0xfd, 0x05, 0x01, 0x1b,
+    0x0a, 0x11, 0xfa, 0x21, 0x13, 0xfb, 0x0c, 0x05, 0xf3, 0xdd, 0xe4, 0xdc,
+    0x22, 0x1b, 0x15, 0x14, 0x0e, 0xe8, 0x00, 0xf7, 0xf8, 0xf4, 0x0b, 0x0b,
+    0xfd, 0x21, 0xe3, 0x0f, 0xe1, 0x22, 0x01, 0x21, 0x0b, 0x1f, 0x09, 0x10,
+    0xe2, 0x18, 0x11, 0x0e, 0xed, 0x01, 0x14, 0x12, 0xfd, 0x11, 0xf6, 0xe9,
+    0x20, 0xe1, 0xf5, 0x1b, 0x27, 0x22, 0xfa, 0xf7, 0xfe, 0x13, 0xf6, 0xdc,
+    0x06, 0x0d, 0xf4, 0x05, 0x20, 0x0d, 0x0b, 0xe4, 0x15, 0x28, 0x0c, 0x00,
+    0xf5, 0x07, 0x0c, 0x0a, 0x06, 0x0e, 0xf3, 0xfb, 0xfe, 0x04, 0x08, 0xf4,
+    0xef, 0x03, 0xe4, 0xeb, 0x06, 0xee, 0xed, 0xdb, 0xeb, 0x1d, 0xf4, 0xfa,
+    0x0c, 0xfc, 0xfe, 0x11, 0xf7, 0xf8, 0xf5, 0xef, 0xe7, 0xfc, 0x1b, 0xdc,
+    0x17, 0xfd, 0xfe, 0x00, 0xea, 0xf4, 0xf1, 0xf7, 0x0f, 0x21, 0x04, 0xfd,
+    0x0d, 0x0c, 0x0a, 0x14, 0xfd, 0x19, 0x09, 0x01, 0xfd, 0xe2, 0x0c, 0x0c,
+    0xe0, 0x25, 0xfb, 0xff, 0x0d, 0x18, 0xf6, 0x0b, 0x19, 0x12, 0x10, 0x09,
+    0x0b, 0x06, 0x12, 0x1c, 0x10, 0x03, 0x13, 0x0a, 0x05, 0x0f, 0x09, 0x01,
+    0x21, 0xe4, 0x01, 0x26, 0xf9, 0xf4, 0x05, 0x19, 0x00, 0xff, 0x0b, 0xff,
+    0x16, 0x09, 0xe7, 0xee, 0xed, 0xf5, 0x0f, 0x2f, 0xee, 0x19, 0x03, 0x0a,
+    0x10, 0xee, 0xf7, 0x2e, 0xf4, 0x08, 0xf7, 0xee, 0x07, 0x00, 0xfc, 0x0e,
+    0xf0, 0x12, 0x08, 0x05, 0xed, 0x11, 0xfc, 0xfb, 0xf7, 0x25, 0xf1, 0x05,
+    0x0c, 0xf9, 0xfa, 0x03, 0x0c, 0x16, 0x04, 0x25, 0xf8, 0xe7, 0xfc, 0x11,
+    0x0d, 0x19, 0xd8, 0xfa, 0x0b, 0x06, 0xfd, 0xef, 0x13, 0xf6, 0xff, 0x0e,
+    0xf9, 0x04, 0xf1, 0xdc, 0xfb, 0xe1, 0xf6, 0x0b, 0x15, 0x07, 0xf7, 0x02,
+    0x0e, 0xf1, 0xfd, 0xe3, 0xeb, 0x07, 0xf1, 0xef, 0x03, 0xfe, 0xf8, 0x07,
+    0x10, 0xf7, 0x00, 0xf9, 0xf2, 0x0e, 0xf9, 0xf2, 0x1d, 0xf5, 0xd8, 0xff,
+    0xe6, 0x18, 0x2a, 0x1b, 0x03, 0x16, 0xfe, 0xf4, 0xf5, 0xfd, 0x04, 0x01,
+    0xfe, 0xfe, 0x07, 0xfc, 0x0e, 0xfa, 0x15, 0xeb, 0x02, 0x15, 0xea, 0xfd,
+    0x04, 0xe5, 0xfe, 0xed, 0xfe, 0x1a, 0x09, 0x2a, 0x1b, 0xdf, 0xfb, 0xf8,
+    0xf1, 0x04, 0x1a, 0x34, 0x07, 0xf9, 0x0d, 0xf5, 0xef, 0xec, 0x10, 0x1a,
+    0x0b, 0x0f, 0x13, 0xfe, 0x10, 0x22, 0x1e, 0x02, 0xe6, 0xf7, 0x11, 0xfa,
+    0x11, 0xfc, 0x1b, 0x21, 0x12, 0xf4, 0x18, 0x16, 0x29, 0xe4, 0x0c, 0x2e,
+    0x12, 0x07, 0x20, 0xf6, 0x1d, 0xf4, 0x12, 0x33, 0xf4, 0xee, 0xfe, 0x05,
+    0x06, 0xfb, 0x13, 0x0c, 0x0e, 0xf0, 0x00, 0xf8, 0xee, 0xf3, 0x17, 0x00,
+    0xf7, 0xfb, 0xfc, 0x0f, 0xf4, 0xd5, 0x0a, 0xed, 0xeb, 0xf5, 0xe9, 0xef,
+    0xd8, 0xf0, 0xf8, 0xe2, 0x19, 0xf7, 0xf8, 0x0a, 0x0b, 0x09, 0xfa, 0xe7,
+    0x0f, 0xfc, 0xe8, 0x02, 0x00, 0x1a, 0xfe, 0xfd, 0x1b, 0xe6, 0xef, 0x0f,
+    0xe3, 0x10, 0xf1, 0xe2, 0x0b, 0x0e, 0x06, 0x29, 0x00, 0x01, 0xf3, 0x00,
+    0x11, 0x04, 0xf2, 0xf7, 0xea, 0xf8, 0xe0, 0x09, 0x0e, 0x13, 0xf4, 0x00,
+    0x09, 0xfa, 0xf5, 0x0c, 0xff, 0x18, 0x08, 0x0d, 0xfa, 0xde, 0xfa, 0x03,
+    0xf2, 0xf3, 0x1b, 0xeb, 0x06, 0xea, 0xfb, 0xff, 0x0d, 0xf5, 0x10, 0x17,
+    0xf8, 0xe8, 0xf1, 0xf1, 0xf5, 0x00, 0x03, 0x0a, 0x09, 0x0a, 0xf3, 0xfb,
+    0x33, 0x26, 0xe7, 0x17, 0xe3, 0xfa, 0x1f, 0x24, 0xfc, 0x07, 0x02, 0xe2,
+    0xeb, 0x08, 0x2c, 0xf8, 0x02, 0x1f, 0x04, 0xeb, 0x0b, 0x04, 0x17, 0xf7,
+    0xff, 0x1c, 0xed, 0x00, 0x3f, 0xd5, 0x17, 0x1d, 0xfe, 0x03, 0xf1, 0x1c,
+    0x17, 0xec, 0x0e, 0x54, 0xee, 0xf5, 0x25, 0xfa, 0x08, 0xee, 0x13, 0x32,
+    0x0e, 0xd8, 0x09, 0x0f, 0xee, 0xe5, 0x06, 0x10, 0xf4, 0xfb, 0xe4, 0xfb,
+    0x09, 0xde, 0x13, 0xff, 0x02, 0xf9, 0xec, 0x0a, 0x00, 0xe9, 0xfd, 0xdc,
+    0x06, 0x04, 0xdb, 0x06, 0x01, 0xf8, 0x09, 0xe2, 0x0c, 0x14, 0xda, 0xfe,
+    0x20, 0xe3, 0x09, 0xda, 0x14, 0x12, 0xe1, 0x05, 0xff, 0xf3, 0x00, 0x08,
+    0xfb, 0xf1, 0xfd, 0xf3, 0x04, 0xfa, 0x08, 0xff, 0x01, 0x1d, 0x0b, 0xfd,
+    0x0a, 0xf4, 0xfb, 0xfc, 0xf9, 0x19, 0xed, 0xfc, 0xf2, 0x06, 0xe7, 0x02,
+    0xf6, 0x0c, 0xfc, 0xfb, 0x01, 0x0c, 0xeb, 0x1b, 0xff, 0xff, 0x08, 0x1d,
+    0xf7, 0xe8, 0xfc, 0xf4, 0x0c, 0xfa, 0xf1, 0xee, 0xed, 0xdd, 0xfc, 0x06,
+    0x05, 0xdc, 0x1a, 0xfc, 0xf9, 0x07, 0xdf, 0x1b, 0x14, 0x0c, 0xfc, 0x01,
+    0x16, 0xe1, 0xed, 0x09, 0x34, 0xee, 0xe4, 0x1c, 0x1b, 0xfc, 0x3b, 0x03,
+    0x15, 0xf2, 0xeb, 0x14, 0x00, 0xdd, 0x24, 0x04, 0xf1, 0xed, 0xfd, 0xe6,
+    0x32, 0xf9, 0x24, 0x04, 0x0e, 0x22, 0x03, 0x14, 0x2f, 0xf5, 0x1a, 0x37,
+    0xf4, 0x18, 0x03, 0x0f, 0x4b, 0xe6, 0x0d, 0x5c, 0xf7, 0x1f, 0x1c, 0xe6,
+    0x23, 0x0c, 0x15, 0x4e, 0xe0, 0x05, 0x1c, 0xec, 0xff, 0x04, 0x13, 0x15,
+    0xee, 0x07, 0xec, 0x0c, 0xdd, 0xf8, 0x0e, 0x03, 0x0c, 0x1f, 0xe8, 0x0e,
+    0xf5, 0xec, 0xfc, 0xe2, 0xe8, 0xfb, 0xf6, 0x00, 0xe5, 0xea, 0xf3, 0xd3,
+    0xf5, 0xfd, 0xd2, 0xfd, 0x1b, 0xed, 0x09, 0xd1, 0x23, 0xfa, 0xd4, 0xf7,
+    0xe9, 0xf0, 0x0a, 0xd6, 0x14, 0x03, 0xe6, 0x10, 0xf4, 0x18, 0xfe, 0xe1,
+    0x0b, 0x25, 0xf5, 0xfc, 0xe9, 0xf2, 0xe9, 0xf4, 0x0d, 0xf5, 0x00, 0xf9,
+    0x17, 0x02, 0xfd, 0x03, 0x04, 0xf8, 0xf5, 0x14, 0xe3, 0xd3, 0xeb, 0xe7,
+    0x09, 0xf3, 0x14, 0x17, 0xee, 0xe6, 0xf6, 0xff, 0x11, 0x26, 0xf4, 0xf7,
+    0x02, 0xfa, 0x05, 0x08, 0x16, 0xff, 0x0d, 0xf7, 0xf1, 0xf7, 0xe6, 0xfb,
+    0x04, 0x04, 0x07, 0x02, 0x04, 0x09, 0xf5, 0xfc, 0x5f, 0xd6, 0xe7, 0x2a,
+    0x23, 0xf4, 0x1b, 0x06, 0x01, 0xea, 0xe7, 0x05, 0x25, 0xe3, 0x25, 0x07,
+    0xea, 0xfb, 0xfb, 0x09, 0x25, 0xde, 0x37, 0x04, 0x07, 0xe5, 0xff, 0x14,
+    0x2f, 0x0a, 0x30, 0x23, 0x04, 0xf0, 0x23, 0xfe, 0x1c, 0xd2, 0x2b, 0x55,
+    0x01, 0xe5, 0x26, 0xfe, 0x14, 0xed, 0x24, 0x46, 0xe6, 0xee, 0x0f, 0xfd,
+    0xed, 0xef, 0x0e, 0x1e, 0x05, 0x0a, 0x12, 0xff, 0xe4, 0xf5, 0x0c, 0xed,
+    0xfd, 0xea, 0x0d, 0x13, 0x1a, 0xe5, 0xfc, 0xc2, 0xef, 0x0a, 0xe2, 0x0f,
+    0xfe, 0xff, 0x0c, 0xf0, 0xff, 0xdf, 0xea, 0x00, 0xf6, 0xe1, 0x04, 0xd8,
+    0x26, 0x20, 0xdc, 0xf4, 0x19, 0x06, 0xe8, 0xd2, 0x10, 0x04, 0xf1, 0x02,
+    0x0c, 0x06, 0xf0, 0xf0, 0x04, 0x1f, 0xf4, 0xf5, 0xed, 0xf1, 0xfa, 0xf1,
+    0x04, 0x02, 0xf8, 0xfb, 0x04, 0xf1, 0xe5, 0xe4, 0x0a, 0xf0, 0xfe, 0xef,
+    0x1c, 0xe3, 0xeb, 0xf3, 0x00, 0x17, 0x01, 0x13, 0x19, 0xda, 0xf8, 0x06,
+    0xde, 0x11, 0xea, 0xf7, 0xf4, 0xef, 0x03, 0x04, 0x0b, 0xe8, 0x08, 0x0e,
+    0xe2, 0xee, 0xde, 0x06, 0x0e, 0x29, 0xfb, 0xfa, 0x00, 0x02, 0xec, 0x1b,
+    0x52, 0xff, 0xde, 0x3a, 0x2f, 0x13, 0x30, 0xe9, 0xff, 0xf6, 0xe7, 0x15,
+    0x1d, 0xd9, 0x3c, 0x0f, 0xe6, 0x14, 0xee, 0x13, 0x1f, 0xe7, 0x33, 0x08,
+    0xfc, 0x06, 0x0c, 0x08, 0x19, 0xd9, 0x2b, 0x1f, 0x07, 0x10, 0x24, 0x16,
+    0x29, 0xfc, 0x31, 0x4d, 0xf0, 0xd9, 0x3f, 0xf2, 0x20, 0xe2, 0x25, 0x49,
+    0xe5, 0xec, 0x0a, 0xf5, 0xf2, 0xd9, 0x22, 0x1f, 0xed, 0x22, 0x02, 0x0a,
+    0x16, 0x08, 0xf7, 0xfb, 0x0e, 0xfb, 0xfb, 0x1d, 0xf3, 0x1c, 0xf6, 0xe1,
+    0xcf, 0x19, 0xf4, 0x0f, 0xee, 0xf9, 0x04, 0xd1, 0xf9, 0xe2, 0xda, 0xf1,
+    0x24, 0xf5, 0x07, 0xdf, 0x1d, 0xf9, 0xdb, 0x18, 0x0b, 0xea, 0x08, 0xca,
+    0xf2, 0xfa, 0xec, 0x04, 0x0e, 0x17, 0xed, 0xf1, 0x06, 0x15, 0xfc, 0xfd,
+    0x08, 0xfa, 0xe3, 0xe4, 0x0a, 0xfc, 0xee, 0x08, 0xf5, 0x09, 0xef, 0xee,
+    0x06, 0xef, 0xe1, 0x19, 0x07, 0xe8, 0xe6, 0xdf, 0xea, 0x0d, 0xf1, 0x16,
+    0xee, 0xed, 0xf8, 0x09, 0xfa, 0xfb, 0x0c, 0xf8, 0xeb, 0xda, 0x00, 0xfc,
+    0x04, 0xfe, 0xf5, 0xff, 0xf6, 0xe1, 0x0c, 0x0a, 0x13, 0x0d, 0xf6, 0xf5,
+    0x15, 0x07, 0xca, 0xec, 0x50, 0x0e, 0xd0, 0x26, 0x4c, 0xf8, 0x23, 0xeb,
+    0xff, 0x08, 0xe3, 0x11, 0x2c, 0xf9, 0x2a, 0xf1, 0xe9, 0x0b, 0xe9, 0x0f,
+    0x15, 0xec, 0x33, 0x11, 0x0c, 0x0d, 0x01, 0x01, 0x32, 0xe3, 0x41, 0x27,
+    0x11, 0x02, 0x2e, 0x07, 0x09, 0xe3, 0x22, 0x4d, 0xf1, 0x05, 0x27, 0x03,
+    0x25, 0xf5, 0x2c, 0x3b, 0xf4, 0x00, 0x16, 0x0b, 0xec, 0xfe, 0x17, 0x0d,
+    0xff, 0xe7, 0xfe, 0x24, 0x06, 0xee, 0xf0, 0xe9, 0xfa, 0x1c, 0xf2, 0x19,
+    0x08, 0xfa, 0xff, 0xd2, 0x01, 0x02, 0xea, 0x05, 0xf2, 0xf4, 0x0b, 0xd2,
+    0xf9, 0x0d, 0xcd, 0x0d, 0x12, 0xf2, 0x0e, 0xe1, 0x1f, 0x00, 0xe7, 0x14,
+    0x04, 0xff, 0x09, 0xdb, 0xfc, 0xd9, 0x06, 0xf9, 0xeb, 0x01, 0xef, 0xfa,
+    0xfb, 0xf5, 0xfc, 0xfb, 0x14, 0xe2, 0xf9, 0xf5, 0x02, 0xfd, 0xfc, 0x01,
+    0xf7, 0xf3, 0x00, 0xec, 0xe7, 0xf2, 0x00, 0xf1, 0x11, 0xec, 0xf0, 0xe9,
+    0x11, 0x0a, 0x07, 0x04, 0x01, 0xee, 0xfb, 0xf2, 0x14, 0x01, 0x12, 0xf0,
+    0xf2, 0xf1, 0xf0, 0xfb, 0x08, 0x03, 0xf8, 0x01, 0xe8, 0xf9, 0x17, 0x26,
+    0x0f, 0xea, 0xf7, 0xf8, 0x1e, 0xfe, 0xf2, 0xf8, 0x3f, 0x00, 0xd4, 0x1c,
+    0x53, 0xfe, 0x1e, 0x0f, 0xef, 0xdd, 0xed, 0x10, 0x19, 0xe7, 0x34, 0x0e,
+    0xde, 0xdf, 0xfa, 0x0e, 0x29, 0xe3, 0x16, 0x09, 0x06, 0x12, 0xeb, 0xf9,
+    0x32, 0xe0, 0x1a, 0x1d, 0xf3, 0xed, 0x10, 0x07, 0x31, 0xf2, 0x12, 0x52,
+    0xeb, 0xf7, 0x1e, 0xf7, 0x1a, 0xdc, 0x3e, 0x33, 0xe3, 0xfb, 0x1f, 0x0b,
+    0x08, 0xfe, 0x13, 0x1a, 0xf4, 0xf8, 0xfe, 0x08, 0xfc, 0xe9, 0xfe, 0xeb,
+    0xe6, 0xf6, 0x02, 0x18, 0x02, 0xe8, 0xfb, 0xf3, 0x01, 0x08, 0xd7, 0x13,
+    0x04, 0xe6, 0x02, 0xe6, 0xd7, 0x01, 0xd4, 0xf0, 0x0e, 0x05, 0x18, 0xe5,
+    0x08, 0xe5, 0xd2, 0x16, 0x12, 0xfe, 0x0e, 0xd3, 0xfc, 0x1f, 0xe9, 0xf8,
+    0x11, 0x06, 0xf3, 0xd5, 0xf8, 0xff, 0xf0, 0x04, 0x0a, 0xd9, 0xf8, 0xfd,
+    0xf5, 0x12, 0xff, 0x06, 0x1b, 0xe6, 0xfe, 0xfe, 0xde, 0xee, 0xf6, 0x18,
+    0xf1, 0xf8, 0x06, 0xf3, 0x02, 0xea, 0x04, 0x14, 0xfc, 0xee, 0xe6, 0x09,
+    0xf9, 0xee, 0xe3, 0xe7, 0xfc, 0xd9, 0xef, 0xfc, 0x0a, 0x0c, 0x03, 0xf6,
+    0xe2, 0x11, 0x0f, 0x19, 0x18, 0x10, 0xef, 0xe5, 0x22, 0xf5, 0xe5, 0xe9,
+    0x4b, 0xf7, 0xdb, 0x0c, 0x4f, 0xde, 0x22, 0x16, 0x09, 0x16, 0xd1, 0xf8,
+    0x19, 0xe0, 0x24, 0xfe, 0xb8, 0xfb, 0xe5, 0x12, 0x1c, 0xe3, 0x22, 0x09,
+    0x05, 0x29, 0xf7, 0x10, 0x31, 0xe1, 0x33, 0x3f, 0xfd, 0xed, 0x04, 0x03,
+    0x2e, 0xed, 0x30, 0x36, 0xee, 0x16, 0x2f, 0xf5, 0x1b, 0xdc, 0x3a, 0x56,
+    0xe5, 0xef, 0x26, 0xff, 0x03, 0xd7, 0x31, 0x16, 0xef, 0xf1, 0x08, 0x13,
+    0x01, 0x02, 0x03, 0xf1, 0xf2, 0x08, 0xff, 0x05, 0x12, 0xf2, 0xee, 0xda,
+    0xed, 0xec, 0xea, 0xf7, 0x0c, 0xf1, 0x09, 0xe6, 0xe6, 0x00, 0xcc, 0x10,
+    0x0d, 0x0d, 0x20, 0xf4, 0x18, 0x23, 0xec, 0xf9, 0x00, 0xe4, 0x07, 0xd4,
+    0xfb, 0x16, 0xd2, 0x01, 0xe6, 0x01, 0x06, 0xf0, 0xfe, 0x03, 0xf3, 0x09,
+    0x01, 0x0d, 0x05, 0xf7, 0xd4, 0x02, 0xfb, 0xfb, 0x08, 0xf0, 0x1f, 0xf3,
+    0xfe, 0xeb, 0x02, 0x0e, 0x1b, 0x0f, 0x04, 0xf5, 0xf0, 0x1f, 0x14, 0xf7,
+    0x06, 0xdc, 0xf9, 0xe9, 0x01, 0xff, 0x08, 0xf2, 0x06, 0xff, 0xff, 0xf3,
+    0x05, 0x1a, 0xfc, 0xfa, 0xeb, 0xfb, 0xfa, 0x12, 0x20, 0xf6, 0xe0, 0xe8,
+    0x1c, 0xfa, 0xd6, 0x0d, 0x2c, 0x04, 0xe1, 0x09, 0x3b, 0xd3, 0x2a, 0xee,
+    0xf7, 0xed, 0xf1, 0xf7, 0x0d, 0xf0, 0x32, 0x0f, 0xc9, 0x0e, 0x00, 0x10,
+    0x24, 0xfb, 0x31, 0xf0, 0xf4, 0xdd, 0xf5, 0x04, 0x25, 0xc7, 0x27, 0x25,
+    0x16, 0x11, 0x2e, 0x09, 0x30, 0xd1, 0x2c, 0x34, 0xe6, 0xf0, 0x21, 0xf5,
+    0x21, 0xc8, 0x40, 0x39, 0xde, 0xf0, 0x12, 0xf3, 0x10, 0xe8, 0x1f, 0x18,
+    0xfa, 0xea, 0x07, 0x11, 0xdf, 0xed, 0xfa, 0xf0, 0x07, 0xef, 0xf3, 0x05,
+    0x10, 0xe5, 0xf3, 0xe9, 0xe9, 0xe8, 0xd6, 0x01, 0xf9, 0x05, 0x0b, 0xee,
+    0xf9, 0x12, 0xe3, 0x05, 0xfd, 0xe6, 0x16, 0xe2, 0x1b, 0x12, 0xc5, 0x00,
+    0xfd, 0x02, 0x04, 0xd2, 0xff, 0xec, 0xf6, 0xfd, 0x00, 0xe4, 0xf7, 0xf3,
+    0xeb, 0xfa, 0xf8, 0x0d, 0x03, 0xfa, 0xfe, 0xe4, 0xdb, 0xe3, 0x06, 0xff,
+    0xf4, 0xf2, 0x1b, 0xf1, 0xf7, 0x02, 0x01, 0x04, 0x13, 0xe5, 0x0c, 0x05,
+    0xf7, 0x0a, 0x03, 0x03, 0x0b, 0x03, 0xee, 0xf7, 0x21, 0x20, 0xff, 0xf3,
+    0x09, 0xe5, 0xff, 0xec, 0x17, 0x00, 0x06, 0x14, 0xeb, 0xf2, 0x18, 0x16,
+    0x1f, 0xec, 0xee, 0xe1, 0x1e, 0x03, 0xfa, 0xfe, 0x28, 0x03, 0xc9, 0x0c,
+    0x3f, 0xd8, 0x30, 0x16, 0x03, 0xf8, 0xe9, 0xfb, 0x28, 0xe1, 0x36, 0x0a,
+    0xdf, 0xe5, 0xeb, 0x08, 0x1c, 0xcd, 0x29, 0xf2, 0xfc, 0x0a, 0xed, 0x01,
+    0x29, 0xf1, 0x20, 0x13, 0x04, 0xec, 0x17, 0x0a, 0x35, 0xc3, 0x1a, 0x46,
+    0xe0, 0xd7, 0x3c, 0x09, 0x28, 0xd1, 0x22, 0x20, 0xd5, 0xfa, 0x28, 0xfa,
+    0xff, 0xea, 0x1d, 0x23, 0xe0, 0x07, 0x07, 0x0f, 0xf1, 0xf1, 0x08, 0xf0,
+    0xf8, 0xff, 0x05, 0x1b, 0x05, 0xfa, 0xf0, 0xfb, 0xe3, 0xe4, 0xcc, 0x1a,
+    0xf9, 0x09, 0x06, 0xee, 0xf4, 0x03, 0xd0, 0x14, 0xf4, 0xff, 0x1d, 0xe8,
+    0x11, 0xf4, 0xd1, 0xf4, 0x04, 0x0b, 0xfb, 0xdc, 0x0a, 0x0c, 0xeb, 0xed,
+    0x06, 0xf3, 0x04, 0xdd, 0xdf, 0xf9, 0xea, 0xfc, 0xf5, 0xf2, 0xfb, 0xea,
+    0xe3, 0x03, 0xee, 0x0e, 0xff, 0xdb, 0x1e, 0x04, 0xf7, 0x1a, 0x04, 0x0c,
+    0x0d, 0xda, 0x04, 0xe9, 0xff, 0x04, 0x00, 0x0c, 0xf9, 0xe4, 0xfb, 0xf6,
+    0x14, 0xde, 0x1b, 0x00, 0x0b, 0xfe, 0x06, 0xf8, 0x0f, 0xdc, 0x01, 0xef,
+    0xef, 0x0d, 0xf8, 0xf1, 0x0f, 0xf9, 0xf9, 0xdf, 0x0d, 0xe4, 0xd9, 0xf9,
+    0x2b, 0xee, 0xe8, 0x09, 0x40, 0xf9, 0x2f, 0x0a, 0xfa, 0xe8, 0xe9, 0x01,
+    0x0e, 0xe7, 0x23, 0x0a, 0xd0, 0x19, 0xd3, 0x0e, 0x04, 0xda, 0x2b, 0x0f,
+    0xe7, 0xe6, 0xf3, 0xfb, 0x2c, 0xd3, 0x36, 0x19, 0x0e, 0xfe, 0x03, 0x1a,
+    0x2e, 0xd0, 0x23, 0x32, 0xf1, 0xe1, 0x2a, 0x09, 0x1b, 0xf6, 0x29, 0x3e,
+    0xce, 0x15, 0x0a, 0xe8, 0xec, 0xdf, 0x44, 0x28, 0xd9, 0xfd, 0xfa, 0x09,
+    0xff, 0xe7, 0x08, 0xec, 0xf4, 0xef, 0x01, 0x19, 0x11, 0xf3, 0xeb, 0xeb,
+    0xed, 0x1a, 0xdd, 0x15, 0x0f, 0x07, 0xfe, 0xeb, 0xff, 0xd6, 0xd5, 0x04,
+    0xf5, 0x07, 0x10, 0xe6, 0x0c, 0xe4, 0xda, 0x0c, 0x08, 0xee, 0x06, 0xd8,
+    0xf8, 0xf1, 0xe0, 0x01, 0x08, 0xfe, 0xf9, 0xf3, 0xdf, 0x03, 0xe6, 0xf4,
+    0x0a, 0xff, 0xf2, 0xe0, 0xd9, 0xeb, 0x01, 0x10, 0x02, 0xfc, 0x0d, 0x14,
+    0xea, 0xf8, 0x03, 0x18, 0xf3, 0x09, 0xfc, 0x0c, 0x0b, 0x1f, 0xf5, 0x05,
+    0xf7, 0xf9, 0x00, 0xfd, 0x04, 0xfc, 0x16, 0x07, 0x00, 0xdf, 0xf9, 0xfa,
+    0x0c, 0xfb, 0xf4, 0xf7, 0xf0, 0xeb, 0x07, 0x17, 0x20, 0xfb, 0xf0, 0xec,
+    0x04, 0x00, 0xf8, 0xf2, 0x2d, 0xf9, 0xd9, 0x0b, 0x55, 0xec, 0x33, 0x26,
+    0xf8, 0x0a, 0xf2, 0x0b, 0x25, 0xdf, 0x29, 0x05, 0xd1, 0x14, 0xe2, 0xf2,
+    0x12, 0xdd, 0x28, 0xfc, 0xec, 0x08, 0xfd, 0x02, 0x3a, 0xe6, 0x29, 0x25,
+    0x0d, 0x10, 0x09, 0x0a, 0x32, 0xf5, 0x17, 0x2d, 0xea, 0xfb, 0x35, 0xfc,
+    0x28, 0xd0, 0x29, 0x2f, 0xcb, 0x06, 0x0f, 0x04, 0xf2, 0xf3, 0x34, 0x1c,
+    0xf4, 0x08, 0x05, 0xfc, 0xfd, 0xed, 0x0f, 0xf8, 0xe9, 0xf0, 0x09, 0x16,
+    0xfe, 0x02, 0xff, 0xd4, 0xea, 0x0a, 0xeb, 0x0c, 0xf8, 0xf4, 0x09, 0xf4,
+    0xf2, 0x07, 0xd9, 0x0b, 0xfd, 0xe4, 0x1a, 0xef, 0x14, 0x08, 0xd8, 0xfc,
+    0xf5, 0xe1, 0x03, 0xcf, 0xf1, 0x11, 0xdb, 0x15, 0x07, 0x10, 0xf8, 0xfc,
+    0xe2, 0xf1, 0xf5, 0xde, 0xff, 0xe7, 0x01, 0xea, 0xee, 0xe9, 0x02, 0x0a,
+    0x18, 0xec, 0xfe, 0xf9, 0x09, 0xf3, 0x0e, 0x02, 0xf1, 0xfc, 0xf9, 0x16,
+    0x05, 0x07, 0x09, 0x0d, 0x0e, 0xf7, 0x04, 0xed, 0x04, 0xdb, 0x04, 0x04,
+    0xf6, 0xdc, 0xee, 0xec, 0xf5, 0xfe, 0xf4, 0x02, 0xe4, 0x0b, 0xe0, 0x17,
+    0x0a, 0xe0, 0xf7, 0xdc, 0x11, 0xd6, 0xfe, 0xfa, 0x35, 0xde, 0xe6, 0x06,
+    0x44, 0xf9, 0x35, 0x0a, 0xfb, 0xff, 0xec, 0xfb, 0x16, 0xd9, 0x23, 0x0f,
+    0xd4, 0xef, 0xdf, 0x06, 0x0b, 0xd9, 0x25, 0xff, 0xf8, 0xeb, 0xf4, 0x0a,
+    0x20, 0xe5, 0x22, 0x1c, 0xeb, 0xf4, 0x0d, 0x0c, 0x19, 0xe1, 0x1e, 0x31,
+    0xe9, 0xfb, 0x20, 0xf0, 0x23, 0xfe, 0x35, 0x28, 0xb4, 0x06, 0x28, 0xe7,
+    0xfb, 0xe9, 0x2a, 0x1a, 0xef, 0x15, 0x0c, 0xed, 0xf1, 0x04, 0x0e, 0x0a,
+    0xff, 0x16, 0x01, 0x04, 0x17, 0xea, 0xec, 0xdc, 0xf4, 0xf7, 0x04, 0x16,
+    0x1f, 0x0a, 0x11, 0xef, 0x12, 0xdf, 0xd9, 0x0c, 0xf5, 0x10, 0x02, 0xf3,
+    0x10, 0x03, 0xd3, 0xf5, 0x0b, 0x02, 0x00, 0xcb, 0xf6, 0x23, 0xf6, 0xf1,
+    0x1f, 0xf9, 0xfc, 0xf0, 0xf6, 0xfe, 0xfa, 0xf8, 0xf9, 0xf4, 0xfb, 0x0a,
+    0xd6, 0x29, 0x09, 0x02, 0x00, 0xfc, 0xfc, 0xee, 0xf5, 0x05, 0xfb, 0x1e,
+    0xf1, 0xf1, 0xf3, 0x02, 0xec, 0x1c, 0x0c, 0x0e, 0x0b, 0x04, 0xf6, 0xe7,
+    0x14, 0x08, 0x27, 0x01, 0xfe, 0xe5, 0xe7, 0x01, 0x1b, 0xf0, 0xf6, 0xff,
+    0xf4, 0xe7, 0xee, 0x18, 0x0d, 0x08, 0xf8, 0xd6, 0x07, 0xf4, 0x08, 0xff,
+    0x1d, 0x13, 0xe7, 0x0b, 0x42, 0xef, 0x28, 0x00, 0xf9, 0xf0, 0xf3, 0x00,
+    0x15, 0xfd, 0x1a, 0x22, 0xc1, 0xf5, 0xe0, 0xf8, 0x09, 0xe6, 0x0e, 0x05,
+    0xf9, 0xf6, 0x01, 0x01, 0x13, 0xdc, 0x1f, 0x0d, 0xfb, 0x04, 0x08, 0x0b,
+    0x15, 0xdb, 0x28, 0x34, 0xed, 0x0b, 0x3a, 0xed, 0x16, 0xe3, 0x39, 0x32,
+    0xc4, 0x0b, 0x20, 0xe7, 0xf7, 0x02, 0x35, 0x24, 0xfc, 0xe8, 0x1c, 0xf8,
+    0xf1, 0xfa, 0x0c, 0x1d, 0xf2, 0x05, 0xff, 0x12, 0x0f, 0x01, 0xec, 0xea,
+    0xf0, 0x03, 0xe7, 0x15, 0xfd, 0x05, 0x08, 0xe0, 0x1b, 0xf8, 0xe1, 0x1e,
+    0xed, 0xdc, 0x11, 0xeb, 0xfd, 0x1a, 0xeb, 0x09, 0xf9, 0xf3, 0x00, 0xe8,
+    0xe6, 0x08, 0xf7, 0xde, 0x1e, 0x00, 0x00, 0x00, 0xe4, 0x09, 0xf2, 0xf8,
+    0xe7, 0xf2, 0x0d, 0xfa, 0xe2, 0x0f, 0x04, 0x08, 0xf2, 0x13, 0xf8, 0xf9,
+    0xf1, 0xff, 0x03, 0x11, 0x12, 0xe9, 0xf4, 0x13, 0x07, 0x0c, 0x13, 0x2b,
+    0xf7, 0xdd, 0xf9, 0xe9, 0xfa, 0xdb, 0x1d, 0xf6, 0xf6, 0xf9, 0xe4, 0xf6,
+    0x0d, 0xeb, 0x0d, 0x08, 0xe7, 0xe7, 0xf2, 0x03, 0x1d, 0xd9, 0xd8, 0xe4,
+    0xf7, 0xea, 0xdc, 0xdc, 0x26, 0x02, 0xee, 0xfa, 0x38, 0xfc, 0x1a, 0xef,
+    0xda, 0xf1, 0xdf, 0x0b, 0x1a, 0xe0, 0x16, 0x16, 0xdc, 0x04, 0xfa, 0xf7,
+    0xee, 0x02, 0x25, 0x02, 0xf5, 0xfb, 0x08, 0xf6, 0x11, 0xf5, 0x12, 0x08,
+    0xf4, 0xe3, 0x1b, 0xf5, 0x3a, 0xdc, 0x20, 0x2e, 0xe0, 0xf5, 0x30, 0xe4,
+    0x09, 0xf8, 0x3c, 0x45, 0xd3, 0x08, 0x23, 0xd8, 0x09, 0xe4, 0x35, 0x30,
+    0xe4, 0xfe, 0x07, 0xf6, 0x05, 0x01, 0x05, 0xff, 0xf6, 0x0d, 0x02, 0xfd,
+    0x03, 0x05, 0x0d, 0x00, 0xf5, 0xd6, 0xcf, 0x19, 0x06, 0xee, 0x0d, 0xf2,
+    0x01, 0x18, 0xef, 0x12, 0x04, 0x02, 0x21, 0xd9, 0x02, 0x0d, 0xeb, 0xe9,
+    0x13, 0x08, 0x15, 0xf0, 0xee, 0x03, 0xec, 0x06, 0x17, 0xed, 0x00, 0x1a,
+    0xee, 0xf2, 0xfc, 0x09, 0xec, 0xf8, 0xf8, 0x18, 0xf4, 0x13, 0x04, 0xf6,
+    0x02, 0xf0, 0xfc, 0xfe, 0xe3, 0x01, 0x0a, 0x1c, 0x1b, 0xec, 0x0e, 0x01,
+    0xfb, 0x08, 0x11, 0xf5, 0x00, 0x14, 0xe6, 0x12, 0x07, 0xf4, 0x15, 0x07,
+    0xfc, 0xfb, 0xf5, 0xf1, 0x01, 0x21, 0x01, 0xe9, 0xe8, 0xef, 0xdb, 0xdf,
+    0x1f, 0x0a, 0xdd, 0xd1, 0x16, 0x04, 0xfd, 0xe1, 0x24, 0xf0, 0xec, 0xf4,
+    0x38, 0xe1, 0x16, 0xfd, 0xe0, 0xec, 0xe7, 0x0c, 0x2a, 0x04, 0x0c, 0x17,
+    0xdc, 0xe8, 0xf2, 0x03, 0xec, 0xfd, 0x19, 0xfe, 0xf3, 0xf0, 0xf3, 0xfb,
+    0x18, 0xdf, 0x1c, 0x00, 0x09, 0xf4, 0x18, 0x0b, 0x1f, 0xf6, 0x34, 0x22,
+    0xf4, 0x22, 0x45, 0xeb, 0x23, 0xcf, 0x32, 0x34, 0xf2, 0xf9, 0x29, 0xd4,
+    0xf7, 0x0b, 0x38, 0x2a, 0x09, 0xe6, 0x05, 0x01, 0x0b, 0xfe, 0x17, 0xfb,
+    0x00, 0xeb, 0x08, 0xfd, 0x0c, 0x02, 0x1d, 0xea, 0xfa, 0x0b, 0xeb, 0x09,
+    0xfe, 0xfe, 0x10, 0xe0, 0xf6, 0x06, 0xf0, 0x15, 0xf3, 0x09, 0x11, 0xe4,
+    0xf9, 0x07, 0xe1, 0xed, 0x17, 0x05, 0x0c, 0xe1, 0xdb, 0xf2, 0xf8, 0xea,
+    0x22, 0xe9, 0x02, 0x00, 0xfd, 0xe7, 0xf2, 0xf8, 0xf9, 0xfc, 0xfa, 0xe8,
+    0xe8, 0xeb, 0xe9, 0x0d, 0x04, 0xf8, 0xf8, 0xf7, 0xf8, 0x0d, 0x03, 0x0c,
+    0x13, 0xf2, 0x0f, 0xf9, 0xe6, 0xfd, 0x0f, 0x19, 0x08, 0xf7, 0xfa, 0x01,
+    0xf3, 0x12, 0x1e, 0x05, 0x0a, 0x09, 0xfd, 0x0b, 0x07, 0x08, 0x02, 0xfc,
+    0xd6, 0xe8, 0x14, 0x01, 0x13, 0x19, 0xef, 0xda, 0x0e, 0x0a, 0x07, 0xef,
+    0x34, 0xe0, 0x05, 0x1e, 0x4e, 0xe9, 0x19, 0xff, 0xe1, 0x04, 0xfb, 0x0e,
+    0x11, 0x05, 0x1f, 0x15, 0xd4, 0xec, 0xf9, 0xe7, 0xf9, 0xfc, 0x25, 0xff,
+    0x06, 0xf2, 0x01, 0xf6, 0x2a, 0x17, 0x24, 0x11, 0xf3, 0x1a, 0x1f, 0xfb,
+    0x32, 0xeb, 0x33, 0x2f, 0x00, 0x08, 0x2c, 0xf0, 0x26, 0xf4, 0x25, 0x36,
+    0xd9, 0xf1, 0x1a, 0xd5, 0xec, 0xf9, 0x32, 0x27, 0xfc, 0xf4, 0xf0, 0xe3,
+    0xfa, 0x0c, 0x16, 0x17, 0xfa, 0xf9, 0xe5, 0x1f, 0x1f, 0xfa, 0xff, 0xfd,
+    0x0d, 0x02, 0xe9, 0x0e, 0xf0, 0x12, 0x09, 0xda, 0x02, 0xea, 0xe5, 0x0a,
+    0xff, 0x03, 0x13, 0xf0, 0x0a, 0xf9, 0xe9, 0xff, 0x10, 0xfc, 0x1a, 0xf3,
+    0xf7, 0x0f, 0xf4, 0xfa, 0xf4, 0x05, 0x10, 0x0a, 0xdd, 0x09, 0xf7, 0xf0,
+    0xe5, 0x07, 0x07, 0xfa, 0x02, 0xd7, 0xf8, 0xf7, 0x01, 0xfb, 0x0e, 0xf8,
+    0x07, 0x0f, 0xfe, 0x03, 0x12, 0x05, 0x09, 0x13, 0xf8, 0xdc, 0xfd, 0x27,
+    0x0f, 0xec, 0xf7, 0x07, 0x00, 0xfc, 0x12, 0xf8, 0xfb, 0xea, 0xe4, 0xe9,
+    0xe9, 0xe0, 0xff, 0xdc, 0xd6, 0xeb, 0xf2, 0xf7, 0x0d, 0x1b, 0xe9, 0xc4,
+    0x06, 0x00, 0xfd, 0x04, 0x46, 0xf9, 0xe9, 0x13, 0x2d, 0x0c, 0x1f, 0xf8,
+    0xd3, 0x0c, 0x14, 0x11, 0x05, 0xe5, 0x27, 0x08, 0xc5, 0xef, 0xdf, 0xdd,
+    0x04, 0xf8, 0x11, 0x10, 0xf0, 0xe7, 0xfb, 0x03, 0x3c, 0xe7, 0x14, 0x0c,
+    0xf4, 0xf6, 0x1b, 0x0a, 0x23, 0xf2, 0x2d, 0x1a, 0x08, 0xff, 0x32, 0xe7,
+    0x1a, 0x05, 0x2b, 0x34, 0xf1, 0x0a, 0x00, 0xe8, 0x02, 0xdf, 0x2c, 0x2a,
+    0x03, 0xe6, 0xfc, 0xef, 0xfc, 0xe4, 0x03, 0x01, 0x03, 0xee, 0xe9, 0x15,
+    0x05, 0x03, 0x13, 0x11, 0x0e, 0xee, 0xf5, 0x22, 0x1b, 0x0e, 0xfd, 0xf3,
+    0x0a, 0x02, 0xdd, 0x20, 0xeb, 0x06, 0xf8, 0xe2, 0x06, 0x0e, 0xde, 0x0d,
+    0xf9, 0x16, 0x1c, 0x0c, 0xe0, 0xf0, 0xec, 0x0c, 0x0f, 0xf2, 0x27, 0x1d,
+    0xde, 0xe6, 0xf0, 0xf9, 0xf0, 0x02, 0x0a, 0x07, 0x06, 0xf9, 0x0f, 0xfa,
+    0xf0, 0xee, 0xf1, 0xf7, 0xff, 0x02, 0x0b, 0x0d, 0x1b, 0xee, 0xf6, 0x05,
+    0xff, 0x1c, 0x17, 0x04, 0x05, 0x17, 0x00, 0xff, 0x0d, 0xf3, 0x23, 0x10,
+    0xfd, 0x05, 0xfb, 0xea, 0x03, 0x10, 0x07, 0xd7, 0xf7, 0xff, 0xf3, 0xf1,
+    0x17, 0xed, 0xd3, 0xcb, 0x14, 0x1c, 0xf5, 0x03, 0x47, 0xf6, 0xf7, 0xf2,
+    0x3e, 0xf2, 0x22, 0xf4, 0xed, 0xfc, 0xee, 0x0b, 0xf4, 0xf1, 0x25, 0x10,
+    0xd0, 0xf6, 0x00, 0xef, 0x10, 0xfc, 0x15, 0xe5, 0xdb, 0xf3, 0xea, 0x10,
+    0x22, 0xf2, 0x2b, 0x11, 0xf9, 0x0a, 0xfc, 0xf5, 0x53, 0x16, 0x25, 0x43,
+    0xe0, 0x0e, 0x13, 0xfc, 0x2d, 0xe2, 0x55, 0x65, 0xf4, 0x08, 0x01, 0xdf,
+    0x0a, 0x00, 0x49, 0x1c, 0xfe, 0xdf, 0xef, 0xf2, 0xf9, 0xf6, 0xfd, 0xff,
+    0xf3, 0x02, 0xf6, 0x14, 0x0b, 0xe8, 0x09, 0xfc, 0xfc, 0xe2, 0xe5, 0x11,
+    0x03, 0x09, 0xfb, 0x06, 0x10, 0x1a, 0xf3, 0x0d, 0xfa, 0x0a, 0xd5, 0xf5,
+    0x1a, 0x11, 0xf2, 0xfc, 0x1f, 0xfe, 0x0e, 0xe4, 0xef, 0xd7, 0xee, 0x06,
+    0x1e, 0x04, 0x12, 0x28, 0xf7, 0x0e, 0x06, 0xf8, 0xee, 0xf0, 0x1a, 0x01,
+    0xf7, 0xfd, 0x03, 0x11, 0x19, 0x10, 0x04, 0xfb, 0xd7, 0xfa, 0x16, 0x06,
+    0x07, 0x23, 0xfa, 0x14, 0x11, 0xf1, 0x12, 0x10, 0x04, 0xe1, 0xee, 0xf7,
+    0x21, 0x0e, 0x0a, 0x0a, 0xf8, 0x07, 0x0a, 0xee, 0x03, 0x1f, 0xfa, 0xc4,
+    0xec, 0x12, 0x01, 0x1e, 0xfd, 0xf1, 0xe8, 0xcc, 0xf4, 0x17, 0xff, 0xdd,
+    0x45, 0x10, 0xee, 0xfa, 0x3d, 0xe7, 0x27, 0xdd, 0xd7, 0xf9, 0xf4, 0xf6,
+    0x06, 0xf8, 0x1e, 0x13, 0xe7, 0xe2, 0xf1, 0xe3, 0xf3, 0xf7, 0x18, 0x12,
+    0xe4, 0x0a, 0xdb, 0xff, 0xff, 0xfe, 0x20, 0x09, 0x00, 0xf7, 0x23, 0xf6,
+    0x2d, 0x14, 0x26, 0x28, 0xe5, 0xff, 0x0f, 0xe3, 0x1d, 0xe8, 0x56, 0x43,
+    0xe7, 0xfb, 0xf9, 0xe6, 0xe9, 0xe2, 0x19, 0x19, 0x08, 0xfa, 0xf3, 0xe5,
+    0x23, 0x07, 0x0f, 0xf8, 0xf8, 0xf3, 0xfc, 0x11, 0x2a, 0x05, 0xf4, 0xf1,
+    0xfa, 0xfb, 0xf1, 0x1e, 0x13, 0x0f, 0xf9, 0xf5, 0xfa, 0x09, 0xf9, 0x03,
+    0xf0, 0xf0, 0xe7, 0xec, 0xf1, 0x0c, 0xe6, 0xee, 0xf6, 0x20, 0x0f, 0xe9,
+    0x00, 0xf4, 0xfe, 0xf0, 0x13, 0x0a, 0x17, 0x13, 0xee, 0x13, 0xfb, 0xff,
+    0xf8, 0xfd, 0xf4, 0xe2, 0xe8, 0x06, 0xfc, 0x14, 0x03, 0x17, 0x00, 0x03,
+    0xe6, 0xfd, 0xf2, 0x12, 0x12, 0x20, 0xeb, 0x10, 0x02, 0xf7, 0x13, 0x0d,
+    0x11, 0xfd, 0xde, 0xf5, 0x07, 0xf3, 0x04, 0xff, 0x06, 0x05, 0xfb, 0xea,
+    0xf0, 0x0a, 0x00, 0xb5, 0xe8, 0x1a, 0x03, 0xfe, 0x0d, 0x1a, 0xe7, 0xc0,
+    0xd6, 0xdc, 0xf6, 0xf8, 0x39, 0xf5, 0xd5, 0xf8, 0x22, 0xfa, 0x22, 0x05,
+    0xd0, 0xf4, 0x2d, 0xfc, 0x00, 0x0a, 0x1b, 0xfc, 0xe6, 0x09, 0x14, 0xfa,
+    0x00, 0x1d, 0x1a, 0xfd, 0xf3, 0x18, 0xfc, 0xeb, 0x15, 0xf5, 0x0e, 0x0a,
+    0xf3, 0xf1, 0x1b, 0x05, 0x14, 0x03, 0x2d, 0x27, 0xfb, 0x18, 0x22, 0xef,
+    0xf6, 0x06, 0x28, 0x2b, 0xde, 0xec, 0xef, 0xe8, 0xd3, 0xfe, 0x17, 0x12,
+    0x01, 0x13, 0x05, 0xf7, 0x00, 0xde, 0xf3, 0xe5, 0x03, 0xfb, 0x07, 0x0b,
+    0xfd, 0xdc, 0xdf, 0x03, 0x0c, 0x00, 0xfa, 0x06, 0x0e, 0x02, 0x05, 0xfa,
+    0xfd, 0xed, 0x09, 0x0c, 0xfd, 0xfb, 0x0c, 0xf0, 0xe4, 0x04, 0xd6, 0xf3,
+    0x09, 0x0a, 0xf9, 0xf8, 0xe2, 0xef, 0xdf, 0xf0, 0xf8, 0x03, 0x0f, 0x20,
+    0xf4, 0xe3, 0xf8, 0x02, 0xe2, 0xe5, 0x25, 0x0f, 0xeb, 0xf8, 0xe9, 0xfd,
+    0x04, 0x0c, 0x0c, 0xfe, 0x01, 0x08, 0xfc, 0xfc, 0x1b, 0x01, 0xe5, 0x13,
+    0xf9, 0xe8, 0x07, 0x20, 0xfe, 0x06, 0xec, 0xfe, 0x09, 0xef, 0x14, 0x04,
+    0x0b, 0xf5, 0xe7, 0xff, 0x0a, 0x02, 0x09, 0xe9, 0xc4, 0x16, 0x0d, 0xe7,
+    0x15, 0x14, 0xf1, 0xd0, 0xec, 0xe7, 0xf0, 0xf0, 0x33, 0x05, 0xda, 0xf2,
+    0x0b, 0x08, 0x38, 0x01, 0x07, 0xfd, 0xd8, 0x06, 0xd9, 0xf0, 0x16, 0x1f,
+    0xff, 0xf7, 0xe0, 0xd8, 0xf3, 0xf7, 0x12, 0x08, 0x0e, 0x05, 0xf6, 0x03,
+    0xef, 0x1b, 0x12, 0xf4, 0xe8, 0x0f, 0x02, 0xfd, 0xf2, 0x16, 0x26, 0x22,
+    0xe0, 0x07, 0xf7, 0xe6, 0xeb, 0x16, 0x22, 0x1a, 0x0b, 0x01, 0xf5, 0xea,
+    0xd2, 0x22, 0x0f, 0x13, 0x15, 0x08, 0xf0, 0xfb, 0xed, 0x11, 0xf3, 0xe9,
+    0xff, 0xde, 0x0a, 0x18, 0x0f, 0x02, 0xfb, 0xf9, 0xfb, 0xe8, 0x12, 0x18,
+    0x01, 0xf4, 0xf6, 0xf8, 0xf0, 0x1f, 0x24, 0x15, 0xf5, 0x00, 0x1c, 0xf9,
+    0x01, 0x0a, 0x11, 0xd5, 0x01, 0x12, 0x02, 0xec, 0xfd, 0x07, 0xf2, 0xea,
+    0xf9, 0xff, 0xf7, 0xfb, 0x15, 0xec, 0xe5, 0x01, 0xeb, 0x05, 0xf9, 0x10,
+    0xfe, 0x28, 0xe5, 0x0a, 0xeb, 0x1b, 0x0e, 0xf9, 0xde, 0x02, 0x15, 0x0a,
+    0xff, 0xfe, 0x11, 0x24, 0x03, 0xf8, 0x00, 0x08, 0xfd, 0x0e, 0xeb, 0xf3,
+    0xf6, 0xf7, 0x14, 0x0e, 0xfc, 0xf5, 0xde, 0xf5, 0x9e, 0xfe, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xab, 0x01, 0x00, 0x00,
+    0xfa, 0xfd, 0xff, 0xff, 0xa2, 0xff, 0xff, 0xff, 0xba, 0x00, 0x00, 0x00,
+    0x24, 0xfc, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, 0x54, 0x4f, 0x43, 0x4f,
+    0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x24, 0xfb, 0xff, 0xff,
+    0x68, 0x01, 0x00, 0x00, 0x5c, 0x01, 0x00, 0x00, 0x50, 0x01, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00,
+    0x90, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0xce, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0x03, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x1a, 0xff, 0xff, 0xff, 0x00, 0x00, 0x80, 0x3f, 0x01, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00,
+    0x07, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,
+    0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0xc4, 0xfc, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
+    0x16, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x38, 0x00, 0x00, 0x00,
+    0x2c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
+    0x1a, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00, 0x14, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x02, 0x00, 0x00, 0x00,
+    0x38, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+    0x31, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x0a, 0x00, 0x00, 0x00, 0x34, 0x04, 0x00, 0x00, 0xcc, 0x03, 0x00, 0x00,
+    0x4c, 0x03, 0x00, 0x00, 0xdc, 0x02, 0x00, 0x00, 0x60, 0x02, 0x00, 0x00,
+    0x20, 0x02, 0x00, 0x00, 0xb0, 0x01, 0x00, 0x00, 0x44, 0x01, 0x00, 0x00,
+    0x70, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0xfc, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x09, 0x44, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xf4, 0xfb, 0xff, 0xff,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x80, 0x3b, 0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65,
+    0x6c, 0x73, 0x5f, 0x73, 0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x0e, 0x00, 0x1a, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00,
+    0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+    0xb4, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+    0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00,
+    0x12, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x11, 0x1e, 0x23, 0x3a, 0x9e, 0xa1, 0x15, 0x39,
+    0x23, 0x69, 0x45, 0x3a, 0x09, 0xe4, 0xe4, 0x39, 0x65, 0xd7, 0x13, 0x3a,
+    0xe0, 0xb2, 0xfd, 0x39, 0x1b, 0xc1, 0x53, 0x3a, 0xc2, 0x50, 0x2d, 0x3a,
+    0x12, 0x00, 0x00, 0x00, 0x66, 0x69, 0x72, 0x73, 0x74, 0x5f, 0x77, 0x65,
+    0x69, 0x67, 0x68, 0x74, 0x73, 0x2f, 0x72, 0x65, 0x61, 0x64, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x3a, 0xfd, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x09, 0x54, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x2c, 0xfd, 0xff, 0xff,
     0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0xc6, 0xd0, 0xd0, 0x3d, 0x01, 0x00, 0x00, 0x00, 0xf5, 0xff, 0xcf, 0x41,
-    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xbc, 0xff, 0xff,
-    0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
-    0x1c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x52, 0x65, 0x6c, 0x75,
-    0x00, 0x00, 0x00, 0x00, 0x04, 0xfb, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00,
-    0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x09, 0xf5, 0x83, 0x3d, 0x01, 0x00, 0x00, 0x00,
-    0x14, 0x71, 0x83, 0x41, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x72, 0xbc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02, 0x10, 0x00, 0x00, 0x00,
-    0x06, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
-    0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,
-    0x64, 0xbc, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2d, 0x95, 0x98, 0x38,
-    0x20, 0x00, 0x00, 0x00, 0x27, 0xff, 0xff, 0xff, 0x97, 0xff, 0xff, 0xff,
-    0x58, 0x00, 0x00, 0x00, 0x66, 0xff, 0xff, 0xff, 0x13, 0xff, 0xff, 0xff,
-    0x72, 0xfe, 0xff, 0xff, 0x5d, 0xfb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
-    0xea, 0xbc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,
-    0x05, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,
-    0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00, 0xec, 0xfb, 0xff, 0xff,
-    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3b,
-    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x3f, 0x01, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x5a, 0xbd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
-    0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x61, 0x64, 0x64, 0x5f,
-    0x31, 0x00, 0x00, 0x00, 0x54, 0xfc, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00,
-    0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x9c, 0xd2, 0xb5, 0x3d, 0x01, 0x00, 0x00, 0x00,
-    0x48, 0x18, 0x1f, 0x41, 0x01, 0x00, 0x00, 0x00, 0x4a, 0x21, 0x4b, 0xc1,
-    0xc2, 0xbd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,
-    0x03, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
-    0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e,
-    0x74, 0x2f, 0x46, 0x61, 0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x57,
-    0x69, 0x74, 0x68, 0x4d, 0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56, 0x61, 0x72,
-    0x73, 0x00, 0x00, 0x00, 0xe4, 0xfc, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
+    0xb5, 0xfa, 0xfa, 0x39, 0x1f, 0x00, 0x00, 0x00, 0x66, 0x69, 0x6e, 0x61,
+    0x6c, 0x5f, 0x66, 0x63, 0x5f, 0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73,
+    0x2f, 0x72, 0x65, 0x61, 0x64, 0x2f, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70,
+    0x6f, 0x73, 0x65, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0xa0, 0x0f, 0x00, 0x00, 0xa2, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
+    0x58, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x74, 0xfe, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
     0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x8a, 0x0f, 0x3b, 0x3a,
-    0x01, 0x00, 0x00, 0x00, 0xfc, 0x0b, 0xb4, 0x3d, 0x01, 0x00, 0x00, 0x00,
-    0xd9, 0x26, 0xbf, 0xbd, 0x80, 0x02, 0x00, 0x00, 0x60, 0x38, 0xab, 0xcb,
-    0xfa, 0x7e, 0xa2, 0x55, 0x6e, 0x87, 0xa5, 0x9b, 0xb4, 0x66, 0x5c, 0x6f,
-    0xae, 0xdb, 0xcd, 0xb6, 0xc2, 0x60, 0xa9, 0x7d, 0xd4, 0xac, 0xa6, 0x90,
-    0x87, 0x6b, 0x50, 0x95, 0xde, 0xcd, 0xaa, 0xa1, 0x9c, 0x65, 0xb5, 0x6d,
-    0xb0, 0xa5, 0xa5, 0x7f, 0x73, 0x95, 0x63, 0x81, 0x7a, 0xc6, 0xaf, 0x82,
-    0x69, 0x89, 0xc3, 0x3c, 0x47, 0x73, 0x89, 0x4f, 0x33, 0xbc, 0x85, 0x5d,
-    0x69, 0x11, 0x5b, 0xb9, 0xf1, 0x95, 0x8f, 0x5c, 0x7c, 0x59, 0x6c, 0xa0,
-    0xa5, 0x7c, 0x5a, 0x7c, 0xb5, 0xa9, 0x7e, 0xa1, 0xb8, 0x65, 0xb3, 0x86,
-    0xc1, 0x9f, 0x5c, 0x86, 0x7f, 0x74, 0x52, 0xa8, 0xc9, 0xc5, 0x71, 0x96,
-    0x7a, 0x65, 0xc7, 0x69, 0x94, 0xa7, 0x65, 0x68, 0x69, 0x8d, 0x6d, 0x9e,
-    0x59, 0xd4, 0x75, 0x7a, 0x4f, 0x70, 0xca, 0x48, 0x25, 0x8a, 0x69, 0x4d,
-    0x2a, 0xa6, 0x76, 0x69, 0x6a, 0x02, 0x3b, 0xa2, 0xea, 0xc2, 0x73, 0x6b,
-    0x86, 0x4d, 0x3a, 0xa2, 0xa2, 0x88, 0x4e, 0x6c, 0xb3, 0x83, 0x39, 0x93,
-    0xa6, 0x85, 0xb8, 0x7a, 0xa8, 0x7d, 0x2e, 0x7b, 0x7f, 0x69, 0x56, 0xb5,
-    0xbb, 0xae, 0x23, 0x78, 0x67, 0x5c, 0xd2, 0x82, 0x7d, 0x96, 0x46, 0x74,
-    0x70, 0x72, 0x6a, 0x90, 0x43, 0xce, 0x44, 0x75, 0x4a, 0x58, 0xc7, 0x5c,
-    0x34, 0x84, 0x46, 0x4b, 0x41, 0x6c, 0x62, 0x83, 0x7e, 0x01, 0x9b, 0x9b,
-    0xeb, 0xf7, 0x58, 0x6f, 0x8a, 0x43, 0xb3, 0x9f, 0x9c, 0x9e, 0x55, 0xa8,
-    0xaa, 0x84, 0x8f, 0x8f, 0xb0, 0x9e, 0xc8, 0x81, 0xb6, 0x80, 0xa0, 0x81,
-    0x86, 0x73, 0x5d, 0xdc, 0xb9, 0xae, 0xa2, 0x6c, 0x46, 0x67, 0xfa, 0x79,
-    0x89, 0xaf, 0xa0, 0x74, 0x76, 0x85, 0x72, 0xb1, 0x2a, 0xbb, 0xa0, 0x6d,
-    0x4f, 0x50, 0xc9, 0x5d, 0x2f, 0xaa, 0x9c, 0x63, 0x3f, 0x59, 0x63, 0x90,
-    0x73, 0x1e, 0xb3, 0x94, 0xcd, 0xff, 0x3c, 0x63, 0x9b, 0x59, 0xc5, 0xa2,
-    0x9f, 0x9a, 0x53, 0xab, 0xb0, 0x74, 0xb2, 0x6f, 0x8a, 0xa7, 0xd5, 0x8d,
-    0xb8, 0x7e, 0x9e, 0x78, 0x84, 0x61, 0x66, 0xe7, 0xa7, 0x9f, 0xb7, 0x45,
-    0x24, 0x61, 0xfd, 0x69, 0x87, 0xb8, 0xb2, 0x7a, 0x7c, 0x58, 0x64, 0xa3,
-    0x07, 0xa9, 0xaf, 0x69, 0x49, 0x2f, 0xc2, 0x46, 0x3b, 0xaf, 0x9a, 0x70,
-    0x6b, 0x25, 0x5f, 0x9d, 0x82, 0x33, 0xa1, 0x54, 0xae, 0xff, 0x31, 0x5d,
-    0xaf, 0x51, 0xb2, 0x82, 0x9c, 0xa9, 0x5b, 0x8c, 0xab, 0x75, 0xb3, 0x32,
-    0x42, 0xbd, 0xcd, 0x77, 0xb6, 0x67, 0x9a, 0x5f, 0x6c, 0x71, 0x6e, 0xc2,
-    0xac, 0x97, 0x9f, 0x4b, 0x21, 0x6a, 0xfc, 0x77, 0x83, 0xa1, 0xa3, 0x6a,
-    0x7a, 0x6d, 0x5e, 0x87, 0x02, 0xa6, 0x8f, 0x7f, 0x5c, 0x2e, 0xc1, 0x51,
-    0x4a, 0xa7, 0x96, 0x79, 0x83, 0x2e, 0x5a, 0x84, 0x82, 0x5c, 0x61, 0x3a,
-    0x4a, 0xff, 0x2a, 0x51, 0xa4, 0x6b, 0x82, 0x5e, 0x67, 0xb3, 0x71, 0x80,
-    0xad, 0x62, 0x59, 0x40, 0x26, 0xd7, 0xcf, 0x68, 0xab, 0x7c, 0x6a, 0x69,
-    0x5b, 0x7c, 0x84, 0xbc, 0x95, 0x68, 0x77, 0x63, 0x3f, 0x85, 0xed, 0x7b,
-    0x71, 0xa0, 0x76, 0x90, 0x8c, 0x6c, 0x61, 0x81, 0x16, 0x74, 0x72, 0x94,
-    0x74, 0x37, 0xb5, 0x3d, 0x55, 0x96, 0x86, 0xad, 0x87, 0x39, 0x59, 0x88,
-    0x5b, 0x65, 0x60, 0x33, 0x33, 0xe6, 0x2b, 0x4a, 0xb6, 0x82, 0x50, 0x56,
-    0x51, 0x97, 0x71, 0x83, 0xa6, 0x60, 0x57, 0x51, 0x58, 0xe4, 0xd0, 0x87,
-    0xa1, 0x78, 0x4c, 0x67, 0x72, 0x74, 0x86, 0xc6, 0x60, 0x47, 0x50, 0x96,
-    0x67, 0x96, 0xdd, 0x7d, 0x63, 0x85, 0x5e, 0x98, 0xa2, 0x64, 0x5f, 0x8a,
-    0x3b, 0x40, 0x54, 0xcb, 0xa0, 0x61, 0xa7, 0x44, 0x5f, 0x6d, 0x57, 0xb3,
-    0xb9, 0x2e, 0x61, 0x8e, 0x54, 0x78, 0x85, 0x58, 0x43, 0xb0, 0x27, 0x5d,
-    0x8a, 0x7c, 0x8a, 0x58, 0x40, 0x83, 0x82, 0x9b, 0x6c, 0x60, 0x6b, 0x72,
-    0x7f, 0xde, 0xc9, 0x7d, 0x6f, 0x5f, 0x90, 0x7e, 0x7e, 0x7e, 0x8b, 0xe5,
-    0x51, 0x37, 0x7a, 0xa9, 0xa2, 0xc5, 0xd3, 0x81, 0x32, 0x4b, 0x80, 0xa9,
-    0xc5, 0x76, 0x56, 0x99, 0x33, 0x19, 0x72, 0xe6, 0xdb, 0x90, 0xa8, 0x50,
-    0x65, 0x44, 0x77, 0xdb, 0xc7, 0x48, 0x65, 0x8d, 0x3d, 0x7f, 0xa2, 0x7c,
-    0x53, 0x55, 0x26, 0x49, 0x5d, 0x7d, 0xa2, 0x6d, 0x3b, 0x5b, 0x87, 0x64,
-    0x3a, 0x5b, 0x8d, 0x93, 0x7a, 0xb4, 0xca, 0x6d, 0x16, 0x5a, 0x99, 0x82,
-    0x8d, 0x6a, 0x92, 0xa0, 0x39, 0x2c, 0x95, 0xc8, 0xb8, 0xf5, 0xc8, 0x66,
-    0x2a, 0x45, 0x84, 0x9c, 0xc7, 0x8e, 0x61, 0x7b, 0x43, 0x28, 0x86, 0xff,
-    0xd2, 0xc8, 0x9c, 0x46, 0x65, 0x33, 0x82, 0xd8, 0xcb, 0x73, 0x63, 0x80,
-    0xda, 0xc0, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xa0, 0x0f, 0x00, 0x00,
-    0x31, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73, 0x5f,
-    0x71, 0x75, 0x61, 0x6e, 0x74, 0x5f, 0x31, 0x2f, 0x46, 0x61, 0x6b, 0x65,
-    0x51, 0x75, 0x61, 0x6e, 0x74, 0x57, 0x69, 0x74, 0x68, 0x4d, 0x69, 0x6e,
-    0x4d, 0x61, 0x78, 0x56, 0x61, 0x72, 0x73, 0x2f, 0x74, 0x72, 0x61, 0x6e,
-    0x73, 0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00,
-    0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00,
-    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x87, 0xff, 0xdb, 0x39,
-    0x01, 0x00, 0x00, 0x00, 0xd8, 0xb2, 0x5d, 0x3d, 0x01, 0x00, 0x00, 0x00,
-    0x37, 0xdc, 0x56, 0xbd, 0x80, 0x3e, 0x00, 0x00, 0x67, 0x6d, 0x74, 0x77,
-    0x35, 0x66, 0x87, 0x95, 0x8e, 0x82, 0x5e, 0x70, 0x6e, 0xa7, 0x60, 0x64,
-    0x86, 0x5e, 0x93, 0x7a, 0x76, 0x74, 0x71, 0x8c, 0x61, 0x71, 0x60, 0x8b,
-    0x83, 0x48, 0x8b, 0x5f, 0x95, 0x99, 0x5b, 0x59, 0x49, 0x44, 0x79, 0x62,
-    0x8e, 0x77, 0x71, 0x89, 0x64, 0x46, 0x8f, 0x8e, 0x80, 0x73, 0x71, 0x81,
-    0x85, 0x4a, 0x73, 0x57, 0x66, 0x58, 0x75, 0x93, 0x99, 0x58, 0x8a, 0x7b,
-    0x87, 0x81, 0xa1, 0x46, 0x79, 0x6c, 0x83, 0x7a, 0x92, 0x74, 0x6f, 0x6b,
-    0x79, 0x77, 0x97, 0x8a, 0x95, 0x75, 0xa2, 0x49, 0x80, 0x4e, 0x7f, 0x6d,
-    0xaa, 0xac, 0x6c, 0x5d, 0x57, 0x82, 0x97, 0x77, 0x6f, 0x75, 0x95, 0x73,
-    0x7e, 0x51, 0x9f, 0x5b, 0x54, 0x92, 0x60, 0x72, 0x80, 0x6a, 0x92, 0x83,
-    0x9b, 0x85, 0x7b, 0x4d, 0x55, 0x4d, 0xb2, 0x7d, 0x65, 0x95, 0x76, 0x42,
-    0x61, 0x49, 0xa2, 0x73, 0x9f, 0x7d, 0x7c, 0x54, 0x51, 0x76, 0xa1, 0x7f,
-    0x86, 0x69, 0x98, 0x59, 0x6d, 0x84, 0x9f, 0x7b, 0x86, 0x79, 0x88, 0x55,
-    0x9c, 0x72, 0x95, 0x8a, 0x91, 0x7a, 0x77, 0x95, 0x7b, 0x87, 0x87, 0x85,
-    0x95, 0x72, 0x77, 0x59, 0x7c, 0x80, 0x90, 0x8f, 0x8a, 0x62, 0x76, 0x9f,
-    0x64, 0x84, 0x71, 0x7e, 0x7c, 0x66, 0x8e, 0x94, 0x6e, 0xaa, 0x77, 0x5c,
-    0x6b, 0x63, 0x68, 0x82, 0x89, 0x46, 0x61, 0x74, 0x8e, 0x85, 0x6b, 0x57,
-    0x74, 0x50, 0x87, 0x66, 0x87, 0x98, 0x59, 0x7d, 0xa2, 0x59, 0x75, 0x64,
-    0x72, 0x8c, 0x6a, 0x92, 0x8c, 0x56, 0x88, 0x7a, 0x6e, 0x77, 0x9c, 0x82,
-    0x7e, 0x5a, 0x91, 0x80, 0x9c, 0x9e, 0x60, 0x8b, 0x6d, 0x76, 0x8d, 0x68,
-    0x6c, 0x70, 0x6f, 0x8b, 0x61, 0x6e, 0x86, 0x78, 0x81, 0x81, 0x77, 0x79,
-    0x76, 0x69, 0x7d, 0x7b, 0x96, 0x8b, 0x95, 0x91, 0xa2, 0x7b, 0x86, 0x8d,
-    0x8b, 0x89, 0x86, 0x5a, 0x5c, 0x4d, 0x96, 0x80, 0x81, 0x55, 0x80, 0x80,
-    0x7a, 0x76, 0x99, 0x98, 0x61, 0x95, 0x5a, 0x78, 0x5a, 0x6c, 0x89, 0x81,
-    0x98, 0x77, 0x62, 0x77, 0x93, 0x4d, 0x9f, 0x77, 0x72, 0x87, 0x95, 0x71,
-    0x65, 0x72, 0xac, 0x8c, 0xa2, 0x89, 0x90, 0x7b, 0x67, 0x60, 0x8a, 0xb3,
-    0x72, 0x8f, 0x5c, 0x82, 0x74, 0x76, 0x7c, 0x85, 0x78, 0x6b, 0x97, 0x6d,
-    0x86, 0x82, 0x76, 0x84, 0x89, 0x89, 0x7f, 0x6a, 0x7a, 0x7f, 0x6c, 0x77,
-    0x80, 0x35, 0x7d, 0x66, 0x96, 0x7e, 0x88, 0x55, 0x6b, 0x55, 0x7c, 0xa7,
-    0x7f, 0x9f, 0x64, 0x8b, 0xa0, 0x81, 0x80, 0x97, 0xaf, 0x7a, 0x7d, 0x61,
-    0x7a, 0x77, 0x6f, 0x8c, 0x5e, 0x69, 0x6b, 0x94, 0x70, 0x6a, 0x66, 0x5d,
-    0x78, 0x6e, 0x76, 0x64, 0xa0, 0x73, 0x8f, 0xa2, 0x9d, 0x50, 0x8e, 0x52,
-    0x51, 0x85, 0x78, 0x83, 0x8f, 0x94, 0x83, 0x7c, 0x9c, 0x64, 0x59, 0x7d,
-    0x66, 0x6a, 0x73, 0x80, 0x6a, 0x9b, 0x92, 0x7e, 0x7a, 0x78, 0x7d, 0xa0,
-    0x8a, 0x9b, 0x61, 0x9e, 0x6c, 0x64, 0x6c, 0x8e, 0x86, 0x75, 0x8a, 0x95,
-    0x8e, 0x89, 0x87, 0x8a, 0x5d, 0x8b, 0x82, 0x7c, 0x60, 0x63, 0x85, 0x85,
-    0x63, 0x96, 0xa3, 0x7f, 0x93, 0x78, 0x8c, 0x86, 0x7b, 0x78, 0x8e, 0x71,
-    0x72, 0x8b, 0x8a, 0x5e, 0x8d, 0x75, 0x78, 0xa3, 0x84, 0x67, 0xa7, 0x54,
-    0x6c, 0x80, 0x8e, 0xa8, 0x83, 0x51, 0x6e, 0x9f, 0x8b, 0x86, 0x75, 0x95,
-    0x7f, 0x7a, 0x80, 0x81, 0x8d, 0x9c, 0x83, 0x8a, 0x7b, 0x8a, 0x74, 0x6f,
-    0x8d, 0x96, 0x5b, 0x9c, 0x8d, 0x7b, 0x83, 0x79, 0x7f, 0x65, 0x7e, 0x87,
-    0x7c, 0x5d, 0x71, 0x97, 0x77, 0x44, 0x9a, 0x7f, 0xaa, 0x56, 0x75, 0x5f,
-    0x7c, 0x51, 0x8c, 0x90, 0x84, 0x9a, 0x49, 0x5d, 0x86, 0x52, 0x94, 0x95,
-    0x5b, 0x86, 0x66, 0x7d, 0x51, 0x4f, 0x7a, 0x91, 0x6d, 0x6e, 0x72, 0x70,
-    0x83, 0x4f, 0x9b, 0x9a, 0x8a, 0x77, 0x6a, 0xa1, 0x71, 0x60, 0x61, 0x98,
-    0x67, 0x4e, 0x7a, 0x8a, 0x53, 0x6b, 0x99, 0xa0, 0x91, 0x46, 0x8a, 0x8b,
-    0x47, 0x78, 0xa9, 0x7b, 0x71, 0x6c, 0x81, 0x68, 0x53, 0x73, 0xaf, 0x70,
-    0x62, 0x6d, 0x69, 0x97, 0x70, 0x83, 0x5f, 0x7f, 0x81, 0x87, 0x65, 0x93,
-    0x67, 0x87, 0x70, 0x82, 0x79, 0x9e, 0x80, 0x77, 0x6c, 0x80, 0x92, 0x81,
-    0x8d, 0x8c, 0x89, 0x8b, 0x4e, 0x91, 0x77, 0x84, 0x99, 0x8c, 0x71, 0x88,
-    0x57, 0x7a, 0x9a, 0x8c, 0x82, 0x9b, 0x97, 0x72, 0x69, 0xac, 0x7c, 0x62,
-    0x85, 0x7d, 0x76, 0x7f, 0x59, 0x85, 0x68, 0x63, 0x94, 0x8b, 0x7b, 0x92,
-    0x7b, 0x6f, 0x77, 0x98, 0x66, 0x78, 0x74, 0x99, 0x85, 0x8c, 0x94, 0x89,
-    0x6c, 0x77, 0x89, 0x80, 0x79, 0x8a, 0xa6, 0x95, 0xa9, 0x86, 0x6f, 0x95,
-    0x90, 0x69, 0x98, 0x85, 0xa0, 0x7f, 0x56, 0xab, 0x6f, 0x5a, 0x94, 0x8b,
-    0x5a, 0x72, 0x61, 0x83, 0x54, 0x70, 0x8d, 0x8d, 0x9c, 0x5e, 0x36, 0x9b,
-    0x84, 0x32, 0x6e, 0x84, 0x79, 0x72, 0x64, 0x95, 0x83, 0x58, 0x67, 0x6c,
-    0x9e, 0x8d, 0x6e, 0x9e, 0x4f, 0x78, 0x71, 0x85, 0x75, 0x60, 0x4d, 0x7d,
-    0x64, 0x89, 0x8e, 0x89, 0x6e, 0x92, 0x53, 0x7c, 0x86, 0x8f, 0xa9, 0xb0,
-    0x8e, 0x5e, 0x76, 0x96, 0x65, 0x7c, 0x8a, 0x89, 0x75, 0x8f, 0x65, 0x94,
-    0x6c, 0x6c, 0x8d, 0x6d, 0x66, 0x6a, 0x62, 0x98, 0x53, 0x8f, 0x67, 0x76,
-    0x80, 0x89, 0x66, 0x60, 0x55, 0x81, 0x85, 0x61, 0x75, 0x78, 0x80, 0x92,
-    0x6f, 0x79, 0x66, 0x64, 0x99, 0xa7, 0x88, 0xa1, 0x86, 0x6b, 0x94, 0x88,
-    0x77, 0x83, 0x8f, 0x61, 0x72, 0x7c, 0x6f, 0x8f, 0x61, 0x56, 0x8a, 0x7b,
-    0x66, 0x8b, 0x98, 0x9d, 0x82, 0x65, 0x77, 0x98, 0x55, 0x83, 0x7a, 0x8c,
-    0x74, 0x79, 0x6e, 0x85, 0x82, 0x9a, 0x7d, 0x8d, 0x76, 0x72, 0x64, 0x81,
-    0x9a, 0x8d, 0x9f, 0x7b, 0x7c, 0x7b, 0x7b, 0x84, 0x90, 0x6b, 0xa4, 0x84,
-    0x98, 0x6f, 0x81, 0xb8, 0x6f, 0x6c, 0x87, 0x6d, 0x8c, 0x72, 0x53, 0x85,
-    0x59, 0x4d, 0x9c, 0x94, 0x7d, 0x6f, 0x4f, 0x82, 0x5d, 0x71, 0x6e, 0x78,
-    0x61, 0x61, 0x34, 0x71, 0x6a, 0x5a, 0x73, 0xa3, 0x89, 0x65, 0x4d, 0x80,
-    0x5c, 0x51, 0x81, 0x8e, 0x6c, 0x53, 0x4a, 0x95, 0x3b, 0x72, 0xa7, 0x86,
-    0x7f, 0x75, 0x61, 0xa3, 0x85, 0x6c, 0x99, 0x88, 0x7c, 0x64, 0x7a, 0x8d,
-    0x81, 0x7b, 0x6a, 0x7b, 0x8f, 0x74, 0x6d, 0xae, 0x42, 0x67, 0x88, 0xa1,
-    0x90, 0x4d, 0x7c, 0x7b, 0x62, 0x55, 0x9a, 0x80, 0x4d, 0x76, 0x5c, 0x88,
-    0x60, 0x86, 0x6f, 0x65, 0x67, 0x77, 0x8a, 0x97, 0x99, 0x7c, 0x89, 0x78,
-    0x92, 0xa7, 0x6a, 0x7f, 0x8e, 0x88, 0x9d, 0xa1, 0x7b, 0xb0, 0x69, 0x8c,
-    0x7e, 0x51, 0x76, 0x84, 0x7d, 0x91, 0x7a, 0x88, 0x7b, 0x88, 0x92, 0x79,
-    0x6d, 0x82, 0x6c, 0x8a, 0x99, 0x62, 0x82, 0x9d, 0x99, 0x97, 0x78, 0x6a,
-    0x6e, 0x83, 0x64, 0x7d, 0x8c, 0x78, 0x7c, 0x7a, 0x7d, 0x7b, 0x77, 0x84,
-    0x76, 0x57, 0x63, 0x85, 0x97, 0x94, 0x80, 0x92, 0x88, 0x73, 0x91, 0x91,
-    0x8f, 0x6d, 0x99, 0x86, 0x91, 0x7f, 0x8b, 0x87, 0x98, 0x62, 0x84, 0x70,
-    0x97, 0x7b, 0x2e, 0x9b, 0x6e, 0x2a, 0xa4, 0x9c, 0x79, 0x88, 0x54, 0x81,
-    0x4f, 0x41, 0xa0, 0x85, 0xaf, 0x9a, 0x47, 0x5a, 0x7d, 0x62, 0x7a, 0x84,
-    0x81, 0x6e, 0x41, 0xb4, 0x60, 0x47, 0x8f, 0x98, 0x6c, 0x3c, 0x3b, 0x73,
-    0x59, 0x55, 0x7c, 0xb0, 0x6e, 0x5f, 0x61, 0x97, 0x73, 0x59, 0x9f, 0x92,
-    0x89, 0x5c, 0x70, 0x96, 0x5c, 0x7c, 0x7c, 0x64, 0x7e, 0x54, 0x5c, 0x94,
-    0x56, 0x73, 0x8d, 0x95, 0x59, 0x83, 0x6c, 0x99, 0x6e, 0x5e, 0x7a, 0x99,
-    0x83, 0x93, 0x88, 0x76, 0x5a, 0x5a, 0xa5, 0x95, 0x5d, 0x63, 0x8f, 0x6e,
-    0x74, 0x65, 0x85, 0x86, 0x98, 0x83, 0x7b, 0x8a, 0x5c, 0x5e, 0x7f, 0x88,
-    0x78, 0x68, 0x8f, 0x9f, 0x94, 0x8d, 0x74, 0x7b, 0x6a, 0x91, 0x7a, 0x9a,
-    0x70, 0x67, 0xb2, 0x92, 0x75, 0x4e, 0x74, 0xa3, 0x68, 0x74, 0x91, 0x80,
-    0x55, 0x8e, 0x88, 0x73, 0x70, 0x81, 0xa1, 0xb8, 0x96, 0x48, 0x67, 0xb2,
-    0x76, 0xa1, 0x98, 0xa9, 0x61, 0x6c, 0x5f, 0x98, 0x84, 0x92, 0xa9, 0x83,
-    0x9e, 0x74, 0x7b, 0xa2, 0x6f, 0x72, 0x95, 0xa3, 0xb9, 0x80, 0x81, 0x7b,
-    0x65, 0x6b, 0x96, 0x8b, 0xae, 0x79, 0x2b, 0x86, 0x5c, 0x2c, 0x8b, 0xa3,
-    0x84, 0x74, 0x53, 0x7c, 0x54, 0x4a, 0x65, 0x89, 0xa6, 0x89, 0x47, 0x77,
-    0x50, 0x6d, 0x8b, 0x94, 0x8a, 0x61, 0x32, 0x7c, 0x6f, 0x47, 0x78, 0xa2,
-    0x9f, 0x42, 0x42, 0x71, 0x78, 0x76, 0x9e, 0x88, 0x70, 0x70, 0x56, 0x8a,
-    0x83, 0x95, 0xa7, 0x9d, 0x9d, 0x88, 0x9a, 0x92, 0x48, 0x63, 0xaf, 0x91,
-    0x6c, 0x75, 0x5d, 0x5e, 0x83, 0x86, 0xaa, 0x6f, 0x79, 0x84, 0x67, 0x79,
-    0x63, 0x69, 0x8e, 0x81, 0x6a, 0x96, 0x8d, 0x86, 0x7b, 0x9f, 0xaa, 0x8e,
-    0x63, 0x89, 0x9a, 0x7a, 0x5e, 0x7c, 0x87, 0x83, 0x81, 0x64, 0x7e, 0x59,
-    0x6d, 0x5c, 0xa4, 0x72, 0x78, 0x85, 0x9b, 0x79, 0x85, 0x7d, 0x9c, 0x7d,
-    0x9c, 0x5c, 0x66, 0x75, 0x66, 0x72, 0xb4, 0x7c, 0x83, 0x9e, 0x90, 0xae,
-    0x69, 0x71, 0xb0, 0x84, 0x86, 0x50, 0x66, 0xab, 0x75, 0x96, 0xa8, 0x6c,
-    0x87, 0x7b, 0x7e, 0x7c, 0x60, 0x55, 0x96, 0xb0, 0x6a, 0x79, 0x42, 0x9c,
-    0x97, 0xa8, 0xb2, 0x9a, 0xa0, 0x84, 0x68, 0x90, 0x90, 0x98, 0x67, 0x9c,
-    0xa3, 0x81, 0x71, 0xaa, 0x93, 0x6a, 0x84, 0x8c, 0x77, 0x79, 0x4d, 0x82,
-    0x45, 0x1e, 0x7b, 0x94, 0x86, 0x86, 0x26, 0x82, 0x41, 0x6f, 0x8b, 0x86,
-    0xa4, 0x80, 0x38, 0x71, 0x5e, 0x5b, 0x9a, 0x73, 0x86, 0x60, 0x5a, 0x9d,
-    0x7b, 0x53, 0x89, 0xa0, 0x99, 0x76, 0x57, 0x81, 0x76, 0x5a, 0x9e, 0x85,
-    0x5a, 0x7b, 0x56, 0x74, 0x71, 0x6a, 0x9c, 0x68, 0x7e, 0x76, 0x7d, 0x7f,
-    0x52, 0x71, 0x85, 0xa2, 0x96, 0x63, 0x73, 0x7c, 0x7a, 0x97, 0x9f, 0x7c,
-    0x77, 0x77, 0x59, 0x6b, 0x62, 0x77, 0xbc, 0x6b, 0x7c, 0x79, 0x75, 0x90,
-    0x67, 0x82, 0x92, 0x9c, 0x81, 0x92, 0x84, 0x7a, 0x72, 0x5b, 0x86, 0x82,
-    0x87, 0x73, 0x87, 0x7c, 0x57, 0x76, 0xa6, 0x7d, 0x7d, 0x94, 0x6a, 0x67,
-    0x76, 0x89, 0x9a, 0x6d, 0x7d, 0xa4, 0x6d, 0x7e, 0x74, 0x7e, 0x8f, 0xad,
-    0x99, 0x55, 0x5c, 0x82, 0x75, 0x9e, 0xae, 0x76, 0x6b, 0x93, 0x5d, 0x92,
-    0x6e, 0x54, 0x88, 0x8f, 0x6a, 0x72, 0x64, 0x93, 0x6e, 0x63, 0x8c, 0xa7,
-    0xa6, 0x7a, 0x57, 0x9f, 0x94, 0x91, 0xbd, 0xa4, 0x92, 0x7a, 0x68, 0x9d,
-    0x7d, 0x6b, 0x6b, 0xbc, 0xad, 0x7a, 0x73, 0x92, 0x7b, 0x6d, 0x91, 0x6a,
-    0x66, 0x8d, 0x34, 0x9b, 0x75, 0x3b, 0x93, 0x78, 0x88, 0x58, 0x1a, 0x7f,
-    0x52, 0x61, 0xa3, 0xb1, 0x9c, 0x60, 0x1d, 0x90, 0x7b, 0x37, 0x9f, 0x84,
-    0xa3, 0x6c, 0x2e, 0xac, 0x73, 0x62, 0x92, 0x9a, 0x94, 0x6b, 0x5c, 0x82,
-    0x5f, 0x4c, 0x9a, 0x8c, 0x76, 0x69, 0x77, 0x5f, 0x5d, 0x91, 0x80, 0x9a,
-    0x60, 0x4c, 0x7b, 0x57, 0x67, 0x6b, 0x92, 0x93, 0x64, 0x91, 0x55, 0x75,
-    0x41, 0x82, 0x78, 0x68, 0xa2, 0x55, 0x6a, 0x69, 0x59, 0x70, 0x8a, 0x7b,
-    0x70, 0x6e, 0x63, 0x83, 0x7f, 0xa4, 0x80, 0x85, 0x86, 0x93, 0x7e, 0x6f,
-    0x7b, 0x94, 0xa4, 0xa7, 0x97, 0x7a, 0x87, 0x64, 0x4a, 0x97, 0x94, 0x6a,
-    0x96, 0x73, 0x5e, 0x79, 0x6a, 0x99, 0x86, 0xa0, 0x93, 0xac, 0x79, 0x76,
-    0x7f, 0x7b, 0xa7, 0x75, 0x8a, 0x71, 0x53, 0x87, 0x93, 0x7f, 0x9e, 0x7b,
-    0x81, 0x70, 0x68, 0x8b, 0x8c, 0x9c, 0xaf, 0xa7, 0x6a, 0x9b, 0x49, 0x6d,
-    0x67, 0x80, 0x8b, 0x86, 0x9f, 0x80, 0x74, 0x7a, 0x96, 0x74, 0xc8, 0x9d,
-    0xa4, 0x74, 0x71, 0x6c, 0x75, 0x6a, 0x9a, 0x95, 0x97, 0x8c, 0x6e, 0x8a,
-    0x85, 0x62, 0x5f, 0x7e, 0x9e, 0x6b, 0x48, 0x93, 0x44, 0x37, 0x83, 0xa2,
-    0x97, 0x72, 0x25, 0x79, 0x32, 0x39, 0x68, 0x8f, 0x93, 0x61, 0x2b, 0x96,
-    0x94, 0x43, 0x82, 0x6e, 0x8f, 0x6d, 0x53, 0x9b, 0x65, 0x50, 0x70, 0x9d,
-    0x7d, 0x53, 0x3b, 0x86, 0x77, 0x6c, 0xa6, 0x90, 0x6b, 0x3e, 0x7b, 0x7a,
-    0x50, 0x81, 0xb4, 0x76, 0xa5, 0x74, 0x8b, 0x73, 0x79, 0x69, 0xa8, 0x9a,
-    0x82, 0x4a, 0x5e, 0x6c, 0x8d, 0x66, 0xa3, 0x80, 0x8d, 0x74, 0x5b, 0x7c,
-    0x77, 0xaa, 0x82, 0x69, 0x5e, 0x7d, 0x7f, 0x63, 0xa3, 0x8c, 0xb3, 0x9a,
-    0x81, 0x8f, 0x7b, 0x77, 0x60, 0x89, 0x6a, 0x82, 0x5a, 0x7a, 0x71, 0x61,
-    0x93, 0x73, 0x8b, 0xb0, 0xa2, 0x92, 0x7c, 0x84, 0x8b, 0x72, 0x91, 0x8d,
-    0x91, 0x80, 0x6c, 0x75, 0x7a, 0xb3, 0x95, 0x5e, 0xa5, 0x5d, 0x54, 0x8b,
-    0x63, 0x91, 0xa7, 0x68, 0x96, 0x4c, 0x5a, 0x86, 0x76, 0x82, 0xb6, 0xa0,
-    0x68, 0x6b, 0x53, 0x76, 0x60, 0x65, 0x90, 0xaf, 0x82, 0x66, 0x80, 0x7b,
-    0x84, 0xa0, 0xb0, 0xb8, 0x81, 0x6e, 0x81, 0x8a, 0x74, 0x6e, 0x97, 0xa8,
-    0x89, 0x7b, 0x7b, 0x6e, 0x63, 0x74, 0x5a, 0x7b, 0x7e, 0x84, 0x40, 0x95,
-    0x73, 0x3c, 0x7c, 0x72, 0x9b, 0x92, 0x27, 0x87, 0x69, 0x5b, 0x99, 0x8a,
-    0xa8, 0x65, 0x36, 0x8f, 0x86, 0x3e, 0xa1, 0x79, 0x9f, 0x4d, 0x41, 0xc5,
-    0x8c, 0x6a, 0x7e, 0x7f, 0x68, 0x49, 0x5c, 0x91, 0x50, 0x6a, 0x8c, 0x81,
-    0x75, 0x4c, 0x6a, 0x74, 0x8a, 0x87, 0xa0, 0x93, 0x7e, 0x6d, 0x52, 0x79,
-    0x86, 0x6a, 0x68, 0x6c, 0x83, 0x67, 0x79, 0x73, 0x6f, 0x72, 0x97, 0x84,
-    0x8b, 0x78, 0x64, 0x69, 0x8f, 0x92, 0x86, 0x61, 0x5d, 0x85, 0x70, 0x64,
-    0x7d, 0xa3, 0x92, 0xa0, 0x72, 0x71, 0x5d, 0x63, 0x7c, 0x70, 0xaf, 0x6f,
-    0x93, 0x6a, 0x7e, 0x7f, 0x64, 0xab, 0x85, 0x73, 0x8f, 0x8a, 0x7e, 0x5f,
-    0x7a, 0x6f, 0xaa, 0x71, 0x97, 0x7d, 0x60, 0x7c, 0x48, 0x69, 0xa9, 0xaa,
-    0x98, 0x7c, 0x61, 0x85, 0x66, 0x97, 0xa2, 0x73, 0x74, 0x65, 0x52, 0x67,
-    0x79, 0x8a, 0x79, 0x71, 0x85, 0x6e, 0x6d, 0x67, 0x5e, 0x7f, 0xb9, 0x93,
-    0x96, 0x53, 0x69, 0x6e, 0x7f, 0x8f, 0xab, 0x93, 0xa9, 0x70, 0x6e, 0x71,
-    0x7e, 0x87, 0x98, 0x7a, 0xae, 0x90, 0x64, 0x88, 0x8a, 0x4f, 0x6d, 0x9e,
-    0xac, 0x7e, 0x31, 0x92, 0x50, 0x26, 0x95, 0xb2, 0x90, 0x99, 0x0c, 0x84,
-    0x40, 0x4f, 0x8f, 0x76, 0xa4, 0x46, 0x4c, 0x9d, 0x8b, 0x57, 0x81, 0x79,
-    0x7b, 0x47, 0x4d, 0x9c, 0x5f, 0x3b, 0x6f, 0x90, 0x7a, 0x3f, 0x66, 0x9d,
-    0x6c, 0x45, 0x8b, 0x71, 0x79, 0x62, 0x72, 0x78, 0x93, 0x95, 0x7e, 0x86,
-    0x7a, 0x6b, 0x77, 0x74, 0x6b, 0x86, 0xa4, 0x7e, 0x84, 0x48, 0x78, 0x75,
-    0x6e, 0x8b, 0x8e, 0x56, 0x69, 0x7b, 0x59, 0x68, 0x5d, 0x77, 0x69, 0x66,
-    0x67, 0x9f, 0x75, 0x7b, 0x76, 0x64, 0xc1, 0x78, 0x7d, 0x74, 0x82, 0x73,
-    0x73, 0x90, 0xb8, 0x82, 0x7e, 0x70, 0x7b, 0x7a, 0x64, 0xa1, 0x7e, 0x85,
-    0x83, 0x81, 0x60, 0x7b, 0x91, 0x82, 0x6f, 0x95, 0xa0, 0x86, 0x6d, 0x88,
-    0x75, 0x8d, 0x94, 0x90, 0x76, 0x6d, 0x6e, 0x79, 0x64, 0x74, 0xa8, 0xb1,
-    0x92, 0x6e, 0x61, 0x79, 0x74, 0x91, 0x95, 0x74, 0x65, 0x74, 0x5e, 0x7f,
-    0x8b, 0x60, 0x9b, 0x9f, 0x74, 0x77, 0x4c, 0x66, 0x7c, 0x80, 0x97, 0x98,
-    0x9d, 0x86, 0x55, 0x8a, 0x8a, 0x79, 0x8c, 0x82, 0xb0, 0x7d, 0x63, 0x8c,
-    0x5d, 0x5b, 0x82, 0x58, 0x84, 0x56, 0x51, 0x92, 0x75, 0x24, 0x97, 0x92,
-    0x75, 0x6e, 0x19, 0x8e, 0x47, 0x3e, 0x7b, 0x7b, 0x87, 0x6b, 0x3f, 0xa9,
-    0x59, 0x40, 0x86, 0x74, 0x69, 0x4a, 0x2d, 0xad, 0x91, 0x62, 0xb2, 0xa9,
-    0x74, 0x6c, 0x47, 0x94, 0x51, 0x75, 0xb2, 0x6f, 0x75, 0x4b, 0x60, 0xa2,
-    0x8e, 0x6a, 0xa4, 0x79, 0x6f, 0x57, 0x80, 0x8c, 0x6c, 0x8e, 0x9e, 0x74,
-    0x70, 0x5f, 0x66, 0x80, 0x80, 0x89, 0xb5, 0x8a, 0x7a, 0x96, 0x87, 0x7a,
-    0x7b, 0x85, 0x90, 0x79, 0x59, 0x6d, 0x77, 0x8c, 0x8f, 0x82, 0xb3, 0x9c,
-    0x6a, 0x6a, 0x6b, 0x70, 0x77, 0x89, 0x96, 0x86, 0x94, 0x72, 0x7e, 0x72,
-    0xa9, 0x93, 0x8d, 0x7a, 0x6d, 0x8f, 0x66, 0x72, 0x9a, 0x91, 0x9e, 0x98,
-    0xa0, 0x8b, 0x50, 0x76, 0x5c, 0x74, 0xbc, 0x9a, 0x98, 0x73, 0x80, 0x7d,
-    0x73, 0x7c, 0xc0, 0x8b, 0x86, 0x7a, 0x66, 0x86, 0x83, 0x72, 0x8f, 0x96,
-    0x98, 0x56, 0x45, 0x7b, 0x77, 0x92, 0xac, 0x8a, 0xae, 0x43, 0x33, 0x73,
-    0x78, 0x83, 0x98, 0x84, 0x86, 0x78, 0x54, 0x7e, 0x70, 0x5f, 0xa6, 0xa1,
-    0x94, 0x81, 0x73, 0x8d, 0x83, 0x5b, 0x88, 0x71, 0xb2, 0x91, 0x50, 0x99,
-    0x6b, 0x47, 0x72, 0x92, 0x87, 0x6d, 0x07, 0x99, 0x57, 0x3d, 0x8d, 0x83,
-    0x9d, 0x49, 0x40, 0x9d, 0x5c, 0x57, 0x95, 0x73, 0x6e, 0x4b, 0x49, 0xab,
-    0x97, 0x58, 0x8b, 0x7a, 0x7a, 0x48, 0x47, 0x8b, 0x7e, 0x5d, 0xa9, 0x6d,
-    0x8a, 0x3f, 0x60, 0x82, 0x86, 0x98, 0xa9, 0x7c, 0x74, 0x59, 0x9b, 0x80,
-    0x4e, 0x75, 0x9c, 0x5e, 0x75, 0x8c, 0x67, 0x7e, 0x78, 0x75, 0x87, 0x6c,
-    0x79, 0x73, 0x63, 0x77, 0x6e, 0x7a, 0x8d, 0x73, 0x4e, 0x72, 0x4a, 0x7c,
-    0x8f, 0x79, 0x70, 0x7a, 0x70, 0x73, 0x7b, 0x7a, 0x62, 0xa1, 0x7b, 0x63,
-    0x9a, 0x89, 0x76, 0x64, 0x84, 0x7d, 0x9c, 0x94, 0xb0, 0x7f, 0x6c, 0x7b,
-    0x8d, 0x89, 0x89, 0x7b, 0x9d, 0x99, 0x64, 0x8b, 0x5c, 0x88, 0xa6, 0x8e,
-    0x81, 0x86, 0x7e, 0x85, 0x73, 0x72, 0xad, 0x5d, 0x5f, 0x7e, 0x63, 0x74,
-    0x64, 0xa1, 0x9c, 0x83, 0x7c, 0x83, 0x7b, 0x7b, 0x71, 0xa0, 0x9e, 0xaf,
-    0x89, 0x79, 0x4c, 0x7c, 0x8c, 0x78, 0x91, 0x87, 0x8a, 0x87, 0x5e, 0x85,
-    0x7b, 0x61, 0x9c, 0x88, 0xa5, 0x8d, 0x7c, 0x9c, 0x6b, 0x47, 0x95, 0x85,
-    0x81, 0x80, 0x59, 0xb2, 0x4f, 0x3d, 0xae, 0x8c, 0x8d, 0x71, 0x11, 0x95,
-    0x31, 0x65, 0x9d, 0xa0, 0x8e, 0x64, 0x42, 0xb9, 0x6a, 0x5c, 0x91, 0x82,
-    0x91, 0x50, 0x33, 0xb2, 0x7a, 0x54, 0xac, 0x88, 0x92, 0x61, 0x4e, 0xad,
-    0x65, 0x5c, 0x91, 0xb0, 0x72, 0x65, 0x4a, 0x79, 0x68, 0x77, 0x75, 0x5f,
-    0x79, 0x6d, 0x6f, 0x7c, 0x4d, 0x71, 0xb8, 0x78, 0x8a, 0x87, 0x6e, 0x72,
-    0x7d, 0x79, 0x87, 0x80, 0x5a, 0x78, 0x77, 0x78, 0x80, 0x8f, 0x8c, 0x56,
-    0x7a, 0x8b, 0x62, 0x82, 0x5a, 0x96, 0x82, 0x68, 0x71, 0x5d, 0x75, 0x65,
-    0x93, 0xb5, 0x71, 0x82, 0x82, 0x8a, 0x4b, 0x7c, 0x62, 0x6f, 0xc1, 0x86,
-    0x9d, 0x90, 0x63, 0x71, 0x86, 0x9e, 0x9f, 0x77, 0x90, 0x97, 0x68, 0x81,
-    0x5a, 0x8c, 0xab, 0x5e, 0x81, 0x76, 0x83, 0x79, 0x8f, 0xa1, 0x89, 0x79,
-    0x81, 0x8a, 0x7e, 0x6c, 0x65, 0x79, 0xc7, 0x89, 0x92, 0x68, 0x78, 0x70,
-    0x65, 0x96, 0x9e, 0x82, 0x7d, 0x5f, 0x7b, 0x77, 0x72, 0x84, 0x7e, 0x92,
-    0x97, 0x7b, 0x6e, 0x67, 0x81, 0xa1, 0x9a, 0xab, 0x8d, 0x78, 0x61, 0x78,
-    0x52, 0x66, 0xaa, 0x77, 0x75, 0xa3, 0x5e, 0xa0, 0x51, 0x40, 0x68, 0xb0,
-    0x9a, 0x93, 0x11, 0x82, 0x69, 0x48, 0x9c, 0x77, 0x8d, 0x62, 0x36, 0xac,
-    0x6c, 0x4c, 0xa3, 0xab, 0x8f, 0x32, 0x4f, 0xa9, 0x80, 0x68, 0xab, 0x7a,
-    0x90, 0x61, 0x5c, 0xa5, 0x84, 0x4c, 0x8c, 0x7a, 0x95, 0x54, 0x72, 0xa0,
-    0x66, 0x85, 0xb3, 0x91, 0x69, 0x64, 0x68, 0x56, 0x66, 0x8d, 0xa0, 0x9f,
-    0x7a, 0x88, 0x5d, 0x7d, 0x48, 0x80, 0x7f, 0x7c, 0x7c, 0x99, 0x65, 0x81,
-    0x73, 0x8b, 0x8c, 0x61, 0x44, 0x60, 0x53, 0x8e, 0x64, 0x80, 0x9c, 0x74,
-    0x5d, 0x70, 0x8f, 0x5a, 0x68, 0x7a, 0x82, 0xa1, 0x75, 0x7b, 0x83, 0x60,
-    0x75, 0x5e, 0xa2, 0x94, 0x6a, 0x88, 0x78, 0x71, 0x95, 0x70, 0x8b, 0x86,
-    0x7e, 0x94, 0x5f, 0x65, 0x5f, 0xb1, 0x97, 0x99, 0x94, 0x84, 0x88, 0x7d,
-    0x50, 0x8c, 0xaa, 0x81, 0x7b, 0x7c, 0x77, 0x65, 0x5e, 0x91, 0x9c, 0x89,
-    0x8c, 0x85, 0x75, 0x62, 0x7b, 0x78, 0xc3, 0x7a, 0x62, 0x8c, 0x66, 0x6f,
-    0x79, 0x7a, 0x9c, 0x6d, 0x7c, 0x6b, 0x5c, 0x7d, 0x6d, 0x54, 0x93, 0x87,
-    0x7a, 0x7a, 0x50, 0x85, 0x60, 0x56, 0x5e, 0x6b, 0x90, 0x7c, 0x52, 0xa5,
-    0x54, 0x42, 0x7b, 0x75, 0x83, 0x8c, 0x2c, 0xa6, 0x6f, 0x62, 0x78, 0x78,
-    0x86, 0x36, 0x4b, 0xaa, 0x86, 0x54, 0x92, 0x8d, 0x7f, 0x53, 0x37, 0xbe,
-    0x86, 0x7a, 0x90, 0x7e, 0x8e, 0x50, 0x58, 0xa6, 0x82, 0x58, 0x73, 0x74,
-    0x66, 0x5c, 0x6a, 0x7f, 0xa2, 0x69, 0xbd, 0xa9, 0x74, 0x76, 0x75, 0x6f,
-    0x45, 0x6c, 0xa5, 0x79, 0x82, 0x67, 0x56, 0x7c, 0x7f, 0x81, 0x67, 0x6d,
-    0x81, 0x87, 0x71, 0x69, 0x69, 0x81, 0x85, 0x84, 0x5a, 0x8c, 0x5f, 0x73,
-    0x80, 0x9c, 0x9e, 0x90, 0x77, 0xa0, 0x9c, 0x6c, 0x73, 0x8a, 0x84, 0x72,
-    0x87, 0xa1, 0x67, 0x64, 0x5d, 0x9b, 0x9d, 0x9b, 0x97, 0x83, 0x5f, 0x61,
-    0x77, 0x91, 0xa0, 0x8f, 0x8a, 0x6c, 0x45, 0x5f, 0x6d, 0xa6, 0x9b, 0x76,
-    0x86, 0x93, 0x91, 0x7d, 0x54, 0x61, 0xa4, 0x6a, 0x5b, 0x69, 0x5f, 0x6d,
-    0x83, 0xaf, 0xa0, 0x78, 0x9d, 0x62, 0x65, 0x69, 0x5f, 0x78, 0xbf, 0x91,
-    0x7b, 0x7b, 0x52, 0x5d, 0x70, 0x78, 0xa9, 0x87, 0x93, 0x74, 0x61, 0x74,
-    0x8c, 0x61, 0x97, 0x86, 0x9b, 0x7c, 0x7d, 0x75, 0x4b, 0x64, 0xa7, 0x81,
-    0x8a, 0x9c, 0x29, 0xa2, 0x5f, 0x38, 0x6a, 0xb0, 0x82, 0x53, 0x1a, 0xa7,
-    0x38, 0x47, 0x97, 0x90, 0x8d, 0x41, 0x25, 0xa7, 0x65, 0x63, 0x8b, 0x79,
-    0x8f, 0x3e, 0x21, 0xd0, 0x5e, 0x5d, 0x9d, 0x68, 0x75, 0x3e, 0x68, 0xb6,
-    0x6a, 0x50, 0x9a, 0x71, 0x81, 0x45, 0x6d, 0x9a, 0x7f, 0x86, 0x9c, 0x63,
-    0x7d, 0x74, 0x69, 0x7d, 0x5a, 0x6a, 0x8d, 0x72, 0x6b, 0x69, 0x4c, 0x6f,
-    0x7c, 0x8e, 0xa6, 0x83, 0x70, 0x65, 0x5f, 0x78, 0x69, 0x67, 0x7f, 0x8d,
-    0x58, 0x76, 0x4a, 0x85, 0x80, 0x89, 0x9f, 0x91, 0x52, 0x62, 0x72, 0x60,
-    0x7b, 0x5c, 0x77, 0x6f, 0x9d, 0xa4, 0x98, 0x70, 0x6f, 0xad, 0x94, 0x9f,
-    0x7b, 0x89, 0x74, 0x7e, 0x5d, 0x8d, 0xab, 0x98, 0x8f, 0x90, 0x82, 0x84,
-    0x60, 0x7c, 0xb7, 0x8e, 0x79, 0x83, 0x56, 0x86, 0x87, 0x79, 0x95, 0x75,
-    0x78, 0x71, 0x58, 0x73, 0x87, 0x5d, 0xc6, 0x9f, 0x75, 0x61, 0x4f, 0x71,
-    0x91, 0x88, 0xb3, 0x8c, 0x7d, 0x7c, 0x6a, 0x75, 0x6d, 0x66, 0x8e, 0x94,
-    0x96, 0x74, 0x59, 0x6f, 0x6d, 0x65, 0xb0, 0x8e, 0x7b, 0x89, 0x7a, 0x6a,
-    0x7d, 0x57, 0x82, 0x7a, 0x61, 0x9f, 0x50, 0xab, 0x57, 0x46, 0x86, 0x8d,
-    0xa3, 0x96, 0x18, 0xab, 0x51, 0x6e, 0xb3, 0x7e, 0x90, 0x6d, 0x6d, 0xc0,
-    0x54, 0x35, 0x96, 0x84, 0x8e, 0x49, 0x28, 0xe4, 0x81, 0x5f, 0x9b, 0x87,
-    0x8c, 0x33, 0x56, 0xb4, 0x61, 0x5e, 0x8b, 0x81, 0x99, 0x61, 0x6b, 0x96,
-    0x75, 0x82, 0x9e, 0x7c, 0x90, 0x63, 0x64, 0x6b, 0x55, 0x6e, 0xb6, 0x7f,
-    0x5f, 0x55, 0x65, 0x60, 0x35, 0x8a, 0x85, 0x91, 0x4d, 0x62, 0x90, 0x90,
-    0x57, 0x5a, 0x9f, 0x7b, 0x4c, 0x86, 0x73, 0x83, 0x4a, 0x6d, 0xb0, 0x67,
-    0x65, 0x89, 0x54, 0x68, 0x89, 0x7b, 0x72, 0x4f, 0x7a, 0x93, 0x61, 0x7e,
-    0x79, 0x89, 0x8f, 0x9c, 0x7b, 0x70, 0x48, 0x67, 0x82, 0x75, 0xaa, 0x92,
-    0x9a, 0x8f, 0x79, 0x8c, 0x64, 0x94, 0x98, 0x83, 0x7c, 0x8f, 0x5c, 0x77,
-    0x70, 0x90, 0x91, 0x88, 0x7d, 0x51, 0x5d, 0x5d, 0x8b, 0x9f, 0xbc, 0x78,
-    0x9e, 0x73, 0x67, 0x6d, 0x82, 0x8d, 0xc9, 0x86, 0x96, 0x6a, 0x5d, 0x79,
-    0x7e, 0x6b, 0xb2, 0x79, 0x88, 0x85, 0x65, 0x73, 0x75, 0x6b, 0x9e, 0x7f,
-    0x8e, 0x94, 0x8e, 0x7d, 0x74, 0x61, 0x97, 0x56, 0x97, 0x6b, 0x30, 0xb6,
-    0x5f, 0x5a, 0xaa, 0xa5, 0x85, 0x5d, 0x01, 0xbc, 0x79, 0x63, 0x6e, 0x82,
-    0x72, 0x26, 0x4f, 0xc8, 0x98, 0x56, 0x85, 0x9a, 0x81, 0x1f, 0x48, 0xcf,
-    0x84, 0x74, 0x75, 0x87, 0xae, 0x43, 0x6f, 0xdf, 0x6a, 0x4e, 0x97, 0x5d,
-    0x8f, 0x37, 0x55, 0x89, 0x7d, 0x82, 0xb1, 0x89, 0x6d, 0x52, 0x65, 0x8b,
-    0x71, 0x87, 0x8d, 0x6a, 0x99, 0x5d, 0x65, 0x78, 0x67, 0x8d, 0x7b, 0x51,
-    0x60, 0x8a, 0x59, 0x72, 0x78, 0x93, 0x88, 0x75, 0x46, 0x60, 0x6e, 0x79,
-    0x7b, 0x9d, 0x9c, 0x8c, 0x5c, 0x7c, 0x69, 0x71, 0x60, 0x6f, 0xb0, 0x7d,
-    0x4c, 0x5e, 0x88, 0x77, 0x74, 0x6a, 0x6f, 0x9a, 0xa2, 0x83, 0x48, 0x5a,
-    0x6e, 0xa2, 0x8b, 0x7a, 0x65, 0x5b, 0x4b, 0x80, 0x5b, 0x8f, 0xaf, 0x8e,
-    0x93, 0x4a, 0x59, 0x6e, 0x5e, 0x89, 0x91, 0x87, 0x73, 0x6a, 0x47, 0x6c,
-    0x6c, 0x81, 0xad, 0x5a, 0x76, 0x51, 0x51, 0x6c, 0x80, 0x92, 0x9d, 0xae,
-    0x90, 0x71, 0x6c, 0x7a, 0x7c, 0x84, 0xa7, 0x7d, 0x82, 0x7c, 0x80, 0x59,
-    0x7d, 0x86, 0xa9, 0x94, 0x8e, 0x7b, 0x7c, 0x67, 0x67, 0x66, 0x8f, 0x49,
-    0x5d, 0xa4, 0x4a, 0xbc, 0x5a, 0x34, 0xa7, 0xaa, 0x9e, 0x86, 0x17, 0xc0,
-    0x53, 0x67, 0x76, 0xae, 0x8d, 0x37, 0x4a, 0xd6, 0x76, 0x69, 0x95, 0x7a,
-    0x8a, 0x0e, 0x3f, 0xe8, 0x60, 0x4d, 0x9e, 0x90, 0xad, 0x44, 0x46, 0xc5,
-    0x4c, 0x6e, 0x72, 0x8c, 0x89, 0x49, 0x51, 0xa0, 0x60, 0x84, 0x84, 0x9d,
-    0xa4, 0x5a, 0x84, 0x8d, 0x69, 0x6a, 0x97, 0x78, 0x72, 0x66, 0x72, 0x9b,
-    0x74, 0x7a, 0x95, 0x7c, 0x7a, 0x6e, 0x74, 0x7f, 0x65, 0x94, 0x77, 0x7e,
-    0x85, 0x6d, 0x65, 0x7b, 0x63, 0x7b, 0x87, 0x49, 0x80, 0x74, 0x74, 0x85,
-    0x6e, 0x78, 0xad, 0x66, 0x8a, 0x65, 0x54, 0x7c, 0x4e, 0x62, 0x97, 0x7f,
-    0x82, 0x6c, 0x58, 0x79, 0x91, 0x94, 0xb3, 0x7a, 0x88, 0x82, 0x60, 0x7f,
-    0x8c, 0xa7, 0x7b, 0x93, 0x77, 0x49, 0x6f, 0x6f, 0x5a, 0x8d, 0x93, 0x8b,
-    0x87, 0x59, 0x7d, 0x5e, 0x83, 0x7e, 0x8c, 0x7a, 0x91, 0x4e, 0x6f, 0x89,
-    0x8a, 0x87, 0x8b, 0x85, 0x8e, 0x43, 0x63, 0x8d, 0x90, 0x6c, 0xa5, 0x73,
-    0x8a, 0x78, 0x5f, 0x73, 0x88, 0x57, 0x9e, 0x8f, 0x7f, 0x91, 0x70, 0x77,
-    0x8a, 0x76, 0xa2, 0x77, 0x53, 0x86, 0x51, 0xd8, 0xa9, 0x5b, 0x9b, 0x96,
-    0x7c, 0x71, 0x01, 0xd4, 0x56, 0x4a, 0x95, 0xab, 0x91, 0x54, 0x45, 0xe5,
-    0x74, 0x4f, 0x87, 0x6a, 0xa2, 0x3e, 0x47, 0xff, 0x91, 0x4d, 0x94, 0x97,
-    0x6d, 0x74, 0x77, 0xe0, 0x5d, 0x4e, 0x5f, 0x73, 0x70, 0x3a, 0x68, 0xb2,
-    0x78, 0x61, 0x8c, 0x77, 0xa8, 0x57, 0x8c, 0x99, 0x23, 0x5a, 0x84, 0x78,
-    0x9b, 0x7f, 0x5e, 0xa0, 0x49, 0x84, 0x83, 0x94, 0x99, 0x4d, 0x8d, 0x9a,
-    0x86, 0x90, 0x9b, 0x51, 0x75, 0x73, 0x78, 0x89, 0x59, 0x64, 0x78, 0x91,
-    0x72, 0x9c, 0x72, 0x7e, 0x65, 0x6a, 0x80, 0xaa, 0x94, 0x65, 0x6d, 0x87,
-    0x73, 0x93, 0x97, 0x7d, 0x99, 0x63, 0x75, 0x89, 0x67, 0xa1, 0x90, 0x7f,
-    0x88, 0x65, 0x6d, 0x8f, 0x7d, 0x62, 0x91, 0xa7, 0x8b, 0x73, 0x51, 0x88,
-    0x66, 0x66, 0x99, 0xa7, 0x7c, 0x54, 0x82, 0x67, 0x64, 0x8a, 0x95, 0x7c,
-    0x8a, 0x5d, 0x5e, 0x68, 0x4b, 0x75, 0x92, 0x7a, 0x9f, 0x66, 0x71, 0x8d,
-    0x76, 0x72, 0x8e, 0x77, 0x76, 0x8c, 0x5b, 0x88, 0x9a, 0x92, 0x7c, 0x74,
-    0x95, 0xaa, 0x71, 0x77, 0x97, 0x93, 0x9e, 0x62, 0x96, 0x6a, 0x49, 0xd8,
-    0x81, 0x99, 0xae, 0x87, 0x6c, 0x76, 0x3e, 0xd9, 0x6e, 0x95, 0xa3, 0x86,
-    0x60, 0x6c, 0x5c, 0xbe, 0x98, 0x8a, 0x99, 0x7c, 0x47, 0x45, 0x69, 0xeb,
-    0x9d, 0x7d, 0xbb, 0x90, 0x66, 0x69, 0x70, 0xc6, 0x7b, 0x59, 0x9e, 0x87,
-    0x58, 0x76, 0x7c, 0xae, 0x72, 0x7d, 0x9f, 0x92, 0x82, 0x58, 0x51, 0x7a,
-    0x5d, 0x77, 0xa8, 0x7c, 0x56, 0x68, 0x88, 0x8a, 0x7e, 0x8a, 0x98, 0x68,
-    0x64, 0x79, 0x6e, 0x7a, 0x60, 0x96, 0x98, 0x60, 0x60, 0x71, 0x60, 0x8e,
-    0x7c, 0x8c, 0x92, 0x92, 0x77, 0x80, 0x90, 0x91, 0x81, 0x82, 0x9c, 0x80,
-    0x61, 0x7f, 0x5a, 0x8e, 0x88, 0x7c, 0x8e, 0x79, 0x69, 0x8e, 0x4e, 0x7e,
-    0x84, 0x9e, 0x67, 0x72, 0x5c, 0x78, 0x7b, 0x8c, 0x65, 0x7d, 0x8e, 0xa4,
-    0x5e, 0x7a, 0x5c, 0x97, 0x6a, 0x81, 0xab, 0x85, 0x4d, 0x73, 0x83, 0x96,
-    0x8b, 0x7d, 0xa6, 0x69, 0x74, 0x86, 0x73, 0x79, 0x52, 0x8c, 0xa0, 0x86,
-    0x64, 0x7b, 0x84, 0x77, 0x87, 0x93, 0x7d, 0x6d, 0x98, 0x6d, 0x88, 0x5f,
-    0x7c, 0x84, 0x92, 0x82, 0x81, 0x76, 0x85, 0x77, 0x98, 0x85, 0x88, 0x68,
-    0x7d, 0x71, 0x3c, 0xf1, 0x83, 0x86, 0xa2, 0xb3, 0x6e, 0x77, 0x53, 0xe8,
-    0xa8, 0xc7, 0xb3, 0x83, 0x93, 0x83, 0x63, 0xe8, 0x94, 0xb3, 0x86, 0x6e,
-    0x75, 0x5d, 0x54, 0xf0, 0x89, 0xa7, 0x94, 0xb1, 0x7e, 0x91, 0x9a, 0xb8,
-    0x91, 0x7e, 0x99, 0x50, 0x71, 0x82, 0x8a, 0x91, 0x7a, 0x8a, 0x8b, 0x80,
-    0x64, 0x6a, 0x5f, 0xbe, 0x5d, 0x96, 0xb1, 0x82, 0x45, 0x71, 0x8b, 0x95,
-    0x7c, 0x9b, 0x89, 0x6d, 0x5b, 0x73, 0x81, 0x90, 0x76, 0xab, 0xa6, 0x88,
-    0x62, 0x7d, 0x75, 0x99, 0x7a, 0x8b, 0x6e, 0x9b, 0x83, 0x89, 0x99, 0x93,
-    0x81, 0x9e, 0x8a, 0x76, 0x75, 0x7d, 0x6c, 0x93, 0x68, 0x7a, 0x8d, 0x78,
-    0x88, 0x93, 0x66, 0xa5, 0x6c, 0xae, 0xb1, 0x83, 0x72, 0x8f, 0x6b, 0x7b,
-    0x79, 0x9b, 0x98, 0x7c, 0x82, 0x84, 0x7d, 0x7d, 0x71, 0x7c, 0xb0, 0x81,
-    0x74, 0x89, 0x72, 0x89, 0x98, 0xa0, 0x7d, 0x62, 0x2f, 0x50, 0x7d, 0x8b,
-    0x4c, 0x83, 0x87, 0x89, 0x57, 0x9e, 0x92, 0x8c, 0x81, 0x7e, 0xb9, 0x95,
-    0x7f, 0x76, 0x8e, 0x90, 0x9d, 0x68, 0x78, 0x95, 0x7d, 0xab, 0x84, 0x8a,
-    0x64, 0x9f, 0x80, 0x94, 0x8d, 0x89, 0x76, 0x8e, 0x6f, 0x8b, 0x75, 0x7d,
-    0x89, 0x74, 0x67, 0x8a, 0x7d, 0x63, 0x79, 0x6d, 0x79, 0x8a, 0x78, 0x7f,
-    0x7a, 0x9b, 0x70, 0x70, 0x84, 0x86, 0x80, 0x95, 0x5a, 0x77, 0x80, 0x91,
-    0x9c, 0x92, 0x76, 0x81, 0x69, 0x89, 0x78, 0xa5, 0x7a, 0x8d, 0x86, 0x64,
-    0x8f, 0x8d, 0x7d, 0xa1, 0x8c, 0x7b, 0x77, 0x7e, 0x80, 0x93, 0x86, 0x68,
-    0x90, 0x9c, 0x71, 0x8c, 0x68, 0x52, 0x85, 0x88, 0x89, 0x92, 0x64, 0x8f,
-    0x74, 0x64, 0x7c, 0x88, 0x8d, 0x97, 0x77, 0x97, 0x91, 0xac, 0x74, 0x7f,
-    0x60, 0x7e, 0x6e, 0x70, 0x86, 0x83, 0x7f, 0x81, 0x6f, 0x94, 0x62, 0xa4,
-    0x86, 0x7d, 0x90, 0x7c, 0x89, 0x63, 0x7b, 0x89, 0x75, 0xa1, 0x67, 0x69,
-    0xa6, 0x76, 0x69, 0x9c, 0x71, 0x79, 0x76, 0x7a, 0x8e, 0x78, 0x94, 0x75,
-    0x5a, 0x76, 0x6b, 0x91, 0x84, 0x75, 0x72, 0x93, 0x79, 0x7e, 0x75, 0x9a,
-    0x6f, 0x7a, 0x7b, 0x80, 0x5f, 0x90, 0x74, 0x7d, 0x9b, 0x76, 0x70, 0x89,
-    0x8f, 0x5f, 0x7f, 0x9c, 0x93, 0x6d, 0x81, 0x7f, 0x8d, 0x7d, 0x74, 0x5d,
-    0x75, 0x88, 0x7b, 0x91, 0x75, 0x6b, 0x7f, 0x8c, 0x71, 0x74, 0x87, 0x88,
-    0x83, 0x75, 0x77, 0x96, 0x7f, 0x67, 0x7d, 0x95, 0x81, 0x5c, 0x71, 0x5c,
-    0x6e, 0x75, 0x86, 0x92, 0x5d, 0x7a, 0x77, 0x9f, 0x6e, 0x79, 0x68, 0x60,
-    0x94, 0x88, 0x88, 0x88, 0x79, 0x7e, 0x8a, 0x6d, 0x84, 0xa7, 0x5b, 0x8e,
-    0x67, 0x9c, 0x7e, 0x75, 0x82, 0x96, 0x7c, 0x7b, 0x72, 0x85, 0x8c, 0xa3,
-    0x96, 0x5b, 0x93, 0x67, 0x7e, 0x9f, 0x71, 0x82, 0x79, 0x8c, 0x93, 0x9d,
-    0x6b, 0x90, 0x8a, 0x8a, 0x55, 0x82, 0x94, 0x74, 0x7d, 0xaa, 0x81, 0x78,
-    0x8a, 0x8d, 0x83, 0x7b, 0x97, 0x92, 0x68, 0x64, 0x8c, 0x5d, 0x78, 0x9b,
-    0x73, 0x95, 0x78, 0x77, 0x6f, 0x61, 0x7c, 0x9d, 0x85, 0x6e, 0x84, 0x4c,
-    0x87, 0x57, 0x93, 0x68, 0x8e, 0x77, 0x78, 0x72, 0x87, 0x91, 0x5f, 0x7e,
-    0xa6, 0x75, 0x66, 0x86, 0x7a, 0x7d, 0x70, 0x6f, 0x87, 0x8b, 0x74, 0x85,
-    0x7d, 0x8b, 0x7f, 0x70, 0x7e, 0x82, 0x84, 0x75, 0x89, 0xa6, 0x7b, 0x7a,
-    0xa5, 0x69, 0x73, 0x74, 0x82, 0x65, 0x8f, 0x98, 0x7b, 0x77, 0x84, 0x92,
-    0x73, 0x8a, 0xa1, 0x93, 0x80, 0x81, 0x72, 0x8a, 0x6b, 0x75, 0x8f, 0x98,
-    0x73, 0x74, 0x6f, 0x70, 0x51, 0x6a, 0x84, 0x9e, 0x78, 0x9b, 0x8c, 0x81,
-    0x7e, 0x75, 0x80, 0x88, 0x73, 0x4e, 0x71, 0x74, 0x8c, 0x74, 0x6a, 0x84,
-    0x7f, 0x6b, 0x78, 0xab, 0x77, 0xa2, 0x98, 0x93, 0x77, 0x75, 0x72, 0x5c,
-    0x60, 0x74, 0x84, 0x67, 0x83, 0x7d, 0x7f, 0x7c, 0x5c, 0x72, 0x70, 0x7f,
-    0x6c, 0x84, 0x90, 0xab, 0x97, 0x7f, 0x6b, 0x82, 0x7f, 0x78, 0x73, 0x7d,
-    0x8f, 0x8e, 0x8a, 0x8f, 0x8d, 0xa3, 0x74, 0x6e, 0x5e, 0x8c, 0x94, 0x86,
-    0x57, 0xb0, 0x79, 0xa8, 0x7b, 0x8d, 0x83, 0x77, 0x89, 0xb6, 0x60, 0x9d,
-    0x77, 0x59, 0x72, 0x4d, 0x6f, 0x94, 0x71, 0x75, 0x61, 0x96, 0x86, 0x5d,
-    0x84, 0x68, 0x86, 0x82, 0x8d, 0x70, 0x9a, 0x86, 0x73, 0x64, 0x74, 0x7d,
-    0x80, 0x5a, 0x64, 0x81, 0xa1, 0x71, 0x77, 0x65, 0xa3, 0x76, 0xa3, 0x9d,
-    0x73, 0x7b, 0x8f, 0x7b, 0x79, 0x7d, 0x6c, 0x85, 0x8e, 0x75, 0x65, 0x6a,
-    0x87, 0x70, 0x68, 0x8e, 0x76, 0x5d, 0x66, 0x7c, 0x83, 0x83, 0x7e, 0x89,
-    0x59, 0x8c, 0x75, 0x59, 0x87, 0x7e, 0x7f, 0x90, 0x6b, 0x7b, 0x7e, 0x6d,
-    0x6e, 0x86, 0x69, 0x92, 0x83, 0x8f, 0x8a, 0x60, 0x78, 0x75, 0x61, 0x91,
-    0x73, 0x66, 0x86, 0x86, 0x9f, 0x6f, 0x7b, 0x9a, 0x7c, 0x54, 0x75, 0x8e,
-    0x7e, 0x72, 0x8e, 0x98, 0x94, 0x5f, 0x71, 0x7c, 0x95, 0x9f, 0x8e, 0x83,
-    0x96, 0x4b, 0x8d, 0x84, 0x81, 0x7d, 0x70, 0x84, 0x70, 0x53, 0x8d, 0x84,
-    0x5a, 0x91, 0x88, 0x9a, 0x8f, 0x69, 0x8b, 0x52, 0x85, 0x89, 0x6e, 0x99,
-    0x79, 0x89, 0x9a, 0x82, 0x6e, 0x8b, 0x65, 0x62, 0x80, 0xa8, 0x8f, 0x8a,
-    0x71, 0x61, 0x7e, 0x7d, 0x7e, 0xaa, 0x7f, 0xa0, 0x5e, 0x67, 0x90, 0x86,
-    0x6d, 0xac, 0x74, 0x50, 0x61, 0x91, 0x7d, 0x69, 0x8b, 0x7f, 0x81, 0x7a,
-    0x93, 0x8c, 0x72, 0x64, 0x98, 0x88, 0x91, 0x83, 0x69, 0x6d, 0x78, 0x7a,
-    0x68, 0x7c, 0x76, 0x81, 0xa7, 0x88, 0x8f, 0x79, 0x7d, 0x6c, 0x8a, 0x60,
-    0x88, 0x6d, 0x79, 0x9d, 0x80, 0x82, 0x66, 0x7d, 0x7e, 0x96, 0x78, 0x70,
-    0x9b, 0x70, 0x7e, 0x90, 0x77, 0x94, 0x7b, 0x89, 0x78, 0x84, 0x74, 0x6d,
-    0x7d, 0xa7, 0x75, 0x97, 0x85, 0x83, 0x86, 0x65, 0x75, 0x9a, 0x7c, 0x68,
-    0x87, 0x82, 0x75, 0x68, 0x4c, 0x8a, 0x68, 0x93, 0x7d, 0x88, 0x84, 0x72,
-    0x58, 0x81, 0x5d, 0x83, 0x89, 0x63, 0x83, 0x7d, 0x8e, 0x75, 0x8c, 0x88,
-    0x7f, 0x57, 0x8c, 0x8f, 0xa6, 0x71, 0x8a, 0x95, 0x88, 0x51, 0x74, 0x8a,
-    0x8a, 0x98, 0x72, 0x80, 0x8a, 0x52, 0x90, 0x66, 0x54, 0x8e, 0x7f, 0x94,
-    0x81, 0x49, 0x84, 0x70, 0x5c, 0x93, 0x89, 0x6d, 0x82, 0x7f, 0x70, 0x5d,
-    0x87, 0x8a, 0x71, 0x70, 0x6f, 0xa1, 0x90, 0x9f, 0x74, 0x7c, 0x8c, 0x8b,
-    0x72, 0xbf, 0x89, 0x90, 0x5c, 0x8c, 0x75, 0x72, 0x6f, 0xb2, 0x84, 0x6d,
-    0x61, 0x80, 0x7d, 0x7a, 0x66, 0xaa, 0x75, 0x71, 0x89, 0x6d, 0x69, 0x72,
-    0x73, 0x98, 0x8c, 0x78, 0x5a, 0x8e, 0x8c, 0x81, 0x55, 0x81, 0x96, 0x67,
-    0x6f, 0x71, 0x74, 0x7d, 0x8e, 0x66, 0x9a, 0x67, 0xaa, 0x81, 0x90, 0x79,
-    0x89, 0x59, 0x86, 0x66, 0x8f, 0x7d, 0x7e, 0xa2, 0xa4, 0x99, 0x68, 0x7a,
-    0x8c, 0x73, 0x85, 0x77, 0x8b, 0x74, 0x75, 0x66, 0xaa, 0x98, 0x59, 0x8b,
-    0x91, 0x6c, 0x76, 0x73, 0x87, 0xa4, 0x82, 0x82, 0x63, 0x70, 0x7e, 0x73,
-    0x96, 0x97, 0x6f, 0x86, 0x81, 0x6f, 0x83, 0x82, 0x7b, 0x82, 0xa3, 0xa7,
-    0x95, 0x77, 0x84, 0x65, 0x9b, 0x94, 0x6e, 0xb0, 0x75, 0x66, 0x78, 0x82,
-    0x9c, 0x7a, 0x5f, 0xab, 0x99, 0x2f, 0x7f, 0x68, 0xa4, 0x69, 0x8f, 0x9a,
-    0x91, 0x56, 0x6e, 0x75, 0x63, 0x9b, 0x9e, 0x97, 0x95, 0x68, 0x80, 0x6a,
-    0x40, 0x95, 0x53, 0x72, 0x6f, 0x6b, 0x91, 0x78, 0x7f, 0x93, 0x70, 0x8d,
-    0x62, 0x83, 0x7e, 0x64, 0x5b, 0xaa, 0x70, 0x6c, 0x7e, 0x9c, 0x88, 0x76,
-    0x60, 0x70, 0x66, 0x69, 0x84, 0x97, 0x9d, 0x63, 0x5e, 0x9a, 0x7e, 0x52,
-    0x58, 0xb8, 0x95, 0x7c, 0x4d, 0x96, 0x8f, 0x70, 0x71, 0xbf, 0x83, 0x83,
-    0x9e, 0x70, 0x6f, 0x57, 0x70, 0x9a, 0x8d, 0x6e, 0x98, 0x5a, 0x69, 0x6f,
-    0x90, 0x71, 0x8a, 0x5d, 0x8e, 0x6e, 0x69, 0x7a, 0x90, 0x86, 0x89, 0x88,
-    0xb6, 0x77, 0x84, 0x79, 0x76, 0x86, 0x86, 0x7c, 0xbf, 0x6d, 0x5c, 0x90,
-    0xa1, 0x93, 0x72, 0x63, 0x9a, 0x82, 0x7b, 0x61, 0x91, 0x76, 0x82, 0x96,
-    0xb9, 0x80, 0x77, 0x7f, 0xa0, 0x73, 0x61, 0x80, 0x83, 0xc1, 0x92, 0x67,
-    0x7c, 0x81, 0x90, 0x67, 0x8b, 0xbe, 0x81, 0x91, 0x6c, 0x7e, 0x8d, 0x6c,
-    0x62, 0x83, 0x7e, 0x72, 0x64, 0x8a, 0x83, 0x82, 0xaa, 0x8c, 0x74, 0xab,
-    0x79, 0x85, 0x91, 0x79, 0x90, 0x68, 0x5c, 0x9a, 0x7c, 0x36, 0x80, 0x6e,
-    0x93, 0x76, 0x5e, 0xa0, 0xa5, 0x63, 0x73, 0x7e, 0x8d, 0x94, 0x63, 0x99,
-    0x8f, 0x6a, 0x7f, 0x57, 0x57, 0x6f, 0x6d, 0x86, 0x8e, 0x6b, 0x8d, 0x53,
-    0x94, 0xba, 0x84, 0x6f, 0x5a, 0x7b, 0x8c, 0x5f, 0x73, 0x93, 0x8b, 0x87,
-    0x6f, 0x9e, 0x8a, 0x87, 0x62, 0x97, 0x86, 0x7c, 0x69, 0xab, 0xa1, 0x95,
-    0x42, 0x8c, 0x8b, 0x66, 0x68, 0x99, 0xa8, 0x74, 0x80, 0xa5, 0x7d, 0x82,
-    0x55, 0xb3, 0x6f, 0x81, 0xa8, 0x9a, 0x80, 0x67, 0x62, 0x7f, 0x78, 0x93,
-    0x90, 0x83, 0x83, 0x7b, 0x77, 0x73, 0x8c, 0x56, 0xa7, 0x85, 0x7b, 0x71,
-    0x8f, 0x5d, 0x92, 0x69, 0xbe, 0x5e, 0x7f, 0x7f, 0x8e, 0x71, 0x84, 0x75,
-    0x95, 0x69, 0x88, 0x6b, 0x96, 0x85, 0x78, 0x39, 0xc2, 0x86, 0x7c, 0x99,
-    0xa1, 0x94, 0x6b, 0x86, 0xb5, 0x5e, 0x7e, 0x6e, 0x81, 0x95, 0x6a, 0x88,
-    0x7b, 0x92, 0x8f, 0x68, 0x97, 0x77, 0x84, 0x73, 0x68, 0x96, 0x5a, 0x92,
-    0x66, 0x74, 0x74, 0x6c, 0x7d, 0x81, 0x6c, 0x93, 0x7f, 0x72, 0x86, 0x74,
-    0xbf, 0x8f, 0x53, 0xa4, 0x89, 0x76, 0xa0, 0x87, 0x97, 0x6a, 0x6b, 0xb1,
-    0x91, 0x50, 0x74, 0x68, 0xa3, 0x60, 0x8d, 0xbc, 0xc1, 0x3e, 0x62, 0x59,
-    0x71, 0x72, 0x6d, 0x80, 0x9f, 0x52, 0x82, 0x6b, 0x5d, 0x7f, 0x74, 0x7e,
-    0x74, 0x84, 0x8a, 0x59, 0x5c, 0x85, 0x6d, 0x9c, 0x75, 0x9a, 0x88, 0x89,
-    0x81, 0x9f, 0x81, 0x88, 0x6a, 0x94, 0x84, 0x5f, 0x6b, 0x9b, 0x83, 0x4f,
-    0x7e, 0xca, 0x99, 0x6d, 0x45, 0x7f, 0x87, 0x71, 0x69, 0xad, 0x95, 0x53,
-    0x6e, 0x9b, 0x90, 0x73, 0x5d, 0xb0, 0x8d, 0x67, 0x83, 0x82, 0xa3, 0x70,
-    0x70, 0x92, 0x82, 0x9a, 0x8a, 0x69, 0x6a, 0x6e, 0x7f, 0x89, 0xa4, 0x76,
-    0x97, 0x62, 0x94, 0x80, 0x87, 0x55, 0x80, 0x76, 0xb3, 0x7e, 0x7e, 0x71,
-    0x94, 0x88, 0x8e, 0x74, 0xb6, 0x4d, 0x7b, 0x73, 0x90, 0x86, 0x7c, 0x66,
-    0xb5, 0x80, 0x7f, 0x84, 0x87, 0x82, 0x67, 0x83, 0x97, 0x91, 0x8a, 0x78,
-    0x8b, 0x83, 0x5d, 0x84, 0x82, 0x9f, 0x8c, 0x91, 0x84, 0x8b, 0x6a, 0x68,
-    0x86, 0x82, 0x73, 0x77, 0x7b, 0x83, 0x6a, 0x84, 0x92, 0x93, 0x90, 0x8b,
-    0x4c, 0x94, 0x98, 0x76, 0xb8, 0x7b, 0xa0, 0xa2, 0x7d, 0x3e, 0x95, 0x88,
-    0xa3, 0x6f, 0x5e, 0xc8, 0x9a, 0x52, 0x81, 0x86, 0xa3, 0x79, 0x88, 0xc3,
-    0xbd, 0x54, 0x6c, 0x5e, 0x83, 0x8a, 0x98, 0x88, 0x92, 0x66, 0x73, 0x5b,
-    0x6c, 0x7f, 0x6e, 0x97, 0x8d, 0x58, 0x89, 0x6e, 0x65, 0x7a, 0x7d, 0x7c,
-    0x7e, 0x89, 0x94, 0x89, 0x55, 0xb8, 0x8f, 0x82, 0x6c, 0x9c, 0x96, 0x5e,
-    0x6f, 0xb2, 0x70, 0x76, 0x95, 0xc8, 0x86, 0x78, 0x49, 0xac, 0x7e, 0x6c,
-    0x68, 0xb6, 0xaf, 0x89, 0x68, 0xa5, 0x72, 0x85, 0x69, 0x9c, 0x94, 0x84,
-    0xa4, 0x97, 0x91, 0x61, 0x7a, 0xa3, 0x8f, 0x8e, 0x93, 0x80, 0x8d, 0x76,
-    0x74, 0x84, 0x9b, 0x79, 0x97, 0x4e, 0x67, 0x87, 0x9b, 0x69, 0x85, 0x7d,
-    0xb2, 0x68, 0x76, 0x63, 0xa2, 0x86, 0x97, 0x7f, 0xb5, 0x63, 0x79, 0x76,
-    0x8a, 0x7c, 0x7c, 0x91, 0xb1, 0x42, 0x7d, 0x7a, 0x8c, 0x8e, 0x72, 0xab,
-    0xb8, 0x76, 0xab, 0x81, 0x98, 0x85, 0x56, 0x98, 0x84, 0x9f, 0x70, 0x86,
-    0x76, 0x88, 0x70, 0x8d, 0x71, 0x7b, 0x7a, 0x8d, 0x76, 0x75, 0x62, 0x80,
-    0x81, 0x94, 0x82, 0x6e, 0x57, 0x8d, 0xaf, 0x84, 0xbf, 0x85, 0x82, 0xa7,
-    0x80, 0x89, 0x95, 0x81, 0x91, 0x49, 0x72, 0xa1, 0xa7, 0x3f, 0x72, 0x8b,
-    0x99, 0x72, 0x86, 0xb2, 0xc3, 0x61, 0x55, 0x77, 0x86, 0x77, 0x83, 0xa7,
-    0x95, 0x5a, 0x68, 0x68, 0x6a, 0x63, 0x6a, 0x77, 0x93, 0x7c, 0x88, 0x62,
-    0x79, 0x84, 0x8b, 0x82, 0x58, 0x8f, 0x9c, 0x56, 0x77, 0xb1, 0x65, 0x8c,
-    0x76, 0x91, 0x83, 0x5b, 0x62, 0x91, 0x87, 0x68, 0x71, 0xb0, 0x87, 0x64,
-    0x62, 0x91, 0x94, 0x58, 0x7f, 0xac, 0xa3, 0x84, 0x75, 0xaa, 0xa3, 0x4d,
-    0x7a, 0xc2, 0x84, 0x8a, 0x6d, 0xa2, 0x76, 0x74, 0x8c, 0x9e, 0x7c, 0x71,
-    0x86, 0x70, 0x6d, 0x79, 0x9a, 0x74, 0xb0, 0x8d, 0xa5, 0x7e, 0x6b, 0x63,
-    0x96, 0x74, 0x99, 0x76, 0xd0, 0x62, 0x85, 0x9d, 0x8f, 0x6d, 0x83, 0x88,
-    0xb0, 0x62, 0x9b, 0x87, 0x91, 0x82, 0x7a, 0x90, 0x9c, 0x61, 0x6d, 0x97,
-    0x84, 0x7c, 0x74, 0x8e, 0x8b, 0x75, 0x9a, 0x7e, 0x7c, 0x7d, 0x96, 0x81,
-    0x94, 0x69, 0x83, 0x6f, 0x8e, 0x7c, 0x7b, 0x7a, 0x73, 0x98, 0x74, 0x9e,
-    0x72, 0x8c, 0x5f, 0x7d, 0x99, 0x79, 0x5b, 0x73, 0x65, 0x78, 0xa5, 0x7d,
-    0xa2, 0x98, 0x91, 0x91, 0x87, 0x7b, 0x8c, 0x82, 0xb8, 0x6b, 0x82, 0xba,
-    0xa5, 0x3f, 0x83, 0x7a, 0x9b, 0x73, 0x93, 0xa1, 0xbe, 0x55, 0x6b, 0x75,
-    0x94, 0x7d, 0x9c, 0xa1, 0x82, 0x50, 0x75, 0x5a, 0x88, 0x6e, 0x72, 0x7f,
-    0x99, 0x64, 0x72, 0x49, 0x69, 0x79, 0x6d, 0x94, 0x73, 0x79, 0x80, 0x6f,
-    0x72, 0xbc, 0x9d, 0x71, 0x7a, 0x9d, 0x8a, 0x55, 0x74, 0xaa, 0xa1, 0x85,
-    0x7e, 0xc4, 0xa0, 0x7e, 0x50, 0x99, 0x68, 0x8c, 0x8a, 0xb0, 0x99, 0x6c,
-    0x6d, 0xaf, 0x7b, 0x7b, 0x79, 0xba, 0x8a, 0x7a, 0x9d, 0x8b, 0x67, 0x87,
-    0x76, 0xa9, 0x7f, 0x7e, 0x8b, 0x7b, 0x87, 0x84, 0x82, 0x74, 0xa3, 0x91,
-    0x9a, 0x6a, 0x93, 0x7e, 0x87, 0x5b, 0x95, 0x89, 0xbb, 0x5d, 0x74, 0x6c,
-    0x88, 0x7e, 0x81, 0x7e, 0xb6, 0x6b, 0x91, 0x92, 0x83, 0x78, 0x79, 0x95,
-    0x90, 0x5e, 0x68, 0x8f, 0xa8, 0x92, 0x66, 0x8e, 0x6b, 0x8c, 0x86, 0x80,
-    0x7e, 0x7e, 0x70, 0x84, 0x7d, 0x71, 0x67, 0x94, 0x71, 0x69, 0x84, 0x8f,
-    0x6c, 0x72, 0x85, 0x83, 0x69, 0x76, 0x57, 0x62, 0x83, 0x96, 0x83, 0x77,
-    0x64, 0x5f, 0xae, 0x7c, 0xa7, 0x88, 0x91, 0x8c, 0x9e, 0x7f, 0xa8, 0x8a,
-    0x93, 0x6f, 0x58, 0xae, 0xb4, 0x4b, 0x7f, 0x64, 0x9f, 0x5a, 0x9e, 0xb6,
-    0xa6, 0x6b, 0x79, 0x84, 0x6b, 0x7c, 0x8b, 0x94, 0x85, 0x60, 0x6b, 0x55,
-    0x79, 0x68, 0x77, 0x75, 0x85, 0x5c, 0x91, 0x5e, 0x5a, 0x71, 0x68, 0x7b,
-    0x73, 0x91, 0x6c, 0x6e, 0x71, 0x8b, 0x76, 0x86, 0x99, 0xb8, 0x91, 0x68,
-    0x51, 0xa7, 0x6f, 0x7a, 0x8a, 0xc3, 0x8e, 0x65, 0x64, 0x9e, 0x80, 0x78,
-    0x6c, 0xc5, 0xa2, 0x75, 0x71, 0xa5, 0x96, 0x4f, 0x70, 0xa4, 0x7a, 0x7c,
-    0x8c, 0x80, 0x89, 0x97, 0x9a, 0x9a, 0x85, 0x89, 0x92, 0x8f, 0x81, 0x6f,
-    0x82, 0x6a, 0xb8, 0x74, 0x8f, 0x51, 0x7b, 0x8b, 0x8c, 0x55, 0x7e, 0x8c,
-    0xb2, 0x41, 0x85, 0x77, 0x9c, 0x73, 0x75, 0x8d, 0x9f, 0x64, 0x92, 0x77,
-    0xa0, 0x87, 0x5f, 0x71, 0x85, 0x68, 0x8a, 0x78, 0x91, 0x78, 0x75, 0x7a,
-    0x81, 0x67, 0x96, 0x64, 0x96, 0x85, 0x7a, 0x7e, 0x83, 0x74, 0x82, 0x8f,
-    0x98, 0x75, 0x77, 0x84, 0x7e, 0x88, 0x94, 0x7d, 0x79, 0x8c, 0x47, 0x79,
-    0x96, 0x7f, 0x8e, 0x90, 0x50, 0x7f, 0xa3, 0x77, 0xa8, 0x7f, 0x65, 0x9f,
-    0xb9, 0x4c, 0xa7, 0x7f, 0xaa, 0x6e, 0xa2, 0xb0, 0xb8, 0x51, 0x6b, 0x74,
-    0xaa, 0x63, 0x6c, 0xa3, 0xb6, 0x5e, 0x74, 0x6a, 0x75, 0x69, 0x87, 0x7f,
-    0x9d, 0x71, 0x73, 0x72, 0x70, 0x57, 0x5a, 0x7e, 0x8b, 0x64, 0x9a, 0x4d,
-    0x97, 0x81, 0x7b, 0x75, 0x6e, 0x92, 0x5f, 0x67, 0x7e, 0xaa, 0x90, 0x7a,
-    0x92, 0xae, 0x92, 0x68, 0x79, 0x9d, 0x4f, 0x6c, 0x79, 0xb4, 0x9c, 0x58,
-    0x86, 0x8e, 0x62, 0x72, 0x71, 0xc1, 0xac, 0x7d, 0x7a, 0x94, 0x8f, 0x7b,
-    0x88, 0xa8, 0x8d, 0x82, 0x75, 0x9b, 0x5f, 0x83, 0x82, 0xb3, 0x7a, 0x93,
-    0x94, 0x76, 0x70, 0x7e, 0x72, 0x7e, 0x8f, 0x8c, 0xa7, 0x53, 0x72, 0x77,
-    0x7a, 0x64, 0xa8, 0x83, 0xc5, 0x56, 0x71, 0x7b, 0x96, 0x73, 0x7c, 0x73,
-    0x93, 0x49, 0x83, 0x99, 0xa2, 0x83, 0x74, 0x79, 0xa4, 0x61, 0x8e, 0x84,
-    0x7a, 0x7d, 0x56, 0x98, 0x97, 0x6d, 0x87, 0x8c, 0x7a, 0x77, 0x6a, 0x67,
-    0x8a, 0x6f, 0xa2, 0x82, 0x8d, 0x85, 0x6d, 0x8f, 0x7e, 0x74, 0x72, 0x74,
-    0x91, 0x75, 0x58, 0x7f, 0x9e, 0x7c, 0x9c, 0x75, 0x61, 0x6f, 0x85, 0x7b,
-    0xbe, 0x84, 0x85, 0x9b, 0x8c, 0x3b, 0x9a, 0x90, 0xab, 0x77, 0x8e, 0xa2,
-    0xbd, 0x55, 0x96, 0x70, 0xa8, 0x78, 0x98, 0x9c, 0xc3, 0x67, 0x6e, 0x81,
-    0x70, 0x75, 0x96, 0x9c, 0x8a, 0x5b, 0x73, 0x54, 0x69, 0x6c, 0x5d, 0x82,
-    0x99, 0x5b, 0x8c, 0x6d, 0x87, 0x80, 0x67, 0x86, 0x88, 0x7c, 0x70, 0x6b,
-    0x75, 0xab, 0x8e, 0x79, 0x90, 0x91, 0xaf, 0x67, 0x5c, 0xa1, 0x5c, 0x6f,
-    0x75, 0xa1, 0x95, 0x5f, 0x82, 0x8f, 0x78, 0x5d, 0x7c, 0xb8, 0x8a, 0x8a,
-    0x6a, 0x98, 0x6e, 0x51, 0x6b, 0xaa, 0x7d, 0x7c, 0x80, 0x94, 0x79, 0x6d,
-    0xaa, 0x8a, 0x7e, 0x77, 0xa4, 0x78, 0xa5, 0x6d, 0x7c, 0x75, 0xa8, 0x6f,
-    0xa6, 0x51, 0x8e, 0x80, 0x96, 0x5b, 0x9d, 0x7b, 0xb8, 0x4e, 0x6c, 0x87,
-    0x95, 0x7c, 0x78, 0x71, 0xb0, 0x5a, 0x99, 0xa0, 0x90, 0x87, 0x65, 0x8b,
-    0x98, 0x68, 0x92, 0x76, 0x82, 0x77, 0x6a, 0x8a, 0x91, 0x84, 0x87, 0x8b,
-    0x87, 0x84, 0x7a, 0x81, 0x77, 0x55, 0x8e, 0x86, 0x7a, 0x74, 0x65, 0x88,
-    0x62, 0x51, 0xa1, 0x91, 0x88, 0x76, 0x5f, 0x89, 0x9f, 0x86, 0x66, 0x67,
-    0x64, 0x75, 0x9e, 0x74, 0xc1, 0x80, 0x58, 0xa9, 0x8f, 0x5e, 0x94, 0x88,
-    0xaf, 0x6f, 0x6c, 0xa4, 0xa1, 0x4d, 0x68, 0x66, 0xc2, 0x6e, 0x89, 0x9b,
-    0xa3, 0x5a, 0x63, 0x5b, 0x9c, 0x7a, 0x93, 0x76, 0x9d, 0x6d, 0x71, 0x5d,
-    0x80, 0x66, 0x79, 0x80, 0x7c, 0x65, 0x74, 0x64, 0x88, 0x90, 0x79, 0x89,
-    0x72, 0x88, 0x67, 0x75, 0x6a, 0x96, 0x56, 0x67, 0x88, 0xa1, 0x8c, 0x6c,
-    0x55, 0xb2, 0x8a, 0x71, 0x88, 0xdc, 0x7a, 0x72, 0x94, 0x9d, 0x7c, 0x76,
-    0x6a, 0xaa, 0xa8, 0x7f, 0x80, 0xa0, 0x6b, 0x6f, 0x84, 0xe0, 0x68, 0x93,
-    0xa6, 0x99, 0x69, 0x68, 0x93, 0xa0, 0x93, 0x6b, 0x87, 0x8b, 0x80, 0x90,
-    0x90, 0x89, 0x8f, 0x7f, 0xaf, 0x6f, 0x82, 0x6d, 0x94, 0x70, 0x97, 0x8f,
-    0xb0, 0x40, 0x9b, 0x67, 0x78, 0x86, 0x90, 0x8b, 0xa7, 0x51, 0x7f, 0x79,
-    0x90, 0x71, 0x6d, 0x80, 0x95, 0x63, 0x7d, 0x87, 0xa0, 0x7e, 0x7b, 0x85,
-    0x8e, 0x6d, 0xa1, 0x76, 0x70, 0x7b, 0x66, 0x87, 0x90, 0x7a, 0x86, 0x88,
-    0x89, 0x87, 0x6a, 0x91, 0x78, 0x74, 0x76, 0x8d, 0x7e, 0x86, 0x63, 0x90,
-    0x98, 0x7d, 0x4a, 0x85, 0x4f, 0x9d, 0xa2, 0x7c, 0xb4, 0x88, 0x78, 0xb5,
-    0x8f, 0x3f, 0xa7, 0x7d, 0xa4, 0x7c, 0x60, 0x9c, 0xa8, 0x41, 0x6b, 0x7f,
-    0xa2, 0x7f, 0x68, 0xaa, 0xb4, 0x73, 0x56, 0x62, 0x87, 0x72, 0xa5, 0x7c,
-    0x97, 0x69, 0x58, 0x6b, 0x89, 0x57, 0x51, 0x80, 0x92, 0x7a, 0x7c, 0x4c,
-    0x7c, 0x7b, 0x69, 0x5f, 0x90, 0x77, 0x78, 0x67, 0x7a, 0xad, 0x79, 0x5c,
-    0x9c, 0xbf, 0xa6, 0x64, 0x53, 0xb3, 0x5e, 0x59, 0x86, 0xb9, 0x94, 0x65,
-    0x70, 0x9d, 0x7a, 0x80, 0x7c, 0xae, 0x9c, 0x7b, 0x66, 0xae, 0x83, 0x5f,
-    0x81, 0xc5, 0x8b, 0x7e, 0x9b, 0x89, 0x84, 0x7f, 0x7c, 0xa5, 0x5c, 0x89,
-    0x8a, 0x75, 0x99, 0x6d, 0x8e, 0x90, 0x9f, 0x81, 0x81, 0x6b, 0x87, 0x76,
-    0x92, 0x6f, 0xab, 0x95, 0x95, 0x4c, 0x97, 0x72, 0x80, 0x87, 0x83, 0x87,
-    0xa3, 0x59, 0xad, 0x74, 0x93, 0x7f, 0x77, 0x78, 0x8d, 0x66, 0x9b, 0x7a,
-    0x7d, 0x95, 0x64, 0x7f, 0x6d, 0x5c, 0x8e, 0x94, 0x92, 0x82, 0x60, 0x8d,
-    0x75, 0x55, 0x8c, 0x8b, 0x8f, 0x86, 0x7d, 0x7c, 0x74, 0x57, 0x78, 0x9d,
-    0x71, 0x65, 0x66, 0x7f, 0xaa, 0x92, 0x66, 0x81, 0x5a, 0x71, 0xa6, 0x78,
-    0x9d, 0x8a, 0x5a, 0x8a, 0x91, 0x59, 0xb7, 0x5c, 0xc3, 0x73, 0x89, 0x9d,
-    0xa7, 0x62, 0x77, 0x72, 0x9f, 0x92, 0x6a, 0x9f, 0xaa, 0x71, 0x6b, 0x5e,
-    0x7d, 0x73, 0x8d, 0x89, 0xba, 0x61, 0x73, 0x6e, 0x71, 0x8a, 0x79, 0x7c,
-    0x94, 0x76, 0x76, 0x65, 0x81, 0x6f, 0x4e, 0x75, 0x6e, 0x8b, 0x7d, 0x50,
-    0x56, 0xb8, 0x72, 0x67, 0x93, 0xc6, 0x88, 0x6f, 0x57, 0xb7, 0x80, 0x4c,
-    0x97, 0xc4, 0xb6, 0x71, 0x72, 0x9e, 0x6f, 0x72, 0x8d, 0xa5, 0x8f, 0x89,
-    0x74, 0xae, 0x78, 0x70, 0x6e, 0xbb, 0x8f, 0x73, 0x74, 0x8b, 0x5e, 0x86,
-    0x8b, 0x8a, 0x72, 0x71, 0x84, 0x84, 0x77, 0xa3, 0xa6, 0x73, 0xa4, 0x7e,
-    0xab, 0x5d, 0x75, 0x96, 0x94, 0x5f, 0x8b, 0x74, 0x9c, 0x63, 0x8d, 0x81,
-    0x80, 0x6a, 0x91, 0x88, 0x93, 0x53, 0x80, 0x75, 0x79, 0x8d, 0x78, 0x74,
-    0x7c, 0x73, 0xb2, 0x89, 0x8e, 0xab, 0x75, 0x6c, 0x7a, 0x79, 0x99, 0x77,
-    0x7d, 0x89, 0x5a, 0x81, 0x7c, 0x75, 0x6a, 0x7e, 0x8c, 0x83, 0x78, 0x8e,
-    0x62, 0x76, 0x77, 0x6b, 0x79, 0x66, 0x6e, 0x82, 0xa1, 0x8d, 0x52, 0x79,
-    0x70, 0x7d, 0xa9, 0x6a, 0x95, 0x7f, 0x59, 0x94, 0x8f, 0x73, 0xb7, 0x85,
-    0xb3, 0x80, 0x77, 0x9f, 0xb8, 0x4d, 0x82, 0x7c, 0xa0, 0xa4, 0x7b, 0x8c,
-    0xa9, 0x78, 0x62, 0x6b, 0x8a, 0x93, 0x80, 0x68, 0x9b, 0x6d, 0x6b, 0x7b,
-    0x84, 0x8f, 0x86, 0x70, 0x70, 0x73, 0x84, 0x4f, 0x7c, 0x75, 0x64, 0x8d,
-    0x6e, 0x81, 0x7c, 0x72, 0x81, 0xb0, 0x74, 0x65, 0xa7, 0xae, 0x80, 0x70,
-    0x5e, 0xa4, 0x58, 0x54, 0x8e, 0xa7, 0x96, 0x65, 0x66, 0x8b, 0x6c, 0x5d,
-    0x6b, 0xbe, 0x94, 0x79, 0x80, 0xa1, 0x91, 0x78, 0x6d, 0xc2, 0x82, 0x85,
-    0x81, 0x7d, 0x88, 0x79, 0x93, 0x96, 0x7f, 0x7e, 0x7d, 0x92, 0x75, 0xa2,
-    0x9f, 0x7b, 0x92, 0x77, 0x8a, 0x7c, 0x80, 0x8b, 0x9b, 0x64, 0xa5, 0x74,
-    0xa1, 0x74, 0x7f, 0x7e, 0x85, 0x78, 0x9c, 0x86, 0x9f, 0x62, 0x8f, 0x7f,
-    0x8a, 0x90, 0x6d, 0x7d, 0x93, 0x61, 0x9d, 0x81, 0x9b, 0x99, 0x69, 0x87,
-    0x74, 0x7d, 0x8e, 0x8e, 0x7b, 0x7c, 0x6a, 0x71, 0x7d, 0x7f, 0x74, 0x74,
-    0x7b, 0x65, 0x6e, 0x91, 0x7c, 0x6e, 0x80, 0x8c, 0x8a, 0x6c, 0x6b, 0x76,
-    0xad, 0x94, 0x64, 0x81, 0x69, 0x7b, 0xac, 0x76, 0x9f, 0x71, 0x85, 0x85,
-    0x8b, 0x66, 0xb5, 0x87, 0xb3, 0x63, 0x8b, 0x95, 0x8e, 0x50, 0x91, 0x77,
-    0xa1, 0x99, 0x64, 0x81, 0xb3, 0x63, 0x6e, 0x7a, 0x7f, 0x73, 0x7a, 0x7b,
-    0x93, 0x6d, 0x75, 0x75, 0x7c, 0x7b, 0x59, 0x7c, 0x7c, 0x68, 0x67, 0x78,
-    0x79, 0x75, 0x53, 0x86, 0x84, 0x84, 0x91, 0x71, 0x85, 0xb1, 0x84, 0x64,
-    0x88, 0xc0, 0x94, 0x5f, 0x6f, 0x9b, 0x69, 0x67, 0x97, 0x94, 0x88, 0x6a,
-    0x7e, 0x94, 0x9e, 0x7f, 0x81, 0x9c, 0xa7, 0x7f, 0x7a, 0xa2, 0x63, 0x69,
-    0x82, 0xc2, 0x5e, 0x8d, 0x7c, 0x89, 0x63, 0x93, 0x84, 0xb8, 0x76, 0x89,
-    0x96, 0x87, 0x79, 0x88, 0xa6, 0x8e, 0x9b, 0x93, 0x9c, 0x5d, 0x92, 0x92,
-    0x82, 0x5e, 0x85, 0x88, 0xad, 0x73, 0xa4, 0x6f, 0x74, 0x8e, 0x77, 0x89,
-    0x9b, 0x6e, 0x82, 0x76, 0x93, 0xae, 0x82, 0x87, 0x76, 0x6f, 0x80, 0x76,
-    0x95, 0x8e, 0x5e, 0x85, 0x7b, 0x68, 0x7f, 0x7c, 0x82, 0x94, 0x80, 0x91,
-    0x77, 0x71, 0x7c, 0x94, 0x80, 0x62, 0x65, 0x7c, 0x5e, 0x70, 0x76, 0x75,
-    0x7b, 0x60, 0x5f, 0x69, 0xb3, 0x6e, 0x95, 0x9d, 0x5a, 0x5b, 0x9e, 0x6e,
-    0xa6, 0x80, 0x5d, 0xa5, 0x83, 0x5b, 0xa4, 0x80, 0xb3, 0x79, 0x83, 0xb6,
-    0xa3, 0x73, 0x84, 0x67, 0x8d, 0x8f, 0x9d, 0x78, 0xb8, 0x8a, 0x7b, 0x6c,
-    0x85, 0x87, 0x6d, 0x75, 0xae, 0x75, 0x53, 0x71, 0x6b, 0x87, 0x67, 0x7b,
-    0x7f, 0x86, 0x58, 0x73, 0x7d, 0x87, 0x5d, 0x7f, 0x7d, 0x63, 0x92, 0x65,
-    0x7a, 0x9c, 0x6f, 0x87, 0x81, 0xa9, 0x91, 0x54, 0x66, 0x8e, 0x58, 0x6d,
-    0x92, 0xc2, 0xa9, 0x7b, 0x6e, 0x96, 0x7c, 0x60, 0x7e, 0xa8, 0x85, 0x94,
-    0x90, 0x8b, 0x77, 0x79, 0x77, 0xa7, 0x8f, 0x83, 0x80, 0x99, 0x8c, 0x80,
-    0x93, 0x9c, 0x73, 0x9e, 0x75, 0x90, 0x67, 0x74, 0x99, 0x98, 0x7e, 0x76,
-    0x9f, 0x82, 0x90, 0x95, 0x9d, 0x5f, 0x95, 0x98, 0x8c, 0x5f, 0x77, 0x83,
-    0x7b, 0x72, 0x85, 0x7c, 0x97, 0x74, 0x81, 0x80, 0x8d, 0x89, 0x7d, 0x69,
-    0x95, 0x85, 0x83, 0x5e, 0x95, 0x74, 0x54, 0x7f, 0x6c, 0x67, 0x9b, 0x83,
-    0x88, 0x8e, 0x6f, 0x96, 0x81, 0x7f, 0x6e, 0x87, 0x8f, 0x6f, 0x61, 0x87,
-    0x63, 0x66, 0x72, 0x77, 0x75, 0x6d, 0x59, 0x7d, 0xaa, 0x85, 0x62, 0x83,
-    0x97, 0x94, 0x96, 0x89, 0x9d, 0x90, 0x7d, 0x91, 0x78, 0x57, 0xa0, 0x7f,
-    0xa2, 0x62, 0x63, 0x99, 0x77, 0x71, 0x7f, 0x61, 0x99, 0x89, 0x6f, 0xa2,
-    0xae, 0x92, 0x88, 0x51, 0x87, 0x7a, 0x6f, 0x89, 0xa8, 0x89, 0x64, 0x81,
-    0x84, 0x79, 0x5b, 0x73, 0x82, 0x6e, 0x7e, 0x5d, 0x8f, 0x82, 0x51, 0x69,
-    0x8e, 0x76, 0x8b, 0x58, 0x89, 0xb2, 0x52, 0x72, 0x7f, 0xae, 0x96, 0x5a,
-    0x80, 0xa1, 0x74, 0x62, 0x8d, 0xbe, 0x87, 0x6c, 0x6d, 0xad, 0x83, 0x5a,
-    0x6c, 0xa5, 0x7f, 0x7c, 0x7a, 0xa1, 0x75, 0x6d, 0x85, 0xbe, 0x91, 0x8e,
-    0x96, 0x8c, 0x87, 0x74, 0x8b, 0x82, 0x96, 0x8f, 0x8f, 0x93, 0x8f, 0x8c,
-    0x9a, 0x78, 0x73, 0x6e, 0x91, 0x8d, 0x7e, 0x81, 0x81, 0x52, 0x90, 0x85,
-    0x77, 0x66, 0x7e, 0x75, 0x8a, 0x67, 0x72, 0x76, 0x82, 0x7b, 0x6e, 0x67,
-    0x96, 0x7b, 0x75, 0x76, 0x8d, 0x76, 0x7f, 0x79, 0x84, 0x7b, 0x57, 0x81,
-    0x76, 0x80, 0x67, 0x8c, 0x7c, 0x80, 0x67, 0x85, 0x79, 0x5b, 0x97, 0x74,
-    0x91, 0x75, 0x82, 0x75, 0x6b, 0x94, 0x7e, 0x85, 0x8e, 0x77, 0x5d, 0x78,
-    0xb5, 0x8b, 0x73, 0x7f, 0x62, 0x8f, 0xb1, 0x7d, 0xa2, 0x85, 0x6b, 0x92,
-    0x75, 0x75, 0xb8, 0x7d, 0xb3, 0x67, 0x5f, 0xa6, 0x9b, 0x85, 0x9a, 0x67,
-    0xbe, 0x8d, 0x92, 0x88, 0xa5, 0x7c, 0xaa, 0x5a, 0x71, 0x7b, 0x70, 0x77,
-    0xa0, 0xa4, 0x5e, 0x55, 0x6b, 0x8e, 0x53, 0x89, 0x8a, 0x5a, 0x7c, 0x54,
-    0x7c, 0x8b, 0x53, 0x77, 0x67, 0x77, 0x67, 0x5d, 0x91, 0xac, 0x78, 0x81,
-    0x8e, 0xb5, 0x6d, 0x58, 0x78, 0xa6, 0x7c, 0x85, 0x87, 0xb3, 0x76, 0x5d,
-    0x7c, 0x87, 0x57, 0x68, 0x82, 0x8f, 0x89, 0x76, 0x86, 0x9f, 0x6c, 0x68,
-    0x7c, 0x87, 0x79, 0x9f, 0x86, 0x9e, 0x83, 0x70, 0x8d, 0xb2, 0x84, 0x71,
-    0x71, 0x91, 0x9f, 0x8e, 0x83, 0x84, 0x87, 0x80, 0x94, 0x80, 0x7d, 0x8d,
-    0x7c, 0x56, 0x5f, 0x80, 0x7d, 0x84, 0x61, 0x6e, 0x69, 0x80, 0x8b, 0x67,
-    0xa4, 0x8b, 0x98, 0x7a, 0x8a, 0x6c, 0x77, 0x66, 0x7d, 0x6e, 0x84, 0x78,
-    0x82, 0x7d, 0x61, 0x88, 0x6e, 0x53, 0x92, 0x75, 0x88, 0x77, 0x82, 0x9f,
-    0x9e, 0x6f, 0x9c, 0x76, 0x91, 0x78, 0x69, 0x7f, 0x71, 0x6c, 0x6f, 0x7d,
-    0x83, 0x6e, 0x3c, 0x84, 0x90, 0x8b, 0x71, 0x69, 0x75, 0x81, 0xc8, 0x84,
-    0xa7, 0x8a, 0x8a, 0x90, 0x96, 0x86, 0x9e, 0x68, 0x99, 0x84, 0x8c, 0xa0,
-    0x8a, 0x71, 0x7d, 0x41, 0xa1, 0x98, 0x77, 0x91, 0xaa, 0x86, 0x96, 0x5e,
-    0x86, 0x76, 0xa7, 0x83, 0xac, 0x86, 0x66, 0x46, 0x6a, 0x81, 0x64, 0x77,
-    0x67, 0x53, 0x80, 0x59, 0x73, 0x71, 0x63, 0x71, 0x76, 0x86, 0x62, 0x4f,
-    0x83, 0xa4, 0x5d, 0x66, 0x93, 0x87, 0x87, 0x5b, 0x7f, 0x9d, 0x61, 0x9d,
-    0x94, 0xa4, 0x84, 0x75, 0x67, 0xb3, 0x7b, 0x6d, 0x64, 0x98, 0x62, 0x77,
-    0x7d, 0x98, 0x8e, 0x75, 0x7d, 0xa6, 0xa4, 0x8c, 0x83, 0x8b, 0x7a, 0x97,
-    0x6c, 0x7f, 0x66, 0x7f, 0x8f, 0x98, 0x72, 0x6e, 0x75, 0x65, 0x80, 0x8d,
-    0x88, 0x7d, 0x8c, 0x8d, 0x67, 0x68, 0xab, 0x8c, 0x8b, 0x76, 0x87, 0x69,
-    0x88, 0x6c, 0x83, 0x6e, 0x88, 0x64, 0xa8, 0x67, 0xa5, 0x5b, 0x65, 0x60,
-    0x6b, 0x62, 0x76, 0x78, 0x8c, 0x5b, 0x61, 0x6f, 0x66, 0x65, 0x92, 0x67,
-    0x84, 0x7b, 0x80, 0x86, 0x7b, 0x6c, 0x86, 0x7a, 0x72, 0x7b, 0x4d, 0x94,
-    0x80, 0x67, 0x8e, 0x8d, 0x7f, 0x79, 0x65, 0x78, 0xa3, 0x71, 0x80, 0x74,
-    0xa7, 0xa8, 0x97, 0x78, 0x91, 0x77, 0x98, 0x86, 0x82, 0x64, 0xa5, 0x6e,
-    0x7a, 0x5d, 0x6f, 0xad, 0x9b, 0x7a, 0x91, 0x4b, 0xa1, 0x75, 0x95, 0x76,
-    0xac, 0x9d, 0xa3, 0x65, 0x65, 0x6a, 0x81, 0x8b, 0x9f, 0x67, 0x6b, 0x6a,
-    0x60, 0x5b, 0x77, 0x96, 0x73, 0x78, 0x5a, 0x77, 0x5f, 0x68, 0x70, 0x72,
-    0x78, 0x65, 0x81, 0x20, 0x86, 0x99, 0x80, 0x7a, 0xa5, 0xb1, 0x69, 0x45,
-    0x7d, 0xa6, 0x7d, 0x85, 0xaa, 0xa9, 0x65, 0x60, 0x75, 0x9b, 0x61, 0x92,
-    0x91, 0x8f, 0x8a, 0x81, 0x88, 0x9c, 0x81, 0x7d, 0x7b, 0x8f, 0x7e, 0x9e,
-    0x82, 0x94, 0x95, 0x80, 0x73, 0xae, 0x7b, 0x7a, 0x79, 0x8c, 0x8b, 0x65,
-    0x71, 0x75, 0x8d, 0x7a, 0x90, 0x83, 0x7b, 0x77, 0x71, 0x4f, 0x70, 0x95,
-    0x87, 0x69, 0x97, 0x8e, 0x70, 0x92, 0x6e, 0x91, 0x9d, 0x72, 0x75, 0x82,
-    0xad, 0x81, 0x78, 0x8d, 0x6f, 0x65, 0x88, 0x86, 0x8c, 0x8e, 0x59, 0x8b,
-    0x67, 0x69, 0x8b, 0x78, 0x7f, 0x59, 0x73, 0x87, 0x6f, 0x86, 0x66, 0x7c,
-    0x96, 0x68, 0x59, 0x78, 0x67, 0x92, 0x7b, 0x76, 0x80, 0x6e, 0x4a, 0x7b,
-    0x99, 0x67, 0x72, 0x9c, 0x7a, 0x80, 0x76, 0x5f, 0x8e, 0x4f, 0x71, 0x77,
-    0xab, 0x78, 0x99, 0x50, 0x83, 0x65, 0x78, 0x8c, 0xbb, 0x8d, 0x4e, 0x54,
-    0x81, 0x6f, 0x7f, 0x91, 0xb9, 0x79, 0x9c, 0x65, 0x5a, 0x5a, 0x73, 0x8c,
-    0x9a, 0xac, 0x99, 0x44, 0x7d, 0x4f, 0x78, 0x5a, 0x7d, 0x79, 0x57, 0x44,
-    0x6f, 0x6a, 0x75, 0x7f, 0x5f, 0x6f, 0x72, 0x62, 0x7f, 0x89, 0x57, 0x91,
-    0x8d, 0x83, 0x7e, 0x63, 0x8c, 0x95, 0x48, 0x78, 0xa9, 0x88, 0x84, 0x5b,
-    0x8c, 0xa5, 0x65, 0x71, 0x88, 0x82, 0x7e, 0xa4, 0x8d, 0x7d, 0x7d, 0x8d,
-    0x91, 0x7c, 0x73, 0x7d, 0x99, 0x89, 0x6d, 0xa1, 0x98, 0x84, 0x8b, 0x6b,
-    0x89, 0x86, 0x84, 0x7e, 0x86, 0x87, 0x78, 0x8c, 0x96, 0x92, 0x5a, 0xa0,
-    0x64, 0x73, 0x91, 0x88, 0x8f, 0x6b, 0x96, 0x5c, 0x99, 0x62, 0x78, 0x6c,
-    0x87, 0x4d, 0x5d, 0x69, 0x7b, 0x81, 0x4a, 0x61, 0x71, 0x69, 0x7d, 0x91,
-    0x67, 0x92, 0x68, 0x6f, 0x50, 0x5e, 0x61, 0x7e, 0x81, 0x70, 0x5f, 0x7b,
-    0x6b, 0x55, 0x71, 0x6c, 0x70, 0x53, 0x3f, 0x80, 0x6e, 0x57, 0x96, 0x84,
-    0x75, 0x51, 0x60, 0x9a, 0x7f, 0xa5, 0x80, 0x94, 0x95, 0x74, 0x7c, 0x83,
-    0xa0, 0x93, 0x5d, 0x92, 0x83, 0x66, 0x67, 0x8a, 0x8b, 0x9b, 0x81, 0x69,
-    0x73, 0x91, 0x6b, 0x79, 0x93, 0x88, 0x64, 0x68, 0x81, 0x8c, 0x6f, 0x81,
-    0x6f, 0x80, 0x68, 0x5f, 0x9c, 0x95, 0x76, 0x93, 0x87, 0x68, 0x83, 0x94,
-    0x8b, 0x85, 0x72, 0x7f, 0x64, 0x8c, 0x6a, 0x95, 0x8d, 0x80, 0x69, 0x6b,
-    0x98, 0x86, 0x75, 0x92, 0x7a, 0x7f, 0x5b, 0x7f, 0x9b, 0x57, 0x99, 0x8d,
-    0x8a, 0x7b, 0x58, 0x73, 0x88, 0x6d, 0x8a, 0x8c, 0x8e, 0x82, 0x85, 0xaa,
-    0x72, 0xa6, 0x7f, 0x7a, 0x83, 0x59, 0x6d, 0x6e, 0x79, 0x83, 0x88, 0x84,
-    0x74, 0x85, 0x74, 0x78, 0x80, 0x7c, 0x97, 0x86, 0x94, 0x65, 0x7e, 0x80,
-    0x6f, 0x97, 0x70, 0x74, 0x92, 0x76, 0x71, 0x91, 0x85, 0x72, 0x6e, 0x84,
-    0x78, 0x7e, 0x88, 0x79, 0x7f, 0x80, 0x83, 0x7a, 0x85, 0x75, 0x82, 0x81,
-    0x82, 0x7b, 0x7a, 0xa0, 0x76, 0x7f, 0x75, 0xa7, 0x67, 0x8e, 0x81, 0x98,
-    0xa5, 0x86, 0x77, 0x78, 0x7f, 0x97, 0x90, 0x86, 0x80, 0x6b, 0x89, 0x66,
-    0x9b, 0x5c, 0x8b, 0x74, 0xac, 0x89, 0x89, 0x92, 0x92, 0xa8, 0x61, 0x85,
-    0x8c, 0x86, 0x88, 0x91, 0x92, 0x66, 0x63, 0x6c, 0x7a, 0x80, 0x7d, 0x90,
-    0x6f, 0x7f, 0x92, 0x94, 0x8e, 0x7a, 0x86, 0x98, 0xa1, 0x59, 0x71, 0x8c,
-    0x63, 0xa3, 0x60, 0x7d, 0x88, 0x6a, 0x83, 0x6e, 0x7a, 0x94, 0x7b, 0x81,
-    0x7d, 0x83, 0x77, 0x7e, 0x63, 0xab, 0x75, 0x7b, 0x71, 0x8f, 0x76, 0x6e,
-    0x78, 0x7b, 0x79, 0x86, 0x69, 0x67, 0x67, 0x70, 0x6c, 0x7a, 0x6c, 0x84,
-    0x74, 0xa2, 0x74, 0x77, 0x8a, 0x58, 0x7d, 0xa0, 0x65, 0x7b, 0x79, 0x71,
-    0x7c, 0x3c, 0x85, 0x96, 0x59, 0x76, 0x6a, 0x94, 0xa5, 0x5b, 0x70, 0x99,
-    0x7f, 0x9a, 0x69, 0x7c, 0x6f, 0x79, 0x72, 0x8b, 0x83, 0x6e, 0x73, 0x7f,
-    0x6f, 0x6d, 0x7e, 0xa3, 0x72, 0x87, 0x83, 0x8c, 0x8c, 0x70, 0x77, 0x75,
-    0xa4, 0x5a, 0x89, 0x7d, 0xa0, 0x97, 0x67, 0x80, 0x78, 0x7e, 0x86, 0x6a,
-    0x7b, 0x9c, 0x77, 0x67, 0x7b, 0x74, 0x7f, 0xa5, 0x90, 0x94, 0x92, 0x4d,
-    0x7a, 0x79, 0x9f, 0x87, 0x64, 0x6e, 0x6d, 0x59, 0x83, 0x54, 0x79, 0x82,
-    0x6c, 0x74, 0x82, 0x98, 0x77, 0x90, 0x85, 0xa4, 0x88, 0x81, 0x71, 0x85,
-    0x90, 0x8e, 0x88, 0x68, 0x51, 0x6d, 0x71, 0x7b, 0x80, 0xbc, 0xa5, 0x57,
-    0x8f, 0x9f, 0x95, 0x89, 0xb1, 0x96, 0x69, 0x65, 0x61, 0x73, 0x6f, 0x6c,
-    0x5b, 0x95, 0x99, 0x7f, 0x76, 0x9d, 0x7c, 0x7d, 0x8d, 0xb1, 0x8f, 0x6a,
-    0x76, 0x95, 0x74, 0x7a, 0x7b, 0xae, 0x77, 0x76, 0x6d, 0x99, 0x7d, 0x80,
-    0x6e, 0x89, 0x7f, 0x74, 0x6f, 0x72, 0x89, 0x8b, 0x86, 0x7b, 0x7c, 0x72,
-    0x6b, 0x4f, 0x71, 0x94, 0x80, 0x96, 0x83, 0x7e, 0x75, 0x74, 0x68, 0x83,
-    0x95, 0x8c, 0x85, 0x7a, 0x82, 0x74, 0x85, 0x83, 0x8c, 0x7e, 0x7a, 0xa0,
-    0x8e, 0x67, 0x6b, 0x82, 0x9b, 0x66, 0x6c, 0x8a, 0x88, 0x7e, 0x74, 0x9e,
-    0x88, 0x82, 0x73, 0x73, 0x79, 0x7c, 0x72, 0x6b, 0x74, 0x8b, 0xa4, 0xa4,
-    0xa3, 0x73, 0x73, 0x88, 0x8d, 0x94, 0x84, 0x9a, 0x9e, 0x93, 0x6c, 0x86,
-    0x7a, 0x7a, 0x7e, 0xaa, 0x66, 0x8f, 0x99, 0xa4, 0x70, 0x4c, 0x6f, 0x66,
-    0x8a, 0xaa, 0x69, 0x80, 0x6a, 0x5e, 0x71, 0x8f, 0x8b, 0x84, 0x75, 0x9d,
-    0x5c, 0x60, 0x61, 0x4a, 0x6f, 0x91, 0x78, 0x6e, 0x8c, 0x62, 0x88, 0x75,
-    0x64, 0x7c, 0x7d, 0x92, 0x9b, 0x96, 0x62, 0x72, 0x6c, 0x6f, 0x87, 0x5d,
-    0xa0, 0xa7, 0x7c, 0x58, 0x6e, 0x8c, 0x82, 0x84, 0x7f, 0x8b, 0x54, 0x77,
-    0x5b, 0x9a, 0x6a, 0x78, 0x5d, 0xb9, 0x8e, 0x7d, 0x6e, 0xa1, 0x66, 0x7c,
-    0x87, 0xd2, 0x7a, 0x6c, 0x82, 0xa1, 0x83, 0x59, 0x64, 0x9e, 0x65, 0x6d,
-    0x77, 0x80, 0x7c, 0x9a, 0x50, 0x9f, 0x8b, 0x7a, 0x73, 0x80, 0x92, 0x6d,
-    0x97, 0x7f, 0x74, 0x6a, 0x5f, 0x44, 0x7d, 0x99, 0x95, 0x91, 0x8f, 0x6a,
-    0x63, 0x56, 0x89, 0x96, 0xba, 0xa6, 0x71, 0x98, 0x9d, 0x3a, 0x8f, 0x77,
-    0x6d, 0x76, 0x68, 0xb4, 0x8d, 0x79, 0x7a, 0x83, 0x7f, 0x96, 0x75, 0x94,
-    0x9e, 0x51, 0x83, 0x5b, 0x66, 0x73, 0xa1, 0xbc, 0x8c, 0x70, 0x88, 0x80,
-    0x92, 0x60, 0x7d, 0xa9, 0x97, 0x74, 0x7d, 0x98, 0x7b, 0x78, 0x85, 0xa7,
-    0x8f, 0x8c, 0x91, 0x9d, 0x6a, 0x80, 0x6c, 0x8e, 0x8e, 0x91, 0x76, 0x8b,
-    0x79, 0x59, 0x7d, 0x9c, 0x69, 0x83, 0x8c, 0x95, 0x8e, 0x75, 0x9d, 0x83,
-    0x92, 0x99, 0x8a, 0x59, 0x61, 0x54, 0x63, 0x86, 0x83, 0x86, 0x98, 0x83,
-    0x73, 0x74, 0x91, 0x52, 0x60, 0x8a, 0x7c, 0x57, 0xbc, 0x9d, 0x86, 0x6b,
-    0x63, 0xa2, 0x78, 0x80, 0x75, 0xb1, 0x74, 0x76, 0x69, 0x8b, 0x7e, 0x76,
-    0x7b, 0xb3, 0x77, 0x5b, 0x6c, 0x8b, 0x83, 0x80, 0x7f, 0xd1, 0x7c, 0x58,
-    0x6f, 0x98, 0x71, 0x57, 0x60, 0xd0, 0x84, 0x62, 0x74, 0xa6, 0x8f, 0x7b,
-    0x70, 0xaa, 0x81, 0x6b, 0x7f, 0x89, 0x6a, 0x74, 0x5a, 0x8c, 0x9c, 0x77,
-    0x5d, 0x84, 0x63, 0x94, 0x8e, 0x91, 0x83, 0x4a, 0x49, 0x74, 0x6b, 0x70,
-    0xc0, 0xa0, 0x6a, 0x90, 0x8e, 0x5a, 0x70, 0x96, 0xab, 0x72, 0x7e, 0xba,
-    0xa7, 0x46, 0x86, 0x5d, 0x90, 0x76, 0x95, 0x8d, 0xa5, 0x40, 0x82, 0x8a,
-    0x7d, 0x5e, 0x73, 0x94, 0x9d, 0x58, 0x8c, 0x8b, 0x69, 0x6c, 0x9a, 0x90,
-    0xaa, 0x6f, 0x85, 0x8d, 0x64, 0x58, 0x7b, 0x97, 0xa9, 0x79, 0xa5, 0xa2,
-    0x5f, 0x57, 0x9a, 0xb4, 0x89, 0x70, 0x84, 0x73, 0x46, 0x6c, 0x6e, 0x87,
-    0x70, 0x94, 0x8a, 0x8a, 0x69, 0x7b, 0x6c, 0x68, 0x8e, 0xa2, 0x90, 0x84,
-    0x78, 0x45, 0x63, 0x78, 0x7f, 0x90, 0x9f, 0x90, 0x68, 0x43, 0x92, 0x77,
-    0x78, 0x77, 0x82, 0x7d, 0x8f, 0x6a, 0x7a, 0x70, 0x76, 0x75, 0x87, 0x63,
-    0xbc, 0x8e, 0x6a, 0x71, 0x51, 0x51, 0x75, 0x6b, 0x8a, 0xb4, 0x6a, 0x5b,
-    0x99, 0x84, 0x76, 0x84, 0x74, 0xaf, 0x86, 0x6a, 0x53, 0x97, 0x6e, 0x8e,
-    0x61, 0xc4, 0x7e, 0x5d, 0x4d, 0x96, 0x73, 0x73, 0x53, 0xc0, 0x8f, 0x68,
-    0x58, 0xae, 0x81, 0x83, 0x62, 0x98, 0x7b, 0x89, 0x54, 0x86, 0x78, 0x67,
-    0x70, 0x9b, 0x63, 0x5f, 0x2d, 0x77, 0x84, 0x79, 0x6b, 0xa4, 0x7b, 0x65,
-    0x45, 0x65, 0x56, 0x86, 0xbb, 0x8a, 0x8e, 0x92, 0x86, 0x48, 0x7c, 0x6d,
-    0xb4, 0x7d, 0x56, 0xa4, 0x86, 0x52, 0x8b, 0x6a, 0x8d, 0x5b, 0x9d, 0xa2,
-    0xbf, 0x36, 0x7c, 0x99, 0x9d, 0x65, 0x75, 0xa4, 0x9f, 0x6a, 0x7c, 0x6b,
-    0x6f, 0x55, 0x70, 0x7f, 0xc2, 0x38, 0x6e, 0xa4, 0x74, 0x4c, 0x75, 0xbb,
-    0xa4, 0x75, 0x8e, 0x8f, 0x56, 0x65, 0x57, 0x92, 0x73, 0x7f, 0x7d, 0x86,
-    0x65, 0x76, 0x92, 0x84, 0x70, 0xa8, 0x91, 0x5b, 0x69, 0x74, 0x8e, 0x82,
-    0x78, 0x8a, 0xaa, 0x71, 0x70, 0x50, 0x85, 0x82, 0x7d, 0x94, 0xa0, 0x76,
-    0x6d, 0x55, 0x86, 0x79, 0x71, 0x7f, 0x9b, 0x71, 0x8a, 0x42, 0x87, 0x64,
-    0x57, 0x88, 0xa0, 0x77, 0xa8, 0x91, 0x72, 0x65, 0x7e, 0x6b, 0x7e, 0x81,
-    0x8d, 0x97, 0x7e, 0x6a, 0x92, 0x88, 0x84, 0x7a, 0x61, 0xa9, 0x86, 0x59,
-    0x6c, 0x87, 0x61, 0x72, 0x4f, 0xc8, 0x99, 0x6c, 0x66, 0xa3, 0x80, 0x8b,
-    0x5c, 0xc0, 0x69, 0x7a, 0x6c, 0xb8, 0x8e, 0x91, 0x51, 0x9f, 0x8c, 0x85,
-    0x75, 0x96, 0x8c, 0x84, 0x6b, 0xa6, 0x71, 0x62, 0x42, 0x60, 0x74, 0x72,
-    0x92, 0x91, 0x70, 0x5b, 0x3d, 0x71, 0x5e, 0x91, 0xa3, 0xa5, 0x6a, 0x7c,
-    0x60, 0x58, 0x82, 0x80, 0xa3, 0x73, 0x8f, 0xa0, 0xb2, 0x4b, 0x94, 0x5e,
-    0x9f, 0x75, 0x4d, 0x83, 0xbc, 0x42, 0x5e, 0x80, 0x8f, 0x59, 0x53, 0xac,
-    0xb2, 0x45, 0x68, 0x7d, 0x9a, 0x65, 0x8a, 0xaa, 0xa0, 0x4e, 0x77, 0x72,
-    0x4d, 0x62, 0x6e, 0x98, 0x8c, 0x73, 0x92, 0x5a, 0x49, 0x55, 0x7b, 0x98,
-    0x8d, 0x84, 0x80, 0x8e, 0x2e, 0x56, 0x78, 0x73, 0x7b, 0x8f, 0x9a, 0x69,
-    0x73, 0x68, 0x7a, 0x88, 0x78, 0xa5, 0xb1, 0x5c, 0x8f, 0x55, 0x71, 0x99,
-    0x7a, 0xa9, 0xb0, 0x75, 0x69, 0x44, 0x5f, 0x66, 0x81, 0x7d, 0x9e, 0x4f,
-    0x66, 0x7f, 0x87, 0x7d, 0x5d, 0x7c, 0x95, 0x62, 0xa5, 0x86, 0x90, 0x6f,
-    0x60, 0xa5, 0x6e, 0x70, 0x80, 0x96, 0x6f, 0x55, 0x77, 0x87, 0x99, 0x7b,
-    0x21, 0xaa, 0x7f, 0x60, 0x63, 0xae, 0x47, 0x79, 0x44, 0xb5, 0x83, 0x6e,
-    0x6d, 0x93, 0x76, 0x54, 0x4b, 0xad, 0x91, 0x6b, 0x6a, 0x9c, 0x8c, 0x83,
-    0x62, 0x8a, 0x88, 0x71, 0x73, 0xa0, 0x75, 0x95, 0x54, 0x80, 0x92, 0x65,
-    0x45, 0x80, 0x63, 0x9a, 0x93, 0x9b, 0x78, 0x4e, 0x4d, 0x5f, 0x69, 0x9e,
-    0xbd, 0xa5, 0x75, 0x6b, 0x6e, 0x6a, 0x82, 0x97, 0xab, 0x60, 0x76, 0xb3,
-    0xc1, 0x39, 0x82, 0x5b, 0x71, 0x31, 0x7b, 0x9c, 0xb5, 0x4f, 0x75, 0x79,
-    0x6c, 0x5d, 0x80, 0xa6, 0x9c, 0x53, 0x6f, 0x85, 0x84, 0x5e, 0x7d, 0xb5,
-    0x95, 0x5f, 0x7c, 0x98, 0x72, 0x7c, 0x67, 0x99, 0xbb, 0x6c, 0x73, 0x66,
-    0x59, 0x5c, 0x6c, 0x9a, 0x9b, 0x72, 0x9b, 0x5f, 0x4b, 0x51, 0x63, 0x84,
-    0x74, 0xa0, 0xb3, 0x6e, 0x63, 0xa0, 0x84, 0x90, 0x71, 0x91, 0xba, 0x64,
-    0x6d, 0x72, 0x78, 0x83, 0x6f, 0x8e, 0xbd, 0x64, 0x69, 0x60, 0x95, 0x67,
-    0x70, 0x93, 0x78, 0x4d, 0x91, 0x3f, 0x7b, 0x6d, 0x69, 0x87, 0x7d, 0x8a,
-    0xa3, 0x95, 0x9d, 0x66, 0x6d, 0x8b, 0x7a, 0x75, 0x94, 0x7b, 0x89, 0x52,
-    0x66, 0x65, 0x79, 0x84, 0x49, 0x9c, 0x60, 0x66, 0x3e, 0xab, 0x4a, 0x86,
-    0x54, 0xcd, 0x7c, 0x83, 0x7c, 0xac, 0x8b, 0x53, 0x67, 0xbb, 0x7c, 0x6d,
-    0x72, 0xb3, 0x83, 0x85, 0x4f, 0x97, 0x86, 0x60, 0x7d, 0x93, 0x70, 0x8b,
-    0x64, 0x78, 0x82, 0x73, 0x54, 0x87, 0x6c, 0xaa, 0x6f, 0x97, 0x8d, 0x51,
-    0x2d, 0x50, 0x75, 0xa9, 0xc2, 0x94, 0x8d, 0x6f, 0x6d, 0x71, 0x7b, 0x87,
-    0x93, 0x67, 0x7d, 0xa5, 0xa2, 0x4f, 0x99, 0x83, 0x95, 0x49, 0x70, 0x9c,
-    0xcf, 0x37, 0x84, 0x86, 0x94, 0x5c, 0x95, 0xa1, 0xb6, 0x73, 0x80, 0x8d,
-    0x89, 0x62, 0x6f, 0xb4, 0xa1, 0x5b, 0x64, 0x91, 0x41, 0x4f, 0x53, 0xa6,
-    0xae, 0x75, 0x84, 0x82, 0x58, 0x8e, 0x63, 0x95, 0xa3, 0x8d, 0x8b, 0x76,
-    0x5d, 0x78, 0x80, 0x82, 0x6e, 0x9d, 0xb8, 0x7d, 0x64, 0x8a, 0x7e, 0x80,
-    0x72, 0x99, 0xcf, 0x76, 0x66, 0x77, 0x7c, 0x81, 0x71, 0x6f, 0xa1, 0x6c,
-    0x6b, 0x70, 0x80, 0x7c, 0x6d, 0x83, 0x8e, 0x74, 0x7a, 0x58, 0x69, 0x53,
-    0x58, 0x7d, 0x7f, 0x84, 0x96, 0x9c, 0x75, 0x6e, 0x62, 0x7c, 0x88, 0x7e,
-    0x7f, 0x98, 0x93, 0x61, 0x98, 0x98, 0x80, 0x83, 0x2e, 0x7d, 0x64, 0x69,
-    0x50, 0xa5, 0x38, 0x96, 0x2e, 0xc5, 0x66, 0x56, 0x64, 0xaa, 0x63, 0x64,
-    0x6d, 0xb3, 0x8a, 0x6c, 0x59, 0xb6, 0x69, 0x7a, 0x54, 0x91, 0x58, 0x96,
-    0x6b, 0x9f, 0x6d, 0x88, 0x4a, 0x82, 0x94, 0x67, 0x38, 0x93, 0x60, 0x87,
-    0x8c, 0x93, 0x8c, 0x52, 0x31, 0x43, 0x66, 0xa9, 0xb3, 0x7a, 0x88, 0x64,
-    0x60, 0x5b, 0x80, 0x84, 0xb7, 0x5a, 0x7a, 0x9d, 0x92, 0x50, 0x89, 0x80,
-    0x72, 0x51, 0x7f, 0x85, 0xae, 0x47, 0x76, 0x9a, 0x7a, 0x74, 0x6d, 0x93,
-    0xbd, 0x42, 0x72, 0x6d, 0x58, 0x5e, 0x6e, 0xa4, 0xb5, 0x4e, 0x76, 0x8f,
-    0x75, 0x9b, 0x5d, 0x92, 0xad, 0x77, 0x7f, 0x73, 0x62, 0x7d, 0x65, 0xaf,
-    0x98, 0x87, 0x80, 0x7c, 0x61, 0x81, 0x45, 0xa0, 0x84, 0x99, 0xbb, 0x72,
-    0x86, 0x8f, 0x70, 0x97, 0x6a, 0x8a, 0xd3, 0x70, 0x7c, 0x91, 0x77, 0x82,
-    0x70, 0x8c, 0xd5, 0x6c, 0x7f, 0x51, 0x5f, 0x69, 0x72, 0x89, 0x9a, 0x68,
-    0x79, 0x70, 0x8b, 0x80, 0x52, 0x98, 0x86, 0x7a, 0xa0, 0x7b, 0x61, 0x6e,
-    0x66, 0x6f, 0x77, 0x78, 0x64, 0xac, 0x7e, 0x73, 0x5d, 0x71, 0x6f, 0x80,
-    0x2e, 0xa9, 0x90, 0x5c, 0x56, 0xa1, 0x32, 0x88, 0x55, 0xb9, 0x67, 0x6f,
-    0x5c, 0xa5, 0x87, 0x61, 0x6b, 0xbd, 0x77, 0x7c, 0x62, 0xae, 0x7c, 0x7a,
-    0x66, 0xac, 0x7a, 0x62, 0x5c, 0x9a, 0x58, 0x89, 0x5a, 0x74, 0x72, 0x66,
-    0x5c, 0x8e, 0x51, 0x8e, 0x99, 0x92, 0xa0, 0x49, 0x31, 0x55, 0x68, 0x99,
-    0xba, 0x82, 0xa2, 0x7a, 0x5e, 0x6f, 0x84, 0x98, 0x96, 0x52, 0x73, 0x99,
-    0xb4, 0x5e, 0x7c, 0x59, 0x7d, 0x4a, 0x7e, 0xa0, 0xbe, 0x63, 0x67, 0x8e,
-    0x7f, 0x71, 0x80, 0xaf, 0x93, 0x4e, 0x78, 0x7e, 0x6d, 0x52, 0x66, 0xb3,
-    0x94, 0x56, 0x84, 0x8f, 0x50, 0x6d, 0x65, 0xa8, 0xb3, 0x4b, 0x91, 0x7f,
-    0x4c, 0x8d, 0x69, 0x79, 0x95, 0x8f, 0x8f, 0x7c, 0x66, 0x98, 0x75, 0x9b,
-    0x73, 0x9b, 0xac, 0x79, 0x6e, 0x84, 0x69, 0x9e, 0x80, 0xa0, 0xb0, 0x6c,
-    0x46, 0x8b, 0x3f, 0x7a, 0x79, 0x79, 0xb3, 0x62, 0x6b, 0x60, 0x67, 0x81,
-    0x4a, 0x7e, 0xa7, 0x8c, 0x74, 0x7f, 0x67, 0x4c, 0x4b, 0x8c, 0x8e, 0x67,
-    0x78, 0x9d, 0x94, 0x79, 0x75, 0x7c, 0x86, 0x7b, 0x67, 0x9f, 0xa4, 0x61,
-    0x5b, 0x6e, 0x85, 0x70, 0x20, 0xa5, 0x66, 0x5e, 0x55, 0xad, 0x3e, 0x7c,
-    0x2d, 0xb4, 0x78, 0x6f, 0x4c, 0xc6, 0x7e, 0x6d, 0x54, 0xb4, 0x71, 0x78,
-    0x54, 0xc3, 0x66, 0x6e, 0x4a, 0xa0, 0x7b, 0x85, 0x66, 0x94, 0x75, 0x8d,
-    0x34, 0x88, 0x71, 0x4e, 0x49, 0x8a, 0x3b, 0x9c, 0x88, 0x76, 0x7f, 0x6a,
-    0x37, 0x64, 0x66, 0xb6, 0xa3, 0x82, 0x76, 0x82, 0x6d, 0x65, 0x6f, 0x8c,
-    0x99, 0x5e, 0x77, 0xa1, 0x99, 0x51, 0xa1, 0x67, 0x6f, 0x4c, 0x7f, 0x9e,
-    0xad, 0x40, 0x65, 0x82, 0x76, 0x66, 0x72, 0xb5, 0xb2, 0x5b, 0x71, 0x8a,
-    0x76, 0x74, 0x52, 0xa0, 0x91, 0x37, 0x86, 0x72, 0x6c, 0x75, 0x62, 0xa5,
-    0xb6, 0x57, 0x75, 0x90, 0x3e, 0x7f, 0x49, 0x9f, 0x8e, 0x92, 0x81, 0x87,
-    0x69, 0x9e, 0x6b, 0x86, 0x8d, 0xb1, 0x9e, 0x65, 0x6f, 0x93, 0x70, 0x79,
-    0x7b, 0x87, 0xbe, 0x59, 0x69, 0x7a, 0x56, 0x7a, 0x81, 0x7d, 0xb8, 0x67,
-    0x67, 0x7f, 0x54, 0x8f, 0x71, 0x85, 0xa0, 0x74, 0x89, 0x5d, 0x67, 0x52,
-    0x65, 0x96, 0x89, 0x84, 0x81, 0x83, 0x82, 0x9a, 0x85, 0x73, 0x78, 0x62,
-    0x87, 0x98, 0x75, 0x6a, 0x73, 0x95, 0x86, 0x71, 0x11, 0x9a, 0x91, 0x66,
-    0x6e, 0xa4, 0x35, 0x89, 0x47, 0xbb, 0x5e, 0x46, 0x3a, 0xa8, 0x70, 0x4a,
-    0x65, 0xb9, 0x70, 0x96, 0x66, 0xcf, 0x80, 0x79, 0x60, 0xa4, 0x79, 0x70,
-    0x68, 0x92, 0x7f, 0x89, 0x6b, 0x87, 0x77, 0x67, 0x5b, 0x74, 0x3f, 0x9e,
-    0x94, 0x9b, 0xa1, 0x61, 0x4b, 0x66, 0x70, 0xad, 0xb7, 0x67, 0x70, 0x6c,
-    0x3f, 0x5b, 0x94, 0x88, 0xb3, 0x4f, 0x97, 0x97, 0x8c, 0x55, 0xb8, 0x78,
-    0x60, 0x25, 0x51, 0x91, 0xcd, 0x44, 0x6f, 0x85, 0x5c, 0x65, 0x67, 0xa5,
-    0x9e, 0x5f, 0x6d, 0x85, 0x6d, 0x56, 0x80, 0xae, 0x79, 0x63, 0x4f, 0x7d,
-    0x5f, 0x6b, 0x6e, 0xa7, 0x8e, 0x76, 0x8f, 0x90, 0x6e, 0x8c, 0x88, 0x92,
-    0x81, 0x81, 0x96, 0x7d, 0x48, 0x6b, 0x3f, 0xa1, 0x8c, 0xa2, 0x9f, 0x7f,
-    0x77, 0x97, 0x73, 0x9c, 0x67, 0x95, 0xae, 0x77, 0x7f, 0x7a, 0x52, 0x7e,
-    0x91, 0x77, 0xa8, 0x54, 0x6a, 0x74, 0x52, 0x8a, 0x67, 0x8e, 0x90, 0x8d,
-    0x8b, 0x52, 0x72, 0x5a, 0x73, 0x8f, 0x94, 0x87, 0x7c, 0x88, 0x89, 0x76,
-    0x77, 0x88, 0x5c, 0x77, 0x8f, 0x94, 0xac, 0x58, 0x70, 0x79, 0x75, 0x8a,
-    0x20, 0x9c, 0x91, 0x55, 0x55, 0xa4, 0x5b, 0x84, 0x30, 0xc6, 0x8a, 0x51,
-    0x31, 0xc3, 0x72, 0x6b, 0x65, 0xb9, 0x79, 0x7d, 0x62, 0xad, 0x88, 0x75,
-    0x37, 0xb0, 0x76, 0x8a, 0x7d, 0x85, 0x7f, 0xb4, 0x46, 0x9c, 0x83, 0x7b,
-    0x79, 0x78, 0x56, 0xac, 0x8d, 0xa2, 0xa9, 0x54, 0x44, 0x5a, 0x63, 0xb2,
-    0xa8, 0x72, 0xa4, 0x6b, 0x5d, 0x4d, 0x8e, 0x95, 0x9e, 0x4a, 0x98, 0x8c,
-    0xb0, 0x5c, 0xa5, 0x75, 0x83, 0x3b, 0x46, 0x92, 0xa7, 0x3b, 0x6a, 0x75,
-    0x59, 0x57, 0x52, 0xa1, 0xab, 0x54, 0x68, 0x7c, 0x94, 0x6e, 0x5b, 0x9a,
-    0xa3, 0x5d, 0x73, 0x74, 0x5a, 0x63, 0x56, 0x9e, 0xc1, 0x71, 0x82, 0x79,
-    0x49, 0x92, 0x63, 0xa6, 0x99, 0x7d, 0x71, 0x81, 0x5e, 0x90, 0x5c, 0x8b,
-    0x7e, 0xb4, 0xa0, 0x8c, 0x67, 0x93, 0x4e, 0x72, 0x65, 0x83, 0xb5, 0x77,
-    0x83, 0x92, 0x43, 0x67, 0x8c, 0x81, 0xb1, 0x75, 0x6a, 0x61, 0x66, 0x6f,
-    0x5d, 0x7f, 0x8d, 0x7b, 0x6b, 0x68, 0x6f, 0x85, 0x6e, 0x87, 0x97, 0x89,
-    0x9b, 0x81, 0x7e, 0x7e, 0x9d, 0x83, 0x6b, 0x6a, 0xa5, 0x92, 0x7e, 0x70,
-    0x60, 0x8f, 0x6f, 0x8b, 0x15, 0xa6, 0x66, 0x4e, 0x61, 0xbc, 0x38, 0x67,
-    0x46, 0xab, 0x84, 0x5e, 0x3a, 0xac, 0x74, 0x58, 0x76, 0xc4, 0x7a, 0x76,
-    0x67, 0xc0, 0x76, 0x6f, 0x52, 0xa6, 0xa2, 0x97, 0x76, 0xa6, 0x7f, 0x99,
-    0x5d, 0xa5, 0x5f, 0x60, 0x58, 0x88, 0x3f, 0x9e, 0x7d, 0x81, 0x71, 0x63,
-    0x42, 0x55, 0x3e, 0xbd, 0xa9, 0x7a, 0xa5, 0x67, 0x62, 0x7a, 0x80, 0x9e,
-    0xc3, 0x54, 0x7f, 0x9f, 0x93, 0x73, 0xbd, 0x79, 0x74, 0x2e, 0x54, 0x9e,
-    0xaa, 0x76, 0x68, 0x80, 0x78, 0x64, 0x57, 0x93, 0xa4, 0x56, 0x75, 0x72,
-    0x81, 0x7f, 0x48, 0xad, 0x89, 0x67, 0x60, 0x7e, 0x7a, 0x83, 0x6e, 0x95,
-    0xb0, 0x57, 0x89, 0x91, 0x4d, 0x86, 0x78, 0x7b, 0x74, 0x8c, 0x8f, 0x8d,
-    0x67, 0xa4, 0x64, 0x8d, 0x77, 0x9a, 0xa1, 0x88, 0x6e, 0x94, 0x33, 0x95,
-    0x81, 0x76, 0xc6, 0x7d, 0x7d, 0x85, 0x5a, 0x6e, 0x8e, 0x69, 0x9e, 0x71,
-    0x82, 0x81, 0x59, 0x5b, 0x71, 0x9a, 0x91, 0x8e, 0x80, 0x69, 0x71, 0x73,
-    0x6e, 0x9a, 0x95, 0x94, 0x7b, 0x80, 0x82, 0x7e, 0x76, 0x84, 0x70, 0x72,
-    0x9c, 0xa0, 0x77, 0x66, 0x55, 0xa1, 0x8c, 0x73, 0x35, 0xa0, 0x68, 0x4d,
-    0x3b, 0xaa, 0x44, 0x6f, 0x3c, 0xc0, 0x96, 0x78, 0x33, 0xbd, 0x64, 0x5b,
-    0x75, 0xd2, 0x83, 0x87, 0x59, 0xbd, 0x80, 0x80, 0x6e, 0x8e, 0x65, 0x7a,
-    0x87, 0xb6, 0x8d, 0x94, 0x39, 0x95, 0x8b, 0x5d, 0x66, 0x71, 0x4e, 0x9f,
-    0x96, 0x8a, 0x98, 0x47, 0x41, 0x6c, 0x4c, 0xac, 0x95, 0x81, 0x90, 0x75,
-    0x59, 0x4c, 0xa2, 0x93, 0x99, 0x58, 0x7b, 0xaf, 0xa3, 0x52, 0xb0, 0x6c,
-    0x5f, 0x47, 0x6e, 0x8e, 0xae, 0x3d, 0x81, 0x6d, 0x78, 0x52, 0x4f, 0x81,
-    0x80, 0x68, 0x4b, 0x81, 0x74, 0x71, 0x67, 0xa7, 0x9a, 0x55, 0x84, 0x72,
-    0x64, 0x6b, 0x6e, 0x9d, 0xab, 0x76, 0x79, 0x85, 0x40, 0x84, 0x80, 0x85,
-    0x70, 0x91, 0x9a, 0x81, 0x5b, 0x89, 0x6b, 0x8a, 0x92, 0x8c, 0xa4, 0x7b,
-    0x75, 0x89, 0x54, 0x76, 0x69, 0x69, 0xb3, 0x6c, 0x47, 0x7d, 0x4c, 0x7f,
-    0x81, 0x86, 0x8f, 0x63, 0x71, 0x6a, 0x63, 0x67, 0x7c, 0x8f, 0xa0, 0x68,
-    0x86, 0x58, 0x5b, 0x87, 0x6a, 0x82, 0x89, 0x78, 0x9d, 0x8d, 0xaa, 0x82,
-    0x6e, 0xa4, 0x6f, 0x6d, 0x70, 0x9f, 0x7f, 0x77, 0x41, 0xa5, 0x86, 0x61,
-    0x2d, 0x99, 0xa9, 0x5f, 0x5a, 0xb3, 0x51, 0x70, 0x5a, 0xce, 0x77, 0x68,
-    0x2c, 0xb8, 0x90, 0x44, 0x58, 0xb9, 0x74, 0x8e, 0x70, 0xb3, 0x9a, 0x75,
-    0x6d, 0xc0, 0x9e, 0x8e, 0x8d, 0xa8, 0x7b, 0xa8, 0x4a, 0x89, 0x6e, 0x7f,
-    0x5d, 0x6e, 0x46, 0x91, 0x6d, 0x81, 0x89, 0x3e, 0x35, 0x69, 0x44, 0xaf,
-    0x99, 0x8d, 0x94, 0x54, 0x60, 0x5b, 0xaf, 0x97, 0x92, 0x4e, 0x80, 0xae,
-    0x9e, 0x62, 0xa3, 0x77, 0x6e, 0x5d, 0x71, 0xa0, 0xa6, 0x59, 0x84, 0x5d,
-    0x65, 0x4a, 0x69, 0xa1, 0xa1, 0x40, 0x75, 0x65, 0x6b, 0x68, 0x60, 0xb3,
-    0x92, 0x27, 0x70, 0x67, 0x9b, 0x5e, 0x50, 0xaf, 0xae, 0x64, 0x7a, 0x6e,
-    0x61, 0x94, 0x3b, 0x8f, 0x86, 0x7f, 0x98, 0x88, 0x7a, 0x7f, 0x61, 0x7b,
-    0x64, 0x96, 0x96, 0x79, 0x5c, 0x96, 0x52, 0x92, 0x76, 0x7e, 0xc4, 0x60,
-    0x6d, 0x7b, 0x41, 0x8c, 0x7b, 0x8e, 0x9a, 0x66, 0x79, 0x95, 0x67, 0x6a,
-    0x7a, 0x9b, 0xa9, 0x85, 0x6d, 0x66, 0x55, 0x65, 0x76, 0x8b, 0x90, 0x86,
-    0x88, 0x8b, 0x8f, 0x7e, 0x83, 0x7c, 0x75, 0x5f, 0x78, 0x96, 0x76, 0x47,
-    0x54, 0x9c, 0x8d, 0x7d, 0x24, 0x9f, 0x79, 0x5c, 0x55, 0xb2, 0x3b, 0x67,
-    0x4e, 0xd2, 0x90, 0x79, 0x3c, 0xc3, 0x8b, 0x4a, 0x7c, 0xd7, 0x70, 0x75,
-    0x5b, 0xaf, 0xa8, 0x6b, 0x59, 0xc1, 0x6d, 0x5f, 0x5d, 0x96, 0x87, 0x9a,
-    0x5d, 0x7f, 0x8e, 0x6d, 0x5c, 0x75, 0x3f, 0xb6, 0x8e, 0x81, 0x7b, 0x31,
-    0x47, 0x67, 0x56, 0xb6, 0x90, 0x71, 0x89, 0x63, 0x61, 0x75, 0x8d, 0x8b,
-    0x97, 0x62, 0x62, 0x85, 0x9c, 0x64, 0xb7, 0x61, 0x71, 0x3f, 0x6c, 0x8b,
-    0xaa, 0x43, 0x82, 0x70, 0x52, 0x52, 0x80, 0xaa, 0x9e, 0x5d, 0x90, 0x69,
-    0x8a, 0x77, 0x6d, 0x9f, 0x9e, 0x5f, 0x84, 0x61, 0x87, 0x70, 0x43, 0xab,
-    0x97, 0x6e, 0x84, 0x6c, 0x5d, 0x82, 0x64, 0x85, 0x83, 0x7e, 0x82, 0x7c,
-    0x7b, 0x91, 0x55, 0x7e, 0x77, 0x88, 0xba, 0x71, 0x6d, 0x7b, 0x71, 0x8a,
-    0x7f, 0x84, 0xb5, 0x63, 0x4a, 0x9a, 0x3c, 0x70, 0x7a, 0x99, 0xa3, 0x50,
-    0x84, 0x82, 0x56, 0x4c, 0x74, 0x8e, 0xa3, 0x77, 0x8f, 0x4e, 0x5f, 0x6d,
-    0x97, 0x89, 0xa0, 0x6b, 0x7c, 0x8c, 0x85, 0x82, 0x8e, 0xa1, 0x89, 0x5b,
-    0x7f, 0x8b, 0x8f, 0x5e, 0x74, 0x96, 0x8a, 0x7d, 0x15, 0x7b, 0x8f, 0x88,
-    0x5f, 0xa7, 0x63, 0x5b, 0x39, 0xbd, 0x96, 0x56, 0x4c, 0xb4, 0x7b, 0x53,
-    0x5a, 0xaf, 0x79, 0x7b, 0x5c, 0xa6, 0xaa, 0x74, 0x5f, 0xa0, 0x76, 0x9e,
-    0x71, 0x9a, 0x60, 0xa4, 0x33, 0x87, 0x66, 0x66, 0x64, 0x7d, 0x6d, 0xac,
-    0x9e, 0x8c, 0x78, 0x4f, 0x3d, 0x7b, 0x53, 0xb1, 0x97, 0x8a, 0x96, 0x6e,
-    0x60, 0x4b, 0xa9, 0x9e, 0x93, 0x6e, 0x93, 0xb7, 0xae, 0x46, 0xb9, 0x60,
-    0x72, 0x46, 0x80, 0x95, 0xb5, 0x57, 0x82, 0x53, 0x6e, 0x4e, 0x5b, 0xa2,
-    0x9a, 0x3d, 0x8b, 0x6c, 0x84, 0x65, 0x69, 0xa1, 0x8c, 0x60, 0x83, 0x74,
-    0x73, 0x53, 0x5d, 0x7e, 0x7f, 0x79, 0x6e, 0x81, 0x89, 0x8f, 0x51, 0x81,
-    0x99, 0x97, 0x81, 0x8a, 0x87, 0x83, 0x43, 0x90, 0x89, 0x94, 0x93, 0x7a,
-    0x66, 0x80, 0x82, 0x82, 0x79, 0x85, 0xb0, 0x6b, 0x87, 0x7b, 0x53, 0x89,
-    0x79, 0x9d, 0xab, 0x6e, 0x82, 0x84, 0x50, 0x8f, 0x7e, 0x74, 0x90, 0x74,
-    0x6e, 0x65, 0x84, 0x70, 0x82, 0x7a, 0x9e, 0x6d, 0x8f, 0x62, 0xb2, 0x84,
-    0x78, 0x7e, 0x72, 0x5a, 0x7a, 0x85, 0x8c, 0x4b, 0x70, 0x99, 0x87, 0x78,
-    0x26, 0x95, 0xb9, 0x77, 0x4d, 0xb6, 0x51, 0x6a, 0x41, 0xbf, 0x76, 0x68,
-    0x56, 0xb6, 0x80, 0x53, 0x83, 0xaf, 0x87, 0x79, 0x79, 0xb4, 0x89, 0x7d,
-    0x47, 0x9d, 0xa0, 0x86, 0x89, 0xc3, 0x6d, 0x99, 0x41, 0x89, 0x9a, 0x59,
-    0x54, 0x83, 0x79, 0x9d, 0x7b, 0x73, 0x88, 0x4a, 0x42, 0x64, 0x7a, 0x9f,
-    0x7b, 0x6e, 0x71, 0x7b, 0x6a, 0x61, 0xae, 0xa3, 0xa0, 0x68, 0x95, 0x9d,
-    0x94, 0x49, 0x8b, 0x70, 0x8a, 0x5f, 0x49, 0xbb, 0xa7, 0x4a, 0xa1, 0x59,
-    0x59, 0x59, 0x6d, 0xa0, 0x9f, 0x50, 0xa0, 0x7b, 0x75, 0x49, 0x5a, 0x8c,
-    0x84, 0x68, 0x78, 0x57, 0x7a, 0x6e, 0x6b, 0x87, 0x9c, 0x7b, 0x84, 0x83,
-    0x79, 0x7d, 0x5a, 0x77, 0x77, 0x6f, 0x6f, 0x7c, 0x8f, 0x83, 0x40, 0x62,
-    0x6a, 0x87, 0xab, 0x74, 0x86, 0x96, 0x7a, 0x7d, 0x7b, 0x81, 0x9a, 0x65,
-    0x60, 0x82, 0x61, 0x73, 0x71, 0x77, 0xa7, 0x79, 0x87, 0x8c, 0x4e, 0x72,
-    0x8d, 0x89, 0x94, 0x6d, 0x75, 0x6d, 0x6e, 0x82, 0x7a, 0x8d, 0xa9, 0x77,
-    0x77, 0x7c, 0x74, 0xa7, 0xb7, 0x67, 0x75, 0x67, 0x7e, 0x9f, 0x73, 0x60,
-    0x6c, 0x95, 0x7f, 0x62, 0x31, 0x70, 0x85, 0x7a, 0x5f, 0xc0, 0x69, 0x66,
-    0x71, 0xb0, 0x81, 0x5d, 0x48, 0xc9, 0x86, 0x39, 0x93, 0xa4, 0x8e, 0x7c,
-    0x5e, 0xbb, 0x98, 0x5c, 0x74, 0x9c, 0x89, 0x6d, 0x74, 0xbd, 0x8e, 0x6e,
-    0x5f, 0x9a, 0x6d, 0x70, 0x57, 0x9c, 0x58, 0xb7, 0x8e, 0x94, 0xa0, 0x3f,
-    0x39, 0x75, 0x6f, 0xb4, 0xa2, 0x94, 0xa9, 0x70, 0x61, 0x8a, 0x70, 0x92,
-    0xa7, 0x7f, 0x7f, 0x8d, 0x7a, 0x73, 0xa1, 0x5f, 0x8a, 0x4a, 0x65, 0xaa,
-    0x92, 0x6e, 0x98, 0x51, 0x81, 0x47, 0x57, 0xb8, 0x89, 0x50, 0x8a, 0x6d,
-    0x8b, 0x50, 0x8a, 0x86, 0x9b, 0x7d, 0x5b, 0x4a, 0x68, 0x74, 0x53, 0x9b,
-    0x94, 0x74, 0x7c, 0x6f, 0x62, 0x86, 0x5b, 0x8f, 0x82, 0x96, 0x6e, 0x7c,
-    0x80, 0x8f, 0x47, 0x5b, 0x70, 0x95, 0x97, 0x77, 0x8d, 0x8e, 0x69, 0x62,
-    0x78, 0x8f, 0xbf, 0x5e, 0x76, 0xae, 0x4d, 0x84, 0x73, 0x76, 0xab, 0x6f,
-    0x7f, 0x8c, 0x4b, 0x7d, 0x96, 0x7d, 0xb3, 0x55, 0x78, 0x8d, 0x76, 0x73,
-    0x8d, 0x8e, 0x98, 0x6a, 0x91, 0x86, 0x6d, 0x8c, 0x7d, 0x93, 0x97, 0x56,
-    0x79, 0x8f, 0xa3, 0x7f, 0x7e, 0x82, 0xa0, 0x63, 0x3d, 0x6b, 0x88, 0x5e,
-    0x61, 0xc0, 0x45, 0x5f, 0x66, 0xb0, 0x6c, 0x6d, 0x29, 0xd5, 0x95, 0x3b,
-    0x77, 0xaa, 0x62, 0x70, 0x63, 0xce, 0x8c, 0x6e, 0x56, 0xaa, 0x77, 0x6e,
-    0x90, 0xcc, 0x6d, 0x7e, 0x41, 0x9f, 0x88, 0x4f, 0x5d, 0xb4, 0x4c, 0x9b,
-    0x80, 0x97, 0x98, 0x59, 0x4c, 0x71, 0x53, 0xb4, 0x90, 0x97, 0x93, 0x90,
-    0x46, 0x63, 0xa6, 0x87, 0x9d, 0x56, 0x7f, 0xab, 0x8e, 0x68, 0xc6, 0x5d,
-    0x6e, 0x58, 0x4b, 0x85, 0xa1, 0x70, 0x8a, 0x60, 0x84, 0x44, 0x68, 0x8e,
-    0x9b, 0x3a, 0x8c, 0x57, 0x91, 0x4c, 0x6b, 0x9c, 0xa7, 0x64, 0x82, 0x5f,
-    0x68, 0x6d, 0x4d, 0xa1, 0x6c, 0x91, 0x6c, 0x6b, 0x64, 0x97, 0x86, 0x81,
-    0x8d, 0x8e, 0x80, 0x72, 0x88, 0x96, 0x5d, 0x6e, 0x7c, 0x67, 0x97, 0x69,
-    0x95, 0x93, 0x61, 0x8b, 0x9b, 0x7d, 0xc8, 0x6f, 0x85, 0x80, 0x67, 0x68,
-    0x90, 0x6b, 0xcc, 0x7c, 0xa3, 0xa0, 0x58, 0x81, 0x7a, 0x8d, 0x9f, 0x65,
-    0x81, 0x82, 0x78, 0x6b, 0x85, 0x7b, 0x9b, 0x69, 0x86, 0x6c, 0x83, 0x6c,
-    0x8e, 0x59, 0xab, 0x56, 0x7c, 0x7f, 0x7b, 0x84, 0x71, 0x63, 0x7d, 0x73,
-    0x60, 0x8b, 0x7a, 0x7b, 0x5e, 0xbb, 0x4b, 0x40, 0x30, 0xcc, 0x80, 0x65,
-    0x6c, 0xb7, 0x80, 0x35, 0x7d, 0xa3, 0x5c, 0x6c, 0x49, 0xa6, 0x9b, 0x7b,
-    0x53, 0xba, 0x62, 0x76, 0x78, 0xa0, 0x72, 0x80, 0x78, 0x93, 0x87, 0x62,
-    0x64, 0x84, 0x6f, 0xa1, 0x70, 0x90, 0x9a, 0x6b, 0x42, 0x55, 0x6d, 0xc5,
-    0xa6, 0x8a, 0x79, 0x64, 0x4c, 0x72, 0x7b, 0xa9, 0xa3, 0x70, 0x84, 0x8f,
-    0x63, 0x7a, 0x9c, 0x4e, 0x5a, 0x76, 0x91, 0x67, 0xaf, 0x76, 0xbf, 0x46,
-    0x62, 0x3f, 0x7d, 0xa7, 0x8d, 0x62, 0x90, 0x5b, 0x9a, 0x44, 0x51, 0x80,
-    0xa6, 0x7e, 0x8d, 0x6a, 0x73, 0x65, 0x72, 0x82, 0x99, 0xb4, 0x6a, 0x75,
-    0x85, 0x90, 0x47, 0x62, 0x9e, 0x95, 0x94, 0x78, 0x89, 0x74, 0x5d, 0xa3,
-    0x7f, 0x9d, 0x7d, 0x63, 0x96, 0x86, 0x8d, 0xa2, 0x95, 0xab, 0xae, 0x5d,
-    0x93, 0x8d, 0x3d, 0x76, 0x9e, 0x9c, 0xc4, 0x71, 0x7d, 0xa3, 0x75, 0x7e,
-    0x6d, 0x9d, 0xa3, 0x7f, 0x94, 0x89, 0x47, 0x71, 0x8b, 0x95, 0xb1, 0x72,
-    0x90, 0x53, 0x7e, 0x8f, 0x8c, 0x90, 0xa1, 0x4d, 0x59, 0x62, 0x73, 0xa0,
-    0x69, 0x88, 0x86, 0x71, 0x60, 0x3b, 0x81, 0x57, 0x7d, 0x86, 0x58, 0x63,
-    0x7d, 0x98, 0x74, 0x67, 0x5d, 0xb0, 0x67, 0x45, 0x9b, 0xa9, 0x94, 0x68,
-    0x43, 0x8b, 0x85, 0x56, 0x63, 0x96, 0x87, 0x78, 0x88, 0xbf, 0x92, 0x8d,
-    0x60, 0xa8, 0x7e, 0x7e, 0x78, 0x80, 0x66, 0x92, 0x6e, 0x97, 0xab, 0x7f,
-    0x4f, 0x65, 0x59, 0xb0, 0x9b, 0x6b, 0x9f, 0x70, 0x6f, 0x5c, 0xac, 0x95,
-    0xa3, 0x54, 0x8e, 0xa9, 0x9e, 0x8c, 0xa5, 0x66, 0x5f, 0x5b, 0x6c, 0x83,
-    0x90, 0x73, 0x85, 0x64, 0x61, 0x51, 0x4a, 0x63, 0xa1, 0x96, 0x7e, 0x4e,
-    0x87, 0x60, 0x68, 0xb5, 0x9a, 0x8d, 0x75, 0x4e, 0x8a, 0x7a, 0x5f, 0x9f,
-    0x74, 0x80, 0x69, 0x6d, 0x73, 0x92, 0x79, 0x7e, 0x85, 0x68, 0x83, 0x9d,
-    0xb6, 0x9d, 0x6e, 0x8f, 0x78, 0x91, 0xaf, 0x8f, 0xa0, 0x9d, 0x73, 0x55,
-    0x91, 0x8f, 0xb2, 0x76, 0x97, 0xab, 0x63, 0x63, 0x68, 0x7b, 0xab, 0x5c,
-    0x77, 0xae, 0x4c, 0x72, 0x6e, 0x93, 0xb8, 0x51, 0x79, 0x84, 0x7d, 0x6b,
-    0x7f, 0x8a, 0xba, 0x68, 0x7a, 0x43, 0x9a, 0x8d, 0x77, 0x8a, 0x6d, 0x56,
-    0x79, 0x41, 0x7a, 0x4b, 0x81, 0x7a, 0x5c, 0x68, 0x58, 0x36, 0x6f, 0x6f,
-    0x9f, 0xa6, 0x5f, 0x60, 0x4e, 0x67, 0x70, 0x4c, 0x69, 0x69, 0x94, 0x63,
-    0x6d, 0x7b, 0x88, 0x9e, 0x6d, 0x98, 0x69, 0x68, 0x88, 0x80, 0x80, 0x7a,
-    0x8e, 0x78, 0x5e, 0x8d, 0x7e, 0x91, 0x76, 0x64, 0x7e, 0x7f, 0x4e, 0xc9,
-    0x79, 0x8f, 0x9c, 0x82, 0x3d, 0x62, 0x63, 0xc3, 0xb8, 0x7b, 0x72, 0x7b,
-    0x50, 0x56, 0x95, 0x72, 0x8f, 0x6b, 0x90, 0x9d, 0x76, 0xa4, 0xa5, 0x79,
-    0x54, 0x4f, 0x59, 0x85, 0xc5, 0x92, 0x97, 0x4d, 0x6f, 0x69, 0x77, 0x7f,
-    0x71, 0x7c, 0x87, 0x59, 0x98, 0x61, 0x80, 0x81, 0x88, 0x6b, 0x6d, 0x7f,
-    0x7f, 0x77, 0x60, 0xa2, 0x96, 0x73, 0x69, 0x86, 0x83, 0x8d, 0x60, 0x66,
-    0x88, 0x8c, 0x93, 0x67, 0x98, 0x82, 0x7e, 0x91, 0x99, 0x59, 0x8e, 0x6e,
-    0x90, 0xa1, 0x62, 0x8a, 0x98, 0x7b, 0xc8, 0x67, 0x85, 0x8d, 0x6c, 0xa1,
-    0xa1, 0x92, 0xd0, 0x49, 0x85, 0x76, 0x89, 0x75, 0x88, 0x83, 0xa3, 0x77,
-    0x85, 0x68, 0x82, 0x83, 0x7f, 0x79, 0xae, 0x85, 0x76, 0x84, 0x80, 0x9a,
-    0x9d, 0x7b, 0x83, 0x90, 0x79, 0x88, 0x79, 0x9a, 0x93, 0x6c, 0x69, 0x79,
-    0x5f, 0x90, 0x81, 0x7b, 0x87, 0x9d, 0x86, 0x82, 0x7a, 0x77, 0x71, 0x85,
-    0x8b, 0x99, 0x8f, 0x7b, 0x58, 0x98, 0x84, 0x6e, 0x9a, 0xa1, 0x7a, 0x8c,
-    0x77, 0xa8, 0x86, 0x93, 0x7b, 0x90, 0x79, 0x8a, 0x85, 0x8f, 0x84, 0x97,
-    0x73, 0x83, 0x7b, 0x76, 0x8e, 0xa1, 0x89, 0x8a, 0x83, 0x9c, 0x65, 0x68,
-    0x7b, 0x89, 0x92, 0x84, 0x6d, 0x90, 0x61, 0x78, 0x98, 0x8c, 0x8d, 0x87,
-    0xa0, 0x99, 0x79, 0x7b, 0x69, 0xa4, 0x7a, 0x8d, 0x73, 0x71, 0x70, 0x80,
-    0x82, 0x77, 0x81, 0x67, 0x75, 0x97, 0x71, 0x73, 0x85, 0x6d, 0x8e, 0x86,
-    0x6e, 0x80, 0x86, 0x9e, 0x6f, 0x70, 0x67, 0x59, 0x65, 0x89, 0x67, 0x8b,
-    0x7d, 0x68, 0x69, 0x7a, 0x5b, 0x7e, 0x87, 0xa1, 0x92, 0x7b, 0x64, 0x7e,
-    0x76, 0x72, 0x71, 0xab, 0x7c, 0x83, 0x6f, 0xa1, 0x86, 0x76, 0x71, 0x6f,
-    0x91, 0x77, 0x6c, 0x71, 0x92, 0x78, 0x70, 0x7f, 0x6e, 0x65, 0x77, 0x93,
-    0x7e, 0x6c, 0x85, 0x9d, 0x78, 0x8b, 0x7c, 0x5f, 0x94, 0x86, 0x7c, 0x7f,
-    0x83, 0x6e, 0x72, 0x9e, 0x6e, 0x6b, 0x8d, 0x91, 0x97, 0x8b, 0x7b, 0x72,
-    0x86, 0x75, 0x7f, 0x96, 0x7d, 0x81, 0xa1, 0x55, 0xa6, 0x88, 0x96, 0x87,
-    0x93, 0x68, 0x89, 0x72, 0x6f, 0x9c, 0x75, 0x7c, 0x79, 0x6c, 0x74, 0x84,
-    0x7d, 0xa4, 0x86, 0x84, 0x84, 0x8d, 0x63, 0x7a, 0x63, 0xbc, 0x7e, 0x93,
-    0x80, 0x8d, 0x71, 0x7a, 0x5f, 0x8c, 0x74, 0x96, 0x7e, 0x9b, 0x9d, 0x8d,
-    0x5b, 0xa4, 0x71, 0x5e, 0x83, 0x78, 0x86, 0x7f, 0x70, 0x99, 0x87, 0x85,
-    0x8e, 0x81, 0x93, 0x80, 0x89, 0xa0, 0x7a, 0x77, 0x8e, 0x73, 0x5f, 0x80,
-    0x6d, 0x87, 0x5b, 0x7a, 0x85, 0x7c, 0x85, 0x63, 0x61, 0x9d, 0x6f, 0x68,
-    0x77, 0x86, 0x61, 0x6d, 0x84, 0x98, 0x7c, 0x78, 0x69, 0x84, 0x91, 0x6d,
-    0x81, 0xa1, 0x6c, 0x62, 0x95, 0x6d, 0x86, 0x8b, 0x95, 0x8f, 0x5e, 0x86,
-    0x73, 0xa1, 0x83, 0x58, 0x5f, 0x8e, 0x76, 0x79, 0x9e, 0x92, 0x7c, 0x7b,
-    0x81, 0x8b, 0x83, 0x7b, 0x78, 0x75, 0x70, 0x83, 0x70, 0x5a, 0x6a, 0x59,
-    0xa3, 0x82, 0x7a, 0x91, 0x8b, 0x6e, 0x82, 0x8e, 0x70, 0x73, 0x91, 0x76,
-    0xa5, 0x7f, 0x70, 0x81, 0x6f, 0x85, 0x94, 0xa6, 0x8c, 0x50, 0x76, 0x6e,
-    0x64, 0x95, 0xa0, 0x64, 0x6c, 0x68, 0x8e, 0x8b, 0xa1, 0x7d, 0xa0, 0x7f,
-    0x76, 0x8b, 0x7b, 0x93, 0x7b, 0x6e, 0x7e, 0x64, 0x8a, 0xa7, 0x78, 0x64,
-    0x93, 0x67, 0x7d, 0x68, 0x5c, 0xa0, 0x76, 0x98, 0xaf, 0x80, 0x55, 0x96,
-    0x97, 0x9c, 0x78, 0x75, 0x87, 0x85, 0x77, 0x77, 0x62, 0x93, 0x76, 0x68,
-    0xa0, 0x80, 0x81, 0x7f, 0x9a, 0x68, 0x74, 0x69, 0x94, 0x77, 0x77, 0x72,
-    0x90, 0x9a, 0x6f, 0x95, 0x89, 0x6b, 0x6b, 0x94, 0x7e, 0x9c, 0x6f, 0x67,
-    0x8f, 0x82, 0x80, 0x92, 0x76, 0x80, 0x65, 0x9b, 0x6a, 0x7c, 0x75, 0x5a,
-    0x87, 0xa1, 0x69, 0x7a, 0x79, 0x9e, 0x9a, 0x58, 0x81, 0x92, 0x72, 0x67,
-    0x90, 0x80, 0x82, 0x61, 0x9f, 0x9e, 0x6a, 0x8d, 0x8d, 0x8a, 0x73, 0x81,
-    0x68, 0x7f, 0x5b, 0x59, 0x98, 0x89, 0x71, 0x72, 0x58, 0x7b, 0x94, 0x5d,
-    0xa9, 0x8b, 0x72, 0x7b, 0x65, 0x73, 0x5b, 0x8b, 0x7d, 0x86, 0x6e, 0x8c,
-    0x66, 0x6f, 0x6b, 0x8b, 0x71, 0x80, 0x7f, 0x70, 0x70, 0x88, 0x70, 0x7e,
-    0x84, 0x89, 0x7f, 0x81, 0x87, 0x77, 0x71, 0x88, 0x7f, 0x8f, 0x5e, 0x80,
-    0x5d, 0xa1, 0x89, 0x77, 0x93, 0x8e, 0x55, 0x64, 0x88, 0x9a, 0x8b, 0x80,
-    0x77, 0x6f, 0x91, 0x83, 0x6b, 0x9b, 0x85, 0x5c, 0x57, 0x7e, 0xa9, 0x63,
-    0x83, 0xaa, 0x7c, 0xa1, 0x91, 0x5f, 0x68, 0x76, 0x7a, 0x97, 0x96, 0x84,
-    0xca, 0x8d, 0x8c, 0x8b, 0x71, 0x81, 0x88, 0x92, 0xaa, 0x74, 0x49, 0x7a,
-    0x90, 0x93, 0x7a, 0x61, 0x8c, 0x66, 0x71, 0xa0, 0xab, 0x7d, 0x86, 0x6c,
-    0x9f, 0x77, 0x67, 0x6a, 0x89, 0x89, 0x88, 0x70, 0xad, 0x88, 0x69, 0x84,
-    0x70, 0x8f, 0x79, 0x7c, 0x66, 0xa6, 0x71, 0x8d, 0x77, 0x99, 0x69, 0x76,
-    0x79, 0x7d, 0x9c, 0x6f, 0x64, 0x8b, 0x70, 0x82, 0x69, 0xa4, 0x65, 0x6e,
-    0x7f, 0x9e, 0x7e, 0x84, 0x8c, 0x9c, 0x6c, 0x5b, 0x6e, 0xa7, 0x6d, 0x7a,
-    0x92, 0x78, 0x9a, 0x6f, 0x81, 0x91, 0x71, 0x7d, 0x6b, 0x99, 0x6b, 0x92,
-    0x5e, 0x7e, 0x64, 0x95, 0x78, 0x90, 0x6f, 0x68, 0x8a, 0x85, 0x6f, 0x88,
-    0x64, 0x66, 0x7f, 0x78, 0x7c, 0x95, 0x66, 0x6c, 0x76, 0x6a, 0x9b, 0x8f,
-    0x9d, 0x78, 0x86, 0x95, 0x73, 0x66, 0x6d, 0x71, 0x8b, 0x7f, 0x6f, 0x70,
-    0x64, 0x94, 0xa0, 0x83, 0x6b, 0x6d, 0x85, 0x89, 0x68, 0x92, 0x8e, 0x51,
-    0x81, 0x85, 0x86, 0x6e, 0x83, 0x85, 0x8a, 0x5e, 0x68, 0xbf, 0xc4, 0xa5,
-    0x8b, 0x67, 0x86, 0x59, 0x85, 0x9e, 0x96, 0x67, 0x82, 0x7c, 0x6c, 0x80,
-    0x84, 0xae, 0x9d, 0x80, 0xc2, 0x58, 0x5d, 0x95, 0x85, 0x8b, 0x7f, 0x5d,
-    0xc7, 0x75, 0x75, 0x87, 0xa2, 0x8c, 0x62, 0x71, 0x9c, 0x61, 0x7f, 0x9c,
-    0xca, 0x8d, 0x89, 0x6e, 0x7c, 0x71, 0x81, 0x99, 0x95, 0xa4, 0x76, 0x6f,
-    0x64, 0x7b, 0x6c, 0x72, 0x8b, 0x83, 0x70, 0x70, 0x8b, 0xa4, 0x69, 0x76,
-    0x6e, 0x8d, 0x7a, 0x80, 0x8f, 0x9e, 0x73, 0x4b, 0x75, 0x78, 0x77, 0x7b,
-    0x8e, 0x92, 0x88, 0x49, 0x54, 0x9f, 0x7a, 0x7f, 0x68, 0x9f, 0x7f, 0x57,
-    0x6b, 0xad, 0x85, 0x6f, 0x81, 0xa1, 0x96, 0x6f, 0x73, 0x8d, 0x5e, 0x65,
-    0x7a, 0x8c, 0x7c, 0x6a, 0x7e, 0x7a, 0x6a, 0x97, 0x59, 0x86, 0x62, 0x77,
-    0x70, 0x7a, 0x68, 0x62, 0x68, 0x86, 0x7e, 0x76, 0x9a, 0x7f, 0x6c, 0x7e,
-    0x8a, 0x76, 0x65, 0x8f, 0x7d, 0x65, 0x76, 0xa4, 0x95, 0x62, 0x78, 0x97,
-    0x7a, 0x6e, 0x7a, 0x7a, 0x7e, 0x91, 0x8c, 0x8a, 0x91, 0x82, 0x89, 0x6d,
-    0x87, 0x90, 0x69, 0x71, 0x96, 0xa6, 0x7c, 0x7c, 0xa8, 0xa8, 0x62, 0x77,
-    0x76, 0x99, 0xdd, 0x76, 0x8a, 0x5c, 0x86, 0x6a, 0x69, 0x9c, 0xa5, 0x7d,
-    0x78, 0x6a, 0x88, 0x77, 0x77, 0xae, 0x8a, 0x99, 0xcb, 0x85, 0x59, 0x84,
-    0x7b, 0x97, 0x8a, 0x82, 0xc5, 0x65, 0x8c, 0x93, 0xc3, 0x8c, 0x87, 0x64,
-    0x91, 0x41, 0x70, 0xa8, 0xd1, 0x8b, 0x82, 0x71, 0x9c, 0x71, 0x4e, 0x86,
-    0x98, 0x86, 0x7f, 0x7e, 0x69, 0x99, 0x79, 0x78, 0x77, 0xb3, 0x6b, 0x80,
-    0x84, 0x8b, 0x56, 0x73, 0x84, 0x95, 0x82, 0x94, 0x5b, 0x92, 0x83, 0x46,
-    0x66, 0x89, 0x6d, 0x61, 0x99, 0xa6, 0x99, 0x3f, 0x6c, 0xab, 0x5d, 0x5f,
-    0x6c, 0x8e, 0x6b, 0x4a, 0x72, 0xb6, 0x6c, 0x75, 0x78, 0xa6, 0x6f, 0x5b,
-    0x56, 0x8b, 0x57, 0x74, 0x8f, 0xab, 0x53, 0x56, 0x5d, 0x63, 0x63, 0x8b,
-    0x65, 0x78, 0x71, 0x67, 0x7a, 0x62, 0x8d, 0x78, 0x99, 0x76, 0x94, 0x7a,
-    0xa3, 0x70, 0x55, 0x87, 0x7e, 0x7c, 0x57, 0x57, 0x6e, 0x79, 0x94, 0x8f,
-    0x86, 0x80, 0x90, 0x7d, 0x7d, 0x7f, 0x7f, 0x68, 0x41, 0x86, 0x8c, 0x6f,
-    0x8a, 0x7f, 0x87, 0x8a, 0x7e, 0x7f, 0x5d, 0x71, 0x91, 0x81, 0x93, 0x71,
-    0x91, 0xc6, 0x70, 0x4a, 0x74, 0xa8, 0xf3, 0x72, 0xa7, 0x80, 0x7e, 0x41,
-    0x84, 0xa3, 0xb6, 0x94, 0xba, 0x84, 0x70, 0x74, 0x71, 0xac, 0x9f, 0x9d,
-    0xe4, 0x67, 0x6a, 0x87, 0x92, 0x8e, 0x92, 0x82, 0xdb, 0x5e, 0x9b, 0x90,
-    0xd5, 0x87, 0x8d, 0x7c, 0x9c, 0x3c, 0x6c, 0xab, 0xc2, 0x86, 0x83, 0x79,
-    0x6c, 0x61, 0x51, 0xa9, 0x99, 0x79, 0x72, 0x80, 0x6f, 0x85, 0x57, 0x6c,
-    0x81, 0x86, 0x6e, 0x88, 0x87, 0x8d, 0x8e, 0x81, 0x67, 0x88, 0x62, 0x99,
-    0x87, 0xab, 0x8f, 0x57, 0x60, 0x77, 0x64, 0x81, 0x96, 0xa3, 0x81, 0x3d,
-    0x4e, 0xb9, 0x57, 0x6e, 0x99, 0xad, 0x6a, 0x3e, 0x74, 0x96, 0x7e, 0x79,
-    0x65, 0xa4, 0x7c, 0x6a, 0x53, 0x87, 0x56, 0x6f, 0x5e, 0x97, 0x85, 0x42,
-    0x56, 0x6b, 0x67, 0x78, 0x7d, 0xa6, 0x7c, 0x7c, 0x7d, 0x78, 0x7b, 0x84,
-    0x99, 0x7b, 0x89, 0x71, 0x76, 0x8b, 0x76, 0x73, 0x7d, 0x83, 0x56, 0x4f,
-    0x86, 0x72, 0x83, 0x88, 0x6a, 0x93, 0x69, 0x90, 0x6c, 0x73, 0x6f, 0x63,
-    0x55, 0x88, 0x6b, 0x88, 0x7c, 0x86, 0x87, 0x7b, 0x6c, 0x7e, 0x60, 0x57,
-    0xa8, 0x81, 0xa3, 0x72, 0xba, 0xbf, 0x66, 0x65, 0x70, 0xb9, 0xe4, 0x78,
-    0x99, 0x67, 0x8c, 0x72, 0x88, 0x96, 0xb5, 0x72, 0x8a, 0x66, 0x81, 0x39,
-    0x85, 0x93, 0xa0, 0x9c, 0xdf, 0x74, 0x8a, 0x6d, 0x93, 0xa1, 0x8c, 0x7a,
-    0xb5, 0x4b, 0x89, 0xae, 0xba, 0x9c, 0x96, 0x9a, 0xb4, 0x33, 0x5a, 0xb1,
-    0xcd, 0x88, 0x84, 0x63, 0x8c, 0x5e, 0x71, 0x6d, 0xa7, 0x8a, 0x62, 0x85,
-    0x77, 0x75, 0x62, 0x79, 0x96, 0x73, 0x4f, 0x7d, 0x93, 0x8a, 0x88, 0x7e,
-    0x59, 0x6c, 0x7f, 0x87, 0x6f, 0x91, 0x88, 0x59, 0x6d, 0x83, 0x70, 0x7c,
-    0x7f, 0x8d, 0x7f, 0x26, 0x41, 0xcf, 0x6b, 0x6e, 0x75, 0xa3, 0x90, 0x5e,
-    0x3a, 0x94, 0x61, 0x9a, 0x6f, 0x9f, 0x69, 0x7d, 0x55, 0x8c, 0x60, 0x7c,
-    0x93, 0x85, 0x85, 0x4b, 0x54, 0x71, 0x60, 0x8a, 0x6d, 0x8c, 0x9c, 0x7e,
-    0x5b, 0x79, 0x74, 0x7b, 0x7b, 0x9d, 0x5b, 0x65, 0x81, 0x82, 0x66, 0x89,
-    0x82, 0x72, 0x77, 0x78, 0x75, 0x76, 0x6b, 0x74, 0x89, 0x73, 0x6c, 0x6b,
-    0x77, 0x7e, 0x67, 0x84, 0x41, 0x90, 0x58, 0x87, 0x98, 0x60, 0x96, 0x81,
-    0x6b, 0x74, 0x7d, 0x56, 0x72, 0x71, 0x9a, 0x7d, 0xc5, 0xd0, 0x88, 0x6e,
-    0x4d, 0xbe, 0xef, 0x8a, 0xa7, 0x92, 0x82, 0x67, 0x7f, 0x91, 0xc5, 0x7d,
-    0xad, 0x77, 0x6b, 0x4e, 0x8e, 0x99, 0x9b, 0x8e, 0xc7, 0x7f, 0x8a, 0x8e,
-    0x8f, 0x87, 0x9c, 0x75, 0xb0, 0x53, 0x75, 0x97, 0xc7, 0x98, 0xa4, 0xa4,
-    0x80, 0x41, 0x79, 0xc3, 0xdb, 0x86, 0x9d, 0x75, 0x7f, 0x67, 0x7a, 0x96,
-    0xc3, 0x83, 0x54, 0x8e, 0x6f, 0xa8, 0x7c, 0x65, 0x78, 0x7e, 0x59, 0xa3,
-    0x8a, 0x97, 0x8b, 0x82, 0x5e, 0x66, 0x82, 0x9b, 0x9e, 0x9f, 0x70, 0x49,
-    0x55, 0x88, 0x8a, 0x7e, 0x90, 0xa7, 0x6b, 0x3b, 0x28, 0xc0, 0x63, 0x7e,
-    0x60, 0x90, 0x7c, 0x3f, 0x54, 0x9c, 0x7d, 0x8a, 0x6a, 0xa9, 0x6f, 0x61,
-    0x76, 0x86, 0x64, 0x88, 0x72, 0xa5, 0x6b, 0x4d, 0x56, 0x6c, 0x52, 0xa1,
-    0x84, 0x69, 0x69, 0x5b, 0x71, 0x84, 0x76, 0x9b, 0x92, 0x70, 0x86, 0x8b,
-    0x71, 0x68, 0x56, 0x92, 0x76, 0x8f, 0x8f, 0x72, 0x5a, 0x77, 0x6f, 0x92,
-    0x72, 0x72, 0x5e, 0x7a, 0x70, 0x73, 0x60, 0x7d, 0x5a, 0x93, 0x7f, 0x6b,
-    0x89, 0x6b, 0xa1, 0x85, 0x5c, 0x8d, 0x76, 0x7c, 0x6f, 0x73, 0x96, 0x6d,
-    0xbb, 0xad, 0x53, 0x53, 0x5f, 0x9a, 0xe2, 0x8d, 0xa7, 0x6d, 0x8a, 0x5b,
-    0x85, 0x9c, 0xb4, 0x7b, 0xb3, 0x52, 0x75, 0x7f, 0x7a, 0x8c, 0x91, 0x7e,
-    0xca, 0x5f, 0x64, 0x71, 0x85, 0x9a, 0x91, 0x72, 0xbd, 0x6e, 0x9b, 0x81,
-    0x8f, 0xa8, 0xac, 0x7d, 0xb4, 0x5f, 0x45, 0xc5, 0xc8, 0x7a, 0x93, 0x8e,
-    0x7b, 0x41, 0x69, 0x94, 0x8b, 0x76, 0x59, 0x81, 0x73, 0x92, 0x8e, 0x63,
-    0x8e, 0x74, 0x33, 0xa5, 0x9c, 0xa2, 0x88, 0x48, 0x5d, 0x8c, 0x7d, 0xa6,
-    0x68, 0x9a, 0x6f, 0x58, 0x6c, 0x8f, 0x77, 0x65, 0x97, 0x9d, 0x7a, 0x37,
-    0x59, 0xab, 0x6e, 0x8f, 0x7a, 0xae, 0x65, 0x3e, 0x46, 0xa9, 0x82, 0x82,
-    0x9c, 0x9d, 0x62, 0x79, 0x66, 0x7f, 0x5e, 0x88, 0x9e, 0x8f, 0x84, 0x71,
-    0x5d, 0x6d, 0x70, 0xa0, 0x69, 0x92, 0x7f, 0x70, 0x66, 0x6f, 0x75, 0x8c,
-    0x96, 0x7a, 0x85, 0x6a, 0x5a, 0x7c, 0x72, 0x8a, 0x8d, 0x7b, 0x8b, 0x5c,
-    0x76, 0x69, 0x70, 0x7f, 0x74, 0xa1, 0x71, 0x91, 0x5a, 0x8c, 0x6e, 0x83,
-    0x52, 0x78, 0x71, 0x6d, 0xa9, 0x63, 0x9d, 0x81, 0x52, 0x9e, 0x5d, 0x60,
-    0x76, 0x93, 0x97, 0x67, 0xce, 0xc1, 0x75, 0x5e, 0x5f, 0x8c, 0xea, 0x76,
-    0xad, 0x7a, 0x7d, 0x62, 0x85, 0x92, 0xd0, 0x6a, 0xbc, 0x53, 0x55, 0x5c,
-    0x6d, 0x89, 0x9e, 0x71, 0xd2, 0x8b, 0x64, 0x61, 0x85, 0x9a, 0x77, 0x75,
-    0xb9, 0x67, 0x8a, 0xac, 0x90, 0x8a, 0xb4, 0x91, 0xbb, 0x58, 0x94, 0xaf,
-    0xb2, 0x76, 0xa2, 0x71, 0x95, 0x5e, 0x73, 0xa5, 0x92, 0x8c, 0x52, 0x96,
-    0x53, 0x95, 0x84, 0x91, 0x93, 0x7a, 0x40, 0x88, 0xab, 0xa5, 0x63, 0x70,
-    0x66, 0x88, 0x7e, 0x92, 0x89, 0x84, 0x78, 0x57, 0x3d, 0x8d, 0x84, 0x77,
-    0x9b, 0x87, 0x5e, 0x4e, 0x42, 0xa0, 0x76, 0x8a, 0x77, 0x90, 0x83, 0x4c,
-    0x42, 0x9b, 0x75, 0x7a, 0x88, 0x94, 0x98, 0x69, 0x4c, 0xa2, 0x6b, 0x7b,
-    0x6e, 0x9b, 0x5d, 0x5f, 0x53, 0x6a, 0x63, 0x95, 0x69, 0x8a, 0x61, 0x75,
-    0x6c, 0x7a, 0x58, 0x89, 0x84, 0x8f, 0x6b, 0x5a, 0x71, 0x6f, 0x59, 0x89,
-    0x7d, 0x87, 0x5f, 0x77, 0x4b, 0x61, 0x77, 0x92, 0x67, 0x8e, 0x5c, 0x6f,
-    0x5b, 0x77, 0x76, 0x6b, 0x44, 0x9d, 0x9f, 0x7f, 0x8b, 0x94, 0x9e, 0x7c,
-    0x62, 0x94, 0x60, 0x55, 0x77, 0x8f, 0xa6, 0x62, 0xb5, 0xb2, 0x3c, 0x61,
-    0x5c, 0x99, 0xeb, 0x5b, 0x90, 0x6c, 0x7f, 0x5f, 0x75, 0xa6, 0xcf, 0x77,
-    0x98, 0x5d, 0x75, 0x69, 0x7f, 0x8a, 0xa7, 0x73, 0xc8, 0x74, 0x70, 0x82,
-    0x76, 0x8f, 0xa2, 0x7a, 0xa4, 0x7a, 0x66, 0x81, 0x9b, 0x8f, 0x9e, 0x8b,
-    0xa1, 0x51, 0x7b, 0xba, 0xc8, 0x90, 0xab, 0x92, 0x72, 0x57, 0x5b, 0xa3,
-    0xb0, 0x7f, 0x4c, 0x7d, 0x5f, 0x8e, 0x6c, 0x7d, 0x71, 0x7e, 0x4e, 0x87,
-    0xb7, 0x97, 0x7a, 0x4c, 0x5f, 0x72, 0x78, 0x84, 0x82, 0x7e, 0x63, 0x65,
-    0x68, 0x78, 0x73, 0x85, 0x90, 0x99, 0x80, 0x57, 0x42, 0x8b, 0x8a, 0x77,
-    0x71, 0x97, 0x6d, 0x44, 0x41, 0x8f, 0x78, 0x7d, 0x95, 0x81, 0x95, 0x5f,
-    0x64, 0x87, 0x66, 0x80, 0x89, 0x9a, 0x61, 0x4d, 0x68, 0x7b, 0x72, 0x73,
-    0x85, 0x92, 0x77, 0x7d, 0x73, 0x77, 0x54, 0x7a, 0x77, 0x7d, 0x7d, 0x7a,
-    0x6e, 0x8e, 0x4f, 0x7d, 0x80, 0x9a, 0x79, 0x8b, 0x7b, 0x68, 0x6e, 0x86,
-    0x7f, 0x93, 0x7a, 0x76, 0x72, 0x85, 0x6a, 0x7b, 0x57, 0x84, 0x96, 0x9a,
-    0x8f, 0x91, 0x9b, 0x72, 0x73, 0x91, 0x53, 0x66, 0x76, 0x80, 0xae, 0x63,
-    0xbf, 0x99, 0x5e, 0x77, 0x73, 0x9c, 0xd8, 0x74, 0xa7, 0x79, 0x52, 0x64,
-    0x82, 0x95, 0xc7, 0x4f, 0xa8, 0x4f, 0x6d, 0x42, 0x7c, 0x89, 0xab, 0x83,
-    0xc0, 0x82, 0x6a, 0x5f, 0x83, 0x92, 0xa8, 0x76, 0xc1, 0x77, 0x6e, 0x7b,
-    0xa3, 0x9b, 0xaf, 0x87, 0xab, 0x60, 0x8d, 0xc2, 0xd2, 0x83, 0xb2, 0x78,
-    0x8d, 0x39, 0x57, 0x9c, 0x90, 0x8e, 0x6e, 0x6a, 0x74, 0x79, 0x81, 0x6d,
-    0x6f, 0x8e, 0x77, 0x92, 0x93, 0x7d, 0x5f, 0x68, 0x6a, 0x6c, 0x80, 0x8f,
-    0x99, 0x84, 0x4f, 0x64, 0x5c, 0x93, 0x7c, 0x91, 0x98, 0x82, 0x62, 0x3f,
-    0x41, 0x9f, 0x5d, 0x89, 0x98, 0x89, 0x73, 0x50, 0x32, 0xa8, 0xa0, 0x7a,
-    0xa0, 0x95, 0x78, 0x69, 0x74, 0x7c, 0x89, 0x7b, 0x80, 0x65, 0x56, 0x6b,
-    0x69, 0x78, 0x62, 0x87, 0xaf, 0x94, 0x7a, 0x64, 0x53, 0x86, 0x45, 0x99,
-    0x88, 0x79, 0x4d, 0x74, 0x59, 0x91, 0x5f, 0x7b, 0x88, 0x90, 0x80, 0x86,
-    0x7d, 0x7b, 0x64, 0xa3, 0x7f, 0x74, 0x89, 0x80, 0x7d, 0x7c, 0x7a, 0x87,
-    0x5f, 0x8a, 0x5a, 0x72, 0x79, 0x74, 0x8c, 0x7c, 0x86, 0x91, 0x6e, 0x5d,
-    0x61, 0x8e, 0xa2, 0x68, 0xd4, 0x92, 0x67, 0x66, 0x62, 0xa1, 0xf3, 0x63,
-    0x91, 0x81, 0x74, 0x5f, 0x88, 0x98, 0xbb, 0x5a, 0x9b, 0x54, 0x6a, 0x5c,
-    0x75, 0x88, 0xad, 0x7c, 0xb4, 0x7c, 0x69, 0x74, 0x84, 0x76, 0x9d, 0x9a,
-    0xb0, 0x91, 0x5d, 0xa3, 0xa4, 0x7f, 0xbb, 0x80, 0xa4, 0x5d, 0x83, 0xaf,
-    0xb7, 0x66, 0xb0, 0x7f, 0x89, 0x4b, 0x72, 0x9e, 0x99, 0x7c, 0x66, 0x71,
-    0x6a, 0x6f, 0x6d, 0x67, 0x8d, 0x6d, 0x46, 0xa5, 0x9b, 0x84, 0x7a, 0x61,
-    0x64, 0x5c, 0x88, 0x89, 0x95, 0x8c, 0x70, 0x4b, 0x6c, 0x85, 0x83, 0x8b,
-    0x98, 0x87, 0x6a, 0x44, 0x4d, 0x9d, 0x78, 0x71, 0x78, 0x7e, 0x91, 0x5b,
-    0x3f, 0x9f, 0x80, 0x62, 0xa7, 0x95, 0x5d, 0x74, 0x65, 0x9c, 0x6d, 0x7a,
-    0x98, 0x79, 0x80, 0x61, 0x49, 0x82, 0x65, 0x92, 0x80, 0x96, 0x7c, 0x72,
-    0x4f, 0x76, 0x5e, 0x8d, 0x97, 0xa5, 0x72, 0x57, 0x79, 0x87, 0x67, 0x87,
-    0x80, 0x84, 0x7c, 0x6f, 0x66, 0x6b, 0x70, 0x9b, 0x64, 0x90, 0x59, 0x96,
-    0x7a, 0x6f, 0x75, 0x89, 0x4e, 0x8a, 0x62, 0x6e, 0x9c, 0x8c, 0x9a, 0x78,
-    0x8e, 0x91, 0x3d, 0x50, 0x72, 0x92, 0x9f, 0x63, 0xda, 0x92, 0x72, 0x60,
-    0x59, 0xa6, 0xd0, 0x56, 0xc1, 0x6b, 0x5e, 0x76, 0x6e, 0x81, 0xbb, 0x4b,
-    0xbb, 0x59, 0x68, 0x4f, 0x77, 0x87, 0xa1, 0x73, 0xbf, 0x65, 0x56, 0x67,
-    0x77, 0x84, 0x8a, 0x7e, 0xb8, 0x85, 0x66, 0xa6, 0x99, 0xa0, 0xa5, 0x73,
-    0x8d, 0x4a, 0x7d, 0xab, 0xb0, 0x6a, 0x94, 0x84, 0x87, 0x4c, 0x74, 0xa3,
-    0xb3, 0xa9, 0x62, 0x7a, 0x71, 0x7f, 0x53, 0x79, 0x7a, 0x7c, 0x5e, 0x8f,
-    0xa0, 0x90, 0x5c, 0x76, 0x6c, 0x92, 0x70, 0x9c, 0xb3, 0x8b, 0x7e, 0x57,
-    0x5b, 0x9d, 0x96, 0x85, 0x70, 0x93, 0x8b, 0x67, 0x4c, 0x9c, 0x6a, 0x83,
-    0x84, 0x90, 0x8e, 0x60, 0x56, 0xb3, 0x87, 0x7d, 0x86, 0x88, 0x79, 0x5b,
-    0x58, 0x94, 0x92, 0x8e, 0x90, 0x76, 0x58, 0x51, 0x52, 0x63, 0x57, 0x88,
-    0x9b, 0x7a, 0x85, 0x6c, 0x8b, 0x87, 0x5f, 0x8b, 0x90, 0x92, 0x81, 0x64,
-    0x52, 0x8b, 0x77, 0x94, 0x96, 0x98, 0x69, 0x5b, 0x79, 0x87, 0x61, 0x96,
-    0x7b, 0x9a, 0x61, 0x74, 0x7e, 0x8b, 0x82, 0x92, 0x4f, 0x87, 0x7f, 0x80,
-    0x74, 0x97, 0x98, 0x7a, 0x79, 0x97, 0x65, 0x67, 0x66, 0xb1, 0xb1, 0x49,
-    0xd6, 0x97, 0x58, 0x47, 0x62, 0x94, 0xd5, 0x82, 0xa0, 0x60, 0x3f, 0x67,
-    0x6c, 0x9d, 0xb6, 0x58, 0xb1, 0x6e, 0x58, 0x4e, 0x7c, 0x83, 0x8b, 0x83,
-    0xd5, 0x62, 0x8d, 0x84, 0x84, 0x8c, 0xa9, 0x6e, 0xac, 0x7f, 0x6d, 0x88,
-    0xab, 0x8b, 0xb1, 0x77, 0x9b, 0x46, 0x76, 0xa7, 0xb8, 0x7b, 0xc5, 0x6e,
-    0x73, 0x62, 0x68, 0x95, 0xab, 0x7c, 0x6f, 0x74, 0x56, 0x71, 0x61, 0x83,
-    0x8a, 0x73, 0x54, 0x94, 0x86, 0x91, 0x60, 0x69, 0x65, 0x6b, 0x76, 0x85,
-    0xae, 0x87, 0x8f, 0x55, 0x41, 0x98, 0x68, 0x87, 0x5e, 0x7a, 0x80, 0x38,
-    0x50, 0xaf, 0x93, 0x79, 0x57, 0x96, 0x7b, 0x53, 0x4e, 0xc0, 0xa0, 0x85,
-    0x87, 0x95, 0x86, 0x70, 0x4c, 0x9f, 0x77, 0x7d, 0x8b, 0x7a, 0x7b, 0x6d,
-    0x57, 0x74, 0x81, 0x7d, 0xa2, 0x79, 0x64, 0x6c, 0x55, 0x70, 0x3c, 0x88,
-    0x8a, 0x7a, 0x58, 0x72, 0x71, 0x7d, 0x6a, 0x8d, 0x78, 0x7e, 0x95, 0x8b,
-    0x84, 0x7e, 0x73, 0x7c, 0x7e, 0x67, 0x89, 0x8b, 0x6d, 0x68, 0x66, 0x73,
-    0x5a, 0x93, 0x82, 0x85, 0x97, 0x6b, 0x9a, 0x72, 0x51, 0xa2, 0x4f, 0x67,
-    0x67, 0x7e, 0xbb, 0x37, 0xe3, 0x9c, 0x57, 0x5b, 0x6f, 0xa0, 0xdc, 0x5c,
-    0xa6, 0x7c, 0x71, 0x77, 0x72, 0x88, 0xd0, 0x4d, 0x93, 0x58, 0x74, 0x6d,
-    0x8f, 0x77, 0xa3, 0x76, 0xb7, 0x76, 0x6d, 0x6d, 0x6f, 0x7b, 0xaa, 0x6d,
-    0xaa, 0x6a, 0x72, 0x98, 0x8d, 0x98, 0xb0, 0x52, 0x76, 0x5d, 0x61, 0xb7,
-    0xac, 0x90, 0xa5, 0x75, 0x7e, 0x3d, 0x5b, 0x9a, 0xbf, 0x81, 0x83, 0x7b,
-    0x5c, 0x77, 0x74, 0x82, 0x8d, 0x7e, 0x4f, 0x9f, 0x8f, 0x97, 0x7c, 0x75,
-    0x5b, 0x73, 0x97, 0x73, 0x85, 0x7f, 0x70, 0x5a, 0x53, 0x81, 0x81, 0x89,
-    0x73, 0x8d, 0x8a, 0x5c, 0x5f, 0x84, 0x86, 0x6f, 0x76, 0x78, 0x82, 0x6d,
-    0x4f, 0xbb, 0x91, 0x61, 0x7e, 0x97, 0x6c, 0x67, 0x62, 0x83, 0x61, 0x7d,
-    0x89, 0x76, 0x7b, 0x67, 0x56, 0x74, 0x49, 0x7b, 0x6b, 0x8b, 0x89, 0x74,
-    0x5b, 0x7f, 0x78, 0x7b, 0x80, 0x7e, 0x63, 0x71, 0x5e, 0x91, 0x81, 0x92,
-    0x7b, 0x90, 0x9c, 0x7a, 0x73, 0x85, 0x79, 0x9b, 0x66, 0x93, 0x60, 0x87,
-    0x79, 0x69, 0x73, 0x8b, 0x53, 0x8c, 0x8d, 0x68, 0x93, 0xa0, 0x91, 0x65,
-    0x57, 0x8d, 0x71, 0x65, 0x6c, 0x7e, 0xb3, 0x4f, 0xc7, 0xaa, 0x5a, 0x77,
-    0x6e, 0x85, 0xe4, 0x6c, 0xa3, 0x89, 0x69, 0x54, 0x6d, 0x99, 0xb9, 0x77,
-    0xa0, 0x80, 0x85, 0x71, 0x70, 0x78, 0x99, 0x66, 0xaf, 0x8a, 0x59, 0x64,
-    0x54, 0x62, 0xbf, 0x5c, 0xbd, 0x77, 0x7f, 0xab, 0x95, 0x85, 0xaa, 0x6e,
-    0xaa, 0x5a, 0x7b, 0x9f, 0xc3, 0x65, 0x93, 0x64, 0x7c, 0x2d, 0x4e, 0x8f,
-    0xb2, 0x5f, 0x4e, 0x61, 0x64, 0x73, 0x56, 0x75, 0x79, 0x90, 0x5c, 0x81,
-    0x8a, 0x8c, 0x70, 0x64, 0x74, 0x86, 0x86, 0x82, 0xab, 0x7e, 0x62, 0x4f,
-    0x51, 0x89, 0x7b, 0x88, 0x73, 0x97, 0x77, 0x75, 0x5c, 0x9e, 0x97, 0x70,
-    0x5a, 0x98, 0x7a, 0x54, 0x47, 0x99, 0xab, 0x5d, 0x91, 0xa0, 0x64, 0x51,
-    0x57, 0x88, 0x88, 0x85, 0x81, 0x83, 0xa1, 0x89, 0x6a, 0x88, 0x69, 0x81,
-    0x92, 0x63, 0x6a, 0x71, 0x72, 0x6a, 0x75, 0x8e, 0x90, 0x9d, 0x69, 0x60,
-    0x73, 0x95, 0x79, 0x7b, 0x79, 0x7f, 0x77, 0x6e, 0x69, 0x63, 0x60, 0xa0,
-    0x84, 0x91, 0x80, 0x96, 0x92, 0x70, 0x69, 0x7c, 0x3f, 0x90, 0x5c, 0x79,
-    0x82, 0x63, 0x8d, 0x63, 0x56, 0x8a, 0x8e, 0x7a, 0x5c, 0x8d, 0xb8, 0x4e,
-    0xb6, 0x84, 0x57, 0x79, 0x59, 0x79, 0xe8, 0x7e, 0xa8, 0x71, 0x61, 0x62,
-    0x89, 0x71, 0xb7, 0x83, 0x7b, 0x53, 0x86, 0x88, 0x74, 0x71, 0xb1, 0x61,
-    0xae, 0x7e, 0x8f, 0x69, 0x6b, 0x69, 0xb2, 0x6d, 0xb1, 0x7f, 0x5c, 0x9f,
-    0xaa, 0x8c, 0xbd, 0x74, 0xaa, 0x5b, 0x7f, 0xa5, 0xb0, 0x6e, 0xc1, 0x5c,
-    0x94, 0x34, 0x5b, 0xa6, 0xbc, 0x49, 0x75, 0x5b, 0x6e, 0x74, 0x7a, 0x92,
-    0x92, 0x79, 0x78, 0x8a, 0x9e, 0x97, 0x7c, 0x5f, 0x76, 0x86, 0x59, 0x81,
-    0x83, 0x7a, 0x65, 0x5b, 0x42, 0x95, 0x84, 0x99, 0x81, 0x8d, 0x6a, 0x5e,
-    0x59, 0xb7, 0x96, 0x8a, 0x77, 0x86, 0x7a, 0x67, 0x3b, 0xa8, 0xae, 0x7a,
-    0xa0, 0x97, 0x6c, 0x73, 0x5b, 0x9b, 0x77, 0x84, 0x7a, 0x77, 0x75, 0x6f,
-    0x7d, 0x7a, 0x71, 0x86, 0x6c, 0x6f, 0x7d, 0x71, 0x68, 0x60, 0x64, 0x86,
-    0x90, 0x75, 0x6a, 0x61, 0x60, 0x87, 0x68, 0x99, 0x87, 0x7e, 0x92, 0x87,
-    0x87, 0x5f, 0x60, 0x91, 0x68, 0x8c, 0x7b, 0x67, 0x79, 0x5d, 0x67, 0x77,
-    0x47, 0x72, 0x76, 0x88, 0x82, 0xa2, 0x7a, 0x5d, 0x64, 0x87, 0x75, 0x78,
-    0x5e, 0x6f, 0xa4, 0x52, 0xc2, 0x9d, 0x81, 0x89, 0x55, 0x86, 0xc9, 0x6f,
-    0x95, 0x71, 0x9d, 0x87, 0x95, 0x74, 0xac, 0x7f, 0x95, 0x6c, 0x68, 0x66,
-    0x8a, 0x5f, 0x96, 0x69, 0x95, 0x79, 0x7f, 0x71, 0x86, 0x7e, 0x98, 0x71,
-    0xac, 0x8f, 0x75, 0xa5, 0xac, 0x7a, 0xca, 0x63, 0xa0, 0x63, 0x69, 0xbf,
-    0xae, 0x62, 0xc9, 0x46, 0x74, 0x2c, 0x66, 0x96, 0xb7, 0x70, 0x7c, 0x6b,
-    0x7b, 0x90, 0x72, 0x74, 0x8d, 0x5f, 0x63, 0x93, 0x97, 0x78, 0x79, 0x64,
-    0x67, 0x84, 0x64, 0x82, 0x90, 0x83, 0x91, 0x5f, 0x72, 0x93, 0x91, 0xae,
-    0x6d, 0x99, 0x5b, 0x69, 0x54, 0x9f, 0x97, 0x80, 0x80, 0xa4, 0x91, 0x66,
-    0x65, 0xa4, 0xa7, 0x7b, 0x97, 0x87, 0x72, 0x68, 0x6a, 0x96, 0x7b, 0x79,
-    0x69, 0x83, 0x6f, 0x85, 0x6b, 0x92, 0x7f, 0x71, 0x84, 0x87, 0x6a, 0x7b,
-    0x63, 0x72, 0x5f, 0x87, 0x98, 0x7b, 0x96, 0x71, 0x62, 0x90, 0x71, 0xa3,
-    0x8c, 0x77, 0x90, 0x6f, 0x83, 0x76, 0x65, 0x87, 0x72, 0x8a, 0x64, 0x87,
-    0x75, 0x75, 0x6d, 0x84, 0x54, 0x89, 0x88, 0xa0, 0x87, 0x73, 0x7f, 0x6f,
-    0x5f, 0x90, 0x5e, 0x94, 0x5d, 0x61, 0xa6, 0x56, 0xb3, 0x91, 0x95, 0x75,
-    0x4d, 0x74, 0xd9, 0x87, 0x92, 0x74, 0x7f, 0x79, 0x97, 0x6e, 0x90, 0x54,
-    0x84, 0x5d, 0x5f, 0x75, 0x8b, 0x84, 0xa6, 0x75, 0xb4, 0x77, 0x78, 0x85,
-    0x90, 0x76, 0xbd, 0x78, 0xd1, 0xa0, 0x5d, 0x96, 0xa9, 0x7c, 0xc1, 0x61,
-    0xc2, 0x71, 0x8b, 0xa5, 0xa5, 0x5b, 0xc8, 0x50, 0x7b, 0x4b, 0x93, 0x99,
-    0xae, 0x72, 0x67, 0x54, 0x81, 0x89, 0x96, 0x81, 0x6e, 0x68, 0x55, 0x7f,
-    0x93, 0x8c, 0x5e, 0x65, 0x6c, 0x84, 0x7f, 0x8f, 0x9e, 0x7b, 0x73, 0x7f,
-    0x51, 0x63, 0x8a, 0x8b, 0x6b, 0x9b, 0x9d, 0x57, 0x68, 0x89, 0x98, 0x70,
-    0x73, 0xa3, 0x7f, 0x69, 0x44, 0x89, 0xae, 0x68, 0x89, 0x80, 0x7e, 0x6d,
-    0x70, 0x95, 0x85, 0x65, 0x91, 0x7f, 0x66, 0x74, 0x96, 0x72, 0x60, 0x7a,
-    0x87, 0x85, 0x79, 0x54, 0x53, 0x6c, 0x88, 0x87, 0xa9, 0x90, 0x75, 0x8b,
-    0x69, 0x98, 0x7d, 0x95, 0x85, 0x7a, 0x8b, 0x82, 0x87, 0x6f, 0x86, 0x7f,
-    0x74, 0xab, 0x93, 0x6c, 0x8a, 0x78, 0x68, 0x81, 0x62, 0x88, 0x78, 0x91,
-    0x8b, 0x55, 0xa7, 0x58, 0x64, 0x88, 0x71, 0x93, 0x7d, 0x69, 0xbc, 0x58,
-    0xbe, 0x9a, 0x6f, 0x74, 0x6f, 0x7f, 0xeb, 0x9e, 0xb7, 0x60, 0x63, 0x98,
-    0x82, 0x77, 0x94, 0x63, 0x80, 0x6f, 0x7d, 0x8f, 0x8b, 0x85, 0xa5, 0x62,
-    0xad, 0x86, 0x5f, 0x76, 0x88, 0x74, 0xa5, 0x66, 0xa5, 0x94, 0x88, 0x9b,
-    0x87, 0x9e, 0xa8, 0x5a, 0xc9, 0x81, 0x92, 0xcd, 0xb5, 0x67, 0xb9, 0x63,
-    0x86, 0x65, 0x8d, 0xad, 0x98, 0x7c, 0x8a, 0x40, 0x67, 0x65, 0x60, 0x71,
-    0x8e, 0x84, 0x73, 0x64, 0x98, 0x80, 0x73, 0x81, 0x48, 0x75, 0x71, 0x9e,
-    0x73, 0x89, 0x89, 0x68, 0x73, 0xa6, 0x84, 0x8a, 0x7e, 0x9f, 0x78, 0x83,
-    0x60, 0x77, 0xa1, 0x87, 0x76, 0xab, 0x74, 0x57, 0x6d, 0x99, 0xa5, 0x5e,
-    0x9d, 0x91, 0x6d, 0x6a, 0x76, 0x9c, 0x7b, 0x66, 0x96, 0x84, 0x85, 0x6e,
-    0x6c, 0x75, 0x86, 0x6a, 0x71, 0x67, 0x8a, 0x66, 0x66, 0x68, 0x73, 0x90,
-    0x92, 0x68, 0x8f, 0x71, 0x82, 0x7e, 0x71, 0xad, 0x9f, 0x84, 0x9e, 0x7d,
-    0x77, 0x6b, 0x67, 0x8f, 0x73, 0x9a, 0x91, 0x74, 0x8a, 0x74, 0x5a, 0x87,
-    0x37, 0x80, 0x8c, 0x8f, 0x7f, 0x75, 0xa8, 0x49, 0x63, 0x9b, 0x67, 0x68,
-    0x4f, 0x87, 0xbf, 0x59, 0x9c, 0xbe, 0x93, 0x7e, 0x6f, 0x8a, 0xea, 0x77,
-    0x83, 0x7a, 0x75, 0x8e, 0x7d, 0x50, 0x95, 0x60, 0x74, 0x60, 0x6f, 0x97,
-    0x72, 0x5c, 0xa3, 0x6d, 0xb9, 0x86, 0x7b, 0x89, 0x9a, 0x76, 0xc7, 0x56,
-    0xba, 0x86, 0x8d, 0x93, 0xa9, 0x98, 0xbb, 0x6a, 0x97, 0x74, 0x68, 0x84,
-    0xc3, 0x65, 0xb6, 0x68, 0x89, 0x58, 0x87, 0xa1, 0xac, 0x60, 0x65, 0x68,
-    0x7d, 0x98, 0x67, 0x8f, 0x8e, 0x84, 0x50, 0x75, 0x83, 0x91, 0x8a, 0x90,
-    0x66, 0x74, 0x96, 0x89, 0x81, 0x7a, 0x7a, 0x64, 0x7f, 0x73, 0x8f, 0x95,
-    0x8c, 0x89, 0x96, 0x76, 0x7a, 0x6c, 0x89, 0x91, 0x6d, 0x84, 0x68, 0x8d,
-    0x47, 0x94, 0x9a, 0x67, 0x8f, 0x89, 0x8e, 0x79, 0x73, 0xa8, 0x7f, 0x6c,
-    0x80, 0x64, 0x75, 0x81, 0x96, 0x9c, 0x68, 0x65, 0x76, 0x68, 0x74, 0x72,
-    0x68, 0x76, 0x62, 0x6d, 0x6e, 0x6a, 0x84, 0x65, 0x8a, 0x73, 0x76, 0x91,
-    0x78, 0x7c, 0x7a, 0x88, 0x6a, 0x87, 0x60, 0x99, 0x88, 0x75, 0x7b, 0x71,
-    0x81, 0x7b, 0x76, 0x7d, 0x58, 0x75, 0x65, 0xa3, 0x95, 0x7e, 0x96, 0x3e,
-    0x4c, 0x97, 0x86, 0x7a, 0x62, 0x92, 0xd1, 0x72, 0x8e, 0xaa, 0x85, 0x8e,
-    0x59, 0x5f, 0xec, 0x77, 0x96, 0x66, 0x91, 0x9a, 0x89, 0x6c, 0xa2, 0x69,
-    0x7d, 0x6e, 0x76, 0x63, 0x82, 0x72, 0x9c, 0x72, 0xa3, 0x75, 0x85, 0x7b,
-    0x6d, 0x96, 0xc2, 0x69, 0xa7, 0x6a, 0x6b, 0x83, 0xa2, 0x7d, 0xce, 0x5c,
-    0x94, 0x61, 0x7d, 0xae, 0xc3, 0x6d, 0x9f, 0x3c, 0x52, 0x4d, 0x8e, 0x92,
-    0xae, 0x6e, 0x70, 0x5a, 0x76, 0x84, 0x7f, 0x72, 0x92, 0x72, 0x76, 0x5e,
-    0x73, 0x8e, 0x82, 0x6d, 0x72, 0x81, 0x79, 0x94, 0x81, 0x88, 0x8b, 0x81,
-    0x72, 0x72, 0x69, 0x84, 0x59, 0x6e, 0x74, 0x7d, 0x66, 0x74, 0x8d, 0x7b,
-    0x7d, 0x7e, 0x7a, 0x83, 0x4d, 0x7e, 0x6a, 0x5a, 0x87, 0x66, 0x84, 0xa5,
-    0x50, 0x5d, 0x6a, 0x8e, 0x87, 0x74, 0x88, 0x7c, 0x7d, 0x6c, 0x93, 0x98,
-    0x8c, 0x76, 0x7f, 0xa3, 0x6e, 0x5d, 0x7d, 0x9f, 0x7c, 0x7a, 0x98, 0x88,
-    0x74, 0x73, 0x50, 0x8c, 0x78, 0x8b, 0x71, 0x77, 0x9d, 0x56, 0x71, 0x85,
-    0x6b, 0x8a, 0x93, 0x82, 0x8c, 0x79, 0x68, 0x8b, 0x57, 0x7b, 0x7c, 0x8a,
-    0x6c, 0x87, 0x98, 0x54, 0x63, 0x7e, 0x78, 0x6b, 0x63, 0x77, 0xc1, 0x52,
-    0xcd, 0xab, 0x75, 0x8e, 0x64, 0x68, 0xce, 0x68, 0x88, 0x6d, 0x67, 0x6d,
-    0x68, 0x76, 0xa7, 0x78, 0x83, 0x67, 0x65, 0x5b, 0x8f, 0x63, 0x90, 0x5b,
-    0xa1, 0x6f, 0x6a, 0x88, 0x70, 0x5c, 0x78, 0x49, 0xbc, 0x85, 0x8d, 0x8e,
-    0xa3, 0x90, 0x97, 0x84, 0xa2, 0x46, 0x7a, 0x8e, 0x9e, 0xb1, 0xaa, 0x53,
-    0x7d, 0x6b, 0x72, 0x86, 0x8c, 0x67, 0x6b, 0x48, 0x6f, 0x9c, 0x51, 0x94,
-    0x6d, 0x66, 0x8e, 0x90, 0x79, 0x81, 0x66, 0x9f, 0x82, 0x9f, 0x98, 0x97,
-    0x7c, 0x86, 0x7f, 0x57, 0x57, 0x83, 0x97, 0x8f, 0x73, 0x6f, 0x75, 0x6c,
-    0x56, 0x8f, 0x7f, 0x73, 0x71, 0x84, 0x7d, 0x5f, 0x69, 0x69, 0x8e, 0x67,
-    0x8a, 0x7f, 0x8c, 0x5a, 0x7a, 0x67, 0x82, 0x5a, 0x7a, 0x68, 0x73, 0x58,
-    0x84, 0x83, 0x8d, 0x6d, 0x83, 0x72, 0x80, 0x7a, 0x8e, 0x7a, 0x68, 0x88,
-    0x65, 0x74, 0x78, 0x73, 0x83, 0x97, 0x7b, 0x84, 0x77, 0x6d, 0x95, 0x99,
-    0x76, 0x69, 0x5f, 0x9b, 0x7c, 0x75, 0x91, 0x80, 0x7b, 0x73, 0x6f, 0x9f,
-    0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00,
-    0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
-    0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x0b, 0x00, 0x00, 0x00, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f, 0x62,
-    0x69, 0x61, 0x73, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0xaa, 0xcc, 0xe2, 0x37, 0x10, 0x00, 0x00, 0x00, 0xd6, 0x01, 0x00, 0x00,
-    0xfd, 0xfd, 0xff, 0xff, 0x53, 0xfe, 0xff, 0xff, 0x74, 0x01, 0x00, 0x00,
-    0x03, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
-    0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00,
-    0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x14, 0x00, 0x1c, 0x00,
-    0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,
-    0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x14, 0x00, 0x18, 0x00,
-    0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x14, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
-    0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
-    0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-    0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x10, 0x00,
-    0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
-    0x03, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0xff, 0x00, 0x19, 0x06, 0x00,
-    0x06, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x09, 0x06, 0x00,
-    0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04};
-const int g_model_len = 18288;
+    0x01, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xf2, 0xdd, 0xbb, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0x32, 0xa3, 0x25, 0x41, 0x01, 0x00, 0x00, 0x00,
+    0xf6, 0xa0, 0x50, 0xc1, 0x05, 0x00, 0x00, 0x00, 0x61, 0x64, 0x64, 0x5f,
+    0x31, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x0e, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02,
+    0x2c, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x0f, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x5f,
+    0x32, 0x2f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x4a, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
+    0x5c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x1c, 0xff, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x50, 0x50, 0xd0, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x80, 0xcf, 0x41, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68,
+    0x61, 0x70, 0x65, 0x5f, 0x32, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xc2, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
+    0x58, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x94, 0xff, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x01, 0x00, 0x00, 0x00, 0x50, 0x50, 0xd0, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x80, 0xcf, 0x41, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x5f,
+    0x31, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xa8, 0x07, 0x00, 0x00, 0x2e, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
+    0x60, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00, 0x04, 0x00, 0x08, 0x00,
+    0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x01, 0x00, 0x00, 0x00, 0x3a, 0x6a, 0xac, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0xd0, 0xbd, 0xab, 0x41, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xaa, 0xff, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x02, 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x2c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x9c, 0xff, 0xff, 0xff,
+    0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x96, 0x08, 0x29, 0x38, 0x0b, 0x00, 0x00, 0x00,
+    0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
+    0x18, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xa0, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x9a, 0xbb, 0x84, 0x38, 0x83, 0x84, 0x73, 0x37, 0x5b, 0xa3, 0xa0, 0x38,
+    0x16, 0x41, 0x3a, 0x38, 0xc7, 0x9a, 0x70, 0x38, 0xed, 0x70, 0x4e, 0x38,
+    0x54, 0x4f, 0xac, 0x38, 0xfd, 0x07, 0x8d, 0x38, 0x0b, 0x00, 0x00, 0x00,
+    0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x4c, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0xe6, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x19,
+    0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x06, 0x00, 0x05, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x00, 0x16, 0x0a, 0x00, 0x0e, 0x00, 0x07, 0x00,
+    0x00, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+    0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x0c, 0x00, 0x07, 0x00,
+    0x00, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04,
+    0x03, 0x00, 0x00, 0x00};
+const int g_model_len = 18712;
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc
index b523a8185d4..80f2b62546b 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h"
 
-const uint8_t g_no_feature_data_slice[g_no_feature_data_slice_size] = {
-    216, 195, 223, 211, 238, 223, 243, 215, 226, 204, 232, 211, 232, 213,
-    240, 218, 235, 214, 238, 205, 207, 173, 149, 201, 215, 200, 230, 213,
-    208, 195, 175, 151, 195, 175, 182, 163, 235, 217, 218, 190,
+const int8_t g_no_feature_data_slice[g_no_feature_data_slice_size] = {
+    89,  68, 96,  83, 111, 96, 115, 87, 99,  76, 105, 84, 105, 86,
+    113, 91, 108, 87, 110, 78, 80,  46, 22,  74, 88,  72, 103, 86,
+    80,  68, 48,  24, 68,  48, 55,  36, 108, 90, 90,  63,
 };
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h b/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h
index 234e7efc388..7c27379f6de 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h
@@ -24,6 +24,6 @@ limitations under the License.
 #include <cstdint>
 
 constexpr int g_no_feature_data_slice_size = 40;
-extern const uint8_t g_no_feature_data_slice[];
+extern const int8_t g_no_feature_data_slice[];
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_FEATURE_DATA_SLICE_H_
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.cc
index d7a923364a7..2fa4556a273 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.cc
@@ -15,151 +15,174 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h"
 
-/* File automatically created by
- * tensorflow/examples/speech_commands/wav_to_features.py \
- * --sample_rate=16000 \
- * --clip_duration_ms=1000 \
- * --window_size_ms=30 \
- * --window_stride_ms=20 \
- * --feature_bin_count=40 \
- * --quantize=1 \
- * --preprocess="micro" \
- * --input_wav="speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav" \
- * --output_c_file="/tmp/no_micro_features_data.cc" \
- */
+// Golden test values for the expected spectrogram from a "no" sample file
+// speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav.
 
 const int g_no_micro_f9643d42_nohash_4_width = 40;
 const int g_no_micro_f9643d42_nohash_4_height = 49;
-const unsigned char g_no_micro_f9643d42_nohash_4_data[] = {
-    230, 205, 191, 203, 202, 181, 180, 194, 205, 187, 183, 197, 203, 198, 196,
-    186, 202, 159, 151, 126, 110, 138, 141, 142, 137, 148, 133, 120, 110, 126,
-    117, 110, 117, 116, 137, 134, 95,  116, 123, 110, 184, 144, 183, 189, 197,
-    172, 188, 164, 194, 179, 175, 174, 182, 173, 184, 174, 200, 145, 154, 148,
-    147, 135, 143, 122, 127, 138, 116, 99,  122, 105, 110, 125, 127, 133, 131,
-    123, 116, 119, 127, 114, 193, 176, 185, 170, 175, 146, 166, 167, 185, 185,
-    185, 183, 195, 185, 176, 178, 197, 155, 137, 144, 164, 132, 153, 132, 138,
-    137, 134, 95,  120, 116, 131, 122, 99,  120, 120, 110, 116, 110, 126, 127,
-    128, 159, 187, 119, 178, 187, 197, 167, 199, 184, 180, 165, 194, 176, 144,
-    134, 187, 136, 142, 134, 145, 132, 145, 105, 119, 123, 125, 116, 125, 102,
-    129, 138, 130, 99,  99,  90,  120, 123, 134, 95,  194, 172, 187, 123, 191,
-    179, 195, 182, 201, 137, 167, 142, 185, 161, 187, 146, 167, 152, 154, 107,
-    152, 112, 134, 144, 117, 116, 105, 85,  105, 105, 99,  90,  123, 112, 112,
-    68,  107, 105, 117, 99,  116, 143, 139, 90,  154, 142, 188, 172, 178, 135,
-    175, 149, 177, 110, 173, 160, 169, 162, 173, 119, 132, 110, 85,  85,  117,
-    129, 117, 112, 117, 51,  112, 95,  139, 102, 105, 90,  128, 119, 112, 99,
-    170, 168, 195, 152, 174, 173, 180, 0,   157, 130, 169, 149, 149, 123, 170,
-    130, 170, 133, 159, 102, 134, 90,  85,  105, 126, 119, 130, 90,  78,  68,
-    127, 120, 95,  51,  122, 110, 112, 78,  116, 95,  180, 135, 179, 146, 179,
-    162, 197, 153, 172, 135, 154, 0,   149, 95,  145, 114, 166, 0,   114, 110,
-    145, 107, 114, 90,  136, 68,  95,  95,  95,  85,  116, 99,  116, 0,   95,
-    68,  102, 51,  102, 78,  185, 157, 138, 158, 180, 117, 173, 142, 145, 117,
-    169, 130, 159, 99,  138, 123, 169, 90,  78,  0,   123, 85,  107, 51,  114,
-    102, 95,  0,   116, 85,  119, 95,  95,  68,  85,  51,  116, 68,  102, 78,
-    167, 105, 164, 163, 178, 126, 164, 154, 154, 51,  177, 120, 156, 85,  134,
-    139, 168, 90,  161, 102, 114, 116, 122, 95,  112, 102, 107, 51,  114, 85,
-    119, 78,  114, 90,  102, 51,  102, 51,  114, 99,  177, 68,  152, 102, 184,
-    166, 179, 129, 177, 129, 180, 110, 158, 105, 139, 0,   145, 85,  148, 102,
-    117, 102, 116, 0,   78,  68,  90,  51,  107, 85,  78,  0,   51,  0,   51,
-    0,   95,  51,  107, 68,  180, 117, 90,  0,   138, 0,   187, 146, 119, 140,
-    164, 90,  136, 0,   131, 51,  159, 99,  141, 138, 116, 51,  90,  51,  90,
-    68,  105, 0,   85,  78,  112, 51,  122, 95,  128, 68,  85,  0,   112, 68,
-    147, 126, 178, 146, 171, 130, 190, 147, 188, 123, 170, 78,  132, 0,   130,
-    125, 159, 95,  102, 0,   110, 0,   95,  85,  120, 68,  78,  51,  99,  51,
-    105, 0,   112, 102, 105, 68,  90,  51,  90,  0,   127, 95,  166, 175, 187,
-    133, 135, 0,   171, 139, 132, 128, 140, 51,  126, 107, 161, 0,   95,  51,
-    119, 0,   114, 0,   95,  110, 116, 51,  112, 0,   90,  0,   116, 51,  68,
-    0,   105, 68,  105, 0,   164, 78,  173, 0,   194, 166, 145, 114, 116, 51,
-    107, 122, 151, 0,   156, 102, 148, 51,  122, 95,  129, 0,   85,  0,   127,
-    78,  90,  0,   78,  0,   95,  0,   110, 0,   68,  119, 120, 68,  68,  0,
-    122, 99,  147, 127, 200, 167, 85,  114, 161, 85,  161, 125, 143, 99,  156,
-    85,  147, 68,  99,  0,   107, 102, 132, 51,  112, 68,  95,  78,  99,  0,
-    68,  0,   51,  0,   90,  78,  128, 51,  95,  0,   166, 136, 174, 138, 189,
-    144, 130, 129, 138, 134, 132, 120, 134, 0,   51,  78,  147, 51,  51,  0,
-    51,  0,   78,  0,   68,  68,  95,  78,  90,  0,   0,   0,   68,  0,   90,
-    68,  110, 0,   95,  51,  165, 151, 157, 0,   0,   0,   112, 0,   112, 95,
-    149, 107, 119, 68,  126, 68,  138, 0,   78,  0,   78,  0,   99,  51,  112,
-    0,   102, 0,   78,  51,  85,  0,   0,   0,   78,  0,   95,  0,   95,  78,
-    105, 0,   152, 0,   0,   51,  132, 105, 159, 0,   129, 102, 114, 0,   138,
-    51,  123, 0,   129, 78,  119, 51,  51,  51,  105, 0,   78,  85,  95,  0,
-    85,  0,   0,   0,   85,  0,   78,  0,   0,   0,   172, 142, 141, 0,   137,
-    0,   148, 128, 157, 120, 146, 120, 120, 0,   95,  78,  141, 68,  68,  0,
-    68,  0,   90,  0,   85,  0,   107, 0,   78,  0,   85,  51,  102, 0,   68,
-    78,  68,  0,   51,  0,   125, 0,   141, 51,  102, 138, 175, 51,  120, 51,
-    173, 85,  116, 141, 164, 68,  150, 123, 133, 51,  114, 0,   117, 68,  150,
-    51,  116, 68,  78,  0,   68,  0,   68,  0,   85,  0,   78,  0,   51,  78,
-    155, 90,  161, 0,   132, 99,  123, 78,  107, 0,   134, 90,  95,  0,   78,
-    0,   162, 143, 85,  0,   107, 78,  125, 90,  90,  51,  51,  0,   85,  0,
-    0,   0,   132, 102, 102, 154, 128, 0,   99,  68,  162, 102, 151, 0,   99,
-    51,  147, 141, 156, 0,   112, 120, 158, 127, 145, 139, 187, 171, 135, 138,
-    146, 0,   95,  68,  127, 0,   85,  0,   105, 0,   0,   0,   187, 170, 162,
-    188, 165, 51,  51,  78,  243, 215, 225, 196, 205, 181, 205, 168, 176, 134,
-    157, 110, 126, 114, 133, 139, 193, 163, 159, 116, 160, 126, 122, 127, 171,
-    99,  114, 68,  123, 85,  90,  0,   157, 146, 166, 179, 136, 0,   116, 90,
-    242, 219, 240, 204, 216, 164, 188, 171, 176, 164, 154, 158, 190, 157, 190,
-    141, 182, 177, 169, 128, 172, 145, 105, 129, 157, 90,  78,  51,  119, 68,
-    137, 68,  116, 78,  141, 132, 151, 122, 156, 140, 234, 206, 229, 201, 216,
-    174, 191, 144, 162, 85,  122, 157, 194, 167, 204, 149, 180, 166, 166, 139,
-    122, 133, 156, 126, 145, 85,  128, 0,   99,  51,  145, 0,   126, 51,  166,
-    162, 166, 162, 177, 157, 228, 198, 221, 197, 214, 177, 173, 166, 173, 139,
-    185, 191, 202, 163, 205, 172, 206, 189, 135, 68,  166, 134, 149, 134, 135,
-    90,  127, 107, 175, 90,  136, 117, 135, 140, 172, 167, 166, 149, 177, 152,
-    221, 191, 215, 194, 211, 0,   156, 147, 182, 178, 208, 163, 190, 157, 208,
-    200, 195, 164, 179, 154, 181, 150, 143, 99,  132, 137, 185, 143, 163, 85,
-    51,  107, 132, 134, 164, 127, 167, 159, 175, 141, 216, 195, 223, 211, 238,
-    223, 243, 215, 226, 204, 232, 211, 232, 213, 240, 218, 235, 214, 238, 205,
-    207, 173, 149, 201, 215, 200, 230, 213, 208, 195, 175, 151, 195, 175, 182,
-    163, 235, 217, 218, 190, 211, 191, 215, 191, 217, 220, 241, 215, 229, 206,
-    236, 210, 227, 216, 236, 188, 183, 149, 202, 189, 208, 172, 191, 201, 220,
-    193, 221, 207, 216, 208, 201, 131, 170, 187, 229, 197, 211, 194, 226, 201,
-    205, 184, 206, 177, 221, 210, 226, 184, 204, 197, 218, 198, 212, 209, 213,
-    141, 172, 110, 175, 167, 180, 156, 213, 188, 192, 179, 213, 205, 204, 174,
-    200, 147, 162, 181, 203, 167, 198, 187, 210, 164, 196, 169, 189, 168, 224,
-    198, 213, 204, 198, 195, 230, 211, 221, 197, 208, 0,   0,   0,   85,  90,
-    167, 130, 175, 173, 203, 164, 193, 144, 170, 145, 185, 148, 154, 139, 198,
-    159, 180, 171, 216, 174, 178, 161, 166, 136, 216, 184, 215, 197, 199, 190,
-    228, 195, 208, 51,  117, 0,   0,   0,   0,   0,   140, 51,  135, 154, 188,
-    155, 168, 0,   90,  0,   156, 85,  110, 0,   174, 90,  172, 154, 179, 99,
-    142, 166, 179, 157, 177, 95,  192, 142, 204, 198, 217, 147, 173, 0,   112,
-    0,   0,   0,   0,   0,   0,   0,   110, 0,   107, 0,   160, 0,   148, 95,
-    172, 0,   0,   0,   116, 0,   122, 114, 170, 0,   0,   0,   0,   0,   179,
-    110, 196, 85,  205, 183, 169, 0,   99,  0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   141, 0,   112, 0,   0,   0,   134, 0,   0,   0,   0,
-    0,   0,   0,   139, 0,   0,   0,   0,   112, 186, 78,  163, 0,   169, 128,
-    174, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   95,
-    0,   105, 0,   0,   0,   105, 0,   0,   0,   0,   0,   0,   0,   95,  0,
-    0,   0,   0,   0,   0,   0,   119, 0,   164, 78,  0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   90,  0,   0,   68,
-    117, 0,   0,   0,   0,   0,   0,   0,   148, 0,   0,   0,   0,   0,   0,
-    0,   0,   0,   116, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,
-    0,   0,   0,   99,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   99,  0,   0,   0,   0,   0,   0,   0,   0,   0,   78,  0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+const signed char g_no_micro_f9643d42_nohash_4_data[] = {
+    103,  78,   64,   76,   75,   54,   53,   67,   77,   60,   56,   70,
+    76,   71,   68,   58,   74,   32,   23,   -2,   -18,  11,   13,   15,
+    9,    20,   5,    -7,   -18,  -2,   -10,  -18,  -10,  -12,  9,    7,
+    -33,  -12,  -4,   -18,  57,   17,   55,   62,   70,   45,   61,   37,
+    67,   52,   48,   47,   55,   46,   57,   47,   73,   17,   27,   20,
+    19,   8,    15,   -6,   -1,   10,   -12,  -29,  -6,   -23,  -18,  -3,
+    -1,   5,    3,    -4,   -12,  -8,   -1,   -14,  65,   48,   58,   43,
+    48,   19,   39,   39,   57,   57,   58,   55,   67,   58,   49,   50,
+    70,   27,   9,    16,   37,   4,    25,   4,    11,   9,    7,    -33,
+    -7,   -12,  3,    -6,   -29,  -7,   -7,   -18,  -12,  -18,  -2,   -1,
+    0,    31,   60,   -8,   51,   59,   70,   40,   71,   57,   52,   38,
+    66,   48,   17,   6,    59,   8,    15,   7,    18,   4,    18,   -23,
+    -8,   -4,   -3,   -12,  -3,   -26,  1,    10,   2,    -29,  -29,  -37,
+    -7,   -4,   6,    -33,  67,   44,   59,   -4,   64,   51,   68,   55,
+    74,   9,    40,   15,   57,   33,   60,   18,   40,   25,   27,   -20,
+    25,   -16,  6,    17,   -10,  -12,  -23,  -43,  -23,  -23,  -29,  -37,
+    -4,   -16,  -16,  -60,  -20,  -23,  -10,  -29,  -12,  15,   12,   -37,
+    27,   15,   61,   44,   50,   8,    48,   22,   49,   -18,  46,   33,
+    42,   34,   46,   -8,   4,    -18,  -43,  -43,  -10,  1,    -10,  -16,
+    -10,  -77,  -16,  -33,  11,   -26,  -23,  -37,  0,    -8,   -16,  -29,
+    42,   40,   68,   24,   47,   46,   53,   -128, 30,   2,    42,   21,
+    21,   -4,   43,   2,    43,   5,    32,   -26,  7,    -37,  -43,  -23,
+    -2,   -8,   2,    -37,  -50,  -60,  -1,   -7,   -33,  -77,  -6,   -18,
+    -16,  -50,  -12,  -33,  53,   8,    52,   18,   51,   35,   69,   26,
+    44,   8,    27,   -128, 21,   -33,  17,   -14,  38,   -128, -14,  -18,
+    17,   -20,  -14,  -37,  8,    -60,  -33,  -33,  -33,  -43,  -12,  -29,
+    -12,  -128, -33,  -60,  -26,  -77,  -26,  -50,  57,   29,   11,   30,
+    53,   -10,  45,   15,   18,   -10,  42,   2,    31,   -29,  10,   -4,
+    42,   -37,  -50,  -128, -4,   -43,  -20,  -77,  -14,  -26,  -33,  -128,
+    -12,  -43,  -8,   -33,  -33,  -60,  -43,  -77,  -12,  -60,  -26,  -50,
+    40,   -23,  36,   35,   50,   -2,   37,   27,   26,   -77,  49,   -7,
+    28,   -43,  6,    11,   41,   -37,  33,   -26,  -14,  -12,  -6,   -33,
+    -16,  -26,  -20,  -77,  -14,  -43,  -8,   -50,  -14,  -37,  -26,  -77,
+    -26,  -77,  -14,  -29,  50,   -60,  25,   -26,  57,   38,   51,   1,
+    50,   1,    53,   -18,  30,   -23,  11,   -128, 18,   -43,  20,   -26,
+    -10,  -26,  -12,  -128, -50,  -60,  -37,  -77,  -20,  -43,  -50,  -128,
+    -77,  -128, -77,  -128, -33,  -77,  -20,  -60,  53,   -10,  -37,  -128,
+    10,   -128, 60,   18,   -8,   13,   37,   -37,  8,    -128, 3,    -77,
+    32,   -29,  14,   10,   -12,  -77,  -37,  -77,  -37,  -60,  -23,  -128,
+    -43,  -50,  -16,  -77,  -6,   -33,  0,    -60,  -43,  -128, -16,  -60,
+    20,   -2,   51,   19,   43,   2,    63,   20,   60,   -4,   42,   -50,
+    4,    -128, 2,    -3,   32,   -33,  -26,  -128, -18,  -128, -33,  -43,
+    -7,   -60,  -50,  -77,  -29,  -77,  -23,  -128, -16,  -26,  -23,  -60,
+    -37,  -77,  -37,  -128, -1,   -33,  39,   48,   60,   5,    8,    -128,
+    44,   11,   4,    0,    13,   -77,  -2,   -20,  33,   -128, -33,  -77,
+    -8,   -128, -14,  -128, -33,  -18,  -12,  -77,  -16,  -128, -37,  -128,
+    -12,  -77,  -60,  -128, -23,  -60,  -23,  -128, 36,   -50,  46,   -128,
+    66,   39,   18,   -14,  -12,  -77,  -20,  -6,   24,   -128, 28,   -26,
+    21,   -77,  -6,   -33,  1,    -128, -43,  -128, -1,   -50,  -37,  -128,
+    -50,  -128, -33,  -128, -18,  -128, -60,  -8,   -7,   -60,  -60,  -128,
+    -6,   -29,  20,   -1,   73,   40,   -43,  -14,  33,   -43,  33,   -3,
+    15,   -29,  29,   -43,  20,   -60,  -29,  -128, -20,  -26,  4,    -77,
+    -16,  -60,  -33,  -50,  -29,  -128, -60,  -128, -77,  -128, -37,  -50,
+    0,    -77,  -33,  -128, 39,   8,    47,   10,   62,   16,   2,    1,
+    10,   7,    4,    -7,   6,    -128, -77,  -50,  19,   -77,  -77,  -128,
+    -77,  -128, -50,  -128, -60,  -60,  -33,  -50,  -37,  -128, -128, -128,
+    -60,  -128, -37,  -60,  -18,  -128, -33,  -77,  37,   23,   29,   -128,
+    -128, -128, -16,  -128, -16,  -33,  21,   -20,  -8,   -60,  -2,   -60,
+    11,   -128, -50,  -128, -50,  -128, -29,  -77,  -16,  -128, -26,  -128,
+    -50,  -77,  -43,  -128, -128, -128, -50,  -128, -33,  -128, -33,  -50,
+    -23,  -128, 24,   -128, -128, -77,  4,    -23,  32,   -128, 1,    -26,
+    -14,  -128, 10,   -77,  -4,   -128, 1,    -50,  -8,   -77,  -77,  -77,
+    -23,  -128, -50,  -43,  -33,  -128, -43,  -128, -128, -128, -43,  -128,
+    -50,  -128, -128, -128, 44,   15,   14,   -128, 9,    -128, 21,   0,
+    29,   -7,   18,   -7,   -7,   -128, -33,  -50,  14,   -60,  -60,  -128,
+    -60,  -128, -37,  -128, -43,  -128, -20,  -128, -50,  -128, -43,  -77,
+    -26,  -128, -60,  -50,  -60,  -128, -77,  -128, -3,   -128, 14,   -77,
+    -26,  11,   47,   -77,  -7,   -77,  45,   -43,  -12,  14,   37,   -60,
+    22,   -4,   5,    -77,  -14,  -128, -10,  -60,  22,   -77,  -12,  -60,
+    -50,  -128, -60,  -128, -60,  -128, -43,  -128, -50,  -128, -77,  -50,
+    27,   -37,  33,   -128, 4,    -29,  -4,   -50,  -20,  -128, 6,    -37,
+    -33,  -128, -50,  -128, 34,   15,   -43,  -128, -20,  -50,  -3,   -37,
+    -37,  -77,  -77,  -128, -43,  -128, -128, -128, 4,    -26,  -26,  27,
+    0,    -128, -29,  -60,  35,   -26,  23,   -128, -29,  -77,  19,   14,
+    28,   -128, -16,  -7,   31,   -1,   17,   11,   60,   44,   8,    11,
+    18,   -128, -33,  -60,  -1,   -128, -43,  -128, -23,  -128, -128, -128,
+    59,   43,   35,   61,   37,   -77,  -77,  -50,  116,  88,   98,   69,
+    78,   53,   78,   40,   48,   7,    29,   -18,  -2,   -14,  5,    12,
+    65,   35,   31,   -12,  33,   -2,   -6,   -1,   44,   -29,  -14,  -60,
+    -4,   -43,  -37,  -128, 29,   18,   38,   51,   8,    -128, -12,  -37,
+    115,  91,   113,  77,   89,   36,   60,   44,   49,   36,   27,   31,
+    63,   30,   62,   14,   55,   49,   42,   0,    45,   17,   -23,  1,
+    30,   -37,  -50,  -77,  -8,   -60,  9,    -60,  -12,  -50,  13,   4,
+    23,   -6,   28,   13,   107,  78,   101,  73,   89,   46,   63,   17,
+    34,   -43,  -6,   30,   67,   40,   77,   21,   53,   39,   38,   12,
+    -6,   5,    28,   -2,   18,   -43,  0,    -128, -29,  -77,  18,   -128,
+    -2,   -77,  39,   35,   38,   35,   50,   29,   100,  70,   94,   69,
+    86,   50,   45,   38,   45,   12,   58,   64,   74,   36,   77,   45,
+    78,   62,   8,    -60,  38,   6,    21,   7,    8,    -37,  -1,   -20,
+    48,   -37,  8,    -10,  8,    13,   45,   39,   38,   22,   49,   25,
+    94,   63,   87,   66,   84,   -128, 29,   20,   55,   51,   80,   36,
+    62,   30,   81,   72,   68,   37,   51,   27,   54,   22,   16,   -29,
+    4,    9,    57,   15,   35,   -43,  -77,  -20,  4,    6,    37,   -1,
+    40,   31,   47,   14,   89,   68,   96,   83,   111,  96,   115,  87,
+    99,   76,   105,  84,   105,  86,   113,  91,   108,  87,   110,  78,
+    80,   46,   22,   74,   88,   72,   103,  86,   80,   68,   48,   24,
+    68,   48,   55,   36,   108,  90,   90,   63,   83,   63,   87,   64,
+    90,   92,   113,  88,   102,  79,   109,  83,   100,  89,   109,  60,
+    56,   21,   75,   62,   81,   45,   63,   73,   93,   65,   94,   80,
+    89,   81,   73,   3,    43,   60,   102,  70,   84,   67,   99,   74,
+    78,   57,   79,   50,   93,   82,   98,   56,   77,   70,   91,   71,
+    85,   82,   86,   13,   45,   -18,  48,   40,   53,   28,   85,   60,
+    65,   52,   86,   78,   76,   46,   73,   19,   35,   54,   75,   40,
+    71,   60,   82,   37,   69,   42,   62,   40,   96,   70,   85,   77,
+    70,   68,   103,  84,   94,   69,   81,   -128, -128, -128, -43,  -37,
+    40,   2,    48,   45,   76,   37,   65,   16,   43,   18,   58,   20,
+    27,   12,   71,   31,   53,   44,   88,   47,   50,   33,   39,   8,
+    89,   57,   88,   69,   72,   63,   100,  68,   81,   -77,  -10,  -128,
+    -128, -128, -128, -128, 13,   -77,  8,    27,   60,   28,   41,   -128,
+    -37,  -128, 28,   -43,  -18,  -128, 47,   -37,  45,   27,   51,   -29,
+    15,   39,   52,   30,   49,   -33,  65,   15,   76,   71,   90,   19,
+    46,   -128, -16,  -128, -128, -128, -128, -128, -128, -128, -18,  -128,
+    -20,  -128, 32,   -128, 21,   -33,  45,   -128, -128, -128, -12,  -128,
+    -6,   -14,  43,   -128, -128, -128, -128, -128, 52,   -18,  69,   -43,
+    78,   55,   42,   -128, -29,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, 14,   -128, -16,  -128, -128, -128, 7,    -128,
+    -128, -128, -128, -128, -128, -128, 12,   -128, -128, -128, -128, -16,
+    59,   -50,  35,   -128, 42,   0,    47,   -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -33,  -128, -23,  -128,
+    -128, -128, -23,  -128, -128, -128, -128, -128, -128, -128, -33,  -128,
+    -128, -128, -128, -128, -128, -128, -8,   -128, 36,   -50,  -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -37,  -128, -128, -60,  -10,  -128, -128, -128, -128, -128,
+    -128, -128, 21,   -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -12,  -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -77,  -128, -128, -128, -29,  -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -29,  -128, -128, -128, -128, -128, -128, -128, -128, -128, -50,  -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128,
 };
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h b/tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h
index dc4d45b237e..8c1b6d5b57b 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h
@@ -18,6 +18,6 @@ limitations under the License.
 
 extern const int g_no_micro_f9643d42_nohash_4_width;
 extern const int g_no_micro_f9643d42_nohash_4_height;
-extern const unsigned char g_no_micro_f9643d42_nohash_4_data[];
+extern const signed char g_no_micro_f9643d42_nohash_4_data[];
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_MICRO_FEATURES_DATA_H_
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc
index 7597b043d9b..7f077b5ffef 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h"
 
-const uint8_t g_yes_feature_data_slice[g_yes_feature_data_slice_size] = {
-    214, 215, 236, 202, 235, 203, 225, 191, 203, 188, 199, 194, 212, 127,
-    51,  0,   174, 188, 219, 196, 228, 221, 240, 207, 235, 220, 241, 219,
-    237, 207, 212, 142, 95,  0,   139, 78,  162, 177, 197, 183,
+const int8_t g_yes_feature_data_slice[g_yes_feature_data_slice_size] = {
+    86,  88,   108, 75, 108, 76,   98,  64,  75,  61, 71,  66, 85,  -1,
+    -77, -128, 46,  61, 92,  69,   100, 93,  113, 80, 108, 93, 113, 91,
+    110, 80,   85,  15, -33, -128, 12,  -50, 34,  50, 70,  55,
 };
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h b/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h
index 1515449b2c2..2427ee70063 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h
@@ -24,6 +24,6 @@ limitations under the License.
 #include <cstdint>
 
 constexpr int g_yes_feature_data_slice_size = 40;
-extern const uint8_t g_yes_feature_data_slice[];
+extern const int8_t g_yes_feature_data_slice[];
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_FEATURE_DATA_SLICE_H_
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc
index 9c1fb8be0bb..6d9137af2da 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc
@@ -15,151 +15,174 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h"
 
-/* File automatically created by
- * tensorflow/examples/speech_commands/wav_to_features.py \
- * --sample_rate=16000 \
- * --clip_duration_ms=1000 \
- * --window_size_ms=30 \
- * --window_stride_ms=20 \
- * --feature_bin_count=40 \
- * --quantize=1 \
- * --preprocess="micro" \
- * --input_wav="speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav" \
- * --output_c_file="yes_micro_features_data.cc" \
- */
+// Golden test values for the expected spectrogram from a "yes" sample file
+// speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav.
 
 const int g_yes_micro_f2e59fea_nohash_1_width = 40;
 const int g_yes_micro_f2e59fea_nohash_1_height = 49;
-const unsigned char g_yes_micro_f2e59fea_nohash_1_data[] = {
-    244, 226, 245, 223, 234, 213, 228, 208, 194, 110, 95,  116, 102, 0,   137,
-    161, 183, 173, 137, 116, 133, 157, 151, 156, 128, 110, 128, 0,   68,  78,
-    78,  90,  68,  68,  78,  102, 95,  78,  95,  78,  210, 188, 209, 183, 204,
-    188, 201, 191, 166, 119, 90,  107, 110, 107, 175, 157, 179, 168, 182, 145,
-    152, 164, 171, 165, 136, 143, 122, 68,  0,   78,  90,  90,  110, 90,  102,
-    99,  90,  68,  78,  68,  223, 186, 179, 123, 182, 110, 196, 171, 159, 110,
-    102, 95,  90,  99,  160, 134, 125, 136, 153, 152, 164, 134, 164, 151, 141,
-    136, 99,  90,  90,  90,  78,  78,  102, 119, 102, 90,  110, 90,  68,  51,
-    177, 175, 211, 172, 183, 0,   95,  68,  129, 102, 68,  85,  114, 105, 110,
-    85,  102, 95,  140, 51,  85,  51,  95,  90,  143, 116, 90,  78,  78,  51,
-    107, 85,  68,  0,   68,  51,  90,  51,  68,  0,   164, 117, 193, 120, 156,
-    0,   138, 51,  90,  0,   51,  0,   51,  85,  0,   0,   51,  0,   0,   0,
-    0,   0,   114, 0,   85,  78,  90,  51,  0,   0,   51,  85,  99,  85,  107,
-    68,  90,  85,  78,  0,   51,  0,   110, 0,   68,  0,   0,   0,   51,  0,
-    51,  0,   0,   0,   68,  90,  107, 0,   68,  0,   0,   0,   68,  0,   51,
-    68,  0,   78,  68,  0,   51,  0,   78,  68,  90,  68,  78,  51,  51,  0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   90,  0,   0,   0,   0,
-    0,   51,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,  68,
-    0,   0,   78,  0,   78,  0,   78,  0,   51,  0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   51,  0,   51,  0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,  0,   51,
-    0,   51,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,
-    0,   0,   0,   0,   51,  78,  0,   0,   51,  51,  0,   0,   0,   78,  0,
-    213, 170, 192, 180, 196, 188, 173, 131, 173, 116, 137, 105, 159, 127, 0,
-    0,   0,   0,   127, 164, 165, 161, 170, 164, 185, 197, 195, 167, 134, 138,
-    159, 134, 136, 105, 51,  0,   99,  0,   51,  0,   228, 215, 229, 218, 237,
-    215, 228, 210, 237, 222, 239, 211, 208, 211, 234, 218, 220, 209, 225, 219,
-    235, 222, 245, 225, 245, 224, 243, 223, 241, 218, 237, 224, 234, 213, 221,
-    193, 197, 164, 157, 128, 227, 188, 232, 196, 220, 220, 240, 219, 234, 213,
-    234, 211, 231, 218, 233, 213, 239, 215, 228, 207, 229, 206, 224, 208, 226,
-    207, 232, 210, 225, 208, 230, 199, 227, 206, 210, 205, 218, 174, 178, 141,
-    235, 208, 220, 206, 225, 203, 233, 203, 225, 167, 205, 199, 208, 190, 221,
-    204, 223, 207, 225, 188, 225, 197, 215, 188, 199, 183, 225, 195, 224, 200,
-    216, 178, 208, 188, 215, 202, 214, 183, 176, 140, 198, 150, 211, 194, 203,
-    120, 175, 188, 204, 189, 219, 192, 223, 202, 216, 186, 203, 185, 210, 182,
-    214, 183, 204, 170, 204, 125, 184, 187, 206, 185, 198, 182, 210, 161, 202,
-    198, 218, 173, 145, 120, 188, 183, 205, 168, 200, 170, 210, 177, 187, 190,
-    209, 193, 193, 166, 210, 162, 175, 119, 174, 147, 182, 161, 181, 134, 176,
-    143, 187, 165, 186, 149, 185, 141, 192, 181, 202, 123, 170, 143, 144, 78,
-    149, 0,   208, 182, 170, 78,  170, 0,   117, 51,  156, 99,  195, 170, 200,
-    130, 152, 68,  175, 141, 173, 134, 194, 132, 189, 164, 198, 134, 173, 117,
-    171, 149, 183, 181, 185, 99,  153, 117, 125, 0,   166, 0,   173, 117, 144,
-    0,   117, 102, 188, 120, 193, 166, 197, 68,  163, 119, 169, 99,  134, 0,
-    162, 0,   164, 68,  171, 116, 126, 0,   120, 68,  68,  0,   105, 0,   159,
-    95,  150, 51,  90,  85,  0,   0,   131, 0,   105, 0,   145, 51,  170, 51,
-    120, 0,   107, 0,   145, 85,  160, 0,   85,  0,   0,   51,  149, 0,   78,
-    0,   0,   0,   0,   0,   0,   0,   90,  0,   112, 0,   78,  102, 122, 0,
-    0,   0,   0,   0,   105, 0,   0,   0,   0,   0,   0,   0,   0,   0,   112,
-    0,   164, 120, 143, 0,   0,   0,   0,   0,   51,  0,   90,  0,   78,  0,
-    0,   0,   0,   0,   110, 0,   139, 0,   112, 51,  0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   102, 0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   107,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   78,  0,   51,  0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   51,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   127, 110, 133, 0,   167, 0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   132, 0,   190,
-    194, 202, 0,   197, 187, 161, 0,   0,   0,   0,   0,   0,   0,   0,   0,
-    214, 213, 223, 203, 218, 189, 200, 122, 78,  0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   191, 210, 231, 197, 226, 217, 238, 216, 236, 207,
-    199, 0,   0,   0,   0,   0,   107, 122, 155, 160, 214, 215, 236, 202, 235,
-    203, 225, 191, 203, 188, 199, 194, 212, 127, 51,  0,   174, 188, 219, 196,
-    228, 221, 240, 207, 235, 220, 241, 219, 237, 207, 212, 142, 95,  0,   139,
-    78,  162, 177, 197, 183, 211, 199, 235, 208, 238, 215, 227, 207, 211, 201,
-    224, 213, 226, 192, 213, 170, 223, 205, 234, 221, 245, 225, 242, 220, 245,
-    221, 239, 221, 238, 213, 226, 180, 159, 112, 176, 159, 208, 202, 213, 191,
-    205, 191, 225, 197, 238, 219, 224, 201, 227, 200, 221, 201, 225, 203, 212,
-    195, 229, 210, 228, 210, 239, 216, 226, 212, 233, 205, 225, 200, 229, 207,
-    222, 151, 147, 119, 179, 185, 230, 218, 223, 192, 202, 136, 205, 177, 223,
-    204, 228, 215, 232, 209, 221, 189, 221, 205, 209, 200, 226, 209, 229, 205,
-    235, 192, 209, 198, 228, 190, 206, 185, 207, 187, 214, 175, 177, 184, 220,
-    195, 214, 207, 230, 184, 205, 159, 208, 184, 189, 169, 224, 213, 219, 199,
-    229, 203, 216, 205, 222, 204, 224, 206, 231, 208, 231, 176, 197, 184, 216,
-    193, 211, 139, 212, 195, 231, 164, 166, 195, 217, 182, 208, 190, 217, 179,
-    205, 68,  182, 119, 195, 168, 182, 136, 204, 179, 193, 158, 182, 140, 188,
-    154, 197, 169, 190, 99,  184, 0,   125, 0,   131, 0,   99,  68,  179, 85,
-    190, 184, 213, 203, 223, 202, 212, 190, 209, 138, 178, 0,   159, 51,  128,
-    51,  105, 0,   139, 51,  179, 125, 185, 114, 171, 128, 175, 132, 181, 174,
-    155, 0,   0,   0,   90,  0,   125, 0,   176, 188, 227, 217, 244, 215, 234,
-    221, 239, 192, 224, 210, 0,   0,   134, 0,   51,  0,   105, 0,   105, 0,
-    143, 90,  192, 119, 175, 147, 141, 51,  184, 110, 85,  0,   0,   0,   0,
-    0,   0,   0,   151, 139, 201, 203, 232, 203, 226, 208, 236, 206, 230, 212,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   169, 0,   119,
-    0,   78,  0,   0,   0,   0,   0,   0,   0,   0,   0,   68,  0,   0,   133,
-    200, 180, 220, 197, 228, 201, 221, 184, 213, 193, 110, 0,   0,   0,   0,
-    0,   0,   0,   0,   0,   78,  0,   164, 0,   0,   0,   0,   0,   107, 0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   150, 164, 202, 182, 224,
-    197, 211, 179, 212, 193, 134, 0,   0,   0,   0,   0,   0,   0,   0,   0,
-    85,  0,   150, 0,   85,  0,   95,  0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   102, 90,  193, 160, 203, 164, 200, 178, 205, 174,
-    116, 0,   0,   0,   0,   0,   0,   0,   0,   0,   120, 114, 123, 0,   114,
-    0,   145, 68,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    102, 68,  199, 170, 195, 180, 208, 176, 200, 164, 0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   110, 0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   142, 102, 172, 110, 186,
-    167, 185, 147, 189, 154, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   177, 0,   158, 136, 197, 155, 189, 166,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    85,  0,   155, 90,  175, 117, 175, 138, 202, 165, 0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,  0,   139,
-    0,   120, 68,  51,  123, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   119, 0,   78,  0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+const signed char g_yes_micro_f2e59fea_nohash_1_data[] = {
+    116,  98,   118,  95,   106,  85,   101,  81,   67,   -18,  -33,  -12,
+    -26,  -128, 9,    34,   56,   45,   9,    -12,  5,    30,   23,   28,
+    0,    -18,  0,    -128, -60,  -50,  -50,  -37,  -60,  -60,  -50,  -26,
+    -33,  -50,  -33,  -50,  83,   61,   81,   55,   76,   61,   73,   64,
+    38,   -8,   -37,  -20,  -18,  -20,  48,   29,   52,   41,   55,   18,
+    25,   37,   44,   37,   8,    15,   -6,   -60,  -128, -50,  -37,  -37,
+    -18,  -37,  -26,  -29,  -37,  -60,  -50,  -60,  95,   59,   52,   -4,
+    54,   -18,  68,   43,   31,   -18,  -26,  -33,  -37,  -29,  33,   7,
+    -3,   8,    26,   24,   36,   6,    36,   23,   14,   8,    -29,  -37,
+    -37,  -37,  -50,  -50,  -26,  -8,   -26,  -37,  -18,  -37,  -60,  -77,
+    50,   48,   83,   44,   56,   -128, -33,  -60,  1,    -26,  -60,  -43,
+    -14,  -23,  -18,  -43,  -26,  -33,  13,   -77,  -43,  -77,  -33,  -37,
+    16,   -12,  -37,  -50,  -50,  -77,  -20,  -43,  -60,  -128, -60,  -77,
+    -37,  -77,  -60,  -128, 37,   -10,  65,   -7,   28,   -128, 10,   -77,
+    -37,  -128, -77,  -128, -77,  -43,  -128, -128, -77,  -128, -128, -128,
+    -128, -128, -14,  -128, -43,  -50,  -37,  -77,  -128, -128, -77,  -43,
+    -29,  -43,  -20,  -60,  -37,  -43,  -50,  -128, -77,  -128, -18,  -128,
+    -60,  -128, -128, -128, -77,  -128, -77,  -128, -128, -128, -60,  -37,
+    -20,  -128, -60,  -128, -128, -128, -60,  -128, -77,  -60,  -128, -50,
+    -60,  -128, -77,  -128, -50,  -60,  -37,  -60,  -50,  -77,  -77,  -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -37,  -128,
+    -128, -128, -128, -128, -77,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -77,  -60,  -128, -128, -50,  -128, -50,  -128,
+    -50,  -128, -77,  -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -77,  -128, -77,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -77,  -128, -77,  -128, -77,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -77,  -128, -128, -128,
+    -128, -77,  -50,  -128, -128, -77,  -77,  -128, -128, -128, -50,  -128,
+    85,   43,   65,   53,   69,   60,   45,   3,    46,   -12,  9,    -23,
+    32,   -1,   -128, -128, -128, -128, -1,   37,   38,   33,   43,   36,
+    58,   70,   68,   39,   6,    10,   32,   6,    8,    -23,  -77,  -128,
+    -29,  -128, -77,  -128, 101,  87,   102,  91,   110,  88,   101,  83,
+    110,  95,   111,  83,   81,   84,   106,  90,   93,   82,   98,   91,
+    108,  95,   118,  97,   118,  97,   116,  96,   113,  90,   110,  96,
+    107,  85,   94,   66,   69,   36,   29,   0,    100,  60,   105,  68,
+    92,   93,   113,  92,   107,  85,   107,  83,   104,  91,   105,  85,
+    112,  88,   101,  80,   101,  79,   96,   80,   98,   80,   105,  83,
+    98,   81,   103,  71,   100,  79,   83,   78,   91,   47,   50,   13,
+    108,  81,   93,   78,   98,   76,   105,  76,   98,   40,   77,   72,
+    81,   62,   93,   77,   96,   80,   98,   61,   97,   69,   88,   61,
+    71,   56,   98,   68,   97,   72,   89,   51,   81,   61,   88,   75,
+    86,   56,   48,   13,   71,   22,   84,   66,   76,   -7,   48,   61,
+    77,   62,   91,   65,   95,   74,   88,   59,   75,   58,   83,   55,
+    87,   55,   76,   43,   76,   -3,   56,   60,   79,   57,   71,   54,
+    82,   33,   74,   71,   91,   45,   18,   -7,   61,   56,   77,   41,
+    73,   42,   82,   49,   59,   63,   82,   65,   66,   38,   83,   34,
+    48,   -8,   46,   20,   54,   33,   54,   6,    48,   16,   60,   37,
+    58,   22,   58,   14,   65,   53,   75,   -4,   42,   16,   16,   -50,
+    22,   -128, 80,   54,   43,   -50,  42,   -128, -10,  -77,  28,   -29,
+    68,   43,   73,   2,    25,   -60,  47,   14,   45,   7,    66,   4,
+    62,   37,   71,   7,    46,   -10,  44,   22,   55,   53,   57,   -29,
+    26,   -10,  -3,   -128, 38,   -128, 46,   -10,  16,   -128, -10,  -26,
+    60,   -7,   65,   38,   70,   -60,  35,   -8,   42,   -29,  6,    -128,
+    34,   -128, 36,   -60,  44,   -12,  -2,   -128, -7,   -60,  -60,  -128,
+    -23,  -128, 31,   -33,  22,   -77,  -37,  -43,  -128, -128, 3,    -128,
+    -23,  -128, 17,   -77,  43,   -77,  -7,   -128, -20,  -128, 17,   -43,
+    32,   -128, -43,  -128, -128, -77,  21,   -128, -50,  -128, -128, -128,
+    -128, -128, -128, -128, -37,  -128, -16,  -128, -50,  -26,  -6,   -128,
+    -128, -128, -128, -128, -23,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -16,  -128, 36,   -7,   16,   -128, -128, -128, -128, -128,
+    -77,  -128, -37,  -128, -50,  -128, -128, -128, -128, -128, -18,  -128,
+    11,   -128, -16,  -77,  -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -26,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -20,  -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -50,  -128, -77,  -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -77,  -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -1,   -18,  5,    -128,
+    40,   -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, 4,    -128, 63,   66,   75,   -128,
+    70,   60,   34,   -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    87,   86,   95,   76,   91,   62,   72,   -6,   -50,  -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, 64,   83,   104,  70,
+    98,   90,   111,  89,   109,  80,   71,   -128, -128, -128, -128, -128,
+    -20,  -6,   27,   33,   86,   88,   108,  75,   108,  76,   98,   64,
+    75,   61,   71,   66,   85,   -1,   -77,  -128, 46,   61,   92,   69,
+    100,  93,   113,  80,   108,  93,   113,  91,   110,  80,   85,   15,
+    -33,  -128, 12,   -50,  34,   50,   70,   55,   84,   72,   108,  81,
+    111,  88,   100,  80,   84,   73,   97,   86,   99,   65,   85,   43,
+    96,   78,   107,  94,   118,  98,   115,  92,   118,  94,   111,  93,
+    111,  86,   99,   52,   32,   -16,  48,   31,   81,   74,   85,   64,
+    78,   64,   98,   70,   110,  92,   96,   73,   100,  72,   94,   73,
+    98,   76,   85,   67,   101,  83,   101,  83,   112,  89,   98,   85,
+    105,  78,   98,   72,   102,  80,   95,   23,   19,   -8,   52,   57,
+    103,  91,   95,   65,   74,   8,    77,   49,   96,   76,   100,  87,
+    105,  81,   94,   62,   94,   78,   81,   72,   99,   82,   101,  78,
+    108,  65,   82,   70,   100,  63,   79,   58,   80,   59,   87,   48,
+    50,   57,   93,   67,   86,   80,   103,  56,   77,   31,   81,   57,
+    62,   41,   96,   85,   91,   71,   101,  76,   89,   78,   95,   76,
+    96,   79,   103,  81,   103,  48,   70,   57,   88,   66,   84,   11,
+    85,   67,   104,  37,   38,   67,   90,   54,   81,   62,   90,   52,
+    78,   -60,  54,   -8,   68,   40,   55,   8,    77,   52,   66,   31,
+    55,   13,   60,   26,   69,   42,   63,   -29,  57,   -128, -3,   -128,
+    3,    -128, -29,  -60,  52,   -43,  63,   56,   86,   75,   95,   75,
+    85,   63,   82,   10,   50,   -128, 31,   -77,  0,    -77,  -23,  -128,
+    12,   -77,  51,   -3,   58,   -14,  44,   0,    48,   4,    53,   47,
+    28,   -128, -128, -128, -37,  -128, -3,   -128, 49,   61,   100,  90,
+    117,  88,   107,  94,   112,  64,   96,   83,   -128, -128, 7,    -128,
+    -77,  -128, -23,  -128, -23,  -128, 16,   -37,  65,   -8,   48,   20,
+    14,   -77,  57,   -18,  -43,  -128, -128, -128, -128, -128, -128, -128,
+    24,   12,   74,   76,   105,  76,   99,   80,   108,  79,   103,  85,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    42,   -128, -8,   -128, -50,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -60,  -128, -128, 5,    73,   53,   93,   70,   101,  73,
+    94,   57,   86,   66,   -18,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -50,  -128, 36,   -128, -128, -128, -128, -128, -20,  -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 23,   37,
+    75,   54,   97,   70,   83,   52,   85,   65,   7,    -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -43,  -128, 23,   -128, -43,  -128,
+    -33,  -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -26,  -37,  65,   33,   76,   37,   73,   50,   77,   47,
+    -12,  -128, -128, -128, -128, -128, -128, -128, -128, -128, -7,   -14,
+    -4,   -128, -14,  -128, 18,   -60,  -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -26,  -60,  71,   42,   68,   53,
+    81,   49,   73,   36,   -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -18,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 15,   -26,
+    44,   -18,  59,   39,   57,   20,   62,   26,   -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, 49,   -128, 30,   8,    69,   27,   62,   38,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -43,  -128, 28,   -37,  48,   -10,
+    48,   11,   74,   37,   -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -77,  -128, 11,   -128, -7,   -60,  -77,  -4,   -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -8,   -128, -50,  -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128,
 };
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h b/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h
index 07eccc35f4e..cd1ad10888e 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h
@@ -18,6 +18,6 @@ limitations under the License.
 
 extern const int g_yes_micro_f2e59fea_nohash_1_width;
 extern const int g_yes_micro_f2e59fea_nohash_1_height;
-extern const unsigned char g_yes_micro_f2e59fea_nohash_1_data[];
+extern const signed char g_yes_micro_f2e59fea_nohash_1_data[];
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_MICRO_FEATURES_DATA_H_
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_binary_mock_test.sh b/tensorflow/lite/micro/examples/micro_speech/micro_speech_binary_mock_test.sh
new file mode 100755
index 00000000000..f18b7fa2dff
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_speech_binary_mock_test.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Bash unit tests for the example binary.
+
+set -e
+
+OUTPUT_LOG_FILE=${TEST_TMPDIR}/output_log.txt
+
+# Needed for copybara compatibility.
+SCRIPT_BASE_DIR=/org_"tensor"flow
+${TEST_SRCDIR}${SCRIPT_BASE_DIR}/tensorflow/lite/micro/examples/micro_speech/micro_speech_mock 2>&1 | head > ${OUTPUT_LOG_FILE}
+
+if ! grep -q 'Heard ' ${OUTPUT_LOG_FILE}; then
+  echo "ERROR: Expected logs not found in output '${OUTPUT_LOG_FILE}'"
+  exit 1
+fi
+
+echo
+echo "SUCCESS: micro_speech_binary_mock_test PASSED"
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
index ca090ec9524..b1b224c9391 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
@@ -48,14 +48,19 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   // needed by this graph.
   //
   // tflite::ops::micro::AllOpsResolver resolver;
-  tflite::MicroOpResolver<3> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
+  tflite::MicroMutableOpResolver<4> micro_op_resolver;
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+                               tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
+                               tflite::MicroOpResolverAnyVersion());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
-                               tflite::ops::micro::Register_FULLY_CONNECTED());
+                               tflite::ops::micro::Register_FULLY_CONNECTED(),
+                               tflite::MicroOpResolverAnyVersion());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+                               tflite::ops::micro::Register_SOFTMAX(),
+                               tflite::MicroOpResolverAnyVersion());
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
+                               tflite::ops::micro::Register_RESHAPE(),
+                               tflite::MicroOpResolverAnyVersion());
 
   // Create an area of memory to use for input, output, and intermediate arrays.
   const int tensor_arena_size = 10 * 1024;
@@ -71,18 +76,16 @@ TF_LITE_MICRO_TEST(TestInvoke) {
 
   // Make sure the input has the properties we expect.
   TF_LITE_MICRO_EXPECT_NE(nullptr, input);
-  TF_LITE_MICRO_EXPECT_EQ(4, input->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(2, input->dims->size);
   TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
-  TF_LITE_MICRO_EXPECT_EQ(49, input->dims->data[1]);
-  TF_LITE_MICRO_EXPECT_EQ(40, input->dims->data[2]);
-  TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[3]);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, input->type);
+  TF_LITE_MICRO_EXPECT_EQ(1960, input->dims->data[1]);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt8, input->type);
 
   // Copy a spectrogram created from a .wav audio file of someone saying "Yes",
   // into the memory area used for the input.
-  const uint8_t* yes_features_data = g_yes_micro_f2e59fea_nohash_1_data;
+  const int8_t* yes_features_data = g_yes_micro_f2e59fea_nohash_1_data;
   for (int i = 0; i < input->bytes; ++i) {
-    input->data.uint8[i] = yes_features_data[i];
+    input->data.int8[i] = yes_features_data[i];
   }
 
   // Run the model on this input and make sure it succeeds.
@@ -98,7 +101,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TF_LITE_MICRO_EXPECT_EQ(2, output->dims->size);
   TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
   TF_LITE_MICRO_EXPECT_EQ(4, output->dims->data[1]);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, output->type);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt8, output->type);
 
   // There are four possible classes in the output, each with a score.
   const int kSilenceIndex = 0;
@@ -107,18 +110,18 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   const int kNoIndex = 3;
 
   // Make sure that the expected "Yes" score is higher than the other classes.
-  uint8_t silence_score = output->data.uint8[kSilenceIndex];
-  uint8_t unknown_score = output->data.uint8[kUnknownIndex];
-  uint8_t yes_score = output->data.uint8[kYesIndex];
-  uint8_t no_score = output->data.uint8[kNoIndex];
+  uint8_t silence_score = output->data.uint8[kSilenceIndex] + 128;
+  uint8_t unknown_score = output->data.uint8[kUnknownIndex] + 128;
+  uint8_t yes_score = output->data.int8[kYesIndex] + 128;
+  uint8_t no_score = output->data.int8[kNoIndex] + 128;
   TF_LITE_MICRO_EXPECT_GT(yes_score, silence_score);
   TF_LITE_MICRO_EXPECT_GT(yes_score, unknown_score);
   TF_LITE_MICRO_EXPECT_GT(yes_score, no_score);
 
   // Now test with a different input, from a recording of "No".
-  const uint8_t* no_features_data = g_no_micro_f9643d42_nohash_4_data;
+  const int8_t* no_features_data = g_no_micro_f9643d42_nohash_4_data;
   for (int i = 0; i < input->bytes; ++i) {
-    input->data.uint8[i] = no_features_data[i];
+    input->data.int8[i] = no_features_data[i];
   }
 
   // Run the model on this "No" input.
@@ -134,13 +137,13 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TF_LITE_MICRO_EXPECT_EQ(2, output->dims->size);
   TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
   TF_LITE_MICRO_EXPECT_EQ(4, output->dims->data[1]);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, output->type);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt8, output->type);
 
   // Make sure that the expected "No" score is higher than the other classes.
-  silence_score = output->data.uint8[kSilenceIndex];
-  unknown_score = output->data.uint8[kUnknownIndex];
-  yes_score = output->data.uint8[kYesIndex];
-  no_score = output->data.uint8[kNoIndex];
+  silence_score = output->data.int8[kSilenceIndex] + 128;
+  unknown_score = output->data.int8[kUnknownIndex] + 128;
+  yes_score = output->data.int8[kYesIndex] + 128;
+  no_score = output->data.int8[kNoIndex] + 128;
   TF_LITE_MICRO_EXPECT_GT(no_score, silence_score);
   TF_LITE_MICRO_EXPECT_GT(no_score, unknown_score);
   TF_LITE_MICRO_EXPECT_GT(no_score, yes_score);
diff --git a/tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc b/tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc
index 96f35984051..47bd10074d3 100644
--- a/tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc
@@ -47,10 +47,10 @@ TfLiteStatus RecognizeCommands::ProcessLatestResults(
     return kTfLiteError;
   }
 
-  if (latest_results->type != kTfLiteUInt8) {
+  if (latest_results->type != kTfLiteInt8) {
     TF_LITE_REPORT_ERROR(
         error_reporter_,
-        "The results for recognition should be uint8 elements, but are %d",
+        "The results for recognition should be int8 elements, but are %d",
         latest_results->type);
     return kTfLiteError;
   }
@@ -66,7 +66,7 @@ TfLiteStatus RecognizeCommands::ProcessLatestResults(
   }
 
   // Add the latest results to the head of the queue.
-  previous_results_.push_back({current_time_ms, latest_results->data.uint8});
+  previous_results_.push_back({current_time_ms, latest_results->data.int8});
 
   // Prune any earlier results that are too old for the averaging window.
   const int64_t time_limit = current_time_ms - average_window_duration_ms_;
@@ -93,12 +93,12 @@ TfLiteStatus RecognizeCommands::ProcessLatestResults(
   for (int offset = 0; offset < previous_results_.size(); ++offset) {
     PreviousResultsQueue::Result previous_result =
         previous_results_.from_front(offset);
-    const uint8_t* scores = previous_result.scores_;
+    const int8_t* scores = previous_result.scores;
     for (int i = 0; i < kCategoryCount; ++i) {
       if (offset == 0) {
-        average_scores[i] = scores[i];
+        average_scores[i] = scores[i] + 128;
       } else {
-        average_scores[i] += scores[i];
+        average_scores[i] += scores[i] + 128;
       }
     }
   }
diff --git a/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h b/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h
index 059d567fb20..67bdb31bed9 100644
--- a/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h
+++ b/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h
@@ -36,14 +36,14 @@ class PreviousResultsQueue {
   // Data structure that holds an inference result, and the time when it
   // was recorded.
   struct Result {
-    Result() : time_(0), scores_() {}
-    Result(int32_t time, uint8_t* scores) : time_(time) {
+    Result() : time_(0), scores() {}
+    Result(int32_t time, int8_t* input_scores) : time_(time) {
       for (int i = 0; i < kCategoryCount; ++i) {
-        scores_[i] = scores[i];
+        scores[i] = input_scores[i];
       }
     }
     int32_t time_;
-    uint8_t scores_[kCategoryCount];
+    int8_t scores[kCategoryCount];
   };
 
   int size() { return size_; }
diff --git a/tensorflow/lite/micro/examples/micro_speech/recognize_commands_test.cc b/tensorflow/lite/micro/examples/micro_speech/recognize_commands_test.cc
index 70911a81776..dcff73cf7ee 100644
--- a/tensorflow/lite/micro/examples/micro_speech/recognize_commands_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/recognize_commands_test.cc
@@ -27,13 +27,13 @@ TF_LITE_MICRO_TEST(PreviousResultsQueueBasic) {
   PreviousResultsQueue queue(error_reporter);
   TF_LITE_MICRO_EXPECT_EQ(0, queue.size());
 
-  uint8_t scores_a[4] = {0, 0, 0, 1};
+  int8_t scores_a[4] = {0, 0, 0, 1};
   queue.push_back({0, scores_a});
   TF_LITE_MICRO_EXPECT_EQ(1, queue.size());
   TF_LITE_MICRO_EXPECT_EQ(0, queue.front().time_);
   TF_LITE_MICRO_EXPECT_EQ(0, queue.back().time_);
 
-  uint8_t scores_b[4] = {0, 0, 1, 0};
+  int8_t scores_b[4] = {0, 0, 1, 0};
   queue.push_back({1, scores_b});
   TF_LITE_MICRO_EXPECT_EQ(2, queue.size());
   TF_LITE_MICRO_EXPECT_EQ(0, queue.front().time_);
@@ -45,7 +45,7 @@ TF_LITE_MICRO_TEST(PreviousResultsQueueBasic) {
   TF_LITE_MICRO_EXPECT_EQ(1, queue.front().time_);
   TF_LITE_MICRO_EXPECT_EQ(1, queue.back().time_);
 
-  uint8_t scores_c[4] = {0, 1, 0, 0};
+  int8_t scores_c[4] = {0, 1, 0, 0};
   queue.push_back({2, scores_c});
   TF_LITE_MICRO_EXPECT_EQ(2, queue.size());
   TF_LITE_MICRO_EXPECT_EQ(1, queue.front().time_);
@@ -60,7 +60,7 @@ TF_LITE_MICRO_TEST(PreviousResultsQueuePushPop) {
   TF_LITE_MICRO_EXPECT_EQ(0, queue.size());
 
   for (int i = 0; i < 123; ++i) {
-    uint8_t scores[4] = {0, 0, 0, 1};
+    int8_t scores[4] = {0, 0, 0, 1};
     queue.push_back({i, scores});
     TF_LITE_MICRO_EXPECT_EQ(1, queue.size());
     TF_LITE_MICRO_EXPECT_EQ(i, queue.front().time_);
@@ -78,11 +78,11 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestBasic) {
 
   RecognizeCommands recognize_commands(error_reporter);
 
-  std::initializer_list<uint8_t> result_data = {255, 0, 0, 0};
+  std::initializer_list<int8_t> result_data = {127, -128, -128, -128};
   auto result_dims = {2, 1, 4};
   TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
       result_data, tflite::testing::IntArrayFromInitializer(result_dims),
-      "input_tensor", 0.0f, 128.0f);
+      "input_tensor", -128.0f, 127.0f);
 
   const char* found_command;
   uint8_t score;
@@ -98,11 +98,11 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestFindCommands) {
 
   RecognizeCommands recognize_commands(error_reporter, 1000, 51);
 
-  std::initializer_list<uint8_t> yes_data = {0, 0, 255, 0};
+  std::initializer_list<int8_t> yes_data = {-128, -128, 127, -128};
   auto yes_dims = {2, 1, 4};
   TfLiteTensor yes_results = tflite::testing::CreateQuantizedTensor(
       yes_data, tflite::testing::IntArrayFromInitializer(yes_dims),
-      "input_tensor", 0.0f, 128.0f);
+      "input_tensor", -128.0f, 127.0f);
 
   bool has_found_new_command = false;
   const char* new_command;
@@ -126,11 +126,11 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestFindCommands) {
     TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("yes", new_command));
   }
 
-  std::initializer_list<uint8_t> no_data = {0, 0, 0, 255};
+  std::initializer_list<int8_t> no_data = {-128, -128, -128, 127};
   auto no_dims = {2, 1, 4};
   TfLiteTensor no_results = tflite::testing::CreateQuantizedTensor(
       no_data, tflite::testing::IntArrayFromInitializer(no_dims),
-      "input_tensor", 0.0f, 128.0f);
+      "input_tensor", -128.0f, 127.0f);
   has_found_new_command = false;
   new_command = "";
   uint8_t score;
@@ -161,11 +161,11 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputLength) {
 
   RecognizeCommands recognize_commands(error_reporter, 1000, 51);
 
-  std::initializer_list<uint8_t> bad_data = {0, 0, 255};
+  std::initializer_list<int8_t> bad_data = {-128, -128, 127};
   auto bad_dims = {2, 1, 3};
   TfLiteTensor bad_results = tflite::testing::CreateQuantizedTensor(
       bad_data, tflite::testing::IntArrayFromInitializer(bad_dims),
-      "input_tensor", 0.0f, 128.0f);
+      "input_tensor", -128.0f, 127.0f);
 
   const char* found_command;
   uint8_t score;
@@ -181,11 +181,11 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputTimes) {
 
   RecognizeCommands recognize_commands(error_reporter, 1000, 51);
 
-  std::initializer_list<uint8_t> result_data = {0, 0, 255, 0};
+  std::initializer_list<int8_t> result_data = {-128, -128, 127, -128};
   auto result_dims = {2, 1, 4};
   TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
       result_data, tflite::testing::IntArrayFromInitializer(result_dims),
-      "input_tensor", 0.0f, 128.0f);
+      "input_tensor", -128.0f, 127.0f);
 
   const char* found_command;
   uint8_t score;
@@ -204,11 +204,11 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestTooFewInputs) {
 
   RecognizeCommands recognize_commands(error_reporter, 1000, 51);
 
-  std::initializer_list<uint8_t> result_data = {0, 0, 255, 0};
+  std::initializer_list<int8_t> result_data = {-128, -128, 127, -128};
   auto result_dims = {2, 1, 4};
   TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
       result_data, tflite::testing::IntArrayFromInitializer(result_dims),
-      "input_tensor", 0.0f, 128.0f);
+      "input_tensor", -128.0f, 127.0f);
 
   const char* found_command;
   uint8_t score;
diff --git a/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb b/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
index 40f56f8012b..bfe75bdd9f7 100644
--- a/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
+++ b/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
@@ -1,2020 +1 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "train_micro_speech_model.ipynb",
-      "provenance": [],
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "accelerator": "GPU"
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "pO4-CY_TCZZS",
-        "colab_type": "text"
-      },
-      "source": [
-        "# Train a Simple Audio Recognition Model"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "BaFfr7DHRmGF",
-        "colab_type": "text"
-      },
-      "source": [
-        "This notebook demonstrates how to train a 20 kB [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition) model to recognize keywords in speech.\n",
-        "\n",
-        "The model created in this notebook is used in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) example for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview).\n",
-        "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "</table>\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "XaVtYN4nlCft",
-        "colab_type": "text"
-      },
-      "source": [
-        "**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and set **Hardware accelerator: GPU**. Training 15,000 iterations will take 1.5 - 2 hours on a GPU runtime.\n",
-        "\n",
-        "## Configure Defaults\n",
-        "\n",
-        "**MODIFY** the following constants for your specific use case."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "ludfxbNIaegy",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# A comma-delimited list of the words you want to train for.\n",
-        "# The options are: yes,no,up,down,left,right,on,off,stop,go\n",
-        "# All the other words will be used to train an \"unknown\" label and silent\n",
-        "# audio data with no spoken words will be used to train a \"silence\" label.\n",
-        "WANTED_WORDS = \"yes,no\"\n",
-        "\n",
-        "# The number of steps and learning rates can be specified as comma-separated\n",
-        "# lists to define the rate at each stage. For example,\n",
-        "# TRAINING_STEPS=12000,3000 and LEARNING_RATE=0.001,0.0001\n",
-        "# will run 12,000 training loops in total, with a rate of 0.001 for the first\n",
-        "# 8,000, and 0.0001 for the final 3,000.\n",
-        "TRAINING_STEPS = \"12000,3000\"\n",
-        "LEARNING_RATE = \"0.001,0.0001\"\n",
-        "\n",
-        "# Calculate the total number of steps, which is used to identify the checkpoint\n",
-        "# file name.\n",
-        "TOTAL_STEPS = str(sum(map(lambda string: int(string), TRAINING_STEPS.split(\",\"))))\n",
-        "\n",
-        "# Print the configuration to confirm it\n",
-        "!echo \"Training these words:\" $WANTED_WORDS\n",
-        "!echo \"Training steps in each stage:\" $TRAINING_STEPS\n",
-        "!echo \"Learning rate in each stage:\" $LEARNING_RATE\n",
-        "!echo \"Total number of training steps:\" $TOTAL_STEPS"
-      ],
-      "execution_count": 1,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "Training these words: yes,no\n",
-            "Training steps in each stage: 12000,3000\n",
-            "Learning rate in each stage: 0.001,0.0001\n",
-            "Total number of training steps: 15000\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "gCgeOpvY9pAi",
-        "colab_type": "text"
-      },
-      "source": [
-        "**DO NOT MODIFY** the following constants as they include filepaths used in this notebook and data that is shared during training and inference."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Nd1iM1o2ymvA",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# Calculate the percentage of 'silence' and 'unknown' training samples required\n",
-        "# to ensure that we have equal number of samples for each label.\n",
-        "number_of_labels = WANTED_WORDS.count(',') + 1\n",
-        "number_of_total_labels = number_of_labels + 2 # for 'silence' and 'unknown' label\n",
-        "equal_percentage_of_training_samples = int(100.0/(number_of_total_labels))\n",
-        "SILENT_PERCENTAGE = equal_percentage_of_training_samples\n",
-        "UNKNOWN_PERCENTAGE = equal_percentage_of_training_samples\n",
-        "\n",
-        "# Constants which are shared during training and inference\n",
-        "PREPROCESS = 'micro'\n",
-        "WINDOW_STRIDE ='20'\n",
-        "MODEL_ARCHITECTURE = 'tiny_conv' # Other options include: single_fc, conv,\n",
-        "                      # low_latency_conv, low_latency_svdf, tiny_embedding_conv\n",
-        "QUANTIZE = '1' # For booleans, we provide 1 or 0 (instead of True or False)\n",
-        "\n",
-        "# Constants used during training only\n",
-        "VERBOSITY = 'WARN'\n",
-        "EVAL_STEP_INTERVAL = '1000'\n",
-        "SAVE_STEP_INTERVAL = '5000'\n",
-        "\n",
-        "# Constants for training directories and filepaths\n",
-        "DATASET_DIR =  'dataset/'\n",
-        "LOGS_DIR = 'logs/'\n",
-        "TRAIN_DIR = 'train/' # for training checkpoints and other files.\n",
-        "\n",
-        "# Constants for inference directories and filepaths\n",
-        "import os\n",
-        "MODELS_DIR = 'models/'\n",
-        "os.mkdir(MODELS_DIR)\n",
-        "MODEL_TF = MODELS_DIR + 'model.pb'\n",
-        "MODEL_TFLITE = MODELS_DIR + 'model.tflite'\n",
-        "MODEL_TFLITE_MICRO = MODELS_DIR + 'model.cc'"
-      ],
-      "execution_count": 2,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "6rLYpvtg9P4o",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Setup Environment\n",
-        "\n",
-        "Install Dependencies"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "ed_XpUrU5DvY",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "%tensorflow_version 1.x\n",
-        "import tensorflow as tf"
-      ],
-      "execution_count": 3,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "TensorFlow 1.x selected.\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "T9Ty5mR58E4i",
-        "colab_type": "text"
-      },
-      "source": [
-        "**DELETE** any old data from previous runs\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "APGx0fEh7hFF",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "!rm -rf {DATASET_DIR} {LOGS_DIR} {TRAIN_DIR} {MODELS_DIR}"
-      ],
-      "execution_count": 5,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "GfEUlfFBizio",
-        "colab_type": "text"
-      },
-      "source": [
-        "Clone the TensorFlow Github Repository, which contains the relevant code required to run this tutorial."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "yZArmzT85SLq",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "!git clone -q https://github.com/tensorflow/tensorflow"
-      ],
-      "execution_count": 6,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nS9swHLSi7Bi",
-        "colab_type": "text"
-      },
-      "source": [
-        "Load TensorBoard to visualize the accuracy and loss as training proceeds.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "q4qF1VxP3UE4",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "%load_ext tensorboard\n",
-        "%tensorboard --logdir {LOGS_DIR}"
-      ],
-      "execution_count": 7,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "x1J96Ron-O4R",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Training\n",
-        "\n",
-        "The following script downloads the dataset and begin training."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "VJsEZx6lynbY",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "!python tensorflow/tensorflow/examples/speech_commands/train.py \\\n",
-        "--data_dir={DATASET_DIR} \\\n",
-        "--wanted_words={WANTED_WORDS} \\\n",
-        "--silence_percentage={SILENT_PERCENTAGE} \\\n",
-        "--unknown_percentage={UNKNOWN_PERCENTAGE} \\\n",
-        "--preprocess={PREPROCESS} \\\n",
-        "--window_stride={WINDOW_STRIDE} \\\n",
-        "--model_architecture={MODEL_ARCHITECTURE} \\\n",
-        "--quantize={QUANTIZE} \\\n",
-        "--how_many_training_steps={TRAINING_STEPS} \\\n",
-        "--learning_rate={LEARNING_RATE} \\\n",
-        "--train_dir={TRAIN_DIR} \\\n",
-        "--summaries_dir={LOGS_DIR} \\\n",
-        "--verbosity={VERBOSITY} \\\n",
-        "--eval_step_interval={EVAL_STEP_INTERVAL} \\\n",
-        "--save_step_interval={SAVE_STEP_INTERVAL} \\"
-      ],
-      "execution_count": 8,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "..\n",
-            "..\n",
-            "..\n",
-            "..\n",
-            "WARNING:tensorflow:Confusion Matrix:\n",
-            " [[205   0   0   1]\n",
-            " [  3 162  13  28]\n",
-            " [  3   9 401   6]\n",
-            " [  2  22   6 375]]\n",
-            "W0402 00:25:28.115174 139938153863040 train.py:320] Confusion Matrix:\n",
-            " [[205   0   0   1]\n",
-            " [  3 162  13  28]\n",
-            " [  3   9 401   6]\n",
-            " [  2  22   6 375]]\n",
-            "WARNING:tensorflow:Final test accuracy = 92.5% (N=1236)\n",
-            "W0402 00:25:28.115574 139938153863040 train.py:322] Final test accuracy = 92.5% (N=1236)\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "XQUJLrdS-ftl",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Generate a TensorFlow Model for Inference\n",
-        "\n",
-        "Combine relevant training results (graph, weights, etc) into a single file for inference. This process is known as freezing a model and the resulting model is known as a frozen model/graph, as it cannot be further re-trained after this process."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "xyc3_eLh9sAg",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "!python tensorflow/tensorflow/examples/speech_commands/freeze.py \\\n",
-        "--wanted_words=$WANTED_WORDS \\\n",
-        "--window_stride_ms=$WINDOW_STRIDE \\\n",
-        "--preprocess=$PREPROCESS \\\n",
-        "--model_architecture=$MODEL_ARCHITECTURE \\\n",
-        "--quantize=$QUANTIZE \\\n",
-        "--start_checkpoint=$TRAIN_DIR$MODEL_ARCHITECTURE'.ckpt-'$TOTAL_STEPS \\\n",
-        "--output_file=$MODEL_TF \\"
-      ],
-      "execution_count": 9,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "..\n",
-            "..\n",
-            "..\n",
-            "..\n",
-            "INFO:tensorflow:Restoring parameters from /content/train/tiny_conv.ckpt-15000\n",
-            "I0402 00:25:47.086113 140352379615104 saver.py:1284] Restoring parameters from /content/train/tiny_conv.ckpt-15000\n",
-            "INFO:tensorflow:Froze 12 variables.\n",
-            "I0402 00:25:47.663757 140352379615104 graph_util_impl.py:334] Froze 12 variables.\n",
-            "INFO:tensorflow:Converted 12 variables to const ops.\n",
-            "I0402 00:25:47.665771 140352379615104 graph_util_impl.py:394] Converted 12 variables to const ops.\n",
-            "INFO:tensorflow:Saved frozen graph to /content/models/model.pb\n",
-            "I0402 00:25:47.667117 140352379615104 freeze.py:186] Saved frozen graph to /content/models/model.pb\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "_DBGDxVI-nKG",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Generate a TensorFlow Lite Model\n",
-        "\n",
-        "Convert the frozen graph into a TensorFlow Lite model, which is fully quantized for use with embedded devices.\n",
-        "\n",
-        "The following cell will also print the model size, which will be under 20 kilobytes."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "lBj_AyCh1cC0",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "input_tensor = 'Reshape_2'\n",
-        "output_tensor = 'labels_softmax'\n",
-        "\n",
-        "converter = tf.lite.TFLiteConverter.from_frozen_graph(\n",
-        "    MODEL_TF, [input_tensor], [output_tensor])\n",
-        "converter.inference_type = tf.uint8\n",
-        "converter.quantized_input_stats = {input_tensor: (0.0, 9.8077)} # (mean, standard deviation)\n",
-        "tflite_model = converter.convert()\n",
-        "\n",
-        "tflite_model_size = open(MODEL_TFLITE, \"wb\").write(tflite_model)\n",
-        "print(\"Model is %d bytes\" % tflite_model_size)\n"
-        ],
-      "execution_count": 10,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "Model is 18288 bytes\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "dt6Zqbxu-wIi",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Generate a TensorFlow Lite for MicroControllers Model\n",
-        "Convert the TensorFlow Lite model into a C source file that can be loaded by TensorFlow Lite for Microcontrollers."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "XohZOTjR8ZyE",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# Install xxd if it is not available\n",
-        "!apt-get update && apt-get -qq install xxd\n",
-        "# Convert to a C source file\n",
-        "!xxd -i {MODEL_TFLITE} > {MODEL_TFLITE_MICRO}\n",
-        "# Update variable names\n",
-        "REPLACE_TEXT = MODEL_TFLITE.replace('/', '_').replace('.', '_')\n",
-        "!sed -i 's/'{REPLACE_TEXT}'/g_model/g' {MODEL_TFLITE_MICRO}"
-      ],
-      "execution_count": 11,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "2pQnN0i_-0L2",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Deploy to a Microcontroller\n",
-        "\n",
-        "Follow the instructions in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) README.md for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview) to deploy this model on a specific microcontroller.\n",
-        "\n",
-        "**Reference Model:** If you have not modified this notebook, you can follow the instructions as is, to deploy the model. Refer to the [`micro_speech/train/models`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/models) directory to access the models generated in this notebook. \n",
-        "\n",
-        "**New Model:** If you have generated a new model to identify different words: (i) Update `kCategoryCount` and `kCategoryLabels` in [`micro_speech/micro_features/micro_model_settings.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h) and (ii) Update the values assigned to the variables defined in [`micro_speech/micro_features/model.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc) with values displayed after running the following cell."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "eoYyh0VU8pca",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# Print the C source file\n",
-        "!cat {MODEL_TFLITE_MICRO}"
-      ],
-      "execution_count": 12,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "unsigned char g_model[] = {\n",
-            "  0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x12, 0x00,\n",
-            "  0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,\n",
-            "  0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,\n",
-            "  0x1c, 0x47, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,\n",
-            "  0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0xc0, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,\n",
-            "  0x0f, 0x00, 0x00, 0x00, 0x54, 0x4f, 0x43, 0x4f, 0x20, 0x43, 0x6f, 0x6e,\n",
-            "  0x76, 0x65, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x00, 0x0a, 0x00, 0x00, 0x00,\n",
-            "  0x60, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,\n",
-            "  0x3c, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,\n",
-            "  0x20, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x0e, 0xba, 0xff, 0xff, 0x38, 0x00, 0x00, 0x00,\n",
-            "  0xbc, 0xb9, 0xff, 0xff, 0xc0, 0xb9, 0xff, 0xff, 0x1e, 0xba, 0xff, 0xff,\n",
-            "  0xe0, 0x01, 0x00, 0x00, 0xcc, 0xb9, 0xff, 0xff, 0xd0, 0xb9, 0xff, 0xff,\n",
-            "  0x2e, 0xba, 0xff, 0xff, 0x60, 0x03, 0x00, 0x00, 0x36, 0xba, 0xff, 0xff,\n",
-            "  0x7c, 0x06, 0x00, 0x00, 0x3e, 0xba, 0xff, 0xff, 0x68, 0x45, 0x00, 0x00,\n",
-            "  0xec, 0xb9, 0xff, 0xff, 0x05, 0x00, 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e,\n",
-            "  0x30, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00,\n",
-            "  0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,\n",
-            "  0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74,\n",
-            "  0x69, 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00,\n",
-            "  0x10, 0xfa, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
-            "  0x08, 0x00, 0x00, 0x00, 0x2c, 0x45, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,\n",
-            "  0x08, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x9c, 0x44, 0x00, 0x00,\n",
-            "  0x8c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0xdc, 0x01, 0x00, 0x00,\n",
-            "  0x68, 0x01, 0x00, 0x00, 0x3c, 0x02, 0x00, 0x00, 0x50, 0x05, 0x00, 0x00,\n",
-            "  0x8e, 0xbb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x08, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,\n",
-            "  0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,\n",
-            "  0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x5f, 0x32, 0x00, 0x00, 0x00,\n",
-            "  0x94, 0xfa, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,\n",
-            "  0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0xc6, 0xd0, 0xd0, 0x3d, 0x01, 0x00, 0x00, 0x00, 0xf5, 0xff, 0xcf, 0x41,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xbc, 0xff, 0xff,\n",
-            "  0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,\n",
-            "  0x1c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
-            "  0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x52, 0x65, 0x6c, 0x75,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x04, 0xfb, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00,\n",
-            "  0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x09, 0xf5, 0x83, 0x3d, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x14, 0x71, 0x83, 0x41, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x72, 0xbc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x06, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,\n",
-            "  0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,\n",
-            "  0x64, 0xbc, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2d, 0x95, 0x98, 0x38,\n",
-            "  0x20, 0x00, 0x00, 0x00, 0x27, 0xff, 0xff, 0xff, 0x97, 0xff, 0xff, 0xff,\n",
-            "  0x58, 0x00, 0x00, 0x00, 0x66, 0xff, 0xff, 0xff, 0x13, 0xff, 0xff, 0xff,\n",
-            "  0x72, 0xfe, 0xff, 0xff, 0x5d, 0xfb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0xea, 0xbc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x05, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,\n",
-            "  0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00, 0xec, 0xfb, 0xff, 0xff,\n",
-            "  0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3b,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x3f, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x5a, 0xbd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,\n",
-            "  0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
-            "  0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x61, 0x64, 0x64, 0x5f,\n",
-            "  0x31, 0x00, 0x00, 0x00, 0x54, 0xfc, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00,\n",
-            "  0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x9c, 0xd2, 0xb5, 0x3d, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x48, 0x18, 0x1f, 0x41, 0x01, 0x00, 0x00, 0x00, 0x4a, 0x21, 0x4b, 0xc1,\n",
-            "  0xc2, 0xbd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x03, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,\n",
-            "  0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,\n",
-            "  0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e,\n",
-            "  0x74, 0x2f, 0x46, 0x61, 0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x57,\n",
-            "  0x69, 0x74, 0x68, 0x4d, 0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56, 0x61, 0x72,\n",
-            "  0x73, 0x00, 0x00, 0x00, 0xe4, 0xfc, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,\n",
-            "  0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x8a, 0x0f, 0x3b, 0x3a,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0xfc, 0x0b, 0xb4, 0x3d, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0xd9, 0x26, 0xbf, 0xbd, 0x80, 0x02, 0x00, 0x00, 0x60, 0x38, 0xab, 0xcb,\n",
-            "  0xfa, 0x7e, 0xa2, 0x55, 0x6e, 0x87, 0xa5, 0x9b, 0xb4, 0x66, 0x5c, 0x6f,\n",
-            "  0xae, 0xdb, 0xcd, 0xb6, 0xc2, 0x60, 0xa9, 0x7d, 0xd4, 0xac, 0xa6, 0x90,\n",
-            "  0x87, 0x6b, 0x50, 0x95, 0xde, 0xcd, 0xaa, 0xa1, 0x9c, 0x65, 0xb5, 0x6d,\n",
-            "  0xb0, 0xa5, 0xa5, 0x7f, 0x73, 0x95, 0x63, 0x81, 0x7a, 0xc6, 0xaf, 0x82,\n",
-            "  0x69, 0x89, 0xc3, 0x3c, 0x47, 0x73, 0x89, 0x4f, 0x33, 0xbc, 0x85, 0x5d,\n",
-            "  0x69, 0x11, 0x5b, 0xb9, 0xf1, 0x95, 0x8f, 0x5c, 0x7c, 0x59, 0x6c, 0xa0,\n",
-            "  0xa5, 0x7c, 0x5a, 0x7c, 0xb5, 0xa9, 0x7e, 0xa1, 0xb8, 0x65, 0xb3, 0x86,\n",
-            "  0xc1, 0x9f, 0x5c, 0x86, 0x7f, 0x74, 0x52, 0xa8, 0xc9, 0xc5, 0x71, 0x96,\n",
-            "  0x7a, 0x65, 0xc7, 0x69, 0x94, 0xa7, 0x65, 0x68, 0x69, 0x8d, 0x6d, 0x9e,\n",
-            "  0x59, 0xd4, 0x75, 0x7a, 0x4f, 0x70, 0xca, 0x48, 0x25, 0x8a, 0x69, 0x4d,\n",
-            "  0x2a, 0xa6, 0x76, 0x69, 0x6a, 0x02, 0x3b, 0xa2, 0xea, 0xc2, 0x73, 0x6b,\n",
-            "  0x86, 0x4d, 0x3a, 0xa2, 0xa2, 0x88, 0x4e, 0x6c, 0xb3, 0x83, 0x39, 0x93,\n",
-            "  0xa6, 0x85, 0xb8, 0x7a, 0xa8, 0x7d, 0x2e, 0x7b, 0x7f, 0x69, 0x56, 0xb5,\n",
-            "  0xbb, 0xae, 0x23, 0x78, 0x67, 0x5c, 0xd2, 0x82, 0x7d, 0x96, 0x46, 0x74,\n",
-            "  0x70, 0x72, 0x6a, 0x90, 0x43, 0xce, 0x44, 0x75, 0x4a, 0x58, 0xc7, 0x5c,\n",
-            "  0x34, 0x84, 0x46, 0x4b, 0x41, 0x6c, 0x62, 0x83, 0x7e, 0x01, 0x9b, 0x9b,\n",
-            "  0xeb, 0xf7, 0x58, 0x6f, 0x8a, 0x43, 0xb3, 0x9f, 0x9c, 0x9e, 0x55, 0xa8,\n",
-            "  0xaa, 0x84, 0x8f, 0x8f, 0xb0, 0x9e, 0xc8, 0x81, 0xb6, 0x80, 0xa0, 0x81,\n",
-            "  0x86, 0x73, 0x5d, 0xdc, 0xb9, 0xae, 0xa2, 0x6c, 0x46, 0x67, 0xfa, 0x79,\n",
-            "  0x89, 0xaf, 0xa0, 0x74, 0x76, 0x85, 0x72, 0xb1, 0x2a, 0xbb, 0xa0, 0x6d,\n",
-            "  0x4f, 0x50, 0xc9, 0x5d, 0x2f, 0xaa, 0x9c, 0x63, 0x3f, 0x59, 0x63, 0x90,\n",
-            "  0x73, 0x1e, 0xb3, 0x94, 0xcd, 0xff, 0x3c, 0x63, 0x9b, 0x59, 0xc5, 0xa2,\n",
-            "  0x9f, 0x9a, 0x53, 0xab, 0xb0, 0x74, 0xb2, 0x6f, 0x8a, 0xa7, 0xd5, 0x8d,\n",
-            "  0xb8, 0x7e, 0x9e, 0x78, 0x84, 0x61, 0x66, 0xe7, 0xa7, 0x9f, 0xb7, 0x45,\n",
-            "  0x24, 0x61, 0xfd, 0x69, 0x87, 0xb8, 0xb2, 0x7a, 0x7c, 0x58, 0x64, 0xa3,\n",
-            "  0x07, 0xa9, 0xaf, 0x69, 0x49, 0x2f, 0xc2, 0x46, 0x3b, 0xaf, 0x9a, 0x70,\n",
-            "  0x6b, 0x25, 0x5f, 0x9d, 0x82, 0x33, 0xa1, 0x54, 0xae, 0xff, 0x31, 0x5d,\n",
-            "  0xaf, 0x51, 0xb2, 0x82, 0x9c, 0xa9, 0x5b, 0x8c, 0xab, 0x75, 0xb3, 0x32,\n",
-            "  0x42, 0xbd, 0xcd, 0x77, 0xb6, 0x67, 0x9a, 0x5f, 0x6c, 0x71, 0x6e, 0xc2,\n",
-            "  0xac, 0x97, 0x9f, 0x4b, 0x21, 0x6a, 0xfc, 0x77, 0x83, 0xa1, 0xa3, 0x6a,\n",
-            "  0x7a, 0x6d, 0x5e, 0x87, 0x02, 0xa6, 0x8f, 0x7f, 0x5c, 0x2e, 0xc1, 0x51,\n",
-            "  0x4a, 0xa7, 0x96, 0x79, 0x83, 0x2e, 0x5a, 0x84, 0x82, 0x5c, 0x61, 0x3a,\n",
-            "  0x4a, 0xff, 0x2a, 0x51, 0xa4, 0x6b, 0x82, 0x5e, 0x67, 0xb3, 0x71, 0x80,\n",
-            "  0xad, 0x62, 0x59, 0x40, 0x26, 0xd7, 0xcf, 0x68, 0xab, 0x7c, 0x6a, 0x69,\n",
-            "  0x5b, 0x7c, 0x84, 0xbc, 0x95, 0x68, 0x77, 0x63, 0x3f, 0x85, 0xed, 0x7b,\n",
-            "  0x71, 0xa0, 0x76, 0x90, 0x8c, 0x6c, 0x61, 0x81, 0x16, 0x74, 0x72, 0x94,\n",
-            "  0x74, 0x37, 0xb5, 0x3d, 0x55, 0x96, 0x86, 0xad, 0x87, 0x39, 0x59, 0x88,\n",
-            "  0x5b, 0x65, 0x60, 0x33, 0x33, 0xe6, 0x2b, 0x4a, 0xb6, 0x82, 0x50, 0x56,\n",
-            "  0x51, 0x97, 0x71, 0x83, 0xa6, 0x60, 0x57, 0x51, 0x58, 0xe4, 0xd0, 0x87,\n",
-            "  0xa1, 0x78, 0x4c, 0x67, 0x72, 0x74, 0x86, 0xc6, 0x60, 0x47, 0x50, 0x96,\n",
-            "  0x67, 0x96, 0xdd, 0x7d, 0x63, 0x85, 0x5e, 0x98, 0xa2, 0x64, 0x5f, 0x8a,\n",
-            "  0x3b, 0x40, 0x54, 0xcb, 0xa0, 0x61, 0xa7, 0x44, 0x5f, 0x6d, 0x57, 0xb3,\n",
-            "  0xb9, 0x2e, 0x61, 0x8e, 0x54, 0x78, 0x85, 0x58, 0x43, 0xb0, 0x27, 0x5d,\n",
-            "  0x8a, 0x7c, 0x8a, 0x58, 0x40, 0x83, 0x82, 0x9b, 0x6c, 0x60, 0x6b, 0x72,\n",
-            "  0x7f, 0xde, 0xc9, 0x7d, 0x6f, 0x5f, 0x90, 0x7e, 0x7e, 0x7e, 0x8b, 0xe5,\n",
-            "  0x51, 0x37, 0x7a, 0xa9, 0xa2, 0xc5, 0xd3, 0x81, 0x32, 0x4b, 0x80, 0xa9,\n",
-            "  0xc5, 0x76, 0x56, 0x99, 0x33, 0x19, 0x72, 0xe6, 0xdb, 0x90, 0xa8, 0x50,\n",
-            "  0x65, 0x44, 0x77, 0xdb, 0xc7, 0x48, 0x65, 0x8d, 0x3d, 0x7f, 0xa2, 0x7c,\n",
-            "  0x53, 0x55, 0x26, 0x49, 0x5d, 0x7d, 0xa2, 0x6d, 0x3b, 0x5b, 0x87, 0x64,\n",
-            "  0x3a, 0x5b, 0x8d, 0x93, 0x7a, 0xb4, 0xca, 0x6d, 0x16, 0x5a, 0x99, 0x82,\n",
-            "  0x8d, 0x6a, 0x92, 0xa0, 0x39, 0x2c, 0x95, 0xc8, 0xb8, 0xf5, 0xc8, 0x66,\n",
-            "  0x2a, 0x45, 0x84, 0x9c, 0xc7, 0x8e, 0x61, 0x7b, 0x43, 0x28, 0x86, 0xff,\n",
-            "  0xd2, 0xc8, 0x9c, 0x46, 0x65, 0x33, 0x82, 0xd8, 0xcb, 0x73, 0x63, 0x80,\n",
-            "  0xda, 0xc0, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xa0, 0x0f, 0x00, 0x00,\n",
-            "  0x31, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73, 0x5f,\n",
-            "  0x71, 0x75, 0x61, 0x6e, 0x74, 0x5f, 0x31, 0x2f, 0x46, 0x61, 0x6b, 0x65,\n",
-            "  0x51, 0x75, 0x61, 0x6e, 0x74, 0x57, 0x69, 0x74, 0x68, 0x4d, 0x69, 0x6e,\n",
-            "  0x4d, 0x61, 0x78, 0x56, 0x61, 0x72, 0x73, 0x2f, 0x74, 0x72, 0x61, 0x6e,\n",
-            "  0x73, 0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00,\n",
-            "  0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00,\n",
-            "  0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x87, 0xff, 0xdb, 0x39,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0xd8, 0xb2, 0x5d, 0x3d, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x37, 0xdc, 0x56, 0xbd, 0x80, 0x3e, 0x00, 0x00, 0x67, 0x6d, 0x74, 0x77,\n",
-            "  0x35, 0x66, 0x87, 0x95, 0x8e, 0x82, 0x5e, 0x70, 0x6e, 0xa7, 0x60, 0x64,\n",
-            "  0x86, 0x5e, 0x93, 0x7a, 0x76, 0x74, 0x71, 0x8c, 0x61, 0x71, 0x60, 0x8b,\n",
-            "  0x83, 0x48, 0x8b, 0x5f, 0x95, 0x99, 0x5b, 0x59, 0x49, 0x44, 0x79, 0x62,\n",
-            "  0x8e, 0x77, 0x71, 0x89, 0x64, 0x46, 0x8f, 0x8e, 0x80, 0x73, 0x71, 0x81,\n",
-            "  0x85, 0x4a, 0x73, 0x57, 0x66, 0x58, 0x75, 0x93, 0x99, 0x58, 0x8a, 0x7b,\n",
-            "  0x87, 0x81, 0xa1, 0x46, 0x79, 0x6c, 0x83, 0x7a, 0x92, 0x74, 0x6f, 0x6b,\n",
-            "  0x79, 0x77, 0x97, 0x8a, 0x95, 0x75, 0xa2, 0x49, 0x80, 0x4e, 0x7f, 0x6d,\n",
-            "  0xaa, 0xac, 0x6c, 0x5d, 0x57, 0x82, 0x97, 0x77, 0x6f, 0x75, 0x95, 0x73,\n",
-            "  0x7e, 0x51, 0x9f, 0x5b, 0x54, 0x92, 0x60, 0x72, 0x80, 0x6a, 0x92, 0x83,\n",
-            "  0x9b, 0x85, 0x7b, 0x4d, 0x55, 0x4d, 0xb2, 0x7d, 0x65, 0x95, 0x76, 0x42,\n",
-            "  0x61, 0x49, 0xa2, 0x73, 0x9f, 0x7d, 0x7c, 0x54, 0x51, 0x76, 0xa1, 0x7f,\n",
-            "  0x86, 0x69, 0x98, 0x59, 0x6d, 0x84, 0x9f, 0x7b, 0x86, 0x79, 0x88, 0x55,\n",
-            "  0x9c, 0x72, 0x95, 0x8a, 0x91, 0x7a, 0x77, 0x95, 0x7b, 0x87, 0x87, 0x85,\n",
-            "  0x95, 0x72, 0x77, 0x59, 0x7c, 0x80, 0x90, 0x8f, 0x8a, 0x62, 0x76, 0x9f,\n",
-            "  0x64, 0x84, 0x71, 0x7e, 0x7c, 0x66, 0x8e, 0x94, 0x6e, 0xaa, 0x77, 0x5c,\n",
-            "  0x6b, 0x63, 0x68, 0x82, 0x89, 0x46, 0x61, 0x74, 0x8e, 0x85, 0x6b, 0x57,\n",
-            "  0x74, 0x50, 0x87, 0x66, 0x87, 0x98, 0x59, 0x7d, 0xa2, 0x59, 0x75, 0x64,\n",
-            "  0x72, 0x8c, 0x6a, 0x92, 0x8c, 0x56, 0x88, 0x7a, 0x6e, 0x77, 0x9c, 0x82,\n",
-            "  0x7e, 0x5a, 0x91, 0x80, 0x9c, 0x9e, 0x60, 0x8b, 0x6d, 0x76, 0x8d, 0x68,\n",
-            "  0x6c, 0x70, 0x6f, 0x8b, 0x61, 0x6e, 0x86, 0x78, 0x81, 0x81, 0x77, 0x79,\n",
-            "  0x76, 0x69, 0x7d, 0x7b, 0x96, 0x8b, 0x95, 0x91, 0xa2, 0x7b, 0x86, 0x8d,\n",
-            "  0x8b, 0x89, 0x86, 0x5a, 0x5c, 0x4d, 0x96, 0x80, 0x81, 0x55, 0x80, 0x80,\n",
-            "  0x7a, 0x76, 0x99, 0x98, 0x61, 0x95, 0x5a, 0x78, 0x5a, 0x6c, 0x89, 0x81,\n",
-            "  0x98, 0x77, 0x62, 0x77, 0x93, 0x4d, 0x9f, 0x77, 0x72, 0x87, 0x95, 0x71,\n",
-            "  0x65, 0x72, 0xac, 0x8c, 0xa2, 0x89, 0x90, 0x7b, 0x67, 0x60, 0x8a, 0xb3,\n",
-            "  0x72, 0x8f, 0x5c, 0x82, 0x74, 0x76, 0x7c, 0x85, 0x78, 0x6b, 0x97, 0x6d,\n",
-            "  0x86, 0x82, 0x76, 0x84, 0x89, 0x89, 0x7f, 0x6a, 0x7a, 0x7f, 0x6c, 0x77,\n",
-            "  0x80, 0x35, 0x7d, 0x66, 0x96, 0x7e, 0x88, 0x55, 0x6b, 0x55, 0x7c, 0xa7,\n",
-            "  0x7f, 0x9f, 0x64, 0x8b, 0xa0, 0x81, 0x80, 0x97, 0xaf, 0x7a, 0x7d, 0x61,\n",
-            "  0x7a, 0x77, 0x6f, 0x8c, 0x5e, 0x69, 0x6b, 0x94, 0x70, 0x6a, 0x66, 0x5d,\n",
-            "  0x78, 0x6e, 0x76, 0x64, 0xa0, 0x73, 0x8f, 0xa2, 0x9d, 0x50, 0x8e, 0x52,\n",
-            "  0x51, 0x85, 0x78, 0x83, 0x8f, 0x94, 0x83, 0x7c, 0x9c, 0x64, 0x59, 0x7d,\n",
-            "  0x66, 0x6a, 0x73, 0x80, 0x6a, 0x9b, 0x92, 0x7e, 0x7a, 0x78, 0x7d, 0xa0,\n",
-            "  0x8a, 0x9b, 0x61, 0x9e, 0x6c, 0x64, 0x6c, 0x8e, 0x86, 0x75, 0x8a, 0x95,\n",
-            "  0x8e, 0x89, 0x87, 0x8a, 0x5d, 0x8b, 0x82, 0x7c, 0x60, 0x63, 0x85, 0x85,\n",
-            "  0x63, 0x96, 0xa3, 0x7f, 0x93, 0x78, 0x8c, 0x86, 0x7b, 0x78, 0x8e, 0x71,\n",
-            "  0x72, 0x8b, 0x8a, 0x5e, 0x8d, 0x75, 0x78, 0xa3, 0x84, 0x67, 0xa7, 0x54,\n",
-            "  0x6c, 0x80, 0x8e, 0xa8, 0x83, 0x51, 0x6e, 0x9f, 0x8b, 0x86, 0x75, 0x95,\n",
-            "  0x7f, 0x7a, 0x80, 0x81, 0x8d, 0x9c, 0x83, 0x8a, 0x7b, 0x8a, 0x74, 0x6f,\n",
-            "  0x8d, 0x96, 0x5b, 0x9c, 0x8d, 0x7b, 0x83, 0x79, 0x7f, 0x65, 0x7e, 0x87,\n",
-            "  0x7c, 0x5d, 0x71, 0x97, 0x77, 0x44, 0x9a, 0x7f, 0xaa, 0x56, 0x75, 0x5f,\n",
-            "  0x7c, 0x51, 0x8c, 0x90, 0x84, 0x9a, 0x49, 0x5d, 0x86, 0x52, 0x94, 0x95,\n",
-            "  0x5b, 0x86, 0x66, 0x7d, 0x51, 0x4f, 0x7a, 0x91, 0x6d, 0x6e, 0x72, 0x70,\n",
-            "  0x83, 0x4f, 0x9b, 0x9a, 0x8a, 0x77, 0x6a, 0xa1, 0x71, 0x60, 0x61, 0x98,\n",
-            "  0x67, 0x4e, 0x7a, 0x8a, 0x53, 0x6b, 0x99, 0xa0, 0x91, 0x46, 0x8a, 0x8b,\n",
-            "  0x47, 0x78, 0xa9, 0x7b, 0x71, 0x6c, 0x81, 0x68, 0x53, 0x73, 0xaf, 0x70,\n",
-            "  0x62, 0x6d, 0x69, 0x97, 0x70, 0x83, 0x5f, 0x7f, 0x81, 0x87, 0x65, 0x93,\n",
-            "  0x67, 0x87, 0x70, 0x82, 0x79, 0x9e, 0x80, 0x77, 0x6c, 0x80, 0x92, 0x81,\n",
-            "  0x8d, 0x8c, 0x89, 0x8b, 0x4e, 0x91, 0x77, 0x84, 0x99, 0x8c, 0x71, 0x88,\n",
-            "  0x57, 0x7a, 0x9a, 0x8c, 0x82, 0x9b, 0x97, 0x72, 0x69, 0xac, 0x7c, 0x62,\n",
-            "  0x85, 0x7d, 0x76, 0x7f, 0x59, 0x85, 0x68, 0x63, 0x94, 0x8b, 0x7b, 0x92,\n",
-            "  0x7b, 0x6f, 0x77, 0x98, 0x66, 0x78, 0x74, 0x99, 0x85, 0x8c, 0x94, 0x89,\n",
-            "  0x6c, 0x77, 0x89, 0x80, 0x79, 0x8a, 0xa6, 0x95, 0xa9, 0x86, 0x6f, 0x95,\n",
-            "  0x90, 0x69, 0x98, 0x85, 0xa0, 0x7f, 0x56, 0xab, 0x6f, 0x5a, 0x94, 0x8b,\n",
-            "  0x5a, 0x72, 0x61, 0x83, 0x54, 0x70, 0x8d, 0x8d, 0x9c, 0x5e, 0x36, 0x9b,\n",
-            "  0x84, 0x32, 0x6e, 0x84, 0x79, 0x72, 0x64, 0x95, 0x83, 0x58, 0x67, 0x6c,\n",
-            "  0x9e, 0x8d, 0x6e, 0x9e, 0x4f, 0x78, 0x71, 0x85, 0x75, 0x60, 0x4d, 0x7d,\n",
-            "  0x64, 0x89, 0x8e, 0x89, 0x6e, 0x92, 0x53, 0x7c, 0x86, 0x8f, 0xa9, 0xb0,\n",
-            "  0x8e, 0x5e, 0x76, 0x96, 0x65, 0x7c, 0x8a, 0x89, 0x75, 0x8f, 0x65, 0x94,\n",
-            "  0x6c, 0x6c, 0x8d, 0x6d, 0x66, 0x6a, 0x62, 0x98, 0x53, 0x8f, 0x67, 0x76,\n",
-            "  0x80, 0x89, 0x66, 0x60, 0x55, 0x81, 0x85, 0x61, 0x75, 0x78, 0x80, 0x92,\n",
-            "  0x6f, 0x79, 0x66, 0x64, 0x99, 0xa7, 0x88, 0xa1, 0x86, 0x6b, 0x94, 0x88,\n",
-            "  0x77, 0x83, 0x8f, 0x61, 0x72, 0x7c, 0x6f, 0x8f, 0x61, 0x56, 0x8a, 0x7b,\n",
-            "  0x66, 0x8b, 0x98, 0x9d, 0x82, 0x65, 0x77, 0x98, 0x55, 0x83, 0x7a, 0x8c,\n",
-            "  0x74, 0x79, 0x6e, 0x85, 0x82, 0x9a, 0x7d, 0x8d, 0x76, 0x72, 0x64, 0x81,\n",
-            "  0x9a, 0x8d, 0x9f, 0x7b, 0x7c, 0x7b, 0x7b, 0x84, 0x90, 0x6b, 0xa4, 0x84,\n",
-            "  0x98, 0x6f, 0x81, 0xb8, 0x6f, 0x6c, 0x87, 0x6d, 0x8c, 0x72, 0x53, 0x85,\n",
-            "  0x59, 0x4d, 0x9c, 0x94, 0x7d, 0x6f, 0x4f, 0x82, 0x5d, 0x71, 0x6e, 0x78,\n",
-            "  0x61, 0x61, 0x34, 0x71, 0x6a, 0x5a, 0x73, 0xa3, 0x89, 0x65, 0x4d, 0x80,\n",
-            "  0x5c, 0x51, 0x81, 0x8e, 0x6c, 0x53, 0x4a, 0x95, 0x3b, 0x72, 0xa7, 0x86,\n",
-            "  0x7f, 0x75, 0x61, 0xa3, 0x85, 0x6c, 0x99, 0x88, 0x7c, 0x64, 0x7a, 0x8d,\n",
-            "  0x81, 0x7b, 0x6a, 0x7b, 0x8f, 0x74, 0x6d, 0xae, 0x42, 0x67, 0x88, 0xa1,\n",
-            "  0x90, 0x4d, 0x7c, 0x7b, 0x62, 0x55, 0x9a, 0x80, 0x4d, 0x76, 0x5c, 0x88,\n",
-            "  0x60, 0x86, 0x6f, 0x65, 0x67, 0x77, 0x8a, 0x97, 0x99, 0x7c, 0x89, 0x78,\n",
-            "  0x92, 0xa7, 0x6a, 0x7f, 0x8e, 0x88, 0x9d, 0xa1, 0x7b, 0xb0, 0x69, 0x8c,\n",
-            "  0x7e, 0x51, 0x76, 0x84, 0x7d, 0x91, 0x7a, 0x88, 0x7b, 0x88, 0x92, 0x79,\n",
-            "  0x6d, 0x82, 0x6c, 0x8a, 0x99, 0x62, 0x82, 0x9d, 0x99, 0x97, 0x78, 0x6a,\n",
-            "  0x6e, 0x83, 0x64, 0x7d, 0x8c, 0x78, 0x7c, 0x7a, 0x7d, 0x7b, 0x77, 0x84,\n",
-            "  0x76, 0x57, 0x63, 0x85, 0x97, 0x94, 0x80, 0x92, 0x88, 0x73, 0x91, 0x91,\n",
-            "  0x8f, 0x6d, 0x99, 0x86, 0x91, 0x7f, 0x8b, 0x87, 0x98, 0x62, 0x84, 0x70,\n",
-            "  0x97, 0x7b, 0x2e, 0x9b, 0x6e, 0x2a, 0xa4, 0x9c, 0x79, 0x88, 0x54, 0x81,\n",
-            "  0x4f, 0x41, 0xa0, 0x85, 0xaf, 0x9a, 0x47, 0x5a, 0x7d, 0x62, 0x7a, 0x84,\n",
-            "  0x81, 0x6e, 0x41, 0xb4, 0x60, 0x47, 0x8f, 0x98, 0x6c, 0x3c, 0x3b, 0x73,\n",
-            "  0x59, 0x55, 0x7c, 0xb0, 0x6e, 0x5f, 0x61, 0x97, 0x73, 0x59, 0x9f, 0x92,\n",
-            "  0x89, 0x5c, 0x70, 0x96, 0x5c, 0x7c, 0x7c, 0x64, 0x7e, 0x54, 0x5c, 0x94,\n",
-            "  0x56, 0x73, 0x8d, 0x95, 0x59, 0x83, 0x6c, 0x99, 0x6e, 0x5e, 0x7a, 0x99,\n",
-            "  0x83, 0x93, 0x88, 0x76, 0x5a, 0x5a, 0xa5, 0x95, 0x5d, 0x63, 0x8f, 0x6e,\n",
-            "  0x74, 0x65, 0x85, 0x86, 0x98, 0x83, 0x7b, 0x8a, 0x5c, 0x5e, 0x7f, 0x88,\n",
-            "  0x78, 0x68, 0x8f, 0x9f, 0x94, 0x8d, 0x74, 0x7b, 0x6a, 0x91, 0x7a, 0x9a,\n",
-            "  0x70, 0x67, 0xb2, 0x92, 0x75, 0x4e, 0x74, 0xa3, 0x68, 0x74, 0x91, 0x80,\n",
-            "  0x55, 0x8e, 0x88, 0x73, 0x70, 0x81, 0xa1, 0xb8, 0x96, 0x48, 0x67, 0xb2,\n",
-            "  0x76, 0xa1, 0x98, 0xa9, 0x61, 0x6c, 0x5f, 0x98, 0x84, 0x92, 0xa9, 0x83,\n",
-            "  0x9e, 0x74, 0x7b, 0xa2, 0x6f, 0x72, 0x95, 0xa3, 0xb9, 0x80, 0x81, 0x7b,\n",
-            "  0x65, 0x6b, 0x96, 0x8b, 0xae, 0x79, 0x2b, 0x86, 0x5c, 0x2c, 0x8b, 0xa3,\n",
-            "  0x84, 0x74, 0x53, 0x7c, 0x54, 0x4a, 0x65, 0x89, 0xa6, 0x89, 0x47, 0x77,\n",
-            "  0x50, 0x6d, 0x8b, 0x94, 0x8a, 0x61, 0x32, 0x7c, 0x6f, 0x47, 0x78, 0xa2,\n",
-            "  0x9f, 0x42, 0x42, 0x71, 0x78, 0x76, 0x9e, 0x88, 0x70, 0x70, 0x56, 0x8a,\n",
-            "  0x83, 0x95, 0xa7, 0x9d, 0x9d, 0x88, 0x9a, 0x92, 0x48, 0x63, 0xaf, 0x91,\n",
-            "  0x6c, 0x75, 0x5d, 0x5e, 0x83, 0x86, 0xaa, 0x6f, 0x79, 0x84, 0x67, 0x79,\n",
-            "  0x63, 0x69, 0x8e, 0x81, 0x6a, 0x96, 0x8d, 0x86, 0x7b, 0x9f, 0xaa, 0x8e,\n",
-            "  0x63, 0x89, 0x9a, 0x7a, 0x5e, 0x7c, 0x87, 0x83, 0x81, 0x64, 0x7e, 0x59,\n",
-            "  0x6d, 0x5c, 0xa4, 0x72, 0x78, 0x85, 0x9b, 0x79, 0x85, 0x7d, 0x9c, 0x7d,\n",
-            "  0x9c, 0x5c, 0x66, 0x75, 0x66, 0x72, 0xb4, 0x7c, 0x83, 0x9e, 0x90, 0xae,\n",
-            "  0x69, 0x71, 0xb0, 0x84, 0x86, 0x50, 0x66, 0xab, 0x75, 0x96, 0xa8, 0x6c,\n",
-            "  0x87, 0x7b, 0x7e, 0x7c, 0x60, 0x55, 0x96, 0xb0, 0x6a, 0x79, 0x42, 0x9c,\n",
-            "  0x97, 0xa8, 0xb2, 0x9a, 0xa0, 0x84, 0x68, 0x90, 0x90, 0x98, 0x67, 0x9c,\n",
-            "  0xa3, 0x81, 0x71, 0xaa, 0x93, 0x6a, 0x84, 0x8c, 0x77, 0x79, 0x4d, 0x82,\n",
-            "  0x45, 0x1e, 0x7b, 0x94, 0x86, 0x86, 0x26, 0x82, 0x41, 0x6f, 0x8b, 0x86,\n",
-            "  0xa4, 0x80, 0x38, 0x71, 0x5e, 0x5b, 0x9a, 0x73, 0x86, 0x60, 0x5a, 0x9d,\n",
-            "  0x7b, 0x53, 0x89, 0xa0, 0x99, 0x76, 0x57, 0x81, 0x76, 0x5a, 0x9e, 0x85,\n",
-            "  0x5a, 0x7b, 0x56, 0x74, 0x71, 0x6a, 0x9c, 0x68, 0x7e, 0x76, 0x7d, 0x7f,\n",
-            "  0x52, 0x71, 0x85, 0xa2, 0x96, 0x63, 0x73, 0x7c, 0x7a, 0x97, 0x9f, 0x7c,\n",
-            "  0x77, 0x77, 0x59, 0x6b, 0x62, 0x77, 0xbc, 0x6b, 0x7c, 0x79, 0x75, 0x90,\n",
-            "  0x67, 0x82, 0x92, 0x9c, 0x81, 0x92, 0x84, 0x7a, 0x72, 0x5b, 0x86, 0x82,\n",
-            "  0x87, 0x73, 0x87, 0x7c, 0x57, 0x76, 0xa6, 0x7d, 0x7d, 0x94, 0x6a, 0x67,\n",
-            "  0x76, 0x89, 0x9a, 0x6d, 0x7d, 0xa4, 0x6d, 0x7e, 0x74, 0x7e, 0x8f, 0xad,\n",
-            "  0x99, 0x55, 0x5c, 0x82, 0x75, 0x9e, 0xae, 0x76, 0x6b, 0x93, 0x5d, 0x92,\n",
-            "  0x6e, 0x54, 0x88, 0x8f, 0x6a, 0x72, 0x64, 0x93, 0x6e, 0x63, 0x8c, 0xa7,\n",
-            "  0xa6, 0x7a, 0x57, 0x9f, 0x94, 0x91, 0xbd, 0xa4, 0x92, 0x7a, 0x68, 0x9d,\n",
-            "  0x7d, 0x6b, 0x6b, 0xbc, 0xad, 0x7a, 0x73, 0x92, 0x7b, 0x6d, 0x91, 0x6a,\n",
-            "  0x66, 0x8d, 0x34, 0x9b, 0x75, 0x3b, 0x93, 0x78, 0x88, 0x58, 0x1a, 0x7f,\n",
-            "  0x52, 0x61, 0xa3, 0xb1, 0x9c, 0x60, 0x1d, 0x90, 0x7b, 0x37, 0x9f, 0x84,\n",
-            "  0xa3, 0x6c, 0x2e, 0xac, 0x73, 0x62, 0x92, 0x9a, 0x94, 0x6b, 0x5c, 0x82,\n",
-            "  0x5f, 0x4c, 0x9a, 0x8c, 0x76, 0x69, 0x77, 0x5f, 0x5d, 0x91, 0x80, 0x9a,\n",
-            "  0x60, 0x4c, 0x7b, 0x57, 0x67, 0x6b, 0x92, 0x93, 0x64, 0x91, 0x55, 0x75,\n",
-            "  0x41, 0x82, 0x78, 0x68, 0xa2, 0x55, 0x6a, 0x69, 0x59, 0x70, 0x8a, 0x7b,\n",
-            "  0x70, 0x6e, 0x63, 0x83, 0x7f, 0xa4, 0x80, 0x85, 0x86, 0x93, 0x7e, 0x6f,\n",
-            "  0x7b, 0x94, 0xa4, 0xa7, 0x97, 0x7a, 0x87, 0x64, 0x4a, 0x97, 0x94, 0x6a,\n",
-            "  0x96, 0x73, 0x5e, 0x79, 0x6a, 0x99, 0x86, 0xa0, 0x93, 0xac, 0x79, 0x76,\n",
-            "  0x7f, 0x7b, 0xa7, 0x75, 0x8a, 0x71, 0x53, 0x87, 0x93, 0x7f, 0x9e, 0x7b,\n",
-            "  0x81, 0x70, 0x68, 0x8b, 0x8c, 0x9c, 0xaf, 0xa7, 0x6a, 0x9b, 0x49, 0x6d,\n",
-            "  0x67, 0x80, 0x8b, 0x86, 0x9f, 0x80, 0x74, 0x7a, 0x96, 0x74, 0xc8, 0x9d,\n",
-            "  0xa4, 0x74, 0x71, 0x6c, 0x75, 0x6a, 0x9a, 0x95, 0x97, 0x8c, 0x6e, 0x8a,\n",
-            "  0x85, 0x62, 0x5f, 0x7e, 0x9e, 0x6b, 0x48, 0x93, 0x44, 0x37, 0x83, 0xa2,\n",
-            "  0x97, 0x72, 0x25, 0x79, 0x32, 0x39, 0x68, 0x8f, 0x93, 0x61, 0x2b, 0x96,\n",
-            "  0x94, 0x43, 0x82, 0x6e, 0x8f, 0x6d, 0x53, 0x9b, 0x65, 0x50, 0x70, 0x9d,\n",
-            "  0x7d, 0x53, 0x3b, 0x86, 0x77, 0x6c, 0xa6, 0x90, 0x6b, 0x3e, 0x7b, 0x7a,\n",
-            "  0x50, 0x81, 0xb4, 0x76, 0xa5, 0x74, 0x8b, 0x73, 0x79, 0x69, 0xa8, 0x9a,\n",
-            "  0x82, 0x4a, 0x5e, 0x6c, 0x8d, 0x66, 0xa3, 0x80, 0x8d, 0x74, 0x5b, 0x7c,\n",
-            "  0x77, 0xaa, 0x82, 0x69, 0x5e, 0x7d, 0x7f, 0x63, 0xa3, 0x8c, 0xb3, 0x9a,\n",
-            "  0x81, 0x8f, 0x7b, 0x77, 0x60, 0x89, 0x6a, 0x82, 0x5a, 0x7a, 0x71, 0x61,\n",
-            "  0x93, 0x73, 0x8b, 0xb0, 0xa2, 0x92, 0x7c, 0x84, 0x8b, 0x72, 0x91, 0x8d,\n",
-            "  0x91, 0x80, 0x6c, 0x75, 0x7a, 0xb3, 0x95, 0x5e, 0xa5, 0x5d, 0x54, 0x8b,\n",
-            "  0x63, 0x91, 0xa7, 0x68, 0x96, 0x4c, 0x5a, 0x86, 0x76, 0x82, 0xb6, 0xa0,\n",
-            "  0x68, 0x6b, 0x53, 0x76, 0x60, 0x65, 0x90, 0xaf, 0x82, 0x66, 0x80, 0x7b,\n",
-            "  0x84, 0xa0, 0xb0, 0xb8, 0x81, 0x6e, 0x81, 0x8a, 0x74, 0x6e, 0x97, 0xa8,\n",
-            "  0x89, 0x7b, 0x7b, 0x6e, 0x63, 0x74, 0x5a, 0x7b, 0x7e, 0x84, 0x40, 0x95,\n",
-            "  0x73, 0x3c, 0x7c, 0x72, 0x9b, 0x92, 0x27, 0x87, 0x69, 0x5b, 0x99, 0x8a,\n",
-            "  0xa8, 0x65, 0x36, 0x8f, 0x86, 0x3e, 0xa1, 0x79, 0x9f, 0x4d, 0x41, 0xc5,\n",
-            "  0x8c, 0x6a, 0x7e, 0x7f, 0x68, 0x49, 0x5c, 0x91, 0x50, 0x6a, 0x8c, 0x81,\n",
-            "  0x75, 0x4c, 0x6a, 0x74, 0x8a, 0x87, 0xa0, 0x93, 0x7e, 0x6d, 0x52, 0x79,\n",
-            "  0x86, 0x6a, 0x68, 0x6c, 0x83, 0x67, 0x79, 0x73, 0x6f, 0x72, 0x97, 0x84,\n",
-            "  0x8b, 0x78, 0x64, 0x69, 0x8f, 0x92, 0x86, 0x61, 0x5d, 0x85, 0x70, 0x64,\n",
-            "  0x7d, 0xa3, 0x92, 0xa0, 0x72, 0x71, 0x5d, 0x63, 0x7c, 0x70, 0xaf, 0x6f,\n",
-            "  0x93, 0x6a, 0x7e, 0x7f, 0x64, 0xab, 0x85, 0x73, 0x8f, 0x8a, 0x7e, 0x5f,\n",
-            "  0x7a, 0x6f, 0xaa, 0x71, 0x97, 0x7d, 0x60, 0x7c, 0x48, 0x69, 0xa9, 0xaa,\n",
-            "  0x98, 0x7c, 0x61, 0x85, 0x66, 0x97, 0xa2, 0x73, 0x74, 0x65, 0x52, 0x67,\n",
-            "  0x79, 0x8a, 0x79, 0x71, 0x85, 0x6e, 0x6d, 0x67, 0x5e, 0x7f, 0xb9, 0x93,\n",
-            "  0x96, 0x53, 0x69, 0x6e, 0x7f, 0x8f, 0xab, 0x93, 0xa9, 0x70, 0x6e, 0x71,\n",
-            "  0x7e, 0x87, 0x98, 0x7a, 0xae, 0x90, 0x64, 0x88, 0x8a, 0x4f, 0x6d, 0x9e,\n",
-            "  0xac, 0x7e, 0x31, 0x92, 0x50, 0x26, 0x95, 0xb2, 0x90, 0x99, 0x0c, 0x84,\n",
-            "  0x40, 0x4f, 0x8f, 0x76, 0xa4, 0x46, 0x4c, 0x9d, 0x8b, 0x57, 0x81, 0x79,\n",
-            "  0x7b, 0x47, 0x4d, 0x9c, 0x5f, 0x3b, 0x6f, 0x90, 0x7a, 0x3f, 0x66, 0x9d,\n",
-            "  0x6c, 0x45, 0x8b, 0x71, 0x79, 0x62, 0x72, 0x78, 0x93, 0x95, 0x7e, 0x86,\n",
-            "  0x7a, 0x6b, 0x77, 0x74, 0x6b, 0x86, 0xa4, 0x7e, 0x84, 0x48, 0x78, 0x75,\n",
-            "  0x6e, 0x8b, 0x8e, 0x56, 0x69, 0x7b, 0x59, 0x68, 0x5d, 0x77, 0x69, 0x66,\n",
-            "  0x67, 0x9f, 0x75, 0x7b, 0x76, 0x64, 0xc1, 0x78, 0x7d, 0x74, 0x82, 0x73,\n",
-            "  0x73, 0x90, 0xb8, 0x82, 0x7e, 0x70, 0x7b, 0x7a, 0x64, 0xa1, 0x7e, 0x85,\n",
-            "  0x83, 0x81, 0x60, 0x7b, 0x91, 0x82, 0x6f, 0x95, 0xa0, 0x86, 0x6d, 0x88,\n",
-            "  0x75, 0x8d, 0x94, 0x90, 0x76, 0x6d, 0x6e, 0x79, 0x64, 0x74, 0xa8, 0xb1,\n",
-            "  0x92, 0x6e, 0x61, 0x79, 0x74, 0x91, 0x95, 0x74, 0x65, 0x74, 0x5e, 0x7f,\n",
-            "  0x8b, 0x60, 0x9b, 0x9f, 0x74, 0x77, 0x4c, 0x66, 0x7c, 0x80, 0x97, 0x98,\n",
-            "  0x9d, 0x86, 0x55, 0x8a, 0x8a, 0x79, 0x8c, 0x82, 0xb0, 0x7d, 0x63, 0x8c,\n",
-            "  0x5d, 0x5b, 0x82, 0x58, 0x84, 0x56, 0x51, 0x92, 0x75, 0x24, 0x97, 0x92,\n",
-            "  0x75, 0x6e, 0x19, 0x8e, 0x47, 0x3e, 0x7b, 0x7b, 0x87, 0x6b, 0x3f, 0xa9,\n",
-            "  0x59, 0x40, 0x86, 0x74, 0x69, 0x4a, 0x2d, 0xad, 0x91, 0x62, 0xb2, 0xa9,\n",
-            "  0x74, 0x6c, 0x47, 0x94, 0x51, 0x75, 0xb2, 0x6f, 0x75, 0x4b, 0x60, 0xa2,\n",
-            "  0x8e, 0x6a, 0xa4, 0x79, 0x6f, 0x57, 0x80, 0x8c, 0x6c, 0x8e, 0x9e, 0x74,\n",
-            "  0x70, 0x5f, 0x66, 0x80, 0x80, 0x89, 0xb5, 0x8a, 0x7a, 0x96, 0x87, 0x7a,\n",
-            "  0x7b, 0x85, 0x90, 0x79, 0x59, 0x6d, 0x77, 0x8c, 0x8f, 0x82, 0xb3, 0x9c,\n",
-            "  0x6a, 0x6a, 0x6b, 0x70, 0x77, 0x89, 0x96, 0x86, 0x94, 0x72, 0x7e, 0x72,\n",
-            "  0xa9, 0x93, 0x8d, 0x7a, 0x6d, 0x8f, 0x66, 0x72, 0x9a, 0x91, 0x9e, 0x98,\n",
-            "  0xa0, 0x8b, 0x50, 0x76, 0x5c, 0x74, 0xbc, 0x9a, 0x98, 0x73, 0x80, 0x7d,\n",
-            "  0x73, 0x7c, 0xc0, 0x8b, 0x86, 0x7a, 0x66, 0x86, 0x83, 0x72, 0x8f, 0x96,\n",
-            "  0x98, 0x56, 0x45, 0x7b, 0x77, 0x92, 0xac, 0x8a, 0xae, 0x43, 0x33, 0x73,\n",
-            "  0x78, 0x83, 0x98, 0x84, 0x86, 0x78, 0x54, 0x7e, 0x70, 0x5f, 0xa6, 0xa1,\n",
-            "  0x94, 0x81, 0x73, 0x8d, 0x83, 0x5b, 0x88, 0x71, 0xb2, 0x91, 0x50, 0x99,\n",
-            "  0x6b, 0x47, 0x72, 0x92, 0x87, 0x6d, 0x07, 0x99, 0x57, 0x3d, 0x8d, 0x83,\n",
-            "  0x9d, 0x49, 0x40, 0x9d, 0x5c, 0x57, 0x95, 0x73, 0x6e, 0x4b, 0x49, 0xab,\n",
-            "  0x97, 0x58, 0x8b, 0x7a, 0x7a, 0x48, 0x47, 0x8b, 0x7e, 0x5d, 0xa9, 0x6d,\n",
-            "  0x8a, 0x3f, 0x60, 0x82, 0x86, 0x98, 0xa9, 0x7c, 0x74, 0x59, 0x9b, 0x80,\n",
-            "  0x4e, 0x75, 0x9c, 0x5e, 0x75, 0x8c, 0x67, 0x7e, 0x78, 0x75, 0x87, 0x6c,\n",
-            "  0x79, 0x73, 0x63, 0x77, 0x6e, 0x7a, 0x8d, 0x73, 0x4e, 0x72, 0x4a, 0x7c,\n",
-            "  0x8f, 0x79, 0x70, 0x7a, 0x70, 0x73, 0x7b, 0x7a, 0x62, 0xa1, 0x7b, 0x63,\n",
-            "  0x9a, 0x89, 0x76, 0x64, 0x84, 0x7d, 0x9c, 0x94, 0xb0, 0x7f, 0x6c, 0x7b,\n",
-            "  0x8d, 0x89, 0x89, 0x7b, 0x9d, 0x99, 0x64, 0x8b, 0x5c, 0x88, 0xa6, 0x8e,\n",
-            "  0x81, 0x86, 0x7e, 0x85, 0x73, 0x72, 0xad, 0x5d, 0x5f, 0x7e, 0x63, 0x74,\n",
-            "  0x64, 0xa1, 0x9c, 0x83, 0x7c, 0x83, 0x7b, 0x7b, 0x71, 0xa0, 0x9e, 0xaf,\n",
-            "  0x89, 0x79, 0x4c, 0x7c, 0x8c, 0x78, 0x91, 0x87, 0x8a, 0x87, 0x5e, 0x85,\n",
-            "  0x7b, 0x61, 0x9c, 0x88, 0xa5, 0x8d, 0x7c, 0x9c, 0x6b, 0x47, 0x95, 0x85,\n",
-            "  0x81, 0x80, 0x59, 0xb2, 0x4f, 0x3d, 0xae, 0x8c, 0x8d, 0x71, 0x11, 0x95,\n",
-            "  0x31, 0x65, 0x9d, 0xa0, 0x8e, 0x64, 0x42, 0xb9, 0x6a, 0x5c, 0x91, 0x82,\n",
-            "  0x91, 0x50, 0x33, 0xb2, 0x7a, 0x54, 0xac, 0x88, 0x92, 0x61, 0x4e, 0xad,\n",
-            "  0x65, 0x5c, 0x91, 0xb0, 0x72, 0x65, 0x4a, 0x79, 0x68, 0x77, 0x75, 0x5f,\n",
-            "  0x79, 0x6d, 0x6f, 0x7c, 0x4d, 0x71, 0xb8, 0x78, 0x8a, 0x87, 0x6e, 0x72,\n",
-            "  0x7d, 0x79, 0x87, 0x80, 0x5a, 0x78, 0x77, 0x78, 0x80, 0x8f, 0x8c, 0x56,\n",
-            "  0x7a, 0x8b, 0x62, 0x82, 0x5a, 0x96, 0x82, 0x68, 0x71, 0x5d, 0x75, 0x65,\n",
-            "  0x93, 0xb5, 0x71, 0x82, 0x82, 0x8a, 0x4b, 0x7c, 0x62, 0x6f, 0xc1, 0x86,\n",
-            "  0x9d, 0x90, 0x63, 0x71, 0x86, 0x9e, 0x9f, 0x77, 0x90, 0x97, 0x68, 0x81,\n",
-            "  0x5a, 0x8c, 0xab, 0x5e, 0x81, 0x76, 0x83, 0x79, 0x8f, 0xa1, 0x89, 0x79,\n",
-            "  0x81, 0x8a, 0x7e, 0x6c, 0x65, 0x79, 0xc7, 0x89, 0x92, 0x68, 0x78, 0x70,\n",
-            "  0x65, 0x96, 0x9e, 0x82, 0x7d, 0x5f, 0x7b, 0x77, 0x72, 0x84, 0x7e, 0x92,\n",
-            "  0x97, 0x7b, 0x6e, 0x67, 0x81, 0xa1, 0x9a, 0xab, 0x8d, 0x78, 0x61, 0x78,\n",
-            "  0x52, 0x66, 0xaa, 0x77, 0x75, 0xa3, 0x5e, 0xa0, 0x51, 0x40, 0x68, 0xb0,\n",
-            "  0x9a, 0x93, 0x11, 0x82, 0x69, 0x48, 0x9c, 0x77, 0x8d, 0x62, 0x36, 0xac,\n",
-            "  0x6c, 0x4c, 0xa3, 0xab, 0x8f, 0x32, 0x4f, 0xa9, 0x80, 0x68, 0xab, 0x7a,\n",
-            "  0x90, 0x61, 0x5c, 0xa5, 0x84, 0x4c, 0x8c, 0x7a, 0x95, 0x54, 0x72, 0xa0,\n",
-            "  0x66, 0x85, 0xb3, 0x91, 0x69, 0x64, 0x68, 0x56, 0x66, 0x8d, 0xa0, 0x9f,\n",
-            "  0x7a, 0x88, 0x5d, 0x7d, 0x48, 0x80, 0x7f, 0x7c, 0x7c, 0x99, 0x65, 0x81,\n",
-            "  0x73, 0x8b, 0x8c, 0x61, 0x44, 0x60, 0x53, 0x8e, 0x64, 0x80, 0x9c, 0x74,\n",
-            "  0x5d, 0x70, 0x8f, 0x5a, 0x68, 0x7a, 0x82, 0xa1, 0x75, 0x7b, 0x83, 0x60,\n",
-            "  0x75, 0x5e, 0xa2, 0x94, 0x6a, 0x88, 0x78, 0x71, 0x95, 0x70, 0x8b, 0x86,\n",
-            "  0x7e, 0x94, 0x5f, 0x65, 0x5f, 0xb1, 0x97, 0x99, 0x94, 0x84, 0x88, 0x7d,\n",
-            "  0x50, 0x8c, 0xaa, 0x81, 0x7b, 0x7c, 0x77, 0x65, 0x5e, 0x91, 0x9c, 0x89,\n",
-            "  0x8c, 0x85, 0x75, 0x62, 0x7b, 0x78, 0xc3, 0x7a, 0x62, 0x8c, 0x66, 0x6f,\n",
-            "  0x79, 0x7a, 0x9c, 0x6d, 0x7c, 0x6b, 0x5c, 0x7d, 0x6d, 0x54, 0x93, 0x87,\n",
-            "  0x7a, 0x7a, 0x50, 0x85, 0x60, 0x56, 0x5e, 0x6b, 0x90, 0x7c, 0x52, 0xa5,\n",
-            "  0x54, 0x42, 0x7b, 0x75, 0x83, 0x8c, 0x2c, 0xa6, 0x6f, 0x62, 0x78, 0x78,\n",
-            "  0x86, 0x36, 0x4b, 0xaa, 0x86, 0x54, 0x92, 0x8d, 0x7f, 0x53, 0x37, 0xbe,\n",
-            "  0x86, 0x7a, 0x90, 0x7e, 0x8e, 0x50, 0x58, 0xa6, 0x82, 0x58, 0x73, 0x74,\n",
-            "  0x66, 0x5c, 0x6a, 0x7f, 0xa2, 0x69, 0xbd, 0xa9, 0x74, 0x76, 0x75, 0x6f,\n",
-            "  0x45, 0x6c, 0xa5, 0x79, 0x82, 0x67, 0x56, 0x7c, 0x7f, 0x81, 0x67, 0x6d,\n",
-            "  0x81, 0x87, 0x71, 0x69, 0x69, 0x81, 0x85, 0x84, 0x5a, 0x8c, 0x5f, 0x73,\n",
-            "  0x80, 0x9c, 0x9e, 0x90, 0x77, 0xa0, 0x9c, 0x6c, 0x73, 0x8a, 0x84, 0x72,\n",
-            "  0x87, 0xa1, 0x67, 0x64, 0x5d, 0x9b, 0x9d, 0x9b, 0x97, 0x83, 0x5f, 0x61,\n",
-            "  0x77, 0x91, 0xa0, 0x8f, 0x8a, 0x6c, 0x45, 0x5f, 0x6d, 0xa6, 0x9b, 0x76,\n",
-            "  0x86, 0x93, 0x91, 0x7d, 0x54, 0x61, 0xa4, 0x6a, 0x5b, 0x69, 0x5f, 0x6d,\n",
-            "  0x83, 0xaf, 0xa0, 0x78, 0x9d, 0x62, 0x65, 0x69, 0x5f, 0x78, 0xbf, 0x91,\n",
-            "  0x7b, 0x7b, 0x52, 0x5d, 0x70, 0x78, 0xa9, 0x87, 0x93, 0x74, 0x61, 0x74,\n",
-            "  0x8c, 0x61, 0x97, 0x86, 0x9b, 0x7c, 0x7d, 0x75, 0x4b, 0x64, 0xa7, 0x81,\n",
-            "  0x8a, 0x9c, 0x29, 0xa2, 0x5f, 0x38, 0x6a, 0xb0, 0x82, 0x53, 0x1a, 0xa7,\n",
-            "  0x38, 0x47, 0x97, 0x90, 0x8d, 0x41, 0x25, 0xa7, 0x65, 0x63, 0x8b, 0x79,\n",
-            "  0x8f, 0x3e, 0x21, 0xd0, 0x5e, 0x5d, 0x9d, 0x68, 0x75, 0x3e, 0x68, 0xb6,\n",
-            "  0x6a, 0x50, 0x9a, 0x71, 0x81, 0x45, 0x6d, 0x9a, 0x7f, 0x86, 0x9c, 0x63,\n",
-            "  0x7d, 0x74, 0x69, 0x7d, 0x5a, 0x6a, 0x8d, 0x72, 0x6b, 0x69, 0x4c, 0x6f,\n",
-            "  0x7c, 0x8e, 0xa6, 0x83, 0x70, 0x65, 0x5f, 0x78, 0x69, 0x67, 0x7f, 0x8d,\n",
-            "  0x58, 0x76, 0x4a, 0x85, 0x80, 0x89, 0x9f, 0x91, 0x52, 0x62, 0x72, 0x60,\n",
-            "  0x7b, 0x5c, 0x77, 0x6f, 0x9d, 0xa4, 0x98, 0x70, 0x6f, 0xad, 0x94, 0x9f,\n",
-            "  0x7b, 0x89, 0x74, 0x7e, 0x5d, 0x8d, 0xab, 0x98, 0x8f, 0x90, 0x82, 0x84,\n",
-            "  0x60, 0x7c, 0xb7, 0x8e, 0x79, 0x83, 0x56, 0x86, 0x87, 0x79, 0x95, 0x75,\n",
-            "  0x78, 0x71, 0x58, 0x73, 0x87, 0x5d, 0xc6, 0x9f, 0x75, 0x61, 0x4f, 0x71,\n",
-            "  0x91, 0x88, 0xb3, 0x8c, 0x7d, 0x7c, 0x6a, 0x75, 0x6d, 0x66, 0x8e, 0x94,\n",
-            "  0x96, 0x74, 0x59, 0x6f, 0x6d, 0x65, 0xb0, 0x8e, 0x7b, 0x89, 0x7a, 0x6a,\n",
-            "  0x7d, 0x57, 0x82, 0x7a, 0x61, 0x9f, 0x50, 0xab, 0x57, 0x46, 0x86, 0x8d,\n",
-            "  0xa3, 0x96, 0x18, 0xab, 0x51, 0x6e, 0xb3, 0x7e, 0x90, 0x6d, 0x6d, 0xc0,\n",
-            "  0x54, 0x35, 0x96, 0x84, 0x8e, 0x49, 0x28, 0xe4, 0x81, 0x5f, 0x9b, 0x87,\n",
-            "  0x8c, 0x33, 0x56, 0xb4, 0x61, 0x5e, 0x8b, 0x81, 0x99, 0x61, 0x6b, 0x96,\n",
-            "  0x75, 0x82, 0x9e, 0x7c, 0x90, 0x63, 0x64, 0x6b, 0x55, 0x6e, 0xb6, 0x7f,\n",
-            "  0x5f, 0x55, 0x65, 0x60, 0x35, 0x8a, 0x85, 0x91, 0x4d, 0x62, 0x90, 0x90,\n",
-            "  0x57, 0x5a, 0x9f, 0x7b, 0x4c, 0x86, 0x73, 0x83, 0x4a, 0x6d, 0xb0, 0x67,\n",
-            "  0x65, 0x89, 0x54, 0x68, 0x89, 0x7b, 0x72, 0x4f, 0x7a, 0x93, 0x61, 0x7e,\n",
-            "  0x79, 0x89, 0x8f, 0x9c, 0x7b, 0x70, 0x48, 0x67, 0x82, 0x75, 0xaa, 0x92,\n",
-            "  0x9a, 0x8f, 0x79, 0x8c, 0x64, 0x94, 0x98, 0x83, 0x7c, 0x8f, 0x5c, 0x77,\n",
-            "  0x70, 0x90, 0x91, 0x88, 0x7d, 0x51, 0x5d, 0x5d, 0x8b, 0x9f, 0xbc, 0x78,\n",
-            "  0x9e, 0x73, 0x67, 0x6d, 0x82, 0x8d, 0xc9, 0x86, 0x96, 0x6a, 0x5d, 0x79,\n",
-            "  0x7e, 0x6b, 0xb2, 0x79, 0x88, 0x85, 0x65, 0x73, 0x75, 0x6b, 0x9e, 0x7f,\n",
-            "  0x8e, 0x94, 0x8e, 0x7d, 0x74, 0x61, 0x97, 0x56, 0x97, 0x6b, 0x30, 0xb6,\n",
-            "  0x5f, 0x5a, 0xaa, 0xa5, 0x85, 0x5d, 0x01, 0xbc, 0x79, 0x63, 0x6e, 0x82,\n",
-            "  0x72, 0x26, 0x4f, 0xc8, 0x98, 0x56, 0x85, 0x9a, 0x81, 0x1f, 0x48, 0xcf,\n",
-            "  0x84, 0x74, 0x75, 0x87, 0xae, 0x43, 0x6f, 0xdf, 0x6a, 0x4e, 0x97, 0x5d,\n",
-            "  0x8f, 0x37, 0x55, 0x89, 0x7d, 0x82, 0xb1, 0x89, 0x6d, 0x52, 0x65, 0x8b,\n",
-            "  0x71, 0x87, 0x8d, 0x6a, 0x99, 0x5d, 0x65, 0x78, 0x67, 0x8d, 0x7b, 0x51,\n",
-            "  0x60, 0x8a, 0x59, 0x72, 0x78, 0x93, 0x88, 0x75, 0x46, 0x60, 0x6e, 0x79,\n",
-            "  0x7b, 0x9d, 0x9c, 0x8c, 0x5c, 0x7c, 0x69, 0x71, 0x60, 0x6f, 0xb0, 0x7d,\n",
-            "  0x4c, 0x5e, 0x88, 0x77, 0x74, 0x6a, 0x6f, 0x9a, 0xa2, 0x83, 0x48, 0x5a,\n",
-            "  0x6e, 0xa2, 0x8b, 0x7a, 0x65, 0x5b, 0x4b, 0x80, 0x5b, 0x8f, 0xaf, 0x8e,\n",
-            "  0x93, 0x4a, 0x59, 0x6e, 0x5e, 0x89, 0x91, 0x87, 0x73, 0x6a, 0x47, 0x6c,\n",
-            "  0x6c, 0x81, 0xad, 0x5a, 0x76, 0x51, 0x51, 0x6c, 0x80, 0x92, 0x9d, 0xae,\n",
-            "  0x90, 0x71, 0x6c, 0x7a, 0x7c, 0x84, 0xa7, 0x7d, 0x82, 0x7c, 0x80, 0x59,\n",
-            "  0x7d, 0x86, 0xa9, 0x94, 0x8e, 0x7b, 0x7c, 0x67, 0x67, 0x66, 0x8f, 0x49,\n",
-            "  0x5d, 0xa4, 0x4a, 0xbc, 0x5a, 0x34, 0xa7, 0xaa, 0x9e, 0x86, 0x17, 0xc0,\n",
-            "  0x53, 0x67, 0x76, 0xae, 0x8d, 0x37, 0x4a, 0xd6, 0x76, 0x69, 0x95, 0x7a,\n",
-            "  0x8a, 0x0e, 0x3f, 0xe8, 0x60, 0x4d, 0x9e, 0x90, 0xad, 0x44, 0x46, 0xc5,\n",
-            "  0x4c, 0x6e, 0x72, 0x8c, 0x89, 0x49, 0x51, 0xa0, 0x60, 0x84, 0x84, 0x9d,\n",
-            "  0xa4, 0x5a, 0x84, 0x8d, 0x69, 0x6a, 0x97, 0x78, 0x72, 0x66, 0x72, 0x9b,\n",
-            "  0x74, 0x7a, 0x95, 0x7c, 0x7a, 0x6e, 0x74, 0x7f, 0x65, 0x94, 0x77, 0x7e,\n",
-            "  0x85, 0x6d, 0x65, 0x7b, 0x63, 0x7b, 0x87, 0x49, 0x80, 0x74, 0x74, 0x85,\n",
-            "  0x6e, 0x78, 0xad, 0x66, 0x8a, 0x65, 0x54, 0x7c, 0x4e, 0x62, 0x97, 0x7f,\n",
-            "  0x82, 0x6c, 0x58, 0x79, 0x91, 0x94, 0xb3, 0x7a, 0x88, 0x82, 0x60, 0x7f,\n",
-            "  0x8c, 0xa7, 0x7b, 0x93, 0x77, 0x49, 0x6f, 0x6f, 0x5a, 0x8d, 0x93, 0x8b,\n",
-            "  0x87, 0x59, 0x7d, 0x5e, 0x83, 0x7e, 0x8c, 0x7a, 0x91, 0x4e, 0x6f, 0x89,\n",
-            "  0x8a, 0x87, 0x8b, 0x85, 0x8e, 0x43, 0x63, 0x8d, 0x90, 0x6c, 0xa5, 0x73,\n",
-            "  0x8a, 0x78, 0x5f, 0x73, 0x88, 0x57, 0x9e, 0x8f, 0x7f, 0x91, 0x70, 0x77,\n",
-            "  0x8a, 0x76, 0xa2, 0x77, 0x53, 0x86, 0x51, 0xd8, 0xa9, 0x5b, 0x9b, 0x96,\n",
-            "  0x7c, 0x71, 0x01, 0xd4, 0x56, 0x4a, 0x95, 0xab, 0x91, 0x54, 0x45, 0xe5,\n",
-            "  0x74, 0x4f, 0x87, 0x6a, 0xa2, 0x3e, 0x47, 0xff, 0x91, 0x4d, 0x94, 0x97,\n",
-            "  0x6d, 0x74, 0x77, 0xe0, 0x5d, 0x4e, 0x5f, 0x73, 0x70, 0x3a, 0x68, 0xb2,\n",
-            "  0x78, 0x61, 0x8c, 0x77, 0xa8, 0x57, 0x8c, 0x99, 0x23, 0x5a, 0x84, 0x78,\n",
-            "  0x9b, 0x7f, 0x5e, 0xa0, 0x49, 0x84, 0x83, 0x94, 0x99, 0x4d, 0x8d, 0x9a,\n",
-            "  0x86, 0x90, 0x9b, 0x51, 0x75, 0x73, 0x78, 0x89, 0x59, 0x64, 0x78, 0x91,\n",
-            "  0x72, 0x9c, 0x72, 0x7e, 0x65, 0x6a, 0x80, 0xaa, 0x94, 0x65, 0x6d, 0x87,\n",
-            "  0x73, 0x93, 0x97, 0x7d, 0x99, 0x63, 0x75, 0x89, 0x67, 0xa1, 0x90, 0x7f,\n",
-            "  0x88, 0x65, 0x6d, 0x8f, 0x7d, 0x62, 0x91, 0xa7, 0x8b, 0x73, 0x51, 0x88,\n",
-            "  0x66, 0x66, 0x99, 0xa7, 0x7c, 0x54, 0x82, 0x67, 0x64, 0x8a, 0x95, 0x7c,\n",
-            "  0x8a, 0x5d, 0x5e, 0x68, 0x4b, 0x75, 0x92, 0x7a, 0x9f, 0x66, 0x71, 0x8d,\n",
-            "  0x76, 0x72, 0x8e, 0x77, 0x76, 0x8c, 0x5b, 0x88, 0x9a, 0x92, 0x7c, 0x74,\n",
-            "  0x95, 0xaa, 0x71, 0x77, 0x97, 0x93, 0x9e, 0x62, 0x96, 0x6a, 0x49, 0xd8,\n",
-            "  0x81, 0x99, 0xae, 0x87, 0x6c, 0x76, 0x3e, 0xd9, 0x6e, 0x95, 0xa3, 0x86,\n",
-            "  0x60, 0x6c, 0x5c, 0xbe, 0x98, 0x8a, 0x99, 0x7c, 0x47, 0x45, 0x69, 0xeb,\n",
-            "  0x9d, 0x7d, 0xbb, 0x90, 0x66, 0x69, 0x70, 0xc6, 0x7b, 0x59, 0x9e, 0x87,\n",
-            "  0x58, 0x76, 0x7c, 0xae, 0x72, 0x7d, 0x9f, 0x92, 0x82, 0x58, 0x51, 0x7a,\n",
-            "  0x5d, 0x77, 0xa8, 0x7c, 0x56, 0x68, 0x88, 0x8a, 0x7e, 0x8a, 0x98, 0x68,\n",
-            "  0x64, 0x79, 0x6e, 0x7a, 0x60, 0x96, 0x98, 0x60, 0x60, 0x71, 0x60, 0x8e,\n",
-            "  0x7c, 0x8c, 0x92, 0x92, 0x77, 0x80, 0x90, 0x91, 0x81, 0x82, 0x9c, 0x80,\n",
-            "  0x61, 0x7f, 0x5a, 0x8e, 0x88, 0x7c, 0x8e, 0x79, 0x69, 0x8e, 0x4e, 0x7e,\n",
-            "  0x84, 0x9e, 0x67, 0x72, 0x5c, 0x78, 0x7b, 0x8c, 0x65, 0x7d, 0x8e, 0xa4,\n",
-            "  0x5e, 0x7a, 0x5c, 0x97, 0x6a, 0x81, 0xab, 0x85, 0x4d, 0x73, 0x83, 0x96,\n",
-            "  0x8b, 0x7d, 0xa6, 0x69, 0x74, 0x86, 0x73, 0x79, 0x52, 0x8c, 0xa0, 0x86,\n",
-            "  0x64, 0x7b, 0x84, 0x77, 0x87, 0x93, 0x7d, 0x6d, 0x98, 0x6d, 0x88, 0x5f,\n",
-            "  0x7c, 0x84, 0x92, 0x82, 0x81, 0x76, 0x85, 0x77, 0x98, 0x85, 0x88, 0x68,\n",
-            "  0x7d, 0x71, 0x3c, 0xf1, 0x83, 0x86, 0xa2, 0xb3, 0x6e, 0x77, 0x53, 0xe8,\n",
-            "  0xa8, 0xc7, 0xb3, 0x83, 0x93, 0x83, 0x63, 0xe8, 0x94, 0xb3, 0x86, 0x6e,\n",
-            "  0x75, 0x5d, 0x54, 0xf0, 0x89, 0xa7, 0x94, 0xb1, 0x7e, 0x91, 0x9a, 0xb8,\n",
-            "  0x91, 0x7e, 0x99, 0x50, 0x71, 0x82, 0x8a, 0x91, 0x7a, 0x8a, 0x8b, 0x80,\n",
-            "  0x64, 0x6a, 0x5f, 0xbe, 0x5d, 0x96, 0xb1, 0x82, 0x45, 0x71, 0x8b, 0x95,\n",
-            "  0x7c, 0x9b, 0x89, 0x6d, 0x5b, 0x73, 0x81, 0x90, 0x76, 0xab, 0xa6, 0x88,\n",
-            "  0x62, 0x7d, 0x75, 0x99, 0x7a, 0x8b, 0x6e, 0x9b, 0x83, 0x89, 0x99, 0x93,\n",
-            "  0x81, 0x9e, 0x8a, 0x76, 0x75, 0x7d, 0x6c, 0x93, 0x68, 0x7a, 0x8d, 0x78,\n",
-            "  0x88, 0x93, 0x66, 0xa5, 0x6c, 0xae, 0xb1, 0x83, 0x72, 0x8f, 0x6b, 0x7b,\n",
-            "  0x79, 0x9b, 0x98, 0x7c, 0x82, 0x84, 0x7d, 0x7d, 0x71, 0x7c, 0xb0, 0x81,\n",
-            "  0x74, 0x89, 0x72, 0x89, 0x98, 0xa0, 0x7d, 0x62, 0x2f, 0x50, 0x7d, 0x8b,\n",
-            "  0x4c, 0x83, 0x87, 0x89, 0x57, 0x9e, 0x92, 0x8c, 0x81, 0x7e, 0xb9, 0x95,\n",
-            "  0x7f, 0x76, 0x8e, 0x90, 0x9d, 0x68, 0x78, 0x95, 0x7d, 0xab, 0x84, 0x8a,\n",
-            "  0x64, 0x9f, 0x80, 0x94, 0x8d, 0x89, 0x76, 0x8e, 0x6f, 0x8b, 0x75, 0x7d,\n",
-            "  0x89, 0x74, 0x67, 0x8a, 0x7d, 0x63, 0x79, 0x6d, 0x79, 0x8a, 0x78, 0x7f,\n",
-            "  0x7a, 0x9b, 0x70, 0x70, 0x84, 0x86, 0x80, 0x95, 0x5a, 0x77, 0x80, 0x91,\n",
-            "  0x9c, 0x92, 0x76, 0x81, 0x69, 0x89, 0x78, 0xa5, 0x7a, 0x8d, 0x86, 0x64,\n",
-            "  0x8f, 0x8d, 0x7d, 0xa1, 0x8c, 0x7b, 0x77, 0x7e, 0x80, 0x93, 0x86, 0x68,\n",
-            "  0x90, 0x9c, 0x71, 0x8c, 0x68, 0x52, 0x85, 0x88, 0x89, 0x92, 0x64, 0x8f,\n",
-            "  0x74, 0x64, 0x7c, 0x88, 0x8d, 0x97, 0x77, 0x97, 0x91, 0xac, 0x74, 0x7f,\n",
-            "  0x60, 0x7e, 0x6e, 0x70, 0x86, 0x83, 0x7f, 0x81, 0x6f, 0x94, 0x62, 0xa4,\n",
-            "  0x86, 0x7d, 0x90, 0x7c, 0x89, 0x63, 0x7b, 0x89, 0x75, 0xa1, 0x67, 0x69,\n",
-            "  0xa6, 0x76, 0x69, 0x9c, 0x71, 0x79, 0x76, 0x7a, 0x8e, 0x78, 0x94, 0x75,\n",
-            "  0x5a, 0x76, 0x6b, 0x91, 0x84, 0x75, 0x72, 0x93, 0x79, 0x7e, 0x75, 0x9a,\n",
-            "  0x6f, 0x7a, 0x7b, 0x80, 0x5f, 0x90, 0x74, 0x7d, 0x9b, 0x76, 0x70, 0x89,\n",
-            "  0x8f, 0x5f, 0x7f, 0x9c, 0x93, 0x6d, 0x81, 0x7f, 0x8d, 0x7d, 0x74, 0x5d,\n",
-            "  0x75, 0x88, 0x7b, 0x91, 0x75, 0x6b, 0x7f, 0x8c, 0x71, 0x74, 0x87, 0x88,\n",
-            "  0x83, 0x75, 0x77, 0x96, 0x7f, 0x67, 0x7d, 0x95, 0x81, 0x5c, 0x71, 0x5c,\n",
-            "  0x6e, 0x75, 0x86, 0x92, 0x5d, 0x7a, 0x77, 0x9f, 0x6e, 0x79, 0x68, 0x60,\n",
-            "  0x94, 0x88, 0x88, 0x88, 0x79, 0x7e, 0x8a, 0x6d, 0x84, 0xa7, 0x5b, 0x8e,\n",
-            "  0x67, 0x9c, 0x7e, 0x75, 0x82, 0x96, 0x7c, 0x7b, 0x72, 0x85, 0x8c, 0xa3,\n",
-            "  0x96, 0x5b, 0x93, 0x67, 0x7e, 0x9f, 0x71, 0x82, 0x79, 0x8c, 0x93, 0x9d,\n",
-            "  0x6b, 0x90, 0x8a, 0x8a, 0x55, 0x82, 0x94, 0x74, 0x7d, 0xaa, 0x81, 0x78,\n",
-            "  0x8a, 0x8d, 0x83, 0x7b, 0x97, 0x92, 0x68, 0x64, 0x8c, 0x5d, 0x78, 0x9b,\n",
-            "  0x73, 0x95, 0x78, 0x77, 0x6f, 0x61, 0x7c, 0x9d, 0x85, 0x6e, 0x84, 0x4c,\n",
-            "  0x87, 0x57, 0x93, 0x68, 0x8e, 0x77, 0x78, 0x72, 0x87, 0x91, 0x5f, 0x7e,\n",
-            "  0xa6, 0x75, 0x66, 0x86, 0x7a, 0x7d, 0x70, 0x6f, 0x87, 0x8b, 0x74, 0x85,\n",
-            "  0x7d, 0x8b, 0x7f, 0x70, 0x7e, 0x82, 0x84, 0x75, 0x89, 0xa6, 0x7b, 0x7a,\n",
-            "  0xa5, 0x69, 0x73, 0x74, 0x82, 0x65, 0x8f, 0x98, 0x7b, 0x77, 0x84, 0x92,\n",
-            "  0x73, 0x8a, 0xa1, 0x93, 0x80, 0x81, 0x72, 0x8a, 0x6b, 0x75, 0x8f, 0x98,\n",
-            "  0x73, 0x74, 0x6f, 0x70, 0x51, 0x6a, 0x84, 0x9e, 0x78, 0x9b, 0x8c, 0x81,\n",
-            "  0x7e, 0x75, 0x80, 0x88, 0x73, 0x4e, 0x71, 0x74, 0x8c, 0x74, 0x6a, 0x84,\n",
-            "  0x7f, 0x6b, 0x78, 0xab, 0x77, 0xa2, 0x98, 0x93, 0x77, 0x75, 0x72, 0x5c,\n",
-            "  0x60, 0x74, 0x84, 0x67, 0x83, 0x7d, 0x7f, 0x7c, 0x5c, 0x72, 0x70, 0x7f,\n",
-            "  0x6c, 0x84, 0x90, 0xab, 0x97, 0x7f, 0x6b, 0x82, 0x7f, 0x78, 0x73, 0x7d,\n",
-            "  0x8f, 0x8e, 0x8a, 0x8f, 0x8d, 0xa3, 0x74, 0x6e, 0x5e, 0x8c, 0x94, 0x86,\n",
-            "  0x57, 0xb0, 0x79, 0xa8, 0x7b, 0x8d, 0x83, 0x77, 0x89, 0xb6, 0x60, 0x9d,\n",
-            "  0x77, 0x59, 0x72, 0x4d, 0x6f, 0x94, 0x71, 0x75, 0x61, 0x96, 0x86, 0x5d,\n",
-            "  0x84, 0x68, 0x86, 0x82, 0x8d, 0x70, 0x9a, 0x86, 0x73, 0x64, 0x74, 0x7d,\n",
-            "  0x80, 0x5a, 0x64, 0x81, 0xa1, 0x71, 0x77, 0x65, 0xa3, 0x76, 0xa3, 0x9d,\n",
-            "  0x73, 0x7b, 0x8f, 0x7b, 0x79, 0x7d, 0x6c, 0x85, 0x8e, 0x75, 0x65, 0x6a,\n",
-            "  0x87, 0x70, 0x68, 0x8e, 0x76, 0x5d, 0x66, 0x7c, 0x83, 0x83, 0x7e, 0x89,\n",
-            "  0x59, 0x8c, 0x75, 0x59, 0x87, 0x7e, 0x7f, 0x90, 0x6b, 0x7b, 0x7e, 0x6d,\n",
-            "  0x6e, 0x86, 0x69, 0x92, 0x83, 0x8f, 0x8a, 0x60, 0x78, 0x75, 0x61, 0x91,\n",
-            "  0x73, 0x66, 0x86, 0x86, 0x9f, 0x6f, 0x7b, 0x9a, 0x7c, 0x54, 0x75, 0x8e,\n",
-            "  0x7e, 0x72, 0x8e, 0x98, 0x94, 0x5f, 0x71, 0x7c, 0x95, 0x9f, 0x8e, 0x83,\n",
-            "  0x96, 0x4b, 0x8d, 0x84, 0x81, 0x7d, 0x70, 0x84, 0x70, 0x53, 0x8d, 0x84,\n",
-            "  0x5a, 0x91, 0x88, 0x9a, 0x8f, 0x69, 0x8b, 0x52, 0x85, 0x89, 0x6e, 0x99,\n",
-            "  0x79, 0x89, 0x9a, 0x82, 0x6e, 0x8b, 0x65, 0x62, 0x80, 0xa8, 0x8f, 0x8a,\n",
-            "  0x71, 0x61, 0x7e, 0x7d, 0x7e, 0xaa, 0x7f, 0xa0, 0x5e, 0x67, 0x90, 0x86,\n",
-            "  0x6d, 0xac, 0x74, 0x50, 0x61, 0x91, 0x7d, 0x69, 0x8b, 0x7f, 0x81, 0x7a,\n",
-            "  0x93, 0x8c, 0x72, 0x64, 0x98, 0x88, 0x91, 0x83, 0x69, 0x6d, 0x78, 0x7a,\n",
-            "  0x68, 0x7c, 0x76, 0x81, 0xa7, 0x88, 0x8f, 0x79, 0x7d, 0x6c, 0x8a, 0x60,\n",
-            "  0x88, 0x6d, 0x79, 0x9d, 0x80, 0x82, 0x66, 0x7d, 0x7e, 0x96, 0x78, 0x70,\n",
-            "  0x9b, 0x70, 0x7e, 0x90, 0x77, 0x94, 0x7b, 0x89, 0x78, 0x84, 0x74, 0x6d,\n",
-            "  0x7d, 0xa7, 0x75, 0x97, 0x85, 0x83, 0x86, 0x65, 0x75, 0x9a, 0x7c, 0x68,\n",
-            "  0x87, 0x82, 0x75, 0x68, 0x4c, 0x8a, 0x68, 0x93, 0x7d, 0x88, 0x84, 0x72,\n",
-            "  0x58, 0x81, 0x5d, 0x83, 0x89, 0x63, 0x83, 0x7d, 0x8e, 0x75, 0x8c, 0x88,\n",
-            "  0x7f, 0x57, 0x8c, 0x8f, 0xa6, 0x71, 0x8a, 0x95, 0x88, 0x51, 0x74, 0x8a,\n",
-            "  0x8a, 0x98, 0x72, 0x80, 0x8a, 0x52, 0x90, 0x66, 0x54, 0x8e, 0x7f, 0x94,\n",
-            "  0x81, 0x49, 0x84, 0x70, 0x5c, 0x93, 0x89, 0x6d, 0x82, 0x7f, 0x70, 0x5d,\n",
-            "  0x87, 0x8a, 0x71, 0x70, 0x6f, 0xa1, 0x90, 0x9f, 0x74, 0x7c, 0x8c, 0x8b,\n",
-            "  0x72, 0xbf, 0x89, 0x90, 0x5c, 0x8c, 0x75, 0x72, 0x6f, 0xb2, 0x84, 0x6d,\n",
-            "  0x61, 0x80, 0x7d, 0x7a, 0x66, 0xaa, 0x75, 0x71, 0x89, 0x6d, 0x69, 0x72,\n",
-            "  0x73, 0x98, 0x8c, 0x78, 0x5a, 0x8e, 0x8c, 0x81, 0x55, 0x81, 0x96, 0x67,\n",
-            "  0x6f, 0x71, 0x74, 0x7d, 0x8e, 0x66, 0x9a, 0x67, 0xaa, 0x81, 0x90, 0x79,\n",
-            "  0x89, 0x59, 0x86, 0x66, 0x8f, 0x7d, 0x7e, 0xa2, 0xa4, 0x99, 0x68, 0x7a,\n",
-            "  0x8c, 0x73, 0x85, 0x77, 0x8b, 0x74, 0x75, 0x66, 0xaa, 0x98, 0x59, 0x8b,\n",
-            "  0x91, 0x6c, 0x76, 0x73, 0x87, 0xa4, 0x82, 0x82, 0x63, 0x70, 0x7e, 0x73,\n",
-            "  0x96, 0x97, 0x6f, 0x86, 0x81, 0x6f, 0x83, 0x82, 0x7b, 0x82, 0xa3, 0xa7,\n",
-            "  0x95, 0x77, 0x84, 0x65, 0x9b, 0x94, 0x6e, 0xb0, 0x75, 0x66, 0x78, 0x82,\n",
-            "  0x9c, 0x7a, 0x5f, 0xab, 0x99, 0x2f, 0x7f, 0x68, 0xa4, 0x69, 0x8f, 0x9a,\n",
-            "  0x91, 0x56, 0x6e, 0x75, 0x63, 0x9b, 0x9e, 0x97, 0x95, 0x68, 0x80, 0x6a,\n",
-            "  0x40, 0x95, 0x53, 0x72, 0x6f, 0x6b, 0x91, 0x78, 0x7f, 0x93, 0x70, 0x8d,\n",
-            "  0x62, 0x83, 0x7e, 0x64, 0x5b, 0xaa, 0x70, 0x6c, 0x7e, 0x9c, 0x88, 0x76,\n",
-            "  0x60, 0x70, 0x66, 0x69, 0x84, 0x97, 0x9d, 0x63, 0x5e, 0x9a, 0x7e, 0x52,\n",
-            "  0x58, 0xb8, 0x95, 0x7c, 0x4d, 0x96, 0x8f, 0x70, 0x71, 0xbf, 0x83, 0x83,\n",
-            "  0x9e, 0x70, 0x6f, 0x57, 0x70, 0x9a, 0x8d, 0x6e, 0x98, 0x5a, 0x69, 0x6f,\n",
-            "  0x90, 0x71, 0x8a, 0x5d, 0x8e, 0x6e, 0x69, 0x7a, 0x90, 0x86, 0x89, 0x88,\n",
-            "  0xb6, 0x77, 0x84, 0x79, 0x76, 0x86, 0x86, 0x7c, 0xbf, 0x6d, 0x5c, 0x90,\n",
-            "  0xa1, 0x93, 0x72, 0x63, 0x9a, 0x82, 0x7b, 0x61, 0x91, 0x76, 0x82, 0x96,\n",
-            "  0xb9, 0x80, 0x77, 0x7f, 0xa0, 0x73, 0x61, 0x80, 0x83, 0xc1, 0x92, 0x67,\n",
-            "  0x7c, 0x81, 0x90, 0x67, 0x8b, 0xbe, 0x81, 0x91, 0x6c, 0x7e, 0x8d, 0x6c,\n",
-            "  0x62, 0x83, 0x7e, 0x72, 0x64, 0x8a, 0x83, 0x82, 0xaa, 0x8c, 0x74, 0xab,\n",
-            "  0x79, 0x85, 0x91, 0x79, 0x90, 0x68, 0x5c, 0x9a, 0x7c, 0x36, 0x80, 0x6e,\n",
-            "  0x93, 0x76, 0x5e, 0xa0, 0xa5, 0x63, 0x73, 0x7e, 0x8d, 0x94, 0x63, 0x99,\n",
-            "  0x8f, 0x6a, 0x7f, 0x57, 0x57, 0x6f, 0x6d, 0x86, 0x8e, 0x6b, 0x8d, 0x53,\n",
-            "  0x94, 0xba, 0x84, 0x6f, 0x5a, 0x7b, 0x8c, 0x5f, 0x73, 0x93, 0x8b, 0x87,\n",
-            "  0x6f, 0x9e, 0x8a, 0x87, 0x62, 0x97, 0x86, 0x7c, 0x69, 0xab, 0xa1, 0x95,\n",
-            "  0x42, 0x8c, 0x8b, 0x66, 0x68, 0x99, 0xa8, 0x74, 0x80, 0xa5, 0x7d, 0x82,\n",
-            "  0x55, 0xb3, 0x6f, 0x81, 0xa8, 0x9a, 0x80, 0x67, 0x62, 0x7f, 0x78, 0x93,\n",
-            "  0x90, 0x83, 0x83, 0x7b, 0x77, 0x73, 0x8c, 0x56, 0xa7, 0x85, 0x7b, 0x71,\n",
-            "  0x8f, 0x5d, 0x92, 0x69, 0xbe, 0x5e, 0x7f, 0x7f, 0x8e, 0x71, 0x84, 0x75,\n",
-            "  0x95, 0x69, 0x88, 0x6b, 0x96, 0x85, 0x78, 0x39, 0xc2, 0x86, 0x7c, 0x99,\n",
-            "  0xa1, 0x94, 0x6b, 0x86, 0xb5, 0x5e, 0x7e, 0x6e, 0x81, 0x95, 0x6a, 0x88,\n",
-            "  0x7b, 0x92, 0x8f, 0x68, 0x97, 0x77, 0x84, 0x73, 0x68, 0x96, 0x5a, 0x92,\n",
-            "  0x66, 0x74, 0x74, 0x6c, 0x7d, 0x81, 0x6c, 0x93, 0x7f, 0x72, 0x86, 0x74,\n",
-            "  0xbf, 0x8f, 0x53, 0xa4, 0x89, 0x76, 0xa0, 0x87, 0x97, 0x6a, 0x6b, 0xb1,\n",
-            "  0x91, 0x50, 0x74, 0x68, 0xa3, 0x60, 0x8d, 0xbc, 0xc1, 0x3e, 0x62, 0x59,\n",
-            "  0x71, 0x72, 0x6d, 0x80, 0x9f, 0x52, 0x82, 0x6b, 0x5d, 0x7f, 0x74, 0x7e,\n",
-            "  0x74, 0x84, 0x8a, 0x59, 0x5c, 0x85, 0x6d, 0x9c, 0x75, 0x9a, 0x88, 0x89,\n",
-            "  0x81, 0x9f, 0x81, 0x88, 0x6a, 0x94, 0x84, 0x5f, 0x6b, 0x9b, 0x83, 0x4f,\n",
-            "  0x7e, 0xca, 0x99, 0x6d, 0x45, 0x7f, 0x87, 0x71, 0x69, 0xad, 0x95, 0x53,\n",
-            "  0x6e, 0x9b, 0x90, 0x73, 0x5d, 0xb0, 0x8d, 0x67, 0x83, 0x82, 0xa3, 0x70,\n",
-            "  0x70, 0x92, 0x82, 0x9a, 0x8a, 0x69, 0x6a, 0x6e, 0x7f, 0x89, 0xa4, 0x76,\n",
-            "  0x97, 0x62, 0x94, 0x80, 0x87, 0x55, 0x80, 0x76, 0xb3, 0x7e, 0x7e, 0x71,\n",
-            "  0x94, 0x88, 0x8e, 0x74, 0xb6, 0x4d, 0x7b, 0x73, 0x90, 0x86, 0x7c, 0x66,\n",
-            "  0xb5, 0x80, 0x7f, 0x84, 0x87, 0x82, 0x67, 0x83, 0x97, 0x91, 0x8a, 0x78,\n",
-            "  0x8b, 0x83, 0x5d, 0x84, 0x82, 0x9f, 0x8c, 0x91, 0x84, 0x8b, 0x6a, 0x68,\n",
-            "  0x86, 0x82, 0x73, 0x77, 0x7b, 0x83, 0x6a, 0x84, 0x92, 0x93, 0x90, 0x8b,\n",
-            "  0x4c, 0x94, 0x98, 0x76, 0xb8, 0x7b, 0xa0, 0xa2, 0x7d, 0x3e, 0x95, 0x88,\n",
-            "  0xa3, 0x6f, 0x5e, 0xc8, 0x9a, 0x52, 0x81, 0x86, 0xa3, 0x79, 0x88, 0xc3,\n",
-            "  0xbd, 0x54, 0x6c, 0x5e, 0x83, 0x8a, 0x98, 0x88, 0x92, 0x66, 0x73, 0x5b,\n",
-            "  0x6c, 0x7f, 0x6e, 0x97, 0x8d, 0x58, 0x89, 0x6e, 0x65, 0x7a, 0x7d, 0x7c,\n",
-            "  0x7e, 0x89, 0x94, 0x89, 0x55, 0xb8, 0x8f, 0x82, 0x6c, 0x9c, 0x96, 0x5e,\n",
-            "  0x6f, 0xb2, 0x70, 0x76, 0x95, 0xc8, 0x86, 0x78, 0x49, 0xac, 0x7e, 0x6c,\n",
-            "  0x68, 0xb6, 0xaf, 0x89, 0x68, 0xa5, 0x72, 0x85, 0x69, 0x9c, 0x94, 0x84,\n",
-            "  0xa4, 0x97, 0x91, 0x61, 0x7a, 0xa3, 0x8f, 0x8e, 0x93, 0x80, 0x8d, 0x76,\n",
-            "  0x74, 0x84, 0x9b, 0x79, 0x97, 0x4e, 0x67, 0x87, 0x9b, 0x69, 0x85, 0x7d,\n",
-            "  0xb2, 0x68, 0x76, 0x63, 0xa2, 0x86, 0x97, 0x7f, 0xb5, 0x63, 0x79, 0x76,\n",
-            "  0x8a, 0x7c, 0x7c, 0x91, 0xb1, 0x42, 0x7d, 0x7a, 0x8c, 0x8e, 0x72, 0xab,\n",
-            "  0xb8, 0x76, 0xab, 0x81, 0x98, 0x85, 0x56, 0x98, 0x84, 0x9f, 0x70, 0x86,\n",
-            "  0x76, 0x88, 0x70, 0x8d, 0x71, 0x7b, 0x7a, 0x8d, 0x76, 0x75, 0x62, 0x80,\n",
-            "  0x81, 0x94, 0x82, 0x6e, 0x57, 0x8d, 0xaf, 0x84, 0xbf, 0x85, 0x82, 0xa7,\n",
-            "  0x80, 0x89, 0x95, 0x81, 0x91, 0x49, 0x72, 0xa1, 0xa7, 0x3f, 0x72, 0x8b,\n",
-            "  0x99, 0x72, 0x86, 0xb2, 0xc3, 0x61, 0x55, 0x77, 0x86, 0x77, 0x83, 0xa7,\n",
-            "  0x95, 0x5a, 0x68, 0x68, 0x6a, 0x63, 0x6a, 0x77, 0x93, 0x7c, 0x88, 0x62,\n",
-            "  0x79, 0x84, 0x8b, 0x82, 0x58, 0x8f, 0x9c, 0x56, 0x77, 0xb1, 0x65, 0x8c,\n",
-            "  0x76, 0x91, 0x83, 0x5b, 0x62, 0x91, 0x87, 0x68, 0x71, 0xb0, 0x87, 0x64,\n",
-            "  0x62, 0x91, 0x94, 0x58, 0x7f, 0xac, 0xa3, 0x84, 0x75, 0xaa, 0xa3, 0x4d,\n",
-            "  0x7a, 0xc2, 0x84, 0x8a, 0x6d, 0xa2, 0x76, 0x74, 0x8c, 0x9e, 0x7c, 0x71,\n",
-            "  0x86, 0x70, 0x6d, 0x79, 0x9a, 0x74, 0xb0, 0x8d, 0xa5, 0x7e, 0x6b, 0x63,\n",
-            "  0x96, 0x74, 0x99, 0x76, 0xd0, 0x62, 0x85, 0x9d, 0x8f, 0x6d, 0x83, 0x88,\n",
-            "  0xb0, 0x62, 0x9b, 0x87, 0x91, 0x82, 0x7a, 0x90, 0x9c, 0x61, 0x6d, 0x97,\n",
-            "  0x84, 0x7c, 0x74, 0x8e, 0x8b, 0x75, 0x9a, 0x7e, 0x7c, 0x7d, 0x96, 0x81,\n",
-            "  0x94, 0x69, 0x83, 0x6f, 0x8e, 0x7c, 0x7b, 0x7a, 0x73, 0x98, 0x74, 0x9e,\n",
-            "  0x72, 0x8c, 0x5f, 0x7d, 0x99, 0x79, 0x5b, 0x73, 0x65, 0x78, 0xa5, 0x7d,\n",
-            "  0xa2, 0x98, 0x91, 0x91, 0x87, 0x7b, 0x8c, 0x82, 0xb8, 0x6b, 0x82, 0xba,\n",
-            "  0xa5, 0x3f, 0x83, 0x7a, 0x9b, 0x73, 0x93, 0xa1, 0xbe, 0x55, 0x6b, 0x75,\n",
-            "  0x94, 0x7d, 0x9c, 0xa1, 0x82, 0x50, 0x75, 0x5a, 0x88, 0x6e, 0x72, 0x7f,\n",
-            "  0x99, 0x64, 0x72, 0x49, 0x69, 0x79, 0x6d, 0x94, 0x73, 0x79, 0x80, 0x6f,\n",
-            "  0x72, 0xbc, 0x9d, 0x71, 0x7a, 0x9d, 0x8a, 0x55, 0x74, 0xaa, 0xa1, 0x85,\n",
-            "  0x7e, 0xc4, 0xa0, 0x7e, 0x50, 0x99, 0x68, 0x8c, 0x8a, 0xb0, 0x99, 0x6c,\n",
-            "  0x6d, 0xaf, 0x7b, 0x7b, 0x79, 0xba, 0x8a, 0x7a, 0x9d, 0x8b, 0x67, 0x87,\n",
-            "  0x76, 0xa9, 0x7f, 0x7e, 0x8b, 0x7b, 0x87, 0x84, 0x82, 0x74, 0xa3, 0x91,\n",
-            "  0x9a, 0x6a, 0x93, 0x7e, 0x87, 0x5b, 0x95, 0x89, 0xbb, 0x5d, 0x74, 0x6c,\n",
-            "  0x88, 0x7e, 0x81, 0x7e, 0xb6, 0x6b, 0x91, 0x92, 0x83, 0x78, 0x79, 0x95,\n",
-            "  0x90, 0x5e, 0x68, 0x8f, 0xa8, 0x92, 0x66, 0x8e, 0x6b, 0x8c, 0x86, 0x80,\n",
-            "  0x7e, 0x7e, 0x70, 0x84, 0x7d, 0x71, 0x67, 0x94, 0x71, 0x69, 0x84, 0x8f,\n",
-            "  0x6c, 0x72, 0x85, 0x83, 0x69, 0x76, 0x57, 0x62, 0x83, 0x96, 0x83, 0x77,\n",
-            "  0x64, 0x5f, 0xae, 0x7c, 0xa7, 0x88, 0x91, 0x8c, 0x9e, 0x7f, 0xa8, 0x8a,\n",
-            "  0x93, 0x6f, 0x58, 0xae, 0xb4, 0x4b, 0x7f, 0x64, 0x9f, 0x5a, 0x9e, 0xb6,\n",
-            "  0xa6, 0x6b, 0x79, 0x84, 0x6b, 0x7c, 0x8b, 0x94, 0x85, 0x60, 0x6b, 0x55,\n",
-            "  0x79, 0x68, 0x77, 0x75, 0x85, 0x5c, 0x91, 0x5e, 0x5a, 0x71, 0x68, 0x7b,\n",
-            "  0x73, 0x91, 0x6c, 0x6e, 0x71, 0x8b, 0x76, 0x86, 0x99, 0xb8, 0x91, 0x68,\n",
-            "  0x51, 0xa7, 0x6f, 0x7a, 0x8a, 0xc3, 0x8e, 0x65, 0x64, 0x9e, 0x80, 0x78,\n",
-            "  0x6c, 0xc5, 0xa2, 0x75, 0x71, 0xa5, 0x96, 0x4f, 0x70, 0xa4, 0x7a, 0x7c,\n",
-            "  0x8c, 0x80, 0x89, 0x97, 0x9a, 0x9a, 0x85, 0x89, 0x92, 0x8f, 0x81, 0x6f,\n",
-            "  0x82, 0x6a, 0xb8, 0x74, 0x8f, 0x51, 0x7b, 0x8b, 0x8c, 0x55, 0x7e, 0x8c,\n",
-            "  0xb2, 0x41, 0x85, 0x77, 0x9c, 0x73, 0x75, 0x8d, 0x9f, 0x64, 0x92, 0x77,\n",
-            "  0xa0, 0x87, 0x5f, 0x71, 0x85, 0x68, 0x8a, 0x78, 0x91, 0x78, 0x75, 0x7a,\n",
-            "  0x81, 0x67, 0x96, 0x64, 0x96, 0x85, 0x7a, 0x7e, 0x83, 0x74, 0x82, 0x8f,\n",
-            "  0x98, 0x75, 0x77, 0x84, 0x7e, 0x88, 0x94, 0x7d, 0x79, 0x8c, 0x47, 0x79,\n",
-            "  0x96, 0x7f, 0x8e, 0x90, 0x50, 0x7f, 0xa3, 0x77, 0xa8, 0x7f, 0x65, 0x9f,\n",
-            "  0xb9, 0x4c, 0xa7, 0x7f, 0xaa, 0x6e, 0xa2, 0xb0, 0xb8, 0x51, 0x6b, 0x74,\n",
-            "  0xaa, 0x63, 0x6c, 0xa3, 0xb6, 0x5e, 0x74, 0x6a, 0x75, 0x69, 0x87, 0x7f,\n",
-            "  0x9d, 0x71, 0x73, 0x72, 0x70, 0x57, 0x5a, 0x7e, 0x8b, 0x64, 0x9a, 0x4d,\n",
-            "  0x97, 0x81, 0x7b, 0x75, 0x6e, 0x92, 0x5f, 0x67, 0x7e, 0xaa, 0x90, 0x7a,\n",
-            "  0x92, 0xae, 0x92, 0x68, 0x79, 0x9d, 0x4f, 0x6c, 0x79, 0xb4, 0x9c, 0x58,\n",
-            "  0x86, 0x8e, 0x62, 0x72, 0x71, 0xc1, 0xac, 0x7d, 0x7a, 0x94, 0x8f, 0x7b,\n",
-            "  0x88, 0xa8, 0x8d, 0x82, 0x75, 0x9b, 0x5f, 0x83, 0x82, 0xb3, 0x7a, 0x93,\n",
-            "  0x94, 0x76, 0x70, 0x7e, 0x72, 0x7e, 0x8f, 0x8c, 0xa7, 0x53, 0x72, 0x77,\n",
-            "  0x7a, 0x64, 0xa8, 0x83, 0xc5, 0x56, 0x71, 0x7b, 0x96, 0x73, 0x7c, 0x73,\n",
-            "  0x93, 0x49, 0x83, 0x99, 0xa2, 0x83, 0x74, 0x79, 0xa4, 0x61, 0x8e, 0x84,\n",
-            "  0x7a, 0x7d, 0x56, 0x98, 0x97, 0x6d, 0x87, 0x8c, 0x7a, 0x77, 0x6a, 0x67,\n",
-            "  0x8a, 0x6f, 0xa2, 0x82, 0x8d, 0x85, 0x6d, 0x8f, 0x7e, 0x74, 0x72, 0x74,\n",
-            "  0x91, 0x75, 0x58, 0x7f, 0x9e, 0x7c, 0x9c, 0x75, 0x61, 0x6f, 0x85, 0x7b,\n",
-            "  0xbe, 0x84, 0x85, 0x9b, 0x8c, 0x3b, 0x9a, 0x90, 0xab, 0x77, 0x8e, 0xa2,\n",
-            "  0xbd, 0x55, 0x96, 0x70, 0xa8, 0x78, 0x98, 0x9c, 0xc3, 0x67, 0x6e, 0x81,\n",
-            "  0x70, 0x75, 0x96, 0x9c, 0x8a, 0x5b, 0x73, 0x54, 0x69, 0x6c, 0x5d, 0x82,\n",
-            "  0x99, 0x5b, 0x8c, 0x6d, 0x87, 0x80, 0x67, 0x86, 0x88, 0x7c, 0x70, 0x6b,\n",
-            "  0x75, 0xab, 0x8e, 0x79, 0x90, 0x91, 0xaf, 0x67, 0x5c, 0xa1, 0x5c, 0x6f,\n",
-            "  0x75, 0xa1, 0x95, 0x5f, 0x82, 0x8f, 0x78, 0x5d, 0x7c, 0xb8, 0x8a, 0x8a,\n",
-            "  0x6a, 0x98, 0x6e, 0x51, 0x6b, 0xaa, 0x7d, 0x7c, 0x80, 0x94, 0x79, 0x6d,\n",
-            "  0xaa, 0x8a, 0x7e, 0x77, 0xa4, 0x78, 0xa5, 0x6d, 0x7c, 0x75, 0xa8, 0x6f,\n",
-            "  0xa6, 0x51, 0x8e, 0x80, 0x96, 0x5b, 0x9d, 0x7b, 0xb8, 0x4e, 0x6c, 0x87,\n",
-            "  0x95, 0x7c, 0x78, 0x71, 0xb0, 0x5a, 0x99, 0xa0, 0x90, 0x87, 0x65, 0x8b,\n",
-            "  0x98, 0x68, 0x92, 0x76, 0x82, 0x77, 0x6a, 0x8a, 0x91, 0x84, 0x87, 0x8b,\n",
-            "  0x87, 0x84, 0x7a, 0x81, 0x77, 0x55, 0x8e, 0x86, 0x7a, 0x74, 0x65, 0x88,\n",
-            "  0x62, 0x51, 0xa1, 0x91, 0x88, 0x76, 0x5f, 0x89, 0x9f, 0x86, 0x66, 0x67,\n",
-            "  0x64, 0x75, 0x9e, 0x74, 0xc1, 0x80, 0x58, 0xa9, 0x8f, 0x5e, 0x94, 0x88,\n",
-            "  0xaf, 0x6f, 0x6c, 0xa4, 0xa1, 0x4d, 0x68, 0x66, 0xc2, 0x6e, 0x89, 0x9b,\n",
-            "  0xa3, 0x5a, 0x63, 0x5b, 0x9c, 0x7a, 0x93, 0x76, 0x9d, 0x6d, 0x71, 0x5d,\n",
-            "  0x80, 0x66, 0x79, 0x80, 0x7c, 0x65, 0x74, 0x64, 0x88, 0x90, 0x79, 0x89,\n",
-            "  0x72, 0x88, 0x67, 0x75, 0x6a, 0x96, 0x56, 0x67, 0x88, 0xa1, 0x8c, 0x6c,\n",
-            "  0x55, 0xb2, 0x8a, 0x71, 0x88, 0xdc, 0x7a, 0x72, 0x94, 0x9d, 0x7c, 0x76,\n",
-            "  0x6a, 0xaa, 0xa8, 0x7f, 0x80, 0xa0, 0x6b, 0x6f, 0x84, 0xe0, 0x68, 0x93,\n",
-            "  0xa6, 0x99, 0x69, 0x68, 0x93, 0xa0, 0x93, 0x6b, 0x87, 0x8b, 0x80, 0x90,\n",
-            "  0x90, 0x89, 0x8f, 0x7f, 0xaf, 0x6f, 0x82, 0x6d, 0x94, 0x70, 0x97, 0x8f,\n",
-            "  0xb0, 0x40, 0x9b, 0x67, 0x78, 0x86, 0x90, 0x8b, 0xa7, 0x51, 0x7f, 0x79,\n",
-            "  0x90, 0x71, 0x6d, 0x80, 0x95, 0x63, 0x7d, 0x87, 0xa0, 0x7e, 0x7b, 0x85,\n",
-            "  0x8e, 0x6d, 0xa1, 0x76, 0x70, 0x7b, 0x66, 0x87, 0x90, 0x7a, 0x86, 0x88,\n",
-            "  0x89, 0x87, 0x6a, 0x91, 0x78, 0x74, 0x76, 0x8d, 0x7e, 0x86, 0x63, 0x90,\n",
-            "  0x98, 0x7d, 0x4a, 0x85, 0x4f, 0x9d, 0xa2, 0x7c, 0xb4, 0x88, 0x78, 0xb5,\n",
-            "  0x8f, 0x3f, 0xa7, 0x7d, 0xa4, 0x7c, 0x60, 0x9c, 0xa8, 0x41, 0x6b, 0x7f,\n",
-            "  0xa2, 0x7f, 0x68, 0xaa, 0xb4, 0x73, 0x56, 0x62, 0x87, 0x72, 0xa5, 0x7c,\n",
-            "  0x97, 0x69, 0x58, 0x6b, 0x89, 0x57, 0x51, 0x80, 0x92, 0x7a, 0x7c, 0x4c,\n",
-            "  0x7c, 0x7b, 0x69, 0x5f, 0x90, 0x77, 0x78, 0x67, 0x7a, 0xad, 0x79, 0x5c,\n",
-            "  0x9c, 0xbf, 0xa6, 0x64, 0x53, 0xb3, 0x5e, 0x59, 0x86, 0xb9, 0x94, 0x65,\n",
-            "  0x70, 0x9d, 0x7a, 0x80, 0x7c, 0xae, 0x9c, 0x7b, 0x66, 0xae, 0x83, 0x5f,\n",
-            "  0x81, 0xc5, 0x8b, 0x7e, 0x9b, 0x89, 0x84, 0x7f, 0x7c, 0xa5, 0x5c, 0x89,\n",
-            "  0x8a, 0x75, 0x99, 0x6d, 0x8e, 0x90, 0x9f, 0x81, 0x81, 0x6b, 0x87, 0x76,\n",
-            "  0x92, 0x6f, 0xab, 0x95, 0x95, 0x4c, 0x97, 0x72, 0x80, 0x87, 0x83, 0x87,\n",
-            "  0xa3, 0x59, 0xad, 0x74, 0x93, 0x7f, 0x77, 0x78, 0x8d, 0x66, 0x9b, 0x7a,\n",
-            "  0x7d, 0x95, 0x64, 0x7f, 0x6d, 0x5c, 0x8e, 0x94, 0x92, 0x82, 0x60, 0x8d,\n",
-            "  0x75, 0x55, 0x8c, 0x8b, 0x8f, 0x86, 0x7d, 0x7c, 0x74, 0x57, 0x78, 0x9d,\n",
-            "  0x71, 0x65, 0x66, 0x7f, 0xaa, 0x92, 0x66, 0x81, 0x5a, 0x71, 0xa6, 0x78,\n",
-            "  0x9d, 0x8a, 0x5a, 0x8a, 0x91, 0x59, 0xb7, 0x5c, 0xc3, 0x73, 0x89, 0x9d,\n",
-            "  0xa7, 0x62, 0x77, 0x72, 0x9f, 0x92, 0x6a, 0x9f, 0xaa, 0x71, 0x6b, 0x5e,\n",
-            "  0x7d, 0x73, 0x8d, 0x89, 0xba, 0x61, 0x73, 0x6e, 0x71, 0x8a, 0x79, 0x7c,\n",
-            "  0x94, 0x76, 0x76, 0x65, 0x81, 0x6f, 0x4e, 0x75, 0x6e, 0x8b, 0x7d, 0x50,\n",
-            "  0x56, 0xb8, 0x72, 0x67, 0x93, 0xc6, 0x88, 0x6f, 0x57, 0xb7, 0x80, 0x4c,\n",
-            "  0x97, 0xc4, 0xb6, 0x71, 0x72, 0x9e, 0x6f, 0x72, 0x8d, 0xa5, 0x8f, 0x89,\n",
-            "  0x74, 0xae, 0x78, 0x70, 0x6e, 0xbb, 0x8f, 0x73, 0x74, 0x8b, 0x5e, 0x86,\n",
-            "  0x8b, 0x8a, 0x72, 0x71, 0x84, 0x84, 0x77, 0xa3, 0xa6, 0x73, 0xa4, 0x7e,\n",
-            "  0xab, 0x5d, 0x75, 0x96, 0x94, 0x5f, 0x8b, 0x74, 0x9c, 0x63, 0x8d, 0x81,\n",
-            "  0x80, 0x6a, 0x91, 0x88, 0x93, 0x53, 0x80, 0x75, 0x79, 0x8d, 0x78, 0x74,\n",
-            "  0x7c, 0x73, 0xb2, 0x89, 0x8e, 0xab, 0x75, 0x6c, 0x7a, 0x79, 0x99, 0x77,\n",
-            "  0x7d, 0x89, 0x5a, 0x81, 0x7c, 0x75, 0x6a, 0x7e, 0x8c, 0x83, 0x78, 0x8e,\n",
-            "  0x62, 0x76, 0x77, 0x6b, 0x79, 0x66, 0x6e, 0x82, 0xa1, 0x8d, 0x52, 0x79,\n",
-            "  0x70, 0x7d, 0xa9, 0x6a, 0x95, 0x7f, 0x59, 0x94, 0x8f, 0x73, 0xb7, 0x85,\n",
-            "  0xb3, 0x80, 0x77, 0x9f, 0xb8, 0x4d, 0x82, 0x7c, 0xa0, 0xa4, 0x7b, 0x8c,\n",
-            "  0xa9, 0x78, 0x62, 0x6b, 0x8a, 0x93, 0x80, 0x68, 0x9b, 0x6d, 0x6b, 0x7b,\n",
-            "  0x84, 0x8f, 0x86, 0x70, 0x70, 0x73, 0x84, 0x4f, 0x7c, 0x75, 0x64, 0x8d,\n",
-            "  0x6e, 0x81, 0x7c, 0x72, 0x81, 0xb0, 0x74, 0x65, 0xa7, 0xae, 0x80, 0x70,\n",
-            "  0x5e, 0xa4, 0x58, 0x54, 0x8e, 0xa7, 0x96, 0x65, 0x66, 0x8b, 0x6c, 0x5d,\n",
-            "  0x6b, 0xbe, 0x94, 0x79, 0x80, 0xa1, 0x91, 0x78, 0x6d, 0xc2, 0x82, 0x85,\n",
-            "  0x81, 0x7d, 0x88, 0x79, 0x93, 0x96, 0x7f, 0x7e, 0x7d, 0x92, 0x75, 0xa2,\n",
-            "  0x9f, 0x7b, 0x92, 0x77, 0x8a, 0x7c, 0x80, 0x8b, 0x9b, 0x64, 0xa5, 0x74,\n",
-            "  0xa1, 0x74, 0x7f, 0x7e, 0x85, 0x78, 0x9c, 0x86, 0x9f, 0x62, 0x8f, 0x7f,\n",
-            "  0x8a, 0x90, 0x6d, 0x7d, 0x93, 0x61, 0x9d, 0x81, 0x9b, 0x99, 0x69, 0x87,\n",
-            "  0x74, 0x7d, 0x8e, 0x8e, 0x7b, 0x7c, 0x6a, 0x71, 0x7d, 0x7f, 0x74, 0x74,\n",
-            "  0x7b, 0x65, 0x6e, 0x91, 0x7c, 0x6e, 0x80, 0x8c, 0x8a, 0x6c, 0x6b, 0x76,\n",
-            "  0xad, 0x94, 0x64, 0x81, 0x69, 0x7b, 0xac, 0x76, 0x9f, 0x71, 0x85, 0x85,\n",
-            "  0x8b, 0x66, 0xb5, 0x87, 0xb3, 0x63, 0x8b, 0x95, 0x8e, 0x50, 0x91, 0x77,\n",
-            "  0xa1, 0x99, 0x64, 0x81, 0xb3, 0x63, 0x6e, 0x7a, 0x7f, 0x73, 0x7a, 0x7b,\n",
-            "  0x93, 0x6d, 0x75, 0x75, 0x7c, 0x7b, 0x59, 0x7c, 0x7c, 0x68, 0x67, 0x78,\n",
-            "  0x79, 0x75, 0x53, 0x86, 0x84, 0x84, 0x91, 0x71, 0x85, 0xb1, 0x84, 0x64,\n",
-            "  0x88, 0xc0, 0x94, 0x5f, 0x6f, 0x9b, 0x69, 0x67, 0x97, 0x94, 0x88, 0x6a,\n",
-            "  0x7e, 0x94, 0x9e, 0x7f, 0x81, 0x9c, 0xa7, 0x7f, 0x7a, 0xa2, 0x63, 0x69,\n",
-            "  0x82, 0xc2, 0x5e, 0x8d, 0x7c, 0x89, 0x63, 0x93, 0x84, 0xb8, 0x76, 0x89,\n",
-            "  0x96, 0x87, 0x79, 0x88, 0xa6, 0x8e, 0x9b, 0x93, 0x9c, 0x5d, 0x92, 0x92,\n",
-            "  0x82, 0x5e, 0x85, 0x88, 0xad, 0x73, 0xa4, 0x6f, 0x74, 0x8e, 0x77, 0x89,\n",
-            "  0x9b, 0x6e, 0x82, 0x76, 0x93, 0xae, 0x82, 0x87, 0x76, 0x6f, 0x80, 0x76,\n",
-            "  0x95, 0x8e, 0x5e, 0x85, 0x7b, 0x68, 0x7f, 0x7c, 0x82, 0x94, 0x80, 0x91,\n",
-            "  0x77, 0x71, 0x7c, 0x94, 0x80, 0x62, 0x65, 0x7c, 0x5e, 0x70, 0x76, 0x75,\n",
-            "  0x7b, 0x60, 0x5f, 0x69, 0xb3, 0x6e, 0x95, 0x9d, 0x5a, 0x5b, 0x9e, 0x6e,\n",
-            "  0xa6, 0x80, 0x5d, 0xa5, 0x83, 0x5b, 0xa4, 0x80, 0xb3, 0x79, 0x83, 0xb6,\n",
-            "  0xa3, 0x73, 0x84, 0x67, 0x8d, 0x8f, 0x9d, 0x78, 0xb8, 0x8a, 0x7b, 0x6c,\n",
-            "  0x85, 0x87, 0x6d, 0x75, 0xae, 0x75, 0x53, 0x71, 0x6b, 0x87, 0x67, 0x7b,\n",
-            "  0x7f, 0x86, 0x58, 0x73, 0x7d, 0x87, 0x5d, 0x7f, 0x7d, 0x63, 0x92, 0x65,\n",
-            "  0x7a, 0x9c, 0x6f, 0x87, 0x81, 0xa9, 0x91, 0x54, 0x66, 0x8e, 0x58, 0x6d,\n",
-            "  0x92, 0xc2, 0xa9, 0x7b, 0x6e, 0x96, 0x7c, 0x60, 0x7e, 0xa8, 0x85, 0x94,\n",
-            "  0x90, 0x8b, 0x77, 0x79, 0x77, 0xa7, 0x8f, 0x83, 0x80, 0x99, 0x8c, 0x80,\n",
-            "  0x93, 0x9c, 0x73, 0x9e, 0x75, 0x90, 0x67, 0x74, 0x99, 0x98, 0x7e, 0x76,\n",
-            "  0x9f, 0x82, 0x90, 0x95, 0x9d, 0x5f, 0x95, 0x98, 0x8c, 0x5f, 0x77, 0x83,\n",
-            "  0x7b, 0x72, 0x85, 0x7c, 0x97, 0x74, 0x81, 0x80, 0x8d, 0x89, 0x7d, 0x69,\n",
-            "  0x95, 0x85, 0x83, 0x5e, 0x95, 0x74, 0x54, 0x7f, 0x6c, 0x67, 0x9b, 0x83,\n",
-            "  0x88, 0x8e, 0x6f, 0x96, 0x81, 0x7f, 0x6e, 0x87, 0x8f, 0x6f, 0x61, 0x87,\n",
-            "  0x63, 0x66, 0x72, 0x77, 0x75, 0x6d, 0x59, 0x7d, 0xaa, 0x85, 0x62, 0x83,\n",
-            "  0x97, 0x94, 0x96, 0x89, 0x9d, 0x90, 0x7d, 0x91, 0x78, 0x57, 0xa0, 0x7f,\n",
-            "  0xa2, 0x62, 0x63, 0x99, 0x77, 0x71, 0x7f, 0x61, 0x99, 0x89, 0x6f, 0xa2,\n",
-            "  0xae, 0x92, 0x88, 0x51, 0x87, 0x7a, 0x6f, 0x89, 0xa8, 0x89, 0x64, 0x81,\n",
-            "  0x84, 0x79, 0x5b, 0x73, 0x82, 0x6e, 0x7e, 0x5d, 0x8f, 0x82, 0x51, 0x69,\n",
-            "  0x8e, 0x76, 0x8b, 0x58, 0x89, 0xb2, 0x52, 0x72, 0x7f, 0xae, 0x96, 0x5a,\n",
-            "  0x80, 0xa1, 0x74, 0x62, 0x8d, 0xbe, 0x87, 0x6c, 0x6d, 0xad, 0x83, 0x5a,\n",
-            "  0x6c, 0xa5, 0x7f, 0x7c, 0x7a, 0xa1, 0x75, 0x6d, 0x85, 0xbe, 0x91, 0x8e,\n",
-            "  0x96, 0x8c, 0x87, 0x74, 0x8b, 0x82, 0x96, 0x8f, 0x8f, 0x93, 0x8f, 0x8c,\n",
-            "  0x9a, 0x78, 0x73, 0x6e, 0x91, 0x8d, 0x7e, 0x81, 0x81, 0x52, 0x90, 0x85,\n",
-            "  0x77, 0x66, 0x7e, 0x75, 0x8a, 0x67, 0x72, 0x76, 0x82, 0x7b, 0x6e, 0x67,\n",
-            "  0x96, 0x7b, 0x75, 0x76, 0x8d, 0x76, 0x7f, 0x79, 0x84, 0x7b, 0x57, 0x81,\n",
-            "  0x76, 0x80, 0x67, 0x8c, 0x7c, 0x80, 0x67, 0x85, 0x79, 0x5b, 0x97, 0x74,\n",
-            "  0x91, 0x75, 0x82, 0x75, 0x6b, 0x94, 0x7e, 0x85, 0x8e, 0x77, 0x5d, 0x78,\n",
-            "  0xb5, 0x8b, 0x73, 0x7f, 0x62, 0x8f, 0xb1, 0x7d, 0xa2, 0x85, 0x6b, 0x92,\n",
-            "  0x75, 0x75, 0xb8, 0x7d, 0xb3, 0x67, 0x5f, 0xa6, 0x9b, 0x85, 0x9a, 0x67,\n",
-            "  0xbe, 0x8d, 0x92, 0x88, 0xa5, 0x7c, 0xaa, 0x5a, 0x71, 0x7b, 0x70, 0x77,\n",
-            "  0xa0, 0xa4, 0x5e, 0x55, 0x6b, 0x8e, 0x53, 0x89, 0x8a, 0x5a, 0x7c, 0x54,\n",
-            "  0x7c, 0x8b, 0x53, 0x77, 0x67, 0x77, 0x67, 0x5d, 0x91, 0xac, 0x78, 0x81,\n",
-            "  0x8e, 0xb5, 0x6d, 0x58, 0x78, 0xa6, 0x7c, 0x85, 0x87, 0xb3, 0x76, 0x5d,\n",
-            "  0x7c, 0x87, 0x57, 0x68, 0x82, 0x8f, 0x89, 0x76, 0x86, 0x9f, 0x6c, 0x68,\n",
-            "  0x7c, 0x87, 0x79, 0x9f, 0x86, 0x9e, 0x83, 0x70, 0x8d, 0xb2, 0x84, 0x71,\n",
-            "  0x71, 0x91, 0x9f, 0x8e, 0x83, 0x84, 0x87, 0x80, 0x94, 0x80, 0x7d, 0x8d,\n",
-            "  0x7c, 0x56, 0x5f, 0x80, 0x7d, 0x84, 0x61, 0x6e, 0x69, 0x80, 0x8b, 0x67,\n",
-            "  0xa4, 0x8b, 0x98, 0x7a, 0x8a, 0x6c, 0x77, 0x66, 0x7d, 0x6e, 0x84, 0x78,\n",
-            "  0x82, 0x7d, 0x61, 0x88, 0x6e, 0x53, 0x92, 0x75, 0x88, 0x77, 0x82, 0x9f,\n",
-            "  0x9e, 0x6f, 0x9c, 0x76, 0x91, 0x78, 0x69, 0x7f, 0x71, 0x6c, 0x6f, 0x7d,\n",
-            "  0x83, 0x6e, 0x3c, 0x84, 0x90, 0x8b, 0x71, 0x69, 0x75, 0x81, 0xc8, 0x84,\n",
-            "  0xa7, 0x8a, 0x8a, 0x90, 0x96, 0x86, 0x9e, 0x68, 0x99, 0x84, 0x8c, 0xa0,\n",
-            "  0x8a, 0x71, 0x7d, 0x41, 0xa1, 0x98, 0x77, 0x91, 0xaa, 0x86, 0x96, 0x5e,\n",
-            "  0x86, 0x76, 0xa7, 0x83, 0xac, 0x86, 0x66, 0x46, 0x6a, 0x81, 0x64, 0x77,\n",
-            "  0x67, 0x53, 0x80, 0x59, 0x73, 0x71, 0x63, 0x71, 0x76, 0x86, 0x62, 0x4f,\n",
-            "  0x83, 0xa4, 0x5d, 0x66, 0x93, 0x87, 0x87, 0x5b, 0x7f, 0x9d, 0x61, 0x9d,\n",
-            "  0x94, 0xa4, 0x84, 0x75, 0x67, 0xb3, 0x7b, 0x6d, 0x64, 0x98, 0x62, 0x77,\n",
-            "  0x7d, 0x98, 0x8e, 0x75, 0x7d, 0xa6, 0xa4, 0x8c, 0x83, 0x8b, 0x7a, 0x97,\n",
-            "  0x6c, 0x7f, 0x66, 0x7f, 0x8f, 0x98, 0x72, 0x6e, 0x75, 0x65, 0x80, 0x8d,\n",
-            "  0x88, 0x7d, 0x8c, 0x8d, 0x67, 0x68, 0xab, 0x8c, 0x8b, 0x76, 0x87, 0x69,\n",
-            "  0x88, 0x6c, 0x83, 0x6e, 0x88, 0x64, 0xa8, 0x67, 0xa5, 0x5b, 0x65, 0x60,\n",
-            "  0x6b, 0x62, 0x76, 0x78, 0x8c, 0x5b, 0x61, 0x6f, 0x66, 0x65, 0x92, 0x67,\n",
-            "  0x84, 0x7b, 0x80, 0x86, 0x7b, 0x6c, 0x86, 0x7a, 0x72, 0x7b, 0x4d, 0x94,\n",
-            "  0x80, 0x67, 0x8e, 0x8d, 0x7f, 0x79, 0x65, 0x78, 0xa3, 0x71, 0x80, 0x74,\n",
-            "  0xa7, 0xa8, 0x97, 0x78, 0x91, 0x77, 0x98, 0x86, 0x82, 0x64, 0xa5, 0x6e,\n",
-            "  0x7a, 0x5d, 0x6f, 0xad, 0x9b, 0x7a, 0x91, 0x4b, 0xa1, 0x75, 0x95, 0x76,\n",
-            "  0xac, 0x9d, 0xa3, 0x65, 0x65, 0x6a, 0x81, 0x8b, 0x9f, 0x67, 0x6b, 0x6a,\n",
-            "  0x60, 0x5b, 0x77, 0x96, 0x73, 0x78, 0x5a, 0x77, 0x5f, 0x68, 0x70, 0x72,\n",
-            "  0x78, 0x65, 0x81, 0x20, 0x86, 0x99, 0x80, 0x7a, 0xa5, 0xb1, 0x69, 0x45,\n",
-            "  0x7d, 0xa6, 0x7d, 0x85, 0xaa, 0xa9, 0x65, 0x60, 0x75, 0x9b, 0x61, 0x92,\n",
-            "  0x91, 0x8f, 0x8a, 0x81, 0x88, 0x9c, 0x81, 0x7d, 0x7b, 0x8f, 0x7e, 0x9e,\n",
-            "  0x82, 0x94, 0x95, 0x80, 0x73, 0xae, 0x7b, 0x7a, 0x79, 0x8c, 0x8b, 0x65,\n",
-            "  0x71, 0x75, 0x8d, 0x7a, 0x90, 0x83, 0x7b, 0x77, 0x71, 0x4f, 0x70, 0x95,\n",
-            "  0x87, 0x69, 0x97, 0x8e, 0x70, 0x92, 0x6e, 0x91, 0x9d, 0x72, 0x75, 0x82,\n",
-            "  0xad, 0x81, 0x78, 0x8d, 0x6f, 0x65, 0x88, 0x86, 0x8c, 0x8e, 0x59, 0x8b,\n",
-            "  0x67, 0x69, 0x8b, 0x78, 0x7f, 0x59, 0x73, 0x87, 0x6f, 0x86, 0x66, 0x7c,\n",
-            "  0x96, 0x68, 0x59, 0x78, 0x67, 0x92, 0x7b, 0x76, 0x80, 0x6e, 0x4a, 0x7b,\n",
-            "  0x99, 0x67, 0x72, 0x9c, 0x7a, 0x80, 0x76, 0x5f, 0x8e, 0x4f, 0x71, 0x77,\n",
-            "  0xab, 0x78, 0x99, 0x50, 0x83, 0x65, 0x78, 0x8c, 0xbb, 0x8d, 0x4e, 0x54,\n",
-            "  0x81, 0x6f, 0x7f, 0x91, 0xb9, 0x79, 0x9c, 0x65, 0x5a, 0x5a, 0x73, 0x8c,\n",
-            "  0x9a, 0xac, 0x99, 0x44, 0x7d, 0x4f, 0x78, 0x5a, 0x7d, 0x79, 0x57, 0x44,\n",
-            "  0x6f, 0x6a, 0x75, 0x7f, 0x5f, 0x6f, 0x72, 0x62, 0x7f, 0x89, 0x57, 0x91,\n",
-            "  0x8d, 0x83, 0x7e, 0x63, 0x8c, 0x95, 0x48, 0x78, 0xa9, 0x88, 0x84, 0x5b,\n",
-            "  0x8c, 0xa5, 0x65, 0x71, 0x88, 0x82, 0x7e, 0xa4, 0x8d, 0x7d, 0x7d, 0x8d,\n",
-            "  0x91, 0x7c, 0x73, 0x7d, 0x99, 0x89, 0x6d, 0xa1, 0x98, 0x84, 0x8b, 0x6b,\n",
-            "  0x89, 0x86, 0x84, 0x7e, 0x86, 0x87, 0x78, 0x8c, 0x96, 0x92, 0x5a, 0xa0,\n",
-            "  0x64, 0x73, 0x91, 0x88, 0x8f, 0x6b, 0x96, 0x5c, 0x99, 0x62, 0x78, 0x6c,\n",
-            "  0x87, 0x4d, 0x5d, 0x69, 0x7b, 0x81, 0x4a, 0x61, 0x71, 0x69, 0x7d, 0x91,\n",
-            "  0x67, 0x92, 0x68, 0x6f, 0x50, 0x5e, 0x61, 0x7e, 0x81, 0x70, 0x5f, 0x7b,\n",
-            "  0x6b, 0x55, 0x71, 0x6c, 0x70, 0x53, 0x3f, 0x80, 0x6e, 0x57, 0x96, 0x84,\n",
-            "  0x75, 0x51, 0x60, 0x9a, 0x7f, 0xa5, 0x80, 0x94, 0x95, 0x74, 0x7c, 0x83,\n",
-            "  0xa0, 0x93, 0x5d, 0x92, 0x83, 0x66, 0x67, 0x8a, 0x8b, 0x9b, 0x81, 0x69,\n",
-            "  0x73, 0x91, 0x6b, 0x79, 0x93, 0x88, 0x64, 0x68, 0x81, 0x8c, 0x6f, 0x81,\n",
-            "  0x6f, 0x80, 0x68, 0x5f, 0x9c, 0x95, 0x76, 0x93, 0x87, 0x68, 0x83, 0x94,\n",
-            "  0x8b, 0x85, 0x72, 0x7f, 0x64, 0x8c, 0x6a, 0x95, 0x8d, 0x80, 0x69, 0x6b,\n",
-            "  0x98, 0x86, 0x75, 0x92, 0x7a, 0x7f, 0x5b, 0x7f, 0x9b, 0x57, 0x99, 0x8d,\n",
-            "  0x8a, 0x7b, 0x58, 0x73, 0x88, 0x6d, 0x8a, 0x8c, 0x8e, 0x82, 0x85, 0xaa,\n",
-            "  0x72, 0xa6, 0x7f, 0x7a, 0x83, 0x59, 0x6d, 0x6e, 0x79, 0x83, 0x88, 0x84,\n",
-            "  0x74, 0x85, 0x74, 0x78, 0x80, 0x7c, 0x97, 0x86, 0x94, 0x65, 0x7e, 0x80,\n",
-            "  0x6f, 0x97, 0x70, 0x74, 0x92, 0x76, 0x71, 0x91, 0x85, 0x72, 0x6e, 0x84,\n",
-            "  0x78, 0x7e, 0x88, 0x79, 0x7f, 0x80, 0x83, 0x7a, 0x85, 0x75, 0x82, 0x81,\n",
-            "  0x82, 0x7b, 0x7a, 0xa0, 0x76, 0x7f, 0x75, 0xa7, 0x67, 0x8e, 0x81, 0x98,\n",
-            "  0xa5, 0x86, 0x77, 0x78, 0x7f, 0x97, 0x90, 0x86, 0x80, 0x6b, 0x89, 0x66,\n",
-            "  0x9b, 0x5c, 0x8b, 0x74, 0xac, 0x89, 0x89, 0x92, 0x92, 0xa8, 0x61, 0x85,\n",
-            "  0x8c, 0x86, 0x88, 0x91, 0x92, 0x66, 0x63, 0x6c, 0x7a, 0x80, 0x7d, 0x90,\n",
-            "  0x6f, 0x7f, 0x92, 0x94, 0x8e, 0x7a, 0x86, 0x98, 0xa1, 0x59, 0x71, 0x8c,\n",
-            "  0x63, 0xa3, 0x60, 0x7d, 0x88, 0x6a, 0x83, 0x6e, 0x7a, 0x94, 0x7b, 0x81,\n",
-            "  0x7d, 0x83, 0x77, 0x7e, 0x63, 0xab, 0x75, 0x7b, 0x71, 0x8f, 0x76, 0x6e,\n",
-            "  0x78, 0x7b, 0x79, 0x86, 0x69, 0x67, 0x67, 0x70, 0x6c, 0x7a, 0x6c, 0x84,\n",
-            "  0x74, 0xa2, 0x74, 0x77, 0x8a, 0x58, 0x7d, 0xa0, 0x65, 0x7b, 0x79, 0x71,\n",
-            "  0x7c, 0x3c, 0x85, 0x96, 0x59, 0x76, 0x6a, 0x94, 0xa5, 0x5b, 0x70, 0x99,\n",
-            "  0x7f, 0x9a, 0x69, 0x7c, 0x6f, 0x79, 0x72, 0x8b, 0x83, 0x6e, 0x73, 0x7f,\n",
-            "  0x6f, 0x6d, 0x7e, 0xa3, 0x72, 0x87, 0x83, 0x8c, 0x8c, 0x70, 0x77, 0x75,\n",
-            "  0xa4, 0x5a, 0x89, 0x7d, 0xa0, 0x97, 0x67, 0x80, 0x78, 0x7e, 0x86, 0x6a,\n",
-            "  0x7b, 0x9c, 0x77, 0x67, 0x7b, 0x74, 0x7f, 0xa5, 0x90, 0x94, 0x92, 0x4d,\n",
-            "  0x7a, 0x79, 0x9f, 0x87, 0x64, 0x6e, 0x6d, 0x59, 0x83, 0x54, 0x79, 0x82,\n",
-            "  0x6c, 0x74, 0x82, 0x98, 0x77, 0x90, 0x85, 0xa4, 0x88, 0x81, 0x71, 0x85,\n",
-            "  0x90, 0x8e, 0x88, 0x68, 0x51, 0x6d, 0x71, 0x7b, 0x80, 0xbc, 0xa5, 0x57,\n",
-            "  0x8f, 0x9f, 0x95, 0x89, 0xb1, 0x96, 0x69, 0x65, 0x61, 0x73, 0x6f, 0x6c,\n",
-            "  0x5b, 0x95, 0x99, 0x7f, 0x76, 0x9d, 0x7c, 0x7d, 0x8d, 0xb1, 0x8f, 0x6a,\n",
-            "  0x76, 0x95, 0x74, 0x7a, 0x7b, 0xae, 0x77, 0x76, 0x6d, 0x99, 0x7d, 0x80,\n",
-            "  0x6e, 0x89, 0x7f, 0x74, 0x6f, 0x72, 0x89, 0x8b, 0x86, 0x7b, 0x7c, 0x72,\n",
-            "  0x6b, 0x4f, 0x71, 0x94, 0x80, 0x96, 0x83, 0x7e, 0x75, 0x74, 0x68, 0x83,\n",
-            "  0x95, 0x8c, 0x85, 0x7a, 0x82, 0x74, 0x85, 0x83, 0x8c, 0x7e, 0x7a, 0xa0,\n",
-            "  0x8e, 0x67, 0x6b, 0x82, 0x9b, 0x66, 0x6c, 0x8a, 0x88, 0x7e, 0x74, 0x9e,\n",
-            "  0x88, 0x82, 0x73, 0x73, 0x79, 0x7c, 0x72, 0x6b, 0x74, 0x8b, 0xa4, 0xa4,\n",
-            "  0xa3, 0x73, 0x73, 0x88, 0x8d, 0x94, 0x84, 0x9a, 0x9e, 0x93, 0x6c, 0x86,\n",
-            "  0x7a, 0x7a, 0x7e, 0xaa, 0x66, 0x8f, 0x99, 0xa4, 0x70, 0x4c, 0x6f, 0x66,\n",
-            "  0x8a, 0xaa, 0x69, 0x80, 0x6a, 0x5e, 0x71, 0x8f, 0x8b, 0x84, 0x75, 0x9d,\n",
-            "  0x5c, 0x60, 0x61, 0x4a, 0x6f, 0x91, 0x78, 0x6e, 0x8c, 0x62, 0x88, 0x75,\n",
-            "  0x64, 0x7c, 0x7d, 0x92, 0x9b, 0x96, 0x62, 0x72, 0x6c, 0x6f, 0x87, 0x5d,\n",
-            "  0xa0, 0xa7, 0x7c, 0x58, 0x6e, 0x8c, 0x82, 0x84, 0x7f, 0x8b, 0x54, 0x77,\n",
-            "  0x5b, 0x9a, 0x6a, 0x78, 0x5d, 0xb9, 0x8e, 0x7d, 0x6e, 0xa1, 0x66, 0x7c,\n",
-            "  0x87, 0xd2, 0x7a, 0x6c, 0x82, 0xa1, 0x83, 0x59, 0x64, 0x9e, 0x65, 0x6d,\n",
-            "  0x77, 0x80, 0x7c, 0x9a, 0x50, 0x9f, 0x8b, 0x7a, 0x73, 0x80, 0x92, 0x6d,\n",
-            "  0x97, 0x7f, 0x74, 0x6a, 0x5f, 0x44, 0x7d, 0x99, 0x95, 0x91, 0x8f, 0x6a,\n",
-            "  0x63, 0x56, 0x89, 0x96, 0xba, 0xa6, 0x71, 0x98, 0x9d, 0x3a, 0x8f, 0x77,\n",
-            "  0x6d, 0x76, 0x68, 0xb4, 0x8d, 0x79, 0x7a, 0x83, 0x7f, 0x96, 0x75, 0x94,\n",
-            "  0x9e, 0x51, 0x83, 0x5b, 0x66, 0x73, 0xa1, 0xbc, 0x8c, 0x70, 0x88, 0x80,\n",
-            "  0x92, 0x60, 0x7d, 0xa9, 0x97, 0x74, 0x7d, 0x98, 0x7b, 0x78, 0x85, 0xa7,\n",
-            "  0x8f, 0x8c, 0x91, 0x9d, 0x6a, 0x80, 0x6c, 0x8e, 0x8e, 0x91, 0x76, 0x8b,\n",
-            "  0x79, 0x59, 0x7d, 0x9c, 0x69, 0x83, 0x8c, 0x95, 0x8e, 0x75, 0x9d, 0x83,\n",
-            "  0x92, 0x99, 0x8a, 0x59, 0x61, 0x54, 0x63, 0x86, 0x83, 0x86, 0x98, 0x83,\n",
-            "  0x73, 0x74, 0x91, 0x52, 0x60, 0x8a, 0x7c, 0x57, 0xbc, 0x9d, 0x86, 0x6b,\n",
-            "  0x63, 0xa2, 0x78, 0x80, 0x75, 0xb1, 0x74, 0x76, 0x69, 0x8b, 0x7e, 0x76,\n",
-            "  0x7b, 0xb3, 0x77, 0x5b, 0x6c, 0x8b, 0x83, 0x80, 0x7f, 0xd1, 0x7c, 0x58,\n",
-            "  0x6f, 0x98, 0x71, 0x57, 0x60, 0xd0, 0x84, 0x62, 0x74, 0xa6, 0x8f, 0x7b,\n",
-            "  0x70, 0xaa, 0x81, 0x6b, 0x7f, 0x89, 0x6a, 0x74, 0x5a, 0x8c, 0x9c, 0x77,\n",
-            "  0x5d, 0x84, 0x63, 0x94, 0x8e, 0x91, 0x83, 0x4a, 0x49, 0x74, 0x6b, 0x70,\n",
-            "  0xc0, 0xa0, 0x6a, 0x90, 0x8e, 0x5a, 0x70, 0x96, 0xab, 0x72, 0x7e, 0xba,\n",
-            "  0xa7, 0x46, 0x86, 0x5d, 0x90, 0x76, 0x95, 0x8d, 0xa5, 0x40, 0x82, 0x8a,\n",
-            "  0x7d, 0x5e, 0x73, 0x94, 0x9d, 0x58, 0x8c, 0x8b, 0x69, 0x6c, 0x9a, 0x90,\n",
-            "  0xaa, 0x6f, 0x85, 0x8d, 0x64, 0x58, 0x7b, 0x97, 0xa9, 0x79, 0xa5, 0xa2,\n",
-            "  0x5f, 0x57, 0x9a, 0xb4, 0x89, 0x70, 0x84, 0x73, 0x46, 0x6c, 0x6e, 0x87,\n",
-            "  0x70, 0x94, 0x8a, 0x8a, 0x69, 0x7b, 0x6c, 0x68, 0x8e, 0xa2, 0x90, 0x84,\n",
-            "  0x78, 0x45, 0x63, 0x78, 0x7f, 0x90, 0x9f, 0x90, 0x68, 0x43, 0x92, 0x77,\n",
-            "  0x78, 0x77, 0x82, 0x7d, 0x8f, 0x6a, 0x7a, 0x70, 0x76, 0x75, 0x87, 0x63,\n",
-            "  0xbc, 0x8e, 0x6a, 0x71, 0x51, 0x51, 0x75, 0x6b, 0x8a, 0xb4, 0x6a, 0x5b,\n",
-            "  0x99, 0x84, 0x76, 0x84, 0x74, 0xaf, 0x86, 0x6a, 0x53, 0x97, 0x6e, 0x8e,\n",
-            "  0x61, 0xc4, 0x7e, 0x5d, 0x4d, 0x96, 0x73, 0x73, 0x53, 0xc0, 0x8f, 0x68,\n",
-            "  0x58, 0xae, 0x81, 0x83, 0x62, 0x98, 0x7b, 0x89, 0x54, 0x86, 0x78, 0x67,\n",
-            "  0x70, 0x9b, 0x63, 0x5f, 0x2d, 0x77, 0x84, 0x79, 0x6b, 0xa4, 0x7b, 0x65,\n",
-            "  0x45, 0x65, 0x56, 0x86, 0xbb, 0x8a, 0x8e, 0x92, 0x86, 0x48, 0x7c, 0x6d,\n",
-            "  0xb4, 0x7d, 0x56, 0xa4, 0x86, 0x52, 0x8b, 0x6a, 0x8d, 0x5b, 0x9d, 0xa2,\n",
-            "  0xbf, 0x36, 0x7c, 0x99, 0x9d, 0x65, 0x75, 0xa4, 0x9f, 0x6a, 0x7c, 0x6b,\n",
-            "  0x6f, 0x55, 0x70, 0x7f, 0xc2, 0x38, 0x6e, 0xa4, 0x74, 0x4c, 0x75, 0xbb,\n",
-            "  0xa4, 0x75, 0x8e, 0x8f, 0x56, 0x65, 0x57, 0x92, 0x73, 0x7f, 0x7d, 0x86,\n",
-            "  0x65, 0x76, 0x92, 0x84, 0x70, 0xa8, 0x91, 0x5b, 0x69, 0x74, 0x8e, 0x82,\n",
-            "  0x78, 0x8a, 0xaa, 0x71, 0x70, 0x50, 0x85, 0x82, 0x7d, 0x94, 0xa0, 0x76,\n",
-            "  0x6d, 0x55, 0x86, 0x79, 0x71, 0x7f, 0x9b, 0x71, 0x8a, 0x42, 0x87, 0x64,\n",
-            "  0x57, 0x88, 0xa0, 0x77, 0xa8, 0x91, 0x72, 0x65, 0x7e, 0x6b, 0x7e, 0x81,\n",
-            "  0x8d, 0x97, 0x7e, 0x6a, 0x92, 0x88, 0x84, 0x7a, 0x61, 0xa9, 0x86, 0x59,\n",
-            "  0x6c, 0x87, 0x61, 0x72, 0x4f, 0xc8, 0x99, 0x6c, 0x66, 0xa3, 0x80, 0x8b,\n",
-            "  0x5c, 0xc0, 0x69, 0x7a, 0x6c, 0xb8, 0x8e, 0x91, 0x51, 0x9f, 0x8c, 0x85,\n",
-            "  0x75, 0x96, 0x8c, 0x84, 0x6b, 0xa6, 0x71, 0x62, 0x42, 0x60, 0x74, 0x72,\n",
-            "  0x92, 0x91, 0x70, 0x5b, 0x3d, 0x71, 0x5e, 0x91, 0xa3, 0xa5, 0x6a, 0x7c,\n",
-            "  0x60, 0x58, 0x82, 0x80, 0xa3, 0x73, 0x8f, 0xa0, 0xb2, 0x4b, 0x94, 0x5e,\n",
-            "  0x9f, 0x75, 0x4d, 0x83, 0xbc, 0x42, 0x5e, 0x80, 0x8f, 0x59, 0x53, 0xac,\n",
-            "  0xb2, 0x45, 0x68, 0x7d, 0x9a, 0x65, 0x8a, 0xaa, 0xa0, 0x4e, 0x77, 0x72,\n",
-            "  0x4d, 0x62, 0x6e, 0x98, 0x8c, 0x73, 0x92, 0x5a, 0x49, 0x55, 0x7b, 0x98,\n",
-            "  0x8d, 0x84, 0x80, 0x8e, 0x2e, 0x56, 0x78, 0x73, 0x7b, 0x8f, 0x9a, 0x69,\n",
-            "  0x73, 0x68, 0x7a, 0x88, 0x78, 0xa5, 0xb1, 0x5c, 0x8f, 0x55, 0x71, 0x99,\n",
-            "  0x7a, 0xa9, 0xb0, 0x75, 0x69, 0x44, 0x5f, 0x66, 0x81, 0x7d, 0x9e, 0x4f,\n",
-            "  0x66, 0x7f, 0x87, 0x7d, 0x5d, 0x7c, 0x95, 0x62, 0xa5, 0x86, 0x90, 0x6f,\n",
-            "  0x60, 0xa5, 0x6e, 0x70, 0x80, 0x96, 0x6f, 0x55, 0x77, 0x87, 0x99, 0x7b,\n",
-            "  0x21, 0xaa, 0x7f, 0x60, 0x63, 0xae, 0x47, 0x79, 0x44, 0xb5, 0x83, 0x6e,\n",
-            "  0x6d, 0x93, 0x76, 0x54, 0x4b, 0xad, 0x91, 0x6b, 0x6a, 0x9c, 0x8c, 0x83,\n",
-            "  0x62, 0x8a, 0x88, 0x71, 0x73, 0xa0, 0x75, 0x95, 0x54, 0x80, 0x92, 0x65,\n",
-            "  0x45, 0x80, 0x63, 0x9a, 0x93, 0x9b, 0x78, 0x4e, 0x4d, 0x5f, 0x69, 0x9e,\n",
-            "  0xbd, 0xa5, 0x75, 0x6b, 0x6e, 0x6a, 0x82, 0x97, 0xab, 0x60, 0x76, 0xb3,\n",
-            "  0xc1, 0x39, 0x82, 0x5b, 0x71, 0x31, 0x7b, 0x9c, 0xb5, 0x4f, 0x75, 0x79,\n",
-            "  0x6c, 0x5d, 0x80, 0xa6, 0x9c, 0x53, 0x6f, 0x85, 0x84, 0x5e, 0x7d, 0xb5,\n",
-            "  0x95, 0x5f, 0x7c, 0x98, 0x72, 0x7c, 0x67, 0x99, 0xbb, 0x6c, 0x73, 0x66,\n",
-            "  0x59, 0x5c, 0x6c, 0x9a, 0x9b, 0x72, 0x9b, 0x5f, 0x4b, 0x51, 0x63, 0x84,\n",
-            "  0x74, 0xa0, 0xb3, 0x6e, 0x63, 0xa0, 0x84, 0x90, 0x71, 0x91, 0xba, 0x64,\n",
-            "  0x6d, 0x72, 0x78, 0x83, 0x6f, 0x8e, 0xbd, 0x64, 0x69, 0x60, 0x95, 0x67,\n",
-            "  0x70, 0x93, 0x78, 0x4d, 0x91, 0x3f, 0x7b, 0x6d, 0x69, 0x87, 0x7d, 0x8a,\n",
-            "  0xa3, 0x95, 0x9d, 0x66, 0x6d, 0x8b, 0x7a, 0x75, 0x94, 0x7b, 0x89, 0x52,\n",
-            "  0x66, 0x65, 0x79, 0x84, 0x49, 0x9c, 0x60, 0x66, 0x3e, 0xab, 0x4a, 0x86,\n",
-            "  0x54, 0xcd, 0x7c, 0x83, 0x7c, 0xac, 0x8b, 0x53, 0x67, 0xbb, 0x7c, 0x6d,\n",
-            "  0x72, 0xb3, 0x83, 0x85, 0x4f, 0x97, 0x86, 0x60, 0x7d, 0x93, 0x70, 0x8b,\n",
-            "  0x64, 0x78, 0x82, 0x73, 0x54, 0x87, 0x6c, 0xaa, 0x6f, 0x97, 0x8d, 0x51,\n",
-            "  0x2d, 0x50, 0x75, 0xa9, 0xc2, 0x94, 0x8d, 0x6f, 0x6d, 0x71, 0x7b, 0x87,\n",
-            "  0x93, 0x67, 0x7d, 0xa5, 0xa2, 0x4f, 0x99, 0x83, 0x95, 0x49, 0x70, 0x9c,\n",
-            "  0xcf, 0x37, 0x84, 0x86, 0x94, 0x5c, 0x95, 0xa1, 0xb6, 0x73, 0x80, 0x8d,\n",
-            "  0x89, 0x62, 0x6f, 0xb4, 0xa1, 0x5b, 0x64, 0x91, 0x41, 0x4f, 0x53, 0xa6,\n",
-            "  0xae, 0x75, 0x84, 0x82, 0x58, 0x8e, 0x63, 0x95, 0xa3, 0x8d, 0x8b, 0x76,\n",
-            "  0x5d, 0x78, 0x80, 0x82, 0x6e, 0x9d, 0xb8, 0x7d, 0x64, 0x8a, 0x7e, 0x80,\n",
-            "  0x72, 0x99, 0xcf, 0x76, 0x66, 0x77, 0x7c, 0x81, 0x71, 0x6f, 0xa1, 0x6c,\n",
-            "  0x6b, 0x70, 0x80, 0x7c, 0x6d, 0x83, 0x8e, 0x74, 0x7a, 0x58, 0x69, 0x53,\n",
-            "  0x58, 0x7d, 0x7f, 0x84, 0x96, 0x9c, 0x75, 0x6e, 0x62, 0x7c, 0x88, 0x7e,\n",
-            "  0x7f, 0x98, 0x93, 0x61, 0x98, 0x98, 0x80, 0x83, 0x2e, 0x7d, 0x64, 0x69,\n",
-            "  0x50, 0xa5, 0x38, 0x96, 0x2e, 0xc5, 0x66, 0x56, 0x64, 0xaa, 0x63, 0x64,\n",
-            "  0x6d, 0xb3, 0x8a, 0x6c, 0x59, 0xb6, 0x69, 0x7a, 0x54, 0x91, 0x58, 0x96,\n",
-            "  0x6b, 0x9f, 0x6d, 0x88, 0x4a, 0x82, 0x94, 0x67, 0x38, 0x93, 0x60, 0x87,\n",
-            "  0x8c, 0x93, 0x8c, 0x52, 0x31, 0x43, 0x66, 0xa9, 0xb3, 0x7a, 0x88, 0x64,\n",
-            "  0x60, 0x5b, 0x80, 0x84, 0xb7, 0x5a, 0x7a, 0x9d, 0x92, 0x50, 0x89, 0x80,\n",
-            "  0x72, 0x51, 0x7f, 0x85, 0xae, 0x47, 0x76, 0x9a, 0x7a, 0x74, 0x6d, 0x93,\n",
-            "  0xbd, 0x42, 0x72, 0x6d, 0x58, 0x5e, 0x6e, 0xa4, 0xb5, 0x4e, 0x76, 0x8f,\n",
-            "  0x75, 0x9b, 0x5d, 0x92, 0xad, 0x77, 0x7f, 0x73, 0x62, 0x7d, 0x65, 0xaf,\n",
-            "  0x98, 0x87, 0x80, 0x7c, 0x61, 0x81, 0x45, 0xa0, 0x84, 0x99, 0xbb, 0x72,\n",
-            "  0x86, 0x8f, 0x70, 0x97, 0x6a, 0x8a, 0xd3, 0x70, 0x7c, 0x91, 0x77, 0x82,\n",
-            "  0x70, 0x8c, 0xd5, 0x6c, 0x7f, 0x51, 0x5f, 0x69, 0x72, 0x89, 0x9a, 0x68,\n",
-            "  0x79, 0x70, 0x8b, 0x80, 0x52, 0x98, 0x86, 0x7a, 0xa0, 0x7b, 0x61, 0x6e,\n",
-            "  0x66, 0x6f, 0x77, 0x78, 0x64, 0xac, 0x7e, 0x73, 0x5d, 0x71, 0x6f, 0x80,\n",
-            "  0x2e, 0xa9, 0x90, 0x5c, 0x56, 0xa1, 0x32, 0x88, 0x55, 0xb9, 0x67, 0x6f,\n",
-            "  0x5c, 0xa5, 0x87, 0x61, 0x6b, 0xbd, 0x77, 0x7c, 0x62, 0xae, 0x7c, 0x7a,\n",
-            "  0x66, 0xac, 0x7a, 0x62, 0x5c, 0x9a, 0x58, 0x89, 0x5a, 0x74, 0x72, 0x66,\n",
-            "  0x5c, 0x8e, 0x51, 0x8e, 0x99, 0x92, 0xa0, 0x49, 0x31, 0x55, 0x68, 0x99,\n",
-            "  0xba, 0x82, 0xa2, 0x7a, 0x5e, 0x6f, 0x84, 0x98, 0x96, 0x52, 0x73, 0x99,\n",
-            "  0xb4, 0x5e, 0x7c, 0x59, 0x7d, 0x4a, 0x7e, 0xa0, 0xbe, 0x63, 0x67, 0x8e,\n",
-            "  0x7f, 0x71, 0x80, 0xaf, 0x93, 0x4e, 0x78, 0x7e, 0x6d, 0x52, 0x66, 0xb3,\n",
-            "  0x94, 0x56, 0x84, 0x8f, 0x50, 0x6d, 0x65, 0xa8, 0xb3, 0x4b, 0x91, 0x7f,\n",
-            "  0x4c, 0x8d, 0x69, 0x79, 0x95, 0x8f, 0x8f, 0x7c, 0x66, 0x98, 0x75, 0x9b,\n",
-            "  0x73, 0x9b, 0xac, 0x79, 0x6e, 0x84, 0x69, 0x9e, 0x80, 0xa0, 0xb0, 0x6c,\n",
-            "  0x46, 0x8b, 0x3f, 0x7a, 0x79, 0x79, 0xb3, 0x62, 0x6b, 0x60, 0x67, 0x81,\n",
-            "  0x4a, 0x7e, 0xa7, 0x8c, 0x74, 0x7f, 0x67, 0x4c, 0x4b, 0x8c, 0x8e, 0x67,\n",
-            "  0x78, 0x9d, 0x94, 0x79, 0x75, 0x7c, 0x86, 0x7b, 0x67, 0x9f, 0xa4, 0x61,\n",
-            "  0x5b, 0x6e, 0x85, 0x70, 0x20, 0xa5, 0x66, 0x5e, 0x55, 0xad, 0x3e, 0x7c,\n",
-            "  0x2d, 0xb4, 0x78, 0x6f, 0x4c, 0xc6, 0x7e, 0x6d, 0x54, 0xb4, 0x71, 0x78,\n",
-            "  0x54, 0xc3, 0x66, 0x6e, 0x4a, 0xa0, 0x7b, 0x85, 0x66, 0x94, 0x75, 0x8d,\n",
-            "  0x34, 0x88, 0x71, 0x4e, 0x49, 0x8a, 0x3b, 0x9c, 0x88, 0x76, 0x7f, 0x6a,\n",
-            "  0x37, 0x64, 0x66, 0xb6, 0xa3, 0x82, 0x76, 0x82, 0x6d, 0x65, 0x6f, 0x8c,\n",
-            "  0x99, 0x5e, 0x77, 0xa1, 0x99, 0x51, 0xa1, 0x67, 0x6f, 0x4c, 0x7f, 0x9e,\n",
-            "  0xad, 0x40, 0x65, 0x82, 0x76, 0x66, 0x72, 0xb5, 0xb2, 0x5b, 0x71, 0x8a,\n",
-            "  0x76, 0x74, 0x52, 0xa0, 0x91, 0x37, 0x86, 0x72, 0x6c, 0x75, 0x62, 0xa5,\n",
-            "  0xb6, 0x57, 0x75, 0x90, 0x3e, 0x7f, 0x49, 0x9f, 0x8e, 0x92, 0x81, 0x87,\n",
-            "  0x69, 0x9e, 0x6b, 0x86, 0x8d, 0xb1, 0x9e, 0x65, 0x6f, 0x93, 0x70, 0x79,\n",
-            "  0x7b, 0x87, 0xbe, 0x59, 0x69, 0x7a, 0x56, 0x7a, 0x81, 0x7d, 0xb8, 0x67,\n",
-            "  0x67, 0x7f, 0x54, 0x8f, 0x71, 0x85, 0xa0, 0x74, 0x89, 0x5d, 0x67, 0x52,\n",
-            "  0x65, 0x96, 0x89, 0x84, 0x81, 0x83, 0x82, 0x9a, 0x85, 0x73, 0x78, 0x62,\n",
-            "  0x87, 0x98, 0x75, 0x6a, 0x73, 0x95, 0x86, 0x71, 0x11, 0x9a, 0x91, 0x66,\n",
-            "  0x6e, 0xa4, 0x35, 0x89, 0x47, 0xbb, 0x5e, 0x46, 0x3a, 0xa8, 0x70, 0x4a,\n",
-            "  0x65, 0xb9, 0x70, 0x96, 0x66, 0xcf, 0x80, 0x79, 0x60, 0xa4, 0x79, 0x70,\n",
-            "  0x68, 0x92, 0x7f, 0x89, 0x6b, 0x87, 0x77, 0x67, 0x5b, 0x74, 0x3f, 0x9e,\n",
-            "  0x94, 0x9b, 0xa1, 0x61, 0x4b, 0x66, 0x70, 0xad, 0xb7, 0x67, 0x70, 0x6c,\n",
-            "  0x3f, 0x5b, 0x94, 0x88, 0xb3, 0x4f, 0x97, 0x97, 0x8c, 0x55, 0xb8, 0x78,\n",
-            "  0x60, 0x25, 0x51, 0x91, 0xcd, 0x44, 0x6f, 0x85, 0x5c, 0x65, 0x67, 0xa5,\n",
-            "  0x9e, 0x5f, 0x6d, 0x85, 0x6d, 0x56, 0x80, 0xae, 0x79, 0x63, 0x4f, 0x7d,\n",
-            "  0x5f, 0x6b, 0x6e, 0xa7, 0x8e, 0x76, 0x8f, 0x90, 0x6e, 0x8c, 0x88, 0x92,\n",
-            "  0x81, 0x81, 0x96, 0x7d, 0x48, 0x6b, 0x3f, 0xa1, 0x8c, 0xa2, 0x9f, 0x7f,\n",
-            "  0x77, 0x97, 0x73, 0x9c, 0x67, 0x95, 0xae, 0x77, 0x7f, 0x7a, 0x52, 0x7e,\n",
-            "  0x91, 0x77, 0xa8, 0x54, 0x6a, 0x74, 0x52, 0x8a, 0x67, 0x8e, 0x90, 0x8d,\n",
-            "  0x8b, 0x52, 0x72, 0x5a, 0x73, 0x8f, 0x94, 0x87, 0x7c, 0x88, 0x89, 0x76,\n",
-            "  0x77, 0x88, 0x5c, 0x77, 0x8f, 0x94, 0xac, 0x58, 0x70, 0x79, 0x75, 0x8a,\n",
-            "  0x20, 0x9c, 0x91, 0x55, 0x55, 0xa4, 0x5b, 0x84, 0x30, 0xc6, 0x8a, 0x51,\n",
-            "  0x31, 0xc3, 0x72, 0x6b, 0x65, 0xb9, 0x79, 0x7d, 0x62, 0xad, 0x88, 0x75,\n",
-            "  0x37, 0xb0, 0x76, 0x8a, 0x7d, 0x85, 0x7f, 0xb4, 0x46, 0x9c, 0x83, 0x7b,\n",
-            "  0x79, 0x78, 0x56, 0xac, 0x8d, 0xa2, 0xa9, 0x54, 0x44, 0x5a, 0x63, 0xb2,\n",
-            "  0xa8, 0x72, 0xa4, 0x6b, 0x5d, 0x4d, 0x8e, 0x95, 0x9e, 0x4a, 0x98, 0x8c,\n",
-            "  0xb0, 0x5c, 0xa5, 0x75, 0x83, 0x3b, 0x46, 0x92, 0xa7, 0x3b, 0x6a, 0x75,\n",
-            "  0x59, 0x57, 0x52, 0xa1, 0xab, 0x54, 0x68, 0x7c, 0x94, 0x6e, 0x5b, 0x9a,\n",
-            "  0xa3, 0x5d, 0x73, 0x74, 0x5a, 0x63, 0x56, 0x9e, 0xc1, 0x71, 0x82, 0x79,\n",
-            "  0x49, 0x92, 0x63, 0xa6, 0x99, 0x7d, 0x71, 0x81, 0x5e, 0x90, 0x5c, 0x8b,\n",
-            "  0x7e, 0xb4, 0xa0, 0x8c, 0x67, 0x93, 0x4e, 0x72, 0x65, 0x83, 0xb5, 0x77,\n",
-            "  0x83, 0x92, 0x43, 0x67, 0x8c, 0x81, 0xb1, 0x75, 0x6a, 0x61, 0x66, 0x6f,\n",
-            "  0x5d, 0x7f, 0x8d, 0x7b, 0x6b, 0x68, 0x6f, 0x85, 0x6e, 0x87, 0x97, 0x89,\n",
-            "  0x9b, 0x81, 0x7e, 0x7e, 0x9d, 0x83, 0x6b, 0x6a, 0xa5, 0x92, 0x7e, 0x70,\n",
-            "  0x60, 0x8f, 0x6f, 0x8b, 0x15, 0xa6, 0x66, 0x4e, 0x61, 0xbc, 0x38, 0x67,\n",
-            "  0x46, 0xab, 0x84, 0x5e, 0x3a, 0xac, 0x74, 0x58, 0x76, 0xc4, 0x7a, 0x76,\n",
-            "  0x67, 0xc0, 0x76, 0x6f, 0x52, 0xa6, 0xa2, 0x97, 0x76, 0xa6, 0x7f, 0x99,\n",
-            "  0x5d, 0xa5, 0x5f, 0x60, 0x58, 0x88, 0x3f, 0x9e, 0x7d, 0x81, 0x71, 0x63,\n",
-            "  0x42, 0x55, 0x3e, 0xbd, 0xa9, 0x7a, 0xa5, 0x67, 0x62, 0x7a, 0x80, 0x9e,\n",
-            "  0xc3, 0x54, 0x7f, 0x9f, 0x93, 0x73, 0xbd, 0x79, 0x74, 0x2e, 0x54, 0x9e,\n",
-            "  0xaa, 0x76, 0x68, 0x80, 0x78, 0x64, 0x57, 0x93, 0xa4, 0x56, 0x75, 0x72,\n",
-            "  0x81, 0x7f, 0x48, 0xad, 0x89, 0x67, 0x60, 0x7e, 0x7a, 0x83, 0x6e, 0x95,\n",
-            "  0xb0, 0x57, 0x89, 0x91, 0x4d, 0x86, 0x78, 0x7b, 0x74, 0x8c, 0x8f, 0x8d,\n",
-            "  0x67, 0xa4, 0x64, 0x8d, 0x77, 0x9a, 0xa1, 0x88, 0x6e, 0x94, 0x33, 0x95,\n",
-            "  0x81, 0x76, 0xc6, 0x7d, 0x7d, 0x85, 0x5a, 0x6e, 0x8e, 0x69, 0x9e, 0x71,\n",
-            "  0x82, 0x81, 0x59, 0x5b, 0x71, 0x9a, 0x91, 0x8e, 0x80, 0x69, 0x71, 0x73,\n",
-            "  0x6e, 0x9a, 0x95, 0x94, 0x7b, 0x80, 0x82, 0x7e, 0x76, 0x84, 0x70, 0x72,\n",
-            "  0x9c, 0xa0, 0x77, 0x66, 0x55, 0xa1, 0x8c, 0x73, 0x35, 0xa0, 0x68, 0x4d,\n",
-            "  0x3b, 0xaa, 0x44, 0x6f, 0x3c, 0xc0, 0x96, 0x78, 0x33, 0xbd, 0x64, 0x5b,\n",
-            "  0x75, 0xd2, 0x83, 0x87, 0x59, 0xbd, 0x80, 0x80, 0x6e, 0x8e, 0x65, 0x7a,\n",
-            "  0x87, 0xb6, 0x8d, 0x94, 0x39, 0x95, 0x8b, 0x5d, 0x66, 0x71, 0x4e, 0x9f,\n",
-            "  0x96, 0x8a, 0x98, 0x47, 0x41, 0x6c, 0x4c, 0xac, 0x95, 0x81, 0x90, 0x75,\n",
-            "  0x59, 0x4c, 0xa2, 0x93, 0x99, 0x58, 0x7b, 0xaf, 0xa3, 0x52, 0xb0, 0x6c,\n",
-            "  0x5f, 0x47, 0x6e, 0x8e, 0xae, 0x3d, 0x81, 0x6d, 0x78, 0x52, 0x4f, 0x81,\n",
-            "  0x80, 0x68, 0x4b, 0x81, 0x74, 0x71, 0x67, 0xa7, 0x9a, 0x55, 0x84, 0x72,\n",
-            "  0x64, 0x6b, 0x6e, 0x9d, 0xab, 0x76, 0x79, 0x85, 0x40, 0x84, 0x80, 0x85,\n",
-            "  0x70, 0x91, 0x9a, 0x81, 0x5b, 0x89, 0x6b, 0x8a, 0x92, 0x8c, 0xa4, 0x7b,\n",
-            "  0x75, 0x89, 0x54, 0x76, 0x69, 0x69, 0xb3, 0x6c, 0x47, 0x7d, 0x4c, 0x7f,\n",
-            "  0x81, 0x86, 0x8f, 0x63, 0x71, 0x6a, 0x63, 0x67, 0x7c, 0x8f, 0xa0, 0x68,\n",
-            "  0x86, 0x58, 0x5b, 0x87, 0x6a, 0x82, 0x89, 0x78, 0x9d, 0x8d, 0xaa, 0x82,\n",
-            "  0x6e, 0xa4, 0x6f, 0x6d, 0x70, 0x9f, 0x7f, 0x77, 0x41, 0xa5, 0x86, 0x61,\n",
-            "  0x2d, 0x99, 0xa9, 0x5f, 0x5a, 0xb3, 0x51, 0x70, 0x5a, 0xce, 0x77, 0x68,\n",
-            "  0x2c, 0xb8, 0x90, 0x44, 0x58, 0xb9, 0x74, 0x8e, 0x70, 0xb3, 0x9a, 0x75,\n",
-            "  0x6d, 0xc0, 0x9e, 0x8e, 0x8d, 0xa8, 0x7b, 0xa8, 0x4a, 0x89, 0x6e, 0x7f,\n",
-            "  0x5d, 0x6e, 0x46, 0x91, 0x6d, 0x81, 0x89, 0x3e, 0x35, 0x69, 0x44, 0xaf,\n",
-            "  0x99, 0x8d, 0x94, 0x54, 0x60, 0x5b, 0xaf, 0x97, 0x92, 0x4e, 0x80, 0xae,\n",
-            "  0x9e, 0x62, 0xa3, 0x77, 0x6e, 0x5d, 0x71, 0xa0, 0xa6, 0x59, 0x84, 0x5d,\n",
-            "  0x65, 0x4a, 0x69, 0xa1, 0xa1, 0x40, 0x75, 0x65, 0x6b, 0x68, 0x60, 0xb3,\n",
-            "  0x92, 0x27, 0x70, 0x67, 0x9b, 0x5e, 0x50, 0xaf, 0xae, 0x64, 0x7a, 0x6e,\n",
-            "  0x61, 0x94, 0x3b, 0x8f, 0x86, 0x7f, 0x98, 0x88, 0x7a, 0x7f, 0x61, 0x7b,\n",
-            "  0x64, 0x96, 0x96, 0x79, 0x5c, 0x96, 0x52, 0x92, 0x76, 0x7e, 0xc4, 0x60,\n",
-            "  0x6d, 0x7b, 0x41, 0x8c, 0x7b, 0x8e, 0x9a, 0x66, 0x79, 0x95, 0x67, 0x6a,\n",
-            "  0x7a, 0x9b, 0xa9, 0x85, 0x6d, 0x66, 0x55, 0x65, 0x76, 0x8b, 0x90, 0x86,\n",
-            "  0x88, 0x8b, 0x8f, 0x7e, 0x83, 0x7c, 0x75, 0x5f, 0x78, 0x96, 0x76, 0x47,\n",
-            "  0x54, 0x9c, 0x8d, 0x7d, 0x24, 0x9f, 0x79, 0x5c, 0x55, 0xb2, 0x3b, 0x67,\n",
-            "  0x4e, 0xd2, 0x90, 0x79, 0x3c, 0xc3, 0x8b, 0x4a, 0x7c, 0xd7, 0x70, 0x75,\n",
-            "  0x5b, 0xaf, 0xa8, 0x6b, 0x59, 0xc1, 0x6d, 0x5f, 0x5d, 0x96, 0x87, 0x9a,\n",
-            "  0x5d, 0x7f, 0x8e, 0x6d, 0x5c, 0x75, 0x3f, 0xb6, 0x8e, 0x81, 0x7b, 0x31,\n",
-            "  0x47, 0x67, 0x56, 0xb6, 0x90, 0x71, 0x89, 0x63, 0x61, 0x75, 0x8d, 0x8b,\n",
-            "  0x97, 0x62, 0x62, 0x85, 0x9c, 0x64, 0xb7, 0x61, 0x71, 0x3f, 0x6c, 0x8b,\n",
-            "  0xaa, 0x43, 0x82, 0x70, 0x52, 0x52, 0x80, 0xaa, 0x9e, 0x5d, 0x90, 0x69,\n",
-            "  0x8a, 0x77, 0x6d, 0x9f, 0x9e, 0x5f, 0x84, 0x61, 0x87, 0x70, 0x43, 0xab,\n",
-            "  0x97, 0x6e, 0x84, 0x6c, 0x5d, 0x82, 0x64, 0x85, 0x83, 0x7e, 0x82, 0x7c,\n",
-            "  0x7b, 0x91, 0x55, 0x7e, 0x77, 0x88, 0xba, 0x71, 0x6d, 0x7b, 0x71, 0x8a,\n",
-            "  0x7f, 0x84, 0xb5, 0x63, 0x4a, 0x9a, 0x3c, 0x70, 0x7a, 0x99, 0xa3, 0x50,\n",
-            "  0x84, 0x82, 0x56, 0x4c, 0x74, 0x8e, 0xa3, 0x77, 0x8f, 0x4e, 0x5f, 0x6d,\n",
-            "  0x97, 0x89, 0xa0, 0x6b, 0x7c, 0x8c, 0x85, 0x82, 0x8e, 0xa1, 0x89, 0x5b,\n",
-            "  0x7f, 0x8b, 0x8f, 0x5e, 0x74, 0x96, 0x8a, 0x7d, 0x15, 0x7b, 0x8f, 0x88,\n",
-            "  0x5f, 0xa7, 0x63, 0x5b, 0x39, 0xbd, 0x96, 0x56, 0x4c, 0xb4, 0x7b, 0x53,\n",
-            "  0x5a, 0xaf, 0x79, 0x7b, 0x5c, 0xa6, 0xaa, 0x74, 0x5f, 0xa0, 0x76, 0x9e,\n",
-            "  0x71, 0x9a, 0x60, 0xa4, 0x33, 0x87, 0x66, 0x66, 0x64, 0x7d, 0x6d, 0xac,\n",
-            "  0x9e, 0x8c, 0x78, 0x4f, 0x3d, 0x7b, 0x53, 0xb1, 0x97, 0x8a, 0x96, 0x6e,\n",
-            "  0x60, 0x4b, 0xa9, 0x9e, 0x93, 0x6e, 0x93, 0xb7, 0xae, 0x46, 0xb9, 0x60,\n",
-            "  0x72, 0x46, 0x80, 0x95, 0xb5, 0x57, 0x82, 0x53, 0x6e, 0x4e, 0x5b, 0xa2,\n",
-            "  0x9a, 0x3d, 0x8b, 0x6c, 0x84, 0x65, 0x69, 0xa1, 0x8c, 0x60, 0x83, 0x74,\n",
-            "  0x73, 0x53, 0x5d, 0x7e, 0x7f, 0x79, 0x6e, 0x81, 0x89, 0x8f, 0x51, 0x81,\n",
-            "  0x99, 0x97, 0x81, 0x8a, 0x87, 0x83, 0x43, 0x90, 0x89, 0x94, 0x93, 0x7a,\n",
-            "  0x66, 0x80, 0x82, 0x82, 0x79, 0x85, 0xb0, 0x6b, 0x87, 0x7b, 0x53, 0x89,\n",
-            "  0x79, 0x9d, 0xab, 0x6e, 0x82, 0x84, 0x50, 0x8f, 0x7e, 0x74, 0x90, 0x74,\n",
-            "  0x6e, 0x65, 0x84, 0x70, 0x82, 0x7a, 0x9e, 0x6d, 0x8f, 0x62, 0xb2, 0x84,\n",
-            "  0x78, 0x7e, 0x72, 0x5a, 0x7a, 0x85, 0x8c, 0x4b, 0x70, 0x99, 0x87, 0x78,\n",
-            "  0x26, 0x95, 0xb9, 0x77, 0x4d, 0xb6, 0x51, 0x6a, 0x41, 0xbf, 0x76, 0x68,\n",
-            "  0x56, 0xb6, 0x80, 0x53, 0x83, 0xaf, 0x87, 0x79, 0x79, 0xb4, 0x89, 0x7d,\n",
-            "  0x47, 0x9d, 0xa0, 0x86, 0x89, 0xc3, 0x6d, 0x99, 0x41, 0x89, 0x9a, 0x59,\n",
-            "  0x54, 0x83, 0x79, 0x9d, 0x7b, 0x73, 0x88, 0x4a, 0x42, 0x64, 0x7a, 0x9f,\n",
-            "  0x7b, 0x6e, 0x71, 0x7b, 0x6a, 0x61, 0xae, 0xa3, 0xa0, 0x68, 0x95, 0x9d,\n",
-            "  0x94, 0x49, 0x8b, 0x70, 0x8a, 0x5f, 0x49, 0xbb, 0xa7, 0x4a, 0xa1, 0x59,\n",
-            "  0x59, 0x59, 0x6d, 0xa0, 0x9f, 0x50, 0xa0, 0x7b, 0x75, 0x49, 0x5a, 0x8c,\n",
-            "  0x84, 0x68, 0x78, 0x57, 0x7a, 0x6e, 0x6b, 0x87, 0x9c, 0x7b, 0x84, 0x83,\n",
-            "  0x79, 0x7d, 0x5a, 0x77, 0x77, 0x6f, 0x6f, 0x7c, 0x8f, 0x83, 0x40, 0x62,\n",
-            "  0x6a, 0x87, 0xab, 0x74, 0x86, 0x96, 0x7a, 0x7d, 0x7b, 0x81, 0x9a, 0x65,\n",
-            "  0x60, 0x82, 0x61, 0x73, 0x71, 0x77, 0xa7, 0x79, 0x87, 0x8c, 0x4e, 0x72,\n",
-            "  0x8d, 0x89, 0x94, 0x6d, 0x75, 0x6d, 0x6e, 0x82, 0x7a, 0x8d, 0xa9, 0x77,\n",
-            "  0x77, 0x7c, 0x74, 0xa7, 0xb7, 0x67, 0x75, 0x67, 0x7e, 0x9f, 0x73, 0x60,\n",
-            "  0x6c, 0x95, 0x7f, 0x62, 0x31, 0x70, 0x85, 0x7a, 0x5f, 0xc0, 0x69, 0x66,\n",
-            "  0x71, 0xb0, 0x81, 0x5d, 0x48, 0xc9, 0x86, 0x39, 0x93, 0xa4, 0x8e, 0x7c,\n",
-            "  0x5e, 0xbb, 0x98, 0x5c, 0x74, 0x9c, 0x89, 0x6d, 0x74, 0xbd, 0x8e, 0x6e,\n",
-            "  0x5f, 0x9a, 0x6d, 0x70, 0x57, 0x9c, 0x58, 0xb7, 0x8e, 0x94, 0xa0, 0x3f,\n",
-            "  0x39, 0x75, 0x6f, 0xb4, 0xa2, 0x94, 0xa9, 0x70, 0x61, 0x8a, 0x70, 0x92,\n",
-            "  0xa7, 0x7f, 0x7f, 0x8d, 0x7a, 0x73, 0xa1, 0x5f, 0x8a, 0x4a, 0x65, 0xaa,\n",
-            "  0x92, 0x6e, 0x98, 0x51, 0x81, 0x47, 0x57, 0xb8, 0x89, 0x50, 0x8a, 0x6d,\n",
-            "  0x8b, 0x50, 0x8a, 0x86, 0x9b, 0x7d, 0x5b, 0x4a, 0x68, 0x74, 0x53, 0x9b,\n",
-            "  0x94, 0x74, 0x7c, 0x6f, 0x62, 0x86, 0x5b, 0x8f, 0x82, 0x96, 0x6e, 0x7c,\n",
-            "  0x80, 0x8f, 0x47, 0x5b, 0x70, 0x95, 0x97, 0x77, 0x8d, 0x8e, 0x69, 0x62,\n",
-            "  0x78, 0x8f, 0xbf, 0x5e, 0x76, 0xae, 0x4d, 0x84, 0x73, 0x76, 0xab, 0x6f,\n",
-            "  0x7f, 0x8c, 0x4b, 0x7d, 0x96, 0x7d, 0xb3, 0x55, 0x78, 0x8d, 0x76, 0x73,\n",
-            "  0x8d, 0x8e, 0x98, 0x6a, 0x91, 0x86, 0x6d, 0x8c, 0x7d, 0x93, 0x97, 0x56,\n",
-            "  0x79, 0x8f, 0xa3, 0x7f, 0x7e, 0x82, 0xa0, 0x63, 0x3d, 0x6b, 0x88, 0x5e,\n",
-            "  0x61, 0xc0, 0x45, 0x5f, 0x66, 0xb0, 0x6c, 0x6d, 0x29, 0xd5, 0x95, 0x3b,\n",
-            "  0x77, 0xaa, 0x62, 0x70, 0x63, 0xce, 0x8c, 0x6e, 0x56, 0xaa, 0x77, 0x6e,\n",
-            "  0x90, 0xcc, 0x6d, 0x7e, 0x41, 0x9f, 0x88, 0x4f, 0x5d, 0xb4, 0x4c, 0x9b,\n",
-            "  0x80, 0x97, 0x98, 0x59, 0x4c, 0x71, 0x53, 0xb4, 0x90, 0x97, 0x93, 0x90,\n",
-            "  0x46, 0x63, 0xa6, 0x87, 0x9d, 0x56, 0x7f, 0xab, 0x8e, 0x68, 0xc6, 0x5d,\n",
-            "  0x6e, 0x58, 0x4b, 0x85, 0xa1, 0x70, 0x8a, 0x60, 0x84, 0x44, 0x68, 0x8e,\n",
-            "  0x9b, 0x3a, 0x8c, 0x57, 0x91, 0x4c, 0x6b, 0x9c, 0xa7, 0x64, 0x82, 0x5f,\n",
-            "  0x68, 0x6d, 0x4d, 0xa1, 0x6c, 0x91, 0x6c, 0x6b, 0x64, 0x97, 0x86, 0x81,\n",
-            "  0x8d, 0x8e, 0x80, 0x72, 0x88, 0x96, 0x5d, 0x6e, 0x7c, 0x67, 0x97, 0x69,\n",
-            "  0x95, 0x93, 0x61, 0x8b, 0x9b, 0x7d, 0xc8, 0x6f, 0x85, 0x80, 0x67, 0x68,\n",
-            "  0x90, 0x6b, 0xcc, 0x7c, 0xa3, 0xa0, 0x58, 0x81, 0x7a, 0x8d, 0x9f, 0x65,\n",
-            "  0x81, 0x82, 0x78, 0x6b, 0x85, 0x7b, 0x9b, 0x69, 0x86, 0x6c, 0x83, 0x6c,\n",
-            "  0x8e, 0x59, 0xab, 0x56, 0x7c, 0x7f, 0x7b, 0x84, 0x71, 0x63, 0x7d, 0x73,\n",
-            "  0x60, 0x8b, 0x7a, 0x7b, 0x5e, 0xbb, 0x4b, 0x40, 0x30, 0xcc, 0x80, 0x65,\n",
-            "  0x6c, 0xb7, 0x80, 0x35, 0x7d, 0xa3, 0x5c, 0x6c, 0x49, 0xa6, 0x9b, 0x7b,\n",
-            "  0x53, 0xba, 0x62, 0x76, 0x78, 0xa0, 0x72, 0x80, 0x78, 0x93, 0x87, 0x62,\n",
-            "  0x64, 0x84, 0x6f, 0xa1, 0x70, 0x90, 0x9a, 0x6b, 0x42, 0x55, 0x6d, 0xc5,\n",
-            "  0xa6, 0x8a, 0x79, 0x64, 0x4c, 0x72, 0x7b, 0xa9, 0xa3, 0x70, 0x84, 0x8f,\n",
-            "  0x63, 0x7a, 0x9c, 0x4e, 0x5a, 0x76, 0x91, 0x67, 0xaf, 0x76, 0xbf, 0x46,\n",
-            "  0x62, 0x3f, 0x7d, 0xa7, 0x8d, 0x62, 0x90, 0x5b, 0x9a, 0x44, 0x51, 0x80,\n",
-            "  0xa6, 0x7e, 0x8d, 0x6a, 0x73, 0x65, 0x72, 0x82, 0x99, 0xb4, 0x6a, 0x75,\n",
-            "  0x85, 0x90, 0x47, 0x62, 0x9e, 0x95, 0x94, 0x78, 0x89, 0x74, 0x5d, 0xa3,\n",
-            "  0x7f, 0x9d, 0x7d, 0x63, 0x96, 0x86, 0x8d, 0xa2, 0x95, 0xab, 0xae, 0x5d,\n",
-            "  0x93, 0x8d, 0x3d, 0x76, 0x9e, 0x9c, 0xc4, 0x71, 0x7d, 0xa3, 0x75, 0x7e,\n",
-            "  0x6d, 0x9d, 0xa3, 0x7f, 0x94, 0x89, 0x47, 0x71, 0x8b, 0x95, 0xb1, 0x72,\n",
-            "  0x90, 0x53, 0x7e, 0x8f, 0x8c, 0x90, 0xa1, 0x4d, 0x59, 0x62, 0x73, 0xa0,\n",
-            "  0x69, 0x88, 0x86, 0x71, 0x60, 0x3b, 0x81, 0x57, 0x7d, 0x86, 0x58, 0x63,\n",
-            "  0x7d, 0x98, 0x74, 0x67, 0x5d, 0xb0, 0x67, 0x45, 0x9b, 0xa9, 0x94, 0x68,\n",
-            "  0x43, 0x8b, 0x85, 0x56, 0x63, 0x96, 0x87, 0x78, 0x88, 0xbf, 0x92, 0x8d,\n",
-            "  0x60, 0xa8, 0x7e, 0x7e, 0x78, 0x80, 0x66, 0x92, 0x6e, 0x97, 0xab, 0x7f,\n",
-            "  0x4f, 0x65, 0x59, 0xb0, 0x9b, 0x6b, 0x9f, 0x70, 0x6f, 0x5c, 0xac, 0x95,\n",
-            "  0xa3, 0x54, 0x8e, 0xa9, 0x9e, 0x8c, 0xa5, 0x66, 0x5f, 0x5b, 0x6c, 0x83,\n",
-            "  0x90, 0x73, 0x85, 0x64, 0x61, 0x51, 0x4a, 0x63, 0xa1, 0x96, 0x7e, 0x4e,\n",
-            "  0x87, 0x60, 0x68, 0xb5, 0x9a, 0x8d, 0x75, 0x4e, 0x8a, 0x7a, 0x5f, 0x9f,\n",
-            "  0x74, 0x80, 0x69, 0x6d, 0x73, 0x92, 0x79, 0x7e, 0x85, 0x68, 0x83, 0x9d,\n",
-            "  0xb6, 0x9d, 0x6e, 0x8f, 0x78, 0x91, 0xaf, 0x8f, 0xa0, 0x9d, 0x73, 0x55,\n",
-            "  0x91, 0x8f, 0xb2, 0x76, 0x97, 0xab, 0x63, 0x63, 0x68, 0x7b, 0xab, 0x5c,\n",
-            "  0x77, 0xae, 0x4c, 0x72, 0x6e, 0x93, 0xb8, 0x51, 0x79, 0x84, 0x7d, 0x6b,\n",
-            "  0x7f, 0x8a, 0xba, 0x68, 0x7a, 0x43, 0x9a, 0x8d, 0x77, 0x8a, 0x6d, 0x56,\n",
-            "  0x79, 0x41, 0x7a, 0x4b, 0x81, 0x7a, 0x5c, 0x68, 0x58, 0x36, 0x6f, 0x6f,\n",
-            "  0x9f, 0xa6, 0x5f, 0x60, 0x4e, 0x67, 0x70, 0x4c, 0x69, 0x69, 0x94, 0x63,\n",
-            "  0x6d, 0x7b, 0x88, 0x9e, 0x6d, 0x98, 0x69, 0x68, 0x88, 0x80, 0x80, 0x7a,\n",
-            "  0x8e, 0x78, 0x5e, 0x8d, 0x7e, 0x91, 0x76, 0x64, 0x7e, 0x7f, 0x4e, 0xc9,\n",
-            "  0x79, 0x8f, 0x9c, 0x82, 0x3d, 0x62, 0x63, 0xc3, 0xb8, 0x7b, 0x72, 0x7b,\n",
-            "  0x50, 0x56, 0x95, 0x72, 0x8f, 0x6b, 0x90, 0x9d, 0x76, 0xa4, 0xa5, 0x79,\n",
-            "  0x54, 0x4f, 0x59, 0x85, 0xc5, 0x92, 0x97, 0x4d, 0x6f, 0x69, 0x77, 0x7f,\n",
-            "  0x71, 0x7c, 0x87, 0x59, 0x98, 0x61, 0x80, 0x81, 0x88, 0x6b, 0x6d, 0x7f,\n",
-            "  0x7f, 0x77, 0x60, 0xa2, 0x96, 0x73, 0x69, 0x86, 0x83, 0x8d, 0x60, 0x66,\n",
-            "  0x88, 0x8c, 0x93, 0x67, 0x98, 0x82, 0x7e, 0x91, 0x99, 0x59, 0x8e, 0x6e,\n",
-            "  0x90, 0xa1, 0x62, 0x8a, 0x98, 0x7b, 0xc8, 0x67, 0x85, 0x8d, 0x6c, 0xa1,\n",
-            "  0xa1, 0x92, 0xd0, 0x49, 0x85, 0x76, 0x89, 0x75, 0x88, 0x83, 0xa3, 0x77,\n",
-            "  0x85, 0x68, 0x82, 0x83, 0x7f, 0x79, 0xae, 0x85, 0x76, 0x84, 0x80, 0x9a,\n",
-            "  0x9d, 0x7b, 0x83, 0x90, 0x79, 0x88, 0x79, 0x9a, 0x93, 0x6c, 0x69, 0x79,\n",
-            "  0x5f, 0x90, 0x81, 0x7b, 0x87, 0x9d, 0x86, 0x82, 0x7a, 0x77, 0x71, 0x85,\n",
-            "  0x8b, 0x99, 0x8f, 0x7b, 0x58, 0x98, 0x84, 0x6e, 0x9a, 0xa1, 0x7a, 0x8c,\n",
-            "  0x77, 0xa8, 0x86, 0x93, 0x7b, 0x90, 0x79, 0x8a, 0x85, 0x8f, 0x84, 0x97,\n",
-            "  0x73, 0x83, 0x7b, 0x76, 0x8e, 0xa1, 0x89, 0x8a, 0x83, 0x9c, 0x65, 0x68,\n",
-            "  0x7b, 0x89, 0x92, 0x84, 0x6d, 0x90, 0x61, 0x78, 0x98, 0x8c, 0x8d, 0x87,\n",
-            "  0xa0, 0x99, 0x79, 0x7b, 0x69, 0xa4, 0x7a, 0x8d, 0x73, 0x71, 0x70, 0x80,\n",
-            "  0x82, 0x77, 0x81, 0x67, 0x75, 0x97, 0x71, 0x73, 0x85, 0x6d, 0x8e, 0x86,\n",
-            "  0x6e, 0x80, 0x86, 0x9e, 0x6f, 0x70, 0x67, 0x59, 0x65, 0x89, 0x67, 0x8b,\n",
-            "  0x7d, 0x68, 0x69, 0x7a, 0x5b, 0x7e, 0x87, 0xa1, 0x92, 0x7b, 0x64, 0x7e,\n",
-            "  0x76, 0x72, 0x71, 0xab, 0x7c, 0x83, 0x6f, 0xa1, 0x86, 0x76, 0x71, 0x6f,\n",
-            "  0x91, 0x77, 0x6c, 0x71, 0x92, 0x78, 0x70, 0x7f, 0x6e, 0x65, 0x77, 0x93,\n",
-            "  0x7e, 0x6c, 0x85, 0x9d, 0x78, 0x8b, 0x7c, 0x5f, 0x94, 0x86, 0x7c, 0x7f,\n",
-            "  0x83, 0x6e, 0x72, 0x9e, 0x6e, 0x6b, 0x8d, 0x91, 0x97, 0x8b, 0x7b, 0x72,\n",
-            "  0x86, 0x75, 0x7f, 0x96, 0x7d, 0x81, 0xa1, 0x55, 0xa6, 0x88, 0x96, 0x87,\n",
-            "  0x93, 0x68, 0x89, 0x72, 0x6f, 0x9c, 0x75, 0x7c, 0x79, 0x6c, 0x74, 0x84,\n",
-            "  0x7d, 0xa4, 0x86, 0x84, 0x84, 0x8d, 0x63, 0x7a, 0x63, 0xbc, 0x7e, 0x93,\n",
-            "  0x80, 0x8d, 0x71, 0x7a, 0x5f, 0x8c, 0x74, 0x96, 0x7e, 0x9b, 0x9d, 0x8d,\n",
-            "  0x5b, 0xa4, 0x71, 0x5e, 0x83, 0x78, 0x86, 0x7f, 0x70, 0x99, 0x87, 0x85,\n",
-            "  0x8e, 0x81, 0x93, 0x80, 0x89, 0xa0, 0x7a, 0x77, 0x8e, 0x73, 0x5f, 0x80,\n",
-            "  0x6d, 0x87, 0x5b, 0x7a, 0x85, 0x7c, 0x85, 0x63, 0x61, 0x9d, 0x6f, 0x68,\n",
-            "  0x77, 0x86, 0x61, 0x6d, 0x84, 0x98, 0x7c, 0x78, 0x69, 0x84, 0x91, 0x6d,\n",
-            "  0x81, 0xa1, 0x6c, 0x62, 0x95, 0x6d, 0x86, 0x8b, 0x95, 0x8f, 0x5e, 0x86,\n",
-            "  0x73, 0xa1, 0x83, 0x58, 0x5f, 0x8e, 0x76, 0x79, 0x9e, 0x92, 0x7c, 0x7b,\n",
-            "  0x81, 0x8b, 0x83, 0x7b, 0x78, 0x75, 0x70, 0x83, 0x70, 0x5a, 0x6a, 0x59,\n",
-            "  0xa3, 0x82, 0x7a, 0x91, 0x8b, 0x6e, 0x82, 0x8e, 0x70, 0x73, 0x91, 0x76,\n",
-            "  0xa5, 0x7f, 0x70, 0x81, 0x6f, 0x85, 0x94, 0xa6, 0x8c, 0x50, 0x76, 0x6e,\n",
-            "  0x64, 0x95, 0xa0, 0x64, 0x6c, 0x68, 0x8e, 0x8b, 0xa1, 0x7d, 0xa0, 0x7f,\n",
-            "  0x76, 0x8b, 0x7b, 0x93, 0x7b, 0x6e, 0x7e, 0x64, 0x8a, 0xa7, 0x78, 0x64,\n",
-            "  0x93, 0x67, 0x7d, 0x68, 0x5c, 0xa0, 0x76, 0x98, 0xaf, 0x80, 0x55, 0x96,\n",
-            "  0x97, 0x9c, 0x78, 0x75, 0x87, 0x85, 0x77, 0x77, 0x62, 0x93, 0x76, 0x68,\n",
-            "  0xa0, 0x80, 0x81, 0x7f, 0x9a, 0x68, 0x74, 0x69, 0x94, 0x77, 0x77, 0x72,\n",
-            "  0x90, 0x9a, 0x6f, 0x95, 0x89, 0x6b, 0x6b, 0x94, 0x7e, 0x9c, 0x6f, 0x67,\n",
-            "  0x8f, 0x82, 0x80, 0x92, 0x76, 0x80, 0x65, 0x9b, 0x6a, 0x7c, 0x75, 0x5a,\n",
-            "  0x87, 0xa1, 0x69, 0x7a, 0x79, 0x9e, 0x9a, 0x58, 0x81, 0x92, 0x72, 0x67,\n",
-            "  0x90, 0x80, 0x82, 0x61, 0x9f, 0x9e, 0x6a, 0x8d, 0x8d, 0x8a, 0x73, 0x81,\n",
-            "  0x68, 0x7f, 0x5b, 0x59, 0x98, 0x89, 0x71, 0x72, 0x58, 0x7b, 0x94, 0x5d,\n",
-            "  0xa9, 0x8b, 0x72, 0x7b, 0x65, 0x73, 0x5b, 0x8b, 0x7d, 0x86, 0x6e, 0x8c,\n",
-            "  0x66, 0x6f, 0x6b, 0x8b, 0x71, 0x80, 0x7f, 0x70, 0x70, 0x88, 0x70, 0x7e,\n",
-            "  0x84, 0x89, 0x7f, 0x81, 0x87, 0x77, 0x71, 0x88, 0x7f, 0x8f, 0x5e, 0x80,\n",
-            "  0x5d, 0xa1, 0x89, 0x77, 0x93, 0x8e, 0x55, 0x64, 0x88, 0x9a, 0x8b, 0x80,\n",
-            "  0x77, 0x6f, 0x91, 0x83, 0x6b, 0x9b, 0x85, 0x5c, 0x57, 0x7e, 0xa9, 0x63,\n",
-            "  0x83, 0xaa, 0x7c, 0xa1, 0x91, 0x5f, 0x68, 0x76, 0x7a, 0x97, 0x96, 0x84,\n",
-            "  0xca, 0x8d, 0x8c, 0x8b, 0x71, 0x81, 0x88, 0x92, 0xaa, 0x74, 0x49, 0x7a,\n",
-            "  0x90, 0x93, 0x7a, 0x61, 0x8c, 0x66, 0x71, 0xa0, 0xab, 0x7d, 0x86, 0x6c,\n",
-            "  0x9f, 0x77, 0x67, 0x6a, 0x89, 0x89, 0x88, 0x70, 0xad, 0x88, 0x69, 0x84,\n",
-            "  0x70, 0x8f, 0x79, 0x7c, 0x66, 0xa6, 0x71, 0x8d, 0x77, 0x99, 0x69, 0x76,\n",
-            "  0x79, 0x7d, 0x9c, 0x6f, 0x64, 0x8b, 0x70, 0x82, 0x69, 0xa4, 0x65, 0x6e,\n",
-            "  0x7f, 0x9e, 0x7e, 0x84, 0x8c, 0x9c, 0x6c, 0x5b, 0x6e, 0xa7, 0x6d, 0x7a,\n",
-            "  0x92, 0x78, 0x9a, 0x6f, 0x81, 0x91, 0x71, 0x7d, 0x6b, 0x99, 0x6b, 0x92,\n",
-            "  0x5e, 0x7e, 0x64, 0x95, 0x78, 0x90, 0x6f, 0x68, 0x8a, 0x85, 0x6f, 0x88,\n",
-            "  0x64, 0x66, 0x7f, 0x78, 0x7c, 0x95, 0x66, 0x6c, 0x76, 0x6a, 0x9b, 0x8f,\n",
-            "  0x9d, 0x78, 0x86, 0x95, 0x73, 0x66, 0x6d, 0x71, 0x8b, 0x7f, 0x6f, 0x70,\n",
-            "  0x64, 0x94, 0xa0, 0x83, 0x6b, 0x6d, 0x85, 0x89, 0x68, 0x92, 0x8e, 0x51,\n",
-            "  0x81, 0x85, 0x86, 0x6e, 0x83, 0x85, 0x8a, 0x5e, 0x68, 0xbf, 0xc4, 0xa5,\n",
-            "  0x8b, 0x67, 0x86, 0x59, 0x85, 0x9e, 0x96, 0x67, 0x82, 0x7c, 0x6c, 0x80,\n",
-            "  0x84, 0xae, 0x9d, 0x80, 0xc2, 0x58, 0x5d, 0x95, 0x85, 0x8b, 0x7f, 0x5d,\n",
-            "  0xc7, 0x75, 0x75, 0x87, 0xa2, 0x8c, 0x62, 0x71, 0x9c, 0x61, 0x7f, 0x9c,\n",
-            "  0xca, 0x8d, 0x89, 0x6e, 0x7c, 0x71, 0x81, 0x99, 0x95, 0xa4, 0x76, 0x6f,\n",
-            "  0x64, 0x7b, 0x6c, 0x72, 0x8b, 0x83, 0x70, 0x70, 0x8b, 0xa4, 0x69, 0x76,\n",
-            "  0x6e, 0x8d, 0x7a, 0x80, 0x8f, 0x9e, 0x73, 0x4b, 0x75, 0x78, 0x77, 0x7b,\n",
-            "  0x8e, 0x92, 0x88, 0x49, 0x54, 0x9f, 0x7a, 0x7f, 0x68, 0x9f, 0x7f, 0x57,\n",
-            "  0x6b, 0xad, 0x85, 0x6f, 0x81, 0xa1, 0x96, 0x6f, 0x73, 0x8d, 0x5e, 0x65,\n",
-            "  0x7a, 0x8c, 0x7c, 0x6a, 0x7e, 0x7a, 0x6a, 0x97, 0x59, 0x86, 0x62, 0x77,\n",
-            "  0x70, 0x7a, 0x68, 0x62, 0x68, 0x86, 0x7e, 0x76, 0x9a, 0x7f, 0x6c, 0x7e,\n",
-            "  0x8a, 0x76, 0x65, 0x8f, 0x7d, 0x65, 0x76, 0xa4, 0x95, 0x62, 0x78, 0x97,\n",
-            "  0x7a, 0x6e, 0x7a, 0x7a, 0x7e, 0x91, 0x8c, 0x8a, 0x91, 0x82, 0x89, 0x6d,\n",
-            "  0x87, 0x90, 0x69, 0x71, 0x96, 0xa6, 0x7c, 0x7c, 0xa8, 0xa8, 0x62, 0x77,\n",
-            "  0x76, 0x99, 0xdd, 0x76, 0x8a, 0x5c, 0x86, 0x6a, 0x69, 0x9c, 0xa5, 0x7d,\n",
-            "  0x78, 0x6a, 0x88, 0x77, 0x77, 0xae, 0x8a, 0x99, 0xcb, 0x85, 0x59, 0x84,\n",
-            "  0x7b, 0x97, 0x8a, 0x82, 0xc5, 0x65, 0x8c, 0x93, 0xc3, 0x8c, 0x87, 0x64,\n",
-            "  0x91, 0x41, 0x70, 0xa8, 0xd1, 0x8b, 0x82, 0x71, 0x9c, 0x71, 0x4e, 0x86,\n",
-            "  0x98, 0x86, 0x7f, 0x7e, 0x69, 0x99, 0x79, 0x78, 0x77, 0xb3, 0x6b, 0x80,\n",
-            "  0x84, 0x8b, 0x56, 0x73, 0x84, 0x95, 0x82, 0x94, 0x5b, 0x92, 0x83, 0x46,\n",
-            "  0x66, 0x89, 0x6d, 0x61, 0x99, 0xa6, 0x99, 0x3f, 0x6c, 0xab, 0x5d, 0x5f,\n",
-            "  0x6c, 0x8e, 0x6b, 0x4a, 0x72, 0xb6, 0x6c, 0x75, 0x78, 0xa6, 0x6f, 0x5b,\n",
-            "  0x56, 0x8b, 0x57, 0x74, 0x8f, 0xab, 0x53, 0x56, 0x5d, 0x63, 0x63, 0x8b,\n",
-            "  0x65, 0x78, 0x71, 0x67, 0x7a, 0x62, 0x8d, 0x78, 0x99, 0x76, 0x94, 0x7a,\n",
-            "  0xa3, 0x70, 0x55, 0x87, 0x7e, 0x7c, 0x57, 0x57, 0x6e, 0x79, 0x94, 0x8f,\n",
-            "  0x86, 0x80, 0x90, 0x7d, 0x7d, 0x7f, 0x7f, 0x68, 0x41, 0x86, 0x8c, 0x6f,\n",
-            "  0x8a, 0x7f, 0x87, 0x8a, 0x7e, 0x7f, 0x5d, 0x71, 0x91, 0x81, 0x93, 0x71,\n",
-            "  0x91, 0xc6, 0x70, 0x4a, 0x74, 0xa8, 0xf3, 0x72, 0xa7, 0x80, 0x7e, 0x41,\n",
-            "  0x84, 0xa3, 0xb6, 0x94, 0xba, 0x84, 0x70, 0x74, 0x71, 0xac, 0x9f, 0x9d,\n",
-            "  0xe4, 0x67, 0x6a, 0x87, 0x92, 0x8e, 0x92, 0x82, 0xdb, 0x5e, 0x9b, 0x90,\n",
-            "  0xd5, 0x87, 0x8d, 0x7c, 0x9c, 0x3c, 0x6c, 0xab, 0xc2, 0x86, 0x83, 0x79,\n",
-            "  0x6c, 0x61, 0x51, 0xa9, 0x99, 0x79, 0x72, 0x80, 0x6f, 0x85, 0x57, 0x6c,\n",
-            "  0x81, 0x86, 0x6e, 0x88, 0x87, 0x8d, 0x8e, 0x81, 0x67, 0x88, 0x62, 0x99,\n",
-            "  0x87, 0xab, 0x8f, 0x57, 0x60, 0x77, 0x64, 0x81, 0x96, 0xa3, 0x81, 0x3d,\n",
-            "  0x4e, 0xb9, 0x57, 0x6e, 0x99, 0xad, 0x6a, 0x3e, 0x74, 0x96, 0x7e, 0x79,\n",
-            "  0x65, 0xa4, 0x7c, 0x6a, 0x53, 0x87, 0x56, 0x6f, 0x5e, 0x97, 0x85, 0x42,\n",
-            "  0x56, 0x6b, 0x67, 0x78, 0x7d, 0xa6, 0x7c, 0x7c, 0x7d, 0x78, 0x7b, 0x84,\n",
-            "  0x99, 0x7b, 0x89, 0x71, 0x76, 0x8b, 0x76, 0x73, 0x7d, 0x83, 0x56, 0x4f,\n",
-            "  0x86, 0x72, 0x83, 0x88, 0x6a, 0x93, 0x69, 0x90, 0x6c, 0x73, 0x6f, 0x63,\n",
-            "  0x55, 0x88, 0x6b, 0x88, 0x7c, 0x86, 0x87, 0x7b, 0x6c, 0x7e, 0x60, 0x57,\n",
-            "  0xa8, 0x81, 0xa3, 0x72, 0xba, 0xbf, 0x66, 0x65, 0x70, 0xb9, 0xe4, 0x78,\n",
-            "  0x99, 0x67, 0x8c, 0x72, 0x88, 0x96, 0xb5, 0x72, 0x8a, 0x66, 0x81, 0x39,\n",
-            "  0x85, 0x93, 0xa0, 0x9c, 0xdf, 0x74, 0x8a, 0x6d, 0x93, 0xa1, 0x8c, 0x7a,\n",
-            "  0xb5, 0x4b, 0x89, 0xae, 0xba, 0x9c, 0x96, 0x9a, 0xb4, 0x33, 0x5a, 0xb1,\n",
-            "  0xcd, 0x88, 0x84, 0x63, 0x8c, 0x5e, 0x71, 0x6d, 0xa7, 0x8a, 0x62, 0x85,\n",
-            "  0x77, 0x75, 0x62, 0x79, 0x96, 0x73, 0x4f, 0x7d, 0x93, 0x8a, 0x88, 0x7e,\n",
-            "  0x59, 0x6c, 0x7f, 0x87, 0x6f, 0x91, 0x88, 0x59, 0x6d, 0x83, 0x70, 0x7c,\n",
-            "  0x7f, 0x8d, 0x7f, 0x26, 0x41, 0xcf, 0x6b, 0x6e, 0x75, 0xa3, 0x90, 0x5e,\n",
-            "  0x3a, 0x94, 0x61, 0x9a, 0x6f, 0x9f, 0x69, 0x7d, 0x55, 0x8c, 0x60, 0x7c,\n",
-            "  0x93, 0x85, 0x85, 0x4b, 0x54, 0x71, 0x60, 0x8a, 0x6d, 0x8c, 0x9c, 0x7e,\n",
-            "  0x5b, 0x79, 0x74, 0x7b, 0x7b, 0x9d, 0x5b, 0x65, 0x81, 0x82, 0x66, 0x89,\n",
-            "  0x82, 0x72, 0x77, 0x78, 0x75, 0x76, 0x6b, 0x74, 0x89, 0x73, 0x6c, 0x6b,\n",
-            "  0x77, 0x7e, 0x67, 0x84, 0x41, 0x90, 0x58, 0x87, 0x98, 0x60, 0x96, 0x81,\n",
-            "  0x6b, 0x74, 0x7d, 0x56, 0x72, 0x71, 0x9a, 0x7d, 0xc5, 0xd0, 0x88, 0x6e,\n",
-            "  0x4d, 0xbe, 0xef, 0x8a, 0xa7, 0x92, 0x82, 0x67, 0x7f, 0x91, 0xc5, 0x7d,\n",
-            "  0xad, 0x77, 0x6b, 0x4e, 0x8e, 0x99, 0x9b, 0x8e, 0xc7, 0x7f, 0x8a, 0x8e,\n",
-            "  0x8f, 0x87, 0x9c, 0x75, 0xb0, 0x53, 0x75, 0x97, 0xc7, 0x98, 0xa4, 0xa4,\n",
-            "  0x80, 0x41, 0x79, 0xc3, 0xdb, 0x86, 0x9d, 0x75, 0x7f, 0x67, 0x7a, 0x96,\n",
-            "  0xc3, 0x83, 0x54, 0x8e, 0x6f, 0xa8, 0x7c, 0x65, 0x78, 0x7e, 0x59, 0xa3,\n",
-            "  0x8a, 0x97, 0x8b, 0x82, 0x5e, 0x66, 0x82, 0x9b, 0x9e, 0x9f, 0x70, 0x49,\n",
-            "  0x55, 0x88, 0x8a, 0x7e, 0x90, 0xa7, 0x6b, 0x3b, 0x28, 0xc0, 0x63, 0x7e,\n",
-            "  0x60, 0x90, 0x7c, 0x3f, 0x54, 0x9c, 0x7d, 0x8a, 0x6a, 0xa9, 0x6f, 0x61,\n",
-            "  0x76, 0x86, 0x64, 0x88, 0x72, 0xa5, 0x6b, 0x4d, 0x56, 0x6c, 0x52, 0xa1,\n",
-            "  0x84, 0x69, 0x69, 0x5b, 0x71, 0x84, 0x76, 0x9b, 0x92, 0x70, 0x86, 0x8b,\n",
-            "  0x71, 0x68, 0x56, 0x92, 0x76, 0x8f, 0x8f, 0x72, 0x5a, 0x77, 0x6f, 0x92,\n",
-            "  0x72, 0x72, 0x5e, 0x7a, 0x70, 0x73, 0x60, 0x7d, 0x5a, 0x93, 0x7f, 0x6b,\n",
-            "  0x89, 0x6b, 0xa1, 0x85, 0x5c, 0x8d, 0x76, 0x7c, 0x6f, 0x73, 0x96, 0x6d,\n",
-            "  0xbb, 0xad, 0x53, 0x53, 0x5f, 0x9a, 0xe2, 0x8d, 0xa7, 0x6d, 0x8a, 0x5b,\n",
-            "  0x85, 0x9c, 0xb4, 0x7b, 0xb3, 0x52, 0x75, 0x7f, 0x7a, 0x8c, 0x91, 0x7e,\n",
-            "  0xca, 0x5f, 0x64, 0x71, 0x85, 0x9a, 0x91, 0x72, 0xbd, 0x6e, 0x9b, 0x81,\n",
-            "  0x8f, 0xa8, 0xac, 0x7d, 0xb4, 0x5f, 0x45, 0xc5, 0xc8, 0x7a, 0x93, 0x8e,\n",
-            "  0x7b, 0x41, 0x69, 0x94, 0x8b, 0x76, 0x59, 0x81, 0x73, 0x92, 0x8e, 0x63,\n",
-            "  0x8e, 0x74, 0x33, 0xa5, 0x9c, 0xa2, 0x88, 0x48, 0x5d, 0x8c, 0x7d, 0xa6,\n",
-            "  0x68, 0x9a, 0x6f, 0x58, 0x6c, 0x8f, 0x77, 0x65, 0x97, 0x9d, 0x7a, 0x37,\n",
-            "  0x59, 0xab, 0x6e, 0x8f, 0x7a, 0xae, 0x65, 0x3e, 0x46, 0xa9, 0x82, 0x82,\n",
-            "  0x9c, 0x9d, 0x62, 0x79, 0x66, 0x7f, 0x5e, 0x88, 0x9e, 0x8f, 0x84, 0x71,\n",
-            "  0x5d, 0x6d, 0x70, 0xa0, 0x69, 0x92, 0x7f, 0x70, 0x66, 0x6f, 0x75, 0x8c,\n",
-            "  0x96, 0x7a, 0x85, 0x6a, 0x5a, 0x7c, 0x72, 0x8a, 0x8d, 0x7b, 0x8b, 0x5c,\n",
-            "  0x76, 0x69, 0x70, 0x7f, 0x74, 0xa1, 0x71, 0x91, 0x5a, 0x8c, 0x6e, 0x83,\n",
-            "  0x52, 0x78, 0x71, 0x6d, 0xa9, 0x63, 0x9d, 0x81, 0x52, 0x9e, 0x5d, 0x60,\n",
-            "  0x76, 0x93, 0x97, 0x67, 0xce, 0xc1, 0x75, 0x5e, 0x5f, 0x8c, 0xea, 0x76,\n",
-            "  0xad, 0x7a, 0x7d, 0x62, 0x85, 0x92, 0xd0, 0x6a, 0xbc, 0x53, 0x55, 0x5c,\n",
-            "  0x6d, 0x89, 0x9e, 0x71, 0xd2, 0x8b, 0x64, 0x61, 0x85, 0x9a, 0x77, 0x75,\n",
-            "  0xb9, 0x67, 0x8a, 0xac, 0x90, 0x8a, 0xb4, 0x91, 0xbb, 0x58, 0x94, 0xaf,\n",
-            "  0xb2, 0x76, 0xa2, 0x71, 0x95, 0x5e, 0x73, 0xa5, 0x92, 0x8c, 0x52, 0x96,\n",
-            "  0x53, 0x95, 0x84, 0x91, 0x93, 0x7a, 0x40, 0x88, 0xab, 0xa5, 0x63, 0x70,\n",
-            "  0x66, 0x88, 0x7e, 0x92, 0x89, 0x84, 0x78, 0x57, 0x3d, 0x8d, 0x84, 0x77,\n",
-            "  0x9b, 0x87, 0x5e, 0x4e, 0x42, 0xa0, 0x76, 0x8a, 0x77, 0x90, 0x83, 0x4c,\n",
-            "  0x42, 0x9b, 0x75, 0x7a, 0x88, 0x94, 0x98, 0x69, 0x4c, 0xa2, 0x6b, 0x7b,\n",
-            "  0x6e, 0x9b, 0x5d, 0x5f, 0x53, 0x6a, 0x63, 0x95, 0x69, 0x8a, 0x61, 0x75,\n",
-            "  0x6c, 0x7a, 0x58, 0x89, 0x84, 0x8f, 0x6b, 0x5a, 0x71, 0x6f, 0x59, 0x89,\n",
-            "  0x7d, 0x87, 0x5f, 0x77, 0x4b, 0x61, 0x77, 0x92, 0x67, 0x8e, 0x5c, 0x6f,\n",
-            "  0x5b, 0x77, 0x76, 0x6b, 0x44, 0x9d, 0x9f, 0x7f, 0x8b, 0x94, 0x9e, 0x7c,\n",
-            "  0x62, 0x94, 0x60, 0x55, 0x77, 0x8f, 0xa6, 0x62, 0xb5, 0xb2, 0x3c, 0x61,\n",
-            "  0x5c, 0x99, 0xeb, 0x5b, 0x90, 0x6c, 0x7f, 0x5f, 0x75, 0xa6, 0xcf, 0x77,\n",
-            "  0x98, 0x5d, 0x75, 0x69, 0x7f, 0x8a, 0xa7, 0x73, 0xc8, 0x74, 0x70, 0x82,\n",
-            "  0x76, 0x8f, 0xa2, 0x7a, 0xa4, 0x7a, 0x66, 0x81, 0x9b, 0x8f, 0x9e, 0x8b,\n",
-            "  0xa1, 0x51, 0x7b, 0xba, 0xc8, 0x90, 0xab, 0x92, 0x72, 0x57, 0x5b, 0xa3,\n",
-            "  0xb0, 0x7f, 0x4c, 0x7d, 0x5f, 0x8e, 0x6c, 0x7d, 0x71, 0x7e, 0x4e, 0x87,\n",
-            "  0xb7, 0x97, 0x7a, 0x4c, 0x5f, 0x72, 0x78, 0x84, 0x82, 0x7e, 0x63, 0x65,\n",
-            "  0x68, 0x78, 0x73, 0x85, 0x90, 0x99, 0x80, 0x57, 0x42, 0x8b, 0x8a, 0x77,\n",
-            "  0x71, 0x97, 0x6d, 0x44, 0x41, 0x8f, 0x78, 0x7d, 0x95, 0x81, 0x95, 0x5f,\n",
-            "  0x64, 0x87, 0x66, 0x80, 0x89, 0x9a, 0x61, 0x4d, 0x68, 0x7b, 0x72, 0x73,\n",
-            "  0x85, 0x92, 0x77, 0x7d, 0x73, 0x77, 0x54, 0x7a, 0x77, 0x7d, 0x7d, 0x7a,\n",
-            "  0x6e, 0x8e, 0x4f, 0x7d, 0x80, 0x9a, 0x79, 0x8b, 0x7b, 0x68, 0x6e, 0x86,\n",
-            "  0x7f, 0x93, 0x7a, 0x76, 0x72, 0x85, 0x6a, 0x7b, 0x57, 0x84, 0x96, 0x9a,\n",
-            "  0x8f, 0x91, 0x9b, 0x72, 0x73, 0x91, 0x53, 0x66, 0x76, 0x80, 0xae, 0x63,\n",
-            "  0xbf, 0x99, 0x5e, 0x77, 0x73, 0x9c, 0xd8, 0x74, 0xa7, 0x79, 0x52, 0x64,\n",
-            "  0x82, 0x95, 0xc7, 0x4f, 0xa8, 0x4f, 0x6d, 0x42, 0x7c, 0x89, 0xab, 0x83,\n",
-            "  0xc0, 0x82, 0x6a, 0x5f, 0x83, 0x92, 0xa8, 0x76, 0xc1, 0x77, 0x6e, 0x7b,\n",
-            "  0xa3, 0x9b, 0xaf, 0x87, 0xab, 0x60, 0x8d, 0xc2, 0xd2, 0x83, 0xb2, 0x78,\n",
-            "  0x8d, 0x39, 0x57, 0x9c, 0x90, 0x8e, 0x6e, 0x6a, 0x74, 0x79, 0x81, 0x6d,\n",
-            "  0x6f, 0x8e, 0x77, 0x92, 0x93, 0x7d, 0x5f, 0x68, 0x6a, 0x6c, 0x80, 0x8f,\n",
-            "  0x99, 0x84, 0x4f, 0x64, 0x5c, 0x93, 0x7c, 0x91, 0x98, 0x82, 0x62, 0x3f,\n",
-            "  0x41, 0x9f, 0x5d, 0x89, 0x98, 0x89, 0x73, 0x50, 0x32, 0xa8, 0xa0, 0x7a,\n",
-            "  0xa0, 0x95, 0x78, 0x69, 0x74, 0x7c, 0x89, 0x7b, 0x80, 0x65, 0x56, 0x6b,\n",
-            "  0x69, 0x78, 0x62, 0x87, 0xaf, 0x94, 0x7a, 0x64, 0x53, 0x86, 0x45, 0x99,\n",
-            "  0x88, 0x79, 0x4d, 0x74, 0x59, 0x91, 0x5f, 0x7b, 0x88, 0x90, 0x80, 0x86,\n",
-            "  0x7d, 0x7b, 0x64, 0xa3, 0x7f, 0x74, 0x89, 0x80, 0x7d, 0x7c, 0x7a, 0x87,\n",
-            "  0x5f, 0x8a, 0x5a, 0x72, 0x79, 0x74, 0x8c, 0x7c, 0x86, 0x91, 0x6e, 0x5d,\n",
-            "  0x61, 0x8e, 0xa2, 0x68, 0xd4, 0x92, 0x67, 0x66, 0x62, 0xa1, 0xf3, 0x63,\n",
-            "  0x91, 0x81, 0x74, 0x5f, 0x88, 0x98, 0xbb, 0x5a, 0x9b, 0x54, 0x6a, 0x5c,\n",
-            "  0x75, 0x88, 0xad, 0x7c, 0xb4, 0x7c, 0x69, 0x74, 0x84, 0x76, 0x9d, 0x9a,\n",
-            "  0xb0, 0x91, 0x5d, 0xa3, 0xa4, 0x7f, 0xbb, 0x80, 0xa4, 0x5d, 0x83, 0xaf,\n",
-            "  0xb7, 0x66, 0xb0, 0x7f, 0x89, 0x4b, 0x72, 0x9e, 0x99, 0x7c, 0x66, 0x71,\n",
-            "  0x6a, 0x6f, 0x6d, 0x67, 0x8d, 0x6d, 0x46, 0xa5, 0x9b, 0x84, 0x7a, 0x61,\n",
-            "  0x64, 0x5c, 0x88, 0x89, 0x95, 0x8c, 0x70, 0x4b, 0x6c, 0x85, 0x83, 0x8b,\n",
-            "  0x98, 0x87, 0x6a, 0x44, 0x4d, 0x9d, 0x78, 0x71, 0x78, 0x7e, 0x91, 0x5b,\n",
-            "  0x3f, 0x9f, 0x80, 0x62, 0xa7, 0x95, 0x5d, 0x74, 0x65, 0x9c, 0x6d, 0x7a,\n",
-            "  0x98, 0x79, 0x80, 0x61, 0x49, 0x82, 0x65, 0x92, 0x80, 0x96, 0x7c, 0x72,\n",
-            "  0x4f, 0x76, 0x5e, 0x8d, 0x97, 0xa5, 0x72, 0x57, 0x79, 0x87, 0x67, 0x87,\n",
-            "  0x80, 0x84, 0x7c, 0x6f, 0x66, 0x6b, 0x70, 0x9b, 0x64, 0x90, 0x59, 0x96,\n",
-            "  0x7a, 0x6f, 0x75, 0x89, 0x4e, 0x8a, 0x62, 0x6e, 0x9c, 0x8c, 0x9a, 0x78,\n",
-            "  0x8e, 0x91, 0x3d, 0x50, 0x72, 0x92, 0x9f, 0x63, 0xda, 0x92, 0x72, 0x60,\n",
-            "  0x59, 0xa6, 0xd0, 0x56, 0xc1, 0x6b, 0x5e, 0x76, 0x6e, 0x81, 0xbb, 0x4b,\n",
-            "  0xbb, 0x59, 0x68, 0x4f, 0x77, 0x87, 0xa1, 0x73, 0xbf, 0x65, 0x56, 0x67,\n",
-            "  0x77, 0x84, 0x8a, 0x7e, 0xb8, 0x85, 0x66, 0xa6, 0x99, 0xa0, 0xa5, 0x73,\n",
-            "  0x8d, 0x4a, 0x7d, 0xab, 0xb0, 0x6a, 0x94, 0x84, 0x87, 0x4c, 0x74, 0xa3,\n",
-            "  0xb3, 0xa9, 0x62, 0x7a, 0x71, 0x7f, 0x53, 0x79, 0x7a, 0x7c, 0x5e, 0x8f,\n",
-            "  0xa0, 0x90, 0x5c, 0x76, 0x6c, 0x92, 0x70, 0x9c, 0xb3, 0x8b, 0x7e, 0x57,\n",
-            "  0x5b, 0x9d, 0x96, 0x85, 0x70, 0x93, 0x8b, 0x67, 0x4c, 0x9c, 0x6a, 0x83,\n",
-            "  0x84, 0x90, 0x8e, 0x60, 0x56, 0xb3, 0x87, 0x7d, 0x86, 0x88, 0x79, 0x5b,\n",
-            "  0x58, 0x94, 0x92, 0x8e, 0x90, 0x76, 0x58, 0x51, 0x52, 0x63, 0x57, 0x88,\n",
-            "  0x9b, 0x7a, 0x85, 0x6c, 0x8b, 0x87, 0x5f, 0x8b, 0x90, 0x92, 0x81, 0x64,\n",
-            "  0x52, 0x8b, 0x77, 0x94, 0x96, 0x98, 0x69, 0x5b, 0x79, 0x87, 0x61, 0x96,\n",
-            "  0x7b, 0x9a, 0x61, 0x74, 0x7e, 0x8b, 0x82, 0x92, 0x4f, 0x87, 0x7f, 0x80,\n",
-            "  0x74, 0x97, 0x98, 0x7a, 0x79, 0x97, 0x65, 0x67, 0x66, 0xb1, 0xb1, 0x49,\n",
-            "  0xd6, 0x97, 0x58, 0x47, 0x62, 0x94, 0xd5, 0x82, 0xa0, 0x60, 0x3f, 0x67,\n",
-            "  0x6c, 0x9d, 0xb6, 0x58, 0xb1, 0x6e, 0x58, 0x4e, 0x7c, 0x83, 0x8b, 0x83,\n",
-            "  0xd5, 0x62, 0x8d, 0x84, 0x84, 0x8c, 0xa9, 0x6e, 0xac, 0x7f, 0x6d, 0x88,\n",
-            "  0xab, 0x8b, 0xb1, 0x77, 0x9b, 0x46, 0x76, 0xa7, 0xb8, 0x7b, 0xc5, 0x6e,\n",
-            "  0x73, 0x62, 0x68, 0x95, 0xab, 0x7c, 0x6f, 0x74, 0x56, 0x71, 0x61, 0x83,\n",
-            "  0x8a, 0x73, 0x54, 0x94, 0x86, 0x91, 0x60, 0x69, 0x65, 0x6b, 0x76, 0x85,\n",
-            "  0xae, 0x87, 0x8f, 0x55, 0x41, 0x98, 0x68, 0x87, 0x5e, 0x7a, 0x80, 0x38,\n",
-            "  0x50, 0xaf, 0x93, 0x79, 0x57, 0x96, 0x7b, 0x53, 0x4e, 0xc0, 0xa0, 0x85,\n",
-            "  0x87, 0x95, 0x86, 0x70, 0x4c, 0x9f, 0x77, 0x7d, 0x8b, 0x7a, 0x7b, 0x6d,\n",
-            "  0x57, 0x74, 0x81, 0x7d, 0xa2, 0x79, 0x64, 0x6c, 0x55, 0x70, 0x3c, 0x88,\n",
-            "  0x8a, 0x7a, 0x58, 0x72, 0x71, 0x7d, 0x6a, 0x8d, 0x78, 0x7e, 0x95, 0x8b,\n",
-            "  0x84, 0x7e, 0x73, 0x7c, 0x7e, 0x67, 0x89, 0x8b, 0x6d, 0x68, 0x66, 0x73,\n",
-            "  0x5a, 0x93, 0x82, 0x85, 0x97, 0x6b, 0x9a, 0x72, 0x51, 0xa2, 0x4f, 0x67,\n",
-            "  0x67, 0x7e, 0xbb, 0x37, 0xe3, 0x9c, 0x57, 0x5b, 0x6f, 0xa0, 0xdc, 0x5c,\n",
-            "  0xa6, 0x7c, 0x71, 0x77, 0x72, 0x88, 0xd0, 0x4d, 0x93, 0x58, 0x74, 0x6d,\n",
-            "  0x8f, 0x77, 0xa3, 0x76, 0xb7, 0x76, 0x6d, 0x6d, 0x6f, 0x7b, 0xaa, 0x6d,\n",
-            "  0xaa, 0x6a, 0x72, 0x98, 0x8d, 0x98, 0xb0, 0x52, 0x76, 0x5d, 0x61, 0xb7,\n",
-            "  0xac, 0x90, 0xa5, 0x75, 0x7e, 0x3d, 0x5b, 0x9a, 0xbf, 0x81, 0x83, 0x7b,\n",
-            "  0x5c, 0x77, 0x74, 0x82, 0x8d, 0x7e, 0x4f, 0x9f, 0x8f, 0x97, 0x7c, 0x75,\n",
-            "  0x5b, 0x73, 0x97, 0x73, 0x85, 0x7f, 0x70, 0x5a, 0x53, 0x81, 0x81, 0x89,\n",
-            "  0x73, 0x8d, 0x8a, 0x5c, 0x5f, 0x84, 0x86, 0x6f, 0x76, 0x78, 0x82, 0x6d,\n",
-            "  0x4f, 0xbb, 0x91, 0x61, 0x7e, 0x97, 0x6c, 0x67, 0x62, 0x83, 0x61, 0x7d,\n",
-            "  0x89, 0x76, 0x7b, 0x67, 0x56, 0x74, 0x49, 0x7b, 0x6b, 0x8b, 0x89, 0x74,\n",
-            "  0x5b, 0x7f, 0x78, 0x7b, 0x80, 0x7e, 0x63, 0x71, 0x5e, 0x91, 0x81, 0x92,\n",
-            "  0x7b, 0x90, 0x9c, 0x7a, 0x73, 0x85, 0x79, 0x9b, 0x66, 0x93, 0x60, 0x87,\n",
-            "  0x79, 0x69, 0x73, 0x8b, 0x53, 0x8c, 0x8d, 0x68, 0x93, 0xa0, 0x91, 0x65,\n",
-            "  0x57, 0x8d, 0x71, 0x65, 0x6c, 0x7e, 0xb3, 0x4f, 0xc7, 0xaa, 0x5a, 0x77,\n",
-            "  0x6e, 0x85, 0xe4, 0x6c, 0xa3, 0x89, 0x69, 0x54, 0x6d, 0x99, 0xb9, 0x77,\n",
-            "  0xa0, 0x80, 0x85, 0x71, 0x70, 0x78, 0x99, 0x66, 0xaf, 0x8a, 0x59, 0x64,\n",
-            "  0x54, 0x62, 0xbf, 0x5c, 0xbd, 0x77, 0x7f, 0xab, 0x95, 0x85, 0xaa, 0x6e,\n",
-            "  0xaa, 0x5a, 0x7b, 0x9f, 0xc3, 0x65, 0x93, 0x64, 0x7c, 0x2d, 0x4e, 0x8f,\n",
-            "  0xb2, 0x5f, 0x4e, 0x61, 0x64, 0x73, 0x56, 0x75, 0x79, 0x90, 0x5c, 0x81,\n",
-            "  0x8a, 0x8c, 0x70, 0x64, 0x74, 0x86, 0x86, 0x82, 0xab, 0x7e, 0x62, 0x4f,\n",
-            "  0x51, 0x89, 0x7b, 0x88, 0x73, 0x97, 0x77, 0x75, 0x5c, 0x9e, 0x97, 0x70,\n",
-            "  0x5a, 0x98, 0x7a, 0x54, 0x47, 0x99, 0xab, 0x5d, 0x91, 0xa0, 0x64, 0x51,\n",
-            "  0x57, 0x88, 0x88, 0x85, 0x81, 0x83, 0xa1, 0x89, 0x6a, 0x88, 0x69, 0x81,\n",
-            "  0x92, 0x63, 0x6a, 0x71, 0x72, 0x6a, 0x75, 0x8e, 0x90, 0x9d, 0x69, 0x60,\n",
-            "  0x73, 0x95, 0x79, 0x7b, 0x79, 0x7f, 0x77, 0x6e, 0x69, 0x63, 0x60, 0xa0,\n",
-            "  0x84, 0x91, 0x80, 0x96, 0x92, 0x70, 0x69, 0x7c, 0x3f, 0x90, 0x5c, 0x79,\n",
-            "  0x82, 0x63, 0x8d, 0x63, 0x56, 0x8a, 0x8e, 0x7a, 0x5c, 0x8d, 0xb8, 0x4e,\n",
-            "  0xb6, 0x84, 0x57, 0x79, 0x59, 0x79, 0xe8, 0x7e, 0xa8, 0x71, 0x61, 0x62,\n",
-            "  0x89, 0x71, 0xb7, 0x83, 0x7b, 0x53, 0x86, 0x88, 0x74, 0x71, 0xb1, 0x61,\n",
-            "  0xae, 0x7e, 0x8f, 0x69, 0x6b, 0x69, 0xb2, 0x6d, 0xb1, 0x7f, 0x5c, 0x9f,\n",
-            "  0xaa, 0x8c, 0xbd, 0x74, 0xaa, 0x5b, 0x7f, 0xa5, 0xb0, 0x6e, 0xc1, 0x5c,\n",
-            "  0x94, 0x34, 0x5b, 0xa6, 0xbc, 0x49, 0x75, 0x5b, 0x6e, 0x74, 0x7a, 0x92,\n",
-            "  0x92, 0x79, 0x78, 0x8a, 0x9e, 0x97, 0x7c, 0x5f, 0x76, 0x86, 0x59, 0x81,\n",
-            "  0x83, 0x7a, 0x65, 0x5b, 0x42, 0x95, 0x84, 0x99, 0x81, 0x8d, 0x6a, 0x5e,\n",
-            "  0x59, 0xb7, 0x96, 0x8a, 0x77, 0x86, 0x7a, 0x67, 0x3b, 0xa8, 0xae, 0x7a,\n",
-            "  0xa0, 0x97, 0x6c, 0x73, 0x5b, 0x9b, 0x77, 0x84, 0x7a, 0x77, 0x75, 0x6f,\n",
-            "  0x7d, 0x7a, 0x71, 0x86, 0x6c, 0x6f, 0x7d, 0x71, 0x68, 0x60, 0x64, 0x86,\n",
-            "  0x90, 0x75, 0x6a, 0x61, 0x60, 0x87, 0x68, 0x99, 0x87, 0x7e, 0x92, 0x87,\n",
-            "  0x87, 0x5f, 0x60, 0x91, 0x68, 0x8c, 0x7b, 0x67, 0x79, 0x5d, 0x67, 0x77,\n",
-            "  0x47, 0x72, 0x76, 0x88, 0x82, 0xa2, 0x7a, 0x5d, 0x64, 0x87, 0x75, 0x78,\n",
-            "  0x5e, 0x6f, 0xa4, 0x52, 0xc2, 0x9d, 0x81, 0x89, 0x55, 0x86, 0xc9, 0x6f,\n",
-            "  0x95, 0x71, 0x9d, 0x87, 0x95, 0x74, 0xac, 0x7f, 0x95, 0x6c, 0x68, 0x66,\n",
-            "  0x8a, 0x5f, 0x96, 0x69, 0x95, 0x79, 0x7f, 0x71, 0x86, 0x7e, 0x98, 0x71,\n",
-            "  0xac, 0x8f, 0x75, 0xa5, 0xac, 0x7a, 0xca, 0x63, 0xa0, 0x63, 0x69, 0xbf,\n",
-            "  0xae, 0x62, 0xc9, 0x46, 0x74, 0x2c, 0x66, 0x96, 0xb7, 0x70, 0x7c, 0x6b,\n",
-            "  0x7b, 0x90, 0x72, 0x74, 0x8d, 0x5f, 0x63, 0x93, 0x97, 0x78, 0x79, 0x64,\n",
-            "  0x67, 0x84, 0x64, 0x82, 0x90, 0x83, 0x91, 0x5f, 0x72, 0x93, 0x91, 0xae,\n",
-            "  0x6d, 0x99, 0x5b, 0x69, 0x54, 0x9f, 0x97, 0x80, 0x80, 0xa4, 0x91, 0x66,\n",
-            "  0x65, 0xa4, 0xa7, 0x7b, 0x97, 0x87, 0x72, 0x68, 0x6a, 0x96, 0x7b, 0x79,\n",
-            "  0x69, 0x83, 0x6f, 0x85, 0x6b, 0x92, 0x7f, 0x71, 0x84, 0x87, 0x6a, 0x7b,\n",
-            "  0x63, 0x72, 0x5f, 0x87, 0x98, 0x7b, 0x96, 0x71, 0x62, 0x90, 0x71, 0xa3,\n",
-            "  0x8c, 0x77, 0x90, 0x6f, 0x83, 0x76, 0x65, 0x87, 0x72, 0x8a, 0x64, 0x87,\n",
-            "  0x75, 0x75, 0x6d, 0x84, 0x54, 0x89, 0x88, 0xa0, 0x87, 0x73, 0x7f, 0x6f,\n",
-            "  0x5f, 0x90, 0x5e, 0x94, 0x5d, 0x61, 0xa6, 0x56, 0xb3, 0x91, 0x95, 0x75,\n",
-            "  0x4d, 0x74, 0xd9, 0x87, 0x92, 0x74, 0x7f, 0x79, 0x97, 0x6e, 0x90, 0x54,\n",
-            "  0x84, 0x5d, 0x5f, 0x75, 0x8b, 0x84, 0xa6, 0x75, 0xb4, 0x77, 0x78, 0x85,\n",
-            "  0x90, 0x76, 0xbd, 0x78, 0xd1, 0xa0, 0x5d, 0x96, 0xa9, 0x7c, 0xc1, 0x61,\n",
-            "  0xc2, 0x71, 0x8b, 0xa5, 0xa5, 0x5b, 0xc8, 0x50, 0x7b, 0x4b, 0x93, 0x99,\n",
-            "  0xae, 0x72, 0x67, 0x54, 0x81, 0x89, 0x96, 0x81, 0x6e, 0x68, 0x55, 0x7f,\n",
-            "  0x93, 0x8c, 0x5e, 0x65, 0x6c, 0x84, 0x7f, 0x8f, 0x9e, 0x7b, 0x73, 0x7f,\n",
-            "  0x51, 0x63, 0x8a, 0x8b, 0x6b, 0x9b, 0x9d, 0x57, 0x68, 0x89, 0x98, 0x70,\n",
-            "  0x73, 0xa3, 0x7f, 0x69, 0x44, 0x89, 0xae, 0x68, 0x89, 0x80, 0x7e, 0x6d,\n",
-            "  0x70, 0x95, 0x85, 0x65, 0x91, 0x7f, 0x66, 0x74, 0x96, 0x72, 0x60, 0x7a,\n",
-            "  0x87, 0x85, 0x79, 0x54, 0x53, 0x6c, 0x88, 0x87, 0xa9, 0x90, 0x75, 0x8b,\n",
-            "  0x69, 0x98, 0x7d, 0x95, 0x85, 0x7a, 0x8b, 0x82, 0x87, 0x6f, 0x86, 0x7f,\n",
-            "  0x74, 0xab, 0x93, 0x6c, 0x8a, 0x78, 0x68, 0x81, 0x62, 0x88, 0x78, 0x91,\n",
-            "  0x8b, 0x55, 0xa7, 0x58, 0x64, 0x88, 0x71, 0x93, 0x7d, 0x69, 0xbc, 0x58,\n",
-            "  0xbe, 0x9a, 0x6f, 0x74, 0x6f, 0x7f, 0xeb, 0x9e, 0xb7, 0x60, 0x63, 0x98,\n",
-            "  0x82, 0x77, 0x94, 0x63, 0x80, 0x6f, 0x7d, 0x8f, 0x8b, 0x85, 0xa5, 0x62,\n",
-            "  0xad, 0x86, 0x5f, 0x76, 0x88, 0x74, 0xa5, 0x66, 0xa5, 0x94, 0x88, 0x9b,\n",
-            "  0x87, 0x9e, 0xa8, 0x5a, 0xc9, 0x81, 0x92, 0xcd, 0xb5, 0x67, 0xb9, 0x63,\n",
-            "  0x86, 0x65, 0x8d, 0xad, 0x98, 0x7c, 0x8a, 0x40, 0x67, 0x65, 0x60, 0x71,\n",
-            "  0x8e, 0x84, 0x73, 0x64, 0x98, 0x80, 0x73, 0x81, 0x48, 0x75, 0x71, 0x9e,\n",
-            "  0x73, 0x89, 0x89, 0x68, 0x73, 0xa6, 0x84, 0x8a, 0x7e, 0x9f, 0x78, 0x83,\n",
-            "  0x60, 0x77, 0xa1, 0x87, 0x76, 0xab, 0x74, 0x57, 0x6d, 0x99, 0xa5, 0x5e,\n",
-            "  0x9d, 0x91, 0x6d, 0x6a, 0x76, 0x9c, 0x7b, 0x66, 0x96, 0x84, 0x85, 0x6e,\n",
-            "  0x6c, 0x75, 0x86, 0x6a, 0x71, 0x67, 0x8a, 0x66, 0x66, 0x68, 0x73, 0x90,\n",
-            "  0x92, 0x68, 0x8f, 0x71, 0x82, 0x7e, 0x71, 0xad, 0x9f, 0x84, 0x9e, 0x7d,\n",
-            "  0x77, 0x6b, 0x67, 0x8f, 0x73, 0x9a, 0x91, 0x74, 0x8a, 0x74, 0x5a, 0x87,\n",
-            "  0x37, 0x80, 0x8c, 0x8f, 0x7f, 0x75, 0xa8, 0x49, 0x63, 0x9b, 0x67, 0x68,\n",
-            "  0x4f, 0x87, 0xbf, 0x59, 0x9c, 0xbe, 0x93, 0x7e, 0x6f, 0x8a, 0xea, 0x77,\n",
-            "  0x83, 0x7a, 0x75, 0x8e, 0x7d, 0x50, 0x95, 0x60, 0x74, 0x60, 0x6f, 0x97,\n",
-            "  0x72, 0x5c, 0xa3, 0x6d, 0xb9, 0x86, 0x7b, 0x89, 0x9a, 0x76, 0xc7, 0x56,\n",
-            "  0xba, 0x86, 0x8d, 0x93, 0xa9, 0x98, 0xbb, 0x6a, 0x97, 0x74, 0x68, 0x84,\n",
-            "  0xc3, 0x65, 0xb6, 0x68, 0x89, 0x58, 0x87, 0xa1, 0xac, 0x60, 0x65, 0x68,\n",
-            "  0x7d, 0x98, 0x67, 0x8f, 0x8e, 0x84, 0x50, 0x75, 0x83, 0x91, 0x8a, 0x90,\n",
-            "  0x66, 0x74, 0x96, 0x89, 0x81, 0x7a, 0x7a, 0x64, 0x7f, 0x73, 0x8f, 0x95,\n",
-            "  0x8c, 0x89, 0x96, 0x76, 0x7a, 0x6c, 0x89, 0x91, 0x6d, 0x84, 0x68, 0x8d,\n",
-            "  0x47, 0x94, 0x9a, 0x67, 0x8f, 0x89, 0x8e, 0x79, 0x73, 0xa8, 0x7f, 0x6c,\n",
-            "  0x80, 0x64, 0x75, 0x81, 0x96, 0x9c, 0x68, 0x65, 0x76, 0x68, 0x74, 0x72,\n",
-            "  0x68, 0x76, 0x62, 0x6d, 0x6e, 0x6a, 0x84, 0x65, 0x8a, 0x73, 0x76, 0x91,\n",
-            "  0x78, 0x7c, 0x7a, 0x88, 0x6a, 0x87, 0x60, 0x99, 0x88, 0x75, 0x7b, 0x71,\n",
-            "  0x81, 0x7b, 0x76, 0x7d, 0x58, 0x75, 0x65, 0xa3, 0x95, 0x7e, 0x96, 0x3e,\n",
-            "  0x4c, 0x97, 0x86, 0x7a, 0x62, 0x92, 0xd1, 0x72, 0x8e, 0xaa, 0x85, 0x8e,\n",
-            "  0x59, 0x5f, 0xec, 0x77, 0x96, 0x66, 0x91, 0x9a, 0x89, 0x6c, 0xa2, 0x69,\n",
-            "  0x7d, 0x6e, 0x76, 0x63, 0x82, 0x72, 0x9c, 0x72, 0xa3, 0x75, 0x85, 0x7b,\n",
-            "  0x6d, 0x96, 0xc2, 0x69, 0xa7, 0x6a, 0x6b, 0x83, 0xa2, 0x7d, 0xce, 0x5c,\n",
-            "  0x94, 0x61, 0x7d, 0xae, 0xc3, 0x6d, 0x9f, 0x3c, 0x52, 0x4d, 0x8e, 0x92,\n",
-            "  0xae, 0x6e, 0x70, 0x5a, 0x76, 0x84, 0x7f, 0x72, 0x92, 0x72, 0x76, 0x5e,\n",
-            "  0x73, 0x8e, 0x82, 0x6d, 0x72, 0x81, 0x79, 0x94, 0x81, 0x88, 0x8b, 0x81,\n",
-            "  0x72, 0x72, 0x69, 0x84, 0x59, 0x6e, 0x74, 0x7d, 0x66, 0x74, 0x8d, 0x7b,\n",
-            "  0x7d, 0x7e, 0x7a, 0x83, 0x4d, 0x7e, 0x6a, 0x5a, 0x87, 0x66, 0x84, 0xa5,\n",
-            "  0x50, 0x5d, 0x6a, 0x8e, 0x87, 0x74, 0x88, 0x7c, 0x7d, 0x6c, 0x93, 0x98,\n",
-            "  0x8c, 0x76, 0x7f, 0xa3, 0x6e, 0x5d, 0x7d, 0x9f, 0x7c, 0x7a, 0x98, 0x88,\n",
-            "  0x74, 0x73, 0x50, 0x8c, 0x78, 0x8b, 0x71, 0x77, 0x9d, 0x56, 0x71, 0x85,\n",
-            "  0x6b, 0x8a, 0x93, 0x82, 0x8c, 0x79, 0x68, 0x8b, 0x57, 0x7b, 0x7c, 0x8a,\n",
-            "  0x6c, 0x87, 0x98, 0x54, 0x63, 0x7e, 0x78, 0x6b, 0x63, 0x77, 0xc1, 0x52,\n",
-            "  0xcd, 0xab, 0x75, 0x8e, 0x64, 0x68, 0xce, 0x68, 0x88, 0x6d, 0x67, 0x6d,\n",
-            "  0x68, 0x76, 0xa7, 0x78, 0x83, 0x67, 0x65, 0x5b, 0x8f, 0x63, 0x90, 0x5b,\n",
-            "  0xa1, 0x6f, 0x6a, 0x88, 0x70, 0x5c, 0x78, 0x49, 0xbc, 0x85, 0x8d, 0x8e,\n",
-            "  0xa3, 0x90, 0x97, 0x84, 0xa2, 0x46, 0x7a, 0x8e, 0x9e, 0xb1, 0xaa, 0x53,\n",
-            "  0x7d, 0x6b, 0x72, 0x86, 0x8c, 0x67, 0x6b, 0x48, 0x6f, 0x9c, 0x51, 0x94,\n",
-            "  0x6d, 0x66, 0x8e, 0x90, 0x79, 0x81, 0x66, 0x9f, 0x82, 0x9f, 0x98, 0x97,\n",
-            "  0x7c, 0x86, 0x7f, 0x57, 0x57, 0x83, 0x97, 0x8f, 0x73, 0x6f, 0x75, 0x6c,\n",
-            "  0x56, 0x8f, 0x7f, 0x73, 0x71, 0x84, 0x7d, 0x5f, 0x69, 0x69, 0x8e, 0x67,\n",
-            "  0x8a, 0x7f, 0x8c, 0x5a, 0x7a, 0x67, 0x82, 0x5a, 0x7a, 0x68, 0x73, 0x58,\n",
-            "  0x84, 0x83, 0x8d, 0x6d, 0x83, 0x72, 0x80, 0x7a, 0x8e, 0x7a, 0x68, 0x88,\n",
-            "  0x65, 0x74, 0x78, 0x73, 0x83, 0x97, 0x7b, 0x84, 0x77, 0x6d, 0x95, 0x99,\n",
-            "  0x76, 0x69, 0x5f, 0x9b, 0x7c, 0x75, 0x91, 0x80, 0x7b, 0x73, 0x6f, 0x9f,\n",
-            "  0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00,\n",
-            "  0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,\n",
-            "  0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x0b, 0x00, 0x00, 0x00, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f, 0x62,\n",
-            "  0x69, 0x61, 0x73, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0xaa, 0xcc, 0xe2, 0x37, 0x10, 0x00, 0x00, 0x00, 0xd6, 0x01, 0x00, 0x00,\n",
-            "  0xfd, 0xfd, 0xff, 0xff, 0x53, 0xfe, 0xff, 0xff, 0x74, 0x01, 0x00, 0x00,\n",
-            "  0x03, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
-            "  0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00,\n",
-            "  0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x14, 0x00, 0x1c, 0x00,\n",
-            "  0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,\n",
-            "  0x28, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x14, 0x00, 0x18, 0x00,\n",
-            "  0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x14, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,\n",
-            "  0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,\n",
-            "  0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,\n",
-            "  0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x10, 0x00,\n",
-            "  0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,\n",
-            "  0x03, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0xff, 0x00, 0x19, 0x06, 0x00,\n",
-            "  0x06, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x09, 0x06, 0x00,\n",
-            "  0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04\n",
-            "};\n",
-            "unsigned int g_model_len = 18288;\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    }
-  ]
-}
\ No newline at end of file
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"train_micro_speech_model.ipynb","provenance":[{"file_id":"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb","timestamp":1587690382292}],"collapsed_sections":[],"toc_visible":true},"kernelspec":{"name":"python3","display_name":"Python 3"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","metadata":{"id":"pO4-CY_TCZZS","colab_type":"text"},"source":["# Train a Simple Audio Recognition Model"]},{"cell_type":"markdown","metadata":{"id":"BaFfr7DHRmGF","colab_type":"text"},"source":["This notebook demonstrates how to train a 20 kB [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition) model to recognize keywords in speech.\n","\n","The model created in this notebook is used in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) example for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview).\n","\n","<table class=\"tfo-notebook-buttons\" align=\"left\">\n","  <td>\n","    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n","  </td>\n","  <td>\n","    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n","  </td>\n","</table>\n"]},{"cell_type":"markdown","metadata":{"id":"XaVtYN4nlCft","colab_type":"text"},"source":["**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and set **Hardware accelerator: GPU**. Training 15,000 iterations will take 1.5 - 2 hours on a GPU runtime.\n","\n","## Configure Defaults\n","\n","**MODIFY** the following constants for your specific use case."]},{"cell_type":"code","metadata":{"id":"ludfxbNIaegy","colab_type":"code","colab":{}},"source":["# A comma-delimited list of the words you want to train for.\n","# The options are: yes,no,up,down,left,right,on,off,stop,go\n","# All the other words will be used to train an \"unknown\" label and silent\n","# audio data with no spoken words will be used to train a \"silence\" label.\n","WANTED_WORDS = \"yes,no\"\n","\n","# The number of steps and learning rates can be specified as comma-separated\n","# lists to define the rate at each stage. For example,\n","# TRAINING_STEPS=12000,3000 and LEARNING_RATE=0.001,0.0001\n","# will run 12,000 training loops in total, with a rate of 0.001 for the first\n","# 8,000, and 0.0001 for the final 3,000.\n","TRAINING_STEPS = \"12000,3000\"\n","LEARNING_RATE = \"0.001,0.0001\"\n","\n","# Calculate the total number of steps, which is used to identify the checkpoint\n","# file name.\n","TOTAL_STEPS = str(sum(map(lambda string: int(string), TRAINING_STEPS.split(\",\"))))\n","\n","# Print the configuration to confirm it\n","print(\"Training these words: %s\" % WANTED_WORDS)\n","print(\"Training steps in each stage: %s\" % TRAINING_STEPS)\n","print(\"Learning rate in each stage: %s\" % LEARNING_RATE)\n","print(\"Total number of training steps: %s\" % TOTAL_STEPS)"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"gCgeOpvY9pAi","colab_type":"text"},"source":["**DO NOT MODIFY** the following constants as they include filepaths used in this notebook and data that is shared during training and inference."]},{"cell_type":"code","metadata":{"id":"Nd1iM1o2ymvA","colab_type":"code","colab":{}},"source":["# Calculate the percentage of 'silence' and 'unknown' training samples required\n","# to ensure that we have equal number of samples for each label.\n","number_of_labels = WANTED_WORDS.count(',') + 1\n","number_of_total_labels = number_of_labels + 2 # for 'silence' and 'unknown' label\n","equal_percentage_of_training_samples = int(100.0/(number_of_total_labels))\n","SILENT_PERCENTAGE = equal_percentage_of_training_samples\n","UNKNOWN_PERCENTAGE = equal_percentage_of_training_samples\n","\n","# Constants which are shared during training and inference\n","PREPROCESS = 'micro'\n","WINDOW_STRIDE = 20\n","MODEL_ARCHITECTURE = 'tiny_conv' # Other options include: single_fc, conv,\n","                      # low_latency_conv, low_latency_svdf, tiny_embedding_conv\n","\n","# Constants used during training only\n","VERBOSITY = 'WARN'\n","EVAL_STEP_INTERVAL = '1000'\n","SAVE_STEP_INTERVAL = '1000'\n","\n","# Constants for training directories and filepaths\n","DATASET_DIR =  'dataset/'\n","LOGS_DIR = 'logs/'\n","TRAIN_DIR = 'train/' # for training checkpoints and other files.\n","\n","# Constants for inference directories and filepaths\n","import os\n","MODELS_DIR = 'models'\n","if not os.path.exists(MODELS_DIR):\n","  os.mkdir(MODELS_DIR)\n","MODEL_TF = os.path.join(MODELS_DIR, 'model.pb')\n","MODEL_TFLITE = os.path.join(MODELS_DIR, 'model.tflite')\n","FLOAT_MODEL_TFLITE = os.path.join(MODELS_DIR, 'float_model.tflite')\n","MODEL_TFLITE_MICRO = os.path.join(MODELS_DIR, 'model.cc')\n","SAVED_MODEL = os.path.join(MODELS_DIR, 'saved_model')\n","\n","QUANT_INPUT_MIN = 0.0\n","QUANT_INPUT_MAX = 26.0\n","QUANT_INPUT_RANGE = QUANT_INPUT_MAX - QUANT_INPUT_MIN"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"6rLYpvtg9P4o","colab_type":"text"},"source":["## Setup Environment\n","\n","Install Dependencies"]},{"cell_type":"code","metadata":{"id":"ed_XpUrU5DvY","colab_type":"code","colab":{}},"source":["%tensorflow_version 1.x\n","import tensorflow as tf"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"T9Ty5mR58E4i","colab_type":"text"},"source":["**DELETE** any old data from previous runs\n"]},{"cell_type":"code","metadata":{"id":"APGx0fEh7hFF","colab_type":"code","colab":{}},"source":["!rm -rf {DATASET_DIR} {LOGS_DIR} {TRAIN_DIR} {MODELS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"GfEUlfFBizio","colab_type":"text"},"source":["Clone the TensorFlow Github Repository, which contains the relevant code required to run this tutorial."]},{"cell_type":"code","metadata":{"id":"yZArmzT85SLq","colab_type":"code","colab":{}},"source":["!git clone -q --depth 1 https://github.com/tensorflow/tensorflow"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"nS9swHLSi7Bi","colab_type":"text"},"source":["Load TensorBoard to visualize the accuracy and loss as training proceeds.\n"]},{"cell_type":"code","metadata":{"id":"q4qF1VxP3UE4","colab_type":"code","colab":{}},"source":["%load_ext tensorboard\n","%tensorboard --logdir {LOGS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"x1J96Ron-O4R","colab_type":"text"},"source":["## Training\n","\n","The following script downloads the dataset and begin training."]},{"cell_type":"code","metadata":{"id":"VJsEZx6lynbY","colab_type":"code","colab":{}},"source":["!python tensorflow/tensorflow/examples/speech_commands/train.py \\\n","--data_dir={DATASET_DIR} \\\n","--wanted_words={WANTED_WORDS} \\\n","--silence_percentage={SILENT_PERCENTAGE} \\\n","--unknown_percentage={UNKNOWN_PERCENTAGE} \\\n","--preprocess={PREPROCESS} \\\n","--window_stride={WINDOW_STRIDE} \\\n","--model_architecture={MODEL_ARCHITECTURE} \\\n","--how_many_training_steps={TRAINING_STEPS} \\\n","--learning_rate={LEARNING_RATE} \\\n","--train_dir={TRAIN_DIR} \\\n","--summaries_dir={LOGS_DIR} \\\n","--verbosity={VERBOSITY} \\\n","--eval_step_interval={EVAL_STEP_INTERVAL} \\\n","--save_step_interval={SAVE_STEP_INTERVAL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"UczQKtqLi7OJ","colab_type":"text"},"source":["# Skipping the training\n","\n","If you don't want to spend an hour or two training the model from scratch, you can download pretrained checkpoints by uncommenting the lines below (removing the '#'s at the start of each line) and running them."]},{"cell_type":"code","metadata":{"id":"RZw3VNlnla-J","colab_type":"code","colab":{}},"source":["#!curl -O \"https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_micro_train_2020_05_10.tgz\"\n","#!tar xzf speech_micro_train_2020_05_10.tgz"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"XQUJLrdS-ftl","colab_type":"text"},"source":["## Generate a TensorFlow Model for Inference\n","\n","Combine relevant training results (graph, weights, etc) into a single file for inference. This process is known as freezing a model and the resulting model is known as a frozen model/graph, as it cannot be further re-trained after this process."]},{"cell_type":"code","metadata":{"id":"xyc3_eLh9sAg","colab_type":"code","colab":{}},"source":["!rm -rf {SAVED_MODEL}\n","!python tensorflow/tensorflow/examples/speech_commands/freeze.py \\\n","--wanted_words=$WANTED_WORDS \\\n","--window_stride_ms=$WINDOW_STRIDE \\\n","--preprocess=$PREPROCESS \\\n","--model_architecture=$MODEL_ARCHITECTURE \\\n","--start_checkpoint=$TRAIN_DIR$MODEL_ARCHITECTURE'.ckpt-'{TOTAL_STEPS} \\\n","--save_format=saved_model \\\n","--output_file={SAVED_MODEL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"_DBGDxVI-nKG","colab_type":"text"},"source":["## Generate a TensorFlow Lite Model\n","\n","Convert the frozen graph into a TensorFlow Lite model, which is fully quantized for use with embedded devices.\n","\n","The following cell will also print the model size, which will be under 20 kilobytes."]},{"cell_type":"code","metadata":{"id":"RIitkqvGWmre","colab_type":"code","colab":{}},"source":["import sys\n","# We add this path so we can import the speech processing modules.\n","sys.path.append(\"/content/tensorflow/tensorflow/examples/speech_commands/\")\n","import input_data\n","import models\n","import numpy as np"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"kzqECqMxgBh4","colab_type":"code","colab":{}},"source":["SAMPLE_RATE = 16000\n","CLIP_DURATION_MS = 1000\n","WINDOW_SIZE_MS = 30.0\n","FEATURE_BIN_COUNT = 40\n","BACKGROUND_FREQUENCY = 0.8\n","BACKGROUND_VOLUME_RANGE = 0.1\n","TIME_SHIFT_MS = 100.0\n","\n","DATA_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz'\n","VALIDATION_PERCENTAGE = 10\n","TESTING_PERCENTAGE = 10"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"rNQdAplJV1fz","colab_type":"code","colab":{}},"source":["model_settings = models.prepare_model_settings(\n","    len(input_data.prepare_words_list(WANTED_WORDS.split(','))),\n","    SAMPLE_RATE, CLIP_DURATION_MS, WINDOW_SIZE_MS,\n","    WINDOW_STRIDE, FEATURE_BIN_COUNT, PREPROCESS)\n","audio_processor = input_data.AudioProcessor(\n","    DATA_URL, DATASET_DIR,\n","    SILENT_PERCENTAGE, UNKNOWN_PERCENTAGE,\n","    WANTED_WORDS.split(','), VALIDATION_PERCENTAGE,\n","    TESTING_PERCENTAGE, model_settings, LOGS_DIR)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"lBj_AyCh1cC0","colab_type":"code","colab":{}},"source":["with tf.Session() as sess:\n","  float_converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  float_tflite_model = float_converter.convert()\n","  float_tflite_model_size = open(FLOAT_MODEL_TFLITE, \"wb\").write(float_tflite_model)\n","  print(\"Float model is %d bytes\" % float_tflite_model_size)\n","\n","  converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  converter.optimizations = [tf.lite.Optimize.DEFAULT]\n","  converter.inference_input_type = tf.lite.constants.INT8\n","  converter.inference_output_type = tf.lite.constants.INT8\n","  def representative_dataset_gen():\n","    for i in range(100):\n","      data, _ = audio_processor.get_data(1, i*1, model_settings,\n","                                         BACKGROUND_FREQUENCY, \n","                                         BACKGROUND_VOLUME_RANGE,\n","                                         TIME_SHIFT_MS,\n","                                         'testing',\n","                                         sess)\n","      flattened_data = np.array(data.flatten(), dtype=np.float32).reshape(1, 1960)\n","      yield [flattened_data]\n","  converter.representative_dataset = representative_dataset_gen\n","  tflite_model = converter.convert()\n","  tflite_model_size = open(MODEL_TFLITE, \"wb\").write(tflite_model)\n","  print(\"Quantized model is %d bytes\" % tflite_model_size)\n"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"EeLiDZTbLkzv","colab_type":"text"},"source":["# Testing the TensorFlow Lite model's accuracy\n","\n","Verify that the model we've exported is still accurate, using the TF Lite Python API and our test set."]},{"cell_type":"code","metadata":{"id":"wQsEteKRLryJ","colab_type":"code","colab":{}},"source":["with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","float_interpreter = tf.lite.Interpreter(FLOAT_MODEL_TFLITE)\n","float_interpreter.allocate_tensors()\n","\n","float_input_index = float_interpreter.get_input_details()[0][\"index\"]\n","\n","float_output_index = float_interpreter.get_output_details()[0][\"index\"]\n","float_model_output = float_interpreter.tensor(float_output_index)\n","\n","float_correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  flattened_input = np.array(current_input.flatten(), dtype=np.float32).reshape(1, 1960)\n","  float_interpreter.set_tensor(float_input_index, flattened_input)\n","  float_interpreter.invoke()\n","  top_prediction = float_model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    float_correct_predictions += 1\n","\n","print('Float accuracy is %f%% (N=%d)' % ((float_correct_predictions * 100) / len(test_data), len(test_data)))\n","\n","interpreter = tf.lite.Interpreter(MODEL_TFLITE)\n","interpreter.allocate_tensors()\n","\n","input_index = interpreter.get_input_details()[0][\"index\"]\n","\n","output_index = interpreter.get_output_details()[0][\"index\"]\n","model_output = interpreter.tensor(output_index)\n","\n","with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  quantized_input = np.zeros((1960), np.int8)\n","  for index, input_value in enumerate(current_input.flatten()):\n","    # These scaling values are derived from those used in input_data.py in the\n","    # training pipeline.\n","    value = ((input_value - QUANT_INPUT_MIN) * 256) / QUANT_INPUT_RANGE\n","    value -= 128\n","    if value < -128:\n","      value = -128\n","    if value > 127:\n","      value = 127\n","    quantized_input[index] = value\n","  flattened_input = np.array(quantized_input.flatten(), dtype=np.int8).reshape(1, 1960)\n","  interpreter.set_tensor(input_index, flattened_input)\n","  interpreter.invoke()\n","  top_prediction = model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    correct_predictions += 1\n","\n","print('Quantized accuracy is %f%% (N=%d)' % ((correct_predictions * 100) / len(test_data), len(test_data)))\n"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"dt6Zqbxu-wIi","colab_type":"text"},"source":["## Generate a TensorFlow Lite for MicroControllers Model\n","Convert the TensorFlow Lite model into a C source file that can be loaded by TensorFlow Lite for Microcontrollers."]},{"cell_type":"code","metadata":{"id":"XohZOTjR8ZyE","colab_type":"code","colab":{}},"source":["# Install xxd if it is not available\n","!apt-get update && apt-get -qq install xxd\n","# Convert to a C source file\n","!xxd -i {MODEL_TFLITE} > {MODEL_TFLITE_MICRO}\n","# Update variable names\n","REPLACE_TEXT = MODEL_TFLITE.replace('/', '_').replace('.', '_')\n","!sed -i 's/'{REPLACE_TEXT}'/g_model/g' {MODEL_TFLITE_MICRO}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"2pQnN0i_-0L2","colab_type":"text"},"source":["## Deploy to a Microcontroller\n","\n","Follow the instructions in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) README.md for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview) to deploy this model on a specific microcontroller.\n","\n","**Reference Model:** If you have not modified this notebook, you can follow the instructions as is, to deploy the model. Refer to the [`micro_speech/train/models`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/models) directory to access the models generated in this notebook. \n","\n","**New Model:** If you have generated a new model to identify different words: (i) Update `kCategoryCount` and `kCategoryLabels` in [`micro_speech/micro_features/micro_model_settings.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h) and (ii) Update the values assigned to the variables defined in [`micro_speech/micro_features/model.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc) with values displayed after running the following cell."]},{"cell_type":"code","metadata":{"id":"eoYyh0VU8pca","colab_type":"code","colab":{}},"source":["# Print the C source file\n","!cat {MODEL_TFLITE_MICRO}"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"iYlIKpO2mkhv","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb b/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb
new file mode 100644
index 00000000000..fb56dc3a4d9
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb
@@ -0,0 +1 @@
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Redirect","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyO1u6oks1qPVEQNnHFD3Cyo"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","metadata":{"id":"86C-FMxpdZxv","colab_type":"text"},"source":["This Colab notebook has been moved to [https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb)\n"]}]}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/person_detection/BUILD b/tensorflow/lite/micro/examples/person_detection/BUILD
index cb9fdb80c33..84eddba73d4 100644
--- a/tensorflow/lite/micro/examples/person_detection/BUILD
+++ b/tensorflow/lite/micro/examples/person_detection/BUILD
@@ -23,7 +23,7 @@ cc_library(
 cc_library(
     name = "person_detect_model_data",
     srcs = [
-        "person_detect_model_data.cc",
+        "@person_detect_data//:person_detect_model_data",
     ],
     hdrs = [
         "person_detect_model_data.h",
@@ -56,7 +56,7 @@ cc_library(
     deps = [
         ":model_settings",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -69,7 +69,7 @@ tflite_micro_cc_test(
         ":image_provider",
         ":model_settings",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -84,7 +84,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -112,8 +112,15 @@ cc_binary(
         ":model_settings",
         ":person_detect_model_data",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/kernels:micro_ops",
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
+
+sh_test(
+    name = "person_detection_binary_test",
+    srcs = ["person_detection_binary_test.sh"],
+    data = [":person_detection"],
+)
diff --git a/tensorflow/lite/micro/examples/person_detection/README.md b/tensorflow/lite/micro/examples/person_detection/README.md
index 5ee7bda9914..423941dcad8 100644
--- a/tensorflow/lite/micro/examples/person_detection/README.md
+++ b/tensorflow/lite/micro/examples/person_detection/README.md
@@ -5,7 +5,9 @@ network to recognize people in images captured by a camera.  It is designed to
 run on systems with small amounts of memory such as microcontrollers and DSPs.
 
 ## Table of contents
+
 -   [Getting started](#getting-started)
+-   [Running on ARC EM SDP](#running-on-arc-em-sdp)
 -   [Running on Arduino](#running-on-arduino)
 -   [Running on ESP32](#running-on-esp32)
 -   [Running on SparkFun Edge](#running-on-sparkfun-edge)
@@ -13,6 +15,94 @@ run on systems with small amounts of memory such as microcontrollers and DSPs.
 -   [Debugging image capture](#debugging-image-capture)
 -   [Training your own model](#training-your-own-model)
 
+## Running on ARC EM SDP
+
+The following instructions will help you to build and deploy this example to
+[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
+board. General information and instructions on using the board with TensorFlow
+Lite Micro can be found in the common
+[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
+
+This example is quantized with symmetric uint8 scheme. As noted in
+[kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md),
+embARC MLI supports optimized kernels for int8 quantization only. Therefore,
+this example will only use TFLM reference kernels.
+
+The ARC EM SDP board contains the reach set of extension interfaces. You can
+choose any compatible camera and modify
+[image_provider.cc](/tensorflow/lite/micro/examples/person_detection/image_provider.cc)
+file accordingly to use input from your specific camera. By default, results of
+running this example are printed to the console. If you would like to instead
+implement some target-specific actions, you need to modify
+[detection_responder.cc](/tensorflow/lite/micro/examples/person_detection/detection_responder.cc)
+accordingly.
+
+The reference implementations of these files are used by default on the EM SDP.
+
+### Initial setup
+
+Follow the instructions on the
+[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
+to get and install all required tools for work with ARC EM SDP.
+
+### Generate Example Project
+
+The example project for ARC EM SDP platform can be generated with the following
+command:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_person_detection_make_project
+```
+
+### Build and Run Example
+
+For more detailed information on building and running examples see the
+appropriate sections of general descriptions of the
+[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
+In the directory with generated project you can also find a
+*README_ARC_EMSDP.md* file with instructions and options on building and
+running. Here we only briefly mention main steps which are typically enough to
+get it started.
+
+1.  You need to
+    [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
+    and open an serial connection.
+
+2.  Go to the generated example project director
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection/make
+    ```
+
+3.  Build the example using
+
+    ```
+    make app
+    ```
+
+4.  To generate artefacts for self-boot of example from the board use
+
+    ```
+    make flash
+    ```
+
+5.  To run application from the board using microSD card:
+
+    *   Copy the content of the created /bin folder into the root of microSD
+        card. Note that the card must be formatted as FAT32 with default cluster
+        size (but less than 32 Kbytes)
+    *   Plug in the microSD card into the J11 connector.
+    *   Push the RST button. If a red LED is lit beside RST button, push the CFG
+        button.
+
+6.  If you have the MetaWare Debugger installed in your environment:
+
+    *   To run application from the console using it type `make run`.
+    *   To stop the execution type `Ctrl+C` in the console several times.
+
+In both cases (step 5 and 6) you will see the application output in the serial
+terminal.
+
 ## Running on Arduino
 
 The following instructions will help you build and deploy this sample
diff --git a/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc
new file mode 100644
index 00000000000..29a09466e83
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc
@@ -0,0 +1,24 @@
+ifeq ($(TARGET), arc_emsdp)
+
+# Patch of arc make project to adjust it specifically 
+# for person detection example. In particular:
+# - Use Linker command file with better usage of fast memory
+# - In case project was generated with MLI usage, reduce scratch buffers.
+
+  person_detection_HDRS += \
+  person_detection_patch.txt
+  
+  person_detection_TEST_HDRS += \
+  person_detection_patch.txt
+  
+
+%/person_detection_patch.txt: %/emsdp.lcf %/Makefile
+	@cp tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf $< 
+	@echo emsdp.lcf > $@
+	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\
+	CXXFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0\
+	CCFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0#'\
+	  $(word 2, $^)
+	@echo Makefile >> $@
+
+endif
diff --git a/tensorflow/lite/micro/examples/person_detection/esp/app_camera_esp.h b/tensorflow/lite/micro/examples/person_detection/esp/app_camera_esp.h
index 403fb4defb1..e8cbe2177a9 100644
--- a/tensorflow/lite/micro/examples/person_detection/esp/app_camera_esp.h
+++ b/tensorflow/lite/micro/examples/person_detection/esp/app_camera_esp.h
@@ -30,7 +30,7 @@ limitations under the License.
 #define CAMERA_PIXEL_FORMAT PIXFORMAT_GRAYSCALE
 
 /*
- * FRAMESIZE_96x96,    // 96x96
+ * FRAMESIZE_96X96,    // 96x96
  * FRAMESIZE_QQVGA,    // 160x120
  * FRAMESIZE_QQVGA2,   // 128x160
  * FRAMESIZE_QCIF,     // 176x144
@@ -43,7 +43,7 @@ limitations under the License.
  * FRAMESIZE_SXGA,     // 1280x1024
  * FRAMESIZE_UXGA,     // 1600x1200
  */
-#define CAMERA_FRAME_SIZE FRAMESIZE_96x96
+#define CAMERA_FRAME_SIZE FRAMESIZE_96X96
 
 #if CONFIG_CAMERA_MODEL_WROVER_KIT
 #define PWDN_GPIO_NUM -1
diff --git a/tensorflow/lite/micro/examples/person_detection/main_functions.cc b/tensorflow/lite/micro/examples/person_detection/main_functions.cc
index 0e5c6394d56..6b07f6514d5 100644
--- a/tensorflow/lite/micro/examples/person_detection/main_functions.cc
+++ b/tensorflow/lite/micro/examples/person_detection/main_functions.cc
@@ -65,7 +65,7 @@ void setup() {
   //
   // tflite::ops::micro::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
-  static tflite::MicroOpResolver<3> micro_op_resolver;
+  static tflite::MicroMutableOpResolver<3> micro_op_resolver;
   micro_op_resolver.AddBuiltin(
       tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
       tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
diff --git a/tensorflow/lite/micro/examples/person_detection/person_detection_binary_test.sh b/tensorflow/lite/micro/examples/person_detection/person_detection_binary_test.sh
new file mode 100755
index 00000000000..00d985d19bf
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_binary_test.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Bash unit tests for the example binary.
+
+set -e
+
+OUTPUT_LOG_FILE=${TEST_TMPDIR}/output_log.txt
+
+# Needed for copybara compatibility.
+SCRIPT_BASE_DIR=/org_"tensor"flow
+${TEST_SRCDIR}${SCRIPT_BASE_DIR}/tensorflow/lite/micro/examples/person_detection/person_detection 2>&1 | head > ${OUTPUT_LOG_FILE}
+
+if ! grep -q 'person score' ${OUTPUT_LOG_FILE}; then
+  echo "ERROR: Expected logs not found in output '${OUTPUT_LOG_FILE}'"
+  exit 1
+fi
+
+echo
+echo "SUCCESS: person_detection_binary_test PASSED"
diff --git a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
index 8acb93ced17..dafed8089e3 100644
--- a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
@@ -54,7 +54,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   // needed by this graph.
   //
   // tflite::ops::micro::AllOpsResolver resolver;
-  tflite::MicroOpResolver<3> micro_op_resolver;
+  tflite::MicroMutableOpResolver<3> micro_op_resolver;
   micro_op_resolver.AddBuiltin(
       tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
       tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/BUILD b/tensorflow/lite/micro/examples/person_detection_experimental/BUILD
index cb9fdb80c33..49f10c814cb 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/BUILD
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/BUILD
@@ -56,7 +56,7 @@ cc_library(
     deps = [
         ":model_settings",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -69,6 +69,7 @@ tflite_micro_cc_test(
         ":image_provider",
         ":model_settings",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -84,7 +85,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -112,6 +113,7 @@ cc_binary(
         ":model_settings",
         ":person_detect_model_data",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/kernels:micro_ops",
         "//tensorflow/lite/schema:schema_fbs",
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/README.md b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
index d8aaa9ba383..bf99b40d776 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/README.md
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
@@ -6,13 +6,101 @@ run on systems with small amounts of memory such as microcontrollers and DSPs.
 This uses the experimental int8 quantized version of the person detection model.
 
 ## Table of contents
+
 -   [Getting started](#getting-started)
+-   [Running on ARC EM SDP](#running-on-arc-em-sdp)
 -   [Running on Arduino](#running-on-arduino)
 -   [Running on SparkFun Edge](#running-on-sparkfun-edge)
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
 -   [Debugging image capture](#debugging-image-capture)
 -   [Training your own model](#training-your-own-model)
 
+## Running on ARC EM SDP
+
+The following instructions will help you to build and deploy this example to
+[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
+board. General information and instructions on using the board with TensorFlow
+Lite Micro can be found in the common
+[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
+
+This example uses asymmetric int8 quantization and can therefore leverage
+optimized int8 kernels from the embARC MLI library
+
+The ARC EM SDP board contains a rich set of extension interfaces. You can choose
+any compatible camera and modify
+[image_provider.cc](/tensorflow/lite/micro/examples/person_detection_experimental/image_provider.cc)
+file accordingly to use input from your specific camera. By default, results of
+running this example are printed to the console. If you would like to instead
+implement some target-specific actions, you need to modify
+[detection_responder.cc](/tensorflow/lite/micro/examples/person_detection_experimental/detection_responder.cc)
+accordingly.
+
+The reference implementations of these files are used by default on the EM SDP.
+
+### Initial setup
+
+Follow the instructions on the
+[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
+to get and install all required tools for work with ARC EM SDP.
+
+### Generate Example Project
+
+The example project for ARC EM SDP platform can be generated with the following
+command:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp generate_person_detection_int8_make_project
+```
+
+### Build and Run Example
+
+For more detailed information on building and running examples see the
+appropriate sections of general descriptions of the
+[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
+In the directory with generated project you can also find a
+*README_ARC_EMSDP.md* file with instructions and options on building and
+running. Here we only briefly mention main steps which are typically enough to
+get it started.
+
+1.  You need to
+    [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
+    and open an serial connection.
+
+2.  Go to the generated example project director
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make
+    ```
+
+3.  Build the example using
+
+    ```
+    make app
+    ```
+
+4.  To generate artefacts for self-boot of example from the board use
+
+    ```
+    make flash
+    ```
+
+5.  To run application from the board using microSD card:
+
+    *   Copy the content of the created /bin folder into the root of microSD
+        card. Note that the card must be formatted as FAT32 with default cluster
+        size (but less than 32 Kbytes)
+    *   Plug in the microSD card into the J11 connector.
+    *   Push the RST button. If a red LED is lit beside RST button, push the CFG
+        button.
+
+6.  If you have the MetaWare Debugger installed in your environment:
+
+    *   To run application from the console using it type `make run`.
+    *   To stop the execution type `Ctrl+C` in the console several times.
+
+In both cases (step 5 and 6) you will see the application output in the serial
+terminal.
+
 ## Running on Arduino
 
 The following instructions will help you build and deploy this sample
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc
new file mode 100644
index 00000000000..c00f9b89953
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc
@@ -0,0 +1,21 @@
+ifeq ($(TARGET), arc_emsdp)
+
+# Patch of arc make project to adjust it specifically 
+# for experimental person detection example. In particular:
+# - Use Linker command file with better usage of fast memory
+# - Stripout TFLM reference code by default.
+
+  person_detection_HDRS += \
+  person_detection_int8_patch.txt
+  
+  person_detection_TEST_HDRS += \
+  person_detection_int8_patch.txt
+  
+
+%/person_detection_int8_patch.txt: %/emsdp.lcf %/Makefile
+	@cp tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf $< 
+	@echo emsdp.lcf > $@
+	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= true#' $(word 2, $^)
+	@echo Makefile > $@
+
+endif
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf
new file mode 100644
index 00000000000..c4150930d2b
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf
@@ -0,0 +1,74 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Difference with common EMSDP LCF file (to reduce data access time): 
+# - move data from external PSRAM to on-chip memory
+# - move text from SRAM to ICCM
+#
+# CCMWRAP memory regions indicate unusable portions of the address space
+#   due to CCM memory wrapping into upper addresses beyond its size
+
+MEMORY {
+    PSRAM   : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400
+    SRAM    : ORIGIN = 0x20000000, LENGTH = 0x00040000
+    IVT     : ORIGIN = 0x60000000, LENGTH = 0x400
+    ICCM0   : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400)
+#   CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
+    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
+#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
+    XCCM    : ORIGIN = 0x90000000, LENGTH = 0x00004000
+#   CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
+    YCCM    : ORIGIN = 0xa0000000, LENGTH = 0x00004000
+#   CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
+    }
+
+SECTIONS {
+
+    GROUP BLOCK(4) : {
+        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
+    } > IVT
+
+    GROUP BLOCK(4): {
+        .text? : { *('.text$crt*') }
+        * (TEXT): {}
+        * (LIT): {}
+    } > ICCM0
+
+    GROUP BLOCK(4): {
+        .rodata_in_data? : {}
+    } > PSRAM
+
+    GROUP BLOCK(4): {
+    /* _SDA_BASE_ computed implicitly */
+        .sdata?: {}
+        .sbss?: {}
+        * (DATA): {}
+        * (BSS): {}
+        .debug_log? : {}
+    } > SRAM
+
+    GROUP BLOCK(4): {
+       .Zdata? : {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {}
+    } > DCCM
+
+    GROUP BLOCK(4): {
+        .Xdata? : {}
+    } > XCCM
+
+    GROUP BLOCK(4): {
+        .Ydata? : {}
+    } > YCCM
+}
+
+
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
index 92d2c091f55..3090356ee0d 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
@@ -72,19 +72,18 @@ void setup() {
   //
   // tflite::ops::micro::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
-  static tflite::MicroOpResolver<12> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-                               tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
-                               1, 3);
+  static tflite::MicroMutableOpResolver<5> micro_op_resolver;
+  micro_op_resolver.AddBuiltin(
+      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D(), 1, 3);
+                               tflite::ops::micro::Register_CONV_2D());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_AVERAGE_POOL_2D,
-                               tflite::ops::micro::Register_AVERAGE_POOL_2D(),
-                               1, 2);
+                               tflite::ops::micro::Register_AVERAGE_POOL_2D());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
                                tflite::ops::micro::Register_RESHAPE());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX(), 1, 3);
+                               tflite::ops::micro::Register_SOFTMAX());
 
   // Build an interpreter to run the model with.
   // NOLINTNEXTLINE(runtime-global-variables)
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
index c3719e559ca..ddec8951596 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
@@ -52,19 +52,18 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   // An easier approach is to just use the AllOpsResolver, but this will
   // incur some penalty in code space for op implementations that are not
   // needed by this graph.
-  tflite::MicroOpResolver<11> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-                               tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
-                               1, 3);
+  tflite::MicroMutableOpResolver<5> micro_op_resolver;
+  micro_op_resolver.AddBuiltin(
+      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D(), 1, 3);
+                               tflite::ops::micro::Register_CONV_2D());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_AVERAGE_POOL_2D,
-                               tflite::ops::micro::Register_AVERAGE_POOL_2D(),
-                               1, 2);
+                               tflite::ops::micro::Register_AVERAGE_POOL_2D());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
                                tflite::ops::micro::Register_RESHAPE());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX(), 1, 2);
+                               tflite::ops::micro::Register_SOFTMAX());
 
   // Build an interpreter to run the model with.
   tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena,
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/training_a_model.md b/tensorflow/lite/micro/examples/person_detection_experimental/training_a_model.md
index 24067fc188f..beb743a2923 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/training_a_model.md
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/training_a_model.md
@@ -372,6 +372,9 @@ tf.lite.TFLiteConverter.from_frozen_graph('vww_96_grayscale_frozen.pb',
 ['input'], ['MobilenetV1/Predictions/Reshape_1'])
 converter.optimizations = [tf.lite.Optimize.DEFAULT]
 converter.representative_dataset = representative_dataset_gen
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.int8
+converter.inference_output_type = tf.int8
 
 tflite_quant_model = converter.convert()
 open("vww_96_grayscale_quantized.tflite", "wb").write(tflite_quant_model)
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index 50a0a4f9190..bbb5c67d9e5 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -51,6 +51,7 @@ cc_library(
         "split.cc",
         "strided_slice.cc",
         "sub.cc",
+        "tanh.cc",
         "unpack.cc",
     ] + select({
         "//conditions:default": [
@@ -153,6 +154,7 @@ cc_library(
         "strided_slice.cc",
         "sub.cc",
         "svdf.cc",
+        "tanh.cc",
         "unpack.cc",
     ],
     hdrs = ["micro_ops.h"],
@@ -201,7 +203,7 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:debug_log",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -214,7 +216,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -228,7 +229,6 @@ tflite_micro_cc_test(
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -242,7 +242,6 @@ tflite_micro_cc_test(
         ":portable_optimized_ops_resolver",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -269,7 +268,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -282,7 +280,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -295,7 +292,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -308,7 +304,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:micro_utils",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -322,7 +317,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -335,7 +329,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -348,7 +341,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -361,7 +353,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -374,7 +365,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -387,7 +377,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -400,7 +389,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -412,9 +400,7 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":micro_utils",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -426,9 +412,7 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":micro_utils",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -441,7 +425,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -454,7 +437,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -467,7 +449,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -480,7 +461,7 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:debug_log",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -493,7 +474,7 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:debug_log",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -506,7 +487,7 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:debug_log",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -552,7 +533,6 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":micro_utils",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -566,7 +546,6 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":micro_utils",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -602,7 +581,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -614,9 +592,7 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":micro_utils",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -666,7 +642,6 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":micro_ops",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -679,8 +654,18 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":micro_ops",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
+
+tflite_micro_cc_test(
+    name = "tanh_test",
+    srcs = ["tanh_test.cc"],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
diff --git a/tensorflow/lite/micro/kernels/activation_utils.h b/tensorflow/lite/micro/kernels/activation_utils.h
index 7525bc93b0a..a71826211c0 100644
--- a/tensorflow/lite/micro/kernels/activation_utils.h
+++ b/tensorflow/lite/micro/kernels/activation_utils.h
@@ -21,6 +21,8 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/max.h"
+#include "tensorflow/lite/kernels/internal/min.h"
 
 namespace tflite {
 namespace ops {
@@ -32,11 +34,11 @@ inline float ActivationValFloat(TfLiteFusedActivation act, float a) {
     case kTfLiteActNone:
       return a;
     case kTfLiteActRelu:
-      return std::max(0.0f, a);
+      return TfLiteMax(0.0f, a);
     case kTfLiteActRelu1:
-      return std::max(-1.0f, std::min(a, 1.0f));
+      return TfLiteMax(-1.0f, TfLiteMin(a, 1.0f));
     case kTfLiteActRelu6:
-      return std::max(0.0f, std::min(a, 6.0f));
+      return TfLiteMax(0.0f, TfLiteMin(a, 6.0f));
     case kTfLiteActTanh:
       return std::tanh(a);
     case kTfLiteActSignBit:
diff --git a/tensorflow/lite/micro/kernels/activations_test.cc b/tensorflow/lite/micro/kernels/activations_test.cc
index 7c6a55f306c..008be1039e1 100644
--- a/tensorflow/lite/micro/kernels/activations_test.cc
+++ b/tensorflow/lite/micro/kernels/activations_test.cc
@@ -43,7 +43,7 @@ void TestReluFloat(const int* input_dims_data, const float* input_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU, 1);
+      resolver.FindOp(tflite::BuiltinOperator_RELU);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
@@ -99,7 +99,7 @@ void TestRelu6Float(const int* input_dims_data, const float* input_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU6, 1);
+      resolver.FindOp(tflite::BuiltinOperator_RELU6);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
@@ -160,7 +160,7 @@ void TestReluUint8(const int* input_dims_data, const float* input_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU, 1);
+      resolver.FindOp(tflite::BuiltinOperator_RELU);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
@@ -225,7 +225,7 @@ void TestRelu6Uint8(const int* input_dims_data, const float* input_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU6, 1);
+      resolver.FindOp(tflite::BuiltinOperator_RELU6);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
@@ -289,7 +289,7 @@ void TestReluInt8(const int* input_dims_data, const float* input_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU, 1);
+      resolver.FindOp(tflite::BuiltinOperator_RELU);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
@@ -355,7 +355,7 @@ void TestRelu6Int8(const int* input_dims_data, const float* input_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU6, 1);
+      resolver.FindOp(tflite::BuiltinOperator_RELU6);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
diff --git a/tensorflow/lite/micro/kernels/add_test.cc b/tensorflow/lite/micro/kernels/add_test.cc
index 96c3aabeb8e..07dc25222b6 100644
--- a/tensorflow/lite/micro/kernels/add_test.cc
+++ b/tensorflow/lite/micro/kernels/add_test.cc
@@ -71,7 +71,7 @@ void ValidateAddGoldens(TfLiteTensor* tensors, int tensors_size,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(::tflite::BuiltinOperator_ADD, 1);
+      resolver.FindOp(::tflite::BuiltinOperator_ADD);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
diff --git a/tensorflow/lite/micro/kernels/all_ops_resolver.cc b/tensorflow/lite/micro/kernels/all_ops_resolver.cc
index dee3cbe0664..e427511c1f2 100644
--- a/tensorflow/lite/micro/kernels/all_ops_resolver.cc
+++ b/tensorflow/lite/micro/kernels/all_ops_resolver.cc
@@ -18,65 +18,61 @@ namespace tflite {
 namespace ops {
 namespace micro {
 
-// Register each supported op with:
-// AddBuiltin(<operator ID>, <registration>, [min version], [max version])
 AllOpsResolver::AllOpsResolver() {
-  AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(), 1, 4);
-  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D(), 1, 2);
-  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(), 1, 2);
-  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC(), 1, 2);
-  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(), 1, 3);
-  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D(), 1, 3);
-  AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION(), 1, 3);
-  AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D(), 1,
-             3);
-  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D(), 1, 2);
+  // Please keep this list of Builtin Operators in alphabetical order.
   AddBuiltin(BuiltinOperator_ABS, Register_ABS());
-  AddBuiltin(BuiltinOperator_SIN, Register_SIN());
-  AddBuiltin(BuiltinOperator_COS, Register_COS());
-  AddBuiltin(BuiltinOperator_LOG, Register_LOG());
-  AddBuiltin(BuiltinOperator_SQRT, Register_SQRT());
-  AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT());
-  AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE());
-  AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
-  AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR());
-  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
-  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
+  AddBuiltin(BuiltinOperator_ADD, Register_ADD());
   AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
   AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN());
-  AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
+  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D());
+  AddBuiltin(BuiltinOperator_CEIL, Register_CEIL());
+  AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION());
+  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D());
+  AddBuiltin(BuiltinOperator_COS, Register_COS());
+  AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D());
+  AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE());
+  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL());
+  AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR());
+  AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED());
+  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER());
+  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL());
+  AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION());
+  AddBuiltin(BuiltinOperator_LESS, Register_LESS());
+  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL());
+  AddBuiltin(BuiltinOperator_LOG, Register_LOG());
   AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND());
   AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT());
-  AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
-  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL(), 1, 2);
-  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL(), 1, 2);
-  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER(), 1, 2);
-  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL(), 1, 2);
-  AddBuiltin(BuiltinOperator_LESS, Register_LESS(), 1, 2);
-  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL(), 1, 2);
-  AddBuiltin(BuiltinOperator_CEIL, Register_CEIL());
-  AddBuiltin(BuiltinOperator_ROUND, Register_ROUND());
-  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
-  AddBuiltin(BuiltinOperator_PACK, Register_PACK(), 1, 2);
-  AddBuiltin(BuiltinOperator_PAD, Register_PAD(), 1, 2);
-  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2(), 1, 2);
-  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT(), 1, 3);
-  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK(), 1, 2);
+  AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
+  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC());
+  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D());
+  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
+  AddBuiltin(BuiltinOperator_MEAN, Register_MEAN());
+  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
+  AddBuiltin(BuiltinOperator_MUL, Register_MUL());
   AddBuiltin(BuiltinOperator_NEG, Register_NEG());
-  AddBuiltin(BuiltinOperator_ADD, Register_ADD(), 1, 2);
-  AddBuiltin(BuiltinOperator_MUL, Register_MUL(), 1, 3);
-  AddBuiltin(BuiltinOperator_SUB, Register_SUB(), 1, 2);
+  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL());
+  AddBuiltin(BuiltinOperator_PACK, Register_PACK());
+  AddBuiltin(BuiltinOperator_PAD, Register_PAD());
+  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2());
+  AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
   AddBuiltin(BuiltinOperator_QUANTIZE, Register_QUANTIZE());
-  AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(), 1, 2);
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
   AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
-  AddBuiltin(BuiltinOperator_MEAN, Register_MEAN());
+  AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
   AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
-             Register_RESIZE_NEAREST_NEIGHBOR(),
-             /* min_version = */ 1,
-             /* max_version = */ 2);
-  AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION());
+             Register_RESIZE_NEAREST_NEIGHBOR());
+  AddBuiltin(BuiltinOperator_ROUND, Register_ROUND());
+  AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT());
+  AddBuiltin(BuiltinOperator_SIN, Register_SIN());
+  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX());
+  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT());
+  AddBuiltin(BuiltinOperator_SQRT, Register_SQRT());
+  AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE());
+  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
+  AddBuiltin(BuiltinOperator_SUB, Register_SUB());
+  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
   AddBuiltin(BuiltinOperator_TANH, Register_TANH());
+  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK());
 }
 
 }  // namespace micro
diff --git a/tensorflow/lite/micro/kernels/all_ops_resolver.h b/tensorflow/lite/micro/kernels/all_ops_resolver.h
index 26bb03230ed..5637316d0da 100644
--- a/tensorflow/lite/micro/kernels/all_ops_resolver.h
+++ b/tensorflow/lite/micro/kernels/all_ops_resolver.h
@@ -19,7 +19,12 @@ namespace tflite {
 namespace ops {
 namespace micro {
 
-class AllOpsResolver : public MicroMutableOpResolver {
+// The magic number in the template parameter is the maximum number of ops that
+// can be added to AllOpsResolver. It can be increased if needed. And most
+// applications that care about the memory footprint will want to directly use
+// MicroMutableOpResolver and have an application specific template parameter.
+// The examples directory has sample code for this.
+class AllOpsResolver : public MicroMutableOpResolver<128> {
  public:
   AllOpsResolver();
 
diff --git a/tensorflow/lite/micro/kernels/arc/conv.cc b/tensorflow/lite/micro/kernels/arc/conv.cc
deleted file mode 100644
index 69542e12e90..00000000000
--- a/tensorflow/lite/micro/kernels/arc/conv.cc
+++ /dev/null
@@ -1,343 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/kernels/internal/reference/conv.h"
-
-#include "mli_api.h"  // NOLINT
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/padding.h"
-#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace conv {
-
-constexpr int kInputTensor = 0;
-constexpr int kFilterTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
-
-// This file has 2 implementation of Conv.
-
-const int kTensorNotAllocated = -1;
-
-struct OpData {
-  TfLitePaddingValues padding;
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-
-  // Per channel output multiplier and shift.
-  // TODO(b/141139247): Allocate these dynamically when possible.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
-
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-};
-
-inline PaddingType RuntimePaddingType(TfLitePadding padding) {
-  switch (padding) {
-    case TfLitePadding::kTfLitePaddingSame:
-      return PaddingType::kSame;
-    case TfLitePadding::kTfLitePaddingValid:
-      return PaddingType::kValid;
-    case TfLitePadding::kTfLitePaddingUnknown:
-    default:
-      return PaddingType::kNone;
-  }
-}
-
-TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, int width, int height,
-                             int filter_width, int filter_height, int out_width,
-                             int out_height, const TfLiteType data_type,
-                             OpData* data) {
-  bool has_bias = node->inputs->size == 3;
-  // Check number of inputs/outputs
-  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-
-  // Matching GetWindowedOutputSize in TensorFlow.
-  auto padding = params->padding;
-  data->padding = ComputePaddingHeightWidth(
-      params->stride_height, params->stride_width,
-      params->dilation_height_factor, params->dilation_width_factor, height,
-      width, filter_height, filter_width, padding, &out_height, &out_width);
-
-  // Note that quantized inference requires that all tensors have their
-  // parameters set. This is usually done during quantized training.
-  if (data_type != kTfLiteFloat32) {
-    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-    const TfLiteTensor* bias =
-        GetOptionalInputTensor(context, node, kBiasTensor);
-    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
-        context, input, filter, bias, output, params->activation,
-        &data->output_multiplier, &data->output_shift,
-        &data->output_activation_min, &data->output_activation_max,
-        data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift)));
-  }
-  return kTfLiteOk;
-}
-
-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteConvParams* params, OpData* data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* im2col,
-                   TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
-
-  ConvParams op_params;
-  op_params.padding_type = RuntimePaddingType(params->padding);
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-  reference_ops::Conv(op_params, GetTensorShape(input),
-                      GetTensorData<uint8_t>(input), GetTensorShape(filter),
-                      GetTensorData<uint8_t>(filter), GetTensorShape(bias),
-                      GetTensorData<int32_t>(bias), GetTensorShape(output),
-                      GetTensorData<uint8_t>(output), GetTensorShape(im2col),
-                      GetTensorData<uint8_t>(im2col), nullptr);
-}
-
-void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, OpData* data,
-                             const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output,
-                             TfLiteTensor* im2col) {
-  // Run Conv MLI kernel
-  // MLI optimized version only supports int8 dataype and dilation factor of 1
-  if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) &&
-      (params->dilation_height_factor == 1)) {
-    mli_tensor mli_in = {0};
-    mli_tensor mli_weights = {0};
-    mli_tensor mli_bias = {0};
-    mli_tensor mli_out = {0};
-    mli_conv2d_cfg cfg = {};
-
-    // reuse space allocated for OpData parameters
-    mli_weights.el_params.asym.scale.pi16 =
-        (int16_t*)data->per_channel_output_multiplier;
-    mli_bias.el_params.asym.scale.pi16 =
-        (int16_t*)data->per_channel_output_shift;
-
-    int16_t filter_zero_point = 0;
-    int16_t bias_zero_point = 0;
-    mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
-    mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
-
-    ConvertToMliTensor<int8_t>(input, &mli_in);
-    ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
-    ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
-    ConvertToMliTensor<int8_t>(output, &mli_out);
-
-    if (params->activation == kTfLiteActRelu) {
-      cfg.relu.type = MLI_RELU_GEN;
-    } else if (params->activation == kTfLiteActRelu6) {
-      cfg.relu.type = MLI_RELU_6;
-    } else if (params->activation == kTfLiteActRelu1) {
-      cfg.relu.type = MLI_RELU_1;
-    } else {
-      cfg.relu.type = MLI_RELU_NONE;
-    }
-
-    cfg.stride_width = params->stride_width;
-    cfg.stride_height = params->stride_height;
-    if (params->padding == kTfLitePaddingValid) {
-      cfg.padding_left = 0;
-      cfg.padding_right = 0;
-      cfg.padding_top = 0;
-      cfg.padding_bottom = 0;
-    } else {
-      cfg.padding_left = data->padding.width;
-      cfg.padding_right = data->padding.width + data->padding.width_offset;
-      cfg.padding_top = data->padding.height;
-      cfg.padding_bottom = data->padding.height + data->padding.height_offset;
-    }
-
-    mli_point_to_subtsr_cfg substr_cfg_in = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
-    mli_point_to_subtsr_cfg substr_cfg_out = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
-    mli_tensor sub_mli_in = {0};
-    mli_tensor sub_mli_out = {0};
-
-    const int batches =
-        MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
-
-    for (int i = 0; i < batches; i++) {
-      substr_cfg_in.start_coord[0] = i;
-      substr_cfg_out.start_coord[0] = i;
-      mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in);
-      mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out);
-
-      mli_krn_conv2d_hwc_sa8_sa8_sa32(&sub_mli_in, &mli_weights, &mli_bias,
-                                      &cfg, &sub_mli_out);
-    }
-  } else {
-    ConvParams op_params;
-    op_params.input_offset = -input->params.zero_point;
-    op_params.output_offset = output->params.zero_point;
-    op_params.stride_height = params->stride_height;
-    op_params.stride_width = params->stride_width;
-    op_params.dilation_height_factor = params->dilation_height_factor;
-    op_params.dilation_width_factor = params->dilation_width_factor;
-    op_params.padding_values.height = data->padding.height;
-    op_params.padding_values.width = data->padding.width;
-
-    reference_integer_ops::ConvPerChannel(
-        op_params, data->per_channel_output_multiplier,
-        data->per_channel_output_shift, GetTensorShape(input),
-        GetTensorData<int8>(input), GetTensorShape(filter),
-        GetTensorData<int8>(filter), GetTensorShape(bias),
-        GetTensorData<int32>(bias), GetTensorShape(output),
-        GetTensorData<int8>(output));
-  }
-}
-
-void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteConvParams* params, OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* im2col,
-               TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-
-  ConvParams op_params;
-  op_params.padding_type = RuntimePaddingType(params->padding);
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  reference_ops::Conv(op_params, GetTensorShape(input),
-                      GetTensorData<float>(input), GetTensorShape(filter),
-                      GetTensorData<float>(filter), GetTensorShape(bias),
-                      GetTensorData<float>(bias), GetTensorShape(output),
-                      GetTensorData<float>(output), GetTensorShape(im2col),
-                      GetTensorData<float>(im2col));
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-
-  int input_width = input->dims->data[2];
-  int input_height = input->dims->data[1];
-  int filter_width = filter->dims->data[2];
-  int filter_height = filter->dims->data[1];
-  int output_width = output->dims->data[2];
-  int output_height = output->dims->data[1];
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    // Conv is quantized along dimension 0:
-    // https://www.tensorflow.org/lite/performance/quantization_spec
-    TF_LITE_ENSURE_EQ(context, filter->dims->data[0],
-                      affine_quantization->scale->size);
-    TF_LITE_ENSURE_EQ(context, filter->dims->data[0],
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(
-      context, node, params, input_width, input_height, filter_width,
-      filter_height, output_width, output_height, input->type, &data));
-
-  switch (input->type) {  // Already know in/out types are same.
-    case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input, filter, bias, nullptr,
-                nullptr, output);
-      break;
-    case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
-                              output, nullptr);
-      break;
-    case kTfLiteUInt8:
-      EvalQuantized(context, node, params, &data, input, filter, bias, nullptr,
-                    nullptr, output);
-      break;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(input->type), input->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace conv
-
-TfLiteRegistration* Register_CONV_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/conv::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
deleted file mode 100644
index 6322414f5c6..00000000000
--- a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
+++ /dev/null
@@ -1,344 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
-
-#include "mli_api.h"  // NOLINT
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
-#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/padding.h"
-#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace depthwise_conv {
-namespace {
-
-constexpr int kInputTensor = 0;
-constexpr int kFilterTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
-
-struct OpData {
-  TfLitePaddingValues padding;
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-
-  // Per channel output multiplier and shift.
-  // TODO(b/141139247): Allocate these dynamically when possible.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
-
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-};
-
-TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params, int width,
-                             int height, int filter_width, int filter_height,
-                             const TfLiteType data_type, OpData* data) {
-  bool has_bias = node->inputs->size == 3;
-  // Check number of inputs/outputs
-  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-
-  int unused_output_height, unused_output_width;
-  data->padding = ComputePaddingHeightWidth(
-      params->stride_height, params->stride_width, 1, 1, height, width,
-      filter_height, filter_width, params->padding, &unused_output_height,
-      &unused_output_width);
-
-  // Note that quantized inference requires that all tensors have their
-  // parameters set. This is usually done during quantized training.
-  if (data_type != kTfLiteFloat32) {
-    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-    const TfLiteTensor* bias =
-        GetOptionalInputTensor(context, node, kBiasTensor);
-    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-    // Ensure filter and bias channel count does not exceed space reserved for
-    // quantization metadata.
-    const auto filter_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    const auto bias_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(bias->quantization.params);
-    TF_LITE_ENSURE(context, filter_quantization->scale->size <= kMaxChannels);
-    TF_LITE_ENSURE(context, bias_quantization->scale->size <= kMaxChannels);
-
-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
-        context, input, filter, bias, output, params->activation,
-        &data->output_multiplier, &data->output_shift,
-        &data->output_activation_min, &data->output_activation_max,
-        data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift)));
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace
-
-void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteDepthwiseConvParams* params, OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  tflite::reference_ops::DepthwiseConv(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
-}
-
-void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params, OpData* data,
-                             const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output) {
-  // Run Depthwise Conv MLI kernel
-  // MLI optimized version only supports int8 dataype and dilation factor of 1
-  if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) &&
-      (params->dilation_height_factor == 1)) {
-    mli_tensor mli_in = {0};
-    mli_tensor mli_weights = {0};
-    mli_tensor mli_bias = {0};
-    mli_tensor mli_out = {0};
-    mli_conv2d_cfg cfg = {};
-
-    // reuse space allocated for OpData parameters
-    mli_weights.el_params.asym.scale.pi16 =
-        (int16_t*)data->per_channel_output_multiplier;
-    mli_bias.el_params.asym.scale.pi16 =
-        (int16_t*)data->per_channel_output_shift;
-
-    int16_t filter_zero_point = 0;
-    int16_t bias_zero_point = 0;
-    mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
-    mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
-
-    ConvertToMliTensor<int8_t>(input, &mli_in);
-    ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
-    ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
-    ConvertToMliTensor<int8_t>(output, &mli_out);
-
-    if (params->activation == kTfLiteActRelu) {
-      cfg.relu.type = MLI_RELU_GEN;
-    } else if (params->activation == kTfLiteActRelu6) {
-      cfg.relu.type = MLI_RELU_6;
-    } else if (params->activation == kTfLiteActRelu1) {
-      cfg.relu.type = MLI_RELU_1;
-    } else {
-      cfg.relu.type = MLI_RELU_NONE;
-    }
-
-    cfg.stride_width = params->stride_width;
-    cfg.stride_height = params->stride_height;
-    if (params->padding == kTfLitePaddingValid) {
-      cfg.padding_left = 0;
-      cfg.padding_right = 0;
-      cfg.padding_top = 0;
-      cfg.padding_bottom = 0;
-    } else {
-      cfg.padding_left = data->padding.width;
-      cfg.padding_right = data->padding.width + data->padding.width_offset;
-      cfg.padding_top = data->padding.height;
-      cfg.padding_bottom = data->padding.height + data->padding.height_offset;
-    }
-
-    mli_point_to_subtsr_cfg substr_cfg_in = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
-    mli_point_to_subtsr_cfg substr_cfg_out = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
-    mli_tensor sub_mli_in = {0};
-    mli_tensor sub_mli_out = {0};
-
-    const int batches =
-        MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
-
-    for (int i = 0; i < batches; i++) {
-      substr_cfg_in.start_coord[0] = i;
-      substr_cfg_out.start_coord[0] = i;
-      mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in);
-      mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out);
-
-      mli_krn_depthwise_conv2d_hwc_sa8_sa8_sa32(&sub_mli_in, &mli_weights,
-                                                &mli_bias, &cfg, &sub_mli_out);
-    }
-  } else {
-    DepthwiseParams op_params;
-    op_params.padding_type = PaddingType::kSame;
-    op_params.padding_values.width = data->padding.width;
-    op_params.padding_values.height = data->padding.height;
-    op_params.stride_width = params->stride_width;
-    op_params.stride_height = params->stride_height;
-    op_params.dilation_width_factor = params->dilation_width_factor;
-    op_params.dilation_height_factor = params->dilation_height_factor;
-    op_params.depth_multiplier = params->depth_multiplier;
-    op_params.input_offset = -input->params.zero_point;
-    op_params.weights_offset = 0;
-    op_params.output_offset = output->params.zero_point;
-    // TODO(b/130439627): Use calculated value for clamping.
-    op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
-    op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
-
-    reference_integer_ops::DepthwiseConvPerChannel(
-        op_params, data->per_channel_output_multiplier,
-        data->per_channel_output_shift, GetTensorShape(input),
-        GetTensorData<int8>(input), GetTensorShape(filter),
-        GetTensorData<int8>(filter), GetTensorShape(bias),
-        GetTensorData<int32>(bias), GetTensorShape(output),
-        GetTensorData<int8>(output));
-  }
-}
-
-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteDepthwiseConvParams* params, OpData* data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
-
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
-
-  tflite::reference_ops::DepthwiseConv(
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-      GetTensorShape(filter), GetTensorData<uint8_t>(filter),
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),
-      GetTensorShape(output), GetTensorData<uint8_t>(output));
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias =
-      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
-
-  const TfLiteType data_type = input->type;
-  int width = SizeOfDimension(input, 2);
-  int height = SizeOfDimension(input, 1);
-  int filter_width = SizeOfDimension(filter, 2);
-  int filter_height = SizeOfDimension(filter, 1);
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    // Depthwise conv is quantized along dimension 3:
-    // https://www.tensorflow.org/lite/performance/quantization_spec
-    TF_LITE_ENSURE_EQ(context, filter->dims->data[3],
-                      affine_quantization->scale->size);
-    TF_LITE_ENSURE_EQ(context, filter->dims->data[3],
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
-                                        filter_width, filter_height, data_type,
-                                        &data));
-  switch (input->type) {  // Already know in/out types are same.
-    case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input, filter, bias, output);
-      break;
-    case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
-                              output);
-      break;
-    case kTfLiteUInt8:
-      EvalQuantized(context, node, params, &data, input, filter, bias, output);
-      break;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(input->type), input->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace depthwise_conv
-
-TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/depthwise_conv::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/arc/fully_connected.cc b/tensorflow/lite/micro/kernels/arc/fully_connected.cc
deleted file mode 100644
index 57203f10487..00000000000
--- a/tensorflow/lite/micro/kernels/arc/fully_connected.cc
+++ /dev/null
@@ -1,248 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
-
-#include "mli_api.h"  // NOLINT
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace fully_connected {
-namespace {
-
-struct OpData {
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-  // The index of the temporary tensor where the quantized inputs are cached.
-  int input_quantized_index;
-};
-
-constexpr int kInputTensor = 0;
-constexpr int kWeightsTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-
-TfLiteStatus CalculateOpData(TfLiteContext* context,
-                             TfLiteFullyConnectedParams* params,
-                             TfLiteType data_type, const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output,
-                             OpData* data) {
-  TfLiteStatus status = kTfLiteOk;
-  if (data_type != kTfLiteFloat32) {
-    double real_multiplier = 0.0;
-    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
-        context, input, filter, bias, output, &real_multiplier));
-    int exponent;
-    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
-    data->output_shift = -exponent;
-    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
-        context, params->activation, output, &data->output_activation_min,
-        &data->output_activation_max));
-  }
-  return status;
-}
-
-}  // namespace
-
-TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               TfLiteFullyConnectedParams* params, OpData* data,
-                               const TfLiteTensor* input,
-                               const TfLiteTensor* filter,
-                               const TfLiteTensor* bias, TfLiteTensor* output) {
-  // Run Fully Connected MLI kernel
-  // MLI optimized version only supports int8 dataype and no fused Relu
-  // TODO: subject to add mli_saturate kernel
-  // work around for issue #35318, mli fully connect kernel only supports
-  // zeropoint == 0 for weights. this check can be removed once issue #35318 is
-  // resolved.
-  if ((filter->params.zero_point == 0) &&
-      (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone)) {
-    mli_tensor mli_in = {0};
-    mli_tensor mli_weights = {0};
-    mli_tensor mli_bias = {0};
-    mli_tensor mli_out = {0};
-
-    ConvertToMliTensor<int8_t>(input, &mli_in);
-    ConvertToMliTensor<int8_t>(filter, &mli_weights);
-    ConvertToMliTensor<int32_t>(bias, &mli_bias);
-    ConvertToMliTensor<int8_t>(output, &mli_out);
-
-    mli_point_to_subtsr_cfg substr_cfg_in = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
-    mli_point_to_subtsr_cfg substr_cfg_out = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
-    mli_tensor sub_mli_in = {0};
-    mli_tensor sub_mli_out = {0};
-
-    const int batches =
-        MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
-
-    for (int i = 0; i < batches; i++) {
-      substr_cfg_in.start_coord[0] = i;
-      substr_cfg_out.start_coord[0] = i;
-      mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in);
-      mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out);
-
-      mli_krn_fully_connected_sa8_sa8_sa32(&sub_mli_in, &mli_weights, &mli_bias,
-                                           &sub_mli_out);
-    }
-  } else {
-    FullyConnectedParams op_params;
-    op_params.input_offset = -input->params.zero_point;
-    op_params.weights_offset = -filter->params.zero_point;
-    op_params.output_offset = output->params.zero_point;
-    op_params.output_multiplier = data->output_multiplier;
-    // TODO(b/138810107): Figure out whether output shift should be inverted
-    op_params.output_shift = -data->output_shift;
-    op_params.quantized_activation_min = data->output_activation_min;
-    op_params.quantized_activation_max = data->output_activation_max;
-
-    reference_integer_ops::FullyConnected(
-        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-        GetTensorShape(filter), GetTensorData<int8_t>(filter),
-        GetTensorShape(bias), GetTensorData<int32_t>(bias),
-        GetTensorShape(output), GetTensorData<int8_t>(output));
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteFullyConnectedParams* params, OpData* data,
-                           const TfLiteTensor* input,
-                           const TfLiteTensor* filter, const TfLiteTensor* bias,
-                           TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
-
-  tflite::FullyConnectedParams op_params;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-
-#define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
-  reference_ops::FullyConnected(                                       \
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
-      GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
-      GetTensorShape(output), GetTensorData<output_data_type>(output))
-  switch (output->type) {
-    case kTfLiteUInt8:
-      TF_LITE_FULLY_CONNECTED(uint8_t);
-      break;
-    case kTfLiteInt16:
-      TF_LITE_FULLY_CONNECTED(int16_t);
-      break;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(output->type), output->type);
-      return kTfLiteError;
-  }
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteFullyConnectedParams* params, OpData* data,
-                       const TfLiteTensor* input, const TfLiteTensor* filter,
-                       const TfLiteTensor* bias, TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-  tflite::FullyConnectedParams op_params;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-  tflite::reference_ops::FullyConnected(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
-  return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  TfLiteType data_type = input->type;
-  OpData local_data_object;
-  OpData* data = &local_data_object;
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
-                                        filter, bias, output, data));
-
-  switch (filter->type) {  // Already know in/out types are same.
-    case kTfLiteFloat32:
-      return EvalFloat(context, node, params, data, input, filter, bias,
-                       output);
-    case kTfLiteInt8:
-      return EvalQuantizedInt8(context, node, params, data, input, filter, bias,
-                               output);
-
-    case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, data, input, filter, bias,
-                           output);
-
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(filter->type), filter->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace fully_connected
-
-TfLiteRegistration* Register_FULLY_CONNECTED() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/fully_connected::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-
-  return &r;
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/arc/pooling.cc b/tensorflow/lite/micro/kernels/arc/pooling.cc
deleted file mode 100644
index 55452013028..00000000000
--- a/tensorflow/lite/micro/kernels/arc/pooling.cc
+++ /dev/null
@@ -1,292 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/kernels/internal/reference/pooling.h"
-
-#include "mli_api.h"  // NOLINT
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/padding.h"
-#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace pooling {
-
-namespace {
-
-constexpr int kInputTensor = 0;
-constexpr int kOutputTensor = 0;
-
-struct OpData {
-  TfLitePaddingValues padding;
-};
-
-TfLiteStatus CalculateOpData(const TfLiteContext* context,
-                             const TfLitePoolParams* params,
-                             const TfLiteTensor* input,
-                             const TfLiteTensor* output, OpData* data) {
-  // input: batch, height, width, channel
-  int height = SizeOfDimension(input, 1);
-  int width = SizeOfDimension(input, 2);
-
-  int out_height, out_width;
-
-  data->padding = ComputePaddingHeightWidth(
-      params->stride_height, params->stride_width,
-      /*dilation_rate_height=*/1,
-      /*dilation_rate_width=*/1, height, width, params->filter_height,
-      params->filter_width, params->padding, &out_height, &out_width);
-
-  return kTfLiteOk;
-}
-
-void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
-                      const TfLitePoolParams* params, const OpData* data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
-  float activation_min, activation_max;
-  CalculateActivationRange(params->activation, &activation_min,
-                           &activation_max);
-
-  PoolParams op_params;
-  op_params.stride_height = params->stride_height;
-  op_params.stride_width = params->stride_width;
-  op_params.filter_height = params->filter_height;
-  op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.float_activation_min = activation_min;
-  op_params.float_activation_max = activation_max;
-  reference_ops::AveragePool(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(output), GetTensorData<float>(output));
-}
-
-void AverageEvalUint8(TfLiteContext* context, const TfLiteNode* node,
-                      const TfLitePoolParams* params, const OpData* data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
-  int32_t activation_min, activation_max;
-  (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                          &activation_min, &activation_max);
-
-  PoolParams op_params;
-  op_params.stride_height = params->stride_height;
-  op_params.stride_width = params->stride_width;
-  op_params.filter_height = params->filter_height;
-  op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = activation_min;
-  op_params.quantized_activation_max = activation_max;
-  reference_ops::AveragePool(
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-      GetTensorShape(output), GetTensorData<uint8_t>(output));
-}
-
-void AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
-                     const TfLitePoolParams* params, const OpData* data,
-                     const TfLiteTensor* input, TfLiteTensor* output) {
-  // Run Average Pooling MLI kernel
-  // MLI optimized version only supports int8 dataype and no fused Relu
-  // TODO: subject to add mli_saturate kernel
-  if (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone) {
-    mli_tensor mli_in = {0};
-    mli_tensor mli_out = {0};
-    mli_pool_cfg cfg = {0};
-
-    ConvertToMliTensor<int8_t>(input, &mli_in);
-    ConvertToMliTensor<int8_t>(output, &mli_out);
-
-    cfg.kernel_width = params->filter_width;
-    cfg.kernel_height = params->filter_height;
-    cfg.stride_width = params->stride_width;
-    cfg.stride_height = params->stride_height;
-
-    if (params->padding == kTfLitePaddingValid) {
-      cfg.padding_left = 0;
-      cfg.padding_right = 0;
-      cfg.padding_top = 0;
-      cfg.padding_bottom = 0;
-    } else {
-      cfg.padding_left = data->padding.width;
-      cfg.padding_right = data->padding.width + data->padding.width_offset;
-      cfg.padding_top = data->padding.height;
-      cfg.padding_bottom = data->padding.height + data->padding.height_offset;
-    }
-
-    mli_point_to_subtsr_cfg substr_cfg_in = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
-    mli_point_to_subtsr_cfg substr_cfg_out = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
-    mli_tensor sub_mli_in = {0};
-    mli_tensor sub_mli_out = {0};
-
-    const int batches =
-        MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
-
-    for (int i = 0; i < batches; i++) {
-      substr_cfg_in.start_coord[0] = i;
-      substr_cfg_out.start_coord[0] = i;
-      mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in);
-      mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out);
-
-      mli_krn_avepool_hwc_sa8(&sub_mli_in, &cfg, &sub_mli_out);
-    }
-  } else {
-    int32_t activation_min, activation_max;
-    (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                            &activation_min, &activation_max);
-    PoolParams op_params;
-    op_params.stride_height = params->stride_height;
-    op_params.stride_width = params->stride_width;
-    op_params.filter_height = params->filter_height;
-    op_params.filter_width = params->filter_width;
-    op_params.padding_values.height = data->padding.height;
-    op_params.padding_values.width = data->padding.width;
-    op_params.quantized_activation_min = activation_min;
-    op_params.quantized_activation_max = activation_max;
-    reference_integer_ops::AveragePool(
-        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-        GetTensorShape(output), GetTensorData<int8_t>(output));
-  }
-}
-
-void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLitePoolParams* params, OpData* data,
-                  const TfLiteTensor* input, TfLiteTensor* output) {
-  float activation_min, activation_max;
-  CalculateActivationRange(params->activation, &activation_min,
-                           &activation_max);
-
-  tflite::PoolParams op_params;
-  op_params.stride_height = params->stride_height;
-  op_params.stride_width = params->stride_width;
-  op_params.filter_height = params->filter_height;
-  op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.float_activation_min = activation_min;
-  op_params.float_activation_max = activation_max;
-  reference_ops::MaxPool(op_params, GetTensorShape(input),
-                         GetTensorData<float>(input), GetTensorShape(output),
-                         GetTensorData<float>(output));
-}
-
-void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node,
-                           TfLitePoolParams* params, OpData* data,
-                           const TfLiteTensor* input, TfLiteTensor* output) {
-  int32_t activation_min, activation_max;
-  (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                          &activation_min, &activation_max);
-
-  tflite::PoolParams op_params;
-  op_params.stride_height = params->stride_height;
-  op_params.stride_width = params->stride_width;
-  op_params.filter_height = params->filter_height;
-  op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = activation_min;
-  op_params.quantized_activation_max = activation_max;
-  reference_ops::MaxPool(op_params, GetTensorShape(input),
-                         GetTensorData<uint8_t>(input), GetTensorShape(output),
-                         GetTensorData<uint8_t>(output));
-}
-
-}  // namespace
-
-TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
-  OpData data;
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
-
-  // Inputs and outputs share the same type, guarenteed by the converter.
-  switch (input->type) {
-    case kTfLiteFloat32:
-      AverageEvalFloat(context, node, params, &data, input, output);
-      break;
-    case kTfLiteUInt8:
-      AverageEvalUint8(context, node, params, &data, input, output);
-      break;
-    case kTfLiteInt8:
-      AverageEvalInt8(context, node, params, &data, input, output);
-      break;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported",
-                         TfLiteTypeGetName(input->type));
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
-  OpData data;
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
-
-  switch (input->type) {
-    case kTfLiteFloat32:
-      MaxEvalFloat(context, node, params, &data, input, output);
-      break;
-    case kTfLiteUInt8:
-      MaxEvalQuantizedUInt8(context, node, params, &data, input, output);
-      break;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
-                         TfLiteTypeGetName(input->type));
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace pooling
-
-TfLiteRegistration* Register_AVERAGE_POOL_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/pooling::AverageEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
-}
-
-TfLiteRegistration* Register_MAX_POOL_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/pooling::MaxEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/arc_mli/README.md b/tensorflow/lite/micro/kernels/arc_mli/README.md
new file mode 100644
index 00000000000..9bd0085f373
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc_mli/README.md
@@ -0,0 +1,96 @@
+# EmbARC MLI Library Based Optimizations of TensorFlow Lite Micro Kernels for ARC Platforms.
+
+This folder contains kernel implementations which use optimized
+[embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli).
+It allows acceleration of inference operations which use int8 (asymmetric
+quantization).
+
+## Usage
+
+embARC MLI Library is used by default to speed up execution of some kernels for
+asymmetrically quantized layers. This means that usual project generation for
+ARC specific target implies usage of embARC MLI.
+
+For example:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp generate_person_detection_int8_make_project
+```
+
+In case MLI implementation can’t be used, kernels in this folder fallback to
+TFLM reference implementations. For applications which may not benefit from MLI
+library, projects can be generated without these implementations by adding
+`TAGS=no_arc_mli` in the command line, which can reduce overall code size:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_person_detection_int8_make_project
+```
+
+For ARC EM SDP board, a pre-compiled MLI library is downloaded and used in the
+application. For a custom target ARC-based platform, MLI sources are downloaded
+and compiled during project generation phase. To build library from sources for
+ARC EM SDP platform, add `BUILD_ARC_MLI=true` option to make command:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp BUILD_ARC_MLI=true generate_person_detection_int8_make_project
+```
+
+If an application exclusively uses accelerated MLI kernel implementations, one
+can strip out TFLM reference kernel implementations to reduce code size of
+application. Build application with `MLI_ONLY=true` option in generated project
+(after the project was built):
+
+```
+cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make
+
+make app MLI_ONLY=true
+```
+
+if you try this and application execution fails, then most probably MLI can’t be
+used for some nodes and you need to revert to using TFLM reference kernels.
+
+## Limitations
+
+Currently, the MLI Library provides optimized implementation only for int8
+(asymmetric) versions of the following kernels: 1. Convolution 2D – Per axis
+quantization only, `dilation_ratio==1` 2. Depthwise Convolution 2D – Per axis
+quantization only, `dilation_ratio==1` 3. Average Pooling 4. Max Pooling 5.
+Fully Connected
+
+Currently only
+[/tensorflow/lite/micro/examples/person_detection_experimental](/tensorflow/lite/micro/examples/person_detection_experimental)
+is quantized using this specification. Other examples can be executed on
+ARC-based targets, but will only use reference kernels.
+
+## Scratch Buffers and Slicing
+
+The following information applies only for ARC EM SDP and other targets with XY
+memory. embARC MLI uses specific optimizations which assumes node operands are
+in XY memory and/or DCCM (Data Closely Coupled Memory). As operands might be
+quite big and may not fit in available XY memory, special slicing logic is
+applied which allows kernel calculations to be split into multiple parts. For
+this reason, internal static buffers are allocated in these X, Y and DCCM memory
+banks and used to execute sub-calculations.
+
+All this is performed automatically and invisible to the user. Half of the DCCM
+memory bank and the full XY banks are occupied for MLI specific needs. If the
+user needs space in XY memory for other tasks, these arrays can be reduced by
+setting specific sizes. For this, add the following option to build command
+replacing **<size[a|b|c]>** with required values:
+
+```
+EXT_CFLAGS=”-DSCRATCH_MEM_Z_SIZE=<size_a> -DSCRATCH_MEM_X_SIZE=<size_b> -DSCRATCH_MEM_Y_SIZE=<size_c>”
+```
+
+For example, to reduce sizes of arrays placed in DCCM and XCCM to 32k and 8k
+respectively, use next command:
+
+```
+make app EXT_CFLAGS=”-DSCRATCH_MEM_Z_SIZE=32*1024 -DSCRATCH_MEM_X_SIZE=8*1024”
+```
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository,
+and third party dependencies are covered by their respective licenses, in the
+third_party folder of this package.
diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv.cc b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
new file mode 100644
index 00000000000..66880b732cc
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
@@ -0,0 +1,490 @@
+/* Copyright 2019-2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/conv.h"
+
+#include "mli_api.h"  // NOLINT
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace conv {
+
+constexpr int kInputTensor = 0;
+constexpr int kFilterTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+constexpr int kMaxChannels = 256;
+
+// Conv is quantized along dimension 0:
+// https://www.tensorflow.org/lite/performance/quantization_spec
+constexpr int kConvQuantizedDimension = 0;
+
+struct OpData {
+  TfLitePaddingValues padding;
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+
+  // Per channel output multiplier and shift.
+  int32_t per_channel_output_multiplier[kMaxChannels];
+  int32_t per_channel_output_shift[kMaxChannels];
+
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
+
+inline PaddingType RuntimePaddingType(TfLitePadding padding) {
+  switch (padding) {
+    case TfLitePadding::kTfLitePaddingSame:
+      return PaddingType::kSame;
+    case TfLitePadding::kTfLitePaddingValid:
+      return PaddingType::kValid;
+    case TfLitePadding::kTfLitePaddingUnknown:
+    default:
+      return PaddingType::kNone;
+  }
+}
+
+bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
+                     const TfLiteTensor* filter, const TfLiteTensor* bias,
+                     const TfLiteConvParams* params) {
+  const auto* affine_quantization =
+      reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+  // MLI optimized version only supports int8 dataype, dilation factor of 1 and
+  // per-axis quantization of weights (no broadcasting/per-tensor)
+  bool ret_val = (filter->type == kTfLiteInt8) &&
+                 (input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
+                 (params->dilation_width_factor == 1) &&
+                 (params->dilation_height_factor == 1) &&
+                 (affine_quantization->scale->size ==
+                  filter->dims->data[kConvQuantizedDimension]) &&
+                 affine_quantization->scale->size <= (kMaxChannels * 2);
+  return ret_val;
+}
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteConvParams* params, int width, int height,
+                             int filter_width, int filter_height, int out_width,
+                             int out_height, const TfLiteType data_type,
+                             bool mli_is_applicable, OpData* data) {
+  bool has_bias = node->inputs->size == 3;
+  // Check number of inputs/outputs
+  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+
+  // Matching GetWindowedOutputSize in TensorFlow.
+  auto padding = params->padding;
+  data->padding = ComputePaddingHeightWidth(
+      params->stride_height, params->stride_width,
+      params->dilation_height_factor, params->dilation_width_factor, height,
+      width, filter_height, filter_width, padding, &out_height, &out_width);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  if (data_type != kTfLiteFloat32 && !mli_is_applicable) {
+    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+    const TfLiteTensor* bias =
+        GetOptionalInputTensor(context, node, kBiasTensor);
+    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+    int output_channels = filter->dims->data[kConvQuantizedDimension];
+
+    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+        context, input, filter, bias, output, params->activation,
+        &data->output_multiplier, &data->output_shift,
+        &data->output_activation_min, &data->output_activation_max,
+        data->per_channel_output_multiplier,
+        reinterpret_cast<int*>(data->per_channel_output_shift),
+        output_channels));
+  }
+#endif
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                           TfLiteConvParams* params, OpData* data,
+                           const TfLiteTensor* input,
+                           const TfLiteTensor* filter, const TfLiteTensor* bias,
+                           TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
+                           TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  const int32_t input_offset = -input->params.zero_point;
+  const int32_t filter_offset = -filter->params.zero_point;
+  const int32_t output_offset = output->params.zero_point;
+
+  ConvParams op_params;
+  op_params.padding_type = RuntimePaddingType(params->padding);
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = -data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  reference_ops::Conv(op_params, GetTensorShape(input),
+                      GetTensorData<uint8_t>(input), GetTensorShape(filter),
+                      GetTensorData<uint8_t>(filter), GetTensorShape(bias),
+                      GetTensorData<int32_t>(bias), GetTensorShape(output),
+                      GetTensorData<uint8_t>(output), GetTensorShape(im2col),
+                      GetTensorData<uint8_t>(im2col), nullptr);
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus EvalMliQuantizedPerChannel(
+    TfLiteContext* context, TfLiteNode* node, TfLiteConvParams* params,
+    OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
+    const TfLiteTensor* bias, TfLiteTensor* output) {
+  // Run Conv MLI kernel
+  // MLI optimized version only supports int8 dataype and dilation factor of 1
+  if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) &&
+      (params->dilation_height_factor == 1)) {
+    mli_tensor mli_in = {0};
+    mli_tensor mli_weights = {0};
+    mli_tensor mli_bias = {0};
+    mli_tensor mli_out = {0};
+    mli_conv2d_cfg cfg = {};
+
+    // reuse space allocated for OpData parameters
+    mli_weights.el_params.asym.scale.pi16 =
+        (int16_t*)data->per_channel_output_multiplier;
+    mli_bias.el_params.asym.scale.pi16 =
+        (int16_t*)data->per_channel_output_shift;
+
+    int16_t filter_zero_point = 0;
+    int16_t bias_zero_point = 0;
+    mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
+    mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
+
+    ConvertToMliTensor<int8_t>(input, &mli_in);
+    ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
+    ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
+    ConvertToMliTensor<int8_t>(output, &mli_out);
+
+    if (params->activation == kTfLiteActRelu) {
+      cfg.relu.type = MLI_RELU_GEN;
+    } else if (params->activation == kTfLiteActRelu6) {
+      cfg.relu.type = MLI_RELU_6;
+    } else if (params->activation == kTfLiteActRelu1) {
+      cfg.relu.type = MLI_RELU_1;
+    } else {
+      cfg.relu.type = MLI_RELU_NONE;
+    }
+
+    cfg.stride_width = params->stride_width;
+    cfg.stride_height = params->stride_height;
+    if (params->padding == kTfLitePaddingValid) {
+      cfg.padding_left = 0;
+      cfg.padding_right = 0;
+      cfg.padding_top = 0;
+      cfg.padding_bottom = 0;
+    } else {
+      cfg.padding_left = data->padding.width;
+      cfg.padding_right = data->padding.width + data->padding.width_offset;
+      cfg.padding_top = data->padding.height;
+      cfg.padding_bottom = data->padding.height + data->padding.height_offset;
+    }
+
+    // for height slicing
+    const int height_dimension = 1;
+    int in_slice_height = 0;
+    int out_slice_height = 0;
+    const int kernel_height =
+        static_cast<int>(mli_weights.shape[KRNL_H_DIM_HWC]);
+    const int overlap = kernel_height - cfg.stride_height;
+
+    // for weight slicing (on output channels)
+    // NHWC layout for weigths, output channel dimension is the first dimension.
+    const int weight_out_ch_dimension = 0;
+    int slice_channels =
+        static_cast<int>(mli_weights.shape[weight_out_ch_dimension]);
+    // Batch-Height-Width-Channel layout means last dimension is output
+    // channels.
+    const int out_tensor_ch_dimension = 3;
+
+    // Tensors for data in fast (local) memory and config to copy data from
+    // external to local memory
+    mli_tensor weights_local = mli_weights;
+    mli_tensor bias_local = mli_bias;
+    mli_tensor in_local = mli_in;
+    mli_tensor out_local = mli_out;
+    mli_mov_cfg_t copy_config;
+    mli_mov_cfg_for_copy(&copy_config);
+    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(
+        context, &in_local, &weights_local, &bias_local, &out_local));
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
+        &in_local, &out_local, kernel_height, cfg.stride_height,
+        cfg.padding_top, cfg.padding_bottom, &in_slice_height,
+        &out_slice_height));
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
+        &weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
+
+    /* is_local indicates that the tensor is already in local memory,
+       so in that case the original tensor can be used,
+       and there is no need to copy it to the local tensor*/
+    const bool in_is_local = in_local.data == mli_in.data;
+    const bool out_is_local = out_local.data == mli_out.data;
+    const bool w_is_local = weights_local.data == mli_weights.data;
+    const bool b_is_local = bias_local.data == mli_bias.data;
+
+    TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels);
+    TensorSlicer b_slice(&mli_bias, weight_out_ch_dimension, slice_channels);
+    TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels,
+                              0, 0, 0, true);
+
+    mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
+    mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
+
+    void* input_buffer_ptr = NULL;
+    int input_buffer_size = 0;
+
+    while (!w_slice.Done()) {
+      mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
+      mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
+
+      /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional
+      tensor. because the mli kernel will process one HWC tensor at a time, the
+      4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
+      on top of that there could be a need to also slice in the Height
+      dimension. for that the sliceHeight has been calculated. The tensor slicer
+      is configured that it will completely slice the nBatch dimension (0) and
+      slice the height dimension (1) in chunks of 'sliceHeight' */
+      TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height,
+                            cfg.padding_top, cfg.padding_bottom, overlap);
+
+      /* output tensor is alreade sliced in the output channel dimension.
+      out_ch_slice.Sub() is the tensor for the amount of output channels of this
+      itteration of the weight slice loop. This tensor needs to be further
+      sliced over the batch and height dimension. */
+      TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
+                             out_slice_height);
+
+      /* setup the pointers to the local or remote tensor to make the code
+       * inside the loop easier. */
+      mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+      mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+
+      while (!out_slice.Done()) {
+        TF_LITE_ENSURE(context, !in_slice.Done());
+        cfg.padding_top = in_slice.GetPaddingPre();
+        cfg.padding_bottom = in_slice.GetPaddingPost();
+
+        // if same input copy as previous iteration, skip the copy of input
+        if ((in_slice.Sub()->data != input_buffer_ptr) ||
+            (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
+          mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+          input_buffer_ptr = in_slice.Sub()->data;
+          input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
+        }
+        mli_krn_conv2d_nhwc_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg, out_ptr);
+        mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+        in_slice.Next();
+        out_slice.Next();
+      }
+      w_slice.Next();
+      b_slice.Next();
+      out_ch_slice.Next();
+      TF_LITE_ENSURE(context, in_slice.Done());
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                                     TfLiteConvParams* params, OpData* data,
+                                     const TfLiteTensor* input,
+                                     const TfLiteTensor* filter,
+                                     const TfLiteTensor* bias,
+                                     TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  ConvParams op_params;
+  op_params.input_offset = -input->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+  reference_integer_ops::ConvPerChannel(
+      op_params, data->per_channel_output_multiplier,
+      data->per_channel_output_shift, GetTensorShape(input),
+      GetTensorData<int8>(input), GetTensorShape(filter),
+      GetTensorData<int8>(filter), GetTensorShape(bias),
+      GetTensorData<int32>(bias), GetTensorShape(output),
+      GetTensorData<int8>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Node configuration is not supported by ARC MLI Library.");
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteConvParams* params, OpData* data,
+                       const TfLiteTensor* input, const TfLiteTensor* filter,
+                       const TfLiteTensor* bias, TfLiteTensor* im2col,
+                       TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+
+  ConvParams op_params;
+  op_params.padding_type = RuntimePaddingType(params->padding);
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  reference_ops::Conv(op_params, GetTensorShape(input),
+                      GetTensorData<float>(input), GetTensorShape(filter),
+                      GetTensorData<float>(filter), GetTensorShape(bias),
+                      GetTensorData<float>(bias), GetTensorShape(output),
+                      GetTensorData<float>(output), GetTensorShape(im2col),
+                      GetTensorData<float>(im2col));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+
+  int input_width = input->dims->data[2];
+  int input_height = input->dims->data[1];
+  int filter_width = filter->dims->data[2];
+  int filter_height = filter->dims->data[1];
+  int output_width = output->dims->data[2];
+  int output_height = output->dims->data[1];
+
+  OpData data;
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+
+    TF_LITE_ENSURE(context,
+                   affine_quantization->scale->size == 1 ||
+                       affine_quantization->scale->size ==
+                           filter->dims->data[kConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+  bool mli_is_applicable =
+      IsMliApplicable(context, input, filter, bias, params);
+  TF_LITE_ENSURE_STATUS(
+      CalculateOpData(context, node, params, input_width, input_height,
+                      filter_width, filter_height, output_width, output_height,
+                      input->type, mli_is_applicable, &data));
+
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      return EvalFloat(context, node, params, &data, input, filter, bias,
+                       nullptr, nullptr, output);
+      break;
+    case kTfLiteInt8:
+      if (mli_is_applicable) {
+        return EvalMliQuantizedPerChannel(context, node, params, &data, input,
+                                          filter, bias, output);
+
+      } else {
+        return EvalQuantizedPerChannel(context, node, params, &data, input,
+                                       filter, bias, output);
+      }
+      break;
+    case kTfLiteUInt8:
+      return EvalQuantized(context, node, params, &data, input, filter, bias,
+                           nullptr, nullptr, output);
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace conv
+
+TfLiteRegistration* Register_CONV_2D() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/nullptr,
+                                 /*invoke=*/conv::Eval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
new file mode 100644
index 00000000000..2c32b45bc26
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
@@ -0,0 +1,506 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This test checks that slicing logic doesn`t affect result of convolution
+// kernel
+//
+// This test doesn`t replace default convolution test
+// (tensorflow/lite/micro/kernels/conv_test.cc). It is added to the whole
+// testset only in case MLI for ARC platform is used during generation (which is
+// handled in arc_mli.inc). So such tests won`t be generated for other
+// platforms.
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+// Common inputs and outputs 1.
+static const int kInput1Elements = 20;
+static const int kInput1Shape[] = {4, 1, 5, 2, 2};
+static const float kInput1Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                    2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+static const int kFilter1Elements = 36;
+static const int kFilter1Shape[] = {4, 2, 3, 3, 2};
+static const float kFilter1Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+static const int kBias1Elements = 2;
+static const int kBias1Shape[] = {1, 2};
+static const float kBias1Data[] = {2, 2};
+static const int kOutput1Elements = 20;
+static const int kOutput1Shape[] = {4, 1, 5, 2, 2};
+static const float kGolden1Data[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50,
+                                     50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+
+// Common inputs and outputs 2.
+static const int kInput2Elements = 80;
+static const int kInput2Shape[] = {4, 1, 20, 2, 2};
+static const float kInput2Data[] = {
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+static const int kFilter2Elements = 36;
+static const int kFilter2Shape[] = {4, 2, 3, 3, 2};
+static const float kFilter2Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+static const int kBias2Elements = 2;
+static const int kBias2Shape[] = {1, 2};
+static const float kBias2Data[] = {2, 2};
+static const int kOutput2Elements = 80;
+static const int kOutput2Shape[] = {4, 1, 20, 2, 2};
+static const float kGolden2Data[] = {
+    34, 34, 34, 34, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+    50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+    50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+    50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+    50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+
+// Common inputs and outputs 3.
+static const int kInput3Elements = 40;
+static const int kInput3Shape[] = {4, 1, 2, 2, 10};
+static const float kInput3Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+static const int kFilter3Elements = 90;
+static const int kFilter3Shape[] = {4, 1, 3, 3, 10};  // 1 3 3 10
+static const float kFilter3Data[] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+static const int kBias3Elements = 1;
+static const int kBias3Shape[] = {1, 1};
+static const float kBias3Data[] = {1};
+static const int kOutput3Elements = 4;
+static const int kOutput3Shape[] = {4, 1, 2, 2, 1};  // 2 2 1
+static const float kGolden3Data[] = {41, 41, 41, 41};
+
+// Common inputs and outputs 4.
+static const int kInput4Elements = 80;
+static const int kInput4Shape[] = {4, 1, 4, 2, 10};
+static const float kInput4Data[] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+static const int kFilter4Elements = 90;
+static const int kFilter4Shape[] = {4, 1, 3, 3, 10};
+static const float kFilter4Data[] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+static const int kBias4Elements = 1;
+static const int kBias4Shape[] = {1, 1};
+static const float kBias4Data[] = {1};
+static const int kOutput4Elements = 8;
+static const int kOutput4Shape[] = {4, 1, 4, 2, 1};
+static const float kGolden4Data[] = {41, 41, 61, 61, 61, 61, 41, 41};
+
+static TfLiteConvParams common_conv_params = {
+    kTfLitePaddingSame,  // padding
+    1,                   // stride_width
+    1,                   // stride_height
+    kTfLiteActNone,      // activation
+    1,                   // dilation_width_factor
+    1,                   // dilation_height_factor
+};
+
+template <typename T>
+TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
+                                 const T* expected_output_data, T* output_data,
+                                 int output_length,
+                                 TfLiteConvParams* conv_params,
+                                 float tolerance = 1e-5) {
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_CONV_2D);
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  const char* init_data = reinterpret_cast<const char*>(conv_params);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(conv_params);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TfLiteStatus return_val = registration->invoke(&context, &node);
+  if (return_val != kTfLiteOk) {
+    return return_val;
+  }
+
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output_length; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
+                              tolerance);
+  }
+  return kTfLiteOk;
+}
+
+void TestConvQuantizedPerChannel(
+    const int* input_dims_data, const float* input_data,
+    int8_t* input_quantized, float input_scale, int input_zero_point,
+    const int* filter_dims_data, const float* filter_data,
+    int8_t* filter_data_quantized, const int* bias_dims_data,
+    const float* bias_data, int32_t* bias_data_quantized, float* bias_scales,
+    int* bias_zero_points, const int* output_dims_data,
+    const float* expected_output_data, int8_t* expected_output_data_quantized,
+    int8_t* output_data, float output_scale, int output_zero_point,
+    TfLiteConvParams* conv_params) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  int filter_zero_points[5];
+  float filter_scales[5];
+  TfLiteAffineQuantization filter_quant;
+  TfLiteAffineQuantization bias_quant;
+  TfLiteTensor input_tensor =
+      CreateQuantizedTensor(input_data, input_quantized, input_dims,
+                            input_scale, input_zero_point, "input_tensor");
+  TfLiteTensor filter_tensor = CreateSymmetricPerChannelQuantizedTensor(
+      filter_data, filter_data_quantized, filter_dims, filter_scales,
+      filter_zero_points, &filter_quant, 0 /* quantized dimension */,
+      "filter_tensor");
+
+  // DN: to replace scales and quantized data to avoid second quantization
+  int channel_count = filter_dims->data[0];
+  float true_filter_scales[5] = {1.0, 1.0, 1.0, 1.0, 1.0};
+  true_filter_scales[0] = static_cast<float>(channel_count);
+  TfLiteAffineQuantization* to_change =
+      (TfLiteAffineQuantization*)filter_tensor.quantization.params;
+  to_change->scale = FloatArrayFromFloats(true_filter_scales);
+
+  int filter_size = filter_tensor.bytes;
+  for (int i = 0; i < filter_size; ++i) {
+    filter_tensor.data.int8[i] = filter_data[i];
+  }
+
+  TfLiteTensor bias_tensor = CreatePerChannelQuantizedBiasTensor(
+      bias_data, bias_data_quantized, bias_dims, input_scale, &filter_scales[1],
+      bias_scales, bias_zero_points, &bias_quant, 0 /* quantized dimension */,
+      "bias_tensor");
+  TfLiteTensor output_tensor =
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            output_zero_point, "output_tensor");
+
+  float input_scales[] = {1, input_scale};
+  int input_zero_points[] = {1, input_zero_point};
+  TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales),
+                                          IntArrayFromInts(input_zero_points)};
+  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
+
+  float output_scales[] = {1, output_scale};
+  int output_zero_points[] = {1, output_zero_point};
+  TfLiteAffineQuantization output_quant = {
+      FloatArrayFromFloats(output_scales),
+      IntArrayFromInts(output_zero_points)};
+  output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      input_tensor,
+      filter_tensor,
+      bias_tensor,
+      output_tensor,
+  };
+
+  tflite::AsymmetricQuantize(expected_output_data,
+                             expected_output_data_quantized, output_dims_count,
+                             output_scale, output_zero_point);
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      ValidateConvGoldens(tensors, tensors_size, expected_output_data_quantized,
+                          output_data, output_dims_count, conv_params,
+                          1.0 /* tolerance */));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+// Test group 1
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel1) {
+  const int output_dims_count = 20;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[tflite::testing::kInput1Elements];
+  int8_t filter_quantized[tflite::testing::kFilter1Elements];
+  int32_t bias_quantized[tflite::testing::kBias1Elements];
+  int8_t golden_quantized[tflite::testing::kOutput1Elements];
+  int8_t output_data[output_dims_count];
+
+  int zero_points[tflite::testing::kBias1Elements + 1];
+  float scales[tflite::testing::kBias1Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput1Shape, tflite::testing::kInput1Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter1Shape, tflite::testing::kFilter1Data,
+      filter_quantized, tflite::testing::kBias1Shape,
+      tflite::testing::kBias1Data, bias_quantized, scales, zero_points,
+      tflite::testing::kOutput1Shape, tflite::testing::kGolden1Data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &tflite::testing::common_conv_params);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel1) {
+  const int output_dims_count = 20;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+#pragma Bss(".Xdata")
+  static int8_t input_quantized[tflite::testing::kInput1Elements];
+  static int8_t filter_quantized[tflite::testing::kFilter1Elements];
+  static int32_t bias_quantized[tflite::testing::kBias1Elements];
+  static int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  int8_t golden_quantized[tflite::testing::kOutput1Elements];
+  int zero_points[tflite::testing::kBias1Elements + 1];
+  float scales[tflite::testing::kBias1Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput1Shape, tflite::testing::kInput1Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter1Shape, tflite::testing::kFilter1Data,
+      filter_quantized, tflite::testing::kBias1Shape,
+      tflite::testing::kBias1Data, bias_quantized, scales, zero_points,
+      tflite::testing::kOutput1Shape, tflite::testing::kGolden1Data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &tflite::testing::common_conv_params);
+}
+
+// Test group 2
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel2) {
+  const int output_dims_count = 80;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[tflite::testing::kInput2Elements];
+  int8_t filter_quantized[tflite::testing::kFilter2Elements];
+  int32_t bias_quantized[tflite::testing::kBias2Elements];
+  int8_t golden_quantized[tflite::testing::kOutput2Elements];
+  int8_t output_data[output_dims_count];
+
+  int zero_points[tflite::testing::kBias2Elements + 1];
+  float scales[tflite::testing::kBias2Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput2Shape, tflite::testing::kInput2Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter2Shape, tflite::testing::kFilter2Data,
+      filter_quantized, tflite::testing::kBias2Shape,
+      tflite::testing::kBias2Data, bias_quantized, scales, zero_points,
+      tflite::testing::kOutput2Shape, tflite::testing::kGolden2Data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &tflite::testing::common_conv_params);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel2) {
+  const int output_dims_count = 80;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+#pragma Bss(".Xdata")
+  static int8_t input_quantized[tflite::testing::kInput2Elements];
+  static int8_t filter_quantized[tflite::testing::kFilter2Elements];
+  static int32_t bias_quantized[tflite::testing::kBias2Elements];
+  static int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  int8_t golden_quantized[tflite::testing::kOutput2Elements];
+  int zero_points[tflite::testing::kBias2Elements + 1];
+  float scales[tflite::testing::kBias2Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput2Shape, tflite::testing::kInput2Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter2Shape, tflite::testing::kFilter2Data,
+      filter_quantized, tflite::testing::kBias2Shape,
+      tflite::testing::kBias2Data, bias_quantized, scales, zero_points,
+      tflite::testing::kOutput2Shape, tflite::testing::kGolden2Data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &tflite::testing::common_conv_params);
+}
+
+// Test group 3
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel3) {
+  const int output_dims_count = 4;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[tflite::testing::kInput3Elements];
+  int8_t filter_quantized[tflite::testing::kFilter3Elements];
+  int32_t bias_quantized[tflite::testing::kBias3Elements];
+  int8_t golden_quantized[tflite::testing::kOutput3Elements];
+  int8_t output_data[output_dims_count];
+
+  int zero_points[tflite::testing::kBias3Elements + 1];
+  float scales[tflite::testing::kBias3Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput3Shape, tflite::testing::kInput3Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter3Shape, tflite::testing::kFilter3Data,
+      filter_quantized, tflite::testing::kBias3Shape,
+      tflite::testing::kBias3Data, bias_quantized, scales, zero_points,
+      tflite::testing::kOutput3Shape, tflite::testing::kGolden3Data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &tflite::testing::common_conv_params);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel3) {
+  const int output_dims_count = 4;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+#pragma Bss(".Xdata")
+  static int8_t input_quantized[tflite::testing::kInput3Elements];
+  static int8_t filter_quantized[tflite::testing::kFilter3Elements];
+  static int32_t bias_quantized[tflite::testing::kBias3Elements];
+  static int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  int8_t golden_quantized[tflite::testing::kOutput3Elements];
+  int zero_points[tflite::testing::kBias3Elements + 1];
+  float scales[tflite::testing::kBias3Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput3Shape, tflite::testing::kInput3Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter3Shape, tflite::testing::kFilter3Data,
+      filter_quantized, tflite::testing::kBias3Shape,
+      tflite::testing::kBias3Data, bias_quantized, scales, zero_points,
+      tflite::testing::kOutput3Shape, tflite::testing::kGolden3Data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &tflite::testing::common_conv_params);
+}
+
+// Test group 4
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel4) {
+  const int output_dims_count = 8;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[tflite::testing::kInput4Elements];
+  int8_t filter_quantized[tflite::testing::kFilter4Elements];
+  int32_t bias_quantized[tflite::testing::kBias4Elements];
+  int8_t golden_quantized[tflite::testing::kOutput4Elements];
+  int8_t output_data[output_dims_count];
+
+  int zero_points[tflite::testing::kBias4Elements + 1];
+  float scales[tflite::testing::kBias4Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput4Shape, tflite::testing::kInput4Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter4Shape, tflite::testing::kFilter4Data,
+      filter_quantized, tflite::testing::kBias4Shape,
+      tflite::testing::kBias4Data, bias_quantized, scales, zero_points,
+      tflite::testing::kOutput4Shape, tflite::testing::kGolden4Data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &tflite::testing::common_conv_params);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel4) {
+  const int output_dims_count = 8;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+#pragma Bss(".Xdata")
+  static int8_t input_quantized[tflite::testing::kInput4Elements];
+  static int8_t filter_quantized[tflite::testing::kFilter4Elements];
+  static int32_t bias_quantized[tflite::testing::kBias4Elements];
+  static int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  int8_t golden_quantized[tflite::testing::kOutput4Elements];
+  int zero_points[tflite::testing::kBias4Elements + 1];
+  float scales[tflite::testing::kBias4Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput4Shape, tflite::testing::kInput4Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter4Shape, tflite::testing::kFilter4Data,
+      filter_quantized, tflite::testing::kBias4Shape,
+      tflite::testing::kBias4Data, bias_quantized, scales, zero_points,
+      tflite::testing::kOutput4Shape, tflite::testing::kGolden4Data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &tflite::testing::common_conv_params);
+}
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
new file mode 100644
index 00000000000..b1a26a6a10e
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
@@ -0,0 +1,515 @@
+/* Copyright 2017-2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
+
+#include "mli_api.h"  // NOLINT
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace depthwise_conv {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kFilterTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+constexpr int kMaxChannels = 256;
+
+// Depthwise conv is quantized along dimension 3:
+// https://www.tensorflow.org/lite/performance/quantization_spec
+constexpr int kDepthwiseConvQuantizedDimension = 3;
+
+struct OpData {
+  TfLitePaddingValues padding;
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+
+  // Per channel output multiplier and shift.
+  int32_t per_channel_output_multiplier[kMaxChannels];
+  int32_t per_channel_output_shift[kMaxChannels];
+
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
+
+bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
+                     const TfLiteTensor* filter, const TfLiteTensor* bias,
+                     const TfLiteDepthwiseConvParams* params) {
+  const auto* affine_quantization =
+      reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+  const int in_ch = SizeOfDimension(input, 3);
+  const int filters_num = SizeOfDimension(filter, 3);
+
+  // MLI optimized version only supports int8 dataype, dilation factor of 1 and
+  // per-axis quantization of weights (no broadcasting/per-tensor)
+  // (in_ch == filters_num) || (in_ch == 1)) is a forbidding of
+  // channel multiplier logic for multichannel input.
+  bool ret_val = (filter->type == kTfLiteInt8) &&
+                 (input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
+                 (params->dilation_width_factor == 1) &&
+                 (params->dilation_height_factor == 1) &&
+                 (affine_quantization->scale->size ==
+                  filter->dims->data[kDepthwiseConvQuantizedDimension]) &&
+                 ((in_ch == filters_num) || (in_ch == 1)) &&
+                 affine_quantization->scale->size <= (kMaxChannels * 2);
+  return ret_val;
+}
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params, int width,
+                             int height, int filter_width, int filter_height,
+                             const TfLiteType data_type, bool mli_is_applicable,
+                             OpData* data) {
+  bool has_bias = node->inputs->size == 3;
+  // Check number of inputs/outputs
+  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+
+  int unused_output_height, unused_output_width;
+  data->padding = ComputePaddingHeightWidth(
+      params->stride_height, params->stride_width, 1, 1, height, width,
+      filter_height, filter_width, params->padding, &unused_output_height,
+      &unused_output_width);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  if (data_type != kTfLiteFloat32 && !mli_is_applicable) {
+    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+    const TfLiteTensor* bias =
+        GetOptionalInputTensor(context, node, kBiasTensor);
+    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+    int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+
+    // Ensure filter and bias channel count does not exceed space reserved for
+    // quantization metadata.
+    const auto filter_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    const auto bias_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(bias->quantization.params);
+    TF_LITE_ENSURE(context, filter_quantization->scale->size <= kMaxChannels);
+    TF_LITE_ENSURE(context, bias_quantization->scale->size <= kMaxChannels);
+
+    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+        context, input, filter, bias, output, params->activation,
+        &data->output_multiplier, &data->output_shift,
+        &data->output_activation_min, &data->output_activation_max,
+        data->per_channel_output_multiplier,
+        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
+  }
+#endif
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteDepthwiseConvParams* params, OpData* data,
+                       const TfLiteTensor* input, const TfLiteTensor* filter,
+                       const TfLiteTensor* bias, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  tflite::reference_ops::DepthwiseConv(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(filter), GetTensorData<float>(filter),
+      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
+      GetTensorData<float>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus EvalMliQuantizedPerChannel(
+    TfLiteContext* context, TfLiteNode* node, TfLiteDepthwiseConvParams* params,
+    OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
+    const TfLiteTensor* bias, TfLiteTensor* output) {
+  // Run Depthwise Conv MLI kernel
+  mli_tensor mli_in = {0};
+  mli_tensor mli_weights = {0};
+  mli_tensor mli_bias = {0};
+  mli_tensor mli_out = {0};
+  mli_conv2d_cfg cfg = {};
+
+  // reuse space allocated for OpData parameters
+  mli_weights.el_params.asym.scale.pi16 =
+      (int16_t*)data->per_channel_output_multiplier;
+  mli_bias.el_params.asym.scale.pi16 = (int16_t*)data->per_channel_output_shift;
+
+  int16_t filter_zero_point = 0;
+  int16_t bias_zero_point = 0;
+  mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
+  mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
+
+  ConvertToMliTensor<int8_t>(input, &mli_in);
+  ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
+  ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
+  ConvertToMliTensor<int8_t>(output, &mli_out);
+
+  if (params->activation == kTfLiteActRelu) {
+    cfg.relu.type = MLI_RELU_GEN;
+  } else if (params->activation == kTfLiteActRelu6) {
+    cfg.relu.type = MLI_RELU_6;
+  } else if (params->activation == kTfLiteActRelu1) {
+    cfg.relu.type = MLI_RELU_1;
+  } else {
+    cfg.relu.type = MLI_RELU_NONE;
+  }
+
+  cfg.stride_width = params->stride_width;
+  cfg.stride_height = params->stride_height;
+  if (params->padding == kTfLitePaddingValid) {
+    cfg.padding_left = 0;
+    cfg.padding_right = 0;
+    cfg.padding_top = 0;
+    cfg.padding_bottom = 0;
+  } else {
+    cfg.padding_left = data->padding.width;
+    cfg.padding_right = data->padding.width + data->padding.width_offset;
+    cfg.padding_top = data->padding.height;
+    cfg.padding_bottom = data->padding.height + data->padding.height_offset;
+  }
+
+  // for height slicing
+  const int heightDimension = 1;
+  int inSliceHeight = 0;
+  int outSliceHeight = 0;
+  const int kernelHeight =
+      static_cast<int>(mli_weights.shape[KRNL_DW_H_DIM_HWC]);
+  const int overlap = kernelHeight - cfg.stride_height;
+
+  // for weight slicing (on output channels)
+  // HWCN layout for weigths, output channel dimension is the first dimension.
+  const int weight_out_ch_dimension = 3;
+  // bias has only 1 dimension
+  const int bias_out_ch_dimension = 0;
+  // Batch-Height-Width-Channel layout means last dimension is output channels.
+  const int out_tensor_ch_dimension = 3;
+  const int32_t in_channels = mli_in.shape[out_tensor_ch_dimension];
+  const int32_t out_channels = mli_out.shape[out_tensor_ch_dimension];
+  int slice_channels =
+      static_cast<int>(mli_weights.shape[weight_out_ch_dimension]);
+
+  // Tensors for data in fast (local) memory
+  // and config to copy data from external to local memory
+  mli_tensor weights_local = mli_weights;
+  mli_tensor bias_local = mli_bias;
+  mli_tensor in_local = mli_in;
+  mli_tensor out_local = mli_out;  // this assumes that output shape
+                                   // is already filled in the tensor struct.
+  mli_mov_cfg_t copy_config;
+  mli_mov_cfg_for_copy(&copy_config);
+
+  TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(
+      context, &in_local, &weights_local, &bias_local, &out_local));
+  /* is_local indicates that the tensor is already in local memory,
+     so in that case the original tensor can be used,
+     and there is no need to copy it to the local tensor*/
+  const bool in_is_local = in_local.data == mli_in.data;
+  const bool out_is_local = out_local.data == mli_out.data;
+  const bool w_is_local = weights_local.data == mli_weights.data;
+  const bool b_is_local = bias_local.data == mli_bias.data;
+
+  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
+      &in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top,
+      cfg.padding_bottom, &inSliceHeight, &outSliceHeight));
+  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
+      &weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
+
+  /* if input channels is not equal to output channels, a channel multiplier
+     is used. in this case the slice channels needs to be rounded down to a
+     multiple of the input channels */
+  if (in_channels != out_channels) {
+    slice_channels = (slice_channels / in_channels) * in_channels;
+  }
+
+  TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension, slice_channels);
+  TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels, 0,
+                       0, 0, true);
+  TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels,
+                            0, 0, 0, true);
+  TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels, 0,
+                           0, 0, true);
+
+  mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
+  mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
+
+  void* input_buffer_ptr = NULL;
+  int input_buffer_size = 0;
+  int padding_top = cfg.padding_top;
+  int padding_bottom = cfg.padding_bottom;
+
+  while (!w_slice.Done()) {
+    mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
+    mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
+
+    /* input tensor is alreade sliced in the  channel dimension.
+    out_ch_slice.Sub() is the tensor for the amount of channels of this
+    itteration of the weight slice loop. This tensor needs to be further
+    sliced over the batch and height dimension. in_ch_slice.Sub() tensor
+    contains batches of HWC tensors. so it is a 4 dimensional tensor. because
+    the mli kernel will process one HWC tensor at a time, the 4 dimensional
+    tensor needs to be sliced into nBatch 3 dimensional tensors. on top of
+    that there could be a need to also slice in the Height dimension. for that
+    the sliceHeight has been calculated. The tensor slicer is configured that
+    it will completely slice the nBatch dimension (0) and slice the height
+    dimension (1) in chunks of 'sliceHeight' */
+    TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight,
+                          padding_top, padding_bottom, overlap);
+
+    /* output tensor is alreade sliced in the output channel dimension.
+    out_ch_slice.Sub() is the tensor for the amount of output channels of this
+    itteration of the weight slice loop. This tensor needs to be further
+    sliced over the batch and height dimension. */
+    TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight);
+
+    /* setup the pointers to the local or remote tensor to make the code
+     * inside the loop easier. */
+    mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+    mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+
+    while (!out_slice.Done()) {
+      TF_LITE_ENSURE(context, !in_slice.Done());
+      cfg.padding_top = in_slice.GetPaddingPre();
+      cfg.padding_bottom = in_slice.GetPaddingPost();
+
+      // if same input copy as previous iteration, skip the copy of input
+      if ((in_slice.Sub()->data != input_buffer_ptr) ||
+          (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
+        mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+        input_buffer_ptr = in_slice.Sub()->data;
+        input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
+      }
+      mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg,
+                                                 out_ptr);
+      mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+      in_slice.Next();
+      out_slice.Next();
+    }
+    w_slice.Next();
+    b_slice.Next();
+    out_ch_slice.Next();
+    in_ch_slice.Next();
+    TF_LITE_ENSURE(context, in_slice.Done());
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                                     TfLiteDepthwiseConvParams* params,
+                                     OpData* data, const TfLiteTensor* input,
+                                     const TfLiteTensor* filter,
+                                     const TfLiteTensor* bias,
+                                     TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  DepthwiseParams op_params;
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.input_offset = -input->params.zero_point;
+  op_params.weights_offset = 0;
+  op_params.output_offset = output->params.zero_point;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+  reference_integer_ops::DepthwiseConvPerChannel(
+      op_params, data->per_channel_output_multiplier,
+      data->per_channel_output_shift, GetTensorShape(input),
+      GetTensorData<int8>(input), GetTensorShape(filter),
+      GetTensorData<int8>(filter), GetTensorShape(bias),
+      GetTensorData<int32>(bias), GetTensorShape(output),
+      GetTensorData<int8>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Node configuration is not supported by ARC MLI Library.");
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                           TfLiteDepthwiseConvParams* params, OpData* data,
+                           const TfLiteTensor* input,
+                           const TfLiteTensor* filter, const TfLiteTensor* bias,
+                           TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  const int32_t input_offset = -input->params.zero_point;
+  const int32_t filter_offset = -filter->params.zero_point;
+  const int32_t output_offset = output->params.zero_point;
+
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = -data->output_shift;
+
+  tflite::reference_ops::DepthwiseConv(
+      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+      GetTensorShape(filter), GetTensorData<uint8_t>(filter),
+      GetTensorShape(bias), GetTensorData<int32_t>(bias),
+      GetTensorShape(output), GetTensorData<uint8_t>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias =
+      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
+
+  const TfLiteType data_type = input->type;
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
+
+  OpData data;
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+    TF_LITE_ENSURE(
+        context, affine_quantization->scale->size == 1 ||
+                     affine_quantization->scale->size ==
+                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  bool mli_is_applicable =
+      IsMliApplicable(context, input, filter, bias, params);
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
+                                        filter_width, filter_height, data_type,
+                                        mli_is_applicable, &data));
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      return EvalFloat(context, node, params, &data, input, filter, bias,
+                       output);
+      break;
+    case kTfLiteInt8:
+      if (mli_is_applicable) {
+        return EvalMliQuantizedPerChannel(context, node, params, &data, input,
+                                          filter, bias, output);
+      } else {
+        return EvalQuantizedPerChannel(context, node, params, &data, input,
+                                       filter, bias, output);
+      }
+      break;
+    case kTfLiteUInt8:
+      return EvalQuantized(context, node, params, &data, input, filter, bias,
+                           output);
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace depthwise_conv
+
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/nullptr,
+                                 /*invoke=*/depthwise_conv::Eval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
new file mode 100644
index 00000000000..10f2f857c01
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
@@ -0,0 +1,550 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This test checks that slicing logic doesn`t affect result of depthwise
+// convolution kernel
+//
+// This test doesn`t replace default depthwise convolution test
+// (tensorflow/lite/micro/kernels/depthwise_conv_test.cc). It is added to the
+// whole testset only in case MLI for ARC platform is used during generation
+// (which is handled in arc_mli.inc). So such tests won`t be generated for other
+// platforms.
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+constexpr int kMaxFilterChannels = 64;
+constexpr int kMaxBiasChannels = 64;
+
+// Index of the output tensor in context->tensors, specific to
+// DepthwiseConv.
+constexpr int kOutputTensorIndex = 3;
+
+// Creates a DepthwiseConv opeerator, calls it with the provided input tensors
+// and some defaults parameters, and compares the output with
+// expected_output_data.
+//
+// The tensors parameter contains both the input tensors as well as a
+// preallocated output tensor into which the output is stored.
+template <typename T>
+TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data,
+                                          int output_length,
+                                          TfLiteFusedActivation activation,
+                                          float tolerance, int tensors_size,
+                                          TfLiteTensor* tensors) {
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  int input_depth = tensors[0].dims->data[3];
+  int output_depth = tensors[1].dims->data[3];
+  int depth_mul = output_depth / input_depth;
+  TfLiteDepthwiseConvParams builtin_data;
+  builtin_data.padding = kTfLitePaddingValid;
+  builtin_data.activation = activation;
+  builtin_data.stride_height = 1;
+  builtin_data.stride_width = 1;
+  builtin_data.dilation_height_factor = 1;
+  builtin_data.dilation_width_factor = 1;
+  builtin_data.depth_multiplier = depth_mul;
+
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TfLiteStatus invoke_status = registration->invoke(&context, &node);
+  if (invoke_status != kTfLiteOk) {
+    return invoke_status;
+  }
+
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  const T* output_data = tflite::GetTensorData<T>(&tensors[kOutputTensorIndex]);
+  for (int i = 0; i < output_length; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
+                              tolerance);
+  }
+  return kTfLiteOk;
+}
+
+void TestDepthwiseConvQuantizedPerChannel(
+    const int* input_dims_data, const float* input_data,
+    int8_t* input_quantized, float input_scale, int input_zero_point,
+    const int* filter_dims_data, const float* filter_data,
+    int8_t* filter_data_quantized, const int* bias_dims_data,
+    const float* bias_data, int32_t* bias_data_quantized,
+    const int* output_dims_data, const float* expected_output_data,
+    int8_t* expected_output_data_quantized, int8_t* output_data,
+    float output_scale, int output_zero_point,
+    TfLiteFusedActivation activation) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  int filter_zero_points[kMaxFilterChannels];
+  float filter_scales[kMaxFilterChannels];
+  int bias_zero_points[kMaxBiasChannels];
+  float bias_scales[kMaxBiasChannels];
+  TfLiteAffineQuantization filter_quant;
+  TfLiteAffineQuantization bias_quant;
+  TfLiteTensor input_tensor =
+      CreateQuantizedTensor(input_data, input_quantized, input_dims,
+                            input_scale, input_zero_point, "input_tensor");
+  TfLiteTensor filter_tensor = CreateSymmetricPerChannelQuantizedTensor(
+      filter_data, filter_data_quantized, filter_dims, filter_scales,
+      filter_zero_points, &filter_quant, 3 /* quantized dimension */,
+      "filter_tensor");
+  TfLiteTensor bias_tensor = CreatePerChannelQuantizedBiasTensor(
+      bias_data, bias_data_quantized, bias_dims, input_scale, &filter_scales[1],
+      bias_scales, bias_zero_points, &bias_quant, 3 /* quantized dimension */,
+      "bias_tensor");
+  TfLiteTensor output_tensor =
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            input_zero_point, "output_tensor");
+
+  float input_scales[] = {1, input_scale};
+  int input_zero_points[] = {1, input_zero_point};
+  TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales),
+                                          IntArrayFromInts(input_zero_points)};
+  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
+
+  float output_scales[] = {1, output_scale};
+  int output_zero_points[] = {1, output_zero_point};
+  TfLiteAffineQuantization output_quant = {
+      FloatArrayFromFloats(output_scales),
+      IntArrayFromInts(output_zero_points)};
+  output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      input_tensor,
+      filter_tensor,
+      bias_tensor,
+      output_tensor,
+  };
+
+  AsymmetricQuantize(expected_output_data, expected_output_data_quantized,
+                     output_dims_count, output_scale, output_zero_point);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, ValidateDepthwiseConvGoldens(expected_output_data_quantized,
+                                              output_dims_count, activation,
+                                              1.0, tensors_size, tensors));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+// Test group 1
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel1) {
+  const int input_elements = 20;
+  const int input_shape[] = {4, 1, 5, 2, 2};
+  const float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int filter_elements = 36;
+  const int filter_shape[] = {4, 2, 3, 3, 2};
+  const float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int bias_elements = 2;
+  const int bias_shape[] = {4, 1, 1, 1, 2};
+  const int output_elements = 20;
+  const float bias_values[] = {2, 2};
+  const float golden[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50,
+                          50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+  const int output_shape[] = {4, 1, 5, 2, 2};
+  const int output_dims_count = 20;
+  int8_t output_data[output_dims_count];
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel1) {
+  const int input_elements = 20;
+  const int input_shape[] = {4, 1, 5, 2, 2};
+  const int filter_elements = 36;
+  const int filter_shape[] = {4, 2, 3, 3, 2};
+  const int bias_elements = 2;
+  const int bias_shape[] = {4, 1, 1, 1, 2};
+  const int output_elements = 20;
+  const int output_shape[] = {4, 1, 5, 2, 2};
+  const int output_dims_count = 20;
+
+#pragma Bss(".Zdata")
+  const float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const float bias_values[] = {2, 2};
+  int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  const float golden[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50,
+                          50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+// Test group 2
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel2) {
+  const int input_elements = 80;
+  const int input_shape[] = {4, 1, 20, 2, 2};
+  const float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int filter_elements = 36;
+  const int filter_shape[] = {4, 2, 3, 3, 2};
+  const float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int bias_elements = 2;
+  const int bias_shape[] = {4, 1, 1, 1, 2};
+  const int output_elements = 80;
+  const float bias_values[] = {2, 2};
+  const float golden[] = {
+      34, 34, 34, 34, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+  const int output_shape[] = {4, 1, 20, 2, 2};
+  const int output_dims_count = 80;
+  int8_t output_data[output_dims_count];
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel2) {
+  const int input_elements = 80;
+  const int input_shape[] = {4, 1, 20, 2, 2};
+  const int filter_elements = 36;
+  const int filter_shape[] = {4, 2, 3, 3, 2};
+  const int bias_elements = 2;
+  const int bias_shape[] = {4, 1, 1, 1, 2};
+  const int output_elements = 80;
+  const int output_shape[] = {4, 1, 20, 2, 2};
+  const int output_dims_count = 80;
+
+#pragma Bss(".Zdata")
+  float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  float bias_values[] = {2, 2};
+  int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  const float golden[] = {
+      34, 34, 34, 34, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+// Test group 3
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel3) {
+  const int input_elements = 40;
+  const int input_shape[] = {4, 1, 2, 2, 10};
+  const float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int filter_elements = 90;
+  const int filter_shape[] = {4, 1, 3, 3, 10};
+  const float filter_values[] = {
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int bias_elements = 1;
+  const int bias_shape[] = {4, 1, 1, 1, 1};
+  const int output_elements = 4;
+  const float bias_values[] = {1};
+  const float golden[] = {41, 41, 41, 41};
+  const int output_shape[] = {4, 1, 2, 2, 1};
+  const int output_dims_count = 4;
+  int8_t output_data[output_dims_count];
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel3) {
+  const int input_elements = 40;
+  const int input_shape[] = {4, 1, 2, 2, 10};
+  const int filter_elements = 90;
+  const int filter_shape[] = {4, 1, 3, 3, 10};
+  const int bias_elements = 1;
+  const int bias_shape[] = {4, 1, 1, 1, 1};
+  const int output_elements = 4;
+  const int output_shape[] = {4, 1, 2, 2, 1};
+  const int output_dims_count = 4;
+
+#pragma Bss(".Zdata")
+  float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float filter_values[] = {
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float bias_values[] = {1};
+  int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  const float golden[] = {41, 41, 41, 41};
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+// Test group 4
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel4) {
+  const int input_elements = 80;
+  const int input_shape[] = {4, 1, 4, 2, 10};
+  const float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int filter_elements = 90;
+  const int filter_shape[] = {4, 1, 3, 3, 10};
+  const float filter_values[] = {
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int bias_elements = 1;
+  const int bias_shape[] = {4, 1, 1, 1, 1};
+  const int output_elements = 8;
+  const float bias_values[] = {1};
+  const float golden[] = {41, 41, 61, 61, 61, 61, 41, 41};
+  const int output_shape[] = {4, 1, 4, 2, 1};
+  const int output_dims_count = 8;
+  int8_t output_data[output_dims_count];
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel4) {
+  const int input_elements = 80;
+  const int input_shape[] = {4, 1, 4, 2, 10};
+  const int filter_elements = 90;
+  const int filter_shape[] = {4, 1, 3, 3, 10};
+  const int bias_elements = 1;
+  const int bias_shape[] = {4, 1, 1, 1, 1};
+  const int output_elements = 8;
+  const int output_shape[] = {4, 1, 4, 2, 1};
+  const int output_dims_count = 8;
+
+#pragma Bss(".Zdata")
+  float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float filter_values[] = {
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float bias_values[] = {1};
+  int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  const float golden[] = {41, 41, 61, 61, 61, 61, 41, 41};
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
new file mode 100644
index 00000000000..fe077c99fac
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
@@ -0,0 +1,385 @@
+/* Copyright 2017-2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+
+#include "mli_api.h"  // NOLINT
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace fully_connected {
+namespace {
+
+struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  // The index of the temporary tensor where the quantized inputs are cached.
+  int input_quantized_index;
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kWeightsTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
+                     const TfLiteTensor* filter, const TfLiteTensor* bias,
+                     const TfLiteFullyConnectedParams* params) {
+  // MLI optimized version only supports int8 dataype and no fused Relu and
+  // symmetric per-tensor quantization of weights (not per-axis)
+  bool ret_val = (filter->type == kTfLiteInt8) &&
+                 (input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
+                 (params->activation == kTfLiteActNone) &&
+                 (filter->params.zero_point == 0);
+  return ret_val;
+}
+
+TfLiteStatus CalculateOpData(TfLiteContext* context,
+                             TfLiteFullyConnectedParams* params,
+                             TfLiteType data_type, const TfLiteTensor* input,
+                             const TfLiteTensor* filter,
+                             const TfLiteTensor* bias, TfLiteTensor* output,
+                             OpData* data) {
+  TfLiteStatus status = kTfLiteOk;
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  if (data_type != kTfLiteFloat32 &&
+      !IsMliApplicable(context, input, filter, bias, params)) {
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
+    data->output_shift = -exponent;
+    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
+        context, params->activation, output, &data->output_activation_min,
+        &data->output_activation_max));
+  }
+#endif
+  return status;
+}
+
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  OpData* data = nullptr;
+  TfLiteStatus status = context->AllocatePersistentBuffer(
+      context, sizeof(OpData), reinterpret_cast<void**>(&data));
+  if (status != kTfLiteOk || data == nullptr) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+  auto* params =
+      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE(context, data != nullptr);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
+                     "Hybrid models are not supported on TFLite Micro.");
+
+  TfLiteType data_type = input->type;
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
+                                        filter, bias, output, data));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
+                                  TfLiteFullyConnectedParams* params,
+                                  OpData* data, const TfLiteTensor* input,
+                                  const TfLiteTensor* filter,
+                                  const TfLiteTensor* bias,
+                                  TfLiteTensor* output) {
+  mli_tensor mli_in = {0};
+  mli_tensor mli_weights = {0};
+  mli_tensor mli_bias = {0};
+  mli_tensor mli_out = {0};
+
+  ConvertToMliTensor<int8_t>(input, &mli_in);
+  ConvertToMliTensor<int8_t>(filter, &mli_weights);
+  ConvertToMliTensor<int32_t>(bias, &mli_bias);
+  ConvertToMliTensor<int8_t>(output, &mli_out);
+
+  /* The input tensor can have more than 2 dimensions. for the compute this
+     doesn't make any difference because all the inputs or a batch entry will
+     be used anyway. because the MLI kernel doesn't recognize the multiple
+     dimensions, the tensor shape is casted to a {batchnum, inputsize} shape. */
+  mli_in.shape[0] = mli_out.shape[0];
+  mli_in.shape[1] = mli_weights.shape[1];
+  mli_in.shape[2] = 0;
+  mli_in.shape[3] = 0;
+  mli_in.rank = 2;
+
+  // Tensors for data in fast (local) memory and config to copy data from
+  // external to local memory
+  mli_tensor weights_local = mli_weights;
+  mli_tensor bias_local = mli_bias;
+  mli_tensor in_local = mli_in;
+  mli_tensor out_local = mli_out;
+  mli_mov_cfg_t copy_config;
+  mli_mov_cfg_for_copy(&copy_config);
+  const int weight_out_dimension = 0;
+  const int out_tensor_dimension = 1;
+  const int input_size_dimension = 1;
+  int slice_size = mli_weights.shape[weight_out_dimension];
+
+  /* allocate the local buffers, and compute the slice size */
+  TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_fully_connect_tensors(
+      context, &in_local, &weights_local, &bias_local, &out_local));
+  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
+      &weights_local, &bias_local, weight_out_dimension, &slice_size));
+  int max_out_slice_size =
+      out_local.capacity / mli_hlp_tensor_element_size(&out_local);
+  if (slice_size > max_out_slice_size) slice_size = max_out_slice_size;
+
+  /* is_local indicates that the tensor is already in local memory,
+     so in that case the original tensor can be used,
+     and there is no need to copy it to the local tensor*/
+  const bool in_is_local = in_local.data == mli_in.data;
+  const bool out_is_local = out_local.data == mli_out.data;
+  const bool w_is_local = weights_local.data == mli_weights.data;
+  const bool b_is_local = bias_local.data == mli_bias.data;
+
+  TensorSlicer w_slice(&mli_weights, weight_out_dimension, slice_size);
+  TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size);
+  TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension, slice_size, 0, 0, 0,
+                            true);
+
+  mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
+  mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
+
+  void* input_buffer_ptr = NULL;
+
+  while (!w_slice.Done()) {
+    mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
+    mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
+
+    // Slice the input over the batches (one at a time with the size of a
+    // complete input)
+    TensorSlicer in_slice(&mli_in, input_size_dimension,
+                          mli_in.shape[input_size_dimension]);
+
+    /* output tensor is alreade sliced in the output size dimension.
+    out_ch_slice.Sub() is the tensor for the amount of output size of this
+    itteration of the weight slice loop. This tensor needs to be further
+    sliced over the batch */
+    TensorSlicer out_slice(out_ch_slice.Sub(), out_tensor_dimension,
+                           slice_size);
+
+    /* setup the pointers to the local or remote tensor to make the code
+     * inside the loop easier. */
+    mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+    mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+
+    while (!out_slice.Done()) {
+      // if same input copy as previous iteration, skip the copy of input
+      if (in_slice.Sub()->data != input_buffer_ptr) {
+        mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+        input_buffer_ptr = in_slice.Sub()->data;
+      }
+      mli_krn_fully_connected_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, out_ptr);
+      mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+      in_slice.Next();
+      out_slice.Next();
+    }
+    w_slice.Next();
+    b_slice.Next();
+    out_ch_slice.Next();
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
+                               TfLiteFullyConnectedParams* params, OpData* data,
+                               const TfLiteTensor* input,
+                               const TfLiteTensor* filter,
+                               const TfLiteTensor* bias, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  FullyConnectedParams op_params;
+  op_params.input_offset = -input->params.zero_point;
+  op_params.weights_offset = -filter->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = -data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+  reference_integer_ops::FullyConnected(
+      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+      GetTensorShape(filter), GetTensorData<int8_t>(filter),
+      GetTensorShape(bias), GetTensorData<int32_t>(bias),
+      GetTensorShape(output), GetTensorData<int8_t>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Node configuration is not supported by ARC MLI Library.");
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                           TfLiteFullyConnectedParams* params, OpData* data,
+                           const TfLiteTensor* input,
+                           const TfLiteTensor* filter, const TfLiteTensor* bias,
+                           TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  const int32_t input_offset = -input->params.zero_point;
+  const int32_t filter_offset = -filter->params.zero_point;
+  const int32_t output_offset = output->params.zero_point;
+
+  tflite::FullyConnectedParams op_params;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = -data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+#define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
+  reference_ops::FullyConnected(                                       \
+      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
+      GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
+      GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
+      GetTensorShape(output), GetTensorData<output_data_type>(output))
+  switch (output->type) {
+    case kTfLiteUInt8:
+      TF_LITE_FULLY_CONNECTED(uint8_t);
+      break;
+    case kTfLiteInt16:
+      TF_LITE_FULLY_CONNECTED(int16_t);
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(output->type), output->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteFullyConnectedParams* params, OpData* data,
+                       const TfLiteTensor* input, const TfLiteTensor* filter,
+                       const TfLiteTensor* bias, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+  tflite::FullyConnectedParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  tflite::reference_ops::FullyConnected(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(filter), GetTensorData<float>(filter),
+      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
+      GetTensorData<float>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+  TF_LITE_ENSURE(context, data != nullptr);
+
+  // Checks in Prepare ensure input, output and filter types are all the same.
+  switch (input->type) {
+    case kTfLiteFloat32:
+      return EvalFloat(context, node, params, data, input, filter, bias,
+                       output);
+    case kTfLiteInt8:
+      if (IsMliApplicable(context, input, filter, bias, params)) {
+        return EvalMliQuantizedInt8(context, node, params, data, input, filter,
+                                    bias, output);
+      } else {
+        return EvalQuantizedInt8(context, node, params, data, input, filter,
+                                 bias, output);
+      }
+
+    case kTfLiteUInt8:
+      return EvalQuantized(context, node, params, data, input, filter, bias,
+                           output);
+
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(filter->type), filter->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace fully_connected
+
+TfLiteRegistration* Register_FULLY_CONNECTED() {
+  static TfLiteRegistration r = {/*init=*/fully_connected::Init,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/fully_connected::Prepare,
+                                 /*invoke=*/fully_connected::Eval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc
new file mode 100644
index 00000000000..a64e7bdff4a
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc
@@ -0,0 +1,425 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This test checks that slicing logic doesn`t affect result of fully
+// connected kernel
+//
+// This test doesn`t replace default fully connected test
+// (tensorflow/lite/micro/kernels/fully_connected_test.cc). It is added to the
+// whole testset only in case MLI for ARC platform is used during generation
+// (which is handled in arc_mli.inc). So such tests won`t be generated for other
+// platforms.
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+template <typename T>
+void TestFullyConnectedQuantized(
+    const int* input_dims_data, const T* input_data, const float input_min,
+    const float input_max, const int* weights_dims_data, const T* weights_data,
+    const float weights_min, const float weights_max, const int* bias_dims_data,
+    const int32_t* bias_data, const float bias_scale,
+    const T* expected_output_data, const int* output_dims_data,
+    const float output_min, const float output_max,
+    TfLiteFusedActivation activation, T* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* weights_dims = IntArrayFromInts(weights_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(weights_data, weights_dims, "weights_tensor",
+                            weights_min, weights_max),
+      CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_scale),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  tensors[0].params.zero_point = 0;
+  tensors[1].params.zero_point = 0;
+  tensors[3].params.zero_point = 0;
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 4);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteFullyConnectedParams builtin_data = {
+      activation,
+      kTfLiteFullyConnectedWeightsFormatDefault,
+  };
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+// Test group 1
+TF_LITE_MICRO_TEST(SystemSimpleTestQuantized1) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data[] = {2, 2, 10};
+  const int8_t input_data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                               2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int weights_dims_data[] = {2, 3, 10};
+  const int8_t weights_data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {1, 1, 1};
+  const int8_t expected_output_data[] = {41, 41, 41, 41, 41, 41};
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(LocalSimpleTestQuantized1) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_local[] = {2, 2, 10};
+  const int weights_dims_data_local[] = {2, 3, 10};
+  const int bias_dims_data_local[] = {1, 3};
+  const int output_dims_data_local[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+
+#pragma Bss(".Zdata")
+  const int8_t input_data_local[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int8_t weights_data_local[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int32_t bias_data_local[] = {1, 1, 1};
+  int8_t output_data_local[output_dims_count];
+#pragma Bss()
+
+  const int8_t expected_output_data[] = {41, 41, 41, 41, 41, 41};
+
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_local, input_data_local, input_min, input_max,
+      weights_dims_data_local, weights_data_local, weights_min, weights_max,
+      bias_dims_data_local, bias_data_local, bias_scale, expected_output_data,
+      output_dims_data_local, output_min, output_max, kTfLiteActNone,
+      output_data_local);
+}
+
+// Test group 2
+TF_LITE_MICRO_TEST(SystemSimpleTestQuantized2) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_2[] = {2, 10, 4};
+  const int8_t input_data_2[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int weights_dims_data_2[] = {2, 6, 4};
+  const int8_t weights_data_2[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int bias_dims_data_2[] = {1, 6};
+  const int32_t bias_data_2[] = {1, 1, 1, 1, 1, 1};
+  const int8_t expected_output_data_2[] = {
+      17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+      17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+      17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+      17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
+  const int output_dims_data_2[] = {2, 10, 6};
+
+  const int output_dims_count_2 = 60;
+  int8_t output_data_2[output_dims_count_2];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_2, input_data_2, input_min, input_max,
+      weights_dims_data_2, weights_data_2, weights_min, weights_max,
+      bias_dims_data_2, bias_data_2, bias_scale, expected_output_data_2,
+      output_dims_data_2, output_min, output_max, kTfLiteActNone,
+      output_data_2);
+}
+
+TF_LITE_MICRO_TEST(LocalSimpleTestQuantized2) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_local_2[] = {2, 10, 4};
+  const int weights_dims_data_local_2[] = {2, 6, 4};
+  const int bias_dims_data_local_2[] = {1, 6};
+  const int output_dims_data_local_2[] = {2, 10, 6};
+
+  const int output_dims_count_local_2 = 60;
+
+#pragma Bss(".Zdata")
+  const int8_t input_data_local_2[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int8_t weights_data_local_2[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int32_t bias_data_local_2[] = {1, 1, 1, 1, 1, 1};
+  int8_t output_data_local_2[output_dims_count_local_2];
+#pragma Bss()
+
+  const int8_t expected_output_data_local_2[] = {41, 41, 41, 41, 41, 41};
+
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_local_2, input_data_local_2, input_min, input_max,
+      weights_dims_data_local_2, weights_data_local_2, weights_min, weights_max,
+      bias_dims_data_local_2, bias_data_local_2, bias_scale,
+      expected_output_data_local_2, output_dims_data_local_2, output_min,
+      output_max, kTfLiteActNone, output_data_local_2);
+}
+
+// Test group 3
+TF_LITE_MICRO_TEST(SystemSimpleTestQuantized3) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_3[] = {2, 2, 5};
+  const int8_t input_data_3[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int weights_dims_data_3[] = {2, 10, 5};
+  const int8_t weights_data_3[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int bias_dims_data_3[] = {1, 10};
+  const int32_t bias_data_3[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int8_t expected_output_data_3[] = {21, 21, 21, 21, 21, 21, 21,
+                                           21, 21, 21, 21, 21, 21, 21,
+                                           21, 21, 21, 21, 21, 21};
+  const int output_dims_data_3[] = {2, 2, 10};
+
+  const int output_dims_count_3 = 20;
+  int8_t output_data_3[output_dims_count_3];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_3, input_data_3, input_min, input_max,
+      weights_dims_data_3, weights_data_3, weights_min, weights_max,
+      bias_dims_data_3, bias_data_3, bias_scale, expected_output_data_3,
+      output_dims_data_3, output_min, output_max, kTfLiteActNone,
+      output_data_3);
+}
+
+TF_LITE_MICRO_TEST(LocalSimpleTestQuantized3) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_local_3[] = {2, 2, 5};
+  const int weights_dims_data_local_3[] = {2, 10, 5};
+  const int bias_dims_data_local_3[] = {1, 10};
+  const int output_dims_data_local_3[] = {2, 2, 10};
+
+  const int output_dims_count_local_3 = 20;
+
+#pragma Bss(".Zdata")
+  static int8_t input_data_local_3[10];
+  static int8_t weights_data_local_3[50];
+  static int32_t bias_data_local_3[10];
+  static int8_t output_data_local_3[output_dims_count_local_3];
+#pragma Bss()
+
+  for (int i = 0; i < 10; ++i) {
+    input_data_local_3[i] = 2;
+  }
+
+  for (int i = 0; i < 50; ++i) {
+    weights_data_local_3[i] = 2;
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    bias_data_local_3[i] = 1;
+  }
+
+  for (int i = 0; i < 20; ++i) {
+    output_data_local_3[i] = 0;
+  }
+
+  const int8_t expected_output_data_local_3[] = {21, 21, 21, 21, 21, 21, 21,
+                                                 21, 21, 21, 21, 21, 21, 21,
+                                                 21, 21, 21, 21, 21, 21};
+
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_local_3, input_data_local_3, input_min, input_max,
+      weights_dims_data_local_3, weights_data_local_3, weights_min, weights_max,
+      bias_dims_data_local_3, bias_data_local_3, bias_scale,
+      expected_output_data_local_3, output_dims_data_local_3, output_min,
+      output_max, kTfLiteActNone, output_data_local_3);
+}
+
+// Test group 4
+TF_LITE_MICRO_TEST(SystemSimpleTestQuantized4) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_4[] = {2, 5, 10};
+  const int8_t input_data_4[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int weights_dims_data_4[] = {2, 5, 10};
+  const int8_t weights_data_4[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int bias_dims_data_4[] = {1, 5};
+  const int32_t bias_data_4[] = {1, 1, 1, 1, 1};
+  const int8_t expected_output_data_4[] = {41, 41, 41, 41, 41, 41, 41, 41, 41,
+                                           41, 41, 41, 41, 41, 41, 41, 41, 41,
+                                           41, 41, 41, 41, 41, 41, 41};
+  const int output_dims_data_4[] = {2, 5, 5};
+
+  const int output_dims_count_4 = 25;
+  int8_t output_data_4[output_dims_count_4];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_4, input_data_4, input_min, input_max,
+      weights_dims_data_4, weights_data_4, weights_min, weights_max,
+      bias_dims_data_4, bias_data_4, bias_scale, expected_output_data_4,
+      output_dims_data_4, output_min, output_max, kTfLiteActNone,
+      output_data_4);
+}
+
+TF_LITE_MICRO_TEST(LocalSimpleTestQuantized4) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_local_4[] = {2, 5, 10};
+  const int weights_dims_data_local_4[] = {2, 5, 10};
+  const int bias_dims_data_local_4[] = {1, 5};
+  const int output_dims_data_local_4[] = {2, 5, 5};
+
+  const int output_dims_count_local_4 = 25;
+
+#pragma Bss(".Zdata")
+  const int8_t input_data_local_4[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int8_t weights_data_local_4[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int32_t bias_data_local_4[] = {1, 1, 1, 1, 1};
+  int8_t output_data_local_4[output_dims_count_local_4];
+#pragma Bss()
+
+  const int8_t expected_output_data_local_4[] = {
+      41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
+      41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41};
+
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_local_4, input_data_local_4, input_min, input_max,
+      weights_dims_data_local_4, weights_data_local_4, weights_min, weights_max,
+      bias_dims_data_local_4, bias_data_local_4, bias_scale,
+      expected_output_data_local_4, output_dims_data_local_4, output_min,
+      output_max, kTfLiteActNone, output_data_local_4);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
new file mode 100644
index 00000000000..4637470f62e
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
@@ -0,0 +1,126 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mli_slicers.h"  // NOLINT
+
+#include <algorithm>
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim,
+                           int slice_size, int padding_pre, int padding_post,
+                           int overlap, bool interleave_mode)
+    : full_tensor_(full_tensor),
+      sliceDim_(slice_dim),
+      pad_pre_(padding_pre),
+      pad_post_(padding_post),
+      overlap_(overlap),
+      sub_cfg_{0},
+      sub_tensor_{0},
+      done_(false) {
+  /* In the interleave mode, the slicing happens from the deepest dimension up
+  to the slice_dim for example in an HWC layout this can mode can be used to
+  slice in the C dimenstion. in this mode the data is not contiguous in memory
+  anymore */
+  if (interleave_mode) {
+    for (int i = 0; i < full_tensor->rank; i++) {
+      if (i > slice_dim) {
+        sub_cfg_.size[i] = 1;
+      } else if (i == slice_dim) {
+        sub_cfg_.size[i] = slice_size;
+      } else {
+        sub_cfg_.size[i] = full_tensor->shape[i];
+      }
+    }
+    sub_cfg_.sub_tensor_rank = full_tensor->rank;
+
+  } else {
+    /* In the not interleaved mode, the slicing happens from the outer most
+    dimension up to the slice_dim for example in an HWC layout this mode can be
+    used to slice in the H dimension. in this mode the data of the slice is
+    still contiguous in memory (if that was the case in the input tensor */
+    for (int i = 0; i < full_tensor->rank; i++) {
+      if (i < slice_dim) {
+        sub_cfg_.size[i] = 1;
+      } else if (i == slice_dim) {
+        sub_cfg_.size[i] = slice_size;
+      } else {
+        sub_cfg_.size[i] = full_tensor->shape[i];
+      }
+    }
+    sub_cfg_.sub_tensor_rank = full_tensor->rank - slice_dim;
+  }
+
+  ComputeSubTensor();
+}
+
+void TensorSlicer::ComputeSubTensor(void) {
+  // subtsr_cfg_ is used to keep track of the iteration.
+  // A copy is created to update it with the correct clipping and padding for
+  // the current slice
+  mli_sub_tensor_cfg cfg_new = sub_cfg_;
+
+  // begin and end spans the complete input region including padding areas.
+  const int begin = (int)sub_cfg_.offset[sliceDim_] - pad_pre_;
+  // end is clipped to the end of the full input region. this is needed for
+  // cases where the last slice is smaller than the rest.
+  const int end = std::min(begin + sub_cfg_.size[sliceDim_] + overlap_,
+                           full_tensor_->shape[sliceDim_] + pad_post_);
+  // The start coordinate of the subtensor is clipped to zero
+  cfg_new.offset[sliceDim_] = std::max(begin, 0);
+  // and the stop coordinate is clipped to the size of the full tensor
+  const int stop_coord =
+      std::min(end, static_cast<int>(full_tensor_->shape[sliceDim_]));
+  // compute the size of the subtensor
+  cfg_new.size[sliceDim_] = stop_coord - cfg_new.offset[sliceDim_];
+
+  // compute the padding configuration for the current slice.
+  actual_padding_pre = cfg_new.offset[sliceDim_] - begin;
+  actual_padding_post = end - stop_coord;
+
+  mli_hlp_create_subtensor(full_tensor_, &cfg_new, &sub_tensor_);
+}
+
+void TensorSlicer::Next(void) {
+  for (int i = full_tensor_->rank - 1; i >= 0; i--) {
+    sub_cfg_.offset[i] += sub_cfg_.size[i];
+    if (sub_cfg_.offset[i] >= full_tensor_->shape[i]) {
+      // wrap
+      sub_cfg_.offset[i] = 0;
+      // and continue to the next dimension, if no next dimension we are done.
+      if (i == 0) done_ = true;
+      continue;
+    } else {
+      // carry is false, so break from the loop
+      break;
+    }
+  }
+
+  if (!done_) ComputeSubTensor();
+}
+
+bool TensorSlicer::Done(void) { return done_; }
+
+int TensorSlicer::GetPaddingPre(void) { return actual_padding_pre; }
+
+int TensorSlicer::GetPaddingPost(void) { return actual_padding_post; }
+
+mli_tensor* TensorSlicer::Sub(void) { return &sub_tensor_; }
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h
new file mode 100644
index 00000000000..b21a5b68054
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h
@@ -0,0 +1,56 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_
+
+#include "mli_api.h"  // NOLINT
+namespace tflite {
+namespace ops {
+namespace micro {
+
+class TensorSlicer {
+ public:
+  TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int slice_size,
+               int padding_pre = 0, int padding_post = 0, int overlap = 0,
+               bool interleave_mode = false);
+  ~TensorSlicer() = default;
+
+  void Next();
+  bool Done();
+  int GetPaddingPre();
+  int GetPaddingPost();
+
+  mli_tensor* Sub();
+
+  // Default constructor is deleted
+  TensorSlicer() = delete;
+
+ private:
+  const mli_tensor* full_tensor_;
+  mli_tensor sub_tensor_;
+  mli_sub_tensor_cfg sub_cfg_;
+  bool done_;
+  int sliceDim_;
+  int pad_pre_, pad_post_, overlap_;
+  int actual_padding_pre, actual_padding_post;
+
+  void ComputeSubTensor();
+};
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_
diff --git a/tensorflow/lite/micro/kernels/arc/mli_tf_utils.h b/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
similarity index 100%
rename from tensorflow/lite/micro/kernels/arc/mli_tf_utils.h
rename to tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
new file mode 100644
index 00000000000..babde709bae
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
@@ -0,0 +1,376 @@
+/* Copyright 2019-2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/pooling.h"
+
+#include "mli_api.h"  // NOLINT
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace pooling {
+
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  TfLitePaddingValues padding;
+};
+
+enum MliPoolingType { AveragePooling = 0, MaxPooling = 1 };
+
+bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
+                     const TfLitePoolParams* params) {
+  // MLI optimized version only supports int8 dataype and no fused Relu
+  return (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone);
+}
+
+TfLiteStatus CalculateOpData(const TfLiteContext* context,
+                             const TfLitePoolParams* params,
+                             const TfLiteTensor* input,
+                             const TfLiteTensor* output, OpData* data) {
+  // input: batch, height, width, channel
+  int height = SizeOfDimension(input, 1);
+  int width = SizeOfDimension(input, 2);
+
+  int out_height, out_width;
+
+  data->padding = ComputePaddingHeightWidth(
+      params->stride_height, params->stride_width,
+      /*dilation_rate_height=*/1,
+      /*dilation_rate_width=*/1, height, width, params->filter_height,
+      params->filter_width, params->padding, &out_height, &out_width);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
+                              const TfLitePoolParams* params,
+                              const OpData* data, const TfLiteTensor* input,
+                              TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  float activation_min, activation_max;
+  CalculateActivationRange(params->activation, &activation_min,
+                           &activation_max);
+
+  PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.float_activation_min = activation_min;
+  op_params.float_activation_max = activation_max;
+  reference_ops::AveragePool(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(output), GetTensorData<float>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+// Prepare MLI tensors and run Average or Max Pooling
+TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params,
+                     const OpData* data, const TfLiteTensor* input,
+                     TfLiteTensor* output, const MliPoolingType pooling_type) {
+  mli_tensor mli_in = {0};
+  mli_tensor mli_out = {0};
+  mli_pool_cfg cfg = {0};
+
+  ConvertToMliTensor<int8_t>(input, &mli_in);
+  ConvertToMliTensor<int8_t>(output, &mli_out);
+
+  cfg.kernel_width = params->filter_width;
+  cfg.kernel_height = params->filter_height;
+  cfg.stride_width = params->stride_width;
+  cfg.stride_height = params->stride_height;
+
+  if (params->padding == kTfLitePaddingValid) {
+    cfg.padding_left = 0;
+    cfg.padding_right = 0;
+    cfg.padding_top = 0;
+    cfg.padding_bottom = 0;
+  } else {
+    cfg.padding_left = data->padding.width;
+    cfg.padding_right = data->padding.width + data->padding.width_offset;
+    cfg.padding_top = data->padding.height;
+    cfg.padding_bottom = data->padding.height + data->padding.height_offset;
+  }
+
+  const int height_dimension = 1;
+  int in_slice_height = 0;
+  int out_slice_height = 0;
+  const int overlap = cfg.kernel_height - cfg.stride_height;
+
+  // Tensors for data in fast (local) memory and config to copy data from
+  // external to local memory
+  mli_tensor in_local = mli_in;
+  mli_tensor out_local = mli_out;
+  mli_mov_cfg_t copy_config;
+  mli_mov_cfg_for_copy(&copy_config);
+  TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_pooling_tensors(
+      context, &in_local, &out_local));
+  bool in_is_local = in_local.data == mli_in.data;
+  bool out_is_local = out_local.data == mli_out.data;
+  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
+      &in_local, &out_local, cfg.kernel_height, cfg.stride_height,
+      cfg.padding_top, cfg.padding_bottom, &in_slice_height,
+      &out_slice_height));
+
+  /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional
+     tensor. because the mli kernel will process one HWC tensor at a time, the 4
+     dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. on
+     top of that there could be a need to also slice in the Height dimension.
+     for that the sliceHeight has been calculated. The tensor slicer is
+     configured that it will completely slice the nBatch dimension (0) and slice
+     the height dimension (1) in chunks of 'sliceHeight' */
+  TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height,
+                        cfg.padding_top, cfg.padding_bottom, overlap);
+  TensorSlicer out_slice(&mli_out, height_dimension, out_slice_height);
+
+  /* is_local indicates that the tensor is already in local memory,
+     so in that case the original tensor can be used,
+     and there is no need to copy it to the local tensor*/
+  mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+  mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+
+  while (!out_slice.Done()) {
+    cfg.padding_top = in_slice.GetPaddingPre();
+    cfg.padding_bottom = in_slice.GetPaddingPost();
+
+    mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+    if (pooling_type == AveragePooling)
+      mli_krn_avepool_hwc_sa8(in_ptr, &cfg, out_ptr);
+    else if (pooling_type == MaxPooling)
+      mli_krn_maxpool_hwc_sa8(in_ptr, &cfg, out_ptr);
+    mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+    in_slice.Next();
+    out_slice.Next();
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus AverageEvalQuantized(TfLiteContext* context,
+                                  const TfLiteNode* node,
+                                  const TfLitePoolParams* params,
+                                  const OpData* data, const TfLiteTensor* input,
+                                  TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
+  int32_t activation_min, activation_max;
+  (void)CalculateActivationRangeQuantized(context, params->activation, output,
+                                          &activation_min, &activation_max);
+  PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.quantized_activation_min = activation_min;
+  op_params.quantized_activation_max = activation_max;
+
+  if (input->type == kTfLiteUInt8) {
+    reference_ops::AveragePool(
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+        GetTensorShape(output), GetTensorData<uint8_t>(output));
+  } else {
+    reference_integer_ops::AveragePool(
+        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+        GetTensorShape(output), GetTensorData<int8_t>(output));
+  }
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(
+      context,
+      "Node configuration or type %s (%d) is not supported by ARC MLI Library.",
+      TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
+                          TfLitePoolParams* params, OpData* data,
+                          const TfLiteTensor* input, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  float activation_min, activation_max;
+  CalculateActivationRange(params->activation, &activation_min,
+                           &activation_max);
+
+  tflite::PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.float_activation_min = activation_min;
+  op_params.float_activation_max = activation_max;
+  reference_ops::MaxPool(op_params, GetTensorShape(input),
+                         GetTensorData<float>(input), GetTensorShape(output),
+                         GetTensorData<float>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                              TfLitePoolParams* params, OpData* data,
+                              const TfLiteTensor* input, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
+  int32_t activation_min, activation_max;
+  (void)CalculateActivationRangeQuantized(context, params->activation, output,
+                                          &activation_min, &activation_max);
+
+  tflite::PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.quantized_activation_min = activation_min;
+  op_params.quantized_activation_max = activation_max;
+
+  if (input->type == kTfLiteUInt8) {
+    reference_ops::MaxPool(
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+        GetTensorShape(output), GetTensorData<uint8_t>(output));
+  } else {
+    reference_integer_ops::MaxPool(
+        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+        GetTensorShape(output), GetTensorData<int8_t>(output));
+  }
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(
+      context,
+      "Node configuration or type %s (%d) is not supported by ARC MLI Library.",
+      TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+}  // namespace
+
+TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+  OpData data;
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
+
+  // Inputs and outputs share the same type, guaranteed by the converter.
+  switch (input->type) {
+    case kTfLiteFloat32:
+      return AverageEvalFloat(context, node, params, &data, input, output);
+      break;
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+      if (IsMliApplicable(context, input, params)) {
+        return EvalMli(context, params, &data, input, output, AveragePooling);
+      } else {
+        return AverageEvalQuantized(context, node, params, &data, input,
+                                    output);
+      }
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+  OpData data;
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
+
+  switch (input->type) {
+    case kTfLiteFloat32:
+      return MaxEvalFloat(context, node, params, &data, input, output);
+      break;
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+      if (IsMliApplicable(context, input, params)) {
+        return EvalMli(context, params, &data, input, output, MaxPooling);
+      } else {
+        return MaxEvalQuantized(context, node, params, &data, input, output);
+      }
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace pooling
+
+TfLiteRegistration* Register_AVERAGE_POOL_2D() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/nullptr,
+                                 /*invoke=*/pooling::AverageEval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+
+TfLiteRegistration* Register_MAX_POOL_2D() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/nullptr,
+                                 /*invoke=*/pooling::MaxEval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
new file mode 100644
index 00000000000..e8667874120
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
@@ -0,0 +1,422 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This test checks that slicing logic doesn`t affect result of pooling kernels
+//
+// This test doesn`t replace default pooling test
+// (tensorflow/lite/micro/kernels/pooling.cc). It is added to the
+// whole testset only in case MLI for ARC platform is used during generation
+// (which is handled in arc_mli.inc). So such tests won`t be generated for other
+// platforms.
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+template <typename T>
+void TestAveragePoolingQuantized(
+    const int* input_dims_data, const T* input_data, const float input_min,
+    const float input_max, const int filter_height, const int filter_width,
+    const int stride_height, const int stride_width,
+    const T* expected_output_data, const int* output_dims_data,
+    float output_min, float output_max, TfLitePadding padding,
+    TfLiteFusedActivation activation, T* output_data) {
+  static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed.");
+
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLitePoolParams builtin_data = {padding,      stride_width,  stride_height,
+                                   filter_width, filter_height, activation};
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
+  }
+}
+
+template <typename T>
+void TestMaxPoolQuantized(const int* input_dims_data, const T* input_data,
+                          float input_min, float input_max, int filter_width,
+                          int filter_height, int stride_width,
+                          int stride_height, const T* expected_output_data,
+                          float output_min, float output_max,
+                          const int* output_dims_data, TfLitePadding padding,
+                          TfLiteFusedActivation activation, T* output_data) {
+  static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed.");
+
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLitePoolParams builtin_data = {
+      padding,      stride_width,  stride_height,
+      filter_width, filter_height, activation,
+  };
+
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
+  }
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SystemAveragePoolTestInt1) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int8_t output_data[3];
+
+  const int kInput1Shape[] = {4, 1, 2, 4, 1};
+  const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput1Shape[] = {4, 1, 1, 3, 1};
+  const int8_t kGolden1Data[] = {1, 1, 1};
+
+  tflite::testing::TestAveragePoolingQuantized(
+      kInput1Shape,                       // Input shape
+      kInput1Data, input_min, input_max,  // input quantization range
+      2, 2,                               // filter height, filter width
+      1, 1,                               // stride height, stride width
+      kGolden1Data,
+      kOutput1Shape,           // Output shape
+      output_min, output_max,  // output quantization range
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(LocalAveragePoolTestInt1) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int8_t output_data[3];
+
+#pragma Bss(".Zdata")
+  const int kInput1Shape[] = {4, 1, 2, 4, 1};
+  const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput1Shape[] = {4, 1, 1, 3, 1};
+  const int8_t kGolden1Data[] = {1, 1, 1};
+#pragma Bss()
+
+  tflite::testing::TestAveragePoolingQuantized(
+      kInput1Shape,                       // Input shape
+      kInput1Data, input_min, input_max,  // input quantization range
+      2, 2,                               // filter height, filter width
+      1, 1,                               // stride height, stride width
+      kGolden1Data,
+      kOutput1Shape,           // Output shape
+      output_min, output_max,  // output quantization range
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+// Test group AVG 2
+TF_LITE_MICRO_TEST(SystemAveragePoolTestInt2) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int8_t output_data[45];
+
+  const int kInput2Shape[] = {4, 1, 6, 10, 1};
+  const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput2Shape[] = {4, 1, 5, 9, 1};
+  const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+  tflite::testing::TestAveragePoolingQuantized(
+      kInput2Shape,                       // Input shape
+      kInput2Data, input_min, input_max,  // input quantization range
+      2, 2,                               // filter height, filter width
+      1, 1,                               // stride height, stride width
+      kGolden2Data,
+      kOutput2Shape,           // Output shape
+      output_min, output_max,  // output quantization range
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(LocalAveragePoolTestInt2) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int8_t output_data[45];
+
+#pragma Bss(".Zdata")
+  const int kInput2Shape[] = {4, 1, 6, 10, 1};
+  const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput2Shape[] = {4, 1, 5, 9, 1};
+  const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+#pragma Bss()
+
+  tflite::testing::TestAveragePoolingQuantized(
+      kInput2Shape,                       // Input shape
+      kInput2Data, input_min, input_max,  // input quantization range
+      2, 2,                               // filter height, filter width
+      1, 1,                               // stride height, stride width
+      kGolden2Data,
+      kOutput2Shape,           // Output shape
+      output_min, output_max,  // output quantization range
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+// Test group MAX 1
+TF_LITE_MICRO_TEST(SystemMaxPoolTestInt1) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[3];
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 1;
+  int stride_height = 1;
+
+  const int kInput1Shape[] = {4, 1, 2, 4, 1};
+  const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput1Shape[] = {4, 1, 1, 3, 1};
+  const int8_t kGolden1Data[] = {1, 1, 1};
+
+  tflite::testing::TestMaxPoolQuantized(
+      kInput1Shape,  // Input shape
+      kInput1Data, input_min, input_max, filter_width, filter_height,
+      stride_width, stride_height, kGolden1Data, output_min, output_max,
+      kOutput1Shape,  // Output shape
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(LocalMaxPoolTestInt1) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[3];
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 1;
+  int stride_height = 1;
+
+#pragma Bss(".Zdata")
+  const int kInput1Shape[] = {4, 1, 2, 4, 1};
+  const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput1Shape[] = {4, 1, 1, 3, 1};
+  const int8_t kGolden1Data[] = {1, 1, 1};
+#pragma Bss()
+
+  tflite::testing::TestMaxPoolQuantized(
+      kInput1Shape,  // Input shape
+      kInput1Data, input_min, input_max, filter_width, filter_height,
+      stride_width, stride_height, kGolden1Data, output_min, output_max,
+      kOutput1Shape,  // Output shape
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+// Test group MAX 2
+TF_LITE_MICRO_TEST(SystemMaxPoolTestInt2) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[45];
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 1;
+  int stride_height = 1;
+
+  const int kInput2Shape[] = {4, 1, 6, 10, 1};
+  const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput2Shape[] = {4, 1, 5, 9, 1};
+  const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+  tflite::testing::TestMaxPoolQuantized(
+      kInput2Shape,  // Input shape
+      kInput2Data, input_min, input_max, filter_width, filter_height,
+      stride_width, stride_height, kGolden2Data, output_min, output_max,
+      kOutput2Shape,  // Output shape
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(LocalMaxPoolTestInt2) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[45];
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 1;
+  int stride_height = 1;
+
+#pragma Bss(".Zdata")
+  const int kInput2Shape[] = {4, 1, 6, 10, 1};
+  const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput2Shape[] = {4, 1, 5, 9, 1};
+  const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+#pragma Bss()
+
+  tflite::testing::TestMaxPoolQuantized(
+      kInput2Shape,  // Input shape
+      kInput2Data, input_min, input_max, filter_width, filter_height,
+      stride_width, stride_height, kGolden2Data, output_min, output_max,
+      kOutput2Shape,  // Output shape
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
new file mode 100644
index 00000000000..534d5ef3230
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
@@ -0,0 +1,338 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+
+#include <limits.h>
+
+#include <algorithm>
+
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+static void get_arc_two_buffer_sizes(int request_size_1, int request_size_2,
+                                     int* grant_size_1, int* grant_size_2) {
+  int maxrequest = 0;
+  int secondrequest = 0;
+  int maxavailable = 0;
+  int secondavail = 0;
+
+  // determine the largest requested buffer.
+  if (request_size_1 > request_size_2) {
+    maxrequest = request_size_1;
+    secondrequest = request_size_2;
+  } else {
+    maxrequest = request_size_2;
+    secondrequest = request_size_1;
+  }
+
+  // find the two largest available buffers.
+  get_arc_scratch_buffer_two_max_sizes(&maxavailable, &secondavail);
+
+  // in case two buffers are available, the largest buffer can go to the largest
+  // request.
+  if (secondavail > 0) {  // this condition can be enhanced to prevent cases
+                          // where the second buffer is so small that it is
+                          // better to use one buffer and split it.
+    if (request_size_1 > request_size_2) {
+      *grant_size_1 = maxavailable;
+      *grant_size_2 = secondavail;
+    } else {
+      *grant_size_1 = secondavail;
+      *grant_size_2 = maxavailable;
+    }
+  } else {
+    // In case only one buffer is available,
+    // use only the max buffer, and split it.
+    *grant_size_1 = maxavailable / 2;
+    *grant_size_2 = maxavailable / 2;
+  }
+}
+
+static TfLiteStatus get_arc_scratch_buffer_for_io_tensors(
+    TfLiteContext* context, mli_tensor* in, mli_tensor* out) {
+#ifdef __Xxy
+  int request_size_in = 0;
+  int request_size_out = 0;
+  int grant_size_in = 0;
+  int grant_size_out = 0;
+  if (!inside_arc_ccm(in->data)) {
+    // In case the input tensor contains multiple batches, it has rank 4
+    // because the mli kernel cannot operate on batches, we need to have the
+    // size of a single HWC tensor. that is why the start_rank is 1 in case of
+    // input rank 4
+    int start_rank = in->rank - 3;
+    request_size_in = mli_hlp_count_elem_num(in, start_rank) *
+                      mli_hlp_tensor_element_size(in);
+  }
+  if (!inside_arc_ccm(out->data)) {
+    // In case the input tensor contains multiple batches, it has rank 4
+    // because the mli kernel cannot operate on batches, we need to have the
+    // size of a single batch. that is why the start_rank is 1 in case of input
+    // rank 4
+    int start_rank = out->rank - 3;
+    request_size_out = mli_hlp_count_elem_num(out, start_rank) *
+                       mli_hlp_tensor_element_size(out);
+  }
+
+  get_arc_two_buffer_sizes(request_size_in, request_size_out, &grant_size_in,
+                           &grant_size_out);
+
+  if (!inside_arc_ccm(in->data)) {
+    in->data = get_arc_scratch_buffer(grant_size_in);
+    in->capacity = grant_size_in;
+    if (in->data == NULL) return kTfLiteError;
+  }
+  if (!inside_arc_ccm(out->data)) {
+    out->data = get_arc_scratch_buffer(grant_size_out);
+    out->capacity = grant_size_out;
+    if (out->data == NULL) return kTfLiteError;
+  }
+#endif
+  return kTfLiteOk;
+}
+
+TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
+                                                     mli_tensor* in,
+                                                     mli_tensor* weights,
+                                                     mli_tensor* bias,
+                                                     mli_tensor* out) {
+  TfLiteStatus ret_val = kTfLiteOk;
+#ifdef __Xxy
+  init_arc_scratch_buffers();
+  if (!inside_arc_ccm(weights->data)) {
+    int weights_size = mli_hlp_count_elem_num(weights, 0) *
+                       mli_hlp_tensor_element_size(weights);
+    int max_weights_size = 0;
+    weights->data = get_arc_scratch_buffer(weights_size);
+    weights->capacity = weights_size;
+    if (weights->data == NULL) {
+      get_arc_scratch_buffer_max_size(&max_weights_size);
+      weights->data = get_arc_scratch_buffer(max_weights_size);
+      weights->capacity = max_weights_size;
+      if (max_weights_size == 0) ret_val = kTfLiteError;
+    }
+    if (weights->data == NULL) ret_val = kTfLiteError;
+  }
+
+  if (!inside_arc_ccm(bias->data)) {
+    uint32_t bias_mem_requirements =
+        mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
+    bias->data = get_arc_scratch_buffer(bias_mem_requirements);
+    bias->capacity = bias_mem_requirements;
+  }
+
+  if (ret_val == kTfLiteOk) {
+    ret_val = get_arc_scratch_buffer_for_io_tensors(context, in, out);
+  }
+
+  if (bias->data == NULL) {
+    int max_bias_size = 0;
+    get_arc_scratch_buffer_max_size(&max_bias_size);
+    bias->data = get_arc_scratch_buffer(max_bias_size);
+    bias->capacity = max_bias_size;
+    if (max_bias_size == 0) ret_val = kTfLiteError;
+  }
+  if (bias->data == NULL) ret_val = kTfLiteError;
+
+#endif
+  return ret_val;
+}
+
+TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(
+    TfLiteContext* context, mli_tensor* in, mli_tensor* weights,
+    mli_tensor* bias, mli_tensor* out) {
+  TfLiteStatus ret_val = kTfLiteOk;
+#ifdef __Xxy
+  init_arc_scratch_buffers();
+  /* strategy for FC kernels:
+     first allocate input, because this cannot be sliced. (in case of batch
+     processing, only a single input needs to be allocated) then weigths & bias
+     because if fully loaded, they can be reused over batches. then output.
+     The number of output channels (for weights slicing) depends on size of
+     output and size of weights&bias */
+
+  if (!inside_arc_ccm(in->data)) {
+    /* In case the input tensor contains multiple batches,
+       only count the size if the inner most dimension */
+    int size_in = mli_hlp_count_elem_num(in, in->rank - 1) *
+                  mli_hlp_tensor_element_size(in);
+    in->data = get_arc_scratch_buffer(size_in);
+    in->capacity = size_in;
+    if (in->data == NULL) {
+      in->capacity = 0;
+      ret_val = kTfLiteError;
+    }
+  }
+
+  if (!inside_arc_ccm(weights->data)) {
+    int weights_size = mli_hlp_count_elem_num(weights, 0) *
+                       mli_hlp_tensor_element_size(weights);
+    int max_weights_size = 0;
+    weights->data = get_arc_scratch_buffer(weights_size);
+    weights->capacity = weights_size;
+    if (weights->data == NULL) {
+      get_arc_scratch_buffer_max_size(&max_weights_size);
+      weights->data = get_arc_scratch_buffer(max_weights_size);
+      weights->capacity = max_weights_size;
+      if (max_weights_size == 0) ret_val = kTfLiteError;
+    }
+    if (weights->data == NULL) ret_val = kTfLiteError;
+  }
+
+  if (!inside_arc_ccm(bias->data)) {
+    int bias_mem_requirements =
+        mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
+    bias->data = get_arc_scratch_buffer(bias_mem_requirements);
+    bias->capacity = bias_mem_requirements;
+  }
+
+  if (!inside_arc_ccm(out->data)) {
+    /* In case the input tensor contains multiple batches,
+       only count the size if the inner most dimension */
+    int out_size = mli_hlp_count_elem_num(out, out->rank - 1) *
+                   mli_hlp_tensor_element_size(out);
+    int max_out_size = 0;
+    out->data = get_arc_scratch_buffer(out_size);
+    out->capacity = out_size;
+    if (out->data == NULL) {
+      get_arc_scratch_buffer_max_size(&max_out_size);
+      out->data = get_arc_scratch_buffer(max_out_size);
+      out->capacity = max_out_size;
+      if (max_out_size == 0) ret_val = kTfLiteError;
+    }
+    if (out->data == NULL) ret_val = kTfLiteError;
+  }
+
+  if (bias->data == NULL) {
+    int max_bias_size = 0;
+    get_arc_scratch_buffer_max_size(&max_bias_size);
+    bias->data = get_arc_scratch_buffer(max_bias_size);
+    bias->capacity = max_bias_size;
+    if (max_bias_size == 0) ret_val = kTfLiteError;
+  }
+  if (bias->data == NULL) ret_val = kTfLiteError;
+
+#endif
+  return ret_val;
+}
+
+TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
+    const mli_tensor* in, const mli_tensor* out, const int kernel_height,
+    const int stride_height, const int padding_top, const int padding_bot,
+    int* in_slice_height, int* out_slice_height) {
+  const int height_dimension = 1;
+  const int in_height = in->shape[height_dimension];
+  const int out_height = out->shape[height_dimension];
+  const int line_size_in = mli_hlp_count_elem_num(in, height_dimension + 1) *
+                           mli_hlp_tensor_element_size(in);
+  const int line_size_out = mli_hlp_count_elem_num(out, height_dimension + 1) *
+                            mli_hlp_tensor_element_size(out);
+  int max_lines_in = 0;
+  int max_lines_out = 0;
+  int max_out_lines_for_input = 0;
+  bool fit = (in->capacity >= in_height * line_size_in) &&
+             (out->capacity >= out_height * line_size_out);
+  if (fit) {
+    // in case both tensors completely fit in the capacity, there is no need for
+    // slicing
+    *in_slice_height = in_height;
+    *out_slice_height = out_height;
+  } else {
+    // First compute how many lines fit into the input tensor, and compute how
+    // many output lines can be computed with that.
+    max_lines_in =
+        std::min(in_height, static_cast<int>(in->capacity) / line_size_in);
+    if (max_lines_in >= in_height) {
+      max_out_lines_for_input = out_height;
+    } else if (2 * max_lines_in >= in_height) {
+      // in this case only two slices are needed, so both could benefit from
+      // padding. take the MIN to get the worst case.
+      max_out_lines_for_input =
+          (max_lines_in + std::min(padding_top, padding_bot) - kernel_height +
+           1) /
+          stride_height;
+    } else {
+      max_out_lines_for_input =
+          (max_lines_in - kernel_height + 1) / stride_height;
+    }
+    // Ten compute how many ouput lines fit into the output tensor.
+    max_lines_out =
+        std::min(out_height, static_cast<int>(out->capacity) / line_size_out);
+    // the smallest of the two determines the slice height for the output, and
+    // the derived sliceheight for the input.
+    *out_slice_height = std::min(max_out_lines_for_input, max_lines_out);
+    *in_slice_height = *out_slice_height * stride_height;
+  }
+
+  if ((*in_slice_height > 0) && (*out_slice_height > 0)) {
+    return kTfLiteOk;
+  } else {
+    return kTfLiteError;
+  }
+}
+
+TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
+    const mli_tensor* weights, const mli_tensor* bias,
+    const int weight_out_ch_dimension, int* slice_channels) {
+  const int channels = weights->shape[weight_out_ch_dimension];
+  const int ch_size_w = (mli_hlp_count_elem_num(weights, 0) / channels) *
+                        mli_hlp_tensor_element_size(weights);
+  const int ch_size_b = (mli_hlp_count_elem_num(bias, 0) / channels) *
+                        mli_hlp_tensor_element_size(bias);
+  int max_ch_weigths = 0;
+  int max_ch_bias = 0;
+
+  bool fit = (weights->capacity >= channels * ch_size_w) &&
+             (bias->capacity >= channels * ch_size_b);
+  if (fit) {
+    // in case both tensors completely fit in the capacity, there is no need for
+    // slicing
+    *slice_channels = channels;
+  } else {
+    // First compute how many channels fit into the weights tensor
+    max_ch_weigths =
+        std::min(channels, static_cast<int>(weights->capacity) / ch_size_w);
+    // Ten compute how many channels fit into the bias tensor.
+    max_ch_bias =
+        std::min(channels, static_cast<int>(bias->capacity) / ch_size_b);
+    // the smallest of the two determines the slice size
+    *slice_channels = std::min(max_ch_weigths, max_ch_bias);
+  }
+
+  if (*slice_channels > 0) {
+    return kTfLiteOk;
+  } else {
+    return kTfLiteError;
+  }
+}
+
+TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context,
+                                                        mli_tensor* in,
+                                                        mli_tensor* out) {
+#ifdef __Xxy
+  init_arc_scratch_buffers();
+  return get_arc_scratch_buffer_for_io_tensors(context, in, out);
+#else
+  return kTfLiteOk;
+#endif
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h
new file mode 100644
index 00000000000..0db2db558ee
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h
@@ -0,0 +1,129 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_
+#define TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_
+
+#include "mli_api.h"  // NOLINT
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+/**
+ * @brief Function to allocate scratch buffers for the convolution tensors
+ *
+ * @detail This function will update the data pointers in the 4 tensors with
+ * pointers to scratch buffers in fast local memory.
+ *
+ * @param context  [I] pointer to TfLite context (needed for error handling)
+ * @param in [IO] pointer to the input tensor
+ * @param weights [IO] pointer to the weights tensor
+ * @param bias [IO] pointer to the bias tensor
+ * @param output [IO] pointer to the output tensor
+ *
+ * @return Tf Lite status code
+ */
+TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
+                                                     mli_tensor* in,
+                                                     mli_tensor* weights,
+                                                     mli_tensor* bias,
+                                                     mli_tensor* out);
+
+/**
+ * @brief Function to allocate scratch buffers for pooling kernels with only
+ * input and output buffers
+ *
+ * @detail This function will update the data pointers in the 2 tensors with
+ * pointers to scratch buffers in fast local memory.
+ *
+ * @param context  [I] pointer to TfLite context (needed for error handling)
+ * @param in [IO] pointer to the input tensor
+ * @param output [IO] pointer to the output tensor
+ *
+ * @return Tf Lite status code
+ */
+TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context,
+                                                        mli_tensor* in,
+                                                        mli_tensor* out);
+
+/**
+ * @brief Function to allocate scratch buffers for the fully connect tensors
+ *
+ * @detail This function will update the data pointers in the 4 tensors with
+ * pointers to scratch buffers in fast local memory.
+ *
+ * @param context  [I] pointer to TfLite context (needed for error handling)
+ * @param in [IO] pointer to the input tensor
+ * @param weights [IO] pointer to the weights tensor
+ * @param bias [IO] pointer to the bias tensor
+ * @param output [IO] pointer to the output tensor
+ *
+ * @return Tf Lite status code
+ */
+TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(
+    TfLiteContext* context, mli_tensor* in, mli_tensor* weights,
+    mli_tensor* bias, mli_tensor* out);
+
+/**
+ * @brief Function to calculate slice size for io tensors
+ *
+ * @detail This function will calculate the slice size in the height dimension
+ * for input and output tensors. it takes into account the kernel size and the
+ * padding. the function will look at the capacity filed in the in and out
+ * tensor to determine the available buffersize.
+ *
+ * @param in [I] pointer to the input tensor
+ * @param out [I] pointer to the output tensor
+ * @param kernelHeight [I] size of the kernel in height dimension
+ * @param strideHeight [I] input stride in height dimension
+ * @param padding_top [I] number of lines with zeros at the top
+ * @param padding_bot [I] number of lines with zeros at the bottom
+ * @param inSliceHeight [O] slice size in height dimension for the input tensor
+ * @param outSliceHeight [O] slice size in height dimension for the output
+ * tensor
+ *
+ * @return Tf Lite status code
+ */
+TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
+    const mli_tensor* in, const mli_tensor* out, const int kernelHeight,
+    const int strideHeight, const int padding_top, const int padding_bot,
+    int* in_slice_height, int* out_slice_height);
+
+/**
+ * @brief Function to calculate slice size for weight slicing
+ *
+ * @detail This function will calculate the slice size in the output channel
+ * dimension for weight and bias tensors. the function will look at the capacity
+ * filed in the weights and bias tensor to determine the available buffersize.
+ *
+ * @param weights [I] pointer to the input tensor
+ * @param bias [I] pointer to the output tensor
+ * @param weightOutChDimension [I] dimension of the output channels in the
+ * weights tensor
+ * @param sliceChannels [O] slice size in output channel dimension
+ *
+ * @return Tf Lite status code
+ */
+TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
+    const mli_tensor* weights, const mli_tensor* bias,
+    const int weight_out_ch_dimension, int* slice_channels);
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_
diff --git a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
new file mode 100644
index 00000000000..2ee91da5eb7
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
@@ -0,0 +1,135 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+
+#include <limits.h>
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+/* by default use all the XY memory, and half of the DCCM because DCCM is also
+ * used for the data section and the stack. the values can be overruled by
+ * adding a -D option to the makefile of the application
+ */
+#ifndef SCRATCH_MEM_X_SIZE
+#ifdef core_config_xy_size
+#define SCRATCH_MEM_X_SIZE (core_config_xy_size)
+#else
+#define SCRATCH_MEM_X_SIZE (0)
+#endif
+#endif
+
+#ifndef SCRATCH_MEM_Y_SIZE
+#ifdef core_config_xy_size
+#define SCRATCH_MEM_Y_SIZE (core_config_xy_size)
+#else
+#define SCRATCH_MEM_Y_SIZE (0)
+#endif
+#endif
+
+#ifndef SCRATCH_MEM_Z_SIZE
+#ifdef core_config_dccm_size
+#define SCRATCH_MEM_Z_SIZE ((core_config_dccm_size) / 2)
+#else
+#define SCRATCH_MEM_Z_SIZE (0)
+#endif
+#endif
+
+namespace {
+#pragma Bss(".Xdata")
+static int8_t scratch_mem_x[SCRATCH_MEM_X_SIZE];
+#pragma Bss()
+
+#pragma Bss(".Ydata")
+static int8_t scratch_mem_y[SCRATCH_MEM_Y_SIZE];
+#pragma Bss()
+
+#pragma Bss(".Zdata")
+static int8_t scratch_mem_z[SCRATCH_MEM_Z_SIZE];
+#pragma Bss()
+}  // namespace
+
+static int8_t *scratch_mem[] = {scratch_mem_x, scratch_mem_y, scratch_mem_z};
+static uint32_t scratch_sizes[] = {SCRATCH_MEM_X_SIZE, SCRATCH_MEM_Y_SIZE,
+                                   SCRATCH_MEM_Z_SIZE};
+
+void *get_arc_scratch_buffer(int size) {
+  // Function to asign fast memory from one of 3 scratch buffers.
+  // Best Fit strategy - memory is allocated from that memory bank that leaves
+  // the least unused memory.
+  void *buf = NULL;
+  int best_mem_idx = -1;
+  int best_mem_delta = INT_MAX;
+  const int num_mem = sizeof(scratch_mem) / sizeof(scratch_mem[0]);
+  // find a local memory that fits the data size.
+  for (int mem_idx = 0; mem_idx < num_mem; ++mem_idx) {
+    // Best Fit
+    if ((size <= scratch_sizes[mem_idx]) &&
+        (scratch_sizes[mem_idx] - size < best_mem_delta)) {
+      best_mem_idx = mem_idx;
+      best_mem_delta = scratch_sizes[mem_idx] - size;
+    }
+  }
+  if (best_mem_idx >= 0) {
+    buf = static_cast<void *>(scratch_mem[best_mem_idx]);
+    scratch_mem[best_mem_idx] += size;
+    scratch_sizes[best_mem_idx] -= size;
+  }
+  return buf;
+}
+
+void get_arc_scratch_buffer_max_size(int *size) {
+  int maxavailable = 0;
+  const int num_mem = sizeof(scratch_mem) / sizeof(scratch_mem[0]);
+  // find the largest available buffer.
+  for (int i = 0; i < num_mem; i++) {
+    if (scratch_sizes[i] > maxavailable) {
+      maxavailable = scratch_sizes[i];
+    }
+  }
+  *size = maxavailable;
+}
+
+void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2) {
+  int maxavailable = 0;
+  int secondavail = 0;
+  const int num_mem = sizeof(scratch_mem) / sizeof(scratch_mem[0]);
+  // find the two largest available buffers.
+  for (int i = 0; i < num_mem; i++) {
+    if (scratch_sizes[i] > maxavailable) {
+      secondavail = maxavailable;
+      maxavailable = scratch_sizes[i];
+    } else if (scratch_sizes[i] > secondavail) {
+      secondavail = scratch_sizes[i];
+    }
+  }
+  *size1 = maxavailable;
+  *size2 = secondavail;
+}
+
+void init_arc_scratch_buffers(void) {
+  scratch_mem[0] = scratch_mem_x;
+  scratch_mem[1] = scratch_mem_y;
+  scratch_mem[2] = scratch_mem_z;
+  scratch_sizes[0] = SCRATCH_MEM_X_SIZE;
+  scratch_sizes[1] = SCRATCH_MEM_Y_SIZE;
+  scratch_sizes[2] = SCRATCH_MEM_Z_SIZE;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
new file mode 100644
index 00000000000..f139659960e
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
@@ -0,0 +1,68 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_
+#define TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_
+
+#include "mli_api.h"  // NOLINT
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+void init_arc_scratch_buffers(void);
+void* get_arc_scratch_buffer(
+    int size);  // Function to assign fast memory from one of 3 scratch buffers.
+
+void get_arc_scratch_buffer_max_size(int* size);
+void get_arc_scratch_buffer_two_max_sizes(int* size1, int* size2);
+
+static inline bool inside_arc_dccm(void* p) {
+#if core_config_dccm_present
+  return ((unsigned)p >= core_config_dccm_base) &&
+         ((unsigned)p < core_config_dccm_base + core_config_dccm_size);
+#else
+  return false;
+#endif
+}
+
+static inline bool inside_arc_xccm(void* p) {
+#if core_config_xy
+  return ((unsigned)p >= core_config_xy_x_base) &&
+         ((unsigned)p < core_config_xy_x_base + core_config_xy_size);
+#else
+  return false;
+#endif
+}
+
+static inline bool inside_arc_yccm(void* p) {
+#if core_config_xy
+  return ((unsigned)p >= core_config_xy_y_base) &&
+         ((unsigned)p < core_config_xy_y_base + core_config_xy_size);
+#else
+  return false;
+#endif
+}
+
+static inline bool inside_arc_ccm(void* p) {
+  return inside_arc_dccm(p) || inside_arc_xccm(p) || inside_arc_yccm(p);
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_
diff --git a/tensorflow/lite/micro/kernels/arg_min_max_test.cc b/tensorflow/lite/micro/kernels/arg_min_max_test.cc
index ea95b8adabf..2b4d22d0f4e 100644
--- a/tensorflow/lite/micro/kernels/arg_min_max_test.cc
+++ b/tensorflow/lite/micro/kernels/arg_min_max_test.cc
@@ -31,9 +31,9 @@ void ValidateArgMinMaxGoldens(TfLiteTensor* tensors, int tensors_size,
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration;
   if (using_min) {
-    registration = resolver.FindOp(tflite::BuiltinOperator_ARG_MIN, 1);
+    registration = resolver.FindOp(tflite::BuiltinOperator_ARG_MIN);
   } else {
-    registration = resolver.FindOp(tflite::BuiltinOperator_ARG_MAX, 1);
+    registration = resolver.FindOp(tflite::BuiltinOperator_ARG_MAX);
   }
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
diff --git a/tensorflow/lite/micro/kernels/ceil_test.cc b/tensorflow/lite/micro/kernels/ceil_test.cc
index ccd5c712fef..fc21b0141b5 100644
--- a/tensorflow/lite/micro/kernels/ceil_test.cc
+++ b/tensorflow/lite/micro/kernels/ceil_test.cc
@@ -39,7 +39,7 @@ void TestCeil(const int* input_dims_data, const float* input_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_CEIL, 1);
+      resolver.FindOp(tflite::BuiltinOperator_CEIL);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   int inputs_array_data[] = {1, 0};
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
index 34d4e837f65..6e8272b221a 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/conv.h"
 
+#include "arm_nn_types.h"
 #include "arm_nnfunctions.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
@@ -116,7 +117,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 #if defined(__ARM_FEATURE_DSP)
   OpData data;
-  int32_t buf_size;
+  int32_t buf_size = 0;
 
   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
 
@@ -127,32 +128,49 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   RuntimeShape input_shape = GetTensorShape(input);
   RuntimeShape output_shape = GetTensorShape(output);
 
-  const int input_depth = input_shape.Dims(3);
-  const int input_width = input->dims->data[2];
-  const int input_height = input->dims->data[1];
-  const int filter_width = filter->dims->data[2];
-  const int filter_height = filter->dims->data[1];
-  const int output_width = output->dims->data[2];
-  const int output_height = output->dims->data[1];
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  // Initialize cmsis-nn input dimensions
+  cmsis_nn_dims input_dims;
+  input_dims.n = MatchingDim(input_shape, 0, output_shape, 0);
+  input_dims.h = input->dims->data[1];
+  input_dims.w = input->dims->data[2];
+  input_dims.c = input_shape.Dims(3);
+
+  // Initialize cmsis-nn filter dimensions
+  cmsis_nn_dims filter_dims;
+  filter_dims.n = output_shape.Dims(3);
+  filter_dims.h = filter->dims->data[1];
+  filter_dims.w = filter->dims->data[2];
+  filter_dims.c = input_dims.c;
+
+  // Initialize cmsis-nn output dimensions
+  cmsis_nn_dims output_dims;
+  output_dims.n = input_dims.n;
+  output_dims.h = output->dims->data[1];
+  output_dims.w = output->dims->data[2];
+  output_dims.c = output_shape.Dims(3);
 
   int* buffer_idx = reinterpret_cast<int*>(node->user_data);
 
   TF_LITE_ENSURE_STATUS(CalculateOpData(
-      context, node, params, input_width, input_height, filter_width,
-      filter_height, output_width, output_height, input->type, &data));
+      context, node, params, input_dims.w, input_dims.h, filter_dims.w,
+      filter_dims.h, output_dims.w, output_dims.h, input->type, &data));
 
-  if (data.padding.width == 0 && data.padding.height == 0 &&
-      (input_depth % 4 == 0) && params->stride_width == 1 &&
-      params->stride_height == 1 && filter_width == 1 && filter_height == 1) {
-    buf_size = arm_convolve_1x1_s8_fast_get_buffer_size(input_depth);
-  } else if (output_height == 1 && input_height == 1 && filter_height == 1 &&
-             (output_width % 4 == 0) && batches == 1) {
-    buf_size = arm_convolve_1_x_n_s8_get_buffer_size(input_depth, filter_width,
-                                                     filter_height);
-  } else {
-    buf_size = arm_convolve_s8_get_buffer_size(input_depth, filter_width,
-                                               filter_height);
+  if (input->type == kTfLiteInt8) {
+    // Initialize cmsis-nn convolution parameters
+    cmsis_nn_conv_params conv_params;
+    conv_params.input_offset = -input->params.zero_point;
+    conv_params.output_offset = output->params.zero_point;
+    conv_params.stride.h = params->stride_height;
+    conv_params.stride.w = params->stride_width;
+    conv_params.dilation.h = params->dilation_height_factor;
+    conv_params.dilation.w = params->dilation_width_factor;
+    conv_params.padding.h = data.padding.height;
+    conv_params.padding.w = data.padding.width;
+    conv_params.activation.min = data.output_activation_min;
+    conv_params.activation.max = data.output_activation_max;
+
+    buf_size = arm_convolve_wrapper_s8_get_buffer_size(
+        &conv_params, &input_dims, &filter_dims, &output_dims);
   }
 
   node->user_data = buffer_idx;
@@ -204,6 +222,102 @@ TfLiteStatus EvalQuantizedPerChannel(
     TfLiteContext* context, TfLiteNode* node, TfLiteConvParams* params,
     OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
     const TfLiteTensor* bias, TfLiteTensor* output, TfLiteTensor* im2col) {
+  // Initialize cmsis-nn convolution parameters
+  cmsis_nn_conv_params conv_params;
+  conv_params.input_offset = -input->params.zero_point;
+  conv_params.output_offset = output->params.zero_point;
+  conv_params.stride.h = params->stride_height;
+  conv_params.stride.w = params->stride_width;
+  conv_params.dilation.h = params->dilation_height_factor;
+  conv_params.dilation.w = params->dilation_width_factor;
+  conv_params.padding.h = data->padding.height;
+  conv_params.padding.w = data->padding.width;
+  conv_params.activation.min = data->output_activation_min;
+  conv_params.activation.max = data->output_activation_max;
+
+  // Initialize cmsis-nn per channel quantization parameters
+  cmsis_nn_per_channel_quant_params quant_params;
+  quant_params.multiplier = data->per_channel_output_multiplier;
+  quant_params.shift = data->per_channel_output_shift;
+
+#if defined(__ARM_FEATURE_DSP)
+  RuntimeShape filter_shape = GetTensorShape(filter);
+  RuntimeShape input_shape = GetTensorShape(input);
+  RuntimeShape output_shape = GetTensorShape(output);
+  RuntimeShape bias_shape = GetTensorShape(bias);
+
+  // Sanity check.
+  TFLITE_DCHECK_LE(conv_params.activation.min, conv_params.activation.max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (GetTensorData<int8_t>(bias)) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Initialize cmsis-nn dimensions
+  // Input
+  cmsis_nn_dims input_dims;
+  input_dims.n = batch_size;
+  input_dims.h = input_shape.Dims(1);
+  input_dims.w = input_shape.Dims(2);
+  input_dims.c = input_depth;
+
+  // Filter
+  cmsis_nn_dims filter_dims;
+  filter_dims.n = output_depth;
+  filter_dims.h = filter_shape.Dims(1);
+  filter_dims.w = filter_shape.Dims(2);
+  filter_dims.c = input_depth;
+
+  // Bias
+  cmsis_nn_dims bias_dims;
+  bias_dims.n = 1;
+  bias_dims.h = 1;
+  bias_dims.w = 1;
+  bias_dims.c = output_depth;
+
+  // Output
+  cmsis_nn_dims output_dims;
+  output_dims.n = batch_size;
+  output_dims.h = output_shape.Dims(1);
+  output_dims.w = output_shape.Dims(2);
+  output_dims.c = output_depth;
+
+  // Initialize cmsis-nn context
+  cmsis_nn_context ctx;
+  ctx.buf = nullptr;
+  ctx.size = 0;
+
+  auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
+  if (*buffer_idx > -1) {
+    ctx.buf = context->GetScratchBuffer(context, *buffer_idx);
+    // Note: ctx.size is currently not used in cmsis-nn.
+    // The buffer should be allocated in the Prepare function through
+    // arm_convolve_wrapper_s8_get_buffer_size
+  }
+
+  // arm_convolve_wrapper_s8 dispatches the optimized kernel accordingly with
+  // the parameters passed
+  arm_status status = arm_convolve_wrapper_s8(
+      &ctx, &conv_params, &quant_params, &input_dims,
+      GetTensorData<int8_t>(input), &filter_dims, GetTensorData<int8_t>(filter),
+      &bias_dims, GetTensorData<int32>(bias), &output_dims,
+      GetTensorData<int8_t>(output));
+
+  if (status == ARM_MATH_SUCCESS) {
+    return kTfLiteOk;
+  } else {
+    return kTfLiteError;
+  }
+
+#else
+#pragma message( \
+    "CMSIS-NN optimization for conv not available for this target. Using reference kernel.")
+
   ConvParams op_params;
   op_params.input_offset = -input->params.zero_point;
   op_params.output_offset = output->params.zero_point;
@@ -216,91 +330,6 @@ TfLiteStatus EvalQuantizedPerChannel(
   op_params.quantized_activation_min = data->output_activation_min;
   op_params.quantized_activation_max = data->output_activation_max;
 
-#if defined(__ARM_FEATURE_DSP)
-  RuntimeShape filter_shape = GetTensorShape(filter);
-  RuntimeShape input_shape = GetTensorShape(input);
-  RuntimeShape output_shape = GetTensorShape(output);
-  RuntimeShape bias_shape = GetTensorShape(bias);
-
-  // Set min and max value of the output.
-  const int32 output_activation_min = std::numeric_limits<int8_t>::min();
-  const int32 output_activation_max = std::numeric_limits<int8_t>::max();
-
-  // Sanity check.
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
-  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
-  if (GetTensorData<int8_t>(bias)) {
-    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
-  }
-
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  int16_t* buf = nullptr;
-
-  auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
-  if (*buffer_idx > -1) {
-    void* raw = context->GetScratchBuffer(context, *buffer_idx);
-    buf = reinterpret_cast<int16_t*>(raw);
-  }
-
-  if (op_params.padding_values.width == 0 &&
-      op_params.padding_values.height == 0 && (input_depth % 4 == 0) &&
-      op_params.stride_width == 1 && op_params.stride_height == 1 &&
-      filter_width == 1 && filter_height == 1) {
-    if (arm_convolve_1x1_s8_fast(
-            GetTensorData<int8_t>(input), input_width, input_height,
-            input_depth, batches, GetTensorData<int8_t>(filter), output_depth,
-            op_params.padding_values.width, op_params.padding_values.height,
-            op_params.stride_width, op_params.stride_height,
-            GetTensorData<int32>(bias), GetTensorData<int8_t>(output),
-            data->per_channel_output_shift, data->per_channel_output_multiplier,
-            op_params.output_offset, op_params.input_offset,
-            output_activation_min, output_activation_max, output_width,
-            output_height, buf) != ARM_MATH_SUCCESS) {
-      return kTfLiteError;
-    }
-
-  } else if (output_height == 1 && input_height == 1 && filter_height == 1 &&
-             (output_width % 4 == 0) && batches == 1) {
-    if (arm_convolve_1_x_n_s8(
-            GetTensorData<int8_t>(input), input_width, input_depth, batches,
-            GetTensorData<int8_t>(filter), output_depth, filter_width,
-            op_params.padding_values.width, op_params.stride_width,
-            GetTensorData<int32_t>(bias), GetTensorData<int8_t>(output),
-            data->per_channel_output_shift, data->per_channel_output_multiplier,
-            op_params.output_offset, op_params.input_offset,
-            output_activation_min, output_activation_max, output_width,
-            buf) != ARM_MATH_SUCCESS) {
-      return kTfLiteError;
-    }
-  } else {
-    if (arm_convolve_s8(
-            GetTensorData<int8_t>(input), input_width, input_height,
-            input_depth, batches, GetTensorData<int8_t>(filter), output_depth,
-            filter_width, filter_height, op_params.padding_values.width,
-            op_params.padding_values.height, op_params.stride_width,
-            op_params.stride_height, GetTensorData<int32>(bias),
-            GetTensorData<int8_t>(output), data->per_channel_output_shift,
-            data->per_channel_output_multiplier, op_params.output_offset,
-            op_params.input_offset, output_activation_min,
-            output_activation_max, output_width, output_height,
-            buf) != ARM_MATH_SUCCESS) {
-      return kTfLiteError;
-    }
-  }
-#else
-#pragma message( \
-    "CMSIS-NN optimization for conv not available for this target. Using reference kernel.")
-
   reference_integer_ops::ConvPerChannel(
       op_params, data->per_channel_output_multiplier,
       data->per_channel_output_shift, GetTensorShape(input),
diff --git a/tensorflow/lite/micro/kernels/comparisons_test.cc b/tensorflow/lite/micro/kernels/comparisons_test.cc
index 8f7fb9263c0..731d9d778e7 100644
--- a/tensorflow/lite/micro/kernels/comparisons_test.cc
+++ b/tensorflow/lite/micro/kernels/comparisons_test.cc
@@ -37,7 +37,7 @@ void TestComparison(tflite::BuiltinOperator op, TfLiteTensor* tensors,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const int inputs_array_data[] = {2, 0, 1};
diff --git a/tensorflow/lite/micro/kernels/concatenation_test.cc b/tensorflow/lite/micro/kernels/concatenation_test.cc
index d841d508c80..040cd43a400 100644
--- a/tensorflow/lite/micro/kernels/concatenation_test.cc
+++ b/tensorflow/lite/micro/kernels/concatenation_test.cc
@@ -49,7 +49,7 @@ void TestConcatenateTwoInputs(std::initializer_list<int> input1_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_CONCATENATION, /* version */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_CONCATENATION);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteConcatenationParams builtin_data = {
@@ -111,7 +111,7 @@ void TestConcatenateQuantizedTwoInputs(
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_CONCATENATION, /* version */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_CONCATENATION);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteConcatenationParams builtin_data = {
diff --git a/tensorflow/lite/micro/kernels/conv_test.cc b/tensorflow/lite/micro/kernels/conv_test.cc
index 4cc2a80c3ea..9b1add6d94a 100644
--- a/tensorflow/lite/micro/kernels/conv_test.cc
+++ b/tensorflow/lite/micro/kernels/conv_test.cc
@@ -60,7 +60,7 @@ TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
   ::tflite::ops::micro::AllOpsResolver resolver;
 
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_CONV_2D, 1);
+      resolver.FindOp(tflite::BuiltinOperator_CONV_2D);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
@@ -409,8 +409,9 @@ TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannel) {
 
 TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannelRelu6) {
   // conv params:
-  // padding, stride_<width,height>, dilation_<width, height>, activation
-  TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6};
+  // padding, stride_<width,height>, activation, dilation_<width, height>
+  TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1,
+                                  kTfLiteActRelu6,     1, 1};
   const int kInputShape[] = {4, 1, 2, 2, 4};  // [len,N,H,W,C]
   const int kInputElements =
       kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4];
diff --git a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
index c4a242f480e..07177452ed7 100644
--- a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
@@ -48,7 +48,7 @@ TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D, 1);
+      resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   int input_depth = tensors[0].dims->data[3];
diff --git a/tensorflow/lite/micro/kernels/dequantize.cc b/tensorflow/lite/micro/kernels/dequantize.cc
index 37fb8ffc3c6..4b87c0eb04c 100644
--- a/tensorflow/lite/micro/kernels/dequantize.cc
+++ b/tensorflow/lite/micro/kernels/dequantize.cc
@@ -28,7 +28,27 @@ namespace ops {
 namespace micro {
 namespace dequantize {
 
+struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -42,10 +62,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(
       context, output->type == kTfLiteFloat32 || output->type == kTfLiteInt32);
 
+  if (output->type == kTfLiteInt32) {
+    const double effective_output_scale =
+        static_cast<double>(input->params.scale) /
+        static_cast<double>(output->params.scale);
+    QuantizeMultiplier(effective_output_scale, &data->output_multiplier,
+                       &data->output_shift);
+  }
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
@@ -76,28 +106,21 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return kTfLiteError;
     }
   } else if (output->type == kTfLiteInt32) {
-    int32_t output_multiplier;
-    int output_shift;
-    const double effective_output_scale =
-        static_cast<double>(input->params.scale) /
-        static_cast<double>(output->params.scale);
-    QuantizeMultiplier(effective_output_scale, &output_multiplier,
-                       &output_shift);
     int flat_size =
         MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
     switch (input->type) {
       case kTfLiteInt16: {
         reference_ops::Requantize(
-            GetTensorData<int16_t>(input), flat_size, output_multiplier,
-            output_shift, input->params.zero_point, output->params.zero_point,
-            GetTensorData<int32_t>(output));
+            GetTensorData<int16_t>(input), flat_size, data->output_multiplier,
+            data->output_shift, input->params.zero_point,
+            output->params.zero_point, GetTensorData<int32_t>(output));
         break;
       }
       case kTfLiteInt8: {
         reference_ops::Requantize(
-            GetTensorData<int8_t>(input), flat_size, output_multiplier,
-            output_shift, input->params.zero_point, output->params.zero_point,
-            GetTensorData<int32_t>(output));
+            GetTensorData<int8_t>(input), flat_size, data->output_multiplier,
+            data->output_shift, input->params.zero_point,
+            output->params.zero_point, GetTensorData<int32_t>(output));
         break;
       }
       default:
@@ -119,7 +142,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace dequantize
 
 TfLiteRegistration* Register_DEQUANTIZE() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  static TfLiteRegistration r = {/*init=*/dequantize::Init,
                                  /*free=*/nullptr,
                                  /*prepare=*/dequantize::Prepare,
                                  /*invoke=*/dequantize::Eval,
diff --git a/tensorflow/lite/micro/kernels/dequantize_test.cc b/tensorflow/lite/micro/kernels/dequantize_test.cc
index 5831791248c..c90f745a778 100644
--- a/tensorflow/lite/micro/kernels/dequantize_test.cc
+++ b/tensorflow/lite/micro/kernels/dequantize_test.cc
@@ -32,9 +32,8 @@ void ValidateDequantizeGoldens(TfLiteTensor* tensors, int tensors_size,
   ::tflite::ops::micro::AllOpsResolver resolver;
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
-  // Version 2 of dequantize supports int8 quantization.
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_DEQUANTIZE, 2);
+      resolver.FindOp(tflite::BuiltinOperator_DEQUANTIZE);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
diff --git a/tensorflow/lite/micro/kernels/elementwise.cc b/tensorflow/lite/micro/kernels/elementwise.cc
index 93fc4ec0d88..b69d260a826 100644
--- a/tensorflow/lite/micro/kernels/elementwise.cc
+++ b/tensorflow/lite/micro/kernels/elementwise.cc
@@ -106,9 +106,6 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalLogical(context, node, [](bool v) { return !v; });
 }
 
-TfLiteStatus TANHEval(TfLiteContext* context, TfLiteNode* node) {
-  return EvalNumeric(context, node, std::tanh);
-}
 
 }  // namespace
 }  // namespace elementwise
@@ -225,20 +222,6 @@ TfLiteRegistration* Register_LOGICAL_NOT() {
   return &r;
 }
 
-TfLiteRegistration* Register_TANH() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr,
-      /*free=*/nullptr,
-      /*prepare=*/
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      /*invoke=*/elementwise::TANHEval,
-      /*profiling_string=*/nullptr,
-      /*builtin_code=*/0,
-      /*custom_name=*/nullptr,
-      /*version=*/0};
-  return &r;
-}
-
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/elementwise_test.cc b/tensorflow/lite/micro/kernels/elementwise_test.cc
index c1e807974dc..923739b5bb4 100644
--- a/tensorflow/lite/micro/kernels/elementwise_test.cc
+++ b/tensorflow/lite/micro/kernels/elementwise_test.cc
@@ -47,8 +47,7 @@ void TestElementwiseFloat(tflite::BuiltinOperator op,
   TfLiteContext context;
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(op, /* version= */ 1);
+  const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   void* user_data = nullptr;
@@ -113,8 +112,7 @@ void TestElementwiseBool(tflite::BuiltinOperator op,
   TfLiteContext context;
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(op, /* version= */ 1);
+  const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   void* user_data = nullptr;
diff --git a/tensorflow/lite/micro/kernels/floor_test.cc b/tensorflow/lite/micro/kernels/floor_test.cc
index ab9cae36177..eea959d54ce 100644
--- a/tensorflow/lite/micro/kernels/floor_test.cc
+++ b/tensorflow/lite/micro/kernels/floor_test.cc
@@ -40,7 +40,7 @@ void TestFloor(const int* input_dims_data, const float* input_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_FLOOR, 1);
+      resolver.FindOp(tflite::BuiltinOperator_FLOOR);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   int inputs_array_data[] = {1, 0};
diff --git a/tensorflow/lite/micro/kernels/fully_connected_test.cc b/tensorflow/lite/micro/kernels/fully_connected_test.cc
index a920ca3b132..b464d977385 100644
--- a/tensorflow/lite/micro/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected_test.cc
@@ -55,7 +55,7 @@ TfLiteStatus TestFullyConnectedFloat(
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 1);
+      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteFullyConnectedParams builtin_data = {
@@ -135,7 +135,7 @@ TfLiteStatus TestFullyConnectedQuantized(
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 4);
+      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteFullyConnectedParams builtin_data = {
diff --git a/tensorflow/lite/micro/kernels/l2norm_test.cc b/tensorflow/lite/micro/kernels/l2norm_test.cc
index a4f2fff6a12..4b034b211db 100644
--- a/tensorflow/lite/micro/kernels/l2norm_test.cc
+++ b/tensorflow/lite/micro/kernels/l2norm_test.cc
@@ -106,7 +106,7 @@ void TestL2Normalization(const int* input_dims_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_L2_NORMALIZATION, 1);
+      resolver.FindOp(tflite::BuiltinOperator_L2_NORMALIZATION);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteL2NormParams builtin_data = {
diff --git a/tensorflow/lite/micro/kernels/logical_test.cc b/tensorflow/lite/micro/kernels/logical_test.cc
index f6dc9608fa2..ec5503bc8f3 100644
--- a/tensorflow/lite/micro/kernels/logical_test.cc
+++ b/tensorflow/lite/micro/kernels/logical_test.cc
@@ -48,7 +48,7 @@ void TestLogicalOp(tflite::BuiltinOperator op,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
diff --git a/tensorflow/lite/micro/kernels/logistic_test.cc b/tensorflow/lite/micro/kernels/logistic_test.cc
index 0cfef0704ac..6473858f9fb 100644
--- a/tensorflow/lite/micro/kernels/logistic_test.cc
+++ b/tensorflow/lite/micro/kernels/logistic_test.cc
@@ -45,7 +45,7 @@ void TestLogisticFloat(std::initializer_list<int> input_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_LOGISTIC, 1);
+      resolver.FindOp(tflite::BuiltinOperator_LOGISTIC);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
@@ -107,7 +107,7 @@ void TestLogisticInt8(std::initializer_list<int> input_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_LOGISTIC, 1);
+      resolver.FindOp(tflite::BuiltinOperator_LOGISTIC);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
diff --git a/tensorflow/lite/micro/kernels/maximum_minimum_test.cc b/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
index 17f13d4b0e9..c60ca906a5d 100644
--- a/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
@@ -49,7 +49,7 @@ void TestMaxMinFloat(tflite::BuiltinOperator op,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
@@ -108,7 +108,7 @@ void TestMaxMinQuantized(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
@@ -165,7 +165,7 @@ void TestMaxMinQuantizedInt32(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
diff --git a/tensorflow/lite/micro/kernels/mul_test.cc b/tensorflow/lite/micro/kernels/mul_test.cc
index c740e57f4e2..446512a7c6c 100644
--- a/tensorflow/lite/micro/kernels/mul_test.cc
+++ b/tensorflow/lite/micro/kernels/mul_test.cc
@@ -49,7 +49,7 @@ void TestMulFloat(std::initializer_list<int> input1_dims_data,
   TfLiteContext context;
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MUL, 1);
+      resolver.FindOp(tflite::BuiltinOperator_MUL);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
@@ -124,7 +124,7 @@ void TestMulQuantized(std::initializer_list<int> input1_dims_data,
   TfLiteContext context;
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MUL, 1);
+      resolver.FindOp(tflite::BuiltinOperator_MUL);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
diff --git a/tensorflow/lite/micro/kernels/neg_test.cc b/tensorflow/lite/micro/kernels/neg_test.cc
index ac2b79f8de1..9e8f0c842ea 100644
--- a/tensorflow/lite/micro/kernels/neg_test.cc
+++ b/tensorflow/lite/micro/kernels/neg_test.cc
@@ -43,7 +43,7 @@ void TestNegFloat(std::initializer_list<int> input_dims_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_NEG, 1);
+      resolver.FindOp(tflite::BuiltinOperator_NEG);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   int inputs_array_data[] = {1, 0};
diff --git a/tensorflow/lite/micro/kernels/pack_test.cc b/tensorflow/lite/micro/kernels/pack_test.cc
index b384cb78d4e..216cd615e4c 100644
--- a/tensorflow/lite/micro/kernels/pack_test.cc
+++ b/tensorflow/lite/micro/kernels/pack_test.cc
@@ -52,7 +52,7 @@ void TestPackTwoInputsFloat(std::initializer_list<int> input1_dims_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PACK, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_PACK);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePackParams builtin_data = {
@@ -129,7 +129,7 @@ void TestPackThreeInputsFloat(std::initializer_list<int> input1_dims_data,
 
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PACK, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_PACK);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePackParams builtin_data = {
@@ -202,7 +202,7 @@ void TestPackTwoInputsQuantized(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PACK, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_PACK);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePackParams builtin_data = {
@@ -272,7 +272,7 @@ void TestPackTwoInputsQuantized32(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PACK, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_PACK);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePackParams builtin_data = {
diff --git a/tensorflow/lite/micro/kernels/pad_test.cc b/tensorflow/lite/micro/kernels/pad_test.cc
index 7248a9b2126..eac5b980837 100644
--- a/tensorflow/lite/micro/kernels/pad_test.cc
+++ b/tensorflow/lite/micro/kernels/pad_test.cc
@@ -32,7 +32,7 @@ TfLiteStatus ValidatePadGoldens(TfLiteTensor* tensors, int tensors_size,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PAD, 1);
+      resolver.FindOp(tflite::BuiltinOperator_PAD);
   TF_LITE_ENSURE(&context, registration != nullptr);
 
   int inputs_array_data[] = {2, 0, 1};
@@ -69,7 +69,7 @@ TfLiteStatus ValidatePadV2Goldens(TfLiteTensor* tensors, int tensors_size,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PADV2, 1);
+      resolver.FindOp(tflite::BuiltinOperator_PADV2);
   TF_LITE_ENSURE(&context, registration != nullptr);
 
   int inputs_array_data[] = {3, 0, 1, 2};
diff --git a/tensorflow/lite/micro/kernels/pooling_test.cc b/tensorflow/lite/micro/kernels/pooling_test.cc
index 8bfeb718a1b..1ac74fca644 100644
--- a/tensorflow/lite/micro/kernels/pooling_test.cc
+++ b/tensorflow/lite/micro/kernels/pooling_test.cc
@@ -51,7 +51,7 @@ void TestAveragePoolingFloat(std::initializer_list<int> input_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D, 1);
+      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePoolParams builtin_data = {padding,      stride_width,  stride_height,
@@ -125,7 +125,7 @@ void TestAveragePoolingQuantized(
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D, 1);
+      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePoolParams builtin_data = {padding,      stride_width,  stride_height,
@@ -192,7 +192,7 @@ void TestMaxPoolFloat(std::initializer_list<int> input_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D, 1);
+      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePoolParams builtin_data = {
@@ -268,7 +268,7 @@ void TestMaxPoolQuantized(std::initializer_list<int> input_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D, 1);
+      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePoolParams builtin_data = {
@@ -496,7 +496,7 @@ TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingSameStride1ActNone) {
        F2QS(8.5, output_min, output_max), F2QS(7., output_min, output_max)},
       {4, 1, 2, 4, 1},         // Output shape
       output_min, output_max,  // output quantization range
-      kTfLitePaddingValid, kTfLiteActNone, output_data);
+      kTfLitePaddingSame, kTfLiteActNone, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloat) {
diff --git a/tensorflow/lite/micro/kernels/prelu.cc b/tensorflow/lite/micro/kernels/prelu.cc
index a20d2c88225..921aa208ea2 100644
--- a/tensorflow/lite/micro/kernels/prelu.cc
+++ b/tensorflow/lite/micro/kernels/prelu.cc
@@ -64,14 +64,21 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   const TfLiteTensor* alpha = GetInput(context, node, 1);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  int32_t output_multiplier = 0;
-  int output_shift = 0;
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
-    double real_multiplier = static_cast<double>(input->params.scale) *
-                             static_cast<double>(alpha->params.scale) /
-                             static_cast<double>(output->params.scale);
-    QuantizeMultiplierSmallerThanOneExp(real_multiplier, &output_multiplier,
-                                        &output_shift);
+  int32_t output_multiplier_1 = 0;
+  int output_shift_1 = 0;
+  int32_t output_multiplier_2 = 0;
+  int output_shift_2 = 0;
+  if (output->type == kTfLiteInt8 || output->type == kTfLiteUInt8 ||
+      output->type == kTfLiteInt16) {
+    double real_multiplier_1 = static_cast<double>(input->params.scale) /
+                               static_cast<double>(output->params.scale);
+    double real_multiplier_2 = static_cast<double>(input->params.scale) *
+                               static_cast<double>(alpha->params.scale) /
+                               static_cast<double>(output->params.scale);
+    QuantizeMultiplier(real_multiplier_1, &output_multiplier_1,
+                       &output_shift_1);
+    QuantizeMultiplier(real_multiplier_2, &output_multiplier_2,
+                       &output_shift_2);
   }
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -86,14 +93,31 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
       op_params.input_offset = -input->params.zero_point;
       op_params.alpha_offset = -alpha->params.zero_point;
       op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier = output_multiplier;
-      op_params.output_shift = output_shift;
+      op_params.output_multiplier_1 = output_multiplier_1;
+      op_params.output_shift_1 = output_shift_1;
+      op_params.output_multiplier_2 = output_multiplier_2;
+      op_params.output_shift_2 = output_shift_2;
       reference_ops::BroadcastPrelu4DSlow(
           op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
           GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
           GetTensorShape(output), GetTensorData<uint8_t>(output));
       return kTfLiteOk;
     } break;
+    case kTfLiteInt8: {
+      PreluParams op_params;
+      op_params.input_offset = -input->params.zero_point;
+      op_params.alpha_offset = -alpha->params.zero_point;
+      op_params.output_offset = output->params.zero_point;
+      op_params.output_multiplier_1 = output_multiplier_1;
+      op_params.output_shift_1 = output_shift_1;
+      op_params.output_multiplier_2 = output_multiplier_2;
+      op_params.output_shift_2 = output_shift_2;
+      reference_ops::BroadcastPrelu4DSlow(
+          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+          GetTensorShape(alpha), GetTensorData<int8_t>(alpha),
+          GetTensorShape(output), GetTensorData<int8_t>(output));
+      return kTfLiteOk;
+    } break;
     default:
       TF_LITE_KERNEL_LOG(
           context, "Only float32 and uint8 are supported currently, got %d.",
diff --git a/tensorflow/lite/micro/kernels/prelu_test.cc b/tensorflow/lite/micro/kernels/prelu_test.cc
index 4b35dac5849..1a41e75501d 100644
--- a/tensorflow/lite/micro/kernels/prelu_test.cc
+++ b/tensorflow/lite/micro/kernels/prelu_test.cc
@@ -46,7 +46,7 @@ void TestPreluFloat(std::initializer_list<int> input_dims_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PRELU, 1);
+      resolver.FindOp(tflite::BuiltinOperator_PRELU);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   size_t init_data_size = 0;
@@ -82,16 +82,18 @@ void TestPreluFloat(std::initializer_list<int> input_dims_data,
   }
 }
 
+// Template argument T can be either uint8_t or int8_t depending on which type
+// of quantization required to be tested.
+template <typename T>
 void TestPreluQuantized(std::initializer_list<int> input_dims_data,
-                        std::initializer_list<uint8_t> input_data,
-                        float input_min, float input_max,
+                        std::initializer_list<T> input_data, float input_min,
+                        float input_max,
                         std::initializer_list<int> alpha_dims_data,
-                        std::initializer_list<uint8_t> alpha_data,
-                        float alpha_min, float alpha_max,
-                        std::initializer_list<uint8_t> expected_output_data,
+                        std::initializer_list<T> alpha_data, float alpha_min,
+                        float alpha_max,
+                        std::initializer_list<T> expected_output_data,
                         std::initializer_list<int> output_dims_data,
-                        float output_min, float output_max,
-                        uint8_t* output_data) {
+                        float output_min, float output_max, T* output_data) {
   TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
   TfLiteIntArray* alpha_dims = IntArrayFromInitializer(alpha_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
@@ -111,7 +113,7 @@ void TestPreluQuantized(std::initializer_list<int> input_dims_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PRELU, 1);
+      resolver.FindOp(tflite::BuiltinOperator_PRELU);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   size_t init_data_size = 0;
@@ -154,7 +156,7 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(FloatPreluActivationsOpTest) {
   const int output_dims_count = 12;
   float output_data[output_dims_count];
-  tflite::testing::TestPreluFloat({4, 1, 2, 2, 3},  // input shape
+  tflite::testing::TestPreluFloat({3, 2, 2, 3},  // input shape
                                   {
                                       0.0f, 0.0f, 0.0f,     // Row 1, Column 1
                                       1.0f, 1.0f, 1.0f,     // Row 1, Column 2
@@ -169,20 +171,20 @@ TF_LITE_MICRO_TEST(FloatPreluActivationsOpTest) {
                                       0.0f, -1.0f, -2.0f,  // Row 2, Column 1
                                       0.0f, -2.0f, -4.0f,  // Row 1, Column 2
                                   },
-                                  {4, 1, 2, 2, 3},  // output shape
+                                  {3, 2, 2, 3},  // output shape
                                   output_data);
 }
 
-TF_LITE_MICRO_TEST(QuantizedPreluActivationsOpTest) {
+TF_LITE_MICRO_TEST(QuantizedUint8PreluActivationsOpTest) {
   using tflite::testing::F2Q;
-  const float kMin = -1;
-  const float kMax = 127.f / 128.f;
+  const float kMin = -4;
+  const float kMax = 127.f / 32.f;
   const float kAlphaMin = -0.5f;
   const float kAlphaMax = 0.5f;
   const int output_dims_count = 12;
   uint8_t output_data[output_dims_count];
   tflite::testing::TestPreluQuantized(
-      {4, 1, 2, 2, 3},  // input shape
+      {3, 2, 2, 3},  // input shape
       {F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax),
        F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax),
        F2Q(-1.0f, kMin, kMax), F2Q(-1.0f, kMin, kMax), F2Q(-1.0f, kMin, kMax),
@@ -196,8 +198,34 @@ TF_LITE_MICRO_TEST(QuantizedPreluActivationsOpTest) {
        F2Q(0.0f, kMin, kMax), F2Q(-0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax),
        F2Q(0.0f, kMin, kMax), F2Q(-0.125f, kMin, kMax),
        F2Q(0.125f, kMin, kMax)},
-      {4, 1, 2, 2, 3},  // output shape
+      {3, 2, 2, 3},  // output shape
       kMin, kMax, output_data);
 }
 
+TF_LITE_MICRO_TEST(QuantizedInt8PreluActivationsOpTest) {
+  using tflite::testing::F2QS;
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  const float kAlphaMin = -0.5f;
+  const float kAlphaMax = 0.5f;
+  const int output_dims_count = 12;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestPreluQuantized(
+      {3, 2, 2, 3},  // input shape
+      {F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax),
+       F2QS(0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax),
+       F2QS(-1.0f, kMin, kMax), F2QS(-1.0f, kMin, kMax),
+       F2QS(-1.0f, kMin, kMax), F2QS(-0.25f, kMin, kMax),
+       F2QS(-0.25f, kMin, kMax), F2QS(-0.25f, kMin, kMax)},
+      kMin, kMax, {3, 1, 1, 3},  // alpha shape
+      {F2QS(0.0f, kMin, kMax), F2QS(0.5f, kMin, kMax), F2QS(-0.5f, kMin, kMax)},
+      kMin, kMax,
+      {F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax),
+       F2QS(0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax),
+       F2QS(0.0f, kMin, kMax), F2QS(-0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax),
+       F2QS(0.0f, kMin, kMax), F2QS(-0.125f, kMin, kMax),
+       F2QS(0.125f, kMin, kMax)},
+      {3, 2, 2, 3},  // output shape
+      kMin, kMax, output_data);
+}
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/quantize.cc b/tensorflow/lite/micro/kernels/quantize.cc
index d40471df948..b58a1cb368e 100644
--- a/tensorflow/lite/micro/kernels/quantize.cc
+++ b/tensorflow/lite/micro/kernels/quantize.cc
@@ -26,7 +26,27 @@ namespace ops {
 namespace micro {
 namespace quantize {
 
+struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -43,15 +63,27 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, affine_quantization->scale);
   TF_LITE_ENSURE(context, affine_quantization->scale->size == 1);
 
-  TF_LITE_ENSURE(context,
-                 input->type == kTfLiteFloat32 || input->type == kTfLiteInt16);
+  TF_LITE_ENSURE(context, input->type == kTfLiteFloat32 ||
+                              input->type == kTfLiteInt16 ||
+                              input->type == kTfLiteInt8);
   TF_LITE_ENSURE(context,
                  output->type == kTfLiteUInt8 || output->type == kTfLiteInt8);
 
+  if ((input->type == kTfLiteInt16 || input->type == kTfLiteInt8) &&
+      output->type == kTfLiteInt8) {
+    double effective_scale =
+        static_cast<double>(input->params.scale / output->params.scale);
+
+    QuantizeMultiplier(effective_scale, &data->output_multiplier,
+                       &data->output_shift);
+  }
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
@@ -79,17 +111,29 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
   } else if (input->type == kTfLiteInt16) {
     size_t size = ElementCount(*input->dims);
-    int32_t output_multiplier;
-    int output_shift;
-    double effective_scale =
-        static_cast<double>(input->params.scale / output->params.scale);
     switch (output->type) {
       case kTfLiteInt8:
-        QuantizeMultiplier(effective_scale, &output_multiplier, &output_shift);
         reference_ops::Requantize(
-            GetTensorData<int16_t>(input), size, output_multiplier,
-            output_shift, input->params.zero_point, output->params.zero_point,
-            GetTensorData<int8_t>(output));
+            GetTensorData<int16_t>(input), size, data->output_multiplier,
+            data->output_shift, input->params.zero_point,
+            output->params.zero_point, GetTensorData<int8_t>(output));
+        break;
+      default:
+        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                           TfLiteTypeGetName(input->type),
+                           TfLiteTypeGetName(output->type));
+        return kTfLiteError;
+    }
+  } else if (input->type == kTfLiteInt8) {
+    // Int8 to Int8 requantization, required if the input and output tensors
+    // have different scales and/or zero points.
+    size_t size = ElementCount(*input->dims);
+    switch (output->type) {
+      case kTfLiteInt8:
+        reference_ops::Requantize(
+            GetTensorData<int8_t>(input), size, data->output_multiplier,
+            data->output_shift, input->params.zero_point,
+            output->params.zero_point, GetTensorData<int8_t>(output));
         break;
       default:
         TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
@@ -113,7 +157,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 // AffineQuantize takes scale and zero point and quantizes the float value to
 // quantized output, in int8 or uint8 format.
 TfLiteRegistration* Register_QUANTIZE() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  static TfLiteRegistration r = {/*init=*/quantize::Init,
                                  /*free=*/nullptr,
                                  /*prepare=*/quantize::Prepare,
                                  /*invoke=*/quantize::Eval,
diff --git a/tensorflow/lite/micro/kernels/quantize_test.cc b/tensorflow/lite/micro/kernels/quantize_test.cc
index 359abbd73db..3089357040c 100644
--- a/tensorflow/lite/micro/kernels/quantize_test.cc
+++ b/tensorflow/lite/micro/kernels/quantize_test.cc
@@ -35,7 +35,7 @@ void ValidateQuantizeGoldens(TfLiteTensor* tensors, int tensors_size,
   // Version 1 of quantize supports int8 and uint8 quantization.
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_QUANTIZE, 1);
+      resolver.FindOp(tflite::BuiltinOperator_QUANTIZE);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
@@ -110,13 +110,13 @@ void TestQuantizeFloat(const int* input_dims_data, const float* input_data,
                           scale, zero_point, output_dims_count, output_data);
 }
 
-template <typename T>
-void TestQuantizeInt16(const int* input_dims_data, const float* input_data,
-                       int16_t* input_quantized, const float input_scale,
-                       const int input_zero_point, const int* output_dims_data,
-                       const float* golden, T* golden_quantized,
-                       const float output_scale, const int output_zero_point,
-                       T* output_data) {
+template <typename InputType, typename OutputType>
+void TestRequantize(const int* input_dims_data, const float* input_data,
+                    InputType* input_quantized, const float input_scale,
+                    const int input_zero_point, const int* output_dims_data,
+                    const float* golden, OutputType* golden_quantized,
+                    const float output_scale, const int output_zero_point,
+                    OutputType* output_data) {
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
@@ -212,11 +212,46 @@ TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt8) {
   const float output_scale = 0.5;
   const int output_zero_point = 0;
   int8_t output_quantized[length];
+  int8_t values_quantized[length];
   int16_t input_quantized[length];
-  tflite::testing::TestQuantizeInt16(dims, values, input_quantized, input_scale,
-                                     input_zero_point, dims, values,
-                                     output_quantized, output_scale,
-                                     output_zero_point, output_quantized);
+  tflite::testing::TestRequantize(dims, values, input_quantized, input_scale,
+                                  input_zero_point, dims, values,
+                                  values_quantized, output_scale,
+                                  output_zero_point, output_quantized);
+}
+
+TF_LITE_MICRO_TEST(QuantizeOpTestInt8toInt8) {
+  const int length = 10;
+  const int dims[] = {2, 2, 5};
+  const float values[] = {-64, -62, -60, -58, -56, 54, 56, 58, 60, 62};
+  const float input_scale = 2.f;
+  const int input_zero_point = 0;
+  const float output_scale = 0.5;
+  const int output_zero_point = 32;
+  int8_t output_quantized[length];
+  int8_t values_quantized[length];
+  int8_t input_quantized[length];
+  tflite::testing::TestRequantize(dims, values, input_quantized, input_scale,
+                                  input_zero_point, dims, values,
+                                  values_quantized, output_scale,
+                                  output_zero_point, output_quantized);
+}
+
+TF_LITE_MICRO_TEST(QuantizeOpTestInt8toInt8NoZeroPoint) {
+  const int length = 10;
+  const int dims[] = {2, 2, 5};
+  const float values[] = {-32, -31, -30, -29, -28, 27, 28, 29, 30, 31};
+  const float input_scale = 1.f;
+  const int input_zero_point = 0;
+  const float output_scale = 0.5;
+  const int output_zero_point = 0;
+  int8_t output_quantized[length];
+  int8_t values_quantized[length];
+  int8_t input_quantized[length];
+  tflite::testing::TestRequantize(dims, values, input_quantized, input_scale,
+                                  input_zero_point, dims, values,
+                                  values_quantized, output_scale,
+                                  output_zero_point, output_quantized);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/reduce_test.cc b/tensorflow/lite/micro/kernels/reduce_test.cc
index a791cdeaba6..965e45adb44 100644
--- a/tensorflow/lite/micro/kernels/reduce_test.cc
+++ b/tensorflow/lite/micro/kernels/reduce_test.cc
@@ -54,7 +54,7 @@ TfLiteStatus ValidateReduceGoldens(TfLiteTensor* tensors, int tensors_size,
   ::tflite::ops::micro::AllOpsResolver resolver;
 
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MEAN, 1);
+      resolver.FindOp(tflite::BuiltinOperator_MEAN);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
diff --git a/tensorflow/lite/micro/kernels/reshape_test.cc b/tensorflow/lite/micro/kernels/reshape_test.cc
index b783c3c3e0f..7e7f58f0a03 100644
--- a/tensorflow/lite/micro/kernels/reshape_test.cc
+++ b/tensorflow/lite/micro/kernels/reshape_test.cc
@@ -62,7 +62,7 @@ void TestReshapeImpl(TfLiteTensor* input_tensor, TfLiteTensor* shape_tensor,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RESHAPE, 1);
+      resolver.FindOp(tflite::BuiltinOperator_RESHAPE);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   void* user_data = nullptr;
diff --git a/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
index 518eada70fb..72e7252b31b 100644
--- a/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
@@ -71,7 +71,7 @@ void TestResizeNearestNeighbor(const int* input_dims_data, const T* input_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 1);
+      resolver.FindOp(tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteResizeNearestNeighborParams builtin_data = {
diff --git a/tensorflow/lite/micro/kernels/round_test.cc b/tensorflow/lite/micro/kernels/round_test.cc
index e19ea41f2d9..10eb2759a4b 100644
--- a/tensorflow/lite/micro/kernels/round_test.cc
+++ b/tensorflow/lite/micro/kernels/round_test.cc
@@ -39,7 +39,7 @@ void TestRound(const int* input_dims_data, const float* input_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_ROUND, 1);
+      resolver.FindOp(tflite::BuiltinOperator_ROUND);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   int inputs_array_data[] = {1, 0};
diff --git a/tensorflow/lite/micro/kernels/softmax_test.cc b/tensorflow/lite/micro/kernels/softmax_test.cc
index f229ab021f9..afd97f1f015 100644
--- a/tensorflow/lite/micro/kernels/softmax_test.cc
+++ b/tensorflow/lite/micro/kernels/softmax_test.cc
@@ -45,7 +45,7 @@ void TestSoftmaxFloat(std::initializer_list<int> input_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX, 1);
+      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSoftmaxParams builtin_data = {1.0f};
@@ -111,7 +111,7 @@ void TestSoftmaxQuantized(std::initializer_list<int> input_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX, 1);
+      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSoftmaxParams builtin_data = {1.0f};
@@ -177,7 +177,7 @@ void TestSoftmaxQuantizedSigned(
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX, 1);
+      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSoftmaxParams builtin_data = {1.0f};
diff --git a/tensorflow/lite/micro/kernels/split_test.cc b/tensorflow/lite/micro/kernels/split_test.cc
index 59be78f2960..807929ca4d5 100644
--- a/tensorflow/lite/micro/kernels/split_test.cc
+++ b/tensorflow/lite/micro/kernels/split_test.cc
@@ -65,7 +65,7 @@ void TestSplitTwoOutputsFloat(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SPLIT, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_SPLIT);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSplitParams builtin_data = {
@@ -168,7 +168,7 @@ void TestSplitFourOutputsFloat(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SPLIT, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_SPLIT);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSplitParams builtin_data = {
@@ -266,7 +266,7 @@ void TestSplitTwoOutputsQuantized(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SPLIT, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_SPLIT);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSplitParams builtin_data = {
@@ -355,7 +355,7 @@ void TestSplitTwoOutputsQuantized32(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SPLIT, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_SPLIT);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSplitParams builtin_data = {
diff --git a/tensorflow/lite/micro/kernels/strided_slice_test.cc b/tensorflow/lite/micro/kernels/strided_slice_test.cc
index 75732d8860d..a36a90b858e 100644
--- a/tensorflow/lite/micro/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/micro/kernels/strided_slice_test.cc
@@ -88,7 +88,7 @@ void TestStrideSlide(std::initializer_list<int> input_shape,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_STRIDED_SLICE, 1);
+      resolver.FindOp(tflite::BuiltinOperator_STRIDED_SLICE);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
   TfLiteStridedSliceParams builtin_data = {begin_mask, end_mask, ellipsis_mask,
                                            new_axis_mask, shrink_axis_mask};
diff --git a/tensorflow/lite/micro/kernels/sub_test.cc b/tensorflow/lite/micro/kernels/sub_test.cc
index e59ac636f65..e4875c3bd1e 100644
--- a/tensorflow/lite/micro/kernels/sub_test.cc
+++ b/tensorflow/lite/micro/kernels/sub_test.cc
@@ -71,7 +71,7 @@ void ValidateSubGoldens(TfLiteTensor* tensors, int tensors_size,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(::tflite::BuiltinOperator_SUB, 1);
+      resolver.FindOp(::tflite::BuiltinOperator_SUB);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
diff --git a/tensorflow/lite/micro/kernels/svdf_test.cc b/tensorflow/lite/micro/kernels/svdf_test.cc
index 179cafd152c..fead3ab2fab 100644
--- a/tensorflow/lite/micro/kernels/svdf_test.cc
+++ b/tensorflow/lite/micro/kernels/svdf_test.cc
@@ -175,7 +175,7 @@ void ValidateSVDFGoldens(const int batch_size, const int num_units,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SVDF, 1);
+      resolver.FindOp(tflite::BuiltinOperator_SVDF);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSVDFParams params;
@@ -250,7 +250,7 @@ void ValidateIntegerSVDFGoldens(const int batch_size, const int num_units,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SVDF, 1);
+      resolver.FindOp(tflite::BuiltinOperator_SVDF);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSVDFParams params;
diff --git a/tensorflow/lite/micro/kernels/tanh.cc b/tensorflow/lite/micro/kernels/tanh.cc
new file mode 100644
index 00000000000..9ee5b74bde4
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/tanh.cc
@@ -0,0 +1,128 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/tanh.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace activations {
+namespace {
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  int32_t input_zero_point;
+  int32_t input_range_radius;
+  int32_t input_multiplier;
+  int input_left_shift;
+};
+
+TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
+                                       OpData* data) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+
+    // The number if input integer bits is set to be consistent with the
+    // required value in reference_integer_ops::Tanh
+    static constexpr int kInputIntegerBits = 4;
+    const double input_real_multiplier =
+        static_cast<double>(input->params.scale) *
+        static_cast<double>(1 << (31 - kInputIntegerBits));
+
+    const double q = std::frexp(input_real_multiplier, &data->input_left_shift);
+    data->input_multiplier = static_cast<int32_t>(TfLiteRound(q * (1ll << 31)));
+
+    data->input_range_radius =
+        CalculateInputRadius(kInputIntegerBits, data->input_left_shift, 31);
+  }
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  OpData data;
+  CalculateArithmeticOpData(context, node, &data);
+
+  if (input->type == kTfLiteFloat32) {
+    switch (output->type) {
+      case kTfLiteFloat32: {
+        reference_ops::Tanh(GetTensorShape(input), GetTensorData<float>(input),
+                            GetTensorShape(output),
+                            GetTensorData<float>(output));
+        return kTfLiteOk;
+      }
+      default:
+        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                           TfLiteTypeGetName(input->type),
+                           TfLiteTypeGetName(output->type));
+        return kTfLiteError;
+    }
+  } else if (input->type == kTfLiteInt8) {
+    switch (output->type) {
+      case kTfLiteInt8: {
+        reference_integer_ops::Tanh(
+            input->params.zero_point, data.input_range_radius,
+            data.input_multiplier, data.input_left_shift,
+            NumElements(input->dims), GetTensorData<int8_t>(input),
+            GetTensorData<int8_t>(output));
+        return kTfLiteOk;
+      }
+      default:
+        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                           TfLiteTypeGetName(input->type),
+                           TfLiteTypeGetName(output->type));
+        return kTfLiteError;
+    }
+  } else {
+    TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                       TfLiteTypeGetName(input->type),
+                       TfLiteTypeGetName(output->type));
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace activations
+
+TfLiteRegistration* Register_TANH() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/nullptr,
+                                 /*invoke=*/activations::TanhEval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/tanh_test.cc b/tensorflow/lite/micro/kernels/tanh_test.cc
new file mode 100644
index 00000000000..517d8a9c4a8
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/tanh_test.cc
@@ -0,0 +1,220 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestTanhFloat(std::initializer_list<int> input_dims_data,
+                   std::initializer_list<float> input_data,
+                   std::initializer_list<float> expected_output_data,
+                   std::initializer_list<int> output_dims_data,
+                   float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_elements_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_TANH);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  const char* init_data = nullptr;
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = nullptr;
+  node.user_data = user_data;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_elements_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+void TestTanhInt8(std::initializer_list<int> input_dims_data,
+                  std::initializer_list<int8_t> input_data, float input_min,
+                  float input_max,
+                  std::initializer_list<int8_t> expected_output_data,
+                  std::initializer_list<int> output_dims_data, float output_min,
+                  float output_max, int8_t* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_elements_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_TANH);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  const char* init_data = nullptr;
+  size_t init_data_size = 1;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = nullptr;
+  node.user_data = user_data;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_elements_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleTestFloat) {
+  const int output_elements_count = 10;
+  float output_data[output_elements_count];
+  tflite::testing::TestTanhFloat({2, 1, 5},  // Input shape.
+                                 {
+                                     1.0,
+                                     2.0,
+                                     3.0,
+                                     4.0,
+                                     93.0,
+                                     -1.0,
+                                     -2.0,
+                                     -3.0,
+                                     -4.0,
+                                     -93.0,
+                                 },
+                                 {
+                                     // Expected results.
+                                     0.76159416,
+                                     0.96402758,
+                                     0.99505475,
+                                     0.9993293,
+                                     1.0,
+                                     -0.76159416,
+                                     -0.96402758,
+                                     -0.99505475,
+                                     -0.9993293,
+                                     -1.0,
+                                 },
+                                 {2, 1, 5},  // Output shape.
+                                 output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestInt8) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -31.75f;
+  const float input_max = 32.0f;
+  const float output_min = -1.0f;
+  const float output_max = (127.0f / 128.0f);
+
+  const int output_elements_count = 10;
+  int8_t output_data[output_elements_count];
+  tflite::testing::TestTanhInt8(
+      {2, 1, output_elements_count},  // Input shape.
+      {F2QS(1.0, input_min, input_max), F2QS(2.0, input_min, input_max),
+       F2QS(3.0, input_min, input_max), F2QS(4.0, input_min, input_max),
+       F2QS(5.0, input_min, input_max), F2QS(-1.0, input_min, input_max),
+       F2QS(-2.0, input_min, input_max), F2QS(-3.0, input_min, input_max),
+       F2QS(-4.0, input_min, input_max), F2QS(-5.0, input_min, input_max)},
+      input_min, input_max,  // Input quantized range.
+      {                      // Expected results.
+       F2QS(0.76159416, output_min, output_max),
+       F2QS(0.96402758, output_min, output_max),
+       F2QS(0.99505475, output_min, output_max),
+       F2QS(0.9993293, output_min, output_max),
+       F2QS(0.9999092, output_min, output_max),
+       F2QS(-0.76159416, output_min, output_max),
+       F2QS(-0.96402758, output_min, output_max),
+       F2QS(-0.99505475, output_min, output_max),
+       F2QS(-0.9993293, output_min, output_max),
+       F2QS(-0.9999092, output_min, output_max)},
+      {2, 1, output_elements_count},  // Output shape.
+      output_min, output_max,         // Output quantized range.
+      output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/unpack_test.cc b/tensorflow/lite/micro/kernels/unpack_test.cc
index 86ccab8edc0..015c92c208a 100644
--- a/tensorflow/lite/micro/kernels/unpack_test.cc
+++ b/tensorflow/lite/micro/kernels/unpack_test.cc
@@ -67,7 +67,7 @@ void TestUnpackThreeOutputsFloat(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_UNPACK, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_UNPACK);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteUnpackParams builtin_data = {
@@ -144,7 +144,7 @@ void TestUnpackOneOutputFloat(std::initializer_list<int> input_dims_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_UNPACK, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_UNPACK);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteUnpackParams builtin_data = {
@@ -236,7 +236,7 @@ void TestUnpackThreeOutputsQuantized(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_UNPACK, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_UNPACK);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteUnpackParams builtin_data = {
@@ -332,7 +332,7 @@ void TestUnpackThreeOutputsQuantized32(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_UNPACK, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_UNPACK);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteUnpackParams builtin_data = {
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
index c8da67b5af8..39f07862753 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
@@ -152,10 +152,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
   double real_multiplier = 0.0;
   TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
       context, input, filter, bias, output, &real_multiplier));
-  int exponent;
-  xtensa::hifimini::QuantizeMultiplier(real_multiplier,
-                                       &data->output_multiplier, &exponent);
-  data->output_shift = -exponent;
+  xtensa::hifimini::QuantizeMultiplier(
+      real_multiplier, &data->output_multiplier, &data->output_shift);
   return CalculateActivationRangeQuantized(context, activation, output,
                                            &data->output_activation_min,
                                            &data->output_activation_max);
@@ -194,14 +192,17 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
                                const OpData& data, const TfLiteTensor* input,
                                const TfLiteTensor* filter,
                                const TfLiteTensor* bias, TfLiteTensor* output) {
-  // TODO(b/154032858): Investigate removing extra copies.
+  // TODO(b/154032858): Investigate removing extra copies, and also passing by
+  // value. TODO(b/155656675): Consider passing OpData by value once it is also
+  // passed to the FullyConnected function. Until it is copied to a local
+  // op_param variable, we do not get any latency improvements from passing by
+  // value.
   FullyConnectedParams op_params;
   op_params.input_offset = -input->params.zero_point;
   op_params.weights_offset = -filter->params.zero_point;
   op_params.output_offset = output->params.zero_point;
   op_params.output_multiplier = data.output_multiplier;
-  // TODO(b/138810107): Figure out whether output shift should be inverted
-  op_params.output_shift = -data.output_shift;
+  op_params.output_shift = data.output_shift;
   op_params.quantized_activation_min = data.output_activation_min;
   op_params.quantized_activation_max = data.output_activation_max;
 
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
index 2177bf1c363..29b2544a625 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
@@ -108,22 +108,24 @@ struct OpData {
   int scale_multiplier = 0;
 };
 
-// This size will work for both the hotword (1) and ambient music (1):
-constexpr int kMaxOpDataSize = 2;
-static int op_data_counter = 0;
-static OpData kStaticOpData[kMaxOpDataSize];
-
-void Free(TfLiteContext* context, void* buffer) { op_data_counter = 0; }
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  auto* op_data = static_cast<OpData*>(node->user_data);
+
   TfLiteTensor* output = GetOutput(context, node, 0);
   const TfLiteTensor* input = GetInput(context, node, 0);
 
-  // TODO(b/132070898): Use statically slotted OpData structures until a
-  // scratch memory API is ready.
-  OpData* op_data = &kStaticOpData[op_data_counter++];
-  node->user_data = op_data;
-
+  // TODO(b/155682734): Fix dangerous input/output scale ratio assumptions.
   op_data->scale_multiplier = xtensa::hifimini::CreateQConstantForInt24(
       0, input->params.scale / output->params.scale);
 
@@ -131,7 +133,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* op_data = reinterpret_cast<OpData*>(node->user_data);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  auto* op_data = static_cast<OpData*>(node->user_data);
 
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -159,8 +162,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 // AffineQuantize takes scale and zero point and quantizes the float value to
 // quantized output, in int8 or uint8 format.
 TfLiteRegistration* Register_QUANTIZE() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/quantize::Free,
+  static TfLiteRegistration r = {/*init=*/quantize::Init,
+                                 /*free=*/nullptr,
                                  /*prepare=*/quantize::Prepare,
                                  /*invoke=*/quantize::Eval,
                                  /*profiling_string=*/nullptr,
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
index c77e9d1173c..da75118b598 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
@@ -29,71 +29,133 @@ namespace micro {
 namespace activations {
 namespace {
 
-// TODO(b/141176180): This code is currently a strict subset of the portable
-// implementation (softmax.cc one directory up). When TFLM implements
-// registrations for selective types (e.g. compile without float support), this
-// can be removed. Otherwise, any HiFi specific optimizations should land here.
+struct OpData {
+  uint16_t* exp_lut;
+};
 
-// This size will work for both the hotword (1) and ambient music (0):
-static SoftmaxParams kStaticOpData;
+// Number of unique int8 and int16 values.  Used in exponent lookup table
+// conputation.
+constexpr int kInt8Range =
+    std::numeric_limits<int8_t>::max() - std::numeric_limits<int8>::min() + 1;
+constexpr int kInt16Range =
+    std::numeric_limits<int16_t>::max() - std::numeric_limits<int16>::min() + 1;
+// Each 16-bit precalculated exponent is expressed as a Q0.16 fixedpoint
+// value. We special-case e^0 since 1.0 requires 1 integer bit to
+// express.
+constexpr int kExpFractionalBits = 16;
+// e^0 expressed as Q1.15 exceeds the int16_t range, so it must be handled
+// specially.
+constexpr int kMaxExponentValue = (1 << kExpFractionalBits);
+
+// Quantized softmax with int8 input and int16 output.
+// Passing OpData by value does not have much savings in this op, but following
+// that as a best practice, at least for the xtensa kernels. See b/155656675 for
+// more details.
+TfLiteStatus Softmax(OpData op_data, const RuntimeShape& input_shape,
+                     const int8_t* input_data, const RuntimeShape& output_shape,
+                     int16_t* output_data) {
+  // The last dimension is depth.  Outer size is the the total input size
+  // divided by depth.
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    int8_t max_in_row = std::numeric_limits<int8_t>::min();
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    uint32_t sum_of_exps = 0;
+    for (int c = 0; c < depth; ++c) {
+      TFLITE_DCHECK(max_in_row >= input_data[i * depth + c]);
+      uint8_t input_diff = max_in_row - input_data[i * depth + c];
+
+      sum_of_exps +=
+          input_diff == 0 ? kMaxExponentValue : op_data.exp_lut[input_diff];
+    }
+
+    // Ensure we cannnot overflow the full_range_output value.  We need to
+    // guarantee that kInt16Range * max(input_data) / sum_of_exps < kInt16Range.
+    TFLITE_DCHECK(sum_of_exps >= kMaxExponentValue);
+
+    for (int c = 0; c < depth; ++c) {
+      uint8_t input_diff = max_in_row - input_data[i * depth + c];
+      // Special case for diff == 0
+      uint32_t unscaled_output =
+          input_diff == 0 ? kMaxExponentValue : op_data.exp_lut[input_diff];
+      int64_t scaled_output = static_cast<int64_t>(unscaled_output) *
+                              static_cast<int64_t>(kInt16Range);
+      int32_t full_range_output =
+          scaled_output / sum_of_exps + std::numeric_limits<int16_t>::min();
+      // Round up if remainder exceeds half of the divider value.
+      uint32_t remainder = scaled_output % sum_of_exps;
+      if (remainder * 2 >= sum_of_exps) {
+        full_range_output++;
+      }
+      output_data[i * depth + c] = static_cast<int16_t>(std::max(
+          std::min(full_range_output,
+                   static_cast<int32>(std::numeric_limits<int16_t>::max())),
+          static_cast<int32_t>(std::numeric_limits<int16_t>::min())));
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
 
 TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
                                     const TfLiteTensor* input,
                                     TfLiteTensor* output,
                                     const TfLiteSoftmaxParams* params,
-                                    SoftmaxParams* op_data) {
+                                    OpData* op_data) {
   if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
     if (input->type == kTfLiteUInt8) {
       TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
     } else {
       if (output->type == kTfLiteInt16) {
-        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -32768);
+        TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                          std::numeric_limits<int16_t>::min());
         // NOTE: Current int16 softmax output does not require symmetric scaling
         // - so no need to verify scale here.
       } else {
-        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
+        TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                          std::numeric_limits<int8_t>::min());
         TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
       }
     }
 
-    static const int kScaledDiffIntegerBits = 5;
+    // Precompute e^(-x * input_scale * beta) for every possible int8 input.
+    // This computation is used for every iteration of Softmax.  We must compute
+    // using pre-scaled inputs to avoid introducing additional error, while
+    // restricting our input range to the int8 range. This is valid since beta
+    // and input scale are constant for a given op in the graph. Skip index 0
+    // since that is a special case which requires 1 integer bit instead of 0.
+    for (int i = 1; i <= kInt8Range; i++) {
+      float scaled_input = i * input->params.scale;
+      float exp_value =
+          std::exp((-scaled_input) * static_cast<float>(params->beta));
 
-    int input_left_shift;
-    tflite::PreprocessSoftmaxScaling(
-        static_cast<double>(params->beta),
-        static_cast<double>(input->params.scale), kScaledDiffIntegerBits,
-        &op_data->input_multiplier, &input_left_shift);
-    op_data->input_left_shift = input_left_shift;
-    op_data->diff_min =
-        -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits,
-                                            op_data->input_left_shift);
+      float exponent_scaled =
+          std::round(exp_value * static_cast<float>(1 << kExpFractionalBits));
+      op_data->exp_lut[i] = static_cast<uint16_t>(exponent_scaled);
+    }
   }
   return kTfLiteOk;
 }
 
-TfLiteStatus SoftmaxQuantized(TfLiteContext* context, const TfLiteTensor* input,
-                              TfLiteTensor* output,
-                              const SoftmaxParams& op_params) {
-  switch (output->type) {
-    case kTfLiteInt16:
-      tflite::reference_ops::Softmax(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int16_t>(output));
-      return kTfLiteOk;
-    case kTfLiteInt8:
-      tflite::reference_ops::Softmax(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
-      return kTfLiteOk;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(output->type), output->type);
-      return kTfLiteError;
+void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
   }
+  return data;
 }
 
-}  // namespace
-
 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
 
@@ -103,36 +165,44 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
 
-  // TODO(b/132070898): Use statically slotted SoftmaxParams structures until a
-  // scratch memory API is ready.
-  SoftmaxParams* op_params = &kStaticOpData;
-  node->user_data = op_params;
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* op_data = static_cast<OpData*>(node->user_data);
+
+  // Allocate an array to precompute exponents over all int8 inputs, applying
+  // the scale and beta before calculating exp. It is mandatory to apply beta
+  // and scale here, since each softmax op may have different beta and scale
+  // values. Beta and scale will remain constant for a given softmax op.
+  void* allocated_ptr;
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, kInt8Range * sizeof(int16_t), &allocated_ptr));
+  op_data->exp_lut = static_cast<uint16_t*>(allocated_ptr);
 
   TF_LITE_ENSURE_STATUS(
-      CalculateSoftmaxOpData(context, input, output, params, op_params));
+      CalculateSoftmaxOpData(context, input, output, params, op_data));
 
   return kTfLiteOk;
 }
 
 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
-  auto* op_params = static_cast<SoftmaxParams*>(node->user_data);
+  auto* op_data = static_cast<OpData*>(node->user_data);
 
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
-  switch (input->type) {
-    case kTfLiteInt8:
-      return SoftmaxQuantized(context, input, output, *op_params);
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(input->type), input->type);
-      return kTfLiteError;
+  if (input->type == kTfLiteInt8 && output->type == kTfLiteInt16) {
+    return Softmax(*op_data, GetTensorShape(input),
+                   GetTensorData<int8_t>(input), GetTensorShape(output),
+                   GetTensorData<int16_t>(output));
+  } else {
+    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                       TfLiteTypeGetName(input->type), input->type);
+    return kTfLiteError;
   }
 }
 }  // namespace activations
 
 TfLiteRegistration* Register_SOFTMAX() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  static TfLiteRegistration r = {/*init=*/activations::SoftmaxInit,
                                  /*free=*/nullptr,
                                  /*prepare=*/activations::SoftmaxPrepare,
                                  /*invoke=*/activations::SoftmaxEval,
diff --git a/tensorflow/lite/micro/memory_helpers.cc b/tensorflow/lite/micro/memory_helpers.cc
index 302f160a235..37c78162b62 100644
--- a/tensorflow/lite/micro/memory_helpers.cc
+++ b/tensorflow/lite/micro/memory_helpers.cc
@@ -15,9 +15,14 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/memory_helpers.h"
 
+#include <cstddef>
 #include <cstdint>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
@@ -79,8 +84,12 @@ TfLiteStatus BytesRequiredForTensor(const tflite::Tensor& flatbuffer_tensor,
                                     size_t* bytes, size_t* type_size,
                                     ErrorReporter* error_reporter) {
   int element_count = 1;
-  for (size_t n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
-    element_count *= flatbuffer_tensor.shape()->Get(n);
+  // If flatbuffer_tensor.shape == nullptr, then flatbuffer_tensor is a scalar
+  // so has 1 element.
+  if (flatbuffer_tensor.shape() != nullptr) {
+    for (size_t n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
+      element_count *= flatbuffer_tensor.shape()->Get(n);
+    }
   }
 
   TfLiteType tf_lite_type;
diff --git a/tensorflow/lite/micro/memory_helpers.h b/tensorflow/lite/micro/memory_helpers.h
index ef8205c8038..f52da062271 100644
--- a/tensorflow/lite/micro/memory_helpers.h
+++ b/tensorflow/lite/micro/memory_helpers.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MEMORY_HELPERS_H_
 #define TENSORFLOW_LITE_MICRO_MEMORY_HELPERS_H_
 
+#include <cstddef>
+#include <cstdint>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index c2b32166467..ad26483ec3c 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -18,15 +18,20 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/api/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 #include "tensorflow/lite/micro/memory_planner/greedy_memory_planner.h"
+#include "tensorflow/lite/micro/memory_planner/memory_planner.h"
+#include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/micro/simple_memory_allocator.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
@@ -44,6 +49,15 @@ struct AllocationInfo {
 // requirement for SIMD extensions.
 constexpr int kBufferAlignment = 16;
 
+// Instance of a zero-length int to pass as tensor dims for a flatbuffer
+// Tensor with no shape. Note that the second member of a TfLiteArray is a
+// flexible array member, which is not strictly valid C++. However it is
+// supported by both GCC and clang, as long as the flexible array element is not
+// initialized, which is ok in this case as it should never be accessed.
+// Declaring this as constexpr causes build errors with clang, as it requires
+// the flexible array element to be initialized.
+const TfLiteIntArray kZeroLengthIntArray = {0};
+
 class MicroBuiltinDataAllocator : public BuiltinDataAllocator {
  public:
   explicit MicroBuiltinDataAllocator(SimpleMemoryAllocator* memory_allocator)
@@ -256,7 +270,7 @@ TfLiteStatus CommitPlan(ErrorReporter* error_reporter, MemoryPlanner* planner,
 
 namespace internal {
 
-TfLiteStatus InitializeRuntimeTensor(
+TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
     SimpleMemoryAllocator* allocator, const tflite::Tensor& flatbuffer_tensor,
     const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
     ErrorReporter* error_reporter, TfLiteTensor* result) {
@@ -309,11 +323,17 @@ TfLiteStatus InitializeRuntimeTensor(
       flatbuffer_tensor, &result->bytes, &type_size, error_reporter));
 
   // TFLM doesn't allow reshaping the tensor which requires dynamic memory
-  // allocation so it is safe to drop the const qualifier. In the future, if we
-  // really want to update the tensor shape, we can always pass in a new
+  // allocation so it is safe to drop the const qualifier. In the future, if
+  // we really want to update the tensor shape, we can always pass in a new
   // TfLiteIntArray - especially we have to do so if the dimension is changed.
-  result->dims = const_cast<TfLiteIntArray*>(
-      reinterpret_cast<const TfLiteIntArray*>(flatbuffer_tensor.shape()));
+  if (flatbuffer_tensor.shape() == nullptr) {
+    // flatbuffer_tensor.shape() can return a nullptr in the case of a scalar
+    // tensor.
+    result->dims = const_cast<TfLiteIntArray*>(&kZeroLengthIntArray);
+  } else {
+    result->dims = const_cast<TfLiteIntArray*>(
+        reinterpret_cast<const TfLiteIntArray*>(flatbuffer_tensor.shape()));
+  }
 
   // Copy the quantization information from the serialized data.
   const auto* src_quantization = flatbuffer_tensor.quantization();
@@ -378,60 +398,9 @@ TfLiteStatus InitializeRuntimeTensor(
   }
   return kTfLiteOk;
 }
+
 }  // namespace internal
 
-TfLiteStatus MicroAllocator::Init() {
-  auto* subgraphs = model_->subgraphs();
-  if (subgraphs->size() != 1) {
-    TF_LITE_REPORT_ERROR(error_reporter_,
-                         "Only 1 subgraph is currently supported.\n");
-    return kTfLiteError;
-  }
-  subgraph_ = (*subgraphs)[0];
-  tensors_ = subgraph_->tensors();
-  operators_ = subgraph_->operators();
-
-  context_->tensors_size = tensors_->size();
-  context_->tensors =
-      reinterpret_cast<TfLiteTensor*>(memory_allocator_->AllocateFromTail(
-          sizeof(TfLiteTensor) * context_->tensors_size,
-          alignof(TfLiteTensor)));
-  if (context_->tensors == nullptr) {
-    TF_LITE_REPORT_ERROR(
-        error_reporter_,
-        "Failed to allocate memory for context->tensors, %d bytes required",
-        sizeof(TfLiteTensor) * context_->tensors_size);
-    return kTfLiteError;
-  }
-
-  // Initialize runtime tensors in context_ using the flatbuffer.
-  for (size_t i = 0; i < tensors_->size(); ++i) {
-    TfLiteStatus status = internal::InitializeRuntimeTensor(
-        memory_allocator_, *tensors_->Get(i), model_->buffers(),
-        error_reporter_, &context_->tensors[i]);
-    if (status != kTfLiteOk) {
-      TF_LITE_REPORT_ERROR(error_reporter_, "Failed to initialize tensor %d",
-                           i);
-      return kTfLiteError;
-    }
-  }
-
-  return kTfLiteOk;
-}
-
-size_t MicroAllocator::used_bytes() const {
-  if (active_) {
-    return 0;
-  }
-  TF_LITE_REPORT_ERROR(error_reporter_, "Total buffer usage: %d bytes",
-                       memory_allocator_->GetUsedBytes());
-  TF_LITE_REPORT_ERROR(error_reporter_, "Head usage: %d bytes",
-                       memory_allocator_->GetHeadUsedBytes());
-  TF_LITE_REPORT_ERROR(error_reporter_, "Tail usage: %d bytes",
-                       memory_allocator_->GetTailUsedBytes());
-  return memory_allocator_->GetUsedBytes();
-}
-
 MicroAllocator::MicroAllocator(TfLiteContext* context, const Model* model,
                                uint8_t* tensor_arena, size_t arena_size,
                                ErrorReporter* error_reporter)
@@ -448,9 +417,10 @@ MicroAllocator::MicroAllocator(TfLiteContext* context, const Model* model,
   // Creates a root memory allocator managing the arena. The allocator itself
   // also locates in the arena buffer. This allocator doesn't need to be
   // destructed as it's the root allocator.
-  memory_allocator_ = CreateInPlaceSimpleMemoryAllocator(
+  memory_allocator_ = SimpleMemoryAllocator::Create(
       error_reporter, aligned_arena, aligned_arena_size);
-  TfLiteStatus status = Init();
+
+  TfLiteStatus status = InitGraphAndContextTensorData();
   // TODO(b/147871299): Consider improving this code. A better way of handling
   // failures in the constructor is to have a static function that returns a
   // pointer to the class. If allocation failed, a nullptr will be returned.
@@ -463,88 +433,15 @@ MicroAllocator::MicroAllocator(TfLiteContext* context, const Model* model,
   }
 }
 
-TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
-    const OpResolver& op_resolver,
+TfLiteStatus MicroAllocator::InitializeFromFlatbuffer(
+    const MicroOpResolver& op_resolver,
     NodeAndRegistration** node_and_registrations) {
   if (!active_) {
     return kTfLiteError;
   }
-
-  auto* output = reinterpret_cast<NodeAndRegistration*>(
-      memory_allocator_->AllocateFromTail(
-          sizeof(NodeAndRegistration) * operators_->size(),
-          alignof(NodeAndRegistration)));
-  if (output == nullptr) {
-    TF_LITE_REPORT_ERROR(
-        error_reporter_,
-        "Failed to allocate memory for node_and_registrations.");
-    return kTfLiteError;
-  }
-  TfLiteStatus status = kTfLiteOk;
-  auto* opcodes = model_->operator_codes();
-  MicroBuiltinDataAllocator builtin_data_allocator(memory_allocator_);
-  for (size_t i = 0; i < operators_->size(); ++i) {
-    const auto* op = operators_->Get(i);
-    size_t index = op->opcode_index();
-    if (index >= opcodes->size()) {
-      TF_LITE_REPORT_ERROR(error_reporter_,
-                           "Missing registration for opcode_index %d\n", index);
-      return kTfLiteError;
-    }
-    auto* opcode = (*opcodes)[index];
-    status = GetRegistrationFromOpCode(opcode, op_resolver, error_reporter_,
-                                       &(output[i].registration));
-    if (status != kTfLiteOk) {
-      TF_LITE_REPORT_ERROR(error_reporter_,
-                           "Failed to get registration from op code %s\n ",
-                           EnumNameBuiltinOperator(opcode->builtin_code()));
-      return status;
-    }
-    const auto* registration = output[i].registration;
-    if (registration == nullptr) {
-      TF_LITE_REPORT_ERROR(error_reporter_, "Skipping op for opcode_index %d\n",
-                           index);
-      return kTfLiteError;
-    }
-    BuiltinOperator op_type =
-        static_cast<BuiltinOperator>(registration->builtin_code);
-
-    if (op_type != BuiltinOperator_CUSTOM && op->custom_options()) {
-      TF_LITE_REPORT_ERROR(
-          error_reporter_,
-          "Unsupported behavior: found builtin operator %s with custom "
-          "options.\n",
-          EnumNameBuiltinOperator(op_type));
-      return kTfLiteError;
-    }
-
-    const char* custom_data = nullptr;
-    size_t custom_data_size = 0;
-    unsigned char* builtin_data = nullptr;
-    if (op->custom_options()) {
-      custom_data = reinterpret_cast<const char*>(op->custom_options()->data());
-      custom_data_size = op->custom_options()->size();
-    } else {
-      TF_LITE_ENSURE_STATUS(ParseOpData(op, op_type, error_reporter_,
-                                        &builtin_data_allocator,
-                                        (void**)(&builtin_data)));
-    }
-
-    // Disregard const qualifier to workaround with existing API.
-    TfLiteIntArray* inputs_array = const_cast<TfLiteIntArray*>(
-        reinterpret_cast<const TfLiteIntArray*>(op->inputs()));
-    TfLiteIntArray* outputs_array = const_cast<TfLiteIntArray*>(
-        reinterpret_cast<const TfLiteIntArray*>(op->outputs()));
-
-    TfLiteNode* node = &(output[i].node);
-    *node = {};
-    node->inputs = inputs_array;
-    node->outputs = outputs_array;
-    node->builtin_data = reinterpret_cast<void*>(builtin_data);
-    node->custom_initial_data = custom_data;
-    node->custom_initial_data_size = custom_data_size;
-  }
-  *node_and_registrations = output;
+  TF_LITE_ENSURE_STATUS(AllocateNodeAndRegistrations(node_and_registrations));
+  TF_LITE_ENSURE_STATUS(PrepareNodeAndRegistrationDataFromFlatbuffer(
+      op_resolver, *node_and_registrations));
   return kTfLiteOk;
 }
 
@@ -567,7 +464,7 @@ TfLiteStatus MicroAllocator::FinishTensorAllocation() {
 
     AllocationInfoBuilder builder(error_reporter_, &tmp_allocator);
     TF_LITE_ENSURE_STATUS(
-        builder.Init(tensors_->size(), scratch_buffer_count_));
+        builder.Init(subgraph_->tensors()->size(), scratch_buffer_count_));
     TF_LITE_ENSURE_STATUS(builder.AddTensors(subgraph_, context_->tensors));
     TF_LITE_ENSURE_STATUS(builder.AddScratchBuffers(scratch_buffer_handles_));
     const AllocationInfo* allocation_info = builder.Finish();
@@ -606,8 +503,8 @@ TfLiteStatus MicroAllocator::FinishTensorAllocation() {
 
   // Data in variables need to be kept for the next invocation so allocating
   // them from the tail (persistent area).
-  if (AllocateVariables(tensors_, context_->tensors, memory_allocator_) !=
-      kTfLiteOk) {
+  if (AllocateVariables(subgraph_->tensors(), context_->tensors,
+                        memory_allocator_) != kTfLiteOk) {
     TF_LITE_REPORT_ERROR(
         error_reporter_,
         "Failed to allocate variables. Please increase arena size.");
@@ -679,4 +576,154 @@ void* MicroAllocator::GetScratchBuffer(int buffer_idx) const {
   return scratch_buffer_handles_[scratch_buffer_count_ - buffer_idx - 1].data;
 }
 
+size_t MicroAllocator::used_bytes() const {
+  if (active_) {
+    return 0;
+  }
+  TF_LITE_REPORT_ERROR(error_reporter_, "Total buffer usage: %d bytes",
+                       memory_allocator_->GetUsedBytes());
+  TF_LITE_REPORT_ERROR(error_reporter_, "Head usage: %d bytes",
+                       memory_allocator_->GetHeadUsedBytes());
+  TF_LITE_REPORT_ERROR(error_reporter_, "Tail usage: %d bytes",
+                       memory_allocator_->GetTailUsedBytes());
+  return memory_allocator_->GetUsedBytes();
+}
+
+TfLiteStatus MicroAllocator::InitGraphAndContextTensorData() {
+  auto* subgraphs = model_->subgraphs();
+  if (subgraphs->size() != 1) {
+    TF_LITE_REPORT_ERROR(error_reporter_,
+                         "Only 1 subgraph is currently supported.\n");
+    return kTfLiteError;
+  }
+  subgraph_ = (*subgraphs)[0];
+
+  TF_LITE_ENSURE_STATUS(AllocateTfLiteTensorArray());
+  TF_LITE_ENSURE_STATUS(PopulateTfLiteTensorArrayFromFlatbuffer());
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus MicroAllocator::AllocateTfLiteTensorArray() {
+  context_->tensors_size = subgraph_->tensors()->size();
+  context_->tensors =
+      reinterpret_cast<TfLiteTensor*>(memory_allocator_->AllocateFromTail(
+          sizeof(TfLiteTensor) * context_->tensors_size,
+          alignof(TfLiteTensor)));
+  if (context_->tensors == nullptr) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "Failed to allocate memory for context->tensors, %d bytes required",
+        sizeof(TfLiteTensor) * context_->tensors_size);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus MicroAllocator::PopulateTfLiteTensorArrayFromFlatbuffer() {
+  // Initialize tensors in context_ using the flatbuffer for quantization data.
+  for (size_t i = 0; i < subgraph_->tensors()->size(); ++i) {
+    TfLiteStatus status = internal::InitializeTfLiteTensorFromFlatbuffer(
+        memory_allocator_, *subgraph_->tensors()->Get(i), model_->buffers(),
+        error_reporter_, &context_->tensors[i]);
+    if (status != kTfLiteOk) {
+      TF_LITE_REPORT_ERROR(error_reporter_, "Failed to initialize tensor %d",
+                           i);
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
+    NodeAndRegistration** node_and_registrations) {
+  NodeAndRegistration* output = reinterpret_cast<NodeAndRegistration*>(
+      memory_allocator_->AllocateFromTail(
+          sizeof(NodeAndRegistration) * subgraph_->operators()->size(),
+          alignof(NodeAndRegistration)));
+  if (output == nullptr) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "Failed to allocate memory for node_and_registrations.");
+    return kTfLiteError;
+  }
+  *node_and_registrations = output;
+  return kTfLiteOk;
+}
+
+TfLiteStatus MicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
+    const MicroOpResolver& op_resolver,
+    NodeAndRegistration* node_and_registrations) {
+  TfLiteStatus status = kTfLiteOk;
+  auto* opcodes = model_->operator_codes();
+  MicroBuiltinDataAllocator builtin_data_allocator(memory_allocator_);
+  for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
+    const auto* op = subgraph_->operators()->Get(i);
+    const size_t index = op->opcode_index();
+    if (index >= opcodes->size()) {
+      TF_LITE_REPORT_ERROR(error_reporter_,
+                           "Missing registration for opcode_index %d\n", index);
+      return kTfLiteError;
+    }
+    auto* opcode = (*opcodes)[index];
+    status =
+        GetRegistrationFromOpCode(opcode, op_resolver, error_reporter_,
+                                  &(node_and_registrations[i].registration));
+    if (status != kTfLiteOk) {
+      TF_LITE_REPORT_ERROR(error_reporter_,
+                           "Failed to get registration from op code %s\n ",
+                           EnumNameBuiltinOperator(opcode->builtin_code()));
+      return status;
+    }
+    const auto* registration = node_and_registrations[i].registration;
+    if (registration == nullptr) {
+      TF_LITE_REPORT_ERROR(error_reporter_, "Skipping op for opcode_index %d\n",
+                           index);
+      return kTfLiteError;
+    }
+    BuiltinOperator op_type =
+        static_cast<BuiltinOperator>(registration->builtin_code);
+
+    if (op_type != BuiltinOperator_CUSTOM && op->custom_options()) {
+      TF_LITE_REPORT_ERROR(
+          error_reporter_,
+          "Unsupported behavior: found builtin operator %s with custom "
+          "options.\n",
+          EnumNameBuiltinOperator(op_type));
+      return kTfLiteError;
+    }
+
+    const char* custom_data = nullptr;
+    size_t custom_data_size = 0;
+    unsigned char* builtin_data = nullptr;
+    if (op->custom_options()) {
+      custom_data = reinterpret_cast<const char*>(op->custom_options()->data());
+      custom_data_size = op->custom_options()->size();
+    } else {
+      MicroOpResolver::BuiltinParseFunction parser =
+          op_resolver.GetOpDataParser(op_type);
+      TFLITE_DCHECK(parser != nullptr);
+      TF_LITE_ENSURE_STATUS(parser(op, op_type, error_reporter_,
+                                   &builtin_data_allocator,
+                                   (void**)(&builtin_data)));
+    }
+
+    // Disregard const qualifier to workaround with existing API.
+    TfLiteIntArray* inputs_array = const_cast<TfLiteIntArray*>(
+        reinterpret_cast<const TfLiteIntArray*>(op->inputs()));
+    TfLiteIntArray* outputs_array = const_cast<TfLiteIntArray*>(
+        reinterpret_cast<const TfLiteIntArray*>(op->outputs()));
+
+    TfLiteNode* node = &(node_and_registrations[i].node);
+    *node = {};
+    node->inputs = inputs_array;
+    node->outputs = outputs_array;
+    node->builtin_data = reinterpret_cast<void*>(builtin_data);
+    node->custom_initial_data = custom_data;
+    node->custom_initial_data_size = custom_data_size;
+  }
+
+  return kTfLiteOk;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_allocator.h b/tensorflow/lite/micro/micro_allocator.h
index b16f814071c..d7f7e4c9d6c 100644
--- a/tensorflow/lite/micro/micro_allocator.h
+++ b/tensorflow/lite/micro/micro_allocator.h
@@ -15,9 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_ALLOCATOR_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_ALLOCATOR_H_
 
+#include <cstddef>
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/micro/simple_memory_allocator.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -26,9 +30,9 @@ namespace tflite {
 // Namespace used for unittests.
 namespace internal {
 
-// Sets up all of the data structure members for a runtime tensor
-// based on the contents of a serialized tensor.
-TfLiteStatus InitializeRuntimeTensor(
+// Sets up all of the data structure members for a TfLiteTensor based on the
+// contents of a serialized tensor in the flatbuffer.
+TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
     SimpleMemoryAllocator* allocator, const tflite::Tensor& flatbuffer_tensor,
     const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
     ErrorReporter* error_reporter, TfLiteTensor* result);
@@ -82,6 +86,15 @@ class MicroAllocator {
                  uint8_t* tensor_arena, size_t arena_size,
                  ErrorReporter* error_reporter);
 
+  // Run through the model flatbuffer data (loaded from the TfLiteModel
+  // instance) to allocate nodes and registrations. We need to keep them for the
+  // entire life time of the model to allow persistent tensors. This method
+  // needs to be called before FinishTensorAllocation method. This method also
+  // allocates any internal Op data that is required from the flatbuffer.
+  TfLiteStatus InitializeFromFlatbuffer(
+      const MicroOpResolver& op_resolver,
+      NodeAndRegistration** node_and_registrations);
+
   // Runs through the model and allocates all necessary input, output and
   // intermediate tensors.
   // WARNING: doing any allocation after calling this method has the risk of
@@ -89,17 +102,6 @@ class MicroAllocator {
   // called in this class.
   TfLiteStatus FinishTensorAllocation();
 
-  // Returns the arena usage in bytes, only available after
-  // `FinishTensorAllocation`. Otherwise, it will return 0.
-  size_t used_bytes() const;
-
-  // Run through the model to allocate nodes and registrations. We need to keep
-  // them for the entire life time of the model to allow persistent tensors.
-  // This method needs to be called before FinishTensorAllocation method.
-  TfLiteStatus AllocateNodeAndRegistrations(
-      const OpResolver& op_resolver,
-      NodeAndRegistration** node_and_registrations);
-
   // Allocates persistent buffer which has the same life time as the allocator.
   // The memory is immediately available and is allocated from the tail of the
   // arena.
@@ -116,8 +118,38 @@ class MicroAllocator {
   // Returns the pointer to the planned scratch buffer.
   void* GetScratchBuffer(int buffer_idx) const;
 
+  // Returns the arena usage in bytes, only available after
+  // `FinishTensorAllocation`. Otherwise, it will return 0.
+  size_t used_bytes() const;
+
+ protected:
+  // Allocates an array in the arena to hold pointers to the tensors required
+  // to initialize and prepare a model. These allocations are stored and
+  // populated on the context.
+  TfLiteStatus AllocateTfLiteTensorArray();
+
+  // Populates content on the list of tensor pointers required to initialize and
+  // prepare a model from data in the flatbuffer (loaded from the TfLiteModel
+  // instance). Persistent data (e.g. quantization params) is allocated from the
+  // arena.
+  TfLiteStatus PopulateTfLiteTensorArrayFromFlatbuffer();
+
+  // Allocates an array in the arena to hold pointers to the node and
+  // registration pointers required to represent the inference graph of the
+  // model.
+  TfLiteStatus AllocateNodeAndRegistrations(
+      NodeAndRegistration** node_and_registrations);
+
+  // Populates node and registration pointers representing the inference graph
+  // of the model from values inside the flatbuffer (loaded from the TfLiteModel
+  // instance). Persistent data (e.g. operator data) is allocated from the
+  // arena.
+  TfLiteStatus PrepareNodeAndRegistrationDataFromFlatbuffer(
+      const MicroOpResolver& op_resolver,
+      NodeAndRegistration* node_and_registrations);
+
  private:
-  TfLiteStatus Init();
+  TfLiteStatus InitGraphAndContextTensorData();
 
   const Model* model_;
   // A simple memory allocator that always allocate from the arena tail.
@@ -135,8 +167,6 @@ class MicroAllocator {
   size_t scratch_buffer_count_ = 0;
 
   const SubGraph* subgraph_;
-  const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators_;
-  const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc
index 78419edbbf9..013c1cefabf 100644
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@@ -68,8 +68,9 @@ TF_LITE_MICRO_TEST(TestInitializeRuntimeTensor) {
   TfLiteContext context;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
-  tflite::SimpleMemoryAllocator simple_allocator(micro_test::reporter, arena,
-                                                 arena_size);
+  tflite::SimpleMemoryAllocator* simple_allocator =
+      tflite::SimpleMemoryAllocator::Create(micro_test::reporter, arena,
+                                            arena_size);
 
   const tflite::Tensor* tensor = tflite::testing::Create1dFlatbufferTensor(100);
   const flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>* buffers =
@@ -77,8 +78,8 @@ TF_LITE_MICRO_TEST(TestInitializeRuntimeTensor) {
 
   TfLiteTensor allocated_tensor;
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, tflite::internal::InitializeRuntimeTensor(
-                     &simple_allocator, *tensor, buffers, micro_test::reporter,
+      kTfLiteOk, tflite::internal::InitializeTfLiteTensorFromFlatbuffer(
+                     simple_allocator, *tensor, buffers, micro_test::reporter,
                      &allocated_tensor));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
   TF_LITE_MICRO_EXPECT_EQ(1, allocated_tensor.dims->size);
@@ -93,8 +94,9 @@ TF_LITE_MICRO_TEST(TestInitializeQuantizedTensor) {
   TfLiteContext context;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
-  tflite::SimpleMemoryAllocator simple_allocator(micro_test::reporter, arena,
-                                                 arena_size);
+  tflite::SimpleMemoryAllocator* simple_allocator =
+      tflite::SimpleMemoryAllocator::Create(micro_test::reporter, arena,
+                                            arena_size);
 
   const tflite::Tensor* tensor =
       tflite::testing::CreateQuantizedFlatbufferTensor(100);
@@ -103,8 +105,8 @@ TF_LITE_MICRO_TEST(TestInitializeQuantizedTensor) {
 
   TfLiteTensor allocated_tensor;
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, tflite::internal::InitializeRuntimeTensor(
-                     &simple_allocator, *tensor, buffers, micro_test::reporter,
+      kTfLiteOk, tflite::internal::InitializeTfLiteTensorFromFlatbuffer(
+                     simple_allocator, *tensor, buffers, micro_test::reporter,
                      &allocated_tensor));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
   TF_LITE_MICRO_EXPECT_EQ(1, allocated_tensor.dims->size);
@@ -119,8 +121,9 @@ TF_LITE_MICRO_TEST(TestMissingQuantization) {
   TfLiteContext context;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
-  tflite::SimpleMemoryAllocator simple_allocator(micro_test::reporter, arena,
-                                                 arena_size);
+  tflite::SimpleMemoryAllocator* simple_allocator =
+      tflite::SimpleMemoryAllocator::Create(micro_test::reporter, arena,
+                                            arena_size);
 
   const tflite::Tensor* tensor =
       tflite::testing::CreateMissingQuantizationFlatbufferTensor(100);
@@ -129,8 +132,8 @@ TF_LITE_MICRO_TEST(TestMissingQuantization) {
 
   TfLiteTensor allocated_tensor;
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, tflite::internal::InitializeRuntimeTensor(
-                     &simple_allocator, *tensor, buffers, micro_test::reporter,
+      kTfLiteOk, tflite::internal::InitializeTfLiteTensorFromFlatbuffer(
+                     simple_allocator, *tensor, buffers, micro_test::reporter,
                      &allocated_tensor));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
   TF_LITE_MICRO_EXPECT_EQ(1, allocated_tensor.dims->size);
diff --git a/tensorflow/lite/micro/micro_error_reporter.cc b/tensorflow/lite/micro/micro_error_reporter.cc
index bea3dc8db4c..6d8361cd25a 100644
--- a/tensorflow/lite/micro/micro_error_reporter.cc
+++ b/tensorflow/lite/micro/micro_error_reporter.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 
+#include <cstdarg>
+
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
+#include "tensorflow/lite/micro/debug_log.h"
 #include "tensorflow/lite/micro/micro_string.h"
 #endif
 
diff --git a/tensorflow/lite/micro/micro_error_reporter.h b/tensorflow/lite/micro/micro_error_reporter.h
index b18c47f4ecb..e2c073a465d 100644
--- a/tensorflow/lite/micro/micro_error_reporter.h
+++ b/tensorflow/lite/micro/micro_error_reporter.h
@@ -15,9 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_ERROR_REPORTER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_ERROR_REPORTER_H_
 
+#include <cstdarg>
+
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/micro/compatibility.h"
-#include "tensorflow/lite/micro/debug_log.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
index c5d35407648..7e2e56e417d 100644
--- a/tensorflow/lite/micro/micro_interpreter.cc
+++ b/tensorflow/lite/micro/micro_interpreter.cc
@@ -14,12 +14,17 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/micro/micro_interpreter.h"
 
+#include <cstdarg>
+#include <cstddef>
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/tensor_utils.h"
-#include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
-#include "tensorflow/lite/micro/micro_optional_debug_tools.h"
+#include "tensorflow/lite/micro/micro_op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
@@ -67,7 +72,7 @@ void ContextHelper::ReportOpError(struct TfLiteContext* context,
 }  // namespace internal
 
 MicroInterpreter::MicroInterpreter(const Model* model,
-                                   const OpResolver& op_resolver,
+                                   const MicroOpResolver& op_resolver,
                                    uint8_t* tensor_arena,
                                    size_t tensor_arena_size,
                                    ErrorReporter* error_reporter)
@@ -87,8 +92,6 @@ MicroInterpreter::MicroInterpreter(const Model* model,
     return;
   }
   subgraph_ = (*subgraphs)[0];
-  tensors_ = subgraph_->tensors();
-  operators_ = subgraph_->operators();
 
   context_.impl_ = static_cast<void*>(&context_helper_);
   context_.ReportError = context_helper_.ReportOpError;
@@ -112,7 +115,7 @@ MicroInterpreter::MicroInterpreter(const Model* model,
 
 MicroInterpreter::~MicroInterpreter() {
   if (node_and_registrations_ != nullptr) {
-    for (size_t i = 0; i < operators_->size(); ++i) {
+    for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
       TfLiteNode* node = &(node_and_registrations_[i].node);
       const TfLiteRegistration* registration =
           node_and_registrations_[i].registration;
@@ -163,7 +166,7 @@ void MicroInterpreter::CorrectTensorDataEndianness(T* data, int32_t size) {
 }
 
 TfLiteStatus MicroInterpreter::AllocateTensors() {
-  TF_LITE_ENSURE_OK(&context_, allocator_.AllocateNodeAndRegistrations(
+  TF_LITE_ENSURE_OK(&context_, allocator_.InitializeFromFlatbuffer(
                                    op_resolver_, &node_and_registrations_));
 
   // Only allow AllocatePersistentBuffer in Init stage.
@@ -171,7 +174,7 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
   context_.RequestScratchBufferInArena = nullptr;
   context_.GetScratchBuffer = nullptr;
 
-  for (size_t i = 0; i < operators_->size(); ++i) {
+  for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
     context_helper_.SetNodeIndex(i);
     auto* node = &(node_and_registrations_[i].node);
     auto* registration = node_and_registrations_[i].registration;
@@ -195,7 +198,7 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
   // in Prepare stage.
   context_.RequestScratchBufferInArena =
       context_helper_.RequestScratchBufferInArena;
-  for (size_t i = 0; i < operators_->size(); ++i) {
+  for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
     // Set node idx to annotate the lifetime for scratch buffers.
     context_helper_.SetNodeIndex(i);
     auto* node = &(node_and_registrations_[i].node);
@@ -237,7 +240,7 @@ TfLiteStatus MicroInterpreter::Invoke() {
     TF_LITE_ENSURE_OK(&context_, AllocateTensors());
   }
 
-  for (size_t i = 0; i < operators_->size(); ++i) {
+  for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
     auto* node = &(node_and_registrations_[i].node);
     auto* registration = node_and_registrations_[i].registration;
 
diff --git a/tensorflow/lite/micro/micro_interpreter.h b/tensorflow/lite/micro/micro_interpreter.h
index b2046128c78..a0b70527905 100644
--- a/tensorflow/lite/micro/micro_interpreter.h
+++ b/tensorflow/lite/micro/micro_interpreter.h
@@ -15,11 +15,15 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_INTERPRETER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_INTERPRETER_H_
 
+#include <cstddef>
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
+#include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/type_to_tflitetype.h"
 
@@ -68,7 +72,7 @@ class MicroInterpreter {
   // function.
   // The interpreter doesn't do any deallocation of any of the pointed-to
   // objects, ownership remains with the caller.
-  MicroInterpreter(const Model* model, const OpResolver& op_resolver,
+  MicroInterpreter(const Model* model, const MicroOpResolver& op_resolver,
                    uint8_t* tensor_arena, size_t tensor_arena_size,
                    ErrorReporter* error_reporter);
 
@@ -132,7 +136,7 @@ class MicroInterpreter {
 
   TfLiteStatus initialization_status() const { return initialization_status_; }
 
-  size_t operators_size() const { return operators_->size(); }
+  size_t operators_size() const { return subgraph_->operators()->size(); }
 
   // For debugging only.
   const NodeAndRegistration node_and_registration(int node_index) const {
@@ -156,15 +160,13 @@ class MicroInterpreter {
   NodeAndRegistration* node_and_registrations_ = nullptr;
 
   const Model* model_;
-  const OpResolver& op_resolver_;
+  const MicroOpResolver& op_resolver_;
   ErrorReporter* error_reporter_;
   TfLiteContext context_ = {};
   MicroAllocator allocator_;
   bool tensors_allocated_;
 
   TfLiteStatus initialization_status_;
-  const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors_;
-  const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators_;
 
   const SubGraph* subgraph_;
   internal::ContextHelper context_helper_;
diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc
index 36e8c009b96..aa25593c508 100644
--- a/tensorflow/lite/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/micro/micro_interpreter_test.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/micro/micro_optional_debug_tools.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/micro/test_helpers.h"
@@ -147,13 +149,12 @@ class MockCustom {
   }
 };
 
-class MockOpResolver : public OpResolver {
+class MockOpResolver : public MicroOpResolver {
  public:
-  const TfLiteRegistration* FindOp(BuiltinOperator op,
-                                   int version) const override {
+  const TfLiteRegistration* FindOp(BuiltinOperator op) const override {
     return nullptr;
   }
-  const TfLiteRegistration* FindOp(const char* op, int version) const override {
+  const TfLiteRegistration* FindOp(const char* op) const override {
     if (strcmp(op, "mock_custom") == 0) {
       return MockCustom::getRegistration();
     } else if (strcmp(op, "simple_stateful_op") == 0) {
@@ -162,6 +163,22 @@ class MockOpResolver : public OpResolver {
       return nullptr;
     }
   }
+
+  MicroOpResolver::BuiltinParseFunction GetOpDataParser(
+      tflite::BuiltinOperator) const override {
+    // TODO(b/149408647): Figure out an alternative so that we do not have any
+    // references to ParseOpData in the micro code and the signature for
+    // MicroOpResolver::BuiltinParseFunction can be changed to be different from
+    // ParseOpData.
+    return ParseOpData;
+  }
+
+  TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
+                          TfLiteRegistration* registration,
+                          int version) override {
+    // This function is currently not used in the tests.
+    return kTfLiteError;
+  }
 };
 
 }  // namespace
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index ac304352a57..54de9e0e518 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,78 +15,113 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
 
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/op_resolver.h"
-#include "tensorflow/lite/micro/compatibility.h"
-#include "tensorflow/lite/schema/schema_generated.h"
+#include <cstring>
 
-#ifndef TFLITE_REGISTRATIONS_MAX
-#define TFLITE_REGISTRATIONS_MAX (128)
-#endif
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/micro/compatibility.h"
+#include "tensorflow/lite/micro/micro_op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
-// Op versions discussed in this file are enumerated here:
-// tensorflow/lite/tools/versioning/op_version.cc
-
+// TODO(b/151245712) TODO(b/149408647): remove any version once we no longer
+// support op versions in the API or we switch most users and AllOpsResolver to
+// the new selective registration API, whichever seems more appropriate.
 inline int MicroOpResolverAnyVersion() { return 0; }
 
-template <unsigned int tOpCount = TFLITE_REGISTRATIONS_MAX>
-class MicroOpResolver : public OpResolver {
+template <unsigned int tOpCount>
+class MicroMutableOpResolver : public MicroOpResolver {
  public:
-  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
-                                   int version) const override {
+  explicit MicroMutableOpResolver(ErrorReporter* error_reporter = nullptr)
+      : error_reporter_(error_reporter) {}
+
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override {
+    if (op == BuiltinOperator_CUSTOM) return nullptr;
+
     for (unsigned int i = 0; i < registrations_len_; ++i) {
       const TfLiteRegistration& registration = registrations_[i];
-      if ((registration.builtin_code == op) &&
-          (registration.version == MicroOpResolverAnyVersion() ||
-           registration.version == version)) {
+      if (registration.builtin_code == op) {
         return &registration;
       }
     }
     return nullptr;
   }
 
-  const TfLiteRegistration* FindOp(const char* op, int version) const override {
+  const TfLiteRegistration* FindOp(const char* op) const override {
     for (unsigned int i = 0; i < registrations_len_; ++i) {
       const TfLiteRegistration& registration = registrations_[i];
       if ((registration.builtin_code == BuiltinOperator_CUSTOM) &&
-          (strcmp(registration.custom_name, op) == 0) &&
-          (registration.version == MicroOpResolverAnyVersion() ||
-           registration.version == version)) {
+          (strcmp(registration.custom_name, op) == 0)) {
         return &registration;
       }
     }
     return nullptr;
   }
 
-  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration,
-                  int version = 1) {
+  MicroOpResolver::BuiltinParseFunction GetOpDataParser(
+      tflite::BuiltinOperator) const override {
+    // TODO(b/149408647): Replace with the more selective builtin parser.
+    return ParseOpData;
+  }
+
+  TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
+                          TfLiteRegistration* registration,
+                          int version = 1) override {
     if (registrations_len_ >= tOpCount) {
-      // TODO(b/147748244) - Add error reporting hooks so we can report this!
-      return;
+      if (error_reporter_) {
+        TF_LITE_REPORT_ERROR(error_reporter_,
+                             "Couldn't register builtin op #%d, resolver size "
+                             "is too small (%d)",
+                             op, tOpCount);
+      }
+      return kTfLiteError;
     }
+
+    if (FindOp(op) != nullptr) {
+      if (error_reporter_ != nullptr) {
+        TF_LITE_REPORT_ERROR(error_reporter_,
+                             "Registering multiple versions of the same op is "
+                             "not supported (Op: #%d, version: %d).",
+                             op, version);
+      }
+      return kTfLiteError;
+    }
+
     TfLiteRegistration* new_registration = &registrations_[registrations_len_];
     registrations_len_ += 1;
 
     *new_registration = *registration;
     new_registration->builtin_code = op;
     new_registration->version = version;
+
+    return kTfLiteOk;
   }
 
-  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration,
-                  int min_version, int max_version) {
-    for (int version = min_version; version <= max_version; ++version) {
-      AddBuiltin(op, registration, version);
-    }
-  }
-
-  void AddCustom(const char* name, TfLiteRegistration* registration,
-                 int version = 1) {
+  TfLiteStatus AddCustom(const char* name, TfLiteRegistration* registration,
+                         int version = 1) {
+    printf("registrations_len_: %d\n", registrations_len_);
     if (registrations_len_ >= tOpCount) {
-      // TODO(b/147748244) - Add error reporting hooks so we can report this!
-      return;
+      if (error_reporter_) {
+        TF_LITE_REPORT_ERROR(
+            error_reporter_,
+            "Couldn't register custom op '%s', resolver size is too small (%d)",
+            name, tOpCount);
+      }
+      return kTfLiteError;
     }
+
+    if (FindOp(name) != nullptr) {
+      if (error_reporter_ != nullptr) {
+        TF_LITE_REPORT_ERROR(error_reporter_,
+                             "Registering multiple versions of the same op is "
+                             "not supported (Op: %s, version: %d).",
+                             name, version);
+      }
+      return kTfLiteError;
+    }
+
     TfLiteRegistration* new_registration = &registrations_[registrations_len_];
     registrations_len_ += 1;
 
@@ -94,13 +129,8 @@ class MicroOpResolver : public OpResolver {
     new_registration->builtin_code = BuiltinOperator_CUSTOM;
     new_registration->custom_name = name;
     new_registration->version = version;
-  }
 
-  void AddCustom(const char* name, TfLiteRegistration* registration,
-                 int min_version, int max_version) {
-    for (int version = min_version; version <= max_version; ++version) {
-      AddCustom(name, registration, version);
-    }
+    return kTfLiteOk;
   }
 
   unsigned int GetRegistrationLength() { return registrations_len_; }
@@ -108,18 +138,11 @@ class MicroOpResolver : public OpResolver {
  private:
   TfLiteRegistration registrations_[tOpCount];
   unsigned int registrations_len_ = 0;
+  ErrorReporter* error_reporter_;
 
   TF_LITE_REMOVE_VIRTUAL_DELETE
 };
 
-// TODO(b/147854028): Consider switching all uses of MicroMutableOpResolver to
-// MicroOpResolver.
-class MicroMutableOpResolver
-    : public MicroOpResolver<TFLITE_REGISTRATIONS_MAX> {
- private:
-  TF_LITE_REMOVE_VIRTUAL_DELETE
-};
-
 };  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
index 0619591523a..82f1c4bf1db 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 
+#include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 
 namespace tflite {
@@ -35,6 +36,24 @@ TfLiteStatus MockPrepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus MockInvoke(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
+
+class MockErrorReporter : public ErrorReporter {
+ public:
+  MockErrorReporter() : has_been_called_(false) {}
+  int Report(const char* format, va_list args) override {
+    has_been_called_ = true;
+    return 0;
+  };
+
+  bool HasBeenCalled() { return has_been_called_; }
+
+  void ResetState() { has_been_called_ = false; }
+
+ private:
+  bool has_been_called_;
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
 }  // namespace
 }  // namespace tflite
 
@@ -43,98 +62,91 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(TestOperations) {
   using tflite::BuiltinOperator_CONV_2D;
   using tflite::BuiltinOperator_RELU;
-  using tflite::MicroOpResolver;
+  using tflite::MicroMutableOpResolver;
   using tflite::OpResolver;
 
   static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
                                  tflite::MockPrepare, tflite::MockInvoke};
 
-  // We need space for 7 operators because of 2 ops, one with 3 versions, one
-  // with 4 versions.
-  MicroOpResolver<7> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r, 1, 3);
-  micro_op_resolver.AddCustom("mock_custom", &r, 1, 4);
-  OpResolver* resolver = &micro_op_resolver;
+  MicroMutableOpResolver<2> micro_op_resolver;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r, 1));
+
+  // Only one AddBuiltin per operator should return kTfLiteOk, regardless of
+  // what the version parameter is.
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, micro_op_resolver.AddBuiltin(
+                                            BuiltinOperator_CONV_2D, &r, 1));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, micro_op_resolver.AddBuiltin(
+                                            BuiltinOperator_CONV_2D, &r, 2));
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          micro_op_resolver.AddCustom("mock_custom", &r, 2));
+
+  // Only one AddCustom per operator should return kTfLiteOk, regardless of
+  // what the version parameter is.
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                          micro_op_resolver.AddCustom("mock_custom", &r, 2));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                          micro_op_resolver.AddCustom("mock_custom", &r, 1));
+
+  tflite::MicroOpResolver* resolver = &micro_op_resolver;
 
   const TfLiteRegistration* registration =
-      resolver->FindOp(BuiltinOperator_CONV_2D, 1);
+      resolver->FindOp(BuiltinOperator_CONV_2D);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
   TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
 
-  TF_LITE_MICRO_EXPECT_EQ(7, micro_op_resolver.GetRegistrationLength());
+  TF_LITE_MICRO_EXPECT_EQ(2, micro_op_resolver.GetRegistrationLength());
 
-  registration = resolver->FindOp(BuiltinOperator_CONV_2D, 10);
+  registration = resolver->FindOp(BuiltinOperator_RELU);
   TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
 
-  registration = resolver->FindOp(BuiltinOperator_RELU, 0);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
-
-  registration = resolver->FindOp("mock_custom", 1);
+  registration = resolver->FindOp("mock_custom");
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
   TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
 
-  registration = resolver->FindOp("mock_custom", 10);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
-
-  registration = resolver->FindOp("nonexistent_custom", 0);
+  registration = resolver->FindOp("nonexistent_custom");
   TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
 }
 
-TF_LITE_MICRO_TEST(TestOpRegistrationOverflow) {
+TF_LITE_MICRO_TEST(TestErrorReporting) {
   using tflite::BuiltinOperator_CONV_2D;
   using tflite::BuiltinOperator_RELU;
-  using tflite::MicroOpResolver;
-  using tflite::OpResolver;
+  using tflite::MicroMutableOpResolver;
 
   static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
                                  tflite::MockPrepare, tflite::MockInvoke};
 
-  MicroOpResolver<4> micro_op_resolver;
-  // Register 7 ops, but only 4 is expected because the class is created with
-  // that limit..
-  micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r, 0, 2);
-  micro_op_resolver.AddCustom("mock_custom", &r, 0, 3);
-  OpResolver* resolver = &micro_op_resolver;
+  tflite::MockErrorReporter mock_reporter;
+  MicroMutableOpResolver<2> micro_op_resolver(&mock_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  mock_reporter.ResetState();
 
-  TF_LITE_MICRO_EXPECT_EQ(4, micro_op_resolver.GetRegistrationLength());
-}
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r));
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  mock_reporter.ResetState();
 
-TF_LITE_MICRO_TEST(TestZeroVersionRegistration) {
-  using tflite::MicroOpResolver;
-  using tflite::OpResolver;
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          micro_op_resolver.AddCustom("mock_custom_0", &r));
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  mock_reporter.ResetState();
 
-  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
-                                 tflite::MockPrepare, tflite::MockInvoke};
+  // Attempting to Add more operators than the class template parameter for
+  // MicroMutableOpResolver should result in errors.
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, micro_op_resolver.AddBuiltin(BuiltinOperator_RELU, &r));
+  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
+  mock_reporter.ResetState();
 
-  MicroOpResolver<1> micro_op_resolver;
-  micro_op_resolver.AddCustom("mock_custom", &r,
-                              tflite::MicroOpResolverAnyVersion());
-
-  TF_LITE_MICRO_EXPECT_EQ(1, micro_op_resolver.GetRegistrationLength());
-
-  OpResolver* resolver = &micro_op_resolver;
-
-  const TfLiteRegistration* registration = resolver->FindOp("mock_custom", 0);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
-
-  registration = resolver->FindOp("mock_custom", 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
-
-  registration = resolver->FindOp("mock_custom", 42);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                          micro_op_resolver.AddCustom("mock_custom_1", &r));
+  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
+  mock_reporter.ResetState();
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/micro_op_resolver.h b/tensorflow/lite/micro/micro_op_resolver.h
new file mode 100644
index 00000000000..68035b3256e
--- /dev/null
+++ b/tensorflow/lite/micro/micro_op_resolver.h
@@ -0,0 +1,105 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_MICRO_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_MICRO_MICRO_OP_RESOLVER_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// This is an interface for the OpResolver for TFLiteMicro. The differences from
+// the TFLite OpResolver base class are to:
+//  * explicitly remove support for Op versions
+//  * allow for finer grained registration of the Builtin Ops to reduce code
+//    size for TFLiteMicro.
+//
+// We need an interface class instead of directly using MicroMutableOpResolver
+// because MicroMutableOpResolver is a class template with the number of
+// registered Ops as the template parameter.
+class MicroOpResolver : public OpResolver {
+ public:
+  // TODO(b/149408647): The op_type parameter enables a gradual transfer to
+  // selective registration of the parse function. It should be removed once we
+  // no longer need to use ParseOpData (from flatbuffer_conversions.h) as part
+  // of the MicroMutableOpResolver.
+  typedef TfLiteStatus (*BuiltinParseFunction)(const Operator* op,
+                                               BuiltinOperator op_type,
+                                               ErrorReporter* error_reporter,
+                                               BuiltinDataAllocator* allocator,
+                                               void** builtin_data);
+
+  // Registers a Builtin Operator with the MicroOpResolver.
+  //
+  // Note that the version parameter is ignored and only a first call for a
+  // given BuiltinOperator enum will be successful. i.e. if this function is
+  // called again for a previously added BuiltinOperator (even with a different
+  // version parameter), the MicroOpResolver will be unchanged and this function
+  // will return kTfLiteError.
+  //
+  // TODO(b/151245712): The version param is kept to avoid breaking the legacy
+  // API but it should be removed eventually.
+  virtual TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
+                                  TfLiteRegistration* registration,
+                                  int version) = 0;
+
+  // Registers a Custom Operator with the MicroOpResolver.
+  //
+  // Note that the version parameter is ignored and only a first call for a
+  // given name will be successful. i.e. if this function is called again for a
+  // previously added Custom Operator (even with a different version parameter),
+  // the MicroOpResolver will be unchanged and this function will return
+  // kTfLiteError.
+  //
+  // TODO(b/151245712): The version param is kept to avoid breaking the legacy
+  // API but it should be removed eventually.
+  TfLiteStatus AddCustom(const char* name, TfLiteRegistration* registration,
+                         int version);
+
+  // Returns the Op registration struct corresponding to the enum code from the
+  // flatbuffer schema. Returns nullptr if the op is not found or if op ==
+  // BuiltinOperator_CUSTOM.
+  virtual const TfLiteRegistration* FindOp(BuiltinOperator op) const = 0;
+
+  // Returns the Op registration struct corresponding to the custom operator by
+  // name.
+  virtual const TfLiteRegistration* FindOp(const char* op) const = 0;
+
+  // This implementation exists for compatibility with the OpResolver base class
+  // and disregards the version parameter.
+  const TfLiteRegistration* FindOp(BuiltinOperator op,
+                                   int version) const final {
+    return FindOp(op);
+  }
+
+  // This implementation exists for compatibility with the OpResolver base class
+  // and disregards the version parameter.
+  const TfLiteRegistration* FindOp(const char* op, int version) const final {
+    return FindOp(op);
+  }
+
+  // Returns the operator specific parsing function for the OpData for a
+  // BuiltinOperator (if registered), else nullptr.
+  virtual BuiltinParseFunction GetOpDataParser(BuiltinOperator op) const = 0;
+
+  ~MicroOpResolver() override {}
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_MICRO_OP_RESOLVER_H_
diff --git a/tensorflow/lite/micro/micro_optional_debug_tools.cc b/tensorflow/lite/micro/micro_optional_debug_tools.cc
index 70f16c78d79..daa5d007cdf 100644
--- a/tensorflow/lite/micro/micro_optional_debug_tools.cc
+++ b/tensorflow/lite/micro/micro_optional_debug_tools.cc
@@ -20,8 +20,17 @@ limitations under the License.
 #endif
 
 #include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <vector>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/micro_allocator.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+
 namespace tflite {
 namespace {
 
@@ -95,6 +104,8 @@ const char* AllocTypeName(TfLiteAllocationType type) {
       return "kTfLiteArenaRw";
     case kTfLiteArenaRwPersistent:
       return "kTfLiteArenaRwPersistent";
+    case kTfLitePersistentRo:
+      return "kTfLitePersistentRo";
   }
   return "(invalid)";
 }
diff --git a/tensorflow/lite/micro/recording_simple_memory_allocator.cc b/tensorflow/lite/micro/recording_simple_memory_allocator.cc
new file mode 100644
index 00000000000..934fa260e30
--- /dev/null
+++ b/tensorflow/lite/micro/recording_simple_memory_allocator.cc
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/recording_simple_memory_allocator.h"
+
+namespace tflite {
+
+RecordingSimpleMemoryAllocator::RecordingSimpleMemoryAllocator(
+    ErrorReporter* error_reporter, uint8_t* buffer_head, size_t buffer_size)
+    : SimpleMemoryAllocator(error_reporter, buffer_head, buffer_size),
+      requested_bytes_(0),
+      used_bytes_(0),
+      alloc_count_(0) {}
+
+RecordingSimpleMemoryAllocator::~RecordingSimpleMemoryAllocator() {}
+
+size_t RecordingSimpleMemoryAllocator::GetRequestedBytes() const {
+  return requested_bytes_;
+}
+
+size_t RecordingSimpleMemoryAllocator::GetUsedBytes() const {
+  return used_bytes_;
+}
+
+size_t RecordingSimpleMemoryAllocator::GetAllocatedCount() const {
+  return alloc_count_;
+}
+
+uint8_t* RecordingSimpleMemoryAllocator::AllocateFromHead(size_t size,
+                                                          size_t alignment) {
+  const uint8_t* previous_head = GetHead();
+  uint8_t* result = SimpleMemoryAllocator::AllocateFromHead(size, alignment);
+  if (result != nullptr) {
+    used_bytes_ += GetHead() - previous_head;
+    requested_bytes_ += size;
+    alloc_count_++;
+  }
+  return result;
+}
+
+uint8_t* RecordingSimpleMemoryAllocator::AllocateFromTail(size_t size,
+                                                          size_t alignment) {
+  const uint8_t* previous_tail = GetTail();
+  uint8_t* result = SimpleMemoryAllocator::AllocateFromTail(size, alignment);
+  if (result != nullptr) {
+    used_bytes_ += previous_tail - GetTail();
+    requested_bytes_ += size;
+    alloc_count_++;
+  }
+  return result;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/recording_simple_memory_allocator.h b/tensorflow/lite/micro/recording_simple_memory_allocator.h
new file mode 100644
index 00000000000..77edadb35be
--- /dev/null
+++ b/tensorflow/lite/micro/recording_simple_memory_allocator.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_RECORDING_SIMPLE_MEMORY_ALLOCATOR_H_
+#define TENSORFLOW_LITE_MICRO_RECORDING_SIMPLE_MEMORY_ALLOCATOR_H_
+
+#include "tensorflow/lite/micro/compatibility.h"
+#include "tensorflow/lite/micro/simple_memory_allocator.h"
+
+namespace tflite {
+
+// Utility class used to log allocations of a SimpleMemoryAllocator. Should only
+// be used in debug/evaluation settings or unit tests to evaluate allocation
+// usage.
+class RecordingSimpleMemoryAllocator : public SimpleMemoryAllocator {
+ public:
+  RecordingSimpleMemoryAllocator(ErrorReporter* error_reporter,
+                                 uint8_t* buffer_head, size_t buffer_size);
+  // TODO(b/157615197): Cleanup constructors/destructor and use factory
+  // functions.
+  ~RecordingSimpleMemoryAllocator() override;
+
+  // Returns the number of bytes requested from the head or tail.
+  size_t GetRequestedBytes() const;
+
+  // Returns the number of bytes actually allocated from the head or tail. This
+  // value will be >= to the number of requested bytes due to padding and
+  // alignment.
+  size_t GetUsedBytes() const;
+
+  // Returns the number of alloc calls from the head or tail.
+  size_t GetAllocatedCount() const;
+
+  uint8_t* AllocateFromHead(size_t size, size_t alignment) override;
+  uint8_t* AllocateFromTail(size_t size, size_t alignment) override;
+
+ private:
+  size_t requested_bytes_;
+  size_t used_bytes_;
+  size_t alloc_count_;
+
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_RECORDING_SIMPLE_MEMORY_ALLOCATOR_H_
diff --git a/tensorflow/lite/micro/recording_simple_memory_allocator_test.cc b/tensorflow/lite/micro/recording_simple_memory_allocator_test.cc
new file mode 100644
index 00000000000..8fc4745a70e
--- /dev/null
+++ b/tensorflow/lite/micro/recording_simple_memory_allocator_test.cc
@@ -0,0 +1,119 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/recording_simple_memory_allocator.h"
+
+#include <cstdint>
+
+#include "tensorflow/lite/micro/test_helpers.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestRecordsTailAllocations) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::RecordingSimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                                   arena_size);
+
+  uint8_t* result = allocator.AllocateFromTail(/*size=*/10, /*alignment=*/1);
+  TF_LITE_MICRO_EXPECT_NE(result, nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 10);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 10);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 1);
+
+  result = allocator.AllocateFromTail(/*size=*/20, /*alignment=*/1);
+  TF_LITE_MICRO_EXPECT_NE(result, nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 30);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 30);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 2);
+}
+
+TF_LITE_MICRO_TEST(TestRecordsMisalignedTailAllocations) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::RecordingSimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                                   arena_size);
+
+  uint8_t* result = allocator.AllocateFromTail(/*size=*/10, /*alignment=*/12);
+  TF_LITE_MICRO_EXPECT_NE(result, nullptr);
+  // Validate used bytes in 8 byte range that can included alignment of 12:
+  TF_LITE_MICRO_EXPECT_GE(allocator.GetUsedBytes(), 10);
+  TF_LITE_MICRO_EXPECT_LE(allocator.GetUsedBytes(), 20);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 10);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 1);
+}
+
+TF_LITE_MICRO_TEST(TestDoesNotRecordFailedTailAllocations) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::RecordingSimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                                   arena_size);
+
+  uint8_t* result = allocator.AllocateFromTail(/*size=*/2048, /*alignment=*/1);
+  TF_LITE_MICRO_EXPECT_EQ(result, nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 0);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 0);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 0);
+}
+
+TF_LITE_MICRO_TEST(TestRecordsHeadAllocations) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::RecordingSimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                                   arena_size);
+
+  uint8_t* result = allocator.AllocateFromHead(/*size=*/5, /*alignment=*/1);
+  TF_LITE_MICRO_EXPECT_NE(result, nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 5);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 5);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 1);
+
+  result = allocator.AllocateFromTail(/*size=*/15, /*alignment=*/1);
+  TF_LITE_MICRO_EXPECT_NE(result, nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 20);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 20);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 2);
+}
+
+TF_LITE_MICRO_TEST(TestRecordsMisalignedHeadAllocations) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::RecordingSimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                                   arena_size);
+
+  uint8_t* result = allocator.AllocateFromHead(/*size=*/10, /*alignment=*/12);
+  TF_LITE_MICRO_EXPECT_NE(result, nullptr);
+  // Validate used bytes in 8 byte range that can included alignment of 12:
+  TF_LITE_MICRO_EXPECT_GE(allocator.GetUsedBytes(), 10);
+  TF_LITE_MICRO_EXPECT_LE(allocator.GetUsedBytes(), 20);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 10);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 1);
+}
+
+TF_LITE_MICRO_TEST(TestDoesNotRecordFailedTailAllocations) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::RecordingSimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                                   arena_size);
+
+  uint8_t* result = allocator.AllocateFromHead(/*size=*/2048, /*alignment=*/1);
+  TF_LITE_MICRO_EXPECT_EQ(result, nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 0);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 0);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 0);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/simple_memory_allocator.cc b/tensorflow/lite/micro/simple_memory_allocator.cc
index be7c469529e..65d60161e9a 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.cc
+++ b/tensorflow/lite/micro/simple_memory_allocator.cc
@@ -18,21 +18,42 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 
 namespace tflite {
 
-SimpleMemoryAllocator* CreateInPlaceSimpleMemoryAllocator(
-    ErrorReporter* error_reporter, uint8_t* buffer, size_t buffer_size) {
+SimpleMemoryAllocator::SimpleMemoryAllocator(ErrorReporter* error_reporter,
+                                             uint8_t* buffer_head,
+                                             uint8_t* buffer_tail)
+    : error_reporter_(error_reporter),
+      buffer_head_(buffer_head),
+      buffer_tail_(buffer_tail),
+      head_(buffer_head),
+      tail_(buffer_tail) {}
+
+SimpleMemoryAllocator::SimpleMemoryAllocator(ErrorReporter* error_reporter,
+                                             uint8_t* buffer,
+                                             size_t buffer_size)
+    : SimpleMemoryAllocator(error_reporter, buffer, buffer + buffer_size) {}
+
+/* static */
+SimpleMemoryAllocator* SimpleMemoryAllocator::Create(
+    ErrorReporter* error_reporter, uint8_t* buffer_head, size_t buffer_size) {
   SimpleMemoryAllocator tmp =
-      SimpleMemoryAllocator(error_reporter, buffer, buffer_size);
-  SimpleMemoryAllocator* in_place_allocator =
-      reinterpret_cast<SimpleMemoryAllocator*>(tmp.AllocateFromTail(
-          sizeof(SimpleMemoryAllocator), alignof(SimpleMemoryAllocator)));
-  *in_place_allocator = tmp;
-  return in_place_allocator;
+      SimpleMemoryAllocator(error_reporter, buffer_head, buffer_size);
+
+  // Allocate enough bytes from the buffer to create a SimpleMemoryAllocator.
+  // The new instance will use the current adjusted tail buffer from the tmp
+  // allocator instance.
+  uint8_t* allocator_buffer = tmp.AllocateFromTail(
+      sizeof(SimpleMemoryAllocator), alignof(SimpleMemoryAllocator));
+  return new (allocator_buffer)
+      SimpleMemoryAllocator(error_reporter, tmp.head_, tmp.tail_);
 }
 
+SimpleMemoryAllocator::~SimpleMemoryAllocator() {}
+
 uint8_t* SimpleMemoryAllocator::AllocateFromHead(size_t size,
                                                  size_t alignment) {
   uint8_t* const aligned_result = AlignPointerUp(head_, alignment);
@@ -63,4 +84,28 @@ uint8_t* SimpleMemoryAllocator::AllocateFromTail(size_t size,
   return aligned_result;
 }
 
+uint8_t* SimpleMemoryAllocator::GetHead() const { return head_; }
+
+uint8_t* SimpleMemoryAllocator::GetTail() const { return tail_; }
+
+size_t SimpleMemoryAllocator::GetHeadUsedBytes() const {
+  return head_ - buffer_head_;
+}
+
+size_t SimpleMemoryAllocator::GetTailUsedBytes() const {
+  return buffer_tail_ - tail_;
+}
+
+size_t SimpleMemoryAllocator::GetAvailableMemory() const {
+  return tail_ - head_;
+}
+
+size_t SimpleMemoryAllocator::GetUsedBytes() const {
+  return GetBufferSize() - GetAvailableMemory();
+}
+
+size_t SimpleMemoryAllocator::GetBufferSize() const {
+  return buffer_tail_ - buffer_head_;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/simple_memory_allocator.h b/tensorflow/lite/micro/simple_memory_allocator.h
index ed73104a2c6..426ced032f6 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.h
+++ b/tensorflow/lite/micro/simple_memory_allocator.h
@@ -16,10 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_SIMPLE_MEMORY_ALLOCATOR_H_
 #define TENSORFLOW_LITE_MICRO_SIMPLE_MEMORY_ALLOCATOR_H_
 
+#include <cstddef>
 #include <cstdint>
 
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/micro/compatibility.h"
 
 namespace tflite {
 
@@ -28,46 +29,46 @@ namespace tflite {
 // This makes it pretty wasteful, so we should use a more intelligent method.
 class SimpleMemoryAllocator {
  public:
+  // TODO(b/157615197): Cleanup constructors/destructor and use factory
+  // functions.
   SimpleMemoryAllocator(ErrorReporter* error_reporter, uint8_t* buffer_head,
-                        uint8_t* buffer_tail)
-      : error_reporter_(error_reporter),
-        buffer_head_(buffer_head),
-        buffer_tail_(buffer_tail),
-        head_(buffer_head),
-        tail_(buffer_tail) {}
+                        uint8_t* buffer_tail);
   SimpleMemoryAllocator(ErrorReporter* error_reporter, uint8_t* buffer,
-                        size_t buffer_size)
-      : SimpleMemoryAllocator(error_reporter, buffer, buffer + buffer_size) {}
+                        size_t buffer_size);
+  virtual ~SimpleMemoryAllocator();
+
+  // Creates a new SimpleMemoryAllocator from a given buffer head and size.
+  static SimpleMemoryAllocator* Create(ErrorReporter* error_reporter,
+                                       uint8_t* buffer_head,
+                                       size_t buffer_size);
 
   // Allocates memory starting at the head of the arena (lowest address and
   // moving upwards).
-  uint8_t* AllocateFromHead(size_t size, size_t alignment);
+  virtual uint8_t* AllocateFromHead(size_t size, size_t alignment);
   // Allocates memory starting at the tail of the arena (highest address and
   // moving downwards).
-  uint8_t* AllocateFromTail(size_t size, size_t alignment);
+  virtual uint8_t* AllocateFromTail(size_t size, size_t alignment);
 
-  uint8_t* GetHead() const { return head_; }
-  uint8_t* GetTail() const { return tail_; }
-  size_t GetAvailableMemory() const { return tail_ - head_; }
-  size_t GetUsedBytes() const { return GetBufferSize() - GetAvailableMemory(); }
+  uint8_t* GetHead() const;
+  uint8_t* GetTail() const;
 
-  size_t GetHeadUsedBytes() const { return head_ - buffer_head_; }
-  size_t GetTailUsedBytes() const { return buffer_tail_ - tail_; }
+  size_t GetHeadUsedBytes() const;
+  size_t GetTailUsedBytes() const;
+
+  size_t GetAvailableMemory() const;
+  size_t GetUsedBytes() const;
 
  private:
-  size_t GetBufferSize() const { return buffer_tail_ - buffer_head_; }
+  size_t GetBufferSize() const;
 
   ErrorReporter* error_reporter_;
   uint8_t* buffer_head_;
   uint8_t* buffer_tail_;
   uint8_t* head_;
   uint8_t* tail_;
-};
 
-// Allocate a SimpleMemoryAllocator from the buffer and then return the pointer
-// to this allocator.
-SimpleMemoryAllocator* CreateInPlaceSimpleMemoryAllocator(
-    ErrorReporter* error_reporter, uint8_t* buffer, size_t buffer_size);
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/micro/test_helpers.cc b/tensorflow/lite/micro/test_helpers.cc
index 77a1cc82f3b..c2607cd32c6 100644
--- a/tensorflow/lite/micro/test_helpers.cc
+++ b/tensorflow/lite/micro/test_helpers.cc
@@ -15,10 +15,15 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/test_helpers.h"
 
+#include <cstdarg>
+#include <cstddef>
+#include <cstdint>
 #include <initializer_list>
+#include <new>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/tensor_utils.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/micro/test_helpers.h b/tensorflow/lite/micro/test_helpers.h
index 010e1f9e336..2d1d2895db0 100644
--- a/tensorflow/lite/micro/test_helpers.h
+++ b/tensorflow/lite/micro/test_helpers.h
@@ -18,8 +18,10 @@ limitations under the License.
 
 // Useful functions for writing tests.
 
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/micro/testing/BUILD b/tensorflow/lite/micro/testing/BUILD
index 245e919bb05..8db93c6eeac 100644
--- a/tensorflow/lite/micro/testing/BUILD
+++ b/tensorflow/lite/micro/testing/BUILD
@@ -22,6 +22,7 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:micro_utils",
     ],
@@ -43,8 +44,7 @@ cc_library(
         "micro_benchmark.h",
     ],
     deps = [
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_time",
     ],
 )
diff --git a/tensorflow/lite/micro/testing/test_utils.cc b/tensorflow/lite/micro/testing/test_utils.cc
index 62621db40d3..7d7a5554b10 100644
--- a/tensorflow/lite/micro/testing/test_utils.cc
+++ b/tensorflow/lite/micro/testing/test_utils.cc
@@ -119,8 +119,8 @@ int32_t F2Q32(float value, float scale) {
 // TODO(b/141330728): Move this method elsewhere as part clean up.
 void PopulateContext(TfLiteTensor* tensors, int tensors_size,
                      ErrorReporter* error_reporter, TfLiteContext* context) {
-  simple_memory_allocator_ = CreateInPlaceSimpleMemoryAllocator(
-      error_reporter, raw_arena_, kArenaSize);
+  simple_memory_allocator_ =
+      SimpleMemoryAllocator::Create(error_reporter, raw_arena_, kArenaSize);
   TFLITE_DCHECK(simple_memory_allocator_ != nullptr);
   scratch_buffer_count_ = 0;
 
diff --git a/tensorflow/lite/micro/testing/test_xcore_binary.sh b/tensorflow/lite/micro/testing/test_xcore_binary.sh
new file mode 100755
index 00000000000..e059968c885
--- /dev/null
+++ b/tensorflow/lite/micro/testing/test_xcore_binary.sh
@@ -0,0 +1,47 @@
+#!/bin/bash -e
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Tests an  XS3  binary by executing it using the XSIM simulator and parsing
+# the log output.
+#
+# First argument is the binary location.
+# Second argument is a regular expression that's required to be in the output
+# logs for the test to pass.
+
+declare -r ROOT_DIR=`pwd`
+declare -r TEST_TMPDIR=/tmp/test_xcore_binary/
+declare -r MICRO_LOG_PATH=${TEST_TMPDIR}/$1
+declare -r MICRO_LOG_FILENAME=${MICRO_LOG_PATH}/logs.txt
+declare -r XCORE_
+mkdir -p ${MICRO_LOG_PATH}
+
+# Get the location of this script file as an absolute path
+SCRIPT_PATH="`dirname \"$BASH_SOURCE\"`"
+SCRIPT_PATH="`( cd \"$SCRIPT_PATH\" && pwd )`"
+XSIM_FLAGS=""
+
+
+xsim $1 ${XSIM_FLAGS} 2>&1 | tee ${MICRO_LOG_FILENAME}
+
+if grep -q "$2" ${MICRO_LOG_FILENAME}
+then
+  echo "$1: PASS"
+  exit 0
+else
+  echo "$1: FAIL - '$2' not found in logs."
+  exit 1
+fi
+
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 8599a27df52..c00a8a8ae89 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -86,16 +86,18 @@ else ifeq ($(BUILD_TYPE), release)
 	CXXFLAGS += -DNDEBUG -O3 -DTF_LITE_STRIP_ERROR_STRINGS
 	CCFLAGS  += -DNDEBUG -O3 -DTF_LITE_STRIP_ERROR_STRINGS
 else
-	CXXFLAGS += -O3
-	CCFLAGS  += -O3
+	CXXFLAGS += -DNDEBUG -O3
+	CCFLAGS  += -DNDEBUG -O3
 endif
 
 # This library is the main target for this makefile. It will contain a minimal
 # runtime that can be linked in to other programs.
 MICROLITE_LIB_NAME := libtensorflow-microlite.a
 
+# These two must be defined before we include the target specific Makefile.inc
+# because we filter out the examples that are not supported for those targets.
+# See targets/xtensa_xpg_makefile.inc for an example.
 MICRO_LITE_EXAMPLE_TESTS := $(shell find tensorflow/lite/micro/examples/ -name Makefile.inc)
-
 MICRO_LITE_BENCHMARKS := $(wildcard tensorflow/lite/micro/benchmarks/Makefile.inc)
 
 MICROLITE_TEST_SRCS := \
@@ -137,6 +139,7 @@ tensorflow/lite/c/common.h \
 tensorflow/lite/core/api/error_reporter.h \
 tensorflow/lite/core/api/flatbuffer_conversions.h \
 tensorflow/lite/core/api/op_resolver.h \
+tensorflow/lite/core/api/profiler.h \
 tensorflow/lite/core/api/tensor_utils.h \
 tensorflow/lite/kernels/internal/common.h \
 tensorflow/lite/kernels/internal/compatibility.h \
@@ -161,6 +164,8 @@ tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/mul.h \
+tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h \
+tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h \
 tensorflow/lite/kernels/internal/reference/l2normalization.h \
 tensorflow/lite/kernels/internal/reference/maximum_minimum.h \
 tensorflow/lite/kernels/internal/reference/mul.h \
@@ -178,8 +183,10 @@ tensorflow/lite/kernels/internal/reference/softmax.h \
 tensorflow/lite/kernels/internal/reference/sub.h \
 tensorflow/lite/kernels/internal/reference/logistic.h \
 tensorflow/lite/kernels/internal/reference/strided_slice.h \
-tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h \
+tensorflow/lite/kernels/internal/reference/tanh.h \
 tensorflow/lite/kernels/internal/cppmath.h \
+tensorflow/lite/kernels/internal/max.h \
+tensorflow/lite/kernels/internal/min.h \
 tensorflow/lite/kernels/internal/strided_slice_logic.h \
 tensorflow/lite/kernels/internal/tensor.h \
 tensorflow/lite/kernels/internal/tensor_ctypes.h \
@@ -237,6 +244,7 @@ include $(MAKEFILE_DIR)/third_party_downloads.inc
 THIRD_PARTY_DOWNLOADS :=
 $(eval $(call add_third_party_download,$(GEMMLOWP_URL),$(GEMMLOWP_MD5),gemmlowp,))
 $(eval $(call add_third_party_download,$(FLATBUFFERS_URL),$(FLATBUFFERS_MD5),flatbuffers,))
+$(eval $(call add_third_party_download,$(RUY_URL),$(RUY_MD5),ruy,))
 
 # These target-specific makefiles should modify or replace options like
 # CXXFLAGS or LIBS to work for a specific targeted architecture. All logic
diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh
index 2248031f6d1..c82b247ef8c 100755
--- a/tensorflow/lite/micro/tools/make/download_and_extract.sh
+++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh
@@ -86,11 +86,22 @@ patch_kissfft() {
 # CIFAR10 test dataset.
 patch_cifar10_dataset() {
   xxd -l 30730 -i ${1}/test_batch.bin ${1}/../../../../examples/image_recognition_experimental/first_10_cifar_images.h
-  sed -i "s/unsigned char/const unsigned char/g" ${1}/../../../../examples/image_recognition_experimental/first_10_cifar_images.h
+  sed -i -E "s/unsigned char/const unsigned char/g" ${1}/../../../../examples/image_recognition_experimental/first_10_cifar_images.h
 }
 
 build_embarc_mli() {
-  gmake -j 4 -C ${1}/lib/make TCF_FILE=${2}
+  make -j 4 -C ${1}/lib/make TCF_FILE=${2}
+}
+
+setup_zephyr() {
+  command -v virtualenv >/dev/null 2>&1 || {
+    echo >&2 "The required 'virtualenv' tool isn't installed. Try 'pip install virtualenv'."; exit 1;
+  }
+  virtualenv -p python3 ${1}/venv-zephyr
+  . ${1}/venv-zephyr/bin/activate
+  python ${1}/venv-zephyr/bin/pip install -r ${1}/scripts/requirements.txt
+  west init -m https://github.com/zephyrproject-rtos/zephyr.git
+  deactivate
 }
 
 # Main function handling the download, verify, extract, and patch process.
@@ -137,6 +148,9 @@ download_and_extract() {
     exit 1
   fi
 
+  # delete anything after the '?' in a url that may mask true file extension
+  url=$(echo "${url}" | sed "s/\?.*//")
+
   if [[ "${url}" == *gz ]]; then
     tar -C "${dir}" --strip-components=1 -xzf ${tempfile}
   elif [[ "${url}" == *tar.xz ]]; then
@@ -170,7 +184,14 @@ download_and_extract() {
   elif [[ ${action} == "patch_cifar10_dataset" ]]; then
     patch_cifar10_dataset ${dir}
   elif [[ ${action} == "build_embarc_mli" ]]; then
-    build_embarc_mli ${dir} ${action_param1}
+    if [[ "${action_param1}" == *.tcf ]]; then
+      cp ${action_param1} ${dir}/hw/arc.tcf
+      build_embarc_mli ${dir} ../../hw/arc.tcf
+    else
+      build_embarc_mli ${dir} ${action_param1}
+    fi
+  elif [[ ${action} == "setup_zephyr" ]]; then
+    setup_zephyr ${dir}
   elif [[ ${action} ]]; then
     echo "Unknown action '${action}'"
     exit 1
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc
new file mode 100644
index 00000000000..5dbb91dd368
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc
@@ -0,0 +1,104 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Settings for embARC MLI library for ARC platform. 
+
+ifeq ($(TARGET_ARCH), arc)
+
+# MLI Library is used by default for ARC platform whenever it is possible.
+# To use TFLM reference implementation MLI should be intentionally turned off 
+# by passing 'no_arc_mli' tag (make -f <tflm_main_makefile> TAGS=no_arc_mli ...)
+ifeq ($(filter no_arc_mli,$(ALL_TAGS)),)
+
+ALL_TAGS += arc_mli
+
+ifeq ($(BUILD_ARC_MLI),true)
+  MLI_LIB_DIR ?= arc_mli_$(basename $(TCF_FILE_NAME))
+
+  $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE)))
+
+  MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
+  MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a
+  MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/$(MLI_LIB_DIR)/LICENSE
+else
+ifneq ($(ARC_MLI_PRE_COMPILED_TARGET),)
+  MLI_LIB_DIR ?= arc_mli_package
+  $(eval $(call add_third_party_download,$(EMBARC_MLI_PRE_COMPILED_URL),$(EMBARC_MLI_PRE_COMPILED_MD5),$(MLI_LIB_DIR),))
+
+  MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
+  MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/$(ARC_MLI_PRE_COMPILED_TARGET)/release/libmli.a
+  MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/$(ARC_MLI_PRE_COMPILED_TARGET)/release/libmli.a
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/$(MLI_LIB_DIR)/LICENSE
+else 
+$(error Target for pre compiled ARC MLI library is not defined)
+endif
+endif
+
+  THIRD_PARTY_CC_HDRS += $(MLI_LIB)
+  GENERATED_PROJECT_LIBS += $(MLI_LIB)
+
+  INCLUDES += \
+    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \
+    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api
+
+  GENERATED_PROJECT_INCLUDES += \
+    -I. \
+    -I./third_party/$(MLI_INCLUDE_FOLDER) \
+    -I./third_party/$(MLI_INCLUDE_FOLDER)/api
+
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h
+
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
+
+
+  MICROLITE_TEST_SRCS += $(wildcard tensorflow/lite/micro/kernels/arc_mli/*test.cc)
+
+  ARC_MLI_TESTS := conv depthwise_conv pooling fully_connected
+  ARC_MLI_TESTS += $(foreach TEST,$(ARC_MLI_TESTS), $(TEST)_slicing)
+
+generate_arc_mli_test_projects: $(foreach TEST,$(ARC_MLI_TESTS), generate_kernel_$(TEST)_test_make_project)
+  
+  ARC_EXTRA_APP_SETTINGS += \
+    \nMLI_ONLY ?= false\n\
+    \nifeq \($(DLR)\(MLI_ONLY\), true\)\
+    \nCCFLAGS += -DTF_LITE_STRIP_REFERENCE_IMPL\
+    \nCXXFLAGS += -DTF_LITE_STRIP_REFERENCE_IMPL\
+    \nendif\n
+
+
+
+endif # no_embarc_mli
+endif # TARGET_ARCH
diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc
index 09771419843..d3403a81ef0 100644
--- a/tensorflow/lite/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/micro/tools/make/helper_functions.inc
@@ -74,6 +74,14 @@ $(PRJDIR)$(3)/$(1)/%: % third_party_downloads
 	@mkdir -p $$(dir $$@)
 	@cp $$< $$@
 
+$(PRJDIR)$(3)/cmake/boards/%: tensorflow/lite/micro/examples/$(3)/zephyr_riscv/boards/%
+	@mkdir -p $$(dir $$@)
+	@cp $$< $$@
+
+$(PRJDIR)$(3)/cmake/%: tensorflow/lite/micro/examples/$(3)/zephyr_riscv/%
+	@mkdir -p $$(dir $$@)
+	@cp $$< $$@
+
 $(PRJDIR)$(3)/$(1)/third_party/%: tensorflow/lite/micro/tools/make/downloads/% third_party_downloads
 	@mkdir -p $$(dir $$@)
 	@cp $$< $$@
@@ -130,24 +138,37 @@ endef
 define generate_arc_project
 
 ifeq ($(TARGET_ARCH), arc)
-$(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/Makefile.tpl
+
+$(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl
 	@mkdir -p $$(dir $$@)
 	@sed -E 's#\%\{SRCS\}\%#$(4)#g' $$< | \
-	sed -E '1 i\CC = ccac\nCXX = ccac\nLD = ccac\n' | \
+	sed -E 's#\%\{CC\}\%#$(CC_TOOL)#g' | \
+	sed -E 's#\%\{CXX\}\%#$(CXX_TOOL)#g' | \
+	sed -E 's#\%\{LD\}\%#$(LD_TOOL)#g' | \
 	sed -E 's#\%\{EXECUTABLE\}\%#$(3).elf#g' | \
 	sed -E 's#\%\{LINKER_FLAGS\}\%#$(6)#g' | \
 	sed -E 's#\%\{CXX_FLAGS\}\%#$(7)#g' | \
-	sed -E 's#\%\{CC_FLAGS\}\%#$(8)#g' > $$@
+	sed -E 's#\%\{CC_FLAGS\}\%#$(8)#g' | \
+	sed -E 's#\%\{EXTRA_APP_SETTINGS\}\%#$(ARC_EXTRA_APP_SETTINGS)#g' | \
+	sed -E 's#\%\{EXTRA_APP_RULES\}\%#$(ARC_EXTRA_APP_RULES)#g' | \
+	sed -E 's#\%\{BIN_DEPEND\}\%#$(ARC_BIN_DEPEND)#g' | \
+	sed -E 's#\%\{BIN_RULE\}\%#$(ARC_BIN_RULE)#g' | \
+	sed -E 's#\%\{EXTRA_RM_TARGETS\}\%#$(ARC_EXTRA_RM_TARGETS)#g' | \
+	sed -E 's#\%\{APP_RUN_CMD\}\%#$(ARC_APP_RUN_CMD)#g' | \
+	sed -E 's#\%\{APP_DEBUG_CMD\}\%#$(ARC_APP_DEBUG_CMD)#g' | \
+	sed -E 's#\%\{EXTRA_EXECUTE_RULES\}\%#$(ARC_EXTRA_EXECUTE_RULES)#g' > $$@
 
-
-# Special rule to copy TCF in case the local filesystem file name has been defined
-ifneq ($(TCF_FILE_NAME), )
-$(PRJDIR)$(3)/$(1)/$(TCF_FILE_NAME): $(TCF_FILE)
+$(PRJDIR)$(3)/$(1)/%: tensorflow/lite/micro/tools/make/templates/arc/%.tpl
 	@cp $$< $$@
-endif
+
+$(foreach var,$(ARC_TARGET_FILES_DIRS),$(eval $(call path_changing_copy_file,$(PRJDIR)$(3)/$(1),$(var))))
+
 endif
 endef
 
+
+
+
 # Creates a set of rules to build a standalone Arduino project for an
 # executable, including all of the source and header files required in a
 # separate folder and a simple makefile.
diff --git a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
index 8671df5864f..5214b06a36f 100644
--- a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -40,7 +40,6 @@ $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)/$(SF_BSPS_DEST): $(MAKEFILE_DIR)/downlo
     -fmessage-length=0 \
     -fno-exceptions \
     -fno-unwind-tables \
-    -fno-builtin \
     -ffunction-sections \
     -fdata-sections \
     -funsigned-char \
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/README.md b/tensorflow/lite/micro/tools/make/targets/arc/README.md
new file mode 100644
index 00000000000..366aede5db4
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/arc/README.md
@@ -0,0 +1,315 @@
+# Building TensorFlow Lite for Microcontrollers for Synopsys DesignWare ARC EM/HS Processors
+
+This document contains the general information on building and running
+TensorFlow Lite Micro for targets based on the Synopsys ARC EM/HS Processors.
+
+## Table of Contents
+
+-   [Install the Synopsys DesignWare ARC MetaWare Development Toolkit](#install-the-synopsys-designware-arc-metaWare-development-toolkit)
+-   [ARC EM Software Development Platform (ARC EM SDP)](#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
+-   [Custom ARC EM or HS Platform](#Custom-ARC-EMHS-Platform)
+
+## Install the Synopsys DesignWare ARC MetaWare Development Toolkit
+
+The Synopsys DesignWare ARC MetaWare Development Toolkit (MWDT) is required to
+build and run Tensorflow Lite Micro applications for all ARC EM/HS targets.
+
+To license MWDT, please see further details
+[here](https://www.synopsys.com/dw/ipdir.php?ds=sw_metaware)
+
+To request an evaluation version of MWDT, please use the
+[Synopsys Eval Portal](https://eval.synopsys.com/) and follow the link for the
+MetaWare Development Toolkit (Important: Do not confuse this with MetaWare EV
+Development Toolkit or MetaWare Lite options also available on this page)
+
+Run the downloaded installer and follow the instructions to set up the toolchain
+on your platform.
+
+TensorFlow Lite for Microcontrollers builds are divided into two phases:
+Application Project Generation and Application Project Building/Running. The
+former phase requires \*nix environment while the latter does not.
+
+For basic project generation targeting
+[ARC EM Software Development Platform](#ARC-EM-Software-Development-Platform-ARC-EM-SDP),
+MetaWare is NOT required for the Project Generation Phase. However, it is
+required in case the following: - For project generation for custom (not EM SDP)
+targets - To build microlib target library with all required TFLM objects for
+external use
+
+Please consider the above when choosing whether to install Windows or Linux or
+both versions of MWDT
+
+## ARC EM Software Development Platform (ARC EM SDP)
+
+This section describes how to deploy on an
+[ARC EM SDP board](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
+
+### Initial Setup
+
+To use the EM SDP, you need the following hardware and software:
+
+#### ARC EM SDP
+
+More information on the platform, including ordering information, can be found
+[here](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform).
+
+#### MetaWare Development Toolkit
+
+See
+[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](#install-the-synopsys-designware-arc-metaWare-development-toolkit)
+section for instructions on toolchain installation.
+
+#### Digilent Adept 2 System Software Package
+
+If you wish to use the MetaWare Debugger to debug your code, you need to also
+install the Digilent Adept 2 software, which includes the necessary drivers for
+connecting to the targets. This is available from oficial
+[Digilent site](https://reference.digilentinc.com/reference/software/adept/start?redirect=1#software_downloads).
+You should install the “System” component, and Runtime. Utilities and SDK are
+NOT required.
+
+Digilent installation is NOT required if you plan to deploy to EM SDP via the SD
+card instead of using the debugger.
+
+#### Make Tool
+
+A `'make'` tool is required for both phases of deploying Tensorflow Lite Micro
+applications on ARC EM SDP: 1. Application project generation 2. Working with
+generated application (build and run)
+
+For the first phase you need an environment and make tool compatible with
+Tensorflow Lite for Micro build system. At the moment of this writing, this
+requires make >=3.82 and a *nix-like environment which supports shell and native
+commands for file manipulations. MWDT toolkit is not required for this phase.
+
+For the second phase, requirements are less strict. The gmake version delivered
+with MetaWare Development Toolkit is sufficient. There are no shell and *nix
+command dependencies, so Windows can be used
+
+#### Serial Terminal Emulation Application
+
+The Debug UART port of the EM SDP is used to print application output. The USB
+connection provides both the debug channel and RS232 transport. You can use any
+terminal emulation program (like [PuTTY](https://www.putty.org/)) to view UART
+output from the EM SDP.
+
+#### microSD Card
+
+If you want to self-boot your application (start it independently from a
+debugger connection), you also need a microSD card with a minimum size of 512 MB
+and a way to write to the card from your development host
+
+### Connect the Board
+
+1.  Make sure Boot switches of the board (S3) are configured in the next way:
+
+Switch # | Switch position
+:------: | :-------------:
+1        | Low (0)
+2        | Low (0)
+3        | High (1)
+4        | Low (0)
+
+1.  Connect the power supply included in the product package to the ARC EM SDP.
+2.  Connect the USB cable to connector J10 on the ARC EM SDP (near the RST and
+    CFG buttons) and to an available USB port on your development host.
+3.  Determine the COM port assigned to the USB Serial Port (on Windows, using
+    Device Manager is an easy way to do this)
+4.  Execute the serial terminal application you installed in the previous step
+    and open the serial connection with the early defined COM port (speed 115200
+    baud; 8 bits; 1 stop bit; no parity).
+5.  Push the CFG button on the board. After a few seconds you should see the
+    boot log in the terminal which begins as follows:
+
+```
+U-Boot <Versioning info>
+
+CPU:   ARC EM11D v5.0 at 40 MHz
+Subsys:ARC Data Fusion IP Subsystem
+Model: snps,emsdp
+Board: ARC EM Software Development Platform v1.0
+…
+```
+
+### Generate Application Project for ARC EM SDP
+
+Before building an example or test application, you need to generate a TFLM
+project for this application from TensorFlow sources and external dependencies.
+To generate it for ARC EM SDP board you need to set `TARGET=arc_emsdp` on the
+make command line. For instance, to build the Person Detect test application,
+use a shell to execute the following command from the root directory of the
+TensorFlow repo:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile generate_person_detection_test_int8_make_project TARGET=arc_emsdp
+```
+
+The application project will be generated into
+*tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_test_int8/make*
+
+Info on generating and building example applications for EM SDP
+(*tensorflow/lite/micro/examples*) can be found in the appropriate readme file
+placed in the same directory with the examples. In general, it’s the same
+process which described in this Readme.
+
+The
+[embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli)
+is used by default to speed up execution of some kernels for asymmetrically
+quantized layers. Kernels which use MLI-based implementations are kept in the
+*tensorflow/lite/micro/kernels/arc_mli* folder. For applications which may not
+benefit from MLI library, the project can be generated without these
+implementations by adding `TAGS=no_arc_mli` in the command line. This can reduce
+code size when the optimized kernels are not required.
+
+For more options on embARC MLI usage see
+[kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md).
+
+### Build the Application
+
+You may need to adjust the following commands in order to use the appropriate
+make tool available in your environment (ie: `make` or `gmake`)
+
+1.  Open command shell and change the working directory to the location which
+    contains the generated project, as described in the previous section
+
+2.  Clean previous build artifacts (optional)
+
+    make clean
+
+3.  Build application
+
+    make app
+
+### Run the Application on the Board Using MetaWare Debugger
+
+In case you do not have access to the MetaWare Debugger or have chosen not to
+install the Digilent drivers, you can skip to the next section.
+
+To run the application from the console, use the following command:
+
+```
+   make run
+```
+
+If application runs in an infinite loop, type `Ctrl+C` several times to exit the
+debugger.
+
+To run the application in the GUI debugger, use the following command:
+
+```
+   make debug
+```
+
+In both cases you will see the application output in the serial terminal.
+
+### Run the Application on the Board from the microSD Card
+
+1.  Use the following command in the same command shell you used for building
+    the application, as described in the previous step
+
+    make flash
+
+2.  Copy the content of the created *./bin* folder into the root of microSD
+    card. Note that the card must be formatted as FAT32 with default cluster
+    size (but less than 32 Kbytes)
+
+3.  Plug in the microSD card into the J11 connector.
+
+4.  Push the RST button. If a red LED is lit beside RST button, push the CFG
+    button.
+
+You will see the application output in the serial terminal.
+
+## Custom ARC EM/HS Platform
+
+This section describes how to deploy on a Custom ARC EM/HS platform defined only
+by a TCF (Tool Configuration File, created at CPU configuration time) and
+optional LCF (Linker Command File). In this case, the real hardware is unknown,
+and applications can be run only in the nSIM simulator included with the
+MetaWare toolkit
+
+### Initial Setup
+
+To with custom ARC EM/HS platform, you need the following : * Synopsys MetaWare
+Development Toolkit version 2019.12 or higher * Make tool (make or gmake)
+
+See
+[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](#install-the-synopsys-designware-arc-metaWare-development-toolkit)
+section for instructions on toolchain installation. See
+[MetaWare Development Toolkit](#MetaWare-Development-Toolkit) and
+[Make Tool](#Make-Tool) sections for instructions on toolchain installation and
+comments about make versions.
+
+### Generate Application Project
+
+Before building the application itself, you need to generate the project for
+this application from TensorFlow sources and external dependencies. To generate
+it for a custom TCF you need to set the following variables in the make command
+line: * TARGET_ARCH=arc * TCF_FILE=<path to TCF file> * (optional)
+LCF_FILE=<path to LCF file>
+
+If you don’t supply an external LCF, the one embedded in the TCF will be used
+instead
+
+For instance, to build **Person Detection** test application, use the following
+command from the root directory of the TensorFlow repo:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile generate_person_detection_test_int8_make_project TARGET_ARCH=arc TCF_FILE=<path_to_tcf_file> LCF_FILE=<path_to_lcf_file>
+```
+
+The application project will be generated into
+*tensorflow/lite/micro/tools/make/gen/<tcf_file_basename>_arc/prj/person_detection_test_int8/make*
+
+The
+[embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli)
+is used by default to speed up execution of some kernels for asymmetrically
+quantized layers. Kernels which use MLI-based implementations are kept in the
+*tensorflow/lite/micro/kernels/arc_mli* folder. For applications which may not
+benefit from MLI library, the project can be generated without these
+implementations by adding `TAGS=no_arc_mli` in the command line. This can reduce
+code size when the optimized kernels are not required.
+
+For more options on embARC MLI usage see
+[kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md).
+
+### Build the Application
+
+You may need to adjust the following commands in order to use the appropriate
+make tool available in your environment (ie: `make` or `gmake`)
+
+1.  Open command shell and change the working directory to the location which
+    contains the generated project, as described in the previous section
+
+2.  Clean previous build artifacts (optional)
+
+    make clean
+
+3.  Build application
+
+    make app
+
+### Run the Application with MetaWare Debugger on the nSim Simulator.
+
+To run application from the console, use the following command:
+
+```
+   make run
+```
+
+If application runs in an infinite loop, type `Ctrl+C` several times to exit the
+debugger.
+
+To run the application in the GUI debugger, use the following command:
+
+```
+   make debug
+```
+
+You will see the application output in the same console where you ran it.
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository,
+and third-party dependencies are covered by their respective licenses, in the
+third_party folder of this package.
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
new file mode 100644
index 00000000000..596f219d3d1
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
@@ -0,0 +1,138 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Common Settings for ARC platform and its projects. 
+# Might be reused across different targets
+
+ifeq ($(TARGET_ARCH), arc)
+
+  DLR := $$$$
+
+  # List of folders to search project files for copy with path changing
+  # For instance, TCF and LCF files are copied into the root of generated project
+  ARC_TARGET_FILES_DIRS ?=
+
+  # For the following variables see arc_app_makefile.tpl for usage
+
+  # Additional text into application settings section of arc makefile project 
+  ARC_EXTRA_APP_SETTINGS ?=
+
+  # Additional text into application general rules of arc makefile project 
+  ARC_EXTRA_APP_RULES ?=
+  
+  # additional arguments for RM command of "clean" target rule ("make clean" command)
+  ARC_EXTRA_RM_TARGETS ?=
+
+  # Dependencies of "flash" target rule ("make flash" command)
+  ARC_BIN_DEPEND ?=
+  
+  # Commands in "flash" target rule ("make flash" command)
+  ARC_BIN_RULE ?= \t$(DLR)\(error Flash rule isnt defined for this ARC target\)
+  
+  # Command to run app on "make run" command of generated project
+  ARC_APP_RUN_CMD ?= 
+  
+  # Command to run app on "make debug" command of generated project
+  ARC_APP_DEBUG_CMD ?= 
+  
+  # Additional text into application execution rules of arc makefile project 
+  ARC_EXTRA_EXECUTE_RULES ?= 
+
+# We overwrite project generator to exclude everything not relevant to ARC platform.
+# ARC targets cannot work with non-ARC development tools.
+# Basic make project is updated to be applicable for general ARC platform
+define generate_microlite_projects
+$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX))
+$(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
+endef
+
+# Copy rule generator to do file copies with changing paths in generated project
+# Arguments are:
+# 1 - Path files in generated project.
+# 2 - Path files in the source repo
+# Used in helper_functions.inc for arc projects to copy files
+define path_changing_copy_file
+$(1)/%: $(2)/%
+	@mkdir -p $$(dir $$@)
+	@cp $$< $$@
+endef
+
+# These are microcontroller-specific rules for converting the ELF output
+# of the linker into a binary image that can be loaded directly.
+# Not applicable for ARC, leaving it empty.
+$(BINDIR)%.bin:
+
+
+ifeq ($(ARC_TOOLCHAIN), mwdt)
+  CC_TOOL := ccac
+  AR_TOOL := arac
+  CXX_TOOL := ccac
+  LD_TOOL := ccac
+
+  ARC_APP_RUN_CMD = mdb -run -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS\)
+  ARC_APP_DEBUG_CMD = mdb -OK -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS\)
+
+  # The variable TCF_FILE stores path to Tool Configuration File (*.tcf). 
+  # This file is used by MWDT toolchain to properly compile/run code
+  TCF_FILE ?= 
+
+  LCF_FILE ?= 
+
+  BUILD_ARC_MLI ?= true
+
+# The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), 
+# this variable is used later to add the option to the linker/compiler flags.
+# This condition also handles the case when the user/makefile specifies 
+# the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying.
+ifneq (,$(findstring .tcf,$(TCF_FILE)))
+  TCF_FILE_NAME = $(notdir $(TCF_FILE))
+  ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE))
+  MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
+else
+  TCF_FILE_NAME = $(TCF_FILE)
+endif
+
+  PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -tcf_core_config
+  
+  PLATFORM_FLAGS += -Hnocopyr -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections 
+  
+  # Use compact CRT. It requires pre-defined heap size
+  PLATFORM_FLAGS += -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset 
+  
+  PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) 
+  
+  PLATFORM_LDFLAGS += -Hnocopyr -m -Hldopt=-Coutput=memory.map -Hheap=2K 
+
+ifneq ($(LCF_FILE), )
+  PLATFORM_LDFLAGS += $(notdir $(LCF_FILE))
+  MAKE_PROJECT_FILES += $(notdir $(LCF_FILE))
+ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(LCF_FILE))),)
+  ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
+endif
+endif
+
+  CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
+  CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
+  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
+
+  CXXFLAGS += $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+  LDFLAGS += $(PLATFORM_LDFLAGS)
+
+
+
+
+endif # ARC_TOOLCHAIN
+endif  # TARGET_ARCH
+
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
new file mode 100644
index 00000000000..780fd7b9750
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
@@ -0,0 +1,85 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Common EMSDP LCF File for applications
+#
+# external SRAM memory is used for code, because some TFLM applications includes the whole 
+# set of supported kernels which doesn't fit to ICCM0. 
+# It could slow performance a bit. Smaller applications can use ICCM0 instead.
+#
+# External PSRAM is used for potentially big sections. In particular:
+# - rodata_in data which typically includes protobuf with model.
+# - other .data which typically includes tensor arena.
+#
+# stack and heap are kept in DCCM which is the closest memory to the core 
+
+# CCMWRAP memory regions indicate unusable portions of the address space
+#   due to CCM memory wrapping into upper addresses beyond its size
+
+MEMORY {
+    PSRAM   : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400
+    SRAM    : ORIGIN = 0x20000000, LENGTH = 0x00040000
+    IVT     : ORIGIN = 0x60000000, LENGTH = 0x400
+    ICCM0   : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400)
+#   CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
+    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
+#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
+    XCCM    : ORIGIN = 0x90000000, LENGTH = 0x00004000
+#   CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
+    YCCM    : ORIGIN = 0xa0000000, LENGTH = 0x00004000
+#   CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
+    }
+
+SECTIONS {
+
+    GROUP BLOCK(4) : {
+        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
+    } > IVT
+
+    GROUP BLOCK(4): {
+        .text? : { *('.text$crt*') }
+        * (TEXT): {}
+        * (LIT): {}
+    } > SRAM
+
+    GROUP BLOCK(4): {
+       .Zdata? : {}
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:32K): {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
+    } > DCCM
+        
+    GROUP BLOCK(4): {
+        .Xdata? : {}
+    } > XCCM
+
+    GROUP BLOCK(4): {
+        .Ydata? : {}
+    } > YCCM
+
+    GROUP BLOCK(4): {
+    /* _SDA_BASE_ computed implicitly */
+        .sdata?: {}
+        .sbss?: {}
+        * (DATA): {}
+        * (BSS): {}
+    } > PSRAM
+
+    GROUP BLOCK(4): {
+        .rodata_in_data? : {}
+    } > PSRAM
+
+    GROUP BLOCK(4): {
+        .debug_log? : {}
+    } > SRAM
+}
+
+
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf
new file mode 100644
index 00000000000..63ef48667db
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf
@@ -0,0 +1,74 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Difference with common EMSDP LCF file (to reduce data access time): 
+# - move data from external PSRAM to DCCM
+# - move text from SRAM to ICCM
+#
+# CCMWRAP memory regions indicate unusable portions of the address space
+#   due to CCM memory wrapping into upper addresses beyond its size
+
+MEMORY {
+    PSRAM   : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400
+    SRAM    : ORIGIN = 0x20000000, LENGTH = 0x00040000
+    IVT     : ORIGIN = 0x60000000, LENGTH = 0x400
+    ICCM0   : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400)
+#   CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
+    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
+#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
+    XCCM    : ORIGIN = 0x90000000, LENGTH = 0x00004000
+#   CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
+    YCCM    : ORIGIN = 0xa0000000, LENGTH = 0x00004000
+#   CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
+    }
+
+SECTIONS {
+
+    GROUP BLOCK(4) : {
+        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
+    } > IVT
+
+    GROUP BLOCK(4): {
+        .text? : { *('.text$crt*') }
+        * (TEXT): {}
+        * (LIT): {}
+    } > ICCM0
+
+    GROUP BLOCK(4): {
+        .rodata_in_data? : {}
+    } > PSRAM
+
+    GROUP BLOCK(4): {
+        .debug_log? : {}
+    } > SRAM
+
+    GROUP BLOCK(4): {
+    /* _SDA_BASE_ computed implicitly */
+        .sdata?: {}
+        .sbss?: {}
+        * (DATA): {}
+        * (BSS): {}
+       .Zdata? : {}
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
+    } > DCCM
+
+    GROUP BLOCK(4): {
+        .Xdata? : {}
+    } > XCCM
+
+    GROUP BLOCK(4): {
+        .Ydata? : {}
+    } > YCCM
+}
+
+
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
new file mode 100644
index 00000000000..405b9698cca
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
@@ -0,0 +1,73 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Settings for EMSDP target (ARC processor)
+ifeq ($(TARGET), arc_emsdp)
+
+  TARGET_ARCH := arc
+  ARC_TOOLCHAIN := mwdt
+
+
+  BUILD_ARC_MLI := false
+  ARC_MLI_PRE_COMPILED_TARGET := emsdp_em11d_em9d_dfss
+
+ifneq ($(filter no_arc_mli,$(ALL_TAGS)),)
+  MLI_LIB_DIR = arc_mli_package
+  $(eval $(call add_third_party_download,$(EMBARC_MLI_PRE_COMPILED_URL),$(EMBARC_MLI_PRE_COMPILED_MD5),$(MLI_LIB_DIR),))
+else ifeq ($(BUILD_ARC_MLI), true)
+  MLI_LIB_DIR = arc_mli_$(ARC_MLI_PRE_COMPILED_TARGET)
+endif
+
+  TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/hw/emsdp_em11d_em9d_dfss.tcf
+  LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf
+  UBOOT_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/uboot.env
+  UBOOT_FILE_NAME := $(notdir $(UBOOT_FILE))
+    
+
+include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
+  
+   ARC_EXTRA_APP_SETTINGS = \
+      BIN_DIR = .$(DLR)\(PS\)bin\n\
+      BIN_FILE = $(DLR)\(BIN_DIR\)$(DLR)\(PS\)app.elf\n
+
+   ARC_EXTRA_APP_RULES = \
+     $(DLR)\(BIN_FILE\): $(DLR)\(BIN_DIR\) $(DLR)\(OUT_NAME\)\
+     \n\t\@$(DLR)\(CP\) $(DLR)\(OUT_NAME\) $(DLR)\(BIN_FILE\)\
+     \n\t\@$(DLR)\(CP\) $(UBOOT_FILE_NAME) $(DLR)\(BIN_DIR\)$(DLR)\(PS\)$(UBOOT_FILE_NAME)\
+     \n \
+     \n$(DLR)\(BIN_DIR\):\
+     \n\t\@$(DLR)\(MKDIR\) $(DLR)\(BIN_DIR\)\
+
+   ARC_EXTRA_RM_TARGETS = $(DLR)\(BIN_DIR\)
+
+   ARC_BIN_DEPEND = $(DLR)\(BIN_DIR\) $(DLR)\(BIN_FILE\)
+   ARC_BIN_RULE = \t@echo Copy content of $(DLR)\(BIN_DIR\) into the root of SD card and follow instructions
+   
+   ARC_APP_RUN_CMD = mdb -run -digilent -nooptions $(DLR)\(DBG_ARGS\)
+   ARC_APP_DEBUG_CMD = mdb -OK -digilent -nooptions $(DLR)\(DBG_ARGS\)
+   ARC_EXTRA_EXECUTE_RULES = 
+
+  MAKE_PROJECT_FILES += $(UBOOT_FILE_NAME)
+ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(UBOOT_FILE))),)
+  ARC_TARGET_FILES_DIRS += $(dir $(UBOOT_FILE))
+endif
+
+  MAKE_PROJECT_FILES := $(filter-out README_MAKE.md, $(MAKE_PROJECT_FILES)) README_ARC_EMSDP.md
+
+  # for default EMSDP configuration we can use em9d_va rt libs
+  # for better performance runtime should be built for emsdp configuration
+  # No hostlink library for smaller codesize purpose
+  PLATFORM_LDFLAGS += -Hlib=em9d_voice_audio -Hhostlib=
+
+endif
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
index 0f56e5f4641..9f5442b4c6c 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
@@ -1,86 +1,40 @@
-# Settings for arc processors
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Settings for not pre-defined ARC processors. 
+# User need to specify ARC target with Tool Configuration File (*.tcf). 
+# Path to this file must be passed through TCF_FILE variable.
+# Otherwise, default em7d_voice_audio configuration is used 
 ifeq ($(TARGET_ARCH), arc)
 
-  CC_TOOL = ccac
-  AR_TOOL = arac
-  CXX_TOOL = ccac
+# Known target are specified with their own make configurations. 
+ifeq ($(filter $(TARGET), arc_emsdp),)
+
+ARC_TOOLCHAIN := mwdt
 
 ifneq ($(TCF_FILE), )
   TARGET = $(basename $(notdir $(TCF_FILE)))
 else
+  $(warning TCF_FILE variable is not specified. Use default em7d_voice_audio configuration)
   TARGET = em7d_voice_audio
   TCF_FILE = em7d_voice_audio
 endif
 
-# The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), this variable is used later to add the option to the linker/compiler flags.
-# This condition also handles the case when the user/makefile specifies the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying.
-ifneq (,$(findstring .tcf,$(TCF_FILE)))
-  TCF_FILE_NAME = $(notdir $(TCF_FILE))
-  THIRD_PARTY_CC_HDRS += $(TCF_FILE_NAME)
-else
-  TCF_FILE_NAME = $(TCF_FILE)
-endif
+include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
 
-  PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -fslp-vectorize-aggressive -ffunction-sections -fdata-sections
-  PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map
+MAKE_PROJECT_FILES := $(filter-out README_MAKE.md, $(MAKE_PROJECT_FILES)) README_ARC.md
 
-  CXXFLAGS += $(PLATFORM_FLAGS)
-  CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS))
-  CCFLAGS += $(PLATFORM_FLAGS)
-  LDFLAGS += $(PLATFORM_LDFLAGS)
+endif  # $(TARGET)
+endif  # $(TARGET_ARCH)...
 
-  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
-
-  USE_EMBARC_MLI ?= true
-
-ifeq ($(USE_EMBARC_MLI), true)
-  ALL_TAGS += arc
-
-ifeq ($(PRE_COMPILED_MLI),true)
-  $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,))
-
-  MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include
-  MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/embarc_osp/LICENSE
-else
-  MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME))
-
-  $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE)))
-
-  MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
-  MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a
-  MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/$(MLI_LIB_DIR)/LICENSE
-endif
-
-  THIRD_PARTY_CC_HDRS += $(MLI_LIB)
-  GENERATED_PROJECT_LIBS += $(MLI_LIB)
-
-  INCLUDES += \
-    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \
-    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api
-
-  GENERATED_PROJECT_INCLUDES += \
-    -I. \
-    -I./third_party/$(MLI_INCLUDE_FOLDER) \
-    -I./third_party/$(MLI_INCLUDE_FOLDER)/api
-
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \
-
-endif # USE_EMBARC_MLI
-
-endif
diff --git a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
index 2bd84fa6e29..5223f00a74f 100644
--- a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
@@ -19,7 +19,6 @@ ifeq ($(TARGET), bluepill)
     -fmessage-length=0 \
     -fno-exceptions \
     -fno-unwind-tables \
-    -fno-builtin \
     -ffunction-sections \
     -fdata-sections \
     -funsigned-char \
diff --git a/tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc b/tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc
index 756915f946a..709f060c4ac 100644
--- a/tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc
@@ -27,7 +27,6 @@ ifeq ($(TARGET), ecm3531)
     -fmessage-length=0 \
     -fno-exceptions \
     -fno-unwind-tables \
-    -fno-builtin \
     -ffunction-sections \
     -fdata-sections \
     -funsigned-char \
diff --git a/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc b/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
index 155fff99dcd..c3cbf206c8a 100644
--- a/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
@@ -40,7 +40,6 @@ ifeq ($(TARGET), hexagon)
     -fdata-sections \
     -ffunction-sections \
     -fmessage-length=0 \
-    -fno-builtin \
     -fno-delete-null-pointer-checks \
     -fno-exceptions \
     -fno-register-global-dtors-with-atexit \
diff --git a/tensorflow/lite/micro/tools/make/targets/litex_vexriscv_makefile.inc b/tensorflow/lite/micro/tools/make/targets/litex_vexriscv_makefile.inc
new file mode 100644
index 00000000000..c9764ed2f26
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/litex_vexriscv_makefile.inc
@@ -0,0 +1,6 @@
+ifeq ($(TARGET), zephyr_vexriscv)
+  $(eval $(call add_third_party_download,$(ZEPHYR_URL),$(ZEPHYR_MD5),zephyr,setup_zephyr))
+  export ZEPHYR_SDK_INSTALL_DIR?=/opt/zephyr-sdk
+  export ZEPHYR_BASE?=$(realpath $(MAKEFILE_DIR)/downloads/zephyr)
+endif
+
diff --git a/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc b/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc
index 079c3c14f1e..b91b0a516f2 100644
--- a/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc
@@ -16,7 +16,6 @@ ifeq ($(TARGET), riscv32_mcu)
     -DTF_LITE_MCU_DEBUG_LOG \
     -DTF_LITE_USE_GLOBAL_CMATH_FUNCTIONS \
     -fno-unwind-tables \
-    -fno-builtin \
     -ffunction-sections \
     -fdata-sections \
     -funsigned-char \
diff --git a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
index 7abd3cc7e38..5114aa4620c 100644
--- a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
@@ -16,7 +16,6 @@ ifeq ($(TARGET), stm32f4)
     -fmessage-length=0 \
     -fno-exceptions \
     -fno-unwind-tables \
-    -fno-builtin \
     -ffunction-sections \
     -fdata-sections \
     -funsigned-char \
diff --git a/tensorflow/lite/micro/tools/make/targets/xcore_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xcore_makefile.inc
new file mode 100644
index 00000000000..9a0f7463688
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/xcore_makefile.inc
@@ -0,0 +1,24 @@
+# Settings for XMOS XS3 based processors (xcore.ai, ...)
+
+#IMPORTANT: to set up environment variables correctly run the following from the top tensorflow directory:
+# $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" clean clean_downloads test
+# $ pushd tensorflow/lite/micro/tools/make/downloads/xtimecomposer/xTIMEcomposer/15.0.0/ && source SetEnv && popd
+# $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test
+
+ifeq ($(TARGET), xcore)
+  XTIME_URL := "https://www.xmos.com/download/Tools-15---Linux-64%2815.0.0_rc1%29.tgz?key=132D-9DC9-E913-0229-ECE6-D5AB-F511-2B19"
+  XTIME_MD5 := "8f6543c8ac4af7583edf75e62df322a2"
+  $(eval $(call add_third_party_download,$(XTIME_URL),$(XTIME_MD5),xtimecomposer))
+  PLATFORM_FLAGS = -target=XU316-1024-FB265-C32 -mcmodel=large -Os -DXCORE -Wno-xcore-fptrgroup -report
+  CXX_TOOL := xcc
+  CC_TOOL := xcc
+  AR_TOOL := xmosar   
+  override CXXFLAGS := -std=c++11 -g -DTF_LITE_STATIC_MEMORY -DNDEBUG
+  override CXXFLAGS += $(PLATFORM_FLAGS) 
+  override CCFLAGS := -g -DTF_LITE_STATIC_MEMORY -DNDEBUG
+  override CCFLAGS += $(PLATFORM_FLAGS)
+  TARGET_ARCH := xcore
+  #TARGET_TOOLCHAIN_PREFIX := tensorflow/lite/micro/tools/make/downloads/xtimecomposer/bin/
+  TEST_SCRIPT := tensorflow/lite/micro/testing/test_xcore_binary.sh
+  #GCC_XCORE := $(MAKEFILE_DIR)/downloads/xtimecomposer/bin/
+endif
diff --git a/tensorflow/lite/micro/tools/make/targets/xtensa_xpg_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xtensa_xpg_makefile.inc
index 5ed601f8dd1..dba98b45cd9 100644
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_xpg_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_xpg_makefile.inc
@@ -30,4 +30,16 @@ ifeq ($(TARGET), xtensa-xpg)
   LDFLAGS += -Wl,-gc-sections
 
   TEST_SCRIPT := tensorflow/lite/micro/testing/test_xtensa_xpg_binary.sh
+
+  # TODO(b/156962140): This manually maintained list of excluded examples is
+  # quite error prone.
+  EXCLUDED_EXAMPLE_TESTS := \
+    tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc \
+    tensorflow/lite/micro/examples/magic_wand/Makefile.inc \
+    tensorflow/lite/micro/examples/micro_speech/Makefile.inc \
+    tensorflow/lite/micro/examples/network_tester/Makefile.inc \
+    tensorflow/lite/micro/examples/person_detection/Makefile.inc \
+    tensorflow/lite/micro/examples/person_detection_experimental/Makefile.inc
+  MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))
+
 endif
diff --git a/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl
new file mode 100644
index 00000000000..0ddaf3e0a81
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl
@@ -0,0 +1,45 @@
+# TensorFlow Lite Micro ARC Make Project
+
+This folder has been autogenerated by TensorFlow, and contains sources, headers, and project files needed to build a single TensorFlow Lite Micro application using make tool and a Synopsys DesignWare ARC processor compatible toolchain, specifically the ARC MetaWare Development Toolkit (MWDT).  
+
+This project has been generated for a target defined by TCF file only (Tool Configuration File). The real target board is unspecified, and applications can be run only in the nSIM simulator included with MWDT.
+
+See
+[tensorflow/lite/micro](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro)
+for details on how projects like this can be generated from the main source tree.
+
+## Usage
+
+See [Custom ARC EM/HS Platform](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/tools/make/targets/arc/README.md#Custom-ARC-EMHS-Platform) section for more detailed information on requirements and usage of this project. 
+
+The Makefile contains all the information on building and running the project. One can modify it to satisfy specific needs. Next actions are available out of the box. You may need to adjust the following commands in order to use the appropriate make tool available in your environment, ie: `make` or `gmake`
+
+1. Build the application.
+
+       make app 
+
+2. Build the application passing additional flags to compiler.
+
+       make app EXT_CFLAGS=[additional compiler flags]
+
+3. Build the application and stripout TFLM reference kernel fallback implementations in order to reduce code size. This only has an effect in case the project was generated with MLI support. See more info in [EmbARC MLI Library Based Optimizations](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/kernels/arc_mli/README.md). `false` is the default value. 
+
+       make app MLI_ONLY=[true|false]
+
+4. Delete all artifacts created during build.
+
+       make clean
+
+5. Run the application with the nSIM simulator in console mode.
+
+       make run 
+
+6. Run the application with the nSIM simulator, but using the MetaWare Debugger GUI for further execution/debugging capabilities. 	
+
+       make debug 
+
+
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository, and third party dependencies are covered by their respective licenses, in the third_party folder of this package.
diff --git a/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl
new file mode 100644
index 00000000000..9d2801ed6b7
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl
@@ -0,0 +1,48 @@
+# TensorFlow Lite Micro ARC Make Project for EM SDP Board.
+
+This folder has been autogenerated by TensorFlow, and contains source, header, and project files needed to build a single TensorFlow Lite Micro target using make tool and and a Synopsys DesignWare ARC processor compatible toolchain, specifically the ARC MetaWare Development Toolkit (MWDT).  
+
+This project has been generated for the ARC EM Software Development Platform (EM SDP). The built application can be run only on this platform.
+
+See
+[tensorflow/lite/micro](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro)
+for details on how projects like this can be generated from the main source tree.
+
+## Usage
+
+See [ARC EM Software Development Platform](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP) section for more detailed information on requirements and usage of this project. 
+
+The Makefile contains all the information on building and running the project. One can modify it to satisfy specific needs. Next actions are available out of the box. You may need to adjust the following commands in order to use the appropriate make tool available in your environment, ie: `make` or `gmake`:
+
+1. Build the application.
+
+       make app 
+
+2. Build the application passing additional flags to compiler.
+
+       make app EXT_CFLAGS=[additional compiler flags]
+
+3. Build the application and stripout TFLM reference kernel fallback implementations in order to reduce code size. This only has an effect in case the project was generated with MLI support. See more info in [EmbARC MLI Library Based Optimizations](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/kernels/arc_mli/README.md). `false` is the default value. 
+
+       make app MLI_ONLY=[true|false]
+
+4. Delete all artifacts created during build.
+
+       make clean
+
+5. Run the application with the nSIM simulator in console mode.
+
+       make run 
+
+6. Load the application and open MetaWare Debugger GUI for further execution/debugging. 	
+
+       make debug 
+
+7. Generate necessary artefacts for self-booting execution from flash. See [reference to Run the application on the board from the micro SD card](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/tools/make/targets/arc/README.md#Run-the-Application-on-the-Board-from-the-microSD-Card). 	
+
+       make flash
+
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository, and third party dependencies are covered by their respective licenses, in the third_party folder of this package.
diff --git a/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl b/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl
new file mode 100644
index 00000000000..a1a3ab71028
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl
@@ -0,0 +1,114 @@
+#=============================================================
+# OS-specific definitions
+#=============================================================
+COMMA=,
+OPEN_PAREN=(
+CLOSE_PAREN=)
+BACKSLASH=\$(nullstring)
+ifneq ($(ComSpec)$(COMSPEC),)
+    O_SYS=Windows
+    RM=del /F /Q
+    MKDIR=mkdir 
+    CP=copy /Y
+    TYPE=type
+    PS=$(BACKSLASH)
+    Q=
+    coQ=\$(nullstring)
+    fix_platform_path = $(subst /,$(PS), $(1))
+    DEV_NULL = nul
+else
+    O_SYS=Unix
+    RM=rm -rf
+    MKDIR=mkdir -p
+    CP=cp 
+    TYPE=cat
+    PS=/
+    Q=$(BACKSLASH)
+    coQ=
+    fix_platform_path=$(1)
+    DEV_NULL=/dev/null
+endif
+
+#=============================================================
+# Toolchain definitions
+#=============================================================
+CC = %{CC}%
+CXX = %{CXX}%
+LD = %{LD}%
+
+
+#=============================================================
+# Applications settings
+#=============================================================
+OUT_NAME = %{EXECUTABLE}%
+
+DBG_ARGS ?= 
+
+RUN_ARGS ?= 
+
+EXT_CFLAGS ?=
+
+CXXFLAGS += %{CXX_FLAGS}%
+
+CCFLAGS += %{CC_FLAGS}%
+
+LDFLAGS += %{LINKER_FLAGS}%
+
+%{EXTRA_APP_SETTINGS}%
+
+
+#=============================================================
+# Files and directories
+#=============================================================
+SRCS := \
+%{SRCS}%
+
+OBJS := \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(SRCS)))
+
+
+#=============================================================
+# Common rules
+#=============================================================
+.PHONY: all app flash clean run debug
+
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@
+
+%.o: %.c
+	$(CC) $(CCFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@
+
+$(OUT_NAME): $(OBJS)
+	$(LD) $(CXXFLAGS) -o $@ -Ccrossref $(OBJS) $(LDFLAGS)
+
+%{EXTRA_APP_RULES}%
+
+
+#=================================================================
+# Global rules
+#=================================================================
+all: $(OUT_NAME)
+
+app: $(OUT_NAME)
+
+flash: %{BIN_DEPEND}%
+%{BIN_RULE}%
+
+clean: 
+	-@$(RM) $(call fix_platform_path,$(OBJS))
+	-@$(RM) $(OUT_NAME) %{EXTRA_RM_TARGETS}%
+
+#=================================================================
+# Execution rules
+#=================================================================
+
+APP_RUN := %{APP_RUN_CMD}%
+APP_DEBUG := %{APP_DEBUG_CMD}%
+
+run: $(OUT_NAME)
+	$(APP_RUN) $(OUT_NAME) $(RUN_ARGS)
+
+debug: $(OUT_NAME)
+	$(APP_DEBUG) $(OUT_NAME) $(RUN_ARGS)
+
+%{EXTRA_EXECUTE_RULES}%
diff --git a/tensorflow/lite/micro/tools/make/templates/zephyr_cmake_project.cmake.tpl b/tensorflow/lite/micro/tools/make/templates/zephyr_cmake_project.cmake.tpl
new file mode 100644
index 00000000000..d7bb4511f32
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/templates/zephyr_cmake_project.cmake.tpl
@@ -0,0 +1,18 @@
+cmake_minimum_required(VERSION 3.13.1)
+include($ENV{ZEPHYR_BASE}/cmake/app/boilerplate.cmake NO_POLICY_SCOPE)
+project(tf_lite_magic_wand)
+
+set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} %{CXX_FLAGS}%")
+set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} %{CC_FLAGS}%")
+set(CMAKE_EXE_LINKER_FLAGS "%{LINKER_FLAGS}%")
+
+# -fno-threadsafe-statics -- disables the mutex around initialization of local static variables
+target_compile_options(app PRIVATE "-fno-threadsafe-statics")
+
+target_sources(app PRIVATE
+		%{SRCS}%
+		)
+
+target_include_directories(app PRIVATE
+		%{INCLUDE_DIRS}%
+                )
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 9251e4c161e..1543abc63f8 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -28,8 +28,8 @@ LEON_BCC2_MD5 := "cdf78082be4882da2a92c9baa82fe765"
 TSIM_URL := "https://www.gaisler.com/anonftp/tsim/tsim-eval-2.0.63.tar.gz"
 TSIM_MD5 := "afa0095d3ed989a949e1467f94e41d2f"
 
-CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/8a4db53f69da06e97565fe2f2e8926d193a5759d.zip"
-CMSIS_MD5 := "e9864fb71b65adc4f7d92a9dea6e1aab"
+CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/1150e71e07c79b538efd842aba5b210a31827ae5.zip"
+CMSIS_MD5 := "e05f4222ef58825193910b41a0871dcb"
 
 AM_SDK_URL := "http://s3.asia.ambiqmicro.com/downloads/AmbiqSuite-Rel2.2.0.zip"
 AM_SDK_MD5 := "7605fa2d4d97e6bb7a1190c92b66b597"
@@ -62,20 +62,23 @@ RUY_MD5="2d54f058f8f7120dfc1ecee79dbf259e"
 CIFAR10_DATASET_URL="https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz"
 CIFAR10_DATASET_MD5="c32a1d4ab5d03f1284b67883e8d87530"
 
-IMAGE_RECOGNITION_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/models/tflite/cifar_image_recognition_model_2020_4_14.zip"
-IMAGE_RECOGNITION_MODEL_MD5 := "2b886156e7ef4d6e53d0f1a4bc800e56"
+IMAGE_RECOGNITION_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/models/tflite/cifar_image_recognition_model_2020_05_27.zip"
+IMAGE_RECOGNITION_MODEL_MD5 := "1f4607b05ac45b8a6146fb883dbc2d7b"
 
-PERSON_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_grayscale_2019_11_21.zip"
-PERSON_MODEL_MD5 := "fe2934bd0788f1dcc7af3f0a954542ab"
+PERSON_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_grayscale_2020_05_27.zip"
+PERSON_MODEL_MD5 := "55b85f76e2995153e660391d4a209ef1"
 
-PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_01_13.zip"
-PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc"
+PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_05_27.zip"
+PERSON_MODEL_INT8_MD5 := "a0ede2d058aa2a1d413893455dd55352"
 
-EMBARC_OSP_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp/archive/embarc_mli.zip"
-EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776"
+EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/58284867ca52d1f43b25045e8601999d7359d986.zip"
+EMBARC_MLI_MD5 := "2bf4982a327fdaa9d475803ce014d1ef"
 
-EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/6316034d421cbbb59756239908d7c9a99075a3bb.zip"
-EMBARC_MLI_MD5 := "db0910cf0e07e43f74ae7a31de485d56"
+EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC2/embARC_MLI_package.zip"
+EMBARC_MLI_PRE_COMPILED_MD5 := "a95ff9e0370434484f14e7e4114327f6"
+
+ZEPHYR_URL := "https://github.com/antmicro/zephyr/archive/55e36b9.zip"
+ZEPHYR_MD5 := "755622eb4812fde918a6382b65d50c3b"
 
 XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
 XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"
diff --git a/tensorflow/lite/micro/xcore/README.md b/tensorflow/lite/micro/xcore/README.md
new file mode 100644
index 00000000000..796b73a8ca1
--- /dev/null
+++ b/tensorflow/lite/micro/xcore/README.md
@@ -0,0 +1,30 @@
+# Quickstart to install tools and run unit tests:
+
+```
+$ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" clean clean_downloads && make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test_greedy_memory_planner_test || true && pushd tensorflow/lite/micro/tools/make/downloads/xtimecomposer/xTIMEcomposer/15.0.0/ && source SetEnv && popd  && make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test
+```
+
+(add -jN to the final make command to run builds / tests in N parallel threads)
+
+# Background information:
+
+*   To start from a fresh repo (this will also remove non-xcore builds and
+    downloads): `$ make -f tensorflow/lite/micro/tools/make/Makefile
+    TARGET="xcore" clean clean_downloads`
+*   To force xcore.ai tools download from a clean repo: `$ make -f
+    tensorflow/lite/micro/tools/make/Makefile TARGET="xcore"
+    test_greedy_memory_planner_test` (this will fail to build the test, but if
+    it succeeds because you already have tools it will exit quickly)
+
+*   To set up environment variables correctly run the following from the top
+    tensorflow directory: `$ make -f tensorflow/lite/micro/tools/make/Makefile
+    TARGET="xcore" test $ pushd
+    ./tensorflow/lite/micro/tools/make/downloads/xtimecomposer/xTIMEcomposer/15.0.0/
+    && source SetEnv && popd $ make -f tensorflow/lite/micro/tools/make/Makefile
+    TARGET="xcore" test`
+
+*   Assuming tools are already set up the following are the most commonly used
+    commands: `$ make -f tensorflow/lite/micro/tools/make/Makefile
+    TARGET="xcore" build $ make -f tensorflow/lite/micro/tools/make/Makefile
+    TARGET="xcore" test $ make -f tensorflow/lite/micro/tools/make/Makefile
+    TARGET="xcore" < name_of_example i.e. hello_world_test >`
diff --git a/tensorflow/lite/micro/xcore/debug_log.cc b/tensorflow/lite/micro/xcore/debug_log.cc
new file mode 100644
index 00000000000..c206f057c02
--- /dev/null
+++ b/tensorflow/lite/micro/xcore/debug_log.cc
@@ -0,0 +1,16 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/debug_log.h"
+
+#include <cstdio>
+extern "C" void DebugLog(const char* s) { printf("%s", s); }
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index 851c1718e0a..6739838e4d1 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -136,6 +136,13 @@ enum {
   ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_LSTM = 92,
   ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_RNN = 93,
   ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR = 94,
+  ANEURALNETWORKS_QUANTIZED_LSTM = 95,
+  ANEURALNETWORKS_IF = 96,
+  ANEURALNETWORKS_WHILE = 97,
+  ANEURALNETWORKS_ELU = 98,
+  ANEURALNETWORKS_HARD_SWISH = 99,
+  ANEURALNETWORKS_FILL = 100,
+  ANEURALNETWORKS_RANK = 101,
 };
 
 /**
@@ -208,6 +215,18 @@ enum {
   ANEURALNETWORKS_DEVICE_ACCELERATOR = 4,
 };
 
+/**
+ * Relative execution priority.
+ *
+ * Available since API level 30.
+ */
+enum {
+  ANEURALNETWORKS_PRIORITY_LOW = 90,
+  ANEURALNETWORKS_PRIORITY_MEDIUM = 100,
+  ANEURALNETWORKS_PRIORITY_HIGH = 110,
+  ANEURALNETWORKS_PRIORITY_DEFAULT = ANEURALNETWORKS_PRIORITY_MEDIUM,
+};
+
 /**
  * ANeuralNetworksMemory is an opaque type that represents memory.
  *
@@ -521,9 +540,21 @@ typedef int (*ANeuralNetworksCompilation_setCaching_fn)(
     ANeuralNetworksCompilation* compilation, const char* cacheDir,
     const uint8_t* token);
 
+typedef int (*ANeuralNetworksCompilation_setTimeout_fn)(
+    ANeuralNetworksCompilation* compilation, uint64_t duration);
+
+typedef int (*ANeuralNetworksCompilation_setPriority_fn)(
+    ANeuralNetworksCompilation* compilation, int priority);
+
 typedef int (*ANeuralNetworksExecution_compute_fn)(
     ANeuralNetworksExecution* execution);
 
+typedef int (*ANeuralNetworksExecution_setTimeout_fn)(
+    ANeuralNetworksExecution* execution, uint64_t duration);
+
+typedef int (*ANeuralNetworksExecution_setLoopTimeout_fn)(
+    ANeuralNetworksExecution* execution, uint64_t duration);
+
 typedef int (*ANeuralNetworksExecution_getOutputOperandRank_fn)(
     ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank);
 
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
index 71a4de53e9a..ad5869fec04 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.cc
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -45,19 +45,6 @@ int32_t GetAndroidSdkVersion() {
       }
       result = result * 10 + digit;
     }
-    // TODO(levp): remove once SDK gets updated to 29th level
-    // Upgrade SDK version for pre-release Q to be able to test functionality
-    // available from SDK level 29.
-    if (result == 28) {
-      char versionCodename[PROP_VALUE_MAX];
-      const char* versionCodenameProp = "ro.build.version.codename";
-      length = __system_property_get(versionCodenameProp, versionCodename);
-      if (length != 0) {
-        if (versionCodename[0] == 'Q') {
-          return 29;
-        }
-      }
-    }
     return result;
   }
   return 0;
@@ -228,6 +215,17 @@ const NnApi LoadNnApi() {
                          ANeuralNetworksModel_getExtensionOperationType);
   LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
                          ANeuralNetworksModel_setOperandExtensionData);
+
+  // API 30 (NNAPI 1.3) methods.
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksCompilation_setTimeout);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksCompilation_setPriority);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_setTimeout);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_setLoopTimeout);
+
   return nnapi;
 }
 
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.h b/tensorflow/lite/nnapi/nnapi_implementation.h
index a27f5ba661a..abee0fbdef3 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.h
+++ b/tensorflow/lite/nnapi/nnapi_implementation.h
@@ -789,6 +789,76 @@ struct NnApi {
       ANeuralNetworksCompilation* compilation, const char* cacheDir,
       const uint8_t* token);
 
+  /**
+   * Set the maximum expected duration for compiling the model.
+   *
+   * If the device is not able to complete the compilation within the specified
+   * duration, the compilation may be aborted. The timeout duration begins at
+   * the call to {@link ANeuralNetworksCompilation_finish}.
+   *
+   * This timeout duration acts as a hint to drivers, and can be used to both
+   * free up compute resources within the driver and return control back to the
+   * application quicker than is possible without the hint. It enables drivers
+   * that are able to estimate how long a compilation will take to abort the
+   * compilation before it has even started if the driver believes the
+   * compilation cannot be completed within the timeout duration. Similarly, it
+   * enables drivers to abort an ongoing compilation if it is taking too long.
+   * However, this call does not guarantee that the compilation will complete or
+   * abort within the timeout duration.
+   *
+   * By default (i.e., unless ANeuralNetworksCompilation_setTimeout is called),
+   * the timeout duration for compiling the model is considered infinite.
+   *
+   * The {@link ANeuralNetworksCompilation} must have been created with
+   * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+   * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If the
+   * device has a feature level reported by
+   * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then
+   * the timeout duration hint will be ignored.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The compilation to be modified.
+   * @param duration The maximum amount of time in nanoseconds that is expected
+   * to be spent finishing a compilation. If this duration is exceeded, the
+   *     compilation may be aborted. If set to 0, the timeout duration is
+   *     considered infinite.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksCompilation_setTimeout)(
+      ANeuralNetworksCompilation* compilation, uint64_t duration);
+
+  /**
+   * Set the execution priority.
+   *
+   * Execution priorities are relative to other executions created by the same
+   * application (specifically same uid) for the same device. Specifically,
+   * priorities of executions from one application will not affect executions
+   * from another application. Similarly, priorities of executions on one device
+   * will not affect executions on another device.
+   *
+   * Higher priority executions may use more compute resources than lower
+   * priority executions, and may preempt or starve lower priority executions.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 30.
+   *
+   * @param compilation The compilation to be modified.
+   * @param priority The relative priority of the execution compared to other
+   *     executions created by the application. Must be one of
+   *     ANEURALNETWORKS_PRIORITY_*.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksCompilation_setPriority)(
+      ANeuralNetworksCompilation* compilation, int priority);
+
   /**
    * Schedule synchronous evaluation of the execution.
    *
@@ -813,6 +883,84 @@ struct NnApi {
    */
   int (*ANeuralNetworksExecution_compute)(ANeuralNetworksExecution* execution);
 
+  /**
+   * Set the maximum expected duration of the specified execution.
+   *
+   * If the device is not able to complete the execution within the specified
+   * duration, the execution may be aborted. The timeout duration begins at a
+   * call to one of:
+   * - {@link ANeuralNetworksExecution_burstCompute}
+   * - {@link ANeuralNetworksExecution_compute}
+   * - {@link ANeuralNetworksExecution_startCompute}
+   * - {@link ANeuralNetworksExecution_startComputeWithDependencies}
+   *
+   * This timeout duration acts as a hint to drivers, and can be used to both
+   * free up compute resources within the driver and return control back to the
+   * application quicker than is possible without the hint. It enables drivers
+   * that are able to estimate how long an execution will take to abort the
+   * execution before it has even started if the driver believes the execution
+   * cannot be completed within the timeout duration. Similarly, it enables
+   * drivers to abort an ongoing execution if it is taking too long. However,
+   * this call does not guarantee that the execution will complete or abort
+   * within the timeout duration.
+   *
+   * By default (i.e., unless ANeuralNetworksExecution_setTimeout is called),
+   * the timeout duration for execution is considered infinite.
+   *
+   * The {@link ANeuralNetworksExecution} must have been created from an
+   * {@link ANeuralNetworksCompilation} which in turn was created from
+   * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+   * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If the
+   * device has a feature level reported by
+   * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then
+   * the timeout duration hint will be ignored.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param duration The maximum amount of time in nanoseconds that is expected
+   * to be spent executing a model. If this duration is exceeded, the execution
+   *     may be aborted. If set to 0, the timeout duration is considered
+   * infinite.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksExecution_setTimeout)(
+      ANeuralNetworksExecution* execution, uint64_t duration);
+
+  /**
+   * Set the maximum duration of WHILE loops in the specified execution.
+   *
+   * This is a fuzzy per-loop timeout intended to prevent infinite loops.
+   *
+   * If a WHILE loop condition model does not output false within the specified
+   * duration, the execution will be aborted.
+   *
+   * See {@link ANeuralNetworks_getDefaultLoopTimeout} and
+   * {@link ANeuralNetworks_getMaximumLoopTimeout} for the default
+   * and maximum timeout values.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param duration The maximum amount of time in nanoseconds that can be spent
+   *     executing a WHILE loop. If the specified duration value exceeds the
+   * value produced by {@link ANeuralNetworks_getMaximumLoopTimeout}, it will be
+   *     overridden by that value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *         ANEURALNETWORKS_BAD_STATE if execution has started.
+   *         ANEURALNETWORKS_UNEXPECTED_NULL if execution is NULL.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksExecution_setLoopTimeout)(
+      ANeuralNetworksExecution* execution, uint64_t duration);
+
   /**
    * Get the dimensional information of the specified output operand of the
    * model of the
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index c5ccdb98390..2e25b0a17f7 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -77,6 +77,8 @@ const char* AllocTypeName(TfLiteAllocationType type) {
       return "kTfLiteArenaRw";
     case kTfLiteArenaRwPersistent:
       return "kTfLiteArenaRwPersistent";
+    case kTfLitePersistentRo:
+      return "kTfLitePersistentRo";
   }
   return "(invalid)";
 }
diff --git a/tensorflow/lite/profiling/atrace_profiler.cc b/tensorflow/lite/profiling/atrace_profiler.cc
index 2f6672d6bb7..4bdaf9d9e06 100644
--- a/tensorflow/lite/profiling/atrace_profiler.cc
+++ b/tensorflow/lite/profiling/atrace_profiler.cc
@@ -57,16 +57,19 @@ class ATraceProfiler : public tflite::Profiler {
   }
 
   uint32_t BeginEvent(const char* tag, EventType event_type,
-                      uint32_t event_metadata,
-                      uint32_t event_subgraph_index) override {
+                      int64_t event_metadata1,
+                      int64_t event_metadata2) override {
     if (handle_ && atrace_is_enabled_()) {
       // Note: When recording an OPERATOR_INVOKE_EVENT, we have recorded the op
-      // name as tag and node index as event_metadata. See the macro
-      // TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE defined in
-      // tensorflow/lite/core/api/profiler.h for details.
-      // op_name@node_index/subgraph_index
+      // name
+      // as tag, node index as event_metadata1 and subgraph index as
+      // event_metadata2. See the macro TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE
+      // defined in tensorflow/lite/core/api/profiler.h for details.
+      // Regardless the 'event_type', we encode the perfetto event name as
+      // tag@event_metadata1/event_metadata2. In case of OPERATOR_INVOKE_EVENT,
+      // the perfetto event name will be op_name@node_index/subgraph_index
       std::string trace_event_tag =
-          absl::StrCat(tag, "@", event_metadata, "/", event_subgraph_index);
+          absl::StrCat(tag, "@", event_metadata1, "/", event_metadata2);
       atrace_begin_section_(trace_event_tag.c_str());
     }
     return 0;
diff --git a/tensorflow/lite/profiling/buffered_profiler.h b/tensorflow/lite/profiling/buffered_profiler.h
index 2b617c92aeb..cfd96e0490a 100644
--- a/tensorflow/lite/profiling/buffered_profiler.h
+++ b/tensorflow/lite/profiling/buffered_profiler.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_PROFILING_BUFFERED_PROFILER_H_
 #define TENSORFLOW_LITE_PROFILING_BUFFERED_PROFILER_H_
 
+#include <cstdint>
 #include <vector>
 
 #include "tensorflow/lite/core/api/profiler.h"
@@ -75,24 +76,33 @@ namespace profiling {
 class BufferedProfiler : public tflite::Profiler {
  public:
   explicit BufferedProfiler(uint32_t max_num_entries)
-      : buffer_(max_num_entries, false) {}
+      : buffer_(max_num_entries, false),
+        supported_event_types_(~static_cast<uint64_t>(
+            EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT)) {}
 
   uint32_t BeginEvent(const char* tag, EventType event_type,
-                      uint32_t event_metadata,
-                      uint32_t event_subgraph_index) override {
-    return buffer_.BeginEvent(tag, event_type, event_metadata,
-                              event_subgraph_index);
+                      int64_t event_metadata1,
+                      int64_t event_metadata2) override {
+    if (!ShouldAddEvent(event_type)) return kInvalidEventHandle;
+    return buffer_.BeginEvent(tag, event_type, event_metadata1,
+                              event_metadata2);
   }
 
   void EndEvent(uint32_t event_handle) override {
     buffer_.EndEvent(event_handle);
   }
 
-  void AddEvent(const char* tag, EventType event_type, uint32_t event_metadata,
-                uint64_t start, uint64_t end,
-                uint32_t event_subgraph_index) override {
-    buffer_.AddEvent(tag, event_type, event_metadata, start, end,
-                     event_subgraph_index);
+  void EndEvent(uint32_t event_handle, int64_t event_metadata1,
+                int64_t event_metadata2) override {
+    buffer_.EndEvent(event_handle, &event_metadata1, &event_metadata2);
+  }
+
+  void AddEvent(const char* tag, EventType event_type, uint64_t start,
+                uint64_t end, int64_t event_metadata1,
+                int64_t event_metadata2) override {
+    if (!ShouldAddEvent(event_type)) return;
+    buffer_.AddEvent(tag, event_type, start, end, event_metadata1,
+                     event_metadata2);
   }
 
   void StartProfiling() { buffer_.SetEnabled(true); }
@@ -107,9 +117,15 @@ class BufferedProfiler : public tflite::Profiler {
     return profile_events;
   }
 
+ protected:
+  bool ShouldAddEvent(EventType event_type) {
+    return (static_cast<uint64_t>(event_type) & supported_event_types_) != 0;
+  }
+
  private:
   ProfileBuffer* GetProfileBuffer() { return &buffer_; }
   ProfileBuffer buffer_;
+  const uint64_t supported_event_types_;
 };
 
 }  // namespace profiling
diff --git a/tensorflow/lite/profiling/noop_profiler.h b/tensorflow/lite/profiling/noop_profiler.h
index 27363fc6788..078d0e8ee6d 100644
--- a/tensorflow/lite/profiling/noop_profiler.h
+++ b/tensorflow/lite/profiling/noop_profiler.h
@@ -29,7 +29,7 @@ class NoopProfiler : public tflite::Profiler {
   NoopProfiler() {}
   explicit NoopProfiler(int max_profiling_buffer_entries) {}
 
-  uint32_t BeginEvent(const char*, EventType, uint32_t, uint32_t) override {
+  uint32_t BeginEvent(const char*, EventType, int64_t, int64_t) override {
     return 0;
   }
   void EndEvent(uint32_t) override {}
diff --git a/tensorflow/lite/profiling/profile_buffer.h b/tensorflow/lite/profiling/profile_buffer.h
index 67d19d02afe..3b34cf9612a 100644
--- a/tensorflow/lite/profiling/profile_buffer.h
+++ b/tensorflow/lite/profiling/profile_buffer.h
@@ -51,10 +51,11 @@ struct ProfileEvent {
   // The field containing the type of event. This must be one of the event types
   // in EventType.
   EventType event_type;
-  // Extra data describing the details of the event.
-  uint32_t event_metadata;
-  // The index of subgraph where an event came from.
-  uint32_t event_subgraph_index;
+  // Meta data associated w/ the event.
+  int64_t event_metadata;
+  // Note: if this is an OPERATOR_INVOKE_EVENT, 'extra_event_metadata' will
+  // represent the index of the subgraph that this event comes from.
+  int64_t extra_event_metadata;
 };
 
 // A ring buffer of profile events.
@@ -69,7 +70,7 @@ class ProfileBuffer {
   // buffer is disabled this has no affect.
   // The tag of the event should remain valid till the buffer is valid.
   uint32_t BeginEvent(const char* tag, ProfileEvent::EventType event_type,
-                      uint32_t event_metadata, uint32_t event_subgraph_index) {
+                      int64_t event_metadata1, int64_t event_metadata2) {
     if (!enabled_) {
       return kInvalidEventHandle;
     }
@@ -81,8 +82,8 @@ class ProfileBuffer {
     }
     event_buffer_[index].tag = tag;
     event_buffer_[index].event_type = event_type;
-    event_buffer_[index].event_subgraph_index = event_subgraph_index;
-    event_buffer_[index].event_metadata = event_metadata;
+    event_buffer_[index].event_metadata = event_metadata1;
+    event_buffer_[index].extra_event_metadata = event_metadata2;
     event_buffer_[index].begin_timestamp_us = timestamp;
     event_buffer_[index].end_timestamp_us = 0;
     if (event_type != Profiler::EventType::OPERATOR_INVOKE_EVENT) {
@@ -98,7 +99,8 @@ class ProfileBuffer {
   // Sets the end timestamp for event for the handle to current time.
   // If the buffer is disabled or previous event has been overwritten this
   // operation has not effect.
-  void EndEvent(uint32_t event_handle) {
+  void EndEvent(uint32_t event_handle, const int64_t* event_metadata1 = nullptr,
+                const int64_t* event_metadata2 = nullptr) {
     if (!enabled_ || event_handle == kInvalidEventHandle ||
         event_handle > current_index_) {
       return;
@@ -116,11 +118,17 @@ class ProfileBuffer {
         Profiler::EventType::OPERATOR_INVOKE_EVENT) {
       event_buffer_[event_index].end_mem_usage = memory::GetMemoryUsage();
     }
+    if (event_metadata1) {
+      event_buffer_[event_index].event_metadata = *event_metadata1;
+    }
+    if (event_metadata2) {
+      event_buffer_[event_index].extra_event_metadata = *event_metadata2;
+    }
   }
 
   void AddEvent(const char* tag, ProfileEvent::EventType event_type,
-                uint32_t event_metadata, uint64_t start, uint64_t end,
-                uint32_t event_subgraph_index) {
+                uint64_t start, uint64_t end, int64_t event_metadata1,
+                int64_t event_metadata2) {
     if (!enabled_) {
       return;
     }
@@ -131,8 +139,8 @@ class ProfileBuffer {
     }
     event_buffer_[index].tag = tag;
     event_buffer_[index].event_type = event_type;
-    event_buffer_[index].event_subgraph_index = event_subgraph_index;
-    event_buffer_[index].event_metadata = event_metadata;
+    event_buffer_[index].event_metadata = event_metadata1;
+    event_buffer_[index].extra_event_metadata = event_metadata2;
     event_buffer_[index].begin_timestamp_us = start;
     event_buffer_[index].end_timestamp_us = end;
     current_index_++;
diff --git a/tensorflow/lite/profiling/profile_buffer_test.cc b/tensorflow/lite/profiling/profile_buffer_test.cc
index 584d21255a5..ab98cbb0d13 100644
--- a/tensorflow/lite/profiling/profile_buffer_test.cc
+++ b/tensorflow/lite/profiling/profile_buffer_test.cc
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/profiling/profile_buffer.h"
+
+#include <cstdint>
 #include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/profiling/profile_buffer.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
@@ -43,7 +45,7 @@ TEST(ProfileBufferTest, AddEvent) {
   EXPECT_EQ(0, buffer.Size());
   auto event_handle =
       buffer.BeginEvent("hello", ProfileEvent::EventType::DEFAULT,
-                        /*event_metadata*/ 42, /*event_subgraph_index*/ 0);
+                        /*event_metadata1*/ 42, /*event_metadata2*/ 0);
 
   EXPECT_GE(event_handle, 0);
   EXPECT_EQ(1, buffer.Size());
@@ -59,6 +61,28 @@ TEST(ProfileBufferTest, AddEvent) {
   EXPECT_GE(event->end_timestamp_us, event->begin_timestamp_us);
 }
 
+TEST(ProfileBufferTest, EndEventWithMetadata) {
+  ProfileBuffer buffer(/*max_size*/ 10, /*enabled*/ true);
+  EXPECT_EQ(0, buffer.Size());
+  auto event_handle =
+      buffer.BeginEvent("hello", ProfileEvent::EventType::DEFAULT,
+                        /*event_metadata1*/ 42, /*event_metadata2*/ 0);
+  const int64_t kEventMetadata1 = 18;
+  const int64_t kEventMetadata2 = 36;
+  buffer.EndEvent(event_handle, &kEventMetadata1, &kEventMetadata2);
+
+  EXPECT_GE(event_handle, 0);
+  EXPECT_EQ(1, buffer.Size());
+  auto event = GetProfileEvents(buffer)[0];
+  EXPECT_EQ(event->tag, "hello");
+  EXPECT_GT(event->begin_timestamp_us, 0);
+  EXPECT_EQ(event->event_type, ProfileEvent::EventType::DEFAULT);
+  EXPECT_EQ(event->event_metadata, kEventMetadata1);
+  EXPECT_EQ(event->extra_event_metadata, kEventMetadata2);
+  EXPECT_EQ(1, buffer.Size());
+  EXPECT_GE(event->end_timestamp_us, event->begin_timestamp_us);
+}
+
 TEST(ProfileBufferTest, OverFlow) {
   const int max_size = 4;
   ProfileBuffer buffer{max_size, true};
@@ -83,13 +107,13 @@ TEST(ProfileBufferTest, Enable) {
   EXPECT_EQ(0, buffer.Size());
   auto event_handle =
       buffer.BeginEvent("hello", ProfileEvent::EventType::DEFAULT,
-                        /*event_metadata*/ 42, /*event_subgraph_index*/ 0);
+                        /*event_metadata1*/ 42, /*event_metadata2*/ 0);
   EXPECT_EQ(kInvalidEventHandle, event_handle);
   EXPECT_EQ(0, buffer.Size());
   buffer.SetEnabled(true);
   event_handle =
       buffer.BeginEvent("hello", ProfileEvent::EventType::DEFAULT,
-                        /*event_metadata*/ 42, /*event_subgraph_index*/ 0);
+                        /*event_metadata1*/ 42, /*event_metadata2*/ 0);
   EXPECT_GE(event_handle, 0);
   EXPECT_EQ(1, buffer.Size());
 }
diff --git a/tensorflow/lite/profiling/profile_summarizer.cc b/tensorflow/lite/profiling/profile_summarizer.cc
index acf630c93cf..2fc04f99659 100644
--- a/tensorflow/lite/profiling/profile_summarizer.cc
+++ b/tensorflow/lite/profiling/profile_summarizer.cc
@@ -128,7 +128,7 @@ void ProfileSummarizer::ProcessProfiles(
   int64_t delegate_internal_total_us = 0;
 
   for (auto event : events) {
-    const auto subgraph_index = event->event_subgraph_index;
+    const auto subgraph_index = event->extra_event_metadata;
     auto stats_calculator = GetStatsCalculator(subgraph_index);
     int64_t start_us = event->begin_timestamp_us - base_start_us;
     int64_t node_exec_time =
@@ -174,7 +174,7 @@ void ProfileSummarizer::ProcessProfiles(
       const memory::MemoryUsage node_mem_usage =
           event->end_mem_usage - event->begin_mem_usage;
       std::string node_name(event->tag);
-      node_name += "/" + std::to_string(event->event_subgraph_index);
+      node_name += "/" + std::to_string(event->extra_event_metadata);
       stats_calculator->AddNodeStats(node_name, event->tag, node_num, start_us,
                                      node_exec_time,
                                      node_mem_usage.max_rss_kb * 1000.0);
diff --git a/tensorflow/lite/profiling/profile_summarizer_test.cc b/tensorflow/lite/profiling/profile_summarizer_test.cc
index 87e689e9985..98d26196b75 100644
--- a/tensorflow/lite/profiling/profile_summarizer_test.cc
+++ b/tensorflow/lite/profiling/profile_summarizer_test.cc
@@ -182,13 +182,13 @@ TEST_F(ProfileSummarizerIfOpTest, TestIfTrue) {
   EXPECT_EQ(2, events.size());
   int event_count_of_subgraph_zero = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 0; });
+      [](auto event) { return event->extra_event_metadata == 0; });
   int event_count_of_subgraph_one = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 1; });
+      [](auto event) { return event->extra_event_metadata == 1; });
   int event_count_of_subgraph_two = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 2; });
+      [](auto event) { return event->extra_event_metadata == 2; });
   EXPECT_EQ(1, event_count_of_subgraph_zero);
   EXPECT_EQ(1, event_count_of_subgraph_one);
   EXPECT_EQ(0, event_count_of_subgraph_two);
@@ -209,13 +209,13 @@ TEST_F(ProfileSummarizerIfOpTest, TestIfFalse) {
   EXPECT_EQ(2, events.size());
   int event_count_of_subgraph_zero = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 0; });
+      [](auto event) { return event->extra_event_metadata == 0; });
   int event_count_of_subgraph_one = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 1; });
+      [](auto event) { return event->extra_event_metadata == 1; });
   int event_count_of_subgraph_two = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 2; });
+      [](auto event) { return event->extra_event_metadata == 2; });
   EXPECT_EQ(1, event_count_of_subgraph_zero);
   EXPECT_EQ(0, event_count_of_subgraph_one);
   EXPECT_EQ(1, event_count_of_subgraph_two);
diff --git a/tensorflow/lite/profiling/profiler_test.cc b/tensorflow/lite/profiling/profiler_test.cc
index cedb109697d..1d8455e3647 100644
--- a/tensorflow/lite/profiling/profiler_test.cc
+++ b/tensorflow/lite/profiling/profiler_test.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/profiling/profiler.h"
+
 #include <unistd.h>
 
 #include <chrono>  // NOLINT(build/c++11)
@@ -20,7 +22,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
@@ -55,6 +56,20 @@ TEST(ProfilerTest, NoProfilesAreCollectedWhenDisabled) {
   EXPECT_EQ(0, profile_events.size());
 }
 
+TEST(ProfilerTest, NoProfilesAreCollectedWhenEventTypeUnsupported) {
+  BufferedProfiler profiler(1024);
+  tflite::Profiler* p = &profiler;
+  p->AddEvent("Hello",
+              Profiler::EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT,
+              /*start*/ 0, /*end*/ 1,
+              /*event_metadata*/ 2);
+  auto handler = p->BeginEvent(
+      "begin", Profiler::EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT, 0);
+  p->EndEvent(handler);
+  auto profile_events = profiler.GetProfileEvents();
+  EXPECT_EQ(0, profile_events.size());
+}
+
 TEST(ProfilingTest, ProfilesAreCollected) {
   BufferedProfiler profiler(1024);
   profiler.StartProfiling();
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 7248792523e..c1f37c81b7f 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -157,6 +157,7 @@ py_test(
     name = "lite_v2_test",
     srcs = ["lite_v2_test.py"],
     python_version = "PY3",
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 98e1938f12b..c30987a5898 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -114,7 +114,8 @@ class ConverterError(Exception):
   pass
 
 
-def mlir_quantize(input_data_str, disable_per_channel=False):
+def mlir_quantize(input_data_str, disable_per_channel=False,
+                  inference_type=_types_pb2.INT8):
   """Quantize `input_data_str` with calibration results.
 
   Args:
@@ -122,13 +123,15 @@ def mlir_quantize(input_data_str, disable_per_channel=False):
                     calibration results).
     disable_per_channel: Bool indicating whether to do per-channel or
                          per-tensor quantization
+    inference_type: Data type for the activations. The default value is int8.
 
   Returns:
     Quantized model in serialized form (e.g. a TFLITE model) with floating-point
     inputs and outputs.
   """
   return wrap_toco.wrapped_experimental_mlir_quantize(input_data_str,
-                                                      disable_per_channel)
+                                                      disable_per_channel,
+                                                      inference_type)
 
 
 def mlir_sparsify(input_data_str):
@@ -172,9 +175,10 @@ def toco_convert_protos(model_flags_str,
     RuntimeError: When conversion fails, an exception is raised with the error
       message embedded.
   """
-  # TODO(aselle): When toco does not use fatal errors for failure, we can
-  # switch this on.
-  if not _toco_from_proto_bin:
+  # Historically, TOCO conversion failures would trigger a crash, so we would
+  # attempt to run the converter out-of-process. The MLIR conversion pipeline
+  # surfaces errors instead, and can be safely run in-process.
+  if enable_mlir_converter or not _toco_from_proto_bin:
     try:
       model_str = wrap_toco.wrapped_toco_convert(model_flags_str,
                                                  toco_flags_str, input_data_str,
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index ccbba9014c8..04863b12853 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -27,20 +27,8 @@ import numpy as np
 # pylint: disable=g-import-not-at-top
 if not __file__.endswith('tflite_runtime/interpreter.py'):
   # This file is part of tensorflow package.
-  from tensorflow.python.util.lazy_loader import LazyLoader
+  from tensorflow.lite.python.interpreter_wrapper import _pywrap_tensorflow_interpreter_wrapper as _interpreter_wrapper
   from tensorflow.python.util.tf_export import tf_export as _tf_export
-
-  # Lazy load since some of the performance benchmark skylark rules
-  # break dependencies. Must use double quotes to match code internal rewrite
-  # rule.
-  # pylint: disable=g-inconsistent-quotes
-  _interpreter_wrapper = LazyLoader(
-      "_interpreter_wrapper", globals(),
-      "tensorflow.lite.python.interpreter_wrapper."
-      '_pywrap_tensorflow_interpreter_wrapper')
-  # pylint: enable=g-inconsistent-quotes
-
-  del LazyLoader
 else:
   # This file is part of tflite_runtime package.
   from tflite_runtime import _pywrap_tensorflow_interpreter_wrapper as _interpreter_wrapper
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index c1778e7b12d..b3799be7af9 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -61,6 +61,13 @@ cc_library(
     ],
 )
 
+config_setting(
+    name = "tflite_pip_with_flex",
+    define_values = {
+        "tflite_pip_with_flex": "true",
+    },
+)
+
 pybind_extension(
     name = "_pywrap_tensorflow_interpreter_wrapper",
     srcs = [
@@ -71,10 +78,13 @@ pybind_extension(
     module_name = "_pywrap_tensorflow_interpreter_wrapper",
     deps = [
         ":interpreter_wrapper_lib",
+        "@pybind11",
+        "//third_party/python_runtime:headers",
         "//tensorflow/lite:framework_lib",
         "//tensorflow/lite/experimental/tflite_api_dispatcher",
         "//tensorflow/python:pybind11_lib",
-        "//third_party/python_runtime:headers",
-        "@pybind11",
-    ],
+    ] + select({
+        ":tflite_pip_with_flex": ["//tensorflow/lite/delegates/flex:delegate"],
+        "//conditions:default": [],
+    }),
 )
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 844a9827cb6..92e7c22a702 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -609,6 +609,7 @@ PyObject* InterpreterWrapper::GetTensor(int i) const {
       size_t size_of_type;
       if (GetSizeOfType(nullptr, tensor->type, &size_of_type) != kTfLiteOk) {
         PyErr_SetString(PyExc_ValueError, "Unknown tensor type.");
+        free(data);
         return nullptr;
       }
       sparse_buffer_dims[0] = tensor->bytes / size_of_type;
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index bf107c3c15e..92c0d5a95d9 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -20,6 +20,8 @@ from __future__ import division
 from __future__ import print_function
 
 import enum
+import shutil
+import tempfile
 import warnings
 
 from absl import logging
@@ -204,7 +206,8 @@ class QuantizationMode(object):
   def training_time_int8_allow_float(self):
     """Training-time int8 quantize, allow float fallback."""
     return (self._any_optimization_enabled() and
-            self._contains_training_quant_op())
+            not self.post_training_dynamic_range_int8() and
+            not self.post_training_fp16())
 
   def post_training_int16x8_no_float(self):
     """Post training int16x8 quantize, disallow float fallback."""
@@ -224,7 +227,7 @@ class QuantizationMode(object):
     # int8 quantization and training time quantization was not done.
     return (self._any_optimization_enabled() and
             self._representative_dataset is None and
-            not self._contains_training_quant_op() and
+            not self.contains_training_quant_op() and
             self._smallest_supported_type() == constants.INT8)
 
   def post_training_fp16(self):
@@ -245,6 +248,66 @@ class QuantizationMode(object):
   def activations_type(self):
     return constants.INT16 if self._is_int16x8_target_required() else constants.INT8
 
+  def converter_flags(self, inference_ty=None, inference_input_ty=None):
+    """Flags to the converter."""
+    if (self.post_training_int8_no_float() or
+        self.post_training_int8_allow_float()):
+      # The inference_input_type is for the quantizer, then we need to keep the
+      # converter inference_iput_type to float.
+      inference_input_ty = constants.FLOAT
+
+    if self.training_time_int8_allow_float():
+      return {
+          "inference_type": inference_ty if inference_ty else constants.INT8,
+          "inference_input_type":
+              inference_input_ty if inference_input_ty else constants.FLOAT,
+          "post_training_quantize": False,  # disable dynamic range quantization
+          "quantize_to_float16": False  # disable float16 quantization
+      }
+    elif self.post_training_dynamic_range_int8():
+      return {
+          "inference_type": constants.FLOAT,
+          "inference_input_type": constants.FLOAT,
+          "post_training_quantize": True,  # enable dynamic range quantization
+          "quantize_to_float16": False  # disable float16 quantization
+      }
+    elif self.post_training_fp16():
+      return {
+          "inference_type": constants.FLOAT,
+          "inference_input_type": constants.FLOAT,
+          "post_training_quantize": True,
+          "quantize_to_float16": True  # enable float16 quantization
+      }
+    else:
+      # Note this might still trigger (uint8) quantization to be compatible with
+      # TOCO.
+      return {
+          "inference_type": inference_ty if inference_ty else constants.FLOAT,
+          "inference_input_type": inference_input_ty,
+          "post_training_quantize": False,  # enable dynamic range quantization
+          "quantize_to_float16": False  # disable float16 quantization
+      }
+
+  def quantizer_flags(self, input_ty=None, output_ty=None):
+    """Default flags to the TFMOT quantizer."""
+
+    inference_input_type = input_ty if input_ty else constants.FLOAT
+    inference_output_type = output_ty if output_ty else constants.FLOAT
+    if self.post_training_int8_no_float():
+      return True, {
+          "inference_input_type": inference_input_type,
+          "inference_output_type": inference_output_type,
+          "allow_float": False
+      }
+    elif self.post_training_int8_allow_float():
+      return True, {
+          "inference_input_type": inference_input_type,
+          "inference_output_type": inference_output_type,
+          "allow_float": True
+      }
+    else:
+      return False, None
+
   # Below are helpers for the above functions.
 
   def _validate_int8_required(self):
@@ -300,7 +363,7 @@ class QuantizationMode(object):
       # The default smallest supported type is INT8.
       return constants.INT8
 
-  def _contains_training_quant_op(self):
+  def contains_training_quant_op(self):
     """Checks if the graph contains any training-time quantization ops."""
     training_quant_ops = frozenset({
         "FakeQuantWithMinMaxVars", "FakeQuantWithMinMaxVarsPerChannel",
@@ -308,7 +371,7 @@ class QuantizationMode(object):
     })
 
     for node_def in self._graph_def.node:
-      if any([op in node_def.name for op in training_quant_ops]):
+      if any(op in node_def.name for op in training_quant_ops):
         return True
     return False
 
@@ -391,8 +454,6 @@ class TFLiteConverterBase(object):
     args = {
         "input_format": constants.TENSORFLOW_GRAPHDEF,
         "allow_custom_ops": self.allow_custom_ops,
-        "post_training_quantize": False,
-        "quantize_to_float16": False,
         "debug_info": self._debug_info,
         "target_ops": self.target_spec.supported_ops,
         "enable_mlir_converter": self.experimental_new_converter,
@@ -433,7 +494,8 @@ class TFLiteConverterBase(object):
       if not self._contains_function_with_implements_attr(saved_model_proto):
         self.saved_model_dir = None
       else:
-        self._saved_model_exported_names = []
+        if not self._saved_model_exported_names:
+          self._saved_model_exported_names = []
         self._saved_model_version = saved_model_proto.saved_model_schema_version
         if self._saved_model_version not in [1, 2]:
           raise ValueError(
@@ -444,7 +506,7 @@ class TFLiteConverterBase(object):
 class TFLiteConverterBaseV2(TFLiteConverterBase):
   """Converter subclass to share functionality between V2 converters."""
 
-  def _convert(self, graph_def, input_tensors, output_tensors):
+  def convert(self, graph_def, input_tensors, output_tensors):
     """Converts a TensorFlow GraphDef based on instance variables.
 
     Args:
@@ -491,22 +553,7 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
           graph_def)
 
     converter_kwargs = self._get_base_converter_args()
-
-    if quant_mode.training_time_int8_allow_float():
-      converter_kwargs.update({
-          "inference_type": constants.INT8,
-          "inference_input_type": constants.FLOAT,
-      })
-
-    if quant_mode.post_training_dynamic_range_int8():
-      converter_kwargs.update({
-          "post_training_quantize": True,
-      })
-    elif quant_mode.post_training_fp16():
-      converter_kwargs.update({
-          "post_training_quantize": True,
-          "quantize_to_float16": True,
-      })
+    converter_kwargs.update(quant_mode.converter_flags())
 
     if not self.experimental_new_converter:
       logging.warning(
@@ -545,6 +592,10 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
                                               constants.FLOAT, activations_type,
                                               True)
 
+    calibrate_and_quantize, flags = quant_mode.quantizer_flags()
+    if calibrate_and_quantize:
+      result = self._calibrate_quantize_model(result, **flags)
+
     if self._experimental_sparsify_model:
       result = _mlir_sparsify(result)
 
@@ -613,7 +664,115 @@ class TFLiteSavedModelConverterV2(TFLiteConverterBaseV2):
         graph.get_tensor_by_name(signature_def.outputs[key].name)
         for key in signature_def.outputs
     ]
-    return self._convert(meta_graph.graph_def, input_tensors, output_tensors)
+    return super(TFLiteSavedModelConverterV2,
+                 self).convert(meta_graph.graph_def, input_tensors,
+                               output_tensors)
+
+
+class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
+  """Converts the given Keras model into TensorFlow Lite model."""
+
+  def __init__(self, keras_model, trackable_obj=None):
+    """Constructor for TFLiteConverter.
+
+    Args:
+      keras_model: tf.Keras.Model.
+      trackable_obj: tf.AutoTrackable object associated with `funcs`. A
+        reference to this object needs to be maintained so that Variables do not
+        get garbage collected since functions have a weak reference to
+        Variables. This is only required when the tf.AutoTrackable object is not
+        maintained by the user (e.g. `from_saved_model`).
+    """
+    super(TFLiteKerasModelConverterV2, self).__init__()
+    self._keras_model = keras_model
+    self._trackable_obj = trackable_obj
+
+  def convert(self):
+    """Converts a keras model based on instance variables.
+
+    Returns:
+      The converted data in serialized format.
+
+    Raises:
+      ValueError:
+        Multiple concrete functions are specified.
+        Input shape is not specified.
+        Invalid quantization parameters.
+    """
+    temp_dir = tempfile.mkdtemp()
+    try:
+      self._keras_model.save(temp_dir, save_format="tf")
+      self.saved_model_dir = temp_dir
+      self._saved_model_tags = set([_tag_constants.SERVING])
+      self._saved_model_exported_names = [
+          _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+      ]
+      self._parse_saved_model_args()
+      if self.saved_model_dir:
+        graph = _ops.Graph()
+        saved_model = _loader_impl.SavedModelLoader(self.saved_model_dir)
+        saved_model.load_graph(graph, tags=self._saved_model_tags)
+        meta_graph = saved_model.get_meta_graph_def_from_tags(
+            self._saved_model_tags)
+        signature_def = meta_graph.signature_def[
+            _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+        input_tensors = [
+            graph.get_tensor_by_name(signature_def.inputs[key].name)
+            for key in signature_def.inputs
+        ]
+        output_tensors = [
+            graph.get_tensor_by_name(signature_def.outputs[key].name)
+            for key in signature_def.outputs
+        ]
+        self._trackable_obj = _load(self.saved_model_dir,
+                                    self._saved_model_tags)
+        return super(TFLiteKerasModelConverterV2,
+                     self).convert(meta_graph.graph_def, input_tensors,
+                                   output_tensors)
+    finally:
+      shutil.rmtree(temp_dir, True)
+
+    input_signature = None
+    # If the model's call is not a `tf.function`, then we need to first get its
+    # input signature from `model_input_signature` method. We can't directly
+    # call `trace_model_call` because otherwise the batch dimension is set
+    # to None.
+    # Once we have better support for dynamic shapes, we can remove this.
+    if not isinstance(self._keras_model.call, _def_function.Function):
+      # Pass `keep_original_batch_size=True` will ensure that we get an input
+      # signature including the batch dimension specified by the user.
+      input_signature = _saving_utils.model_input_signature(
+          self._keras_model, keep_original_batch_size=True)
+
+    func = _saving_utils.trace_model_call(self._keras_model, input_signature)
+    concrete_func = func.get_concrete_function()
+    self._funcs = [concrete_func]
+
+    frozen_func, graph_def = (
+        _convert_to_constants.convert_variables_to_constants_v2_as_graph(
+            self._funcs[0], lower_control_flow=False))
+
+    input_tensors = [
+        tensor for tensor in frozen_func.inputs
+        if tensor.dtype != _dtypes.resource
+    ]
+    output_tensors = frozen_func.outputs
+
+    # Run a Grappler pass.
+    grappler_config = self._grappler_config()
+    # Skip running grappler when there are no optimizers to run. If not,
+    # grappler will run with the default optimizer set and it will lead to
+    # causing an unexpected behavior.
+    if grappler_config.graph_options.rewrite_options.optimizers:
+      graph_def = _run_graph_optimizations(
+          graph_def,
+          input_tensors,
+          output_tensors,
+          config=grappler_config,
+          graph=frozen_func.graph)
+
+    return super(TFLiteKerasModelConverterV2,
+                 self).convert(graph_def, input_tensors, output_tensors)
 
 
 class TFLiteFrozenGraphConverterV2(TFLiteConverterBaseV2):
@@ -681,7 +840,8 @@ class TFLiteFrozenGraphConverterV2(TFLiteConverterBaseV2):
           config=grappler_config,
           graph=frozen_func.graph)
 
-    return self._convert(graph_def, input_tensors, output_tensors)
+    return super(TFLiteFrozenGraphConverterV2,
+                 self).convert(graph_def, input_tensors, output_tensors)
 
 
 @_tf_export("lite.TFLiteConverter", v1=[])
@@ -805,6 +965,9 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
     if not signature_keys:
       signature_keys = saved_model.signatures
 
+    if len(signature_keys) != 1:
+      raise ValueError("Only support a single signature key.")
+
     funcs = []
     for key in signature_keys:
       if key not in saved_model.signatures:
@@ -830,21 +993,7 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
     Returns:
       TFLiteConverter object.
     """
-    input_signature = None
-    # If the model's call is not a `tf.function`, then we need to first get its
-    # input signature from `model_input_signature` method. We can't directly
-    # call `trace_model_call` because otherwise the batch dimension is set
-    # to None.
-    # Once we have better support for dynamic shapes, we can remove this.
-    if not isinstance(model.call, _def_function.Function):
-      # Pass `keep_original_batch_size=True` will ensure that we get an input
-      # signature including the batch dimension specified by the user.
-      input_signature = _saving_utils.model_input_signature(
-          model, keep_original_batch_size=True)
-
-    func = _saving_utils.trace_model_call(model, input_signature)
-    concrete_func = func.get_concrete_function()
-    return cls([concrete_func])
+    return TFLiteKerasModelConverterV2(model)
 
   # pylint: disable=useless-super-delegation
   def convert(self):
@@ -989,7 +1138,7 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
       return self.target_spec.supported_ops
     return object.__getattribute__(self, name)
 
-  def _validate_quantized_input_stats(self, converter_kwargs):
+  def _validate_quantized_input_stats(self, converter_kwargs, calibrate):
     """Ensure quantized_input_stats provided if required."""
 
     quantized_types = frozenset({constants.INT8, constants.QUANTIZED_UINT8})
@@ -997,14 +1146,14 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
     requires_quantized_input_stats = (
         (converter_kwargs["inference_type"] in quantized_types or
          converter_kwargs["inference_input_type"] in quantized_types) and
-        not converter_kwargs["post_training_quantize"])
+        not calibrate)
 
     if (requires_quantized_input_stats and
         not converter_kwargs["quantized_input_stats"]):
       raise ValueError("std_dev and mean must be defined when inference_type "
                        "or inference_input_type is QUANTIZED_UINT8 or INT8.")
 
-  def _convert(self):
+  def convert(self):
     """Converts a TensorFlow GraphDef based on instance variables.
 
     Returns:
@@ -1053,52 +1202,10 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
     else:
       quantized_stats = None
 
-    toco_inference_input_type = self.inference_input_type
-    inference_input_type = self.inference_input_type
-    inference_output_type = self.inference_output_type
-    post_training_optimize = (
-        quant_mode.post_training_int8_no_float() or
-        quant_mode.post_training_int8_allow_float() or
-        quant_mode.post_training_int16x8_allow_float() or
-        quant_mode.post_training_int16x8_no_float() or
-        quant_mode.post_training_dynamic_range_int8() or
-        quant_mode.post_training_fp16())
-    if post_training_optimize:
-      # Post training optimizations require that TOCO outputs a float model.
-      if self.inference_type != constants.FLOAT:
-        raise ValueError(
-            "`optimizations` require that `inference_type` is set to float.")
-      toco_inference_input_type = constants.FLOAT
-      # Set up default values.
-      if inference_input_type is None:
-        inference_input_type = constants.FLOAT
-      if inference_output_type is None:
-        inference_output_type = constants.FLOAT
-
-    weight_only_quantize = (
-        quant_mode.post_training_dynamic_range_int8() or
-        quant_mode.post_training_fp16())
-    if weight_only_quantize:
-      # Currently, weight only quantization requires float inputs and outputs.
-      if (inference_input_type != constants.FLOAT or
-          inference_output_type != constants.FLOAT):
-        raise ValueError(
-            "Provide an inference_input_type and inference_output_type of type "
-            "tf.float32.")
-
-    if not post_training_optimize and self.inference_output_type is not None:
-      raise ValueError(
-          "inference_output_type is currently not supported if optimizations "
-          "are not enabled.")
-
     optimized_graph = self._graph_def
     if not self.saved_model_dir:
-      # if it is not uint8 or int8 with post-training quantization, it is not
-      # quantization aware training, then graph optimization is applied.
-      # Graph optimization is disabled for quantization aware training.
-      if (self.inference_type != constants.QUANTIZED_UINT8 or
-          (self.inference_type == constants.INT8 and
-           (post_training_optimize or weight_only_quantize))):
+      # Disable grappler constant folding if there are training quant ops.
+      if not quant_mode.contains_training_quant_op():
         try:
           # TODO(b/150163103): Merge `disabling lower using switch merge' calls.
           # Grappler will also try to lower while loop into switch merge
@@ -1119,20 +1226,10 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
     self._debug_info = _get_debug_info(self._debug_info_func, optimized_graph)
 
     converter_kwargs = self._get_base_converter_args()
-
-    if quant_mode.post_training_dynamic_range_int8():
-      converter_kwargs.update({
-          "post_training_quantize": True,
-      })
-    elif quant_mode.post_training_fp16():
-      converter_kwargs.update({
-          "post_training_quantize": True,
-          "quantize_to_float16": True,
-      })
-
+    converter_kwargs.update(
+        quant_mode.converter_flags(self.inference_type,
+                                   self.inference_input_type))
     converter_kwargs.update({
-        "inference_type": self.inference_type,
-        "inference_input_type": toco_inference_input_type,
         "output_format": self.output_format,
         "quantized_input_stats": quantized_stats,
         "default_ranges_stats": self.default_ranges_stats,
@@ -1156,7 +1253,10 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
                    "please file a bug. You can opt-out "
                    "by setting experimental_new_converter=False")
 
-    self._validate_quantized_input_stats(converter_kwargs)
+    calibrate_quantize, flags = quant_mode.quantizer_flags(
+        self.inference_input_type, self.inference_output_type)
+
+    self._validate_quantized_input_stats(converter_kwargs, calibrate_quantize)
 
     # Converts model.
     if self._has_valid_tensors():
@@ -1172,20 +1272,8 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
           output_arrays=self._output_arrays,
           **converter_kwargs)
 
-    activations_type = quant_mode.activations_type()
-
-    if quant_mode.post_training_int8_no_float():
-      result = self._calibrate_quantize_model(result, inference_input_type,
-                                              inference_output_type, activations_type, False)
-    elif quant_mode.post_training_int8_allow_float():
-      result = self._calibrate_quantize_model(result, inference_input_type,
-                                              inference_output_type, activations_type, True)
-    elif quant_mode.post_training_int16x8_no_float():
-      result = self._calibrate_quantize_model(result, inference_input_type,
-                                              inference_output_type, activations_type, False)
-    elif quant_mode.post_training_int16x8_allow_float():
-      result = self._calibrate_quantize_model(result, inference_input_type,
-                                              inference_output_type, activations_type, True)
+    if calibrate_quantize:
+      result = self._calibrate_quantize_model(result, **flags)
 
     if self._experimental_sparsify_model:
       result = _mlir_sparsify(result)
@@ -1297,8 +1385,86 @@ class TFLiteSavedModelConverter(TFLiteConverterBaseV1):
     self._output_tensors = result[2]
     self._parse_saved_model_args()
 
+
+class TFLiteKerasModelConverter(TFLiteConverterBaseV1):
+  """Converts the given SavedModel into TensorFlow Lite model."""
+
+  def __init__(self,
+               model_file,
+               input_arrays=None,
+               input_shapes=None,
+               output_arrays=None,
+               custom_objects=None):
+    """Constructor for TFLiteConverter.
+
+    Args:
+      model_file: Full filepath of HDF5 file containing the tf.keras model.
+      input_arrays: List of input tensors to freeze graph with. Uses input
+        arrays from SignatureDef when none are provided. (default None)
+      input_shapes: Dict of strings representing input tensor names to list of
+        integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
+        Automatically determined when input shapes is None (e.g., {"foo" :
+          None}). (default None)
+      output_arrays: List of output tensors to freeze graph with. Uses output
+        arrays from SignatureDef when none are provided. (default None)
+      custom_objects: Dict mapping names (strings) to custom classes or
+        functions to be considered during model deserialization. (default None)
+
+    Raises:
+      ValueError: Invalid arguments.
+    """
+    super(TFLiteKerasModelConverter,
+          self).__init__(experimental_debug_info_func=None)
+    # Handles Keras when Eager mode is enabled.
+    if context.executing_eagerly():
+      if input_arrays or output_arrays:
+        raise ValueError("`input_arrays` and `output_arrays` are unsupported "
+                         "with Eager mode. If your model requires any of these "
+                         "parameters, please use disable_eager_execution().")
+
+      _keras.backend.set_learning_phase(False)
+      keras_model = _keras.models.load_model(model_file, custom_objects)
+
+      function = _saving_utils.trace_model_call(keras_model)
+      concrete_func = function.get_concrete_function()
+
+      frozen_func = _convert_to_constants.convert_variables_to_constants_v2(
+          concrete_func, lower_control_flow=False)
+      _set_tensor_shapes(frozen_func.inputs, input_shapes)
+      self._keras_model = keras_model
+      self._graph_def = frozen_func.graph.as_graph_def()
+      self._input_tensors = frozen_func.inputs
+      self._output_tensors = frozen_func.outputs
+      self._debug_info_func = _build_debug_info_func(frozen_func.graph)
+      return
+
+    # Handles Keras when Eager mode is disabled.
+    _keras.backend.clear_session()
+    _keras.backend.set_learning_phase(False)
+    keras_model = _keras.models.load_model(model_file, custom_objects)
+    sess = _keras.backend.get_session()
+
+    # Get input and output tensors.
+    if input_arrays:
+      input_tensors = _get_tensors_from_tensor_names(sess.graph, input_arrays)
+    else:
+      input_tensors = keras_model.inputs
+
+    if output_arrays:
+      output_tensors = _get_tensors_from_tensor_names(sess.graph, output_arrays)
+    else:
+      output_tensors = keras_model.outputs
+    _set_tensor_shapes(input_tensors, input_shapes)
+
+    graph_def = _freeze_graph(sess, input_tensors, output_tensors)
+    self._keras_model = keras_model
+    self._graph_def = graph_def
+    self._input_tensors = input_tensors
+    self._output_tensors = output_tensors
+    self._debug_info_func = _build_debug_info_func(sess.graph)
+
   def convert(self):
-    """Converts a TensorFlow GraphDef based on instance variables.
+    """Converts a Keras model based on instance variables.
 
     Returns:
       The converted data in serialized format. Either a TFLite Flatbuffer or a
@@ -1309,7 +1475,28 @@ class TFLiteSavedModelConverter(TFLiteConverterBaseV1):
         Input shape is not specified.
         None value for dimension in input_tensor.
     """
-    return self._convert()
+    temp_dir = tempfile.mkdtemp()
+    try:
+      self._keras_model.save(temp_dir, save_format="tf")
+      tag_set = set([_tag_constants.SERVING])
+      signature_key = _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+      result = _freeze_saved_model(temp_dir, None, None, None, tag_set,
+                                   signature_key)
+
+      self.saved_model_dir = temp_dir
+      self._saved_model_tags = tag_set
+      self._saved_model_exported_names = [signature_key]
+      self._parse_saved_model_args()
+      if self.saved_model_dir:
+        self._graph_def = result[0]
+        self._input_tensors = result[1]
+        self._output_tensors = result[2]
+        self._debug_info_func = _build_debug_info_func(result[3])
+        return super(TFLiteKerasModelConverter, self).convert()
+    finally:
+      shutil.rmtree(temp_dir, True)
+
+    return super(TFLiteKerasModelConverter, self).convert()
 
 
 class TFLiteFrozenGraphConverter(TFLiteConverterBaseV1):
@@ -1358,20 +1545,6 @@ class TFLiteFrozenGraphConverter(TFLiteConverterBaseV1):
       self._input_arrays_with_shape = input_arrays_with_shape
       self._output_arrays = output_arrays
 
-  def convert(self):
-    """Converts a TensorFlow GraphDef based on instance variables.
-
-    Returns:
-      The converted data in serialized format. Either a TFLite Flatbuffer or a
-      Graphviz graph depending on value in `output_format`.
-
-    Raises:
-      ValueError:
-        Input shape is not specified.
-        None value for dimension in input_tensor.
-    """
-    return self._convert()
-
 
 @_tf_export(v1=["lite.TFLiteConverter"])
 class TFLiteConverter(TFLiteFrozenGraphConverter):
@@ -1699,53 +1872,8 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
     Returns:
       TFLiteConverter class.
     """
-    # Handles Keras when Eager mode is enabled.
-    if context.executing_eagerly():
-      if input_arrays or output_arrays:
-        raise ValueError("`input_arrays` and `output_arrays` are unsupported "
-                         "with Eager mode. If your model requires any of these "
-                         "parameters, please use disable_eager_execution().")
-
-      _keras.backend.set_learning_phase(False)
-      keras_model = _keras.models.load_model(model_file, custom_objects)
-
-      function = _saving_utils.trace_model_call(keras_model)
-      concrete_func = function.get_concrete_function()
-
-      frozen_func = _convert_to_constants.convert_variables_to_constants_v2(
-          concrete_func, lower_control_flow=False)
-      _set_tensor_shapes(frozen_func.inputs, input_shapes)
-      return cls(
-          frozen_func.graph.as_graph_def(),
-          frozen_func.inputs,
-          frozen_func.outputs,
-          experimental_debug_info_func=_build_debug_info_func(
-              frozen_func.graph))
-
-    # Handles Keras when Eager mode is disabled.
-    _keras.backend.clear_session()
-    _keras.backend.set_learning_phase(False)
-    keras_model = _keras.models.load_model(model_file, custom_objects)
-    sess = _keras.backend.get_session()
-
-    # Get input and output tensors.
-    if input_arrays:
-      input_tensors = _get_tensors_from_tensor_names(sess.graph, input_arrays)
-    else:
-      input_tensors = keras_model.inputs
-
-    if output_arrays:
-      output_tensors = _get_tensors_from_tensor_names(sess.graph, output_arrays)
-    else:
-      output_tensors = keras_model.outputs
-    _set_tensor_shapes(input_tensors, input_shapes)
-
-    graph_def = _freeze_graph(sess, input_tensors, output_tensors)
-    return cls(
-        graph_def,
-        input_tensors,
-        output_tensors,
-        experimental_debug_info_func=_build_debug_info_func(sess.graph))
+    return TFLiteKerasModelConverter(model_file, input_arrays, input_shapes,
+                                     output_arrays, custom_objects)
 
   # pylint: disable=useless-super-delegation
   def convert(self):
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index ece62cf0999..4075a887943 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -269,9 +269,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
                                                   [out_tensor])
     converter.inference_input_type = lite_constants.QUANTIZED_UINT8
     converter.inference_type = lite_constants.FLOAT
-    converter.quantized_input_stats = {
-        'Placeholder': (0., 1.)
-    }  # mean, std_dev
+    converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
@@ -1153,6 +1151,35 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     }
     quantized_converter.convert()
 
+  def testTrainingTimeAndPostTrainingCalibrateAndQuantize(self):
+    with ops.Graph().as_default():
+      inp, output, calibration_gen = self._getCalibrationQuantizeModel()
+      sess = session.Session()
+
+    # Convert float model.
+    float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
+    float_tflite = float_converter.convert()
+    self.assertTrue(float_tflite)
+
+    converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
+
+    # extra flags to trigger training time quantization conversion
+    converter.inference_type = lite_constants.INT8
+    converter.inference_input_type = lite_constants.FLOAT
+    converter.inference_output_type = lite_constants.FLOAT
+    input_arrays = converter.get_input_arrays()
+    converter.quantized_input_stats = {
+        input_arrays[0]: (0., 1.)
+    }
+    # trigger post-training quantization
+    converter.optimizations = [lite.Optimize.DEFAULT]
+    converter.representative_dataset = calibration_gen
+    converter._experimental_new_quantizer = True
+    quantized_tflite = converter.convert()
+    self.assertTrue(quantized_tflite)
+
+    self.assertLess(len(quantized_tflite), len(float_tflite))
+
   def testFloatTocoConverter(self):
     """Tests deprecated test TocoConverter."""
     with ops.Graph().as_default():
@@ -1329,6 +1356,41 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
+  def testResizeWithShape(self):
+    with ops.Graph().as_default():
+      # Construct a graph with a dynamically shapped input and an internal node
+      # that relies on the output of that input's shape.
+      in_tensor = array_ops.placeholder(
+          shape=[None, None], dtype=dtypes.float32)
+      in_tensor2 = [[1, 2], [3, 4]]
+      out_tensor = array_ops.reshape(in_tensor2, array_ops.shape(in_tensor))
+      sess = session.Session()
+
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    converter.experimental_new_converter = True
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertTrue(([1, 1] == input_details[0]['shape']).all())
+    self.assertTrue(([-1, -1] == input_details[0]['shape_signature']).all())
+
+    # Resize tensor and invoke.
+    interpreter.resize_tensor_input(0, [4])
+    interpreter.allocate_tensors()
+    interpreter.invoke()
+
+    # The output should be reshaped properly according to the resized input.
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual(np.int32, output_details[0]['dtype'])
+    self.assertTrue(([4] == output_details[0]['shape']).all())
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue(([1, 2, 3, 4] == output_data).all())
+
   def testResizingIntermediateDynamicTensor(self):
     # This is a regression test for the case where shape of dynamic output
     # tensors changes between invocations.
@@ -1897,7 +1959,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
-    self.assertEqual('dense_input', input_details[0]['name'])
+    self.assertEndsWith(input_details[0]['name'], 'dense_input')
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
     self.assertEqual((0., 0.), input_details[0]['quantization'])
@@ -1992,7 +2054,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
-    self.assertEqual('dense_input', input_details[0]['name'])
+    self.assertEndsWith(input_details[0]['name'], 'dense_input')
     self.assertTrue(([2, 3] == input_details[0]['shape']).all())
 
   def testSequentialModelOutputArray(self):
@@ -2111,12 +2173,12 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 2)
-    self.assertEqual('input_a', input_details[0]['name'])
+    self.assertEndsWith(input_details[0]['name'], 'input_a')
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
-    self.assertEqual('input_b', input_details[1]['name'])
+    self.assertEndsWith(input_details[1]['name'], 'input_b')
     self.assertEqual(np.float32, input_details[1]['dtype'])
     self.assertTrue(([1, 3] == input_details[1]['shape']).all())
     self.assertEqual((0., 0.), input_details[1]['quantization'])
@@ -2167,7 +2229,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
-    self.assertEqual('dense_input', input_details[0]['name'])
+    self.assertEndsWith(input_details[0]['name'], 'dense_input')
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
     self.assertEqual((0., 0.), input_details[0]['quantization'])
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index 59f326d4b9f..9af37df2975 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -29,7 +29,9 @@ import tensorflow as tf
 
 from tensorflow.lite.python import lite
 from tensorflow.lite.python import lite_v2_test_util
+from tensorflow.lite.python.convert import mlir_quantize
 from tensorflow.lite.python.interpreter import Interpreter
+from tensorflow.lite.toco import types_pb2 as _types_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.layers import recurrent
@@ -204,6 +206,40 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     # Ensure that the quantized weights tflite model is smaller.
     self.assertLess(len(quantized_tflite), len(float_tflite))
 
+  def testCalibrateAndQuantizeBuiltinInt16(self):
+    func, calibration_gen = self._getCalibrationQuantizeModel()
+
+    # Convert float model.
+    float_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    float_tflite = float_converter.convert()
+    self.assertTrue(float_tflite)
+
+    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    # TODO(b/156309549): We should add INT16 to the builtin types.
+    converter.target_spec.supported_ops = [
+        lite.OpsSet.TFLITE_BUILTINS_INT8
+    ]
+    converter.representative_dataset = calibration_gen
+    converter._experimental_calibrate_only = True
+    calibrated_tflite = converter.convert()
+    quantized_tflite = mlir_quantize(calibrated_tflite,
+                                     inference_type=_types_pb2.QUANTIZED_INT16)
+
+    self.assertTrue(quantized_tflite)
+
+    # The default input and output types should be float.
+    interpreter = Interpreter(model_content=quantized_tflite)
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+
+    # Ensure that the quantized weights tflite model is smaller.
+    self.assertLess(len(quantized_tflite), len(float_tflite))
+
   def _getTrainingTimeQuantizedModel(self):
 
     class QLinear(tf.keras.layers.Layer):
@@ -213,9 +249,11 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
         self.units = units
 
       def build(self, input_shape):
-        self.w = self.add_weight(shape=(input_shape[-1], self.units),
-                                 initializer='random_normal',
-                                 trainable=True)
+        self.w = self.add_weight(
+            'weight',
+            shape=(input_shape[-1], self.units),
+            initializer='random_normal',
+            trainable=True)
         self.min_var = self.add_weight(
             'min',
             initializer=tf.keras.initializers.Constant(-6.0),
@@ -469,15 +507,10 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
     save(root, save_dir, {'add': add_func, 'sub': sub_func})
 
-    # Ensure the converter generates.
-    converter = lite.TFLiteConverterV2.from_saved_model(save_dir)
-    self.assertLen(converter._funcs, 2)
-
     # Try converting multiple functions.
     with self.assertRaises(ValueError) as error:
-      _ = converter.convert()
-    self.assertIn('This converter can only convert a single ConcreteFunction',
-                  str(error.exception))
+      _ = lite.TFLiteConverterV2.from_saved_model(save_dir)
+    self.assertIn('Only support a single signature key.', str(error.exception))
 
   @test_util.run_v2_only
   def testNoConcreteFunctionModel(self):
@@ -487,12 +520,9 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
     save(root, save_dir)
 
-    converter = lite.TFLiteConverterV2.from_saved_model(save_dir)
-    self.assertLen(converter._funcs, 0)
-
     with self.assertRaises(ValueError) as error:
-      _ = converter.convert()
-    self.assertIn('No ConcreteFunction is specified.', str(error.exception))
+      _ = lite.TFLiteConverterV2.from_saved_model(save_dir)
+    self.assertIn('Only support a single signature key.', str(error.exception))
 
   @test_util.run_v2_only
   def testKerasSequentialModel(self):
@@ -756,7 +786,10 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
     input_data = tf.constant(
         np.array(np.random.random_sample((1, 10, 10)), dtype=np.float32))
     rnn_obj = rnn_layer(units=10, input_shape=(10, 10))
-    model = tf.keras.models.Sequential([rnn_obj])
+    model = tf.keras.models.Sequential([
+        tf.keras.layers.Input(batch_size=1, shape=(10, 10), name='input'),
+        rnn_obj,
+    ])
 
     # Convert model.
     converter = lite.TFLiteConverterV2.from_keras_model(model)
@@ -795,6 +828,7 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
     input_data = tf.constant(
         np.array(np.random.random_sample((1, 10, 10)), dtype=np.float32))
     model = tf.keras.models.Sequential()
+    model.add(tf.keras.layers.Input(batch_size=1, shape=(10, 10), name='input'))
     model.add(
         tf.keras.layers.Bidirectional(
             recurrent_v2.LSTM(units=10, return_sequences=True),
diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index 29683718016..9d62c1b8a97 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -435,6 +435,7 @@ class OpHint(object):
     Args:
       *args: List of inputs to be converted (should be Tf.Tensor).
       **kwargs: This allows 'names' which should be a list of names.
+
     Returns:
       Wrapped inputs (identity standins that have additional metadata). These
       are also are also tf.Tensor's.
@@ -453,6 +454,7 @@ class OpHint(object):
     Args:
       *args: List of outputs to be converted (should be tf.Tensor).
       **kwargs: See
+
     Returns:
       Wrapped outputs (identity standins that have additional metadata). These
       are also tf.Tensor's.
@@ -574,8 +576,8 @@ class _LiteAggregateOperand(_LiteOperand):
       elif self.aggregation == OpHint.AGGREGATE_STACK:
         pass
       else:
-        raise ValueError(
-            "Invalid aggregation type %r specified" % self.aggregation)
+        raise ValueError("Invalid aggregation type %r specified" %
+                         self.aggregation)
     return self.flattened
 
   def flatten(self):
@@ -646,8 +648,8 @@ class _LiteAggregateOperand(_LiteOperand):
       stack_node.attr["num"].i = len(flattened)
       output_type = flattened[0].attr["T"].type
       stack_node.attr["T"].type = output_type
-      stack_node.input.append(_tensorflow_output_name(
-          fused_op_name, output_index))
+      stack_node.input.append(
+          _tensorflow_output_name(fused_op_name, output_index))
       out_graphdef.node.extend([stack_node])
 
       for idx, discrete in enumerate(flattened):
@@ -675,11 +677,10 @@ class _LiteFuncCall(object):
     inputs: inputs to the op (hash from index # to argument)
     outputs: outputs to the op (hash from index # to argument)
     function_name: the tflite custom op name to use
-    uuid: a unique call id for this particular call  (i.e.
-      multiple function calls would have the same function_name but different
-      uuids.
-    params: A param name to key value for op constant data. I.e. for
-      axis on a reduction, strides on a convolution, etc.
+    uuid: a unique call id for this particular call  (i.e. multiple function
+      calls would have the same function_name but different uuids.
+    params: A param name to key value for op constant data. I.e. for axis on a
+      reduction, strides on a convolution, etc.
     level: Level of the OpHint.
     children_inputs_mappings: If the Ophint has children, children inputs
       mappings indicate how their inputs & outputs are mapped.
@@ -700,6 +701,7 @@ class _LiteFuncCall(object):
     Returns:
       Tuple of (inputs, outputs). where input and output i a list of names.
     """
+
     def _flatten(input_or_output_dict):
       flattened_items = []
       for item in input_or_output_dict.values():
@@ -709,6 +711,7 @@ class _LiteFuncCall(object):
     return _flatten(self.inputs), _flatten(self.outputs)
 
   def __str__(self):
+
     def format_args(items):
       s = ""
       for idx, item in items.iteritems():
@@ -739,8 +742,8 @@ def _find_all_hints_in_nodes(nodes):
   for node in nodes:
     attr = node.attr
     # This is an op hint if it has a FUNCTION_UUID_ATTR, otherwise skip
-    if (OpHint.FUNCTION_UUID_ATTR not in attr
-        or not attr[OpHint.FUNCTION_UUID_ATTR].s):
+    if (OpHint.FUNCTION_UUID_ATTR not in attr or
+        not attr[OpHint.FUNCTION_UUID_ATTR].s):
       continue
     uuid = attr[OpHint.FUNCTION_UUID_ATTR].s
 
@@ -751,9 +754,11 @@ def _find_all_hints_in_nodes(nodes):
     call_def.level = attr[OpHint.FUNCTION_LEVEL_ATTR].i
     # Get sorting and aggregation information
 
-    sort = (attr[OpHint.FUNCTION_SORT_INDEX_ATTR].i
-            if OpHint.FUNCTION_SORT_INDEX_ATTR in attr else None)
-    if sort == -1: sort = None
+    sort = (
+        attr[OpHint.FUNCTION_SORT_INDEX_ATTR].i
+        if OpHint.FUNCTION_SORT_INDEX_ATTR in attr else None)
+    if sort == -1:
+      sort = None
     aggregation = None
     if OpHint.FUNCTION_AGGREGATE_ATTR in attr:
       aggregation = _compat.as_text(attr[OpHint.FUNCTION_AGGREGATE_ATTR].s)
@@ -887,6 +892,7 @@ def _tensor_name_base(full_tensor_name):
   Args:
     full_tensor_name: A tensor name that is annotated with a device placement
       (this is what tensor flow introspection gives).
+
   Returns:
     A name without any device assignment.
   """
@@ -919,10 +925,10 @@ def _check_subgraph_closed(n, reachable_by_input, input_nodes_set,
   while next_to_visit:
     current_node = next_to_visit.pop()
     visited.add(current_node)
-    if (current_node in reachable_by_input
-        and current_node not in input_nodes_set):
-      raise TypeError(
-          "Node %s uses input %s not in input_nodes." % (n, current_node))
+    if (current_node in reachable_by_input and
+        current_node not in input_nodes_set):
+      raise TypeError("Node %s uses input %s not in input_nodes." %
+                      (n, current_node))
     if current_node not in input_nodes_set:
       next_to_visit += [
           input_node for input_node in name_to_input_name[current_node]
@@ -1066,6 +1072,7 @@ def _remove_one_redundant_stack_unstack(in_graph_def):
 
   Args:
     in_graph_def: Graph def to use as input.
+
   Returns:
     Simplified tuple (graph_def, changed_something) where changed_something
     is true if anything was done.
@@ -1101,15 +1108,15 @@ def _remove_one_redundant_stack_unstack(in_graph_def):
       node = name_to_node[current_node_name]
       is_op_hint_stack = node.name.startswith("OpHintStack")
       is_op_hint_unstack = node.name.startswith("OpHintUnstack")
-      if (node.op == "Identity" or is_op_hint_stack
-          or (do_generic_pack_unpack and node.op == "Pack")):
+      if (node.op == "Identity" or is_op_hint_stack or
+          (do_generic_pack_unpack and node.op == "Pack")):
         is_hint_created_stack |= is_op_hint_stack
         next_to_visit += [
             input_node for input_node in name_to_input_name[current_node_name]
             if input_node not in visited
         ]
-      elif (is_op_hint_unstack
-            or (do_generic_pack_unpack and node.op == "Unpack")):
+      elif (is_op_hint_unstack or
+            (do_generic_pack_unpack and node.op == "Unpack")):
         unpack_nodes.add(node.name)
         is_hint_created_stack &= is_op_hint_unstack
       else:
@@ -1124,7 +1131,8 @@ def _remove_one_redundant_stack_unstack(in_graph_def):
       # Unstacked form
       no_external_dependency = True
       for other_n in in_graph_def.node:
-        if other_n.name in visited: continue
+        if other_n.name in visited:
+          continue
         for input_tensor in name_to_input_name[other_n.name]:
           input_op = _tensor_name_base(input_tensor)
           if input_op in visited and input_op != pack_node:
@@ -1141,9 +1149,9 @@ def _remove_one_redundant_stack_unstack(in_graph_def):
           if node_name not in visited:
             new_node = _copy.deepcopy(other_n)
             new_node.input[:] = [
-                (end_input if stripped == pack_node else
-                 non_stripped) for stripped, non_stripped in zip(
-                     name_to_input_name[node_name], new_node.input[:])
+                (end_input if stripped == pack_node else non_stripped)
+                for stripped, non_stripped in zip(name_to_input_name[node_name],
+                                                  new_node.input[:])
             ]
             out.node.extend([new_node])
         return out, True
@@ -1166,8 +1174,6 @@ def _get_correct_mapping(original_index, nodes):
     node_indices = nodes.keys()
     node_indices = sorted(node_indices)
     return node_indices[-1]
-  else:
-    return original_index
   return original_index
 
 
@@ -1179,6 +1185,7 @@ def _convert_op_hints_to_stubs_helper(
     graph_def: A graph def that we should convert.
     write_callback: A function pointer that can be used to write intermediate
       steps of graph transformation (optional).
+
   Returns:
     A new stubbed graph_def.
   """
@@ -1308,6 +1315,7 @@ def convert_op_hints_to_stubs(session=None,
     graph_def: A graph def that we should convert.
     write_callback: A function pointer that can be used to write intermediate
       steps of graph transformation (optional).
+
   Returns:
     A new graphdef with all ops contained in OpHints being replaced by
     a single op call with the right parameters.
diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py
index d0dd7313df3..c7504a3a638 100644
--- a/tensorflow/lite/python/tflite_convert.py
+++ b/tensorflow/lite/python/tflite_convert.py
@@ -65,6 +65,8 @@ def _parse_inference_type(value, flag):
     return lite_constants.FLOAT
   if value == "QUANTIZED_UINT8":
     return lite_constants.QUANTIZED_UINT8
+  if value == "INT8":
+    return lite_constants.INT8
   raise ValueError("Unsupported value for --{0}. Only FLOAT and "
                    "QUANTIZED_UINT8 are supported.".format(flag))
 
@@ -352,12 +354,12 @@ def _get_tf1_flags(parser):
   parser.add_argument(
       "--inference_type",
       type=str.upper,
-      choices=["FLOAT", "QUANTIZED_UINT8"],
+      choices=["FLOAT", "QUANTIZED_UINT8", "INT8"],
       help="Target data type of real-number arrays in the output file.")
   parser.add_argument(
       "--inference_input_type",
       type=str.upper,
-      choices=["FLOAT", "QUANTIZED_UINT8"],
+      choices=["FLOAT", "QUANTIZED_UINT8", "INT8"],
       help=("Target data type of real-number input arrays. Allows for a "
             "different type for input arrays in the case of quantization."))
 
diff --git a/tensorflow/lite/python/tflite_convert_test.py b/tensorflow/lite/python/tflite_convert_test.py
index 1e80907edbd..d6a35ba9248 100644
--- a/tensorflow/lite/python/tflite_convert_test.py
+++ b/tensorflow/lite/python/tflite_convert_test.py
@@ -98,8 +98,8 @@ class TfLiteConvertV1Test(TestModels):
     sess.close()
 
     flags_str = ('--graph_def_file={0} --input_arrays={1} '
-                 '--output_arrays={2}'.format(graph_def_file,
-                                              'Placeholder', 'add'))
+                 '--output_arrays={2}'.format(graph_def_file, 'Placeholder',
+                                              'add'))
     self._run(flags_str, should_succeed=True)
     os.remove(graph_def_file)
 
@@ -137,8 +137,31 @@ class TfLiteConvertV1Test(TestModels):
     sess.close()
 
     flags_str = ('--graph_def_file={0} --input_arrays={1} '
-                 '--output_arrays={2}'.format(graph_def_file,
-                                              'random', 'add'))
+                 '--output_arrays={2}'.format(graph_def_file, 'random', 'add'))
+    self._run(flags_str, should_succeed=True)
+    os.remove(graph_def_file)
+
+  def testQATFrozenGraphDefInt8(self):
+    with ops.Graph().as_default():
+      in_tensor_1 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
+      _ = array_ops.fake_quant_with_min_max_args(
+          in_tensor_1 + in_tensor_2, min=0., max=1., name='output',
+          num_bits=16)  # INT8 inference type works for 16 bits fake quant.
+      sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = self._getFilepath('model.pb')
+    write_graph(sess.graph_def, '', graph_def_file, False)
+    sess.close()
+
+    flags_str = ('--inference_type=INT8 --std_dev_values=128,128 '
+                 '--mean_values=128,128 '
+                 '--graph_def_file={0} --input_arrays={1},{2} '
+                 '--output_arrays={3}'.format(graph_def_file, 'inputA',
+                                              'inputB', 'output'))
     self._run(flags_str, should_succeed=True)
     os.remove(graph_def_file)
 
@@ -166,8 +189,8 @@ class TfLiteConvertV1Test(TestModels):
   def testKerasFileMLIR(self):
     keras_file = self._getKerasModelFile()
 
-    flags_str = ('--keras_model_file={} --experimental_new_converter'
-                 .format(keras_file))
+    flags_str = (
+        '--keras_model_file={} --experimental_new_converter'.format(keras_file))
     self._run(flags_str, should_succeed=True)
     os.remove(keras_file)
 
@@ -299,8 +322,8 @@ class TfLiteConvertV2Test(TestModels):
   def testKerasFileMLIR(self):
     keras_file = self._getKerasModelFile()
 
-    flags_str = ('--keras_model_file={} --experimental_new_converter'
-                 .format(keras_file))
+    flags_str = (
+        '--keras_model_file={} --experimental_new_converter'.format(keras_file))
     self._run(flags_str, should_succeed=True)
     os.remove(keras_file)
 
diff --git a/tensorflow/lite/python/util_test.py b/tensorflow/lite/python/util_test.py
index 51a0c57260a..f3c287dd7fc 100644
--- a/tensorflow/lite/python/util_test.py
+++ b/tensorflow/lite/python/util_test.py
@@ -174,7 +174,6 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
         str(error.exception))
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
-  @test_util.run_deprecated_v1
   def testSetTensorShapeDimensionInvalid(self):
     # Tests set_tensor_shape where the shape passed in is incompatible.
     with ops.Graph().as_default():
diff --git a/tensorflow/lite/python/wrap_toco.py b/tensorflow/lite/python/wrap_toco.py
index 3c1f98ff42d..8f72cc8cbbd 100644
--- a/tensorflow/lite/python/wrap_toco.py
+++ b/tensorflow/lite/python/wrap_toco.py
@@ -43,10 +43,12 @@ def wrapped_get_potentially_supported_ops():
   return _pywrap_toco_api.TocoGetPotentiallySupportedOps()
 
 
-def wrapped_experimental_mlir_quantize(input_data_str, disable_per_channel):
+def wrapped_experimental_mlir_quantize(input_data_str, disable_per_channel,
+                                       inference_type):
   """Wraps experimental mlir quantize model."""
   return _pywrap_toco_api.ExperimentalMlirQuantizeModel(input_data_str,
-                                                        disable_per_channel)
+                                                        disable_per_channel,
+                                                        inference_type)
 
 
 def wrapped_experimental_mlir_sparsify(input_data_str):
diff --git a/tensorflow/lite/simple_memory_arena.cc b/tensorflow/lite/simple_memory_arena.cc
index a4d6d19656b..4aa0a1eb2ef 100644
--- a/tensorflow/lite/simple_memory_arena.cc
+++ b/tensorflow/lite/simple_memory_arena.cc
@@ -136,6 +136,8 @@ TfLiteStatus SimpleMemoryArena::ResolveAlloc(
     char** output_ptr) {
   TF_LITE_ENSURE(context, committed_);
   TF_LITE_ENSURE(context, output_ptr != nullptr);
+  TF_LITE_ENSURE(context,
+                 underlying_buffer_size_ >= (alloc.offset + alloc.size));
   if (alloc.size == 0) {
     *output_ptr = nullptr;
   } else {
diff --git a/tensorflow/lite/simple_memory_arena_test.cc b/tensorflow/lite/simple_memory_arena_test.cc
index fe337562b0a..0196421cc9c 100644
--- a/tensorflow/lite/simple_memory_arena_test.cc
+++ b/tensorflow/lite/simple_memory_arena_test.cc
@@ -197,6 +197,9 @@ TEST_P(BufferAndPlanClearingTest, TestClearBufferAndClearPlan) {
   EXPECT_NE(resolved_ptr, nullptr);
 }
 
+INSTANTIATE_TEST_SUITE_P(BufferAndPlanClearingTest, BufferAndPlanClearingTest,
+                         ::testing::Values(true, false));
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/special_rules.bzl b/tensorflow/lite/special_rules.bzl
index c1373d3a5c2..5053eb2a16b 100644
--- a/tensorflow/lite/special_rules.bzl
+++ b/tensorflow/lite/special_rules.bzl
@@ -47,3 +47,7 @@ def if_nnapi(supported, not_supported = [], supported_android = None):
         clean_dep("//tensorflow:android"): supported_android,
         "//conditions:default": supported,
     })
+
+def tflite_hexagon_mobile_test(name):
+    """This is a no-op outside of Google."""
+    pass
diff --git a/tensorflow/lite/string_util.cc b/tensorflow/lite/string_util.cc
index f7fcf2ac630..44719858f2a 100644
--- a/tensorflow/lite/string_util.cc
+++ b/tensorflow/lite/string_util.cc
@@ -89,6 +89,7 @@ int DynamicBuffer::WriteToBuffer(char** buffer) {
   return bytes;
 }
 
+#ifndef TF_LITE_STATIC_MEMORY
 void DynamicBuffer::WriteToTensorAsVector(TfLiteTensor* tensor) {
   auto dims = TfLiteIntArrayCreate(1);
   dims->data[0] = offset_.size() - 1;  // Store number of strings.
@@ -109,6 +110,7 @@ void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor,
                     tensor_buffer, bytes, kTfLiteDynamic, tensor->allocation,
                     tensor->is_variable, tensor);
 }
+#endif  // TF_LITE_STATIC_MEMORY
 
 int GetStringCount(const void* raw_buffer) {
   // The first integers in the raw buffer is the number of strings.
diff --git a/tensorflow/lite/string_util.h b/tensorflow/lite/string_util.h
index 779b1e12ab8..879aa76b83b 100644
--- a/tensorflow/lite/string_util.h
+++ b/tensorflow/lite/string_util.h
@@ -74,6 +74,9 @@ class DynamicBuffer {
   // The function allocates space for the buffer but does NOT take ownership.
   int WriteToBuffer(char** buffer);
 
+  // String tensors are not generally supported on platforms w/ static memory.
+  // TODO(b/156130024): Remove this guard after removing header from TFLM deps.
+#ifndef TF_LITE_STATIC_MEMORY
   // Fill content into a string tensor, with the given new_shape. The new shape
   // must match the number of strings in this object. Caller relinquishes
   // ownership of new_shape. If 'new_shape' is nullptr, keep the tensor's
@@ -82,6 +85,7 @@ class DynamicBuffer {
 
   // Fill content into a string tensor. Set shape to {num_strings}.
   void WriteToTensorAsVector(TfLiteTensor* tensor);
+#endif  // TF_LITE_STATIC_MEMORY
 
  private:
   // Data buffer to store contents of strings, not including headers.
diff --git a/tensorflow/lite/testdata/dynamic_shapes.bin b/tensorflow/lite/testdata/dynamic_shapes.bin
new file mode 100644
index 00000000000..268d457131a
Binary files /dev/null and b/tensorflow/lite/testdata/dynamic_shapes.bin differ
diff --git a/tensorflow/lite/testdata/softplus_flex.bin b/tensorflow/lite/testdata/softplus_flex.bin
new file mode 100644
index 00000000000..e4715bf6011
Binary files /dev/null and b/tensorflow/lite/testdata/softplus_flex.bin differ
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 9d50f1ad604..6452d511acc 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -25,6 +25,8 @@ package(
 exports_files([
     "generated_examples_zip_test.cc",
     "tflite_diff_example_test.cc",
+    "init_tensorflow.h",
+    "init_tensorflow.cc",
 ])
 
 [gen_zip_test(
@@ -68,8 +70,8 @@ exports_files([
             "//tensorflow/core:test",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
     }),
 ) for conversion_mode, test_name, tags, args in generated_test_models_all() + merged_test_models()]
@@ -326,10 +328,10 @@ cc_library(
             "//tensorflow/core:tensorflow",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
@@ -365,10 +367,10 @@ cc_library(
             "//tensorflow/core:framework",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
@@ -405,10 +407,10 @@ cc_library(
             "//tensorflow/core:lib",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
@@ -440,10 +442,10 @@ cc_library(
             "//tensorflow/core:lib",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
diff --git a/tensorflow/lite/testing/kernel_test/BUILD b/tensorflow/lite/testing/kernel_test/BUILD
index 5180f2f4e5a..76333c76259 100644
--- a/tensorflow/lite/testing/kernel_test/BUILD
+++ b/tensorflow/lite/testing/kernel_test/BUILD
@@ -25,7 +25,7 @@ cc_library(
             "//tensorflow/core:lib",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
index 9236181f840..03a0004b2fc 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -178,18 +179,21 @@ class EvaluateKerasModel(test.TestCase):
       os.close(fd)
     return keras_file
 
+  @test_util.run_v1_only('Keras test fails under v2, see b/157266669')
   def testFloat(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
 
     model_coverage.test_keras_model(keras_file)
 
+  @test_util.run_v1_only('Keras test fails under v2, see b/157266669')
   def testPostTrainingQuantize(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
 
     model_coverage.test_keras_model(keras_file, post_training_quantize=True)
 
+  @test_util.run_v1_only('Keras test fails under v2, see b/157266669')
   def testTargetOps(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
diff --git a/tensorflow/lite/testing/op_tests/prelu.py b/tensorflow/lite/testing/op_tests/prelu.py
index f927c7a8b00..bc5875739ed 100644
--- a/tensorflow/lite/testing/op_tests/prelu.py
+++ b/tensorflow/lite/testing/op_tests/prelu.py
@@ -35,12 +35,33 @@ def make_prelu_tests(options):
           # channel.
           "input_shape": [[1, 10, 10, 3], [3, 3, 3, 3]],
           "shared_axes": [[1, 2], [1]],
+          "fully_quantize": [False],
+          "input_range": [(-10, 10)],
       },
       {
           # 2D-3D example. Share the 2nd axis.
           "input_shape": [[20, 20], [20, 20, 20]],
           "shared_axes": [[1]],
-      }
+          "fully_quantize": [False],
+          "input_range": [(-10, 10)],
+      },
+      # Quantized cases.
+      {
+          # The canonical case for image processing is having a 4D `input`
+          # (NHWC)and `shared_axes`=[1, 2], so the alpha parameter is per
+          # channel.
+          "input_shape": [[1, 10, 10, 3], [3, 3, 3, 3]],
+          "shared_axes": [[1, 2], [1]],
+          "fully_quantize": [True],
+          "input_range": [(-10, 10)],
+      },
+      {
+          # 2D-3D example. Share the 2nd axis.
+          "input_shape": [[20, 20], [20, 20, 20]],
+          "shared_axes": [[1]],
+          "fully_quantize": [True],
+          "input_range": [(-10, 10)],
+      },
   ]
 
   def build_graph(parameters):
@@ -64,7 +85,8 @@ def make_prelu_tests(options):
     for dim in range(1, len(input_shape)):
       alpha_shape.append(1 if dim in shared_axes else input_shape[dim])
 
-    alpha_values = create_tensor_data(np.float32, alpha_shape)
+    alpha_values = create_tensor_data(
+        np.float32, alpha_shape, min_value=-5, max_value=5)
 
     # There should be only 1 trainable variable tensor.
     variables = tf.compat.v1.all_variables()
diff --git a/tensorflow/lite/testing/op_tests/space_to_batch_nd.py b/tensorflow/lite/testing/op_tests/space_to_batch_nd.py
index 81753539e8a..86b061c6885 100644
--- a/tensorflow/lite/testing/op_tests/space_to_batch_nd.py
+++ b/tensorflow/lite/testing/op_tests/space_to_batch_nd.py
@@ -105,6 +105,13 @@ def make_space_to_batch_nd_tests(options):
       values.append(np.array(parameters["paddings"]))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
+  if options.use_experimental_converter:
+    # Remove unsupported dimension cases. Currently, kernel supports 3 and 4-D
+    # inputs.
+    test_parameters = [
+        test_parameters[0], test_parameters[1], test_parameters[3]
+    ]
+
   make_zip_of_tests(
       options,
       test_parameters,
diff --git a/tensorflow/lite/testing/op_tests/transpose_conv.py b/tensorflow/lite/testing/op_tests/transpose_conv.py
index 654856f0d88..09c1b5f4f14 100644
--- a/tensorflow/lite/testing/op_tests/transpose_conv.py
+++ b/tensorflow/lite/testing/op_tests/transpose_conv.py
@@ -38,6 +38,7 @@ def make_transpose_conv_tests(options):
       {
           "input_shape": [[1, 3, 4, 1], [1, 10, 10, 3], [3, 20, 20, 1]],
           "filter_size": [[1, 1], [1, 2], [3, 3]],
+          "has_bias": [False],
           "strides": [[1, 1, 1, 1], [1, 3, 3, 1]],
           "padding": ["SAME", "VALID"],
           "data_format": ["NHWC"],
@@ -50,6 +51,7 @@ def make_transpose_conv_tests(options):
       {
           "input_shape": [[1, 3, 3, 1]],
           "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [False],
           "strides": [[1, 1, 1, 1]],
           "padding": ["SAME"],
           "data_format": ["NHWC"],
@@ -60,6 +62,7 @@ def make_transpose_conv_tests(options):
       {
           "input_shape": [[1, 3, 3, 1]],
           "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [False],
           "strides": [[1, 2, 2, 1]],
           "padding": ["SAME"],
           "data_format": ["NHWC"],
@@ -70,13 +73,25 @@ def make_transpose_conv_tests(options):
       {
           "input_shape": [[1, 4, 3, 1]],
           "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [False],
           "strides": [[1, 2, 2, 1]],
           "padding": ["SAME"],
           "data_format": ["NHWC"],
           "channel_multiplier": [1],
           "output_shape": [[1, 8, 6, 2]],
           "fully_quantize": [True]
-      }
+      },
+      {
+          "input_shape": [[1, 3, 3, 1]],
+          "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [True],
+          "strides": [[1, 1, 1, 1]],
+          "padding": ["SAME"],
+          "data_format": ["NHWC"],
+          "channel_multiplier": [1],
+          "output_shape": [[1, 3, 3, 2]],
+          "fully_quantize": [True]
+      },
   ]
 
   def get_tensor_shapes(parameters):
@@ -124,6 +139,13 @@ def make_transpose_conv_tests(options):
           strides=parameters["strides"],
           padding=parameters["padding"],
           data_format=parameters["data_format"])
+      if parameters["has_bias"]:
+        bias_input = create_tensor_data(
+            np.float32, (parameters["output_shape"][-1],),
+            min_value=-1,
+            max_value=1)
+        out = tf.nn.bias_add(
+            out, bias_input, data_format=parameters["data_format"])
 
     return input_tensors, [out]
 
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 93a1b813f4d..692155239ff 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -337,10 +337,7 @@ TfLiteDriver::TfLiteDriver(DelegateType delegate_type, bool reference_kernel)
       break;
     case DelegateType::kFlex:
 #if !defined(__APPLE__)
-      delegate_ = Interpreter::TfLiteDelegatePtr(
-          FlexDelegate::Create().release(), [](TfLiteDelegate* delegate) {
-            delete static_cast<tflite::FlexDelegate*>(delegate);
-          });
+      delegate_ = FlexDelegate::Create();
 #endif
       break;
   }
diff --git a/tensorflow/lite/tflite_with_xnnpack_optional.cc b/tensorflow/lite/tflite_with_xnnpack_optional.cc
new file mode 100644
index 00000000000..31d4ff50f28
--- /dev/null
+++ b/tensorflow/lite/tflite_with_xnnpack_optional.cc
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tflite_with_xnnpack_optional.h"
+
+#include "tensorflow/lite/core/macros.h"
+
+#ifdef TFLITE_BUILD_WITH_XNNPACK_DELEGATE
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#endif
+
+namespace tflite {
+
+using TfLiteDelegatePtr =
+    std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
+
+#ifndef TFLITE_BUILD_WITH_XNNPACK_DELEGATE
+// Using weak symbols to create a delegate allows automatic injection of the
+// delegate simply by adding it as a dependency. See the strong override in
+// lite/tflite_with_xnnpack.cc,
+TFLITE_ATTRIBUTE_WEAK TfLiteDelegatePtr
+AcquireXNNPACKDelegate(int num_threads) {
+  return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
+}
+#endif
+
+#ifdef TFLITE_BUILD_WITH_XNNPACK_DELEGATE
+TfLiteDelegatePtr MaybeCreateXNNPACKDelegate(int num_threads) {
+  auto opts = TfLiteXNNPackDelegateOptionsDefault();
+  // Note that we don't want to use the thread pool for num_threads == 1.
+  opts.num_threads = num_threads > 1 ? num_threads : 0;
+  return TfLiteDelegatePtr(TfLiteXNNPackDelegateCreate(&opts),
+                           TfLiteXNNPackDelegateDelete);
+}
+#else
+TfLiteDelegatePtr MaybeCreateXNNPACKDelegate(int num_threads) {
+  return AcquireXNNPACKDelegate(num_threads);
+}
+#endif
+
+}  // namespace tflite
diff --git a/tensorflow/lite/tflite_with_xnnpack_optional.h b/tensorflow/lite/tflite_with_xnnpack_optional.h
new file mode 100644
index 00000000000..afbdbd17356
--- /dev/null
+++ b/tensorflow/lite/tflite_with_xnnpack_optional.h
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_
+#define TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_
+#include <memory>
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
+MaybeCreateXNNPACKDelegate(int num_threads);
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_
diff --git a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
index 9816cc1df6a..171d522daa7 100644
--- a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -271,8 +271,8 @@ bool MinMaxApproximatelyEqual(const MinMax& minmax1, const MinMax& minmax2) {
   const double magnitude =
       std::min(minmax1.max - minmax1.min, minmax2.max - minmax2.min);
   const double tolerated = 1e-6 * magnitude;
-  return std::abs(minmax1.min - minmax2.min) < tolerated &&
-         std::abs(minmax1.max - minmax2.max) < tolerated;
+  return std::abs(minmax1.min - minmax2.min) <= tolerated &&
+         std::abs(minmax1.max - minmax2.max) <= tolerated;
 }
 
 // Propagates MinMax from any of the listed arrays, to all others.
diff --git a/tensorflow/lite/toco/model_cmdline_flags.cc b/tensorflow/lite/toco/model_cmdline_flags.cc
index 2434481272f..86a1cedd612 100644
--- a/tensorflow/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/lite/toco/model_cmdline_flags.cc
@@ -204,7 +204,7 @@ void ReadModelFlagsFromCommandLineFlags(
   }
 
 #ifdef PLATFORM_GOOGLE
-  CHECK(!((base::SpecifiedOnCommandLine("batch") &&
+  CHECK(!((base::WasPresentOnCommandLine("batch") &&
            parsed_model_flags.variable_batch.specified())))
       << "The --batch and --variable_batch flags are mutually exclusive.";
 #endif
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index bea582d83a5..7dfa714d1d6 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -54,6 +54,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/python:saved_model_to_tfl_flatbuffer",
         "//tensorflow/compiler/mlir/lite/quantization/lite:quantize_model",
         "//tensorflow/compiler/mlir/lite/sparsity:sparsify_model",
+        "//tensorflow/lite/toco:types_proto_cc",
     ] + select({
         # This is required when running `tflite_convert` from `bazel`.
         # It requires to link with TensorFlow Ops to get the op definitions.
diff --git a/tensorflow/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
index aafd14f9da8..441aabf0ffe 100644
--- a/tensorflow/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/lite/toco/toco_tooling.h"
 #include "tensorflow/lite/toco/toco_types.h"
 #include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/types.pb.h"
 
 namespace toco {
 
@@ -229,7 +230,7 @@ PyObject* TocoGetPotentiallySupportedOps() {
 }
 
 PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
-                            bool fully_quantize) {
+                            bool fully_quantize, int inference_type) {
   using tflite::interpreter_wrapper::PythonErrorReporter;
   char* buf = nullptr;
   Py_ssize_t length;
@@ -249,11 +250,25 @@ PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
   auto tflite_model = absl::make_unique<tflite::ModelT>();
   model->GetModel()->UnPackTo(tflite_model.get(), nullptr);
 
+  tflite::TensorType inference_tensor_type;
+  switch (inference_type) {
+    case toco::IODataType::QUANTIZED_INT16:
+      inference_tensor_type = tflite::TensorType_INT16;
+      break;
+    case toco::IODataType::QUANTIZED_UINT8:
+      inference_tensor_type = tflite::TensorType_UINT8;
+      break;
+    case toco::IODataType::INT8:
+      inference_tensor_type = tflite::TensorType_INT8;
+      break;
+    default:
+      return nullptr;
+  }
   flatbuffers::FlatBufferBuilder builder;
   auto status = mlir::lite::QuantizeModel(
       *tflite_model, tflite::TensorType::TensorType_FLOAT32,
-      tflite::TensorType::TensorType_FLOAT32, {}, disable_per_channel,
-      fully_quantize, &builder, error_reporter.get());
+      tflite::TensorType::TensorType_FLOAT32, inference_tensor_type, {},
+      disable_per_channel, fully_quantize, &builder, error_reporter.get());
 
   if (status != kTfLiteOk) {
     error_reporter->exception();
diff --git a/tensorflow/lite/toco/python/toco_python_api.h b/tensorflow/lite/toco/python/toco_python_api.h
index 7afb097fd4a..058ae9fb942 100644
--- a/tensorflow/lite/toco/python/toco_python_api.h
+++ b/tensorflow/lite/toco/python/toco_python_api.h
@@ -44,7 +44,7 @@ PyObject* TocoGetPotentiallySupportedOps();
 // is specified by the calibration data are not sufficient to quantize the
 // model.
 PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
-                            bool fully_quantize);
+                            bool fully_quantize, int inference_type);
 
 // Sparsifies model to encode sparse tensors with proper format. Throws error if
 // sparsification fails.
diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index 3718f83acaa..cf127a9f459 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -173,6 +173,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kSlice, 3}, "1.14.0"},
           {{OperatorType::kTanh, 1}, "1.14.0"},
           {{OperatorType::kTanh, 2}, "1.14.0"},
+          {{OperatorType::kTanh, 3}, kPendingReleaseOpVersion},
           {{OperatorType::kOneHot, 1}, "1.11.0"},
           {{OperatorType::kCTCBeamSearchDecoder, 1}, "1.11.0"},
           {{OperatorType::kUnpack, 1}, "1.11.0"},
@@ -180,8 +181,10 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kUnpack, 3}, "2.2.0"},
           {{OperatorType::kUnpack, 4}, kPendingReleaseOpVersion},
           {{OperatorType::kLeakyRelu, 1}, "1.13.1"},
+          {{OperatorType::kLeakyRelu, 2}, kPendingReleaseOpVersion},
           {{OperatorType::kLogistic, 1}, "1.14.0"},
           {{OperatorType::kLogistic, 2}, "1.14.0"},
+          {{OperatorType::kLogistic, 3}, kPendingReleaseOpVersion},
           {{OperatorType::kLogSoftmax, 1}, "1.14.0"},
           {{OperatorType::kLogSoftmax, 2}, "1.14.0"},
           {{OperatorType::kSquaredDifference, 1}, "1.13.1"},
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 57b791a1a94..fee10a19787 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -487,6 +487,7 @@ class FullyConnected
     op_sig.options.fully_connected.keep_num_dims = fc_op.keep_num_dims;
     op_sig.options.fully_connected.weights_format =
         GetWeightFormat(fc_op.weights_format);
+    op_sig.options.fully_connected.sparse_weight = false;
     return ::tflite::GetBuiltinOperatorVersion(op_sig);
   }
 };
@@ -1118,6 +1119,7 @@ class ResizeBilinear
         GetVersioningOpSig(builtin_op(), op_signature);
     op_sig.options.resize.half_pixel_centers =
         resize_bilinear_op.half_pixel_centers;
+    op_sig.options.resize.align_corners = resize_bilinear_op.align_corners;
     return ::tflite::GetBuiltinOperatorVersion(op_sig);
   }
 };
@@ -1147,6 +1149,7 @@ class ResizeNearestNeighbor
     ::tflite::OpSignature op_sig =
         GetVersioningOpSig(builtin_op(), op_signature);
     op_sig.options.resize.half_pixel_centers = resize_nn_op.half_pixel_centers;
+    op_sig.options.resize.align_corners = resize_nn_op.align_corners;
     return ::tflite::GetBuiltinOperatorVersion(op_sig);
   }
 };
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index a96c1c3ede3..c34453e0809 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -17,7 +17,10 @@ py_binary(
     srcs = ["visualize.py"],
     python_version = "PY3",
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow/lite/python:schema_py"],
+    deps = [
+        "//tensorflow/lite/python:schema_py",
+        "//third_party/py/numpy",
+    ],
 )
 
 py_test(
@@ -131,6 +134,7 @@ cc_binary(
     deps = [
         ":command_line_flags",
         ":gen_op_registration",
+        "//tensorflow/lite:util",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -249,6 +253,60 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "list_flex_ops",
+    srcs = ["list_flex_ops.cc"],
+    hdrs = ["list_flex_ops.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:util",
+        "@com_google_absl//absl/strings",
+        "@flatbuffers",
+    ],
+)
+
+# This tool list flex ops and kernels inside a TFLite file.
+# It is used to generate header file for selective registration.
+cc_binary(
+    name = "list_flex_ops_main",
+    srcs = ["list_flex_ops_main.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":list_flex_ops",
+        "//tensorflow/lite/tools:command_line_flags",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_test(
+    name = "list_flex_ops_test",
+    srcs = ["list_flex_ops_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/0_subgraphs.bin",
+        "//tensorflow/lite:testdata/multi_add_flex.bin",
+        "//tensorflow/lite:testdata/softplus_flex.bin",
+        "//tensorflow/lite:testdata/test_model.bin",
+        "//tensorflow/lite:testdata/test_model_broken.bin",
+    ],
+    tags = [
+        "no_oss",  # Currently requires --config=monolithic, b/118895218.
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":list_flex_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:protobuf",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 py_binary(
     name = "zip_files",
     srcs = ["zip_files.py"],
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index 357072226af..f6cb71749f8 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -142,6 +142,7 @@ cc_library(
         ":profiling_listener",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/profiling:platform_profiler",
         "//tensorflow/lite/profiling:profile_summary_formatter",
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index a4f632c40a9..ae7e4ae150d 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -73,6 +73,7 @@ Note when `use_legacy_nnapi` is selected, this parameter won't work.
     `/data/local/tmp/` and this benchmark tool will not correctly use NNAPI.
 *   `nnapi_accelerator_name`: `str` (default="")
 *   `disable_nnapi_cpu`: `bool` (default=false)
+*   `nnapi_allow_fp16`: `bool` (default=false)
 
 #### Hexagon delegate
 * `use_hexagon`: `bool` (default=false)
@@ -87,6 +88,7 @@ the reported data on hexagon is in cycles, not in ms like on cpu.
 
 #### CoreML delegate
 *   `use_coreml`: `bool` (default=false)
+*   `coreml_version`: `int` (default=0)
 
 #### External delegate
 *   `external_delegate_path`: `string` (default="")
diff --git a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
index 26fed5e279f..cfce23c4595 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
@@ -233,6 +233,7 @@ void BenchmarkPerformanceOptions::ResetPerformanceOptions() {
   single_option_run_params_->Set<std::string>("nnapi_accelerator_name", "");
   single_option_run_params_->Set<bool>("disable_nnapi_cpu", false);
   single_option_run_params_->Set<int>("max_delegated_partitions", 0);
+  single_option_run_params_->Set<bool>("nnapi_allow_fp16", false);
 #endif
 #if defined(TFLITE_ENABLE_HEXAGON)
   single_option_run_params_->Set<bool>("use_hexagon", false);
@@ -334,7 +335,7 @@ void BenchmarkPerformanceOptions::Run() {
   // profiling listener etc. in each Run() invoke because such listeners may be
   // reset and become invalid in the next Run(). As a result, we record the
   // number of externally-added listeners here to prevent they're cleared later.
-  const int num_external_listners = single_option_run_->NumListeners();
+  const int num_external_listeners = single_option_run_->NumListeners();
 
   // Now perform all runs, each with different performance-affecting parameters.
   for (const auto& run_params : all_run_params_) {
@@ -349,7 +350,7 @@ void BenchmarkPerformanceOptions::Run() {
 
     // Clear internally created listeners before each run but keep externally
     // created ones.
-    single_option_run_->RemoveListeners(num_external_listners);
+    single_option_run_->RemoveListeners(num_external_listeners);
 
     all_run_stats_->MarkBenchmarkStart(*single_option_run_params_);
     single_option_run_->Run();
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 489780e4f69..969713cce73 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/strings/numbers.h"
 #include "ruy/profiler/profiler.h"  // from @ruy
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
@@ -596,17 +597,20 @@ TfLiteStatus BenchmarkTfLiteModel::ResetInputsAndOutputs() {
   return kTfLiteOk;
 }
 
-TfLiteStatus BenchmarkTfLiteModel::Init() {
-  TF_LITE_ENSURE_STATUS(LoadModel());
-
+TfLiteStatus BenchmarkTfLiteModel::InitInterpreter() {
   auto resolver = GetOpResolver();
-
   const int32_t num_threads = params_.Get<int32_t>("num_threads");
   tflite::InterpreterBuilder(*model_, *resolver)(&interpreter_, num_threads);
   if (!interpreter_) {
-    TFLITE_LOG(ERROR) << "Failed to construct interpreter";
+    TFLITE_LOG(ERROR) << "Failed to initialize the interpreter";
     return kTfLiteError;
   }
+  return kTfLiteOk;
+}
+
+TfLiteStatus BenchmarkTfLiteModel::Init() {
+  TF_LITE_ENSURE_STATUS(LoadModel());
+  TF_LITE_ENSURE_STATUS(InitInterpreter());
 
   // Install profilers if necessary right after interpreter is created so that
   // any memory allocations inside the TFLite runtime could be recorded if the
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index b56390b3775..cc87743b531 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -74,6 +74,9 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   // Allow subclasses to create a customized Op resolver during init.
   virtual std::unique_ptr<tflite::OpResolver> GetOpResolver() const;
 
+  // Allow subclass to initialize a customized tflite interpereter.
+  virtual TfLiteStatus InitInterpreter();
+
   // Create a BenchmarkListener that's specifically for TFLite profiling if
   // necessary.
   virtual std::unique_ptr<BenchmarkListener> MayCreateProfilingListener() const;
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index 12ddf9945fd..31d8d53d563 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -29,6 +29,9 @@ limitations under the License.
 // TfLiteDelegate - allows delegation of nodes to alternative backends.
 //
 // Some abstractions in this file are created and managed by Interpreter.
+//
+// NOTE: The order of values in these structs are "semi-ABI stable". New values
+// should be added only to the end of structs and never reordered.
 
 #ifndef TENSORFLOW_LITE_C_COMMON_H_
 #define TENSORFLOW_LITE_C_COMMON_H_
@@ -83,8 +86,9 @@ typedef struct TfLiteIntArray {
   int size;
 // gcc 6.1+ have a bug where flexible members aren't properly handled
 // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
-#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
-    __GNUC_MINOR__ >= 1
+#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
+     __GNUC_MINOR__ >= 1) ||                                      \
+    defined(HEXAGON)
   int data[0];
 #else
   int data[];
@@ -122,6 +126,7 @@ typedef struct TfLiteFloatArray {
   int size;
 // gcc 6.1+ have a bug where flexible members aren't properly handled
 // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
+// This also applies to the toolchain used for Qualcomm Hexagon DSPs.
 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
     __GNUC_MINOR__ >= 1
   float data[0];
@@ -318,15 +323,23 @@ typedef union TfLitePtrUnion {
   void* data;
 } TfLitePtrUnion;
 
-// Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
-// data (or data externally allocated). kTfLiteArenaRw is arena allocated
-// data. kTfLiteDynamic is for tensors that are allocated during evaluation.
+// Memory allocation strategies.
+//  * kTfLiteMmapRo: Read-only memory-mapped data, or data externally allocated.
+//  * kTfLiteArenaRw: Arena allocated with no guarantees about persistence,
+//        and available during eval.
+//  * kTfLiteArenaRwPersistent: Arena allocated but persistent across eval, and
+//        only available during eval.
+//  * kTfLiteDynamic: Allocated during eval, or for string tensors.
+//  * kTfLitePersistentRo: Allocated and populated during prepare. This is
+//        useful for tensors that can be computed during prepare and treated
+//        as constant inputs for downstream ops (also in prepare).
 typedef enum TfLiteAllocationType {
   kTfLiteMemNone = 0,
   kTfLiteMmapRo,
   kTfLiteArenaRw,
   kTfLiteArenaRwPersistent,
   kTfLiteDynamic,
+  kTfLitePersistentRo,
 } TfLiteAllocationType;
 
 // The delegates should use zero or positive integers to represent handles.
@@ -731,8 +744,9 @@ typedef struct TfLiteDelegate {
                           struct TfLiteDelegate* delegate);
 
   // Copy the data from delegate buffer handle into raw memory of the given
-  // 'tensor'. This cannot be null. The delegate is allowed to allocate the raw
-  // bytes as long as it follows the rules for kTfLiteDynamic tensors.
+  // 'tensor'. Note that the delegate is allowed to allocate the raw bytes as
+  // long as it follows the rules for kTfLiteDynamic tensors, in which case this
+  // cannot be null.
   TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
                                        struct TfLiteDelegate* delegate,
                                        TfLiteBufferHandle buffer_handle,
diff --git a/tensorflow/lite/tools/delegates/BUILD b/tensorflow/lite/tools/delegates/BUILD
index d2eac9d7348..9f2108d07d6 100644
--- a/tensorflow/lite/tools/delegates/BUILD
+++ b/tensorflow/lite/tools/delegates/BUILD
@@ -143,6 +143,10 @@ cc_library(
     name = "external_delegate_provider",
     srcs = ["external_delegate_provider.cc"],
     copts = tflite_copts(),
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-ldl"],
+    }),
     linkstatic = True,
     visibility = ["//visibility:public"],
     deps = [
diff --git a/tensorflow/lite/tools/delegates/README.md b/tensorflow/lite/tools/delegates/README.md
index f0e15e9e71a..bc1bffd49b6 100644
--- a/tensorflow/lite/tools/delegates/README.md
+++ b/tensorflow/lite/tools/delegates/README.md
@@ -73,6 +73,8 @@ TFLite delegate.
     [NNAPI CPU reference implementation](https://developer.android.com/ndk/guides/neuralnetworks#device-assignment)
     from the possible devices to be used by NNAPI to execute the model. This
     option is ignored if `nnapi_accelerator_name` is specified.
+*   `nnapi_allow_fp16`: `bool` (default=false) \
+    Whether to allow FP32 computation to be run in FP16.
 
 ### Hexagon delegate provider
 *   `use_hexagon`: `bool` (default=false) \
@@ -93,6 +95,9 @@ TFLite delegate.
 *   `use_coreml`: `bool` (default=false) \
     Whether to use the [Core ML delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/delegates/coreml).
     This option is only available in iOS.
+*   `coreml_version`: `int` (default=0) \
+    Target Core ML version for model conversion. The default value is 0 and it
+    means using the newest version that's available on the device.
 
 ### External delegate provider
 *   `external_delegate_path`: `string` (default="") \
diff --git a/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
index 0d1a8ade368..c6509618aee 100644
--- a/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
@@ -31,7 +31,8 @@ class CoreMlDelegateProvider : public DelegateProvider {
  public:
   CoreMlDelegateProvider() {
 #if defined(REAL_IPHONE_DEVICE)
-    default_params_.AddParam("use_coreml", ToolParam::Create<bool>(true));
+    default_params_.AddParam("use_coreml", ToolParam::Create<bool>(false));
+    default_params_.AddParam("coreml_version", ToolParam::Create<int>(0));
 #endif
   }
   std::vector<Flag> CreateFlags(ToolParams* params) const final;
@@ -49,6 +50,10 @@ std::vector<Flag> CoreMlDelegateProvider::CreateFlags(
 #if defined(REAL_IPHONE_DEVICE)
   std::vector<Flag> flags = {
       CreateFlag<bool>("use_coreml", params, "use Core ML"),
+      CreateFlag<int>("coreml_version", params,
+                      "Target Core ML version for model conversion. "
+                      "The default value is 0 and it means using the newest "
+                      "version that's available on the device."),
   };
   return flags;
 #else
@@ -71,6 +76,7 @@ TfLiteDelegatePtr CoreMlDelegateProvider::CreateTfLiteDelegate(
   if (params.Get<bool>("use_coreml")) {
     TfLiteCoreMlDelegateOptions coreml_opts = {
         .enabled_devices = TfLiteCoreMlDelegateAllDevices};
+    coreml_opts.coreml_version = params.Get<int>("coreml_version");
     coreml_opts.max_delegated_partitions =
         params.Get<int>("max_delegated_partitions");
     coreml_opts.min_nodes_per_partition =
diff --git a/tensorflow/lite/tools/delegates/external_delegate_provider.cc b/tensorflow/lite/tools/delegates/external_delegate_provider.cc
index 95b0e42802f..193860820b1 100644
--- a/tensorflow/lite/tools/delegates/external_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/external_delegate_provider.cc
@@ -119,7 +119,7 @@ std::vector<Flag> ExternalDelegateProvider::CreateFlags(
                               "The library path for the underlying external."),
       CreateFlag<std::string>(
           "external_delegate_options", params,
-          "Comma-seperated options to be passed to the external delegate")};
+          "Comma-separated options to be passed to the external delegate")};
   return flags;
 }
 
diff --git a/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc b/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
index db1f32b2282..62805b2644b 100644
--- a/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
@@ -154,8 +154,8 @@ TfLiteDelegatePtr GpuDelegateProvider::CreateTfLiteDelegate(
     delegate = TfLiteDelegatePtr(TFLGpuDelegateCreate(&gpu_opts),
                                  &TFLGpuDelegateDelete);
 #else
-    TFLITE_LOG(WARN) << "The GPU delegate compile options are only supported on"
-                        "Android or iOS platforms.";
+    TFLITE_LOG(WARN) << "The GPU delegate compile options are only supported "
+                        "on Android or iOS platforms.";
     delegate = evaluation::CreateGPUDelegate();
 #endif
 
diff --git a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
index f3ed8743e54..bde9c0e03e3 100644
--- a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
@@ -26,6 +26,7 @@ namespace tools {
 class NnapiDelegateProvider : public DelegateProvider {
  public:
   NnapiDelegateProvider() {
+#if defined(__ANDROID__)
     default_params_.AddParam("use_nnapi", ToolParam::Create<bool>(false));
     default_params_.AddParam("nnapi_execution_preference",
                              ToolParam::Create<std::string>(""));
@@ -33,6 +34,9 @@ class NnapiDelegateProvider : public DelegateProvider {
                              ToolParam::Create<std::string>(""));
     default_params_.AddParam("disable_nnapi_cpu",
                              ToolParam::Create<bool>(false));
+    default_params_.AddParam("nnapi_allow_fp16",
+                             ToolParam::Create<bool>(false));
+#endif
   }
 
   std::vector<Flag> CreateFlags(ToolParams* params) const final;
@@ -47,16 +51,21 @@ REGISTER_DELEGATE_PROVIDER(NnapiDelegateProvider);
 
 std::vector<Flag> NnapiDelegateProvider::CreateFlags(ToolParams* params) const {
   std::vector<Flag> flags = {
-      CreateFlag<bool>("use_nnapi", params, "use nnapi delegate api"),
-      CreateFlag<std::string>("nnapi_execution_preference", params,
-                              "execution preference for nnapi delegate. Should "
-                              "be one of the following: fast_single_answer, "
-                              "sustained_speed, low_power, undefined"),
-      CreateFlag<std::string>(
-          "nnapi_accelerator_name", params,
-          "the name of the nnapi accelerator to use (requires Android Q+)"),
-      CreateFlag<bool>("disable_nnapi_cpu", params,
-                       "Disable the NNAPI CPU device")};
+#if defined(__ANDROID__)
+    CreateFlag<bool>("use_nnapi", params, "use nnapi delegate api"),
+    CreateFlag<std::string>("nnapi_execution_preference", params,
+                            "execution preference for nnapi delegate. Should "
+                            "be one of the following: fast_single_answer, "
+                            "sustained_speed, low_power, undefined"),
+    CreateFlag<std::string>(
+        "nnapi_accelerator_name", params,
+        "the name of the nnapi accelerator to use (requires Android Q+)"),
+    CreateFlag<bool>("disable_nnapi_cpu", params,
+                     "Disable the NNAPI CPU device"),
+    CreateFlag<bool>("nnapi_allow_fp16", params,
+                     "Allow fp32 computation to be run in fp16")
+#endif
+  };
 
   return flags;
 }
@@ -83,6 +92,10 @@ void NnapiDelegateProvider::LogParams(const ToolParams& params) const {
       TFLITE_LOG(INFO) << "disable_nnapi_cpu: ["
                        << params.Get<bool>("disable_nnapi_cpu") << "]";
     }
+    if (params.Get<bool>("nnapi_allow_fp16")) {
+      TFLITE_LOG(INFO) << "Allow fp16 in NNAPI: ["
+                       << params.Get<bool>("nnapi_allow_fp16") << "]";
+    }
   }
 #endif
 }
@@ -90,6 +103,7 @@ void NnapiDelegateProvider::LogParams(const ToolParams& params) const {
 TfLiteDelegatePtr NnapiDelegateProvider::CreateTfLiteDelegate(
     const ToolParams& params) const {
   TfLiteDelegatePtr delegate(nullptr, [](TfLiteDelegate*) {});
+#if defined(__ANDROID__)
   if (params.Get<bool>("use_nnapi")) {
     StatefulNnApiDelegate::Options options;
     std::string accelerator_name =
@@ -99,6 +113,11 @@ TfLiteDelegatePtr NnapiDelegateProvider::CreateTfLiteDelegate(
     } else if (params.Get<bool>("disable_nnapi_cpu")) {
       options.disallow_nnapi_cpu = true;
     }
+
+    if (params.Get<bool>("nnapi_allow_fp16")) {
+      options.allow_fp16 = true;
+    }
+
     std::string string_execution_preference =
         params.Get<std::string>("nnapi_execution_preference");
     // Only set execution preference if user explicitly passes one. Otherwise,
@@ -144,7 +163,7 @@ TfLiteDelegatePtr NnapiDelegateProvider::CreateTfLiteDelegate(
                      << params.Get<std::string>("nnapi_execution_preference")
                      << ") to be used.";
   }
-
+#endif
   return delegate;
 }
 
diff --git a/tensorflow/lite/tools/evaluation/BUILD b/tensorflow/lite/tools/evaluation/BUILD
index 1bc35211b0a..f6d2c8d8141 100644
--- a/tensorflow/lite/tools/evaluation/BUILD
+++ b/tensorflow/lite/tools/evaluation/BUILD
@@ -41,10 +41,10 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/lite/delegates/gpu:delegate",
+            "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         ],
         "//conditions:default": [],
     }) + select({
diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h
index 36f80469a97..9ff20d630ce 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h
@@ -33,7 +33,7 @@ class DelegateProviders {
   DelegateProviders();
 
   // Initialize delegate-related parameters from commandline arguments and
-  // returns true if sucessful.
+  // returns true if successful.
   bool InitFromCmdlineArgs(int* argc, const char** argv);
 
   // Get all parameters from all registered delegate providers.
diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc
index c2dfa8d0360..0282b258082 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc
@@ -43,7 +43,11 @@ TEST(EvaluationDelegateProviderTest, CreateTfLiteDelegate) {
 TEST(EvaluationDelegateProviderTest, DelegateProvidersParams) {
   DelegateProviders providers;
   const auto& params = providers.GetAllParams();
+#if defined(__ANDROID__)
   EXPECT_TRUE(params.HasParam("use_nnapi"));
+#else
+  EXPECT_FALSE(params.HasParam("use_nnapi"));
+#endif
   EXPECT_TRUE(params.HasParam("use_gpu"));
 
   int argc = 3;
@@ -66,7 +70,7 @@ TEST(EvaluationDelegateProviderTest, GetAllParamsWithTfliteInferenceParams) {
   TfliteInferenceParams params;
   params.set_delegate(TfliteInferenceParams::NONE);
   params.set_num_threads(4);
-  // The same-meaning parameter in TfliteInferenceParams takes precendence.
+  // The same-meaning parameter in TfliteInferenceParams takes precedence.
   tools::ToolParams tool_params = providers.GetAllParams(params);
   EXPECT_EQ(4, tool_params.Get<int>("num_threads"));
   EXPECT_EQ(1, argc);
diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index 33967b6f4ea..c766a932999 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -101,20 +101,18 @@ TfLiteDelegatePtr CreateNNAPIDelegate() {
       // NnApiDelegate() returns a singleton, so provide a no-op deleter.
       [](TfLiteDelegate*) {});
 #else
-  return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
+  return CreateNullDelegate();
 #endif  // defined(__ANDROID__)
 }
 
-TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options) {
 #if defined(__ANDROID__)
+TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options) {
   return TfLiteDelegatePtr(
       new StatefulNnApiDelegate(options), [](TfLiteDelegate* delegate) {
         delete reinterpret_cast<StatefulNnApiDelegate*>(delegate);
       });
-#else
-  return CreateNullDelegate();
-#endif  // defined(__ANDROID__)
 }
+#endif  // defined(__ANDROID__)
 
 #if defined(__ANDROID__)
 TfLiteDelegatePtr CreateGPUDelegate(TfLiteGpuDelegateOptionsV2* options) {
diff --git a/tensorflow/lite/tools/evaluation/utils.h b/tensorflow/lite/tools/evaluation/utils.h
index ef2c609723e..7269d04a802 100644
--- a/tensorflow/lite/tools/evaluation/utils.h
+++ b/tensorflow/lite/tools/evaluation/utils.h
@@ -16,12 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_UTILS_H_
 #define TENSORFLOW_LITE_TOOLS_EVALUATION_UTILS_H_
 
+#include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
 
 #if defined(__ANDROID__)
 #include "tensorflow/lite/delegates/gpu/delegate.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #if (defined(__arm__) || defined(__aarch64__))
 #include "tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.h"
 #endif
@@ -33,7 +35,6 @@ limitations under the License.
 #endif
 
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 
 namespace tflite {
 namespace evaluation {
@@ -61,8 +62,9 @@ inline TfLiteStatus GetSortedFileNames(const std::string& directory,
 }
 
 TfLiteDelegatePtr CreateNNAPIDelegate();
-
+#if defined(__ANDROID__)
 TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options);
+#endif
 
 TfLiteDelegatePtr CreateGPUDelegate();
 #if defined(__ANDROID__)
diff --git a/tensorflow/lite/tools/gen_op_registration.cc b/tensorflow/lite/tools/gen_op_registration.cc
index be08b6e0d31..2a95df799e8 100644
--- a/tensorflow/lite/tools/gen_op_registration.cc
+++ b/tensorflow/lite/tools/gen_op_registration.cc
@@ -12,12 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/tools/gen_op_registration.h"
+
 #include <string>
 #include <vector>
 
 #include "re2/re2.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/tools/gen_op_registration.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/tools/gen_op_registration_main.cc b/tensorflow/lite/tools/gen_op_registration_main.cc
index 410aaabf064..c959945b334 100644
--- a/tensorflow/lite/tools/gen_op_registration_main.cc
+++ b/tensorflow/lite/tools/gen_op_registration_main.cc
@@ -19,23 +19,25 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/str_split.h"
 #include "absl/strings/strip.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/gen_op_registration.h"
+#include "tensorflow/lite/util.h"
 
-const char kInputModelFlag[] = "input_model";
+const char kInputModelFlag[] = "input_models";
 const char kNamespace[] = "namespace";
 const char kOutputRegistrationFlag[] = "output_registration";
 const char kTfLitePathFlag[] = "tflite_path";
 const char kForMicro[] = "for_micro";
 
-void ParseFlagAndInit(int* argc, char** argv, std::string* input_model,
+void ParseFlagAndInit(int* argc, char** argv, std::string* input_models,
                       std::string* output_registration,
                       std::string* tflite_path, std::string* namespace_flag,
                       bool* for_micro) {
   std::vector<tflite::Flag> flag_list = {
-      tflite::Flag::CreateFlag(kInputModelFlag, input_model,
-                               "path to the tflite model"),
+      tflite::Flag::CreateFlag(kInputModelFlag, input_models,
+                               "path to the tflite models, separated by comma"),
       tflite::Flag::CreateFlag(kOutputRegistrationFlag, output_registration,
                                "filename for generated registration code"),
       tflite::Flag::CreateFlag(kTfLitePathFlag, tflite_path,
@@ -83,6 +85,8 @@ void GenerateFileContent(const std::string& tflite_path,
     fout << "namespace custom {\n";
     fout << "// Forward-declarations for the custom ops.\n";
     for (const auto& op : custom_ops) {
+      // Skips Tensorflow ops, only TFLite custom ops can be registered here.
+      if (tflite::IsFlexOp(op.first.c_str())) continue;
       fout << "TfLiteRegistration* Register_"
            << ::tflite::NormalizeCustomOpName(op.first) << "();\n";
     }
@@ -114,6 +118,8 @@ void GenerateFileContent(const std::string& tflite_path,
     fout << ");\n";
   }
   for (const auto& op : custom_ops) {
+    // Skips Tensorflow ops, only TFLite custom ops can be registered here.
+    if (tflite::IsFlexOp(op.first.c_str())) continue;
     fout << "  resolver->AddCustom(\"" << op.first
          << "\", ::tflite::ops::custom::Register_"
          << ::tflite::NormalizeCustomOpName(op.first) << "()";
@@ -144,22 +150,26 @@ void AddOpsFromModel(const std::string& input_model,
 }  // namespace
 
 int main(int argc, char** argv) {
-  std::string input_model;
+  std::string input_models;
   std::string output_registration;
   std::string tflite_path;
   std::string namespace_flag;
   bool for_micro = false;
-  ParseFlagAndInit(&argc, argv, &input_model, &output_registration,
+  ParseFlagAndInit(&argc, argv, &input_models, &output_registration,
                    &tflite_path, &namespace_flag, &for_micro);
 
   tflite::RegisteredOpMap builtin_ops;
   tflite::RegisteredOpMap custom_ops;
-  if (!input_model.empty()) {
-    AddOpsFromModel(input_model, &builtin_ops, &custom_ops);
+  if (!input_models.empty()) {
+    std::vector<std::string> models = absl::StrSplit(input_models, ',');
+    for (const std::string& input_model : models) {
+      AddOpsFromModel(input_model, &builtin_ops, &custom_ops);
+    }
   }
   for (int i = 1; i < argc; i++) {
     AddOpsFromModel(argv[i], &builtin_ops, &custom_ops);
   }
+
   GenerateFileContent(tflite_path, output_registration, namespace_flag,
                       builtin_ops, custom_ops, for_micro);
   return 0;
diff --git a/tensorflow/lite/tools/list_flex_ops.cc b/tensorflow/lite/tools/list_flex_ops.cc
new file mode 100644
index 00000000000..9c80afbc90e
--- /dev/null
+++ b/tensorflow/lite/tools/list_flex_ops.cc
@@ -0,0 +1,128 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/list_flex_ops.h"
+
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace flex {
+
+std::string OpListToJSONString(const OpKernelSet& flex_ops) {
+  return absl::StrCat("[",
+                      absl::StrJoin(flex_ops, ",\n",
+                                    [](std::string* out, const OpKernel& op) {
+                                      absl::StrAppend(out, "[\"", op.op_name,
+                                                      "\", \"", op.kernel_name,
+                                                      "\"]");
+                                    }),
+                      "]");
+}
+
+// Find the class name of the op kernel described in the node_def from the pool
+// of registered ops. If no kernel class is found, return an empty string.
+string FindTensorflowKernelClass(tensorflow::NodeDef* node_def) {
+  if (!node_def || node_def->op().empty()) {
+    LOG(FATAL) << "Invalid NodeDef";
+  }
+
+  const tensorflow::OpRegistrationData* op_reg_data;
+  auto status =
+      tensorflow::OpRegistry::Global()->LookUp(node_def->op(), &op_reg_data);
+  if (!status.ok()) {
+    LOG(FATAL) << "Op " << node_def->op() << " not found: " << status;
+  }
+  AddDefaultsToNodeDef(op_reg_data->op_def, node_def);
+
+  tensorflow::DeviceNameUtils::ParsedName parsed_name;
+  if (!tensorflow::DeviceNameUtils::ParseFullName(node_def->device(),
+                                                  &parsed_name)) {
+    LOG(FATAL) << "Failed to parse device from node_def: "
+               << node_def->ShortDebugString();
+  }
+  string class_name;
+  if (!tensorflow::FindKernelDef(
+           tensorflow::DeviceType(parsed_name.type.c_str()), *node_def,
+           nullptr /* kernel_def */, &class_name)
+           .ok()) {
+    LOG(FATAL) << "Failed to find kernel class for op: " << node_def->op();
+  }
+  return class_name;
+}
+
+void AddFlexOpsFromModel(const tflite::Model* model, OpKernelSet* flex_ops) {
+  // Read flex ops.
+  auto* subgraphs = model->subgraphs();
+  if (!subgraphs) return;
+  for (int subgraph_index = 0; subgraph_index < subgraphs->size();
+       ++subgraph_index) {
+    const tflite::SubGraph* subgraph = subgraphs->Get(subgraph_index);
+    auto* operators = subgraph->operators();
+    auto* opcodes = model->operator_codes();
+    if (!operators || !opcodes) continue;
+    for (int i = 0; i < operators->size(); ++i) {
+      const tflite::Operator* op = operators->Get(i);
+      const tflite::OperatorCode* opcode = opcodes->Get(op->opcode_index());
+      if (opcode->builtin_code() != tflite::BuiltinOperator_CUSTOM ||
+          !tflite::IsFlexOp(opcode->custom_code()->c_str())) {
+        continue;
+      }
+
+      // Remove the "Flex" prefix from op name.
+      std::string flex_op_name(opcode->custom_code()->c_str());
+      std::string tf_op_name =
+          flex_op_name.substr(strlen(tflite::kFlexCustomCodePrefix));
+
+      // Read NodeDef and find the op kernel class.
+      if (op->custom_options_format() !=
+          tflite::CustomOptionsFormat_FLEXBUFFERS) {
+        LOG(FATAL) << "Invalid CustomOptionsFormat";
+      }
+      const flatbuffers::Vector<uint8_t>* custom_opt_bytes =
+          op->custom_options();
+      if (custom_opt_bytes && custom_opt_bytes->size()) {
+        // NOLINTNEXTLINE: It is common to use references with flatbuffer.
+        const flexbuffers::Vector& v =
+            flexbuffers::GetRoot(custom_opt_bytes->data(),
+                                 custom_opt_bytes->size())
+                .AsVector();
+        std::string nodedef_str = v[1].AsString().str();
+        tensorflow::NodeDef nodedef;
+        if (nodedef_str.empty() || !nodedef.ParseFromString(nodedef_str)) {
+          LOG(FATAL) << "Failed to parse data into a valid NodeDef";
+        }
+        // Flex delegate only supports running flex ops with CPU.
+        *nodedef.mutable_device() = "/CPU:0";
+        std::string kernel_class = FindTensorflowKernelClass(&nodedef);
+        flex_ops->insert({tf_op_name, kernel_class});
+      }
+    }
+  }
+}
+}  // namespace flex
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/list_flex_ops.h b/tensorflow/lite/tools/list_flex_ops.h
new file mode 100644
index 00000000000..070da2d9b3d
--- /dev/null
+++ b/tensorflow/lite/tools/list_flex_ops.h
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_LIST_FLEX_OPS_H_
+#define TENSORFLOW_LITE_TOOLS_LIST_FLEX_OPS_H_
+
+#include <set>
+#include <string>
+
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace flex {
+
+// Store the Op and Kernel name of an op as the key of a set or map.
+struct OpKernel {
+  std::string op_name;
+  std::string kernel_name;
+};
+
+// The comparison function for OpKernel.
+struct OpKernelCompare {
+  bool operator()(const OpKernel& lhs, const OpKernel& rhs) const {
+    if (lhs.op_name == rhs.op_name) {
+      return lhs.kernel_name < rhs.kernel_name;
+    }
+    return lhs.op_name < rhs.op_name;
+  }
+};
+
+using OpKernelSet = std::set<OpKernel, OpKernelCompare>;
+
+// Find flex ops and its kernel classes inside a TFLite model and add them to
+// the map flex_ops. The map stores
+void AddFlexOpsFromModel(const tflite::Model* model, OpKernelSet* flex_ops);
+
+// Serialize the list op of to a json string. If flex_ops is empty, return an
+// empty string.
+std::string OpListToJSONString(const OpKernelSet& flex_ops);
+
+}  // namespace flex
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_LIST_FLEX_OPS_H_
diff --git a/tensorflow/lite/tools/list_flex_ops_main.cc b/tensorflow/lite/tools/list_flex_ops_main.cc
new file mode 100644
index 00000000000..e20aed43a66
--- /dev/null
+++ b/tensorflow/lite/tools/list_flex_ops_main.cc
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+#include "absl/strings/str_split.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
+#include "tensorflow/lite/tools/list_flex_ops.h"
+
+const char kInputModelsFlag[] = "graphs";
+
+int main(int argc, char** argv) {
+  std::string input_models;
+  std::vector<tflite::Flag> flag_list = {
+      tflite::Flag::CreateFlag(kInputModelsFlag, &input_models,
+                               "path to the tflite models, separated by comma.",
+                               tflite::Flag::kRequired),
+  };
+  tflite::Flags::Parse(&argc, const_cast<const char**>(argv), flag_list);
+
+  std::vector<std::string> models = absl::StrSplit(input_models, ',');
+  tflite::flex::OpKernelSet flex_ops;
+  for (const std::string& model_file : models) {
+    std::ifstream fin;
+    fin.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+    fin.open(model_file);
+    std::stringstream content;
+    content << fin.rdbuf();
+
+    // Need to store content data first, otherwise, it won't work in bazel.
+    std::string content_str = content.str();
+    const ::tflite::Model* model = ::tflite::GetModel(content_str.data());
+    tflite::flex::AddFlexOpsFromModel(model, &flex_ops);
+  }
+  std::cout << tflite::flex::OpListToJSONString(flex_ops);
+  return 0;
+}
diff --git a/tensorflow/lite/tools/list_flex_ops_test.cc b/tensorflow/lite/tools/list_flex_ops_test.cc
new file mode 100644
index 00000000000..67ddc06325a
--- /dev/null
+++ b/tensorflow/lite/tools/list_flex_ops_test.cc
@@ -0,0 +1,203 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/tools/list_flex_ops.h"
+
+#include <cstdint>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/lite/kernels/test_util.h"
+
+namespace tflite {
+namespace flex {
+
+class FlexOpsListTest : public ::testing::Test {
+ protected:
+  FlexOpsListTest() {}
+
+  void ReadOps(const string& model_path) {
+    auto model = FlatBufferModel::BuildFromFile(model_path.data());
+    AddFlexOpsFromModel(model->GetModel(), &flex_ops_);
+    output_text_ = OpListToJSONString(flex_ops_);
+  }
+
+  void ReadOps(const tflite::Model* model) {
+    AddFlexOpsFromModel(model, &flex_ops_);
+    output_text_ = OpListToJSONString(flex_ops_);
+  }
+
+  std::string output_text_;
+  OpKernelSet flex_ops_;
+};
+
+TfLiteRegistration* Register_TEST() {
+  static TfLiteRegistration r = {nullptr, nullptr, nullptr, nullptr};
+  return &r;
+}
+
+std::vector<uint8_t> CreateFlexCustomOptions(std::string nodedef_raw_string) {
+  tensorflow::NodeDef node_def;
+  tensorflow::protobuf::TextFormat::ParseFromString(nodedef_raw_string,
+                                                    &node_def);
+  std::string node_def_str = node_def.SerializeAsString();
+  auto flex_builder = std::make_unique<flexbuffers::Builder>();
+  flex_builder->Vector([&]() {
+    flex_builder->String(node_def.op());
+    flex_builder->String(node_def_str);
+  });
+  flex_builder->Finish();
+  return flex_builder->GetBuffer();
+}
+
+class FlexOpModel : public SingleOpModel {
+ public:
+  FlexOpModel(const std::string& op_name, const TensorData& input1,
+              const TensorData& input2, const TensorType& output,
+              const std::vector<uint8_t>& custom_options) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetCustomOp(op_name, custom_options, Register_TEST);
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST_F(FlexOpsListTest, TestModelsNoFlex) {
+  ReadOps("third_party/tensorflow/lite/testdata/test_model.bin");
+  EXPECT_EQ(output_text_, "[]");
+}
+
+TEST_F(FlexOpsListTest, TestBrokenModel) {
+  EXPECT_DEATH_IF_SUPPORTED(
+      ReadOps("third_party/tensorflow/lite/testdata/test_model_broken.bin"),
+      "");
+}
+
+TEST_F(FlexOpsListTest, TestZeroSubgraphs) {
+  ReadOps("third_party/tensorflow/lite/testdata/0_subgraphs.bin");
+  EXPECT_EQ(output_text_, "[]");
+}
+
+TEST_F(FlexOpsListTest, TestFlexAdd) {
+  ReadOps("third_party/tensorflow/lite/testdata/multi_add_flex.bin");
+  EXPECT_EQ(output_text_,
+            "[[\"Add\", \"BinaryOp<CPUDevice, functor::add<float>>\"]]");
+}
+
+TEST_F(FlexOpsListTest, TestTwoModel) {
+  ReadOps("third_party/tensorflow/lite/testdata/multi_add_flex.bin");
+  ReadOps("third_party/tensorflow/lite/testdata/softplus_flex.bin");
+  EXPECT_EQ(output_text_,
+            "[[\"Add\", \"BinaryOp<CPUDevice, "
+            "functor::add<float>>\"],\n[\"Softplus\", \"SoftplusOp<CPUDevice, "
+            "float>\"]]");
+}
+
+TEST_F(FlexOpsListTest, TestDuplicatedOp) {
+  ReadOps("third_party/tensorflow/lite/testdata/multi_add_flex.bin");
+  ReadOps("third_party/tensorflow/lite/testdata/multi_add_flex.bin");
+  EXPECT_EQ(output_text_,
+            "[[\"Add\", \"BinaryOp<CPUDevice, functor::add<float>>\"]]");
+}
+
+TEST_F(FlexOpsListTest, TestInvalidCustomOptions) {
+  // Using a invalid custom options, expected to fail.
+  std::vector<uint8_t> random_custom_options(20);
+  FlexOpModel max_model("FlexAdd", {TensorType_FLOAT32, {3, 1, 2, 2}},
+                        {TensorType_FLOAT32, {3, 1, 2, 1}}, TensorType_FLOAT32,
+                        random_custom_options);
+  EXPECT_DEATH_IF_SUPPORTED(
+      ReadOps(tflite::GetModel(max_model.GetModelBuffer())),
+      "Failed to parse data into a valid NodeDef");
+}
+
+TEST_F(FlexOpsListTest, TestOpNameEmpty) {
+  // NodeDef with empty opname.
+  std::string nodedef_raw_str =
+      "name: \"node_1\""
+      "op: \"\""
+      "input: [ \"b\", \"c\" ]"
+      "attr: { key: \"T\" value: { type: DT_FLOAT } }";
+  std::string random_fieldname = "random string";
+  FlexOpModel max_model("FlexAdd", {TensorType_FLOAT32, {3, 1, 2, 2}},
+                        {TensorType_FLOAT32, {3, 1, 2, 1}}, TensorType_FLOAT32,
+                        CreateFlexCustomOptions(nodedef_raw_str));
+  EXPECT_DEATH_IF_SUPPORTED(
+      ReadOps(tflite::GetModel(max_model.GetModelBuffer())), "Invalid NodeDef");
+}
+
+TEST_F(FlexOpsListTest, TestOpNotFound) {
+  // NodeDef with invalid opname.
+  std::string nodedef_raw_str =
+      "name: \"node_1\""
+      "op: \"FlexInvalidOp\""
+      "input: [ \"b\", \"c\" ]"
+      "attr: { key: \"T\" value: { type: DT_FLOAT } }";
+
+  FlexOpModel max_model("FlexAdd", {TensorType_FLOAT32, {3, 1, 2, 2}},
+                        {TensorType_FLOAT32, {3, 1, 2, 1}}, TensorType_FLOAT32,
+                        CreateFlexCustomOptions(nodedef_raw_str));
+  EXPECT_DEATH_IF_SUPPORTED(
+      ReadOps(tflite::GetModel(max_model.GetModelBuffer())),
+      "Op FlexInvalidOp not found");
+}
+
+TEST_F(FlexOpsListTest, TestKernelNotFound) {
+  // NodeDef with non-supported type.
+  std::string nodedef_raw_str =
+      "name: \"node_1\""
+      "op: \"Add\""
+      "input: [ \"b\", \"c\" ]"
+      "attr: { key: \"T\" value: { type: DT_BOOL } }";
+
+  FlexOpModel max_model("FlexAdd", {TensorType_FLOAT32, {3, 1, 2, 2}},
+                        {TensorType_FLOAT32, {3, 1, 2, 1}}, TensorType_FLOAT32,
+                        CreateFlexCustomOptions(nodedef_raw_str));
+  EXPECT_DEATH_IF_SUPPORTED(
+      ReadOps(tflite::GetModel(max_model.GetModelBuffer())),
+      "Failed to find kernel class for op: Add");
+}
+
+TEST_F(FlexOpsListTest, TestFlexAddWithSingleOpModel) {
+  std::string nodedef_raw_str =
+      "name: \"node_1\""
+      "op: \"Add\""
+      "input: [ \"b\", \"c\" ]"
+      "attr: { key: \"T\" value: { type: DT_FLOAT } }";
+
+  FlexOpModel max_model("FlexAdd", {TensorType_FLOAT32, {3, 1, 2, 2}},
+                        {TensorType_FLOAT32, {3, 1, 2, 1}}, TensorType_FLOAT32,
+                        CreateFlexCustomOptions(nodedef_raw_str));
+  ReadOps(tflite::GetModel(max_model.GetModelBuffer()));
+  EXPECT_EQ(output_text_,
+            "[[\"Add\", \"BinaryOp<CPUDevice, functor::add<float>>\"]]");
+}
+}  // namespace flex
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index ad3832f9962..738d50b1771 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -58,7 +58,7 @@ LIBS := \
 # There are no rules for compiling objects for the host system (since we don't
 # generate things like the protobuf compiler that require that), so all of
 # these settings are for the target compiler.
-CFLAGS := -O3 -DNDEBUG -fPIC
+CFLAGS := -O3 -DNDEBUG -fPIC $(EXTRA_CFLAGS)
 CXXFLAGS := $(CFLAGS) --std=c++11 $(EXTRA_CXXFLAGS)
 LDOPTS := -L/usr/local/lib
 ARFLAGS := -r
@@ -97,6 +97,11 @@ BENCHMARK_PERF_OPTIONS_BINARY_NAME := benchmark_model_performance_options
 MINIMAL_SRCS := \
 	tensorflow/lite/examples/minimal/minimal.cc
 
+LABEL_IMAGE_SRCS := \
+	tensorflow/lite/examples/label_image/bitmap_helpers.cc \
+	tensorflow/lite/examples/label_image/label_image.cc \
+	tensorflow/lite/tools/evaluation/utils.cc
+
 # What sources we want to compile, must be kept in sync with the main Bazel
 # build files.
 
@@ -118,6 +123,7 @@ CORE_CC_ALL_SRCS := \
 $(wildcard tensorflow/lite/*.cc) \
 $(wildcard tensorflow/lite/*.c) \
 $(wildcard tensorflow/lite/c/*.c) \
+$(wildcard tensorflow/lite/c/*.cc) \
 $(wildcard tensorflow/lite/core/*.cc) \
 $(wildcard tensorflow/lite/core/api/*.cc) \
 $(wildcard tensorflow/lite/experimental/resource/*.cc) \
@@ -246,7 +252,7 @@ BENCHMARK_LIB_SRCS := $(filter-out \
 	$(BENCHMARK_ALL_SRCS))
 
 # These target-specific makefiles should modify or replace options like
-# CXXFLAGS or LIBS to work for a specific targetted architecture. All logic
+# CXXFLAGS or LIBS to work for a specific targeted architecture. All logic
 # based on platforms or architectures should happen within these files, to
 # keep this main makefile focused on the sources and dependencies.
 include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
@@ -271,6 +277,7 @@ BENCHMARK_LIB := $(LIBDIR)$(BENCHMARK_LIB_NAME)
 BENCHMARK_BINARY := $(BINDIR)$(BENCHMARK_BINARY_NAME)
 BENCHMARK_PERF_OPTIONS_BINARY := $(BINDIR)$(BENCHMARK_PERF_OPTIONS_BINARY_NAME)
 MINIMAL_BINARY := $(BINDIR)minimal
+LABEL_IMAGE_BINARY := $(BINDIR)label_image
 
 CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++
 CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc
@@ -279,6 +286,9 @@ AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar
 MINIMAL_OBJS := $(addprefix $(OBJDIR), \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MINIMAL_SRCS))))
 
+LABEL_IMAGE_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(LABEL_IMAGE_SRCS))))
+
 LIB_OBJS := $(addprefix $(OBJDIR), \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(TF_LITE_CC_SRCS)))))
 
@@ -333,17 +343,32 @@ $(MINIMAL_BINARY): $(MINIMAL_OBJS) $(LIB_PATH)
 
 minimal: $(MINIMAL_BINARY)
 
+$(LABEL_IMAGE_BINARY): $(LABEL_IMAGE_OBJS) $(LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(LABEL_IMAGE_BINARY) $(LABEL_IMAGE_OBJS) \
+	$(LIBFLAGS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
+
+label_image: $(LABEL_IMAGE_BINARY)
+
 $(BENCHMARK_LIB) : $(LIB_PATH) $(BENCHMARK_LIB_OBJS)
 	@mkdir -p $(dir $@)
 	$(AR) $(ARFLAGS) $(BENCHMARK_LIB) $(LIB_OBJS) $(BENCHMARK_LIB_OBJS)
 
 benchmark_lib: $(BENCHMARK_LIB)
 
+BENCHMARK_LINKOPTS :=
+ifeq ($(HOST_OS),osx)
+	BENCHMARK_LINKOPTS += $(LIBFLAGS) -Wl,-force_load $(BENCHMARK_LIB) $(LIBS) $(LDFLAGS) -framework CoreFoundation
+else
+	BENCHMARK_LINKOPTS += $(LIBFLAGS) -Wl,--whole-archive $(BENCHMARK_LIB) -Wl,--no-whole-archive $(LDFLAGS) $(LIBS)
+endif
+
 $(BENCHMARK_BINARY) : $(BENCHMARK_MAIN_OBJ) $(BENCHMARK_LIB)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $(BENCHMARK_BINARY) $(BENCHMARK_MAIN_OBJ) \
-	$(LIBFLAGS) -Wl,--whole-archive $(BENCHMARK_LIB) -Wl,--no-whole-archive $(LDFLAGS) $(LIBS)
+	$(LIBFLAGS) $(BENCHMARK_LINKOPTS)
 
 $(BENCHMARK_PERF_OPTIONS_BINARY) : $(BENCHMARK_PERF_OPTIONS_OBJ) $(BENCHMARK_LIB)
 	@mkdir -p $(dir $@)
diff --git a/tensorflow/lite/tools/make/targets/rpi_makefile.inc b/tensorflow/lite/tools/make/targets/rpi_makefile.inc
index 2225848ae64..71046d08131 100644
--- a/tensorflow/lite/tools/make/targets/rpi_makefile.inc
+++ b/tensorflow/lite/tools/make/targets/rpi_makefile.inc
@@ -32,7 +32,7 @@ ifeq ($(TARGET),rpi)
   # TODO(petewarden) In the future, we'll want to use OpenBLAS as a faster
   # alternative to Eigen on non-NEON ARM hardware like armv6.
   ifeq ($(TARGET_ARCH), armv6)
-    TARGET_TOOLCHAIN_PREFIX := arm-linux-gnueabi-
+    TARGET_TOOLCHAIN_PREFIX := arm-linux-gnueabihf-
     CXXFLAGS += \
       -march=armv6 \
       -mfpu=vfp \
diff --git a/tensorflow/lite/tools/optimize/model_utils.cc b/tensorflow/lite/tools/optimize/model_utils.cc
index 26dcff222bd..ae868cf21b8 100644
--- a/tensorflow/lite/tools/optimize/model_utils.cc
+++ b/tensorflow/lite/tools/optimize/model_utils.cc
@@ -134,8 +134,10 @@ void SetOperatorCodeVersion(ModelT* model) {
       OperatorCodeT* op_code = model->operator_codes[op->opcode_index].get();
       operator_property::OperatorProperty property =
           operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
-      if (property.quantizable) {
-        // Only update the versions of quantizable operations.
+      if (property.quantizable && op_code->version < property.version) {
+        // Only update the versions of quantizable operations if the original
+        // version is lesser than minimum quantized one mentioned by
+        // OperatorProperty.
         op_code->version = property.version;
       }
     }
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index f1a4623b874..8a0cbca29e2 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -831,7 +831,17 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.version = 2;
       break;
+    case BuiltinOperator_PRELU:
+      property.inputs = {{0, {}}, {1, {}}};
+      property.outputs = {{0, {}}};
+      property.restrict_same_input_output_scale = false;
+      property.version = 1;
+      break;
     case BuiltinOperator_LEAKY_RELU:
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
+      property.version = 2;
+      break;
     case BuiltinOperator_RELU:
     case BuiltinOperator_RELU6:
       property.inputs = {{0, {}}};
diff --git a/tensorflow/lite/tools/optimize/operator_property.h b/tensorflow/lite/tools/optimize/operator_property.h
index e8214568fa8..ef84f3aaac1 100644
--- a/tensorflow/lite/tools/optimize/operator_property.h
+++ b/tensorflow/lite/tools/optimize/operator_property.h
@@ -88,7 +88,7 @@ struct OperatorProperty {
   bool restrict_same_input_output_scale = false;
 
   // Use same min of min and max of max for each group.
-  // Incompatable with restrict_same_input_output_scale and restricted_value.
+  // Incompatible with restrict_same_input_output_scale and restricted_value.
   // TODO(jianlijianli): make it compatible with other restrictions when there
   // is a use case.
   std::vector<std::vector<int>> restrict_scale = {};
diff --git a/tensorflow/lite/tools/optimize/python/modify_model_interface_lib.py b/tensorflow/lite/tools/optimize/python/modify_model_interface_lib.py
index 5e4bf99ccdf..782d88cbc9b 100644
--- a/tensorflow/lite/tools/optimize/python/modify_model_interface_lib.py
+++ b/tensorflow/lite/tools/optimize/python/modify_model_interface_lib.py
@@ -74,6 +74,6 @@ def modify_model_interface(input_file, output_file, input_type, output_type):
   # Throw an exception if the return status is an error.
   if status != 0:
     raise RuntimeError(
-        'Error occured when trying to modify the model input type from float '
+        'Error occurred when trying to modify the model input type from float '
         'to {input_type} and output type from float to {output_type}.'.format(
             input_type=input_type, output_type=output_type))
diff --git a/tensorflow/lite/tools/pip_package/README.md b/tensorflow/lite/tools/pip_package/README.md
index dac8ce02ca1..9622ce0c7e1 100644
--- a/tensorflow/lite/tools/pip_package/README.md
+++ b/tensorflow/lite/tools/pip_package/README.md
@@ -49,6 +49,76 @@ BUILD_DEB=y to the make command (only for python3):
 make BASE_IMAGE=debian:buster PYTHON=python3 TENSORFLOW_TARGET=rpi BUILD_DEB=y docker-build
 ```
 
+## Alternative build with Bazel (experimental)
+
+There is another build steps to build a binary wheel which uses Bazel instead of
+Makefile. You don't need to install additional dependencies.
+This approach can leverage TF's ci_build.sh for ARM cross builds.
+
+### Native build for your workstation
+
+```sh
+tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
+```
+
+### Cross build for armhf Python 3.5
+
+```sh
+CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.5" \
+  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON3 \
+  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh armhf
+```
+
+### Cross build for armhf Python 3.7
+
+```sh
+CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.7" \
+  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON37 \
+  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh armhf
+```
+
+### Cross build for aarch64 Python 3.5
+
+```sh
+  CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.5" \
+  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON3 \
+  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh aarch64
+```
+
+### Cross build for aarch64 Python 3.7
+
+```sh
+CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.7" \
+  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON37 \
+  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh aarch64
+```
+
+## Enable TF OP support (Flex delegate)
+
+If you want to use TF ops with Python API, you need to enable flex support.
+You can build TFLite interpreter with flex ops support by providing
+"--define=tflite_pip_with_flex=true" to Bazel.
+
+Here are some examples.
+
+### Native build with Flex for your workstation
+
+```sh
+CUSTOM_BAZEL_FLAGS=--define=tflite_pip_with_flex=true \
+  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
+```
+
+### Cross build with Flex for armhf Python 3.5
+
+```sh
+CI_DOCKER_EXTRA_PARAMS="-e CUSTOM_BAZEL_FLAGS=--define=tflite_pip_with_flex=true \
+  -e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.5" \
+  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON3 \
+  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh armhf
+```
+
+## Usage
+
 Note, unlike tensorflow this will be installed to a tflite_runtime namespace.
 You can then use the Tensorflow Lite interpreter as.
 
diff --git a/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh b/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
new file mode 100755
index 00000000000..4976624e340
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
@@ -0,0 +1,144 @@
+#!/usr/bin/env bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -ex
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PYTHON="${PYTHON:-python3}"
+VERSION_SUFFIX=${VERSION_SUFFIX:-}
+export TENSORFLOW_DIR="${SCRIPT_DIR}/../../../.."
+TENSORFLOW_LITE_DIR="${TENSORFLOW_DIR}/tensorflow/lite"
+TENSORFLOW_VERSION=$(grep "_VERSION = " "${TENSORFLOW_DIR}/tensorflow/tools/pip_package/setup.py" | cut -d= -f2 | sed "s/[ '-]//g")
+export PACKAGE_VERSION="${TENSORFLOW_VERSION}${VERSION_SUFFIX}"
+BUILD_DIR="${SCRIPT_DIR}/gen/tflite_pip/${PYTHON}"
+TENSORFLOW_TARGET=$1
+
+# Fix container image for cross build.
+if [ ! -z "${CI_BUILD_HOME}" ] && [ `pwd` = "/workspace" ]; then
+  # Fix for curl build problem in 32-bit, see https://stackoverflow.com/questions/35181744/size-of-array-curl-rule-01-is-negative
+  if [ "${TENSORFLOW_TARGET}" = "armhf" ]; then
+    sudo sed -i 's/define CURL_SIZEOF_LONG 8/define CURL_SIZEOF_LONG 4/g' /usr/include/curl/curlbuild.h
+    sudo sed -i 's/define CURL_SIZEOF_CURL_OFF_T 8/define CURL_SIZEOF_CURL_OFF_T 4/g' /usr/include/curl/curlbuild.h
+  fi
+
+  # The system-installed OpenSSL headers get pulled in by the latest BoringSSL
+  # release on this configuration, so move them before we build:
+  if [ -d /usr/include/openssl ]; then
+    sudo mv /usr/include/openssl /usr/include/openssl.original
+  fi
+fi
+
+# Build source tree.
+rm -rf "${BUILD_DIR}" && mkdir -p "${BUILD_DIR}/tflite_runtime"
+cp -r "${TENSORFLOW_LITE_DIR}/tools/pip_package/debian" \
+      "${TENSORFLOW_LITE_DIR}/tools/pip_package/setup_with_bazel.py" \
+      "${TENSORFLOW_LITE_DIR}/tools/pip_package/MANIFEST.in" \
+      "${TENSORFLOW_LITE_DIR}/python/interpreter_wrapper" \
+      "${BUILD_DIR}"
+cp "${TENSORFLOW_LITE_DIR}/python/interpreter.py" \
+   "${BUILD_DIR}/tflite_runtime"
+echo "__version__ = '${PACKAGE_VERSION}'" >> "${BUILD_DIR}/tflite_runtime/__init__.py"
+echo "__git_version__ = '$(git -C "${TENSORFLOW_DIR}" describe)'" >> "${BUILD_DIR}/tflite_runtime/__init__.py"
+
+# Build python interpreter_wrapper.
+cd "${BUILD_DIR}"
+case "${TENSORFLOW_TARGET}" in
+  armhf)
+    BAZEL_FLAGS="--config=elinux_armhf
+      --copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
+      --copt=-O3 --copt=-fno-tree-pre --copt=-fpermissive
+      --define tensorflow_mkldnn_contraction_kernel=0
+      --define=raspberry_pi_with_neon=true"
+    ;;
+  aarch64)
+    BAZEL_FLAGS="--config=elinux_aarch64
+      --define tensorflow_mkldnn_contraction_kernel=0
+      --copt=-O3"
+    ;;
+  *)
+    ;;
+esac
+
+# We need to pass down the environment variable with a possible alternate Python
+# include path for Python 3.x builds to work.
+export CROSSTOOL_PYTHON_INCLUDE_PATH
+
+bazel build -c opt -s --config=monolithic --config=noaws --config=nogcp --config=nohdfs --config=nonccl \
+  ${BAZEL_FLAGS} ${CUSTOM_BAZEL_FLAGS} //tensorflow/lite/python/interpreter_wrapper:_pywrap_tensorflow_interpreter_wrapper
+cp "${TENSORFLOW_DIR}/bazel-bin/tensorflow/lite/python/interpreter_wrapper/_pywrap_tensorflow_interpreter_wrapper.so" \
+   "${BUILD_DIR}/tflite_runtime"
+
+# Build python wheel.
+cd "${BUILD_DIR}"
+case "${TENSORFLOW_TARGET}" in
+  armhf)
+    ${PYTHON} setup_with_bazel.py bdist --plat-name=linux-armv7l \
+                       bdist_wheel --plat-name=linux-armv7l
+    ;;
+  aarch64)
+    ${PYTHON} setup_with_bazel.py bdist --plat-name=linux-aarch64 \
+                       bdist_wheel --plat-name=linux-aarch64
+    ;;
+  *)
+    if [[ -n "${TENSORFLOW_TARGET}" ]] && [[ -n "${TENSORFLOW_TARGET_ARCH}" ]]; then
+      ${PYTHON} setup_with_bazel.py bdist --plat-name=${TENSORFLOW_TARGET}-${TENSORFLOW_TARGET_ARCH} \
+                         bdist_wheel --plat-name=${TENSORFLOW_TARGET}-${TENSORFLOW_TARGET_ARCH}
+    else
+      ${PYTHON} setup_with_bazel.py bdist bdist_wheel
+    fi
+    ;;
+esac
+
+echo "Output can be found here:"
+find "${BUILD_DIR}"
+
+# Build debian package.
+if [[ "${BUILD_DEB}" != "y" ]]; then
+  exit 0
+fi
+
+PYTHON_VERSION=$(${PYTHON} -c "import sys;print(sys.version_info.major)")
+if [[ ${PYTHON_VERSION} != 3 ]]; then
+  echo "Debian package can only be generated for python3." >&2
+  exit 1
+fi
+
+DEB_VERSION=$(dpkg-parsechangelog --show-field Version | cut -d- -f1)
+if [[ "${DEB_VERSION}" != "${PACKAGE_VERSION}" ]]; then
+  cat << EOF > "${BUILD_DIR}/debian/changelog"
+tflite-runtime (${PACKAGE_VERSION}-1) unstable; urgency=low
+
+  * Bump version to ${PACKAGE_VERSION}.
+
+ -- TensorFlow team <packages@tensorflow.org>  $(date -R)
+
+$(<"${BUILD_DIR}/debian/changelog")
+EOF
+fi
+
+case "${TENSORFLOW_TARGET}" in
+  armhf)
+    dpkg-buildpackage -b -rfakeroot -us -uc -tc -d -a armhf
+    ;;
+  aarch64)
+    dpkg-buildpackage -b -rfakeroot -us -uc -tc -d -a arm64
+    ;;
+  *)
+    dpkg-buildpackage -b -rfakeroot -us -uc -tc -d
+    ;;
+esac
+
+cat "${BUILD_DIR}/debian/changelog"
+
diff --git a/tensorflow/lite/tools/pip_package/setup.py b/tensorflow/lite/tools/pip_package/setup.py
index 2f2515145c4..56f52f4d68d 100644
--- a/tensorflow/lite/tools/pip_package/setup.py
+++ b/tensorflow/lite/tools/pip_package/setup.py
@@ -58,7 +58,9 @@ elif TARGET == 'aarch64':
   os.environ['CC'] = 'aarch64-linux-gnu-gcc'
 
 MAKE_CROSS_OPTIONS = []
-for name in ['TARGET', 'TARGET_ARCH', 'CC_PREFIX', 'EXTRA_CXXFLAGS']:
+for name in [
+    'TARGET', 'TARGET_ARCH', 'CC_PREFIX', 'EXTRA_CXXFLAGS', 'EXTRA_CFLAGS'
+]:
   value = os.environ.get('TENSORFLOW_%s' % name)
   if value:
     MAKE_CROSS_OPTIONS.append('%s=%s' % (name, value))
diff --git a/tensorflow/lite/tools/pip_package/setup_with_bazel.py b/tensorflow/lite/tools/pip_package/setup_with_bazel.py
new file mode 100644
index 00000000000..e3e9a35a62e
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/setup_with_bazel.py
@@ -0,0 +1,70 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow Lite is for mobile and embedded devices.
+
+TensorFlow Lite is the official solution for running machine learning models on
+mobile and embedded devices. It enables on-device machine learning inference
+with low latency and a small binary size on Android, iOS, and other operating
+systems.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from setuptools import find_packages
+from setuptools import setup
+PACKAGE_NAME = 'tflite_runtime'
+PACKAGE_VERSION = os.environ['PACKAGE_VERSION']
+DOCLINES = __doc__.split('\n')
+
+setup(
+    name=PACKAGE_NAME.replace('_', '-'),
+    version=PACKAGE_VERSION,
+    description=DOCLINES[0],
+    long_description='\n'.join(DOCLINES[2:]),
+    url='https://www.tensorflow.org/lite/',
+    author='Google, LLC',
+    author_email='packages@tensorflow.org',
+    license='Apache 2.0',
+    include_package_data=True,
+    keywords='tflite tensorflow tensor machine learning',
+    classifiers=[
+        'Development Status :: 5 - Production/Stable',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Education',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: Apache Software License',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Topic :: Scientific/Engineering',
+        'Topic :: Scientific/Engineering :: Mathematics',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: Software Development',
+        'Topic :: Software Development :: Libraries',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+    ],
+    packages=find_packages(exclude=[]),
+    package_dir={'': '.'},
+    package_data={'': ['*.so']},
+    install_requires=[
+        'numpy >= 1.16.0',
+        'pybind11 >= 2.4.3',
+    ])
diff --git a/tensorflow/lite/tools/signature/BUILD b/tensorflow/lite/tools/signature/BUILD
new file mode 100644
index 00000000000..cf28b2eab72
--- /dev/null
+++ b/tensorflow/lite/tools/signature/BUILD
@@ -0,0 +1,106 @@
+# Utilities for signature_defs in TFLite
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/lite/micro:build_def.bzl", "cc_library")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+TFLITE_DEFAULT_COPTS = if_not_windows([
+    "-Wall",
+    "-Wno-comment",
+    "-Wno-extern-c-compat",
+])
+
+cc_library(
+    name = "signature_def_util",
+    srcs = ["signature_def_util.cc"],
+    hdrs = ["signature_def_util.h"],
+    copts = TFLITE_DEFAULT_COPTS + tflite_copts(),
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:protos_all_cc_impl",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/memory",
+        "@com_google_protobuf//:protobuf",
+        "@flatbuffers",
+    ],
+)
+
+cc_test(
+    name = "signature_def_util_test",
+    size = "small",
+    srcs = ["signature_def_util_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/add.bin",
+    ],
+    tags = [
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":signature_def_util",
+        "//tensorflow/cc/saved_model:signature_constants",
+        "//tensorflow/core:tflite_portable_logging",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+pybind_extension(
+    name = "_pywrap_signature_def_util_wrapper",
+    srcs = [
+        "signature_def_util_wrapper_pybind11.cc",
+    ],
+    module_name = "_pywrap_signature_def_util_wrapper",
+    deps = [
+        ":signature_def_util",
+        "//tensorflow/lite:framework_lib",
+        "//tensorflow/python:pybind11_lib",
+        "@pybind11",
+    ],
+)
+
+py_library(
+    name = "signature_def_utils",
+    srcs = ["signature_def_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":_pywrap_signature_def_util_wrapper",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+py_test(
+    name = "signature_def_utils_test",
+    srcs = ["signature_def_utils_test.py"],
+    data = ["//tensorflow/lite:testdata/add.bin"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_mac",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":signature_def_utils",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/signature/signature_def_util.cc b/tensorflow/lite/tools/signature/signature_def_util.cc
new file mode 100644
index 00000000000..60ec27e4d22
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_util.cc
@@ -0,0 +1,175 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/signature/signature_def_util.h"
+
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace {
+
+using tensorflow::Status;
+using SerializedSignatureDefMap = std::map<std::string, std::string>;
+using SignatureDefMap = std::map<std::string, tensorflow::SignatureDef>;
+
+const Metadata* GetSignatureDefMetadata(const Model* model) {
+  if (!model || !model->metadata()) {
+    return nullptr;
+  }
+  for (int i = 0; i < model->metadata()->size(); ++i) {
+    const Metadata* metadata = model->metadata()->Get(i);
+    if (metadata->name()->str() == kSignatureDefsMetadataName) {
+      return metadata;
+    }
+  }
+  return nullptr;
+}
+
+Status ReadSignatureDefMap(const Model* model, const Metadata* metadata,
+                           SerializedSignatureDefMap* map) {
+  if (!model || !metadata || !map) {
+    return tensorflow::errors::InvalidArgument("Arguments must not be nullptr");
+  }
+  const flatbuffers::Vector<uint8_t>* flatbuffer_data =
+      model->buffers()->Get(metadata->buffer())->data();
+  const auto signature_defs =
+      flexbuffers::GetRoot(flatbuffer_data->data(), flatbuffer_data->size())
+          .AsMap();
+  for (int i = 0; i < signature_defs.Keys().size(); ++i) {
+    const std::string key = signature_defs.Keys()[i].AsString().c_str();
+    (*map)[key] = signature_defs[key].AsString().c_str();
+  }
+  return tensorflow::Status::OK();
+}
+
+}  // namespace
+
+Status SetSignatureDefMap(const Model* model,
+                          const SignatureDefMap& signature_def_map,
+                          std::string* model_data_with_signature_def) {
+  if (!model || !model_data_with_signature_def) {
+    return tensorflow::errors::InvalidArgument("Arguments must not be nullptr");
+  }
+  if (signature_def_map.empty()) {
+    return tensorflow::errors::InvalidArgument(
+        "signature_def_map should not be empty");
+  }
+  flexbuffers::Builder fbb;
+  const size_t start_map = fbb.StartMap();
+  auto mutable_model = absl::make_unique<ModelT>();
+  model->UnPackTo(mutable_model.get(), nullptr);
+  int buffer_id = mutable_model->buffers.size();
+  const Metadata* metadata = GetSignatureDefMetadata(model);
+  if (metadata) {
+    buffer_id = metadata->buffer();
+  } else {
+    auto buffer = absl::make_unique<BufferT>();
+    mutable_model->buffers.emplace_back(std::move(buffer));
+    auto sigdef_metadata = absl::make_unique<MetadataT>();
+    sigdef_metadata->buffer = buffer_id;
+    sigdef_metadata->name = kSignatureDefsMetadataName;
+    mutable_model->metadata.emplace_back(std::move(sigdef_metadata));
+  }
+  for (const auto& entry : signature_def_map) {
+    fbb.String(entry.first.c_str(), entry.second.SerializeAsString());
+  }
+  fbb.EndMap(start_map);
+  fbb.Finish();
+  mutable_model->buffers[buffer_id]->data = fbb.GetBuffer();
+  flatbuffers::FlatBufferBuilder builder;
+  auto packed_model = Model::Pack(builder, mutable_model.get());
+  FinishModelBuffer(builder, packed_model);
+  *model_data_with_signature_def =
+      std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
+                  builder.GetSize());
+  return Status::OK();
+}
+
+bool HasSignatureDef(const Model* model, const std::string& signature_key) {
+  if (!model) {
+    return false;
+  }
+  const Metadata* metadata = GetSignatureDefMetadata(model);
+  if (!metadata) {
+    return false;
+  }
+  SerializedSignatureDefMap signature_defs;
+  if (ReadSignatureDefMap(model, metadata, &signature_defs) !=
+      tensorflow::Status::OK()) {
+    return false;
+  }
+  return (signature_defs.find(signature_key) != signature_defs.end());
+}
+
+Status GetSignatureDefMap(const Model* model,
+                          SignatureDefMap* signature_def_map) {
+  if (!model || !signature_def_map) {
+    return tensorflow::errors::InvalidArgument("Arguments must not be nullptr");
+  }
+  SignatureDefMap retrieved_signature_def_map;
+  const Metadata* metadata = GetSignatureDefMetadata(model);
+  if (metadata) {
+    SerializedSignatureDefMap signature_defs;
+    auto status = ReadSignatureDefMap(model, metadata, &signature_defs);
+    if (status != tensorflow::Status::OK()) {
+      return tensorflow::errors::Internal("Error reading signature def map: ",
+                                          status.error_message());
+    }
+    for (const auto& entry : signature_defs) {
+      tensorflow::SignatureDef signature_def;
+      if (!signature_def.ParseFromString(entry.second)) {
+        return tensorflow::errors::Internal(
+            "Cannot parse signature def found in flatbuffer.");
+      }
+      retrieved_signature_def_map[entry.first] = signature_def;
+    }
+    *signature_def_map = retrieved_signature_def_map;
+  }
+  return Status::OK();
+}
+
+Status ClearSignatureDefMap(const Model* model, std::string* model_data) {
+  if (!model || !model_data) {
+    return tensorflow::errors::InvalidArgument("Arguments must not be nullptr");
+  }
+  auto mutable_model = absl::make_unique<ModelT>();
+  model->UnPackTo(mutable_model.get(), nullptr);
+  for (int id = 0; id < model->metadata()->size(); ++id) {
+    const Metadata* metadata = model->metadata()->Get(id);
+    if (metadata->name()->str() == kSignatureDefsMetadataName) {
+      auto* buffers = &(mutable_model->buffers);
+      buffers->erase(buffers->begin() + metadata->buffer());
+      mutable_model->metadata.erase(mutable_model->metadata.begin() + id);
+      break;
+    }
+  }
+  flatbuffers::FlatBufferBuilder builder;
+  auto packed_model = Model::Pack(builder, mutable_model.get());
+  FinishModelBuffer(builder, packed_model);
+  *model_data =
+      std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
+                  builder.GetSize());
+  return Status::OK();
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/signature/signature_def_util.h b/tensorflow/lite/tools/signature/signature_def_util.h
new file mode 100644
index 00000000000..7e9c96ffc43
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_util.h
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_SIGNATURE_DEF_UTIL_H_
+#define TENSORFLOW_LITE_TOOLS_SIGNATURE_DEF_UTIL_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// Constant for name of the Metadata entry associated with SignatureDefs.
+constexpr char kSignatureDefsMetadataName[] = "signature_defs_metadata";
+
+// The function `SetSignatureDefMap()` results in
+// `model_data_with_signature_defs` containing a serialized TFLite model
+// identical to `model` with a metadata and associated buffer containing
+// a FlexBuffer::Map with `signature_def_map` keys and values serialized to
+// String.
+//
+// If a Metadata entry containing a SignatureDef map exists, it will be
+//   overwritten.
+//
+// Returns error if `model_data_with_signature_defs` is null or
+//   `signature_def_map` is empty.
+//
+// On success, returns tensorflow::Status::OK() or error otherwise.
+// On error, `model_data_with_signature_defs` is unchanged.
+tensorflow::Status SetSignatureDefMap(
+    const Model* model,
+    const std::map<std::string, tensorflow::SignatureDef>& signature_def_map,
+    std::string* model_data_with_signature_defs);
+
+// The function `HasSignatureDef()` returns true if `model` contains a Metadata
+// table pointing to a buffer containing a FlexBuffer::Map and the map has
+// `signature_key` as a key, or false otherwise.
+bool HasSignatureDef(const Model* model, const std::string& signature_key);
+
+// The function `GetSignatureDefMap()` results in `signature_def_map`
+// pointing to a map<std::string, tensorflow::SignatureDef>
+// parsed from `model`'s metadata buffer.
+//
+// If the Metadata entry does not exist, `signature_def_map` is unchanged.
+// If the Metadata entry exists but cannot be parsed, returns an error.
+tensorflow::Status GetSignatureDefMap(
+    const Model* model,
+    std::map<std::string, tensorflow::SignatureDef>* signature_def_map);
+
+// The function `ClearSignatureDefs` results in `model_data`
+// containing a serialized Model identical to `model` omitting any
+// SignatureDef-related metadata or buffers.
+tensorflow::Status ClearSignatureDefMap(const Model* model,
+                                        std::string* model_data);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_SIGNATURE_DEF_UTIL_H_
diff --git a/tensorflow/lite/tools/signature/signature_def_util_test.cc b/tensorflow/lite/tools/signature/signature_def_util_test.cc
new file mode 100644
index 00000000000..d4581e262a4
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_util_test.cc
@@ -0,0 +1,167 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/signature/signature_def_util.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/cc/saved_model/signature_constants.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+using tensorflow::kClassifyMethodName;
+using tensorflow::kDefaultServingSignatureDefKey;
+using tensorflow::kPredictMethodName;
+using tensorflow::SignatureDef;
+using tensorflow::Status;
+
+constexpr char kSignatureInput[] = "input";
+constexpr char kSignatureOutput[] = "output";
+constexpr char kTestFilePath[] = "tensorflow/lite/testdata/add.bin";
+
+class SimpleSignatureDefUtilTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    flatbuffer_model_ = FlatBufferModel::BuildFromFile(kTestFilePath);
+    ASSERT_NE(flatbuffer_model_, nullptr);
+    model_ = flatbuffer_model_->GetModel();
+    ASSERT_NE(model_, nullptr);
+  }
+
+  SignatureDef GetTestSignatureDef() {
+    auto signature_def = SignatureDef();
+    tensorflow::TensorInfo input_tensor;
+    tensorflow::TensorInfo output_tensor;
+    *input_tensor.mutable_name() = kSignatureInput;
+    *output_tensor.mutable_name() = kSignatureOutput;
+    *signature_def.mutable_method_name() = kClassifyMethodName;
+    (*signature_def.mutable_inputs())[kSignatureInput] = input_tensor;
+    (*signature_def.mutable_outputs())[kSignatureOutput] = output_tensor;
+    return signature_def;
+  }
+  std::unique_ptr<FlatBufferModel> flatbuffer_model_;
+  const Model* model_;
+};
+
+TEST_F(SimpleSignatureDefUtilTest, SetSignatureDefTest) {
+  SignatureDef expected_signature_def = GetTestSignatureDef();
+  std::string model_output;
+  const std::map<string, SignatureDef> expected_signature_def_map = {
+      {kDefaultServingSignatureDefKey, expected_signature_def}};
+  EXPECT_EQ(Status::OK(), SetSignatureDefMap(model_, expected_signature_def_map,
+                                             &model_output));
+  const Model* add_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_TRUE(HasSignatureDef(add_model, kDefaultServingSignatureDefKey));
+  std::map<string, SignatureDef> test_signature_def_map;
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(add_model, &test_signature_def_map));
+  SignatureDef test_signature_def =
+      test_signature_def_map[kDefaultServingSignatureDefKey];
+  EXPECT_EQ(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+}
+
+TEST_F(SimpleSignatureDefUtilTest, OverwriteSignatureDefTest) {
+  auto expected_signature_def = GetTestSignatureDef();
+  std::string model_output;
+  std::map<string, SignatureDef> expected_signature_def_map = {
+      {kDefaultServingSignatureDefKey, expected_signature_def}};
+  EXPECT_EQ(Status::OK(), SetSignatureDefMap(model_, expected_signature_def_map,
+                                             &model_output));
+  const Model* add_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_TRUE(HasSignatureDef(add_model, kDefaultServingSignatureDefKey));
+  std::map<string, SignatureDef> test_signature_def_map;
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(add_model, &test_signature_def_map));
+  SignatureDef test_signature_def =
+      test_signature_def_map[kDefaultServingSignatureDefKey];
+  EXPECT_EQ(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+  *expected_signature_def.mutable_method_name() = kPredictMethodName;
+  expected_signature_def_map.erase(
+      expected_signature_def_map.find(kDefaultServingSignatureDefKey));
+  constexpr char kTestSignatureDefKey[] = "ServingTest";
+  expected_signature_def_map[kTestSignatureDefKey] = expected_signature_def;
+  EXPECT_EQ(
+      Status::OK(),
+      SetSignatureDefMap(add_model, expected_signature_def_map, &model_output));
+  const Model* final_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_FALSE(HasSignatureDef(final_model, kDefaultServingSignatureDefKey));
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(final_model, &test_signature_def_map));
+  EXPECT_NE(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+  EXPECT_TRUE(HasSignatureDef(final_model, kTestSignatureDefKey));
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(final_model, &test_signature_def_map));
+  test_signature_def = test_signature_def_map[kTestSignatureDefKey];
+  EXPECT_EQ(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+}
+
+TEST_F(SimpleSignatureDefUtilTest, GetSignatureDefTest) {
+  std::map<string, SignatureDef> test_signature_def_map;
+  EXPECT_EQ(Status::OK(), GetSignatureDefMap(model_, &test_signature_def_map));
+  EXPECT_FALSE(HasSignatureDef(model_, kDefaultServingSignatureDefKey));
+}
+
+TEST_F(SimpleSignatureDefUtilTest, ClearSignatureDefTest) {
+  const int expected_num_buffers = model_->buffers()->size();
+  auto expected_signature_def = GetTestSignatureDef();
+  std::string model_output;
+  std::map<string, SignatureDef> expected_signature_def_map = {
+      {kDefaultServingSignatureDefKey, expected_signature_def}};
+  EXPECT_EQ(Status::OK(), SetSignatureDefMap(model_, expected_signature_def_map,
+                                             &model_output));
+  const Model* add_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_TRUE(HasSignatureDef(add_model, kDefaultServingSignatureDefKey));
+  SignatureDef test_signature_def;
+  std::map<string, SignatureDef> test_signature_def_map;
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(add_model, &test_signature_def_map));
+  test_signature_def = test_signature_def_map[kDefaultServingSignatureDefKey];
+  EXPECT_EQ(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+  EXPECT_EQ(Status::OK(), ClearSignatureDefMap(add_model, &model_output));
+  const Model* clear_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_FALSE(HasSignatureDef(clear_model, kDefaultServingSignatureDefKey));
+  EXPECT_EQ(expected_num_buffers, clear_model->buffers()->size());
+}
+
+TEST_F(SimpleSignatureDefUtilTest, SetSignatureDefErrorsTest) {
+  std::map<string, SignatureDef> test_signature_def_map;
+  std::string model_output;
+  EXPECT_TRUE(tensorflow::errors::IsInvalidArgument(
+      SetSignatureDefMap(model_, test_signature_def_map, &model_output)));
+  SignatureDef test_signature_def;
+  test_signature_def_map[kDefaultServingSignatureDefKey] = test_signature_def;
+  EXPECT_TRUE(tensorflow::errors::IsInvalidArgument(
+      SetSignatureDefMap(model_, test_signature_def_map, nullptr)));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc b/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
new file mode 100644
index 00000000000..9477305d433
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
@@ -0,0 +1,95 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "pybind11/stl.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/tools/signature/signature_def_util.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+
+py::bytes WrappedSetSignatureDefMap(
+    const std::vector<uint8_t>& model_buffer,
+    const std::map<std::string, std::string>& serialized_signature_def_map) {
+  auto flatbuffer_model = tflite::FlatBufferModel::BuildFromBuffer(
+      reinterpret_cast<const char*>(model_buffer.data()), model_buffer.size());
+  auto* model = flatbuffer_model->GetModel();
+  if (!model) {
+    throw std::invalid_argument("Invalid model");
+  }
+  std::string data;
+  std::map<std::string, tensorflow::SignatureDef> signature_def_map;
+  for (const auto& entry : serialized_signature_def_map) {
+    tensorflow::SignatureDef signature_def;
+    if (!signature_def.ParseFromString(entry.second)) {
+      throw std::invalid_argument("Cannot parse signature def");
+    }
+    signature_def_map[entry.first] = signature_def;
+  }
+  auto status = tflite::SetSignatureDefMap(model, signature_def_map, &data);
+  if (status != tensorflow::Status::OK()) {
+    throw std::invalid_argument(status.error_message());
+  }
+  return py::bytes(data);
+}
+
+std::map<std::string, py::bytes> WrappedGetSignatureDefMap(
+    const std::vector<uint8_t>& model_buffer) {
+  auto flatbuffer_model = tflite::FlatBufferModel::BuildFromBuffer(
+      reinterpret_cast<const char*>(model_buffer.data()), model_buffer.size());
+  auto* model = flatbuffer_model->GetModel();
+  if (!model) {
+    throw std::invalid_argument("Invalid model");
+  }
+  std::string content;
+  std::map<std::string, tensorflow::SignatureDef> signature_def_map;
+  auto status = tflite::GetSignatureDefMap(model, &signature_def_map);
+  if (status != tensorflow::Status::OK()) {
+    throw std::invalid_argument("Cannot parse signature def");
+  }
+  std::map<std::string, py::bytes> serialized_signature_def_map;
+  for (const auto& entry : signature_def_map) {
+    serialized_signature_def_map[entry.first] =
+        py::bytes(entry.second.SerializeAsString());
+  }
+  return serialized_signature_def_map;
+}
+
+py::bytes WrappedClearSignatureDefs(const std::vector<uint8_t>& model_buffer) {
+  auto flatbuffer_model = tflite::FlatBufferModel::BuildFromBuffer(
+      reinterpret_cast<const char*>(model_buffer.data()), model_buffer.size());
+  auto* model = flatbuffer_model->GetModel();
+  if (!model) {
+    throw std::invalid_argument("Invalid model");
+  }
+  std::string content;
+  auto status = tflite::ClearSignatureDefMap(model, &content);
+  if (status != tensorflow::Status::OK()) {
+    throw std::invalid_argument("An unknown error occurred");
+  }
+  return py::bytes(content);
+}
+
+PYBIND11_MODULE(_pywrap_signature_def_util_wrapper, m) {
+  m.doc() = R"pbdoc(
+    _pywrap_signature_def_util_wrapper
+    -----
+  )pbdoc";
+
+  m.def("SetSignatureDefMap", &WrappedSetSignatureDefMap);
+
+  m.def("GetSignatureDefMap", &WrappedGetSignatureDefMap);
+
+  m.def("ClearSignatureDefs", &WrappedClearSignatureDefs);
+}
diff --git a/tensorflow/lite/tools/signature/signature_def_utils.py b/tensorflow/lite/tools/signature/signature_def_utils.py
new file mode 100644
index 00000000000..df25c651172
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_utils.py
@@ -0,0 +1,95 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions related to SignatureDefs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.lite.tools.signature import _pywrap_signature_def_util_wrapper as signature_def_util
+
+
+def set_signature_defs(tflite_model, signature_def_map):
+  """Sets SignatureDefs to the Metadata of a TfLite flatbuffer buffer.
+
+  Args:
+    tflite_model: Binary TFLite model (bytes or bytes-like object) to which to
+      add signature_def.
+    signature_def_map: dict containing SignatureDefs to store in metadata.
+  Returns:
+    buffer: A TFLite model binary identical to model buffer with
+      metadata field containing SignatureDef.
+
+  Raises:
+    ValueError:
+      tflite_model buffer does not contain a valid TFLite model.
+      signature_def_map is empty or does not contain a SignatureDef.
+  """
+  model = tflite_model
+  if not isinstance(tflite_model, bytearray):
+    model = bytearray(tflite_model)
+  serialized_signature_def_map = {
+      k: v.SerializeToString() for k, v in signature_def_map.items()}
+  model_buffer = signature_def_util.SetSignatureDefMap(
+      model, serialized_signature_def_map)
+  return model_buffer
+
+
+def get_signature_defs(tflite_model):
+  """Get SignatureDef dict from the Metadata of a TfLite flatbuffer buffer.
+
+  Args:
+    tflite_model: TFLite model buffer to get the signature_def.
+
+  Returns:
+    dict containing serving names to SignatureDefs if exists, otherwise, empty
+      dict.
+
+  Raises:
+    ValueError:
+      tflite_model buffer does not contain a valid TFLite model.
+    DecodeError:
+      SignatureDef cannot be parsed from TfLite SignatureDef metadata.
+  """
+  model = tflite_model
+  if not isinstance(tflite_model, bytearray):
+    model = bytearray(tflite_model)
+  serialized_signature_def_map = signature_def_util.GetSignatureDefMap(model)
+  def _deserialize(serialized):
+    signature_def = meta_graph_pb2.SignatureDef()
+    signature_def.ParseFromString(serialized)
+    return signature_def
+  return {k: _deserialize(v) for k, v in serialized_signature_def_map.items()}
+
+
+def clear_signature_defs(tflite_model):
+  """Clears SignatureDefs from the Metadata of a TfLite flatbuffer buffer.
+
+  Args:
+    tflite_model: TFLite model buffer to remove signature_defs.
+
+  Returns:
+    buffer: A TFLite model binary identical to model buffer with
+      no SignatureDef metadata.
+
+  Raises:
+    ValueError:
+      tflite_model buffer does not contain a valid TFLite model.
+  """
+  model = tflite_model
+  if not isinstance(tflite_model, bytearray):
+    model = bytearray(tflite_model)
+  return signature_def_util.ClearSignatureDefs(model)
diff --git a/tensorflow/lite/tools/signature/signature_def_utils_test.py b/tensorflow/lite/tools/signature/signature_def_utils_test.py
new file mode 100644
index 00000000000..f7cb33188af
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_utils_test.py
@@ -0,0 +1,76 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for signature_def_util.py.
+
+   - Tests adding a SignatureDef to TFLite metadata.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tensorflow as tf
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.lite.tools.signature import signature_def_utils
+
+
+class SignatureDefUtilsTest(tf.test.TestCase):
+
+  def testAddSignatureDefToFlatbufferMetadata(self):
+    """Test a SavedModel conversion has correct Metadata."""
+    filename = tf.compat.v1.resource_loader.get_path_to_datafile(
+        '../../testdata/add.bin')
+    if not os.path.exists(filename):
+      raise IOError('File "{0}" does not exist in {1}.'.format(
+          filename,
+          tf.compat.v1.resource_loader.get_root_dir_with_all_resources()))
+
+    with tf.io.gfile.GFile(filename, 'rb') as fp:
+      tflite_model = bytearray(fp.read())
+
+    self.assertIsNotNone(tflite_model, 'TFLite model is none')
+    sig_input_tensor = meta_graph_pb2.TensorInfo(
+        dtype=tf.as_dtype(tf.float32).as_datatype_enum,
+        tensor_shape=tf.TensorShape([1, 8, 8, 3]).as_proto())
+    sig_input_tensor_signature = {'x': sig_input_tensor}
+    sig_output_tensor = meta_graph_pb2.TensorInfo(
+        dtype=tf.as_dtype(tf.float32).as_datatype_enum,
+        tensor_shape=tf.TensorShape([1, 8, 8, 3]).as_proto())
+    sig_output_tensor_signature = {'y': sig_output_tensor}
+    predict_signature_def = (
+        tf.compat.v1.saved_model.build_signature_def(
+            sig_input_tensor_signature, sig_output_tensor_signature,
+            tf.saved_model.PREDICT_METHOD_NAME))
+    serving_key = tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    signature_def_map = {serving_key: predict_signature_def}
+    tflite_model = signature_def_utils.set_signature_defs(
+        tflite_model, signature_def_map)
+    saved_signature_def_map = signature_def_utils.get_signature_defs(
+        tflite_model)
+    signature_def = saved_signature_def_map.get(serving_key)
+    self.assertIsNotNone(signature_def, 'SignatureDef not found')
+    self.assertEqual(signature_def.SerializeToString(),
+                     predict_signature_def.SerializeToString())
+    remove_tflite_model = (
+        signature_def_utils.clear_signature_defs(tflite_model))
+    signature_def_map = signature_def_utils.get_signature_defs(
+        remove_tflite_model)
+    self.assertIsNone(signature_def_map.get(serving_key),
+                      'SignatureDef found, but should be missing')
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index e2a0af7a380..118e2d420f8 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -121,6 +121,11 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       // | Quantized Int8  |                  4 |                        4 |
       // +-----------------+--------------------+--------------------------+
 
+      // FullyConnected with sparse weight is supported at version 8.
+      if (op_sig.options.fully_connected.sparse_weight) {
+        return 8;
+      }
+
       // Int16 fully fixed point kernel is at version 7.
       if (op_sig.input_types.at(0) == TensorType_INT16 &&
           op_sig.input_types.at(1) == TensorType_INT16 &&
@@ -133,7 +138,7 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       if (op_sig.input_types.size() == 2) {
         return 6;
       }
-      // `keep_num_dims` is supported at verison 5.
+      // `keep_num_dims` is supported at version 5.
       if (op_sig.options.fully_connected.keep_num_dims) {
         return 5;
       }
@@ -363,13 +368,20 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
     case BuiltinOperator_RESIZE_BILINEAR:
-    case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
       if (op_sig.options.resize.half_pixel_centers) {
         return 3;
       } else if (op_sig.input_types.at(0) == TensorType_INT8) {
         return 2;
       }
       return 1;
+    case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
+      if (op_sig.options.resize.half_pixel_centers ||
+          op_sig.options.resize.align_corners) {
+        return 3;
+      } else if (op_sig.input_types.at(0) == TensorType_INT8) {
+        return 2;
+      }
+      return 1;
 
     case BuiltinOperator_MAXIMUM:
     case BuiltinOperator_MINIMUM:
@@ -438,6 +450,18 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
 
+    case BuiltinOperator_TANH:
+    case BuiltinOperator_LOGISTIC:
+      if (op_sig.input_types.at(0) == TensorType_INT16 &&
+          op_sig.output_types.at(0) == TensorType_INT16) {
+        return 3;
+      }
+
+      if (op_sig.input_types.at(0) == TensorType_INT8) {
+        return 2;
+      }
+      return 1;
+
     case BuiltinOperator_FILL:
       if (op_sig.input_types.size() >= 2 &&
           (op_sig.input_types.at(1) == TensorType_BOOL ||
@@ -458,6 +482,12 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
 
+    case BuiltinOperator_LEAKY_RELU:
+      if (op_sig.input_types.at(0) == TensorType_INT16) {
+        return 2;
+      }
+      return 1;
+
     case BuiltinOperator_CONCATENATION:
     case BuiltinOperator_SOFTMAX:
       // In case of int16 inputs, the version is 3.
@@ -479,8 +509,6 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
     case BuiltinOperator_REDUCE_MAX:
     case BuiltinOperator_REDUCE_MIN:
     case BuiltinOperator_RELU6:
-    case BuiltinOperator_TANH:
-    case BuiltinOperator_LOGISTIC:
     case BuiltinOperator_LOG_SOFTMAX:
     case BuiltinOperator_TOPK_V2:
     case BuiltinOperator_ARG_MAX:
@@ -555,6 +583,11 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
         op_sig.options.fully_connected.weights_format =
             fully_connected_option->weights_format();
       }
+
+      const Tensor* weight_tensor =
+          subgraph->tensors()->Get(op->inputs()->Get(1));
+      op_sig.options.fully_connected.sparse_weight =
+          (weight_tensor->sparsity() != nullptr);
     } break;
 
     case BuiltinOperator_MUL: {
@@ -596,6 +629,8 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
       if (resize_bilinear_option) {
         op_sig.options.resize.half_pixel_centers =
             resize_bilinear_option->half_pixel_centers();
+        op_sig.options.resize.align_corners =
+            resize_bilinear_option->align_corners();
       }
     } break;
     case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR: {
@@ -604,6 +639,7 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
       if (resize_nn_option) {
         op_sig.options.resize.half_pixel_centers =
             resize_nn_option->half_pixel_centers();
+        op_sig.options.resize.align_corners = resize_nn_option->align_corners();
       }
     } break;
     // TODO(b/150176627): Add tests for GetOpSignature.
diff --git a/tensorflow/lite/tools/versioning/op_version.h b/tensorflow/lite/tools/versioning/op_version.h
index fba6c943462..df74ffaf6dd 100644
--- a/tensorflow/lite/tools/versioning/op_version.h
+++ b/tensorflow/lite/tools/versioning/op_version.h
@@ -37,6 +37,9 @@ typedef struct {
     struct {
       bool keep_num_dims;
       FullyConnectedOptionsWeightsFormat weights_format;
+      // TODO(b/156530611): Make this global when more ops support sparse
+      // computation.
+      bool sparse_weight;
     } fully_connected;
     struct {
       float input1_scale;
@@ -48,6 +51,7 @@ typedef struct {
     } lstm;
     struct {
       bool half_pixel_centers;
+      bool align_corners;
     } resize;
     struct {
       int32_t num_dims;
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index 7d9039ff848..4017fc3bff0 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -352,6 +352,15 @@ TEST(OpVersionTest, VersioningFullyConnectedTest) {
   fake_op_sig.options.fully_connected = {
       false, FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8};
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 6);
+
+  fake_op_sig = {
+      .op = BuiltinOperator_FULLY_CONNECTED,
+      .input_types = std::vector<TensorType>{TensorType_INT8, TensorType_INT8},
+      .output_types = std::vector<TensorType>{TensorType_INT8},
+  };
+  fake_op_sig.options.fully_connected = {
+      false, FullyConnectedOptionsWeightsFormat_DEFAULT, true};
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 8);
 }
 
 TEST(OpVersionTest, VersioningDequantizeTest) {
@@ -594,4 +603,64 @@ TEST(OpVersionTEst, VersioningFillTest) {
                                                         TensorType_INT32}};
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
 }
+TEST(OpVersionTest, VersioningResizeBilinearTest) {
+  // Default.
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_BILINEAR,
+      .input_types =
+          std::vector<TensorType>{TensorType_FLOAT32, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_FLOAT32},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  // align_corners=true is still version 1.
+  fake_op_sig.options.resize.align_corners = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  // half_pixel_centers=true must be version 3.
+  fake_op_sig.options.resize.align_corners = false;
+  fake_op_sig.options.resize.half_pixel_centers = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  // int8 input is version 2.
+  fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_BILINEAR,
+      .input_types = std::vector<TensorType>{TensorType_INT8, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_INT8},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig.options.resize.half_pixel_centers = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+}
+TEST(OpVersionTest, VersioningResizeNearestNeighborTest) {
+  // Default.
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+      .input_types =
+          std::vector<TensorType>{TensorType_FLOAT32, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_FLOAT32},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  // align_corners=true is version 3.
+  fake_op_sig.options.resize.align_corners = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  // half_pixel_centers=true must be version 3.
+  fake_op_sig.options.resize.align_corners = false;
+  fake_op_sig.options.resize.half_pixel_centers = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  // int8 input is version 2.
+  fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+      .input_types = std::vector<TensorType>{TensorType_INT8, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_INT8},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig.options.resize.align_corners = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+}
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/versioning/runtime_version.h b/tensorflow/lite/tools/versioning/runtime_version.h
index e4c25221310..ad88bd2ab89 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.h
+++ b/tensorflow/lite/tools/versioning/runtime_version.h
@@ -24,8 +24,8 @@ namespace tflite {
 void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer);
 
 // Returns true if the first version string precedes the second.
-// For example, '1.14' should precede '1.9', also '1.14.1' should precede
-// '1.14'. If two version string is equal, then false will be returned.
+// For example, '1.9' should precede '1.14', also '1.14' should precede
+// '1.14.1'. If two version string is equal, then false will be returned.
 bool CompareRuntimeVersion(const std::string&, const std::string&);
 
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/visualize.py b/tensorflow/lite/tools/visualize.py
index 1f89f9c5448..3d22d1bb05b 100644
--- a/tensorflow/lite/tools/visualize.py
+++ b/tensorflow/lite/tools/visualize.py
@@ -28,6 +28,7 @@ import json
 import os
 import re
 import sys
+import numpy as np
 
 from tensorflow.lite.python import schema_py_generated as schema_fb
 
@@ -377,23 +378,34 @@ def CamelCaseToSnakeCase(camel_case_input):
   return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
 
 
-def FlatbufferToDict(fb):
-  """Converts a hierarchy of FB objects into a nested dict."""
-  if hasattr(fb, "__dict__"):
+def FlatbufferToDict(fb, preserve_as_numpy):
+  """Converts a hierarchy of FB objects into a nested dict.
+
+  We avoid transforming big parts of the flat buffer into python arrays. This
+  speeds conversion from ten minutes to a few seconds on big graphs.
+
+  Args:
+    fb: a flat buffer structure. (i.e. ModelT)
+    preserve_as_numpy: true if all downstream np.arrays should be preserved.
+      false if all downstream np.array should become python arrays
+  Returns:
+    A dictionary representing the flatbuffer rather than a flatbuffer object.
+  """
+  if isinstance(fb, int) or isinstance(fb, float) or isinstance(fb, str):
+    return fb
+  elif hasattr(fb, "__dict__"):
     result = {}
     for attribute_name in dir(fb):
       attribute = fb.__getattribute__(attribute_name)
       if not callable(attribute) and attribute_name[0] != "_":
         snake_name = CamelCaseToSnakeCase(attribute_name)
-        result[snake_name] = FlatbufferToDict(attribute)
+        preserve = True if attribute_name == "buffers" else preserve_as_numpy
+        result[snake_name] = FlatbufferToDict(attribute, preserve)
     return result
-  elif isinstance(fb, str):
-    return fb
+  elif isinstance(fb, np.ndarray):
+    return fb if preserve_as_numpy else fb.tolist()
   elif hasattr(fb, "__len__"):
-    result = []
-    for entry in fb:
-      result.append(FlatbufferToDict(entry))
-    return result
+    return [FlatbufferToDict(entry, preserve_as_numpy) for entry in fb]
   else:
     return fb
 
@@ -401,7 +413,7 @@ def FlatbufferToDict(fb):
 def CreateDictFromFlatbuffer(buffer_data):
   model_obj = schema_fb.Model.GetRootAsModel(buffer_data, 0)
   model = schema_fb.ModelT.InitFromObj(model_obj)
-  return FlatbufferToDict(model)
+  return FlatbufferToDict(model, preserve_as_numpy=False)
 
 
 def CreateHtmlFile(tflite_input, html_output):
diff --git a/tensorflow/lite/util.cc b/tensorflow/lite/util.cc
index c91e50b1845..09efaa77f15 100644
--- a/tensorflow/lite/util.cc
+++ b/tensorflow/lite/util.cc
@@ -38,6 +38,14 @@ bool IsFlexOp(const char* custom_name) {
                                 strlen(kFlexCustomCodePrefix)) == 0;
 }
 
+std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> BuildTfLiteIntArray(
+    const std::vector<int>& data) {
+  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> result(
+      TfLiteIntArrayCreate(data.size()));
+  std::copy(data.begin(), data.end(), result->data);
+  return result;
+}
+
 TfLiteIntArray* ConvertVectorToTfLiteIntArray(const std::vector<int>& input) {
   return ConvertArrayToTfLiteIntArray(static_cast<int>(input.size()),
                                       input.data());
diff --git a/tensorflow/lite/util.h b/tensorflow/lite/util.h
index 1b68f699662..71b69c755fc 100644
--- a/tensorflow/lite/util.h
+++ b/tensorflow/lite/util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_UTIL_H_
 #define TENSORFLOW_LITE_UTIL_H_
 
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -60,6 +61,11 @@ struct TfLiteIntArrayDeleter {
   }
 };
 
+// Helper for Building TfLiteIntArray that is wrapped in a unique_ptr,
+// So that it is automatically freed when it goes out of the scope.
+std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> BuildTfLiteIntArray(
+    const std::vector<int>& data);
+
 // Populates the size in bytes of a type into `bytes`. Returns kTfLiteOk for
 // valid types, and kTfLiteError otherwise.
 TfLiteStatus GetSizeOfType(TfLiteContext* context, const TfLiteType type,
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index d4df3df079e..41750ea02b4 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -6,8 +6,6 @@ tensorflow/compat_template_v1.__init__.py
 tensorflow/compiler/mlir/glob_lit_test.bzl
 tensorflow/lite/micro/build_def.bzl
 tensorflow/python/autograph/core/config.py
-tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
-tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
 tensorflow/python/eager/benchmarks_test_base.py
 tensorflow/python/tpu/profiler/pip_package/BUILD
 tensorflow/python/tpu/profiler/pip_package/README
@@ -343,6 +341,8 @@ tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nightly_release.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
+tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh
+tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh
 tensorflow/tools/ci_build/release/ubuntu_16/sanity/build.sh
 tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
 tensorflow/tools/ci_build/release/windows/cpu_libtensorflow/nightly.bat
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 4729ce9d743..682d15712b7 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3,7 +3,7 @@
 #  ":platform" - Low-level and platform-specific Python code.
 
 load("//tensorflow:tensorflow.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cuda_library", "tf_gen_op_wrapper_py", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cuda_library", "tf_gen_op_wrapper_py")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
@@ -26,6 +26,9 @@ load("//tensorflow:tensorflow.bzl", "tf_external_workspace_visible")
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_pybind_cc_library_wrapper")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_build_info_genrule")
 load("//tensorflow/core/platform:build_config.bzl", "pyx_library", "tf_additional_all_protos", "tf_additional_lib_deps", "tf_proto_library", "tf_proto_library_py", "tf_protos_grappler")  # @unused
@@ -53,6 +56,7 @@ visibility = [
     "//third_party/py/tf_slim:__subpackages__",
     # TODO(aselle): to pass open source test.
     "//bazel_pip/tensorflow/lite/toco/python:__pkg__",
+    "//third_party/py/tensorflow_docs:__subpackages__",
 ]
 
 package(
@@ -134,7 +138,7 @@ py_library(
         ":_pywrap_utils",
         ":array_ops",
         ":audio_ops_gen",
-        ":bincount",
+        ":bincount_ops",
         ":bitwise_ops",
         ":boosted_trees_ops",
         ":check_ops",
@@ -230,10 +234,26 @@ py_library(
         "//tensorflow/python/tools:module_util",
         "//tensorflow/python/tools/api/generator:create_python_api",
         "//tensorflow/python/tpu:tpu_noestimator",
+        "//tensorflow/python/types",
         "//third_party/py/numpy",
     ],
 )
 
+# This target should only be used for API generation.
+py_library(
+    name = "modules_with_exports",
+    srcs = ["modules_with_exports.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow:__pkg__",
+        "//tensorflow/python/tools/api/generator:__pkg__",
+        "//third_party/py/tensorflow_core:__subpackages__",
+    ],
+    deps = [
+        ":no_contrib",
+    ],
+)
+
 # TODO(gunan): Investigate making this action hermetic so we do not need
 # to run it locally.
 tf_py_build_info_genrule(
@@ -655,15 +675,15 @@ tf_python_pybind_extension(
         "@com_google_absl//absl/types:optional",
     ] + if_static(
         extra_deps = [
-            "//tensorflow/core:eager_service_proto_cc",
-            "//tensorflow/core:master_proto_cc",
-            "//tensorflow/core:worker_proto_cc",
+            "//tensorflow/core/protobuf:eager_service_proto_cc",
+            "//tensorflow/core/protobuf:master_proto_cc",
+            "//tensorflow/core/protobuf:worker_proto_cc",
             "//tensorflow/core:version_lib",
         ],
         otherwise = [
-            "//tensorflow/core:eager_service_proto_cc_headers_only",
-            "//tensorflow/core:master_proto_cc_headers_only",
-            "//tensorflow/core:worker_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:eager_service_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:master_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:worker_proto_cc_headers_only",
         ],
     ),
 )
@@ -996,6 +1016,8 @@ cc_library(
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/c/eager:tfe_context_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -2068,6 +2090,7 @@ tf_py_test(
     srcs = ["framework/constant_op_test.py"],
     main = "framework/constant_op_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":constant_op",
     ],
@@ -2206,6 +2229,21 @@ py_library(
     ],
 )
 
+py_test(
+    name = "ops/functional_ops_test",
+    srcs = ["ops/functional_ops_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":dtypes",
+        ":function",
+        ":functional_ops",
+        ":tensor_spec",
+        "//tensorflow/python/eager:def_function",
+    ],
+)
+
 cuda_py_test(
     name = "function_test",
     size = "medium",
@@ -3469,23 +3507,24 @@ py_library(
 )
 
 py_library(
-    name = "bincount",
-    srcs = ["ops/bincount.py"],
+    name = "bincount_ops",
+    srcs = ["ops/bincount_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":count_ops_gen",
         ":framework",
         ":framework_for_generated_wrappers",
+        "//tensorflow/python/compat",
     ],
 )
 
 tf_py_test(
-    name = "bincount_test",
+    name = "bincount_ops_test",
     size = "small",
-    srcs = ["ops/bincount_test.py"],
+    srcs = ["ops/bincount_ops_test.py"],
     python_version = "PY3",
     deps = [
-        ":bincount",
+        ":bincount_ops",
         ":platform_test",
     ],
 )
@@ -4172,6 +4211,7 @@ py_library(
         ":random_ops",
         ":tensor_shape",
         ":tensor_util",
+        ":variables",
         "//third_party/py/numpy",
     ],
 )
@@ -5069,6 +5109,21 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "math_ops_linspace_test",
+    size = "medium",
+    srcs = ["ops/math_ops_linspace_test.py"],
+    python_version = "PY3",
+    tags = ["no_windows_gpu"],
+    deps = [
+        ":framework_for_generated_wrappers",
+        ":framework_test_lib",
+        ":math_ops",
+        ":platform_test",
+        "//third_party/py/numpy",
+    ],
+)
+
 cuda_py_test(
     name = "nn_batchnorm_test",
     size = "medium",
@@ -5112,8 +5167,6 @@ cuda_py_test(
     srcs = ["ops/nn_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],
-    # TODO(b/130689556): Numerical differences due to fast math on CPU.
-    xla_enable_strict_auto_jit = False,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5288,6 +5341,7 @@ py_library(
         ":variable_scope",
         ":variables",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/experimental/service:server_lib",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:distribute_coordinator_context",
         "//tensorflow/python/distribute:distribute_lib",
@@ -8047,14 +8101,14 @@ tf_python_pybind_extension(
         "//tensorflow/core/platform",
     ] + if_static(
         extra_deps = [
-            "//tensorflow/core:eager_service_proto_cc",
-            "//tensorflow/core:master_proto_cc",
-            "//tensorflow/core:worker_proto_cc",
+            "//tensorflow/core/protobuf:eager_service_proto_cc",
+            "//tensorflow/core/protobuf:master_proto_cc",
+            "//tensorflow/core/protobuf:worker_proto_cc",
         ],
         otherwise = [
-            "//tensorflow/core:eager_service_proto_cc_headers_only",
-            "//tensorflow/core:master_proto_cc_headers_only",
-            "//tensorflow/core:worker_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:eager_service_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:master_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:worker_proto_cc_headers_only",
         ],
     ),
 )
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 8939c9b3143..9b42742058c 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -30,51 +30,14 @@ import importlib
 import sys
 import traceback
 
-# TODO(drpng): write up instructions for editing this file in a doc and point to
-# the doc instead.
-# If you want to edit this file to expose modules in public tensorflow API, you
-# need to follow these steps:
-# 1. Consult with tensorflow team and get approval for adding a new API to the
-#    public interface.
-# 2. Document the module in the gen_docs_combined.py.
-# 3. Import the module in the main tensorflow namespace by adding an import
-#    statement in this file.
-# 4. Sanitize the entry point by making sure that your module does not expose
-#    transitively imported modules used for implementation, such as os, sys.
+# We aim to keep this file minimal and ideally remove completely.
+# If you are adding a new file with @tf_export decorators,
+# import it in modules_with_exports.py instead.
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,g-bad-import-order,g-import-not-at-top
 
-import numpy as np
-
-from tensorflow.python import pywrap_tensorflow
-
-# Protocol buffers
-from tensorflow.core.framework.graph_pb2 import *
-from tensorflow.core.framework.node_def_pb2 import *
-from tensorflow.core.framework.summary_pb2 import *
-from tensorflow.core.framework.attr_value_pb2 import *
-from tensorflow.core.protobuf.meta_graph_pb2 import TensorInfo
-from tensorflow.core.protobuf.meta_graph_pb2 import MetaGraphDef
-from tensorflow.core.protobuf.config_pb2 import *
-from tensorflow.core.protobuf.tensorflow_server_pb2 import *
-from tensorflow.core.util.event_pb2 import *
-
-# Framework
-from tensorflow.python.framework.framework_lib import *  # pylint: disable=redefined-builtin
-from tensorflow.python.framework.versions import *
-from tensorflow.python.framework import config
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import graph_util
-
-# Session
-from tensorflow.python.client.client_lib import *
-
-# Ops
-from tensorflow.python.ops.standard_ops import *
-
-# Namespaces
-from tensorflow.python.ops import initializers_ns as initializers
+from tensorflow.python.eager import context
 
 # pylint: enable=wildcard-import
 
@@ -85,7 +48,7 @@ from tensorflow.python import keras
 from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.layers import layers
 from tensorflow.python.module import module
-from tensorflow.python.ops import bincount
+from tensorflow.python.ops import bincount_ops
 from tensorflow.python.ops import bitwise_ops as bitwise
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import image_ops as image
@@ -152,8 +115,8 @@ from tensorflow.python.framework.ops import enable_eager_execution
 # Check whether TF2_BEHAVIOR is turned on.
 from tensorflow.python.eager import monitoring as _monitoring
 from tensorflow.python import tf2 as _tf2
-_tf2_gauge = _monitoring.BoolGauge('/tensorflow/api/tf2_enable',
-                                   'Environment variable TF2_BEHAVIOR is set".')
+_tf2_gauge = _monitoring.BoolGauge(
+    '/tensorflow/api/tf2_enable', 'Environment variable TF2_BEHAVIOR is set".')
 _tf2_gauge.get_cell().set(_tf2.enabled())
 
 # Necessary for the symbols in this module to be taken into account by
@@ -186,30 +149,6 @@ nn.bidirectional_dynamic_rnn = rnn.bidirectional_dynamic_rnn
 nn.static_state_saving_rnn = rnn.static_state_saving_rnn
 nn.rnn_cell = rnn_cell
 
-# Export protos
-# pylint: disable=undefined-variable
-tf_export(v1=['AttrValue'])(AttrValue)
-tf_export(v1=['ConfigProto'])(ConfigProto)
-tf_export(v1=['Event', 'summary.Event'])(Event)
-tf_export(v1=['GPUOptions'])(GPUOptions)
-tf_export(v1=['GraphDef'])(GraphDef)
-tf_export(v1=['GraphOptions'])(GraphOptions)
-tf_export(v1=['HistogramProto'])(HistogramProto)
-tf_export(v1=['LogMessage'])(LogMessage)
-tf_export(v1=['MetaGraphDef'])(MetaGraphDef)
-tf_export(v1=['NameAttrList'])(NameAttrList)
-tf_export(v1=['NodeDef'])(NodeDef)
-tf_export(v1=['OptimizerOptions'])(OptimizerOptions)
-tf_export(v1=['RunMetadata'])(RunMetadata)
-tf_export(v1=['RunOptions'])(RunOptions)
-tf_export(v1=['SessionLog', 'summary.SessionLog'])(SessionLog)
-tf_export(v1=['Summary', 'summary.Summary'])(Summary)
-tf_export(v1=['summary.SummaryDescription'])(SummaryDescription)
-tf_export(v1=['SummaryMetadata'])(SummaryMetadata)
-tf_export(v1=['summary.TaggedRunMetadata'])(TaggedRunMetadata)
-tf_export(v1=['TensorInfo'])(TensorInfo)
-# pylint: enable=undefined-variable
-
 # Special dunders that we choose to export:
 _exported_dunders = set([
     '__version__',
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index 9c1d5a38707..9cf3bba8dd5 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -33,6 +33,7 @@ py_library(
         "logical_expressions.py",
         "return_statements.py",
         "slices.py",
+        "variables.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -117,7 +118,13 @@ py_test(
     name = "control_flow_test",
     srcs = ["control_flow_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
+    tags = [
+        "no_oss_py2",
+        "no_pip",
+        "no_windows",
+        "nopip",
+    ],
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
@@ -213,3 +220,16 @@ py_test(
         "//tensorflow/python/autograph/pyct",
     ],
 )
+
+py_test(
+    name = "variables_test",
+    srcs = ["variables_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":converters",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/autograph/core:test_lib",
+        "//tensorflow/python/autograph/pyct",
+    ],
+)
diff --git a/tensorflow/python/autograph/converters/asserts_test.py b/tensorflow/python/autograph/converters/asserts_test.py
index fd31cd15a0e..dc435cbc90e 100644
--- a/tensorflow/python/autograph/converters/asserts_test.py
+++ b/tensorflow/python/autograph/converters/asserts_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.converters import asserts
 from tensorflow.python.autograph.converters import functions
+from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
@@ -36,7 +37,8 @@ class AssertsTest(converter_testing.TestCase):
       return a
 
     with ops.Graph().as_default():
-      with self.converted(test_fn, (functions, asserts), {}) as result:
+      with self.converted(
+          test_fn, (functions, asserts, return_statements), {}) as result:
         op = result.test_fn(constant_op.constant(False))
 
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, 'testmsg'):
diff --git a/tensorflow/python/autograph/converters/conditional_expressions.py b/tensorflow/python/autograph/converters/conditional_expressions.py
index 44ab6dee926..65fb6765fcf 100644
--- a/tensorflow/python/autograph/converters/conditional_expressions.py
+++ b/tensorflow/python/autograph/converters/conditional_expressions.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gast
+
 from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import templates
 
 
@@ -26,19 +29,20 @@ class ConditionalExpressionTransformer(converter.Base):
   """Converts conditional expressions to functional form."""
 
   def visit_IfExp(self, node):
-    return templates.replace_as_expression(
-        '''ag__.if_stmt(
+    template = '''
+        ag__.if_exp(
             test,
             lambda: true_expr,
             lambda: false_expr,
-            lambda: (),
-            lambda _: None,
-            ('<internal expr>',),
-            ())
-        ''',
+            expr_repr)
+    '''
+    expr_repr = parser.unparse(node.test, include_encoding_marker=False).strip()
+    return templates.replace_as_expression(
+        template,
         test=node.test,
         true_expr=node.body,
-        false_expr=node.orelse)
+        false_expr=node.orelse,
+        expr_repr=gast.Constant(expr_repr, kind=None))
 
 
 def transform(node, ctx):
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index a903c43bcfc..b54770cbd28 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -23,7 +23,6 @@ import gast
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.lang import directives
 from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import qual_names
@@ -57,145 +56,59 @@ class ControlFlowTransformer(converter.Base):
       fn.scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
       return self.generic_visit(node)
 
-  def _create_cond_branch(self, body_name, aliased_orig_names,
-                          aliased_new_names, body, returns):
-    if len(returns) == 1:
-      template = """
-        return retval
-      """
-      return_stmt = templates.replace(template, retval=returns[0])
-    else:
-      template = """
-        return (retvals,)
-      """
-      return_stmt = templates.replace(template, retvals=returns)
-
-    if aliased_orig_names:
-      alias_declarations = []
-      for new_name, old_name in zip(aliased_new_names, aliased_orig_names):
-        template = """
-          try:
-            aliased_new_name = aliased_orig_name
-          except NameError:
-            aliased_new_name = ag__.Undefined(symbol_name)
-        """
-
-        alias_declarations.extend(
-            templates.replace(
-                template,
-                aliased_new_name=new_name,
-                aliased_orig_name=old_name,
-                symbol_name=gast.Constant(str(old_name), kind=None)))
-
-      template = """
-        def body_name():
-          alias_declarations
-          body
-          return_stmt
-      """
-      return templates.replace(
-          template,
-          alias_declarations=alias_declarations,
-          body_name=body_name,
-          body=body,
-          return_stmt=return_stmt)
-    else:
-      template = """
-        def body_name():
-          body
-          return_stmt
-      """
-      return templates.replace(
-          template, body_name=body_name, body=body, return_stmt=return_stmt)
-
-  def _create_cond_expr(self, results, test, body_name, orelse_name,
-                        state_getter_name, state_setter_name,
-                        basic_symbol_names, composite_symbol_names):
-    if results is not None:
-      template = """
-        results = ag__.if_stmt(test, body_name, orelse_name,
-                               state_getter_name, state_setter_name,
-                               (basic_symbol_names,),
-                               (composite_symbol_names,))
-      """
-      return templates.replace(
-          template,
-          test=test,
-          results=results,
-          body_name=body_name,
-          orelse_name=orelse_name,
-          state_getter_name=state_getter_name,
-          state_setter_name=state_setter_name,
-          basic_symbol_names=basic_symbol_names,
-          composite_symbol_names=composite_symbol_names)
-    else:
-      template = """
-        ag__.if_stmt(test, body_name, orelse_name, getter_name, setter_name,
-                     (basic_symbol_names,), (composite_symbol_names,))
-      """
-      return templates.replace(
-          template,
-          test=test,
-          body_name=body_name,
-          orelse_name=orelse_name,
-          getter_name=state_getter_name,
-          setter_name=state_setter_name,
-          basic_symbol_names=basic_symbol_names,
-          composite_symbol_names=composite_symbol_names)
-
-  def _fmt_symbols(self, symbol_set):
-    if not symbol_set:
-      return 'no variables'
-    return ', '.join(map(str, symbol_set))
-
-  def _determine_aliased_symbols(self, scope, node_defined_in):
-    modified_live = scope.modified & node_defined_in
-    # Composite symbols are handled elsewhere, see _create_state_functions
-    return {
-        s for s in modified_live
-        if not s.is_composite() and s not in self.state[_Function].scope.globals
-    }
-
-  def _create_nonlocal_declarations(self, loop_vars):
+  def _create_nonlocal_declarations(self, vars_):
+    vars_ = set(vars_)
     results = []
     global_vars = self.state[_Function].scope.globals
 
     if global_vars:
-      results.append(gast.Global([str(v) for v in global_vars]))
+      results.append(gast.Global([str(v) for v in vars_]))
 
     nonlocal_vars = [
-        v for v in loop_vars if not v.is_composite() and v not in global_vars]
+        v for v in vars_ if not v.is_composite() and v not in global_vars]
     if nonlocal_vars:
       results.append(gast.Nonlocal([str(v) for v in nonlocal_vars]))
 
     return results
 
   def _create_state_functions(
-      self, loop_vars, nonlocal_declarations, getter_name, setter_name):
-    if loop_vars:
-      template = """
-        def getter_name():
-          return state_vars,
-        def setter_name(loop_vars):
-          nonlocal_declarations
-          state_vars, = loop_vars
-      """
-      return templates.replace(
-          template,
-          nonlocal_declarations=nonlocal_declarations,
-          getter_name=getter_name,
-          setter_name=setter_name,
-          state_vars=tuple(loop_vars))
-    else:
+      self, block_vars, nonlocal_declarations, getter_name, setter_name):
+    if not block_vars:
       template = """
         def getter_name():
           return ()
-        def setter_name(loop_vars):
+        def setter_name(block_vars):
           pass
       """
       return templates.replace(
           template, getter_name=getter_name, setter_name=setter_name)
 
+    guarded_block_vars = []
+    for v in block_vars:
+      if v.is_simple():
+        guarded_block_vars.append(v)
+      else:
+        guarded_block_vars.append(
+            templates.replace_as_expression(
+                'ag__.ldu(lambda: var_, name)',
+                var_=v,
+                name=gast.Constant(str(v), kind=None)))
+
+    template = """
+      def getter_name():
+        return guarded_state_vars,
+      def setter_name(vars_):
+        nonlocal_declarations
+        state_vars, = vars_
+    """
+    return templates.replace(
+        template,
+        nonlocal_declarations=nonlocal_declarations,
+        getter_name=getter_name,
+        guarded_state_vars=guarded_block_vars,
+        setter_name=setter_name,
+        state_vars=tuple(block_vars))
+
   def _create_loop_options(self, node):
     if not anno.hasanno(node, anno.Basic.DIRECTIVES):
       return gast.Dict([], [])
@@ -222,166 +135,34 @@ class ControlFlowTransformer(converter.Base):
           symbol_name=gast.Constant(s.ssf(), kind=None))
     return assignments
 
-  def visit_If(self, node):
-    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
-    orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
-    defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
-    live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
-
-    # Note: this information needs to be extracted before the body conversion
-    # that happens in the call to generic_visit below, because the conversion
-    # generates nodes that lack static analysis annotations.
-    need_alias_in_body = self._determine_aliased_symbols(
-        body_scope, defined_in)
-    need_alias_in_orelse = self._determine_aliased_symbols(
-        orelse_scope, defined_in)
-
-    node = self.generic_visit(node)
-
-    modified_in_cond = body_scope.modified | orelse_scope.modified
-    returned_from_cond = set()
-    composites = set()
-    for s in modified_in_cond:
-      if s in live_out and not s.is_composite():
-        returned_from_cond.add(s)
-      if s.is_composite():
-        # Special treatment for compound objects, always return them.
-        # This allows special handling within the if_stmt itself.
-        # For example, in TensorFlow we need to restore the state of composite
-        # symbols to ensure that only effects from the executed branch are seen.
-        composites.add(s)
-
-    created_in_body = body_scope.modified & returned_from_cond - defined_in
-    created_in_orelse = orelse_scope.modified & returned_from_cond - defined_in
-
-    basic_created_in_body = tuple(
-        s for s in created_in_body if not s.is_composite())
-    basic_created_in_orelse = tuple(
-        s for s in created_in_orelse if not s.is_composite())
-
-    # These variables are defined only in a single branch. This is fine in
-    # Python so we pass them through. Another backend, e.g. Tensorflow, may need
-    # to handle these cases specially or throw an Error.
-    possibly_undefined = (set(basic_created_in_body) ^
-                          set(basic_created_in_orelse))
-
-    # Alias the closure variables inside the conditional functions, to allow
-    # the functions access to the respective variables.
-    # We will alias variables independently for body and orelse scope,
-    # because different branches might write different variables.
-    aliased_body_orig_names = tuple(need_alias_in_body)
-    aliased_orelse_orig_names = tuple(need_alias_in_orelse)
-    aliased_body_new_names = tuple(
-        self.ctx.namer.new_symbol(s.ssf(), body_scope.referenced)
-        for s in aliased_body_orig_names)
-    aliased_orelse_new_names = tuple(
-        self.ctx.namer.new_symbol(s.ssf(), orelse_scope.referenced)
-        for s in aliased_orelse_orig_names)
-
-    alias_body_map = dict(zip(aliased_body_orig_names, aliased_body_new_names))
-    alias_orelse_map = dict(
-        zip(aliased_orelse_orig_names, aliased_orelse_new_names))
-
-    node_body = ast_util.rename_symbols(node.body, alias_body_map)
-    node_orelse = ast_util.rename_symbols(node.orelse, alias_orelse_map)
-
-    cond_var_name = self.ctx.namer.new_symbol('cond', body_scope.referenced)
-    body_name = self.ctx.namer.new_symbol('if_true', body_scope.referenced)
-    orelse_name = self.ctx.namer.new_symbol('if_false', orelse_scope.referenced)
-    all_referenced = body_scope.referenced | orelse_scope.referenced
-    state_getter_name = self.ctx.namer.new_symbol('get_state', all_referenced)
-    state_setter_name = self.ctx.namer.new_symbol('set_state', all_referenced)
-
-    returned_from_cond = tuple(returned_from_cond)
-    composites = tuple(composites)
-
-    if returned_from_cond:
-      if len(returned_from_cond) == 1:
-        cond_results = returned_from_cond[0]
-      else:
-        cond_results = gast.Tuple([s.ast() for s in returned_from_cond], None)
-
-      returned_from_body = tuple(
-          alias_body_map[s] if s in need_alias_in_body else s
-          for s in returned_from_cond)
-      returned_from_orelse = tuple(
-          alias_orelse_map[s] if s in need_alias_in_orelse else s
-          for s in returned_from_cond)
-
-    else:
-      # When the cond would return no value, we leave the cond called without
-      # results. That in turn should trigger the side effect guards. The
-      # branch functions will return a dummy value that ensures cond
-      # actually has some return value as well.
-      cond_results = None
-      # TODO(mdan): Replace with None once side_effect_guards is retired.
-      returned_from_body = (templates.replace_as_expression(
-          'ag__.match_staging_level(1, cond_var_name)',
-          cond_var_name=cond_var_name),)
-      returned_from_orelse = (templates.replace_as_expression(
-          'ag__.match_staging_level(1, cond_var_name)',
-          cond_var_name=cond_var_name),)
-
-    cond_assign = self.create_assignment(cond_var_name, node.test)
-    body_def = self._create_cond_branch(
-        body_name,
-        aliased_orig_names=aliased_body_orig_names,
-        aliased_new_names=aliased_body_new_names,
-        body=node_body,
-        returns=returned_from_body)
-    orelse_def = self._create_cond_branch(
-        orelse_name,
-        aliased_orig_names=aliased_orelse_orig_names,
-        aliased_new_names=aliased_orelse_new_names,
-        body=node_orelse,
-        returns=returned_from_orelse)
-    undefined_assigns = self._create_undefined_assigns(possibly_undefined)
-    composite_defs = self._create_state_functions(
-        composites, [], state_getter_name, state_setter_name)
-
-    basic_symbol_names = tuple(
-        gast.Constant(str(symbol), kind=None) for symbol in returned_from_cond)
-    composite_symbol_names = tuple(
-        gast.Constant(str(symbol), kind=None) for symbol in composites)
-
-    cond_expr = self._create_cond_expr(cond_results, cond_var_name, body_name,
-                                       orelse_name, state_getter_name,
-                                       state_setter_name, basic_symbol_names,
-                                       composite_symbol_names)
-
-    if_ast = (
-        undefined_assigns + composite_defs + body_def + orelse_def +
-        cond_assign + cond_expr)
-    return if_ast
-
-  def _get_basic_loop_vars(self, modified, live_in, live_out):
-    # The loop variables corresponding to simple symbols (e.g. `x`).
-    basic_loop_vars = []
+  def _get_block_basic_vars(self, modified, live_in, live_out):
+    nonlocals = self.state[_Function].scope.nonlocals
+    basic_scope_vars = []
     for s in modified:
       if s.is_composite():
-        # TODO(mdan): Raise an error when this happens for a TF loop.
+        # TODO(mdan): Raise an error when this happens for a TF scope.
         continue
-      # Variables not live into or out of the loop are considered local to the
-      # loop.
-      if s not in live_in and s not in live_out:
-        continue
-      basic_loop_vars.append(s)
-    return frozenset(basic_loop_vars)
+      # Variables not live into or out of the scope are considered local to the
+      # scope.
+      if s in live_in or s in live_out or s in nonlocals:
+        basic_scope_vars.append(s)
+      continue
+    return frozenset(basic_scope_vars)
 
-  def _get_composite_loop_vars(self, modified, live_in):
-    # The loop variables corresponding to composite symbols (e.g. `self.x`).
-    composite_loop_vars = []
+  def _get_block_composite_vars(self, modified, live_in):
+    # The scope variables corresponding to composite symbols (e.g. `self.x`).
+    composite_scope_vars = []
     for s in modified:
       if not s.is_composite():
         continue
-      # Mutations made to objects created inside the loop will appear as writes
+      # Mutations made to objects created inside the scope will appear as writes
       # to composite symbols. Because these mutations appear as modifications
       # made to composite symbols, we check whether the composite's parent is
-      # actually live into the loop.
+      # actually live into the scope.
       # Example:
       #   while cond:
       #     x = Foo()
-      #     x.foo = 2 * x.foo  # x.foo is live into the loop, but x is not.
+      #     x.foo = 2 * x.foo  # x.foo is live into the scope, but x is not.
       #
       # Note that some parents might not be symbols - for example, in x['foo'],
       # 'foo' is a parent, but it's a literal, not a symbol. We don't check the
@@ -390,40 +171,106 @@ class ControlFlowTransformer(converter.Base):
           sss for sss in s.support_set if sss.is_symbol())
       if not all(sss in live_in for sss in support_set_symbols):
         continue
-      composite_loop_vars.append(s)
-    return frozenset(composite_loop_vars)
+      composite_scope_vars.append(s)
+    return frozenset(composite_scope_vars)
 
-  def _get_loop_vars(self, node, modified):
-    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+  def _get_block_vars(self, node, modified):
+    """Determines the variables affected inside a control flow statement."""
     defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
     live_in = anno.getanno(node, anno.Static.LIVE_VARS_IN)
     live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
-    reserved_symbols = body_scope.referenced
 
-    basic_loop_vars = self._get_basic_loop_vars(modified, live_in, live_out)
-    composite_loop_vars = self._get_composite_loop_vars(modified, live_in)
-    loop_vars = tuple(basic_loop_vars | composite_loop_vars)
+    basic_scope_vars = self._get_block_basic_vars(
+        modified,
+        live_in,
+        live_out)
+    composite_scope_vars = self._get_block_composite_vars(modified, live_in)
+    scope_vars = tuple(basic_scope_vars | composite_scope_vars)
 
-    # Variable that are used or defined inside the loop, but not defined
-    # before entering the loop. Only simple variables must be defined. The
+    # Variables that are modified inside the scope, but not defined
+    # before entering it. Only simple variables must be defined. The
     # composite ones will be implicitly checked at runtime.
-    undefined_lives = basic_loop_vars - defined_in
+    # This covers loop variables as well as variables that
+    undefined = tuple(v for v in modified - defined_in if not v.is_composite())
 
-    return loop_vars, reserved_symbols, undefined_lives
+    # Variables that are modified inside the scope, and depend on values outside
+    # it.
+    input_only = basic_scope_vars & live_in - live_out
+
+    # Place the outputs first.
+    scope_vars = sorted(scope_vars, key=lambda v: v in input_only)
+    nouts = len(scope_vars) - len(input_only)
+
+    return scope_vars, undefined, nouts
+
+  def visit_If(self, node):
+    node = self.generic_visit(node)
+    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+    orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
+
+    cond_vars, undefined, nouts = self._get_block_vars(
+        node, body_scope.modified | orelse_scope.modified)
+
+    undefined_assigns = self._create_undefined_assigns(undefined)
+
+    nonlocal_declarations = self._create_nonlocal_declarations(cond_vars)
+
+    reserved = body_scope.referenced | orelse_scope.referenced
+    state_getter_name = self.ctx.namer.new_symbol('get_state', reserved)
+    state_setter_name = self.ctx.namer.new_symbol('set_state', reserved)
+    state_functions = self._create_state_functions(
+        cond_vars, nonlocal_declarations, state_getter_name, state_setter_name)
+
+    orelse_body = node.orelse
+    if not orelse_body:
+      orelse_body = [gast.Pass()]
+
+    template = """
+      state_functions
+      def body_name():
+        nonlocal_declarations
+        body
+      def orelse_name():
+        nonlocal_declarations
+        orelse
+      undefined_assigns
+      ag__.if_stmt(
+        test,
+        body_name,
+        orelse_name,
+        state_getter_name,
+        state_setter_name,
+        (symbol_names,),
+        nouts)
+    """
+    return templates.replace(
+        template,
+        body=node.body,
+        body_name=self.ctx.namer.new_symbol('if_body', reserved),
+        orelse=orelse_body,
+        orelse_name=self.ctx.namer.new_symbol('else_body', reserved),
+        nonlocal_declarations=nonlocal_declarations,
+        nouts=gast.Constant(nouts, kind=None),
+        state_functions=state_functions,
+        state_getter_name=state_getter_name,
+        state_setter_name=state_setter_name,
+        symbol_names=tuple(gast.Constant(str(s), kind=None) for s in cond_vars),
+        test=node.test,
+        undefined_assigns=undefined_assigns)
 
   def visit_While(self, node):
     node = self.generic_visit(node)
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
 
-    loop_vars, reserved_symbols, possibly_undefs = self._get_loop_vars(
-        node, body_scope.modified)
+    loop_vars, undefined, _ = self._get_block_vars(node, body_scope.modified)
 
-    undefined_assigns = self._create_undefined_assigns(possibly_undefs)
+    undefined_assigns = self._create_undefined_assigns(undefined)
 
     nonlocal_declarations = self._create_nonlocal_declarations(loop_vars)
 
-    state_getter_name = self.ctx.namer.new_symbol('get_state', reserved_symbols)
-    state_setter_name = self.ctx.namer.new_symbol('set_state', reserved_symbols)
+    reserved = body_scope.referenced
+    state_getter_name = self.ctx.namer.new_symbol('get_state', reserved)
+    state_setter_name = self.ctx.namer.new_symbol('set_state', reserved)
     state_functions = self._create_state_functions(
         loop_vars, nonlocal_declarations, state_getter_name, state_setter_name)
 
@@ -448,7 +295,7 @@ class ControlFlowTransformer(converter.Base):
     return templates.replace(
         template,
         body=node.body,
-        body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
+        body_name=self.ctx.namer.new_symbol('loop_body', reserved),
         nonlocal_declarations=nonlocal_declarations,
         opts=opts,
         state_functions=state_functions,
@@ -456,7 +303,7 @@ class ControlFlowTransformer(converter.Base):
         state_setter_name=state_setter_name,
         symbol_names=tuple(gast.Constant(str(s), kind=None) for s in loop_vars),
         test=node.test,
-        test_name=self.ctx.namer.new_symbol('loop_test', reserved_symbols),
+        test_name=self.ctx.namer.new_symbol('loop_test', reserved),
         undefined_assigns=undefined_assigns)
 
   def visit_For(self, node):
@@ -464,15 +311,16 @@ class ControlFlowTransformer(converter.Base):
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
     iter_scope = anno.getanno(node, annos.NodeAnno.ITERATE_SCOPE)
 
-    loop_vars, reserved_symbols, possibly_undefs = self._get_loop_vars(
+    loop_vars, undefined, _ = self._get_block_vars(
         node, body_scope.modified | iter_scope.modified)
 
-    undefined_assigns = self._create_undefined_assigns(possibly_undefs)
+    undefined_assigns = self._create_undefined_assigns(undefined)
 
     nonlocal_declarations = self._create_nonlocal_declarations(loop_vars)
 
-    state_getter_name = self.ctx.namer.new_symbol('get_state', reserved_symbols)
-    state_setter_name = self.ctx.namer.new_symbol('set_state', reserved_symbols)
+    reserved = body_scope.referenced | iter_scope.referenced
+    state_getter_name = self.ctx.namer.new_symbol('get_state', reserved)
+    state_setter_name = self.ctx.namer.new_symbol('set_state', reserved)
     state_functions = self._create_state_functions(
         loop_vars, nonlocal_declarations, state_getter_name, state_setter_name)
 
@@ -484,7 +332,7 @@ class ControlFlowTransformer(converter.Base):
     if anno.hasanno(node, anno.Basic.EXTRA_LOOP_TEST):
       extra_test = anno.getanno(node, anno.Basic.EXTRA_LOOP_TEST)
       extra_test_name = self.ctx.namer.new_symbol(
-          'extra_test', reserved_symbols)
+          'extra_test', reserved)
       template = """
         def extra_test_name():
           nonlocal_declarations
@@ -502,7 +350,7 @@ class ControlFlowTransformer(converter.Base):
 
     # iterate_arg_name holds a single arg with the iterates, which may be a
     # tuple.
-    iterate_arg_name = self.ctx.namer.new_symbol('itr', reserved_symbols)
+    iterate_arg_name = self.ctx.namer.new_symbol('itr', reserved)
     template = """
       iterates = iterate_arg_name
     """
@@ -529,7 +377,7 @@ class ControlFlowTransformer(converter.Base):
     return templates.replace(
         template,
         body=node.body,
-        body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
+        body_name=self.ctx.namer.new_symbol('loop_body', reserved),
         extra_test_function=extra_test_function,
         extra_test_name=extra_test_name,
         iterate_arg_name=iterate_arg_name,
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index 32e86400da6..f0681128698 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -1,3 +1,4 @@
+# Lint as: python3
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -188,9 +189,9 @@ class WhileStatementTest(ControlFlowTestBase):
         symbols={'TestClass': TestClass})
     with self.converted(
         test_fn, control_flow, {'TestClass': TestClass}) as result:
-      # TODO(b/128519776): Better error message.
-      with self.assertRaisesRegex(AttributeError, 'subattr'):
-        result.test_fn(constant_op.constant(0), constant_op.constant(5))
+      with self.assertRaisesRegex(
+          ValueError, "'tc.subattr' must be defined before the loop"):
+        result.test_fn(constant_op.constant(0), 0)
 
   def test_composite_state_slice_initialized_in_loop(self):
 
@@ -208,9 +209,9 @@ class WhileStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(test_fn, (0, constant_op.constant(10)),
                                  {'subkey': 14})
     with self.converted(test_fn, control_flow, {}) as result:
-      # TODO(b/128519776): Better error message.
-      with self.assertRaisesRegex(KeyError, 'subkey'):
-        result.test_fn(constant_op.constant(0), constant_op.constant(5))
+      with self.assertRaisesRegex(
+          ValueError, r"'d\[k\]' must be defined before the loop"):
+        result.test_fn(constant_op.constant(0), 0)
 
   def test_composite_state_literal_slice_initialized_in_loop(self):
 
@@ -227,9 +228,9 @@ class WhileStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(test_fn, (0, constant_op.constant(10)),
                                  {'subkey': 14})
     with self.converted(test_fn, control_flow, {}) as result:
-      # TODO(b/128519776): Better error message.
-      with self.assertRaisesRegex(KeyError, 'subkey'):
-        result.test_fn(constant_op.constant(0), constant_op.constant(5))
+      with self.assertRaisesRegex(
+          ValueError, r"'d\['subkey'\]' must be defined before the loop"):
+        result.test_fn(constant_op.constant(0), 0)
 
   def test_composite_state_slice_aliased_to_local(self):
 
@@ -244,7 +245,7 @@ class WhileStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(test_fn, (0, constant_op.constant(10)),
                                  {'subkey': 11})
     with self.converted(test_fn, control_flow, {}) as result:
-      # TODO(b/128519776): Better error message.
+      # TODO(b/136999953): Better error message.
       # Note that this error happens at execution time.
       with self.assertRaises(errors.InaccessibleTensorError):
         graph_fn = def_function.function(result.test_fn, autograph=False)
@@ -453,6 +454,17 @@ class IfStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(test_fn, constant_op.constant(1), 5)
     self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
 
+  def test_local_remains_local(self):
+
+    def test_fn(n):
+      if n > 0:
+        b = 4
+        n = b + 1
+      return n
+
+    self.assertTransformedResult(test_fn, constant_op.constant(1), 5)
+    self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
+
   def test_no_outputs(self):
 
     def test_fn(n):
@@ -465,6 +477,85 @@ class IfStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(test_fn, constant_op.constant(1), 1)
     self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
 
+  def test_created_outputs(self):
+
+    def test_fn(i):
+      if i == 0:
+        result = i - 1
+      else:
+        result = i + 1
+      return result
+
+    self.assertTransformedResult(test_fn, 0, -1)
+    self.assertTransformedResult(test_fn, 1, 2)
+
+  def test_created_loop_local_outputs(self):
+
+    def test_fn(n, x):
+      for i in n:
+        if i == 0:
+          result = i - 1
+        else:
+          result = i + 1
+        if result > 0:
+          x += 1
+      return x
+
+    self.assertTransformedResult(test_fn, (range(5), 10), 14)
+
+  def test_created_loop_variable(self):
+
+    def test_fn(n, x):
+      for i in n:
+        if i == 0:
+          result = i - 1
+        if i > 0:  # Using the result from previous iteration.
+          if result < 0:
+            x += 1
+      return x
+
+    self.assertTransformedResult(test_fn, (range(5), 10), 14)
+
+  def test_unaffected_global(self):
+
+    def test_fn(i):
+      global g  # pylint:disable=global-variable-undefined
+      if i == 0:
+        g = i - 1
+      return g
+
+    self.assertTransformedResult(test_fn, 1, 3, symbols={'g': 3})
+    self.assertTransformedResult(test_fn, 0, -1, symbols={'g': 3})
+
+  def test_unaffected_nonlocal(self):
+
+    def test_fn(i):
+      def inner_fn():
+        nonlocal n
+        if i == 0:
+          n = i - 1
+
+      n = 3
+      inner_fn()
+      return n
+
+    self.assertTransformedResult(test_fn, 1, 3)
+    self.assertTransformedResult(test_fn, 0, -1)
+
+  def test_output_defined_in_prior_except(self):
+
+    def test_fn(i):
+      try:
+        raise ValueError()
+      except ValueError:
+        x = 1
+      if i == 0:
+        x = i - 1
+      return x
+
+    self.assertTransformedResult(test_fn, 1, 1)
+    self.assertTransformedResult(test_fn, 0, -1)
+
   def test_unbalanced_multiple_composites(self):
 
     class Foo(object):
@@ -580,11 +671,9 @@ class ForStatementTest(ControlFlowTestBase):
         symbols={'TestClass': TestClass})
     with self.converted(
         test_fn, control_flow, {'TestClass': TestClass}) as result:
-      # TODO(b/128519776): Better error message.
       with self.assertRaisesRegex(
-          AttributeError, '\'TestClass\' object has no attribute \'x\''):
-        result.test_fn(
-            constant_op.constant(list(range(5))), constant_op.constant(5))
+          ValueError, "'tc.x' must be defined before the loop"):
+        result.test_fn(constant_op.constant(list(range(5))), 0)
 
   def test_tuple_unpacking(self):
     def test_fn(x_list):
diff --git a/tensorflow/python/autograph/converters/functions.py b/tensorflow/python/autograph/converters/functions.py
index fc33dafb63d..26ead131f9b 100644
--- a/tensorflow/python/autograph/converters/functions.py
+++ b/tensorflow/python/autograph/converters/functions.py
@@ -38,15 +38,6 @@ class _Function(object):
 class FunctionTransformer(converter.Base):
   """Wraps function bodies around autograph-specific boilerplate."""
 
-  def visit_Return(self, node):
-    if node.value is None:
-      return node
-    node = self.generic_visit(node)
-    return templates.replace(
-        'return function_context_name.mark_return_value(value)',
-        function_context_name=self.state[_Function].context_name,
-        value=node.value)
-
   def _function_scope_options(self, fn_scope):
     """Returns the options with which to create function scopes."""
     # Top-level function receive the options that were directly requested.
diff --git a/tensorflow/python/autograph/converters/functions_test.py b/tensorflow/python/autograph/converters/functions_test.py
index aad455e67d7..2a51ef71ebf 100644
--- a/tensorflow/python/autograph/converters/functions_test.py
+++ b/tensorflow/python/autograph/converters/functions_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.converters import functions
+from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import converter_testing
@@ -74,7 +75,7 @@ class FunctionTransformer(converter_testing.TestCase):
       l += 1
       return l, inner_fn(l)
 
-    with self.converted(test_fn, functions, {},
+    with self.converted(test_fn, (functions, return_statements), {},
                         (ops.name_scope,)) as result:
       first, second = result.test_fn(constant_op.constant(1))
       self.assertIn('test_fn/', first.op.name)
@@ -119,6 +120,7 @@ class FunctionTransformer(converter_testing.TestCase):
     ns = {'TestClass': TestClass}
     node, ctx = self.prepare(TestClass, ns)
     node = functions.transform(node, ctx)
+    node = return_statements.transform(node, ctx)
 
     with self.compiled(node, {}, (ops.name_scope,)) as result:
       first, second = result.TestClass().test_fn(constant_op.constant(1))
diff --git a/tensorflow/python/autograph/converters/return_statements.py b/tensorflow/python/autograph/converters/return_statements.py
index 39bac60fb91..e4062e42db7 100644
--- a/tensorflow/python/autograph/converters/return_statements.py
+++ b/tensorflow/python/autograph/converters/return_statements.py
@@ -220,9 +220,9 @@ class ReturnStatementsTransformer(converter.Base):
         retval = val
   """
 
-  def __init__(self, ctx, default_to_null_return):
+  def __init__(self, ctx, allow_missing_return):
     super(ReturnStatementsTransformer, self).__init__(ctx)
-    self.default_to_null_return = default_to_null_return
+    self.allow_missing_return = allow_missing_return
 
   def visit_Return(self, node):
     for block in reversed(self.state[_Block].stack):
@@ -339,75 +339,68 @@ class ReturnStatementsTransformer(converter.Base):
     return node
 
   def visit_FunctionDef(self, node):
-    self.state[_Function].enter()
-    self.state[_Block].enter()
-    self.state[_Block].is_function = True
+    with self.state[_Function] as fn:
+      with self.state[_Block] as block:
+        block.is_function = True
 
-    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-    do_return_var_name = self.ctx.namer.new_symbol(
-        'do_return', scope.referenced)
-    retval_var_name = self.ctx.namer.new_symbol('retval_', scope.referenced)
-    self.state[_Function].do_return_var_name = do_return_var_name
-    self.state[_Function].retval_var_name = retval_var_name
+        scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+        do_return_var_name = self.ctx.namer.new_symbol('do_return',
+                                                       scope.referenced)
+        retval_var_name = self.ctx.namer.new_symbol('retval_', scope.referenced)
+        fn.do_return_var_name = do_return_var_name
+        fn.retval_var_name = retval_var_name
 
-    converted_body = self._visit_statement_block(node, node.body)
+        node.body = self._visit_statement_block(node, node.body)
 
-    # Avoid placing statements before any eventual docstring.
-    # TODO(mdan): Should a docstring even be included in the output?
-    docstring = None
-    if converted_body:
-      if (isinstance(converted_body[0], gast.Expr) and
-          isinstance(converted_body[0].value, gast.Constant)):
-        docstring = converted_body[0]
-        converted_body = converted_body[1:]
+        if block.return_used:
 
-    if self.state[_Block].return_used:
+          if self.allow_missing_return:
+            # The function whould have a single `with` node that wraps the
+            # entire body. If the function had a docstring, the body has two
+            # nodes, with the `with` as the second node.
+            wrapper_node = node.body[-1]
+            assert isinstance(wrapper_node, gast.With), (
+                'This transformer requires the functions converter.')
 
-      if self.default_to_null_return:
-        # TODO(mdan): Remove the (do_return_var_name,) below.
-        # Currently, that line ensures the variable is both defined and alive
-        # throughout the function.
-        template = """
-          do_return_var_name = False
-          retval_var_name = ag__.UndefinedReturnValue()
-          body
-          (do_return_var_name,)
-          return ag__.retval(retval_var_name)
-        """
-      else:
-        template = """
-          body
-          return retval_var_name
-        """
-      node.body = templates.replace(
-          template,
-          body=converted_body,
-          do_return_var_name=do_return_var_name,
-          retval_var_name=retval_var_name)
+            template = """
+              do_return_var_name = False
+              retval_var_name = ag__.UndefinedReturnValue()
+              body
+              return function_context.ret(retval_var_name, do_return_var_name)
+            """
 
-      if docstring:
-        node.body.insert(0, docstring)
+            wrapper_node.body = templates.replace(
+                template,
+                body=wrapper_node.body,
+                do_return_var_name=do_return_var_name,
+                function_context=anno.getanno(node, 'function_context_name'),
+                retval_var_name=retval_var_name)
+          else:
+            template = """
+              body
+              return retval_var_name
+            """
+            node.body = templates.replace(
+                template,
+                body=node.body,
+                do_return_var_name=do_return_var_name,
+                retval_var_name=retval_var_name)
 
-    self.state[_Block].exit()
-    self.state[_Function].exit()
     return node
 
 
 def transform(node, ctx, default_to_null_return=True):
-  """Ensure a function has only a single return."""
-  # Note: Technically, these two could be merged into a single walk, but
-  # keeping them separate helps with readability.
-
+  """Ensure a function has only a single return, at the end."""
   node = qual_names.resolve(node)
   node = activity.resolve(node, ctx, None)
 
+  # Note: Technically, these two could be merged into a single walk, but
+  # keeping them separate helps with readability.
   node = ConditionalReturnRewriter(ctx).visit(node)
 
   node = qual_names.resolve(node)
   node = activity.resolve(node, ctx, None)
-
   transformer = ReturnStatementsTransformer(
-      ctx, default_to_null_return=default_to_null_return)
+      ctx, allow_missing_return=default_to_null_return)
   node = transformer.visit(node)
-
   return node
diff --git a/tensorflow/python/autograph/converters/return_statements_test.py b/tensorflow/python/autograph/converters/return_statements_test.py
index df687927638..3f1e6a0bd97 100644
--- a/tensorflow/python/autograph/converters/return_statements_test.py
+++ b/tensorflow/python/autograph/converters/return_statements_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.autograph.converters import functions
 from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import ops
@@ -28,7 +29,7 @@ class SingleReturnTest(converter_testing.TestCase):
 
   def assertTransformedEquivalent(self, test_fn, *inputs):
     ns = {'ops': ops}
-    with self.converted(test_fn, return_statements, ns) as result:
+    with self.converted(test_fn, (functions, return_statements), ns) as result:
       self.assertEqual(test_fn(*inputs), result.test_fn(*inputs))
 
   def test_straightline(self):
diff --git a/tensorflow/python/autograph/converters/variables.py b/tensorflow/python/autograph/converters/variables.py
new file mode 100644
index 00000000000..9784f50ed56
--- /dev/null
+++ b/tensorflow/python/autograph/converters/variables.py
@@ -0,0 +1,101 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Overloads all variable read operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import templates
+
+
+class VariableAccessTransformer(converter.Base):
+  """Rewrites basic symbol reads.
+
+  This transformer rewrites variable reads with a "read" operator which allows
+  tracking activity.
+
+  Example:
+
+  For a basic statement:
+
+      a = b + c
+
+  This is translated to:
+
+      a = ld(b) + ld(c)
+
+  Augmented assignment operations also introduce a `ld` operator:
+
+      a += b
+
+  The assignment target also receives an operator to properly represent the
+  read:
+
+      a = ld(a)
+      a += ld(b)
+  """
+
+  def visit_Name(self, node):
+    # Only the loads which existed in the original code are overloaded.
+    if not anno.hasanno(node, anno.Static.ORIG_DEFINITIONS):
+      return node
+    if isinstance(node.ctx, gast.Load):
+      node = templates.replace_as_expression('ag__.ld(var_)', var_=node)
+    return node
+
+  def visit_Delete(self, node):
+    node = self.generic_visit(node)
+
+    rewrite_targets = []
+    for tgt in node.targets:
+      # Don't rewrite composites like `del a[0]`.
+      if isinstance(tgt, gast.Name):
+        rewrite_targets.append(tgt)
+
+    if not rewrite_targets:
+      return node
+
+    results = []
+    for tgt in rewrite_targets:
+      template = """
+        var_ = ag__.Undefined(var_name)
+      """
+      results.extend(templates.replace(
+          template, var_=tgt, var_name=gast.Constant(tgt.id, kind=None)))
+    remaining_targets = [n for n in node.targets if n not in rewrite_targets]
+    if remaining_targets:
+      results.append(gast.Delete(targets=remaining_targets))
+
+    return results
+
+  def visit_AugAssign(self, node):
+    if isinstance(node.target, gast.Name):
+      template = """
+        var_ = ag__.ld(var_)
+        original
+      """
+      node = templates.replace(template, var_=node.target, original=node)
+    else:
+      node = self.generic_visit(node)
+    return node
+
+
+def transform(node, ctx):
+  return VariableAccessTransformer(ctx).visit(node)
diff --git a/tensorflow/python/autograph/converters/variables_test.py b/tensorflow/python/autograph/converters/variables_test.py
new file mode 100644
index 00000000000..93a31e63de3
--- /dev/null
+++ b/tensorflow/python/autograph/converters/variables_test.py
@@ -0,0 +1,200 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for variables module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+from tensorflow.python.autograph.converters import variables
+from tensorflow.python.autograph.core import converter_testing
+from tensorflow.python.platform import test
+
+
+class VariablesTest(converter_testing.TestCase):
+
+  @contextlib.contextmanager
+  def apply_add_one_conversion(self, fn):
+    """Generates code which adds 1 to all variable reads."""
+    with self.converted(fn, variables, {}) as result:
+      result.ag__.__dict__['ld'] = lambda x: x + 1
+      yield result
+
+  def test_read(self):
+
+    def test_fn(l):
+      return l
+
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(1), 2)
+
+  def test_aug_assign(self):
+
+    def test_fn(l):
+      l *= 10
+      return l
+
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(1), (1 + 1) * 10 + 1)  # two reads
+
+  def test_del(self):
+
+    def test_fn(l):
+      del l
+      return l
+
+    with self.converted(test_fn, variables, {}) as result:
+      with self.assertRaisesRegex(
+          NameError, "'l' is used before assignment"):
+        result.test_fn(1)
+
+  def test_del_getitem_ignored(self):
+
+    def basic_slice(l):
+      del l[0]
+      return l
+
+    with self.converted(basic_slice, variables, {}) as result:
+      self.assertListEqual([2], result.basic_slice([1, 2]))
+
+    def range_slice(l):
+      del l[0:2]
+      return l
+
+    with self.converted(range_slice, variables, {}) as result:
+      self.assertListEqual([], result.range_slice([1, 2]))
+
+  def test_del_getattr_ignored(self):
+
+    def test_fn(l):
+      del l.a
+      return l
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.a = 1
+        self.b = 2
+
+    with self.converted(test_fn, variables, {}) as result:
+      self.assertFalse(hasattr(result.test_fn(TestClass()), 'a'))
+      self.assertEqual(result.test_fn(TestClass()).b, 2)
+
+  def test_del_packing_ignored(self):
+    # Note: test for UnboundLocalError, not NameError because in this case we
+    # don't rewrite the del.
+
+    def list_(a, b):
+      del [a, b]
+      return a
+
+    with self.converted(list_, variables, {}) as result:
+      with self.assertRaises(UnboundLocalError):
+        result.list_(1, 2)
+
+    def nested(a, b, c):
+      del [a, (b, c)]
+      return c
+
+    with self.converted(nested, variables, {}) as result:
+      with self.assertRaises(UnboundLocalError):
+        result.nested(1, 2, 3)
+
+  def test_del_item_multiple_mixed(self):
+
+    def test_fn_failing(a, b, c):
+      del a, b, c[0]
+      a = 1
+      return a, b, c
+
+    with self.converted(test_fn_failing, variables, {}) as result:
+      with self.assertRaisesRegex(
+          NameError, "'b' is used before assignment"):
+        result.test_fn_failing(1, 2, [1, 2])
+
+    def test_fn_passing(a, b, c):
+      del a, b, c[0]
+      a = 1
+      b = 2
+      return c
+
+    with self.converted(test_fn_passing, variables, {}) as result:
+      self.assertListEqual([2], result.test_fn_passing(1, 2, [1, 2]))
+
+  def test_attribute(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.v = 1
+
+      def __add__(self, other):
+        self.v += other
+        return self
+
+    def test_fn(l):
+      return l.v
+
+    tc = TestClass()
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(tc), 2)
+
+  def test_subscript(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.v = 1
+
+      def __add__(self, other):
+        self.v += other
+        return self
+
+      def __getitem__(self, _):
+        return self.v
+
+    def test_fn(l):
+      return l[0]
+
+    tc = TestClass()
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(tc), 2)
+
+  def test_call(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.v = 1
+
+      def __add__(self, other):
+        self.v += other
+        return self
+
+      def __call__(self):
+        return self.v
+
+    def test_fn(l):
+      return l()
+
+    tc = TestClass()
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(tc), 2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index 655dc118a37..4a5c50dac55 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -30,6 +30,7 @@ py_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python/autograph/operators",
         "//tensorflow/python/autograph/pyct",
         "//tensorflow/python/autograph/pyct/static_analysis",
         "//tensorflow/python/autograph/utils",
diff --git a/tensorflow/python/autograph/core/function_wrappers.py b/tensorflow/python/autograph/core/function_wrappers.py
index cc0e7b98de5..d425f8b679d 100644
--- a/tensorflow/python/autograph/core/function_wrappers.py
+++ b/tensorflow/python/autograph/core/function_wrappers.py
@@ -20,12 +20,16 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.operators import variables
 from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.util import nest
 
 
+# TODO(mdan): Move this into operators - it represents a function definition.
+
+
 class FunctionScope(object):
   """Context manager that wraps the body of a converted function.
 
@@ -84,8 +88,13 @@ class FunctionScope(object):
     if self.use_auto_deps:
       self.autodeps_scope.__exit__(exc_type, exc_val, exc_tb)
 
-  def mark_return_value(self, value):
+  def ret(self, value, did_return):
     """Marks a value as returned from the function guarded by the scope."""
+    del did_return
+
+    if isinstance(value, variables.UndefinedReturnValue):
+      return None
+
     if self.use_auto_deps:
       self._return_value_marked = True
       if value is None:
diff --git a/tensorflow/python/autograph/core/function_wrappers_test.py b/tensorflow/python/autograph/core/function_wrappers_test.py
index 917a5358633..344ba495570 100644
--- a/tensorflow/python/autograph/core/function_wrappers_test.py
+++ b/tensorflow/python/autograph/core/function_wrappers_test.py
@@ -46,7 +46,7 @@ class FunctionWrappersTest(test.TestCase):
         converter.ConversionOptions(
             optional_features=converter.Feature.AUTO_CONTROL_DEPS)) as scope:
       v.assign(2)
-      op = scope.mark_return_value(constant_op.constant(1))
+      op = scope.ret(constant_op.constant(1), True)
     self.evaluate(op)
     self.assertEqual(self.evaluate(v.read_value()), 2)
 
diff --git a/tensorflow/python/autograph/g3doc/reference/control_flow.md b/tensorflow/python/autograph/g3doc/reference/control_flow.md
index 79cc0f31450..cf580af7330 100644
--- a/tensorflow/python/autograph/g3doc/reference/control_flow.md
+++ b/tensorflow/python/autograph/g3doc/reference/control_flow.md
@@ -164,7 +164,7 @@ after if
 #### Python values modified in TensorFlow control flow become Tensors
 
 If a symbol is modified in a TensorFlow control flow statement, then it becomes
-a `tf.Tensor`, even if it started off as a Python promitive value.
+a `tf.Tensor`, even if it started off as a Python primitive value.
 
 For example, the conditional below will run as a `tf.cond` (its condition is a
 `tf.Tensor`), which in turn will cause `i` to become a `tf.Tensor`.
diff --git a/tensorflow/python/autograph/g3doc/reference/generated_code.md b/tensorflow/python/autograph/g3doc/reference/generated_code.md
index b62911b7203..389fa53a065 100644
--- a/tensorflow/python/autograph/g3doc/reference/generated_code.md
+++ b/tensorflow/python/autograph/g3doc/reference/generated_code.md
@@ -66,7 +66,7 @@ print(inspect.getsourcefile(converted_f))
 ```
 
 `tf.autograph.to_code` is a shortcut to obtain the generated code, and it's
-equivalent with calling `inspect.getsource(tf.autograph.to_code(f))`.
+equivalent with calling `inspect.getsource(tf.autograph.to_graph(f))`.
 
 #### Recording diagnostic information: `tf.autograph.set_verbosity`
 
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index 3ebb5824b7f..98e19fdde86 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -18,13 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import copy
 import functools
 import inspect
 import os
-import pdb
-import re
 import sys
 import textwrap
 import traceback
@@ -344,6 +340,15 @@ def _call_unconverted(f, args, kwargs, options, update_cache=True):
   return f(*args)
 
 
+def _is_of_known_loaded_module(f, module_name):
+  mod = sys.modules.get(module_name, None)
+  if mod is None:
+    return False
+  if any(v is not None for v in mod.__dict__.values() if f is v):
+    return True
+  return False
+
+
 def _is_known_loaded_type(f, module_name, entity_name):
   """Tests whether the function or method is an instance of a known type."""
   if (module_name not in sys.modules or
@@ -511,7 +516,8 @@ def converted_call(f,
   # Other built-in modules are permanently whitelisted.
   # TODO(mdan): Figure out how to do this consistently for all stdlib modules.
   if any(
-      f in m.__dict__.values() for m in (collections, pdb, copy, inspect, re)):
+      _is_of_known_loaded_module(f, m)
+      for m in ('collections', 'pdb', 'copy', 'inspect', 're')):
     logging.log(2, 'Permanently whitelisted: %s: part of builtin module', f)
     return _call_unconverted(f, args, kwargs, options)
 
diff --git a/tensorflow/python/autograph/impl/api_py3_test.py b/tensorflow/python/autograph/impl/api_py3_test.py
index df6544928bf..c460e478008 100644
--- a/tensorflow/python/autograph/impl/api_py3_test.py
+++ b/tensorflow/python/autograph/impl/api_py3_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import os
 
 from tensorflow.python.autograph.core import converter
@@ -60,6 +61,23 @@ class ApiTest(test.TestCase):
 
     self.assertEqual(5, tc.no_arg(2))
 
+  def test_converted_call_avoids_triggering_operators(self):
+
+    test_self = self
+
+    class Pair(collections.namedtuple('Pair', ['a', 'b'])):
+
+      def __call__(self):
+        return self.a + self.b
+
+      def __eq__(self, other):
+        test_self.fail('Triggered operator')
+
+    p = Pair(constant_op.constant(1), constant_op.constant(2))
+
+    x = api.converted_call(p, (), {}, options=DEFAULT_RECURSIVE)
+    self.assertIsNotNone(self.evaluate(x), 3)
+
 
 if __name__ == '__main__':
   os.environ['AUTOGRAPH_STRICT_CONVERSION'] = '1'
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index 7a7efe3d43a..eeea0aef896 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -36,6 +36,7 @@ from tensorflow.python.autograph.converters import lists
 from tensorflow.python.autograph.converters import logical_expressions
 from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.converters import slices
+from tensorflow.python.autograph.converters import variables
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import function_wrappers
@@ -92,6 +93,7 @@ class AutoGraphTranspiler(transpiler.FunctionTranspiler):
     node = control_flow.transform(node, ctx)
     node = conditional_expressions.transform(node, ctx)
     node = logical_expressions.transform(node, ctx)
+    node = variables.transform(node, ctx)
     return node
 
 
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index 6db9e4f8e3b..5f644ea525d 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -22,6 +22,7 @@ py_library(
     name = "operators",
     srcs = [
         "__init__.py",
+        "conditional_expressions.py",
         "control_flow.py",
         "control_flow_deprecated_py2.py",
         "data_structures.py",
@@ -29,8 +30,7 @@ py_library(
         "logical.py",
         "py_builtins.py",
         "slices.py",
-        "special_values.py",
-        "symbols.py",
+        "variables.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -63,6 +63,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "conditional_expressions_test",
+    srcs = ["conditional_expressions_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = [
+        "no_oss_py2",
+    ],
+    deps = [
+        ":operators",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "control_flow_test",
     srcs = ["control_flow_test.py"],
@@ -148,19 +162,8 @@ py_test(
 )
 
 py_test(
-    name = "special_values_test",
-    srcs = ["special_values_test.py"],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":operators",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-py_test(
-    name = "symbols_test",
-    srcs = ["symbols_test.py"],
+    name = "variables_test",
+    srcs = ["variables_test.py"],
     python_version = "PY3",
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/python/autograph/operators/__init__.py b/tensorflow/python/autograph/operators/__init__.py
index 495b6070aae..a42dcf326c3 100644
--- a/tensorflow/python/autograph/operators/__init__.py
+++ b/tensorflow/python/autograph/operators/__init__.py
@@ -37,6 +37,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.autograph.operators.conditional_expressions import if_exp
 from tensorflow.python.autograph.operators.control_flow import for_stmt
 from tensorflow.python.autograph.operators.control_flow import if_stmt
 from tensorflow.python.autograph.operators.control_flow import while_stmt
@@ -60,8 +61,7 @@ from tensorflow.python.autograph.operators.py_builtins import range_
 from tensorflow.python.autograph.operators.slices import get_item
 from tensorflow.python.autograph.operators.slices import GetItemOpts
 from tensorflow.python.autograph.operators.slices import set_item
-from tensorflow.python.autograph.operators.special_values import is_undefined
-from tensorflow.python.autograph.operators.special_values import is_undefined_return
-from tensorflow.python.autograph.operators.special_values import retval
-from tensorflow.python.autograph.operators.special_values import Undefined
-from tensorflow.python.autograph.operators.special_values import UndefinedReturnValue
+from tensorflow.python.autograph.operators.variables import ld
+from tensorflow.python.autograph.operators.variables import ldu
+from tensorflow.python.autograph.operators.variables import Undefined
+from tensorflow.python.autograph.operators.variables import UndefinedReturnValue
diff --git a/tensorflow/python/autograph/operators/conditional_expressions.py b/tensorflow/python/autograph/operators/conditional_expressions.py
new file mode 100644
index 00000000000..7ea2b249935
--- /dev/null
+++ b/tensorflow/python/autograph/operators/conditional_expressions.py
@@ -0,0 +1,56 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Conditional expressions (e.g. the ternary if statement)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.autograph.operators import control_flow
+from tensorflow.python.autograph.utils import tensors
+from tensorflow.python.ops import control_flow_ops
+
+
+def if_exp(cond, if_true, if_false, expr_repr):
+  if tensors.is_dense_tensor(cond):
+    return _tf_if_exp(cond, if_true, if_false, expr_repr)
+  else:
+    return _py_if_exp(cond, if_true, if_false)
+
+
+def _tf_if_exp(cond, if_true, if_false, expr_repr):
+  """Overload of if_exp that stages a TF cond."""
+  # TODO(mdan): Use nonlocal once we no longer need to support py2.
+  true_val = []
+  false_val = []
+
+  def true_fn():
+    true_val.append(if_true())
+    if true_val and false_val:
+      control_flow.verify_single_cond_var(expr_repr, true_val[0], false_val[0])
+    return true_val[0]
+
+  def false_fn():
+    false_val.append(if_false())
+    if true_val and false_val:
+      control_flow.verify_single_cond_var(expr_repr, true_val[0], false_val[0])
+    return false_val[0]
+
+  return control_flow_ops.cond(cond, true_fn, false_fn)
+
+
+def _py_if_exp(cond, if_true, if_false):
+  return if_true() if cond else if_false()
diff --git a/tensorflow/python/autograph/operators/conditional_expressions_test.py b/tensorflow/python/autograph/operators/conditional_expressions_test.py
new file mode 100644
index 00000000000..3f126116023
--- /dev/null
+++ b/tensorflow/python/autograph/operators/conditional_expressions_test.py
@@ -0,0 +1,66 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for conditional_expressions module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.operators import conditional_expressions
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+def _basic_expr(cond):
+  return conditional_expressions.if_exp(
+      cond,
+      lambda: constant_op.constant(1),
+      lambda: constant_op.constant(2),
+      'cond')
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class IfExpTest(test.TestCase):
+
+  def test_tensor(self):
+    self.assertEqual(self.evaluate(_basic_expr(constant_op.constant(True))), 1)
+    self.assertEqual(self.evaluate(_basic_expr(constant_op.constant(False))), 2)
+
+  def test_tensor_mismatched_type(self):
+    # tf.function required because eager cond degenerates to Python if.
+    @def_function.function
+    def test_fn():
+      conditional_expressions.if_exp(
+          constant_op.constant(True), lambda: 1.0, lambda: 2, 'expr_repr')
+
+    with self.assertRaisesRegexp(
+        TypeError,
+        "'expr_repr' has dtype float32 in the main.*int32 in the else"):
+      test_fn()
+
+  def test_python(self):
+    self.assertEqual(self.evaluate(_basic_expr(True)), 1)
+    self.assertEqual(self.evaluate(_basic_expr(False)), 2)
+    self.assertEqual(
+        conditional_expressions.if_exp(True, lambda: 1, lambda: 2, ''), 1)
+    self.assertEqual(
+        conditional_expressions.if_exp(False, lambda: 1, lambda: 2, ''), 2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 48b7971ec16..77db7579ece 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -65,7 +65,7 @@ import traceback
 import numpy as np
 
 from tensorflow.python.autograph.operators import py_builtins
-from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.autograph.operators import variables
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.autograph.utils import compat_util
 from tensorflow.python.autograph.utils import misc
@@ -102,15 +102,15 @@ def _verify_loop_init_vars(values, symbol_names):
   """Ensures that all values in the state are defined when entering a loop."""
   for name, value in zip(symbol_names, values):
     if value is None:
-      raise ValueError('"{}" may not be None before the loop.'.format(name))
-    if special_values.is_undefined_return(value):
+      raise ValueError("'{}' may not be None before the loop.".format(name))
+    if isinstance(value, variables.UndefinedReturnValue):
       # Assumption: the loop will only capture the variable which tracks the
       # return value if the loop contained a return statement.
       # TODO(mdan): This should be checked at the place where return occurs.
       raise ValueError(
           'return statements are not supported within a TensorFlow loop.')
-    if special_values.is_undefined(value):
-      raise ValueError('"{}" must be defined before the loop.'.format(name))
+    if isinstance(value, variables.Undefined):
+      raise ValueError("'{}' must be defined before the loop.".format(name))
 
 
 def _is_subshape(left, right):
@@ -133,9 +133,9 @@ def _is_subshape(left, right):
 def _verify_single_loop_var(
     name, check_shape, init, entry, exit_, shape_invariant):
   """Verifies whether the initial, entry and exit values are consistent."""
-  assert entry is not None, 'no TF op should set "{}" to None?'.format(name)
+  assert entry is not None, "no TF op should set '{}' to None?".format(name)
   if exit_ is None:
-    raise ValueError('"{}" is None at the end of the iteration.'.format(name))
+    raise ValueError("'{}' is None at the end of the iteration.".format(name))
 
   if isinstance(init, (bool, int, float, str, np.ndarray)):
     init = ops.convert_to_tensor_v2(init)
@@ -158,9 +158,8 @@ def _verify_single_loop_var(
 
   if entry.dtype != exit_.dtype:
     raise TypeError(
-        '"{}" has dtype {} before the loop, but dtype {} after one'
-        ' iteration. TensorFlow control flow requires it stays the'
-        ' same.'.format(
+        "'{}' has dtype {} before the loop, but dtype {} after one"
+        ' iteration'.format(
             name,
             entry.dtype.name,
             exit_.dtype.name,
@@ -171,19 +170,19 @@ def _verify_single_loop_var(
       entry_shape = entry.shape
       if not _is_subshape(exit_shape, entry_shape):
         raise ValueError(
-            '"{}" has shape {} before the loop, but shape {} after one'
+            "'{}' has shape {} before the loop, but shape {} after one"
             ' iteration. Use tf.autograph.experimental.set_loop_options to set'
             ' shape invariants.'.format(name, entry_shape, exit_shape))
     else:
       init_shape = init.shape
       if not _is_subshape(init_shape, shape_invariant):
         raise ValueError(
-            '"{}" has shape {} before the loop, which does not conform with'
+            "'{}' has shape {} before the loop, which does not conform with"
             ' the shape invariant {}.'.format(name, init_shape,
                                               shape_invariant))
       if not _is_subshape(exit_shape, shape_invariant):
         raise ValueError(
-            '"{}" has shape {} after one iteration, which does not conform with'
+            "'{}' has shape {} after one iteration, which does not conform with"
             ' the shape invariant {}.'.format(
                 name, exit_shape, shape_invariant))
 
@@ -216,13 +215,13 @@ def _verify_tf_loop_vars(init_vars,
       nest.assert_same_structure(init, entry, expand_composites=True)
       nest.assert_same_structure(entry, exit_, expand_composites=True)
     except (ValueError, TypeError) as e:
-      raise TypeError('"{}" does not have the same nested structure after one'
+      raise TypeError("'{}' does not have the same nested structure after one"
                       ' iteration.\n\n{}'.format(name, e))
     if invariant is not None:
       try:
         nest.assert_same_structure(init, invariant, expand_composites=False)
       except (ValueError, TypeError) as e:
-        raise TypeError('"{}" does not have the same nested structure as its'
+        raise TypeError("'{}' does not have the same nested structure as its"
                         ' corresponding shape invariant.\n\n{}'.format(name, e))
 
     nest.map_structure(
@@ -230,13 +229,13 @@ def _verify_tf_loop_vars(init_vars,
         entry, exit_, invariant)
 
 
-def _verify_single_cond_var(name, body_var, orelse_var):
+def verify_single_cond_var(name, body_var, orelse_var):
   """Verifies whether body_var and orelse_var are consistent."""
   if body_var is None:
-    raise ValueError('"{}" is None at the end of the TRUE branch.'.format(name))
+    raise ValueError("'{}' is None at the end of the main branch.".format(name))
   if orelse_var is None:
     raise ValueError(
-        '"{}" is None at the end of the FALSE branch.'.format(name))
+        "'{}' is None at the end of the else branch.".format(name))
 
   if isinstance(body_var, (bool, int, float, str, np.ndarray)):
     body_var = ops.convert_to_tensor_v2(body_var)
@@ -255,41 +254,37 @@ def _verify_single_cond_var(name, body_var, orelse_var):
 
   if body_var.dtype != orelse_var.dtype:
     raise TypeError(
-        '"{}" has dtype {} in the TRUE branch, but dtype={} in the FALSE'
-        ' branch. TensorFlow control flow requires that they are the'
-        ' same.'.format(name, body_var.dtype.name,
-                        orelse_var.dtype.name))
+        "'{}' has dtype {} in the main branch, but dtype {} in the else"
+        ' branch'.format(name, body_var.dtype.name,
+                         orelse_var.dtype.name))
+
+
+def _verify_tf_cond_branch_vars(vars_, symbol_names, branch_name):
+  """Verifies variables output by a conditional branch for consistency."""
+  for name, var_ in zip(symbol_names, vars_):
+    if isinstance(var_, variables.Undefined):
+      raise ValueError(
+          "'{}' must also be initialized in the {} branch".format(
+              name, branch_name))
+    if isinstance(var_, variables.UndefinedReturnValue):
+      raise ValueError(
+          'the {} branch must also have a return statement.'.format(
+              branch_name))
 
 
 def _verify_tf_cond_vars(body_vars, orelse_vars, symbol_names):
   """Verifies variables manipulated by a conditional for consistency."""
-  basic_body_vars, composite_body_vars = body_vars
-  basic_orelse_vars, composite_orelse_vars = orelse_vars
-  assert isinstance(composite_body_vars, tuple)
-  assert isinstance(composite_orelse_vars, tuple)
-
-  # TODO(kkb): Make this more consistent.
-  # The basic outputs should always be a tuple.
-  if not isinstance(basic_body_vars, tuple):
-    basic_body_vars = (basic_body_vars,)
-  if not isinstance(basic_orelse_vars, tuple):
-    basic_orelse_vars = (basic_orelse_vars,)
-
-  body_vars = basic_body_vars + composite_body_vars
-  orelse_vars = basic_orelse_vars + composite_orelse_vars
-
   named_vars = zip(symbol_names, body_vars, orelse_vars)
+
   for name, body_var, orelse_var in named_vars:
     try:
-      nest.assert_same_structure(
-          body_var, orelse_var, expand_composites=True)
+      nest.assert_same_structure(body_var, orelse_var, expand_composites=True)
     except (ValueError, TypeError) as e:
       raise TypeError(
-          '"{}" does not have the same nested structure in the TRUE and FALSE'
-          ' branches.\n\n{}'.format(name, str(e)))
-
+          "'{}' must have the same nested structure in the main and else"
+          ' branches:\n\n{}'.format(name, str(e)))
     nest.map_structure(
-        functools.partial(_verify_single_cond_var, name), body_var, orelse_var)
+        functools.partial(verify_single_cond_var, name), body_var, orelse_var)
 
 
 def for_stmt(iter_, extra_test, body, get_state, set_state, symbol_names, opts):
@@ -314,12 +309,16 @@ def for_stmt(iter_, extra_test, body, get_state, set_state, symbol_names, opts):
   `extra_test`, `body`, `get_state` and `set_state` functions must bind to the
   original `geo_mean` and `arith_mean` symbols, using `nonlocal`.
 
+  The inputs and outputs of the callables representing the loop blocks are not
+  explicit - instead, these functions must use nonlocal/global for side effects.
+  The inputs and outputs are instead controlled by the set_state/get_state
+  functions.
+
   Args:
     iter_: The entity being iterated over.
-    extra_test: Callable with the state as arguments, and boolean return type.
+    extra_test: Callable with boolean return type.
       An additional loop condition.
-    body: Callable with the iterate and the state as arguments, and state as
-      return type. The actual loop body.
+    body: Callable representing the actual loop body.
     get_state: Additional callable which can capture additional state (such as
       the values of composite symbols). This is only useful when staging the
       loop.
@@ -495,8 +494,7 @@ def _tf_range_for_stmt(
   iterate = compat_util.BasicRef(start)
 
   def _value_or(name, var, default):
-    if (name == opts['iterate_names']
-        and isinstance(var, special_values.Undefined)):
+    if (name == opts['iterate_names'] and isinstance(var, variables.Undefined)):
       return default
     return var
 
@@ -718,11 +716,14 @@ def while_stmt(test, body, get_state, set_state, symbol_names, opts):
   a tuple of entities that represent an actual state, or a list of arguments
   of the corresponding types.
 
+  The inputs and outputs of the callables representing the loop blocks are not
+  explicit - instead, these functions must use nonlocal/global for side effects.
+  The inputs and outputs are instead controlled by the set_state/get_state
+  functions.
+
   Args:
-    test: Callable with the state as arguments, and boolean return type. The
-      loop condition.
-    body: Callable with the state as arguments, and state as return type. The
-      actual loop body.
+    test: Callable with boolean return type. The loop condition.
+    body: Callable representing the actual loop body.
     get_state: Additional callable which can capture additional state (such as
       the values of composite symbols). This is only useful when staging the
       loop.
@@ -895,21 +896,32 @@ def _tf_while_stmt(test, body, get_state, set_state, symbol_names, opts):
   set_state(final_loop_vars)
 
 
-def if_stmt(cond,
-            body,
-            orelse,
-            get_state,
-            set_state,
-            basic_symbol_names,
-            composite_symbol_names):
+def if_stmt(cond, body, orelse, get_state, set_state, symbol_names, nouts):
   """Functional form of an if statement.
 
+  The conditional operates on a state, which includes all symbols whose values
+  are a function of the branch taken.
+
+  For example, given the code below that calculates the abs function:
+
+  ```
+    x = 1
+    if x > 0:
+      x = -x
+  ```
+
+  The state is represented by the variable `x`. The `body, `orelse` and
+  `set_state` functions must bind to the original `x` symbol, using `nonlocal`.
+
+  The inputs and outputs of the callables representing the loop blocks are not
+  explicit - instead, these functions must use nonlocal/global for side effects.
+  The inputs and outputs are instead controlled by the set_state/get_state
+  functions.
+
   Args:
     cond: Boolean.
-    body: Callable with no arguments, and outputs of the positive (if) branch as
-      return type.
-    orelse: Callable with no arguments, and outputs of the negative (else)
-      branch as return type.
+    body: Callable representing the main block of the conditional.
+    orelse: Callable representing the else block of the conditional.
     get_state: Function that returns a tuple containing the values of all
       composite symbols modified within the conditional. This allows access to
       state that branches may mutate through side effects. This function is not
@@ -921,122 +933,63 @@ def if_stmt(cond,
       restore checkpointed values. The single argument a tuple containing values
       for each composite symbol that may be modified in a branch of the
       conditional. The is usually the result of a call to get_state.
-    basic_symbol_names: Tuple containing basic loop var names.
-    composite_symbol_names: Tuple containing composite loop var names.
-
-  Returns:
-    Tuple containing the statement outputs.
+    symbol_names: Tuple containing basic loop var names.
+    nouts: Number of variables output by the statement. Vars which are
+      not outputs will not be passed through staged control flow such as
+      tf.cond. This includes variables that are defined before the conditional,
+      but are not used after it.
   """
   # Note: tf.cond doesn't support SparseTensor.
   if tensors.is_dense_tensor(cond):
-    return tf_if_stmt(cond, body, orelse, get_state, set_state,
-                      basic_symbol_names, composite_symbol_names)
+    _tf_if_stmt(cond, body, orelse, get_state, set_state, symbol_names, nouts)
   else:
-    return _py_if_stmt(cond, body, orelse)
+    _py_if_stmt(cond, body, orelse)
 
 
-def tf_if_stmt(cond, body, orelse, get_state, set_state, basic_symbol_names,
-               composite_symbol_names):
+def _tf_if_stmt(
+    cond, body, orelse, get_state, set_state, symbol_names, nouts):
   """Overload of if_stmt that stages a TF cond."""
-  body = _wrap_disallow_undefs_from_cond(body, branch_name='if')
-  orelse = _wrap_disallow_undefs_from_cond(orelse, branch_name='else')
-  body = _isolate_state(body, get_state, set_state)
-  orelse = _isolate_state(orelse, get_state, set_state)
+  if not nouts:
+    prev_get_state, prev_set_state = get_state, set_state
+    # Control flow V1 wants at least one output.
+    get_state = lambda: (0,) + prev_get_state()
+    set_state = lambda v: prev_set_state(v[1:])
+    symbol_names += ('<unused dummy>',)
+    nouts = 1
 
-  # `state` currently includes the values of any composite symbols (e.g. `a.b`)
-  # composites modified by the loop. `final_vars` includes the values of basic
-  # symbols (e.g. `a`) which cannot be passed by reference and must be returned.
-  # See _isolate_state.
-  # TODO(mdan): We should minimize calls to get/set_state.
+  init_vars = get_state()
 
-  body_branch = 0
-  orelse_branch = 1
-  result = [None, None]
+  # TODO(mdan): Use nonlocal once we no longer need to support py2.
+  new_body_vars_ = [None]
+  new_orelse_vars_ = [None]
 
-  def error_checking_body():
-    result[body_branch] = body()
-    if result[orelse_branch] is not None:
-      _verify_tf_cond_vars(result[body_branch], result[orelse_branch],
-                           basic_symbol_names + composite_symbol_names)
-    return result[body_branch]
+  def aug_body():
+    set_state(init_vars)
+    body()
+    new_body_vars = get_state()
+    new_body_vars = new_body_vars[:nouts]
+    new_body_vars_[0] = new_body_vars
+    _verify_tf_cond_branch_vars(new_body_vars, symbol_names, 'main')
+    if new_orelse_vars_[0] is not None:
+      _verify_tf_cond_vars(new_body_vars, new_orelse_vars_[0], symbol_names)
+    return new_body_vars
 
-  def error_checking_orelse():
-    result[orelse_branch] = orelse()
-    if result[body_branch] is not None:
-      _verify_tf_cond_vars(result[body_branch], result[orelse_branch],
-                           basic_symbol_names + composite_symbol_names)
-    return result[orelse_branch]
+  def aug_orelse():
+    set_state(init_vars)
+    orelse()
+    new_orelse_vars = get_state()
+    new_orelse_vars = new_orelse_vars[:nouts]
+    new_orelse_vars_[0] = new_orelse_vars
+    _verify_tf_cond_branch_vars(new_orelse_vars, symbol_names, 'else')
+    if new_body_vars_[0] is not None:
+      _verify_tf_cond_vars(new_body_vars_[0], new_orelse_vars, symbol_names)
+    return new_orelse_vars
 
-  final_vars, final_state = control_flow_ops.cond(cond, error_checking_body,
-                                                  error_checking_orelse)
+  final_cond_vars = control_flow_ops.cond(
+      cond, aug_body, aug_orelse, strict=True)
+  final_cond_vars = final_cond_vars + init_vars[nouts:]
 
-  set_state(final_state)
-
-  return final_vars
-
-
-def _isolate_state(func, get_state, set_state):
-  """Wraps func to (best-effort) isolate state mutations that func may do.
-
-  The simplest example of state mutation is mutation of variables (via e.g.
-  attributes), or modification of globals.
-
-  This allows us to more safely execute this function without worrying about
-  side effects when the function wasn't normally expected to execute. For
-  example, staging requires that the function is executed ahead of time, and
-  we need to ensure its effects are not observed during normal execution.
-
-  Args:
-    func: () -> Any
-    get_state: () -> Any, returns the current state
-    set_state: (Any) -> None, resets the state to the specified values.
-      Typically the result of an earlier call to `get_state`.
-
-  Returns:
-    Tuple[Any, Any], where the first element is the return value of `func`,
-    and the second is the final state values.
-  """
-
-  def wrapper():
-    init_state = get_state()
-    new_vars = func()
-    # TODO(mdan): These should be copies, lest set_state might affect them.
-    new_state = get_state()
-    set_state(init_state)
-    return new_vars, new_state
-
-  return wrapper
-
-
-def _wrap_disallow_undefs_from_cond(func, branch_name):
-  """Wraps conditional branch to disallow returning undefined symbols."""
-
-  def wrapper():
-    """Calls function and raises an error if undefined symbols are returned."""
-    results = func()
-
-    if isinstance(results, tuple):
-      results_tuple = results
-    else:
-      results_tuple = results,
-    undefined = tuple(filter(special_values.is_undefined, results_tuple))
-    if undefined:
-      raise ValueError(
-          'The following symbols must also be initialized in the {} branch: {}.'
-          ' Alternatively, you may initialize them before the if'
-          ' statement.'.format(branch_name,
-                               tuple(s.symbol_name for s in undefined)))
-
-    for result in results_tuple:
-      if special_values.is_undefined_return(result):
-        raise ValueError(
-            'A value must also be returned from the {} branch. If a value is '
-            'returned from one branch of a conditional a value must be '
-            'returned from all branches.'.format(branch_name))
-
-    return results
-
-  return wrapper
+  set_state(final_cond_vars)
 
 
 def _py_if_stmt(cond, body, orelse):
diff --git a/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py b/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
index e01a2f206c8..5a900fb19ed 100644
--- a/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
+++ b/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
@@ -66,7 +66,7 @@ import functools
 import numpy as np
 
 from tensorflow.python.autograph.operators import py_builtins
-from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.autograph.operators import variables
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.autograph.utils import misc
 from tensorflow.python.autograph.utils import tensors
@@ -103,13 +103,13 @@ INEFFICIENT_UNROLL_MIN_OPS = 1
 
 def _disallow_undefs_into_loop(*values):
   """Ensures that all values in the state are defined when entering a loop."""
-  undefined = tuple(filter(special_values.is_undefined, values))
+  undefined = [v for v in values if isinstance(v, variables.Undefined)]
   if undefined:
     raise ValueError(
         '{} must be defined before the loop.'.format(
             ','.join(s.symbol_name for s in undefined)))
   for value in values:
-    if special_values.is_undefined_return(value):
+    if isinstance(value, variables.UndefinedReturnValue):
       # Assumption: the loop will only capture the variable which tracks the
       # return value if the loop contained a return statement.
       # TODO(mdan): This should be checked at the place where return occurs.
@@ -1129,7 +1129,7 @@ def _wrap_disallow_undefs_from_cond(func, branch_name):
       results_tuple = results
     else:
       results_tuple = results,
-    undefined = tuple(filter(special_values.is_undefined, results_tuple))
+    undefined = [v for v in results_tuple if isinstance(v, variables.Undefined)]
     if undefined:
       raise ValueError(
           'The following symbols must also be initialized in the {} branch: {}.'
@@ -1138,7 +1138,7 @@ def _wrap_disallow_undefs_from_cond(func, branch_name):
                                tuple(s.symbol_name for s in undefined)))
 
     for result in results_tuple:
-      if special_values.is_undefined_return(result):
+      if isinstance(result, variables.UndefinedReturnValue):
         raise ValueError(
             'A value must also be returned from the {} branch. If a value is '
             'returned from one branch of a conditional a value must be '
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 5f0a9d09bf3..57288be9a9f 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -29,7 +29,7 @@ import numpy as np
 import six
 
 from tensorflow.python.autograph.operators import control_flow
-from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.autograph.operators import variables as variable_operators
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import def_function
@@ -543,21 +543,21 @@ class ForLoopTest(test.TestCase):
     return s
 
   def test_tensor_illegal_input(self):
-    with self.assertRaisesRegex(ValueError, '"s" may not be None'):
+    with self.assertRaisesRegex(ValueError, '\'s\' may not be None'):
       self._basic_loop(None, lambda i, s: s)
-    with self.assertRaisesRegex(ValueError, '"s" must be defined'):
-      self._basic_loop(special_values.Undefined(''), lambda i, s: s)
+    with self.assertRaisesRegex(ValueError, '\'s\' must be defined'):
+      self._basic_loop(variable_operators.Undefined(''), lambda i, s: s)
 
   def test_tensor_none_output(self):
-    with self.assertRaisesRegex(ValueError, '"s" is None at the end'):
+    with self.assertRaisesRegex(ValueError, '\'s\' is None at the end'):
       self._basic_loop(0, lambda i, s: None)
 
   def test_tensor_dtype_change(self):
-    with self.assertRaisesRegex(TypeError, '"s".* dtype float32 after'):
+    with self.assertRaisesRegex(TypeError, '\'s\'.* dtype float32 after'):
       self._basic_loop(0, lambda i, s: 1.0)
 
   def test_tensor_shape_change(self):
-    with self.assertRaisesRegex(ValueError, r'"s".* shape \(1,\) after'):
+    with self.assertRaisesRegex(ValueError, r'\'s\'.* shape \(1,\) after'):
       self._basic_loop(0, lambda i, s: np.array([1], dtype=np.int32))
 
 
@@ -782,21 +782,21 @@ class WhileLoopTest(test.TestCase):
     return s
 
   def test_tensor_illegal_input(self):
-    with self.assertRaisesRegex(ValueError, '"s" may not be None'):
+    with self.assertRaisesRegex(ValueError, "'s' may not be None"):
       self._basic_loop(None, lambda i, s: s)
-    with self.assertRaisesRegex(ValueError, '"s" must be defined'):
-      self._basic_loop(special_values.Undefined(''), lambda i, s: s)
+    with self.assertRaisesRegex(ValueError, "'s' must be defined"):
+      self._basic_loop(variable_operators.Undefined(''), lambda i, s: s)
 
   def test_tensor_none_output(self):
-    with self.assertRaisesRegex(ValueError, '"s" is None at the end'):
+    with self.assertRaisesRegex(ValueError, "'s' is None at the end"):
       self._basic_loop(0, lambda i, s: None)
 
   def test_tensor_dtype_change(self):
-    with self.assertRaisesRegex(TypeError, '"s".* dtype float32 after'):
+    with self.assertRaisesRegex(TypeError, "'s'.* dtype float32 after"):
       self._basic_loop(0, lambda i, s: 1.0)
 
   def test_tensor_shape_change(self):
-    with self.assertRaisesRegex(ValueError, r'"s".* shape \(1,\) after'):
+    with self.assertRaisesRegex(ValueError, r"'s'.* shape \(1,\) after"):
       self._basic_loop(0, lambda i, s: np.array([1], dtype=np.int32))
 
 
@@ -806,29 +806,88 @@ class IfStmtTest(test.TestCase):
   def test_tensor(self):
 
     def test_fn(cond):
-      return control_flow.if_stmt(
+      def body():
+        nonlocal i
+        i = constant_op.constant(1)
+
+      def orelse():
+        nonlocal i
+        i = constant_op.constant(-1)
+
+      def set_state(cond_vars):
+        nonlocal i
+        i, = cond_vars
+
+      i = None
+      control_flow.if_stmt(
           cond=cond,
-          body=lambda: constant_op.constant(1),
-          orelse=lambda: constant_op.constant(-1),
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          basic_symbol_names=('_',),
-          composite_symbol_names=())
+          body=body,
+          orelse=orelse,
+          get_state=lambda: (i,),
+          set_state=set_state,
+          symbol_names=('i',),
+          nouts=1)
+      return i
 
     self.assertEqual(1, self.evaluate(test_fn(constant_op.constant(True))))
     self.assertEqual(-1, self.evaluate(test_fn(constant_op.constant(False))))
 
+  def test_tensor_no_outputs(self):
+
+    def test_fn(cond):
+      def body():
+        nonlocal i
+        i = constant_op.constant(1)
+
+      def orelse():
+        nonlocal i
+        i = constant_op.constant(-1.0)
+
+      def set_state(cond_vars):
+        nonlocal i
+        i, = cond_vars
+
+      i = None
+      control_flow.if_stmt(
+          cond=cond,
+          body=body,
+          orelse=orelse,
+          get_state=lambda: (i,),
+          set_state=set_state,
+          symbol_names=('i',),
+          nouts=0)
+      return i
+
+    self.assertEqual(None, test_fn(constant_op.constant(True)))
+    self.assertEqual(None, test_fn(constant_op.constant(False)))
+
   def test_tensor_multiple_returns(self):
 
     def test_fn(cond):
-      return control_flow.if_stmt(
+      def body():
+        nonlocal i, j
+        i = constant_op.constant(1)
+        j = constant_op.constant(2)
+
+      def orelse():
+        nonlocal i, j
+        i = constant_op.constant(-1)
+        j = constant_op.constant(-2)
+
+      def set_state(cond_vars):
+        nonlocal i, j
+        i, j = cond_vars
+
+      i, j = None, None
+      control_flow.if_stmt(
           cond=cond,
-          body=lambda: (constant_op.constant(1), constant_op.constant(2)),
-          orelse=lambda: (constant_op.constant(-1), constant_op.constant(-2)),
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          basic_symbol_names=('_',),
-          composite_symbol_names=())
+          body=body,
+          orelse=orelse,
+          get_state=lambda: (i, j),
+          set_state=set_state,
+          symbol_names=('i', 'j'),
+          nouts=2)
+      return i, j
 
     self.assertEqual((1, 2), self.evaluate(test_fn(constant_op.constant(True))))
     self.assertEqual((-1, -2),
@@ -837,14 +896,24 @@ class IfStmtTest(test.TestCase):
   def test_python(self):
 
     def test_fn(cond):
-      return control_flow.if_stmt(
+      def body():
+        nonlocal i
+        i = 1
+
+      def orelse():
+        nonlocal i
+        i = -1
+
+      i = None
+      control_flow.if_stmt(
           cond=cond,
-          body=lambda: 1,
-          orelse=lambda: -1,
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          basic_symbol_names=('_',),
-          composite_symbol_names=())
+          body=body,
+          orelse=orelse,
+          get_state=None,
+          set_state=None,
+          symbol_names=('i',),
+          nouts=1)
+      return i
 
     self.assertEqual(1, test_fn(True))
     self.assertEqual(-1, test_fn(False))
@@ -852,48 +921,75 @@ class IfStmtTest(test.TestCase):
   def test_python_multiple_returns(self):
 
     def test_fn(cond):
-      return control_flow.if_stmt(
+      def body():
+        nonlocal i, j
+        i = 1
+        j = 2
+
+      def orelse():
+        nonlocal i, j
+        i = -1
+        j = -2
+
+      i, j = None, None
+      control_flow.if_stmt(
           cond=cond,
-          body=lambda: (1, 2),
-          orelse=lambda: (-1, -2),
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          basic_symbol_names=('_',),
-          composite_symbol_names=())
+          body=body,
+          orelse=orelse,
+          get_state=None,
+          set_state=None,
+          symbol_names=('i', 'j'),
+          nouts=2)
+      return i, j
 
     self.assertEqual((1, 2), test_fn(True))
     self.assertEqual((-1, -2), test_fn(False))
 
-  def _basic_cond(self, true_value, false_value):
+  def _basic_cond(self, body_fn, else_fn):
+    def body():
+      nonlocal x
+      x = body_fn()
+
+    def orelse():
+      nonlocal x
+      x = else_fn()
+
+    def set_state(cond_vars):
+      nonlocal x
+      x, = cond_vars
+
+    x = 0
     # Eager cond had different semantics, we don't test those here.
     with func_graph.FuncGraph('tmp').as_default():
-      return control_flow.if_stmt(
+      control_flow.if_stmt(
           cond=constant_op.constant(True),
-          body=true_value,
-          orelse=false_value,
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          basic_symbol_names=('s',),
-          composite_symbol_names=())
+          body=body,
+          orelse=orelse,
+          get_state=lambda: (x,),
+          set_state=set_state,
+          symbol_names=('x',),
+          nouts=1)
+    return x
 
   def test_tensor_none_output(self):
     with self.assertRaisesRegex(
-        ValueError, '"s" is None at the end of the TRUE branch'):
+        ValueError, "'x' is None at the end of the main branch"):
       self._basic_cond(lambda: None, lambda: 1)
     with self.assertRaisesRegex(
-        ValueError, '"s" is None at the end of the FALSE branch'):
+        ValueError, "'x' is None at the end of the else branch"):
       self._basic_cond(lambda: 1, lambda: None)
 
   def test_tensor_undefined_output(self):
     with self.assertRaisesRegex(
-        ValueError, "must also be initialized in the if.*'s'"):
-      self._basic_cond(lambda: special_values.Undefined('s'), lambda: 1)
+        ValueError, "'x' must also be initialized in the main branch"):
+      self._basic_cond(lambda: variable_operators.Undefined('x'), lambda: 1)
     with self.assertRaisesRegex(
-        ValueError, "must also be initialized in the else.*'s'"):
-      self._basic_cond(lambda: 1, lambda: special_values.Undefined('s'))
+        ValueError, "'x' must also be initialized in the else branch"):
+      self._basic_cond(lambda: 1, lambda: variable_operators.Undefined('s'))
 
   def test_tensor_dtype_change(self):
-    with self.assertRaisesRegex(TypeError, '"s" has dtype int32.*but.*float32'):
+    with self.assertRaisesRegex(
+        TypeError, "'x' has dtype int32.*but.*float32"):
       self._basic_cond(lambda: 1, lambda: 1.0)
 
 
diff --git a/tensorflow/python/autograph/operators/data_structures_test.py b/tensorflow/python/autograph/operators/data_structures_test.py
index c5a3a3d1cac..5d835fd3771 100644
--- a/tensorflow/python/autograph/operators/data_structures_test.py
+++ b/tensorflow/python/autograph/operators/data_structures_test.py
@@ -106,11 +106,12 @@ class ListTest(test.TestCase):
     with self.cached_session() as sess:
       self.assertAllEqual(self.evaluate(t), [[1, 2, 3]])
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def test_append_tensorarray(self):
     l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
     l1 = data_structures.list_append(l, 1)
     l2 = data_structures.list_append(l1, 2)
+
     with self.cached_session() as sess:
       self.assertAllEqual(self.evaluate(l1.stack()), [1])
       self.assertAllEqual(self.evaluate(l2.stack()), [1, 2])
diff --git a/tensorflow/python/autograph/operators/symbols.py b/tensorflow/python/autograph/operators/symbols.py
deleted file mode 100644
index 0dd7e0a5956..00000000000
--- a/tensorflow/python/autograph/operators/symbols.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Abstract representation of composite symbols that can be used in staging code.
-
-This provides a way to checkpoint the values of symbols that may be undefined
-entering staged control flow. This checkpointing is necessary to prevent some
-unintended side-effects. For example checkpointing prevents side-effects in one
-branch of a conditional from leaking into another.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.autograph.operators import special_values
-
-
-is_undefined = special_values.is_undefined
-Undefined = special_values.Undefined
-
-
-class Symbol(object):
-  """Representation of a simple or composite Python symbol.
-
-  Subclasses should implement `maybe_compute_value(self)` that returns the value
-  corresponding to the symbol or Undefined if no such value exists.
-  """
-
-  def __init__(self, name):
-    self.name = name
-
-
-class ValueSymbol(Symbol):
-  """Representation of a simple Python symbol with a concrete value.
-
-  This includes variables and literals. Since we are reifying undefined symbols
-  `Undefined` is also a valid value.
-  """
-
-  def __init__(self, name, value):
-    super(ValueSymbol, self).__init__(name)
-    self.value = value
-
-  def maybe_compute_value(self):
-    return self.value
-
-
-class AttributeAccessSymbol(Symbol):
-  """Representation of Python attribute access e.g. `a.b`."""
-
-  def __init__(self, parent_symbol, attr_name):
-    super(AttributeAccessSymbol, self).__init__(
-        parent_symbol.name + '.' + attr_name)
-    self.attr_name = attr_name
-    self.parent_symbol = parent_symbol
-
-  def maybe_compute_value(self):
-    """Compute the value corresponding to the attribute access or `Undefined`.
-
-    This will be `Undefined` if no such value exists either because there is no
-    such attribute or if the base is itself undefined.
-
-    Returns:
-      value corresponding to the attribute access or `Undefined`
-    """
-    parent_value = self.parent_symbol.maybe_compute_value()
-    if (is_undefined(parent_value) or
-        getattr(parent_value, self.attr_name, None) is None):
-      return Undefined(self.name)
-
-    return parent_value.__getattribute__(self.attr_name)
-
-
-class SubscriptSymbol(Symbol):
-  """Representation of Python subscript access e.g. `a[b]`."""
-
-  def __init__(self, parent_symbol, index_symbol):
-    super(SubscriptSymbol, self).__init__(
-        parent_symbol.name + '[' + index_symbol.name + ']')
-    self.index_symbol = index_symbol
-    self.parent_symbol = parent_symbol
-
-  def maybe_compute_value(self):
-    """Compute the value corresponding to the subscript access or `Undefined`.
-
-    This will be `Undefined` if no such value exists either because there is no
-    element corresponding to the given subscript or if the base itself is
-    not defined.
-
-    Returns:
-      value corresponding to the subscript access or `Undefined`
-    """
-    parent_value = self.parent_symbol.maybe_compute_value()
-    index_value = self.index_symbol.maybe_compute_value()
-    if is_undefined(parent_value) or is_undefined(index_value):
-      return Undefined(self.name)
-
-    try:
-      return parent_value[index_value]
-    except (IndexError, KeyError, TypeError):
-      # Reify the lack of an object for the given index/key
-      # This allows us to define them later without regret
-      return Undefined(self.name)
diff --git a/tensorflow/python/autograph/operators/symbols_test.py b/tensorflow/python/autograph/operators/symbols_test.py
deleted file mode 100644
index 3acb16273bd..00000000000
--- a/tensorflow/python/autograph/operators/symbols_test.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for special symbol handling."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.autograph.operators import special_values
-from tensorflow.python.autograph.operators import symbols
-from tensorflow.python.platform import test
-
-Undefined = special_values.Undefined
-AttributeAccessSymbol = symbols.AttributeAccessSymbol
-SubscriptSymbol = symbols.SubscriptSymbol
-ValueSymbol = symbols.ValueSymbol
-
-
-class SymbolsTest(test.TestCase):
-
-  def test_value_symbol_returns_value(self):
-    a = 42
-    a_symbol = ValueSymbol('a', a)
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(a_symbol.name, 'a')
-
-  def test_attribute_access_missing_attribute(self):
-    class Foo(object):
-      pass
-    a = Foo()
-
-    a_symbol = ValueSymbol('a', a)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a.b')
-
-  def test_attribute_access_undefined_target(self):
-    a = Undefined('a')
-    a_symbol = ValueSymbol('a', a)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a.b')
-
-  def test_attribute_access_basic(self):
-    class Foo(object):
-
-      def __init__(self):
-        self.b = 'this is an attribute'
-
-    a = Foo()
-    a_symbol = ValueSymbol('a', a)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a.b)
-
-  def test_item_access_undefined_index(self):
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return 'this is an item'
-
-    a = Foo()
-    b = Undefined('b')
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a[b]')
-
-  def test_item_access_no_getitem(self):
-    class Foo(object):
-      pass
-
-    a = Foo()
-    b = 42
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a[b]')
-
-  def test_item_access_undefined_root(self):
-    a = Undefined('a')
-    b = 42
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a[b]')
-
-  def test_item_access_basic(self):
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return 'this is an item'
-
-    a = Foo()
-    b = 42
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a[b])
-
-  def test_item_access_after_attribute_access(self):
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return 'this is an item'
-
-    class Bar(object):
-
-      def __init__(self):
-        self.b = Foo()
-
-    a = Bar()
-    c = 42
-    a_symbol = ValueSymbol('a', a)
-    c_symbol = ValueSymbol('c', c)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-    a_b_c_symbol = SubscriptSymbol(a_b_symbol, c_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(c_symbol.maybe_compute_value(), c)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a.b)
-    self.assertEqual(a_b_c_symbol.maybe_compute_value(), a.b[c])
-
-  def test_attribute_access_after_item_access(self):
-    class Bar(object):
-
-      def __init__(self):
-        self.c = object()
-
-    item = Bar()
-
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return item
-
-    a = Foo()
-    b = 42
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-    a_b_c_symbol = AttributeAccessSymbol(a_b_symbol, 'c')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a[b])
-    self.assertEqual(a_b_c_symbol.maybe_compute_value(), a[b].c)
-
-  def test_item_access_after_item_access(self):
-    class Bar(object):
-
-      def __getitem__(self, key):
-        return 'this is an item'
-
-    item = Bar()
-
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return item
-
-    a = Foo()
-    b = 42
-    c = 43
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    c_symbol = ValueSymbol('b', c)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-    a_b_c_symbol = SubscriptSymbol(a_b_symbol, c_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a[b])
-    self.assertEqual(a_b_c_symbol.maybe_compute_value(), a[b][c])
-
-  def test_attribute_access_after_attribute_access(self):
-    class Bar(object):
-
-      def __init__(self):
-        self.c = object()
-
-    class Foo(object):
-
-      def __init__(self):
-        self.b = Bar()
-
-    a = Foo()
-    a_symbol = ValueSymbol('a', a)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-    a_b_c_symbol = AttributeAccessSymbol(a_b_symbol, 'c')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a.b)
-    self.assertEqual(a_b_c_symbol.maybe_compute_value(), a.b.c)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/autograph/operators/special_values.py b/tensorflow/python/autograph/operators/variables.py
similarity index 67%
rename from tensorflow/python/autograph/operators/special_values.py
rename to tensorflow/python/autograph/operators/variables.py
index c172cce23f1..c3bedc3fecf 100644
--- a/tensorflow/python/autograph/operators/special_values.py
+++ b/tensorflow/python/autograph/operators/variables.py
@@ -19,6 +19,38 @@ from __future__ import division
 from __future__ import print_function
 
 
+def ld(v):
+  """Load variable operator."""
+  if isinstance(v, Undefined):
+    return v.read()
+  return v
+
+
+def ldu(load_v, name):
+  """Load variable operator that returns Undefined when failing to evaluate.
+
+  Note: the name ("load or return undefined") is abbreviated to minimize
+  the amount of clutter in generated code.
+
+  This variant of `ld` is useful when loading symbols that may be undefined at
+  runtime, such as composite symbols, and whether they are defined or not cannot
+  be determined statically. For example `d['a']` is undefined when `d` is an
+  empty dict.
+
+  Args:
+    load_v: Lambda that executes the actual read.
+    name: Human-readable name of the symbol being read.
+  Returns:
+    Either the value of the symbol, or Undefined, if the symbol is not fully
+    defined.
+  """
+  try:
+    # TODO(mdan): Use locals()/globals() here.
+    return load_v()
+  except (KeyError, AttributeError, NameError):
+    return Undefined(name)
+
+
 class Undefined(object):
   """Represents an undefined symbol in Python.
 
@@ -51,6 +83,10 @@ class Undefined(object):
   def __init__(self, symbol_name):
     self.symbol_name = symbol_name
 
+  def read(self):
+    raise UnboundLocalError("'{}' is used before assignment".format(
+        self.symbol_name))
+
   def __repr__(self):
     return self.symbol_name
 
@@ -66,34 +102,7 @@ class Undefined(object):
     return self
 
 
-def is_undefined(value):
-  """Checks whether Autograph has determined that a given value is undefined.
-
-  This only works in places where Autograph reifies undefined symbols. Note that
-  if this function is passed a truly undefined symbol the call-site will raise
-  NameError.
-
-  Args:
-    value: value to test for undefinedness
-  Returns:
-    Boolean, whether the input value is undefined.
-  """
-  return isinstance(value, Undefined)
-
-
 # TODO(mdan): Refactor as a RetVal object, aggregating the value and do_return.
 class UndefinedReturnValue(object):
-  """Represents a default return value from a function (None in Python)."""
+  """Represents a return value that is undefined."""
   pass
-
-
-def retval(value):
-  """Returns the actual value that a return statement should produce."""
-  if isinstance(value, UndefinedReturnValue):
-    return None
-  return value
-
-
-def is_undefined_return(value):
-  """Checks whether `value` is the default return value."""
-  return isinstance(value, UndefinedReturnValue)
diff --git a/tensorflow/python/autograph/operators/special_values_test.py b/tensorflow/python/autograph/operators/variables_test.py
similarity index 58%
rename from tensorflow/python/autograph/operators/special_values_test.py
rename to tensorflow/python/autograph/operators/variables_test.py
index 1742cc4277d..168e6172232 100644
--- a/tensorflow/python/autograph/operators/special_values_test.py
+++ b/tensorflow/python/autograph/operators/variables_test.py
@@ -18,28 +18,38 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.autograph.operators import variables
 from tensorflow.python.platform import test
 
 
 class SpecialValuesTest(test.TestCase):
 
   def test_undefined(self):
-    undefined_symbol = special_values.Undefined('name')
-    self.assertEqual(undefined_symbol.symbol_name, 'name')
+    undefined_symbol = variables.Undefined('name')
+    undefined_symbol2 = variables.Undefined('name')
 
-    undefined_symbol2 = special_values.Undefined('name')
+    self.assertEqual(undefined_symbol.symbol_name, 'name')
+    self.assertEqual(undefined_symbol2.symbol_name, 'name')
     self.assertNotEqual(undefined_symbol, undefined_symbol2)
 
-    self.assertTrue(special_values.is_undefined(undefined_symbol))
-    self.assertTrue(special_values.is_undefined(undefined_symbol2))
-
   def test_undefined_operations(self):
-    undefined_symbol = special_values.Undefined('name')
+    undefined_symbol = variables.Undefined('name')
+
+    self.assertIsInstance(undefined_symbol.foo, variables.Undefined)
+    self.assertIsInstance(undefined_symbol[0], variables.Undefined)
+    self.assertNotIsInstance(undefined_symbol.__class__, variables.Undefined)
+
+  def test_read(self):
+    self.assertEqual(variables.ld(1), 1)
+    o = object()
+    self.assertEqual(variables.ld(o), o)
+
+    self.assertIsNone(variables.ld(None))
+
+  def test_read_undefined(self):
+    with self.assertRaisesRegex(UnboundLocalError, 'used before assignment'):
+      variables.ld(variables.Undefined('a'))
 
-    self.assertTrue(special_values.is_undefined(undefined_symbol.foo))
-    self.assertTrue(special_values.is_undefined(undefined_symbol[0]))
-    self.assertFalse(special_values.is_undefined(undefined_symbol.__class__))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/qual_names.py b/tensorflow/python/autograph/pyct/qual_names.py
index f97e595d1dc..d9491691567 100644
--- a/tensorflow/python/autograph/pyct/qual_names.py
+++ b/tensorflow/python/autograph/pyct/qual_names.py
@@ -41,21 +41,13 @@ class Symbol(collections.namedtuple('Symbol', ['name'])):
   """Represents a Python symbol."""
 
 
-class StringLiteral(collections.namedtuple('StringLiteral', ['value'])):
-  """Represents a Python string literal."""
-
-  def __str__(self):
-    return '\'%s\'' % self.value
-
-  def __repr__(self):
-    return str(self)
-
-
-class NumberLiteral(collections.namedtuple('NumberLiteral', ['value'])):
+class Literal(collections.namedtuple('Literal', ['value'])):
   """Represents a Python numeric literal."""
 
   def __str__(self):
-    return '%s' % self.value
+    if isinstance(self.value, str):
+      return "'{}'".format(self.value)
+    return str(self.value)
 
   def __repr__(self):
     return str(self)
@@ -91,7 +83,7 @@ class QN(object):
       self._has_subscript = True
 
     else:
-      if not isinstance(base, (str, StringLiteral, NumberLiteral)):
+      if not isinstance(base, (str, Literal)):
         # TODO(mdan): Require Symbol instead of string.
         raise ValueError(
             'for simple QNs, base must be a string or a Literal object;'
@@ -169,12 +161,13 @@ class QN(object):
             self.has_attr() == other.has_attr())
 
   def __str__(self):
+    root = self.qn[0]
     if self.has_subscript():
-      return str(self.qn[0]) + '[' + str(self.qn[1]) + ']'
+      return '{}[{}]'.format(root, self.qn[1])
     if self.has_attr():
       return '.'.join(map(str, self.qn))
     else:
-      return str(self.qn[0])
+      return str(root)
 
   def __repr__(self):
     return str(self)
@@ -207,13 +200,11 @@ class QN(object):
     if isinstance(base, str):
       return gast.Name(
           base, ctx=CallerMustSetThis, annotation=None, type_comment=None)
-    elif isinstance(base, StringLiteral):
-      return gast.Constant(base.value, kind=None)
-    elif isinstance(base, NumberLiteral):
+    elif isinstance(base, Literal):
       return gast.Constant(base.value, kind=None)
     else:
       assert False, ('the constructor should prevent types other than '
-                     'str, StringLiteral and NumberLiteral')
+                     'str and Literal')
 
 
 class QnResolver(gast.NodeTransformer):
@@ -243,7 +234,7 @@ class QnResolver(gast.NodeTransformer):
       # Continuing silently because some demos use these.
       return node
     if isinstance(s.value, gast.Constant):
-      subscript = QN(NumberLiteral(s.value.value))
+      subscript = QN(Literal(s.value.value))
     else:
       # The index may be an expression, case in which a name doesn't make sense.
       if anno.hasanno(node.slice.value, anno.Basic.QN):
diff --git a/tensorflow/python/autograph/pyct/qual_names_test.py b/tensorflow/python/autograph/pyct/qual_names_test.py
index ce17aecc024..6addb0a7179 100644
--- a/tensorflow/python/autograph/pyct/qual_names_test.py
+++ b/tensorflow/python/autograph/pyct/qual_names_test.py
@@ -75,9 +75,7 @@ class QNTest(test.TestCase):
     b_sub_c = QN(b, subscript=c)
     a_sub_b_sub_c = QN(a, subscript=b_sub_c)
     self.assertEqual(a_sub_b_sub_c.qn, (a, b_sub_c))
-    self.assertTrue(a_sub_b.is_composite())
     self.assertTrue(a_sub_b_sub_c.is_composite())
-    self.assertTrue(a_sub_b.has_subscript())
     self.assertTrue(a_sub_b_sub_c.has_subscript())
     self.assertEqual(b_sub_c.qn, (b, c))
     self.assertEqual(str(a_sub_b_sub_c), 'a[b[c]]')
@@ -154,14 +152,17 @@ class QNTest(test.TestCase):
 
   def test_literals(self):
     a = QN('a')
-    a_sub_str_b = QN(a, subscript=QN(qual_names.StringLiteral('b')))
+    a_sub_str_b = QN(a, subscript=QN(qual_names.Literal('b')))
     a_sub_b = QN(a, subscript=QN('b'))
 
     self.assertNotEqual(a_sub_str_b, a_sub_b)
     self.assertNotEqual(hash(a_sub_str_b), hash(a_sub_b))
+    self.assertEqual(a_sub_str_b.ast().slice.value.value, 'b')
+    self.assertEqual(str(a_sub_str_b), "a['b']")
 
-    a_sub_three = QN(a, subscript=QN(qual_names.NumberLiteral(3)))
+    a_sub_three = QN(a, subscript=QN(qual_names.Literal(3)))
     self.assertEqual(a_sub_three.ast().slice.value.value, 3)
+    self.assertEqual(str(a_sub_three), "a[3]")
 
   def test_support_set(self):
     a = QN('a')
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity.py b/tensorflow/python/autograph/pyct/static_analysis/activity.py
index ca68bc9911c..0e19da87451 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity.py
@@ -70,6 +70,9 @@ class Scope(object):
     globals: Set[qual_names.QN], names that are explicitly marked as global in
       this scope. Note that this doesn't include free read-only vars bound to
       global symbols.
+    nonlocals: Set[qual_names.QN], names that are explicitly marked as nonlocal
+      in this scope. Note that this doesn't include free read-only vars bound to
+      global symbols.
     free_vars: Set[qual_names.QN], the free variables in this scope. See
       https://docs.python.org/3/reference/executionmodel.html for a precise
       definition.
@@ -111,6 +114,7 @@ class Scope(object):
 
     self.bound = set()
     self.globals = set()
+    self.nonlocals = set()
     self.annotations = set()
 
     self.params = weakref.WeakValueDictionary()
@@ -186,6 +190,7 @@ class Scope(object):
         self.parent.modified.update(self.modified - self.isolated_names)
         self.parent.bound.update(self.bound - self.isolated_names)
         self.parent.globals.update(self.globals)
+        self.parent.nonlocals.update(self.nonlocals)
         self.parent.annotations.update(self.annotations)
       else:
         # TODO(mdan): This is not accurate.
@@ -363,6 +368,7 @@ class ActivityAnalyzer(transformer.Base):
       qn = qual_names.QN(name)
       self.scope.read.add(qn)
       self.scope.bound.add(qn)
+      self.scope.nonlocals.add(qn)
     self._exit_and_record_scope(node)
     return node
 
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py
index 7333ec0c872..ba27280f729 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py
@@ -78,6 +78,18 @@ class ReachingDefinitionsAnalyzerTest(
 
     self.assertSameDef(local_body[1].test, local_body[2].value.elts[0])
 
+    # Note: the function name is is visible inside the function body. But it's
+    # a closure variable, not a local.
+    #
+    # Example:
+    #
+    #   >>> def f():
+    #   ...  print(f)
+    #   >>> g = f
+    #   >>> f = 'something else'
+    #   >>> g()
+    #   something else
+    #
     self.assertHasDefinedIn(local_body[1], ('a', 'b'))
 
 
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
index c4e7cbd4d17..ac91b662a47 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
@@ -255,6 +255,9 @@ class ReachingDefinitionsAnalyzerTest(ReachingDefinitionsAnalyzerTestBase):
 
     inner_fn_body = fn_body[1].body[1].body
     def_of_a_in_foo = inner_fn_body[0].value
+    # Even though `a` is visible in the inner functio above, the late binding
+    # makes it impossible to assume that the same value will be visible at
+    # call time.
     self.assertHasDefs(def_of_a_in_foo, 0)
 
   def test_nested_functions_isolation(self):
@@ -401,6 +404,46 @@ class ReachingDefinitionsAnalyzerTest(ReachingDefinitionsAnalyzerTestBase):
 
     self.assertHasDefinedIn(fn_body[1], ('a',))
 
+  def test_definitions_in_except_block(self):
+
+    def test_fn():
+      try:
+        pass
+      except ValueError:
+        a = None
+      if a:  # pylint:disable=using-constant-test
+        a = None
+      return a
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    self.assertHasDefs(fn_body[1].test, 1)
+    self.assertHasDefs(fn_body[1].body[0].targets[0], 1)
+    self.assertHasDefs(fn_body[2].value, 2)
+
+    self.assertHasDefinedIn(fn_body[1], ('a',))
+
+  def test_definitions_in_except_block_of_raising_try(self):
+
+    def test_fn():
+      try:
+        raise ValueError()
+      except ValueError:
+        a = None
+      if a:  # pylint:disable=using-constant-test
+        a = None
+      return a
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    self.assertHasDefs(fn_body[1].test, 1)
+    self.assertHasDefs(fn_body[1].body[0].targets[0], 1)
+    self.assertHasDefs(fn_body[2].value, 2)
+
+    self.assertHasDefinedIn(fn_body[1], ('a',))
+
   def test_global(self):
 
     def test_fn():
diff --git a/tensorflow/python/autograph/utils/tensor_list_test.py b/tensorflow/python/autograph/utils/tensor_list_test.py
index bbbc3bf6918..017d97bb040 100644
--- a/tensorflow/python/autograph/utils/tensor_list_test.py
+++ b/tensorflow/python/autograph/utils/tensor_list_test.py
@@ -34,7 +34,6 @@ class TensorListTest(test.TestCase):
   def _shape(self, shape_tuple):
     return constant(shape_tuple, dtypes.int32)
 
-  @test_util.run_v1_only("b/117943489")
   def test_dynamic_list_append(self):
     l = []
     l = tl.dynamic_list_append(l, 1)
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index dd8e64ac182..074b50bf69b 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -34,6 +34,7 @@ from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as framework_device_lib
@@ -1911,12 +1912,15 @@ class SessionTest(test_util.TensorFlowTestCase):
       def __str__(self):
         return self._output
 
+    context.set_log_device_placement(True)
     if context.executing_eagerly():
-      context.set_log_device_placement(True)
       with CaptureStderr() as log:
         a = constant_op.constant(1)
         b = constant_op.constant(2)
         c = a + b
+        # Ensure if the same kernel with the same arguments is executed then its
+        # execution is logged.
+        d = a + b
     else:
       # Passing the config to the server, but not the session should still
       # result in logging device placement.
@@ -1925,12 +1929,32 @@ class SessionTest(test_util.TensorFlowTestCase):
       a = constant_op.constant(1)
       b = constant_op.constant(2)
       c = a + b
+      d = a + b
       with session.Session(server.target) as sess:
         with CaptureStderr() as log:
-          sess.run(c)
+          c, d = sess.run([c, d])
 
+    self.assertEqual(c, 3)
+    self.assertEqual(d, 3)
     # Ensure that we did log device placement.
-    self.assertTrue('/replica:0/task:0/device:CPU:0' in str(log), str(log))
+    add_executions = [l for l in str(log).splitlines() if 'AddV2' in l]
+    self.assertEqual(len(add_executions), 2)
+
+    @def_function.function
+    def fn():
+      a = constant_op.constant(1)
+      b = constant_op.constant(2)
+      c = a + b
+      d = a + b
+      return c, d
+
+    with CaptureStderr() as log:
+      c, d = self.evaluate(fn())
+    self.assertEqual(c, 3)
+    self.assertEqual(d, 3)
+    # Ensure that we did log device placement.
+    add_executions = [l for l in str(log).splitlines() if 'AddV2' in l]
+    self.assertEqual(len(add_executions), 2)
 
   @test_util.run_v1_only('b/120545219')
   def testLocalMasterSessionTimeout(self):
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index 78a1613c86c..cb960fd599a 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -89,7 +89,8 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
     input_names.push_back(key_string);
 
     inputs_safe.emplace_back(make_safe(static_cast<TF_Tensor*>(nullptr)));
-    s = PyArrayToTF_Tensor(value, &inputs_safe.back());
+    s = NdarrayToTensor(nullptr /*ctx*/, value, &inputs_safe.back(),
+                        true /*convert_to_string*/);
     if (!s.ok()) {
       Set_TF_Status_from_Status(out_status, s);
       return;
@@ -367,7 +368,7 @@ void TF_SessionRun_wrapper_helper(TF_Session* session, const char* handle,
   // cleaned up properly.
   //
   // Memory management:
-  // PyArrayToTF_Tensor() creates a new ndarray PyObject from the input
+  // NdarrayToTensor() creates a new ndarray PyObject from the input
   // ndarray. We manage the new ndarray's lifetime in order to keep the
   // underlying data buffer alive (the new ndarray also guarantees a contiguous
   // data buffer). The new ndarray's data buffer is used to create the
@@ -382,7 +383,7 @@ void TF_SessionRun_wrapper_helper(TF_Session* session, const char* handle,
   std::vector<Safe_TF_TensorPtr> input_vals_safe;
   for (PyObject* ndarray : input_ndarrays) {
     input_vals_safe.emplace_back(make_safe(static_cast<TF_Tensor*>(nullptr)));
-    s = PyArrayToTF_Tensor(ndarray, &input_vals_safe.back());
+    s = NdarrayToTensor(nullptr, ndarray, &input_vals_safe.back(), true);
     if (!s.ok()) {
       Set_TF_Status_from_Status(out_status, s);
       return;
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 51597fb0596..2a625496569 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 7)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 2)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 1e4c215994f..2b26dd42818 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -120,8 +120,11 @@ cuda_py_tests(
     srcs = [
         "test/base_test.py",
         "test/batch_matmul_test.py",
+        "test/biasadd_matmul_test.py",
         "test/binary_tensor_weight_broadcast_test.py",
+        "test/cast_test.py",
         "test/combined_nms_test.py",
+        "test/concatenation_test.py",
         "test/const_broadcast_test.py",
         "test/conv2d_test.py",
         "test/dynamic_input_shapes_test.py",
@@ -155,27 +158,6 @@ cuda_py_tests(
     ],
 )
 
-cuda_py_tests(
-    name = "concatenation_test",
-    srcs = [
-        "test/biasadd_matmul_test.py",
-        "test/concatenation_test.py",
-    ],
-    python_version = "PY3",
-    tags = [
-        "no_rocm",
-        "no_windows",
-        "nomac",
-        "notap",  # b/140261407
-    ],
-    xla_enable_strict_auto_jit = False,
-    deps = [
-        ":tf_trt_integration_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
 cuda_py_test(
     name = "quantization_mnist_test",
     srcs = ["test/quantization_mnist_test.py"],
diff --git a/tensorflow/python/compiler/tensorrt/test/cast_test.py b/tensorflow/python/compiler/tensorrt/test/cast_test.py
new file mode 100644
index 00000000000..381aa5b93c2
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/test/cast_test.py
@@ -0,0 +1,56 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test conversion of graphs involving INT32 tensors and operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class CastInt32ToFp32Test(trt_test.TfTrtIntegrationTestBase):
+  """Tests cast to FP32 are splitted in FP16 mode."""
+
+  def _ConstOp(self, shape, dtype):
+    return constant_op.constant(np.random.randn(*shape), dtype=dtype)
+
+  def GraphFn(self, x):
+    b_f = self._ConstOp((1, 10), dtypes.float32)
+    x_f = math_ops.cast(x, dtypes.float32)
+    x_f = math_ops.mul(x_f, b_f)
+    b_f = self._ConstOp((1, 10), dtypes.float32)
+    x_f = math_ops.add(x_f, b_f)
+    return array_ops.identity(x_f, name="output_0")
+
+  def GetParams(self):
+    return self.BuildParams(self.GraphFn, dtypes.int32, [[1, 10]], [[1, 10]])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Returns the expected engines to build."""
+    if run_params.precision_mode == "FP16":
+      return {"TRTEngineOp_0": ["Cast", "Add", "Mul"]}
+    else:
+      return {"TRTEngineOp_0": ["Add", "Mul"]}
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index 773061d57a7..8b93750fde4 100644
--- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -595,17 +595,32 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         if k not in removed_const_nodes
     }
 
-    # Compute the actual mapping from each node to its input nodes.
+    # Compute the actual mapping from each node to its input nodes. If a cast
+    # op doesn't exist in the original graph, we replace the use of the cast op
+    # with the input of the op. This allows the verification to handle the case
+    # where the TF-TRT bridge splits a cast op into a chain of two cast ops.
+    new_cast_op_name_to_node_map = {
+        node.name: node
+        for node in converted_gdef.node
+        if (node.name not in old_to_new_node_map and node.op == "Cast")
+    }
     actual_input_map = {}
     for node in converted_gdef.node:
       name_str = node.name
+      # Only nodes from the original graph or TRTEngineOp nodes are added as
+      # keys to the map.
       if node.op == "TRTEngineOp":
         name_str = self._RemoveGraphSequenceNumber(name_str)
+      elif name_str not in old_to_new_node_map:
+        continue
       actual_input_map[name_str] = set()
       input_set = actual_input_map[name_str]
       for inp in node.input:
         (prefix, node_name) = _InputName(inp)
         node_name = self._MayRemoveGraphSequenceNumber(node_name)
+        if node_name in new_cast_op_name_to_node_map:
+          (prefix, node_name) = _InputName(
+              new_cast_op_name_to_node_map[node_name].input[0])
         input_set.add(prefix + node_name)
 
     self.assertEqual(
diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py
index 9dea461a0f0..39cbd3de735 100644
--- a/tensorflow/python/data/__init__.py
+++ b/tensorflow/python/data/__init__.py
@@ -24,8 +24,10 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.python.data import experimental
 from tensorflow.python.data.ops.dataset_ops import Dataset
+from tensorflow.python.data.ops.dataset_ops import INFINITE as INFINITE_CARDINALITY
 from tensorflow.python.data.ops.dataset_ops import make_initializable_iterator
 from tensorflow.python.data.ops.dataset_ops import make_one_shot_iterator
+from tensorflow.python.data.ops.dataset_ops import UNKNOWN as UNKNOWN_CARDINALITY
 from tensorflow.python.data.ops.iterator_ops import Iterator
 from tensorflow.python.data.ops.readers import FixedLengthRecordDataset
 from tensorflow.python.data.ops.readers import TextLineDataset
diff --git a/tensorflow/python/data/experimental/BUILD b/tensorflow/python/data/experimental/BUILD
index 5264b4a7791..a1e45ecd17e 100644
--- a/tensorflow/python/data/experimental/BUILD
+++ b/tensorflow/python/data/experimental/BUILD
@@ -13,5 +13,6 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/data/experimental/ops:dataset_ops",
         "//tensorflow/python/data/experimental/ops:iterator_ops",
+        "//tensorflow/python/data/experimental/service",
     ],
 )
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index b64ecc32e3e..c9d3d769c44 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -53,6 +53,7 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@copy_to_device
 @@dense_to_ragged_batch
 @@dense_to_sparse_batch
+@@distribute
 @@enumerate_dataset
 @@from_variant
 @@get_next_as_optional
@@ -74,6 +75,7 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@sample_from_datasets
 @@scan
 @@shuffle_and_repeat
+@@snapshot
 @@take_while
 @@to_variant
 @@unbatch
@@ -89,6 +91,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
+from tensorflow.python.data.experimental import service
 from tensorflow.python.data.experimental.ops.batching import dense_to_ragged_batch
 from tensorflow.python.data.experimental.ops.batching import dense_to_sparse_batch
 from tensorflow.python.data.experimental.ops.batching import map_and_batch
@@ -126,6 +129,7 @@ from tensorflow.python.data.experimental.ops.readers import SqlDataset
 from tensorflow.python.data.experimental.ops.resampling import rejection_resample
 from tensorflow.python.data.experimental.ops.scan_ops import scan
 from tensorflow.python.data.experimental.ops.shuffle_ops import shuffle_and_repeat
+from tensorflow.python.data.experimental.ops.snapshot import snapshot
 from tensorflow.python.data.experimental.ops.stats_aggregator import StatsAggregator
 from tensorflow.python.data.experimental.ops.stats_ops import bytes_produced_stats
 from tensorflow.python.data.experimental.ops.stats_ops import latency_stats
@@ -150,4 +154,9 @@ from tensorflow.python.framework.type_spec import TypeSpec as Structure
 # pylint: enable=unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
-remove_undocumented(__name__)
+
+_allowed_symbols = [
+    "service",
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py b/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
index 88230153181..1a0e0a101e2 100644
--- a/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
@@ -51,7 +51,8 @@ class SnapshotDatasetBenchmark(benchmark_base.DatasetBenchmarkBase):
     dataset = dataset.map(
         lambda x: gen_array_ops.broadcast_to(x, [50, 50, 3]))
     dataset = dataset.repeat(num_elems)
-    dataset = dataset.apply(snapshot.snapshot(tmp_dir, compression=compression))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmp_dir, compression=compression))
 
     return dataset
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index d5d6cb00733..5ab02842509 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")  # buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")  # buildifier: disable=same-origin-load
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -87,6 +87,17 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "compression_ops_test",
+    srcs = ["compression_ops_test.py"],
+    deps = [
+        "//tensorflow/python/data/experimental/ops:compression_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 cuda_py_test(
     name = "copy_to_device_test",
     size = "small",
@@ -742,9 +753,12 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:string_ops",
+        "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/experimental/ops:snapshot",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:nest",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
index ffc98b917d2..13d56a84d3c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
@@ -75,9 +75,8 @@ def _test_combinations():
       ("FromTensors2", lambda: dataset_ops.Dataset.from_tensors((0, 1)), 1),
       ("FromTensorSlices1",
        lambda: dataset_ops.Dataset.from_tensor_slices([0, 0, 0]), 3),
-      ("FromTensorSlices2",
-       lambda: dataset_ops.Dataset.from_tensor_slices(([0, 0, 0], [1, 1, 1])),
-       3),
+      ("FromTensorSlices2", lambda: dataset_ops.Dataset.from_tensor_slices(
+          ([0, 0, 0], [1, 1, 1])), 3),
       ("Interleave1", lambda: dataset_ops.Dataset.range(5).interleave(
           lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1),
        cardinality.UNKNOWN),
@@ -134,6 +133,19 @@ def _test_combinations():
        lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).take(2),
        cardinality.UNKNOWN),
       ("Take4", lambda: dataset_ops.Dataset.range(5).repeat().take(2), 2),
+      ("Unbatch1", lambda: dataset_ops.Dataset.range(5).batch(
+          2, drop_remainder=True).unbatch(), 4),
+      ("Unbatch2", lambda: dataset_ops.Dataset.range(5).batch(
+          2, drop_remainder=False).unbatch(), cardinality.UNKNOWN),
+      ("Unbatch3", lambda: dataset_ops.Dataset.range(5).batch(
+          2, drop_remainder=True).filter(lambda _: True).unbatch(),
+       cardinality.UNKNOWN),
+      ("Unbatch4", lambda: dataset_ops.Dataset.range(5).batch(
+          2, drop_remainder=True).repeat().unbatch(), cardinality.INFINITE),
+      ("Unbatch5", lambda: dataset_ops.Dataset.zip((
+          dataset_ops.Dataset.range(4).batch(2, drop_remainder=False),
+          dataset_ops.Dataset.range(5).batch(2, drop_remainder=True),
+      )).unbatch(), 4),
       ("Window1", lambda: dataset_ops.Dataset.range(5).window(
           size=2, shift=2, drop_remainder=True), 2),
       ("Window2", lambda: dataset_ops.Dataset.range(5).window(
@@ -144,12 +156,12 @@ def _test_combinations():
           (dataset_ops.Dataset.range(5), dataset_ops.Dataset.range(3))), 3),
       ("Zip3", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
           5), dataset_ops.Dataset.range(3).repeat())), 5),
-      ("Zip4", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
-          5).repeat(), dataset_ops.Dataset.range(3).repeat())),
-       cardinality.INFINITE),
-      ("Zip5", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
-          5), dataset_ops.Dataset.range(3).filter(lambda _: True))),
-       cardinality.UNKNOWN),
+      ("Zip4", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5).repeat(), dataset_ops.Dataset.range(3).
+           repeat())), cardinality.INFINITE),
+      ("Zip5", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5), dataset_ops.Dataset.range(3).filter(
+              lambda _: True))), cardinality.UNKNOWN),
   ]
 
   def reduce_fn(x, y):
diff --git a/tensorflow/python/data/experimental/kernel_tests/compression_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/compression_ops_test.py
new file mode 100644
index 00000000000..a091bdca8b9
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/compression_ops_test.py
@@ -0,0 +1,81 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for compression ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import compression_ops
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.platform import test
+
+
+def _test_objects():
+  return [
+      combinations.NamedObject("int", 1),
+      combinations.NamedObject("string", "dog"),
+      combinations.NamedObject("tuple", (1, 1)),
+      combinations.NamedObject("int_string_tuple", (1, "dog")),
+      combinations.NamedObject(
+          "sparse",
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])),
+      combinations.NamedObject(
+          "sparse_structured", {
+              "a":
+                  sparse_tensor.SparseTensorValue(
+                      indices=[[0, 0], [1, 2]],
+                      values=[1, 2],
+                      dense_shape=[3, 4]),
+              "b": (1, 2, "dog")
+          })
+  ]
+
+
+class CompressionOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(element=_test_objects())))
+  def testCompression(self, element):
+    element = element._obj
+
+    compressed = compression_ops.compress(element)
+    uncompressed = compression_ops.uncompress(
+        compressed, structure.type_spec_from_value(element))
+    self.assertValuesEqual(element, self.evaluate(uncompressed))
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(element=_test_objects())))
+  def testDatasetCompression(self, element):
+    element = element._obj
+
+    dataset = dataset_ops.Dataset.from_tensors(element)
+    element_spec = dataset.element_spec
+
+    dataset = dataset.map(lambda *x: compression_ops.compress(x))
+    dataset = dataset.map(lambda x: compression_ops.uncompress(x, element_spec))
+    self.assertDatasetProduces(dataset, [element])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
index 941ca209848..13948305aea 100644
--- a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
@@ -41,9 +41,9 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _setup_files(self, inputs, linebreak='\n', compression_type=None):
     filenames = []
-    for i, ip in enumerate(inputs):
+    for i, file_rows in enumerate(inputs):
       fn = os.path.join(self.get_temp_dir(), 'temp_%d.csv' % i)
-      contents = linebreak.join(ip).encode('utf-8')
+      contents = linebreak.join(file_rows).encode('utf-8')
       if compression_type is None:
         with open(fn, 'wb') as f:
           f.write(contents)
@@ -580,6 +580,13 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           inputs, [[0, 0, 0, 0], [1, 1, 1, 0], [0, 2, 2, 2]],
           record_defaults=record_defaults)
 
+  def testCsvDataset_immutableParams(self):
+    inputs = [['a,b,c', '1,2,3', '4,5,6']]
+    filenames = self._setup_files(inputs)
+    select_cols = ['a', 'c']
+    _ = readers.make_csv_dataset(
+        filenames, batch_size=1, select_columns=select_cols)
+    self.assertAllEqual(select_cols, ['a', 'c'])
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
index 2495083cf63..581d8f42792 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
@@ -331,6 +331,16 @@ class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertDatasetProduces(
         dataset, expected_output=[[i] for i in range(10)])
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testGroupByWindowWithAutotune(self):
+    dataset = dataset_ops.Dataset.range(1000).apply(
+        grouping.group_by_window(
+            lambda x: x // 10,
+            lambda key, window: dataset_ops.Dataset.from_tensors(key), 4))
+    dataset = dataset.map(lambda x: x + 1, num_parallel_calls=-1)
+    get_next = self.getNext(dataset)
+    self.evaluate(get_next())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
index 1641243edcc..611fbab4b8b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
@@ -201,6 +201,18 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testPrefetchToDevicePlacement(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/gpu:0"))
+
+    self.assertEqual(device_dataset._variant_tensor.device,
+                     "/job:localhost/replica:0/task:0/device:GPU:0")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index a189e591a3a..b3123d65852 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -199,7 +199,9 @@ tf_py_test(
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/snapshot_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/snapshot_dataset_serialization_test.py
index 53261d4b298..d5daaacae9a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/snapshot_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/snapshot_dataset_serialization_test.py
@@ -34,6 +34,97 @@ class SnapshotDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase,
     parameterized.TestCase):
 
+  def _build_snapshot_dataset(self, repeat=False):
+
+    def ds_fn():
+      self._snapshot_dir = os.path.join(self.get_temp_dir(), "snapshot")
+      if not os.path.exists(self._snapshot_dir):
+        os.mkdir(self._snapshot_dir)
+
+      dataset = dataset_ops.Dataset.range(100)
+      dataset = dataset.apply(snapshot.snapshot(self._snapshot_dir))
+      if repeat:
+        dataset = dataset.repeat(2)
+      return dataset
+
+    return ds_fn
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCheckpointBeforeEpochEndNoRepeat(self):
+    ds_fn = self._build_snapshot_dataset(repeat=False)
+    outputs = self.gen_outputs(ds_fn, [], 50, verify_exhausted=False)
+    self.assertSequenceEqual(outputs, range(50))
+    outputs.extend(
+        self.gen_outputs(ds_fn, [], 50, ckpt_saved=True, verify_exhausted=True))
+    self.assertSequenceEqual(outputs, range(100))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCheckpointBeforeOneEpochWithReading(self):
+    ds_fn = self._build_snapshot_dataset(repeat=True)
+
+    # Generate 50 entries from iterator and save checkpoint.
+    outputs = self.gen_outputs(ds_fn, [], 50, verify_exhausted=False)
+    self.assertSequenceEqual(outputs, list(range(50)))
+
+    # Restore from checkpoint and produce the rest of the elements from the
+    # iterator.
+    t = self.gen_outputs(ds_fn, [], 150, ckpt_saved=True, verify_exhausted=True)
+    outputs.extend(t)
+    self.assertSequenceEqual(
+        outputs,
+        list(range(50)) + list(range(50, 100)) + list(range(100)))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCheckpointBeforeOneEpochThenRunAFewSteps(self):
+    ds_fn = self._build_snapshot_dataset(repeat=False)
+    outputs = self.gen_outputs(
+        ds_fn, [10], 20, verify_exhausted=False, save_checkpoint_at_end=False)
+    self.assertSequenceEqual(outputs, range(20))
+
+    outputs = outputs[:10]
+    outputs.extend(
+        self.gen_outputs(ds_fn, [], 90, ckpt_saved=True, verify_exhausted=True))
+    self.assertSequenceEqual(outputs, range(100))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCheckpointAfterOneEpoch(self):
+    ds_fn = self._build_snapshot_dataset(repeat=True)
+
+    # Generate 110 entries from iterator and save checkpoint.
+    outputs = self.gen_outputs(ds_fn, [], 110, verify_exhausted=False)
+    self.assertSequenceEqual(outputs, list(range(100)) + list(range(10)))
+
+    # Restore from checkpoint and produce the rest of the elements from the
+    # iterator.
+    t = self.gen_outputs(ds_fn, [], 90, ckpt_saved=True, verify_exhausted=True)
+    outputs.extend(t)
+    self.assertSequenceEqual(
+        outputs,
+        list(range(100)) + list(range(10)) + list(range(10, 100)))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCheckpointAfterOneEpochRunFewSteps(self):
+    ds_fn = self._build_snapshot_dataset(repeat=True)
+
+    # Generate 120 entries from iterator and save checkpoint at 110.
+    outputs = self.gen_outputs(
+        ds_fn, [110], 120, verify_exhausted=False, save_checkpoint_at_end=False)
+    self.assertSequenceEqual(outputs, list(range(100)) + list(range(20)))
+
+    # Restore from checkpoint and produce the rest of the elements from the
+    # iterator.
+    outputs = outputs[:110]
+    t = self.gen_outputs(ds_fn, [], 90, ckpt_saved=True, verify_exhausted=True)
+    outputs.extend(t)
+    self.assertSequenceEqual(
+        outputs,
+        list(range(100)) + list(range(10)) + list(range(10, 100)))
+
+
+class LegacySnapshotDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
+
   def _build_snapshot_dataset(self,
                               num_threads=1,
                               repeat=False,
@@ -46,7 +137,7 @@ class SnapshotDatasetSerializationTest(
         os.mkdir(self.snapshot_dir)
       dataset = dataset_ops.Dataset.range(1000)
       dataset = dataset.apply(
-          snapshot.snapshot(
+          snapshot.legacy_snapshot(
               self.snapshot_dir,
               num_writer_threads=num_threads,
               writer_buffer_size=2 * num_threads,
diff --git a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
index 535cf884dc6..b6fc337db61 100644
--- a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
@@ -17,11 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import multiprocessing
 import os
 import shutil
 import time
 
 from absl.testing import parameterized
+import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.ops import snapshot
@@ -40,6 +42,285 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
   def setUp(self):
     super(SnapshotDatasetTest, self).setUp()
+    tmpdir = self.get_temp_dir()
+    tmpdir = os.path.join(tmpdir, "snapshot")
+    os.mkdir(tmpdir)
+    self._snapshot_dir = tmpdir
+
+  def tearDown(self):
+    super(SnapshotDatasetTest, self).tearDown()
+    shutil.rmtree(self._snapshot_dir)
+
+  def createTFRecords(self, num_files=10, num_records=100):
+    self._num_files = num_files
+    self._num_records = num_records
+    self._test_filenames = self._createFiles()
+
+  def removeTFRecords(self):
+    for filename in self._test_filenames:
+      os.remove(filename)
+    self._test_filenames = []
+    self._num_files = None
+    self._num_records = None
+
+  def assertDatasetProducesSet(self, dataset, expected):
+    actual = []
+    next_fn = self.getNext(dataset)
+    for _ in range(len(expected)):
+      elem = self.evaluate(next_fn())
+      actual.append(elem)
+    self.assertCountEqual(actual, expected)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_fn())
+
+  def assertSnapshotDirectoryContains(self, directory, num_fingerprints,
+                                      num_runs_per_fingerprint,
+                                      num_snapshot_shards_per_run):
+    dirlist_raw = os.listdir(directory)
+    dirlist = []
+
+    # Ignore the graphdef pbtxts we write for debugging purposes.
+    for i in range(len(dirlist_raw)):
+      if not dirlist_raw[i].endswith("-graph.pbtxt"):
+        dirlist.append(dirlist_raw[i])
+
+    self.assertLen(dirlist, num_fingerprints)
+
+    for i in range(num_fingerprints):
+      fingerprint_dir = os.path.join(directory, dirlist[i])
+      fingerprint_dir_list = sorted(os.listdir(fingerprint_dir))
+      self.assertLen(fingerprint_dir_list, num_runs_per_fingerprint + 1)
+      self.assertEqual(fingerprint_dir_list[num_runs_per_fingerprint],
+                       "snapshot.metadata")
+
+      for j in range(num_runs_per_fingerprint):
+        run_dir = os.path.join(fingerprint_dir, fingerprint_dir_list[j])
+        run_dirlist = sorted(os.listdir(run_dir))
+        self.assertLen(run_dirlist, num_snapshot_shards_per_run)
+
+        file_counter = 0
+        for filename in run_dirlist:
+          self.assertEqual(filename, "%08d.shard" % file_counter)
+          file_counter += 1
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCreateSnapshotDataset(self):
+    dataset = dataset_ops.Dataset.from_tensors([1, 2, 3])
+    dataset.apply(snapshot.snapshot(self._snapshot_dir))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testReadSnapshotDatasetDefault(self):
+    self.createTFRecords()
+    filenames = self._test_filenames
+    expected = [
+        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for f in range(0, 10)
+        for r in range(0, 100)
+    ]
+
+    dataset = core_readers._TFRecordDataset(filenames)
+    dataset = dataset.apply(snapshot.snapshot(self._snapshot_dir))
+    self.assertDatasetProduces(dataset, expected)
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=multiprocessing.cpu_count())
+
+    self.removeTFRecords()
+    dataset2 = core_readers._TFRecordDataset(filenames)
+    dataset2 = dataset2.apply(snapshot.snapshot(self._snapshot_dir))
+    self.assertDatasetProduces(dataset2, expected)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testReadSnapshotDatasetCustomShardFn(self):
+    self.createTFRecords()
+    filenames = self._test_filenames
+    expected = [
+        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for f in range(0, 10)
+        for r in range(0, 100)
+    ]
+
+    dataset = core_readers._TFRecordDataset(filenames)
+    dataset = dataset.apply(
+        snapshot.snapshot(self._snapshot_dir, shard_func=lambda _: np.int64(0)))
+    self.assertDatasetProduces(dataset, expected)
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=1)
+
+    self.removeTFRecords()
+    dataset2 = core_readers._TFRecordDataset(filenames)
+    dataset2 = dataset2.apply(
+        snapshot.snapshot(self._snapshot_dir, shard_func=lambda _: 0))
+    self.assertDatasetProduces(dataset2, expected)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testReadSnapshotDatasetCustomReaderFn(self):
+    self.createTFRecords()
+    filenames = self._test_filenames
+    expected = [
+        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for f in range(0, 10)
+        for r in range(0, 100)
+    ]
+
+    dataset = core_readers._TFRecordDataset(filenames)
+    dataset = dataset.apply(
+        snapshot.snapshot(
+            self._snapshot_dir,
+            reader_func=(
+                lambda ds: ds.interleave(  # pylint:disable=g-long-lambda
+                    lambda x: x,
+                    cycle_length=4,
+                    num_parallel_calls=4))))
+    self.assertDatasetProduces(dataset, expected)
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=multiprocessing.cpu_count())
+
+    self.removeTFRecords()
+    dataset2 = core_readers._TFRecordDataset(filenames)
+    dataset2 = dataset2.apply(
+        snapshot.snapshot(
+            self._snapshot_dir,
+            reader_func=(
+                lambda ds: ds.interleave(  # pylint:disable=g-long-lambda
+                    lambda x: x,
+                    cycle_length=4,
+                    num_parallel_calls=4))))
+    self.assertDatasetProducesSet(dataset2, expected)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testSnapshotDatasetInvalidShardFn(self):
+    dataset = dataset_ops.Dataset.range(1000)
+    with self.assertRaises(TypeError):
+      dataset = dataset.apply(
+          snapshot.snapshot(
+              self._snapshot_dir, shard_func=lambda _: "invalid_fn"))
+      next_fn = self.getNext(dataset)
+      self.evaluate(next_fn())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testSnapshotDatasetInvalidReaderFn(self):
+    dataset = dataset_ops.Dataset.range(1000)
+    with self.assertRaises(TypeError):
+      dataset = dataset.apply(
+          snapshot.snapshot(self._snapshot_dir, reader_func=lambda x: x + 1))
+      next_fn = self.getNext(dataset)
+      self.evaluate(next_fn())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testWriteSnapshotDatasetSimple(self):
+    dataset = dataset_ops.Dataset.range(1000)
+    dataset = dataset.apply(snapshot.snapshot(self._snapshot_dir))
+    self.assertDatasetProduces(dataset, list(range(1000)))
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=multiprocessing.cpu_count())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testWriteSnapshotDatasetMultipleFingerprints(self):
+    dataset1 = dataset_ops.Dataset.range(1000)
+    dataset1 = dataset1.apply(snapshot.snapshot(self._snapshot_dir))
+    self.assertDatasetProduces(dataset1, list(range(1000)))
+
+    dataset2 = dataset_ops.Dataset.range(2000)
+    dataset2 = dataset2.apply(snapshot.snapshot(self._snapshot_dir))
+    self.assertDatasetProduces(dataset2, list(range(2000)))
+
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=2,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=multiprocessing.cpu_count())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testWriteSnapshotDatasetSameFingerprintMultipleCompleteRuns(self):
+    dataset1 = dataset_ops.Dataset.range(1000)
+    dataset1 = dataset1.apply(snapshot.snapshot(self._snapshot_dir))
+    self.assertDatasetProduces(dataset1, list(range(1000)))
+    dataset2 = dataset_ops.Dataset.range(1000)
+    dataset2 = dataset2.apply(snapshot.snapshot(self._snapshot_dir))
+    self.assertDatasetProduces(dataset2, list(range(1000)))
+
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=multiprocessing.cpu_count())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testWriteSnapshotDatasetSameFingerprintIncompleteRunRestart(self):
+    dataset1 = dataset_ops.Dataset.range(1000)
+    dataset1 = dataset1.apply(snapshot.snapshot(self._snapshot_dir))
+    next1 = self.getNext(dataset1)
+    for i in range(500):
+      self.assertEqual(i, self.evaluate(next1()))
+
+    dataset2 = dataset_ops.Dataset.range(1000)
+    dataset2 = dataset2.apply(snapshot.snapshot(self._snapshot_dir))
+    next2 = self.getNext(dataset2)
+    for i in range(500):
+      self.assertEqual(i, self.evaluate(next2()))
+
+    for i in range(500, 1000):
+      self.assertEqual(i, self.evaluate(next1()))
+      self.assertEqual(i, self.evaluate(next2()))
+
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=2,
+        num_snapshot_shards_per_run=multiprocessing.cpu_count())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testWriteSnapshotCustomShardFunction(self):
+    dataset = dataset_ops.Dataset.range(1000)
+    dataset = dataset.enumerate()
+    dataset = dataset.apply(
+        snapshot.snapshot(self._snapshot_dir, shard_func=lambda i, _: i % 2))
+    dataset = dataset.map(lambda _, elem: elem)
+    self.assertDatasetProduces(dataset, list(range(1000)))
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=2)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testWriteSnapshotDatasetWithTuples(self):
+    dataset1 = dataset_ops.Dataset.range(0, 1000)
+    dataset2 = dataset_ops.Dataset.range(1000, 2000)
+    dataset3 = dataset_ops.Dataset.range(2000, 3000)
+    dataset4 = dataset_ops.Dataset.range(3000, 4000)
+
+    dataset = dataset_ops.Dataset.zip((dataset1, dataset2, dataset3, dataset4))
+    dataset = dataset.apply(snapshot.snapshot(self._snapshot_dir))
+    next1 = self.getNext(dataset)
+    for i in range(0, 1000):
+      self.assertEqual((i, i + 1000, i + 2000, i + 3000),
+                       self.evaluate(next1()))
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=multiprocessing.cpu_count())
+
+
+class LegacySnapshotDatasetTest(
+    reader_dataset_ops_test_base.TFRecordDatasetTestBase,
+    parameterized.TestCase):
+
+  def setUp(self):
+    super(LegacySnapshotDatasetTest, self).setUp()
     self.removeTFRecords()
     tmpdir = self.get_temp_dir()
     tmpdir = os.path.join(tmpdir, "snapshot")
@@ -47,7 +328,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     self.snapshot_dir = tmpdir
 
   def tearDown(self):
-    super(SnapshotDatasetTest, self).tearDown()
+    super(LegacySnapshotDatasetTest, self).tearDown()
     shutil.rmtree(self.snapshot_dir)
 
   def removeTFRecords(self):
@@ -63,8 +344,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
   def makeSnapshotDirectory(self):
     return self.snapshot_dir
 
-  def assertSnapshotDirectoryContains(
-      self, directory, num_fingerprints, num_runs_per_fp, num_snapshot_files):
+  def assertSnapshotDirectoryContains(self, directory, num_fingerprints,
+                                      num_runs_per_fp, num_snapshot_files):
     dirlist_raw = os.listdir(directory)
     dirlist = []
 
@@ -97,11 +378,11 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.range(1000)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset, list(range(1000)))
 
     dataset = dataset_ops.Dataset.range(1001)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset, list(range(1001)))
 
     self.assertSnapshotDirectoryContains(tmpdir, 2, 1, 1)
@@ -111,11 +392,11 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset1 = dataset_ops.Dataset.range(1000)
-    dataset1 = dataset1.apply(snapshot.snapshot(tmpdir))
+    dataset1 = dataset1.apply(snapshot.legacy_snapshot(tmpdir))
     next1 = self.getNext(dataset1)
 
     dataset2 = dataset_ops.Dataset.range(1000)
-    dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
+    dataset2 = dataset2.apply(snapshot.legacy_snapshot(tmpdir))
     next2 = self.getNext(dataset2)
 
     for i in range(0, 1000):
@@ -132,11 +413,11 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     # We create two iterators but call getNext on only one.
     dataset1 = dataset_ops.Dataset.range(1000)
-    dataset1 = dataset1.apply(snapshot.snapshot(tmpdir))
+    dataset1 = dataset1.apply(snapshot.legacy_snapshot(tmpdir))
     next1 = self.getNext(dataset1)
 
     dataset2 = dataset_ops.Dataset.range(1001)
-    dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
+    dataset2 = dataset2.apply(snapshot.legacy_snapshot(tmpdir))
     _ = self.getNext(dataset2)
 
     for _ in range(1000):
@@ -156,7 +437,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.range(1000)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     self.assertDatasetProduces(dataset, list(range(1000)))
 
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
@@ -172,7 +454,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.range(10)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     dataset = dataset.repeat(10)
     self.assertDatasetProduces(dataset, list(range(10)) * 10)
 
@@ -194,7 +477,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
       return (x, string_ops.as_string(x), string_ops.as_string(2 * x), 2 * x)
 
     dataset = dataset.map(map_fn)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     dataset = dataset.repeat(10)
 
     expected = []
@@ -210,7 +494,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(
-        snapshot.snapshot(tmpdir, snapshot_name="my_custom_snapshot"))
+        snapshot.legacy_snapshot(tmpdir, snapshot_name="my_custom_snapshot"))
     dataset = dataset.repeat(10)
     self.assertDatasetProduces(dataset, list(range(10)) * 10)
 
@@ -226,7 +510,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.range(10)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, mode="passthrough"))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, mode="passthrough"))
     dataset = dataset.repeat(10)
     self.assertDatasetProduces(dataset, list(range(10)) * 10)
 
@@ -237,7 +522,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.range(10)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, mode="write"))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir, mode="write"))
     dataset = dataset.repeat(10)
     self.assertDatasetProduces(dataset, list(range(10)) * 10)
 
@@ -251,7 +536,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # We write a copy of the snapshot first.
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir, mode="write", snapshot_name="my_custom_snapshot"))
     self.assertDatasetProduces(dataset, list(range(10)))
 
@@ -264,7 +549,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # longer exists after we moved, we force it to read from the run we specify.
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir, mode="read", snapshot_name="my_custom_snapshot_2"))
     self.assertDatasetProduces(dataset, list(range(10)))
 
@@ -276,7 +561,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
     dataset = dataset_ops.Dataset.range(10)
     with self.assertRaises(errors.NotFoundError):
-      dataset = dataset.apply(snapshot.snapshot(tmpdir, mode="read"))
+      dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir, mode="read"))
       get_next = self.getNext(dataset)
       self.evaluate(get_next())
 
@@ -286,7 +571,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     dataset = dataset_ops.Dataset.range(10)
     with self.assertRaises(errors.NotFoundError):
       dataset = dataset.apply(
-          snapshot.snapshot(
+          snapshot.legacy_snapshot(
               tmpdir, mode="read", snapshot_name="my_nonexistent_snapshot"))
       get_next = self.getNext(dataset)
       self.evaluate(get_next())
@@ -310,15 +595,16 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     self.assertDatasetProduces(dataset, expected)
 
     # remove the original files and try to read the data back only from snapshot
     self.removeTFRecords()
 
     dataset2 = core_readers._TFRecordDataset(filenames)
-    dataset2 = dataset2.apply(snapshot.snapshot(
-        tmpdir, compression=compression))
+    dataset2 = dataset2.apply(
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     self.assertDatasetProduces(dataset2, expected)
 
   @combinations.generate(test_base.default_test_combinations())
@@ -334,7 +620,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, shard_size_bytes=100))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, shard_size_bytes=100))
     self.assertDatasetProduces(dataset, expected)
 
     # remove the original files and try to read the data back only from snapshot
@@ -342,7 +629,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset2 = core_readers._TFRecordDataset(filenames)
     dataset2 = dataset2.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=100, shuffle_on_read=True))
+        snapshot.legacy_snapshot(
+            tmpdir, shard_size_bytes=100, shuffle_on_read=True))
     next2 = self.getNext(dataset2)
 
     res1 = self.evaluate(next2())
@@ -357,7 +645,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # make sure all the elements are still there
     dataset3 = core_readers._TFRecordDataset(filenames)
     dataset3 = dataset3.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=100, shuffle_on_read=True))
+        snapshot.legacy_snapshot(
+            tmpdir, shard_size_bytes=100, shuffle_on_read=True))
     self.assertDatasetProduces(dataset3, expected, assert_items_equal=True)
 
   @combinations.generate(test_base.default_test_combinations())
@@ -373,7 +662,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, shard_size_bytes=10))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, shard_size_bytes=10))
     self.assertDatasetProduces(dataset, expected)
 
     # remove the original files and try to read the data back only from snapshot
@@ -381,14 +671,20 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset2 = core_readers._TFRecordDataset(filenames)
     dataset2 = dataset2.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=10, shuffle_on_read=True,
-                          shuffle_seed=123456))
+        snapshot.legacy_snapshot(
+            tmpdir,
+            shard_size_bytes=10,
+            shuffle_on_read=True,
+            shuffle_seed=123456))
     next2 = self.getNext(dataset2)
 
     dataset3 = core_readers._TFRecordDataset(filenames)
     dataset3 = dataset3.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=10,
-                          shuffle_on_read=True, shuffle_seed=123456))
+        snapshot.legacy_snapshot(
+            tmpdir,
+            shard_size_bytes=10,
+            shuffle_on_read=True,
+            shuffle_seed=123456))
     next3 = self.getNext(dataset3)
 
     # make sure that the items are read back in the same order for both datasets
@@ -417,7 +713,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
     dataset = dataset.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir,
             shard_size_bytes=1024 * 1024,
             num_reader_threads=2,
@@ -431,7 +727,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset2 = core_readers._TFRecordDataset(filenames)
     dataset2 = dataset2.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir,
             shard_size_bytes=1024 * 1024,
             num_reader_threads=2,
@@ -450,8 +746,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
               ]),
               combinations.combine(threads=2, size=[1, 2]) +
               combinations.combine(threads=8, size=[1, 4, 8]))))
-  def testReadSnapshotBackAfterMultiThreadedWrite(
-      self, compression, threads, size):
+  def testReadSnapshotBackAfterMultiThreadedWrite(self, compression, threads,
+                                                  size):
     self.setUpTFRecord()
     filenames = self.test_filenames
 
@@ -464,7 +760,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
     dataset = dataset.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir,
             compression=compression,
             num_writer_threads=threads,
@@ -477,7 +773,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset2 = core_readers._TFRecordDataset(filenames)
     dataset2 = dataset2.apply(
-        snapshot.snapshot(tmpdir, compression=compression))
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     self.assertDatasetProduces(dataset2, expected, assert_items_equal=True)
 
   @combinations.generate(test_base.default_test_combinations())
@@ -489,7 +785,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     dataset3 = dataset_ops.Dataset.range(200, 300)
 
     dataset = dataset1.concatenate(dataset2).concatenate(dataset3)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset, list(range(300)))
 
     dataset4 = dataset_ops.Dataset.range(200, 300)
@@ -497,7 +793,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     dataset6 = dataset_ops.Dataset.range(0, 100)
 
     dataset = dataset6.concatenate(dataset5).concatenate(dataset4)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset, list(range(300)))
 
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
@@ -508,7 +804,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset1 = dataset_ops.Dataset.range(1000)
     dataset1 = dataset1.apply(
-        snapshot.snapshot(tmpdir, pending_snapshot_expiry_seconds=1))
+        snapshot.legacy_snapshot(tmpdir, pending_snapshot_expiry_seconds=1))
     next1 = self.getNext(dataset1)
 
     # Don't finish reading dataset1, so it is never finalized
@@ -524,7 +820,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # snapshot has expired.
     dataset2 = dataset_ops.Dataset.range(1000)
     dataset2 = dataset2.apply(
-        snapshot.snapshot(tmpdir, pending_snapshot_expiry_seconds=1))
+        snapshot.legacy_snapshot(tmpdir, pending_snapshot_expiry_seconds=1))
     next2 = self.getNext(dataset2)
 
     for _ in range(500):
@@ -537,7 +833,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset1 = dataset_ops.Dataset.range(1000)
     dataset1 = dataset1.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=10000))
+        snapshot.legacy_snapshot(tmpdir, shard_size_bytes=10000))
     next1 = self.getNext(dataset1)
 
     for _ in range(1000):
@@ -547,7 +843,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # Create second snapshot with a different shard_size_bytes
     dataset2 = dataset_ops.Dataset.range(1000)
     dataset2 = dataset1.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=20000))
+        snapshot.legacy_snapshot(tmpdir, shard_size_bytes=20000))
     next2 = self.getNext(dataset2)
 
     for _ in range(1000):
@@ -568,7 +864,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     dataset = dataset.map(lambda x: gen_array_ops.broadcast_to(x, [1024, 1024]))
     dataset = dataset.repeat(10)
     dataset = dataset.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir, shard_size_bytes=10 * 1024 * 1024, compression=compression))
     next_fn = self.getNext(dataset)
 
@@ -593,14 +889,14 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset, expected)
 
     # remove the original files and try to read the data back only from snapshot
     self.removeTFRecords()
 
     dataset2 = core_readers._TFRecordDataset(filenames)
-    dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
+    dataset2 = dataset2.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset2, expected)
 
     expected_after = [
@@ -610,7 +906,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     ]
 
     dataset3 = core_readers._TFRecordDataset(filenames)
-    dataset3 = dataset3.apply(snapshot.snapshot(tmpdir))
+    dataset3 = dataset3.apply(snapshot.legacy_snapshot(tmpdir))
     dataset3 = dataset3.map(lambda x: string_ops.substr_v2(x, 2, 1000))
     self.assertDatasetProduces(dataset3, expected_after)
 
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 50d095e46f6..2adf2a6362d 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -33,6 +33,15 @@ py_library(
     ],
 )
 
+py_library(
+    name = "compression_ops",
+    srcs = ["compression_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:experimental_dataset_ops_gen",
+    ],
+)
+
 py_library(
     name = "counter",
     srcs = ["counter.py"],
@@ -475,6 +484,7 @@ py_library(
     deps = [
         ":batching",
         ":cardinality",
+        ":compression_ops",
         ":counter",
         ":data_service_ops",
         ":distribute",
diff --git a/tensorflow/python/data/experimental/ops/cardinality.py b/tensorflow/python/data/experimental/ops/cardinality.py
index 54d30e1fba8..f1b8908fa68 100644
--- a/tensorflow/python/data/experimental/ops/cardinality.py
+++ b/tensorflow/python/data/experimental/ops/cardinality.py
@@ -32,6 +32,7 @@ tf_export("data.experimental.UNKNOWN_CARDINALITY").export_constant(
     __name__, "UNKNOWN")
 
 
+# TODO(b/157691652): Deprecate this method after migrating users to the new API.
 @tf_export("data.experimental.cardinality")
 def cardinality(dataset):
   """Returns the cardinality of `dataset`, if known.
diff --git a/tensorflow/python/data/experimental/ops/compression_ops.py b/tensorflow/python/data/experimental/ops/compression_ops.py
new file mode 100644
index 00000000000..1ef7c8b3f01
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/compression_ops.py
@@ -0,0 +1,55 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for compressing and uncompressing dataset elements."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.util import structure
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+
+
+def compress(element):
+  """Compress a dataset element.
+
+  Args:
+    element: A nested structure of types supported by Tensorflow.
+
+  Returns:
+    A variant tensor representing the compressed element. This variant can be
+    passed to `uncompress` to get back the original element.
+  """
+  element_spec = structure.type_spec_from_value(element)
+  tensor_list = structure.to_tensor_list(element_spec, element)
+  return ged_ops.compress_element(tensor_list)
+
+
+def uncompress(element, output_spec):
+  """Uncompress a compressed dataset element.
+
+  Args:
+    element: A scalar variant tensor to uncompress. The element should have been
+      created by calling `compress`.
+    output_spec: A nested structure of `tf.TypeSpec` representing the type(s) of
+      the uncompressed element.
+
+  Returns:
+    The uncompressed element.
+  """
+  flat_types = structure.get_flat_tensor_types(output_spec)
+  flat_shapes = structure.get_flat_tensor_shapes(output_spec)
+  tensor_list = ged_ops.uncompress_element(
+      element, output_types=flat_types, output_shapes=flat_shapes)
+  return structure.from_tensor_list(output_spec, tensor_list)
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index c1c23668db0..39790d843ba 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -22,11 +22,14 @@ import functools
 import six
 
 from tensorflow.python import tf2
+from tensorflow.python.data.experimental.ops import compression_ops
+from tensorflow.python.data.experimental.ops.distribute_options import AutoShardPolicy
 from tensorflow.python.data.experimental.ops.distribute_options import ExternalStatePolicy
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 class ProcessingMode(object):
@@ -84,15 +87,30 @@ class _DataServiceDatasetV2(dataset_ops.DatasetSource):
     if task_refresh_interval_hint_ms is None:
       task_refresh_interval_hint_ms = dataset_ops.AUTOTUNE
 
+    self._input_dataset = input_dataset
+    self._dataset_id = ops.convert_to_tensor(
+        dataset_id, dtype=dtypes.int64, name="dataset_id")
+    self._processing_mode = ops.convert_to_tensor(
+        processing_mode, dtype=dtypes.string, name="processing_mode")
+    self._address = ops.convert_to_tensor(
+        address, dtype=dtypes.string, name="address")
+    self._protocol = ops.convert_to_tensor(
+        protocol, dtype=dtypes.string, name="protocol")
+    self._job_name = ops.convert_to_tensor(
+        job_name, dtype=dtypes.string, name="job_name")
+    self._max_outstanding_requests = ops.convert_to_tensor(
+        max_outstanding_requests,
+        dtype=dtypes.int64,
+        name="max_outstanding_requests")
     self._element_spec = input_dataset.element_spec
 
     variant_tensor = gen_experimental_dataset_ops.data_service_dataset(
-        dataset_id=dataset_id,
-        processing_mode=processing_mode,
-        address=address,
-        protocol=protocol,
-        job_name=job_name,
-        max_outstanding_requests=max_outstanding_requests,
+        dataset_id=self._dataset_id,
+        processing_mode=self._processing_mode,
+        address=self._address,
+        protocol=self._protocol,
+        job_name=self._job_name,
+        max_outstanding_requests=self._max_outstanding_requests,
         task_refresh_interval_hint_ms=task_refresh_interval_hint_ms,
         iteration_counter=gen_experimental_dataset_ops.dummy_iteration_counter(
         ),
@@ -187,16 +205,31 @@ def _distribute(processing_mode,
   protocol = ops.convert_to_tensor(
       protocol, dtype=dtypes.string, name="protocol")
 
-  def _apply_fn(dataset):
+  def _apply_fn(dataset):  # pylint: disable=missing-docstring
     external_state_policy = dataset.options().experimental_external_state_policy
     if external_state_policy is None:
       external_state_policy = ExternalStatePolicy.WARN
+
+    uncompressed_spec = dataset.element_spec
+    # Compress the dataset elements to reduce the amount of data that needs to
+    # be sent over the network.
+    # TODO(b/157105111): Make this an autotuned parallel map when we have a way
+    # to limit memory usage.
+    dataset = dataset.map(lambda *x: compression_ops.compress(x))
+    # Prefetch one compressed element to reduce latency when requesting data
+    # from tf.data workers.
+    # TODO(b/157105111): Set this to autotune when we have a way to limit
+    # memory usage
+    dataset = dataset.prefetch(1)
+    # Apply options so that the dataset executed in the tf.data service will
+    # be optimized and support autotuning.
+    dataset = dataset._apply_options()  # pylint: disable=protected-access
     dataset_id = gen_experimental_dataset_ops.register_dataset(
         dataset._variant_tensor,  # pylint: disable=protected-access
         address=address,
         protocol=protocol,
         external_state_policy=external_state_policy.value)
-    return _DataServiceDataset(
+    dataset = _DataServiceDataset(
         input_dataset=dataset,
         dataset_id=dataset_id,
         processing_mode=processing_mode,
@@ -205,10 +238,22 @@ def _distribute(processing_mode,
         job_name=job_name,
         max_outstanding_requests=max_outstanding_requests,
         task_refresh_interval_hint_ms=task_refresh_interval_hint_ms)
+    # TODO(b/157105111): Make this an autotuned parallel map when we have a way
+    # to limit memory usage.
+    dataset = dataset.map(
+        lambda x: compression_ops.uncompress(x, output_spec=uncompressed_spec))
+
+    # Disable autosharding for shared jobs.
+    if job_name:
+      options = dataset_ops.Options()
+      options.experimental_distribute.auto_shard_policy = AutoShardPolicy.OFF
+      dataset = dataset.with_options(options)
+    return dataset
 
   return _apply_fn
 
 
+@tf_export("data.experimental.service.distribute")
 def distribute(processing_mode,
                service,
                job_name=None,
@@ -253,32 +298,65 @@ def distribute(processing_mode,
   executed locally.
 
   The `job_name` argument allows jobs to be shared across multiple
-  datasets. Instead of each dataset creating its own job, all datasets with the
-  same `job_name` will consume from the same job. A new job will
-  be created for each iteration of the dataset (with each repetition of
-  `Dataset.repeat` counting as a new iteration). The following example
-  demonstrates shared iteration, with the assumption that the tf.data service is
-  running with a single worker.
+  datasets. Instead of each dataset creating its own job, all
+  datasets with the same `job_name` will consume from the same job. A new job
+  will be created for each iteration of the dataset (with each repetition of
+  `Dataset.repeat` counting as a new iteration). Suppose two training workers
+  (in either a single client or multi-client setup) iterate over the below
+  dataset, and there is a single tf.data worker:
 
   ```
   range5_dataset = tf.data.Dataset.range(5)
-  dataset1 = range5_dataset.apply(tf.data.experimental.service.distribute(
-      "parallel_epochs", "my_job_name", "grpc://dataservice:5000"))
-  dataset2 = range5_dataset.apply(tf.data.experimental.service.distribute(
-      "parallel_epochs", "my_job_name", "grpc://dataservice:5000"))
-  iter_1_1 = iter(dataset1)
-  iter_1_2 = iter(dataset1)
-  iter_2_1 = iter(dataset2)
-  iter_2_2 = iter(dataset2)
-  print(next(iter_1_1))  # Prints "0"
-  # iter_1_2 consumes from the same job as iter_1_1
-  print(next(iter_1_2))  # Prints "1"
-  # iter_2_1 consumes from a new job
-  print(next(iter_2_1))  # Prints "0"
-  # iter_2_2 consumes from the same job as iter_2_1
-  print(next(iter_2_2))  # Prints "1"
+  dataset = range5_dataset.apply(tf.data.experimental.service.distribute(
+      "parallel_epochs", "grpc://dataservice:5000", job_name="my_job_name"))
+  for iteration in range(3):
+    print(list(dataset))
   ```
 
+  The elements of each job will be split between the two processes, with
+  elements being consumed by the processes on a first-come first-served basis.
+  One possible result is that process 1 prints
+
+  ```
+  [0, 2, 4]
+  [0, 1, 3]
+  [1]
+  ```
+
+  and process 2 prints
+
+  ```
+  [1, 3]
+  [2, 4]
+  [0, 2, 3, 4]
+  ```
+
+  Job names must not be re-used across different training jobs within the
+  lifetime of the tf.data service. In general, the tf.data service is expected
+  to live for the duration of a single training job.
+  To use the tf.data service with multiple training jobs, make sure to use
+  different job names to avoid conflicts. For example, suppose a training job
+  calls `distribute` with `job_name="job"` and reads until end of input. If
+  another independent job connects to the same tf.data service and tries to read
+  from `job_name="job"`, it will immediately receive end of input, without
+  getting any data.
+
+  **Keras and Distribution Strategies**
+
+  The dataset produced by the `distribute` transformation can be passed to
+  Keras' `Model.fit` or Distribution Strategy's
+  `tf.distribute.Strategy.experimental_distribute_dataset` like any other
+  `tf.data.Dataset`. We recommend setting a `job_name` on the call to
+  `distribute` so that if there are multiple workers, they read data from the
+  same job. Note that the autosharding normally performed by
+  `experimental_distribute_dataset` will be disabled when setting a `job_name`,
+  since sharing the job already results in splitting data across the workers.
+  When using a shared job, data will be dynamically balanced across workers, so
+  that they reach end of input about the same time. This results in better
+  worker utilization than with autosharding, where each worker processes an
+  independent set of files, and some workers may run out of data earlier than
+  others.
+
   Args:
     processing_mode: A string specifying the policy for how data should be
       processed by tf.data workers. Currently, the only supported value is
@@ -297,5 +375,8 @@ def distribute(processing_mode,
   Returns:
     Dataset: A `Dataset` of the elements produced by the data service.
   """
-  return _distribute(processing_mode, service, job_name,
-                     max_outstanding_requests)
+  return _distribute(
+      processing_mode=processing_mode,
+      service=service,
+      job_name=job_name,
+      max_outstanding_requests=max_outstanding_requests)
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index 8795a206bb1..b8f4c34f40e 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -183,24 +183,30 @@ def _get_sorted_col_indices(select_columns, column_names):
   """Transforms select_columns argument into sorted column indices."""
   names_to_indices = {n: i for i, n in enumerate(column_names)}
   num_cols = len(column_names)
-  for i, v in enumerate(select_columns):
+
+  results = []
+  for v in select_columns:
+    # If value is already an int, check if it's valid.
     if isinstance(v, int):
       if v < 0 or v >= num_cols:
         raise ValueError(
             "Column index %d specified in select_columns out of valid range." %
             v)
-      continue
-    if v not in names_to_indices:
+      results.append(v)
+    # Otherwise, check that it's a valid column name and convert to the
+    # the relevant column index.
+    elif v not in names_to_indices:
       raise ValueError(
           "Value '%s' specified in select_columns not a valid column index or "
           "name." % v)
-    select_columns[i] = names_to_indices[v]
+    else:
+      results.append(names_to_indices[v])
 
   # Sort and ensure there are no duplicates
-  result = sorted(set(select_columns))
-  if len(result) != len(select_columns):
+  results = sorted(set(results))
+  if len(results) != len(select_columns):
     raise ValueError("select_columns contains duplicate columns")
-  return result
+  return results
 
 
 def _maybe_shuffle_and_repeat(
diff --git a/tensorflow/python/data/experimental/ops/snapshot.py b/tensorflow/python/data/experimental/ops/snapshot.py
index 942aec712c3..782e9490d25 100644
--- a/tensorflow/python/data/experimental/ops/snapshot.py
+++ b/tensorflow/python/data/experimental/ops/snapshot.py
@@ -17,19 +17,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import multiprocessing
+
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
-
+from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 COMPRESSION_GZIP = "GZIP"
 COMPRESSION_SNAPPY = "SNAPPY"
 COMPRESSION_NONE = None
 
 
-class _SnapshotDataset(dataset_ops.UnaryUnchangedStructureDataset):
+class _LegacySnapshotDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A Dataset that captures a snapshot or reads from a snapshot."""
 
   def __init__(self,
@@ -96,23 +100,25 @@ class _SnapshotDataset(dataset_ops.UnaryUnchangedStructureDataset):
         snapshot_name=self._snapshot_name,
         **self._flat_structure)
 
-    super(_SnapshotDataset, self).__init__(input_dataset, variant_tensor)
+    super(_LegacySnapshotDataset, self).__init__(input_dataset, variant_tensor)
 
 
-def snapshot(path,
-             compression=None,
-             reader_path_prefix=None,
-             writer_path_prefix=None,
-             shard_size_bytes=None,
-             pending_snapshot_expiry_seconds=None,
-             num_reader_threads=None,
-             reader_buffer_size=None,
-             num_writer_threads=None,
-             writer_buffer_size=None,
-             shuffle_on_read=None,
-             shuffle_seed=None,
-             mode=None,
-             snapshot_name=None):
+@deprecation.deprecated(
+    None, "Use `tf.data.experimental.snapshot(...)` instead.")
+def legacy_snapshot(path,
+                    compression=None,
+                    reader_path_prefix=None,
+                    writer_path_prefix=None,
+                    shard_size_bytes=None,
+                    pending_snapshot_expiry_seconds=None,
+                    num_reader_threads=None,
+                    reader_buffer_size=None,
+                    num_writer_threads=None,
+                    writer_buffer_size=None,
+                    shuffle_on_read=None,
+                    shuffle_seed=None,
+                    mode=None,
+                    snapshot_name=None):
   """Writes to/reads from a snapshot of a dataset.
 
   This function attempts to determine whether a valid snapshot exists at the
@@ -168,7 +174,7 @@ def snapshot(path,
   """
 
   def _apply_fn(dataset):
-    return _SnapshotDataset(
+    return _LegacySnapshotDataset(
         input_dataset=dataset,
         path=path,
         compression=compression,
@@ -186,3 +192,165 @@ def snapshot(path,
         snapshot_name=snapshot_name)
 
   return _apply_fn
+
+
+class _SnapshotDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A dataset that allows saving and re-use of already processed data."""
+
+  def __init__(self,
+               input_dataset,
+               path,
+               shard_func,
+               compression=None,
+               reader_func=None,
+               pending_snapshot_expiry_seconds=None,
+               use_legacy_function=False):
+
+    if reader_func is None:
+      reader_func = lambda datasets: datasets.interleave(  # pylint:disable=g-long-lambda
+          lambda x: x,
+          cycle_length=multiprocessing.cpu_count(),
+          num_parallel_calls=dataset_ops.AUTOTUNE)
+
+    self._input_dataset = input_dataset
+    self._path = path
+    self._compression = compression
+
+    self._reader_func = dataset_ops.StructuredFunctionWrapper(
+        reader_func,
+        self._transformation_name() + ".reader_func",
+        # Dataset of datasets of input elements
+        input_structure=dataset_ops.DatasetSpec(
+            dataset_ops.DatasetSpec(input_dataset.element_spec)),
+        use_legacy_function=use_legacy_function)
+    self._shard_func = dataset_ops.StructuredFunctionWrapper(
+        shard_func,
+        self._transformation_name() + ".shard_func",
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
+
+    if ((not self._shard_func.output_structure.is_compatible_with(
+        tensor_spec.TensorSpec([], dtypes.int32))) and
+        (not self._shard_func.output_structure.is_compatible_with(
+            tensor_spec.TensorSpec([], dtypes.int64)))):
+      raise TypeError(
+          "shard_func must return a 0-dimension tensor containing an int.")
+
+    variant_tensor = ged_ops.snapshot_dataset_v2(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        path,
+        self._reader_func.function.captured_inputs,
+        self._shard_func.function.captured_inputs,
+        compression=compression,
+        reader_func=self._reader_func.function,
+        shard_func=self._shard_func.function,
+        **self._flat_structure)
+    super(_SnapshotDataset, self).__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._reader_func, self._shard_func]
+
+  def _transformation_name(self):
+    return "Dataset.snapshot"
+
+
+@tf_export("data.experimental.snapshot")
+def snapshot(path, compression="AUTO", reader_func=None, shard_func=None):
+  """API to persist the output of the input dataset.
+
+  The snapshot API allows users to transparently persist the output of their
+  preprocessing pipeline to disk, and materialize the pre-processed data on a
+  different training run.
+
+  This API enables repeated preprocessing steps to be consolidated, and allows
+  re-use of already processed data, trading off disk storage and network
+  bandwidth for freeing up more valuable CPU resources and accelerator compute
+  time.
+
+  https://github.com/tensorflow/community/blob/master/rfcs/20200107-tf-data-snapshot.md
+  has detailed design documentation of this feature.
+
+  Users can specify various options to control the behavior of snapshot,
+  including how snapshots are read from and written to by passing in
+  user-defined functions to the `reader_func` and `shard_func` parameters.
+
+  `shard_func` is a user specified function that maps input elements to snapshot
+  shards.
+
+  Users may want to specify this function to control how snapshot files should
+  be written to disk. Below is an example of how a potential shard_func could
+  be written.
+
+  ```python
+  dataset = ...
+  dataset = dataset.enumerate()
+  dataset = dataset.apply(tf.data.experimental.snapshot("/path/to/snapshot/dir",
+      shard_func=lambda x, y: x % NUM_SHARDS, ...))
+  dataset = dataset.map(lambda x, y: y)
+  ```
+
+  `reader_func` is a user specified function that accepts a single argument:
+  (1) a Dataset of Datasets, each representing a "split" of elements of the
+  original dataset. The cardinality of the input dataset matches the
+  number of the shards specified in the `shard_func` (see above). The function
+  should return a Dataset of elements of the original dataset.
+
+  Users may want specify this function to control how snapshot files should be
+  read from disk, including the amount of shuffling and parallelism.
+
+  Here is an example of a standard reader function a user can define. This
+  function enables both dataset shuffling and parallel reading of datasets:
+
+  ```python
+  def user_reader_func(datasets):
+    # shuffle the datasets splits
+    datasets = datasets.shuffle(NUM_CORES)
+    # read datasets in parallel and interleave their elements
+    return datasets.interleave(lambda x: x, num_parallel_calls=AUTOTUNE)
+
+  dataset = dataset.apply(tf.data.experimental.snapshot("/path/to/snapshot/dir",
+      reader_func=user_reader_func))
+  ```
+
+  By default, snapshot parallelize reads by the number of cores available on
+  the system, but will not attempt to shuffle the data.
+
+  Args:
+    path: Required. A directory to use for storing / loading the snapshot to /
+      from.
+    compression: Optional. The type of compression to apply to the snapshot
+      written to disk. Supported options are `GZIP`, `SNAPPY`, `AUTO` or None.
+      Defaults to AUTO, which attempts to pick an appropriate compression
+      algorithm for the dataset.
+    reader_func: Optional. A function to control how to read data from snapshot
+      shards.
+    shard_func: Optional. A function to control how to shard data when writing a
+      snapshot.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    """Actual dataset transformation."""
+    if shard_func is None:
+      dataset = dataset.enumerate()
+      dataset = _SnapshotDataset(
+          input_dataset=dataset,
+          path=path,
+          compression=compression,
+          reader_func=reader_func,
+          # This will not do the right thing where the graph is built on a
+          # different machine than the executor (e.g. Cloud TPUs).
+          shard_func=lambda index, _: index % multiprocessing.cpu_count())
+      return dataset.map(lambda _, elem: elem)
+    else:
+      return _SnapshotDataset(
+          input_dataset=dataset,
+          path=path,
+          compression=compression,
+          reader_func=reader_func,
+          shard_func=shard_func)
+
+  return _apply_fn
diff --git a/tensorflow/python/data/service/BUILD b/tensorflow/python/data/experimental/service/BUILD
similarity index 80%
rename from tensorflow/python/data/service/BUILD
rename to tensorflow/python/data/experimental/service/BUILD
index 19bcaa3b952..f08fef2b669 100644
--- a/tensorflow/python/data/service/BUILD
+++ b/tensorflow/python/data/experimental/service/BUILD
@@ -1,4 +1,6 @@
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
@@ -40,3 +42,13 @@ tf_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
+
+py_library(
+    name = "service",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":server_lib",
+        "//tensorflow/python/data/experimental/ops:data_service_ops",
+    ],
+)
diff --git a/tensorflow/python/data/experimental/service/__init__.py b/tensorflow/python/data/experimental/service/__init__.py
new file mode 100644
index 00000000000..aecc07965bb
--- /dev/null
+++ b/tensorflow/python/data/experimental/service/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for using the tf.data service."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops.data_service_ops import distribute
+from tensorflow.python.data.experimental.service.server_lib import MasterServer
+from tensorflow.python.data.experimental.service.server_lib import WorkerServer
diff --git a/tensorflow/python/data/experimental/service/server_lib.py b/tensorflow/python/data/experimental/service/server_lib.py
new file mode 100644
index 00000000000..f249af671a6
--- /dev/null
+++ b/tensorflow/python/data/experimental/service/server_lib.py
@@ -0,0 +1,254 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A Python interface for creating dataset servers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=invalid-import-order,g-bad-import-order, unused-import
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.data.experimental.service import _pywrap_server_lib
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.service.MasterServer", v1=[])
+class MasterServer(object):
+  """An in-process tf.data service master server.
+
+  A `tf.data.experimental.service.MasterServer` coordinates a cluster of
+  `tf.data.experimental.service.WorkerServer`s. When the workers start, they
+  register themselves with the master.
+
+  >>> master = tf.data.experimental.service.MasterServer(port=0)
+  >>> master_address = master.target.split("://")[1]
+  >>> worker = tf.data.experimental.service.WorkerServer(
+  ...     port=0, master_address=master_address)
+  >>> dataset = tf.data.Dataset.range(10)
+  >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
+  ...     processing_mode="parallel_epochs", service=master.target))
+  >>> print(list(dataset.as_numpy_iterator()))
+  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+  When starting a dedicated tf.data master process, use join() to block
+  indefinitely after starting up the server.
+
+  ```
+  master = tf.data.experimental.service.MasterServer(port=5050)
+  master.join()
+  ```
+  """
+
+  def __init__(self, port, protocol=None, start=True):
+    """Creates a new master server.
+
+    Args:
+      port: Specifies the port to bind to.
+      protocol: (Optional.) Specifies the protocol to be used by the server.
+        Acceptable values include `"grpc", "grpc+local"`. Defaults to `"grpc"`.
+      start: (Optional.) Boolean, indicating whether to start the server after
+        creating it. Defaults to `True`.
+
+    Raises:
+      tf.errors.OpError: Or one of its subclasses if an error occurs while
+        creating the TensorFlow server.
+    """
+    if protocol is None:
+      protocol = "grpc"
+    self._protocol = protocol
+    self._server = _pywrap_server_lib.TF_DATA_NewMasterServer(port, protocol)
+    if start:
+      self._server.start()
+
+  def start(self):
+    """Starts this server.
+
+    >>> master = tf.data.experimental.service.MasterServer(port=0, start=False)
+    >>> master.start()
+
+    Raises:
+      tf.errors.OpError: Or one of its subclasses if an error occurs while
+        starting the server.
+    """
+    self._server.start()
+
+  def join(self):
+    """Blocks until the server has shut down.
+
+    This is useful when starting a dedicated master process.
+
+    ```
+    master = tf.data.experimental.service.MasterServer(port=5050)
+    master.join()
+    ```
+
+    Raises:
+      tf.errors.OpError: Or one of its subclasses if an error occurs while
+        joining the server.
+    """
+    self._server.join()
+
+  @property
+  def target(self):
+    """Returns a target that can be used to connect to the server.
+
+    >>> master = tf.data.experimental.service.MasterServer(port=0)
+    >>> dataset = tf.data.Dataset.range(10)
+    >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
+    ...     processing_mode="parallel_epochs", service=master.target))
+
+    The returned string will be in the form protocol://address, e.g.
+    "grpc://localhost:5050".
+    """
+    return "{0}://localhost:{1}".format(self._protocol,
+                                        self._server.bound_port())
+
+  def _stop(self):
+    """Stops the server.
+
+    Raises:
+      tf.errors.OpError: Or one of its subclasses if an error occurs while
+        stopping the server.
+    """
+    self._server.stop()
+
+  def __del__(self):
+    self._stop()
+
+  @property
+  def _address(self):
+    """Returns the address of the server.
+
+    The returned string will be in the form address:port, e.g. "localhost:1000".
+    """
+    return "localhost:{0}".format(self._server.bound_port())
+
+  def _num_workers(self):
+    """Returns the number of workers registered with the master."""
+    return self._server.num_workers()
+
+
+@tf_export("data.experimental.service.WorkerServer", v1=[])
+class WorkerServer(object):
+  """An in-process tf.data service worker server.
+
+  A `tf.data.experimental.service.WorkerServer` performs `tf.data.Dataset`
+  processing for user-defined datasets, and provides the resulting elements over
+  RPC. A worker is associated with a single
+  `tf.data.experimental.service.MasterServer`.
+
+  >>> master = tf.data.experimental.service.MasterServer(port=0)
+  >>> master_address = master.target.split("://")[1]
+  >>> worker = tf.data.experimental.service.WorkerServer(
+  ...     port=0, master_address=master_address)
+  >>> dataset = tf.data.Dataset.range(10)
+  >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
+  ...     processing_mode="parallel_epochs", service=master.target))
+  >>> print(list(dataset.as_numpy_iterator()))
+  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+  When starting a dedicated tf.data worker process, use join() to block
+  indefinitely after starting up the server.
+
+  ```
+  worker = tf.data.experimental.service.WorkerServer(
+      port=5051, master_address="grpc://localhost:5050")
+  worker.join()
+  ```
+  """
+
+  def __init__(self,
+               port,
+               master_address,
+               worker_address=None,
+               protocol=None,
+               start=True):
+    """Creates a new worker server.
+
+    Args:
+      port: Specifies the port to bind to. A value of 0 indicates that the
+        worker can bind to any available port.
+      master_address: Specifies the address of the master server.
+      worker_address: (Optional.) Specifies the address of the worker server.
+        This address is passed to the master server so that the master can tell
+        clients how to connect to this worker. Defaults to `"localhost:%port%"`,
+          where `%port%` will be replaced with the port used by the worker.
+      protocol: (Optional.) Specifies the protocol to be used by the server.
+        Acceptable values include `"grpc", "grpc+local"`. Defaults to `"grpc"`.
+      start: (Optional.) Boolean, indicating whether to start the server after
+        creating it. Defaults to `True`.
+
+    Raises:
+      tf.errors.OpError: Or one of its subclasses if an error occurs while
+        creating the TensorFlow server.
+    """
+    if worker_address is None:
+      worker_address = "localhost:%port%"
+    if protocol is None:
+      protocol = "grpc"
+
+    self._protocol = protocol
+    self._server = _pywrap_server_lib.TF_DATA_NewWorkerServer(
+        port, protocol, master_address, worker_address)
+    if start:
+      self._server.start()
+
+  def start(self):
+    """Starts this server.
+
+    Raises:
+      tf.errors.OpError: Or one of its subclasses if an error occurs while
+        starting the server.
+    """
+    self._server.start()
+
+  def join(self):
+    """Blocks until the server has shut down.
+
+    This is useful when starting a dedicated worker process.
+
+    ```
+    worker_server = tf.data.experimental.service.WorkerServer(
+        port=5051, master_address="grpc://localhost:5050")
+    worker_server.join()
+    ```
+
+    This method currently blocks forever.
+
+    Raises:
+      tf.errors.OpError: Or one of its subclasses if an error occurs while
+        joining the server.
+    """
+    self._server.join()
+
+  def _stop(self):
+    """Stops the server.
+
+    Raises:
+      tf.errors.OpError: Or one of its subclasses if an error occurs while
+        stopping the server.
+    """
+    self._server.stop()
+
+  def __del__(self):
+    self._stop()
+
+  @property
+  def _address(self):
+    """Returns the address of the server.
+
+    The returned string will be in the form address:port, e.g. "localhost:1000".
+    """
+    return "localhost:{0}".format(self._server.bound_port())
diff --git a/tensorflow/python/data/experimental/service/server_lib_test.py b/tensorflow/python/data/experimental/service/server_lib_test.py
new file mode 100644
index 00000000000..74eb11dc59c
--- /dev/null
+++ b/tensorflow/python/data/experimental/service/server_lib_test.py
@@ -0,0 +1,93 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.data service server lib."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.service import server_lib
+
+from tensorflow.python.platform import test
+
+
+class ServerLibTest(test.TestCase):
+
+  def testStartMaster(self):
+    master = server_lib.MasterServer(0, start=False)
+    master.start()
+
+  def testMultipleStartMaster(self):
+    master = server_lib.MasterServer(0, start=True)
+    master.start()
+
+  def testStartWorker(self):
+    master = server_lib.MasterServer(0)
+    worker = server_lib.WorkerServer(0, master._address, start=False)
+    worker.start()
+
+  def testMultipleStartWorker(self):
+    master = server_lib.MasterServer(0)
+    worker = server_lib.WorkerServer(0, master._address, start=True)
+    worker.start()
+
+  def testStopMaster(self):
+    master = server_lib.MasterServer(0)
+    master._stop()
+    master._stop()
+
+  def testStopWorker(self):
+    master = server_lib.MasterServer(0)
+    worker = server_lib.WorkerServer(0, master._address)
+    worker._stop()
+    worker._stop()
+
+  def testStopStartMaster(self):
+    master = server_lib.MasterServer(0)
+    master._stop()
+    with self.assertRaisesRegex(
+        RuntimeError, "Server cannot be started after it has been stopped"):
+      master.start()
+
+  def testStopStartWorker(self):
+    master = server_lib.MasterServer(0)
+    worker = server_lib.WorkerServer(0, master._address)
+    worker._stop()
+    with self.assertRaisesRegex(
+        RuntimeError, "Server cannot be started after it has been stopped"):
+      worker.start()
+
+  def testJoinMaster(self):
+    master = server_lib.MasterServer(0)
+    master._stop()
+    master.join()
+
+  def testJoinWorker(self):
+    master = server_lib.MasterServer(0)
+    worker = server_lib.WorkerServer(0, master._address)
+    worker._stop()
+    worker.join()
+
+  def testMasterNumWorkers(self):
+    master = server_lib.MasterServer(0)
+    self.assertEqual(0, master._num_workers())
+    worker1 = server_lib.WorkerServer(0, master._address)  # pylint: disable=unused-variable
+    self.assertEqual(1, master._num_workers())
+    worker2 = server_lib.WorkerServer(0, master._address)  # pylint: disable=unused-variable
+    self.assertEqual(2, master._num_workers())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/service/server_lib_wrapper.cc b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
similarity index 68%
rename from tensorflow/python/data/service/server_lib_wrapper.cc
rename to tensorflow/python/data/experimental/service/server_lib_wrapper.cc
index 8325d74a768..03453a56c7f 100644
--- a/tensorflow/python/data/service/server_lib_wrapper.cc
+++ b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
@@ -28,8 +28,24 @@ limitations under the License.
 namespace py = pybind11;
 
 PYBIND11_MODULE(_pywrap_server_lib, m) {
-  py::class_<tensorflow::data::MasterGrpcDataServer>(m, "MasterGrpcDataServer");
-  py::class_<tensorflow::data::WorkerGrpcDataServer>(m, "WorkerGrpcDataServer");
+  py::class_<tensorflow::data::MasterGrpcDataServer>(m, "MasterGrpcDataServer")
+      .def("start", &tensorflow::data::MasterGrpcDataServer::Start)
+      .def("stop", &tensorflow::data::MasterGrpcDataServer::Stop)
+      .def("join", &tensorflow::data::MasterGrpcDataServer::Join)
+      .def("bound_port", &tensorflow::data::MasterGrpcDataServer::BoundPort)
+      .def("num_workers",
+           [](tensorflow::data::MasterGrpcDataServer* server) -> int {
+             int num_workers;
+             tensorflow::Status status = server->NumWorkers(&num_workers);
+             tensorflow::MaybeRaiseFromStatus(status);
+             return num_workers;
+           });
+
+  py::class_<tensorflow::data::WorkerGrpcDataServer>(m, "WorkerGrpcDataServer")
+      .def("start", &tensorflow::data::WorkerGrpcDataServer::Start)
+      .def("stop", &tensorflow::data::WorkerGrpcDataServer::Stop)
+      .def("join", &tensorflow::data::WorkerGrpcDataServer::Join)
+      .def("bound_port", &tensorflow::data::WorkerGrpcDataServer::BoundPort);
 
   m.def(
       "TF_DATA_NewMasterServer",
@@ -39,27 +55,9 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
         tensorflow::Status status =
             tensorflow::data::NewMasterServer(port, protocol, &server);
         tensorflow::MaybeRaiseFromStatus(status);
-        server->Start();
         return server;
       },
       py::return_value_policy::reference);
-  m.def(
-      "TF_DATA_MasterServerBoundPort",
-      [](tensorflow::data::MasterGrpcDataServer* server) -> int {
-        return server->BoundPort();
-      },
-      py::return_value_policy::copy);
-  m.def("TF_DATA_DeleteMasterServer",
-        [](tensorflow::data::MasterGrpcDataServer* server) { server->Stop(); });
-  m.def(
-      "TF_DATA_MasterServerNumTasks",
-      [](tensorflow::data::MasterGrpcDataServer* server) -> int {
-        int num_tasks;
-        tensorflow::Status status = server->NumTasks(&num_tasks);
-        tensorflow::MaybeRaiseFromStatus(status);
-        return num_tasks;
-      },
-      py::return_value_policy::copy);
 
   m.def(
       "TF_DATA_NewWorkerServer",
@@ -70,16 +68,7 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
         tensorflow::Status status = tensorflow::data::NewWorkerServer(
             port, protocol, master_address, worker_address, &server);
         tensorflow::MaybeRaiseFromStatus(status);
-        server->Start();
         return server;
       },
       py::return_value_policy::reference);
-  m.def(
-      "TF_DATA_WorkerServerBoundPort",
-      [](tensorflow::data::WorkerGrpcDataServer* server) -> int {
-        return server->BoundPort();
-      },
-      py::return_value_policy::copy);
-  m.def("TF_DATA_DeleteWorkerServer",
-        [](tensorflow::data::WorkerGrpcDataServer* server) { server->Stop(); });
 };
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 2e01021cfd2..eaee1184ff4 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -1,7 +1,7 @@
 # Tests of TensorFlow kernels written using the Python API.
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")  # buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")  # buildifier: disable=same-origin-load
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -46,6 +46,17 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "cardinality_test",
+    srcs = ["cardinality_test.py"],
+    deps = [
+        "//tensorflow/python/data/experimental/ops:cardinality",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "checkpoint_test",
     size = "medium",
@@ -92,8 +103,8 @@ tf_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python/data",
         "//tensorflow/python/data/experimental/ops:testing",
+        "//tensorflow/python/data/experimental/service:server_lib",
         "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/service:server_lib",
     ],
 )
 
@@ -385,6 +396,19 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "len_test",
+    size = "small",
+    srcs = ["len_test.py"],
+    deps = [
+        ":test_base",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_combinations",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "list_files_test",
     size = "small",
diff --git a/tensorflow/python/data/kernel_tests/cardinality_test.py b/tensorflow/python/data/kernel_tests/cardinality_test.py
new file mode 100644
index 00000000000..bbc8eac6b0c
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/cardinality_test.py
@@ -0,0 +1,174 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.cardinality()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
+from tensorflow.python.platform import test
+
+
+def _test_combinations():
+  # pylint: disable=g-long-lambda
+  cases = [
+      ("Batch1",
+       lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=True), 2),
+      ("Batch2",
+       lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=False), 3),
+      ("Batch3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).batch(2),
+       dataset_ops.UNKNOWN),
+      ("Batch4", lambda: dataset_ops.Dataset.range(5).repeat().batch(2),
+       dataset_ops.INFINITE),
+      ("Cache1", lambda: dataset_ops.Dataset.range(5).cache(), 5),
+      ("Cache2", lambda: dataset_ops.Dataset.range(5).cache("foo"), 5),
+      ("Concatenate1", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5)), 10),
+      ("Concatenate2",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5)), dataset_ops.UNKNOWN),
+      ("Concatenate3", lambda: dataset_ops.Dataset.range(5).repeat().
+       concatenate(dataset_ops.Dataset.range(5)), dataset_ops.INFINITE),
+      ("Concatenate4", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       dataset_ops.UNKNOWN),
+      ("Concatenate5",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       dataset_ops.UNKNOWN),
+      ("Concatenate6", lambda: dataset_ops.Dataset.range(5).repeat().
+       concatenate(dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       dataset_ops.INFINITE),
+      ("Concatenate7", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5).repeat()), dataset_ops.INFINITE),
+      ("Concatenate8",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5).repeat()), dataset_ops.INFINITE),
+      ("Concatenate9",
+       lambda: dataset_ops.Dataset.range(5).repeat().concatenate(
+           dataset_ops.Dataset.range(5).repeat()), dataset_ops.INFINITE),
+      ("FlatMap", lambda: dataset_ops.Dataset.range(5).flat_map(
+          lambda _: dataset_ops.Dataset.from_tensors(0)), dataset_ops.UNKNOWN),
+      ("Filter", lambda: dataset_ops.Dataset.range(5).filter(lambda _: True),
+       dataset_ops.UNKNOWN),
+      ("FromTensors1", lambda: dataset_ops.Dataset.from_tensors(0), 1),
+      ("FromTensors2", lambda: dataset_ops.Dataset.from_tensors((0, 1)), 1),
+      ("FromTensorSlices1",
+       lambda: dataset_ops.Dataset.from_tensor_slices([0, 0, 0]), 3),
+      ("FromTensorSlices2", lambda: dataset_ops.Dataset.from_tensor_slices(
+          ([0, 0, 0], [1, 1, 1])), 3),
+      ("Interleave1", lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1),
+       dataset_ops.UNKNOWN),
+      ("Interleave2", lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0),
+          cycle_length=1,
+          num_parallel_calls=1), dataset_ops.UNKNOWN),
+      ("Map1", lambda: dataset_ops.Dataset.range(5).map(lambda x: x), 5),
+      ("Map2", lambda: dataset_ops.Dataset.range(5).map(
+          lambda x: x, num_parallel_calls=1), 5),
+      ("PaddedBatch1", lambda: dataset_ops.Dataset.range(5).padded_batch(
+          2, [], drop_remainder=True), 2),
+      ("PaddedBatch2", lambda: dataset_ops.Dataset.range(5).padded_batch(
+          2, [], drop_remainder=False), 3),
+      ("PaddedBatch3", lambda: dataset_ops.Dataset.range(5).filter(
+          lambda _: True).padded_batch(2, []), dataset_ops.UNKNOWN),
+      ("PaddedBatch4",
+       lambda: dataset_ops.Dataset.range(5).repeat().padded_batch(2, []),
+       dataset_ops.INFINITE),
+      ("Prefetch", lambda: dataset_ops.Dataset.range(5).prefetch(buffer_size=1),
+       5),
+      ("Range1", lambda: dataset_ops.Dataset.range(0), 0),
+      ("Range2", lambda: dataset_ops.Dataset.range(5), 5),
+      ("Range3", lambda: dataset_ops.Dataset.range(5, 10), 5),
+      ("Range4", lambda: dataset_ops.Dataset.range(10, 5), 0),
+      ("Range5", lambda: dataset_ops.Dataset.range(5, 10, 2), 3),
+      ("Range6", lambda: dataset_ops.Dataset.range(10, 5, -2), 3),
+      ("Repeat1", lambda: dataset_ops.Dataset.range(0).repeat(0), 0),
+      ("Repeat2", lambda: dataset_ops.Dataset.range(1).repeat(0), 0),
+      ("Repeat3", lambda: dataset_ops.Dataset.range(0).repeat(5), 0),
+      ("Repeat4", lambda: dataset_ops.Dataset.range(1).repeat(5), 5),
+      ("Repeat5", lambda: dataset_ops.Dataset.range(0).repeat(), 0),
+      ("Repeat6", lambda: dataset_ops.Dataset.range(1).repeat(),
+       dataset_ops.INFINITE),
+      ("Shuffle", lambda: dataset_ops.Dataset.range(5).shuffle(buffer_size=1),
+       5),
+      ("Shard1", lambda: dataset_ops.Dataset.range(5).shard(2, 0), 3),
+      ("Shard2", lambda: dataset_ops.Dataset.range(5).shard(8, 7), 0),
+      ("Shard3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).shard(2, 0),
+       dataset_ops.UNKNOWN),
+      ("Shard4", lambda: dataset_ops.Dataset.range(5).repeat().shard(2, 0),
+       dataset_ops.INFINITE),
+      ("Skip1", lambda: dataset_ops.Dataset.range(5).skip(2), 3),
+      ("Skip2", lambda: dataset_ops.Dataset.range(5).skip(8), 0),
+      ("Skip3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).skip(2),
+       dataset_ops.UNKNOWN),
+      ("Skip4", lambda: dataset_ops.Dataset.range(5).repeat().skip(2),
+       dataset_ops.INFINITE),
+      ("Take1", lambda: dataset_ops.Dataset.range(5).take(2), 2),
+      ("Take2", lambda: dataset_ops.Dataset.range(5).take(8), 5),
+      ("Take3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).take(2),
+       dataset_ops.UNKNOWN),
+      ("Take4", lambda: dataset_ops.Dataset.range(5).repeat().take(2), 2),
+      ("Window1", lambda: dataset_ops.Dataset.range(5).window(
+          size=2, shift=2, drop_remainder=True), 2),
+      ("Window2", lambda: dataset_ops.Dataset.range(5).window(
+          size=2, shift=2, drop_remainder=False), 3),
+      ("Zip1", lambda: dataset_ops.Dataset.zip(dataset_ops.Dataset.range(5)),
+       5),
+      ("Zip2", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5), dataset_ops.Dataset.range(3))), 3),
+      ("Zip3", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
+          5), dataset_ops.Dataset.range(3).repeat())), 5),
+      ("Zip4", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5).repeat(), dataset_ops.Dataset.range(3).
+           repeat())), dataset_ops.INFINITE),
+      ("Zip5", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5), dataset_ops.Dataset.range(3).filter(
+              lambda _: True))), dataset_ops.UNKNOWN),
+  ]
+
+  def reduce_fn(x, y):
+    name, dataset_fn, expected_result = y
+    return x + combinations.combine(
+        dataset_fn=combinations.NamedObject(name, dataset_fn),
+        expected_result=expected_result)
+
+  return functools.reduce(reduce_fn, cases, [])
+
+
+class CardinalityTest(test_base.DatasetTestBase, parameterized.TestCase):
+  """Tests for `tf.data.Dataset.cardinality()`."""
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         _test_combinations()))
+  def testCardinality(self, dataset_fn, expected_result):
+    dataset = dataset_fn()
+    self.assertEqual(self.evaluate(dataset.cardinality()), expected_result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/data_service_ops_test.py b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
index eac1c674b2d..d316009ce0c 100644
--- a/tensorflow/python/data/kernel_tests/data_service_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
@@ -23,9 +23,9 @@ from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import data_service_ops
 from tensorflow.python.data.experimental.ops import distribute_options
+from tensorflow.python.data.experimental.service import server_lib
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.service import server_lib
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
@@ -37,12 +37,12 @@ from tensorflow.python.platform import test
 PROTOCOL = "grpc"
 
 
-def _make_distributed_dataset(dataset, service, job_name=None):
+def _make_distributed_dataset(dataset, address, job_name=None):
   """Creates a distributed dataset with a short task refresh interval."""
   return dataset.apply(
       data_service_ops._distribute(
           "parallel_epochs",
-          service,
+          "{0}://{1}".format(PROTOCOL, address),
           job_name=job_name,
           task_refresh_interval_hint_ms=20))
 
@@ -56,34 +56,32 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       num_workers: The number of workers in the cluster.
 
     Returns:
-      A target for connecting to the service, e.g.
-      "grpc+local://localhost:2000".
+      The address of the master.
     """
-    self._master = server_lib.MasterServer(PROTOCOL)
-    master_address = self._master.target[len(PROTOCOL + "://"):]
-
+    self._master = server_lib.MasterServer(port=0, protocol=PROTOCOL)
     self._servers = []
     for _ in range(num_workers):
       self._servers.append(
-          server_lib.WorkerServer(PROTOCOL, master_address=master_address))
+          server_lib.WorkerServer(
+              port=0, master_address=self._master._address, protocol=PROTOCOL))
 
-    return self._master.target
+    return self._master._address
 
   @combinations.generate(test_base.eager_only_combinations())
   def testDistributeBasic(self):
     num_elements = 10
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_dataset(ds, master_address)
     results = [elem.numpy() for elem in ds]
     self.assertEqual(list(range(num_elements)), results)
 
   @combinations.generate(test_base.eager_only_combinations())
   def testMultipleEpochs(self):
     num_elements = 3
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_dataset(ds, master_address)
     for _ in range(10):
       self.assertEqual(list(range(num_elements)), [elem.numpy() for elem in ds])
 
@@ -91,9 +89,9 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testRepeatedDataset(self):
     num_elements = 10
     num_repetitions = 5
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_dataset(ds, master_address)
     ds = ds.repeat(num_repetitions)
     self.assertDatasetProduces(
         ds, expected_output=num_repetitions * list(range(num_elements)))
@@ -102,12 +100,12 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testConcurrentEpoch(self):
     num_elements = 10
     num_datasets = 3
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     iterators = []
     results = []
     for _ in range(num_datasets):
       ds = dataset_ops.Dataset.range(num_elements)
-      ds = _make_distributed_dataset(ds, service)
+      ds = _make_distributed_dataset(ds, master_address)
       iterators.append(iter(ds))
       results.append([])
 
@@ -123,9 +121,9 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.skipTest("Not yet implemented")
     num_elements = 10
     num_iterators = 3
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_dataset(ds, master_address)
     result = []
     iterators = []
     for _ in range(num_iterators):
@@ -147,21 +145,20 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testMultiWorker(self):
     num_workers = 3
     num_elements = 10
-    service = self.create_cluster(num_workers)
+    master_address = self.create_cluster(num_workers)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_dataset(ds, master_address)
     results = [elem.numpy() for elem in ds]
     self.assertCountEqual(num_workers * list(range(num_elements)), results)
 
   @combinations.generate(test_base.eager_only_combinations())
   def testAddWorkerMidJob(self):
-    self._master = server_lib.MasterServer(PROTOCOL)
-    master_address = self._master.target[len(PROTOCOL + "://"):]
+    self._master = server_lib.MasterServer(port=0, protocol=PROTOCOL)
     self._worker = server_lib.WorkerServer(
-        PROTOCOL, master_address=master_address)
+        port=0, master_address=self._master._address, protocol=PROTOCOL)
     num_elements = 100
     ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, self._master.target)
+    ds = _make_distributed_dataset(ds, self._master._address)
     iterator = iter(ds)
     results = []
     # Read halfway through the dataset.
@@ -169,10 +166,10 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       results.append(next(iterator).numpy())
 
     self._new_worker = server_lib.WorkerServer(
-        PROTOCOL, master_address=master_address)
+        port=0, master_address=self._master._address, protocol=PROTOCOL)
 
     # Wait for the new worker to register with the master.
-    while self._master.num_tasks() < 2:
+    while self._master._num_workers() < 2:
       time.sleep(10 / 1000)  # 10ms
 
     for elem in iterator:
@@ -184,13 +181,12 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       combinations.times(test_base.eager_only_combinations(),
                          combinations.combine(use_same_port=[True, False])))
   def testRestartWorker(self, use_same_port):
-    self._master = server_lib.MasterServer(PROTOCOL)
-    master_address = self._master.target[len(PROTOCOL + "://"):]
+    self._master = server_lib.MasterServer(port=0, protocol=PROTOCOL)
     self._worker = server_lib.WorkerServer(
-        PROTOCOL, master_address=master_address)
+        port=0, master_address=self._master._address, protocol=PROTOCOL)
     num_elements = 100
     ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, self._master.target)
+    ds = _make_distributed_dataset(ds, self._master._address)
     iterator = iter(ds)
     # Read halfway through the dataset.
     midpoint = num_elements // 2
@@ -200,11 +196,10 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     # Stop the original worker and start a new one.
     port = 0
     if use_same_port:
-      worker_address = self._worker.target[len(PROTOCOL + "://"):]
-      port = int(worker_address.split(":")[1])
-    self._worker.stop()
+      port = int(self._worker._address.split(":")[1])
+    self._worker._stop()
     self._new_worker = server_lib.WorkerServer(
-        PROTOCOL, master_address=master_address, port=port)
+        port=port, master_address=self._master._address, protocol=PROTOCOL)
 
     # The dataset starts over now that we read from the new worker.
     for i in range(num_elements):
@@ -215,16 +210,31 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
         val = next(iterator).numpy()
       self.assertEqual(i, val)
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testMaxOutstandingRequests(self):
+    num_elements = 10
+    num_workers = 3
+    address = self.create_cluster(num_workers)
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds = ds.apply(
+        data_service_ops._distribute(
+            "parallel_epochs",
+            "{0}://{1}".format(PROTOCOL, address),
+            max_outstanding_requests=1,
+            task_refresh_interval_hint_ms=20))
+    self.assertCountEqual(num_workers * list(range(num_elements)),
+                          self.getDatasetOutput(ds))
+
   @combinations.generate(test_base.eager_only_combinations())
   def testInsideFunction(self):
     num_workers = 3
     num_elements = 10
-    service = self.create_cluster(num_workers)
+    master_address = self.create_cluster(num_workers)
 
     @def_function.function
     def f():
       ds = dataset_ops.Dataset.range(num_elements)
-      ds = _make_distributed_dataset(ds, service)
+      ds = _make_distributed_dataset(ds, master_address)
       result = tensor_array_ops.TensorArray(
           dtypes.int64, size=num_workers * num_elements, dynamic_size=True)
       i = 0
@@ -239,10 +249,10 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobName(self):
     num_elements = 10
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, service, job_name="job_name")
-    ds2 = _make_distributed_dataset(ds, service, job_name="job_name")
+    ds1 = _make_distributed_dataset(ds, master_address, job_name="job_name")
+    ds2 = _make_distributed_dataset(ds, master_address, job_name="job_name")
     iter1 = iter(ds1)
     iter2 = iter(ds2)
     results = []
@@ -258,20 +268,20 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.eager_only_combinations())
   def testDifferentJobNames(self):
     num_elements = 10
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, service, job_name="job_name1")
-    ds2 = _make_distributed_dataset(ds, service, job_name="job_name2")
+    ds1 = _make_distributed_dataset(ds, master_address, job_name="job_name1")
+    ds2 = _make_distributed_dataset(ds, master_address, job_name="job_name2")
     self.assertDatasetProduces(ds1, list(range(num_elements)))
     self.assertDatasetProduces(ds2, list(range(num_elements)))
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobNameMultiIteration(self):
     num_elements = 10
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, service, job_name="job_name")
-    ds2 = _make_distributed_dataset(ds, service, job_name="job_name")
+    ds1 = _make_distributed_dataset(ds, master_address, job_name="job_name")
+    ds2 = _make_distributed_dataset(ds, master_address, job_name="job_name")
     # iteration 1
     self.assertDatasetProduces(ds1, list(range(num_elements)))
     self.assertDatasetProduces(ds2, [])
@@ -283,11 +293,11 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testSharedJobNameRepeat(self):
     num_elements = 10
     num_repetitions = 3
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, service, job_name="job_name")
+    ds1 = _make_distributed_dataset(ds, master_address, job_name="job_name")
     ds1 = ds1.repeat(num_repetitions)
-    ds2 = _make_distributed_dataset(ds, service, job_name="job_name")
+    ds2 = _make_distributed_dataset(ds, master_address, job_name="job_name")
     ds2 = ds2.repeat(num_repetitions)
     results = []
     iter1 = iter(ds1)
@@ -311,8 +321,8 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     options.experimental_external_state_policy = external_state_policy
     ds = ds.with_options(options)
 
-    service = self.create_cluster(3)
-    ds = _make_distributed_dataset(ds, service)
+    master_address = self.create_cluster(3)
+    ds = _make_distributed_dataset(ds, master_address)
     next(iter(ds))
 
   @combinations.generate(
@@ -332,12 +342,12 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testDistributeFromInterleave(self):
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(2)
 
     def interleave_fn(_):
       ds = dataset_ops.Dataset.range(2)
-      _make_distributed_dataset(ds, service)
+      _make_distributed_dataset(ds, master_address)
       return ds
 
     with self.assertRaisesRegex(
diff --git a/tensorflow/python/data/kernel_tests/len_test.py b/tensorflow/python/data/kernel_tests/len_test.py
new file mode 100644
index 00000000000..a22e46ed664
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/len_test.py
@@ -0,0 +1,59 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.__len__()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
+from tensorflow.python.platform import test
+
+
+class LenTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testKnown(self):
+    num_elements = 10
+    ds = dataset_ops.Dataset.range(num_elements)
+    self.assertLen(ds, 10)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testInfinite(self):
+    num_elements = 10
+    ds = dataset_ops.Dataset.range(num_elements).repeat()
+    with self.assertRaisesRegex(TypeError, 'infinite'):
+      len(ds)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testUnknown(self):
+    num_elements = 10
+    ds = dataset_ops.Dataset.range(num_elements).filter(lambda x: True)
+    with self.assertRaisesRegex(TypeError, 'unknown'):
+      len(ds)
+
+  @combinations.generate(test_base.graph_only_combinations())
+  def testGraphMode(self):
+    num_elements = 10
+    ds = dataset_ops.Dataset.range(num_elements)
+    with self.assertRaisesRegex(TypeError, 'not supported while tracing'):
+      len(ds)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/options_test.py b/tensorflow/python/data/kernel_tests/options_test.py
index dea217367dc..27b5a336a6c 100644
--- a/tensorflow/python/data/kernel_tests/options_test.py
+++ b/tensorflow/python/data/kernel_tests/options_test.py
@@ -107,9 +107,6 @@ class OptionsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     for _ in range(999):
       result = result.concatenate(ds)
-    options = dataset_ops.Options()
-    options.experimental_optimization.autotune = False
-    result = result.with_options(options)
     self.assertDatasetProduces(result, [0]*1000)
 
 
diff --git a/tensorflow/python/data/kernel_tests/padded_batch_test.py b/tensorflow/python/data/kernel_tests/padded_batch_test.py
index 75d515484a4..b4c5fdb2a1b 100644
--- a/tensorflow/python/data/kernel_tests/padded_batch_test.py
+++ b/tensorflow/python/data/kernel_tests/padded_batch_test.py
@@ -306,6 +306,22 @@ class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertDatasetProduces(
         ds, expected_output=[[0.0, 1.0, 2.0, 3.0, 4.0]])
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testDefaultPaddedValueShapes(self):
+
+    def fill(x):
+      return array_ops.fill([x], x)
+
+    dataset = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.from_tensor_slices([1, 2, 3, 4]).map(fill),
+         dataset_ops.Dataset.from_tensor_slices([1, 2, 3, 4]).map(fill)))
+    dataset = dataset.padded_batch(batch_size=2, padding_values=-1)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[([[1, -1], [2, 2]], [[1, -1], [2, 2]]),
+                         ([[3, 3, 3, -1], [4, 4, 4, 4]], [[3, 3, 3, -1],
+                                                          [4, 4, 4, 4]])])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index c64c909a622..372c19855af 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -94,6 +94,12 @@ ops.NotDifferentiable("ReduceDataset")
 AUTOTUNE = -1
 tf_export("data.experimental.AUTOTUNE").export_constant(__name__, "AUTOTUNE")
 
+# Constants representing infinite and unknown cardinalities.
+INFINITE = -1
+UNKNOWN = -2
+tf_export("data.INFINITE_CARDINALITY").export_constant(__name__, "INFINITE")
+tf_export("data.UNKNOWN_CARDINALITY").export_constant(__name__, "UNKNOWN")
+
 
 @tf_export("data.Dataset", v1=[])
 @six.add_metaclass(abc.ABCMeta)
@@ -396,8 +402,7 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
   def __iter__(self):
     """Creates an `Iterator` for enumerating the elements of this dataset.
 
-    The returned iterator implements the Python iterator protocol and therefore
-    can only be used in eager mode.
+    The returned iterator implements the Python iterator protocol.
 
     Returns:
       An `Iterator` over the elements of this dataset.
@@ -411,6 +416,36 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
       raise RuntimeError("__iter__() is only supported inside of tf.function "
                          "or when eager execution is enabled.")
 
+  def __bool__(self):
+    return True  # Required as __len__ is defined
+
+  __nonzero__ = __bool__  # Python 2 backward compatibility
+
+  def __len__(self):
+    """Returns the length of the dataset if it is known and finite.
+
+    This method requires that you are running in eager mode, and that the
+    length of the dataset is known and non-infinite. When the length may be
+    unknown or infinite, or if you are running in graph mode, use
+    `tf.data.Dataset.cardinality` instead.
+
+    Returns:
+      An integer representing the length of the dataset.
+
+    Raises:
+      RuntimeError: If the dataset length is unknown or infinite, or if eager
+        execution is not enabled.
+    """
+    if not context.executing_eagerly():
+      raise TypeError("__len__() is not supported while tracing functions. "
+                      "Use `tf.data.Dataset.cardinality` instead.")
+    length = self.cardinality()
+    if length.numpy() == INFINITE:
+      raise TypeError("dataset length is infinite.")
+    if length.numpy() == UNKNOWN:
+      raise TypeError("dataset length is unknown.")
+    return length
+
   @abc.abstractproperty
   def element_spec(self):
     """The type specification of an element of this dataset.
@@ -1461,6 +1496,16 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
     >>> list(dataset.as_numpy_iterator())
     [(array([[ 1,  2,  3, -1], [ 4,  5, -1, -1]], dtype=int32),
       array([[ 10, 100], [ 11,  12]], dtype=int32))]
+    >>> # Pad with a single value and multiple components.
+    >>> E = tf.data.Dataset.zip((A, A)).padded_batch(2, padding_values=-1)
+    >>> for element in E.as_numpy_iterator():
+    ...   print(element)
+    (array([[ 1, -1],
+           [ 2,  2]], dtype=int32), array([[ 1, -1],
+           [ 2,  2]], dtype=int32))
+    (array([[ 3,  3,  3, -1],
+           [ 4,  4,  4,  4]], dtype=int32), array([[ 3,  3,  3, -1],
+           [ 4,  4,  4,  4]], dtype=int32))
 
     See also `tf.data.experimental.dense_to_sparse_batch`, which combines
     elements that may have different shapes into a `tf.sparse.SparseTensor`.
@@ -1479,7 +1524,12 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
         `tf.Tensor`, representing the padding values to use for the respective
         components. None represents that the nested structure should be padded
         with default values.  Defaults are `0` for numeric types and the empty
-        string for string types.
+        string for string types. The `padding_values` should have the
+        same structure as the input dataset. If `padding_values` is a single
+        element and the input dataset has multiple components, then the same
+        `padding_values` will be used to pad every component of the dataset.
+        If `padding_values` is a scalar, then its value will be broadcasted
+        to match the shape of each component.
       drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
         whether the last batch should be dropped in the case it has fewer than
         `batch_size` elements; the default behavior is not to drop the smaller
@@ -2081,6 +2131,34 @@ name=None))
     """
     return _OptionsDataset(self, options)
 
+  def cardinality(self):
+    """Returns the cardinality of the dataset, if known.
+
+    `cardinality` may return `tf.data.INFINITE_CARDINALITY` if the dataset
+    contains an infinite number of elements or `tf.data.UNKNOWN_CARDINALITY` if
+    the analysis fails to determine the number of elements in the dataset
+    (e.g. when the dataset source is a file).
+
+    >>> dataset = tf.data.Dataset.range(42)
+    >>> print(dataset.cardinality().numpy())
+    42
+    >>> dataset = dataset.repeat()
+    >>> cardinality = dataset.cardinality()
+    >>> print((cardinality == tf.data.INFINITE_CARDINALITY).numpy())
+    True
+    >>> dataset = dataset.filter(lambda x: True)
+    >>> cardinality = dataset.cardinality()
+    >>> print((cardinality == tf.data.UNKNOWN_CARDINALITY).numpy())
+    True
+
+    Returns:
+      A scalar `tf.int64` `Tensor` representing the cardinality of the dataset.
+      If the cardinality is infinite or unknown, `cardinality` returns the
+      named constants `tf.data.INFINITE_CARDINALITY` and
+      `tf.data.UNKNOWN_CARDINALITY` respectively.
+    """
+    return ged_ops.dataset_cardinality(self._variant_tensor)
+
 
 @tf_export(v1=["data.Dataset"])
 class DatasetV1(DatasetV2):
@@ -3998,6 +4076,12 @@ class PaddedBatchDataset(UnaryDataset):
     self._padded_shapes = nest.pack_sequence_as(input_shapes,
                                                 flat_padded_shapes_as_tensors)
 
+    # If padding_values is a single element and input_shapes is a structure,
+    # "broadcast" padding_values to the same structure as input_shapes.
+    if nest.is_sequence(input_shapes) and not nest.is_sequence(padding_values):
+      padding_values = nest.map_structure(lambda _: padding_values,
+                                          input_shapes)
+
     self._padding_values = nest.map_structure_up_to(
         input_shapes, _padding_value_to_tensor, padding_values,
         get_legacy_output_types(input_dataset))
@@ -4327,14 +4411,18 @@ class PrefetchDataset(UnaryUnchangedStructureDataset):
     """
     self._input_dataset = input_dataset
     if buffer_size is None:
-      buffer_size = -1  # This is the sentinel for auto-tuning.
+      buffer_size = AUTOTUNE
     self._buffer_size = ops.convert_to_tensor(
         buffer_size, dtype=dtypes.int64, name="buffer_size")
-    variant_tensor = gen_dataset_ops.prefetch_dataset(
-        input_dataset._variant_tensor,  # pylint: disable=protected-access
-        buffer_size=self._buffer_size,
-        slack_period=slack_period,
-        **self._flat_structure)
+    # pylint: disable=protected-access
+    # We colocate the prefetch dataset with its input as this collocation only
+    # happens automatically in graph mode.
+    with ops.device(input_dataset._variant_tensor.device):
+      variant_tensor = gen_dataset_ops.prefetch_dataset(
+          input_dataset._variant_tensor,
+          buffer_size=self._buffer_size,
+          slack_period=slack_period,
+          **self._flat_structure)
     super(PrefetchDataset, self).__init__(input_dataset, variant_tensor)
 
 
diff --git a/tensorflow/python/data/service/server_lib.py b/tensorflow/python/data/service/server_lib.py
deleted file mode 100644
index b8f6e673f2e..00000000000
--- a/tensorflow/python/data/service/server_lib.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A Python interface for creating dataset servers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=invalid-import-order,g-bad-import-order, unused-import
-from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.data.service import _pywrap_server_lib
-
-
-class MasterServer(object):
-  """An in-process tf.data service master, for use in testing."""
-
-  def __init__(self, protocol):
-    """Creates and starts a new tf.data master server.
-
-    The server will choose an available port. Use `target()` to get the string
-    for connecting to the server.
-
-    Args:
-      protocol: A string representing the type of protocol to use when creating
-        channels. For no security, use "grpc". For local credentials, use
-        "grpc+local", and make sure your binary links in
-        `data/service:local_credentials`.
-    """
-    self._protocol = protocol
-    self._server = _pywrap_server_lib.TF_DATA_NewMasterServer(0, protocol)
-    self._running = True
-
-  @property
-  def target(self):
-    """Returns the target for connecting to this server.
-
-    The returned string will be in the form protocol://address:port, e.g.
-    "grpc://localhost:1000".
-    """
-    port = _pywrap_server_lib.TF_DATA_MasterServerBoundPort(self._server)
-    return "{0}://localhost:{1}".format(self._protocol, port)
-
-  def num_tasks(self):
-    """Returns the number of tasks on the master."""
-    return _pywrap_server_lib.TF_DATA_MasterServerNumTasks(self._server)
-
-  def stop(self):
-    """Shuts down and deletes the server.
-
-    This method will block until all outstanding rpcs have completed and the
-    server has been shut down.
-    """
-    if self._running:
-      self._running = False
-      _pywrap_server_lib.TF_DATA_DeleteMasterServer(self._server)
-
-  def __del__(self):
-    self.stop()
-
-
-class WorkerServer(object):
-  """An in-process tf.data service worker, for use in testing."""
-
-  def __init__(self, protocol, master_address, port=0):
-    """Creates and starts a new tf.data worker server.
-
-    The server will choose an available port. Use `target()` to get the string
-    for connecting to the server.
-
-    Args:
-      protocol: A string representing the type of protocol to use when creating
-        channels. For no security, use "grpc". For local credentials, use
-        "grpc+local", and make sure your binary links in
-        `data/service:local_credentials`.
-      master_address: The address of the tf.data master server to register with.
-      port: The port to bind to.
-    """
-    self._protocol = protocol
-    self._server = _pywrap_server_lib.TF_DATA_NewWorkerServer(
-        port, protocol, master_address, "localhost:%port%")
-    self._running = True
-
-  @property
-  def target(self):
-    """Returns the target for connecting to this server.
-
-    The returned string will be in the form protocol://address:port, e.g.
-    "grpc://localhost:1000".
-    """
-    port = _pywrap_server_lib.TF_DATA_WorkerServerBoundPort(self._server)
-    return "{0}://localhost:{1}".format(self._protocol, port)
-
-  def stop(self):
-    """Shuts down and deletes the server.
-
-    This method will block until all outstanding rpcs have completed and the
-    server has been shut down.
-    """
-    if self._running:
-      self._running = False
-      _pywrap_server_lib.TF_DATA_DeleteWorkerServer(self._server)
-
-  def __del__(self):
-    self.stop()
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 956e90999c7..1ef0504ecb8 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -840,7 +840,6 @@ py_test(
     python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
-        "no_oss_py38",  #TODO(b/151449908)
         "no_windows",
     ],
     deps = [
diff --git a/tensorflow/python/debug/lib/check_numerics_callback.py b/tensorflow/python/debug/lib/check_numerics_callback.py
index edcafad201e..796fabae301 100644
--- a/tensorflow/python/debug/lib/check_numerics_callback.py
+++ b/tensorflow/python/debug/lib/check_numerics_callback.py
@@ -275,7 +275,9 @@ class CheckNumericsCallback(object):
                   output,
                   inputs,
                   graph=graph,
-                  traceback=output.op.traceback))
+                  traceback=output.op.traceback,
+                  stack_height_limit=self._stack_height_limit,
+                  path_length_limit=self._path_length_limit))
           _CHECK_NUMERICS_INPUT_LOOKUP[graph][checked_output.name] = output
           instrumented_outputs.append(self._get_output_tensor(
               op_type_bytes, output, checked_output, is_v1_graph_mode))
@@ -410,6 +412,21 @@ def enable_check_numerics(stack_height_limit=30,
      z = tf.matmul(y, y)
      ```
 
+  NOTE: If your code is running on TPUs, be sure to call
+  `tf.config.set_soft_device_placement(True)` before calling
+  `tf.debugging.enable_check_numerics()` as this API uses automatic outside
+  compilation on TPUs. For example:
+
+  ```py
+  tf.config.set_soft_device_placement(True)
+  tf.debugging.enable_check_numerics()
+
+  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+  strategy = tf.distribute.experimental.TPUStrategy(resolver)
+  with strategy.scope():
+    # ...
+  ```
+
   Args:
     stack_height_limit: Limit to the height of the printed stack trace.
       Applicable only to ops in `tf.function`s (graphs).
diff --git a/tensorflow/python/debug/lib/check_numerics_callback_test.py b/tensorflow/python/debug/lib/check_numerics_callback_test.py
index 5f578da03c3..5c0cc6394ac 100644
--- a/tensorflow/python/debug/lib/check_numerics_callback_test.py
+++ b/tensorflow/python/debug/lib/check_numerics_callback_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
 
 
 class LimitStringLengthTest(test_util.TensorFlowTestCase):
@@ -105,6 +106,27 @@ class CheckNumericsCallbackTest(test_util.TensorFlowTestCase):
     self.assertAllClose(batches[0], np.log([1.25, 2]))
     self.assertAllClose(batches[1], np.log([3.25, 5]))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testGraphModeUsesCorrectPathLengthAndStackHeightLimits(self):
+    check_numerics_callback.enable_check_numerics(
+        stack_height_limit=123, path_length_limit=1200)
+
+    @def_function.function
+    def add_fn(x, y):
+      return x + y
+
+    fake_get_check_numerics_error_message = test.mock.MagicMock(
+        return_value="dummy_message")
+    with test.mock.patch.object(check_numerics_callback,
+                                "get_check_numerics_error_message",
+                                fake_get_check_numerics_error_message):
+      x = constant_op.constant(2.0)
+      y = constant_op.constant(3.0)
+      self.assertAllClose(self.evaluate(add_fn(x, y)), 5.0)
+      (_, call_kwargs) = fake_get_check_numerics_error_message.call_args
+      self.assertEqual(call_kwargs["stack_height_limit"], 123)
+      self.assertEqual(call_kwargs["path_length_limit"], 1200)
+
 
 class CheckNumericsCallbackUnhealthyTest(test_util.TensorFlowTestCase):
   """Test for cases in which enable_check_numerics() catches infs or nans."""
@@ -372,6 +394,22 @@ class CheckNumericsCallbackUnhealthyTest(test_util.TensorFlowTestCase):
                        re.search(r"graph op.*\"Xdivy\"", message)))
       self.assertTrue(re.search(r"dtype.*float32", message))
 
+  def testEagerModeUsesCorrectPathLengthAndStackHeightLimits(self):
+    check_numerics_callback.enable_check_numerics(
+        stack_height_limit=123, path_length_limit=1200)
+    fake_get_check_numerics_error_message = test.mock.MagicMock(
+        return_value="dummy_message")
+    with test.mock.patch.object(check_numerics_callback,
+                                "get_check_numerics_error_message",
+                                fake_get_check_numerics_error_message):
+      x = constant_op.constant(2.0)
+      y = constant_op.constant(0.0)
+      self._assertRaisesInvalidArgumentErrorAndGetMessage(
+          lambda: x / y)  # Expected to generate an inf.
+      (_, call_kwargs) = fake_get_check_numerics_error_message.call_args
+      self.assertEqual(call_kwargs["stack_height_limit"], 123)
+      self.assertEqual(call_kwargs["path_length_limit"], 1200)
+
   @test_util.run_in_graph_and_eager_modes
   def testExpectedNaNOpOutputs(self):
     """Test calling operations with benign NaN output."""
diff --git a/tensorflow/python/debug/lib/debug_v2_ops_test.py b/tensorflow/python/debug/lib/debug_v2_ops_test.py
index c76cbeeac6c..07721920f63 100644
--- a/tensorflow/python/debug/lib/debug_v2_ops_test.py
+++ b/tensorflow/python/debug/lib/debug_v2_ops_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_debug_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
@@ -680,6 +681,39 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
     self.assertAllEqual(tensor_1, tensor_2)
     self.assertEqual(tensor_id_1, tensor_id_2)
 
+  def testCheckNumericsV2OpNegativeAndPositiveInf(self):
+    """Test that CheckNumericsV2 op distinguishes negative and positive infs."""
+    with self.session(graph=ops.Graph()):
+      t1 = constant_op.constant([-1.0, 1.0])
+      t2 = constant_op.constant([0.0, 0.0])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"pass through test.*had -Inf and \+Inf values"):
+        self.evaluate(
+            array_ops.check_numerics_v2(t1 / t2, message="pass through test"))
+
+  def testCheckNumericsV2OpNegativeAndPositiveInfAndNaN(self):
+    """CheckNumericsV2 op distinguishes - & + infs when nan is present."""
+    with self.session(graph=ops.Graph()):
+      t1 = constant_op.constant([-1.0, 1.0, 0.0])
+      t2 = constant_op.constant([0.0, 0.0, 0.0])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"pass through test.*had -Inf, \+Inf, and NaN values"):
+        self.evaluate(
+            array_ops.check_numerics_v2(t1 / t2, message="pass through test"))
+
+  def testCheckNumericsV2PositiveInfAndNaN(self):
+    """Test that CheckNumericsV2 op shows sign of inf when nan is present."""
+    with self.session(graph=ops.Graph()):
+      t1 = constant_op.constant([0.0, 1.0])
+      t2 = constant_op.constant([0.0, 0.0])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"pass through test.*had \+Inf and NaN values"):
+        self.evaluate(
+            array_ops.check_numerics_v2(t1 / t2, message="pass through test"))
+
 
 if __name__ == "__main__":
   ops.enable_eager_execution()
diff --git a/tensorflow/python/debug/lib/dumping_callback.py b/tensorflow/python/debug/lib/dumping_callback.py
index 5f7fe5e7ea4..f012faf5f3c 100644
--- a/tensorflow/python/debug/lib/dumping_callback.py
+++ b/tensorflow/python/debug/lib/dumping_callback.py
@@ -721,6 +721,22 @@ def enable_dump_debug_info(dump_root,
   # Code to build, train and run your TensorFlow model...
   ```
 
+  NOTE: If your code is running on TPUs, be sure to call
+  `tf.config.set_soft_device_placement(True)` before calling
+  `tf.debugging.experimental.enable_dump_debug_info()` as this API uses
+  automatic outside compilation on TPUs. For example:
+
+  ```py
+  tf.config.set_soft_device_placement(True)
+  tf.debugging.experimental.enable_dump_debug_info(
+      logdir, tensor_debug_mode="FULL_HEALTH")
+
+  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+  strategy = tf.distribute.experimental.TPUStrategy(resolver)
+  with strategy.scope():
+    # ...
+  ```
+
   Args:
     dump_root: The directory path where the dumping information will be written.
     tensor_debug_mode: Debug mode for tensor values, as a string.
diff --git a/tensorflow/python/debug/lib/source_utils_test.py b/tensorflow/python/debug/lib/source_utils_test.py
index faf2365fc9c..89964a21ba7 100644
--- a/tensorflow/python/debug/lib/source_utils_test.py
+++ b/tensorflow/python/debug/lib/source_utils_test.py
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import ast
 import os
+import sys
 import tempfile
 import zipfile
 
@@ -43,7 +45,41 @@ from tensorflow.python.util import tf_inspect
 
 
 def line_number_above():
-  return tf_inspect.stack()[1][2] - 1
+  """Get lineno of the AST node immediately above this function's call site.
+
+  It is assumed that there is no empty line(s) between the call site and the
+  preceding AST node.
+
+  Returns:
+    The lineno of the preceding AST node, at the same level of the AST.
+    If the preceding AST spans multiple lines:
+      - In Python 3.8+, the lineno of the first line is returned.
+      - In older Python versions, the lineno of the last line is returned.
+  """
+  # https://bugs.python.org/issue12458: In Python 3.8, traceback started
+  # to return the lineno of the first line of a multi-line continuation block,
+  # instead of that of the last line. Therefore, in Python 3.8+, we use `ast` to
+  # get the lineno of the first line.
+  call_site_lineno = tf_inspect.stack()[1][2]
+  if sys.version_info < (3, 8):
+    return call_site_lineno - 1
+  else:
+    with open(__file__, "rb") as f:
+      source_text = f.read().decode("utf-8")
+    source_tree = ast.parse(source_text)
+    prev_node = _find_preceding_ast_node(source_tree, call_site_lineno)
+    return prev_node.lineno
+
+
+def _find_preceding_ast_node(node, lineno):
+  """Find the ast node immediately before and not including lineno."""
+  for i, child_node in enumerate(node.body):
+    if child_node.lineno == lineno:
+      return node.body[i - 1]
+    if hasattr(child_node, "body"):
+      found_node = _find_preceding_ast_node(child_node, lineno)
+      if found_node:
+        return found_node
 
 
 class GuessIsTensorFlowLibraryTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 5dccb47fb19..140b3089e32 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -67,6 +67,7 @@ py_library(
         ":collective_util",
         ":cross_device_utils",
         ":device_util",
+        ":ps_values",
         ":reduce_util",
         ":tpu_values",
         ":values",
@@ -78,7 +79,9 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:executor",
         "//tensorflow/tools/docs:doc_controls",
         "@six_archive//:six",
     ],
@@ -315,18 +318,23 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":cross_device_ops",
+        ":device_util",
+        ":distribute_lib",
         ":input_lib",
         ":mirrored_run",
         ":multi_worker_util",
         ":numpy_dataset",
-        ":reduce_util",
+        ":ps_values",
         ":values",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:device",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
     ],
@@ -408,6 +416,7 @@ py_library(
 tf_py_test(
     name = "mirrored_function_strategy_test",
     srcs = ["mirrored_function_strategy_test.py"],
+    python_version = "PY3",
     tags = ["no_pip"],
     deps = [
         ":distribute_lib",
@@ -439,6 +448,7 @@ cuda_py_test(
     srcs = [
         "multi_worker_continuous_run_test.py",
     ],
+    python_version = "PY3",
     tags = [
         "notsan",  # TODO(b/151841995)
     ],
@@ -452,6 +462,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
@@ -519,6 +530,7 @@ py_library(
 cuda_py_test(
     name = "input_ops_test",
     srcs = ["input_ops_test.py"],
+    python_version = "PY3",
     deps = [
         ":input_ops",
         "//tensorflow/python:client_testlib",
@@ -670,12 +682,14 @@ py_library(
         ":device_util",
         ":distribute_lib",
         ":reduce_util",
+        ":values_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:composite_tensor",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python:type_spec",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
@@ -685,6 +699,34 @@ py_library(
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
         "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/types",
+    ],
+)
+
+py_library(
+    name = "ps_values",
+    srcs = ["ps_values.py"],
+    deps = [
+        ":distribute_lib",
+        ":values",
+        ":values_util",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/types",
+    ],
+)
+
+py_library(
+    name = "values_util",
+    srcs = ["values_util.py"],
+    deps = [
+        ":distribute_lib",
+        ":reduce_util",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:variable_scope",
     ],
 )
 
@@ -725,6 +767,7 @@ py_library(
 py_test(
     name = "combinations_test",
     srcs = ["combinations_test.py"],
+    python_version = "PY3",
     tags = [
         "notap",  # TODO(b/153646955): flaky
     ],
@@ -817,6 +860,7 @@ cuda_py_test(
     name = "checkpoint_utils_test",
     size = "medium",
     srcs = ["checkpoint_utils_test.py"],
+    python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
     ],
@@ -914,6 +958,7 @@ distribute_py_test(
 cuda_py_test(
     name = "cross_device_utils_test",
     srcs = ["cross_device_utils_test.py"],
+    python_version = "PY3",
     deps = [
         ":combinations",
         ":cross_device_utils",
@@ -932,6 +977,7 @@ cuda_py_test(
 cuda_py_test(
     name = "cross_device_ops_test",
     srcs = ["cross_device_ops_test.py"],
+    python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
     ],
@@ -958,6 +1004,7 @@ cuda_py_test(
     name = "one_device_strategy_test",
     srcs = ["one_device_strategy_test.py"],
     grpc_enabled = True,
+    python_version = "PY3",
     deps = [
         ":combinations",
         ":strategy_combinations",
@@ -1036,23 +1083,57 @@ distribute_py_test(
     ],
     deps = [
         ":combinations",
-        ":device_util",
         ":distribute_lib",
-        ":mirrored_strategy",
-        ":parameter_server_strategy",
         ":strategy_combinations",
+        ":tpu_strategy",
+        ":tpu_values",
         ":values",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:indexed_slices",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:saver",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:tf2",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/saved_model/model_utils:mode_keys",
+        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/types",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+distribute_py_test(
+    name = "ps_values_test",
+    size = "medium",
+    srcs = ["ps_values_test.py"],
+    main = "ps_values_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        ":combinations",
+        ":ps_values",
+        ":strategy_combinations",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1180,6 +1261,23 @@ distribute_py_test(
     ],
 )
 
+distribute_py_test(
+    name = "strategy_reduce_test",
+    srcs = ["strategy_reduce_test.py"],
+    main = "strategy_reduce_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        ":combinations",
+        ":strategy_combinations",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 distribute_py_test(
     name = "minimize_loss_test",
     srcs = ["minimize_loss_test.py"],
@@ -1260,6 +1358,7 @@ cuda_py_test(
     name = "warm_starting_util_test",
     size = "medium",
     srcs = ["warm_starting_util_test.py"],
+    python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
     ],
@@ -1277,7 +1376,10 @@ cuda_py_test(
 cuda_py_test(
     name = "remote_mirrored_strategy_eager_test",
     srcs = ["remote_mirrored_strategy_eager_test.py"],
-    tags = ["no_oss"],  # b/154743849
+    python_version = "PY3",
+    tags = [
+        "no_oss",  # b/154743849
+    ],
     deps = [
         ":combinations",
         ":distribute_lib",
@@ -1300,6 +1402,7 @@ cuda_py_test(
 cuda_py_test(
     name = "mirrored_strategy_test",
     srcs = ["mirrored_strategy_test.py"],
+    python_version = "PY3",
     shard_count = 5,
     tags = [
         "multi_and_single_gpu",
@@ -1331,6 +1434,7 @@ cuda_py_test(
 cuda_py_test(
     name = "mirrored_variable_test",
     srcs = ["mirrored_variable_test.py"],
+    python_version = "PY3",
     tags = [
         "guitar",
         "multi_and_single_gpu",
@@ -1509,6 +1613,7 @@ distribute_py_test(
 cuda_py_test(
     name = "collective_all_reduce_strategy_test",
     srcs = ["collective_all_reduce_strategy_test.py"],
+    python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
     ],
@@ -1543,18 +1648,23 @@ cuda_py_test(
 cuda_py_test(
     name = "parameter_server_strategy_test",
     srcs = ["parameter_server_strategy_test.py"],
+    python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
+        "no_windows",  # TODO(b/156428279): reenable this test once the image is updated.
     ],
     # b/141096229: Non-atomic AssignAdd
     xla_enable_strict_auto_jit = False,
     deps = [
         ":central_storage_strategy",
         ":combinations",
+        ":device_util",
+        ":distribute_lib",
         ":multi_worker_test_base",
         ":multi_worker_util",
         ":parameter_server_strategy",
-        ":strategy_combinations",
+        ":ps_values",
+        ":reduce_util",
         ":strategy_test_lib",
         ":values",
         "//tensorflow/core:protos_all_py",
@@ -1562,16 +1672,22 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:session",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_util",
-        "//tensorflow/python:training",
+        "//tensorflow/python:training_util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/keras/layers",
+        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/keras/layers:core",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1579,11 +1695,14 @@ cuda_py_test(
 py_library(
     name = "multi_process_runner",
     srcs = ["multi_process_runner.py"],
+    srcs_version = "PY3",
     deps = [
         ":multi_process_lib",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:tf2",
         "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/eager:context",
+        "@absl_py//absl/logging",
         "@six_archive//:six",
     ],
 )
@@ -1591,18 +1710,19 @@ py_library(
 py_library(
     name = "multi_process_lib",
     srcs = ["multi_process_lib.py"],
-    deps = ["@six_archive//:six"],
+    deps = ["//tensorflow/python:client_testlib"],
 )
 
 py_test(
     name = "multi_process_runner_test",
     srcs = ["multi_process_runner_test.py"],
     python_version = "PY3",
-    shard_count = 12,
     deps = [
         ":multi_process_runner",
         ":multi_worker_test_base",
         "//tensorflow/python/eager:test",
+        "@absl_py//absl/logging",
+        "@six_archive//:six",
     ],
 )
 
@@ -1610,6 +1730,7 @@ py_test(
     name = "multi_process_runner_no_init_test",
     srcs = ["multi_process_runner_no_init_test.py"],
     python_version = "PY3",
+    tags = ["no_oss"],
     deps = [
         ":multi_process_runner",
         ":multi_worker_test_base",
@@ -1647,3 +1768,31 @@ py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+cuda_py_test(
+    name = "strategy_common_test",
+    srcs = ["strategy_common_test.py"],
+    python_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+        # TODO(b/155301154): Enable this test on multi-gpu guitar once multi process
+        # runner can run on guitar.
+        "noguitar",
+    ],
+    xla_enable_strict_auto_jit = True,
+    deps = [
+        ":combinations",
+        ":multi_worker_test_base",
+        ":reduce_util",
+        ":strategy_combinations",
+        ":strategy_test_lib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:def_function",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/distribute/checkpointing_test.py b/tensorflow/python/distribute/checkpointing_test.py
index 040faf6f6ce..ad646905315 100644
--- a/tensorflow/python/distribute/checkpointing_test.py
+++ b/tensorflow/python/distribute/checkpointing_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import adam as adam_v1
@@ -96,6 +97,41 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
         self.assertEqual((training_continuation + 1) * num_training_steps,
                          root.optimizer_step.numpy())
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
+          ],
+          mode=["eager"]))
+  def testInitializeFromCheckpoint(self, distribution):
+    variable_shape = [5]
+    save_checkpoint = trackable_utils.Checkpoint(v=variables_lib.Variable(
+        array_ops.ones(variable_shape)))
+    save_path = save_checkpoint.save(
+        os.path.join(self.get_temp_dir(), "checkpoint"))
+    with distribution.scope():
+      restore_checkpoint = trackable_utils.Checkpoint()
+      restore_checkpoint.restore(save_path)
+      initial_value = restore_checkpoint._preload_simple_restoration(
+          "v", variable_shape)
+      v = variables_lib.Variable(initial_value)
+      # Check that the variable is now tagged as restored. `Checkpoint` then
+      # knows it doesn't have to restore `v`'s value when it's assigned to an
+      # object.
+      self.assertGreater(v._update_uid, 0)
+      self.assertAllClose(array_ops.ones(variable_shape), v)
+      v.assign(array_ops.zeros(variable_shape))
+      # Assignment to an object should not trigger restoration, since we already
+      # restored the object through an initializer. This wouldn't be a
+      # correctness issue, but it would mean that models would use twice as much
+      # memory when loading (the buffer already assigned to the variable, and
+      # the new restoration).
+      restore_checkpoint.v = v
+      self.assertAllClose(array_ops.zeros(variable_shape), v)
+
   @combinations.generate(
       combinations.combine(
           distribution=[
diff --git a/tensorflow/python/distribute/cluster_resolver/BUILD b/tensorflow/python/distribute/cluster_resolver/BUILD
index 8577f1978b9..c7427af2081 100644
--- a/tensorflow/python/distribute/cluster_resolver/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/BUILD
@@ -1,10 +1,6 @@
 # Description: Operations defined for Cluster Resolvers
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_additional_rpc_deps",
-)
 
 package(
     default_visibility = [
@@ -64,12 +60,7 @@ py_library(
     name = "tpu_cluster_resolver_py",
     srcs = ["tpu_cluster_resolver.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        ":base_cluster_resolver_py",
-        "//tensorflow/python:training_server_lib",
-        "//tensorflow/python/tpu:tpu_lib",
-        "//tensorflow/python/tpu/client",
-    ] + tf_additional_rpc_deps(),
+    deps = ["//tensorflow/python/distribute/cluster_resolver/tpu:tpu_cluster_resolver_py"],
 )
 
 py_library(
@@ -137,25 +128,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "tpu_cluster_resolver_py_test",
-    size = "small",
-    srcs = ["tpu_cluster_resolver_test.py"],
-    grpc_enabled = True,
-    main = "tpu_cluster_resolver_test.py",
-    python_version = "PY3",
-    deps = [
-        ":tpu_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training_server_lib",
-        "//tensorflow/python/tpu/client",
-        "@absl_py//absl/testing:flagsaver",
-    ],
-)
-
 tf_py_test(
     name = "slurm_cluster_resolver_py_test",
     size = "small",
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
index 3b9f8a259dd..94c036963a3 100644
--- a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
@@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import subprocess
 import re
+import subprocess
 
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
@@ -29,7 +29,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 def expand_hostlist(hostlist):
-  """Create a list of hosts out of a SLURM hostlist
+  """Create a list of hosts out of a SLURM hostlist.
 
   The order of nodes is preserved and no deduplication is done
   Input: 'n[1-2],m5,o[3-4,6,7-9]')
@@ -37,7 +37,7 @@ def expand_hostlist(hostlist):
   """
 
   def split_hostlist(hostlist):
-    """Split hostlist at commas outside of range expressions ('[3-5]')"""
+    """Split hostlist at commas outside of range expressions ('[3-5]')."""
     in_brackets = False
     cur_host = ''
     for c in hostlist:
@@ -57,7 +57,7 @@ def expand_hostlist(hostlist):
       yield cur_host
 
   def expand_range_expression(range_exp):
-    """Expand a range expression like '3-5' to values 3,4,5"""
+    """Expand a range expression like '3-5' to values 3,4,5."""
     for part in range_exp.split(','):
       sub_range = part.split('-')
       if len(sub_range) == 1:
@@ -87,7 +87,7 @@ def expand_hostlist(hostlist):
 
 
 def expand_tasks_per_node(tasks_per_node):
-  """Expand the tasks per node expression from SLURM
+  """Expands the tasks per node expression from SLURM.
 
   The order is preserved so it can be matched to the hostlist
   Input: '3(x2),2,1'
@@ -108,7 +108,7 @@ def expand_tasks_per_node(tasks_per_node):
 
 
 def _get_slurm_var(name):
-  """Get the SLURM variable from the environment
+  """Gets the SLURM variable from the environment.
 
   Args:
     name: Name of the step variable
@@ -126,8 +126,8 @@ def _get_slurm_var(name):
                        'Not running inside a SLURM step?' % name)
 
 
-def get_num_slurm_tasks():
-  """Return the number of SLURM tasks of the current job step
+def _get_num_slurm_tasks():
+  """Returns the number of SLURM tasks of the current job step.
 
   Returns:
     The number of tasks as an int
@@ -136,7 +136,7 @@ def get_num_slurm_tasks():
 
 
 def _get_num_nvidia_gpus():
-  """Get the number of NVIDIA GPUs by using CUDA_VISIBLE_DEVICES and nvidia-smi
+  """Gets the number of NVIDIA GPUs by using CUDA_VISIBLE_DEVICES and nvidia-smi.
 
   Returns:
     Number of GPUs available on the node
@@ -157,9 +157,9 @@ def _get_num_nvidia_gpus():
 
 
 def get_num_gpus():
-  """Return the number of GPUs visible on the current node
+  """Returns the number of GPUs visible on the current node.
 
-  Currently only implemented for NVIDIA GPUs
+  Currently only implemented for NVIDIA GPUs.
   """
   return _get_num_nvidia_gpus()
 
@@ -176,7 +176,6 @@ class SlurmClusterResolver(ClusterResolver):
   used for distributed TensorFlow.
   """
 
-
   def __init__(self,
                jobs=None,
                port_base=8888,
@@ -276,19 +275,19 @@ class SlurmClusterResolver(ClusterResolver):
           sum(self._jobs.values()), num_tasks))
 
   def _resolve_own_rank(self):
-    """Return the rank of the current task in range [0, num_tasks)"""
+    """Returns the rank of the current task in range [0, num_tasks)."""
     return int(_get_slurm_var('PROCID'))
 
   def _resolve_num_tasks(self):
-    """Return the number of tasks for the current job step"""
-    return get_num_slurm_tasks()
+    """Returns the number of tasks for the current job step."""
+    return _get_num_slurm_tasks()
 
   def _resolve_hostlist(self):
-    """Return a list of hostnames for nodes running the current job step"""
+    """Returns a list of hostnames for nodes running the current job step."""
     return expand_hostlist(_get_slurm_var('STEP_NODELIST'))
 
   def _resolve_task_configuration(self):
-    """Create a mapping of hostnames to the number of tasks allocated on it
+    """Creates a mapping of hostnames to the number of tasks allocated on it.
 
     Reads the SLURM environment to determine the nodes involved in the current
     job step and number of tasks running on each node.
@@ -352,7 +351,7 @@ class SlurmClusterResolver(ClusterResolver):
 
       cluster_rank_offset_start = cluster_rank_offset_end
 
-    if self._auto_set_gpu is True:
+    if self._auto_set_gpu:
       os.environ['CUDA_VISIBLE_DEVICES'] = self._gpu_allocation[self._rank]
 
     return ClusterSpec(self._cluster_allocation)
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/BUILD b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
new file mode 100644
index 00000000000..4825bf3b6d8
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
@@ -0,0 +1,44 @@
+# Description: OSS only cluster resolvers
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_additional_rpc_deps",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_library(
+    name = "tpu_cluster_resolver_py",
+    srcs = ["tpu_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/tpu/client",
+    ] + tf_additional_rpc_deps(),
+)
+
+tf_py_test(
+    name = "tpu_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["tpu_cluster_resolver_test.py"],
+    grpc_enabled = True,
+    main = "tpu_cluster_resolver_test.py",
+    python_version = "PY3",
+    deps = [
+        ":tpu_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/tpu/client",
+    ],
+)
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
new file mode 100644
index 00000000000..943b736fde4
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
@@ -0,0 +1,349 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for Cloud TPUs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
+from tensorflow.python.training import server_lib
+from tensorflow.python.util import compat
+
+try:
+  from cloud_tpu_client import client  # pylint: disable=g-import-not-at-top
+except ImportError:
+  logging.debug(
+      'Falling back to TensorFlow client; we recommended you install the Cloud '
+      'TPU client directly with pip install cloud-tpu-client.')
+  from tensorflow.python.tpu.client import client  # pylint: disable=g-import-not-at-top
+
+
+def is_running_in_gce():
+  return True
+
+
+_TPU_DEVICE_REGEX = re.compile(
+    r'.*task:(?P<host_id>\d+)/.*device:TPU:(?P<core_id>\d+)$')
+_TPU_CONN_RETRIES = 120
+DeviceDetails = collections.namedtuple(
+    'DeviceDetails', ['device_map', 'total_cores'])
+
+
+class TPUClusterResolver(cluster_resolver.ClusterResolver):
+  """Cluster Resolver for Google Cloud TPUs.
+
+  This is an implementation of cluster resolvers for the Google Cloud TPU
+  service. As Cloud TPUs are in alpha, you will need to specify a API definition
+  file for this to consume, in addition to a list of Cloud TPUs in your Google
+  Cloud Platform project.
+
+  TPUClusterResolver supports the following distinct environments:
+  Google Compute Engine
+  Google Kubernetes Engine
+  Google internal
+  """
+
+  @staticmethod
+  def _get_device_dict_and_cores(devices):
+    """Returns a dict of hosts to cores and total cores given devices names.
+
+    Returns a namedtuple with two attributes:
+      device_map: A map of host_ids to a list of core_ids.
+      total_cores: The total number of cores within the TPU system.
+
+    Args:
+      devices: A list of devices returned by session.list_devices()
+    """
+    device_map = collections.defaultdict(list)
+    num_cores = 0
+    for device in devices:
+      match = _TPU_DEVICE_REGEX.match(device.name)
+      if match:
+        host_id = match.group('host_id')
+        core_id = match.group('core_id')
+        device_map[host_id].append(core_id)
+        num_cores += 1
+    return DeviceDetails(device_map, num_cores)
+
+  @staticmethod
+  def _verify_and_return_same_core_count(device_dict):
+    """Verifies that every device in device_dict has the same # of cores."""
+    num_cores_per_host_set = (
+        {len(core_ids) for core_ids in device_dict.values()})
+    if len(num_cores_per_host_set) != 1:
+      raise RuntimeError('TPU cores on each device is not the same. This '
+                         'should never happen. Devices: {}'.format(device_dict))
+    return num_cores_per_host_set.pop()
+
+  def __init__(self,
+               tpu=None,
+               zone=None,
+               project=None,
+               job_name='worker',
+               coordinator_name=None,
+               coordinator_address=None,
+               credentials='default',
+               service=None,
+               discovery_url=None):
+    """Creates a new TPUClusterResolver object.
+
+    The ClusterResolver will then use the parameters to query the Cloud TPU APIs
+    for the IP addresses and ports of each Cloud TPU listed.
+
+    Args:
+      tpu: A string corresponding to the TPU to use. If the string is an empty
+        string, the string 'local', or a string that begins with 'grpc://', then
+          it is assumed to not correspond with a Cloud TPU and will instead be
+          passed as the session master and no ClusterSpec propagation will be
+          done. In the future, this may also support a list of strings when
+          multiple Cloud TPUs are used.
+      zone: Zone where the TPUs are located. If omitted or empty, we will assume
+        that the zone of the TPU is the same as the zone of the GCE VM, which we
+        will try to discover from the GCE metadata service.
+      project: Name of the GCP project containing Cloud TPUs. If omitted or
+        empty, we will try to discover the project name of the GCE VM from the
+        GCE metadata service.
+      job_name: Name of the TensorFlow job the TPUs belong to.
+      coordinator_name: The name to use for the coordinator. Set to None if the
+        coordinator should not be included in the computed ClusterSpec.
+      coordinator_address: The address of the coordinator (typically an ip:port
+        pair). If set to None, a TF server will be started. If coordinator_name
+        is None, a TF server will not be started even if coordinator_address is
+        None.
+      credentials: GCE Credentials. If None, then we use default credentials
+        from the oauth2client
+      service: The GCE API object returned by the googleapiclient.discovery
+        function. If you specify a custom service object, then the credentials
+        parameter will be ignored.
+      discovery_url: A URL template that points to the location of the discovery
+        service. It should have two parameters {api} and {apiVersion} that when
+        filled in produce an absolute URL to the discovery document for that
+        service. The environment variable 'TPU_API_DISCOVERY_URL' will override
+        this.
+
+    Raises:
+      ImportError: If the googleapiclient is not installed.
+      ValueError: If no TPUs are specified.
+      RuntimeError: If an empty TPU name is specified and this is running in a
+        Google Cloud environment.
+    """
+
+    self._cloud_tpu_client = client.Client(
+        tpu=tpu,
+        zone=zone,
+        project=project,
+        credentials=credentials,
+        service=service,
+        discovery_url=discovery_url)
+
+    self._tpu = self._cloud_tpu_client.name()
+    # By default the task_type is 'worker` and the task_id is 0 (which is the
+    # first worker in the task).
+    self.task_type = job_name
+    self.task_id = 0
+    self._coordinator_name = coordinator_name
+    if (coordinator_name and not coordinator_address):
+      self._start_local_server()
+    else:
+      self._coordinator_address = coordinator_address
+
+  def __enter__(self):
+    self._cloud_tpu_client.enter()
+
+  def __exit__(self, type, value, traceback):  # pylint: disable=redefined-builtin
+    self._cloud_tpu_client.exit(type, value, traceback)
+
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
+    """Get the Master string to be used for the session.
+
+    In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
+    first instance in the ClusterSpec returned by the cluster_spec function.
+
+    If a non-TPU name is used when constructing a TPUClusterResolver, that will
+    be returned instead (e.g. If the tpus argument's value when constructing
+    this TPUClusterResolver was 'grpc://10.240.1.2:8470',
+    'grpc://10.240.1.2:8470' will be returned).
+
+    Args:
+      task_type: (Optional, string) The type of the TensorFlow task of the
+        master.
+      task_id: (Optional, integer) The index of the TensorFlow task of the
+        master.
+      rpc_layer: (Optional, string) The RPC protocol TensorFlow should use to
+        communicate with TPUs.
+
+    Returns:
+      string, the connection string to use when creating a session.
+
+    Raises:
+      ValueError: If none of the TPUs specified exists.
+    """
+
+    cluster_spec = self.cluster_spec()
+    if task_type is not None and task_id is not None:
+      # task_type and task_id is from the function parameter
+      master = cluster_spec.task_address(task_type, task_id)
+    elif self.task_type is not None and self.task_id is not None:
+      # task_type and task_id is from the object
+      master = cluster_spec.task_address(self.task_type, self.task_id)
+    else:
+      # by default we take the first item in the cluster with the right name
+      job_tasks = cluster_spec.job_tasks(self.task_type)
+      if not job_tasks:
+        raise ValueError('No TPUs with the specified names exist.')
+      master = job_tasks[0]
+    return cluster_resolver.format_master_url(master, 'grpc')
+
+  def get_master(self):
+    return self.master()
+
+  def get_job_name(self):
+    return self.task_type
+
+  def get_tpu_system_metadata(self):
+    """Returns the metadata of the TPU system.
+
+    Users can call this method to get some facts of the TPU system, like
+    total number of cores, number of TPU workers and the devices. E.g.
+    ```python
+
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    tpu_system_medata = resolver.get_tpu_system_metadata()
+    num_hosts = tpu_system_medata.num_hosts
+    ```
+
+    Returns:
+      A `tf.tpu.experimental.TPUSystemMetadata` object.
+    """
+    cluster_spec = self.cluster_spec()
+    cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
+    tpu_system_metadata = (
+        tpu_system_metadata_lib._query_tpu_system_metadata(  # pylint: disable=protected-access
+            self.master(),
+            cluster_def=cluster_def,
+            query_topology=False))
+
+    return tpu_system_metadata
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest TPU information.
+
+    We retrieve the information from the GCE APIs every time this method is
+    called.
+
+    Returns:
+      A ClusterSpec containing host information returned from Cloud TPUs,
+      or None.
+
+    Raises:
+      RuntimeError: If the provided TPU is not healthy.
+    """
+    ############################################################################
+    # There are 5 potential cases this code must handle:
+    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
+    #      a. Create a ClusterSpec that includes the coordinator job
+    #      b. Create a ClusterSpec without the coordinator job.
+    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
+    #     tasks and
+    #      a. Create a ClusterSpec with the coordinator
+    #      b. Create a ClusterSpec without the coordinator
+    ############################################################################
+
+    network_endpoints = self._cloud_tpu_client.network_endpoints()
+    worker_list = [
+        '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
+        for endpoint in network_endpoints
+    ]
+    cluster_spec = {self.task_type: worker_list}
+    if self._coordinator_address:
+      # {1, 2}.a
+      cluster_spec[self._coordinator_name] = [self._coordinator_address]
+
+    return server_lib.ClusterSpec(cluster_spec)
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_id=None,
+                       config_proto=None):
+    """Returns the number of TPU cores per worker.
+
+    Connects to the master and list all the devices present in the master,
+    and counts them up. Also verifies that the device counts per host in the
+    cluster is the same before returning the number of TPU cores per host.
+
+    Args:
+      task_type: Unused.
+      task_id: Unused.
+      config_proto: Used to create a connection to a TPU master in order to
+        retrieve the system metadata.
+
+    Raises:
+      RuntimeError: If we cannot talk to a TPU worker after retrying or if the
+        number of TPU devices per host is different.
+    """
+    retry_count = 1
+    # TODO(b/120564445): Replace with standard library for retries.
+    while True:
+      try:
+        device_details = TPUClusterResolver._get_device_dict_and_cores(
+            cluster_resolver.get_accelerator_devices(
+                self.master(), config_proto=config_proto))
+        break
+      except errors.DeadlineExceededError:
+        error_message = ('Failed to connect to master. The TPU might not be '
+                         'ready (e.g. still scheduling) or the master '
+                         'address is incorrect: got (%s)' % self.master())
+        if retry_count <= _TPU_CONN_RETRIES:
+          logging.warning(error_message)
+          logging.warning('Retrying (%d/%d)...', retry_count, _TPU_CONN_RETRIES)
+          retry_count += 1
+        else:
+          raise RuntimeError(error_message)
+
+    if device_details.total_cores:
+      return {'TPU': TPUClusterResolver._verify_and_return_same_core_count(
+          device_details.device_map)}
+    return {'TPU': 0}
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in."""
+    return self._environment
+
+  def _start_local_server(self):
+    address = compat.as_text(self._cloud_tpu_client.get_local_ip())
+    self._server = server_lib.Server({'local': ['0.0.0.0:0']},
+                                     protocol='grpc',
+                                     config=None,
+                                     start=True)
+    # self._server.target is of the form: grpc://ipaddress:port
+    target = compat.as_bytes(self._server.target)
+    splits = target.split(compat.as_bytes(':'))
+    assert len(splits) == 3, self._server.target
+    assert splits[0] == compat.as_bytes('grpc'), self._server.target
+    self._coordinator_port = compat.as_text(splits[2])
+    self._coordinator_address = '%s:%s' % (
+        address, compat.as_text(self._coordinator_port))
+
+  def __deepcopy__(self, memo):
+    # TODO(b/73668574): Remove this once RunConfig avoids performing deepcopy.
+    return self
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
similarity index 99%
rename from tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
index 1fad0a3fc95..1dc9a73fd74 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
@@ -25,7 +25,7 @@ from six.moves.urllib.error import URLError
 
 from tensorflow.python import framework
 from tensorflow.python.client import session
-from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver as resolver
+from tensorflow.python.distribute.cluster_resolver.tpu import tpu_cluster_resolver as resolver
 from tensorflow.python.eager.context import LogicalDevice
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
@@ -41,7 +41,7 @@ except ImportError:
   logging.debug(
       'Falling back to TensorFlow client; we recommended you install the Cloud '
       'TPU client directly with pip install cloud-tpu-client.')
-  from tensorflow.python.tpu.client import client
+  from tensorflow.python.tpu.client import client  # pylint: disable=g-import-not-at-top
 
 
 class MockRequestClass(object):
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
index 79ec0bc13d1..5731c2c930a 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
@@ -12,339 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for Cloud TPUs."""
+"""Shim so that direct imports of tpu_cluster_resolver get correct symbols.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import re
-
-from tensorflow.python.distribute.cluster_resolver import cluster_resolver
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
-from tensorflow.python.training import server_lib
-from tensorflow.python.util import compat
+from tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver import is_running_in_gce  # pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver import TPUClusterResolver
 from tensorflow.python.util.tf_export import tf_export
 
-try:
-  from cloud_tpu_client import client  # pylint: disable=g-import-not-at-top
-except ImportError:
-  logging.debug(
-      'Falling back to TensorFlow client; we recommended you install the Cloud '
-      'TPU client directly with pip install cloud-tpu-client.')
-  from tensorflow.python.tpu.client import client
-
-def is_running_in_gce():
-  return True
-
-
-_TPU_DEVICE_REGEX = re.compile(
-    r'.*task:(?P<host_id>\d+)/.*device:TPU:(?P<core_id>\d+)$')
-_TPU_CONN_RETRIES = 120
-DeviceDetails = collections.namedtuple(
-    'DeviceDetails', ['device_map', 'total_cores'])
-
-
-@tf_export('distribute.cluster_resolver.TPUClusterResolver')
-class TPUClusterResolver(cluster_resolver.ClusterResolver):
-  """Cluster Resolver for Google Cloud TPUs.
-
-  This is an implementation of cluster resolvers for the Google Cloud TPU
-  service. As Cloud TPUs are in alpha, you will need to specify a API definition
-  file for this to consume, in addition to a list of Cloud TPUs in your Google
-  Cloud Platform project.
-
-  TPUClusterResolver supports the following distinct environments:
-  Google Compute Engine
-  Google Kubernetes Engine
-  Google internal
-  """
-
-  @staticmethod
-  def _get_device_dict_and_cores(devices):
-    """Returns a dict of hosts to cores and total cores given devices names.
-
-    Returns a namedtuple with two attributes:
-      device_map: A map of host_ids to a list of core_ids.
-      total_cores: The total number of cores within the TPU system.
-
-    Args:
-      devices: A list of devices returned by session.list_devices()
-    """
-    device_map = collections.defaultdict(list)
-    num_cores = 0
-    for device in devices:
-      match = _TPU_DEVICE_REGEX.match(device.name)
-      if match:
-        host_id = match.group('host_id')
-        core_id = match.group('core_id')
-        device_map[host_id].append(core_id)
-        num_cores += 1
-    return DeviceDetails(device_map, num_cores)
-
-  @staticmethod
-  def _verify_and_return_same_core_count(device_dict):
-    """Verifies that every device in device_dict has the same # of cores."""
-    num_cores_per_host_set = (
-        {len(core_ids) for core_ids in device_dict.values()})
-    if len(num_cores_per_host_set) != 1:
-      raise RuntimeError('TPU cores on each device is not the same. This '
-                         'should never happen. Devices: {}'.format(device_dict))
-    return num_cores_per_host_set.pop()
-
-  def __init__(self,
-               tpu=None,
-               zone=None,
-               project=None,
-               job_name='worker',
-               coordinator_name=None,
-               coordinator_address=None,
-               credentials='default',
-               service=None,
-               discovery_url=None):
-    """Creates a new TPUClusterResolver object.
-
-    The ClusterResolver will then use the parameters to query the Cloud TPU APIs
-    for the IP addresses and ports of each Cloud TPU listed.
-
-    Args:
-      tpu: A string corresponding to the TPU to use. If the string is an empty
-        string, the string 'local', or a string that begins with 'grpc://', then
-          it is assumed to not correspond with a Cloud TPU and will instead be
-          passed as the session master and no ClusterSpec propagation will be
-          done. In the future, this may also support a list of strings when
-          multiple Cloud TPUs are used.
-      zone: Zone where the TPUs are located. If omitted or empty, we will assume
-        that the zone of the TPU is the same as the zone of the GCE VM, which we
-        will try to discover from the GCE metadata service.
-      project: Name of the GCP project containing Cloud TPUs. If omitted or
-        empty, we will try to discover the project name of the GCE VM from the
-        GCE metadata service.
-      job_name: Name of the TensorFlow job the TPUs belong to.
-      coordinator_name: The name to use for the coordinator. Set to None if the
-        coordinator should not be included in the computed ClusterSpec.
-      coordinator_address: The address of the coordinator (typically an ip:port
-        pair). If set to None, a TF server will be started. If coordinator_name
-        is None, a TF server will not be started even if coordinator_address is
-        None.
-      credentials: GCE Credentials. If None, then we use default credentials
-        from the oauth2client
-      service: The GCE API object returned by the googleapiclient.discovery
-        function. If you specify a custom service object, then the credentials
-        parameter will be ignored.
-      discovery_url: A URL template that points to the location of the discovery
-        service. It should have two parameters {api} and {apiVersion} that when
-        filled in produce an absolute URL to the discovery document for that
-        service. The environment variable 'TPU_API_DISCOVERY_URL' will override
-        this.
-
-    Raises:
-      ImportError: If the googleapiclient is not installed.
-      ValueError: If no TPUs are specified.
-      RuntimeError: If an empty TPU name is specified and this is running in a
-        Google Cloud environment.
-    """
-
-    self._cloud_tpu_client = client.Client(
-        tpu=tpu,
-        zone=zone,
-        project=project,
-        credentials=credentials,
-        service=service,
-        discovery_url=discovery_url)
-
-    self._tpu = self._cloud_tpu_client.name()
-    # By default the task_type is 'worker` and the task_id is 0 (which is the
-    # first worker in the task).
-    self.task_type = job_name
-    self.task_id = 0
-    self._coordinator_name = coordinator_name
-    if (coordinator_name and not coordinator_address):
-      self._start_local_server()
-    else:
-      self._coordinator_address = coordinator_address
-
-  def __enter__(self):
-    self._cloud_tpu_client.enter()
-
-  def __exit__(self, type, value, traceback):  # pylint: disable=redefined-builtin
-    self._cloud_tpu_client.exit(type, value, traceback)
-
-  def master(self, task_type=None, task_id=None, rpc_layer=None):
-    """Get the Master string to be used for the session.
-
-    In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
-    first instance in the ClusterSpec returned by the cluster_spec function.
-
-    If a non-TPU name is used when constructing a TPUClusterResolver, that will
-    be returned instead (e.g. If the tpus argument's value when constructing
-    this TPUClusterResolver was 'grpc://10.240.1.2:8470',
-    'grpc://10.240.1.2:8470' will be returned).
-
-    Args:
-      task_type: (Optional, string) The type of the TensorFlow task of the
-        master.
-      task_id: (Optional, integer) The index of the TensorFlow task of the
-        master.
-      rpc_layer: (Optional, string) The RPC protocol TensorFlow should use to
-        communicate with TPUs.
-
-    Returns:
-      string, the connection string to use when creating a session.
-
-    Raises:
-      ValueError: If none of the TPUs specified exists.
-    """
-
-    cluster_spec = self.cluster_spec()
-    if task_type is not None and task_id is not None:
-      # task_type and task_id is from the function parameter
-      master = cluster_spec.task_address(task_type, task_id)
-    elif self.task_type is not None and self.task_id is not None:
-      # task_type and task_id is from the object
-      master = cluster_spec.task_address(self.task_type, self.task_id)
-    else:
-      # by default we take the first item in the cluster with the right name
-      job_tasks = cluster_spec.job_tasks(self.task_type)
-      if not job_tasks:
-        raise ValueError('No TPUs with the specified names exist.')
-      master = job_tasks[0]
-    return cluster_resolver.format_master_url(master, 'grpc')
-
-  def get_master(self):
-    return self.master()
-
-  def get_job_name(self):
-    return self.task_type
-
-  def get_tpu_system_metadata(self):
-    """Returns the metadata of the TPU system.
-
-    Users can call this method to get some facts of the TPU system, like
-    total number of cores, number of TPU workers and the devices. E.g.
-    ```python
-
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-    tpu_system_medata = resolver.get_tpu_system_metadata()
-    num_hosts = tpu_system_medata.num_hosts
-    ```
-
-    Returns:
-      A `tf.tpu.experimental.TPUSystemMetadata` object.
-    """
-    cluster_spec = self.cluster_spec()
-    cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
-    tpu_system_metadata = (
-        tpu_system_metadata_lib._query_tpu_system_metadata(  # pylint: disable=protected-access
-            self.master(),
-            cluster_def=cluster_def,
-            query_topology=False))
-
-    return tpu_system_metadata
-
-  def cluster_spec(self):
-    """Returns a ClusterSpec object based on the latest TPU information.
-
-    We retrieve the information from the GCE APIs every time this method is
-    called.
-
-    Returns:
-      A ClusterSpec containing host information returned from Cloud TPUs,
-      or None.
-
-    Raises:
-      RuntimeError: If the provided TPU is not healthy.
-    """
-    ############################################################################
-    # There are 5 potential cases this code must handle:
-    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
-    #      a. Create a ClusterSpec that includes the coordinator job
-    #      b. Create a ClusterSpec without the coordinator job.
-    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
-    #     tasks and
-    #      a. Create a ClusterSpec with the coordinator
-    #      b. Create a ClusterSpec without the coordinator
-    ############################################################################
-
-    network_endpoints = self._cloud_tpu_client.network_endpoints()
-    worker_list = [
-        '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
-        for endpoint in network_endpoints
-    ]
-    cluster_spec = {self.task_type: worker_list}
-    if self._coordinator_address:
-      # {1, 2}.a
-      cluster_spec[self._coordinator_name] = [self._coordinator_address]
-
-    return server_lib.ClusterSpec(cluster_spec)
-
-  def num_accelerators(self,
-                       task_type=None,
-                       task_id=None,
-                       config_proto=None):
-    """Returns the number of TPU cores per worker.
-
-    Connects to the master and list all the devices present in the master,
-    and counts them up. Also verifies that the device counts per host in the
-    cluster is the same before returning the number of TPU cores per host.
-
-    Args:
-      task_type: Unused.
-      task_id: Unused.
-      config_proto: Used to create a connection to a TPU master in order to
-        retrieve the system metadata.
-
-    Raises:
-      RuntimeError: If we cannot talk to a TPU worker after retrying or if the
-        number of TPU devices per host is different.
-    """
-    retry_count = 1
-    # TODO(b/120564445): Replace with standard library for retries.
-    while True:
-      try:
-        device_details = TPUClusterResolver._get_device_dict_and_cores(
-            cluster_resolver.get_accelerator_devices(
-                self.master(), config_proto=config_proto))
-        break
-      except errors.DeadlineExceededError:
-        error_message = ('Failed to connect to master. The TPU might not be '
-                         'ready (e.g. still scheduling) or the master '
-                         'address is incorrect: got (%s)' % self.master())
-        if retry_count <= _TPU_CONN_RETRIES:
-          logging.warning(error_message)
-          logging.warning('Retrying (%d/%d)...', retry_count, _TPU_CONN_RETRIES)
-          retry_count += 1
-        else:
-          raise RuntimeError(error_message)
-
-    if device_details.total_cores:
-      return {'TPU': TPUClusterResolver._verify_and_return_same_core_count(
-          device_details.device_map)}
-    return {'TPU': 0}
-
-  @property
-  def environment(self):
-    """Returns the current environment which TensorFlow is running in."""
-    return self._environment
-
-  def _start_local_server(self):
-    address = compat.as_text(self._cloud_tpu_client.get_local_ip())
-    self._server = server_lib.Server({'local': ['0.0.0.0:0']},
-                                     protocol='grpc',
-                                     config=None,
-                                     start=True)
-    # self._server.target is of the form: grpc://ipaddress:port
-    target = compat.as_bytes(self._server.target)
-    splits = target.split(compat.as_bytes(':'))
-    assert len(splits) == 3, self._server.target
-    assert splits[0] == compat.as_bytes('grpc'), self._server.target
-    self._coordinator_port = compat.as_text(splits[2])
-    self._coordinator_address = '%s:%s' % (
-        address, compat.as_text(self._coordinator_port))
-
-  def __deepcopy__(self, memo):
-    # TODO(b/73668574): Remove this once RunConfig avoids performing deepcopy.
-    return self
+tf_export('distribute.cluster_resolver.TPUClusterResolver')(TPUClusterResolver)
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index 7c7f521af98..40c60241ac0 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -418,6 +418,14 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         split_batch_by=self._num_replicas_in_sync,
         input_context=input_context)
 
+  def _experimental_distribute_datasets_from_function(self, dataset_fn):
+    input_context = self._make_input_context()
+    return input_lib.get_distributed_datasets_from_function(
+        dataset_fn=dataset_fn,
+        input_workers=self._input_workers,
+        input_contexts=[input_context],
+        strategy=self._container_strategy())
+
   def _make_dataset_iterator(self, dataset):
     """Distributes the dataset to each local GPU."""
     input_context = self._make_input_context()
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 0a662908323..aaca66833e0 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -19,19 +19,21 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-
 import enum
+import threading
+
 import six
 
 from tensorflow.python.client import device_lib
 from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
+from tensorflow.python.eager import executor
 from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -63,7 +65,7 @@ def validate_destinations(destinations):
   """Validates the `destination` is one of expected types."""
   if not isinstance(
       destinations,
-      (value_lib.DistributedValues, ops.Tensor, value_lib.AggregatingVariable,
+      (value_lib.DistributedValues, ops.Tensor, ps_values.AggregatingVariable,
        six.string_types, tpu_values.TPUMirroredVariable
       )) and not resource_variable_ops.is_resource_variable(destinations):
     raise ValueError("destinations must be one of a `DistributedValues` object,"
@@ -948,6 +950,20 @@ class CollectiveAllReduce(CrossDeviceOps):
     self._collective_keys = (collective_keys or
                              cross_device_utils.CollectiveKeys())
     self._communication = communication
+    # In a multi threaded eager program we need to ensure different groups of
+    # collectives don't interleave each other, otherwise there will be deadlock.
+    self._lock = threading.Lock()
+
+    # Collective ops requires all devices to participate and is blocking. In
+    # eager, we need one async executor for each device to be able to launch
+    # them altogether. Note that async doesn't imply concurrency. Within an
+    # async executor operations are still executed sequentially. In graph or
+    # function building, the executors are not used.
+    self._executors = []
+    for _ in range(self._num_gpus_per_worker or 1):
+      # If num_gpus_per_worker is zero, we assume there's only one device (CPU).
+      self._executors.append(executor.new_executor(enable_async=True))
+
     super(CollectiveAllReduce, self).__init__()
 
   @property
@@ -1059,33 +1075,26 @@ class CollectiveAllReduce(CrossDeviceOps):
           "num_workers = %d, communication_hint = %s, num_packs = %d" %
           (batch_size, self._num_workers, communication, len(packs)), 10)
 
-    def batch_fn():
-      """Wrapper function around batched all-reduce calls."""
-      reduced_values = []
-      for pack in packs:
-        # By placing all CollectiveReduce ops in a pack under single name scope,
-        # we ensure they will be picked up by the `ScopedAllocator` grappler
-        # optimizer and packed into a single all-reduce.
-        with ops.name_scope("allreduce"):
-          for per_replica in pack:
-            # Add control dependencies per device from the last gradients to the
-            # current set, in order to serialize NCCL launches.
-            if (communication == CollectiveCommunication.NCCL.value and
-                reduced_values):
-              control_inputs = [g for g in reduced_values[-1]]
-            else:
-              control_inputs = None
-            reduced_values.append(
-                cross_device_utils.build_collective_reduce(
-                    per_replica.values, self._num_workers,
-                    self._collective_keys, "Add", "Id", communication,
-                    control_inputs))
-      return reduced_values
+    reduced_values = []
+    for pack in packs:
+      # By placing all CollectiveReduce ops in a pack under single name scope,
+      # we ensure they will be picked up by the `ScopedAllocator` grappler
+      # optimizer and packed into a single all-reduce.
+      with self._lock, ops.name_scope("allreduce"):
+        for per_replica in pack:
+          # Add control dependencies per device from the last gradients to the
+          # current set, in order to serialize NCCL launches.
+          if (communication == CollectiveCommunication.NCCL.value and
+              reduced_values):
+            control_inputs = list(reduced_values[-1])
+          else:
+            control_inputs = None
+          reduced_values.append(
+              cross_device_utils.build_collective_reduce(
+                  per_replica.values, self._num_workers,
+                  self._collective_keys, "Add", "Id", communication,
+                  control_inputs, executors=self._executors))
 
-    if context.executing_eagerly():
-      batch_fn = def_function.function(batch_fn)
-
-    reduced_values = batch_fn()
     mirrored = []
     # Reverse the order of reduced value to recover the order in the input.
     for value in reversed(reduced_values):
@@ -1134,6 +1143,12 @@ class CollectiveAllReduce(CrossDeviceOps):
       mirrored.append(value_lib.regroup(value, wrap_class=value_lib.Mirrored))
     return mirrored
 
+  def __deepcopy__(self, memo):
+    # distribute_coordinator deep-copies the strategy object, so
+    # CollectiveAllReduce needs to support deep copy as well.
+    return CollectiveAllReduce(self._num_workers, self._num_gpus_per_worker,
+                               self._collective_keys, self._communication)
+
 
 def choose_the_best(devices, session_config=None):
   """Find the best CrossDeviceOps locally given a `tf.compat.v1.ConfigProto`.
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index e1aa2bea97c..09de4306199 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -19,6 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 import itertools
+import os
+import threading
+import time
 
 from absl.testing import parameterized
 import numpy as np
@@ -39,6 +42,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import collective_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 
@@ -835,6 +839,64 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         variable_length=variable_length,
         local_mode=True)
 
+  @combinations.generate(
+      combinations.combine(
+          required_gpus=2,
+          mode="eager",
+          communication=[
+              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+          ]))
+  def testEagerMultiThread(self, communication):
+    collective, devices, _ = self._get_test_objects(
+        None,
+        None,
+        num_gpus=2,
+        communication=communication,
+        use_strategy_object=False,
+        local_mode=True)
+
+    # We would like to simulate the following sequence:
+    #   thread-0  device0                 device1
+    #   thread-1          device0 device1
+    # If the kernel launch sequence is as-is the program will deadlock since
+    # NCCL requires the launch order to be same on each device.
+    v0 = _make_per_replica([1.0 for _ in devices], devices)
+    v1 = _make_per_replica([2.0 for _ in devices], devices)
+
+    # Add a delay to collective_ops.all_reduce according to the input tensors
+    # index in `sequence.`
+    sequence = [v0.values[0], v1.values[0], v1.values[1], v0.values[1]]
+    all_reduce = collective_ops.all_reduce
+
+    def delayed_all_reduce(input_tensor, *args, **kwargs):
+      for idx, v in enumerate(sequence):
+        if input_tensor is v:
+          time.sleep(idx)
+          break
+      return all_reduce(input_tensor, *args, **kwargs)
+
+    with test.mock.patch.object(collective_ops, "all_reduce",
+                                delayed_all_reduce):
+      # We only use NCCL for batch reduce with two or more values, so we use two
+      # values here.
+
+      def thread_fn():
+        reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v0, v0),
+                                                                     (v0, v0)])
+        self.assertAllEqual(reduced[0].values, [2.0, 2.0])
+        self.assertAllEqual(reduced[1].values, [2.0, 2.0])
+
+      t = threading.Thread(target=thread_fn)
+      t.start()
+      reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v1, v1),
+                                                                   (v1, v1)])
+      self.assertAllEqual(reduced[0].values, [4.0, 4.0])
+      self.assertAllEqual(reduced[1].values, [4.0, 4.0])
+      t.join()
+
 
 if __name__ == "__main__":
+  # Set default inter op thread pool size to one to ensure we don't exhaust the
+  # thread pool with the additional executors to run collectives in eager.
+  os.environ["TF_NUM_INTEROP_THREADS"] = "1"
   test.main()
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index f9917385b59..d7be93ae2c4 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -337,10 +337,12 @@ def build_collective_reduce(input_tensors,
                             reduction_op='Add',
                             unary_op='Id',
                             communication_hint='AUTO',
-                            control_inputs=None):
+                            control_inputs=None,
+                            executors=None):
   """Build a subgraph that does one full all-reduce, using the collective Op.
 
-  This method must be called in graph mode or inside a tf.function.
+  If called in eager mode, it's required to supply a list of async executors for
+  each input Tensor.
 
   Args:
     input_tensors: tensors within a single worker graph that are to be reduced
@@ -355,6 +357,7 @@ def build_collective_reduce(input_tensors,
       implementation.
     control_inputs: if not None, add control edges between control_inputs and
       (index-wise) corresponding collective_reduce tensors
+    executors: a list of async executor. Required for eager execution.
 
   Returns:
     An array of final tensors, one per device, computed by the full reduction.
@@ -362,9 +365,11 @@ def build_collective_reduce(input_tensors,
   Raises:
     ValueError: There must be at least two tensors over all the workers.
   """
-  assert not context.executing_eagerly(), (
-      'build_collective_reduce can only be called in graph mode or inside '
-      'tf.function')
+  if context.executing_eagerly():
+    if (not executors or len(executors) != len(input_tensors) or
+        not all(e.is_async() for e in executors)):
+      raise ValueError(
+          'collectives requires async executors for each device in eager mode')
 
   group_size = len(input_tensors) * num_workers
   if group_size < 2:
@@ -375,15 +380,19 @@ def build_collective_reduce(input_tensors,
 
   out_tensors = []
   for idx, input_tensor in enumerate(input_tensors):
-    with ops.device(input_tensor.device):
-      with ops.control_dependencies(
-          _control_input(input_tensors, control_inputs, idx)):
-        out_tensor = collective_ops.all_reduce(input_tensor, group_size,
-                                               group_key, instance_key,
-                                               reduction_op, unary_op,
-                                               subdiv_offsets,
-                                               communication_hint)
-      out_tensors.append(out_tensor)
+    if context.executing_eagerly():
+      executor_scope = context.executor_scope(executors[idx])
+    else:
+      executor_scope = ops.NullContextmanager()
+    with executor_scope, \
+         ops.device(input_tensor.device), \
+         ops.control_dependencies(
+             _control_input(input_tensors, control_inputs, idx)):
+      out_tensor = collective_ops.all_reduce(input_tensor, group_size,
+                                             group_key, instance_key,
+                                             reduction_op, unary_op,
+                                             subdiv_offsets, communication_hint)
+    out_tensors.append(out_tensor)
   return out_tensors
 
 
diff --git a/tensorflow/python/distribute/custom_training_loop_gradient_test.py b/tensorflow/python/distribute/custom_training_loop_gradient_test.py
index ebf5d440c3e..c2ce2caccd0 100644
--- a/tensorflow/python/distribute/custom_training_loop_gradient_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_gradient_test.py
@@ -111,7 +111,6 @@ class GradientTapeTest(test.TestCase, parameterized.TestCase,
         return grads
       return distribution.experimental_local_results(
           distribution.run(train_step, args=(x,)))
-
     dist_dataset = distribution.experimental_distribute_dataset(dataset)
     results = []
     for x in dist_dataset:
diff --git a/tensorflow/python/distribute/custom_training_loop_models_test.py b/tensorflow/python/distribute/custom_training_loop_models_test.py
index 3c748bd7364..5a9384bb7e0 100644
--- a/tensorflow/python/distribute/custom_training_loop_models_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_models_test.py
@@ -26,6 +26,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
@@ -378,6 +379,46 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
     for model_v, model2_v in zip(model.variables, model2.variables):
       self.assertAllClose(model_v.numpy(), model2_v.numpy())
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies, mode=["eager"]))
+  def test_nested_tf_functions_with_control_flow(self, distribution):
+    inputs = np.random.random((10, 3)).astype(np.float32)
+    targets = np.ones((10, 4), dtype=np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)).repeat()
+    dataset = dataset.batch(10, drop_remainder=True)
+    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
+
+    def get_model():
+      x = keras.layers.Input(shape=(3,), name="input")
+      y = keras.layers.Dense(4, name="dense")(x)
+      model = keras.Model(x, y)
+      return model
+
+    with distribution.scope():
+      model = get_model()
+      optimizer = keras.optimizer_v2.gradient_descent.SGD(0.1, momentum=0.01)
+
+    @def_function.function
+    def train_step(iterator):
+
+      def step_fn(inputs):
+        images, targets = inputs
+        with backprop.GradientTape() as tape:
+          outputs = model(images)
+          loss = math_ops.reduce_sum(outputs - targets)
+        grads = tape.gradient(loss, model.variables)
+        optimizer.apply_gradients(zip(grads, model.variables))
+
+      distribution.run(step_fn, args=(next(iterator),))
+
+    @def_function.function
+    def train_steps(iterator):
+      for _ in math_ops.range(10):
+        train_step(iterator)
+
+    train_steps(input_iterator)
+
   @combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies,
@@ -408,6 +449,35 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     train_step(input_iterator)
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies, mode=["eager"]))
+  def test_reduce_loss(self, distribution):
+    inputs = np.zeros((10, 4), dtype=np.float32)
+    targets = np.zeros((10, 1), dtype=np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.batch(10, drop_remainder=False)
+    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
+
+    with distribution.scope():
+      x = keras.layers.Input(shape=(4), name="input")
+      y = keras.layers.Dense(3, name="dense")(x)
+      model = keras.Model(x, y)
+
+    @def_function.function
+    def train_step(iterator):
+
+      def step_fn(inputs):
+        images, targets = inputs
+        outputs = model(images)
+        loss = keras.losses.sparse_categorical_crossentropy(targets, outputs)
+        return loss
+
+      return distribution.run(step_fn, args=(next(iterator),))
+
+    loss = train_step(input_iterator)
+    loss = distribution.reduce(reduce_util.ReduceOp.MEAN, loss, axis=0)
+
   @combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.tpu_strategies, mode=["eager"]))
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index d17a594cb5e..b77163cb97a 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -114,6 +114,7 @@ from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context as eager_context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -618,6 +619,11 @@ class StrategyBase(object):
 
     if not hasattr(extended, "_retrace_functions_for_each_device"):
       # pylint: disable=protected-access
+      # `extended._retrace_functions_for_each_device` dictates
+      # 1) whether all the ops created inside function will have devices
+      #    inherited from outer stack, and
+      # 2) whether the same function will be retraced when it is called on
+      #    different devices.
       try:
         extended._retrace_functions_for_each_device = (
             len(extended.worker_devices) > 1)
@@ -628,6 +634,10 @@ class StrategyBase(object):
         # a sensible value.
         extended._retrace_functions_for_each_device = True
 
+    # Below are the dicts of axis(int) -> `tf.function`.
+    self._mean_reduce_helper_fns = {}
+    self._reduce_sum_fns = {}
+
   @property
   def extended(self):
     """`tf.distribute.StrategyExtended` with additional methods."""
@@ -831,46 +841,40 @@ class StrategyBase(object):
     where that limitation does not exist.
 
     The `dataset_fn` should take an `tf.distribute.InputContext` instance where
-    information about batching and input replication can be accessed:
+    information about batching and input replication can be accessed.
 
-    ```
-    def dataset_fn(input_context):
-      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
-      d = tf.data.Dataset.from_tensors([[1.]]).repeat().batch(batch_size)
-      return d.shard(
-          input_context.num_input_pipelines, input_context.input_pipeline_id)
+    You can also use the `element_spec` property of the distributed dataset
+    returned by this API to query the `tf.TypeSpec` of the elements returned
+    by the iterator. This can be used to set the `input_signature` property
+    of a `tf.function`.
 
-    inputs = strategy.experimental_distribute_datasets_from_function(dataset_fn)
+    >>> global_batch_size = 8
+    >>> def dataset_fn(input_context):
+    ...   batch_size = input_context.get_per_replica_batch_size(
+    ...                    global_batch_size)
+    ...   d = tf.data.Dataset.from_tensors([[1.]]).repeat().batch(batch_size)
+    ...   return d.shard(
+    ...       input_context.num_input_pipelines,
+    ...       input_context.input_pipeline_id)
 
-    for batch in inputs:
-      replica_results = strategy.run(replica_fn, args=(batch,))
-    ```
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> ds = strategy.experimental_distribute_datasets_from_function(dataset_fn)
+
+    >>> def train(ds):
+    ...   @tf.function(input_signature=[ds.element_spec])
+    ...   def step_fn(inputs):
+    ...     # train the model with inputs
+    ...     return inputs
+
+    ...   for batch in ds:
+    ...     replica_results = strategy.run(replica_fn, args=(batch,))
+    >>> train(ds)
 
     IMPORTANT: The `tf.data.Dataset` returned by `dataset_fn` should have a
     per-replica batch size, unlike `experimental_distribute_dataset`, which uses
     the global batch size.  This may be computed using
     `input_context.get_per_replica_batch_size`.
 
-    To query the `tf.TypeSpec` of the elements in the distributed dataset
-    returned by this API, you need to use the `element_spec` property of the
-    distributed iterator. This `tf.TypeSpec` can be used to set the
-    `input_signature` property of a `tf.function`.
-
-    ```python
-    # If you want to specify `input_signature` for a `tf.function` you must
-    # first create the iterator.
-    iterator = iter(inputs)
-
-    @tf.function(input_signature=[iterator.element_spec])
-    def replica_fn_with_signature(inputs):
-      # train the model with inputs
-      return
-
-    for _ in range(steps):
-      strategy.run(replica_fn_with_signature,
-          args=(next(iterator),))
-    ```
-
     Args:
       dataset_fn: A function taking a `tf.distribute.InputContext` instance and
         returning a `tf.data.Dataset`.
@@ -1014,8 +1018,25 @@ class StrategyBase(object):
     if axis is None:
       return self._extended._reduce(reduce_op, value)  # pylint: disable=protected-access
     if reduce_op == reduce_util.ReduceOp.SUM:
-      value = self.run(
-          lambda v: math_ops.reduce_sum(v, axis=axis), args=(value,))
+
+      def reduce_sum(v):
+        return math_ops.reduce_sum(v, axis=axis)
+
+      if eager_context.executing_eagerly():
+        # As some strategies (e.g. TPUStrategy) doesn't support pure eager
+        # execution, wrap the `reduce_sum_fn` with a `tf.function` so it can be
+        # run from eager mode. Cache the tf.function by `axis` to avoid the
+        # same function to be traced again.
+        if axis not in self._reduce_sum_fns:
+
+          def reduce_sum_fn(v):
+            return self.run(reduce_sum, args=(v,))
+
+          self._reduce_sum_fns[axis] = def_function.function(reduce_sum_fn)
+        value = self._reduce_sum_fns[axis](value)
+      else:
+        value = self.run(reduce_sum, args=(value,))
+
       return self._extended._reduce(reduce_op, value)  # pylint: disable=protected-access
     if reduce_op != reduce_util.ReduceOp.MEAN:
       raise TypeError("Expected `reduce_op` to be a `tf.distribute.ReduceOp`, "
@@ -1062,7 +1083,22 @@ class StrategyBase(object):
       # reduce is complete?
       return numer, denom
 
-    numer, denom = self.run(mean_reduce_helper, args=(value,))
+    if eager_context.executing_eagerly():
+      # As some strategies (e.g. TPUStrategy) doesn't support pure eager
+      # execution, wrap the `mean_reduce_helper` with a `tf.function` so it can
+      # be run from eager mode. Cache the tf.function by `axis` to avoid the
+      # same function to be traced again.
+      if axis not in self._mean_reduce_helper_fns:
+
+        def mean_reduce_fn(v):
+          return self.run(mean_reduce_helper, args=(v,))
+
+        self._mean_reduce_helper_fns[axis] = def_function.function(
+            mean_reduce_fn)
+      numer, denom = self._mean_reduce_helper_fns[axis](value)
+    else:
+      numer, denom = self.run(mean_reduce_helper, args=(value,))
+
     # TODO(josh11b): Should batch reduce here instead of doing two.
     numer = self._extended._reduce(reduce_util.ReduceOp.SUM, numer)  # pylint: disable=protected-access
     denom = self._extended._reduce(reduce_util.ReduceOp.SUM, denom)  # pylint: disable=protected-access
@@ -1772,13 +1808,25 @@ class StrategyExtendedV2(object):
       kwargs["distribute_strategy"] = strategy
 
       # Unwrap `initial_value` if it is a `CheckpointInitialValue` to avoid
-      # dereferencing a `Tensor` that is without a `name`.
-      # TODO(b/138130844): Revisit the following check once
-      # `CheckpointInitialValue` class is removed.
+      # dereferencing a `Tensor` that is without a `name`. We still need to
+      # propagate the metadata it's holding.
       if isinstance(kwargs["initial_value"], trackable.CheckpointInitialValue):
+        checkpoint_restore_uid = kwargs[
+            "initial_value"].checkpoint_position.restore_uid
         kwargs["initial_value"] = kwargs["initial_value"].wrapped_value
+      else:
+        checkpoint_restore_uid = None
 
-      return self._create_variable(next_creator, **kwargs)
+      created = self._create_variable(next_creator, **kwargs)
+
+      if checkpoint_restore_uid is not None:
+        # pylint: disable=protected-access
+        # Let the checkpointing infrastructure know that the variable was
+        # already restored so it doesn't waste memory loading the value again.
+        created._maybe_initialize_trackable()
+        created._update_uid = checkpoint_restore_uid
+        # pylint: enable=protected-access
+      return created
 
     def distributed_getter(getter, *args, **kwargs):
       if not self._allow_variable_partition():
@@ -1912,9 +1960,8 @@ class StrategyExtendedV2(object):
 
   def _reduce(self, reduce_op, value):
     # Default implementation until we have an implementation for each strategy.
-    return self._local_results(
-        self.reduce_to(reduce_op, value,
-                       device_util.current() or "/device:CPU:0"))[0]
+    dst = device_util.current() or self._default_device or "/device:CPU:0"
+    return self._local_results(self.reduce_to(reduce_op, value, dst))[0]
 
   def reduce_to(self, reduce_op, value, destinations, experimental_hints=None):
     """Combine (via e.g. sum or mean) values across replicas.
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index 68e55d5a6af..a1e76ea0ecb 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -349,33 +349,6 @@ class DistributedIteratorBase(distribute_types.Iterator):
             results.append(result)
     replicas = results
 
-    # Some dimensions in `replicas` will become unknown after we conditionally
-    # return the real tensors or the dummy tensors. We fix the input shapes by
-    # using the shapes from `out_of_range_replicas` because it is calling
-    # get_next() inside.
-    flattened_replicas = nest.flatten(replicas)
-    for i, replica_data in enumerate(nest.flatten(out_of_range_replicas)):
-      for target, source in zip(
-          nest.flatten(flattened_replicas[i], expand_composites=True),
-          nest.flatten(replica_data, expand_composites=True)):
-        target.set_shape(source.get_shape())
-      # `SparseTensor` shape is not determined by the shape of its component
-      # tensors. Rather, its shape depends on a tensor's values.
-      if sparse_tensor.is_sparse(replica_data) and replica_data.get_shape():
-        dense_shape = replica_data.get_shape()
-        with ops.device(flattened_replicas[i].op.device):
-          # For partially defined shapes, fill in missing values from tensor.
-          if not dense_shape.is_fully_defined():
-            dense_shape = array_ops.stack([
-                flattened_replicas[i].dense_shape[j] if dim is None else dim
-                for j, dim in enumerate(dense_shape.as_list())
-            ])
-          flattened_replicas[i] = sparse_tensor.SparseTensor(
-              indices=flattened_replicas[i].indices,
-              values=flattened_replicas[i].values,
-              dense_shape=dense_shape)
-    replicas = nest.pack_sequence_as(replicas, flattened_replicas)
-
     return values.regroup(replicas)
 
 
@@ -823,14 +796,15 @@ class DistributedDatasetsFromFunction(_IterableInput):
           "input_contexts (%d)" %
           (input_workers.num_workers, len(input_contexts)))
 
-    self._dataset_fn = dataset_fn
     self._input_workers = input_workers
     self._input_contexts = input_contexts
     self._strategy = strategy
-    self._element_spec = None
-
-    super(DistributedDatasetsFromFunction, self).__init__(
-        input_workers=input_workers)
+    self._datasets, element_spec = (
+        _create_datasets_per_worker_with_input_context(self._input_contexts,
+                                                       self._input_workers,
+                                                       dataset_fn))
+    self._element_spec = _create_distributed_tensor_spec(
+        self._strategy, element_spec)
 
   def __iter__(self):
     if (ops.executing_eagerly_outside_functions() or
@@ -842,9 +816,9 @@ class DistributedDatasetsFromFunction(_IterableInput):
       enable_legacy_iterators = getattr(self._strategy,
                                         "_enable_legacy_iterators", False)
 
-      iterators, element_spec = _create_iterators_per_worker_with_input_context(
-          self._input_contexts, self._input_workers, self._dataset_fn,
-          enable_legacy_iterators)
+      iterators = _create_iterators_per_worker(self._datasets,
+                                               self._input_workers,
+                                               enable_legacy_iterators)
 
       if enable_legacy_iterators:
         iterator = DistributedIteratorV1(self._input_workers, iterators,
@@ -852,8 +826,6 @@ class DistributedDatasetsFromFunction(_IterableInput):
       else:
         iterator = DistributedIterator(self._input_workers, iterators,
                                        self._strategy)
-      self._element_spec = _create_distributed_tensor_spec(self._strategy,
-                                                           element_spec)
       iterator._element_spec = self._element_spec  # pylint: disable=protected-access
       return iterator
 
@@ -896,13 +868,10 @@ class DistributedDatasetsFromFunctionV1(DistributedDatasetsFromFunction):
     return self._get_iterator()
 
   def _get_iterator(self):
-    iterators, element_spec = _create_iterators_per_worker_with_input_context(
-        self._input_contexts, self._input_workers, self._dataset_fn,
-        True)
+    iterators = _create_iterators_per_worker(self._datasets,
+                                             self._input_workers, True)
     iterator = DistributedIteratorV1(self._input_workers, iterators,
                                      self._strategy)
-    self._element_spec = _create_distributed_tensor_spec(self._strategy,
-                                                         element_spec)
     iterator._element_spec = self._element_spec  # pylint: disable=protected-access
     return iterator
 
@@ -1052,6 +1021,34 @@ def _dummy_tensor_fn(value_structure):
   return nest.map_structure(create_dummy_tensor, value_structure)
 
 
+def _recover_shape_fn(data, value_structure):
+  """Recover the shape of `data` the same as shape of `value_structure`."""
+
+  flattened_data = nest.flatten(data)
+  for i, spec in enumerate(nest.flatten(value_structure)):
+    for target, source in zip(
+        nest.flatten(flattened_data[i], expand_composites=True),
+        nest.flatten(spec, expand_composites=True)):
+      target.set_shape(source.shape)
+    # `SparseTensor` shape is not determined by the shape of its component
+    # tensors. Rather, its shape depends on a tensor's values.
+    if isinstance(spec, sparse_tensor.SparseTensorSpec) and spec.shape:
+      dense_shape = spec.shape
+      with ops.device(flattened_data[i].op.device):
+        # For partially defined shapes, fill in missing values from tensor.
+        if not dense_shape.is_fully_defined():
+          dense_shape = array_ops.stack([
+              flattened_data[i].dense_shape[j] if dim is None else dim
+              for j, dim in enumerate(dense_shape.as_list())
+          ])
+        flattened_data[i] = sparse_tensor.SparseTensor(
+            indices=flattened_data[i].indices,
+            values=flattened_data[i].values,
+            dense_shape=dense_shape)
+  data = nest.pack_sequence_as(data, flattened_data)
+  return data
+
+
 class _SingleWorkerDatasetIteratorBase(object):
   """Iterator for a single `tf.data.Dataset`."""
 
@@ -1136,6 +1133,13 @@ class _SingleWorkerDatasetIteratorBase(object):
               lambda: _dummy_tensor_fn(data.value_structure),
               strict=True,
           )
+          # Some dimensions in `replicas` will become unknown after we
+          # conditionally return the real tensors or the dummy tensors. Recover
+          # the shapes from `data.value_structure`. We only need to do this in
+          # non eager mode because we always know the runtime shape of the
+          # tensors in eager mode.
+          if not context.executing_eagerly():
+            real_data = _recover_shape_fn(real_data, data.value_structure)
           result.append(real_data)
           # pylint: enable=cell-var-from-loop
           # pylint: enable=unnecessary-lambda
@@ -1375,27 +1379,16 @@ def _create_iterators_per_worker(worker_datasets, input_workers,
   return iterators
 
 
-def _create_iterators_per_worker_with_input_context(input_contexts,
-                                                    input_workers,
-                                                    dataset_fn,
-                                                    enable_legacy_iterators):
-  """Create a multidevice iterator per workers given a dataset function."""
-  iterators = []
-  element_specs = []
+def _create_datasets_per_worker_with_input_context(input_contexts,
+                                                   input_workers, dataset_fn):
+  """Create device datasets per worker given a dataset function."""
+  datasets = []
   for i, ctx in enumerate(input_contexts):
     worker = input_workers.worker_devices[i]
     with ops.device(worker):
       dataset = dataset_fn(ctx)
-      element_specs.append(dataset.element_spec)
-      devices = input_workers.compute_devices_for_worker(i)
-      if tf2.enabled() and not enable_legacy_iterators:
-        iterator = _SingleWorkerOwnedDatasetIterator(dataset, worker,
-                                                     devices)
-      else:
-        iterator = _SingleWorkerDatasetIterator(dataset, worker,
-                                                devices)
-      iterators.append(iterator)
-  return iterators, dataset.element_spec
+      datasets.append(dataset)
+  return datasets, dataset.element_spec
 
 
 # TODO(sourabhbajaj): Remove this in lieu of distributed datasets
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index 60212f7a3b7..2114c4e6bda 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -48,6 +48,7 @@ from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -612,6 +613,41 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
     else:
       self.assertAllEqual(first_epoch, second_epoch)
 
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.one_device_strategy,
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
+          ]))
+  def testGetNextOptionalShape(self, distribution):
+    batch_size = 8
+    dataset = dataset_ops.DatasetV2.from_tensor_slices({
+        "feature": array_ops.ones([batch_size, 10]),
+        "label": array_ops.ones([batch_size]),
+    })
+    dataset = dataset.batch(batch_size, drop_remainder=True)
+    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+    per_replica_batch_size = batch_size // distribution.num_replicas_in_sync
+
+    @def_function.function
+    def train_fn():
+      for data in dist_dataset:
+        data = nest.map_structure(distribution.experimental_local_results, data)
+        feature = data["feature"]
+        label = data["label"]
+
+        # Asser the shapes are still staic from all replicas.
+        for replica_id in range(distribution.num_replicas_in_sync):
+          self.assertEqual([per_replica_batch_size, 10],
+                           feature[replica_id].shape)
+          self.assertEqual([per_replica_batch_size], label[replica_id].shape)
+
+    train_fn()
+
 
 class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase,
                                         parameterized.TestCase):
diff --git a/tensorflow/python/distribute/multi_process_lib.py b/tensorflow/python/distribute/multi_process_lib.py
index f3b03ca8bc4..ae9aa494062 100644
--- a/tensorflow/python/distribute/multi_process_lib.py
+++ b/tensorflow/python/distribute/multi_process_lib.py
@@ -18,9 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
+import multiprocessing as _multiprocessing
 import unittest
 
+from tensorflow.python.platform import test
+
+
+try:
+  multiprocessing = _multiprocessing.get_context('forkserver')
+except ValueError:
+  # forkserver is not available on Windows.
+  multiprocessing = _multiprocessing.get_context('spawn')
+
 
 class Process(object):
   """A process simulating a worker for testing multi-worker training."""
@@ -31,20 +40,11 @@ class Process(object):
         'TODO(b/141874796): Implement OSS version of `multi_process_lib`')
 
 
-def get_user_data():
-  """Returns the data commonly shared by parent process and subprocesses."""
-  # TODO(b/141874796): Implement OSS version of `multi_process_lib`.
-  pass
+def test_main():
+  """Main function to be called within `__main__` of a test file."""
+  test.main()
 
 
-@contextlib.contextmanager
-def context_manager(max_subprocess_count=20, barrier_parties=0):
-  """No-op in OSS. This exists to maintain testing compatibility."""
-  del max_subprocess_count, barrier_parties
-  yield
-
-
-def using_context_manager():
-  """Whether the context manager is being used."""
-  raise unittest.SkipTest(
-      'TODO(b/141874796): Implement OSS version of `multi_process_lib`')
+def initialized():
+  """Returns whether the module is initialized."""
+  return True
diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index e258d98fb8c..924e7b85b12 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -18,6 +18,7 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
 import collections
 import contextlib
 import json
@@ -26,15 +27,19 @@ import signal
 import sys
 import threading
 import time
+
 from absl import logging
+import dill
 import six
 from six.moves import queue as Queue
+import tblib.pickling_support
 
 from tensorflow.python import tf2
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.distribute import multi_process_lib
 from tensorflow.python.eager import context
-from tensorflow.python.platform import test
+
+multiprocessing = multi_process_lib.multiprocessing
 
 # pylint: disable=g-import-not-at-top
 try:
@@ -43,61 +48,45 @@ try:
 except ImportError:
   faulthandler = None
 
+# For pickling traceback objects.
+tblib.pickling_support.install()
+
 # _ProcessStatusInfo contains process status information. When is_successful
 # attribute is True, the subprocess has ended successfully, or if False, the
 # exception stack trace info is stored in exc_info to pass on to parent process
 # to be re-raised.
 _ProcessStatusInfo = collections.namedtuple(
-    '_ProcessStatusInfo', ['task_type', 'is_successful', 'exc_info'])
-
-# _SubprocessInfo collects basic information of a subprocess such as task type
-# and process id.
-# TODO(rchao): Include task_type and task_id in subprocess info.
-_SubprocessInfo = collections.namedtuple('_SubprocessInfo', ['pid'])
+    '_ProcessStatusInfo',
+    ['task_type', 'is_successful', 'exc_info', 'return_value'])
 
 # Information returned from a successful MultiProcessRunner run.
 MultiProcessRunnerResult = collections.namedtuple('MultiProcessRunnerResult',
                                                   ['return_value', 'stdout'])
 
-# Process status queue is used by `multi_process_runner` internally for
-# communication from subprocesses to the parent process for whether it's been
-# successful, and if not what the error stack trace is.
-PROCESS_STATUS_QUEUE = 'process_status_queue'
+TestEnvironment = collections.namedtuple('TestEnvironment', [
+    'task_type', 'task_id', 'cluster_spec', 'rpc_layer', 'grpc_fail_fast',
+    'v2_enabled', 'executing_eagerly'
+])
 
-# Return value queue is intended to be used by users of `multi_process_runner`
-# for the process function to return information to the caller of
-# `multi_process_runner.run()`.
-RETURN_VALUE_QUEUE = 'return_value_queue'
-
-# Subprocess info queue stores `_SubprocessInfo` for later potential
-# termination by the parent.
-SUBPROCESS_INFO_QUEUE = 'subprocess_info_queue'
-
-# Parent-to-sub queue is used for communications from parent to subprocess.
-# Currently this is only used to terminate subprocesses.
+# Resources for communication between worker processes and the main process.
+#
+# `process_status_queue` is used by `multi_process_runner` internally for
+#   communication from subprocesses to the parent process for whether it's been
+#   successful, and if not what the error stack trace is.
+# `parent_to_sub_queue` is used for communications from parent to subprocess.
+#   Currently this is only used to terminate subprocesses.
 # TODO(rchao): Remove this once subprocess is terminated by SIGKILL.
-PARENT_TO_SUB_QUEUE = 'parent_to_sub_queue'
-
-# Streaming queue stores the logged and printed messages from subprocesses.
-STREAMING_QUEUE = 'streaming_queue'
-
-# Pipes to stream stdout and stderr from subprocesses to parent process.
-STREAMING_PIPE = 'streaming_pipe'
-
-# Barrier identifier.
-BARRIER = 'barrier'
-
-_DEFAULT_MAX_SUBPROCESS_COUNT = 20
+# `streaming_pipe_w` is to stream stdout and stderr from subprocesses to parent
+#   process.
+# `barrier` is a barrier for the party of all subprocesses.
+Resources = collections.namedtuple('Resources', [
+    'process_status_queue', 'parent_to_sub_queue', 'streaming_pipe_w', 'barrier'
+])
 
 # Default time out sec is selected so that it's handled before the default
 # "medium" timeout of the test runs.
 _DEFAULT_TIMEOUT_SEC = 200
 
-# Next pipe index to be global so that pipes are not reused across multiple
-# MultiProcessRunner usages.
-# TODO(rchao): Investigate possibility to remove this variable.
-_next_pipe_index = 0
-
 
 class MultiProcessRunner(object):
   """A utility class to start multiple processes to simulate a cluster.
@@ -123,6 +112,7 @@ class MultiProcessRunner(object):
                grpc_fail_fast=None,
                stream_stdout=True,
                list_stdout=False,
+               use_dill_for_args=True,
                args=None,
                kwargs=None):
     """Creates a multi-process runner.
@@ -153,6 +143,9 @@ class MultiProcessRunner(object):
         returned from `MultiProcessRunner.join()`. If True, the list of stdout
         can be retrieved via `MultiProcessRunnerResult.stdout` attribute.
         Defaults to False.
+      use_dill_for_args: Whether to use dill to pickle `args` and `kwargs`. dill
+        can pickle more objects, but doesn't work with types in
+        `multiprocessing` library like `Mutex`.
       args: Positional arguments to be sent to functions run on processes.
       kwargs: Keyword arguments to be sent to functions run on processes.
 
@@ -166,15 +159,15 @@ class MultiProcessRunner(object):
                        'one chief. Current `cluster_spec` has {} chiefs.'
                        .format(len(cluster_spec['chief'])))
 
-    assert callable(proc_func)
-
-    if not multi_process_lib.using_context_manager():
+    if not multi_process_lib.initialized():
       raise RuntimeError('`multi_process_runner` is not initialized. '
                          'Please call `multi_process_runner.test_main()` '
                          'within `if __name__ == \'__main__\':` block '
                          'in your python module to properly initialize '
                          '`multi_process_runner`.')
 
+    assert callable(proc_func)
+
     self._proc_func = proc_func
     self._cluster_spec = cluster_spec
     self._rpc_layer = rpc_layer
@@ -184,62 +177,86 @@ class MultiProcessRunner(object):
     # TODO(rchao): Revisit list_stdout argument to consider other solution.
     self._list_stdout = list_stdout
     self._dependence_on_chief = True
+    self._use_dill_for_args = use_dill_for_args
     self._args = args or ()
     self._kwargs = kwargs or {}
 
-    self._outstanding_subprocess_count = 0
-
     # Child processes should have the same v2 and eager behavior.
     self._v2_enabled = tf2.enabled()
     self._executing_eagerly = context.executing_eagerly()
 
+    self._joined = False
+    self._processes = {}
+    self._outstanding_subprocess_count = 0
+    self._reading_threads = []
+
+    self._manager = multiprocessing.Manager()
+    self._process_status_queue = self._manager.Queue()
+    self._parent_to_sub_queue = self._manager.Queue()
+    parties = sum(len(addresses) for addresses in self._cluster_spec.values())
+    self._barrier = self._manager.Barrier(parties)
+
+    # We use a queue to collect outputs from worker processes since it's thread
+    # safe.
+    self._streaming_queue = self._manager.Queue()
+
     # This flag will be set to True once terminate_all() is called.
     self._all_forced_terminated = False
 
   def _continuously_readline_from_sub(self, pipe_r, task_type, task_id):
     """Function to continuously read lines from subprocesses."""
-    reader = os.fdopen(pipe_r.fileno(), 'r')
-    while True:
-      read_line = reader.readline()
-      if read_line == 'EOF':
-        reader.close()
-        # The thread that runs `_continuously_readline_from_sub` stops here.
-        # However the threads don't exit until the test exits, so we do not
-        # attempt to join the threads (which leads to timeout).
-        # TODO(rchao): Understand why and do thread joining.
-        break
-      task_string = '[{}-{}]:'.format(task_type, task_id)
-      formatted_line = '{} {}'.format(task_string.ljust(14), read_line)
-      if self._stream_stdout:
-        self._print_stdout_in_parent(formatted_line, task_type, task_id)
-      if self._list_stdout:
-        self._add_stdout_in_queue(formatted_line, task_type, task_id)
+    with os.fdopen(pipe_r.fileno(), 'r', closefd=False) as reader:
+      for line in reader:
+        task_string = '[{}-{}]:'.format(task_type, task_id)
+        formatted_line = '{} {}'.format(task_string.ljust(14), line)
+        if self._stream_stdout:
+          # TODO(rchao): Use a lock here to ensure the printed lines are not
+          # broken.
+          print(formatted_line, end='', flush=True)
+        if self._list_stdout:
+          self._streaming_queue.put(formatted_line)
 
-  def _print_stdout_in_parent(self, formatted_line, task_type, task_id):
-    del task_type, task_id
-    # Flush True so the logging order from subprocesses is respected.
-    # TODO(rchao): Use a lock here to ensure the printed lines are not broken.
-    print(formatted_line, end='', flush=True)
-
-  def _add_stdout_in_queue(self, formatted_line, task_type, task_id):
-    del task_type, task_id
-    # A queue instead of a simple list is used here due to b/150652733.
-    _resource(STREAMING_QUEUE).put(formatted_line)
-
-  def _start_subprocess_and_reading_thread(self, proc_func, task_type, task_id,
-                                           cluster_spec, args, kwargs):
+  def _start_subprocess_and_reading_thread(self,
+                                           task_type,
+                                           task_id,
+                                           cluster_spec=None,
+                                           proc_func=None,
+                                           args=None,
+                                           kwargs=None):
     """Start a subprocess and a thread the reads lines from the subprocess."""
-    global _next_pipe_index
-    pipe_r, pipe_w = _resource(STREAMING_PIPE)[_next_pipe_index]
-    _next_pipe_index += 1
 
-    p = multi_process_lib.Process(
-        target=_Subprocess(),
-        args=(proc_func, task_type, task_id, cluster_spec, self._rpc_layer,
-              self._grpc_fail_fast, self._v2_enabled, self._executing_eagerly,
-              pipe_w) + args,
-        kwargs=kwargs)
+    test_env = TestEnvironment(
+        task_type=task_type,
+        task_id=task_id,
+        cluster_spec=cluster_spec or self._cluster_spec,
+        rpc_layer=self._rpc_layer,
+        grpc_fail_fast=self._grpc_fail_fast,
+        v2_enabled=self._v2_enabled,
+        executing_eagerly=self._executing_eagerly,
+    )
+    pipe_r, pipe_w = multiprocessing.Pipe(duplex=False)
+    resources = Resources(
+        process_status_queue=self._process_status_queue,
+        parent_to_sub_queue=self._parent_to_sub_queue,
+        streaming_pipe_w=pipe_w,
+        barrier=self._barrier,
+    )
+    if proc_func is None:
+      proc_func, args, kwargs = self._proc_func, self._args, self._kwargs
+    # Always use dill to pickle proc_func so that we support more callable
+    # types, e.g. lambda.
+    proc_func = dill.dumps(proc_func, dill.HIGHEST_PROTOCOL)
+    if self._use_dill_for_args:
+      args = dill.dumps(args, dill.HIGHEST_PROTOCOL)
+      kwargs = dill.dumps(kwargs, dill.HIGHEST_PROTOCOL)
+
+    p = _Process(
+        test_env=test_env,
+        target=_ProcFunc(),
+        args=(resources, test_env, proc_func, args, kwargs,
+              self._use_dill_for_args))
     p.start()
+    self._processes[(task_type, task_id)] = p
     self._outstanding_subprocess_count += 1
 
     # For each subprocess, we dedicate a thread continuously reading lines
@@ -248,18 +265,15 @@ class MultiProcessRunner(object):
         target=self._continuously_readline_from_sub,
         args=(pipe_r, task_type, task_id))
     thread.start()
+    self._reading_threads.append(thread)
 
   def start(self):
     """Starts processes, one for each task in `cluster_spec`."""
-
-    global _next_pipe_index
-    self._starting_pipe_index = _next_pipe_index
-
+    if self._processes:
+      raise ValueError('MultiProcessRunner already started.')
     for task_type, addresses in self._cluster_spec.items():
       for task_id, _ in enumerate(addresses):
-        self._start_subprocess_and_reading_thread(self._proc_func, task_type,
-                                                  task_id, self._cluster_spec,
-                                                  self._args, self._kwargs)
+        self._start_subprocess_and_reading_thread(task_type, task_id)
 
     # TODO(rchao): Remove the need of using SIGALRM if possible. At this time,
     # without this the tests become very flaky.
@@ -309,33 +323,22 @@ class MultiProcessRunner(object):
       as_task_type: The task type to be run in the main process.
       as_task_id: The task id to be run in the main process.
     """
-    global _next_pipe_index
-    self._starting_pipe_index = _next_pipe_index
-
+    if self._processes:
+      raise ValueError('MultiProcessRunner already started.')
     for task_type, addresses in self._cluster_spec.items():
       for task_id, _ in enumerate(addresses):
         if not (task_type == as_task_type and task_id == as_task_id):
-          self._start_subprocess_and_reading_thread(self._proc_func, task_type,
-                                                    task_id, self._cluster_spec,
-                                                    self._args, self._kwargs)
-    tf_config_dict = {
-        'cluster': self._cluster_spec,
-        'task': {
-            'type': as_task_type,
-            'index': as_task_id,
-        },
-    }
-    if self._rpc_layer is not None:
-      tf_config_dict['rpc_layer'] = self._rpc_layer
-    os.environ['TF_CONFIG'] = json.dumps(tf_config_dict)
+          self._start_subprocess_and_reading_thread(task_type, task_id)
 
+    _set_tf_config(as_task_type, as_task_id, self._cluster_spec,
+                   self._rpc_layer)
     self._proc_func(*self._args, **self._kwargs)
 
   def start_single_process(self,
                            task_type,
                            task_id,
-                           proc_func=None,
                            cluster_spec=None,
+                           proc_func=None,
                            args=None,
                            kwargs=None):
     """Starts a single process.
@@ -352,19 +355,22 @@ class MultiProcessRunner(object):
     Args:
       task_type: The task type.
       task_id: The task id.
-      proc_func: The process function to be run on the newly started
-        process. If `None`, the function provided at `__init__` will be used.
       cluster_spec: The cluster spec to be used on the newly started
         process. If `None`, the cluster spec provided at `__init__` will be
         used.
+      proc_func: The process function to be run on the newly started
+        process. If specified, specify `args` and `kwargs` as well. If `None`,
+        the function provided at `__init__` will be used.
       args: Optional positional arguments to be supplied in `proc_func`.
       kwargs: Optional keyword arguments to be supplied in `proc_func`.
     """
-    cluster_spec = cluster_spec or self._cluster_spec
-    proc_func = proc_func or self._proc_func
-    self._start_subprocess_and_reading_thread(proc_func, task_type, task_id,
-                                              cluster_spec, args or (),
-                                              kwargs or {})
+    self._start_subprocess_and_reading_thread(
+        task_type,
+        task_id,
+        cluster_spec=cluster_spec,
+        proc_func=proc_func,
+        args=args or (),
+        kwargs=kwargs or {})
 
   def _queue_to_list(self, queue_to_convert):
     """Convert `queue.Queue` to `list`."""
@@ -377,6 +383,18 @@ class MultiProcessRunner(object):
         break
     return list_to_return
 
+  def _join_or_terminate(self, task_type, task_id, process, timeout):
+    """Joins a process. If it times out, terminate all procsses."""
+    logging.info('joining %s-%d', task_type, task_id)
+    process.join(timeout)
+    # If exitcode is None, the process aren't terminated and this is a
+    # timeout.
+    if process.exitcode is None:
+      # Force termination to dump worker processes stack trace.
+      self.terminate_all(sig=signal.SIGTERM)
+      raise RuntimeError('%s-%d and possibly more subprocesses timed out.' %
+                         (task_type, task_id))
+
   def join(self, timeout=_DEFAULT_TIMEOUT_SEC):
     """Joins all the processes with timeout.
 
@@ -395,88 +413,97 @@ class MultiProcessRunner(object):
       RuntimeError: if not all processes report status approximatelty within
       `timeout` seconds, or there's an exception propagated from any subprocess.
     """
+    if self._joined:
+      raise ValueError("MultiProcessRunner can't be joined twice.")
+    self._joined = True
 
-    if not timeout:
-      timeout = float('inf')
-    start_time = time.time()
-    while self._outstanding_subprocess_count > 0:
-      while True:
-        try:
-          process_status = _resource(PROCESS_STATUS_QUEUE).get(timeout=10)
-          break
-        except Queue.Empty:
-          if self._all_forced_terminated:
-            break
-          if time.time() - start_time > timeout:
-            # Send SIGTERM signal to subprocesses to dump their current
-            # stack trace.
-            self.terminate_all(sig=signal.SIGTERM)
-            # If none of those did, report timeout to user.
-            raise RuntimeError('One or more subprocesses timed out. '
-                               'Number of outstanding subprocesses '
-                               'is %d.' % self._outstanding_subprocess_count)
+    chief = self._processes.get(('chief', 0), None)
+    if self._dependence_on_chief and chief:
+      self._join_or_terminate('chief', 0, chief, timeout)
+      # Give other processes a chance to exit on their own.
+      for p in self._processes.values():
+        p.join(timeout=3)
+      self.terminate_all()
+    else:
+      for (task_type, task_id), p in self._processes.items():
+        self._join_or_terminate(task_type, task_id, p, timeout)
 
-      if self._all_forced_terminated:
-        break
-      self._outstanding_subprocess_count -= 1
+    for (task_type, task_id), p in self._processes.items():
+      logging.info('%s-%d exit code: %s', task_type, task_id, p.exitcode)
+
+    process_statuses = self._queue_to_list(self._process_status_queue)
+    if not self._all_forced_terminated and len(
+        process_statuses) != self._outstanding_subprocess_count:
+      raise RuntimeError(
+          'missing statuses from %d subproceses.' %
+          (self._outstanding_subprocess_count - len(process_statuses)))
+    return_values = []
+    for process_status in process_statuses:
       assert isinstance(process_status, _ProcessStatusInfo)
       if not process_status.is_successful:
         six.reraise(*process_status.exc_info)
+      if process_status.return_value is not None:
+        return_values.append(process_status.return_value)
 
-      if self._dependence_on_chief and process_status.task_type == 'chief':
-        self.terminate_all()
-        break
+    logging.info('Joining log reading threads.')
+    for thread in self._reading_threads:
+      thread.join()
+    logging.info('Joined log reading threads.')
 
-    # Giving threads some time to finish the message reading from subprocesses.
-    time.sleep(5)
+    # Clear the alarm.
+    signal.alarm(0)
 
-    stdout = self._queue_to_list(_resource(STREAMING_QUEUE))
-    return_value = self._queue_to_list(_resource(RETURN_VALUE_QUEUE))
+    stdout = self._queue_to_list(self._streaming_queue)
 
-    # Notifying the threads that are reading lines that we should stop.
-    for pipe_index in range(self._starting_pipe_index, _next_pipe_index):  # pylint: disable=protected-access
-      _, pipe_w = _resource(STREAMING_PIPE)[pipe_index]
-      writer = os.fdopen(pipe_w.fileno(), 'w')
-      # Writing end of file message so the threads that's actively reading lines
-      # know to stop.
-      writer.writelines(['EOF'])
-      writer.close()
-
-    return MultiProcessRunnerResult(stdout=stdout, return_value=return_value)
+    return MultiProcessRunnerResult(stdout=stdout, return_value=return_values)
 
   def terminate(self, task_type, task_id):
     """Terminates the process with `task_type` and `task_id`."""
-    _resource(PARENT_TO_SUB_QUEUE).put('terminate {} {}'.format(
-        task_type, task_id))
+    p = self._processes.get((task_type, task_id), None)
+    if p is None:
+      raise ValueError('{}-{} does not exist'.format(task_type, task_id))
+    # TODO(crccw): change to use Process.terminate() as well.
+    self._parent_to_sub_queue.put('terminate {} {}'.format(task_type, task_id))
+    p.join()
 
   def terminate_all(self, sig=None):
     """Terminates all subprocesses."""
     # Use SIGKILL as default. In systems where that's unavailable such as
     # windows, use SIGTERM.
     sig = sig or getattr(signal, 'SIGKILL', signal.SIGTERM)
-    subprocess_infos = []
-
-    while True:
+    for (task_type, task_id), p in self._processes.items():
       try:
-        subprocess_info = _resource(SUBPROCESS_INFO_QUEUE).get(block=False)
-        subprocess_infos.append(subprocess_info)
-      except Queue.Empty:
-        break
-
-    for subprocess_info in subprocess_infos:
-      logging.info('Parent process is now killing PID: %d', subprocess_info.pid)
-      try:
-        os.kill(subprocess_info.pid, sig)
+        os.kill(p.pid, sig)
       except ProcessLookupError:
-        # TODO(rchao): Remove subprocess info from the queue once a subprocess
-        # is terminated.
-        logging.info('PID %d does not exist.', subprocess_info.pid)
-
+        logging.info('Attempting to kill %s-%d but it does not exist.',
+                     task_type, task_id)
     self._all_forced_terminated = True
 
 
-class _Subprocess(object):
-  """Represents an internal subprocess used in MultiProcessRunner's context."""
+class _Process(multi_process_lib.Process):
+  """A modified `multiprocessing.Process` that can set up environment variables."""
+
+  # TODO(crccw): consider moving other logics in _ProcFunc to _Process.
+
+  def __init__(self, test_env, **kwargs):
+    super(_Process, self).__init__(**kwargs)
+    self._test_env = test_env
+    self._actual_run = getattr(self, 'run')
+    self.run = self._run_with_setenv
+
+  def _run_with_setenv(self):
+    # We need to set environment variables before doing anything because
+    # setenv() is not thread-safe.
+    test_env = self._test_env
+    if test_env.grpc_fail_fast is not None:
+      os.environ['GRPC_FAIL_FAST'] = str(test_env.grpc_fail_fast)
+    _set_tf_config(test_env.task_type, test_env.task_id, test_env.cluster_spec,
+                   test_env.rpc_layer)
+    return self._actual_run()
+
+
+class _ProcFunc(object):
+  """Represents a callable to run in a subprocess."""
 
   @contextlib.contextmanager
   def _runtime_mode(self, executing_eagerly):
@@ -487,21 +514,12 @@ class _Subprocess(object):
       with context.graph_mode():
         yield
 
-  def _finish_process(self, process_status_info, return_value):
-    """Adds data to queues before program exits."""
-    # Clear the alarm.
-    signal.alarm(0)
-
-    if return_value is not None:
-      self._add_return_data(return_value)
-    _resource(PROCESS_STATUS_QUEUE).put(process_status_info)
-
   def _message_checking_func(self, task_type, task_id):
     """A function that regularly checks messages from parent process."""
     # TODO(rchao): Remove this once parent uses SIGKILL to terminate subprocess.
     while True:
       try:
-        message = _resource(PARENT_TO_SUB_QUEUE).get(block=False)
+        message = self._resources.parent_to_sub_queue.get(block=False)
 
         # Currently the only possible message is termination.
         if not message.startswith('terminate'):
@@ -512,62 +530,75 @@ class _Subprocess(object):
         else:
           # If the message is not targeting this process, put it back to the
           # queue.
-          _resource(PARENT_TO_SUB_QUEUE).put(message)
+          self._resources.parent_to_sub_queue.put(message)
           time.sleep(1)
       except Queue.Empty:
         time.sleep(0.1)
-    self._finish_process(
+    self._resources.process_status_queue.put(
         _ProcessStatusInfo(
-            task_type=task_type, is_successful=True, exc_info=None), None)
+            task_type=task_type,
+            is_successful=True,
+            exc_info=None,
+            return_value=None))
     # `os._exit(0)` is used to more reliably terminate a subprocess.
     os._exit(0)  # pylint: disable=protected-access
 
-  def __call__(self, proc_func, task_type, task_id, per_process_cluster_spec,
-               rpc_layer, grpc_fail_fast, v2_enabled, executing_eagerly, pipe_w,
-               *arg, **kwargs):
+  def _close_streaming(self):
+    """Close stdout, stderr and streaming pipe.
+
+    We need to explicitly close them since Tensorflow may take a while to exit,
+    so that the reading threads in the main process can exit more quickly.
+    """
+    sys.stdout.flush()
+    sys.stderr.flush()
+    sys.stdout.close()
+    sys.stderr.close()
+    self._resources.streaming_pipe_w.close()
+
+  def __call__(self, resources, test_env, proc_func, args, kwargs,
+               use_dill_for_args):
     """The wrapper function that actually gets run in child process(es)."""
 
+    global _barrier
+
+    self._resources = resources
+    _barrier = self._resources.barrier
+    proc_func = dill.loads(proc_func)
+    if use_dill_for_args:
+      args = dill.loads(args)
+      kwargs = dill.loads(kwargs)
+
     if faulthandler is not None:
       faulthandler.enable()
       faulthandler.register(signal.SIGTERM, chain=True)
 
+    # All logging should go to stderr to be streamed to the main process.
+    logging.set_stderrthreshold(logging.DEBUG)
+
+    # Assign sys.stdout and sys.stderr as duplicates of `streaming_pipe_w` so
+    # print() and logging.*() write directly to `streaming_pipe_w`.
+    # Unfortunately since we cannot prepend task_type and task_id information to
+    # the streamed logs we will need a thread per subprocess to distinguish
+    # where the piece of message is from.
+    os.dup2(resources.streaming_pipe_w.fileno(), sys.stdout.fileno())
+    os.dup2(resources.streaming_pipe_w.fileno(), sys.stderr.fileno())
+
     pid = os.getpid()
     logging.info('Subprocess with PID %d (%s, %d) is now being started.', pid,
-                 task_type, task_id)
-    _resource(SUBPROCESS_INFO_QUEUE).put(_SubprocessInfo(pid=pid))
-    # Assign sys.stdout and sys.stderr as duplicates of `pipe_w` so print() and
-    # logging.*() write directly to `pipe_w`. Unfortunately since we cannot
-    # prepend task_type and task_id information to the streamed logs we will
-    # need a thread per subprocess to distinguish where the piece of message is
-    # from.
-    os.dup2(pipe_w.fileno(), sys.stdout.fileno())
-    os.dup2(pipe_w.fileno(), sys.stderr.fileno())
+                 test_env.task_type, test_env.task_id)
 
     # The thread will be dedicated to checking messages from the parent process.
     threading.Thread(  # pylint: disable=unexpected-keyword-arg
         target=self._message_checking_func,
-        args=(task_type, task_id),
+        args=(test_env.task_type, test_env.task_id),
         daemon=True).start()
 
-    if grpc_fail_fast is not None:
-      os.environ['GRPC_FAIL_FAST'] = str(grpc_fail_fast)
-    tf_config_dict = {
-        'cluster': per_process_cluster_spec,
-        'task': {
-            'type': task_type,
-            'index': task_id,
-        },
-    }
-    if rpc_layer is not None:
-      tf_config_dict['rpc_layer'] = rpc_layer
-    os.environ['TF_CONFIG'] = json.dumps(tf_config_dict)
-
-    if v2_enabled:
+    if test_env.v2_enabled:
       v2_compat.enable_v2_behavior()
 
     try:
-      with self._runtime_mode(executing_eagerly):
-        return_value = proc_func(*arg, **kwargs)
+      with self._runtime_mode(test_env.executing_eagerly):
+        return_value = proc_func(*args, **kwargs)
         is_successful = True
         exc_info = None
 
@@ -587,35 +618,27 @@ class _Subprocess(object):
       raise
 
     finally:
-      self._finish_process(
-          _ProcessStatusInfo(
-              task_type=task_type,
-              is_successful=is_successful,
-              exc_info=exc_info),
-          return_value)
-
-  def _add_return_data(self, data):
-    """Adds return data that will be returned by `join`.
-
-    The function provides a way for child processes to communicate with the
-    parent process. Data passed to `_add_return_data` will be available in a
-    Python Queue.Queue that is eventually returned by `join`.
-
-    Args:
-      data: data to be made available in the queue returned by `join`.
-    """
-    # TODO(rchao): Incorporate the task type and id information in a data
-    # wrapper that becomes what is stored in the queue so we can tell where
-    # the data is from.
-    _resource(RETURN_VALUE_QUEUE).put(data)
+      info = _ProcessStatusInfo(
+          task_type=test_env.task_type,
+          is_successful=is_successful,
+          exc_info=exc_info,
+          return_value=return_value)
+      self._resources.process_status_queue.put(info)
+      self._close_streaming()
 
 
-def barrier():
-  return multi_process_lib.get_user_data()[BARRIER]
-
-
-def _resource(resource_name):
-  return multi_process_lib.get_user_data()[resource_name]
+def _set_tf_config(task_type, task_id, cluster_spec, rpc_layer=None):
+  """Set TF_CONFIG environment variable."""
+  tf_config_dict = {
+      'cluster': cluster_spec,
+      'task': {
+          'type': task_type,
+          'index': task_id,
+      },
+  }
+  if rpc_layer is not None:
+    tf_config_dict['rpc_layer'] = rpc_layer
+  os.environ['TF_CONFIG'] = json.dumps(tf_config_dict)
 
 
 def run(proc_func,
@@ -651,16 +674,19 @@ def run(proc_func,
   return runner.join(timeout)
 
 
-def test_main(max_subprocess_count=_DEFAULT_MAX_SUBPROCESS_COUNT,
-              barrier_parties=0):
-  """Main function to be called within `__main__` of a test file.
+# This is set by MultiProcessRunner in worker processes.
+_barrier = None
 
-  Args:
-    max_subprocess_count: Maximum number of subprocesses that will be used. User
-      of multi_process_runner needs to determine a number at calling this
-      method, and the subprocesses involved later should not exceed this number.
-    barrier_parties: Number of parties the barrier will be used toward. User of
-      multi_process_runner needs to determine a number at calling this method.
-  """
-  with multi_process_lib.context_manager(max_subprocess_count, barrier_parties):
-    test.main()
+
+def barrier():
+  if _barrier is None:
+    raise ValueError(
+        'barrier is not defined. It is likely because you are calling barrier()'
+        'in the main process. barrier() can only be called in the subprocesses.'
+    )
+  return _barrier
+
+
+def test_main():
+  """Main function to be called within `__main__` of a test file."""
+  multi_process_lib.test_main()
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index 47c3a744419..cf68ffd50d7 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -23,15 +23,13 @@ import os
 import threading
 import time
 from absl import logging
-from six.moves import queue as Queue
 
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.eager import test
 
 
-def proc_func_that_adds_task_type_in_return_data(test_obj, val):
-  test_obj.assertEqual(val, 3)
+def proc_func_that_adds_task_type_in_return_data():
   return multi_worker_test_base.get_task_type()
 
 
@@ -51,6 +49,10 @@ def proc_func_that_return_args_and_kwargs(*args, **kwargs):
   return list(args) + list(kwargs.items())
 
 
+def proc_func_with_barrier():
+  return multi_process_runner.barrier()
+
+
 class MultiProcessRunnerTest(test.TestCase):
 
   def _worker_idx(self):
@@ -61,8 +63,7 @@ class MultiProcessRunnerTest(test.TestCase):
     mpr_result = multi_process_runner.run(
         proc_func_that_adds_task_type_in_return_data,
         multi_worker_test_base.create_cluster_spec(
-            num_workers=2, num_ps=3, has_eval=1),
-        args=(self, 3))
+            num_workers=2, num_ps=3, has_eval=1))
 
     job_count_dict = {'worker': 2, 'ps': 3, 'evaluator': 1}
     for data in mpr_result.return_value:
@@ -124,43 +125,29 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_process_that_exits(self):
 
-    def func_to_exit_in_15_sec():
-      time.sleep(5)
-      print('foo', flush=True)
-      time.sleep(20)
-      print('bar', flush=True)
+    def func_to_exit_in_5_sec():
+      logging.error('foo')
+      time.sleep(10)
+      logging.error('bar')
 
     mpr = multi_process_runner.MultiProcessRunner(
-        func_to_exit_in_15_sec,
+        func_to_exit_in_5_sec,
         multi_worker_test_base.create_cluster_spec(num_workers=1),
         list_stdout=True,
-        max_run_time=15)
+        max_run_time=5)
 
     mpr.start()
     stdout = mpr.join().stdout
     self.assertLen([msg for msg in stdout if 'foo' in msg], 1)
     self.assertLen([msg for msg in stdout if 'bar' in msg], 0)
 
-  def test_signal_doesnt_fire_after_process_exits(self):
-    mpr = multi_process_runner.MultiProcessRunner(
-        proc_func_that_does_nothing,
-        multi_worker_test_base.create_cluster_spec(num_workers=1),
-        max_run_time=10)
-    mpr.start()
-    mpr.join()
-    with self.assertRaisesRegexp(Queue.Empty, ''):
-      # If the signal was fired, another message would be added to internal
-      # queue, so verifying it's empty.
-      multi_process_runner._resource(
-          multi_process_runner.PROCESS_STATUS_QUEUE).get(block=False)
-
   def test_termination(self):
 
     def proc_func():
       for i in range(0, 10):
         print(
             'index {}, iteration {}'.format(self._worker_idx(), i), flush=True)
-        time.sleep(1)
+        time.sleep(5)
 
     mpr = multi_process_runner.MultiProcessRunner(
         proc_func,
@@ -192,7 +179,7 @@ class MultiProcessRunnerTest(test.TestCase):
         multi_worker_test_base.create_cluster_spec(num_workers=2),
         list_stdout=True)
     mpr.start()
-    time.sleep(5)
+    time.sleep(3)
     mpr.terminate('worker', 0)
     mpr.start_single_process('worker', 0)
     std_stream_results = mpr.join().stdout
@@ -273,11 +260,14 @@ class MultiProcessRunnerTest(test.TestCase):
             has_chief=True, num_workers=1),
         list_stdout=True)
 
-    def follow_ups():
+    def eval_func():
+      time.sleep(1)
       mpr.start_single_process(task_type='evaluator', task_id=0)
 
-    threading.Thread(target=follow_ups).start()
+    eval_thread = threading.Thread(target=eval_func)
+    eval_thread.start()
     mpr.start_in_process_as(as_task_type='chief', as_task_id=0)
+    eval_thread.join()
     list_to_assert = mpr.join().stdout
     for job in ['worker', 'evaluator']:
       for iteration in range(5):
@@ -285,5 +275,17 @@ class MultiProcessRunnerTest(test.TestCase):
             any('{}-0, i: {}'.format(job, iteration) in line
                 for line in list_to_assert))
 
+  def test_barrier(self):
+    multi_process_runner.run(
+        proc_func_with_barrier,
+        cluster_spec=multi_worker_test_base.create_cluster_spec(
+            has_chief=True, num_workers=1),
+    )
+
+  def test_barrier_called_in_main_process(self):
+    with self.assertRaises(ValueError):
+      multi_process_runner.barrier()
+
+
 if __name__ == '__main__':
   multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/multi_worker_continuous_run_test.py b/tensorflow/python/distribute/multi_worker_continuous_run_test.py
index 90484a12423..14e0564874b 100644
--- a/tensorflow/python/distribute/multi_worker_continuous_run_test.py
+++ b/tensorflow/python/distribute/multi_worker_continuous_run_test.py
@@ -33,10 +33,13 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import config
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
 
+
 NUM_WORKERS = 5
 
 
@@ -84,9 +87,10 @@ class MultiWorkerContinuousRunTest(test.TestCase, parameterized.TestCase):
       for _ in range(20):
         worker_step_fn(worker_id)
 
-    multi_process_runner.run(
-        worker_fn,
-        cluster_spec=test_base.create_cluster_spec(num_workers=NUM_WORKERS))
+    with test_util.skip_if_error(self, errors_impl.UnavailableError):
+      multi_process_runner.run(
+          worker_fn,
+          cluster_spec=test_base.create_cluster_spec(num_workers=NUM_WORKERS))
 
   @combinations.generate(combinations.combine(mode=['eager']))
   def testVariableInitializationWithChangingShape(self, mode):
@@ -116,10 +120,11 @@ class MultiWorkerContinuousRunTest(test.TestCase, parameterized.TestCase):
       for i in range(20):
         worker_step_fn(worker_id, num_dims=(i + 1))
 
-    multi_process_runner.run(
-        worker_fn,
-        cluster_spec=test_base.create_cluster_spec(num_workers=NUM_WORKERS))
+    with test_util.skip_if_error(self, errors_impl.UnavailableError):
+      multi_process_runner.run(
+          worker_fn,
+          cluster_spec=test_base.create_cluster_spec(num_workers=NUM_WORKERS))
 
 
 if __name__ == '__main__':
-  multi_process_runner.test_main(barrier_parties=NUM_WORKERS)
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/multi_worker_test_base.py b/tensorflow/python/distribute/multi_worker_test_base.py
index a80ca831e4b..408cad2ca0a 100644
--- a/tensorflow/python/distribute/multi_worker_test_base.py
+++ b/tensorflow/python/distribute/multi_worker_test_base.py
@@ -50,6 +50,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import server_lib
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.compat import collections_abc
 
@@ -559,6 +560,10 @@ class MultiWorkerMultiProcessTest(test.TestCase):
     return subprocess.Popen(
         cmd_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
 
+  @deprecation.deprecated(
+      None, '`run_multiple_tasks_in_processes` is deprecated; any new test '
+      'requiring multiple processes should use `multi_process_runner` for '
+      'better support of log printing, streaming, and more functionality.')
   def run_multiple_tasks_in_processes(self, cmd_args, cluster_spec):
     """Run `cmd_args` in a process for each task in `cluster_spec`."""
     processes = {}
@@ -570,6 +575,10 @@ class MultiWorkerMultiProcessTest(test.TestCase):
         processes[task_type].append(p)
     return processes
 
+  @deprecation.deprecated(
+      None, '`join_independent_workers` is deprecated; any new test '
+      'requiring multiple processes should use `multi_process_runner` for '
+      'better support of log printing, streaming, and more functionality.')
   def join_independent_workers(self, worker_processes):
     return_codes = []
     for p in nest.flatten(worker_processes):
@@ -585,6 +594,10 @@ class MultiWorkerMultiProcessTest(test.TestCase):
     for return_code in return_codes:
       self.assertEqual(return_code, 0)
 
+  @deprecation.deprecated(
+      None, '`stream_stderr` is deprecated; any new test '
+      'requiring multiple processes should use `multi_process_runner` for '
+      'better support of log printing, streaming, and more functionality.')
   def stream_stderr(self, processes, print_only_first=False):
     """Consume stderr of all processes and print to stdout.
 
diff --git a/tensorflow/python/distribute/parallel_device/BUILD b/tensorflow/python/distribute/parallel_device/BUILD
index e7526a56f66..930816d4407 100644
--- a/tensorflow/python/distribute/parallel_device/BUILD
+++ b/tensorflow/python/distribute/parallel_device/BUILD
@@ -1,4 +1,8 @@
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
 package(
+    default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],  # Apache 2.0
 )
 
@@ -13,6 +17,7 @@ py_library(
     srcs = ["parallel_device.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":parallel_device_ops",
         ":saving",
         "//tensorflow/python:_pywrap_parallel_device",
     ],
@@ -25,6 +30,25 @@ py_library(
     deps = ["//tensorflow/python:framework_ops"],
 )
 
+tf_gen_op_wrapper_py(
+    name = "parallel_device_ops_py",
+    out = "gen_parallel_device_ops.py",
+    deps = ["//tensorflow/c/eager/parallel_device:parallel_device_ops"],
+)
+
+tf_custom_op_library(
+    name = "_parallel_device_ops.so",
+    srcs = ["//tensorflow/c/eager/parallel_device:parallel_device_ops_srcs"],
+)
+
+tf_custom_op_py_library(
+    name = "parallel_device_ops",
+    dso = [":_parallel_device_ops.so"],
+    kernels = ["//tensorflow/c/eager/parallel_device:parallel_device_ops"],
+    visibility = ["//tensorflow:internal"],
+    deps = [":parallel_device_ops_py"],
+)
+
 py_test(
     name = "parallel_device_test",
     srcs = ["parallel_device_test.py"],
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device.py b/tensorflow/python/distribute/parallel_device/parallel_device.py
index 982b061cdb7..2dbdc653a64 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device.py
@@ -22,11 +22,17 @@ import contextlib
 import threading
 
 from tensorflow.python import _pywrap_parallel_device
+from tensorflow.python.distribute.parallel_device import gen_parallel_device_ops
 from tensorflow.python.distribute.parallel_device import saving
 from tensorflow.python.eager import context
+from tensorflow.python.framework import load_library
 from tensorflow.python.framework import ops
+from tensorflow.python.platform import resource_loader
 from tensorflow.python.tpu.ops import tpu_ops
 
+load_library.load_op_library(
+    resource_loader.get_path_to_datafile("_parallel_device_ops.so"))
+
 _next_device_number = 0
 _next_device_number_lock = threading.Lock()
 
@@ -58,6 +64,8 @@ class ParallelDevice(object):
     device, device_info = _pywrap_parallel_device.GetParallelDeviceCapsules(
         self.name, self.components)
     context.register_custom_device(device, self.name, device_info)
+    with ops.device(self.name):
+      self._device_ids = gen_parallel_device_ops.device_id()
 
   def pack(self, tensors):
     """Create a tensor on the parallel device from a sequence of tensors.
@@ -84,6 +92,18 @@ class ParallelDevice(object):
       return tpu_ops.tpu_replicated_output(
           parallel_tensor, num_replicas=len(self.components))
 
+  @property
+  def device_ids(self):
+    """A parallel tensor with scalar integers numbering component devices.
+
+    Each device ID is placed on its corresponding device, in the same order as
+    the `components` constructor argument.
+
+    Returns:
+      A parallel tensor containing 0 on the first device, 1 on the second, etc.
+    """
+    return self._device_ids
+
   # TODO(allenl): Fixing saving in Python is a bit odd. One alternative would be
   # to provide a hook for the custom device to create save specs/etc., then call
   # that hook from the default variable implementation if the variable is on a
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
index d3f3417eca9..e35eb601cc5 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device_test.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -119,6 +119,12 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[0], outputs[0].backing_device)
     self.assertIn(self.device.components[1], outputs[1].backing_device)
 
+  def test_device_id(self):
+    device_ids = self.device.unpack(self.device.device_ids)
+    self.assertAllClose([0, 1], device_ids)
+    self.assertIn(self.device.components[0], device_ids[0].backing_device)
+    self.assertIn(self.device.components[1], device_ids[1].backing_device)
+
   def test_collective_reduce(self):
     with ops.device(self.device.name):
       x = self.device.pack(
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index ce2b7ceb159..142684bb3e9 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -28,6 +28,7 @@ from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import mirrored_run
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import numpy_dataset
+from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import values
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
@@ -441,8 +442,8 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
 
         # Create and wrap the variable.
         v = next_creator(**kwargs)
-        wrapped = values.AggregatingVariable(
-            self._container_strategy(), v, aggregation)
+        wrapped = ps_values.AggregatingVariable(self._container_strategy(), v,
+                                                aggregation)
 
         # Add the wrapped variable to the requested collections.
         # The handling of eager mode and the global step matches
@@ -539,7 +540,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
     return nest.map_structure(_select_fn, structured)
 
   def _update(self, var, fn, args, kwargs, group):
-    if isinstance(var, values.AggregatingVariable):
+    if isinstance(var, ps_values.AggregatingVariable):
       var = var.get()
     if not resource_variable_ops.is_resource_variable(var):
       raise ValueError(
@@ -569,7 +570,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
 
   def value_container(self, val):
     if (hasattr(val, "_aggregating_container") and
-        not isinstance(val, values.AggregatingVariable)):
+        not isinstance(val, ps_values.AggregatingVariable)):
       wrapper = val._aggregating_container()  # pylint: disable=protected-access
       if wrapper is not None:
         return wrapper
diff --git a/tensorflow/python/distribute/parameter_server_strategy_test.py b/tensorflow/python/distribute/parameter_server_strategy_test.py
index de5cf9e90d1..d67ed72a576 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_test.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.distribute import distribution_strategy_context as ds_con
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute import values
@@ -796,8 +797,8 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                        msg=('created_step %s type %s vs. get_step %s type %s' %
                             (id(created_step), created_step.__class__.__name__,
                              id(get_step), get_step.__class__.__name__)))
-      self.assertIs(values.AggregatingVariable, type(created_step))
-      self.assertIs(values.AggregatingVariable, type(get_step))
+      self.assertIs(ps_values.AggregatingVariable, type(created_step))
+      self.assertIs(ps_values.AggregatingVariable, type(get_step))
       self.assertIs(strategy, created_step.distribute_strategy)
 
   @combinations.generate(combinations.combine(mode=['graph']))
@@ -828,7 +829,7 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
           _ = v * v
         v, = tape.watched_variables()
         w = strategy.extended.value_container(v)
-        self.assertIs(values.AggregatingVariable, type(w))
+        self.assertIs(ps_values.AggregatingVariable, type(w))
 
       strategy.extended.call_for_each_replica(f)
 
diff --git a/tensorflow/python/distribute/ps_values.py b/tensorflow/python/distribute/ps_values.py
new file mode 100644
index 00000000000..37cd6e12d90
--- /dev/null
+++ b/tensorflow/python/distribute/ps_values.py
@@ -0,0 +1,304 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various classes representing distributed values for PS."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import weakref
+
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import values
+from tensorflow.python.distribute import values_util
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.types import core
+
+
+# Variable used in PSStrategy TF 1 and CentralStorageStrategy.
+class AggregatingVariable(variables_lib.Variable, core.Tensor):
+  """A wrapper around a variable that aggregates updates across replicas."""
+
+  def __init__(self, strategy, v, aggregation):
+    self._distribute_strategy = strategy
+    self._v = v
+    # NOTE: We don't use "_distributed_container" here because we don't want
+    # to trigger that code path in regroup().
+    v._aggregating_container = weakref.ref(self)  # pylint: disable=protected-access
+    self._aggregation = aggregation
+
+  def get(self):
+    return self._v
+
+  @property
+  def distribute_strategy(self):
+    return self._distribute_strategy
+
+  def __getattr__(self, name):
+    return getattr(self._v, name)
+
+  def _assign_func(self, *args, **kwargs):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      f = kwargs.pop("f")
+      if ds_context.in_cross_replica_context():
+        if distribute_lib.get_update_replica_id() is not None:
+          # We are calling an assign function in an update context.
+          return f(self._v, *args, **kwargs)
+
+        # We are calling an assign function in cross replica context, wrap it in
+        # an update call.
+        return self._distribute_strategy.extended.update(
+            self, f, args=args, kwargs=kwargs)
+      else:
+        replica_context = ds_context.get_replica_context()
+        assert replica_context
+        # We are calling an assign function in replica context.
+        # We reduce the value we want to assign/add/sub. More details about how
+        # we handle the different use cases can be found in the _reduce method.
+        # We call the function with the reduced value.
+        if self._aggregation == vs.VariableAggregation.NONE:
+          raise ValueError(
+              values_util.aggregation_error_msg.format(
+                  variable_type="AggregatingVariable"))
+
+        def merge_fn(strategy,
+                     value,
+                     use_locking=False,
+                     name=None,
+                     read_value=True):
+          v = values_util.apply_aggregation(strategy, value, self._aggregation,
+                                            self)
+          if name and isinstance(name, values.PerReplica):
+            name = name.values[0]
+          return strategy.extended.update(
+              self,
+              f,
+              args=(v,),
+              kwargs={
+                  "use_locking": use_locking,
+                  "name": name,
+                  "read_value": read_value
+              })
+        return replica_context.merge_call(merge_fn, args=args, kwargs=kwargs)
+
+  def assign_sub(self, *args, **kwargs):
+    assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
+    return self._assign_func(f=assign_sub_fn, *args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    assign_add_fn = lambda var, *a, **kw: var.assign_add(*a, **kw)
+    return self._assign_func(f=assign_add_fn, *args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
+    return self._assign_func(f=assign_fn, *args, **kwargs)
+
+  @property
+  def initializer(self):
+    return self._v.initializer
+
+  def initialized_value(self):
+    return self._v.initialized_value()
+
+  @property
+  def initial_value(self):
+    return self._v.initial_value
+
+  @property
+  def op(self):
+    return self._v.op
+
+  def read_value(self):
+    return self._v.read_value()
+
+  def eval(self, session=None):
+    return self._v.eval(session)
+
+  @property
+  def graph(self):
+    return self._v.graph
+
+  @property
+  def device(self):
+    return self._v.device
+
+  @property
+  def shape(self):
+    return self._v.shape
+
+  @property
+  def aggregation(self):
+    return self._aggregation
+
+  @property
+  def synchronization(self):
+    return self._v.synchronization
+
+  @property
+  def name(self):
+    return self._v.name
+
+  @property
+  def trainable(self):
+    return self._v.trainable
+
+  @property
+  def dtype(self):
+    return self._v.dtype
+
+  # TODO(josh11b): Test saving & restoring.
+  def _gather_saveables_for_checkpoint(self):
+    return {trackable.VARIABLE_VALUE_KEY: self._v}
+
+  # pylint: disable=multiple-statements
+  def __add__(self, o):
+    return self._v + o
+
+  def __radd__(self, o):
+    return o + self._v
+
+  def __sub__(self, o):
+    return self._v - o
+
+  def __rsub__(self, o):
+    return o - self._v
+
+  def __mul__(self, o):
+    return self._v * o
+
+  def __rmul__(self, o):
+    return o * self._v
+
+  def __truediv__(self, o):
+    return self._v / o
+
+  def __rtruediv__(self, o):
+    return o / self._v
+
+  def __floordiv__(self, o):
+    return self._v // o
+
+  def __rfloordiv__(self, o):
+    return o // self._v
+
+  def __mod__(self, o):
+    return self._v % o
+
+  def __rmod__(self, o):
+    return o % self._v
+
+  def __lt__(self, o):
+    return self._v < o
+
+  def __le__(self, o):
+    return self._v <= o
+
+  def __gt__(self, o):
+    return self._v > o
+
+  def __ge__(self, o):
+    return self._v >= o
+
+  def __and__(self, o):
+    return self._v & o
+
+  def __rand__(self, o):
+    return o & self._v
+
+  def __or__(self, o):
+    return self._v | o
+
+  def __ror__(self, o):
+    return o | self._v
+
+  def __xor__(self, o):
+    return self._v ^ o
+
+  def __rxor__(self, o):
+    return o ^ self._v
+
+  def __getitem__(self, o):
+    return self._v[o]
+
+  def __pow__(self, o, modulo=None):
+    return pow(self._v, o, modulo)
+
+  def __rpow__(self, o):
+    return pow(o, self._v)
+
+  def __invert__(self):
+    return ~self._v
+
+  def __neg__(self):
+    return -self._v
+
+  def __abs__(self):
+    return abs(self._v)
+
+  def __div__(self, o):
+    try:
+      return self._v.__div__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rdiv__(self, o):
+    try:
+      return self._v.__rdiv__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __matmul__(self, o):
+    try:
+      return self._v.__matmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rmatmul__(self, o):
+    try:
+      return self._v.__rmatmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __str__(self):
+    return str(self._v)
+
+  def __repr__(self):
+    return repr(self._v)
+
+  def _should_act_as_resource_variable(self):
+    """Pass resource_variable_ops.is_resource_variable check."""
+    pass
+
+  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
+    return ops.convert_to_tensor(self.get(), dtype=dtype, name=name,
+                                 as_ref=as_ref)
+
+
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+def _tensor_conversion_aggregate(var, dtype=None, name=None, as_ref=False):
+  return var._dense_var_to_tensor(dtype, name, as_ref)  # pylint: disable=protected-access
+
+
+ops.register_tensor_conversion_function(AggregatingVariable,
+                                        _tensor_conversion_aggregate)
diff --git a/tensorflow/python/distribute/ps_values_test.py b/tensorflow/python/distribute/ps_values_test.py
new file mode 100644
index 00000000000..b8d6b3f35a0
--- /dev/null
+++ b/tensorflow/python/distribute/ps_values_test.py
@@ -0,0 +1,65 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the distributed values library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import ps_values
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            strategy_combinations.central_storage_strategy_with_two_gpus
+        ],
+        mode=["graph", "eager"]))
+class AggregatingVariableTest(test.TestCase, parameterized.TestCase):
+
+  def testAssignOutOfScope(self, distribution):
+    with distribution.scope():
+      aggregating = variables_lib.Variable(1.)
+    self.assertIsInstance(aggregating, ps_values.AggregatingVariable)
+    self.evaluate(aggregating.assign(3.))
+    self.assertEqual(self.evaluate(aggregating.read_value()), 3.)
+    self.assertEqual(self.evaluate(aggregating._v.read_value()), 3.)
+
+  def testAssignAdd(self, distribution):
+    with distribution.scope():
+      v = variable_scope.variable(
+          1, aggregation=variables_lib.VariableAggregation.MEAN)
+    self.evaluate(variables_lib.global_variables_initializer())
+
+    @def_function.function
+    def assign():
+      return v.assign_add(2)
+
+    per_replica_results = self.evaluate(
+        distribution.experimental_local_results(
+            distribution.experimental_run_v2(assign)))
+    self.assertAllEqual([3], per_replica_results)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/distribute/sharded_variable.py b/tensorflow/python/distribute/sharded_variable.py
index 9886e42a8b3..7accc066d8a 100644
--- a/tensorflow/python/distribute/sharded_variable.py
+++ b/tensorflow/python/distribute/sharded_variable.py
@@ -96,6 +96,10 @@ class ShardedVariable(trackable.Trackable):
                        'to the order of the `Variable`s in the list passed to '
                        'the constructor. Found {}'.format(save_slice_info))
 
+  def __iter__(self):
+    """Return an iterable for accessing the underlying sharded variables."""
+    return iter(self._variables)
+
   @property
   def variables(self):
     """The list of `Variable`s that make up the shards of this object."""
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index e69c8c7f129..350b187f67f 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -47,7 +47,6 @@ from tensorflow.python.training import ftrl
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import rmsprop
 
-
 FLAGS = flags.FLAGS
 
 _did_connect_to_cluster = False
@@ -58,16 +57,26 @@ def _get_tpu_strategy_creator(steps_per_run, use_single_core=False, **kwargs):
   def _create_tpu_strategy():
     global _did_connect_to_cluster
 
-    # These flags will be defined by tpu_test_wrapper.py.
-    resolver = tpu_cluster_resolver.TPUClusterResolver(
-        tpu=hasattr(FLAGS, "tpu") and FLAGS.tpu or "",
-        zone=hasattr(FLAGS, "zone") and FLAGS.zone or None,
-        project=hasattr(FLAGS, "project") and FLAGS.project or None,
-    )
+    try:
+      # Attempt to locally discover the TPU. This will fail for Cloud TPU, in
+      # which case we fall back to the values passed as flags.
+      resolver = tpu_cluster_resolver.TPUClusterResolver()
+      did_automatically_resolve = True
+    except ValueError:
+      did_automatically_resolve = False
+
+      # These flags will be defined by tpu_test_wrapper.py.
+      resolver = tpu_cluster_resolver.TPUClusterResolver(
+          tpu=hasattr(FLAGS, "tpu") and FLAGS.tpu or "",
+          zone=hasattr(FLAGS, "zone") and FLAGS.zone or None,
+          project=hasattr(FLAGS, "project") and FLAGS.project or None,
+      )
+
     # Only connect once per process, rather than per test method.
-    if hasattr(FLAGS, "tpu") and FLAGS.tpu and not _did_connect_to_cluster:
-      remote.connect_to_cluster(resolver)
-      _did_connect_to_cluster = True
+    if getattr(FLAGS, "tpu", "") or did_automatically_resolve:
+      if not _did_connect_to_cluster:
+        remote.connect_to_cluster(resolver)
+        _did_connect_to_cluster = True
 
     topology = tpu_strategy_util.initialize_tpu_system(resolver)
     device_assignment = None
@@ -150,13 +159,13 @@ central_storage_strategy_with_gpu_and_cpu = combinations.NamedDistribution(
         ["/gpu:0", "/cpu:0"]),
     required_gpus=1)
 multi_worker_mirrored_two_workers = combinations.NamedDistribution(
-    "MultiWorkerMirrroedTwoWorkers",
+    "MultiWorkerMirroredTwoWorkers",
     collective_all_reduce_strategy.CollectiveAllReduceStrategy,
     has_chief=False,
     num_workers=2,
 )
 multi_worker_mirrored_one_chief_one_worker = combinations.NamedDistribution(
-    "MultiWorkerMirrroedOneChiefOneWorker",
+    "MultiWorkerMirroredOneChiefOneWorker",
     collective_all_reduce_strategy.CollectiveAllReduceStrategy,
     has_chief=True,
     num_workers=1,
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
new file mode 100644
index 00000000000..4ed5054af2d
--- /dev/null
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -0,0 +1,100 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for common methods in strategy classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import strategy_test_lib
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class StrategyReduceTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[strategy_combinations.multi_worker_mirrored_two_workers] +
+          strategy_combinations.strategies_minus_tpu,
+          mode=['eager']))
+  def testSimpleReduce(self, strategy):
+
+    def fn_eager():
+
+      def replica_fn():
+        return array_ops.ones((), dtypes.float32)
+
+      per_replica_value = strategy.run(replica_fn)
+      return strategy.reduce(
+          reduce_util.ReduceOp.SUM, value=per_replica_value, axis=None)
+
+    fn_graph = def_function.function(fn_eager)
+
+    # Run reduce under the strategy scope to explicitly enter
+    # strategy default_device scope.
+    with strategy.scope():
+      self.assertEqual(fn_eager().numpy(), 1.0 * strategy.num_replicas_in_sync)
+      self.assertEqual(fn_graph().numpy(), 1.0 * strategy.num_replicas_in_sync)
+
+    # Run reduce without a strategy scope to implicitly enter
+    # strategy default_device scope.
+    self.assertEqual(fn_eager().numpy(), 1.0 * strategy.num_replicas_in_sync)
+    self.assertEqual(fn_graph().numpy(), 1.0 * strategy.num_replicas_in_sync)
+
+
+class DistributedCollectiveAllReduceStrategyTest(
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[strategy_combinations.multi_worker_mirrored_two_workers],
+          mode=['eager']))
+  def testDatasetFromFunction(self, strategy):
+    def dataset_fn(input_context):
+      global_batch_size = 10
+      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
+      d = dataset_ops.DatasetV2.range(100).repeat().batch(batch_size)
+      return d.shard(input_context.num_input_pipelines,
+                     input_context.input_pipeline_id)
+
+    expected_sum_on_workers = [10, 35]
+    input_iterator = iter(
+        strategy.experimental_distribute_datasets_from_function(dataset_fn))
+
+    @def_function.function
+    def run(iterator):
+      return strategy.experimental_local_results(iterator.get_next())
+
+    result = run(input_iterator)
+    sum_value = math_ops.reduce_sum(result)
+    self.assertEqual(
+        sum_value.numpy(),
+        expected_sum_on_workers[multi_worker_test_base.get_task_index()])
+
+
+if __name__ == '__main__':
+  combinations.main()
diff --git a/tensorflow/python/distribute/strategy_reduce_test.py b/tensorflow/python/distribute/strategy_reduce_test.py
new file mode 100644
index 00000000000..a87cce2f0b8
--- /dev/null
+++ b/tensorflow/python/distribute/strategy_reduce_test.py
@@ -0,0 +1,52 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `strategy.reduce`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+
+
+class StrategyReduceTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies,
+          mode=["eager"]
+      ))
+  def test_reduce_with_axis(self, distribution):
+
+    @def_function.function
+    def fn():
+      return constant_op.constant([1., 2.])
+    x = distribution.run(fn)
+
+    x_m = distribution.reduce(reduce_util.ReduceOp.MEAN, x, axis=0)
+    self.assertEqual(1.5, self.evaluate(x_m))
+    x_s = distribution.reduce(reduce_util.ReduceOp.SUM, x, axis=0)
+    self.assertEqual(3 * distribution.num_replicas_in_sync, self.evaluate(x_s))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 82a4a803510..a8ffa618064 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -96,35 +96,34 @@ def validate_run_function(fn):
 
 @tf_export("distribute.experimental.TPUStrategy", v1=[])
 class TPUStrategy(distribute_lib.Strategy):
-  """TPU distribution strategy implementation."""
+  """TPU distribution strategy implementation.
+
+  To construct a TPUStrategy object, you need to run the
+  initialization code as below:
+
+  >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+  >>> tf.config.experimental_connect_to_cluster(resolver)
+  >>> tf.tpu.experimental.initialize_tpu_system(resolver)
+  >>> strategy = tf.distribute.experimental.TPUStrategy(resolver)
+
+  While using distribution strategies, the variables created within strategy's
+  scope will be replicated across all the replicas and can be kept in sync
+  using all-reduce algorithms.
+
+  To run TF2 programs on TPUs, you can either use `.compile` and
+  `.fit` APIs in `tf.keras` with TPUStrategy, or write your own customized
+  training loop by calling `strategy.run` directly. Note that
+  TPUStrategy doesn't support pure eager execution, so please make sure the
+  function passed into `strategy.run` is a `tf.function` or
+  `strategy.run` is called inside a `tf.function` if eager
+  behavior is enabled.
+  """
 
   def __init__(self,
                tpu_cluster_resolver=None,
                device_assignment=None):
     """Synchronous training in TPU donuts or Pods.
 
-    To construct a TPUStrategy object, you need to run the
-    initialization code as below:
-
-    ```python
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu)
-    tf.config.experimental_connect_to_cluster(resolver)
-    tf.tpu.experimental.initialize_tpu_system(resolver)
-    strategy = tf.distribute.experimental.TPUStrategy(resolver)
-    ```
-
-    While using distribution strategies, the variables created within strategy's
-    scope will be replicated across all the replicas and can be kept in sync
-    using all-reduce algorithms.
-
-    To run TF2 programs on TPUs, you can either use `.compile` and
-    `.fit` APIs in `tf.keras` with TPUStrategy, or write your own customized
-    training loop by calling `strategy.run` directly. Note that
-    TPUStrategy doesn't support pure eager execution, so please make sure the
-    function passed into `strategy.run` is a `tf.function` or
-    `strategy.run` is called inside a `tf.function` if eager
-    behavior is enabled.
-
     Args:
       tpu_cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
         which provides information about the TPU cluster.
@@ -140,10 +139,6 @@ class TPUStrategy(distribute_lib.Strategy):
     distribute_lib.distribution_strategy_replica_gauge.get_cell(
         "num_replicas_per_worker").set(self.extended.num_replicas_per_host)
 
-    # TODO(b/155193424): Enable OwnedMultiDeviceIterator on TPU Pod.
-    if self.extended.num_hosts > 1:
-      self._enable_legacy_iterators = True
-
   # TODO(cjfj): Modify `_call_for_each_replica` in `TPUExtended` such that this
   # can use the default implementation.
   # This implementation runs a single step. It does not use infeed or outfeed.
@@ -213,26 +208,26 @@ class TPUStrategyV1(distribute_lib.StrategyV1):
     Users can pass strategy specific options to `options` argument. An example
     to enable bucketizing dynamic shapes in `TPUStrategy.run`
     is:
-    ```python
 
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-    tf.config.experimental_connect_to_cluster(resolver)
-    tf.tpu.experimental.initialize_tpu_system(resolver)
-    strategy = tf.distribute.experimental.TPUStrategy(tpu='')
+    >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    >>> tf.config.experimental_connect_to_cluster(resolver)
+    >>> tf.tpu.experimental.initialize_tpu_system(resolver)
+    >>> strategy = tf.distribute.experimental.TPUStrategy(resolver)
 
-    options = tf.distribute.RunOptions()
-    options.experimental_bucketizing_dynamic_shape = True
+    >>> options = tf.distribute.RunOptions(
+    ...     experimental_bucketizing_dynamic_shape=True)
 
-    iterator = iter(inputs)
+    >>> dataset = tf.data.Dataset.range(
+    ...    strategy.num_replicas_in_sync, output_type=dtypes.float32).batch(
+    ...        strategy.num_replicas_in_sync, drop_remainder=True)
+    >>> input_iterator = iter(strategy.experimental_distribute_dataset(dataset))
 
-    @tf.function()
-    def step_fn(inputs):
-      output = tf.reduce_sum(inputs)
-      return output
+    >>> @tf.function()
+    ... def step_fn(inputs):
+    ...  output = tf.reduce_sum(inputs)
+    ...  return output
 
-      strategy.run(step_fn, args=(next(iterator),),
-                                   options=options)
-    ```
+    >>> strategy.run(step_fn, args=(next(input_iterator),), options=options)
 
     Args:
       fn: The function to run. The output must be a `tf.nest` of `Tensor`s.
@@ -902,7 +897,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
           if tensor_util.is_tensor(input_tensor):
             rank = input_tensor.get_shape().rank
           else:
-            rank = np.rank(input_tensor)
+            rank = np.ndim(input_tensor)
           maximum_shape = tensor_shape.TensorShape([None] * rank)
           maximum_shapes.append(maximum_shape)
         maximum_shapes = nest.pack_sequence_as(replicate_inputs[0],
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index de4c975d5ef..6c93e29c028 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -140,6 +141,9 @@ class TPUStrategyTest(test.TestCase):
     # for non-local TPU.
     if FLAGS.tpu:
       self.skipTest("Recovery fails for non-local TPU, see b/148150981")
+
+    # Disable automatic outside compilation.
+    config.set_soft_device_placement(False)
     strategy = get_tpu_strategy()
 
     @def_function.function
@@ -164,6 +168,28 @@ class TPUStrategyTest(test.TestCase):
 
     good_run()
 
+  def test_dynamic_shape_with_outside_compilation_failure(self):
+    # Enable automatic outside compilation.
+    config.set_soft_device_placement(True)
+    strategy = get_tpu_strategy()
+    dataset = dataset_ops.Dataset.from_tensors(("string", 1.0)).repeat().batch(
+        2, drop_remainder=False)
+    dataset = strategy.experimental_distribute_dataset(dataset)
+    iterator = iter(dataset)
+
+    @def_function.function
+    def train_fn(iterator):
+
+      def step_fn(inputs):
+        _, inputs = inputs
+        return math_ops.reduce_sum(inputs)
+
+      return strategy.experimental_local_results(
+          strategy.run(step_fn, args=(next(iterator),)))
+
+    with self.assertRaisesRegex(errors.InternalError, "Compilation failure"):
+      logging.info(train_fn(iterator))
+
   def test_computation_on_subset_cores(self):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 4fe3d287ccc..9c830f7081c 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import weakref
 
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import composite_tensor
@@ -38,22 +38,11 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.types import core
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
-def _get_current_replica_id_as_int():
-  """Returns the current replica ID as an integer, or `None`."""
-  replica_context = ds_context.get_replica_context()
-  if replica_context:
-    replica_id = replica_context.replica_id_in_sync_group
-    if not isinstance(replica_id, int):
-      replica_id = tensor_util.constant_value(replica_id)
-  else:
-    replica_id = distribute_lib.get_update_replica_id()
-  return replica_id
-
-
 @tf_export("distribute.DistributedValues", v1=[])
 class DistributedValues(object):
   """Base class for representing distributed values.
@@ -127,7 +116,7 @@ class DistributedValues(object):
 
   def _get(self):
     """Returns the value for the current device or raises a ValueError."""
-    replica_id = _get_current_replica_id_as_int()
+    replica_id = values_util.get_current_replica_id_as_int()
     if replica_id is None:
       return self._get_cross_replica()
     else:
@@ -138,9 +127,9 @@ class DistributedValues(object):
         "This method should be overridden by sub-classes which support cross-"
         "replica accesses.")
 
-  def _get_closest(self):
+  def _get_on_device_or_primary(self):
     """Returns value in same replica or device if possible, else the _primary."""
-    replica_id = _get_current_replica_id_as_int()
+    replica_id = values_util.get_current_replica_id_as_int()
     if replica_id is None:
       # Try to find a value on the current device.
       current_device = device_util.canonicalize(device_util.current())
@@ -378,7 +367,7 @@ class Mirrored(DistributedDelegate):
   """Holds a map from replica to values which are kept in sync."""
 
   def _get_cross_replica(self):
-    return self._get_closest()
+    return self._get_on_device_or_primary()
 
   def _as_graph_element(self):
     obj = self._get()
@@ -388,21 +377,6 @@ class Mirrored(DistributedDelegate):
     return obj
 
 
-def _assign_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign(tensor)
-
-
-def _assign_add_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign_add(tensor)
-
-
-def _assign_sub_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign_sub(tensor)
-
-
 class DistributedVarOp(object):
   """A class that looks like `tf.Operation`."""
 
@@ -422,7 +396,8 @@ class DistributedVarOp(object):
     return hash((self.name, self.graph, self.traceback, self.type))
 
 
-class DistributedVariable(DistributedDelegate, variables_lib.Variable):
+class DistributedVariable(DistributedDelegate, variables_lib.Variable,
+                          core.Tensor):
   """Holds a map from replica to variables."""
 
   # TODO(josh11b): Support changing the set of variables if e.g. if new
@@ -478,11 +453,11 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
     return init_op
 
   def initialized_value(self):
-    return self._get_closest().initialized_value()
+    return self._get_on_device_or_primary().initialized_value()
 
   @property
   def initial_value(self):
-    return self._get_closest().initial_value
+    return self._get_on_device_or_primary().initial_value
 
   @property
   def constraint(self):
@@ -527,7 +502,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
 
   @property
   def handle(self):
-    replica_id = _get_current_replica_id_as_int()
+    replica_id = values_util.get_current_replica_id_as_int()
     if replica_id is None:
       raise ValueError("`handle` is not available outside the replica context"
                        " or a `tf.distribute.Strategy.update()` call.")
@@ -535,7 +510,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
       return self._values[replica_id].handle
 
   def eval(self, session=None):
-    return self._get_closest().eval(session)
+    return self._get_on_device_or_primary().eval(session)
 
   @property
   def _save_slice_info(self):
@@ -550,7 +525,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
 
   @property
   def device(self):
-    return self._get_closest().device
+    return self._get_on_device_or_primary().device
 
   @property
   def trainable(self):
@@ -585,7 +560,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
       return array_ops.identity(self._get())
 
   def value(self):
-    return self._get_closest().value()
+    return self._get_on_device_or_primary().value()
 
   def numpy(self):
     if context.executing_eagerly():
@@ -733,7 +708,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
           return update_fn(self._values[update_replica_id], value, **kwargs)
         return self._update_cross_replica(update_fn, value, **kwargs)
       else:
-        _assert_replica_context(self.distribute_strategy)
+        values_util.assert_replica_context(self.distribute_strategy)
         return self._update_replica(update_fn, value, **kwargs)
 
   def _should_act_as_resource_variable(self):
@@ -741,62 +716,6 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
     pass
 
 
-ops.register_dense_tensor_like_type(DistributedVariable)
-
-
-def _validate_colocate_extended(v, extended):
-  variable_strategy = v._distribute_strategy  # pylint: disable=protected-access
-  if variable_strategy.extended is not extended:
-    raise ValueError(
-        "`colocate_vars_with` must only be passed a variable created in this "
-        "tf.distribute.Strategy.scope(), not %s created in scope: %s" %
-        (v, variable_strategy))
-
-
-def validate_colocate_distributed_variable(v, extended):
-  if not isinstance(v, DistributedVariable):
-    raise ValueError(
-        "`colocate_vars_with` must only be passed a variable created in this "
-        "tf.distribute.Strategy.scope(), not: %r" % (v,))
-  _validate_colocate_extended(v, extended)
-
-
-def validate_colocate(v, extended):
-  if not hasattr(v, "_distribute_strategy"):
-    raise ValueError(
-        "`colocate_vars_with` must only be passed a variable created in this "
-        "tf.distribute.Strategy.scope(), not: %r" % (v,))
-  _validate_colocate_extended(v, extended)
-
-
-def _apply_aggregation(strategy, value, aggregation, destinations):
-  if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-    return strategy.extended.broadcast_to(
-        strategy.experimental_local_results(value)[0],
-        destinations=destinations)
-  reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
-  return strategy.extended.reduce_to(reduce_op, value, destinations)
-
-
-_aggregation_error_msg = (
-    "You must specify an aggregation method to update a "
-    "{variable_type} in Replica Context. You can do so by passing "
-    "an explicit value for argument `aggregation` to tf.Variable(..)."
-    "e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)`"
-    "`tf.VariableAggregation` lists the possible aggregation methods."
-    "This is required because {variable_type} should always be "
-    "kept in sync. When updating them or assigning to them in a "
-    "replica context, we automatically try to aggregate the values "
-    "before updating the variable. For this aggregation, we need to "
-    "know the aggregation method. "
-    "Another alternative is to not try to update such "
-    "{variable_type} in replica context, but in cross replica "
-    "context. You can enter cross replica context by calling "
-    "`tf.distribute.get_replica_context().merge_call(merge_fn, ..)`."
-    "Inside `merge_fn`, you can then update the {variable_type} "
-    "using `tf.distribute.StrategyExtended.update()`.")
-
-
 class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
   """Class for defining how to restore a MirroredVariable."""
 
@@ -809,10 +728,281 @@ class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
     tensor, = restored_tensors
     return control_flow_ops.group(
         tuple(
-            _assign_on_device(v.device, v, tensor)
+            values_util.assign_on_device(v.device, v, tensor)
             for v in self._mirrored_variable.values))
 
 
+class MirroredVariable(DistributedVariable, Mirrored):
+  """Holds a map from replica to variables whose values are kept in sync."""
+
+  def _update_replica(self, update_fn, value, **kwargs):
+    if self.aggregation == vs.VariableAggregation.NONE:
+      raise ValueError(
+          values_util.aggregation_error_msg.format(
+              variable_type="MirroredVariable"))
+
+    def merge_fn(strategy, value, **kwargs):
+      """Aggregate values and update all variables in cross replica context."""
+      # Don't allow MEAN with non float dtype, since it may cause unexpected
+      # precision loss. Python3 and NumPy automatically upcast integers to
+      # float in division, but we should always preserve the type.
+      #
+      # Note that to be backward compatible we allow the case when the value
+      # is *always* the same on each replica. I.E. value is not a
+      # PerReplica. Refer to regroup() to see how values are grouped.
+      if self._aggregation == vs.VariableAggregation.MEAN and (
+          not self.dtype.is_floating) and isinstance(value, PerReplica):
+        raise ValueError(
+            "Cannot update non-float variables with "
+            "tf.VariableAggregation.MEAN aggregation in replica context. "
+            "Either change the variable dtype to float or update it in "
+            "cross-replica context.")
+
+      assert strategy == self.distribute_strategy
+      v = values_util.apply_aggregation(strategy, value, self.aggregation, self)
+      return self._update_cross_replica(update_fn, v, **kwargs)
+
+    return ds_context.get_replica_context().merge_call(
+        merge_fn, args=(value,), kwargs=kwargs)
+
+  def scatter_min(self, *args, **kwargs):
+    if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
+        self._aggregation != vs.VariableAggregation.NONE):
+      raise NotImplementedError("scatter_min is only supported for mirrored "
+                                "variable (variable created within certain "
+                                "`tf.distribute.Strategy` scope) with NONE or "
+                                "`ONLY_FIRST_REPLICA` aggregation, got: %s" %
+                                self._aggregation)
+    return super(MirroredVariable, self).scatter_min(*args, **kwargs)
+
+  def scatter_max(self, *args, **kwargs):
+    if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
+        self._aggregation != vs.VariableAggregation.NONE):
+      raise NotImplementedError("scatter_max is only supported for mirrored "
+                                "variable (variable created within certain "
+                                "`tf.distribute.Strategy` scope) with NONE or "
+                                "`ONLY_FIRST_REPLICA` aggregation, got: %s" %
+                                self._aggregation)
+    return super(MirroredVariable, self).scatter_max(*args, **kwargs)
+
+  def scatter_update(self, *args, **kwargs):
+    if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
+        self._aggregation != vs.VariableAggregation.NONE):
+      raise NotImplementedError("scatter_update is only supported for mirrored "
+                                "variable (variable created within certain "
+                                "`tf.distribute.Strategy` scope) with NONE or "
+                                "`ONLY_FIRST_REPLICA` aggregation, got: %s" %
+                                self._aggregation)
+    return super(MirroredVariable, self).scatter_update(*args, **kwargs)
+
+  def _get_cross_replica(self):
+    # Return identity, to avoid directly exposing the variable to the user and
+    # allowing it to be modified by mistake.
+    return array_ops.identity(Mirrored._get_cross_replica(self))
+
+  def _as_graph_element(self):
+    return self._get_on_device_or_primary()._as_graph_element()  # pylint: disable=protected-access
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides Trackable method.
+
+    This allows both name-based and object-based save and restore of
+    MirroredVariables.
+
+    Returns:
+      A dictionary mapping attribute names to `SaveableObject` factories.
+    """
+
+    def _saveable_factory(name=self._common_name):
+      return _MirroredSaveable(self, self._primary, name)
+
+    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
+    """Converts a variable to a tensor."""
+    # Try to avoid assignments to and other mutations of MirroredVariable
+    # state except through a DistributionStrategy.extended.update() call.
+    if as_ref:
+      # A TF 1.x case where the variable is a boolean variable and used like:
+      # tf.cond(v, true_fn, false_fn).
+      raise ValueError(
+          "You may be using variable created under distribute strategy in TF "
+          "1.x control flows. Try explicitly converting the variable to Tensor "
+          "using variable.read_value(), or switch to TF 2.x.")
+    return ops.convert_to_tensor(
+        self._get(), dtype=dtype, name=name, as_ref=as_ref)
+
+
+class _SyncOnReadSaveable(saveable_object.SaveableObject):
+  """Class for defining how to restore a SyncOnReadVariable."""
+
+  def __init__(self, sync_on_read_variable, name):
+    self._sync_on_read_variable = sync_on_read_variable
+
+    # We use a callable so that we don't have to evaluate this expression
+    # in the case where we are trying to restore instead of save.
+    def tensor():
+      strategy = sync_on_read_variable._distribute_strategy  # pylint: disable=protected-access
+      return strategy.extended.read_var(sync_on_read_variable)
+
+    spec = saveable_object.SaveSpec(
+        tensor=tensor,
+        slice_spec="",
+        name=name,
+        dtype=sync_on_read_variable.dtype,
+        device=sync_on_read_variable._primary.device)  # pylint: disable=protected-access
+
+    super(_SyncOnReadSaveable, self).__init__(tensor, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into all variables."""
+    # To preserve the sum across save and restore, we have to divide the
+    # total across all devices when restoring a variable that was summed
+    # when saving.
+    tensor, = restored_tensors
+    if self._sync_on_read_variable.aggregation == vs.VariableAggregation.SUM:
+      tensor = math_ops.cast(tensor / len(self._sync_on_read_variable._devices),  # pylint: disable=protected-access
+                             self._sync_on_read_variable.dtype)
+    return control_flow_ops.group(
+        tuple(
+            values_util.assign_on_device(v.device, v, tensor)
+            for v in self._sync_on_read_variable.values))
+
+
+class SyncOnReadVariable(DistributedVariable):
+  """Holds a map from replica to variables whose values are reduced on save."""
+
+  def _update_replica(self, update_fn, value, **kwargs):
+    return update_fn(self._get_on_device_or_primary(), value, **kwargs)
+
+  def _assign_on_each_device(self, assign_func, value, read_value):
+    update = control_flow_ops.group(
+        tuple(
+            assign_func(v.device, v, value)
+            for v in self._values))
+    if not read_value:
+      return update
+    with ops.control_dependencies([update] if update else []):
+      return self.read_value()
+
+  # TODO(b/154017756): Make assign behaivor in cross replica context consistent
+  # with MirroredVariable.
+  def assign_sub(self, value, use_locking=False, name=None, read_value=True):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      if ds_context.in_cross_replica_context():
+        if self._aggregation == vs.VariableAggregation.SUM:
+          raise ValueError(
+              "SyncOnReadVariable does not support `assign_sub` in "
+              "cross-replica context when aggregation is set to "
+              "`tf.VariableAggregation.SUM`.")
+        return self._assign_on_each_device(values_util.assign_sub_on_device,
+                                           value, read_value)
+      else:
+        return super(SyncOnReadVariable,
+                     self).assign_sub(value, use_locking, name, read_value)
+
+  def assign_add(self, value, use_locking=False, name=None, read_value=True):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      if ds_context.in_cross_replica_context():
+        if self._aggregation == vs.VariableAggregation.SUM:
+          raise ValueError(
+              "SyncOnReadVariable does not support `assign_add` in "
+              "cross-replica context when aggregation is set to "
+              "`tf.VariableAggregation.SUM`.")
+        return self._assign_on_each_device(values_util.assign_add_on_device,
+                                           value, read_value)
+      else:
+        return super(SyncOnReadVariable,
+                     self).assign_add(value, use_locking, name, read_value)
+
+  def assign(self, value, use_locking=False, name=None, read_value=True):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      if ds_context.in_cross_replica_context():
+        # To preserve the sum across save and restore, we have to divide the
+        # total across all devices when restoring a variable that was summed
+        # when saving.
+        if self._aggregation == vs.VariableAggregation.SUM:
+          value = math_ops.cast(value / len(self._values), self.dtype)
+        return self._assign_on_each_device(values_util.assign_on_device, value,
+                                           read_value)
+      else:
+        return super(SyncOnReadVariable,
+                     self).assign(value, use_locking, name, read_value)
+
+  def _scatter_not_implemented(self, method):
+    raise NotImplementedError(
+        "Variables with `synchronization=ON_READ` doesn't support `%s`" %
+        method)
+
+  def scatter_sub(self, *args, **kwargs):
+    self._scatter_not_implemented("scatter_sub")
+
+  def scatter_add(self, *args, **kwargs):
+    self._scatter_not_implemented("scatter_add")
+
+  def scatter_mul(self, *args, **kwargs):
+    self._scatter_not_implemented("scatter_mul")
+
+  def scatter_div(self, *args, **kwargs):
+    self._scatter_not_implemented("scatter_div")
+
+  def scatter_min(self, *args, **kwargs):
+    self._scatter_not_implemented("scatter_min")
+
+  def scatter_max(self, *args, **kwargs):
+    self._scatter_not_implemented("scatter_max")
+
+  def scatter_update(self, *args, **kwargs):
+    self._scatter_not_implemented("scatter_update")
+
+  def value(self):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      if ds_context.in_cross_replica_context():
+        return self._get_cross_replica()
+      else:
+        # _get_on_device_or_primary() returns a Variable.
+        return self._get_on_device_or_primary().value()
+
+  def _get_cross_replica(self):
+    if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+      return self._primary
+
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      return self._distribute_strategy.reduce(
+          reduce_util.ReduceOp.from_variable_aggregation(self.aggregation),
+          self,
+          axis=None)
+
+  def _as_graph_element(self):
+    # pylint: disable=protected-access
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      if ds_context.in_cross_replica_context():
+        return ops.convert_to_tensor(self._get_cross_replica())
+    return self._get()._as_graph_element()
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides Trackable method.
+
+    This allows both name-based and object-based save and restore of
+    `SyncOnReadVariable`s.
+
+    Returns:
+      A dictionary mapping attribute names to `SaveableObject` factories.
+    """
+
+    def _saveable_factory(name=self._common_name):
+      return _SyncOnReadSaveable(self, name)
+
+    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
+    """Converts a variable to a tensor."""
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      return ops.convert_to_tensor(
+          self._get(), dtype=dtype, name=name, as_ref=as_ref)
+
+
+# Variable creation function for sync strategies.
 def create_mirrored_variable(  # pylint: disable=missing-docstring
     strategy, real_mirrored_creator, mirrored_cls, sync_on_read_cls, **kwargs):
   # Figure out what collections this variable should be added to.
@@ -894,108 +1084,9 @@ def create_mirrored_variable(  # pylint: disable=missing-docstring
   return result
 
 
-class MirroredVariable(DistributedVariable, Mirrored):
-  """Holds a map from replica to variables whose values are kept in sync."""
-
-  def _update_replica(self, update_fn, value, **kwargs):
-    if self.aggregation == vs.VariableAggregation.NONE:
-      raise ValueError(
-          _aggregation_error_msg.format(variable_type="MirroredVariable"))
-
-    def merge_fn(strategy, value, **kwargs):
-      """Aggregate values and update all variables in cross replica context."""
-      # Don't allow MEAN with non float dtype, since it may cause unexpected
-      # precision loss. Python3 and NumPy automatically upcast integers to
-      # float in division, but we should always preserve the type.
-      #
-      # Note that to be backward compatible we allow the case when the value
-      # is *always* the same on each replica. I.E. value is not a
-      # PerReplica. Refer to regroup() to see how values are grouped.
-      if self._aggregation == vs.VariableAggregation.MEAN and (
-          not self.dtype.is_floating) and isinstance(value, PerReplica):
-        raise ValueError(
-            "Cannot update non-float variables with "
-            "tf.VariableAggregation.MEAN aggregation in replica context. "
-            "Either change the variable dtype to float or update it in "
-            "cross-replica context.")
-
-      assert strategy == self.distribute_strategy
-      v = _apply_aggregation(strategy, value, self.aggregation, self)
-      return self._update_cross_replica(update_fn, v, **kwargs)
-
-    return ds_context.get_replica_context().merge_call(
-        merge_fn, args=(value,), kwargs=kwargs)
-
-  def scatter_min(self, *args, **kwargs):
-    if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
-        self._aggregation != vs.VariableAggregation.NONE):
-      raise NotImplementedError("scatter_min is only supported for mirrored "
-                                "variable (variable created within certain "
-                                "`tf.distribute.Strategy` scope) with NONE or "
-                                "`ONLY_FIRST_REPLICA` aggregation, got: %s" %
-                                self._aggregation)
-    return super(MirroredVariable, self).scatter_min(*args, **kwargs)
-
-  def scatter_max(self, *args, **kwargs):
-    if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
-        self._aggregation != vs.VariableAggregation.NONE):
-      raise NotImplementedError("scatter_max is only supported for mirrored "
-                                "variable (variable created within certain "
-                                "`tf.distribute.Strategy` scope) with NONE or "
-                                "`ONLY_FIRST_REPLICA` aggregation, got: %s" %
-                                self._aggregation)
-    return super(MirroredVariable, self).scatter_max(*args, **kwargs)
-
-  def scatter_update(self, *args, **kwargs):
-    if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
-        self._aggregation != vs.VariableAggregation.NONE):
-      raise NotImplementedError("scatter_update is only supported for mirrored "
-                                "variable (variable created within certain "
-                                "`tf.distribute.Strategy` scope) with NONE or "
-                                "`ONLY_FIRST_REPLICA` aggregation, got: %s" %
-                                self._aggregation)
-    return super(MirroredVariable, self).scatter_update(*args, **kwargs)
-
-  def _get_cross_replica(self):
-    # Return identity, to avoid directly exposing the variable to the user and
-    # allowing it to be modified by mistake.
-    return array_ops.identity(Mirrored._get_cross_replica(self))
-
-  def _as_graph_element(self):
-    return self._get_closest()._as_graph_element()  # pylint: disable=protected-access
-
-  def _gather_saveables_for_checkpoint(self):
-    """Overrides Trackable method.
-
-    This allows both name-based and object-based save and restore of
-    MirroredVariables.
-
-    Returns:
-      A dictionary mapping attribute names to `SaveableObject` factories.
-    """
-
-    def _saveable_factory(name=self._common_name):
-      return _MirroredSaveable(self, self._primary, name)
-
-    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
-
-  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
-    """Converts a variable to a tensor."""
-    # Try to avoid assignments to and other mutations of MirroredVariable
-    # state except through a DistributionStrategy.extended.update() call.
-    if as_ref:
-      # A TF 1.x case where the variable is a boolean variable and used like:
-      # tf.cond(v, true_fn, false_fn).
-      raise ValueError(
-          "You may be using variable created under distribute strategy in TF "
-          "1.x control flows. Try explicitly converting the variable to Tensor "
-          "using variable.read_value(), or switch to TF 2.x.")
-    return ops.convert_to_tensor(
-        self._get(), dtype=dtype, name=name, as_ref=as_ref)
-
-
-# Register a conversion function which reads the value of the variable,
+# Register a conversion functions which reads the value of the variable,
 # allowing instances of the class to be used as tensors.
+# MirroredVariables
 def _tensor_conversion_mirrored(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
@@ -1004,6 +1095,7 @@ ops.register_tensor_conversion_function(MirroredVariable,
                                         _tensor_conversion_mirrored)
 
 
+# Mirrored Values
 def _tensor_conversion_mirrored_val(value, dtype=None, name=None, as_ref=False):
   return ops.convert_to_tensor(
       value._get(), dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
@@ -1013,184 +1105,7 @@ ops.register_tensor_conversion_function(Mirrored,
                                         _tensor_conversion_mirrored_val)
 
 
-def is_distributed_variable(v):
-  """Determine if a variable is ds variable or TPU mirrored variable."""
-  return isinstance(v, DistributedVariable)
-
-
-class _SyncOnReadSaveable(saveable_object.SaveableObject):
-  """Class for defining how to restore a SyncOnReadVariable."""
-
-  def __init__(self, sync_on_read_variable, name):
-    self._sync_on_read_variable = sync_on_read_variable
-
-    # We use a callable so that we don't have to evaluate this expression
-    # in the case where we are trying to restore instead of save.
-    def tensor():
-      strategy = sync_on_read_variable._distribute_strategy  # pylint: disable=protected-access
-      return strategy.extended.read_var(sync_on_read_variable)
-
-    spec = saveable_object.SaveSpec(
-        tensor=tensor,
-        slice_spec="",
-        name=name,
-        dtype=sync_on_read_variable.dtype,
-        device=sync_on_read_variable._primary.device)  # pylint: disable=protected-access
-
-    super(_SyncOnReadSaveable, self).__init__(tensor, [spec], name)
-
-  def restore(self, restored_tensors, restored_shapes):
-    """Restore the same value into all variables."""
-    # To preserve the sum across save and restore, we have to divide the
-    # total across all devices when restoring a variable that was summed
-    # when saving.
-    tensor, = restored_tensors
-    if self._sync_on_read_variable.aggregation == vs.VariableAggregation.SUM:
-      tensor = math_ops.cast(tensor / len(self._sync_on_read_variable._devices),  # pylint: disable=protected-access
-                             self._sync_on_read_variable.dtype)
-    return control_flow_ops.group(
-        tuple(
-            _assign_on_device(v.device, v, tensor)
-            for v in self._sync_on_read_variable.values))
-
-
-def _assert_replica_context(strategy):
-  replica_context = ds_context.get_replica_context()
-  if not replica_context:
-    raise RuntimeError(
-        "Replica-local variables may only be assigned in a replica context.")
-  if replica_context.strategy is not strategy:
-    raise RuntimeError(
-        "Replica-local variables may only be assigned in a replica context.")
-
-
-class SyncOnReadVariable(DistributedVariable):
-  """Holds a map from replica to variables whose values are reduced on save."""
-
-  def _update_replica(self, update_fn, value, **kwargs):
-    return update_fn(self._get_closest(), value, **kwargs)
-
-  # TODO(b/154017756): Make assign behaivor in cross replica context consistent
-  # with MirroredVariable.
-  def assign_sub(self, *args, **kwargs):
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context():
-        if self._aggregation == vs.VariableAggregation.SUM:
-          raise ValueError(
-              "SyncOnReadVariable does not support `assign_sub` in "
-              "cross-replica context when aggregation is set to "
-              "`tf.VariableAggregation.SUM`.")
-        return control_flow_ops.group(
-            tuple(
-                _assign_sub_on_device(v.device, v, args[0])
-                for v in self._values))
-      else:
-        return super(SyncOnReadVariable, self).assign_sub(*args, **kwargs)
-
-  def assign_add(self, *args, **kwargs):
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context():
-        if self._aggregation == vs.VariableAggregation.SUM:
-          raise ValueError(
-              "SyncOnReadVariable does not support `assign_add` in "
-              "cross-replica context when aggregation is set to "
-              "`tf.VariableAggregation.SUM`.")
-        return control_flow_ops.group(
-            tuple(
-                _assign_add_on_device(v.device, v, args[0])
-                for v in self._values))
-      else:
-        return super(SyncOnReadVariable, self).assign_add(*args, **kwargs)
-
-  def assign(self, *args, **kwargs):
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context():
-        # To preserve the sum across save and restore, we have to divide the
-        # total across all devices when restoring a variable that was summed
-        # when saving.
-        tensor = args[0]
-        if self._aggregation == vs.VariableAggregation.SUM:
-          tensor = math_ops.cast(tensor / len(self._values), self.dtype)
-        return control_flow_ops.group(
-            tuple(_assign_on_device(v.device, v, tensor) for v in self._values))
-      else:
-        return super(SyncOnReadVariable, self).assign(*args, **kwargs)
-
-  def _scatter_not_implemented(self, method):
-    raise NotImplementedError(
-        "Variables with `synchronization=ON_READ` doesn't support `%s`" %
-        method)
-
-  def scatter_sub(self, *args, **kwargs):
-    self._scatter_not_implemented("scatter_sub")
-
-  def scatter_add(self, *args, **kwargs):
-    self._scatter_not_implemented("scatter_add")
-
-  def scatter_mul(self, *args, **kwargs):
-    self._scatter_not_implemented("scatter_mul")
-
-  def scatter_div(self, *args, **kwargs):
-    self._scatter_not_implemented("scatter_div")
-
-  def scatter_min(self, *args, **kwargs):
-    self._scatter_not_implemented("scatter_min")
-
-  def scatter_max(self, *args, **kwargs):
-    self._scatter_not_implemented("scatter_max")
-
-  def scatter_update(self, *args, **kwargs):
-    self._scatter_not_implemented("scatter_update")
-
-  def value(self):
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context():
-        return self._get_cross_replica()
-      else:
-        # _get_closest() returns a Variable.
-        return self._get_closest().value()
-
-  def _get_cross_replica(self):
-    if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return self._primary
-
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      return self._distribute_strategy.reduce(
-          reduce_util.ReduceOp.from_variable_aggregation(self.aggregation),
-          self,
-          axis=None)
-
-  def _as_graph_element(self):
-    # pylint: disable=protected-access
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context():
-        return ops.convert_to_tensor(self._get_cross_replica())
-    return self._get()._as_graph_element()
-
-  def _gather_saveables_for_checkpoint(self):
-    """Overrides Trackable method.
-
-    This allows both name-based and object-based save and restore of
-    `SyncOnReadVariable`s.
-
-    Returns:
-      A dictionary mapping attribute names to `SaveableObject` factories.
-    """
-
-    def _saveable_factory(name=self._common_name):
-      return _SyncOnReadSaveable(self, name)
-
-    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
-
-  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
-    """Converts a variable to a tensor."""
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      return ops.convert_to_tensor(
-          self._get(), dtype=dtype, name=name, as_ref=as_ref)
-
-
-# Register a conversion function for SyncOnReadVariable which allows as_ref to
-# be true.
+# SyncOnReadVariables
 def _tensor_conversion_sync_on_read(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
@@ -1380,273 +1295,31 @@ def value_container(val):
   return val
 
 
-class AggregatingVariable(variables_lib.Variable):
-  """A wrapper around a variable that aggregates updates across replicas."""
-
-  def __init__(self, strategy, v, aggregation):
-    self._distribute_strategy = strategy
-    self._v = v
-    # NOTE: We don't use "_distributed_container" here because we don't want
-    # to trigger that code path in regroup().
-    v._aggregating_container = weakref.ref(self)  # pylint: disable=protected-access
-    self._aggregation = aggregation
-
-  def get(self):
-    return self._v
-
-  @property
-  def distribute_strategy(self):
-    return self._distribute_strategy
-
-  def __getattr__(self, name):
-    return getattr(self._v, name)
-
-  def _assign_func(self, *args, **kwargs):
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      f = kwargs.pop("f")
-      if ds_context.in_cross_replica_context():
-        if distribute_lib.get_update_replica_id() is not None:
-          # We are calling an assign function in an update context.
-          return f(self._v, *args, **kwargs)
-
-        # We are calling an assign function in cross replica context, wrap it in
-        # an update call.
-        return self._distribute_strategy.extended.update(
-            self, f, args=args, kwargs=kwargs)
-      else:
-        replica_context = ds_context.get_replica_context()
-        assert replica_context
-        # We are calling an assign function in replica context.
-        # We reduce the value we want to assign/add/sub. More details about how
-        # we handle the different use cases can be found in the _reduce method.
-        # We call the function with the reduced value.
-        if self._aggregation == vs.VariableAggregation.NONE:
-          raise ValueError(
-              _aggregation_error_msg.format(
-                  variable_type="AggregatingVariable"))
-
-        def merge_fn(strategy,
-                     value,
-                     use_locking=False,
-                     name=None,
-                     read_value=True):
-          v = _apply_aggregation(strategy, value, self._aggregation, self)
-          if name and isinstance(name, PerReplica):
-            name = name.values[0]
-          return strategy.extended.update(
-              self,
-              f,
-              args=(v,),
-              kwargs={
-                  "use_locking": use_locking,
-                  "name": name,
-                  "read_value": read_value
-              })
-        return replica_context.merge_call(merge_fn, args=args, kwargs=kwargs)
-
-  def assign_sub(self, *args, **kwargs):
-    assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
-    return self._assign_func(f=assign_sub_fn, *args, **kwargs)
-
-  def assign_add(self, *args, **kwargs):
-    assign_add_fn = lambda var, *a, **kw: var.assign_add(*a, **kw)
-    return self._assign_func(f=assign_add_fn, *args, **kwargs)
-
-  def assign(self, *args, **kwargs):
-    assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
-    return self._assign_func(f=assign_fn, *args, **kwargs)
-
-  @property
-  def initializer(self):
-    return self._v.initializer
-
-  def initialized_value(self):
-    return self._v.initialized_value()
-
-  @property
-  def initial_value(self):
-    return self._v.initial_value
-
-  @property
-  def op(self):
-    return self._v.op
-
-  def read_value(self):
-    return self._v.read_value()
-
-  def eval(self, session=None):
-    return self._v.eval(session)
-
-  @property
-  def graph(self):
-    return self._v.graph
-
-  @property
-  def device(self):
-    return self._v.device
-
-  @property
-  def shape(self):
-    return self._v.shape
-
-  @property
-  def aggregation(self):
-    return self._aggregation
-
-  @property
-  def synchronization(self):
-    return self._v.synchronization
-
-  @property
-  def name(self):
-    return self._v.name
-
-  @property
-  def trainable(self):
-    return self._v.trainable
-
-  @property
-  def dtype(self):
-    return self._v.dtype
-
-  # TODO(josh11b): Test saving & restoring.
-  def _gather_saveables_for_checkpoint(self):
-    return {trackable.VARIABLE_VALUE_KEY: self._v}
-
-  # pylint: disable=multiple-statements
-  def __add__(self, o):
-    return self._v + o
-
-  def __radd__(self, o):
-    return o + self._v
-
-  def __sub__(self, o):
-    return self._v - o
-
-  def __rsub__(self, o):
-    return o - self._v
-
-  def __mul__(self, o):
-    return self._v * o
-
-  def __rmul__(self, o):
-    return o * self._v
-
-  def __truediv__(self, o):
-    return self._v / o
-
-  def __rtruediv__(self, o):
-    return o / self._v
-
-  def __floordiv__(self, o):
-    return self._v // o
-
-  def __rfloordiv__(self, o):
-    return o // self._v
-
-  def __mod__(self, o):
-    return self._v % o
-
-  def __rmod__(self, o):
-    return o % self._v
-
-  def __lt__(self, o):
-    return self._v < o
-
-  def __le__(self, o):
-    return self._v <= o
-
-  def __gt__(self, o):
-    return self._v > o
-
-  def __ge__(self, o):
-    return self._v >= o
-
-  def __and__(self, o):
-    return self._v & o
-
-  def __rand__(self, o):
-    return o & self._v
-
-  def __or__(self, o):
-    return self._v | o
-
-  def __ror__(self, o):
-    return o | self._v
-
-  def __xor__(self, o):
-    return self._v ^ o
-
-  def __rxor__(self, o):
-    return o ^ self._v
-
-  def __getitem__(self, o):
-    return self._v[o]
-
-  def __pow__(self, o, modulo=None):
-    return pow(self._v, o, modulo)
-
-  def __rpow__(self, o):
-    return pow(o, self._v)
-
-  def __invert__(self):
-    return ~self._v
-
-  def __neg__(self):
-    return -self._v
-
-  def __abs__(self):
-    return abs(self._v)
-
-  def __div__(self, o):
-    try:
-      return self._v.__div__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __rdiv__(self, o):
-    try:
-      return self._v.__rdiv__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __matmul__(self, o):
-    try:
-      return self._v.__matmul__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __rmatmul__(self, o):
-    try:
-      return self._v.__rmatmul__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __str__(self):
-    return str(self._v)
-
-  def __repr__(self):
-    return repr(self._v)
-
-  def _should_act_as_resource_variable(self):
-    """Pass resource_variable_ops.is_resource_variable check."""
-    pass
-
-  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
-    return ops.convert_to_tensor(self.get(), dtype=dtype, name=name,
-                                 as_ref=as_ref)
-
-
-# Register a conversion function which reads the value of the variable,
-# allowing instances of the class to be used as tensors.
-def _tensor_conversion_aggregate(var, dtype=None, name=None, as_ref=False):
-  return var._dense_var_to_tensor(dtype, name, as_ref)  # pylint: disable=protected-access
-
-
-ops.register_tensor_conversion_function(AggregatingVariable,
-                                        _tensor_conversion_aggregate)
-ops.register_dense_tensor_like_type(AggregatingVariable)
+def is_distributed_variable(v):
+  """Determine if a variable is ds variable or TPU mirrored variable."""
+  return isinstance(v, DistributedVariable)
+
+
+def _validate_colocate_extended(v, extended):
+  variable_strategy = v._distribute_strategy  # pylint: disable=protected-access
+  if variable_strategy.extended is not extended:
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not %s created in scope: %s" %
+        (v, variable_strategy))
+
+
+def validate_colocate_distributed_variable(v, extended):
+  if not isinstance(v, DistributedVariable):
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not: %r" % (v,))
+  _validate_colocate_extended(v, extended)
+
+
+def validate_colocate(v, extended):
+  if not hasattr(v, "_distribute_strategy"):
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not: %r" % (v,))
+  _validate_colocate_extended(v, extended)
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index daa7e5563d3..8016bfe9265 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -56,6 +56,7 @@ from tensorflow.python.saved_model.model_utils import mode_keys
 from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training.tracking import util as trackable_utils
+from tensorflow.python.types import core
 from tensorflow.python.util import nest
 
 
@@ -623,10 +624,10 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
       v = variables_lib.Variable(
           0., synchronization=synchronization, aggregation=aggregation)
     # In cross replica context.
-    self.assertTrue(ops.is_dense_tensor_like(v))
+    self.assertIsInstance(v, core.Tensor)
     # In replica context.
     distribution.run(
-        lambda v: self.assertTrue(ops.is_dense_tensor_like(v)), args=(v,))
+        lambda v: self.assertIsInstance(v, core.Tensor), args=(v,))
 
   def testAssignReturnValueIsTensorLike(self, distribution, synchronization,
                                         aggregation):
@@ -645,12 +646,15 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
       # values is not allowed when aggregation is SUM. See
       # `cross_device_ops.reduce_non_distributed_value`.
       delta = array_ops.identity(1.)
-      self.assertTrue(ops.is_dense_tensor_like(v.assign(delta)))
-      self.assertTrue(ops.is_dense_tensor_like(v.assign_sub(delta)))
-      self.assertTrue(ops.is_dense_tensor_like(v.assign_add(delta)))
+      self.assertIsInstance(v.assign(delta), core.Tensor)
+      self.assertIsInstance(v.assign_sub(delta), core.Tensor)
+      self.assertIsInstance(v.assign_add(delta), core.Tensor)
 
     # In cross replica context we return a PerReplica which is not Tensor like
-    # yet.
+    # all the time yet.
+    if (synchronization == variables_lib.VariableSynchronization.ON_READ and
+        aggregation != variables_lib.VariableAggregation.SUM):
+      assert_is_tensor_like(v)
 
     # In replica context.
     distribution.run(assert_is_tensor_like, args=(v,))
@@ -1609,10 +1613,16 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
         variables_lib.VariableAggregation.MEAN,
         variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
     ]
-    options = (  # VariableAggregation.SUM in cross-replica mode is tested below
-        [x for x in itertools.product(updates, aggregations, [True, False])
-         if not(x[1] == variables_lib.VariableAggregation.SUM and x[2])])
+    options = list(
+        x for x in itertools.product(updates, aggregations, [True, False]))
     for update, aggregation, cross_replica in options:
+      # VariableAggregation.SUM in cross-replica mode is tested below,
+      # VariableAggregation.NONE in cross-replica mode is not supported.
+      if cross_replica and aggregation in [
+          variables_lib.VariableAggregation.SUM,
+          variables_lib.VariableAggregation.NONE,
+      ]:
+        continue
       with distribution.scope():
         v = variable_scope.variable(
             0.,
@@ -1646,10 +1656,16 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
         variables_lib.VariableAggregation.MEAN,
         variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
     ]
-    options = (  # VariableAggregation.SUM in cross-replica mode is tested below
-        [x for x in itertools.product(updates, aggregations, [True, False])
-         if not(x[1] == variables_lib.VariableAggregation.SUM and x[2])])
+    options = list(
+        x for x in itertools.product(updates, aggregations, [True, False]))
     for update, aggregation, cross_replica in options:
+      # VariableAggregation.SUM in cross-replica mode is tested below,
+      # VariableAggregation.NONE in cross-replica mode is not supported.
+      if cross_replica and aggregation in [
+          variables_lib.VariableAggregation.SUM,
+          variables_lib.VariableAggregation.NONE,
+      ]:
+        continue
       with distribution.scope():
         v = variable_scope.variable(
             0.,
@@ -1721,8 +1737,8 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
                                          experimental_run_tf_function):
     aggregations = [
         variables_lib.VariableAggregation.SUM,
-        variables_lib.VariableAggregation.MEAN,
-        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+        # variables_lib.VariableAggregation.MEAN,
+        # variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
     ]
     for aggregation in aggregations:
       if isinstance(distribution, _TPU_STRATEGIES):
@@ -1992,38 +2008,6 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
       self.evaluate(distribution.run(v.scatter_min, args=(delta,)))
 
 
-@combinations.generate(
-    combinations.combine(
-        distribution=[
-            strategy_combinations.central_storage_strategy_with_two_gpus
-        ],
-        mode=["graph", "eager"]))
-class AggregatingVariableTest(test.TestCase, parameterized.TestCase):
-
-  def testAssignOutOfScope(self, distribution):
-    with distribution.scope():
-      aggregating = variables_lib.Variable(1.)
-    self.assertIsInstance(aggregating, values.AggregatingVariable)
-    self.evaluate(aggregating.assign(3.))
-    self.assertEqual(self.evaluate(aggregating.read_value()), 3.)
-    self.assertEqual(self.evaluate(aggregating._v.read_value()), 3.)
-
-  def testAssignAdd(self, distribution):
-    with distribution.scope():
-      v = variable_scope.variable(
-          1, aggregation=variables_lib.VariableAggregation.MEAN)
-    self.evaluate(variables_lib.global_variables_initializer())
-
-    @def_function.function
-    def assign():
-      return v.assign_add(2)
-
-    per_replica_results = self.evaluate(
-        distribution.experimental_local_results(
-            distribution.experimental_run_v2(assign)))
-    self.assertAllEqual([3], per_replica_results)
-
-
 class MirroredTest(test.TestCase):
 
   def testAddOp(self):
diff --git a/tensorflow/python/distribute/values_util.py b/tensorflow/python/distribute/values_util.py
new file mode 100644
index 00000000000..c42ac9e4de1
--- /dev/null
+++ b/tensorflow/python/distribute/values_util.py
@@ -0,0 +1,91 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions used by values.py and ps_values.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import variable_scope as vs
+
+
+def get_current_replica_id_as_int():
+  """Returns the current replica ID as an integer, or `None`."""
+  replica_context = ds_context.get_replica_context()
+  if replica_context:
+    replica_id = replica_context.replica_id_in_sync_group
+    if not isinstance(replica_id, int):
+      replica_id = tensor_util.constant_value(replica_id)
+  else:
+    replica_id = distribute_lib.get_update_replica_id()
+  return replica_id
+
+
+def assign_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign(tensor)
+
+
+def assign_add_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign_add(tensor)
+
+
+def assign_sub_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign_sub(tensor)
+
+
+def assert_replica_context(strategy):
+  replica_context = ds_context.get_replica_context()
+  if not replica_context:
+    raise RuntimeError(
+        "Replica-local variables may only be assigned in a replica context.")
+  if replica_context.strategy is not strategy:
+    raise RuntimeError(
+        "Replica-local variables may only be assigned in a replica context.")
+
+
+def apply_aggregation(strategy, value, aggregation, destinations):
+  if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+    return strategy.extended.broadcast_to(
+        strategy.experimental_local_results(value)[0],
+        destinations=destinations)
+  reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
+  return strategy.extended.reduce_to(reduce_op, value, destinations)
+
+
+aggregation_error_msg = (
+    "You must specify an aggregation method to update a "
+    "{variable_type} in Replica Context. You can do so by passing "
+    "an explicit value for argument `aggregation` to tf.Variable(..)."
+    "e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)`"
+    "`tf.VariableAggregation` lists the possible aggregation methods."
+    "This is required because {variable_type} should always be "
+    "kept in sync. When updating them or assigning to them in a "
+    "replica context, we automatically try to aggregate the values "
+    "before updating the variable. For this aggregation, we need to "
+    "know the aggregation method. "
+    "Another alternative is to not try to update such "
+    "{variable_type} in replica context, but in cross replica "
+    "context. You can enter cross replica context by calling "
+    "`tf.distribute.get_replica_context().merge_call(merge_fn, ..)`."
+    "Inside `merge_fn`, you can then update the {variable_type} "
+    "using `tf.distribute.StrategyExtended.update()`.")
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index c08cb8cc1c3..a44d8a493c1 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -1,6 +1,8 @@
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
@@ -28,6 +30,10 @@ cc_library(
         "pywrap_tensor_conversion.h",
         "pywrap_tfe.h",
     ],
+    copts = ["-fexceptions"],
+    features = [
+        "-use_header_modules",  # Required for pybind11
+    ],
     visibility = [
         "//learning/deepmind/courier:__subpackages__",
         "//tensorflow:internal",
@@ -54,6 +60,7 @@ cc_library(
         "//tensorflow/python:ndarray_tensor",
         "//tensorflow/python:ndarray_tensor_bridge",
         "//tensorflow/python:numpy_lib",
+        "//tensorflow/python:py_exception_registry",
         "//tensorflow/python:py_seq_tensor",
         "//tensorflow/python:safe_ptr",
         "//third_party/py/numpy:headers",
@@ -63,6 +70,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:variant",
+        "@pybind11",
     ],
 )
 
@@ -432,6 +440,7 @@ cuda_py_test(
     srcs = ["function_test.py"],
     python_version = "PY3",
     shard_count = 15,
+    tags = ["nomac"],  # b/157056289
     deps = [
         ":backprop",
         ":cancellation",
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index fb7c4055136..dc7bb7c4b11 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -241,6 +241,11 @@ def implicit_val_and_grad(f):
                        "function was being computed.")
 
     sources = [v.handle for v in variables]
+    for s in sources:
+      if getattr(s, "is_packed", False):
+        raise ValueError(
+            "GradientTape.gradient is not supported on packed EagerTensors yet."
+        )
     grad = imperative_grad.imperative_grad(this_tape, nest.flatten(end_node),
                                            sources)
     return end_node, list(zip(grad, variables))
@@ -548,6 +553,10 @@ def make_vjp(f, params=None, persistent=True):
       ]
       args = _ensure_unique_tensor_objects(parameter_positions, args)
       for i in parameter_positions:
+        if getattr(args[i], "is_packed", False):
+          raise ValueError(
+              "GradientTape.gradient is not supported on packed EagerTensors"
+              "yet.")
         sources.append(args[i])
         tape.watch(this_tape, args[i])
       result = f(*args)
@@ -873,7 +882,7 @@ class GradientTape(object):
     Raises:
       ValueError: if it encounters something that is not a tensor.
     """
-    for t in nest.flatten(tensor):
+    for t in nest.flatten(tensor, expand_composites=True):
       if not (_pywrap_utils.IsTensor(t) or _pywrap_utils.IsVariable(t)):
         raise ValueError("Passed in object of type {}, not tf.Tensor".format(
             type(t)))
@@ -1032,6 +1041,10 @@ class GradientTape(object):
             logging.WARN, "The dtype of the source tensor must be "
             "floating (e.g. tf.float32) when calling GradientTape.gradient, "
             "got %r", t.dtype)
+      if getattr(t, "is_packed", False):
+        raise ValueError(
+            "GradientTape.gradient is not supported on packed EagerTensors yet."
+        )
 
     if output_gradients is not None:
       output_gradients = [None if x is None else ops.convert_to_tensor(x)
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index b28aaa3a626..a0f98fc0a44 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
@@ -48,6 +49,7 @@ from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import training
 
@@ -1484,6 +1486,19 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(ValueError, 'ndarray'):
       g.watch(np.array(1.))
 
+  def testWatchComposite(self):
+    """Test that tape.watch expands composites and watches component Tensors."""
+    with backprop.GradientTape() as t:
+      values = constant_op.constant([1.0, 2.0], dtypes.float32)
+      s = sparse_tensor.SparseTensor(
+          indices=[[0, 0], [1, 2]],
+          values=values,
+          dense_shape=[3, 4])
+      t.watch(s)
+      z = sparse_ops.sparse_reduce_sum_v2(s)
+    result = t.gradient(z, values)
+    self.assertAllEqual(result, [1.0, 1.0])
+
   def testWatchedVariablesAfterNonPersistentGradientCall(self):
     with backprop.GradientTape(persistent=False) as tape:
       x = resource_variable_ops.ResourceVariable(1.0)
diff --git a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
index 9d049a6d59d..b2e57c11e3c 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
+++ b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
@@ -104,24 +104,24 @@ class ResNet50Test(tf.test.TestCase):
       context.async_wait()
     self.assertEqual((2, 1000), output.shape)
 
-  @test_util.disable_tfrt('b/155260334')
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply(self):
     self._apply(defun=False)
 
   @test_util.disable_tfrt(
-      'TFE_ContextGetExecutorForThread not implemented for tfrt')
+      'TFE_ContextGetExecutorForThread not implemented b/156188669')
   def test_apply_async(self):
     self._apply(defun=False, execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('Graph is not supported yet.')
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def test_apply_with_defun(self):
     self._apply(defun=True)
 
-  @test_util.disable_tfrt('Graph is not supported yet.')
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def test_apply_with_defun_async(self):
     self._apply(defun=True, execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('b/155260334')
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply_no_top(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False)
@@ -132,7 +132,7 @@ class ResNet50Test(tf.test.TestCase):
                     if data_format == 'channels_first' else (2, 1, 1, 2048))
     self.assertEqual(output_shape, output.shape)
 
-  @test_util.disable_tfrt('b/155260334')
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply_with_pooling(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False, pooling='avg')
@@ -141,7 +141,7 @@ class ResNet50Test(tf.test.TestCase):
       output = model(images, training=False)
     self.assertEqual((2, 2048), output.shape)
 
-  @test_util.disable_tfrt('b/155260334')
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply_no_average_pooling(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(
@@ -153,7 +153,7 @@ class ResNet50Test(tf.test.TestCase):
                     (2, 7, 7, 2048))
     self.assertEqual(output_shape, output.shape)
 
-  @test_util.disable_tfrt('b/155260334')
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply_block3_strides(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(
@@ -165,7 +165,7 @@ class ResNet50Test(tf.test.TestCase):
                     (2, 1, 1, 2048))
     self.assertEqual(output_shape, output.shape)
 
-  @test_util.disable_tfrt('b/155260334')
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply_retrieve_intermediates(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(
@@ -220,15 +220,15 @@ class ResNet50Test(tf.test.TestCase):
     self.assertEqual(len(events), 2)
     self.assertEqual(events[1].summary.value[0].tag, 'loss')
 
-  @test_util.disable_tfrt('b/155260334')
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_train(self):
     self._test_train()
 
-  @test_util.disable_tfrt('b/155260334')
+  @test_util.disable_tfrt('TFE_ContextGetExecutorForThread missing b/156188669')
   def test_train_async(self):
     self._test_train(execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('b/155260334')
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_no_garbage(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format)
@@ -337,7 +337,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         defun=False,
         execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('Graph is not supported yet.')
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_apply_with_defun(self):
     self._benchmark_eager_apply(
         'eager_apply_with_defun',
@@ -355,7 +355,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         (images, labels) = resnet50_test_util.random_batch(
             batch_size, data_format)
         model = resnet50.ResNet50(data_format)
-        optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.1)
+        optimizer = tf.keras.optimizers.SGD(0.1)
         apply_grads = apply_gradients
         if defun:
           model.call = tf.function(model.call)
@@ -397,7 +397,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         defun=False,
         execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('Graph is not supported yet.')
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_train_with_defun(self):
     self._benchmark_eager_train(
         'eager_train_with_defun', MockIterator,
@@ -416,7 +416,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         resnet50_test_util.device_and_data_format(),
         defun=False)
 
-  @test_util.disable_tfrt('Graph is not supported yet.')
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_train_datasets_with_defun(self):
 
     def make_iterator(tensors):
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 9630ce01ce9..0d62a32b1fe 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -52,6 +52,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.layers import core as core_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -61,6 +63,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
 
@@ -110,18 +113,20 @@ def run_benchmark(func, num_iters, execution_mode=None):
 class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
   def __init__(self):
-    # TODO(b/153054118): Add tf.RandomUniform
-    if not context.is_tfrt_enabled():
-      # used for multiply benchmarks
-      self._m_2 = random_ops.random_uniform([2])
+    # used for multiply benchmarks
+    self._m_2 = random_ops.random_uniform([2])
 
-      # used for matmul benchmarks
-      self._m_2_by_2 = random_ops.random_uniform((2, 2))
-      self._m_100_by_784 = random_ops.random_uniform((100, 784))
+    # used for matmul benchmarks
+    self._m_2_by_2 = random_ops.random_uniform((2, 2))
+    self._m_100_by_784 = random_ops.random_uniform((100, 784))
 
     self._num_iters_2_by_2 = 30000
     self._num_iters_100_by_784 = 30000
 
+    # used for conv2d benchmarks
+    self._m_8_28_28_3 = random_ops.random_uniform((8, 28, 28, 3))
+    self._m_1_3_3_1 = random_ops.random_uniform((1, 3, 3, 1))
+
   def _get_benchmark_name(self):
     """Mostly copied from benchmark.py _get_name()."""
     stack = tf_inspect.stack()
@@ -182,22 +187,18 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         func()  # Warmup.
       self._run(func, 3000)
 
-  @test_util.disable_tfrt("Scalars are not handled correctly")
   def benchmark_create_float_constant(self):
     self._benchmark_create_constant(42.0, dtype=None)
 
-  @test_util.disable_tfrt("Scalars are not handled correctly")
   def benchmark_create_float_constant_uncached(self):
     self._benchmark_create_constant(42.0, dtype=None, cached=False)
 
-  @test_util.disable_tfrt("Scalars are not handled correctly")
   def benchmark_create_int32_constant(self):
     if context.num_gpus():
       return  # int32 constants are always allocated on CPU.
 
     self._benchmark_create_constant(42, dtype=dtypes.int32)
 
-  @test_util.disable_tfrt("Scalars are not handled correctly")
   def benchmark_create_int32_constant_uncached(self):
     if context.num_gpus():
       return  # int32 constants are always allocated on CPU.
@@ -213,21 +214,17 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         func()  # Warmup.
       self._run(func, 30000)
 
-  @test_util.disable_tfrt("Scalars are not handled correctly")
   def benchmark_add_float_scalars(self):
     self._benchmark_add(42.0, 24.0)
 
-  @test_util.disable_tfrt("Scalars are not handled correctly")
   def benchmark_add_int32_scalars(self):
     self._benchmark_add(42, 24)
 
-  @test_util.disable_tfrt("Scalars are not handled correctly")
   def benchmark_add_float_scalar_tensor(self):
     tensor_a = constant_op.constant(42.0)
     tensor_b = constant_op.constant(24.0)
     self._benchmark_add(tensor_a, tensor_b)
 
-  @test_util.disable_tfrt("Scalars are not handled correctly")
   def benchmark_add_int32_scalar_tensor(self):
     tensor_a = constant_op.constant(42)
     tensor_b = constant_op.constant(24)
@@ -315,21 +312,24 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     func = lambda: m * m
     self._run(func, num_iters)
 
+  def _benchmark_tf_conv2d(self, m1, m2, num_iters):
+    func = lambda: nn_ops.conv2d(m1, m2, strides=[1, 1, 1, 1], padding="VALID")
+    self._run(func, num_iters)
+
   def _benchmark_tf_multiply_op(self, m, num_iters):
     func = lambda: math_ops.multiply(m, m)
     self._run(func, num_iters)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("numpy() not supported")
   def benchmark_np_multiply(self):
     self._benchmark_np_multiply(self._m_2, 30000)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_multiply_CPU(self):
     with context.device(CPU):
       m = self._m_2.cpu()
       self._benchmark_tf_multiply(m, 30000)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_multiply_GPU(self):
     if not context.num_gpus():
       return
@@ -337,13 +337,12 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = self._m_2.gpu()
       self._benchmark_tf_multiply(m, 30000)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_multiply_op_CPU(self):
     with context.device(CPU):
       m = self._m_2.cpu()
       self._benchmark_tf_multiply_op(m, 30000)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_multiply_op_GPU(self):
     if not context.num_gpus():
       return
@@ -351,7 +350,21 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = self._m_2.gpu()
       self._benchmark_tf_multiply_op(m, 30000)
 
-  @test_util.disable_tfrt("random ops not supported")
+  def benchmark_tf_conv2d_CPU(self):
+    with context.device(CPU):
+      m1 = self._m_8_28_28_3.cpu()
+      m2 = self._m_1_3_3_1.cpu()
+      self._benchmark_tf_conv2d(m1, m2, 30000)
+
+  @test_util.disable_tfrt("copy to GPU not supported")
+  def benchmark_tf_conv2d_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m1 = self._m_8_28_28_3.gpu()
+      m2 = self._m_1_3_3_1.gpu()
+      self._benchmark_tf_conv2d(m1, m2, 30000)
+
   def benchmark_tf_identity(self):
     m = self._m_2
     self._run(lambda: gen_array_ops.identity(m), 30000)
@@ -360,7 +373,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
   def benchmark_slowpath_tf_identity(self):
     self._run(lambda: gen_array_ops.identity(1), 30000)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tfe_py_execute_identity(self):
     m = self._m_2
     ctx_handle = context.context()._handle
@@ -498,19 +510,17 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._run(m.value, num_iters)
 
   # Benchmarks for A^2, A of dimension 2 by 2.
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_np_matmul_2_by_2(self):
     self._benchmark_np_matmul(
         self._m_2_by_2, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_matmul_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_tf_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("async not supported")
   def benchmark_tf_matmul_2_by_2_CPU_async(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
@@ -520,35 +530,32 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_gen_math_ops_matmul_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_gen_math_ops_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tfe_py_fastpath_execute_matmul_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_tfe_py_fastpath_execute_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tfe_py_execute_matmul_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Mutex corrupt: waiting writer with no waiters")
   def benchmark_defun_matmul_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_defun_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("async not supported")
   def benchmark_defun_matmul_2_by_2_CPU_async(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
@@ -558,14 +565,14 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Mutex corrupt: waiting writer with no waiters")
   def benchmark_defun_matmul_forward_backward_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_defun_matmul_forward_backward(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("async not supported")
   def benchmark_defun_matmul_forward_backward_2_by_2_CPU_async(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
@@ -575,7 +582,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -584,7 +591,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tf_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("async not supported")
   def benchmark_tf_matmul_2_by_2_GPU_async(self):
     if not context.num_gpus():
       return
@@ -596,7 +603,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_gen_math_ops_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -605,7 +612,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_gen_math_ops_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tfe_py_execute_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -614,7 +621,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -623,7 +630,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_defun_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("async not supported")
   def benchmark_defun_matmul_2_by_2_GPU_async(self):
     if not context.num_gpus():
       return
@@ -635,28 +642,26 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_nested_defun_matmul_2_by_2(self):
     m = self._m_2_by_2.cpu()
     self._benchmark_nested_defun_matmul(
         m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
   # Benchmarks for AA.T, A of dimension 100 by 784.
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_np_matmul_100_by_784(self):
     self._benchmark_np_matmul(
         self._m_100_by_784,
         transpose_b=True,
         num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
       self._benchmark_tf_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("async not supported")
   def benchmark_tf_matmul_100_by_784_CPU_async(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
@@ -666,35 +671,33 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_100_by_784,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_gen_math_ops_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
       self._benchmark_gen_math_ops_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tfe_py_fastpath_execute_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
       self._benchmark_tfe_py_fastpath_execute_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tfe_py_execute_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
       self._benchmark_defun_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -703,7 +706,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tf_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("async not supported")
   def benchmark_tf_matmul_100_by_784_GPU_async(self):
     if not context.num_gpus():
       return
@@ -715,7 +718,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_100_by_784,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_gen_math_ops_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -724,7 +727,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_gen_math_ops_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tfe_py_execute_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -733,7 +736,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("defun not supported")
   def benchmark_defun_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -742,7 +745,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_defun_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("defun not supported")
   def benchmark_nested_defun_matmul_100_by_784(self):
     m = self._m_100_by_784.gpu()
     self._benchmark_nested_defun_matmul(
@@ -815,46 +818,48 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         func()
       self._run(func, 3000)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_in_defun_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_of_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_in_defun_of_defun_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_of_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_of_defun_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_matmul_CPU(shape=(100, 784))
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_in_defun_matmul_CPU(shape=(100, 784))
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_of_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_in_defun_of_defun_matmul_CPU(shape=(100, 784))
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_of_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_of_defun_matmul_CPU(shape=(100, 784))
 
   def _benchmark_tf_reduce_logsumexp(self,
                                      device=CPU,
                                      execution_mode=None,
-                                     defunc=False):
+                                     defunc=False,
+                                     xla_compile=False):
     with context.device(device):
       x = constant_op.constant([[1, 0.], [0., 0.]])
       if defunc:
-        reduce_func = def_function.function(math_ops.reduce_logsumexp)
+        reduce_func = def_function.function(
+            math_ops.reduce_logsumexp, experimental_compile=xla_compile)
         func = lambda: reduce_func(x)
       else:
         func = lambda: math_ops.reduce_logsumexp(x)
@@ -895,6 +900,16 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     self._benchmark_tf_reduce_logsumexp(
         device=GPU, execution_mode=context.ASYNC, defunc=True)
 
+  @test_util.disable_tfrt("reduce logsumexp not supported")
+  def benchmark_tf_reduce_logsumexp_GPU_defun_compile(self):
+    self._benchmark_tf_reduce_logsumexp(
+        device=GPU, defunc=True, xla_compile=True)
+
+  @test_util.disable_tfrt("reduce logsumexp not supported")
+  def benchmark_tf_reduce_logsumexp_GPU_async_defun_compile(self):
+    self._benchmark_tf_reduce_logsumexp(
+        device=GPU, execution_mode=context.ASYNC, defunc=True, xla_compile=True)
+
   def _benchmark_tf_tensordot(self, device=CPU, execution_mode=None):
     with context.device(device):
       a = array_ops.ones((2, 2))
@@ -988,25 +1003,20 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       func = lambda: array_ops.zeros_like(m)
       self._run(func, 3000)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_zeros_like_CPU(self):
     self._benchmark_tf_zeros_like(self._m_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_zeros_like_GPU(self):
     self._benchmark_tf_zeros_like(self._m_2_by_2, device=GPU)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_zeros_like_variable_CPU(self):
     m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
     self._benchmark_tf_zeros_like(m)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_zeros_like_variable_GPU(self):
     m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
     self._benchmark_tf_zeros_like(m, device=GPU)
 
-  @test_util.disable_tfrt("random ops not supported")
   def _benchmark_tf_random_uniform_2_by_2(self,
                                           shape=(2, 2),
                                           dtype=dtypes.int32,
@@ -1018,30 +1028,24 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
       self._run(func, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_random_uniform_2_by_2_integer_CPU(self):
     self._benchmark_tf_random_uniform_2_by_2()
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_random_uniform_2_by_2_integer_GPU(self):
     self._benchmark_tf_random_uniform_2_by_2(device=GPU)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_random_uniform_2_by_2_float_CPU(self):
     self._benchmark_tf_random_uniform_2_by_2(dtype=dtypes.float32)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_random_uniform_2_by_2_float_GPU(self):
     self._benchmark_tf_random_uniform_2_by_2(
         dtype=dtypes.float32, device=GPU)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_random_uniform_2_by_2_default_setting_CPU(self):
     with context.device(CPU):
       func = lambda: random_ops.random_uniform((2, 2))
       self._run(func, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_random_uniform_2_by_2_default_setting_GPU(self):
     with context.device(GPU):
       func = lambda: random_ops.random_uniform((2, 2))
@@ -1063,19 +1067,15 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
       self._run(func, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_dropout_scalar_rate_2_by_2_CPU(self):
     self._benchmark_tf_dropout_2_by_2(is_rate_tensor=False)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_dropout_scalar_rate_2_by_2_GPU(self):
     self._benchmark_tf_dropout_2_by_2(is_rate_tensor=False, device=GPU)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_dropout_2_by_2_CPU(self):
     self._benchmark_tf_dropout_2_by_2()
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_dropout_2_by_2_GPU(self):
     self._benchmark_tf_dropout_2_by_2(device=GPU)
 
@@ -1088,31 +1088,31 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     func = lambda: array_ops.transpose(m, perm, conjugate)
     self._run(func, num_iters, execution_mode=execution_mode)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("ConvertToEagerTensorUncached error")
   def benchmark_tf_transpose_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_transpose_2_by_2_GPU(self):
     with context.device(GPU):
       m = self._m_2_by_2.gpu()
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("ConvertToEagerTensorUncached error")
   def benchmark_tf_transpose_variable_2_by_2_CPU(self):
     with context.device(CPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Cannot convert array to EagerTensor of dtype int32")
   def benchmark_tf_transpose_variable_2_by_2_GPU(self):
     with context.device(GPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_without_signature(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1124,7 +1124,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     cache_computation = lambda: defined(t, t, t, t, t, t, t, t)
     self._run(cache_computation, 30000)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_without_signature_and_with_kwargs(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1137,7 +1137,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       return defined(t1=t, t2=t, t3=t, t4=t, t5=t, t6=t, t7=t, t8=t)
     self._run(cache_computation, 30000)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_with_signature(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1150,7 +1150,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     signature_computation = lambda: defined(t, t, t, t, t, t, t, t)
     self._run(signature_computation, 30000)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_with_signature_and_kwargs(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1164,26 +1164,23 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       return defined(t1=t, t2=t, t3=t, t4=t, t5=t, t6=t, t7=t, t8=t)
     self._run(signature_computation, 30000)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_matmul_read_variable_op_2_by_2_CPU(self):
     with context.device(CPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_matmul_read_variable(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_matmul_read_variable_op_with_tape_2_by_2_CPU(self):
     with context.device(CPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_matmul_read_variable_with_tape(
           m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_read_variable_op_2_by_2_CPU(self):
     with context.device(CPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_read_variable(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_read_variable_op_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -1191,14 +1188,13 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2.gpu())
       self._benchmark_read_variable(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_read_variable_op_with_tape_2_by_2_CPU(self):
     with context.device(CPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_read_variable_with_tape(
           m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_read_variable_op_with_tape_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -1228,7 +1224,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
     self._run(scan, 100)
 
-  @test_util.disable_tfrt("add not supported, only add_v2")
   def benchmark_fastpath_conversion_type_inference(self):
     c = constant_op.constant(1., dtype=dtypes.float32)
 
@@ -1237,6 +1232,54 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
     self._run(fn, 10000)
 
+  def benchmark_convert_tensor(self):
+    value = ops.convert_to_tensor(42)
+
+    def fn():
+      return ops.convert_to_tensor(value)
+
+    self._run(fn, 10000)
+
+  def _benchmark_convert_constant(self, value, cached):
+    global GLOBAL_TEST_VALUE
+    GLOBAL_TEST_VALUE = value
+
+    def cached_func():
+      ops.convert_to_tensor(value)
+
+    def uncached_func():
+      global GLOBAL_TEST_VALUE
+      GLOBAL_TEST_VALUE += 1
+      ops.convert_to_tensor(GLOBAL_TEST_VALUE)
+
+    func = cached_func if cached else uncached_func
+
+    self._run(func, 10000)
+
+  def benchmark_convert_python_int(self):
+    self._benchmark_convert_constant(42, cached=True)
+
+  def benchmark_convert_python_int_uncached(self):
+    self._benchmark_convert_constant(42, cached=False)
+
+  def benchmark_convert_python_float(self):
+    self._benchmark_convert_constant(42.0, cached=True)
+
+  def benchmark_convert_python_float_uncached(self):
+    self._benchmark_convert_constant(42.0, cached=False)
+
+  def benchmark_convert_numpy_int(self):
+    self._benchmark_convert_constant(np.array(42), cached=True)
+
+  def benchmark_convert_numpy_int_uncached(self):
+    self._benchmark_convert_constant(np.array(42), cached=False)
+
+  def benchmark_convert_numpy_float(self):
+    self._benchmark_convert_constant(np.array(42.0), cached=True)
+
+  def benchmark_convert_numpy_float_uncached(self):
+    self._benchmark_convert_constant(np.array(42.0), cached=False)
+
   @test_util.disable_tfrt("convert to tensor not supported")
   def benchmark_convert_3x_list_to_tensor(self):
     xs = [1, 2, 3]
@@ -1268,7 +1311,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     xs = [[[np.linspace(0, 1, 21).tolist()] * 20] * 20]
     self._run(lambda: constant_op.constant(xs, dtype=dtypes.float64), 10000)
 
-  @test_util.disable_tfrt("tf.fill not supported")
   def benchmark_list_of_zeros_to_np_array(self):
     values = []
     for _ in range(1000):
@@ -1286,11 +1328,11 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         resources.append(resource_variable_ops.ResourceVariable(self._m_2))
       self._run(lambda: add_all(resources), num_iters)
 
-  @test_util.disable_tfrt("Random uniform needs fallback")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkFunctionWithFiveResourceInputs(self):
     self._benchmarkFunctionWithResourceInputs(5, 1000)
 
-  @test_util.disable_tfrt("Random uniform needs fallback")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkFunctionWithFiveHundredResourceInputs(self):
     self._benchmarkFunctionWithResourceInputs(500, 100)
 
@@ -1325,18 +1367,71 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     with context.device(CPU):
       self._run(benchmark_fn, 10)
 
-  @test_util.disable_tfrt("VarHandleOp needs fallback")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkTenThousandResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(10000)
 
-  @test_util.disable_tfrt("VarHandleOp needs fallback")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkHundredResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(100)
 
-  @test_util.disable_tfrt("VarHandleOp needs fallback")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkTenResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(10)
 
+  def benchmark_tf_name_scope(self):
+
+    def fn():
+      with ops.name_scope_v2("name"):
+        pass
+
+    self._run(fn, 10000)
+
+  def benchmark_tf_nest_map_structure(self):
+    nested = {"a": [1, 2, 3], "b": (4, 5, 6)}
+
+    def fn():
+      nest.map_structure(lambda x: x, nested)
+
+    self._run(fn, 10000)
+
+  def benchmark_tf_nest_pack_sequence_as(self):
+    nested = {"a": [1, 2, 3], "b": (4, 5, 6)}
+    flat = nest.flatten(nested)
+
+    def fn():
+      nest.pack_sequence_as(nested, flat)
+
+    self._run(fn, 10000)
+
+  # TODO(b/157587712): Move to keras when benchmarks are setup.
+  def benchmark_tf_keras_layer_call_overhead(self):
+
+    class OnlyOverheadLayer(base_layer.Layer):
+
+      def call(self, x):
+        return x
+
+    layer = OnlyOverheadLayer()
+    x = ops.convert_to_tensor([[1.]])
+
+    def fn():
+      layer(x)
+
+    self._run(fn, 10000)
+
+  # TODO(b/157587712): Move to keras when benchmarks are setup.
+  def benchmark_tf_keras_dense_overhead(self):
+
+    layer = core_layers.Dense(1)
+    x = ops.convert_to_tensor([[1.]])
+    layer(x)  # Warmup call to `build` layer.
+
+    def fn():
+      layer(x)
+
+    self._run(fn, 10000)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/benchmarks_test_base.py b/tensorflow/python/eager/benchmarks_test_base.py
index 552d844c32d..3d81d08ccbf 100644
--- a/tensorflow/python/eager/benchmarks_test_base.py
+++ b/tensorflow/python/eager/benchmarks_test_base.py
@@ -32,4 +32,6 @@ class MicroBenchmarksBase(test.Benchmark):
         "examples_per_sec": float("{0:.3f}".format(num_iters / total_time)),
         "us_per_example": float("{0:.3f}".format(total_time * 1e6 / num_iters))
     }
-    self.report_benchmark(iters=num_iters, wall_time=mean_us, extras=extras)
+    benchmark_name = self._get_benchmark_name()
+    self.report_benchmark(
+        iters=num_iters, wall_time=mean_us, extras=extras, name=benchmark_name)
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 182b8478420..604a960afd5 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -1123,6 +1123,22 @@ class Context(object):
     pywrap_tfe.TFE_Py_RegisterCustomDevice(self._handle, device_capsule,
                                            device_name, device_info_capsule)
 
+  def pack_eager_tensors(self, tensors):
+    """Pack multiple `EagerTensor`s of the same dtype and shape.
+
+    Args:
+      tensors: a list of EagerTensors to pack.
+
+    Returns:
+      A packed EagerTensor.
+    """
+    self.ensure_initialized()
+    if self._lazy_remote_inputs_copy is not None and (
+        not self._lazy_remote_inputs_copy):
+      raise ValueError("Packing eager tensors is not supported when "
+                       "lazy_remote_inputs_copy is disabled.")
+    return pywrap_tfe.TFE_Py_PackEagerTensors(self._handle, tensors)
+
   def remove_function(self, name):
     """Remove a function from the context.
 
@@ -1509,9 +1525,11 @@ class Context(object):
     return self.config.allow_soft_placement
 
   @soft_device_placement.setter
-  def soft_device_placement(self, enabled):
-    self._soft_device_placement = enabled
+  def soft_device_placement(self, enable):
+    if self._context_handle is not None:
+      pywrap_tfe.TFE_ContextSetSoftDevicePlacement(self._handle, enable)
 
+    self._soft_device_placement = enable
     self._thread_local_data.function_call_options = None
 
   @property
@@ -1519,15 +1537,11 @@ class Context(object):
     return self.config.log_device_placement
 
   @log_device_placement.setter
-  def log_device_placement(self, enabled):
-    if self._log_device_placement == enabled:
-      return
-
+  def log_device_placement(self, enable):
     if self._context_handle is not None:
-      raise RuntimeError(
-          "Device placement logging must be set at program startup")
+      pywrap_tfe.TFE_ContextSetLogDevicePlacement(self._handle, enable)
 
-    self._log_device_placement = enabled
+    self._log_device_placement = enable
     self._thread_local_data.function_call_options = None
 
   @property
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index 47b3966827f..c1401fc56ee 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -1112,5 +1112,4 @@ class EagerTensorCacheTest(test_util.TensorFlowTestCase):
 
 
 if __name__ == '__main__':
-  context.set_log_device_placement(True)
   test.main()
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index 48c9b06fa38..c61f39111b1 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -109,7 +109,7 @@ class _FrequentTracingDetector(object):
             "retracing. Tracing is expensive and the excessive number of "
             "tracings could be due to (1) creating @tf.function repeatedly in "
             "a loop, (2) passing tensors with different shapes, (3) passing "
-            "Python objects instead of tensors. For (1), please  define your "
+            "Python objects instead of tensors. For (1), please define your "
             "@tf.function outside of the loop. For (2), @tf.function has "
             "experimental_relax_shapes=True option that relaxes argument "
             "shapes that can avoid unnecessary retracing. For (3), please "
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index 13b46491d9f..b63a3b434d4 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -290,6 +290,101 @@ class DefFunctionTest(test.TestCase):
         y = f(x)
         tape.gradient(y, x)
 
+  def testTensorListConcatV2(self):
+
+    def f(x):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=2, element_shape=[3])
+      ta = ta.write(0, 2 * x)
+      ta = ta.write(1, 3 * x)
+      return ta.concat()
+
+    compiled_f = def_function.function(experimental_compile=True)(f)
+
+    inputs = constant_op.constant([3.14, 2.68, 7.69])
+
+    self.assertAllClose([6.28, 5.36, 15.38, 9.42, 8.04, 23.07], f(inputs))
+
+    self.assertAllClose(compiled_f(inputs), f(inputs))
+
+  def testTensorListConcatV2Multidim(self):
+
+    def f(x):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=2, element_shape=[3, 2])
+      ta = ta.write(0, 2 * x)
+      ta = ta.write(1, 3 * x)
+      return ta.concat()
+
+    compiled_f = def_function.function(experimental_compile=True)(f)
+
+    inputs = constant_op.constant([[3.14, 21.1], [2.68, 22.2], [7.69, 23.3]])
+    self.assertAllClose(f(inputs), compiled_f(inputs))
+
+  def testTensorListConcatV2Scalars(self):
+
+    def f(x):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=2, element_shape=[1])
+      ta = ta.write(0, 2 * x)
+      ta = ta.write(1, 3 * x)
+      return ta.concat()
+
+    compiled_f = def_function.function(experimental_compile=True)(f)
+    inputs = constant_op.constant([3.14])
+    self.assertAllClose(f(inputs), compiled_f(inputs))
+
+  def testTensorListConcatGrad(self):
+
+    def f(x):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=2, element_shape=[3])
+      ta = ta.write(0, 2 * x)
+      ta = ta.write(1, 3 * x)
+      return ta.concat()
+
+    def g():
+      x = constant_op.constant([3.14, 2.68, 7.69])
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = f(x)
+        return tape.gradient(y, x)
+
+    compiled_g = def_function.function(experimental_compile=True)(g)
+
+    self.assertAllClose([5.0, 5.0, 5.0], g())
+    self.assertAllClose(compiled_g(), g())
+
+  def testTensorListConcatGradNestedCompile(self):
+
+    @def_function.function(experimental_compile=True)
+    def f(x):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=2, element_shape=[3])
+      ta = ta.write(0, 2 * x)
+      ta = ta.write(1, 3 * x)
+      return ta.concat()
+
+    @def_function.function(experimental_compile=True)
+    def g():
+      x = constant_op.constant([3.14, 2.68, 7.69])
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = f(x)
+        out = tape.gradient(y, x)
+      return out
+
+    self.assertAllClose([5.0, 5.0, 5.0], g())
+
+  def testCumsum(self):
+
+    @def_function.function(experimental_compile=True)
+    def f(x):
+      return math_ops.cumsum(x)
+
+    f64_input = constant_op.constant([1.1, 2.2, 3.3], dtype=dtypes.float64)
+    self.assertAllClose([1.1, 3.3, 6.6], f(f64_input))
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 4ddba6b9be3..4a1156f534f 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -90,6 +90,26 @@ def _jacfwd(f, primals):
   return nest.pack_sequence_as(primals, jac_flat)
 
 
+def _jvp_batch(f, primal, tangents):
+  tf_function = def_function.function(f)
+
+  return control_flow_ops.vectorized_map(
+      functools.partial(_jvp, tf_function, primal), tangents)
+
+
+def _jvp_batch_matmul(f, primals, tangent_batch):
+  """Compute the jacobian of `f` at `primals` multiplied by `tangents`."""
+  jac_fwd = _jacfwd(f, primals)
+
+  def jac_mul(tangent):
+    flat_tangent = array_ops.reshape(tangent, shape=[-1])
+    tangent_vector = array_ops.expand_dims(flat_tangent, 1)
+    jvp_vector = math_ops.matmul(jac_fwd, tangent_vector)
+    return array_ops.reshape(jvp_vector, tangent.shape)
+
+  return control_flow_ops.vectorized_map(jac_mul, tangent_batch)
+
+
 def _grad(f, argnums=0):
   """Return a function which computes the gradient of `f`."""
 
@@ -199,7 +219,6 @@ def _test_gradients(testcase,
   # And the symbolic computations should be much closer.
   testcase.assertAllClose(sym_jac_back, sym_jac_fwd)
 
-
 class ForwardpropTest(test.TestCase, parameterized.TestCase):
 
   def testJVPFunction(self):
@@ -361,14 +380,17 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
 
     _test_gradients(self, f, [constant_op.constant([1., 2.])], order=3)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
-  def testCustomGradientRecomputeGrad(self):
+  # TODO(allenl): investigate why assert_no_new_pyobjects_executing_eagerly fails around this test?
+  def testExceptionCustomGradientRecomputeGradForward(self):
 
     @custom_gradient.recompute_grad
     def f(x):
       return math_ops.reduce_prod(math_ops.tanh(x)**2)
 
-    _test_gradients(self, f, [constant_op.constant([1.])], order=3)
+    with self.assertRaisesRegexp(NotImplementedError,
+                                 "recompute_grad tried to transpose"):
+      primals = [constant_op.constant([1.])]
+      sym_jac_fwd = _jacfwd(f, primals)
 
   def testExceptionInCustomGradientNotSwallowed(self):
 
@@ -939,6 +961,18 @@ class HessianTests(test.TestCase, parameterized.TestCase):
     self.assertAllClose(hess_value, hessian_pfor)
 
 
+class JacobianTests(test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters([(math_ops.sin, (2, 3), 5),
+                             (math_ops.sin, (2, 3, 4), 10)])
+  def testJVPBatchCorrectness(self, f, primal_shape, batch_size):
+    primals = [random_ops.random_uniform(primal_shape)]
+    tangent_batch = [random_ops.random_uniform([batch_size, *primal_shape])]
+    self.assertAllClose(
+        _jvp_batch(f, primals, tangent_batch)[1],
+        _jvp_batch_matmul(f, primals, *tangent_batch))
+
+
 if __name__ == "__main__":
   # TODO(allenl): Also test with 1.x-style graph mode.
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 97708f056c2..37c802b9aa6 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -726,7 +726,7 @@ class _DelayedRewriteGradientFunctions(object):
     # pylint: enable=protected-access
 
     capture_mapping = dict(
-        zip([ops.tensor_id(t) for t in self._func_graph.outputs], op.outputs))
+        zip((ops.tensor_id(t) for t in self._func_graph.outputs), op.outputs))
     remapped_captures = [
         capture_mapping.get(ops.tensor_id(capture), capture)
         for capture in backwards_function.captured_inputs
@@ -1831,9 +1831,9 @@ class ConcreteFunction(object):
       `args` and `kwargs`.
     """
     return self._call_flat(
-        (t for t in nest.flatten((args, kwargs), expand_composites=True)
+        [t for t in nest.flatten((args, kwargs), expand_composites=True)
          if isinstance(t, (ops.Tensor,
-                           resource_variable_ops.BaseResourceVariable))),
+                           resource_variable_ops.BaseResourceVariable))],
         captured_inputs=self.captured_inputs,
         cancellation_manager=cancellation_manager)
 
@@ -1854,7 +1854,6 @@ class ConcreteFunction(object):
     Raises:
       ValueError: If `args` contains anything other than Tensors or Variables.
     """
-    args = list(args)
     ctx = context.context()
     executing_eagerly = ctx.executing_eagerly()
 
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 4e68f1460d9..078ca8b8878 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -186,6 +186,43 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(AttributeError, 'no attribute'):
       add(c)
 
+  def testPackedVariable(self):
+    with ops.device('/cpu:0'):
+      v0_0 = resource_variable_ops.ResourceVariable(1.0)
+    with ops.device('/cpu:1'):
+      v0_1 = resource_variable_ops.ResourceVariable(2.0)
+      v1_0 = resource_variable_ops.ResourceVariable(3.0)
+    with ops.device('/cpu:2'):
+      v1_1 = resource_variable_ops.ResourceVariable(4.0)
+
+    packed_var_0 = ops.pack_eager_tensors([v0_0.handle, v0_1.handle])
+    packed_var_1 = ops.pack_eager_tensors([v1_0.handle, v1_1.handle])
+
+    # TODO(b/145922293): use ResourceVariable.assign_add and
+    # ResourceVariable.read_value directly once we support packing multiple
+    # ResourceVariable into one ResourceVariable.
+    @def_function.function
+    def read_var():
+      resource_variable_ops.assign_add_variable_op(
+          packed_var_0, constant_op.constant(5.0))
+      resource_variable_ops.assign_add_variable_op(
+          packed_var_1, constant_op.constant(6.0))
+      with ops.device('/cpu:0'):
+        read0 = resource_variable_ops.read_variable_op(
+            packed_var_0, dtype=dtypes.float32)
+      with ops.device('/cpu:1'):
+        read1 = resource_variable_ops.read_variable_op(
+            packed_var_0, dtype=dtypes.float32)
+        read2 = resource_variable_ops.read_variable_op(
+            packed_var_1, dtype=dtypes.float32)
+      with ops.device('/cpu:2'):
+        read3 = resource_variable_ops.read_variable_op(
+            packed_var_1, dtype=dtypes.float32)
+
+      return read0, read1, read2, read3
+
+    self.assertAllEqual(read_var(), (1 + 5, 2 + 5, 3 + 6, 4 + 6))
+
   def testImplementsAttributeBasic(self):
     v = def_function.function(
         experimental_implements='func')(lambda x, y: x + y)
diff --git a/tensorflow/python/eager/gradient_input_output_exclusions.py b/tensorflow/python/eager/gradient_input_output_exclusions.py
index 94962bf6135..442151f667e 100644
--- a/tensorflow/python/eager/gradient_input_output_exclusions.py
+++ b/tensorflow/python/eager/gradient_input_output_exclusions.py
@@ -253,7 +253,8 @@ def _live_tensors(f, attr_name="inputs"):
       # Not a number, assuming it can be anything.
       return _ALL
     subscript_val, = subscript.qn
-    if not isinstance(subscript_val, qual_names.NumberLiteral):
+    if (not isinstance(subscript_val, qual_names.Literal) and
+        not isinstance(subscript_val.value, int)):
       # Not a number, assuming it can be anything.
       return _ALL
     input_output_indices.add(subscript_val.value)
diff --git a/tensorflow/python/eager/monitoring.py b/tensorflow/python/eager/monitoring.py
index 26d4d8a55b3..74d98558192 100644
--- a/tensorflow/python/eager/monitoring.py
+++ b/tensorflow/python/eager/monitoring.py
@@ -19,6 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
+import time
 
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python import pywrap_tfe
@@ -428,3 +430,46 @@ class Sampler(Metric):
   def get_cell(self, *labels):
     """Retrieves the cell."""
     return SamplerCell(super(Sampler, self).get_cell(*labels))
+
+
+class MonitoredTimer(object):
+  """A context manager to measure the walltime and increment a Counter cell."""
+
+  def __init__(self, cell):
+    """Creates a new MonitoredTimer.
+
+    Args:
+      cell: the cell associated with the time metric that will be inremented.
+    """
+    self.cell = cell
+
+  def __enter__(self):
+    self.t = time.time()
+    return self
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    del exception_type, exception_value, traceback
+    micro_seconds = (time.time() - self.t) * 1000000
+    self.cell.increase_by(int(micro_seconds))
+
+
+def monitored_timer(cell):
+  """A function decorator for adding MonitoredTimer support.
+
+  Arguments:
+    cell: the cell associated with the time metric that will be inremented.
+  Returns:
+    A decorator that measure the function runtime and increment the specified
+    counter cell.
+  """
+
+  def actual_decorator(func):
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+      with MonitoredTimer(cell):
+        return func(*args, **kwargs)
+
+    return wrapper
+
+  return actual_decorator
diff --git a/tensorflow/python/eager/monitoring_test.py b/tensorflow/python/eager/monitoring_test.py
index 3f601735ef2..7cb8c0c2cd1 100644
--- a/tensorflow/python/eager/monitoring_test.py
+++ b/tensorflow/python/eager/monitoring_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
+
 from tensorflow.python.eager import monitoring
 from tensorflow.python.eager import test
 from tensorflow.python.framework import errors
@@ -100,6 +102,26 @@ class MonitoringTest(test_util.TensorFlowTestCase):
     self.assertEqual(histogram_proto1.num, 2.0)
     self.assertEqual(histogram_proto1.sum, 6.0)
 
+  def test_context_manager(self):
+    counter = monitoring.Counter('test/ctxmgr', 'test context manager', 'slot')
+    with monitoring.MonitoredTimer(counter.get_cell('short')):
+      time.sleep(0.001)
+    with monitoring.MonitoredTimer(counter.get_cell('long')):
+      time.sleep(0.02)
+    self.assertGreater(
+        counter.get_cell('long').value(),
+        counter.get_cell('short').value())
+
+  def test_function_decorator(self):
+    counter = monitoring.Counter('test/funcdecorator', 'test func decorator')
+
+    @monitoring.monitored_timer(counter.get_cell())
+    def timed_function(seconds):
+      time.sleep(seconds)
+
+    timed_function(0.001)
+    self.assertGreater(counter.get_cell().value(), 1000)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index a72f74b38b8..031545531f1 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -21,9 +21,11 @@ limitations under the License.
 #include <cmath>
 
 #include "structmember.h"  // NOLINT // For PyMemberDef
+#include "pybind11/pybind11.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -32,6 +34,7 @@ limitations under the License.
 #include "tensorflow/python/lib/core/ndarray_tensor.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
 #include "tensorflow/python/lib/core/numpy.h"
+#include "tensorflow/python/lib/core/py_exception_registry.h"
 #include "tensorflow/python/lib/core/py_seq_tensor.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
 
@@ -300,7 +303,15 @@ TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
       strstr(device_name, "/device:CPU:0") != nullptr) {
     handle = make_safe(TFE_TensorHandleCopyToDevice(handle.get(), ctx,
                                                     device_name, status.get()));
-    if (MaybeRaiseExceptionFromTFStatus(status.get(), PyExc_RuntimeError)) {
+    const TF_Code code = TF_GetCode(status.get());
+    if (code != TF_OK) {
+      // Instead of raising a generic RuntimeError, raise an exception type
+      // based on the status error code.
+      PyObject* exception = PyExceptionRegistry::Lookup(code);
+      PyErr_SetObject(exception,
+                      pybind11::make_tuple(pybind11::none(), pybind11::none(),
+                                           TF_Message(status.get()))
+                          .ptr());
       return nullptr;
     }
   }
@@ -345,6 +356,8 @@ typedef struct EagerTensor {
   char unused[kMaxEagerTensorParentSize];
   TFE_TensorHandle* handle;
   int64_t id;
+  // Indicates whether it's a packed tensor or not.
+  bool is_packed;
   // This mirrors tensorflow.core.framework.ops.Tensor._handle_data Which will
   // be None for tensors of type other than DT_RESOURCE. For DT_RESOURCE
   // tensors, this will contain a serialized HandleData proto with shape
@@ -418,6 +431,7 @@ bool MaybeInvokeCreatedOnEagerTensorProfiler(EagerTensor* created_tensor) {
 int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
   self->id = get_uid();
   self->handle = nullptr;
+  self->is_packed = false;
   Py_INCREF(Py_None);
   self->handle_data = Py_None;
   Py_INCREF(Py_None);
@@ -647,6 +661,11 @@ static PyObject* EagerTensor_backing_device(EagerTensor* self) {
 #endif
 }
 
+// Getter `is_packed`.
+static PyObject* EagerTensor_is_packed(EagerTensor* self) {
+  return PyBool_FromLong(self->is_packed);
+}
+
 static PyGetSetDef EagerTensor_getsetters[] = {
     {const_cast<char*>("_id"), (getter)EagerTensor_getid, nullptr,
      const_cast<char*>("Tensor ID."), nullptr},
@@ -655,6 +674,9 @@ static PyGetSetDef EagerTensor_getsetters[] = {
     {const_cast<char*>("backing_device"), (getter)EagerTensor_backing_device,
      nullptr, const_cast<char*>("Device on which tensor's memory is resident."),
      nullptr},
+    {const_cast<char*>("is_packed"), (getter)EagerTensor_is_packed, nullptr,
+     const_cast<char*>("Whether the EagerTensor is a packed tensor or not."),
+     nullptr},
     {const_cast<char*>("_handle_data"), (getter)EagerTensor_handle_data,
      (setter)EagerTensor_sethandle_data,
      const_cast<char*>("Shape/DType data if the EagerTensor is a DT_RESOURCE"),
@@ -813,7 +835,8 @@ TFE_TensorHandle* EagerTensor_Handle(const PyObject* o) {
   return reinterpret_cast<const EagerTensor*>(o)->handle;
 }
 
-PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle) {
+PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle,
+                                const bool is_packed) {
   if (handle == nullptr) {
     return nullptr;
   }
@@ -821,6 +844,7 @@ PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle) {
       EagerTensorType->tp_new(EagerTensorType, EmptyTuple(), EmptyDict()));
   if (t != nullptr) {
     t->id = get_uid();
+    t->is_packed = is_packed;
     Py_INCREF(Py_None);
     t->handle_data = Py_None;
     Py_INCREF(Py_None);
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 92a0a200e3d..a5c9c181539 100755
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -129,7 +129,8 @@ void TFE_DeleteContextCapsule(PyObject* context);
 bool EagerTensor_CheckExact(const PyObject* o);
 
 // Helper function to construct a new EagerTensor from a TFE_TensorHandle.
-PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle);
+PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle,
+                                const bool is_packed = false);
 
 // Extracts the handle inside EagerTensor object `o`. Returns nullptr on error.
 TFE_TensorHandle* EagerTensor_Handle(const PyObject* o);
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 2d96ed57246..639f623bd1a 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -852,6 +852,8 @@ void TFE_Py_ExecuteCancelable(TFE_Context* ctx, const char* device_name,
                               TFE_CancellationManager* cancellation_manager,
                               TFE_OutputTensorHandles* outputs,
                               TF_Status* out_status) {
+  tensorflow::profiler::TraceMe activity(
+      "TFE_Py_ExecuteCancelable", tensorflow::profiler::TraceMeLevel::kInfo);
   TFE_Op* op = GetOp(ctx, op_name, device_name, out_status);
   auto cleaner = tensorflow::gtl::MakeCleanup([ctx, op] { ReturnOp(ctx, op); });
   if (!out_status->status.ok()) return;
diff --git a/tensorflow/python/eager/remote_test.py b/tensorflow/python/eager/remote_test.py
index 32fe6372f77..710e7bf5f9d 100644
--- a/tensorflow/python/eager/remote_test.py
+++ b/tensorflow/python/eager/remote_test.py
@@ -40,6 +40,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import server_lib
 from tensorflow.python.training.server_lib import ClusterSpec
@@ -324,6 +325,36 @@ class MultiWorkersTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
 
+  def testMultiDeviceFunctionWithPackedVariable(self):
+    with ops.device('/job:worker/replica:0/task:0/device:CPU:0'):
+      var0 = resource_variable_ops.ResourceVariable(1.0)
+    with ops.device('/job:worker/replica:0/task:1/device:CPU:0'):
+      var1 = resource_variable_ops.ResourceVariable(2.0)
+
+    packed_var = ops.pack_eager_tensors([var0.handle, var1.handle])
+    self.assertEqual(packed_var.device,
+                     '/job:localhost/replica:0/task:0/device:COMPOSITE:0')
+    self.assertEqual(packed_var.backing_device,
+                     '/job:localhost/replica:0/task:0/device:COMPOSITE:0')
+
+    @def_function.function
+    def add_variables():
+      with ops.device('/job:worker/replica:0/task:0/device:CPU:0'):
+        read0 = resource_variable_ops.read_variable_op(
+            packed_var, dtype=dtypes.float32)
+      with ops.device('/job:worker/replica:0/task:1/device:CPU:0'):
+        read1 = resource_variable_ops.read_variable_op(
+            packed_var, dtype=dtypes.float32)
+
+      return read0 + read1
+
+    # Run the function on a remote device
+    with ops.device('/job:worker/replica:0/task:0'):
+      self.assertAllEqual(add_variables().numpy(), 3.0)
+
+    # Run the function on a local worker
+    self.assertAllEqual(add_variables().numpy(), 3.0)
+
   @test_util.eager_lazy_remote_copy_on_and_off
   def testMultiDeviceFunctionOnRemoteDeviceWithWait(self):
     with ops.device('/job:worker/replica:0/task:1'):
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index d67cdf9cc06..786c26c009a 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -55,8 +55,6 @@ py_library(
 py_library(
     name = "feature_column_v2",
     srcs = [
-        "dense_features.py",
-        "dense_features_v2.py",
         "feature_column_v2.py",
         "sequence_feature_column.py",
         "serialization.py",
@@ -126,15 +124,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "dense_features_test",
-    srcs = ["dense_features_test.py"],
-    tags = ["no_pip"],
-    deps = [
-        ":feature_column_test_main_lib",
-    ],
-)
-
 py_library(
     name = "feature_column_test_main_lib",
     srcs = ["feature_column_test.py"],
@@ -177,15 +166,6 @@ tf_py_test(
     deps = [":feature_column_v2_test_main_lib"],
 )
 
-tf_py_test(
-    name = "dense_features_v2_test",
-    srcs = ["dense_features_v2_test.py"],
-    tags = ["no_pip"],
-    deps = [
-        ":feature_column_v2_test_main_lib",
-    ],
-)
-
 py_library(
     name = "feature_column_v2_test_main_lib",
     srcs = ["feature_column_v2_test.py"],
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 87420d0e850..07df4e914c9 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -2546,7 +2546,7 @@ class _EmbeddingColumn(
     embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
     if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
         sparse_id_rank <= 2):
-      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse
+      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
     # Return embedding lookup result.
     return embedding_lookup_sparse(
         embedding_weights,
@@ -2696,7 +2696,7 @@ class _SharedEmbeddingColumn(
       embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
       if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
           sparse_id_rank <= 2):
-        embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse
+        embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
       # Return embedding lookup result.
       return embedding_lookup_sparse(
           embedding_weights,
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index afe14f55bfc..bda20ff3f2c 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -19,13 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import,g-bad-import-order
-# We import dense_features_v2 first so that the V1 DenseFeatures is the default
-# if users directly import feature_column_lib.
-from tensorflow.python.feature_column.dense_features_v2 import *
-from tensorflow.python.feature_column.dense_features import *
 from tensorflow.python.feature_column.feature_column import *
 from tensorflow.python.feature_column.feature_column_v2 import *
 from tensorflow.python.feature_column.sequence_feature_column import *
 from tensorflow.python.feature_column.serialization import *
+# We import dense_features_v2 first so that the V1 DenseFeatures is the default
+# if users directly import feature_column_lib.
+from tensorflow.python.keras.feature_column.dense_features_v2 import *
+from tensorflow.python.keras.feature_column.dense_features import *
 from tensorflow.python.keras.feature_column.sequence_feature_column import *
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 21def9cfa2c..38800fc2162 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import collections
 import copy
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.core.example import example_pb2
@@ -852,9 +853,9 @@ class HashedCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), weight_collections=('my_weights',))
 
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertCountEqual([], ops.get_collection('my_weights'))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
@@ -1714,10 +1715,10 @@ class LinearModelTest(test.TestCase):
       # We check the mapping by checking that we have the right keys,
       # and that the values (output_tensors) were indeed the ones used to
       # form the input layer.
-      self.assertItemsEqual(all_cols, cols_to_output_tensors.keys())
+      self.assertCountEqual(all_cols, cols_to_output_tensors.keys())
       input_layer_inputs = [tensor for tensor in input_layer.op.inputs[:-1]]
       output_tensors = [tensor for tensor in cols_to_output_tensors.values()]
-      self.assertItemsEqual(input_layer_inputs, output_tensors)
+      self.assertCountEqual(input_layer_inputs, output_tensors)
 
   def test_dense_collection(self):
     price = fc._numeric_column('price')
@@ -2841,7 +2842,7 @@ class FunctionalInputLayerTest(test.TestCase):
       cols_to_vars = {}
       all_cols = [price1, dense_feature_bucketized, some_embedding_column]
       fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
@@ -2891,7 +2892,7 @@ class FunctionalInputLayerTest(test.TestCase):
           shared_embedding_a, shared_embedding_b
       ]
       fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
@@ -2927,7 +2928,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'input_from_feature_columns',
           partitioner=partitioned_variables.fixed_size_partitioner(3, axis=0)):
         fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(3, len(cols_to_vars[some_embedding_column]))
@@ -3043,7 +3044,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'input_layer/sparse_feature_embedding/embedding_weights:0',
           'input_layer_1/sparse_feature_embedding/embedding_weights:0'
       ]
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
@@ -3077,7 +3078,7 @@ class FunctionalInputLayerTest(test.TestCase):
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-      self.assertItemsEqual(
+      self.assertCountEqual(
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
@@ -3129,7 +3130,7 @@ class FunctionalInputLayerTest(test.TestCase):
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-      self.assertItemsEqual(
+      self.assertCountEqual(
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
@@ -3618,9 +3619,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), weight_collections=('my_weights',))
 
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertCountEqual([], ops.get_collection('my_weights'))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
@@ -4058,9 +4059,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), weight_collections=('my_weights',))
 
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertCountEqual([], ops.get_collection('my_weights'))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
@@ -4363,9 +4364,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), weight_collections=('my_weights',))
 
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertCountEqual([], ops.get_collection('my_weights'))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
@@ -4820,7 +4821,7 @@ class IndicatorColumnTest(test.TestCase):
         self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
 
 
-class EmbeddingColumnTest(test.TestCase):
+class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def test_defaults(self):
@@ -4956,10 +4957,29 @@ class EmbeddingColumnTest(test.TestCase):
       _assert_sparse_tensor_value(self, self.evaluate(output_a),
                                   self.evaluate(output_embedded))
 
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
+      })
   @test_util.run_deprecated_v1
-  def test_get_dense_tensor(self):
+  def test_get_dense_tensor(self, use_safe_embedding_lookup,
+                            partition_variables):
     # Inputs.
-    vocabulary_size = 3
+    vocabulary_size = 4
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, ids [2]
         # example 1, ids [0, 1]
@@ -4974,12 +4994,20 @@ class EmbeddingColumnTest(test.TestCase):
     embedding_values = (
         (1., 2.),  # id 0
         (3., 5.),  # id 1
-        (7., 11.)  # id 2
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
     )
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+
+    def _initializer(shape, dtype, partition_info=None):
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
       self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
       return embedding_values
 
     # Expected lookup result, using combiner='mean'.
@@ -4997,25 +5025,43 @@ class EmbeddingColumnTest(test.TestCase):
     # Build columns.
     categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc._embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer)
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column = fc._embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
 
-    # Provide sparse input and get dense result.
-    embedding_lookup = embedding_column._get_dense_tensor(
-        _LazyBuilder({
-            'aaa': sparse_input
-        }))
+      # Provide sparse input and get dense result.
+      embedding_lookup = embedding_column._get_dense_tensor(
+          _LazyBuilder({'aaa': sparse_input}))
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
+    if partition_variables:
+      self.assertCountEqual(('vars/embedding_weights/part_0:0',
+                             'vars/embedding_weights/part_1:0'),
+                            tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(('vars/embedding_weights:0',),
+                            tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
+    if use_safe_embedding_lookup:
+      self.assertIn('SparseFillEmptyRows',
+                    [x.type for x in ops.get_default_graph().get_operations()])
+    else:
+      self.assertNotIn(
+          'SparseFillEmptyRows',
+          [x.type for x in ops.get_default_graph().get_operations()])
+
   @test_util.run_deprecated_v1
   def test_get_dense_tensor_3d(self):
     # Inputs.
@@ -5072,7 +5118,7 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
@@ -5102,11 +5148,11 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     my_vars = ops.get_collection('my_vars')
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in my_vars]))
+    self.assertCountEqual(('embedding_weights:0',),
+                          tuple([v.name for v in my_vars]))
 
   @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
@@ -5169,8 +5215,8 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertCountEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, embedding_lookup.eval(
@@ -5233,8 +5279,8 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertCountEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
@@ -5280,14 +5326,14 @@ class EmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v for v in ops.get_collection(
               ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -5361,14 +5407,14 @@ class EmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -5450,13 +5496,11 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('input_layer/aaa_embedding/embedding_weights:0',),
-        tuple([v.name for v in global_vars]))
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(
-        ('input_layer/aaa_embedding/embedding_weights:0',),
-        tuple([v.name for v in trainable_vars]))
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in trainable_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, trainable_vars[0].eval())
       self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
@@ -5513,17 +5557,16 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('input_layer/aaa_embedding/embedding_weights:0',),
-        tuple([v.name for v in global_vars]))
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
 
-class SharedEmbeddingColumnTest(test.TestCase):
+class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def test_defaults(self):
@@ -5772,33 +5815,59 @@ class SharedEmbeddingColumnTest(test.TestCase):
       _assert_sparse_tensor_value(self, self.evaluate(output_b),
                                   self.evaluate(output_b_embedded))
 
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
+      })
   @test_util.run_deprecated_v1
-  def test_get_dense_tensor(self):
+  def test_get_dense_tensor(self, use_safe_embedding_lookup,
+                            partition_variables):
     # Inputs.
-    vocabulary_size = 3
+    vocabulary_size = 4
     # -1 values are ignored.
-    input_a = np.array(
-        [[2, -1, -1],  # example 0, ids [2]
-         [0, 1, -1]])  # example 1, ids [0, 1]
-    input_b = np.array(
-        [[0, -1, -1],  # example 0, ids [0]
-         [-1, -1, -1]])  # example 1, ids []
-    input_features = {
-        'aaa': input_a,
-        'bbb': input_b
-    }
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+    input_features = {'aaa': input_a, 'bbb': input_b}
 
     # Embedding variable.
     embedding_dimension = 2
     embedding_values = (
         (1., 2.),  # id 0
         (3., 5.),  # id 1
-        (7., 11.)  # id 2
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
     )
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+
+    def _initializer(shape, dtype, partition_info=None):
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
       self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
       return embedding_values
 
     # Expected lookup result, using combiner='mean'.
@@ -5808,38 +5877,65 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # example 1:
         (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
     )
-    expected_lookups_b = (
-        # example 0:
-        (1., 2.),  # ids [0], embedding = [1, 2]
-        # example 1:
-        (0., 0.),  # ids [], embedding = [0, 0]
-    )
+    if use_safe_embedding_lookup:
+      expected_lookups_b = (
+          # example 0:
+          (1., 2.),  # ids [0], embedding = [1, 2]
+          # example 1:
+          (0., 0.),  # ids [], embedding = [0, 0]
+      )
+    else:
+      expected_lookups_b = (
+          # example 0:
+          (1., 2.),  # ids [0], embedding = [1, 2]
+      )
 
     # Build columns.
     categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer)
 
-    # Provide sparse input and get dense result.
-    embedding_lookup_a = embedding_column_a._get_dense_tensor(
-        _LazyBuilder(input_features))
-    embedding_lookup_b = embedding_column_b._get_dense_tensor(
-        _LazyBuilder(input_features))
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
 
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
+          [categorical_column_a, categorical_column_b],
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
+      # Provide sparse input and get dense result.
+      embedding_lookup_a = embedding_column_a._get_dense_tensor(
+          _LazyBuilder(input_features))
+      embedding_lookup_b = embedding_column_b._get_dense_tensor(
+          _LazyBuilder(input_features))
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
+    if partition_variables:
+      self.assertCountEqual(('vars/embedding_weights/part_0:0',
+                             'vars/embedding_weights/part_1:0'),
+                            tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(('vars/embedding_weights:0',),
+                            tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
-      self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
-      self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
+    self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
+    self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
+
+    if use_safe_embedding_lookup:
+      self.assertIn('SparseFillEmptyRows',
+                    [x.type for x in ops.get_default_graph().get_operations()])
+    else:
+      self.assertNotIn(
+          'SparseFillEmptyRows',
+          [x.type for x in ops.get_default_graph().get_operations()])
 
   @test_util.run_deprecated_v1
   def test_get_dense_tensor_weight_collections(self):
@@ -5886,11 +5982,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
         tuple(v.name for v in global_vars))
     my_vars = ops.get_collection('my_vars')
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
         tuple(v.name for v in my_vars))
 
@@ -5997,14 +6093,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
           'linear_model/aaa_bbb_shared_embedding_1/weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v for v in ops.get_collection(
               ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
@@ -6091,14 +6187,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
           'linear_model/aaa_bbb_shared_embedding_1/weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
@@ -6195,16 +6291,16 @@ class SharedEmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
         tuple([v.name for v in global_vars]))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     if trainable:
-      self.assertItemsEqual(
+      self.assertCountEqual(
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           tuple([v.name for v in trainable_vars]))
     else:
-      self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
+      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
     shared_embedding_vars = global_vars
     with _initialized_session():
       self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index 23a9861eb1b..a03e4da0fae 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -145,8 +145,6 @@ from tensorflow.python.framework import tensor_shape
 # TODO(b/118385027): Dependency on keras can be problematic if Keras moves out
 # of the main repo.
 from tensorflow.python.keras import initializers
-from tensorflow.python.keras.engine import training as keras_training
-from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -154,7 +152,6 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
@@ -383,376 +380,6 @@ class _StateManagerImplV2(_StateManagerImpl):
     return var
 
 
-class _BaseFeaturesLayer(Layer):
-  """Base class for DenseFeatures and SequenceFeatures.
-
-  Defines common methods and helpers.
-
-  Args:
-    feature_columns: An iterable containing the FeatureColumns to use as
-      inputs to your model.
-    expected_column_type: Expected class for provided feature columns.
-    trainable:  Boolean, whether the layer's variables will be updated via
-      gradient descent during training.
-    name: Name to give to the DenseFeatures.
-    **kwargs: Keyword arguments to construct a layer.
-
-  Raises:
-    ValueError: if an item in `feature_columns` doesn't match
-      `expected_column_type`.
-  """
-
-  def __init__(self,
-               feature_columns,
-               expected_column_type,
-               trainable,
-               name,
-               partitioner=None,
-               **kwargs):
-    super(_BaseFeaturesLayer, self).__init__(
-        name=name, trainable=trainable, **kwargs)
-    self._feature_columns = _normalize_feature_columns(feature_columns)
-    self._state_manager = _StateManagerImpl(self, self.trainable)
-    self._partitioner = partitioner
-    for column in self._feature_columns:
-      if not isinstance(column, expected_column_type):
-        raise ValueError(
-            'Items of feature_columns must be a {}. '
-            'You can wrap a categorical column with an '
-            'embedding_column or indicator_column. Given: {}'.format(
-                expected_column_type, column))
-
-  def build(self, _):
-    for column in self._feature_columns:
-      with variable_scope._pure_variable_scope(  # pylint: disable=protected-access
-          self.name,
-          partitioner=self._partitioner):
-        with variable_scope._pure_variable_scope(  # pylint: disable=protected-access
-            _sanitize_column_name_for_variable_scope(column.name)):
-          column.create_state(self._state_manager)
-    super(_BaseFeaturesLayer, self).build(None)
-
-  def _output_shape(self, input_shape, num_elements):
-    """Computes expected output shape of the layer or a column's dense tensor.
-
-    Args:
-      input_shape: Tensor or array with batch shape.
-      num_elements: Size of the last dimension of the output.
-
-    Returns:
-      Tuple with output shape.
-    """
-    raise NotImplementedError('Calling an abstract method.')
-
-  def compute_output_shape(self, input_shape):
-    total_elements = 0
-    for column in self._feature_columns:
-      total_elements += column.variable_shape.num_elements()
-    return self._target_shape(input_shape, total_elements)
-
-  def _process_dense_tensor(self, column, tensor):
-    """Reshapes the dense tensor output of a column based on expected shape.
-
-    Args:
-      column: A DenseColumn or SequenceDenseColumn object.
-      tensor: A dense tensor obtained from the same column.
-
-    Returns:
-      Reshaped dense tensor."""
-    num_elements = column.variable_shape.num_elements()
-    target_shape = self._target_shape(array_ops.shape(tensor), num_elements)
-    return array_ops.reshape(tensor, shape=target_shape)
-
-  def _verify_and_concat_tensors(self, output_tensors):
-    """Verifies and concatenates the dense output of several columns."""
-    _verify_static_batch_size_equality(output_tensors, self._feature_columns)
-    return array_ops.concat(output_tensors, -1)
-
-  def get_config(self):
-    # Import here to avoid circular imports.
-    from tensorflow.python.feature_column import serialization  # pylint: disable=g-import-not-at-top
-    column_configs = serialization.serialize_feature_columns(
-        self._feature_columns)
-    config = {'feature_columns': column_configs}
-    config['partitioner'] = generic_utils.serialize_keras_object(
-        self._partitioner)
-
-    base_config = super(  # pylint: disable=bad-super-call
-        _BaseFeaturesLayer, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    # Import here to avoid circular imports.
-    from tensorflow.python.feature_column import serialization  # pylint: disable=g-import-not-at-top
-    config_cp = config.copy()
-    config_cp['feature_columns'] = serialization.deserialize_feature_columns(
-        config['feature_columns'], custom_objects=custom_objects)
-    config_cp['partitioner'] = generic_utils.deserialize_keras_object(
-        config['partitioner'], custom_objects)
-
-    return cls(**config_cp)
-
-
-class _LinearModelLayer(Layer):
-  """Layer that contains logic for `LinearModel`."""
-
-  def __init__(self,
-               feature_columns,
-               units=1,
-               sparse_combiner='sum',
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(_LinearModelLayer, self).__init__(
-        name=name, trainable=trainable, **kwargs)
-
-    self._feature_columns = _normalize_feature_columns(feature_columns)
-    for column in self._feature_columns:
-      if not isinstance(column, (DenseColumn, CategoricalColumn)):
-        raise ValueError(
-            'Items of feature_columns must be either a '
-            'DenseColumn or CategoricalColumn. Given: {}'.format(column))
-
-    self._units = units
-    self._sparse_combiner = sparse_combiner
-
-    self._state_manager = _StateManagerImpl(self, self.trainable)
-    self.bias = None
-
-  def build(self, _):
-    # We need variable scopes for now because we want the variable partitioning
-    # information to percolate down. We also use _pure_variable_scope's here
-    # since we want to open up a name_scope in the `call` method while creating
-    # the ops.
-    with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
-      for column in self._feature_columns:
-        with variable_scope._pure_variable_scope(  # pylint: disable=protected-access
-            _sanitize_column_name_for_variable_scope(column.name)):
-          # Create the state for each feature column
-          column.create_state(self._state_manager)
-
-          # Create a weight variable for each column.
-          if isinstance(column, CategoricalColumn):
-            first_dim = column.num_buckets
-          else:
-            first_dim = column.variable_shape.num_elements()
-          self._state_manager.create_variable(
-              column,
-              name='weights',
-              dtype=dtypes.float32,
-              shape=(first_dim, self._units),
-              initializer=initializers.zeros(),
-              trainable=self.trainable)
-
-      # Create a bias variable.
-      self.bias = self.add_variable(
-          name='bias_weights',
-          dtype=dtypes.float32,
-          shape=[self._units],
-          initializer=initializers.zeros(),
-          trainable=self.trainable,
-          use_resource=True,
-          # TODO(rohanj): Get rid of this hack once we have a mechanism for
-          # specifying a default partitioner for an entire layer. In that case,
-          # the default getter for Layers should work.
-          getter=variable_scope.get_variable)
-
-    super(_LinearModelLayer, self).build(None)
-
-  def call(self, features):
-    if not isinstance(features, dict):
-      raise ValueError('We expected a dictionary here. Instead we got: {}'
-                       .format(features))
-    with ops.name_scope(self.name):
-      transformation_cache = FeatureTransformationCache(features)
-      weighted_sums = []
-      for column in self._feature_columns:
-        with ops.name_scope(
-            _sanitize_column_name_for_variable_scope(column.name)):
-          # All the weights used in the linear model are owned by the state
-          # manager associated with this Linear Model.
-          weight_var = self._state_manager.get_variable(column, 'weights')
-
-          weighted_sum = _create_weighted_sum(
-              column=column,
-              transformation_cache=transformation_cache,
-              state_manager=self._state_manager,
-              sparse_combiner=self._sparse_combiner,
-              weight_var=weight_var)
-          weighted_sums.append(weighted_sum)
-
-      _verify_static_batch_size_equality(weighted_sums, self._feature_columns)
-      predictions_no_bias = math_ops.add_n(
-          weighted_sums, name='weighted_sum_no_bias')
-      predictions = nn_ops.bias_add(
-          predictions_no_bias, self.bias, name='weighted_sum')
-      return predictions
-
-  def get_config(self):
-    # Import here to avoid circular imports.
-    from tensorflow.python.feature_column import serialization  # pylint: disable=g-import-not-at-top
-    column_configs = serialization.serialize_feature_columns(
-        self._feature_columns)
-    config = {
-        'feature_columns': column_configs,
-        'units': self._units,
-        'sparse_combiner': self._sparse_combiner
-    }
-
-    base_config = super(  # pylint: disable=bad-super-call
-        _LinearModelLayer, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    # Import here to avoid circular imports.
-    from tensorflow.python.feature_column import serialization  # pylint: disable=g-import-not-at-top
-    config_cp = config.copy()
-    columns = serialization.deserialize_feature_columns(
-        config_cp['feature_columns'], custom_objects=custom_objects)
-
-    del config_cp['feature_columns']
-    return cls(feature_columns=columns, **config_cp)
-
-
-# TODO(tanzheny): Cleanup it with respect to Premade model b/132690565.
-class LinearModel(keras_training.Model):
-  """Produces a linear prediction `Tensor` based on given `feature_columns`.
-
-  This layer generates a weighted sum based on output dimension `units`.
-  Weighted sum refers to logits in classification problems. It refers to the
-  prediction itself for linear regression problems.
-
-  Note on supported columns: `LinearLayer` treats categorical columns as
-  `indicator_column`s. To be specific, assume the input as `SparseTensor` looks
-  like:
-
-  ```python
-    shape = [2, 2]
-    {
-        [0, 0]: "a"
-        [1, 0]: "b"
-        [1, 1]: "c"
-    }
-  ```
-  `linear_model` assigns weights for the presence of "a", "b", "c' implicitly,
-  just like `indicator_column`, while `input_layer` explicitly requires wrapping
-  each of categorical columns with an `embedding_column` or an
-  `indicator_column`.
-
-  Example of usage:
-
-  ```python
-  price = numeric_column('price')
-  price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.])
-  keywords = categorical_column_with_hash_bucket("keywords", 10K)
-  keywords_price = crossed_column('keywords', price_buckets, ...)
-  columns = [price_buckets, keywords, keywords_price ...]
-  linear_model = LinearLayer(columns)
-
-  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
-  prediction = linear_model(features)
-  ```
-  """
-
-  def __init__(self,
-               feature_columns,
-               units=1,
-               sparse_combiner='sum',
-               trainable=True,
-               name=None,
-               **kwargs):
-    """Constructs a LinearLayer.
-
-    Args:
-      feature_columns: An iterable containing the FeatureColumns to use as
-        inputs to your model. All items should be instances of classes derived
-        from `_FeatureColumn`s.
-      units: An integer, dimensionality of the output space. Default value is 1.
-      sparse_combiner: A string specifying how to reduce if a categorical column
-        is multivalent. Except `numeric_column`, almost all columns passed to
-        `linear_model` are considered as categorical columns.  It combines each
-        categorical column independently. Currently "mean", "sqrtn" and "sum"
-        are supported, with "sum" the default for linear model. "sqrtn" often
-        achieves good accuracy, in particular with bag-of-words columns.
-          * "sum": do not normalize features in the column
-          * "mean": do l1 normalization on features in the column
-          * "sqrtn": do l2 normalization on features in the column
-        For example, for two features represented as the categorical columns:
-
-          ```python
-          # Feature 1
-
-          shape = [2, 2]
-          {
-              [0, 0]: "a"
-              [0, 1]: "b"
-              [1, 0]: "c"
-          }
-
-          # Feature 2
-
-          shape = [2, 3]
-          {
-              [0, 0]: "d"
-              [1, 0]: "e"
-              [1, 1]: "f"
-              [1, 2]: "g"
-          }
-          ```
-
-        with `sparse_combiner` as "mean", the linear model outputs conceptually
-        are
-        ```
-        y_0 = 1.0 / 2.0 * ( w_a + w_ b) + w_c + b_0
-        y_1 = w_d + 1.0 / 3.0 * ( w_e + w_ f + w_g) + b_1
-        ```
-        where `y_i` is the output, `b_i` is the bias, and `w_x` is the weight
-        assigned to the presence of `x` in the input features.
-      trainable: If `True` also add the variable to the graph collection
-        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-      name: Name to give to the Linear Model. All variables and ops created will
-        be scoped by this name.
-      **kwargs: Keyword arguments to construct a layer.
-
-    Raises:
-      ValueError: if an item in `feature_columns` is neither a `DenseColumn`
-        nor `CategoricalColumn`.
-    """
-
-    super(LinearModel, self).__init__(name=name, **kwargs)
-    self.layer = _LinearModelLayer(
-        feature_columns,
-        units,
-        sparse_combiner,
-        trainable,
-        name=self.name,
-        **kwargs)
-
-  def call(self, features):
-    """Returns a `Tensor` the represents the predictions of a linear model.
-
-    Args:
-      features: A mapping from key to tensors. `_FeatureColumn`s look up via
-        these keys. For example `numeric_column('price')` will look at 'price'
-        key in this dict. Values are `Tensor` or `SparseTensor` depending on
-        corresponding `_FeatureColumn`.
-
-    Returns:
-      A `Tensor` which represents predictions/logits of a linear model. Its
-      shape is (batch_size, units) and its dtype is `float32`.
-
-    Raises:
-      ValueError: If features are not a dictionary.
-    """
-    return self.layer(features)
-
-  @property
-  def bias(self):
-    return self.layer.bias
-
-
 def _transform_features_v2(features, feature_columns, state_manager):
   """Returns transformed features based on features columns passed in.
 
@@ -3263,7 +2890,7 @@ class EmbeddingColumn(
     embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
     if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
         sparse_id_rank <= 2):
-      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse
+      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
     # Return embedding lookup result.
     return embedding_lookup_sparse(
         embedding_weights,
@@ -3558,7 +3185,7 @@ class SharedEmbeddingColumn(
       embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
       if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
           sparse_id_rank <= 2):
-        embedding_lookup_sparse = (embedding_ops.embedding_lookup_sparse)
+        embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
       # Return embedding lookup result.
       return embedding_lookup_sparse(
           embedding_weights,
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index fe769850fb0..91fb7eadb89 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -31,7 +31,6 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import serialization
@@ -49,7 +48,6 @@ from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
-from tensorflow.python.training import rmsprop
 
 
 def _initialized_session(config=None):
@@ -440,36 +438,6 @@ class NumericColumnTest(test.TestCase):
         'aaa', shape=[1, 2], default_value=np.array([[3., 2.]]))
     self.assertEqual(a.default_value, ((3., 2.),))
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      model = fc.LinearModel([price])
-      predictions = model(features)
-      price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias))
-        self.assertAllClose([[0.]], self.evaluate(price_var))
-        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
-        sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
-
-  @test_util.run_deprecated_v1
-  def test_linear_model_sanitizes_scope_names(self):
-    price = fc.numeric_column('price > 100')
-    with ops.Graph().as_default():
-      features = {'price > 100': [[1.], [5.]]}
-      model = fc.LinearModel([price])
-      predictions = model(features)
-      price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias))
-        self.assertAllClose([[0.]], self.evaluate(price_var))
-        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
-        sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
-
   def test_old_linear_model(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
@@ -706,63 +674,6 @@ class BucketizedColumnTest(test.TestCase):
     self.assertAllEqual(a_bucketized_copy.variable_shape, (2, 3))
     self.assertEqual(a_bucketized_copy.boundaries, (0, 1))
 
-  def test_linear_model_one_input_value(self):
-    """Tests linear_model() for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
-    with ops.Graph().as_default():
-      features = {'price': [[-1.], [1.], [5.], [6.]]}
-      model = fc.LinearModel([bucketized_price])
-      predictions = model(features)
-      bucketized_price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias))
-        # One weight variable per bucket, all initialized to zero.
-        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
-                            self.evaluate(bucketized_price_var))
-        self.assertAllClose([[0.], [0.], [0.], [0.]],
-                            self.evaluate(predictions))
-        sess.run(
-            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
-        # price -1. is in the 0th bucket, whose weight is 10.
-        # price 1. is in the 1st bucket, whose weight is 20.
-        # price 5. is in the 3rd bucket, whose weight is 40.
-        # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]],
-                            self.evaluate(predictions))
-        sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]],
-                            self.evaluate(predictions))
-
-  def test_linear_model_two_input_values(self):
-    """Tests linear_model() for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
-    with ops.Graph().as_default():
-      features = {'price': [[-1., 1.], [5., 6.]]}
-      model = fc.LinearModel([bucketized_price])
-      predictions = model(features)
-      bucketized_price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias))
-        # One weight per bucket per input column, all initialized to zero.
-        self.assertAllClose(
-            [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            self.evaluate(bucketized_price_var))
-        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
-        sess.run(
-            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
-                                         [60.], [70.], [80.], [90.], [100.]]))
-        # 1st example:
-        #   price -1. is in the 0th bucket, whose weight is 10.
-        #   price 1. is in the 6th bucket, whose weight is 70.
-        # 2nd example:
-        #   price 5. is in the 3rd bucket, whose weight is 40.
-        #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
-        sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
-
   def test_old_linear_model_one_input_value(self):
     """Tests linear_model() for input with shape=[1]."""
     price = fc.numeric_column('price', shape=[1])
@@ -1071,32 +982,6 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertEqual(
         transformation_cache.get(hashed_sparse, None), id_weight_pair.id_tensor)
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
-    self.assertEqual(4, wire_column.num_buckets)
-    with ops.Graph().as_default():
-      model = fc.LinearModel((wire_column,))
-      predictions = model({
-          wire_column.name:
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=('marlo', 'skywalker', 'omar'),
-                  dense_shape=(2, 2))
-      })
-      wire_var, bias = model.variables
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose((0.,), self.evaluate(bias))
-      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
-      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
-      # 'marlo' -> 3: wire_var[3] = 4
-      # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-      self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
-
   def test_old_linear_model(self):
     wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(4, wire_column.num_buckets)
@@ -1365,101 +1250,6 @@ class CrossedColumnTest(test.TestCase):
       self.assertAllEqual(expected_values, id_tensor_eval.values)
       self.assertAllEqual((2, 4), id_tensor_eval.dense_shape)
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    """Tests linear_model.
-
-    Uses data from test_get_sparse_tensors_simple.
-    """
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
-    with ops.Graph().as_default():
-      model = fc.LinearModel((crossed,))
-      predictions = model({
-          'a':
-              constant_op.constant(((-1., .5), (.5, 1.))),
-          'c':
-              sparse_tensor.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=['cA', 'cB', 'cC'],
-                  dense_shape=(2, 2)),
-      })
-      crossed_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose((0.,), self.evaluate(bias))
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
-                            self.evaluate(crossed_var))
-        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-        sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
-        # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
-        sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
-
-  def test_linear_model_with_weights(self):
-
-    class _TestColumnWithWeights(BaseFeatureColumnForTests,
-                                 fc.CategoricalColumn):
-      """Produces sparse IDs and sparse weights."""
-
-      @property
-      def _is_v2_column(self):
-        return True
-
-      @property
-      def name(self):
-        return 'test_column'
-
-      @property
-      def parse_example_spec(self):
-        return {
-            self.name:
-                parsing_ops.VarLenFeature(dtypes.int32),
-            '{}_weights'.format(self.name):
-                parsing_ops.VarLenFeature(dtypes.float32),
-        }
-
-      @property
-      def num_buckets(self):
-        return 5
-
-      def transform_feature(self, transformation_cache, state_manager):
-        return (transformation_cache.get(self.name, state_manager),
-                transformation_cache.get('{}_weights'.format(self.name),
-                                         state_manager))
-
-      def get_sparse_tensors(self, transformation_cache, state_manager):
-        """Populates both id_tensor and weight_tensor."""
-        ids_and_weights = transformation_cache.get(self, state_manager)
-        return fc.CategoricalColumn.IdWeightPair(
-            id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
-
-    t = _TestColumnWithWeights()
-    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
-    with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
-          ValueError,
-          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
-        model = fc.LinearModel((crossed,))
-        model({
-            t.name:
-                sparse_tensor.SparseTensor(
-                    indices=((0, 0), (1, 0), (1, 1)),
-                    values=[0, 1, 2],
-                    dense_shape=(2, 2)),
-            '{}_weights'.format(t.name):
-                sparse_tensor.SparseTensor(
-                    indices=((0, 0), (1, 0), (1, 1)),
-                    values=[1., 10., 2.],
-                    dense_shape=(2, 2)),
-            'c':
-                sparse_tensor.SparseTensor(
-                    indices=((0, 0), (1, 0), (1, 1)),
-                    values=['cA', 'cB', 'cC'],
-                    dense_shape=(2, 2)),
-        })
-
   def test_old_linear_model(self):
     """Tests linear_model.
 
@@ -1644,668 +1434,6 @@ class CrossedColumnTest(test.TestCase):
     self.assertIs(b, new_crossed.keys[0])
 
 
-class LinearModelTest(test.TestCase):
-
-  def test_raises_if_empty_feature_columns(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'feature_columns must not be empty'):
-      fc.LinearModel(feature_columns=[])
-
-  def test_should_be_feature_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a FeatureColumn'):
-      fc.LinearModel(feature_columns='NotSupported')
-
-  def test_should_be_dense_or_categorical_column(self):
-
-    class NotSupportedColumn(BaseFeatureColumnForTests):
-
-      @property
-      def _is_v2_column(self):
-        return True
-
-      @property
-      def name(self):
-        return 'NotSupportedColumn'
-
-      def transform_feature(self, transformation_cache, state_manager):
-        pass
-
-      @property
-      def parse_example_spec(self):
-        pass
-
-    with self.assertRaisesRegexp(
-        ValueError, 'must be either a DenseColumn or CategoricalColumn'):
-      fc.LinearModel(feature_columns=[NotSupportedColumn()])
-
-  def test_does_not_support_dict_columns(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'Expected feature_columns to be iterable, found dict.'):
-      fc.LinearModel(feature_columns={'a': fc.numeric_column('a')})
-
-  def test_raises_if_duplicate_name(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'Duplicate feature column name found for columns'):
-      fc.LinearModel(
-          feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])
-
-  def test_not_dict_input_features(self):
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default():
-      features = [[1.], [5.]]
-      model = fc.LinearModel([price])
-      with self.assertRaisesRegexp(ValueError, 'We expected a dictionary here'):
-        model(features)
-
-  def test_dense_bias(self):
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      model = fc.LinearModel([price])
-      predictions = model(features)
-      price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias))
-        sess.run(price_var.assign([[10.]]))
-        sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
-
-  def test_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {'wire_cast': wire_tensor}
-      model = fc.LinearModel([wire_cast])
-      predictions = model(features)
-      wire_cast_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias))
-        self.assertAllClose([[0.], [0.], [0.], [0.]],
-                            self.evaluate(wire_cast_var))
-        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
-        sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
-
-  def test_dense_and_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]}
-      model = fc.LinearModel([wire_cast, price])
-      predictions = model(features)
-      price_var, wire_cast_var, bias = model.variables
-      with _initialized_session() as sess:
-        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
-        sess.run(bias.assign([5.]))
-        sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
-
-  def test_dense_and_sparse_column(self):
-    """When the column is both dense and sparse, uses sparse tensors."""
-
-    class _DenseAndSparseColumn(BaseFeatureColumnForTests, fc.DenseColumn,
-                                fc.CategoricalColumn):
-
-      @property
-      def _is_v2_column(self):
-        return True
-
-      @property
-      def name(self):
-        return 'dense_and_sparse_column'
-
-      @property
-      def parse_example_spec(self):
-        return {self.name: parsing_ops.VarLenFeature(self.dtype)}
-
-      def transform_feature(self, transformation_cache, state_manager):
-        return transformation_cache.get(self.name, state_manager)
-
-      @property
-      def variable_shape(self):
-        raise ValueError('Should not use this method.')
-
-      def get_dense_tensor(self, transformation_cache, state_manager):
-        raise ValueError('Should not use this method.')
-
-      @property
-      def num_buckets(self):
-        return 4
-
-      def get_sparse_tensors(self, transformation_cache, state_manager):
-        sp_tensor = sparse_tensor.SparseTensor(
-            indices=[[0, 0], [1, 0], [1, 1]],
-            values=[2, 0, 3],
-            dense_shape=[2, 2])
-        return fc.CategoricalColumn.IdWeightPair(sp_tensor, None)
-
-    dense_and_sparse_column = _DenseAndSparseColumn()
-    with ops.Graph().as_default():
-      sp_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {dense_and_sparse_column.name: sp_tensor}
-      model = fc.LinearModel([dense_and_sparse_column])
-      predictions = model(features)
-      dense_and_sparse_column_var, bias = model.variables
-      with _initialized_session() as sess:
-        sess.run(
-            dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
-                                                [10000.]]))
-        sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
-
-  def test_dense_multi_output(self):
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      model = fc.LinearModel([price], units=3)
-      predictions = model(features)
-      price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
-        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
-        sess.run(price_var.assign([[10., 100., 1000.]]))
-        sess.run(bias.assign([5., 6., 7.]))
-        self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            self.evaluate(predictions))
-
-  def test_sparse_multi_output(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {'wire_cast': wire_tensor}
-      model = fc.LinearModel([wire_cast], units=3)
-      predictions = model(features)
-      wire_cast_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
-        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
-        sess.run(
-            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
-                                  [1000., 1100., 1200.],
-                                  [10000., 11000., 12000.]]))
-        sess.run(bias.assign([5., 6., 7.]))
-        self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            self.evaluate(predictions))
-
-  def test_dense_multi_dimension(self):
-    price = fc.numeric_column('price', shape=2)
-    with ops.Graph().as_default():
-      features = {'price': [[1., 2.], [5., 6.]]}
-      model = fc.LinearModel([price])
-      predictions = model(features)
-      price_var, _ = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
-        sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
-
-  def test_sparse_multi_rank(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      wire_tensor = array_ops.sparse_placeholder(dtypes.string)
-      wire_value = sparse_tensor.SparseTensorValue(
-          values=['omar', 'stringer', 'marlo', 'omar'],  # hashed = [2, 0, 3, 2]
-          indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 1]],
-          dense_shape=[2, 2, 2])
-      features = {'wire_cast': wire_tensor}
-      model = fc.LinearModel([wire_cast])
-      predictions = model(features)
-      wire_cast_var, _ = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
-        self.assertAllClose(
-            np.zeros((2, 1)),
-            predictions.eval(feed_dict={wire_tensor: wire_value}))
-        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
-        self.assertAllClose(
-            [[1010.], [11000.]],
-            predictions.eval(feed_dict={wire_tensor: wire_value}))
-
-  def test_sparse_combiner(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {'wire_cast': wire_tensor}
-      model = fc.LinearModel([wire_cast], sparse_combiner='mean')
-      predictions = model(features)
-      wire_cast_var, bias = model.variables
-      with _initialized_session() as sess:
-        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
-        sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
-
-  def test_sparse_combiner_sqrtn(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {'wire_cast': wire_tensor}
-      model = fc.LinearModel([wire_cast], sparse_combiner='sqrtn')
-      predictions = model(features)
-      wire_cast_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.evaluate(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
-        self.evaluate(bias.assign([5.]))
-        self.assertAllClose([[1005.], [7083.139]], self.evaluate(predictions))
-
-  def test_sparse_combiner_with_negative_weights(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    wire_cast_weights = fc.weighted_categorical_column(wire_cast, 'weights')
-
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {
-          'wire_cast': wire_tensor,
-          'weights': constant_op.constant([[1., 1., -1.0]])
-      }
-      model = fc.LinearModel([wire_cast_weights], sparse_combiner='sum')
-      predictions = model(features)
-      wire_cast_var, bias = model.variables
-      with _initialized_session() as sess:
-        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
-        sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [-9985.]], self.evaluate(predictions))
-
-  def test_dense_multi_dimension_multi_output(self):
-    price = fc.numeric_column('price', shape=2)
-    with ops.Graph().as_default():
-      features = {'price': [[1., 2.], [5., 6.]]}
-      model = fc.LinearModel([price], units=3)
-      predictions = model(features)
-      price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
-        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
-        sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
-        sess.run(bias.assign([2., 3., 4.]))
-        self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            self.evaluate(predictions))
-
-  def test_raises_if_shape_mismatch(self):
-    price = fc.numeric_column('price', shape=2)
-    with ops.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      with self.assertRaisesRegexp(
-          Exception,
-          r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-        model = fc.LinearModel([price])
-        model(features)
-
-  def test_dense_reshaping(self):
-    price = fc.numeric_column('price', shape=[1, 2])
-    with ops.Graph().as_default():
-      features = {'price': [[[1., 2.]], [[5., 6.]]]}
-      model = fc.LinearModel([price])
-      predictions = model(features)
-      price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias))
-        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
-        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
-        sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
-
-  def test_dense_multi_column(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      model = fc.LinearModel([price1, price2])
-      predictions = model(features)
-      price1_var, price2_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias))
-        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
-        self.assertAllClose([[0.]], self.evaluate(price2_var))
-        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
-        sess.run(price1_var.assign([[10.], [100.]]))
-        sess.run(price2_var.assign([[1000.]]))
-        sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
-
-  def test_dense_trainable_default(self):
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default() as g:
-      features = {'price': [[1.], [5.]]}
-      model = fc.LinearModel([price])
-      model(features)
-      price_var, bias = model.variables
-      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertIn(bias, trainable_vars)
-      self.assertIn(price_var, trainable_vars)
-
-  def test_sparse_trainable_default(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default() as g:
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
-      features = {'wire_cast': wire_tensor}
-      model = fc.LinearModel([wire_cast])
-      model(features)
-      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      wire_cast_var, bias = model.variables
-      self.assertIn(bias, trainable_vars)
-      self.assertIn(wire_cast_var, trainable_vars)
-
-  def test_dense_trainable_false(self):
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default() as g:
-      features = {'price': [[1.], [5.]]}
-      model = fc.LinearModel([price], trainable=False)
-      model(features)
-      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertEqual([], trainable_vars)
-
-  def test_sparse_trainable_false(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default() as g:
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
-      features = {'wire_cast': wire_tensor}
-      model = fc.LinearModel([wire_cast], trainable=False)
-      model(features)
-      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertEqual([], trainable_vars)
-
-  def test_column_order(self):
-    price_a = fc.numeric_column('price_a')
-    price_b = fc.numeric_column('price_b')
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      features = {
-          'price_a': [[1.]],
-          'price_b': [[3.]],
-          'wire_cast':
-              sparse_tensor.SparseTensor(
-                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
-      }
-      model = fc.LinearModel([price_a, wire_cast, price_b])
-      model(features)
-
-      my_vars = model.variables
-      self.assertIn('price_a', my_vars[0].name)
-      self.assertIn('price_b', my_vars[1].name)
-      self.assertIn('wire_cast', my_vars[2].name)
-
-    with ops.Graph().as_default():
-      features = {
-          'price_a': [[1.]],
-          'price_b': [[3.]],
-          'wire_cast':
-              sparse_tensor.SparseTensor(
-                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
-      }
-      model = fc.LinearModel([wire_cast, price_b, price_a])
-      model(features)
-
-      my_vars = model.variables
-      self.assertIn('price_a', my_vars[0].name)
-      self.assertIn('price_b', my_vars[1].name)
-      self.assertIn('wire_cast', my_vars[2].name)
-
-  def test_variable_names(self):
-    price1 = fc.numeric_column('price1')
-    dense_feature = fc.numeric_column('dense_feature')
-    dense_feature_bucketized = fc.bucketized_column(
-        dense_feature, boundaries=[0.])
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
-        'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
-        some_sparse_column, dimension=10)
-    all_cols = [price1, dense_feature_bucketized, some_embedding_column]
-
-    with ops.Graph().as_default():
-      model = fc.LinearModel(all_cols)
-      features = {
-          'price1': [[3.], [4.]],
-          'dense_feature': [[-1.], [4.]],
-          'sparse_feature': [['a'], ['x']],
-      }
-      model(features)
-      for var in model.variables:
-        self.assertIsInstance(var, variables_lib.VariableV1)
-      variable_names = [var.name for var in model.variables]
-      self.assertItemsEqual([
-          'linear_model/dense_feature_bucketized/weights:0',
-          'linear_model/price1/weights:0',
-          'linear_model/sparse_feature_embedding/embedding_weights:0',
-          'linear_model/sparse_feature_embedding/weights:0',
-          'linear_model/bias_weights:0',
-      ], variable_names)
-
-  def test_fit_and_predict(self):
-    columns = [fc.numeric_column('a')]
-
-    model = fc.LinearModel(columns)
-    model.compile(
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        loss='binary_crossentropy',
-        metrics=['accuracy'])
-
-    x = {'a': np.random.random((10, 1))}
-    y = np.random.randint(0, 2, size=(10, 1))
-    model.fit(x, y, epochs=1, batch_size=5)
-    model.fit(x, y, epochs=1, batch_size=5)
-    model.evaluate(x, y, batch_size=5)
-    model.predict(x, batch_size=5)
-
-  def test_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {
-          'price1': [[1.], [5.], [7.]],  # batchsize = 3
-          'price2': [[3.], [4.]]  # batchsize = 2
-      }
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-      model = fc.LinearModel([price1, price2])
-      model(features)
-
-  def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    price3 = fc.numeric_column('price3')
-    with ops.Graph().as_default():
-      features = {
-          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
-          'price2': [[3.], [4.]],  # batchsize = 2
-          'price3': [[3.], [4.], [5.]]  # batchsize = 3
-      }
-      with self.assertRaisesRegexp(
-          ValueError,
-          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        model = fc.LinearModel([price1, price2, price3])
-        model(features)
-
-  def test_runtime_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {
-          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
-          'price2': [[3.], [4.]]  # batchsize = 2
-      }
-      model = fc.LinearModel([price1, price2])
-      predictions = model(features)
-      with _initialized_session() as sess:
-        with self.assertRaisesRegexp(errors.OpError,
-                                     'must have the same size and shape'):
-          sess.run(
-              predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
-
-  def test_runtime_batch_size_matches(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {
-          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
-          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
-      }
-      model = fc.LinearModel([price1, price2])
-      predictions = model(features)
-      with _initialized_session() as sess:
-        sess.run(
-            predictions,
-            feed_dict={
-                features['price1']: [[1.], [5.]],
-                features['price2']: [[1.], [5.]],
-            })
-
-  @test_util.run_deprecated_v1
-  def test_with_1d_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(
-        price, boundaries=[
-            0.,
-            10.,
-            100.,
-        ])
-    body_style = fc.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-
-    # Provides 1-dim tensor and dense tensor.
-    features = {
-        'price':
-            constant_op.constant([
-                -1.,
-                12.,
-            ]),
-        'body-style':
-            sparse_tensor.SparseTensor(
-                indices=((0,), (1,)),
-                values=('sedan', 'hardtop'),
-                dense_shape=(2,)),
-    }
-    self.assertEqual(1, features['price'].shape.ndims)
-    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
-
-    model = fc.LinearModel([price_buckets, body_style])
-    net = model(features)
-    with _initialized_session() as sess:
-      body_style_var, price_buckets_var, bias = model.variables
-
-      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
-      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
-      sess.run(bias.assign([5.]))
-
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
-                          self.evaluate(net))
-
-  @test_util.run_deprecated_v1
-  def test_with_1d_unknown_shape_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(
-        price, boundaries=[
-            0.,
-            10.,
-            100.,
-        ])
-    body_style = fc.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    country = fc.categorical_column_with_vocabulary_list(
-        'country', vocabulary_list=['US', 'JP', 'CA'])
-
-    # Provides 1-dim tensor and dense tensor.
-    features = {
-        'price': array_ops.placeholder(dtypes.float32),
-        'body-style': array_ops.sparse_placeholder(dtypes.string),
-        'country': array_ops.placeholder(dtypes.string),
-    }
-    self.assertIsNone(features['price'].shape.ndims)
-    self.assertIsNone(features['body-style'].get_shape().ndims)
-
-    price_data = np.array([-1., 12.])
-    body_style_data = sparse_tensor.SparseTensorValue(
-        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
-    country_data = np.array(['US', 'CA'])
-
-    model = fc.LinearModel([price_buckets, body_style, country])
-    net = model(features)
-    body_style_var, _, price_buckets_var, bias = model.variables
-    with _initialized_session() as sess:
-      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
-      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
-      sess.run(bias.assign([5.]))
-
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
-                          sess.run(
-                              net,
-                              feed_dict={
-                                  features['price']: price_data,
-                                  features['body-style']: body_style_data,
-                                  features['country']: country_data
-                              }))
-
-  @test_util.run_deprecated_v1
-  def test_with_rank_0_feature(self):
-    price = fc.numeric_column('price')
-    features = {
-        'price': constant_op.constant(0),
-    }
-    self.assertEqual(0, features['price'].shape.ndims)
-
-    # Static rank 0 should fail
-    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
-      model = fc.LinearModel([price])
-      model(features)
-
-    # Dynamic rank 0 should fail
-    features = {
-        'price': array_ops.placeholder(dtypes.float32),
-    }
-    model = fc.LinearModel([price])
-    net = model(features)
-    self.assertEqual(1, net.shape[1])
-    with _initialized_session() as sess:
-      with self.assertRaisesOpError('Feature .* cannot have rank 0'):
-        sess.run(net, feed_dict={features['price']: np.array(1)})
-
-  def test_multiple_linear_models(self):
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default():
-      features1 = {'price': [[1.], [5.]]}
-      features2 = {'price': [[2.], [10.]]}
-      model1 = fc.LinearModel([price])
-      model2 = fc.LinearModel([price])
-      predictions1 = model1(features1)
-      predictions2 = model2(features2)
-      price_var1, bias1 = model1.variables
-      price_var2, bias2 = model2.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias1))
-        sess.run(price_var1.assign([[10.]]))
-        sess.run(bias1.assign([5.]))
-        self.assertAllClose([[15.], [55.]], self.evaluate(predictions1))
-        self.assertAllClose([0.], self.evaluate(bias2))
-        sess.run(price_var2.assign([[10.]]))
-        sess.run(bias2.assign([5.]))
-        self.assertAllClose([[25.], [105.]], self.evaluate(predictions2))
-
-
 class OldLinearModelTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
@@ -2731,10 +1859,10 @@ class OldLinearModelTest(test.TestCase):
       # We check the mapping by checking that we have the right keys,
       # and that the values (output_tensors) were indeed the ones used to
       # form the input layer.
-      self.assertItemsEqual(all_cols, cols_to_output_tensors.keys())
+      self.assertCountEqual(all_cols, cols_to_output_tensors.keys())
       input_layer_inputs = [tensor for tensor in input_layer.op.inputs[:-1]]
       output_tensors = [tensor for tensor in cols_to_output_tensors.values()]
-      self.assertItemsEqual(input_layer_inputs, output_tensors)
+      self.assertCountEqual(input_layer_inputs, output_tensors)
 
   def test_dense_collection(self):
     price = fc.numeric_column('price')
@@ -3411,7 +2539,7 @@ class FunctionalInputLayerTest(test.TestCase):
       cols_to_vars = {}
       all_cols = [price1, dense_feature_bucketized, some_embedding_column]
       fc_old.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
@@ -3461,7 +2589,7 @@ class FunctionalInputLayerTest(test.TestCase):
           shared_embedding_a, shared_embedding_b
       ]
       fc_old.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
@@ -3497,7 +2625,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'input_from_feature_columns',
           partitioner=partitioned_variables.fixed_size_partitioner(3, axis=0)):
         fc_old.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(3, len(cols_to_vars[some_embedding_column]))
@@ -3616,7 +2744,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'input_layer/sparse_feature_embedding/embedding_weights:0',
           'input_layer_1/sparse_feature_embedding/embedding_weights:0'
       ]
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
@@ -4362,36 +3490,6 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             dense_shape=inputs.dense_shape),
         self.evaluate(id_weight_pair.id_tensor))
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_file(
-        key='wire',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size,
-        num_oov_buckets=1)
-    self.assertEqual(4, wire_column.num_buckets)
-    with ops.Graph().as_default():
-      model = fc.LinearModel((wire_column,))
-      predictions = model({
-          wire_column.name:
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=('marlo', 'skywalker', 'omar'),
-                  dense_shape=(2, 2))
-      })
-      wire_var, bias = model.variables
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose((0.,), self.evaluate(bias))
-      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
-      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
-      # 'marlo' -> 2: wire_var[2] = 3
-      # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-      self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
-
   def test_old_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_file(
         key='wire',
@@ -4828,35 +3926,6 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
             dense_shape=inputs.dense_shape),
         self.evaluate(id_weight_pair.id_tensor))
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'),
-        num_oov_buckets=1)
-    self.assertEqual(4, wire_column.num_buckets)
-    with ops.Graph().as_default():
-      model = fc.LinearModel((wire_column,))
-      predictions = model({
-          wire_column.name:
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=('marlo', 'skywalker', 'omar'),
-                  dense_shape=(2, 2))
-      })
-      wire_var, bias = model.variables
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose((0.,), self.evaluate(bias))
-      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
-      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
-      # 'marlo' -> 2: wire_var[2] = 3
-      # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-      self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
-
   def test_old_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -5196,32 +4265,6 @@ class IdentityCategoricalColumnTest(test.TestCase):
                   input_shape: (2, 2),
               }))
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
-    self.assertEqual(3, column.num_buckets)
-    with ops.Graph().as_default():
-      model = fc.LinearModel((column,))
-      predictions = model({
-          column.name:
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 2, 1),
-                  dense_shape=(2, 2))
-      })
-      weight_var, bias = model.variables
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose((0.,), self.evaluate(bias))
-      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
-      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
-      # weight_var[0] = 1
-      # weight_var[2] + weight_var[1] = 3+2 = 5
-      self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
-
   def test_old_linear_model(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column.num_buckets)
@@ -5514,30 +4557,6 @@ class IndicatorColumnTest(test.TestCase):
 
     self.assertAllEqual([[0., 1., 1.]], self.evaluate(indicator_tensor))
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
-    with ops.Graph().as_default():
-      features = {
-          'animal':
-              sparse_tensor.SparseTensor(
-                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
-      }
-
-      model = fc.LinearModel([animal])
-      predictions = model(features)
-      weight_var, _ = model.variables
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      # All should be zero-initialized.
-      self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
-      self.assertAllClose([[0.]], self.evaluate(predictions))
-      self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
-      self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
-
   def test_old_linear_model(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
@@ -5582,23 +4601,6 @@ class IndicatorColumnTest(test.TestCase):
       self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
       self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
-  def test_dense_features(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
-    with ops.Graph().as_default():
-      features = {
-          'animal':
-              sparse_tensor.SparseTensor(
-                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
-      }
-      net = df.DenseFeatures([animal])(features)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
-
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     animal = fc.indicator_column(
@@ -5904,7 +4906,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -5968,7 +4970,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6036,7 +5038,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6109,7 +5111,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6180,7 +5182,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6189,238 +5191,6 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
     self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    # Inputs.
-    batch_size = 4
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(batch_size, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_shape = (vocabulary_size, embedding_dimension)
-    zeros_embedding_values = np.zeros(embedding_shape)
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual(embedding_shape, shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return zeros_embedding_values
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    with ops.Graph().as_default():
-      model = fc.LinearModel((embedding_column,))
-      predictions = model({categorical_column.name: sparse_input})
-      expected_var_names = (
-          'linear_model/bias_weights:0',
-          'linear_model/aaa_embedding/weights:0',
-          'linear_model/aaa_embedding/embedding_weights:0',
-      )
-      self.assertItemsEqual(
-          expected_var_names,
-          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
-      trainable_vars = {
-          v.name: v
-          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
-      bias = trainable_vars['linear_model/bias_weights:0']
-      embedding_weights = trainable_vars[
-          'linear_model/aaa_embedding/embedding_weights:0']
-      linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      # Predictions with all zero weights.
-      self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
-      self.assertAllClose(zeros_embedding_values,
-                          self.evaluate(embedding_weights))
-      self.assertAllClose(
-          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
-      self.assertAllClose(np.zeros((batch_size, 1)), self.evaluate(predictions))
-
-      # Predictions with all non-zero weights.
-      self.evaluate(
-          embedding_weights.assign((
-              (1., 2.),  # id 0
-              (3., 5.),  # id 1
-              (7., 11.)  # id 2
-          )))
-      self.evaluate(linear_weights.assign(((4.,), (6.,))))
-      # example 0, ids [2], embedding[0] = [7, 11]
-      # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
-      # example 2, ids [], embedding[2] = [0, 0]
-      # example 3, ids [1], embedding[3] = [3, 5]
-      # sum(embeddings * linear_weights)
-      # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-      self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
-                          self.evaluate(predictions))
-
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': True
-      }, {
-          'testcase_name': 'dont_use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': False
-      })
-  @test_util.run_deprecated_v1
-  def test_dense_features(self, use_safe_embedding_lookup):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        use_safe_embedding_lookup=use_safe_embedding_lookup)
-
-    # Provide sparse input and get dense result.
-    l = df.DenseFeatures((embedding_column,))
-    dense_features = l({'aaa': sparse_input})
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in trainable_vars]))
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-    if use_safe_embedding_lookup:
-      self.assertIn('SparseFillEmptyRows',
-                    [x.type for x in ops.get_default_graph().get_operations()])
-    else:
-      self.assertNotIn(
-          'SparseFillEmptyRows',
-          [x.type for x in ops.get_default_graph().get_operations()])
-
-  @test_util.run_deprecated_v1
-  def test_dense_features_not_trainable(self):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=False)
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures((embedding_column,))({
-        'aaa': sparse_input
-    })
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    self.assertItemsEqual([],
-                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     # Inputs.
@@ -6475,10 +5245,10 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in trainable_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6528,14 +5298,14 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -6610,14 +5380,14 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -6972,15 +5742,26 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
   @parameterized.named_parameters(
       {
           'testcase_name': 'use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': True
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
       }, {
           'testcase_name': 'dont_use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': False
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
       })
   @test_util.run_deprecated_v1
-  def test_get_dense_tensor(self, use_safe_embedding_lookup):
+  def test_get_dense_tensor(self, use_safe_embedding_lookup,
+                            partition_variables):
     # Inputs.
-    vocabulary_size = 3
+    vocabulary_size = 4
     # -1 values are ignored.
     input_a = np.array([
         [2, -1, -1],  # example 0, ids [2]
@@ -6997,13 +5778,20 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     embedding_values = (
         (1., 2.),  # id 0
         (3., 5.),  # id 1
-        (7., 11.)  # id 2
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
     )
 
     def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
       self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
       return embedding_values
 
     # Expected lookup result, using combiner='mean'.
@@ -7031,22 +5819,32 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        use_safe_embedding_lookup=use_safe_embedding_lookup)
 
-    # Provide sparse input and get dense result.
-    embedding_lookup_a = embedding_column_a.get_dense_tensor(
-        fc.FeatureTransformationCache(input_features), None)
-    embedding_lookup_b = embedding_column_b.get_dense_tensor(
-        fc.FeatureTransformationCache(input_features), None)
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
+
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
+          [categorical_column_a, categorical_column_b],
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
+      # Provide sparse input and get dense result.
+      embedding_lookup_a = embedding_column_a.get_dense_tensor(
+          fc.FeatureTransformationCache(input_features), None)
+      embedding_lookup_b = embedding_column_b.get_dense_tensor(
+          fc.FeatureTransformationCache(input_features), None)
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('aaa_bbb_shared_embedding:0',),
-                          tuple([v.name for v in global_vars]))
+    if partition_variables:
+      self.assertCountEqual(('vars/aaa_bbb_shared_embedding/part_0:0',
+                             'vars/aaa_bbb_shared_embedding/part_1:0'),
+                            tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(('vars/aaa_bbb_shared_embedding:0',),
+                            tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -7228,227 +6026,6 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     with _initialized_session() as sess:
       sess.run([embedding_lookup_a, embedding_lookup_b], feed_dict=feed_dict)
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    # Inputs.
-    batch_size = 2
-    vocabulary_size = 3
-    # -1 values are ignored.
-    input_a = np.array([
-        [2, -1, -1],  # example 0, ids [2]
-        [0, 1, -1]
-    ])  # example 1, ids [0, 1]
-    input_b = np.array([
-        [0, -1, -1],  # example 0, ids [0]
-        [-1, -1, -1]
-    ])  # example 1, ids []
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_shape = (vocabulary_size, embedding_dimension)
-    zeros_embedding_values = np.zeros(embedding_shape)
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual(embedding_shape, shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return zeros_embedding_values
-
-    # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    with ops.Graph().as_default():
-      model = fc.LinearModel((embedding_column_a, embedding_column_b))
-      predictions = model({
-          categorical_column_a.name: input_a,
-          categorical_column_b.name: input_b
-      })
-
-      # Linear weights do not follow the column name. But this is a rare use
-      # case, and fixing it would add too much complexity to the code.
-      expected_var_names = (
-          'linear_model/bias_weights:0',
-          'linear_model/aaa_shared_embedding/weights:0',
-          'aaa_bbb_shared_embedding:0',
-          'linear_model/bbb_shared_embedding/weights:0',
-      )
-      self.assertItemsEqual(
-          expected_var_names,
-          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
-      trainable_vars = {
-          v.name: v
-          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
-      bias = trainable_vars['linear_model/bias_weights:0']
-      embedding_weights = trainable_vars['aaa_bbb_shared_embedding:0']
-      linear_weights_a = trainable_vars[
-          'linear_model/aaa_shared_embedding/weights:0']
-      linear_weights_b = trainable_vars[
-          'linear_model/bbb_shared_embedding/weights:0']
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      # Predictions with all zero weights.
-      self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
-      self.assertAllClose(zeros_embedding_values,
-                          self.evaluate(embedding_weights))
-      self.assertAllClose(
-          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_a))
-      self.assertAllClose(
-          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_b))
-      self.assertAllClose(np.zeros((batch_size, 1)), self.evaluate(predictions))
-
-      # Predictions with all non-zero weights.
-      self.evaluate(
-          embedding_weights.assign((
-              (1., 2.),  # id 0
-              (3., 5.),  # id 1
-              (7., 11.)  # id 2
-          )))
-      self.evaluate(linear_weights_a.assign(((4.,), (6.,))))
-      # example 0, ids [2], embedding[0] = [7, 11]
-      # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
-      # sum(embeddings * linear_weights)
-      # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29]
-      self.evaluate(linear_weights_b.assign(((3.,), (5.,))))
-      # example 0, ids [0], embedding[0] = [1, 2]
-      # example 1, ids [], embedding[1] = 0, 0]
-      # sum(embeddings * linear_weights)
-      # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
-      self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
-
-  def _test_dense_features(self, trainable=True):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 4)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [0]
-        # example 1, ids []
-        indices=((0, 0),),
-        values=(0,),
-        dense_shape=(2, 5))
-    sparse_input_c = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 1), (1, 1), (1, 3)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_d = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids []
-        indices=((0, 1),),
-        values=(2,),
-        dense_shape=(2, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0:
-        # A ids [2], embedding = [7, 11]
-        # B ids [0], embedding = [1, 2]
-        # C ids [2], embedding = [7, 11]
-        # D ids [2], embedding = [7, 11]
-        (7., 11., 1., 2., 7., 11., 7., 11.),
-        # example 1:
-        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # B ids [], embedding = [0, 0]
-        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # D ids [], embedding = [0, 0]
-        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
-    )
-
-    # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    categorical_column_c = fc.categorical_column_with_identity(
-        key='ccc', num_buckets=vocabulary_size)
-    categorical_column_d = fc.categorical_column_with_identity(
-        key='ddd', num_buckets=vocabulary_size)
-
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
-        [categorical_column_c, categorical_column_d],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-
-    features = {
-        'aaa': sparse_input_a,
-        'bbb': sparse_input_b,
-        'ccc': sparse_input_c,
-        'ddd': sparse_input_d
-    }
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures(
-        feature_columns=(embedding_column_b, embedding_column_a,
-                         embedding_column_c, embedding_column_d))(
-                             features)
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-        tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    if trainable:
-      self.assertItemsEqual(
-          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-          tuple([v.name for v in trainable_vars]))
-    else:
-      self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
-    shared_embedding_vars = global_vars
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values,
-                        self.evaluate(shared_embedding_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-  @test_util.run_deprecated_v1
-  def test_dense_features(self):
-    self._test_dense_features()
-
-  @test_util.run_deprecated_v1
-  def test_dense_features_no_trainable(self):
-    self._test_dense_features(trainable=False)
-
   @test_util.run_deprecated_v1
   def test_serialization(self):
 
@@ -7687,115 +6264,6 @@ class WeightedCategoricalColumnTest(test.TestCase):
             values=np.array((.5, 1., .1), dtype=np.float32),
             dense_shape=(2, 2)), self.evaluate(weight_tensor))
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    with ops.Graph().as_default():
-      model = fc.LinearModel((column,))
-      predictions = model({
-          'ids':
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 2, 1),
-                  dense_shape=(2, 2)),
-          'values':
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(.5, 1., .1),
-                  dense_shape=(2, 2))
-      })
-      weight_var, bias = model.variables
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose((0.,), self.evaluate(bias))
-      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
-      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
-      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-      # = 3*1 + 2*.1 = 3+.2 = 3.2
-      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
-
-  def test_linear_model_mismatched_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError,
-                                   r'Dimensions.*are not compatible'):
-        model = fc.LinearModel((column,))
-        model({
-            'ids':
-                sparse_tensor.SparseTensorValue(
-                    indices=((0, 0), (1, 0), (1, 1)),
-                    values=(0, 2, 1),
-                    dense_shape=(2, 2)),
-            'values':
-                sparse_tensor.SparseTensorValue(
-                    indices=((0, 0), (0, 1), (1, 0), (1, 1)),
-                    values=(.5, 11., 1., .1),
-                    dense_shape=(2, 2))
-        })
-
-  def test_linear_model_mismatched_dense_values(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    with ops.Graph().as_default():
-      model = fc.LinearModel((column,), sparse_combiner='mean')
-      predictions = model({
-          'ids':
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 2, 1),
-                  dense_shape=(2, 2)),
-          'values': ((.5,), (1.,))
-      })
-      # Disabling the constant folding optimizer here since it changes the
-      # error message differently on CPU and GPU.
-      config = config_pb2.ConfigProto()
-      config.graph_options.rewrite_options.constant_folding = (
-          rewriter_config_pb2.RewriterConfig.OFF)
-      with _initialized_session(config):
-        with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          self.evaluate(predictions)
-
-  def test_linear_model_mismatched_dense_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    with ops.Graph().as_default():
-      model = fc.LinearModel((column,))
-      predictions = model({
-          'ids':
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 2, 1),
-                  dense_shape=(2, 2)),
-          'values': ((.5,), (1.,), (.1,))
-      })
-      weight_var, bias = model.variables
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose((0.,), self.evaluate(bias))
-      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
-      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
-      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-      # = 3*1 + 2*.1 = 3+.2 = 3.2
-      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
-
   def test_old_linear_model(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
diff --git a/tensorflow/python/feature_column/keras_integration_test.py b/tensorflow/python/feature_column/keras_integration_test.py
index e0677e84e50..456c0204350 100644
--- a/tensorflow/python/feature_column/keras_integration_test.py
+++ b/tensorflow/python/feature_column/keras_integration_test.py
@@ -23,12 +23,12 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.feature_column import dense_features_v2
 from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.feature_column import dense_features_v2
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.premade import linear
 from tensorflow.python.keras.premade import wide_deep
diff --git a/tensorflow/python/feature_column/sequence_feature_column_test.py b/tensorflow/python/feature_column/sequence_feature_column_test.py
index 3d5d24ec03a..d0cf5ee7670 100644
--- a/tensorflow/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_test.py
@@ -24,7 +24,6 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.feature_column import serialization
@@ -111,54 +110,6 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
       sfc.concatenate_context_input(context_input, seq_input)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class DenseFeaturesTest(test.TestCase):
-  """Tests DenseFeatures with sequence feature columns."""
-
-  def test_embedding_column(self):
-    """Tests that error is raised for sequence embedding column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
-
-    input_layer = dense_features.DenseFeatures([embedding_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In embedding_column: aaa_embedding\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-  def test_indicator_column(self):
-    """Tests that error is raised for sequence indicator column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
-
-    input_layer = dense_features.DenseFeatures([indicator_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In indicator_column: aaa_indicator\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-
 def _assert_sparse_tensor_value(test_case, expected, actual):
   _assert_sparse_tensor_indices_shape(test_case, expected, actual)
 
diff --git a/tensorflow/python/feature_column/serialization_test.py b/tensorflow/python/feature_column/serialization_test.py
index 78b72746ac9..69b954022af 100644
--- a/tensorflow/python/feature_column/serialization_test.py
+++ b/tensorflow/python/feature_column/serialization_test.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from absl.testing import parameterized
 
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import serialization
-from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -114,123 +111,5 @@ class FeatureColumnSerializationTest(test.TestCase):
     self.assertIs(new_price.normalizer_fn, _custom_fn)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('default', None, None),
-      ('trainable', True, 'trainable'),
-      ('not_trainable', False, 'frozen'))
-  def test_get_config(self, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.embedding_column(fc.categorical_column_with_identity(
-                key='b', num_buckets=3), dimension=2)]
-    orig_layer = dense_features.DenseFeatures(
-        cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    self.assertEqual(config['name'], orig_layer.name)
-    self.assertEqual(config['trainable'], trainable)
-    self.assertLen(config['feature_columns'], 2)
-    self.assertEqual(
-        config['feature_columns'][0]['class_name'], 'NumericColumn')
-    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
-    self.assertEqual(
-        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
-
-  @parameterized.named_parameters(
-      ('default', None, None),
-      ('trainable', True, 'trainable'),
-      ('not_trainable', False, 'frozen'))
-  def test_from_config(self, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
-                'b', vocabulary_list=['1', '2', '3']), dimension=2),
-            fc.indicator_column(fc.categorical_column_with_hash_bucket(
-                key='c', hash_bucket_size=3))]
-    orig_layer = dense_features.DenseFeatures(
-        cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    new_layer = dense_features.DenseFeatures.from_config(config)
-
-    self.assertEqual(new_layer.name, orig_layer.name)
-    self.assertEqual(new_layer.trainable, trainable)
-    self.assertLen(new_layer._feature_columns, 3)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a')
-    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
-    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
-    self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
-
-  def test_crossed_column(self):
-    a = fc.categorical_column_with_vocabulary_list(
-        'a', vocabulary_list=['1', '2', '3'])
-    b = fc.categorical_column_with_vocabulary_list(
-        'b', vocabulary_list=['1', '2', '3'])
-    ab = fc.crossed_column([a, b], hash_bucket_size=2)
-    cols = [fc.indicator_column(ab)]
-
-    orig_layer = dense_features.DenseFeatures(cols)
-    config = orig_layer.get_config()
-
-    new_layer = dense_features.DenseFeatures.from_config(config)
-
-    self.assertLen(new_layer._feature_columns, 1)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class LinearModelLayerSerializationTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('default', 1, 'sum', None, None),
-      ('trainable', 6, 'mean', True, 'trainable'),
-      ('not_trainable', 10, 'sum', False, 'frozen'))
-  def test_get_config(self, units, sparse_combiner, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.categorical_column_with_identity(key='b', num_buckets=3)]
-    layer = fc._LinearModelLayer(
-        cols, units=units, sparse_combiner=sparse_combiner,
-        trainable=trainable, name=name)
-    config = layer.get_config()
-
-    self.assertEqual(config['name'], layer.name)
-    self.assertEqual(config['trainable'], trainable)
-    self.assertEqual(config['units'], units)
-    self.assertEqual(config['sparse_combiner'], sparse_combiner)
-    self.assertLen(config['feature_columns'], 2)
-    self.assertEqual(
-        config['feature_columns'][0]['class_name'], 'NumericColumn')
-    self.assertEqual(
-        config['feature_columns'][1]['class_name'], 'IdentityCategoricalColumn')
-
-  @parameterized.named_parameters(
-      ('default', 1, 'sum', None, None),
-      ('trainable', 6, 'mean', True, 'trainable'),
-      ('not_trainable', 10, 'sum', False, 'frozen'))
-  def test_from_config(self, units, sparse_combiner, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.categorical_column_with_vocabulary_list(
-                'b', vocabulary_list=('1', '2', '3')),
-            fc.categorical_column_with_hash_bucket(
-                key='c', hash_bucket_size=3)]
-    orig_layer = fc._LinearModelLayer(
-        cols, units=units, sparse_combiner=sparse_combiner,
-        trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    new_layer = fc._LinearModelLayer.from_config(config)
-
-    self.assertEqual(new_layer.name, orig_layer.name)
-    self.assertEqual(new_layer._units, units)
-    self.assertEqual(new_layer._sparse_combiner, sparse_combiner)
-    self.assertEqual(new_layer.trainable, trainable)
-    self.assertLen(new_layer._feature_columns, 3)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a')
-    self.assertEqual(
-        new_layer._feature_columns[1].vocabulary_list, ('1', '2', '3'))
-    self.assertEqual(new_layer._feature_columns[2].num_buckets, 3)
-
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index dfe84f14f26..51dcb248b11 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -535,8 +535,10 @@ def _get_resource_inputs(op):
 
   # Note: A resource handle that is not written to is treated as read-only. We
   # don't have a special way of denoting an unused resource.
-  return ([(t, ResourceType.READ_ONLY) for t in reads] +
-          [(t, ResourceType.READ_WRITE) for t in writes])
+  for t in reads:
+    yield (t, ResourceType.READ_ONLY)
+  for t in writes:
+    yield (t, ResourceType.READ_WRITE)
 
 
 def automatic_control_dependencies(f):
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
index b07bb874385..3051f1d0623 100644
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@@ -159,7 +159,6 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
     else:
       self.assertFalse(config.get_soft_device_placement())
 
-    @def_function.function
     def mod():
       with ops.device('/device:GPU:0'):
         a = constant_op.constant(1.0)
@@ -172,8 +171,10 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
         config.get_soft_device_placement(),
         context.context().soft_device_placement)
 
-    # Since soft placement is enabled, the mod operation should work with CPU
+    # Since soft placement is enabled, the mod operation should fallback to CPU
+    # with pure eager execution as well as functions
     mod()
+    def_function.function(mod)()
 
     config.set_soft_device_placement(False)
     self.assertEqual(config.get_soft_device_placement(), False)
@@ -182,8 +183,11 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
         context.context().soft_device_placement)
 
     # Since soft placement is disabled, the mod operation should fail on GPU
+    # with pure eager execution as well as functions
     with self.assertRaises(errors.InvalidArgumentError):
       mod()
+    with self.assertRaises(errors.InvalidArgumentError):
+      def_function.function(mod)()
 
   @reset_eager
   def testLogDevicePlacement(self):
@@ -203,12 +207,8 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
 
     context.ensure_initialized()
 
-    with self.assertRaises(RuntimeError):
-      context.set_log_device_placement(True)
-
-    # If the setting the device placement is a no-op, do not throw a runtime
-    # exception.
-    context.set_log_device_placement(False)
+    # Changing the device placement should not throw an exception
+    context.set_log_device_placement(True)
 
   @reset_eager
   def testEnableMlirBridge(self):
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 73fb034f061..994a7eea494 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -640,5 +640,8 @@ def as_dtype(type_value):
     except (KeyError, TypeError):
       pass
 
+  if isinstance(type_value, _dtypes.DType):
+    return _INTERN_TABLE[type_value.as_datatype_enum]
+
   raise TypeError("Cannot convert value %r to a TensorFlow DType." %
                   (type_value,))
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index dd2ea446b78..041cc5280cd 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.core.framework import types_pb2
+from tensorflow.python import _dtypes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
@@ -64,6 +65,13 @@ class TypesTest(test_util.TensorFlowTestCase):
             dtypes.as_dtype(datatype_enum).base_dtype,
             dtypes.as_dtype(numpy_dtype))
 
+  def testAllPybind11DTypeConvertibleToDType(self):
+    for datatype_enum in types_pb2.DataType.values():
+      if datatype_enum == types_pb2.DT_INVALID:
+        continue
+      dtype = _dtypes.DType(datatype_enum)
+      self.assertEqual(dtypes.as_dtype(datatype_enum), dtype)
+
   def testInvalid(self):
     with self.assertRaises(TypeError):
       dtypes.DType(types_pb2.DT_INVALID)
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index f43663d5396..f83f65152c1 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -62,6 +62,7 @@ from tensorflow.python.framework import versions
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.platform import app
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.types import core as core_tf_types
 from tensorflow.python.types import internal
 from tensorflow.python.util import compat
 from tensorflow.python.util import decorator_utils
@@ -213,53 +214,11 @@ def _as_graph_element(obj):
   return None
 
 
-_TENSOR_LIKE_TYPES = tuple()
-
-
+# Deprecated - do not use.
+# This API to avoid breaking estimator and tensorflow-mesh which depend on this
+# internal API. The stub should be safe to use after TF 2.3 is released.
 def is_dense_tensor_like(t):
-  """EXPERIMENTAL: Returns true if `t` implements the tensor interface.
-
-  See `register_dense_tensor_like_type()` for the current definition of a
-  "tensor-like type".
-
-  Args:
-    t: An object.
-
-  Returns:
-    True iff `t` is an instance of one of the registered "tensor-like" types.
-  """
-  return isinstance(t, _TENSOR_LIKE_TYPES)
-
-
-def register_dense_tensor_like_type(tensor_type):
-  """EXPERIMENTAL: Registers `tensor_type` as implementing the tensor interface.
-
-  A "tensor-like type" can represent a single dense tensor, and implements
-  the `name`, `dtype` and `shape` properties.
-
-  Args:
-    tensor_type: A type implementing the tensor interface.
-
-  Raises:
-    TypeError: If `tensor_type` does not implement the tensor interface.
-  """
-  if not (hasattr(tensor_type, "name") and
-          isinstance(tensor_type.name, property)):
-    raise TypeError("Type %s does not define a `name` property" %
-                    tensor_type.__name__)
-  if not (hasattr(tensor_type, "dtype") and
-          isinstance(tensor_type.dtype, property)):
-    raise TypeError("Type %s does not define a `dtype` property" %
-                    tensor_type.__name__)
-  if not (hasattr(tensor_type, "shape") and
-          isinstance(tensor_type.shape, property)):
-    raise TypeError("Type %s does not define a `shape` property" %
-                    tensor_type.__name__)
-  # We expect this list to be small, so choose quadratic complexity
-  # for registration, so that we have a tuple that can be used for
-  # more efficient `isinstance` checks later.
-  global _TENSOR_LIKE_TYPES
-  _TENSOR_LIKE_TYPES = tuple(list(_TENSOR_LIKE_TYPES) + [tensor_type])
+  return isinstance(t, core_tf_types.Tensor)
 
 
 def uid():
@@ -304,7 +263,7 @@ def disable_tensor_equality():
 
 # TODO(mdan): This object should subclass Symbol, not just Tensor.
 @tf_export("Tensor")
-class Tensor(internal.NativeObject):
+class Tensor(internal.NativeObject, core_tf_types.Tensor):
   """A tensor is a multidimensional array of elements represented by a
 
   `tf.Tensor` object.  All elements are of a single known data type.
@@ -1305,9 +1264,6 @@ class _EagerTensorBase(Tensor):
 EagerTensor = pywrap_tfe.TFE_Py_InitEagerTensor(_EagerTensorBase)
 
 
-register_dense_tensor_like_type(Tensor)
-
-
 @tf_export(v1=["convert_to_tensor"])
 def convert_to_tensor_v1(value,
                          dtype=None,
@@ -1438,6 +1394,65 @@ def _error_prefix(name):
   return "" if name is None else "%s: " % name
 
 
+def pack_eager_tensors(tensors, ctx=None):
+  """Pack multiple `EagerTensor`s of the same dtype and shape.
+
+  Args:
+    tensors: a list of EagerTensors to pack.
+    ctx: context.context().
+
+  Returns:
+    A packed EagerTensor.
+  """
+  if not isinstance(tensors, list):
+    raise TypeError("tensors must be a list or a tuple: %s" % tensors)
+
+  if not tensors:
+    raise ValueError("Empty tensors is unexpected for packing.")
+
+  dtype = tensors[0].dtype
+  shape = tensors[0].shape
+  handle_data = tensors[0]._handle_data  # pylint: disable=protected-access
+  is_resource = dtype == dtypes.resource
+  for i in range(len(tensors)):
+    t = tensors[i]
+    if not isinstance(t, EagerTensor):
+      raise TypeError("tensors must be a list of EagerTensors: %s" % t)
+
+    if t.dtype != dtype:
+      raise ValueError(
+          "All tensors being packed should have the same dtype %s, "
+          "but the %d-th tensor is of dtype %s" % (dtype, i, t.dtype))
+    if t.shape != shape:
+      raise ValueError(
+          "All tensors being packed should have the same shape %s, "
+          "but the %d-th tensor is of shape %s" % (shape, i, t.shape))
+    # pylint: disable=protected-access
+    if is_resource and t._handle_data != handle_data:
+      raise ValueError(
+          "All tensors being packed should have the same handle data %s, "
+          "but the %d-th tensor is of handle data %s" %
+          (handle_data, i, t._handle_data))
+    # pylint: enable=protected-access
+
+  if ctx is None:
+    ctx = context.context()
+
+  # Propogate handle data for resource variables
+  packed_tensor = ctx.pack_eager_tensors(tensors)
+  if handle_data is not None:
+    packed_tensor._handle_data = handle_data  # pylint: disable=protected-access
+
+  def grad_fun(_):
+    raise ValueError(
+        "Gradients through pack_eager_tensors are not supported yet.")
+
+  tape.record_operation("pack_eager_tensors", [packed_tensor], tensors,
+                        grad_fun)
+
+  return packed_tensor
+
+
 def convert_to_tensor(value,
                       dtype=None,
                       name=None,
@@ -5118,14 +5133,13 @@ class Graph(object):
     Returns:
       The dtype that instances of `AutoCastVariable` will be casted to.
     """
-    if not hasattr(self._thread_local, "_auto_cast_variable_read_dtype"):
+    dtype = getattr(self._thread_local, "_auto_cast_variable_read_dtype", None)
+    if dtype is None:
       self._thread_local._auto_cast_variable_read_dtype = None  # pylint: disable=protected-access
-    return self._thread_local._auto_cast_variable_read_dtype  # pylint: disable=protected-access
+    return dtype
 
   @_auto_cast_variable_read_dtype.setter
   def _auto_cast_variable_read_dtype(self, dtype):
-    if dtype:
-      dtype = dtypes.as_dtype(dtype)
     self._thread_local._auto_cast_variable_read_dtype = dtype  # pylint: disable=protected-access
 
   @tf_contextlib.contextmanager
@@ -5338,7 +5352,7 @@ class _DefaultStack(threading.local):
     self.stack = []
 
   def get_default(self):
-    return self.stack[-1] if len(self.stack) >= 1 else None
+    return self.stack[-1] if self.stack else None
 
   def reset(self):
     self.stack = []
@@ -5526,10 +5540,13 @@ class _DefaultGraphStack(_DefaultStack):  # pylint: disable=protected-access
 
   def get_default(self):
     """Override that returns a global default if the stack is empty."""
-    ret = super(_DefaultGraphStack, self).get_default()
-    if ret is None:
-      ret = self._GetGlobalDefaultGraph()
-    return ret
+    if self.stack:
+      return self.stack[-1]
+    elif self._global_default_graph:
+      return self._global_default_graph
+    else:
+      self._global_default_graph = Graph()
+      return self._global_default_graph
 
   def _GetGlobalDefaultGraph(self):
     if self._global_default_graph is None:
@@ -6261,10 +6278,12 @@ def add_to_collection(name, value):
   Args:
     name: The key for the collection. For example, the `GraphKeys` class
       contains many standard names for collections.
-    value: The value to add to the collection.  @compatibility(eager)
-      Collections are only supported in eager when variables are created inside
-      an EagerVariableStore (e.g. as part of a layer or template).
-      @end_compatibility
+    value: The value to add to the collection.
+
+  @compatibility(eager)
+  Collections are only supported in eager when variables are created inside
+  an EagerVariableStore (e.g. as part of a layer or template).
+  @end_compatibility
   """
   get_default_graph().add_to_collection(name, value)
 
@@ -6279,10 +6298,12 @@ def add_to_collections(names, value):
   Args:
     names: The key for the collections. The `GraphKeys` class contains many
       standard names for collections.
-    value: The value to add to the collections.  @compatibility(eager)
-      Collections are only supported in eager when variables are created inside
-      an EagerVariableStore (e.g. as part of a layer or template).
-      @end_compatibility
+    value: The value to add to the collections.
+
+  @compatibility(eager)
+  Collections are only supported in eager when variables are created inside
+  an EagerVariableStore (e.g. as part of a layer or template).
+  @end_compatibility
   """
   get_default_graph().add_to_collections(names, value)
 
@@ -6516,24 +6537,6 @@ class name_scope_v1(object):  # pylint: disable=invalid-name
     return self._name_scope.__exit__(*exc_info)
 
 
-def enter_eager_name_scope(ctx, name):
-  """Updates the eager context to enter the given name scope."""
-  old_name = ctx.scope_name
-  if not name:
-    scope_name = ""
-  else:
-    if name.endswith("/"):
-      # A trailing slash breaks out of nested name scopes, indicating a
-      # fully specified scope name, for compatibility with Graph.name_scope.
-      scope_name = name
-    else:
-      scope_name = name + "/"
-      if old_name:
-        scope_name = old_name + scope_name
-  ctx.scope_name = scope_name
-  return scope_name, old_name
-
-
 @tf_export("name_scope", v1=[])
 class name_scope_v2(object):
   """A context manager for use when defining a Python op.
@@ -6556,9 +6559,9 @@ class name_scope_v2(object):
   When executed, the Tensors `a`, `b`, `c`, will have names `MyOp/a`, `MyOp/b`,
   and `MyOp/c`.
 
-  If the scope name already exists, the name will be made unique by appending
-  `_n`. For example, calling `my_op` the second time will generate `MyOp_1/a`,
-  etc.
+  Inside a `tf.function`, if the scope name already exists, the name will be
+  made unique by appending `_n`. For example, calling `my_op` the second time
+  will generate `MyOp_1/a`, etc.
   """
 
   def __init__(self, name):
@@ -6568,9 +6571,9 @@ class name_scope_v2(object):
       name: The prefix to use on all names created within the name scope.
 
     Raises:
-      ValueError: If name is None, or not a string.
+      ValueError: If name is not a string.
     """
-    if name is None or not isinstance(name, six.string_types):
+    if not isinstance(name, six.string_types):
       raise ValueError("name for name_scope must be a string.")
     self._name = name
     self._exit_fns = []
@@ -6584,16 +6587,29 @@ class name_scope_v2(object):
 
     Returns:
       The scope name.
-
-    Raises:
-      ValueError: if neither `name` nor `default_name` is provided
-        but `values` are.
     """
     ctx = context.context()
     if ctx.executing_eagerly():
-      scope_name, old_scope_name = enter_eager_name_scope(ctx, self._name)
-      self._exit_fns.append(
-          lambda *a: setattr(ctx, "scope_name", old_scope_name))
+      # Names are not auto-incremented in eager mode.
+      # A trailing slash breaks out of nested name scopes, indicating a
+      # fully specified scope name, for compatibility with Graph.name_scope.
+      # This also prevents auto-incrementing.
+      old_name = ctx.scope_name
+      name = self._name
+      if not name:
+        scope_name = ""
+      elif name[-1] == "/":
+        scope_name = name
+      elif old_name:
+        scope_name = old_name + name + "/"
+      else:
+        scope_name = name + "/"
+      ctx.scope_name = scope_name
+
+      def _restore_name_scope(*_):
+        ctx.scope_name = old_name
+
+      self._exit_fns.append(_restore_name_scope)
     else:
       scope = get_default_graph().name_scope(self._name)
       scope_name = scope.__enter__()
@@ -6601,8 +6617,7 @@ class name_scope_v2(object):
     return scope_name
 
   def __exit__(self, type_arg, value_arg, traceback_arg):
-    exit_fn = self._exit_fns.pop()
-    exit_fn(type_arg, value_arg, traceback_arg)
+    self._exit_fns.pop()(type_arg, value_arg, traceback_arg)
     return False  # False values do not suppress exceptions
 
 
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 20f58a00cfe..7626bd780bb 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as eager_function
 from tensorflow.python.eager import wrap_function
+from tensorflow.python.framework import config
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
@@ -90,7 +91,7 @@ class ResourceTest(test_util.TensorFlowTestCase):
                   resources.shared_resources()).eval()), 0)
 
 
-@test_util.disable_tfrt("Graph is not supported yet.")
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
   def testShape(self):
@@ -310,7 +311,8 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
     del x
     self.assertIsNotNone(x_ref.deref())
 
-@test_util.disable_tfrt("Graph mode is not supported yet.")
+
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 @test_util.run_all_in_graph_and_eager_modes
 class IndexedSlicesTest(test_util.TensorFlowTestCase):
 
@@ -355,7 +357,7 @@ class IndexedSlicesTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(x.indices, [0, 2])
 
 
-@test_util.disable_tfrt("Graph mode is not supported yet.")
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 @test_util.run_all_in_graph_and_eager_modes
 class IndexedSlicesSpecTest(test_util.TensorFlowTestCase,
                             parameterized.TestCase):
@@ -501,7 +503,7 @@ def _apply_op(g, *args, **kwargs):
     return op.outputs
 
 
-@test_util.disable_tfrt("Graph is not supported yet.")
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class OperationTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
@@ -1444,7 +1446,7 @@ class NameTest(test_util.TensorFlowTestCase):
                        g.create_op("FloatOutput", [], [dtypes.float32]).name)
 
 
-@test_util.disable_tfrt("Device API are not supported yet.")
+@test_util.disable_tfrt("Device API are not supported yet. b/156188344")
 class DeviceTest(test_util.TensorFlowTestCase):
 
   def testNoDevice(self):
@@ -2025,7 +2027,7 @@ class CollectionTest(test_util.TensorFlowTestCase):
       # Collections are ordered.
       self.assertEqual([90, 100], ops.get_collection("key"))
 
-  @test_util.disable_tfrt("Functions are not supported yet.")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def test_defun(self):
     with context.eager_mode():
 
@@ -2132,7 +2134,7 @@ class ControlDependenciesTest(test_util.TensorFlowTestCase):
     # e should be dominated by c.
     self.assertEqual(e.op.control_inputs, [])
 
-  @test_util.disable_tfrt("Graph is not supported yet.")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.run_in_graph_and_eager_modes
   def testEager(self):
     def future():
@@ -2453,7 +2455,7 @@ class OpScopeTest(test_util.TensorFlowTestCase):
     self._testGraphElements([a, variable, b])
 
 
-@test_util.disable_tfrt("Graphs are not supported yet.")
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class InitScopeTest(test_util.TensorFlowTestCase):
 
   def testClearsControlDependencies(self):
@@ -2756,7 +2758,7 @@ class InitScopeTest(test_util.TensorFlowTestCase):
           self.assertFalse(self.evaluate(f()))
 
 
-@test_util.disable_tfrt("Graphs are not supported yet.")
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class GraphTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -3234,7 +3236,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
       b = variables.Variable([3.0], name="b")
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
 
-  @test_util.disable_tfrt("Functions are not supported yet.")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def testColocateWithVariableInFunction(self):
     v = variables.Variable(1.)
 
@@ -3268,56 +3270,6 @@ class DeprecatedTest(test_util.TensorFlowTestCase):
         test_ops.old()
 
 
-class DenseTensorLikeTypeTest(test_util.TensorFlowTestCase):
-
-  @test_util.disable_tfrt("Graph is not supported yet.")
-  def testSuccess(self):
-    op = ops.Operation(
-        ops._NodeDef("FloatOutput", "myop"), ops.Graph(), [], [dtypes.float32])
-    t = op.outputs[0]
-    self.assertTrue(ops.is_dense_tensor_like(t))
-
-    v = variables.Variable([17])
-    self.assertTrue(ops.is_dense_tensor_like(v))
-
-  class BadClassNoName(object):
-    pass
-
-  class BadClassBadName(object):
-
-    def name(self):
-      pass
-
-  class BadClassNoDtype(object):
-
-    @property
-    def name(self):
-      pass
-
-  class BadClassBadDtype(object):
-
-    @property
-    def name(self):
-      pass
-
-    def dtype(self):
-      pass
-
-  def testBadClass(self):
-    with self.assertRaisesRegexp(TypeError, "`name`"):
-      ops.register_dense_tensor_like_type(
-          DenseTensorLikeTypeTest.BadClassNoName)
-    with self.assertRaisesRegexp(TypeError, "`name`"):
-      ops.register_dense_tensor_like_type(
-          DenseTensorLikeTypeTest.BadClassBadName)
-    with self.assertRaisesRegexp(TypeError, "`dtype`"):
-      ops.register_dense_tensor_like_type(
-          DenseTensorLikeTypeTest.BadClassNoDtype)
-    with self.assertRaisesRegexp(TypeError, "`dtype`"):
-      ops.register_dense_tensor_like_type(
-          DenseTensorLikeTypeTest.BadClassBadDtype)
-
-
 class NameScopeTest(test_util.TensorFlowTestCase):
 
   def testStripAndPrependScope(self):
@@ -3458,5 +3410,51 @@ class CustomConvertToCompositeTensorTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(x_, tensor_util.constant_value(y_))
 
 
+@test_util.disable_tfrt("Packing EagerTensors is not supported yet.")
+class PackEagerTensorTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    super(PackEagerTensorTest, self).setUp()
+    context._reset_context()
+    cpus = config.list_physical_devices("CPU")
+    # Set 2 virtual CPUs
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration(),
+    ])
+
+  def testPack(self):
+    with context.eager_mode():
+      with ops.device("CPU:0"):
+        var0 = resource_variable_ops.ResourceVariable(1.0)
+        c0 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+      with ops.device("CPU:1"):
+        var1 = resource_variable_ops.ResourceVariable(2.0)
+        var2 = resource_variable_ops.ResourceVariable([3.0])
+        c1 = constant_op.constant([9.0])
+
+      packed_var0 = ops.pack_eager_tensors([var0.handle, var1.handle])
+      self.assertTrue(packed_var0.is_packed)
+      self.assertEqual(packed_var0.dtype, var0.handle.dtype)
+      self.assertEqual(packed_var0.shape, var0.handle.shape)
+      self.assertEqual(packed_var0._handle_data, var0.handle._handle_data)
+      self.assertIn("COMPOSITE:0", packed_var0.device)
+      self.assertIn("COMPOSITE:0", packed_var0.backing_device)
+      with self.assertRaises(errors.InvalidArgumentError):
+        packed_var0.numpy()
+
+      # Different dtypes
+      with self.assertRaises(ValueError):
+        ops.pack_eager_tensors([var0.handle, c1])
+
+      # Different shapes
+      with self.assertRaises(ValueError):
+        ops.pack_eager_tensors([c0, c1])
+
+      # Different handle data
+      with self.assertRaises(ValueError):
+        ops.pack_eager_tensors([var0.handle, var2.handle])
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 857cc7b6638..ca0c5d9ef1a 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -959,7 +959,10 @@ void GenEagerPythonOp::AddDispatch(const string& prefix) {
 
   strings::StrAppend(&result_, prefix, "except (TypeError, ValueError):\n");
   strings::StrAppend(&result_, prefix, "  result = _dispatch.dispatch(\n");
-  AddBodyNoReturn(strings::StrCat(prefix, "        ", function_name_, ", "));
+  AddBodyNoReturn(strings::StrCat(prefix, "        ", function_name_,
+                                  ", "
+                                  "(), dict("));
+  strings::StrAppend(&result_, prefix, "      )\n");
   strings::StrAppend(&result_, prefix,
                      "  if result is not "
                      "_dispatch.OpDispatcher.NOT_SUPPORTED:\n");
diff --git a/tensorflow/python/framework/subscribe.py b/tensorflow/python/framework/subscribe.py
index 8c3f91f62d8..c7cf8ce6070 100644
--- a/tensorflow/python/framework/subscribe.py
+++ b/tensorflow/python/framework/subscribe.py
@@ -58,8 +58,7 @@ def _recursive_apply(tensors, apply_fn):
       return tuple(tensors)
     return tensors_type(*tensors)  # collections.namedtuple
   elif tensors_type is dict:
-    return dict([(k, _recursive_apply(v, apply_fn)) for k, v in tensors.items()
-                ])
+    return dict((k, _recursive_apply(v, apply_fn)) for k, v in tensors.items())
   else:
     raise TypeError('_recursive_apply argument %r has invalid type %r' %
                     (tensors, tensors_type))
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 50388595c3d..968b635250a 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -26,6 +26,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.types import core
 from tensorflow.python.types import internal
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
@@ -1009,7 +1010,7 @@ def is_tensor(x):  # pylint: disable=invalid-name
     `True` if `x` is a tensor or "tensor-like", `False` if not.
   """
   return (isinstance(x, internal.NativeObject) or
-          ops.is_dense_tensor_like(x) or
+          isinstance(x, core.Tensor) or
           getattr(x, "is_tensor_like", False))
 
 
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index fa2a4f63e6c..aa52bbd8726 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -460,6 +460,38 @@ def skip_if(condition):
   return real_skip_if
 
 
+@contextlib.contextmanager
+def skip_if_error(test_obj, error_type, messages=None):
+  """Context manager to skip cases not considered failures by the tests.
+
+  Note that this does not work if used in setUpClass/tearDownClass.
+  Usage in setUp/tearDown works fine just like regular test methods.
+
+  Args:
+    test_obj: A test object provided as `self` in the test methods; this object
+      is usually an instance of `unittest.TestCase`'s subclass and should have
+      `skipTest` method.
+    error_type: The error type to skip. Note that if `messages` are given, both
+      `error_type` and `messages` need to match for the test to be skipped.
+    messages: Optional, a string or list of strings. If `None`, the test will be
+      skipped if `error_type` matches what is raised; otherwise, the test is
+      skipped if any of the `messages` is contained in the message of the error
+      raised, and `error_type` matches the error raised.
+
+  Yields:
+    Nothing.
+  """
+  if messages:
+    messages = nest.flatten(messages)
+  try:
+    yield
+  except error_type as e:
+    if not messages or any(message in str(e) for message in messages):
+      test_obj.skipTest("Skipping error: {}".format(str(e)))
+    else:
+      raise
+
+
 def enable_c_shapes(fn):
   """No-op. TODO(b/74620627): Remove this."""
   return fn
@@ -2096,7 +2128,9 @@ class TensorFlowTestCase(googletest.TestCase):
               values=tensor.values.numpy(),
               indices=tensor.indices.numpy(),
               dense_shape=tensor.dense_shape.numpy())
-        return tensor.numpy()
+        # Convert tensors and composite tensors to numpy arrays.
+        return nest.map_structure(lambda t: t.numpy(), tensor,
+                                  expand_composites=True)
       except AttributeError as e:
         six.raise_from(ValueError("Unsupported type %s." % type(tensor)), e)
 
@@ -2654,7 +2688,7 @@ class TensorFlowTestCase(googletest.TestCase):
     if (b.ndim <= 3 or b.size < 500):
       self.assertEqual(
           a.shape, b.shape, "Shape mismatch: expected %s, got %s."
-          " Contents: %s. \n%s." % (a.shape, b.shape, b, msg))
+          " Contents: %r. \n%s." % (a.shape, b.shape, b, msg))
     else:
       self.assertEqual(
           a.shape, b.shape, "Shape mismatch: expected %s, got %s."
@@ -2677,10 +2711,28 @@ class TensorFlowTestCase(googletest.TestCase):
       else:
         # np.where is broken for scalars
         x, y = a, b
-      msgs.append("not equal lhs = {}".format(x))
-      msgs.append("not equal rhs = {}".format(y))
-      # With Python 3, we need to make sure the dtype matches between a and b.
-      b = b.astype(a.dtype)
+      msgs.append("not equal lhs = %r" % x)
+      msgs.append("not equal rhs = %r" % y)
+
+      # Handle mixed string types as a result of PY2to3 migration. That is, the
+      # mixing between bytes (b-prefix strings, PY2 default) and unicodes
+      # (u-prefix strings, PY3 default).
+      if six.PY3:
+        if (a.dtype.kind != b.dtype.kind and
+            {a.dtype.kind, b.dtype.kind}.issubset({"U", "S", "O"})):
+          a_list = []
+          b_list = []
+          # OK to flatten `a` and `b` because they are guaranteed to have the
+          # same shape.
+          for out_list, flat_arr in [(a_list, a.flat), (b_list, b.flat)]:
+            for item in flat_arr:
+              if isinstance(item, str):
+                out_list.append(item.encode("utf-8"))
+              else:
+                out_list.append(item)
+          a = np.array(a_list)
+          b = np.array(b_list)
+
       np.testing.assert_array_equal(a, b, err_msg="\n".join(msgs))
 
   @py_func_if_in_function
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index b5cb903c666..2bd75c3919e 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -22,6 +22,7 @@ import collections
 import copy
 import random
 import threading
+import unittest
 import weakref
 
 from absl.testing import parameterized
@@ -808,6 +809,66 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertEqual(tested_codepaths, set(["present", "future"]))
 
 
+class SkipTestTest(test_util.TensorFlowTestCase):
+
+  def _verify_test_in_set_up_or_tear_down(self):
+    with self.assertRaises(unittest.SkipTest):
+      with test_util.skip_if_error(self, ValueError,
+                                   ["foo bar", "test message"]):
+        raise ValueError("test message")
+    try:
+      with self.assertRaisesRegexp(ValueError, "foo bar"):
+        with test_util.skip_if_error(self, ValueError, "test message"):
+          raise ValueError("foo bar")
+    except unittest.SkipTest:
+      raise RuntimeError("Test is not supposed to skip.")
+
+  def setUp(self):
+    super(SkipTestTest, self).setUp()
+    self._verify_test_in_set_up_or_tear_down()
+
+  def tearDown(self):
+    super(SkipTestTest, self).tearDown()
+    self._verify_test_in_set_up_or_tear_down()
+
+  def test_skip_if_error_should_skip(self):
+    with self.assertRaises(unittest.SkipTest):
+      with test_util.skip_if_error(self, ValueError, "test message"):
+        raise ValueError("test message")
+
+  def test_skip_if_error_should_skip_with_list(self):
+    with self.assertRaises(unittest.SkipTest):
+      with test_util.skip_if_error(self, ValueError,
+                                   ["foo bar", "test message"]):
+        raise ValueError("test message")
+
+  def test_skip_if_error_should_skip_without_expected_message(self):
+    with self.assertRaises(unittest.SkipTest):
+      with test_util.skip_if_error(self, ValueError):
+        raise ValueError("test message")
+
+  def test_skip_if_error_should_skip_without_error_message(self):
+    with self.assertRaises(unittest.SkipTest):
+      with test_util.skip_if_error(self, ValueError):
+        raise ValueError()
+
+  def test_skip_if_error_should_raise_message_mismatch(self):
+    try:
+      with self.assertRaisesRegexp(ValueError, "foo bar"):
+        with test_util.skip_if_error(self, ValueError, "test message"):
+          raise ValueError("foo bar")
+    except unittest.SkipTest:
+      raise RuntimeError("Test is not supposed to skip.")
+
+  def test_skip_if_error_should_raise_no_message(self):
+    try:
+      with self.assertRaisesRegexp(ValueError, ""):
+        with test_util.skip_if_error(self, ValueError, "test message"):
+          raise ValueError()
+    except unittest.SkipTest:
+      raise RuntimeError("Test is not supposed to skip.")
+
+
 # Its own test case to reproduce variable sharing issues which only pop up when
 # setUp() is overridden and super() is not called.
 class GraphAndEagerNoVariableSharing(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/framework/type_spec.py b/tensorflow/python/framework/type_spec.py
index e6e921e6184..4bf2ad791d7 100644
--- a/tensorflow/python/framework/type_spec.py
+++ b/tensorflow/python/framework/type_spec.py
@@ -503,11 +503,29 @@ class BatchableTypeSpec(TypeSpec):
     return tensor_list
 
 
+@tf_export("type_spec_from_value")
 def type_spec_from_value(value):
-  """Returns a `TypeSpec` that represents the given `value`.
+  """Returns a `tf.TypeSpec` that represents the given `value`.
+
+  Examples:
+
+    >>> tf.type_spec_from_value(tf.constant([1, 2, 3]))
+    TensorSpec(shape=(3,), dtype=tf.int32, name=None)
+    >>> tf.type_spec_from_value(np.array([4.0, 5.0], np.float64))
+    TensorSpec(shape=(2,), dtype=tf.float64, name=None)
+    >>> tf.type_spec_from_value(tf.ragged.constant([[1, 2], [3, 4, 5]]))
+    RaggedTensorSpec(TensorShape([2, None]), tf.int32, 1, tf.int64)
+
+    >>> example_input = tf.ragged.constant([[1, 2], [3]])
+    >>> @tf.function(input_signature=[tf.type_spec_from_value(example_input)])
+    ... def f(x):
+    ...   return tf.reduce_sum(x, axis=1)
 
   Args:
     value: A value that can be accepted or returned by TensorFlow APIs.
+      Accepted types for `value` include `tf.Tensor`, any value that can be
+      converted to `tf.Tensor` using `tf.convert_to_tensor`, and any subclass
+      of `CompositeTensor` (such as `tf.RaggedTensor`).
 
   Returns:
     A `TypeSpec` that is compatible with `value`.
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 4cd0af07c74..67fa0c18ebd 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -140,7 +140,7 @@ py_library(
     deps = [
         ":backend",
         "//tensorflow/python/distribute:distributed_file_utils",
-        "//tensorflow/python/keras/distribute:multi_worker_training_state",
+        "//tensorflow/python/keras/distribute:worker_training_state",
         "//tensorflow/python/keras/protobuf:projector_config_proto_py",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:mode_keys",
@@ -584,6 +584,7 @@ tf_py_test(
     deps = [
         ":backend",
         ":combinations",
+        ":engine",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index 34d04d68c6c..0ee4a91f417 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -24,6 +24,7 @@ from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import keras_export
 
 # b/123041942
@@ -41,6 +42,7 @@ _TF_ACTIVATIONS_V2 = {
 
 
 @keras_export('keras.activations.softmax')
+@dispatch.add_dispatch_support
 def softmax(x, axis=-1):
   """Softmax converts a real vector to a vector of categorical probabilities.
 
@@ -82,6 +84,7 @@ def softmax(x, axis=-1):
 
 
 @keras_export('keras.activations.elu')
+@dispatch.add_dispatch_support
 def elu(x, alpha=1.0):
   """Exponential linear unit.
 
@@ -100,6 +103,7 @@ def elu(x, alpha=1.0):
 
 
 @keras_export('keras.activations.selu')
+@dispatch.add_dispatch_support
 def selu(x):
   """Scaled Exponential Linear Unit (SELU).
 
@@ -153,6 +157,7 @@ def selu(x):
 
 
 @keras_export('keras.activations.softplus')
+@dispatch.add_dispatch_support
 def softplus(x):
   """Softplus activation function, `softplus(x) = log(exp(x) + 1)`.
   
@@ -174,6 +179,7 @@ def softplus(x):
 
 
 @keras_export('keras.activations.softsign')
+@dispatch.add_dispatch_support
 def softsign(x):
   """Softsign activation function, `softsign(x) = x / (abs(x) + 1)`.
   
@@ -194,6 +200,7 @@ def softsign(x):
 
 
 @keras_export('keras.activations.swish')
+@dispatch.add_dispatch_support
 def swish(x):
   """Swish activation function, `swish(x) = x * sigmoid(x)`.
 
@@ -224,6 +231,7 @@ def swish(x):
 
 
 @keras_export('keras.activations.relu')
+@dispatch.add_dispatch_support
 def relu(x, alpha=0., max_value=None, threshold=0):
   """Applies the rectified linear unit activation function.
 
@@ -264,6 +272,7 @@ def relu(x, alpha=0., max_value=None, threshold=0):
 
 
 @keras_export('keras.activations.tanh')
+@dispatch.add_dispatch_support
 def tanh(x):
   """Hyperbolic tangent activation function.
 
@@ -285,6 +294,7 @@ def tanh(x):
 
 
 @keras_export('keras.activations.sigmoid')
+@dispatch.add_dispatch_support
 def sigmoid(x):
   """Sigmoid activation function, `sigmoid(x) = 1 / (1 + exp(-x))`.
 
@@ -314,6 +324,7 @@ def sigmoid(x):
 
 
 @keras_export('keras.activations.exponential')
+@dispatch.add_dispatch_support
 def exponential(x):
   """Exponential activation function.
 
@@ -334,6 +345,7 @@ def exponential(x):
 
 
 @keras_export('keras.activations.hard_sigmoid')
+@dispatch.add_dispatch_support
 def hard_sigmoid(x):
   """Hard sigmoid activation function.
 
@@ -360,6 +372,7 @@ def hard_sigmoid(x):
 
 
 @keras_export('keras.activations.linear')
+@dispatch.add_dispatch_support
 def linear(x):
   """Linear activation function (pass-through).
 
@@ -380,6 +393,7 @@ def linear(x):
 
 
 @keras_export('keras.activations.serialize')
+@dispatch.add_dispatch_support
 def serialize(activation):
   """Returns the string identifier of an activation function.
 
@@ -410,6 +424,7 @@ def serialize(activation):
 
 
 @keras_export('keras.activations.deserialize')
+@dispatch.add_dispatch_support
 def deserialize(name, custom_objects=None):
   """Returns activation function given a string identifier.
 
@@ -447,6 +462,7 @@ def deserialize(name, custom_objects=None):
 
 
 @keras_export('keras.activations.get')
+@dispatch.add_dispatch_support
 def get(identifier):
   """Returns function.
 
diff --git a/tensorflow/python/keras/applications/BUILD b/tensorflow/python/keras/applications/BUILD
index 1eaed45c714..0c566c6e6d5 100644
--- a/tensorflow/python/keras/applications/BUILD
+++ b/tensorflow/python/keras/applications/BUILD
@@ -35,10 +35,16 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:util",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/keras:activations",
         "//tensorflow/python/keras:backend",
-        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras/engine",
         "//tensorflow/python/keras/layers",
+        "//tensorflow/python/keras/utils:data_utils",
+        "//tensorflow/python/keras/utils:layer_utils",
+        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/python/keras/applications/densenet.py b/tensorflow/python/keras/applications/densenet.py
index 39004be622f..09069556a26 100644
--- a/tensorflow/python/keras/applications/densenet.py
+++ b/tensorflow/python/keras/applications/densenet.py
@@ -23,14 +23,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -193,7 +192,7 @@ def DenseNet(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
@@ -369,7 +368,9 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TORCH)
+    mode='',
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TORCH,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
 
 DOC = """
diff --git a/tensorflow/python/keras/applications/efficientnet.py b/tensorflow/python/keras/applications/efficientnet.py
index ece9f7f7e5b..e1413b08533 100644
--- a/tensorflow/python/keras/applications/efficientnet.py
+++ b/tensorflow/python/keras/applications/efficientnet.py
@@ -26,7 +26,6 @@ from __future__ import print_function
 
 import copy
 import math
-import os
 
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
@@ -34,6 +33,7 @@ from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -269,7 +269,7 @@ def EfficientNet(
   if blocks_args == 'default':
     blocks_args = DEFAULT_BLOCKS_ARGS
 
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/applications/imagenet_utils.py b/tensorflow/python/keras/applications/imagenet_utils.py
index 8f698a1a1e4..45cccfdb2b0 100644
--- a/tensorflow/python/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/applications/imagenet_utils.py
@@ -66,11 +66,11 @@ PREPROCESS_INPUT_DOC = """
       {ret}
 
   Raises:
-      ValueError: In case of unknown `data_format` argument.
+      {error}
   """
 
 PREPROCESS_INPUT_MODE_DOC = """
-    mode: One of "caffe", "tf" or "torch".
+    mode: One of "caffe", "tf" or "torch". Defaults to "caffe".
       - caffe: will convert the images from RGB to BGR,
           then will zero-center each color channel with
           respect to the ImageNet dataset,
@@ -82,12 +82,18 @@ PREPROCESS_INPUT_MODE_DOC = """
           ImageNet dataset.
   """
 
+PREPROCESS_INPUT_DEFAULT_ERROR_DOC = """
+    ValueError: In case of unknown `mode` or `data_format` argument."""
+
+PREPROCESS_INPUT_ERROR_DOC = """
+    ValueError: In case of unknown `data_format` argument."""
+
 PREPROCESS_INPUT_RET_DOC_TF = """
       The inputs pixel values are scaled between -1 and 1, sample-wise."""
 
 PREPROCESS_INPUT_RET_DOC_TORCH = """
       The input pixels values are scaled between 0 and 1 and each channel is
-      normalized with respect to the InageNet dataset."""
+      normalized with respect to the ImageNet dataset."""
 
 PREPROCESS_INPUT_RET_DOC_CAFFE = """
       The images are converted from RGB to BGR, then each color channel is
@@ -97,9 +103,12 @@ PREPROCESS_INPUT_RET_DOC_CAFFE = """
 @keras_export('keras.applications.imagenet_utils.preprocess_input')
 def preprocess_input(x, data_format=None, mode='caffe'):
   """Preprocesses a tensor or Numpy array encoding a batch of images."""
+  if mode not in {'caffe', 'tf', 'torch'}:
+    raise ValueError('Unknown mode ' + str(mode))
+
   if data_format is None:
     data_format = backend.image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
+  elif data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('Unknown data_format ' + str(data_format))
 
   if isinstance(x, np.ndarray):
@@ -111,7 +120,9 @@ def preprocess_input(x, data_format=None, mode='caffe'):
 
 
 preprocess_input.__doc__ = PREPROCESS_INPUT_DOC.format(
-    mode=PREPROCESS_INPUT_MODE_DOC, ret='')
+    mode=PREPROCESS_INPUT_MODE_DOC,
+    ret='',
+    error=PREPROCESS_INPUT_DEFAULT_ERROR_DOC)
 
 
 @keras_export('keras.applications.imagenet_utils.decode_predictions')
@@ -182,8 +193,7 @@ def _preprocess_numpy_input(x, data_format, mode):
     x /= 127.5
     x -= 1.
     return x
-
-  if mode == 'torch':
+  elif mode == 'torch':
     x /= 255.
     mean = [0.485, 0.456, 0.406]
     std = [0.229, 0.224, 0.225]
@@ -253,8 +263,7 @@ def _preprocess_symbolic_input(x, data_format, mode):
     x /= 127.5
     x -= 1.
     return x
-
-  if mode == 'torch':
+  elif mode == 'torch':
     x /= 255.
     mean = [0.485, 0.456, 0.406]
     std = [0.229, 0.224, 0.225]
@@ -414,10 +423,10 @@ def validate_activation(classifier_activation, weights):
     return
 
   classifier_activation = activations.get(classifier_activation)
-  if classifier_activation not in [
+  if classifier_activation not in {
       activations.get('softmax'),
       activations.get(None)
-  ]:
+  }:
     raise ValueError('Only `None` and `softmax` activations are allowed '
                      'for the `classifier_activation` argument when using '
                      'pretrained weights, with `include_top=True`')
diff --git a/tensorflow/python/keras/applications/imagenet_utils_test.py b/tensorflow/python/keras/applications/imagenet_utils_test.py
index f37ae8c188c..7f7698606d7 100644
--- a/tensorflow/python/keras/applications/imagenet_utils_test.py
+++ b/tensorflow/python/keras/applications/imagenet_utils_test.py
@@ -29,6 +29,11 @@ from tensorflow.python.platform import test
 class TestImageNetUtils(keras_parameterized.TestCase):
 
   def test_preprocess_input(self):
+    # Test invalid mode check
+    x = np.random.uniform(0, 255, (10, 10, 3))
+    with self.assertRaises(ValueError):
+      utils.preprocess_input(x, mode='some_unknown_mode')
+
     # Test image batch with float and int image input
     x = np.random.uniform(0, 255, (2, 10, 10, 3))
     xint = x.astype('int32')
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/applications/inception_resnet_v2.py
index 15cbfa5033c..ec5ae0fb453 100644
--- a/tensorflow/python/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2.py
@@ -25,14 +25,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -113,7 +112,7 @@ def InceptionResNetV2(include_top=True,
     layers = VersionAwareLayers()
   if kwargs:
     raise ValueError('Unknown argument(s): %s' % (kwargs,))
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
@@ -390,5 +389,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF)
+    mode='',
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/inception_v3.py b/tensorflow/python/keras/applications/inception_v3.py
index 3f528fc131a..89b8398d489 100644
--- a/tensorflow/python/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/applications/inception_v3.py
@@ -23,14 +23,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -109,7 +108,7 @@ def InceptionV3(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
@@ -416,5 +415,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF)
+    mode='',
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py
index f531d8d124c..29a672eed30 100644
--- a/tensorflow/python/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/applications/mobilenet.py
@@ -64,14 +64,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
@@ -164,7 +163,7 @@ def MobileNet(input_shape=None,
     layers = VersionAwareLayers()
   if kwargs:
     raise ValueError('Unknown argument(s): %s' % (kwargs,))
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
@@ -452,5 +451,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF)
+    mode='',
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/mobilenet_v2.py b/tensorflow/python/keras/applications/mobilenet_v2.py
index b1138b7ae26..bc9afb47e23 100644
--- a/tensorflow/python/keras/applications/mobilenet_v2.py
+++ b/tensorflow/python/keras/applications/mobilenet_v2.py
@@ -77,14 +77,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
@@ -181,7 +180,7 @@ def MobileNetV2(input_shape=None,
     layers = VersionAwareLayers()
   if kwargs:
     raise ValueError('Unknown argument(s): %s' % (kwargs,))
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
@@ -509,5 +508,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF)
+    mode='',
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py
index f4e5f74e77d..cd4979ece10 100644
--- a/tensorflow/python/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/applications/nasnet.py
@@ -41,14 +41,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
@@ -151,7 +150,7 @@ def NASNet(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
@@ -795,5 +794,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF)
+    mode='',
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/resnet.py b/tensorflow/python/keras/applications/resnet.py
index e72f06ce3d1..91562d91e47 100644
--- a/tensorflow/python/keras/applications/resnet.py
+++ b/tensorflow/python/keras/applications/resnet.py
@@ -23,14 +23,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -138,7 +137,7 @@ def ResNet(stack_fn,
     layers = VersionAwareLayers()
   if kwargs:
     raise ValueError('Unknown argument(s): %s' % (kwargs,))
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
@@ -532,7 +531,9 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE)
+    mode='',
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
 
 DOC = """
diff --git a/tensorflow/python/keras/applications/resnet_v2.py b/tensorflow/python/keras/applications/resnet_v2.py
index 31785350a74..55880bb16ad 100644
--- a/tensorflow/python/keras/applications/resnet_v2.py
+++ b/tensorflow/python/keras/applications/resnet_v2.py
@@ -133,7 +133,9 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE)
+    mode='',
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
 
 DOC = """
diff --git a/tensorflow/python/keras/applications/vgg16.py b/tensorflow/python/keras/applications/vgg16.py
index 3a523dc5dc3..e0780fa9926 100644
--- a/tensorflow/python/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/applications/vgg16.py
@@ -23,14 +23,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -114,7 +113,7 @@ def VGG16(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
@@ -238,5 +237,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE)
+    mode='',
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/vgg19.py b/tensorflow/python/keras/applications/vgg19.py
index e4385cc8f6a..c033f57338a 100644
--- a/tensorflow/python/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/applications/vgg19.py
@@ -23,14 +23,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -114,7 +113,7 @@ def VGG19(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
@@ -243,5 +242,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE)
+    mode='',
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/xception.py b/tensorflow/python/keras/applications/xception.py
index 7139764b15b..b954f7735ed 100644
--- a/tensorflow/python/keras/applications/xception.py
+++ b/tensorflow/python/keras/applications/xception.py
@@ -27,14 +27,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -114,7 +113,7 @@ def Xception(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
@@ -326,5 +325,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF)
+    mode='',
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 503f6cf0e92..b29b519477c 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -76,6 +76,7 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import moving_averages
 from tensorflow.python.training.tracking import util as tracking_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
@@ -173,6 +174,7 @@ def backend():
 
 
 @keras_export('keras.backend.cast_to_floatx')
+@dispatch.add_dispatch_support
 def cast_to_floatx(x):
   """Cast a Numpy array to the default Keras float type.
 
@@ -292,7 +294,6 @@ def clear_session():
   global _GRAPH_VARIABLES  # pylint: disable=global-variable-not-assigned
   global _GRAPH_TF_OPTIMIZERS  # pylint: disable=global-variable-not-assigned
   global _GRAPH
-  global _FREEZABLE_VARS
   _GRAPH.graph = None
   ops.reset_default_graph()
   reset_uids()
@@ -305,7 +306,6 @@ def clear_session():
     _GRAPH_LEARNING_PHASES.setdefault(graph)
     _GRAPH_VARIABLES.pop(graph, None)
     _GRAPH_TF_OPTIMIZERS.pop(graph, None)
-    _FREEZABLE_VARS.pop(graph, None)
 
 
 @keras_export('keras.backend.manual_variable_initialization')
@@ -393,6 +393,9 @@ def _default_learning_phase():
           False, shape=(), name='keras_learning_phase')
 
 
+@deprecated('2020-10-11',
+            'Simply pass a True/False value to the `training` argument '
+            'of the `__call__` method of your layer or model.')
 @keras_export('keras.backend.set_learning_phase')
 def set_learning_phase(value):
   """Sets the learning phase to a fixed value.
@@ -796,6 +799,7 @@ def is_sparse(tensor):
 
 
 @keras_export('keras.backend.to_dense')
+@dispatch.add_dispatch_support
 def to_dense(tensor):
   """Converts a sparse tensor into a dense tensor and returns it.
 
@@ -1004,6 +1008,7 @@ def _initialize_variables(session):
 
 
 @keras_export('keras.backend.constant')
+@dispatch.add_dispatch_support
 def constant(value, dtype=None, shape=None, name=None):
   """Creates a constant tensor.
 
@@ -1052,9 +1057,9 @@ def is_keras_tensor(x):
   >>> tf.keras.backend.is_keras_tensor(keras_var)
   False
   >>> keras_placeholder = tf.keras.backend.placeholder(shape=(2, 4, 5))
-  >>> # A placeholder is not a Keras tensor.
+  >>> # A placeholder is a Keras tensor.
   >>> tf.keras.backend.is_keras_tensor(keras_placeholder)
-  False
+  True
   >>> keras_input = tf.keras.layers.Input([10])
   >>> # An Input is a Keras tensor.
   >>> tf.keras.backend.is_keras_tensor(keras_input)
@@ -1137,6 +1142,14 @@ def placeholder(shape=None,
                              expand_composites=True)
     else:
       x = array_ops.placeholder(dtype, shape=shape, name=name)
+
+  if context.executing_eagerly():
+    # Add keras_history connectivity information to the placeholder
+    # when the placeholder is built in a top-level eager context
+    # (intended to be used with keras.backend.function)
+    from tensorflow.python.keras.engine import input_layer  # pylint: disable=g-import-not-at-top
+    return input_layer.Input(tensor=x)
+
   return x
 
 
@@ -1159,54 +1172,8 @@ def is_placeholder(x):
     return False
 
 
-def freezable_variable(value, shape=None, name=None):
-  """A tensor-like object whose value can be updated only up until execution.
-
-  After creating the freezable variable, you can update its value by calling
-  `var.update_value(new_value)` (similar to a regular variable).
-  Unlike an actual variable, the value used during execution is the current
-  value at the time the execution function (`backend.function()`) was created.
-
-  This is an internal API, expected to be temporary. It is used to implement a
-  mutable `trainable` property for `BatchNormalization` layers, with a frozen
-  value after model compilation.
-
-  We don't use a plain variable in this case because we need the value used
-  in a specific model to be frozen after `compile` has been called
-  (e.g. GAN use case).
-
-  Arguments:
-    value: The initial value for the tensor-like object.
-    shape: The shape for the tensor-like object (cannot be changed).
-    name: The name for the tensor-like object.
-
-  Returns:
-    A tensor-like object with a static value that can be updated via
-    `x.update_value(new_value)`, up until creating an execution function
-    (afterwards the value is fixed).
-  """
-  graph = get_graph()
-  with graph.as_default():
-    x = array_ops.placeholder_with_default(
-        value, shape=shape, name=name)
-    x._initial_value = value
-    x._current_value = value
-
-    def update_value(new_value):
-      x._current_value = new_value
-
-    def get_value():
-      return x._current_value
-
-    x.update_value = update_value
-    x.get_value = get_value
-
-    global _FREEZABLE_VARS
-    _FREEZABLE_VARS[graph].add(x)
-  return x
-
-
 @keras_export('keras.backend.shape')
+@dispatch.add_dispatch_support
 def shape(x):
   """Returns the symbolic shape of a tensor or variable.
 
@@ -1289,6 +1256,7 @@ def ndim(x):
 
 
 @keras_export('keras.backend.dtype')
+@dispatch.add_dispatch_support
 def dtype(x):
   """Returns the dtype of a Keras tensor or variable, as a string.
 
@@ -1387,6 +1355,7 @@ def zeros(shape, dtype=None, name=None):
 
 
 @keras_export('keras.backend.ones')
+@dispatch.add_dispatch_support
 def ones(shape, dtype=None, name=None):
   """Instantiates an all-ones variable and returns it.
 
@@ -1421,6 +1390,7 @@ def ones(shape, dtype=None, name=None):
 
 
 @keras_export('keras.backend.eye')
+@dispatch.add_dispatch_support
 def eye(size, dtype=None, name=None):
   """Instantiate an identity matrix and returns it.
 
@@ -1477,6 +1447,7 @@ def zeros_like(x, dtype=None, name=None):
 
 
 @keras_export('keras.backend.ones_like')
+@dispatch.add_dispatch_support
 def ones_like(x, dtype=None, name=None):
   """Instantiates an all-ones variable of the same shape as another tensor.
 
@@ -1607,6 +1578,7 @@ def count_params(x):
 
 
 @keras_export('keras.backend.cast')
+@dispatch.add_dispatch_support
 def cast(x, dtype):
   """Casts a tensor to a different dtype and returns it.
 
@@ -1691,6 +1663,7 @@ def moving_average_update(x, value, momentum):
 
 
 @keras_export('keras.backend.dot')
+@dispatch.add_dispatch_support
 def dot(x, y):
   """Multiplies 2 tensors (and/or variables) and returns a tensor.
 
@@ -1751,6 +1724,7 @@ def dot(x, y):
 
 
 @keras_export('keras.backend.batch_dot')
+@dispatch.add_dispatch_support
 def batch_dot(x, y, axes=None):
   """Batchwise dot product.
 
@@ -1939,6 +1913,7 @@ def batch_dot(x, y, axes=None):
 
 
 @keras_export('keras.backend.transpose')
+@dispatch.add_dispatch_support
 def transpose(x):
   """Transposes a tensor and returns it.
 
@@ -1970,6 +1945,7 @@ def transpose(x):
 
 
 @keras_export('keras.backend.gather')
+@dispatch.add_dispatch_support
 def gather(reference, indices):
   """Retrieves the elements of indices `indices` in the tensor `reference`.
 
@@ -2005,6 +1981,7 @@ def gather(reference, indices):
 
 
 @keras_export('keras.backend.max')
+@dispatch.add_dispatch_support
 def max(x, axis=None, keepdims=False):
   """Maximum value in a tensor.
 
@@ -2023,6 +2000,7 @@ def max(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.min')
+@dispatch.add_dispatch_support
 def min(x, axis=None, keepdims=False):
   """Minimum value in a tensor.
 
@@ -2041,6 +2019,7 @@ def min(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.sum')
+@dispatch.add_dispatch_support
 def sum(x, axis=None, keepdims=False):
   """Sum of the values in a tensor, alongside the specified axis.
 
@@ -2059,6 +2038,7 @@ def sum(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.prod')
+@dispatch.add_dispatch_support
 def prod(x, axis=None, keepdims=False):
   """Multiplies the values in a tensor, alongside the specified axis.
 
@@ -2077,6 +2057,7 @@ def prod(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.cumsum')
+@dispatch.add_dispatch_support
 def cumsum(x, axis=0):
   """Cumulative sum of the values in a tensor, alongside the specified axis.
 
@@ -2091,6 +2072,7 @@ def cumsum(x, axis=0):
 
 
 @keras_export('keras.backend.cumprod')
+@dispatch.add_dispatch_support
 def cumprod(x, axis=0):
   """Cumulative product of the values in a tensor, alongside the specified axis.
 
@@ -2125,6 +2107,7 @@ def var(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.std')
+@dispatch.add_dispatch_support
 def std(x, axis=None, keepdims=False):
   """Standard deviation of a tensor, alongside the specified axis.
 
@@ -2151,6 +2134,7 @@ def std(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.mean')
+@dispatch.add_dispatch_support
 def mean(x, axis=None, keepdims=False):
   """Mean of a tensor, alongside the specified axis.
 
@@ -2171,6 +2155,7 @@ def mean(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.any')
+@dispatch.add_dispatch_support
 def any(x, axis=None, keepdims=False):
   """Bitwise reduction (logical OR).
 
@@ -2187,6 +2172,7 @@ def any(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.all')
+@dispatch.add_dispatch_support
 def all(x, axis=None, keepdims=False):
   """Bitwise reduction (logical AND).
 
@@ -2203,6 +2189,7 @@ def all(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.argmax')
+@dispatch.add_dispatch_support
 def argmax(x, axis=-1):
   """Returns the index of the maximum value along an axis.
 
@@ -2217,6 +2204,7 @@ def argmax(x, axis=-1):
 
 
 @keras_export('keras.backend.argmin')
+@dispatch.add_dispatch_support
 def argmin(x, axis=-1):
   """Returns the index of the minimum value along an axis.
 
@@ -2231,6 +2219,7 @@ def argmin(x, axis=-1):
 
 
 @keras_export('keras.backend.square')
+@dispatch.add_dispatch_support
 def square(x):
   """Element-wise square.
 
@@ -2244,6 +2233,7 @@ def square(x):
 
 
 @keras_export('keras.backend.abs')
+@dispatch.add_dispatch_support
 def abs(x):
   """Element-wise absolute value.
 
@@ -2257,6 +2247,7 @@ def abs(x):
 
 
 @keras_export('keras.backend.sqrt')
+@dispatch.add_dispatch_support
 def sqrt(x):
   """Element-wise square root.
 
@@ -2273,6 +2264,7 @@ def sqrt(x):
 
 
 @keras_export('keras.backend.exp')
+@dispatch.add_dispatch_support
 def exp(x):
   """Element-wise exponential.
 
@@ -2286,6 +2278,7 @@ def exp(x):
 
 
 @keras_export('keras.backend.log')
+@dispatch.add_dispatch_support
 def log(x):
   """Element-wise log.
 
@@ -2320,6 +2313,7 @@ def logsumexp(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.round')
+@dispatch.add_dispatch_support
 def round(x):
   """Element-wise rounding to the closest integer.
 
@@ -2335,6 +2329,7 @@ def round(x):
 
 
 @keras_export('keras.backend.sign')
+@dispatch.add_dispatch_support
 def sign(x):
   """Element-wise sign.
 
@@ -2348,6 +2343,7 @@ def sign(x):
 
 
 @keras_export('keras.backend.pow')
+@dispatch.add_dispatch_support
 def pow(x, a):
   """Element-wise exponentiation.
 
@@ -2362,6 +2358,7 @@ def pow(x, a):
 
 
 @keras_export('keras.backend.clip')
+@dispatch.add_dispatch_support
 def clip(x, min_value, max_value):
   """Element-wise value clipping.
 
@@ -2385,6 +2382,7 @@ def clip(x, min_value, max_value):
 
 
 @keras_export('keras.backend.equal')
+@dispatch.add_dispatch_support
 def equal(x, y):
   """Element-wise equality between two tensors.
 
@@ -2399,6 +2397,7 @@ def equal(x, y):
 
 
 @keras_export('keras.backend.not_equal')
+@dispatch.add_dispatch_support
 def not_equal(x, y):
   """Element-wise inequality between two tensors.
 
@@ -2413,6 +2412,7 @@ def not_equal(x, y):
 
 
 @keras_export('keras.backend.greater')
+@dispatch.add_dispatch_support
 def greater(x, y):
   """Element-wise truth value of (x > y).
 
@@ -2427,6 +2427,7 @@ def greater(x, y):
 
 
 @keras_export('keras.backend.greater_equal')
+@dispatch.add_dispatch_support
 def greater_equal(x, y):
   """Element-wise truth value of (x >= y).
 
@@ -2441,6 +2442,7 @@ def greater_equal(x, y):
 
 
 @keras_export('keras.backend.less')
+@dispatch.add_dispatch_support
 def less(x, y):
   """Element-wise truth value of (x < y).
 
@@ -2455,6 +2457,7 @@ def less(x, y):
 
 
 @keras_export('keras.backend.less_equal')
+@dispatch.add_dispatch_support
 def less_equal(x, y):
   """Element-wise truth value of (x <= y).
 
@@ -2469,6 +2472,7 @@ def less_equal(x, y):
 
 
 @keras_export('keras.backend.maximum')
+@dispatch.add_dispatch_support
 def maximum(x, y):
   """Element-wise maximum of two tensors.
 
@@ -2493,6 +2497,7 @@ def maximum(x, y):
 
 
 @keras_export('keras.backend.minimum')
+@dispatch.add_dispatch_support
 def minimum(x, y):
   """Element-wise minimum of two tensors.
 
@@ -2507,6 +2512,7 @@ def minimum(x, y):
 
 
 @keras_export('keras.backend.sin')
+@dispatch.add_dispatch_support
 def sin(x):
   """Computes sin of x element-wise.
 
@@ -2520,6 +2526,7 @@ def sin(x):
 
 
 @keras_export('keras.backend.cos')
+@dispatch.add_dispatch_support
 def cos(x):
   """Computes cos of x element-wise.
 
@@ -2665,6 +2672,7 @@ def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
 
 
 @keras_export('keras.backend.batch_normalization')
+@dispatch.add_dispatch_support
 def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
   """Applies batch normalization on x given mean, var, beta and gamma.
 
@@ -2727,6 +2735,7 @@ def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
 
 
 @keras_export('keras.backend.concatenate')
+@dispatch.add_dispatch_support
 def concatenate(tensors, axis=-1):
   """Concatenates a list of tensors alongside the specified axis.
 
@@ -2764,6 +2773,7 @@ def concatenate(tensors, axis=-1):
 
 
 @keras_export('keras.backend.reshape')
+@dispatch.add_dispatch_support
 def reshape(x, shape):
   """Reshapes a tensor to the specified shape.
 
@@ -2793,6 +2803,7 @@ def reshape(x, shape):
 
 
 @keras_export('keras.backend.permute_dimensions')
+@dispatch.add_dispatch_support
 def permute_dimensions(x, pattern):
   """Permutes axes in a tensor.
 
@@ -2824,6 +2835,7 @@ def permute_dimensions(x, pattern):
 
 
 @keras_export('keras.backend.resize_images')
+@dispatch.add_dispatch_support
 def resize_images(x, height_factor, width_factor, data_format,
                   interpolation='nearest'):
   """Resizes the images contained in a 4D tensor.
@@ -2887,6 +2899,7 @@ def resize_images(x, height_factor, width_factor, data_format,
 
 
 @keras_export('keras.backend.resize_volumes')
+@dispatch.add_dispatch_support
 def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
   """Resizes the volume contained in a 5D tensor.
 
@@ -2919,6 +2932,7 @@ def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
 
 
 @keras_export('keras.backend.repeat_elements')
+@dispatch.add_dispatch_support
 def repeat_elements(x, rep, axis):
   """Repeats the elements of a tensor along an axis, like `np.repeat`.
 
@@ -2980,6 +2994,7 @@ def repeat_elements(x, rep, axis):
 
 
 @keras_export('keras.backend.repeat')
+@dispatch.add_dispatch_support
 def repeat(x, n):
   """Repeats a 2D tensor.
 
@@ -3015,6 +3030,7 @@ def repeat(x, n):
 
 
 @keras_export('keras.backend.arange')
+@dispatch.add_dispatch_support
 def arange(start, stop=None, step=1, dtype='int32'):
   """Creates a 1D tensor containing a sequence of integers.
 
@@ -3053,6 +3069,7 @@ def arange(start, stop=None, step=1, dtype='int32'):
 
 
 @keras_export('keras.backend.tile')
+@dispatch.add_dispatch_support
 def tile(x, n):
   """Creates a tensor by tiling `x` by `n`.
 
@@ -3070,6 +3087,7 @@ def tile(x, n):
 
 
 @keras_export('keras.backend.flatten')
+@dispatch.add_dispatch_support
 def flatten(x):
   """Flatten a tensor.
 
@@ -3095,6 +3113,7 @@ def flatten(x):
 
 
 @keras_export('keras.backend.batch_flatten')
+@dispatch.add_dispatch_support
 def batch_flatten(x):
   """Turn a nD tensor into a 2D tensor with same 0th dimension.
 
@@ -3120,6 +3139,7 @@ def batch_flatten(x):
 
 
 @keras_export('keras.backend.expand_dims')
+@dispatch.add_dispatch_support
 def expand_dims(x, axis=-1):
   """Adds a 1-sized dimension at index "axis".
 
@@ -3134,6 +3154,7 @@ def expand_dims(x, axis=-1):
 
 
 @keras_export('keras.backend.squeeze')
+@dispatch.add_dispatch_support
 def squeeze(x, axis):
   """Removes a 1-dimension from the tensor at index "axis".
 
@@ -3148,6 +3169,7 @@ def squeeze(x, axis):
 
 
 @keras_export('keras.backend.temporal_padding')
+@dispatch.add_dispatch_support
 def temporal_padding(x, padding=(1, 1)):
   """Pads the middle dimension of a 3D tensor.
 
@@ -3165,6 +3187,7 @@ def temporal_padding(x, padding=(1, 1)):
 
 
 @keras_export('keras.backend.spatial_2d_padding')
+@dispatch.add_dispatch_support
 def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
   """Pads the 2nd and 3rd dimensions of a 4D tensor.
 
@@ -3196,6 +3219,7 @@ def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
 
 
 @keras_export('keras.backend.spatial_3d_padding')
+@dispatch.add_dispatch_support
 def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
   """Pads 5D tensor with zeros along the depth, height, width dimensions.
 
@@ -3240,6 +3264,7 @@ def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
 
 
 @keras_export('keras.backend.stack')
+@dispatch.add_dispatch_support
 def stack(x, axis=0):
   """Stacks a list of rank `R` tensors into a rank `R+1` tensor.
 
@@ -3266,6 +3291,7 @@ def stack(x, axis=0):
 
 
 @keras_export('keras.backend.one_hot')
+@dispatch.add_dispatch_support
 def one_hot(indices, num_classes):
   """Computes the one-hot representation of an integer tensor.
 
@@ -3285,6 +3311,7 @@ def one_hot(indices, num_classes):
 
 
 @keras_export('keras.backend.reverse')
+@dispatch.add_dispatch_support
 def reverse(x, axes):
   """Reverse a tensor along the specified axes.
 
@@ -3358,13 +3385,14 @@ def get_value(x):
 
   if ops.executing_eagerly_outside_functions():
     # This method of evaluating works inside the Keras FuncGraph.
-    return function([], x)(x)
+    return eval_in_eager_or_function(x)
 
   with x.graph.as_default():
     return x.eval(session=get_session((x,)))
 
 
 @keras_export('keras.backend.batch_get_value')
+@dispatch.add_dispatch_support
 def batch_get_value(tensors):
   """Returns the value of more than one tensor variable.
 
@@ -3426,6 +3454,7 @@ def set_value(x, value):
 
 
 @keras_export('keras.backend.batch_set_value')
+@dispatch.add_dispatch_support
 def batch_set_value(tuples):
   """Sets the values of many tensor variables at once.
 
@@ -3468,6 +3497,7 @@ set_value.__doc__ = set_value.__doc__.format(snippet=_VALUE_SET_CODE_STRING)
 
 
 @keras_export('keras.backend.print_tensor')
+@dispatch.add_dispatch_support
 def print_tensor(x, message=''):
   """Prints `message` and the tensor value when evaluated.
 
@@ -3698,161 +3728,74 @@ class GraphExecutionFunction(object):
     return nest.map_structure(self._eval_if_composite, output_structure)
 
 
-class EagerExecutionFunction(object):
-  """Helper class for constructing a TF graph function from the Keras graph.
+def eval_in_eager_or_function(outputs):
+  """Method to evaluate a tensor in eager or in a tf.function.
+
+  In the case of a tf.function, it will lift the tensor out of the function
+  and try to evaluate that piece of the graph.
+
+  Warning: Do not add new usages of this function.
+  TODO(b/150169018): delete this function once _keras_history_helper is no
+  longer needed, after Keras switches to KerasTensors and op layers
+  work via dispatch.
 
   Arguments:
-    inputs: Feed placeholders to the computation graph.
-    outputs: Output tensors to fetch.
-    updates: Additional update ops to be run at function call.
-    name: A name to help users identify what this function does.
-    session_kwargs: Unsupported.
+    outputs: tensors to fetch.
+  Returns:
+    The value of the tensors (as numpy arrays).
   """
+  outputs_structure = outputs
+  outputs = nest.flatten(outputs, expand_composites=True)
 
-  def __init__(self, inputs, outputs, updates=None, name=None):
-    self.name = name
-    self._inputs_structure = inputs
-    inputs = nest.flatten(inputs, expand_composites=True)
-    self._outputs_structure = outputs
-    outputs = nest.flatten(outputs, expand_composites=True)
+  graphs = {
+      i.graph
+      for i in nest.flatten([outputs])
+      if hasattr(i, 'graph')
+  }
+  if len(graphs) > 1:
+    raise ValueError('Cannot create an execution function which is comprised '
+                     'of elements from multiple graphs.')
 
-    updates = updates or []
-    if not isinstance(updates, (list, tuple)):
-      raise TypeError('`updates` in a Keras backend function '
-                      'should be a list or tuple.')
+  source_graph = graphs.pop()
 
-    if updates and not outputs:
-      # Edge case; never happens in practice
-      raise ValueError('Cannot create a Keras backend function with updates'
-                       ' but no outputs during eager execution.')
-    graphs = {
-        i.graph
-        for i in nest.flatten([inputs, outputs, updates])
-        if hasattr(i, 'graph')
-    }
-    if len(graphs) > 1:
-      raise ValueError('Cannot create an execution function which is comprised '
-                       'of elements from multiple graphs.')
-
-    source_graph = graphs.pop()
+  with _scratch_graph() as exec_graph:
     global_graph = get_graph()
+    if source_graph not in (exec_graph, global_graph):
+      raise ValueError('Unknown graph. Aborting.')
 
-    updates_ops = []
-    legacy_update_ops = []
-    for update in updates:
-      # For legacy reasons it is allowed to pass an update as a tuple
-      # `(variable, new_value)` (this maps to an assign op). Otherwise it
-      # is assumed to already be an op -- we cannot control its execution
-      # order.
-      if isinstance(update, tuple):
-        legacy_update_ops.append(update)
-      else:
-        if hasattr(update, 'op'):
-          update = update.op
-        if update is not None:
-          # `update.op` may have been None in certain cases.
-          updates_ops.append(update)
+    if source_graph is global_graph and exec_graph is not global_graph:
+      init_tensors = outputs
+      lifted_map = lift_to_graph.lift_to_graph(
+          tensors=init_tensors,
+          graph=exec_graph,
+          sources=[],
+          add_sources=True,
+          handle_captures=True,
+          base_graph=source_graph)
 
-    self._freezable_vars_to_feed = []
-    self._freezable_vars_values = []
-    freezable_vars_from_keras_graph = object_identity.ObjectIdentitySet(
-        _FREEZABLE_VARS.get(global_graph, {}))
-    with _scratch_graph() as exec_graph:
-      global_graph = get_graph()
-      if source_graph not in (exec_graph, global_graph):
-        raise ValueError('Unknown graph. Aborting.')
+      outputs = [lifted_map[i] for i in outputs]
 
-      if source_graph is global_graph and exec_graph is not global_graph:
-        init_tensors = (
-            outputs + updates_ops + [p for [p, _] in legacy_update_ops] +
-            [p_new for [_, p_new] in legacy_update_ops
-             if isinstance(p_new, ops.Tensor)])
-        lifted_map = lift_to_graph.lift_to_graph(
-            tensors=init_tensors,
-            graph=exec_graph,
-            sources=inputs,
-            add_sources=True,
-            handle_captures=True,
-            base_graph=source_graph)
+  # Consolidate updates
+  with exec_graph.as_default():
+    outputs = cast_variables_to_tensor(outputs)
 
-        inputs = [lifted_map[i] for i in inputs]
-        outputs = [lifted_map[i] for i in outputs]
-        updates_ops = [lifted_map[i] for i in updates_ops]
-        legacy_update_ops = [(lifted_map[p], lifted_map.get(p_new, p_new))
-                             for p, p_new in legacy_update_ops]
+    exec_graph.inputs = exec_graph.internal_captures
+    exec_graph.outputs = outputs
+    graph_fn = eager_function.ConcreteFunction(exec_graph)
 
-        # Keep track of the value to feed to any "freezable variables"
-        # created in this graph.
-        for old_op, new_op in lifted_map.items():
-          if old_op in freezable_vars_from_keras_graph:
-            frozen_var = old_op
-            if frozen_var._initial_value != frozen_var._current_value:
-              # We only feed a frozen_variable if its value has changed;
-              # otherwise it can rely on the default value of the
-              # underlying placeholder_with_default.
-              self._freezable_vars_to_feed.append(new_op)
-              self._freezable_vars_values.append(frozen_var._current_value)
+  graph_fn._num_positional_args = 0
+  graph_fn._arg_keywords = []
 
-    # Consolidate updates
-    with exec_graph.as_default():
-      outputs = cast_variables_to_tensor(outputs)
-      with ops.control_dependencies(outputs):
-        for p, p_new in legacy_update_ops:
-          updates_ops.append(state_ops.assign(p, p_new))
+  outputs = graph_fn()
 
-      self.inputs, self.outputs = inputs, outputs
-      self._input_references = self.inputs + self._freezable_vars_to_feed
-      with ops.control_dependencies(updates_ops):
-        self.outputs[0] = array_ops.identity(self.outputs[0])
-
-      exec_graph.inputs = self._input_references + exec_graph.internal_captures
-      exec_graph.outputs = self.outputs
-      graph_fn = eager_function.ConcreteFunction(exec_graph)
-
-    graph_fn._num_positional_args = len(self._input_references)
-    graph_fn._arg_keywords = []
-    self._graph_fn = graph_fn
-
-    # Handle placeholders with default
-    # (treated as required placeholder by graph functions)
-    self._placeholder_default_values = {}
-    with exec_graph.as_default():
-      for x in self.inputs:
-        if x.op.type == 'PlaceholderWithDefault':
-          self._placeholder_default_values[ops.tensor_id(
-              x)] = tensor_util.constant_value(x.op.inputs[0])
-
-  def __call__(self, inputs):
-    input_values = nest.flatten(inputs, expand_composites=True)
-
-    if self._freezable_vars_values:
-      input_values = input_values + self._freezable_vars_values
-    converted_inputs = []
-    for tensor, value in zip(self._input_references, input_values):
-      if value is None:
-        # Assume `value` is a placeholder with default
-        value = self._placeholder_default_values.get(
-            ops.tensor_id(tensor), None)
-        if value is None:
-          raise ValueError(
-              'You must feed a value for placeholder %s' % (tensor,))
-      if not isinstance(value, ops.Tensor):
-        value = ops.convert_to_tensor_v2(value, dtype=tensor.dtype)
-      if value.dtype != tensor.dtype:
-        # Temporary workaround due to `convert_to_tensor` not casting floats.
-        # See b/119637405
-        value = math_ops.cast(value, tensor.dtype)
-      converted_inputs.append(value)
-    outputs = self._graph_fn(*converted_inputs)
-
-    # EagerTensor.numpy() will often make a copy to ensure memory safety.
-    # However in this case `outputs` is not directly returned, so it is always
-    # safe to reuse the underlying buffer without checking. In such a case the
-    # private numpy conversion method is preferred to guarantee performance.
-    return nest.pack_sequence_as(
-        self._outputs_structure,
-        [x._numpy() for x in outputs],  # pylint: disable=protected-access
-        expand_composites=True)
+  # EagerTensor.numpy() will often make a copy to ensure memory safety.
+  # However in this case `outputs` is not directly returned, so it is always
+  # safe to reuse the underlying buffer without checking. In such a case the
+  # private numpy conversion method is preferred to guarantee performance.
+  return nest.pack_sequence_as(
+      outputs_structure,
+      [x._numpy() for x in outputs],  # pylint: disable=protected-access
+      expand_composites=True)
 
 
 @keras_export('keras.backend.function')
@@ -3876,7 +3819,20 @@ def function(inputs, outputs, updates=None, name=None, **kwargs):
     if kwargs:
       raise ValueError('Session keyword arguments are not support during '
                        'eager execution. You passed: %s' % (kwargs,))
-    return EagerExecutionFunction(inputs, outputs, updates=updates, name=name)
+    if updates:
+      raise ValueError('`updates` argument is not support during '
+                       'eager execution. You passed: %s' % (updates,))
+    from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
+    from tensorflow.python.keras.utils import tf_utils  # pylint: disable=g-import-not-at-top
+    model = models.Model(inputs=inputs, outputs=outputs)
+
+    wrap_outputs = isinstance(outputs, list) and len(outputs) == 1
+    def func(model_inputs):
+      outs = model(model_inputs)
+      if wrap_outputs:
+        outs = [outs]
+      return tf_utils.to_numpy_or_python_type(outs)
+    return func
 
   if kwargs:
     for key in kwargs:
@@ -3905,6 +3861,7 @@ def gradients(loss, variables):
 
 
 @keras_export('keras.backend.stop_gradient')
+@dispatch.add_dispatch_support
 def stop_gradient(variables):
   """Returns `variables` but with zero gradient w.r.t. every other variable.
 
@@ -3926,6 +3883,7 @@ def stop_gradient(variables):
 
 
 @keras_export('keras.backend.rnn')
+@dispatch.add_dispatch_support
 def rnn(step_function,
         inputs,
         initial_states,
@@ -4320,6 +4278,7 @@ def rnn(step_function,
 
 
 @keras_export('keras.backend.switch')
+@dispatch.add_dispatch_support
 def switch(condition, then_expression, else_expression):
   """Switches between two operations depending on a scalar value.
 
@@ -4453,6 +4412,7 @@ def in_test_phase(x, alt, training=None):
 
 
 @keras_export('keras.backend.relu')
+@dispatch.add_dispatch_support
 def relu(x, alpha=0., max_value=None, threshold=0):
   """Rectified linear unit.
 
@@ -4506,6 +4466,7 @@ def relu(x, alpha=0., max_value=None, threshold=0):
 
 
 @keras_export('keras.backend.elu')
+@dispatch.add_dispatch_support
 def elu(x, alpha=1.):
   """Exponential linear unit.
 
@@ -4524,6 +4485,7 @@ def elu(x, alpha=1.):
 
 
 @keras_export('keras.backend.softmax')
+@dispatch.add_dispatch_support
 def softmax(x, axis=-1):
   """Softmax of a tensor.
 
@@ -4539,6 +4501,7 @@ def softmax(x, axis=-1):
 
 
 @keras_export('keras.backend.softplus')
+@dispatch.add_dispatch_support
 def softplus(x):
   """Softplus of a tensor.
 
@@ -4552,6 +4515,7 @@ def softplus(x):
 
 
 @keras_export('keras.backend.softsign')
+@dispatch.add_dispatch_support
 def softsign(x):
   """Softsign of a tensor.
 
@@ -4571,6 +4535,7 @@ def _backtrack_identity(tensor):
 
 
 @keras_export('keras.backend.categorical_crossentropy')
+@dispatch.add_dispatch_support
 def categorical_crossentropy(target, output, from_logits=False, axis=-1):
   """Categorical crossentropy between an output tensor and a target tensor.
 
@@ -4613,6 +4578,9 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
   [0. 0. 0.]
 
   """
+  target = ops.convert_to_tensor_v2(target)
+  output = ops.convert_to_tensor_v2(output)
+
   target.shape.assert_is_compatible_with(output.shape)
   if from_logits:
     return nn.softmax_cross_entropy_with_logits_v2(
@@ -4639,6 +4607,7 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
 
 
 @keras_export('keras.backend.sparse_categorical_crossentropy')
+@dispatch.add_dispatch_support
 def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   """Categorical crossentropy with integer targets.
 
@@ -4659,6 +4628,9 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   Raises:
       ValueError: if `axis` is neither -1 nor one of the axes of `output`.
   """
+  target = ops.convert_to_tensor_v2(target)
+  output = ops.convert_to_tensor_v2(output)
+
   if not from_logits and not isinstance(
       output, (ops.EagerTensor, variables_module.Variable)):
     output = _backtrack_identity(output)
@@ -4720,6 +4692,7 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
 
 
 @keras_export('keras.backend.binary_crossentropy')
+@dispatch.add_dispatch_support
 def binary_crossentropy(target, output, from_logits=False):
   """Binary crossentropy between an output tensor and a target tensor.
 
@@ -4733,6 +4706,9 @@ def binary_crossentropy(target, output, from_logits=False):
   Returns:
       A tensor.
   """
+  target = ops.convert_to_tensor_v2(target)
+  output = ops.convert_to_tensor_v2(output)
+
   if from_logits:
     return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
 
@@ -4756,6 +4732,7 @@ def binary_crossentropy(target, output, from_logits=False):
 
 
 @keras_export('keras.backend.sigmoid')
+@dispatch.add_dispatch_support
 def sigmoid(x):
   """Element-wise sigmoid.
 
@@ -4769,6 +4746,7 @@ def sigmoid(x):
 
 
 @keras_export('keras.backend.hard_sigmoid')
+@dispatch.add_dispatch_support
 def hard_sigmoid(x):
   """Segment-wise linear approximation of sigmoid.
 
@@ -4791,6 +4769,7 @@ def hard_sigmoid(x):
 
 
 @keras_export('keras.backend.tanh')
+@dispatch.add_dispatch_support
 def tanh(x):
   """Element-wise tanh.
 
@@ -4804,6 +4783,7 @@ def tanh(x):
 
 
 @keras_export('keras.backend.dropout')
+@dispatch.add_dispatch_support
 def dropout(x, level, noise_shape=None, seed=None):
   """Sets entries in `x` to zero at random, while scaling the entire tensor.
 
@@ -4824,6 +4804,7 @@ def dropout(x, level, noise_shape=None, seed=None):
 
 
 @keras_export('keras.backend.l2_normalize')
+@dispatch.add_dispatch_support
 def l2_normalize(x, axis=None):
   """Normalizes a tensor wrt the L2 norm alongside the specified axis.
 
@@ -4838,6 +4819,7 @@ def l2_normalize(x, axis=None):
 
 
 @keras_export('keras.backend.in_top_k')
+@dispatch.add_dispatch_support
 def in_top_k(predictions, targets, k):
   """Returns whether the `targets` are in the top `k` `predictions`.
 
@@ -4940,6 +4922,7 @@ def _preprocess_padding(padding):
 
 
 @keras_export('keras.backend.conv1d')
+@dispatch.add_dispatch_support
 def conv1d(x,
            kernel,
            strides=1,
@@ -4990,6 +4973,7 @@ def conv1d(x,
 
 
 @keras_export('keras.backend.conv2d')
+@dispatch.add_dispatch_support
 def conv2d(x,
            kernel,
            strides=(1, 1),
@@ -5033,6 +5017,7 @@ def conv2d(x,
 
 
 @keras_export('keras.backend.conv2d_transpose')
+@dispatch.add_dispatch_support
 def conv2d_transpose(x,
                      kernel,
                      output_shape,
@@ -5173,6 +5158,7 @@ def separable_conv1d(x,
 
 
 @keras_export('keras.backend.separable_conv2d')
+@dispatch.add_dispatch_support
 def separable_conv2d(x,
                      depthwise_kernel,
                      pointwise_kernel,
@@ -5230,6 +5216,7 @@ def separable_conv2d(x,
 
 
 @keras_export('keras.backend.depthwise_conv2d')
+@dispatch.add_dispatch_support
 def depthwise_conv2d(x,
                      depthwise_kernel,
                      strides=(1, 1),
@@ -5279,6 +5266,7 @@ def depthwise_conv2d(x,
 
 
 @keras_export('keras.backend.conv3d')
+@dispatch.add_dispatch_support
 def conv3d(x,
            kernel,
            strides=(1, 1, 1),
@@ -5381,6 +5369,7 @@ def conv3d_transpose(x,
 
 
 @keras_export('keras.backend.pool2d')
+@dispatch.add_dispatch_support
 def pool2d(x,
            pool_size,
            strides=(1, 1),
@@ -5440,6 +5429,7 @@ def pool2d(x,
 
 
 @keras_export('keras.backend.pool3d')
+@dispatch.add_dispatch_support
 def pool3d(x,
            pool_size,
            strides=(1, 1, 1),
@@ -5570,6 +5560,7 @@ def local_conv(inputs,
 
 
 @keras_export('keras.backend.local_conv1d')
+@dispatch.add_dispatch_support
 def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
   """Apply 1D conv with un-shared weights.
 
@@ -5605,6 +5596,7 @@ def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
 
 
 @keras_export('keras.backend.local_conv2d')
+@dispatch.add_dispatch_support
 def local_conv2d(inputs,
                  kernel,
                  kernel_size,
@@ -5646,6 +5638,7 @@ def local_conv2d(inputs,
 
 
 @keras_export('keras.backend.bias_add')
+@dispatch.add_dispatch_support
 def bias_add(x, bias, data_format=None):
   """Adds a bias vector to a tensor.
 
@@ -5690,6 +5683,7 @@ def bias_add(x, bias, data_format=None):
 
 
 @keras_export('keras.backend.random_normal')
+@dispatch.add_dispatch_support
 def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with normal distribution of values.
 
@@ -5726,6 +5720,7 @@ def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 
 
 @keras_export('keras.backend.random_uniform')
+@dispatch.add_dispatch_support
 def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
   """Returns a tensor with uniform distribution of values.
 
@@ -5759,6 +5754,7 @@ def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
 
 @deprecated(None, 'Use `tf.keras.backend.random_bernoulli` instead.')
 @keras_export('keras.backend.random_binomial')
+@dispatch.add_dispatch_support
 def random_binomial(shape, p=0.0, dtype=None, seed=None):
   """Returns a tensor with random binomial distribution of values.
 
@@ -5795,6 +5791,7 @@ def random_binomial(shape, p=0.0, dtype=None, seed=None):
 
 
 @keras_export('keras.backend.random_bernoulli')
+@dispatch.add_dispatch_support
 def random_bernoulli(shape, p=0.0, dtype=None, seed=None):
   """Returns a tensor with random bernoulli distribution of values.
 
@@ -5811,6 +5808,7 @@ def random_bernoulli(shape, p=0.0, dtype=None, seed=None):
 
 
 @keras_export('keras.backend.truncated_normal')
+@dispatch.add_dispatch_support
 def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with truncated random normal distribution of values.
 
@@ -5845,6 +5843,7 @@ def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 
 
 @keras_export('keras.backend.ctc_label_dense_to_sparse')
+@dispatch.add_dispatch_support
 def ctc_label_dense_to_sparse(labels, label_lengths):
   """Converts CTC labels from dense to sparse.
 
@@ -5891,6 +5890,7 @@ def ctc_label_dense_to_sparse(labels, label_lengths):
 
 
 @keras_export('keras.backend.ctc_batch_cost')
+@dispatch.add_dispatch_support
 def ctc_batch_cost(y_true, y_pred, input_length, label_length):
   """Runs CTC loss algorithm on each batch element.
 
@@ -5923,6 +5923,7 @@ def ctc_batch_cost(y_true, y_pred, input_length, label_length):
 
 
 @keras_export('keras.backend.ctc_decode')
+@dispatch.add_dispatch_support
 def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
   """Decodes the output of a softmax.
 
@@ -5947,10 +5948,13 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
               contains the decoded sequence.
               If `false`, returns the `top_paths` most probable
               decoded sequences.
+              Each decoded sequence has shape (samples, time_steps).
               Important: blank labels are returned as `-1`.
           Tensor `(top_paths, )` that contains
               the log probability of each decoded sequence.
   """
+  input_shape = shape(y_pred)
+  samples, steps = input_shape[0], input_shape[1]
   y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + epsilon())
   input_length = math_ops.cast(input_length, dtypes_module.int32)
 
@@ -5965,7 +5969,7 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
         top_paths=top_paths)
   decoded_dense = [
       sparse_ops.sparse_to_dense(
-          st.indices, st.dense_shape, st.values, default_value=-1)
+          st.indices, (samples, steps), st.values, default_value=-1)
       for st in decoded
   ]
   return (decoded_dense, log_prob)
@@ -6284,10 +6288,6 @@ class ContextValueCache(weakref.WeakKeyDictionary):
 # either train mode (learning_phase == 1) or test mode (learning_phase == 0).
 _GRAPH_LEARNING_PHASES = ContextValueCache(_default_learning_phase)
 
-# This dictionary holds a mapping {graph: set_of_freezable_variables}.
-# Each set tracks objects created via `freezable_variable` in the graph.
-_FREEZABLE_VARS = ContextValueCache(object_identity.ObjectIdentityWeakSet)
-
 # This dictionary holds a mapping between a graph and variables to initialize
 # in the graph.
 _GRAPH_VARIABLES = ContextValueCache(object_identity.ObjectIdentityWeakSet)
diff --git a/tensorflow/python/keras/backend_config.py b/tensorflow/python/keras/backend_config.py
index c1bf163c444..cd1f1e4b423 100644
--- a/tensorflow/python/keras/backend_config.py
+++ b/tensorflow/python/keras/backend_config.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import keras_export
 
 # The type of float to use throughout a session.
@@ -30,6 +31,7 @@ _IMAGE_DATA_FORMAT = 'channels_last'
 
 
 @keras_export('keras.backend.epsilon')
+@dispatch.add_dispatch_support
 def epsilon():
   """Returns the value of the fuzz factor used in numeric expressions.
 
@@ -110,6 +112,7 @@ def set_floatx(value):
 
 
 @keras_export('keras.backend.image_data_format')
+@dispatch.add_dispatch_support
 def image_data_format():
   """Returns the default image data format convention.
 
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 1adc20652b2..f36db9605dc 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -1677,8 +1677,10 @@ class BackendCrossEntropyLossesTest(test.TestCase, parameterized.TestCase):
         t, p, from_logits=True, axis=0),
     self.assertArrayNear(self.evaluate(result)[0], [.002, 0, .17], 1e-3)
 
-  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  @combinations.generate(combinations.combine(mode=['graph']))
   def test_sparse_categorical_crossentropy_loss_with_unknown_rank_tensor(self):
+    # This test only runs in graph because the TF op layer is not supported yet
+    # for sparse ops.
     t = backend.placeholder()
     p = backend.placeholder()
     o = backend.sparse_categorical_crossentropy(t, p)
@@ -1760,7 +1762,10 @@ class TestCTC(test.TestCase):
         -3.777835    # output beam 1
     ], np.float32)[np.newaxis, :]
 
-    decode_truth = [np.array([1, 0]), np.array([0, 1, 0])]
+    decode_truth = [
+        np.array([1, 0, -1, -1, -1, -1, -1]),
+        np.array([0, 1, 0, -1, -1, -1, -1])
+    ]
     beam_width = 2
     top_paths = 2
 
@@ -1870,6 +1875,8 @@ class TestRandomOps(test.TestCase):
 class FunctionTest(test.TestCase):
 
   def test_function_basics(self):
+    if context.executing_eagerly():
+      self.skipTest('eager backend.function does not support updates')
     x1 = backend.placeholder(shape=(), dtype='float32')
     x2 = backend.placeholder(shape=(), dtype='int32')
     v = backend.variable(10.)
@@ -1916,6 +1923,9 @@ class FunctionTest(test.TestCase):
     self.assertEqual(result, 4.)
 
   def test_tuple_updates(self):
+    if context.executing_eagerly():
+      self.skipTest('eager backend.function does not support updates')
+
     x_ph = backend.placeholder(ndim=2)
     v = backend.variable(np.ones((4, 2)))
     output = x_ph ** 2 + v
@@ -1929,7 +1939,7 @@ class FunctionTest(test.TestCase):
 
 class BackendGraphTests(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  @combinations.generate(combinations.combine(mode=['graph']))
   def test_function_placeholder_with_default(self):
     with backend.get_graph().as_default():
       x1 = array_ops.placeholder_with_default(
diff --git a/tensorflow/python/keras/benchmark/README.md b/tensorflow/python/keras/benchmark/README.md
new file mode 100644
index 00000000000..17e458b3a77
--- /dev/null
+++ b/tensorflow/python/keras/benchmark/README.md
@@ -0,0 +1,3 @@
+# Keras Benchmark
+
+This package contains benchmarks on Keras models and components.
diff --git a/tensorflow/python/keras/benchmark/__init__.py b/tensorflow/python/keras/benchmark/__init__.py
new file mode 100644
index 00000000000..a70e59e1834
--- /dev/null
+++ b/tensorflow/python/keras/benchmark/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras Benchmark."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 6748a572805..3d585190867 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -33,11 +33,14 @@ import numpy as np
 import six
 
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distributed_file_utils
-from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
+from tensorflow.python.keras.distribute import worker_training_state
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils import version_utils
@@ -307,14 +310,20 @@ class CallbackList(object):
       end_hook_name = hook_name
       begin_hook_name = 'on_{mode}_batch_begin'.format(mode=mode)
 
-      threshold_time = 0.5 * batch_time
+      threshold_time = 1.5 * batch_time
       warning_msg = ('Callbacks method `{hook}` is slow compared to '
-                     'the batch time. Check your callbacks.')
+                     'the batch time (batch time: {batch_time:.4f}s vs '
+                     '`{hook}` time: {cbk_time:.4f}s). Check your callbacks.')
       if self._timing[begin_hook_name] > threshold_time:
-        logging.warning(warning_msg.format(hook=begin_hook_name))
+        logging.warning(warning_msg.format(
+            hook=begin_hook_name,
+            batch_time=batch_time,
+            cbk_time=self._timing[begin_hook_name]))
       if self._timing[end_hook_name] > threshold_time:
-        logging.warning(warning_msg.format(hook=end_hook_name))
-
+        logging.warning(warning_msg.format(
+            hook=end_hook_name,
+            batch_time=batch_time,
+            cbk_time=self._timing[end_hook_name]))
       self._check_timing = False
       self._batch_start_time = None
 
@@ -1186,51 +1195,19 @@ class ModelCheckpoint(Callback):
       self.save_weights_only = True
 
   def on_train_begin(self, logs=None):
-    # pylint: disable=protected-access
-    if self.model._in_multi_worker_mode():
-      # MultiWorkerTrainingState is used to manage the training state needed
-      # for preemption-recovery of a worker in multi-worker training.
-      self.model._training_state = (
-          training_state.MultiWorkerTrainingState(self.model, self.filepath))
-      self._training_state = self.model._training_state
-      if self._training_state.restore():
-        # If the training state needs to be and is successfully restored,
-        # it is recovering from a previous failure (or preemption). In such
-        # case, do not load the weights from user specified file path.
-        return
-
-    # If this is not multi worker training, restoring is not needed, or
-    # restoring failed, check if it should load weights on restart.
     if self.load_weights_on_restart:
-      if (not self.model._in_multi_worker_mode() or
-          multi_worker_util.should_load_checkpoint()):
-        filepath_to_load = (
-            self._get_most_recently_modified_file_matching_pattern(
-                self.filepath))
-        if (filepath_to_load is not None and
-            training_state.checkpoint_exists(filepath_to_load)):
-          try:
-            # `filepath` may contain placeholders such as `{epoch:02d}`, and
-            # thus it attempts to load the most recently modified file with file
-            # name matching the pattern.
-            self.model.load_weights(filepath_to_load)
-          except (IOError, ValueError) as e:
-            raise ValueError('Error loading file from {}. Reason: {}'.format(
-                filepath_to_load, e))
-
-  def on_train_end(self, logs=None):
-    # pylint: disable=protected-access
-    if self.model._in_multi_worker_mode():
-      if self.model.stop_training or getattr(
-          self.model, '_successful_loop_finish', False):
-        # In multi-worker training, on successful exit of training, delete the
-        # training state backup file that was saved for the purpose of worker
-        # recovery.
-        self._training_state.delete_backup()
-        # Restore the training state so the model is ready for next (possible)
-        # multi worker training.
-        del self._training_state
-        self.model._training_state = None
+      filepath_to_load = (
+          self._get_most_recently_modified_file_matching_pattern(self.filepath))
+      if (filepath_to_load is not None and
+          self._checkpoint_exists(filepath_to_load)):
+        try:
+          # `filepath` may contain placeholders such as `{epoch:02d}`, and
+          # thus it attempts to load the most recently modified file with file
+          # name matching the pattern.
+          self.model.load_weights(filepath_to_load)
+        except (IOError, ValueError) as e:
+          raise ValueError('Error loading file from {}. Reason: {}'.format(
+              filepath_to_load, e))
 
   def on_train_batch_end(self, batch, logs=None):
     if self._should_save_on_batch(batch):
@@ -1243,17 +1220,7 @@ class ModelCheckpoint(Callback):
     self.epochs_since_last_save += 1
     # pylint: disable=protected-access
     if self.save_freq == 'epoch':
-      if self.model._in_multi_worker_mode():
-        # Exclude training state variables in user-requested checkpoint file.
-        with self._training_state.untrack_vars():
-          self._save_model(epoch=epoch, logs=logs)
-      else:
-        self._save_model(epoch=epoch, logs=logs)
-    if self.model._in_multi_worker_mode():
-      # For multi-worker training, back up the weights and current training
-      # state for possible future recovery.
-      # TODO(rchao): Call `back_up` at finer period such as N steps.
-      self._training_state.back_up(epoch)
+      self._save_model(epoch=epoch, logs=logs)
 
   def _should_save_on_batch(self, batch):
     """Handles batch-level saving logic, supports steps_per_execution."""
@@ -1320,7 +1287,7 @@ class ModelCheckpoint(Callback):
         self._maybe_remove_file()
       except IOError as e:
         # `e.errno` appears to be `None` so checking the content of `e.args[0]`.
-        if 'is a directory' in six.ensure_str(e.args[0]):
+        if 'is a directory' in six.ensure_str(e.args[0]).lower():
           raise IOError('Please specify a non-directory filepath for '
                         'ModelCheckpoint. Filepath used is an existing '
                         'directory: {}'.format(filepath))
@@ -1347,6 +1314,14 @@ class ModelCheckpoint(Callback):
     distributed_file_utils.remove_temp_dir_with_filepath(
         self._write_filepath, self.model.distribute_strategy)
 
+  def _checkpoint_exists(self, filepath):
+    """Returns whether the checkpoint `filepath` refers to exists."""
+    if filepath.endswith('.h5'):
+      return file_io.file_exists(filepath)
+    tf_saved_model_exists = file_io.file_exists(filepath)
+    tf_weights_only_checkpoint_exists = file_io.file_exists(filepath + '.index')
+    return tf_saved_model_exists or tf_weights_only_checkpoint_exists
+
   def _get_most_recently_modified_file_matching_pattern(self, pattern):
     """Returns the most recently modified filepath matching pattern.
 
@@ -1439,6 +1414,119 @@ class ModelCheckpoint(Callback):
       return file_path_with_largest_file_name
 
 
+@keras_export('keras.callbacks.experimental.BackupAndRestore', v1=[])
+class BackupAndRestore(Callback):
+  """Callback to back up and restore the training state.
+
+  `BackupAndRestore` callback is intended to recover from interruptions that
+  happened in the middle of a model.fit execution by backing up the
+  training states in a temporary checkpoint file (based on TF CheckpointManager)
+  at the end of each epoch. If training restarted before completion, the
+  training state and model are restored to the most recently saved state at the
+  beginning of a new model.fit() run.
+  Note that user is responsible to bring jobs back up.
+  This callback is important for the backup and restore mechanism for fault
+  tolerance purpose. And the model to be restored from an previous checkpoint is
+  expected to be the same as the one used to back up. If user changes arguments
+  passed to compile or fit, the checkpoint saved for fault tolerance can become
+  invalid.
+
+  Note:
+  1. This callback is not compatible with disabling eager execution.
+  2. A checkpoint is saved at the end of each epoch, when restoring we'll redo
+  any partial work from an unfinished epoch in which the training got restarted
+  (so the work done before a interruption doesn't affect the final model state).
+  3. This works for both single worker and multi-worker mode, only
+  MirroredStrategy and MultiWorkerMirroredStrategy are supported for now.
+
+  Example:
+
+  >>> class InterruptingCallback(tf.keras.callbacks.Callback):
+  ...   def on_epoch_begin(self, epoch, logs=None):
+  ...     if epoch == 4:
+  ...       raise RuntimeError('Interrupting!')
+  >>> callback = tf.keras.callbacks.experimental.BackupAndRestore(
+  ... backup_dir="/tmp")
+  >>> model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
+  >>> model.compile(tf.keras.optimizers.SGD(), loss='mse')
+  >>> try:
+  ...   model.fit(np.arange(100).reshape(5, 20), np.zeros(5), epochs=10,
+  ...             batch_size=1, callbacks=[callback, InterruptingCallback()],
+  ...             verbose=0)
+  ... except:
+  ...   pass
+  >>> history = model.fit(np.arange(100).reshape(5, 20), np.zeros(5), epochs=10,
+  ...             batch_size=1, callbacks=[callback], verbose=0)
+  >>> # Only 6 more epochs are run, since first trainning got interrupted at
+  >>> # zero-indexed epoch 4, second training will continue from 4 to 9.
+  >>> len(history.history['loss'])
+  6
+
+  Arguments:
+      backup_dir: String, path to save the model file. This is the directory in
+        which the system stores temporary files to recover the model from jobs
+        terminated unexpectedly. The directory cannot be reused elsewhere to
+        store other checkpoints, e.g. by BackupAndRestore callback of another
+        training, or by another callback (ModelCheckpoint) of the same training.
+  """
+
+  def __init__(self, backup_dir):
+    super(BackupAndRestore, self).__init__()
+    self.backup_dir = backup_dir
+    self._supports_tf_logs = True
+    self._supported_strategies = (
+        distribute_lib._DefaultDistributionStrategy,
+        mirrored_strategy.MirroredStrategy,
+        collective_all_reduce_strategy.CollectiveAllReduceStrategy)
+
+    if not context.executing_eagerly():
+      if ops.inside_function():
+        raise ValueError('This Callback\'s method contains Python state and '
+                         'should be called outside of `tf.function`s.')
+      else:  # Legacy graph mode:
+        raise ValueError(
+            'BackupAndRestore only supports eager mode. In graph '
+            'mode, consider using ModelCheckpoint to manually save '
+            'and restore weights with `model.load_weights()` and by '
+            'providing `initial_epoch` in `model.fit()` for fault tolerance.')
+
+    # Only the chief worker writes model checkpoints, but all workers
+    # restore checkpoint at on_train_begin().
+    self._chief_worker_only = False
+
+  def set_model(self, model):
+    self.model = model
+
+  def on_train_begin(self, logs=None):
+    # TrainingState is used to manage the training state needed for
+    # failure-recovery of a worker in training.
+    # pylint: disable=protected-access
+
+    if not isinstance(self.model.distribute_strategy,
+                      self._supported_strategies):
+      raise NotImplementedError(
+          'Currently only support empty strategy, MirroredStrategy and '
+          'MultiWorkerMirroredStrategy.')
+    self.model._training_state = (
+        worker_training_state.WorkerTrainingState(self.model, self.backup_dir))
+    self._training_state = self.model._training_state
+    self._training_state.restore()
+
+  def on_train_end(self, logs=None):
+    # pylint: disable=protected-access
+    # On exit of training, delete the training state backup file that was saved
+    # for the purpose of worker recovery.
+    self._training_state.delete_backup()
+
+    # Clean up the training state.
+    del self._training_state
+    del self.model._training_state
+
+  def on_epoch_end(self, epoch, logs=None):
+    # Back up the model and current epoch for possible future recovery.
+    self._training_state.back_up(epoch)
+
+
 @keras_export('keras.callbacks.EarlyStopping')
 class EarlyStopping(Callback):
   """Stop training when a monitored metric has stopped improving.
@@ -1539,6 +1627,7 @@ class EarlyStopping(Callback):
       self.best = self.baseline
     else:
       self.best = np.Inf if self.monitor_op == np.less else -np.Inf
+    self.best_weights = None
 
   def on_epoch_end(self, epoch, logs=None):
     current = self.get_monitor_value(logs)
@@ -2335,7 +2424,7 @@ class CSVLogger(Callback):
 
     if self.model.stop_training:
       # We set NA so that csv parsers do not fail for this last epoch.
-      logs = dict([(k, logs[k]) if k in logs else (k, 'NA') for k in self.keys])
+      logs = dict((k, logs[k]) if k in logs else (k, 'NA') for k in self.keys)
 
     if not self.writer:
 
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 9d15f87ed79..2f1256ee3ee 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -302,8 +302,8 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
           epochs=10,
           callbacks=[SleepCallback()])
     warning_msg = ('Callbacks method `on_train_batch_end` is slow compared '
-                   'to the batch time. Check your callbacks.')
-    self.assertIn(warning_msg, warning_messages)
+                   'to the batch time')
+    self.assertIn(warning_msg, '\n'.join(warning_messages))
 
   @keras_parameterized.run_with_all_model_types(exclude_models='functional')
   @keras_parameterized.run_all_keras_modes
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index d379e4f3349..ddf274f299f 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -44,7 +44,6 @@ py_library(
         "//tensorflow/python/keras:losses",
         "//tensorflow/python/keras:optimizers",
         "//tensorflow/python/keras:regularizers",
-        "//tensorflow/python/keras/distribute:multi_worker_training_state",
         "//tensorflow/python/keras/mixed_precision/experimental:autocast_variable",
         "//tensorflow/python/keras/mixed_precision/experimental:policy",
         "//tensorflow/python/keras/saving",
@@ -56,23 +55,21 @@ py_library(
 )
 
 py_library(
-    name = "multi_worker_training_state",
+    name = "worker_training_state",
     srcs = [
-        "multi_worker_training_state.py",
+        "worker_training_state.py",
     ],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python/distribute:multi_worker_util",
-    ],
 )
 
 cuda_py_test(
-    name = "multi_worker_training_state_test",
-    srcs = ["multi_worker_training_state_test.py"],
+    name = "worker_training_state_test",
+    srcs = ["worker_training_state_test.py"],
+    python_version = "PY3",
     shard_count = 4,
     deps = [
         ":multi_worker_testing_utils",
-        ":multi_worker_training_state",
+        ":worker_training_state",
         "//tensorflow/python:platform",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:multi_worker_test_base",
@@ -128,7 +125,6 @@ distribute_py_test(
         "multi_and_single_gpu",
         "no_rocm",  # times out on ROCm
         "no_windows_gpu",
-        "notpu",  # TODO(b/155867206) flaky segfault
         "notsan",
     ],
     tpu_tags = [
@@ -317,6 +313,7 @@ py_library(
 cuda_py_test(
     name = "keras_optimizer_v2_test",
     srcs = ["keras_optimizer_v2_test.py"],
+    python_version = "PY3",
     shard_count = 4,
     tags = [
         "multi_and_single_gpu",
@@ -330,6 +327,7 @@ cuda_py_test(
 cuda_py_test(
     name = "multi_worker_test",
     srcs = ["multi_worker_test.py"],
+    python_version = "PY3",
     shard_count = 32,
     tags = [
         "multi_and_single_gpu",
@@ -362,35 +360,11 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "multi_worker_callback_tf1_test",
-    srcs = ["multi_worker_callback_tf1_test.py"],
-    # TODO(b/132384649): Enable for guitar and oss tests.
-    shard_count = 24,
-    tags = [
-        "manual",
-        "no_oss",
-        "noguitar",
-        "notap",
-    ],
-    deps = [
-        ":distribute",
-        ":multi_worker_testing_utils",
-        "//tensorflow/python:platform",
-        "//tensorflow/python/distribute:collective_all_reduce_strategy",
-        "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:distribute_config",
-        "//tensorflow/python/distribute:distribute_coordinator",
-        "//tensorflow/python/distribute:multi_worker_test_base",
-        "//tensorflow/python/keras",
-    ],
-)
-
 py_test(
     name = "multi_worker_callback_tf2_test",
     srcs = ["multi_worker_callback_tf2_test.py"],
     python_version = "PY3",
-    shard_count = 10,
+    shard_count = 5,
     deps = [
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:combinations",
@@ -400,27 +374,6 @@ py_test(
     ],
 )
 
-cuda_py_test(
-    name = "multi_worker_fault_tolerance_test",
-    srcs = ["multi_worker_fault_tolerance_test.py"],
-    shard_count = 14,
-    # TODO(b/132384649): Enable once fixed.
-    tags = [
-        "no_oss",
-    ],
-    deps = [
-        ":distribute",
-        ":multi_worker_testing_utils",
-        "//tensorflow/python:platform",
-        "//tensorflow/python/distribute:collective_all_reduce_strategy",
-        "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:distribute_config",
-        "//tensorflow/python/distribute:distribute_coordinator",
-        "//tensorflow/python/distribute:multi_worker_test_base",
-        "//tensorflow/python/keras",
-    ],
-)
-
 py_library(
     name = "multi_worker_testing_utils",
     srcs = [
@@ -454,6 +407,11 @@ py_test(
     srcs = ["multi_worker_tutorial_test.py"],
     python_version = "PY3",
     shard_count = 5,
+    tags = [
+        "noasan",  # TODO(b/156029134)
+        "nomsan",  # TODO(b/156029134)
+        "notsan",  # TODO(b/156029134)
+    ],
     deps = [
         "//tensorflow/python:platform",
         "//tensorflow/python/data/ops:dataset_ops",
diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py
index f6a83c499fe..eac1e2feb8b 100644
--- a/tensorflow/python/keras/distribute/distribute_strategy_test.py
+++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py
@@ -575,8 +575,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
 
   @combinations.generate(
       combinations.combine(
-          distribution=[strategy_combinations.one_device_strategy] +
-          tpu_strategies,
+          distribution=[strategy_combinations.one_device_strategy],
           mode=['graph', 'eager']))
   def test_optimizer_in_cross_replica_context_raises_error(self, distribution):
 
@@ -1070,6 +1069,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
   @combinations.generate(all_strategy_combinations())
   def test_on_dataset_with_unknown_cardinality_without_steps(
       self, distribution, mode):
+    # TODO(b/155867206): Investigate why this test occasionally segfaults on TPU
+    # in eager mode.
+    if mode == 'eager' and isinstance(
+        distribution, (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+      self.skipTest('caused segfault with TPU in eager mode.')
 
     if mode == 'graph' and isinstance(
         distribution, (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
diff --git a/tensorflow/python/keras/distribute/keras_utils_test.py b/tensorflow/python/keras/distribute/keras_utils_test.py
index 702d89d95f8..0f65bbbf917 100644
--- a/tensorflow/python/keras/distribute/keras_utils_test.py
+++ b/tensorflow/python/keras/distribute/keras_utils_test.py
@@ -26,7 +26,6 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute import values
@@ -398,9 +397,6 @@ class TestDistributionStrategyWithNormalizationLayer(test.TestCase,
               optimizer=strategy_combinations
               .gradient_descent_optimizer_keras_v2_fn)))
   def test_batchnorm_correctness(self, distribution, fused, optimizer):
-    if isinstance(distribution.extended,
-                  parameter_server_strategy.ParameterServerStrategyExtended):
-      self.skipTest('b/152353796')
     with self.cached_session():
       with distribution.scope():
         model = keras.models.Sequential()
diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_tf1_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_tf1_test.py
deleted file mode 100644
index 95a235e7b33..00000000000
--- a/tensorflow/python/keras/distribute/multi_worker_callback_tf1_test.py
+++ /dev/null
@@ -1,597 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras callbacks in multi-worker training with TF1."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import sys
-import tempfile
-import threading
-
-from absl.testing import parameterized
-
-from tensorflow.python import keras
-from tensorflow.python.distribute import collective_all_reduce_strategy as collective_strategy
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import distribute_coordinator as dc
-from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.distribute import multi_worker_test_base as test_base
-from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import callbacks
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.distribute import multi_worker_testing_utils
-from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
-from tensorflow.python.platform import test
-
-
-def get_strategy_object(strategy_cls):
-  if strategy_cls == mirrored_strategy.MirroredStrategy:
-    return strategy_cls(mirrored_strategy.all_local_devices())
-  else:
-    # CollectiveAllReduceStrategy and ParameterServerStrategy.
-    return strategy_cls()
-
-
-def generate_callback_test_function(custom_callable):
-  """Generic template for callback tests using mnist synthetic dataset."""
-
-  @combinations.generate(
-      combinations.combine(
-          mode=['graph'],
-          strategy_cls=[collective_strategy.CollectiveAllReduceStrategy],
-          required_gpus=[0, 1],
-          file_format=['h5', 'tf']))
-  def test_template(self, strategy_cls, file_format):
-    num_workers = 2
-    num_epoch = 2
-
-    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
-    self._barrier = dc._Barrier(2)
-
-    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
-      """Simulates an Independent Worker inside of a thread."""
-      with test.mock.patch.object(dc, '_run_std_server',
-                                  self._make_mock_run_std_server()):
-        strategy = get_strategy_object(strategy_cls)
-        batch_size = 64
-        steps = 2
-        train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
-            batch_size, steps)
-        with strategy.scope():
-          model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
-
-        custom_callable(
-            model,
-            self,
-            train_ds,
-            num_epoch,
-            steps,
-            strategy,
-            saving_filepath=kwargs['saving_filepath'],
-            barrier=kwargs['barrier'],
-            threading_local=kwargs['threading_local'])
-
-    # Pass saving_filepath from the parent thread to ensure every worker has the
-    # same filepath to save.
-    saving_filepath = os.path.join(self.get_temp_dir(),
-                                   'checkpoint.' + file_format)
-    barrier = dc._Barrier(2)
-    threading_local = threading.local()
-    threads = self.run_multiple_tasks_in_threads(
-        _independent_worker_fn,
-        cluster_spec,
-        saving_filepath=saving_filepath,
-        barrier=barrier,
-        threading_local=threading_local)
-    self.assertFalse(training_state.checkpoint_exists(saving_filepath))
-
-    threads_to_join = []
-    strategy = get_strategy_object(strategy_cls)
-    if strategy.extended.experimental_between_graph:
-      for ts in threads.values():
-        threads_to_join.extend(ts)
-    else:
-      threads_to_join = [threads['worker'][0]]
-    self.join_independent_workers(threads_to_join)
-
-  return test_template
-
-
-class KerasMultiWorkerCallbackTest(test_base.IndependentWorkerTestBase,
-                                   parameterized.TestCase):
-  """KerasMultiWorkerCallbackTest for TF1.
-
-  TODO(rchao): Migrate all tests in this class to
-  `multi_worker_callback_tf2_test`.
-  """
-
-  # The callables of the actual testing content to be run go below.
-  @staticmethod
-  def callableForTestChiefOnlyCallback(model, test_obj, train_ds, num_epoch,
-                                       steps, strategy, saving_filepath,
-                                       **kwargs):
-
-    class ChiefOnly(keras.callbacks.Callback):
-
-      def __init__(self):
-        self._chief_worker_only = True
-        self.filtered_correctly = True
-
-      def on_train_begin(self, logs):
-        if not multi_worker_util.is_chief():
-          # Non-chief workers shouldn't run this callback.
-          self.filtered_correctly = False
-
-    cb = ChiefOnly()
-    model.fit(
-        x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[cb])
-
-    test_obj.assertTrue(cb.filtered_correctly)
-
-  @staticmethod
-  def callableForTestModelCheckpointSavesOnChiefButNotOtherwise(
-      model, test_obj, train_ds, num_epoch, steps, strategy, saving_filepath,
-      **kwargs):
-
-    extension = os.path.splitext(saving_filepath)[1]
-
-    # Incorporate type/index information and thread id in saving_filepath to
-    # ensure every worker has a unique path. Note that in normal use case the
-    # saving_filepath will be the same for all workers, but we use different
-    # ones here just to test out chief saves checkpoint but non-chief doesn't.
-
-    saving_filepath = os.path.join(
-        test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' %
-        (test_base.get_task_type(), test_base.get_task_index(), extension))
-
-    # The saving_filepath shouldn't exist at the beginning (as it's unique).
-    test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))
-
-    model.fit(
-        x=train_ds,
-        epochs=num_epoch,
-        steps_per_epoch=steps,
-        callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])
-
-    # If it's chief, the model should be saved; if not, the model shouldn't.
-    test_obj.assertEqual(
-        training_state.checkpoint_exists(saving_filepath), test_base.is_chief())
-
-  @staticmethod
-  def initialFitting(test_obj, model, train_ds, num_epoch, steps,
-                     saving_filepath):
-    # The saving_filepath shouldn't exist at the beginning.
-    test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))
-
-    model.fit(
-        x=train_ds,
-        epochs=num_epoch,
-        steps_per_epoch=steps,
-        callbacks=[
-            callbacks.ModelCheckpoint(
-                filepath=saving_filepath, save_weights_only=True)
-        ])
-
-    # The saving_filepath should exist after fitting with callback. Both chief
-    # and non-chief worker should both see it exists (which was saved only by
-    # chief).
-    test_obj.assertTrue(training_state.checkpoint_exists(saving_filepath))
-
-    history_after_one_more_epoch = model.fit(
-        x=train_ds, epochs=1, steps_per_epoch=steps)
-
-    # The saving_filepath should continue to exist (if it did) after fitting
-    # without callback.
-    test_obj.assertTrue(training_state.checkpoint_exists(saving_filepath))
-
-    return saving_filepath, history_after_one_more_epoch
-
-  @staticmethod
-  def callableForTestLoadWeightFromModelCheckpoint(model, test_obj, train_ds,
-                                                   num_epoch, steps, strategy,
-                                                   saving_filepath, **kwargs):
-    filepaths = []
-    real_mkstemp = tempfile.mkstemp
-    def mocked_mkstemp():
-      # Only non-chief should call tempfile.mkstemp() inside fit() in sync
-      # training.
-      assert not test_base.is_chief()
-      file_handle, temp_file_name = real_mkstemp()
-      extension = os.path.splitext(saving_filepath)[1]
-      temp_filepath = temp_file_name + extension
-      filepaths.append(temp_filepath)
-      return file_handle, temp_file_name
-
-    # Mock tempfile.mkstemp() so the filepaths can be stored and verified later.
-    with test.mock.patch.object(tempfile, 'mkstemp', mocked_mkstemp):
-      saving_filepath, history_after_one_more_epoch = \
-          KerasMultiWorkerCallbackTest.initialFitting(
-              test_obj, model, train_ds, num_epoch, steps, saving_filepath)
-
-      with strategy.scope():
-        model.load_weights(saving_filepath)
-
-      history_after_loading_weight_and_one_more_epoch = model.fit(
-          x=train_ds, epochs=1, steps_per_epoch=steps)
-
-      test_obj.assertAllClose(
-          history_after_one_more_epoch.history,
-          history_after_loading_weight_and_one_more_epoch.history,
-          rtol=5e-5)
-
-    # Verify the temp files are indeed removed (no trace left behind).
-    for filepath in filepaths:
-      assert not training_state.checkpoint_exists(filepath)
-
-  @staticmethod
-  def callableForTestModelRestoreCallback(model, test_obj, train_ds, num_epoch,
-                                          steps, strategy, saving_filepath,
-                                          **kwargs):
-
-    saving_filepath, history_after_one_more_epoch = \
-        KerasMultiWorkerCallbackTest.initialFitting(
-            test_obj, model, train_ds, num_epoch, steps, saving_filepath)
-
-    # The model should get restored to the weights previously saved, by
-    # adding a ModelCheckpoint callback (which results in a
-    # _ModelRestoreCallback being added), with load_weights_on_restart=True.
-    history_after_model_restoring_and_one_more_epoch = model.fit(
-        x=train_ds,
-        epochs=1,
-        steps_per_epoch=steps,
-        callbacks=[
-            callbacks.ModelCheckpoint(
-                filepath=saving_filepath,
-                save_weights_only=True,
-                load_weights_on_restart=True)
-        ])
-
-    # Asserting the history one epoch after initial fitting and one epoch after
-    # restoring are closed.
-    test_obj.assertAllClose(
-        history_after_one_more_epoch.history,
-        history_after_model_restoring_and_one_more_epoch.history,
-        rtol=5e-5)
-
-    history_one_more_epoch_without_model_restoring = model.fit(
-        x=train_ds, epochs=1, steps_per_epoch=steps)
-
-    # Ensuring training for another epoch gives different result.
-    test_obj.assertNotAllClose(
-        history_after_model_restoring_and_one_more_epoch.history,
-        history_one_more_epoch_without_model_restoring.history,
-        rtol=5e-5)
-
-  @staticmethod
-  def callableForTestBackupModelRemoved(model, test_obj, train_ds, num_epoch,
-                                        steps, strategy, saving_filepath,
-                                        **kwargs):
-
-    # `barrier` object needs to be passed in from parent
-    # thread so both threads refer to the same object.
-    barrier = kwargs['barrier']
-
-    num_epoch = 3
-
-    # Testing the backup filepath `multi_worker_training_state` uses.
-    _, backup_filepath = training_state._get_backup_filepath(saving_filepath)
-
-    # The backup_filepath shouldn't exist at the beginning.
-    test_obj.assertFalse(training_state.checkpoint_exists(backup_filepath))
-
-    # Callback to verify that the backup file exists in the middle of training.
-    class BackupFilepathVerifyingCallback(callbacks.Callback):
-
-      def on_epoch_begin(self, epoch, logs=None):
-        if epoch > 1:
-          # Asserting that after the first two epochs, the backup file should
-          # exist.
-          test_obj.assertTrue(training_state.checkpoint_exists(backup_filepath))
-
-    model.fit(
-        x=train_ds,
-        epochs=num_epoch,
-        steps_per_epoch=steps,
-        callbacks=[
-            callbacks.ModelCheckpoint(
-                filepath=saving_filepath, save_weights_only=True),
-            BackupFilepathVerifyingCallback()
-        ])
-
-    # Sync on the two threads so we make sure the backup file is removed before
-    # we move on.
-    barrier.wait()
-
-    # The back up file should not exist at successful exit of `model.fit()`.
-    test_obj.assertFalse(training_state.checkpoint_exists(backup_filepath))
-
-  @staticmethod
-  def callableForTestBackupModelNotRemovedIfInterrupted(model, test_obj,
-                                                        train_ds, num_epoch,
-                                                        steps, strategy,
-                                                        saving_filepath,
-                                                        **kwargs):
-
-    # `barrier` object needs to be passed in from parent
-    # thread so both threads refer to the same object.
-    barrier = kwargs['barrier']
-
-    num_epoch = 4
-
-    # Testing the backup filepath `multi_worker_training_state` uses.
-    _, backup_filepath = training_state._get_backup_filepath(saving_filepath)
-
-    # The backup_filepath shouldn't exist at the beginning.
-    test_obj.assertFalse(training_state.checkpoint_exists(backup_filepath))
-
-    # Callback to interrupt in the middle of training.
-    class InterruptingCallback(callbacks.Callback):
-
-      def on_epoch_begin(self, epoch, logs=None):
-        if epoch == 2:
-          raise RuntimeError('Interrupting!')
-
-    try:
-      model.fit(
-          x=train_ds,
-          epochs=num_epoch,
-          steps_per_epoch=steps,
-          callbacks=[
-              callbacks.ModelCheckpoint(
-                  filepath=saving_filepath, save_weights_only=True),
-              InterruptingCallback()
-          ])
-    except RuntimeError as e:
-      if 'Interrupting!' not in e.message:
-        raise
-
-    # Sync on the two threads.
-    barrier.wait()
-
-    # The back up file should exist after interruption of `model.fit()`.
-    test_obj.assertTrue(training_state.checkpoint_exists(backup_filepath))
-
-  @staticmethod
-  def callableForTestUnmatchedModelFile(model, test_obj, train_ds, num_epoch,
-                                        steps, strategy, saving_filepath,
-                                        **kwargs):
-
-    # The saving_filepath shouldn't exist at the beginning.
-    test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))
-
-    model.fit(
-        x=train_ds,
-        epochs=num_epoch,
-        steps_per_epoch=steps,
-        callbacks=[
-            callbacks.ModelCheckpoint(
-                filepath=saving_filepath, save_weights_only=True)
-        ])
-
-    (train_ds, _), (_, _) = testing_utils.get_test_data(
-        train_samples=10, test_samples=10, input_shape=(3,), num_classes=2)
-
-    # Switch to a model of different structure.
-    with strategy.scope():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(5, input_dim=3, activation='relu'))
-      model.add(keras.layers.Dense(2, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
-
-    test_obj.assertTrue(training_state.checkpoint_exists(saving_filepath))
-
-    if saving_filepath.endswith('.tf'):
-      test_obj.skipTest('Loading mismatched TF checkpoint would cause Fatal '
-                        'Python error: Aborted. Skipping.')
-
-    # Unmatched format. Should raise ValueError.
-    with test_obj.assertRaisesRegexp(ValueError, 'Error loading file from'):
-      model.fit(
-          x=train_ds,
-          epochs=num_epoch,
-          batch_size=8,
-          callbacks=[
-              callbacks.ModelCheckpoint(
-                  filepath=saving_filepath,
-                  save_weights_only=True,
-                  load_weights_on_restart=True)
-          ])
-
-  @staticmethod
-  def callableForTestReduceLROnPlateau(model, test_obj, train_ds, num_epoch,
-                                       steps, strategy, saving_filepath,
-                                       **kwargs):
-
-    cbks = [
-        callbacks.ReduceLROnPlateau(
-            monitor='loss',
-            factor=0.1,
-            min_delta=1,
-            patience=1,
-            cooldown=5,
-            verbose=1)
-    ]
-
-    # It is expected that the learning rate would drop by `factor` within
-    # 3 epochs with `min_delta=1`.
-    model.fit(x=train_ds, epochs=3, steps_per_epoch=steps, callbacks=cbks)
-    test_obj.assertAllClose(
-        float(K.get_value(model.optimizer.lr)), 0.0001, atol=1e-8)
-
-    # It is expected that the learning rate would drop by another `factor`
-    # within 3 epochs with `min_delta=1`.
-    model.fit(x=train_ds, epochs=3, steps_per_epoch=steps, callbacks=cbks)
-    test_obj.assertAllClose(
-        float(K.get_value(model.optimizer.lr)), 0.00001, atol=1e-8)
-
-  @staticmethod
-  def callableForTestEarlyStopping(model, test_obj, train_ds, num_epoch, steps,
-                                   strategy, saving_filepath, **kwargs):
-
-    class EpochCounterCallback(callbacks.Callback):
-
-      def on_epoch_begin(self, epoch, logs):
-        self.last_epoch = epoch
-
-    epoch_counter_cbk = EpochCounterCallback()
-    cbks = [
-        callbacks.EarlyStopping(
-            monitor='loss', min_delta=0.05, patience=1, verbose=1),
-        epoch_counter_cbk
-    ]
-
-    # Empirically, it is expected that `model.fit()` would terminate around the
-    # 22th epoch. Asserting that it should have been stopped before the 50th
-    # epoch to avoid flakiness and be more predictable.
-    model.fit(x=train_ds, epochs=100, steps_per_epoch=steps, callbacks=cbks)
-    test_obj.assertLess(epoch_counter_cbk.last_epoch, 50)
-
-  @staticmethod
-  def callableForTestLearningRateScheduler(model, test_obj, train_ds, num_epoch,
-                                           steps, strategy, saving_filepath,
-                                           **kwargs):
-
-    cbks = [
-        callbacks.LearningRateScheduler(
-            schedule=lambda x: 1. / (1. + x), verbose=1)
-    ]
-
-    # It is expected that with `epochs=2`, the learning rate would drop to
-    # 1 / (1 + 2) = 0.5.
-    model.fit(x=train_ds, epochs=2, steps_per_epoch=steps, callbacks=cbks)
-    test_obj.assertAllClose(
-        float(K.get_value(model.optimizer.lr)), 0.5, atol=1e-8)
-
-    # It is expected that with `epochs=4`, the learning rate would drop to
-    # 1 / (1 + 4) = 0.25.
-    model.fit(x=train_ds, epochs=4, steps_per_epoch=steps, callbacks=cbks)
-    test_obj.assertAllClose(
-        float(K.get_value(model.optimizer.lr)), 0.25, atol=1e-8)
-
-  # pylint: disable=g-doc-args
-  @staticmethod
-  def callableForTestIntermediateDirForFTAreRemoved(model, test_obj, train_ds,
-                                                    num_epoch, steps, strategy,
-                                                    saving_filepath, **kwargs):
-    """Testing that the temporary directory are removed.
-
-    Some temporary directories are created for the purpose of fault tolerance.
-    This test ensures that such directories should have been removed at the time
-    `model.fit()` finishes successfully.
-    """
-
-    # `threading_local` and `barrier` objects have to be passed in from parent
-    # thread so both threads refer to the same object.
-    threading_local = kwargs['threading_local']
-    barrier = kwargs['barrier']
-
-    # Two threads will each has one copy of `temp_dirs_supposed_to_be_removed`
-    # list.
-    threading_local.temp_dirs_supposed_to_be_removed = []
-
-    callbacks_list = [
-        callbacks.ModelCheckpoint(
-            filepath=saving_filepath,
-            save_weights_only=True,
-            load_weights_on_restart=True),
-    ]
-
-    # Keep the references to the real function objects.
-    real_os_path_join = os.path.join
-    real_tempfile_mkdtemp = tempfile.mkdtemp
-
-    # Make a `os.path.join` wrapper, which will be patched onto the real
-    # function, so the temporary directories can be tracked.
-    def wrapper_os_path_join(path, *paths):
-      join_result = real_os_path_join(path, *paths)
-      if len(paths) == 1 and paths[0] == 'backup':
-        threading_local.temp_dirs_supposed_to_be_removed.append(join_result)
-      return join_result
-
-    # Likewise for `tempfile.mkdtemp`.
-    def wrapper_tempfile_mkdtemp():
-      result = real_tempfile_mkdtemp()
-      threading_local.temp_dirs_supposed_to_be_removed.append(result)
-      return result
-
-    # Now the two threads must sync here: if they are out of sync, one thread
-    # can go ahead and patch `os.path.join` while the other has not even
-    # assigned the real `os.path.join` to `real_os_path_join`. If this happened,
-    # the "real" `os.path.join` the slower thread would see is actually the
-    # wrapper of the other.
-    barrier.wait()
-
-    # Note that `os.path.join` will respect the second patch (there are two
-    # patches because of the two threads). Both threads will refer to the same
-    # copy of `wrapper_os_path_join` because of the `barrier` preceding
-    # `model.fit()`. Likewise for `wrapper_tempfile_mkdtemp`.
-    os.path.join = wrapper_os_path_join
-    tempfile.mkdtemp = wrapper_tempfile_mkdtemp
-
-    barrier.wait()
-    model.fit(
-        x=train_ds,
-        epochs=num_epoch,
-        steps_per_epoch=steps,
-        callbacks=callbacks_list)
-
-    # Sync before un-patching to prevent either thread from accessing the real
-    # functions. Also to make sure `model.fit()` is done on both threads (so we
-    # can safely assert the directories are removed).
-    barrier.wait()
-    os.path.join = real_os_path_join
-    tempfile.mkdtemp = real_tempfile_mkdtemp
-
-    # There should be directory (names) that are supposed to be removed.
-    test_obj.assertTrue(threading_local.temp_dirs_supposed_to_be_removed)
-    for temp_dir_supposed_to_be_removed in (
-        threading_local.temp_dirs_supposed_to_be_removed):
-      # They should have been removed and thus don't exist.
-      test_obj.assertFalse(os.path.exists(temp_dir_supposed_to_be_removed))
-
-  # The actual testing methods go here.
-  test_chief_only_callback = generate_callback_test_function(
-      callableForTestChiefOnlyCallback.__func__)
-  test_model_checkpoint_saves_on_chief_but_not_otherwise = \
-      generate_callback_test_function(
-          callableForTestModelCheckpointSavesOnChiefButNotOtherwise.__func__)
-  test_load_weight_from_model_checkpoint = generate_callback_test_function(
-      callableForTestLoadWeightFromModelCheckpoint.__func__)
-  test_model_restore_callback = generate_callback_test_function(
-      callableForTestModelRestoreCallback.__func__)
-  test_unmatched_model_file = generate_callback_test_function(
-      callableForTestUnmatchedModelFile.__func__)
-  test_reduce_lr_on_plateau = generate_callback_test_function(
-      callableForTestReduceLROnPlateau.__func__)
-  test_early_stopping = generate_callback_test_function(
-      callableForTestEarlyStopping.__func__)
-  test_learning_rate_scheduler = generate_callback_test_function(
-      callableForTestLearningRateScheduler.__func__)
-  test_intermediate_dir_for_ft_are_removed = generate_callback_test_function(
-      callableForTestIntermediateDirForFTAreRemoved.__func__)
-  test_backup_model_removed = generate_callback_test_function(
-      callableForTestBackupModelRemoved.__func__)
-  test_backup_model_not_removed_if_interrupted = \
-      generate_callback_test_function(
-          callableForTestBackupModelNotRemovedIfInterrupted.__func__)
-
-
-if __name__ == '__main__':
-  with test.mock.patch.object(sys, 'exit', os._exit):
-    test.main()
diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
index 7ea385e0b04..660a8e8cb6c 100644
--- a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
@@ -28,11 +28,19 @@ from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base as test_base
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras.distribute import multi_worker_testing_utils
-from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import test
 
 
+def checkpoint_exists(filepath):
+  """Returns whether the checkpoint `filepath` refers to exists."""
+  if filepath.endswith('.h5'):
+    return file_io.file_exists(filepath)
+  tf_saved_model_exists = file_io.file_exists(filepath)
+  tf_weights_only_checkpoint_exists = file_io.file_exists(filepath + '.index')
+  return tf_saved_model_exists or tf_weights_only_checkpoint_exists
+
+
 def _model_setup(test_obj, file_format):
   """Set up a MNIST Keras model for testing purposes.
 
@@ -89,7 +97,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
           (test_base.get_task_type(), test_base.get_task_index(), extension))
 
       # The saving_filepath shouldn't exist at the beginning (as it's unique).
-      test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))
+      test_obj.assertFalse(checkpoint_exists(saving_filepath))
 
       model.fit(
           x=train_ds,
@@ -104,15 +112,14 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
 
       # If it's chief, the model should be saved; if not, the model shouldn't.
       test_obj.assertEqual(
-          training_state.checkpoint_exists(saving_filepath),
-          test_base.is_chief())
+          checkpoint_exists(saving_filepath), test_base.is_chief())
 
       # If it's chief, the model should be saved (`write_filepath` should
       # simply return `saving_filepath`); if not, i.e. for non-chief workers,
       # the temporary path generated by `write_filepath` should no longer
       # contain the checkpoint that has been deleted.
       test_obj.assertEqual(
-          training_state.checkpoint_exists(
+          checkpoint_exists(
               distributed_file_utils.write_filepath(
                   saving_filepath, model._distribution_strategy)),
           test_base.is_chief())
@@ -148,6 +155,70 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self, saving_filepath))
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def test_backupandrestore_checkpoint_works_with_interruption(self, mode):
+
+    class InterruptingCallback(callbacks.Callback):
+
+      def on_epoch_begin(self, epoch, logs=None):
+        if epoch == 2:
+          raise RuntimeError('Interrupting!')
+
+    class AssertCallback(callbacks.Callback):
+
+      def on_epoch_begin(self, epoch, logs=None):
+        # the interruption happened on epoch 2 as specified in
+        # InterruptingCallback, so the initial epoch after restart will begin
+        # at 2.
+        assert epoch > 1
+
+    def proc_model_checkpoint_works_with_same_file_path(test_obj,
+                                                        saving_filepath):
+      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
+      num_epoch = 4
+
+      # The saving_filepath shouldn't exist at the beginning (as it's unique).
+      test_obj.assertFalse(file_io.file_exists(saving_filepath))
+      bar_dir = os.path.join(os.path.dirname(saving_filepath), 'backup')
+
+      try:
+        model.fit(
+            x=train_ds,
+            epochs=num_epoch,
+            steps_per_epoch=steps,
+            callbacks=[
+                callbacks.ModelCheckpoint(filepath=saving_filepath),
+                callbacks.BackupAndRestore(backup_dir=bar_dir),
+                InterruptingCallback()
+            ])
+      except RuntimeError as e:
+        if 'Interrupting!' not in str(e):
+          raise
+
+      backup_filepath = os.path.join(bar_dir, 'checkpoint')
+      test_obj.assertTrue(file_io.file_exists(backup_filepath))
+      test_obj.assertTrue(file_io.file_exists(saving_filepath))
+
+      model.fit(
+          x=train_ds,
+          epochs=num_epoch,
+          steps_per_epoch=steps,
+          callbacks=[
+              callbacks.ModelCheckpoint(filepath=saving_filepath),
+              callbacks.BackupAndRestore(backup_dir=bar_dir),
+              AssertCallback()
+          ])
+      multi_process_runner.barrier()
+      test_obj.assertFalse(file_io.file_exists(backup_filepath))
+      test_obj.assertTrue(file_io.file_exists(saving_filepath))
+
+    saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint')
+
+    multi_process_runner.run(
+        proc_model_checkpoint_works_with_same_file_path,
+        cluster_spec=test_base.create_cluster_spec(num_workers=2),
+        args=(self, saving_filepath))
+
   @combinations.generate(combinations.combine(mode=['eager']))
   def test_tensorboard_saves_on_chief_but_not_otherwise(self, mode):
 
@@ -273,4 +344,4 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
 
 
 if __name__ == '__main__':
-  multi_process_runner.test_main(barrier_parties=2)
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/multi_worker_fault_tolerance_test.py b/tensorflow/python/keras/distribute/multi_worker_fault_tolerance_test.py
deleted file mode 100644
index fa58d2479ac..00000000000
--- a/tensorflow/python/keras/distribute/multi_worker_fault_tolerance_test.py
+++ /dev/null
@@ -1,346 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests Keras multi worker fault tolerance."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import json
-import os
-import signal
-import sys
-import tempfile
-import threading
-
-from absl.testing import parameterized
-from tensorflow.python.distribute import collective_all_reduce_strategy as collective_strategy
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import distribute_coordinator as dc
-from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.distribute import multi_worker_test_base as test_base
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import callbacks
-from tensorflow.python.keras.distribute import multi_worker_testing_utils
-from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
-from tensorflow.python.platform import test
-
-
-def get_strategy_object(strategy_cls):
-  if strategy_cls == mirrored_strategy.MirroredStrategy:
-    return strategy_cls(mirrored_strategy.all_local_devices())
-  else:
-    # CollectiveAllReduceStrategy and ParameterServerStrategy.
-    return strategy_cls()
-
-
-class KerasMultiWorkerFaultToleranceTest(test_base.IndependentWorkerTestBase,
-                                         parameterized.TestCase):
-
-  class PreemptionAtBatchBoundarySimulatingCallback(callbacks.Callback):
-    """Callback to simulate preemption at batch boundary."""
-
-    def on_epoch_begin(self, epoch, logs=None):
-      self._current_epoch = epoch
-
-    def on_batch_begin(self, batch, logs=None):
-      if self._current_epoch == 1 and batch == 1 and not test_base.is_chief():
-        # Simulate preemption at the start of second batch of second epoch.
-        raise RuntimeError('Preemption!')
-
-    def on_batch_end(self, batch, logs=None):
-      assert self._current_epoch < 1 or batch < 1
-
-    def on_epoch_end(self, epoch, logs=None):
-      assert epoch < 1
-
-  # TODO(rchao): Add tests for checking 0th and 2nd epoch boundary.
-  class PreemptionAtEpochBoundarySimulatingCallback(callbacks.Callback):
-    """Callback to simulate preemption at epoch boundary."""
-
-    def on_epoch_begin(self, epoch, logs=None):
-      if epoch == 1 and not test_base.is_chief():
-        # Simulate preemption at the start of second epoch.
-        raise RuntimeError('Preemption!')
-
-    def on_epoch_end(self, epoch, logs=None):
-      assert epoch < 1
-
-  @combinations.generate(
-      combinations.combine(
-          # Eager runtime unfortunately cannot be tested with multi-threading.
-          # TODO(rchao): Add test to use multi-process for eager mode after
-          # b/132095481 is resolved.
-          mode=['graph'],
-          strategy_cls=[collective_strategy.CollectiveAllReduceStrategy],
-          required_gpus=[0, 1],
-          file_format=['h5', 'tf'],
-          preemption_callback=[
-              PreemptionAtEpochBoundarySimulatingCallback,
-              PreemptionAtBatchBoundarySimulatingCallback
-          ],
-          # FT should work regardless of `ModelCheckpoint`'s parameters.
-          save_weights_only=[True, False],
-          load_weights_on_restart=[True, False],
-      ))
-  def testFaultToleranceInSyncStrategy(self, strategy_cls, file_format,
-                                       preemption_callback, save_weights_only,
-                                       load_weights_on_restart):
-    """Test fault-tolerance with multi-threading using sync dist-strat.
-
-    This test simulates multi-worker training that is interrupted by a
-    preemption, by having two threads, each of which represents a chief and a
-    non-chief worker, where the non-chief raises an error in the middle of
-    training loop. Upon excepting the error, a new thread with a new cluster
-    spec is created to simulate the recovered non-chief worker. Meanwhile, the
-    chief worker cannot proceed and hangs since the non-chief worker has
-    crashed. To simulate a restart of the chief, a new thread has been prepared
-    to run to take over chief with the help of a condition variable. It is
-    expected that after the restart of both chief and non-chief workers, the
-    training continues from the epoch they previously failed at. The test
-    concludes by verifying the preemption-interrupted training can finish with
-    the same loss and accuracy had the preemption not occurred.
-
-    TODO(rchao): Add test to check preemption on chief (possibly using multi
-    processes).
-
-    TODO(rchao): Add test to check fault-tolerance with multiple `model.fit()`.
-
-    Arguments:
-      strategy_cls: The strategy class to use.
-      file_format: `h5` or `tf`.
-      preemption_callback: The callback to simulate preemption.
-      save_weights_only: The argument for `model.fit()`'s `save_weights_only`.
-      load_weights_on_restart: The argument for `model.fit()`'s
-        `load_weights_on_restart`.
-    """
-
-    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
-      with test.mock.patch.object(dc, '_run_std_server',
-                                  self._make_mock_run_std_server()):
-        # `before_restart` is True for the threads that represent the original
-        # chief and non-chief worker, and False for threads that represent the
-        # restarted chief and non-chief workers.
-        before_restart = kwargs['before_restart']
-
-        # Model building under strategy scope. Following is the code we expect
-        # the user runs on every worker.
-        strategy = get_strategy_object(strategy_cls)
-        batch_size = 64
-        steps = 3
-        train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
-            batch_size, steps)
-
-        with strategy.scope():
-          model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
-
-        # Function to start a new thread. This will be called twice in the
-        # following code: one represents the restart of the non-chief, and one
-        # represents the restart of the chief as a result of the restart of the
-        # non-chief (so the training can continue in sync).
-        def start_new_thread(new_chief):
-          new_thread_tf_config = json.loads(os.environ['TF_CONFIG'])
-
-          # Update the ports in new chief and new worker threads.
-          new_thread_tf_config['cluster']['worker'] = kwargs['reserved_ports']
-
-          # Since both new chief and new worker threads are started from the
-          # worker thread, we need to overwrite the tf config task index.
-          new_thread_tf_config['task']['index'] = 0 if new_chief else 1
-          return self._run_task_in_thread(
-              task_fn=_independent_worker_fn,
-              cluster_spec=None,
-              task_type=None,
-              task_id=None,
-              tf_config=new_thread_tf_config,
-              before_restart=False,
-              new_chief=new_chief)
-
-        try:
-
-          class CkptSavedEpochAssertingCallback(callbacks.Callback):
-
-            def __init__(self, test_obj):
-              super(CkptSavedEpochAssertingCallback, self).__init__()
-              self.test_obj = test_obj
-
-            def on_epoch_begin(self, epoch, logs=None):
-              # `_ckpt_saved_epoch` attribute is set at the end of every epoch.
-              self.test_obj.assertEqual(
-                  K.eval(self.model._ckpt_saved_epoch) ==
-                  training_state.CKPT_SAVED_EPOCH_UNUSED_VALUE, epoch == 0)
-
-          callbacks_list = [
-              callbacks.ModelCheckpoint(
-                  filepath=saving_filepath,
-                  save_weights_only=save_weights_only,
-                  load_weights_on_restart=load_weights_on_restart),
-              CkptSavedEpochAssertingCallback(self)
-          ]
-          if before_restart:
-            callbacks_list.append(preemption_callback())
-
-          self.assertFalse(hasattr(model, training_state.CKPT_SAVED_EPOCH))
-          history = model.fit(
-              x=train_ds,
-              epochs=num_epoch,
-              steps_per_epoch=steps,
-              callbacks=callbacks_list)
-          self.assertFalse(hasattr(model, training_state.CKPT_SAVED_EPOCH))
-
-          # `history` of the training result is collected to be compared against
-          # each other. It is expected that the training results (loss and
-          # accuracy`) are the same with or without preemption.
-          self._histories.append(history.history)
-
-        except RuntimeError:
-          # pylint: disable=g-assert-in-except
-          self.assertTrue(before_restart)
-          # Reset the barrier so the new threads simulating recovery can
-          # continue.
-          self._barrier._counter = 0
-          self._barrier._flag = False
-
-          # At this point we block the original non-chief thread, and
-          # start the new threads that simulate the restarted chief and
-          # non-chief, joining the threads and return.
-          new_chief_thread = start_new_thread(new_chief=True)
-          new_worker_thread = start_new_thread(new_chief=False)
-          self.join_independent_workers([new_chief_thread, new_worker_thread])
-          return
-
-        # Successful end of a `fit()` call.
-        with self._lock:
-          self._successful_thread_ends += 1
-        self.assertFalse(before_restart)
-
-    # Common parameters
-    num_workers = 2
-    num_epoch = 3
-    # History list storing the results for preemption and no preemption cases.
-    self._histories = []
-    # Lock required to prevent race condition between two threads.
-    self._lock = threading.Lock()
-    strategy = get_strategy_object(strategy_cls)
-
-    def handler(signum, frame):
-      del signum, frame
-      # `session.run()` within `model.fit()` can time out. Skipping it as it
-      # doesn't represent the failure of this test.
-      self.skipTest('Skipping test due to `session.run()` timeout.')
-
-    signal.signal(signal.SIGALRM, handler)
-    # Alarming within 5 min before the test timeouts and fails.
-    signal.alarm(240)
-
-    def get_saving_dir_and_filepath():
-      saving_dir = tempfile.mkdtemp(prefix=self.get_temp_dir())
-      saving_filepath = os.path.join(saving_dir, 'checkpoint.' + file_format)
-      return saving_dir, saving_filepath
-
-    # Case 1: Training for `num_epoch` without preemptions.
-    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
-    self._barrier = dc._Barrier(2)
-    self._successful_thread_ends = 0
-    # Get a new temporary filepath to save the checkpoint to.
-    saving_dir, saving_filepath = get_saving_dir_and_filepath()
-    threads = self.run_multiple_tasks_in_threads(
-        _independent_worker_fn,
-        cluster_spec,
-        # Pass `saving_filepath` from the parent thread to ensure every worker
-        # has the same filepath to save.
-        saving_filepath=saving_filepath,
-        before_restart=False,
-        new_chief=False)
-    threads_to_join = []
-    if strategy.extended.experimental_between_graph:
-      for ts in threads.values():
-        threads_to_join.extend(ts)
-    else:
-      threads_to_join = [threads['worker'][0]]
-    self.join_independent_workers(threads_to_join)
-
-    # `self.test_skipped_reason` could be set when a non-main thread attempts
-    # to skip the test.
-    # `multi_worker_test_base.skip_if_grpc_server_cant_be_started()` is an
-    # example of where this can be set. Since raising `SkipTest` in a non-main
-    # thread doesn't actually skip the test, we check if the test should be
-    # skipped here once we have joined the threads.
-    if getattr(self, 'test_skipped_reason', None) is not None:
-      self.skipTest(self.test_skipped_reason)
-
-    self.assertTrue(
-        training_state.remove_checkpoint_if_exists(saving_dir, saving_filepath))
-    self.assertEqual(self._successful_thread_ends, 2)
-
-    # Case 2: Training for `num_epoch` epoch with preemptions.
-    # The preemption is simulated at both epoch boundary and batch boundary.
-    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
-    self._barrier = dc._Barrier(2)
-    # Ports reserved for new threads simulating recovery.
-    reserved_ports = [
-        'localhost:%s' % test_base.pick_unused_port()
-        for _ in range(num_workers)
-    ]
-    self._successful_thread_ends = 0
-    # Get a new temporary filepath to save the checkpoint to.
-    saving_dir, saving_filepath = get_saving_dir_and_filepath()
-    threads = self.run_multiple_tasks_in_threads(
-        _independent_worker_fn,
-        cluster_spec,
-        # Pass `saving_filepath` from the parent thread to ensure every worker
-        # has the same filepath to save.
-        saving_filepath=saving_filepath,
-        reserved_ports=reserved_ports,
-        before_restart=True,
-        new_chief=False)
-    threads_to_join = []
-    if strategy.extended.experimental_between_graph:
-      # Only join the non-chief thread since the first thread for chief will
-      # eventually hang and be ignored.
-      threads_to_join = [threads['worker'][1]]
-    else:
-      threads_to_join = [threads['worker'][0]]
-    self.join_independent_workers(threads_to_join)
-    if getattr(self, 'test_skipped_reason', None) is not None:
-      self.skipTest(self.test_skipped_reason)
-
-    self.assertTrue(
-        training_state.remove_checkpoint_if_exists(saving_dir, saving_filepath))
-    self.assertEqual(self._successful_thread_ends, 2)
-
-    def assert_all_elements_are_identical(list_to_check):
-      first_item = list_to_check[0]
-      for item in list_to_check[1:]:
-        self.assertAllClose(first_item, item, rtol=2e-5, atol=1e-5)
-
-    # Important: the results from preemption interrupted and non-interrupted
-    # cases should give the same final results.
-    assert_all_elements_are_identical(
-        [history['acc'][-1] for history in self._histories])
-    assert_all_elements_are_identical(
-        [history['loss'][-1] for history in self._histories])
-    # The length of `self._histories` would be num_workers * num_runs (3).
-    self.assertLen(self._histories, 4)
-
-    # Results from case 1 should have 3 full epochs.
-    self.assertLen(self._histories[0]['acc'], 3)
-    # Results from case 2 should only have 2 full epochs because it restarted at
-    # epoch 1.
-    self.assertLen(self._histories[-1]['acc'], 2)
-
-
-if __name__ == '__main__':
-  with test.mock.patch.object(sys, 'exit', os._exit):
-    test.main()
diff --git a/tensorflow/python/keras/distribute/multi_worker_training_state.py b/tensorflow/python/keras/distribute/multi_worker_training_state.py
deleted file mode 100644
index d967cf8b1d2..00000000000
--- a/tensorflow/python/keras/distribute/multi_worker_training_state.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Training state management in multi-worker distributed training."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import contextlib
-import os
-import uuid
-from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.utils import mode_keys
-from tensorflow.python.lib.io import file_io
-from tensorflow.python.ops import variables
-from tensorflow.python.training.tracking import tracking
-
-# Constant for `tf.keras.Model` attribute to store the epoch at which the most
-# recently saved checkpoint was saved.
-CKPT_SAVED_EPOCH = '_ckpt_saved_epoch'
-
-CKPT_SAVED_EPOCH_UNUSED_VALUE = -1
-
-
-def checkpoint_exists(filepath):
-  """Returns whether the checkpoint `filepath` refers to exists."""
-  if filepath.endswith('.h5'):
-    return file_io.file_exists(filepath)
-  tf_saved_model_exists = file_io.file_exists(filepath)
-  tf_weights_only_checkpoint_exists = file_io.file_exists(filepath + '.index')
-  return tf_saved_model_exists or tf_weights_only_checkpoint_exists
-
-
-def remove_checkpoint_if_exists(ckpt_dir, filepath):
-  """Removes the checkpoint if it exists and returns whether it has removed."""
-  if checkpoint_exists(filepath):
-    _remove_dir(ckpt_dir)
-    return True
-  return False
-
-
-def _remove_dir(dir_to_remove):
-  file_io.delete_recursively(dir_to_remove)
-
-
-def _get_backup_filepath(original_filepath):
-  backup_dir = os.path.join(os.path.dirname(original_filepath), 'backup')
-  return backup_dir, os.path.join(backup_dir, 'training_state')
-
-
-def _get_temp_filepath(original_filepath):
-  temp_dir = os.path.join(
-      os.path.dirname(original_filepath), 'temp_training_states',
-      str(uuid.uuid4()))
-  return temp_dir, os.path.join(temp_dir, 'training_state')
-
-
-class MultiWorkerTrainingState(object):
-  """Training state management class in multi-worker distributed training.
-
-  In multi-worker training, model weights and epoch information are saved
-  periodically for fault-tolerance, also known as preemption-recovery purpose.
-  This class provides apis for backing up and restoring the training state.
-  """
-
-  def __init__(self, model, original_filepath):
-    self._model = model
-
-    # The directory and filepath that store the training state backup file.
-    self._backup_dir, self._backup_filepath = _get_backup_filepath(
-        original_filepath)
-
-    # For those who should not checkpoint (e.g. non-chief worker in sync
-    # training), create a temporary directory to write to (that will be
-    # removed later).
-    if not multi_worker_util.should_save_checkpoint():
-      self._temp_dir, self._temp_filepath = _get_temp_filepath(
-          original_filepath)
-
-    # The epoch at which the checkpoint is saved. Used for fault-tolerance.
-    # GPU device only has int64 dtype registered VarHandleOp.
-    self._ckpt_saved_epoch = variables.Variable(
-        initial_value=constant_op.constant(
-            CKPT_SAVED_EPOCH_UNUSED_VALUE, dtype=dtypes.int64),
-        name='ckpt_saved_epoch')
-
-    # Variable initialization.
-    K.set_value(self._ckpt_saved_epoch, CKPT_SAVED_EPOCH_UNUSED_VALUE)
-
-    # Calling `AutoTrackable.__setattr__` to avoid getting added as a weight of
-    # model (which is done in `Layer.__setattr__`), which breaks saving/loading
-    # in hdf5 format. Once becomes an attr of `model`, _ckpt_saved_epoch gets
-    # tracked and will be included in the checkpoint file when backing up.
-    tracking.AutoTrackable.__setattr__(self._model, CKPT_SAVED_EPOCH,
-                                       self._ckpt_saved_epoch)
-
-  def back_up(self, epoch):
-    """Back up the current state of training into a checkpoint file.
-
-    Arguments:
-      epoch: The current epoch information to be saved.
-    """
-    # pylint: disable=protected-access
-    self._assert_in_multi_worker_mode()
-
-    # Update `_ckpt_saved_epoch`.
-    K.set_value(self._ckpt_saved_epoch, epoch)
-
-    # If this is multi-worker training, and this worker should not
-    # save checkpoint, we replace the filepath with a dummy filepath so
-    # it writes to a file that will be removed at the end of _save_model()
-    # call. This is because the SyncOnReadVariable needs to be synced across
-    # all the workers in order to be read, and all workers need to initiate
-    # that.
-    if multi_worker_util.should_save_checkpoint():
-      save_filepath = self._backup_filepath
-    else:
-      save_filepath = self._temp_filepath
-
-    # Save the weights plus CKPT_SAVED_EPOCH variable.
-    self._model.save_weights(save_filepath, overwrite=True)
-
-    if not multi_worker_util.should_save_checkpoint():
-      # Remove the file in multi-worker training where this worker should
-      # not checkpoint. It is a dummy file previously saved for sync distributed
-      # training.
-      _remove_dir(self._temp_dir)
-
-  def restore(self):
-    """Restore the training state from the backed up checkpoint file.
-
-    Returns:
-      True if the training state is successfully restored. False if the training
-      state doesn't need to be restored, or error occurred so it can't.
-    """
-    self._assert_in_multi_worker_mode()
-    if not multi_worker_util.should_load_checkpoint():
-      # For multi-worker training, it should not restore a model in certain
-      # worker setting (e.g. non-chief worker in ParameterServerStrategy).
-      return False
-    if file_io.file_exists(self._backup_dir):
-      try:
-        # Load the weights plus CKPT_SAVED_EPOCH variable.
-        self._model.load_weights(self._backup_filepath)
-        return True
-
-      except (IOError, ValueError) as e:
-        raise ValueError('Error loading file from {}. Reason: {}'.format(
-            self._backup_filepath, e))
-    return False
-
-  def delete_backup(self):
-    """Delete the backup directories.
-
-    Delete the backup directories which should not exist after `fit()`
-    successfully finishes.
-    """
-    self._assert_in_multi_worker_mode()
-    # Model may not have such attr if there was a failure before the attr was
-    # added to the model
-    if hasattr(self._model, CKPT_SAVED_EPOCH):
-      tracking.AutoTrackable.__delattr__(self._model, CKPT_SAVED_EPOCH)
-    if multi_worker_util.should_save_checkpoint():
-      _remove_dir(self._backup_dir)
-    else:
-      assert not file_io.file_exists(self._temp_dir)
-
-  def maybe_load_initial_epoch_from_ckpt(self, initial_epoch, mode):
-    """Maybe load initial epoch from ckpt considering possible worker recovery.
-
-    When `_ckpt_saved_epoch` attribute exists and is not
-    `CKPT_SAVED_EPOCH_UNUSED_VALUE`, this is under multi-worker training setting
-    and indicates the worker is recovering from previous failure. In this case,
-    infer `initial_epoch` from `self._ckpt_saved_epoch` to continue previous
-    unfinished training from certain epoch.
-
-    Arguments:
-      initial_epoch: The original initial_epoch user passes in in `fit()`.
-      mode: The mode for running `model.fit()`.
-
-    Returns:
-      If the training is recovering from previous failure under multi-worker
-      training setting, return the epoch the training is supposed to continue
-      at. Otherwise, return the `initial_epoch` the user passes in.
-    """
-    self._assert_in_multi_worker_mode()
-
-    # TODO(rchao): Add recovery for validation case
-    # (when mode == ModeKeys.TEST).
-    epoch = K.eval(self._ckpt_saved_epoch)
-    if mode == mode_keys.ModeKeys.TRAIN and epoch >= 0:
-      # The most recently saved epoch is one epoch prior to the epoch it
-      # failed at, so return the value of 'self._ckpt_saved_epoch' plus one.
-      return epoch + 1
-    return initial_epoch
-
-  @contextlib.contextmanager
-  def untrack_vars(self):
-    """Provides a scope within which training state variables are untracked.
-
-    Regular checkpoint file saved by `ModelCheckpoint` callback that the user
-    requests should not contain training state variables such as
-    `CKPT_SAVED_EPOCH`, or the epoch the checkpoint is most recently saved at.
-
-    Yields:
-      None.
-    """
-    tracking.AutoTrackable.__delattr__(self._model, CKPT_SAVED_EPOCH)
-    yield
-    tracking.AutoTrackable.__setattr__(self._model, CKPT_SAVED_EPOCH,
-                                       self._ckpt_saved_epoch)
-
-  def _assert_in_multi_worker_mode(self):
-    # pylint: disable=protected-access
-    if not self._model._in_multi_worker_mode():
-      raise ValueError('MultiWorkerTrainingState is only supposed to be used '
-                       'in multi-worker training. This indicates some error '
-                       'that needs to be fixed. Please submit a bug issue to '
-                       'tf.keras team.')
diff --git a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
index 0a9c5547f5a..3f9ab18f89c 100644
--- a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
@@ -28,6 +28,8 @@ from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras.datasets import mnist
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.platform import test
@@ -118,14 +120,15 @@ class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
 
       multi_worker_model.fit(
           multi_worker_dataset,
-          epochs=3,
-          steps_per_epoch=70,
+          epochs=2,
+          steps_per_epoch=20,
           callbacks=callbacks)
 
-    mpr_result = multi_process_runner.run(
-        proc_func,
-        multi_worker_test_base.create_cluster_spec(num_workers=num_workers),
-        list_stdout=True)
+    with test_util.skip_if_error(self, errors_impl.UnavailableError):
+      mpr_result = multi_process_runner.run(
+          proc_func,
+          multi_worker_test_base.create_cluster_spec(num_workers=num_workers),
+          list_stdout=True)
 
     def extract_accuracy(worker_id, input_string):
       match = re.match(
diff --git a/tensorflow/python/keras/distribute/worker_training_state.py b/tensorflow/python/keras/distribute/worker_training_state.py
new file mode 100644
index 00000000000..06cf46b3333
--- /dev/null
+++ b/tensorflow/python/keras/distribute/worker_training_state.py
@@ -0,0 +1,153 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Training state management."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.distribute import distributed_file_utils
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils import mode_keys
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import variables
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.tracking import util as trackable_util
+
+# Constant for `tf.keras.Model` attribute to store the epoch at which the most
+# recently saved checkpoint was saved.
+CKPT_SAVED_EPOCH = '_ckpt_saved_epoch'
+
+CKPT_SAVED_EPOCH_UNUSED_VALUE = -1
+
+
+class WorkerTrainingState(object):
+  """Training state management class.
+
+  This class provides apis for backing up and restoring the training state.
+  This allows model and epoch information to be saved periodically and restore
+  for fault-tolerance, also known as preemption-recovery purpose.
+  """
+
+  def __init__(self, model, checkpoint_dir):
+    self._model = model
+
+    # The epoch at which the checkpoint is saved. Used for fault-tolerance.
+    # GPU device only has int64 dtype registered VarHandleOp.
+    self._ckpt_saved_epoch = variables.Variable(
+        initial_value=constant_op.constant(
+            CKPT_SAVED_EPOCH_UNUSED_VALUE, dtype=dtypes.int64),
+        name='ckpt_saved_epoch')
+
+    # Variable initialization.
+    K.set_value(self._ckpt_saved_epoch, CKPT_SAVED_EPOCH_UNUSED_VALUE)
+
+    # _ckpt_saved_epoch gets tracked and is included in the checkpoint file
+    # when backing up.
+    checkpoint = trackable_util.Checkpoint(
+        model=self._model, ckpt_saved_epoch=self._ckpt_saved_epoch)
+
+    # If this is single-worker training, checkpoint_dir are the same for
+    # write_checkpoint_manager and read_checkpoint_manager.
+    #
+    # If this is multi-worker training, and this worker should not
+    # save checkpoint, we replace the write_checkpoint_manager's checkpoint_dir
+    # with a temp filepath, so it writes to a file that will be removed at the
+    # end of back_up() call. This is necessary because the SyncOnReadVariable
+    # needs to be synced across all the workers in order to be read, and all
+    # workers need to perform `save()`.
+    # But all workers should restore from the same checkpoint_dir as passed in
+    # read_checkpoint_manager.
+    self.write_checkpoint_dir = distributed_file_utils.write_dirpath(
+        checkpoint_dir, self._model.distribute_strategy)
+    self.write_checkpoint_manager = checkpoint_management.CheckpointManager(
+        checkpoint, directory=self.write_checkpoint_dir, max_to_keep=1)
+    if self.write_checkpoint_dir == checkpoint_dir:
+      self.read_checkpoint_manager = self.write_checkpoint_manager
+    else:
+      self.read_checkpoint_manager = checkpoint_management.CheckpointManager(
+          checkpoint, directory=checkpoint_dir, max_to_keep=1)
+
+  def back_up(self, epoch):
+    """Back up the current state of training into a checkpoint file.
+
+    Arguments:
+      epoch: The current epoch information to be saved.
+    """
+    K.set_value(self._ckpt_saved_epoch, epoch)
+    # Save the model plus CKPT_SAVED_EPOCH variable.
+    if self.write_checkpoint_manager.save():
+      distributed_file_utils.remove_temp_dirpath(
+          self.write_checkpoint_manager.directory,
+          self._model.distribute_strategy)
+
+  def restore(self):
+    """Restore the training state from the backed up checkpoint file.
+
+    Returns:
+      True if the training state is successfully restored. False if the training
+      state doesn't need to be restored, or error occurred so it can't.
+    """
+    # For multi-worker training, it should not restore a model in certain
+    # worker setting (e.g. non-chief worker in ParameterServerStrategy).
+    # pylint: disable=protected-access
+    if self._model._in_multi_worker_mode(
+    ) and not multi_worker_util.should_load_checkpoint():
+      return
+    self.read_checkpoint_manager.restore_or_initialize()
+
+  def delete_backup(self):
+    """Delete the backup directories.
+
+    Delete the backup directories which should not exist after `fit()`
+    successfully finishes.
+    """
+    # pylint: disable=protected-access
+    for pathname in file_io.get_matching_files(
+        self.write_checkpoint_manager._prefix + '*'):
+      file_io.delete_recursively(pathname)
+    for pathname in file_io.get_matching_files(
+        os.path.join(self.write_checkpoint_manager.directory, 'checkpoint')):
+      file_io.delete_recursively(pathname)
+
+  def maybe_load_initial_epoch_from_ckpt(self, initial_epoch, mode):
+    """Maybe load initial epoch from ckpt considering possible worker recovery.
+
+    When `_ckpt_saved_epoch` attribute exists and is not
+    `CKPT_SAVED_EPOCH_UNUSED_VALUE`, this is under multi-worker training setting
+    and indicates the worker is recovering from previous failure. In this case,
+    infer `initial_epoch` from `self._ckpt_saved_epoch` to continue previous
+    unfinished training from certain epoch.
+
+    Arguments:
+      initial_epoch: The original initial_epoch user passes in in `fit()`.
+      mode: The mode for running `model.fit()`.
+
+    Returns:
+      If the training is recovering from previous failure under multi-worker
+      training setting, return the epoch the training is supposed to continue
+      at. Otherwise, return the `initial_epoch` the user passes in.
+    """
+
+    epoch = K.eval(self._ckpt_saved_epoch)
+    if mode == mode_keys.ModeKeys.TRAIN and epoch >= 0:
+      # The most recently saved epoch is one epoch prior to the epoch it
+      # failed at, so return the value of 'self._ckpt_saved_epoch' plus one.
+      return epoch + 1
+    return initial_epoch
diff --git a/tensorflow/python/keras/distribute/multi_worker_training_state_test.py b/tensorflow/python/keras/distribute/worker_training_state_test.py
similarity index 78%
rename from tensorflow/python/keras/distribute/multi_worker_training_state_test.py
rename to tensorflow/python/keras/distribute/worker_training_state_test.py
index 984db20b3b9..80a3deaa914 100644
--- a/tensorflow/python/keras/distribute/multi_worker_training_state_test.py
+++ b/tensorflow/python/keras/distribute/worker_training_state_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests of `multi_worker_training_state.py` utilities."""
+"""Tests of `worker_training_state.py` utilities."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -26,12 +26,12 @@ from tensorflow.python.distribute import multi_worker_test_base as test_base
 from tensorflow.python.framework.errors_impl import NotFoundError
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras.distribute import multi_worker_testing_utils
-from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import test
 
 
-class MultiWorkerTrainingStateTest(test_base.IndependentWorkerTestBase,
-                                   parameterized.TestCase):
+class ModelCheckpointTest(test_base.IndependentWorkerTestBase,
+                          parameterized.TestCase):
 
   @combinations.generate(
       combinations.combine(
@@ -48,7 +48,7 @@ class MultiWorkerTrainingStateTest(test_base.IndependentWorkerTestBase,
         callbacks.ModelCheckpoint(
             filepath=saving_filepath, save_weights_only=save_weights_only)
     ]
-    self.assertFalse(training_state.checkpoint_exists(saving_filepath))
+    self.assertFalse(file_io.file_exists(saving_filepath))
 
     try:
       model.fit(
@@ -56,11 +56,10 @@ class MultiWorkerTrainingStateTest(test_base.IndependentWorkerTestBase,
     except NotFoundError as e:
       if 'Failed to create a NewWriteableFile' in e.message:
         self.skipTest('b/138941852, path not found error in Windows py35.')
-
-    self.assertTrue(training_state.checkpoint_exists(saving_filepath))
-    self.assertTrue(
-        training_state.remove_checkpoint_if_exists(saving_dir, saving_filepath))
-    self.assertFalse(training_state.checkpoint_exists(saving_filepath))
+    tf_saved_model_exists = file_io.file_exists(saving_filepath)
+    tf_weights_only_checkpoint_exists = file_io.file_exists(saving_filepath +
+                                                            '.index')
+    self.assertTrue(tf_saved_model_exists or tf_weights_only_checkpoint_exists)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/engine/BUILD b/tensorflow/python/keras/engine/BUILD
index 203e481170f..231ab7661f0 100644
--- a/tensorflow/python/keras/engine/BUILD
+++ b/tensorflow/python/keras/engine/BUILD
@@ -21,8 +21,8 @@ py_library(
     srcs = [
         "__init__.py",
         "compile_utils.py",
+        "functional.py",
         "input_layer.py",
-        "network.py",
         "node.py",
         "partial_batch_padding_handler.py",
         "saving.py",
@@ -118,6 +118,7 @@ py_library(
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:sharded_variable",
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:constraints",
@@ -460,9 +461,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "network_test",
+    name = "functional_test",
     size = "medium",
-    srcs = ["network_test.py"],
+    srcs = ["functional_test.py"],
     python_version = "PY3",
     shard_count = 8,
     tags = [
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 210f56ae87a..c7d25f31d73 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import copy
 import functools
 import itertools
@@ -34,6 +35,7 @@ from tensorflow.python import tf2
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import function
@@ -87,6 +89,11 @@ from tensorflow.tools.docs import doc_controls
 # Prefix that is added to the TF op layer names.
 _TF_OP_LAYER_NAME_PREFIX = 'tf_op_layer_'
 
+# TODO(mdan): Should we have a single generic type for types that can be passed
+# to tf.cast?
+_AUTOCAST_TYPES = (ops.Tensor, sparse_tensor.SparseTensor,
+                   ragged_tensor.RaggedTensor)
+
 _keras_layers_gauge = monitoring.BoolGauge('/tensorflow/api/keras/layers',
                                            'keras layers usage', 'method')
 _keras_model_gauge = monitoring.BoolGauge(
@@ -286,12 +293,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # are only applicable to input layers: do not pass these keywords
     # to non-input layers.
     allowed_kwargs = {
-        'input_shape',
-        'batch_input_shape',
-        'batch_size',
-        'weights',
-        'activity_regularizer',
-        'autocast'
+        'input_dim', 'input_shape', 'batch_input_shape', 'batch_size',
+        'weights', 'activity_regularizer', 'autocast'
     }
     # Validate optional keyword arguments.
     generic_utils.validate_kwargs(kwargs, allowed_kwargs)
@@ -310,12 +313,14 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # TODO(kathywu): Move this to Layer._set_save_spec once cl/290121460 is
     # submitted.
     self._build_input_shape = None
+    self._saved_model_inputs_spec = None
     # Provides information about which inputs are compatible with the layer.
     self._input_spec = None
     self.supports_masking = False
 
     self._init_set_name(name)
-    self._activity_regularizer = kwargs.pop('activity_regularizer', None)
+    self._activity_regularizer = regularizers.get(
+        kwargs.pop('activity_regularizer', None))
     self._maybe_create_attribute('_trainable_weights', [])
     self._maybe_create_attribute('_non_trainable_weights', [])
     self._updates = []
@@ -362,6 +367,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     self._dynamic = dynamic
 
     # Manage input shape information if passed.
+    if 'input_dim' in kwargs and 'input_shape' not in kwargs:
+      # Backwards compatibility: alias 'input_dim' to 'input_shape'.
+      kwargs['input_shape'] = (kwargs['input_dim'],)
     if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
       # In this case we will later create an input layer
       # to insert before the current layer
@@ -385,6 +393,11 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # might want to turn it off, like Sequential model.
     self._auto_track_sub_layers = True
 
+    # Will compute masking if `compute_mask` is overridden or `supports_masking`
+    # is set.
+    self._compute_mask_overridden = (not getattr(self.compute_mask,
+                                                 '_is_default', False))
+
   @trackable.no_automatic_dependency_tracking
   @generic_utils.default
   def build(self, input_shape):
@@ -517,7 +530,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     dtype = dtypes.as_dtype(dtype)
     if self._dtype_policy.variable_dtype is None:
       # The policy is "_infer", so we infer the policy from the variable dtype.
-      self._dtype_policy = policy.Policy(dtype.base_dtype.name)
+      self._set_dtype_policy(policy.Policy(dtype.base_dtype.name))
     initializer = initializers.get(initializer)
     regularizer = regularizers.get(regularizer)
     constraint = constraints.get(constraint)
@@ -590,7 +603,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       self._handle_weight_regularization(name_in_scope,
                                          variable,
                                          regularizer)
-    if isinstance(variable, tf_variables.PartitionedVariable):
+    if isinstance(
+        variable,
+        (tf_variables.PartitionedVariable, sharded_variable.ShardedVariable)):
       for v in variable:
         backend.track_variable(v)
         if trainable:
@@ -814,6 +829,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     inputs, args, kwargs = self._split_out_first_arg(args, kwargs)
 
     call_context = base_layer_utils.call_context()
+    in_call = call_context.in_call
     input_list = nest.flatten(inputs)
 
     # We will attempt to build a TF graph if & only if all inputs are symbolic.
@@ -841,11 +857,11 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # setting the `_keras_mask` attribute on the inputs to a Layer. Masks passed
     # explicitly take priority.
     mask_arg_passed_by_framework = False
-    input_masks = self._collect_input_masks(inputs, args, kwargs)
-    if (self._expects_mask_arg and input_masks is not None and
-        not self._call_arg_was_passed('mask', args, kwargs)):
-      mask_arg_passed_by_framework = True
+    input_masks, mask_is_implicit = self._get_input_masks(
+        inputs, input_list, args, kwargs)
+    if self._expects_mask_arg and mask_is_implicit:
       kwargs['mask'] = input_masks
+      mask_arg_passed_by_framework = True
 
     # If `training` argument is None or not explicitly passed,
     # propagate `training` value from this layer's calling layer.
@@ -864,11 +880,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       # Priority 3a: `learning_phase()` has been set.
       elif backend.global_learning_phase_is_set():
         training_value = backend.learning_phase()
-      # Priority 3b: Pass the `learning_phase()` if in the Keras FuncGraph.
-      elif build_graph:
-        with backend.get_graph().as_default():
-          if base_layer_utils.is_in_keras_graph():
-            training_value = backend.learning_phase()
 
       if self._expects_training_arg and training_value is not None:
         # Force the training_value to be bool type which matches to the contract
@@ -888,16 +899,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     if build_graph and base_layer_utils.needs_keras_history(inputs):
       base_layer_utils.create_keras_history(inputs)
 
-    # Clear eager losses on top level model call.
-    # We are clearing the losses only on the top level model call and not on
-    # every layer/model call because layer/model may be reused.
-    if (base_layer_utils.is_in_eager_or_tf_function() and
-        not call_context.in_call):
-      self._clear_losses()
-
     with call_context.enter(self, inputs, build_graph, training_value):
       # Check input assumptions set after layer building, e.g. input shape.
       if build_graph:
+        # Losses are cleared for all Layers when the outermost layer is called.
+        # Losses are not cleared each time an inner layer is called, bc inner
+        # Layers can be reused in a Model.
+        if not in_call and base_layer_utils.is_in_tf_function():
+          self._clear_losses()
+
         # Symbolic execution on symbolic tensors. We will attempt to build
         # the corresponding TF subgraph inside `backend.get_graph()`
         # TODO(reedwm): We should assert input compatibility after the inputs
@@ -905,11 +915,12 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         input_spec.assert_input_compatibility(self.input_spec, inputs,
                                               self.name)
         graph = backend.get_graph()
+        # Use `self._name_scope()` to avoid auto-incrementing the name.
         with graph.as_default(), backend.name_scope(self._name_scope()):
           # Build layer if applicable (if the `build` method has been
           # overridden).
           self._maybe_build(inputs)
-          cast_inputs = self._maybe_cast_inputs(inputs)
+          cast_inputs = self._maybe_cast_inputs(inputs, input_list)
 
           if not self.dynamic:
             # Wrapping `call` function in autograph to allow for dynamic control
@@ -927,7 +938,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
             try:
               with base_layer_utils.autocast_context_manager(
-                  self._compute_dtype):
+                  self._compute_dtype_object):
                 # Add auto_control_deps in V2 when they are not already added by
                 # a `tf.function`.
                 if (ops.executing_eagerly_outside_functions() and
@@ -970,22 +981,30 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
             outputs = self._set_connectivity_metadata((inputs,) + args, kwargs,
                                                       outputs)
           self._handle_activity_regularization(inputs, outputs)
-          self._set_mask_metadata(inputs, outputs, input_masks)
+          self._set_mask_metadata(inputs, outputs, input_masks, build_graph)
           if hasattr(self, '_set_inputs') and not self.inputs:
             # Subclassed network: explicitly set metadata normally set by
             # a call to self._set_inputs().
             self._set_inputs(cast_inputs, outputs)
       else:
         # Eager execution on data tensors.
-        with backend.name_scope(self._name_scope()):
+
+        # Losses are cleared for all Layers when the outermost layer is called.
+        # Losses are not cleared each time an inner layer is called, bc inner
+        # Layers can be reused in a Model.
+        if not in_call:
+          self._clear_losses()
+
+        # In Eager mode, `ops.name_scope_v2` does not autoincrement the name.
+        with ops.name_scope_v2(self.name):
           self._maybe_build(inputs)
-          cast_inputs = self._maybe_cast_inputs(inputs)
+          cast_inputs = self._maybe_cast_inputs(inputs, input_list)
           with base_layer_utils.autocast_context_manager(
-              self._compute_dtype):
+              self._compute_dtype_object):
             outputs = self.call(cast_inputs, *args, **kwargs)
           self._handle_activity_regularization(inputs, outputs)
-          self._set_mask_metadata(inputs, outputs, input_masks)
-          if hasattr(self, '_set_save_spec'):
+          self._set_mask_metadata(inputs, outputs, input_masks, build_graph)
+          if self._saved_model_inputs_spec is None:
             self._set_save_spec(cast_inputs)
 
     return outputs
@@ -1001,21 +1020,16 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return self._name
 
   @property
-  @trackable_layer_utils.cache_recursive_attribute('dynamic')
   def dynamic(self):
     """Whether the layer is dynamic (eager-only); set in the constructor."""
-    # NOTE(taylorrobie): Currently self._dynamic is read-only. If that changes
-    #                    then this cache logic must be updated.
-    return self._dynamic
+    return any(layer._dynamic for layer in self._flatten_layers())
 
   @property
   @doc_controls.do_not_doc_inheritable
-  @trackable_layer_utils.cache_recursive_attribute('stateful')
   def stateful(self):
-    return self._stateful
+    return any(layer._stateful for layer in self._flatten_layers())
 
   @stateful.setter
-  @trackable_layer_utils.invalidate_recursive_cache('stateful')
   def stateful(self, value):
     self._stateful = value
 
@@ -1025,9 +1039,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   @trainable.setter
   def trainable(self, value):
-    self._trainable = value
-    for layer in getattr(self, '_layers', []):
-      layer.trainable = value
+    for layer in self._flatten_layers():
+      layer._trainable = value
 
   @property
   def activity_regularizer(self):
@@ -1134,7 +1147,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   @doc_controls.do_not_doc_inheritable
   def updates(self):
     collected_updates = []
-    all_layers = self._gather_unique_layers()
+    all_layers = self._flatten_layers()
     with backend.get_graph().as_default():
       for layer in all_layers:
         if not layer.trainable and not layer.stateful:
@@ -1187,8 +1200,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       A list of tensors.
     """
     collected_losses = []
-    all_layers = self._gather_unique_layers()
-    for layer in all_layers:
+    for layer in self._flatten_layers():
       # If any eager losses are present, we assume the model to be part of an
       # eager training loop (either a custom one or the one used when
       # `run_eagerly=True`) and so we always return just the eager losses.
@@ -1326,14 +1338,14 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           # Possible a loss was added in a Layer's `build`.
           self._losses.append(symbolic_loss)
 
-  @trackable.no_automatic_dependency_tracking
   def _clear_losses(self):
     """Used every step in eager to reset losses."""
-    self._eager_losses = []
-    if hasattr(self, '_layers'):
-      for layer in trackable_layer_utils.filter_empty_layer_containers(
-          self._layers):
-        layer._clear_losses()
+    # Set to thread local directly to avoid Layer.__setattr__ overhead.
+    if not getattr(self, '_layers', None):  # Fast path for single Layer.
+      self._thread_local._eager_losses = []
+    else:
+      for layer in self._flatten_layers():
+        layer._thread_local._eager_losses = []
 
   @property
   def metrics(self):
@@ -1353,8 +1365,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       A list of tensors.
     """
     collected_metrics = []
-    all_layers = self._gather_unique_layers()
-    for layer in all_layers:
+    for layer in self._flatten_layers():
       with layer._metrics_lock:
         collected_metrics.extend(layer._metrics)
     return collected_metrics
@@ -1463,7 +1474,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           self._metrics.append(metric_obj)
         else:
           from tensorflow.python.keras import metrics as metrics_mod  # pylint:disable=g-import-not-at-top
-          metric_obj = metrics_mod.Mean(name=name, dtype=value.dtype)
+          # Build the metric object with the value's dtype if it defines one
+          metric_obj = metrics_mod.Mean(
+              name=name, dtype=getattr(value, 'dtype', None))
           self._metrics.append(metric_obj)
 
       if should_update_state:
@@ -2090,6 +2103,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     self._dtype_defaulted_to_floatx = (not dtype and
                                        policy.policy_defaults_to_floatx())
 
+    # Performance optimization: cache the compute dtype as a Dtype object or
+    # None, so that str to Dtype conversion doesn't happen in Layer.__call__.
+    # TODO(b/157486353): Investigate returning DTypes in Policy.
+    if self._dtype_policy.compute_dtype:
+      self._compute_dtype_object = dtypes.as_dtype(
+          self._dtype_policy.compute_dtype)
+    else:
+      self._compute_dtype_object = None
+
   # TODO(reedwm): Expose this property?
   @property
   def _compute_dtype(self):
@@ -2104,7 +2126,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     """
     return self._dtype_policy.compute_dtype
 
-  def _maybe_cast_inputs(self, inputs):
+  def _maybe_cast_inputs(self, inputs, input_list):
     """Maybe casts the inputs to the compute dtype.
 
     If self._compute_dtype is floating-point, and self_autocast is True,
@@ -2112,32 +2134,36 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
     Args:
       inputs: Input tensor, or structure of input tensors.
+      input_list: Flat list of input tensors.
 
     Returns:
       `inputs`, but tensors may have been casted to self._compute_dtype
     """
-    compute_dtype = self._compute_dtype
-    if (self._autocast and compute_dtype and
-        dtypes.as_dtype(compute_dtype).is_floating):
-      def f(x):
-        """Cast a single Tensor or TensorSpec to the compute dtype."""
-        cast_types = (ops.Tensor, sparse_tensor.SparseTensor,
-                      ragged_tensor.RaggedTensor)
-        if (isinstance(x, cast_types) and x.dtype.is_floating and
-            x.dtype.base_dtype.name != compute_dtype):
-          if self._dtype_defaulted_to_floatx:
-            self._warn_about_input_casting(x.dtype.base_dtype)
-          return math_ops.cast(x, compute_dtype)
-        elif isinstance(x, tensor_spec.TensorSpec) and x.dtype.is_floating:
-          # Inputs may be TensorSpecs when this function is called from
-          # model._set_inputs.
-          return tensor_spec.TensorSpec(x.shape, compute_dtype, x.name)
-        else:
-          return x
-      return nest.map_structure(f, inputs)
+    compute_dtype_object = self._compute_dtype_object
+    should_autocast = (
+        self._autocast and compute_dtype_object and
+        compute_dtype_object.is_floating)
+
+    if (should_autocast and
+        any(map(self._should_cast_single_input, input_list))):
+      # Only perform expensive `nest` operation when needed.
+      return nest.map_structure(self._cast_single_input, inputs)
     else:
       return inputs
 
+  def _should_cast_single_input(self, x):
+    return (isinstance(x, _AUTOCAST_TYPES) and x.dtype.is_floating and
+            x.dtype.base_dtype.name != self._compute_dtype)
+
+  def _cast_single_input(self, x):
+    """Cast a single Tensor or TensorSpec to the compute dtype."""
+    if self._should_cast_single_input(x):
+      if self._dtype_defaulted_to_floatx:
+        self._warn_about_input_casting(x.dtype.base_dtype)
+      return math_ops.cast(x, self._compute_dtype_object)
+    else:
+      return x
+
   def _warn_about_input_casting(self, input_dtype):
     # self._already_warned_about_input_casting is only retrieved or set in this
     # function.
@@ -2176,7 +2202,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   @_dtype.setter
   def _dtype(self, value):
     value = dtypes.as_dtype(value).name
-    self._dtype_policy = policy.Policy(value)
+    self._set_dtype_policy(policy.Policy(value))
 
   def _name_scope(self):
     if not tf2.enabled():
@@ -2240,70 +2266,77 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           mean_activity_loss = activity_loss / batch_size
           self.add_loss(mean_activity_loss)
 
-  def _set_mask_metadata(self, inputs, outputs, previous_mask):
+  def _set_mask_metadata(self, inputs, outputs, previous_mask, build_graph):
+    # Many `Layer`s don't need to call `compute_mask`.
+    # This method is optimized to do as little work as needed for the common
+    # case.
+    if not self.supports_masking and not self._compute_mask_overridden:
+      return
+
     flat_outputs = nest.flatten(outputs)
 
     mask_already_computed = (
         getattr(self, '_compute_output_and_mask_jointly', False) or
         all(getattr(x, '_keras_mask', None) is not None for x in flat_outputs))
-
-    # Only compute the mask if the Layer explicitly supports masking or has
-    # overridden `compute_mask`.
-    should_compute_mask = (
-        hasattr(self, 'compute_mask') and
-        (self.supports_masking or
-         not getattr(self.compute_mask, '_is_default', False)))
-
     if mask_already_computed:
-      flat_masks = [getattr(x, '_keras_mask', None) for x in flat_outputs]
-    elif not should_compute_mask:
-      flat_masks = [None for _ in flat_outputs]
-    else:
-      output_masks = self.compute_mask(inputs, previous_mask)
-      # `compute_mask` can return a single `None` even when a Layer
-      # has multiple outputs.
-      if output_masks is None:
-        flat_masks = [None for _ in flat_outputs]
-      else:
-        flat_masks = nest.flatten(output_masks)
+      if build_graph:
+        self._set_mask_keras_history_checked(flat_outputs)
+      return
 
-    for output, mask in zip(flat_outputs, flat_masks):
+    output_masks = self.compute_mask(inputs, previous_mask)
+    if output_masks is None:
+      return
+
+    flat_masks = nest.flatten(output_masks)
+    for tensor, mask in zip(flat_outputs, flat_masks):
       try:
-        output._keras_mask = mask
+        tensor._keras_mask = mask
       except AttributeError:
         # C Type such as np.ndarray.
         pass
 
-    if tf_utils.are_all_symbolic_tensors(flat_outputs):
-      for output in flat_outputs:
-        if getattr(output, '_keras_mask', None) is not None:
-          # Do not track masks for `TensorFlowOpLayer` construction.
-          output._keras_mask._keras_history_checked = True
+    if build_graph:
+      self._set_mask_keras_history_checked(flat_outputs)
 
-  def _collect_input_masks(self, inputs, args, kwargs):
-    """Checks if `mask` argument was passed, else gathers mask from inputs."""
-    if self._call_arg_was_passed('mask', args, kwargs):
-      return self._get_call_arg_value('mask', args, kwargs)
+  def _set_mask_keras_history_checked(self, flat_outputs):
+    for output in flat_outputs:
+      if getattr(output, '_keras_mask', None) is not None:
+        # Do not track masks for `TensorFlowOpLayer` construction.
+        output._keras_mask._keras_history_checked = True
 
-    if not self._should_compute_mask:
-      return None
-
-    input_masks = nest.map_structure(lambda t: getattr(t, '_keras_mask', None),
-                                     inputs)
-    if generic_utils.is_all_none(input_masks):
-      return None
-    return input_masks
+  def _get_input_masks(self, inputs, input_list, args, kwargs):
+    if (not self._expects_mask_arg and not self.supports_masking and
+        not self._compute_mask_overridden):
+      # Input masks only need to be retrieved if they are needed for `call`
+      # or `compute_mask`.
+      input_masks = None
+      implicit_mask = False
+    elif self._call_arg_was_passed('mask', args, kwargs):
+      input_masks = self._get_call_arg_value('mask', args, kwargs)
+      implicit_mask = False
+    else:
+      input_masks = [getattr(t, '_keras_mask', None) for t in input_list]
+      if all(mask is None for mask in input_masks):
+        input_masks = None
+        implicit_mask = False
+      else:
+        # Only do expensive `nest` op when masking is actually being used.
+        input_masks = nest.pack_sequence_as(inputs, input_masks)
+        implicit_mask = True
+    return input_masks, implicit_mask
 
   def _call_arg_was_passed(self, arg_name, args, kwargs, inputs_in_args=False):
+    # Performance optimization: do no work in most common case.
+    if not args and not kwargs:
+      return False
+
     if arg_name in kwargs:
       return True
     call_fn_args = self._call_fn_args
     if not inputs_in_args:
       # Ignore `inputs` arg.
       call_fn_args = call_fn_args[1:]
-    if arg_name in dict(zip(call_fn_args, args)):
-      return True
-    return False
+    return arg_name in dict(zip(call_fn_args, args))
 
   def _get_call_arg_value(self, arg_name, args, kwargs, inputs_in_args=False):
     if arg_name in kwargs:
@@ -2403,7 +2436,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         except AttributeError:
           pass
         else:
-          self._dtype_policy = policy.Policy(dtype)
+          self._set_dtype_policy(policy.Policy(dtype))
       input_shapes = None
       # Converts Tensors / CompositeTensors to TensorShapes.
       if all(hasattr(x, 'shape') for x in input_list):
@@ -2456,22 +2489,16 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     Returns:
       A dict mapping all sublayers to their `trainable` value.
     """
-    layers = trackable_layer_utils.filter_empty_layer_containers(self._layers)
-    # Keep track of each top-level layers' `trainable` as well as the
-    # state of all of its sublayers.
     trainable_state = weakref.WeakKeyDictionary()
-    trainable_state[self] = self.trainable
-    for layer in layers:
-      trainable_state.update(layer._get_trainable_state())
+    for layer in self._flatten_layers():
+      trainable_state[layer] = layer.trainable
     return trainable_state
 
   def _set_trainable_state(self, trainable_state):
     """Set `trainable` state for each sublayer."""
-    layers = trackable_layer_utils.filter_empty_layer_containers(self._layers)
-    if self in trainable_state:
-      self.trainable = trainable_state[self]
-    for layer in layers:
-      layer._set_trainable_state(trainable_state)
+    for layer in self._flatten_layers():
+      if layer in trainable_state:
+        layer.trainable = trainable_state[layer]
 
   @property
   def _obj_reference_counts(self):
@@ -2531,7 +2558,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       super(tracking.AutoTrackable, self).__setattr__(
           '_layers',
           [l for l in self._layers if l is not existing_value])
-      self._attribute_sentinel.invalidate_all()
     if isinstance(existing_value, tf_variables.Variable):
       super(tracking.AutoTrackable, self).__setattr__(
           '_trainable_weights',
@@ -2540,13 +2566,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           '_non_trainable_weights',
           [w for w in self._non_trainable_weights if w is not existing_value])
 
-    # Any time we change `_layers` (either by deleting the attribute or by
-    # reassigning it which will call __delattr__ from __setattr__) the topology
-    # of the subgraph of Layers may change. In that case we will need to
-    # recompute any attribute which depends on that subgraph.
-    if name == '_layers':
-      self._attribute_sentinel.invalidate_all()
-
   def __setattr__(self, name, value):
     if (name == '_self_setattr_tracking' or
         not getattr(self, '_self_setattr_tracking', True) or
@@ -2575,6 +2594,12 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     except AttributeError:
       pass
 
+    # Keep track of metric instance created in subclassed layer.
+    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+    for val in nest.flatten(value):
+      if isinstance(val, metrics_module.Metric) and hasattr(self, '_metrics'):
+        self._metrics.append(val)
+
     # TODO(scottzhu): Need to track Module object as well for weight tracking.
     # Be careful about metric if it becomes a Module in future.
     # Append value to self._layers if relevant
@@ -2585,8 +2610,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       # container types which compare equal.
       if not any((layer is value for layer in self._layers)):
         self._layers.append(value)
-        if hasattr(value, '_attribute_sentinel'):
-          value._attribute_sentinel.add_parent(self._attribute_sentinel)
         if hasattr(value, '_use_resource_variables'):
           # Legacy layers (V1 tf.layers) must always use
           # resource variables.
@@ -2634,34 +2657,36 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
               getattr(layer, attribute) for layer in nested_layers))
     return []
 
-  def _gather_unique_layers(self):
-    """Returns the current layer and all its children depth first deduped.
+  def _flatten_layers(self, recursive=True, include_self=True):
+    if include_self:
+      yield self
 
-    We are deduping after getting the layers to maintain the order.
-    """
-    all_layers = self._gather_layers()
-    unique_layers, seen_layers = [], object_identity.ObjectIdentitySet()
-    for layer in all_layers:
-      if layer not in seen_layers:
-        unique_layers.append(layer)
-        # Track the Variable's identity to avoid __eq__ issues.
-        seen_layers.add(layer)
-    return unique_layers
+    # Only instantiate set and deque if needed.
+    layers_or_containers = getattr(self, '_layers', None)
+    if layers_or_containers:
+      seen_object_ids = set()
+      deque = collections.deque(layers_or_containers)
+      while deque:
+        layer_or_container = deque.popleft()
 
-  def _gather_layers(self):
-    """Returns the current layer and all its children depth first."""
-    all_layers = [self]
-    if hasattr(self, '_layers'):
-      child_layers = trackable_layer_utils.filter_empty_layer_containers(
-          self._layers)
-      for child_layer in child_layers:
-        all_layers.extend(child_layer._gather_layers())
-    return all_layers
+        layer_or_container_id = id(layer_or_container)
+        if layer_or_container_id in seen_object_ids:
+          continue
+        seen_object_ids.add(layer_or_container_id)
 
-  @property
-  @tracking.cached_per_instance
-  def _attribute_sentinel(self):
-    return trackable_layer_utils.AttributeSentinel()
+        if isinstance(layer_or_container, Layer):
+          yield layer_or_container
+          # Introspect recursively through sublayers.
+          if recursive:
+            sublayers = getattr(layer_or_container, '_layers', None)
+            if sublayers:
+              deque.extendleft(reversed(sublayers))
+        elif isinstance(layer_or_container,
+                        data_structures.TrackableDataStructure):
+          # Data structures are introspected even with `recursive=False`.
+          tracked_values = layer_or_container._values
+          if tracked_values:
+            deque.extendleft(reversed(tracked_values))
 
   # This is a hack so that the is_layer (within
   # training/trackable/layer_utils.py) check doesn't get the weights attr.
@@ -2710,12 +2735,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   def _call_accepts_kwargs(self):
     return self._call_full_argspec.varkw is not None
 
-  @property
-  @tracking.cached_per_instance
-  def _should_compute_mask(self):
-    return ('mask' in self._call_fn_args or
-            getattr(self, 'compute_mask', None) is not None)
-
   @property
   def _eager_losses(self):
     # A list of loss values containing activity regularizers and losses
@@ -2759,6 +2778,22 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   # SavedModel properties. Please see keras/saving/saved_model for details.
 
+  @trackable.no_automatic_dependency_tracking
+  def _set_save_spec(self, inputs):
+    if self._saved_model_inputs_spec is not None:
+      return  # Already set.
+
+    self._saved_model_inputs_spec = nest.map_structure(tf_utils.get_tensor_spec,
+                                                       inputs)
+
+  def _get_save_spec(self, dynamic_batch=True):
+    if self._saved_model_inputs_spec is None:
+      return None
+
+    return nest.map_structure(
+        lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=dynamic_batch),
+        self._saved_model_inputs_spec)
+
   @property
   def _trackable_saved_model_saver(self):
     return layer_serialization.LayerSavedModelSaver(self)
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 82c60eb34c8..13fb2b28bb7 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -616,32 +616,36 @@ class BaseLayerTest(keras_parameterized.TestCase):
     self.assertTrue(layer.built)
     self.assertEqual([None, 3], layer._build_input_shape.as_list())
 
+  def test_activity_regularizer_string(self):
+
+    class MyLayer(base_layer.Layer):
+      pass
+
+    layer = MyLayer(activity_regularizer='l2')
+    self.assertIsInstance(layer.activity_regularizer, regularizers.L2)
+
 
 class SymbolicSupportTest(keras_parameterized.TestCase):
 
   def test_using_symbolic_tensors_with_tf_ops(self):
     # Single-input.
     x = input_layer.Input((3,))
-    y = math_ops.square(x)
-    self.assertEqual(y.graph, backend.get_graph())
+    math_ops.square(x)
 
     # Multi-inputs.
     x1, x2 = input_layer.Input((3,)), input_layer.Input((3,))
-    y = array_ops.concat([x1, x2], axis=1)
-    self.assertEqual(y.graph, backend.get_graph())
+    array_ops.concat([x1, x2], axis=1)
 
     # Mixing Keras symbolic tensors and graph tensors from the same graph works.
     with backend.get_graph().as_default():
       x1 = input_layer.Input((3,))
     x2 = input_layer.Input((3,))
-    y = math_ops.matmul(x1, x2)
-    self.assertEqual(y.graph, backend.get_graph())
+    math_ops.matmul(x1, x2)
 
     # Creating same op type (matmul) multiple times in the Keras graph works.
     x1 = input_layer.Input((3,))
     x2 = input_layer.Input((3,))
-    y = math_ops.matmul(x1, x2)
-    self.assertEqual(y.graph, backend.get_graph())
+    math_ops.matmul(x1, x2)
 
   def test_mixing_eager_and_graph_tensors(self):
     with ops.Graph().as_default():
@@ -663,7 +667,7 @@ class SymbolicSupportTest(keras_parameterized.TestCase):
     x1 = input_layer.Input((3,))
     x2 = array_ops.ones((3, 3))
     y = math_ops.matmul(x1, x2)
-    self.assertEqual(y.graph, backend.get_graph())
+
     fn = backend.function(inputs=[x1], outputs=[y])
     x_val = np.random.random((3, 3))
     y_val = np.ones((3, 3))
@@ -676,7 +680,7 @@ class SymbolicSupportTest(keras_parameterized.TestCase):
     x1 = input_layer.Input((3,))
     x2 = np.ones((3, 3), dtype='float32')
     y = math_ops.matmul(x1, x2)
-    self.assertEqual(y.graph, backend.get_graph())
+
     fn = backend.function(inputs=[x1], outputs=[y])
     x_val = np.random.random((3, 3))
     y_val = np.ones((3, 3))
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 586efda4680..d7bd3d5d372 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import threading
 
 from tensorflow.python import tf2
@@ -24,6 +25,7 @@ from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
@@ -121,7 +123,7 @@ def make_variable(name,
         initializer,
         (type(init_ops.Initializer), type(init_ops_v2.Initializer))):
       initializer = initializer()
-    init_val = lambda: initializer(shape, dtype=dtype)
+    init_val = functools.partial(initializer, shape, dtype=dtype)
     variable_dtype = dtype.base_dtype
   if use_resource is None:
     use_resource = True
@@ -213,18 +215,17 @@ def _create_keras_history_helper(tensors, processed_ops, created_layers):
   for tensor in tensor_list:
     if getattr(tensor, '_keras_history', None) is not None:
       continue
+    if sparse_tensor.is_sparse(tensor) or ragged_tensor.is_ragged(tensor):
+      example = """
+      weights_mult = lambda x: tf.sparse.sparse_dense_matmul(x, weights)
+      output = tf.keras.layers.Lambda(weights_mult)(input)
+      """
+      raise ValueError('Tensorflow ops that generate ragged or sparse tensor '
+                       'outputs are currently not supported by Keras automatic '
+                       'op wrapping. Please wrap these ops in a Lambda layer: '
+                       '\n\n```\n{example}\n```\n'.format(example=example))
     op = tensor.op  # The Op that created this Tensor.
     if op not in processed_ops:
-      if op.type.startswith('Sparse'):
-        lambda_example = """
-        weights_mult = lambda x: tf.sparse.sparse_dense_matmul(x, weights)
-        output = tf.keras.layers.Lambda(weights_mult)(input)
-        """
-        raise ValueError(
-            'Sparse ops are not supported with functional models with built-in '
-            'layer wrapping. Please wrap the sparse ops in a Lambda layer like'
-            ': \n{lambda_example}\n'.format(lambda_example=lambda_example))
-
       # Recursively set `_keras_history`.
       op_inputs = list(op.inputs)
       constants = {}
@@ -247,7 +248,10 @@ def _create_keras_history_helper(tensors, processed_ops, created_layers):
             constants[i] = op_input
           else:
             with ops.init_scope():
-              constants[i] = backend.function([], op_input)([])
+              if ops.executing_eagerly_outside_functions():
+                constants[i] = backend.eval_in_eager_or_function(op_input)
+              else:
+                constants[i] = backend.function([], op_input)([])
       layer_inputs = unnest_if_single_tensor(layer_inputs)
       processed_ops, created_layers = _create_keras_history_helper(
           layer_inputs, processed_ops, created_layers)
@@ -489,7 +493,7 @@ def autocast_context_manager(dtype):
   Returns:
     A context manager to automatically cast AutoCastVariables.
   """
-  if dtype and not dtypes.as_dtype(dtype).is_floating:
+  if dtype and not dtype.is_floating:
     dtype = None
   return ops.get_default_graph()._enable_auto_casting_variables(dtype)  # pylint: disable=protected-access
 
@@ -676,16 +680,8 @@ def enable_v2_dtype_behavior():
   float32) instead of None. In addition, layers will automatically cast
   floating-point inputs to the layer's dtype.
 
-  >>> tf.compat.v1.keras.layers.disable_v2_dtype_behavior()
   >>> x = tf.ones((4, 4, 4, 4), dtype='float64')
   >>> layer = tf.keras.layers.Conv2D(filters=4, kernel_size=2)
-  >>> print(layer.dtype)  # None since V2 behavior is disabled
-  None
-  >>> y = layer(x)  # Doesn't cast inputs since V2 dtype behavior is disabled
-  >>> print(y.dtype.name)
-  float64
-  >>> tf.compat.v1.keras.layers.enable_v2_dtype_behavior()
-  >>> layer = tf.keras.layers.Conv2D(filters=4, kernel_size=2)
   >>> print(layer.dtype)  # float32 since V2 dtype behavior is enabled
   float32
   >>> y = layer(x)  # Layer casts inputs since V2 dtype behavior is enabled
diff --git a/tensorflow/python/keras/engine/base_layer_utils_test.py b/tensorflow/python/keras/engine/base_layer_utils_test.py
index 21f539d89c5..d27a4cd3297 100644
--- a/tensorflow/python/keras/engine/base_layer_utils_test.py
+++ b/tensorflow/python/keras/engine/base_layer_utils_test.py
@@ -16,12 +16,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
+from tensorflow.python import keras
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -66,5 +70,32 @@ class TrackableWeightHandlerTest(keras_parameterized.TestCase):
     _ = backend.batch_get_value(table_handler.get_tensors())
 
 
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class OpLayerTest(keras_parameterized.TestCase):
+
+  def test_tensor_op_layer(self):
+    int_values = keras.Input(shape=(2,), dtype=dtypes.int32)
+    float_values = math_ops.cast(int_values, dtypes.float32)
+    model = keras.Model(int_values, float_values)
+    model.compile(loss='mse')
+
+    input_data = np.array([[1, 2], [3, 4]], dtype=np.int32)
+    expected = [[1.0, 2.0], [3.0, 4.0]]
+    output = model.predict(input_data)
+    self.assertAllClose(expected, output)
+
+  def test_ragged_op_layer(self):
+    with self.assertRaisesRegexp(ValueError, 'Keras automatic op wrapping'):
+      int_values = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
+      float_values = math_ops.cast(int_values, dtypes.float32)
+      _ = keras.Model(int_values, float_values)
+
+  def test_sparse_op_layer(self):
+    with self.assertRaisesRegexp(ValueError, 'Keras automatic op wrapping'):
+      int_values = keras.Input(shape=(None,), dtype=dtypes.int32, sparse=True)
+      float_values = math_ops.cast(int_values, dtypes.float32)
+      _ = keras.Model(int_values, float_values)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 626892752c8..725334f8535 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -158,12 +158,8 @@ class Layer(base_layer.Layer):
     # are only applicable to input layers: do not pass these keywords
     # to non-input layers.
     allowed_kwargs = {
-        'input_shape',
-        'batch_input_shape',
-        'batch_size',
-        'weights',
-        'activity_regularizer',
-        'autocast'
+        'input_dim', 'input_shape', 'batch_input_shape', 'batch_size',
+        'weights', 'activity_regularizer', 'autocast'
     }
     # Validate optional keyword arguments.
     generic_utils.validate_kwargs(kwargs, allowed_kwargs)
@@ -184,7 +180,8 @@ class Layer(base_layer.Layer):
     self.supports_masking = False
 
     self._init_set_name(name)
-    self._activity_regularizer = kwargs.pop('activity_regularizer', None)
+    self._activity_regularizer = regularizers.get(
+        kwargs.pop('activity_regularizer', None))
     self._maybe_create_attribute('_trainable_weights', [])
     self._maybe_create_attribute('_non_trainable_weights', [])
     self._updates = []
@@ -229,6 +226,9 @@ class Layer(base_layer.Layer):
     self._dynamic = dynamic
 
     # Manage input shape information if passed.
+    if 'input_dim' in kwargs and 'input_shape' not in kwargs:
+      # Backwards compatibility: alias 'input_dim' to 'input_shape'.
+      kwargs['input_shape'] = (kwargs['input_dim'],)
     if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
       # In this case we will later create an input layer
       # to insert before the current layer
@@ -378,7 +378,7 @@ class Layer(base_layer.Layer):
     dtype = dtypes.as_dtype(dtype)
     if self._dtype_policy.variable_dtype is None:
       # The policy is "_infer", so we infer the policy from the variable dtype.
-      self._dtype_policy = policy.Policy(dtype.base_dtype.name)
+      self._set_dtype_policy(policy.Policy(dtype.base_dtype.name))
     initializer = initializers.get(initializer)
     regularizer = regularizers.get(regularizer)
     constraint = constraints.get(constraint)
@@ -768,7 +768,7 @@ class Layer(base_layer.Layer):
           if not self.dynamic:
             try:
               with base_layer_utils.autocast_context_manager(
-                  self._compute_dtype):
+                  self._compute_dtype_object):
                 outputs = call_fn(cast_inputs, *args, **kwargs)
 
             except errors.OperatorNotAllowedInGraphError as e:
@@ -813,7 +813,7 @@ class Layer(base_layer.Layer):
           self._maybe_build(inputs)
           cast_inputs = self._maybe_cast_inputs(inputs)
           with base_layer_utils.autocast_context_manager(
-              self._compute_dtype):
+              self._compute_dtype_object):
             outputs = self.call(cast_inputs, *args, **kwargs)
           self._handle_activity_regularization(inputs, outputs)
           self._set_mask_metadata(inputs, outputs, input_masks)
@@ -829,20 +829,15 @@ class Layer(base_layer.Layer):
     return self._name
 
   @property
-  @trackable_layer_utils.cache_recursive_attribute('dynamic')
   def dynamic(self):
-    # NOTE(taylorrobie): Currently self._dynamic is read-only. If that changes
-    #                    then this cache logic must be updated.
-    return self._dynamic
+    return any(layer._dynamic for layer in self._flatten_layers())
 
   @property
   @doc_controls.do_not_generate_docs
-  @trackable_layer_utils.cache_recursive_attribute('stateful')
   def stateful(self):
-    return self._stateful
+    return any(layer._stateful for layer in self._flatten_layers())
 
   @stateful.setter
-  @trackable_layer_utils.invalidate_recursive_cache('stateful')
   def stateful(self, value):
     self._stateful = value
 
@@ -914,7 +909,7 @@ class Layer(base_layer.Layer):
   @property
   def updates(self):
     collected_updates = []
-    all_layers = self._gather_unique_layers()
+    all_layers = self._flatten_layers()
     with backend.get_graph().as_default():
       for layer in all_layers:
         if not layer.trainable and not layer.stateful:
@@ -943,7 +938,7 @@ class Layer(base_layer.Layer):
       A list of tensors.
     """
     collected_losses = []
-    all_layers = self._gather_unique_layers()
+    all_layers = self._flatten_layers()
     for layer in all_layers:
       # If any eager losses are present, we assume the model to be part of an
       # eager training loop (either a custom one or the one used when
@@ -1073,8 +1068,7 @@ class Layer(base_layer.Layer):
   @property
   def metrics(self):
     collected_metrics = []
-    all_layers = self._gather_unique_layers()
-    for layer in all_layers:
+    for layer in self._flatten_layers():
       collected_metrics.extend(layer._metrics)
     return collected_metrics
 
@@ -1747,6 +1741,14 @@ class Layer(base_layer.Layer):
     self._dtype_defaulted_to_floatx = (not dtype and
                                        policy.policy_defaults_to_floatx())
 
+    # Performance optimization: cache the compute dtype as a Dtype object or
+    # None, so that str to Dtype conversion doesn't happen in Layer.__call__.
+    if self._dtype_policy.compute_dtype:
+      self._compute_dtype_object = dtypes.as_dtype(
+          self._dtype_policy.compute_dtype)
+    else:
+      self._compute_dtype_object = None
+
   # TODO(reedwm): Expose this property?
   @property
   def _compute_dtype(self):
@@ -1833,7 +1835,7 @@ class Layer(base_layer.Layer):
   @_dtype.setter
   def _dtype(self, value):
     value = dtypes.as_dtype(value).name
-    self._dtype_policy = policy.Policy(value)
+    self._set_dtype_policy(policy.Policy(value))
 
   def _name_scope(self):
     return self.name
@@ -2066,7 +2068,7 @@ class Layer(base_layer.Layer):
         except AttributeError:
           pass
         else:
-          self._dtype_policy = policy.Policy(dtype)
+          self._set_dtype_policy(policy.Policy(dtype))
       input_shapes = None
       if all(hasattr(x, 'shape') for x in input_list):
         input_shapes = nest.map_structure(lambda x: x.shape, inputs)
@@ -2177,7 +2179,6 @@ class Layer(base_layer.Layer):
       super(tracking.AutoTrackable, self).__setattr__(
           '_layers',
           [l for l in self._layers if l is not existing_value])
-      self._attribute_sentinel.invalidate_all()
     if isinstance(existing_value, tf_variables.Variable):
       super(tracking.AutoTrackable, self).__setattr__(
           '_trainable_weights',
@@ -2186,13 +2187,6 @@ class Layer(base_layer.Layer):
           '_non_trainable_weights',
           [w for w in self._non_trainable_weights if w is not existing_value])
 
-    # Any time we change `_layers` (either by deleting the attribute or by
-    # reassigning it which will call __delattr__ from __setattr__) the topology
-    # of the subgraph of Layers may change. In that case we will need to
-    # recompute any attribute which depends on that subgraph.
-    if name == '_layers':
-      self._attribute_sentinel.invalidate_all()
-
   def __setattr__(self, name, value):
     if (name == '_self_setattr_tracking' or
         not getattr(self, '_self_setattr_tracking', True) or
@@ -2221,6 +2215,12 @@ class Layer(base_layer.Layer):
     except AttributeError:
       pass
 
+    # Keep track of metric instance created in subclassed layer.
+    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+    for val in nest.flatten(value):
+      if isinstance(val, metrics_module.Metric) and hasattr(self, '_metrics'):
+        self._metrics.append(val)
+
     # TODO(scottzhu): Need to track Module object as well for weight tracking.
     # Be careful about metric if it becomes a Module in future.
     # Append value to self._layers if relevant
@@ -2231,8 +2231,6 @@ class Layer(base_layer.Layer):
       # container types which compare equal.
       if not any((layer is value for layer in self._layers)):
         self._layers.append(value)
-        if hasattr(value, '_attribute_sentinel'):
-          value._attribute_sentinel.add_parent(self._attribute_sentinel)
         if hasattr(value, '_use_resource_variables'):
           # Legacy layers (V1 tf.layers) must always use
           # resource variables.
@@ -2280,35 +2278,6 @@ class Layer(base_layer.Layer):
               getattr(layer, attribute) for layer in nested_layers))
     return []
 
-  def _gather_unique_layers(self):
-    """Returns the current layer and all its children depth first deduped.
-
-    We are deduping after getting the layers to maintain the order.
-    """
-    all_layers = self._gather_layers()
-    unique_layers, seen_layers = [], object_identity.ObjectIdentitySet()
-    for layer in all_layers:
-      if layer not in seen_layers:
-        unique_layers.append(layer)
-        # Track the Variable's identity to avoid __eq__ issues.
-        seen_layers.add(layer)
-    return unique_layers
-
-  def _gather_layers(self):
-    """Returns the current layer and all its children depth first."""
-    all_layers = [self]
-    if hasattr(self, '_layers'):
-      child_layers = trackable_layer_utils.filter_empty_layer_containers(
-          self._layers)
-      for child_layer in child_layers:
-        all_layers.extend(child_layer._gather_layers())
-    return all_layers
-
-  @property
-  @tracking.cached_per_instance
-  def _attribute_sentinel(self):
-    return trackable_layer_utils.AttributeSentinel()
-
   # This is a hack so that the is_layer (within
   # training/trackable/layer_utils.py) check doesn't get the weights attr.
   # TODO(b/110718070): Remove when fixed.
diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer.py b/tensorflow/python/keras/engine/base_preprocessing_layer.py
index 84138dd0a00..efd8a0e621f 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer.py
@@ -143,9 +143,12 @@ class CombinerPreprocessingLayer(PreprocessingLayer):
       accumulator = self._combiner.restore(self._restore_updates())
 
     if not isinstance(data,
-                      (dataset_ops.DatasetV2, np.ndarray, ops.EagerTensor)):
+                      (dataset_ops.DatasetV2,
+                       np.ndarray,
+                       ops.Tensor,
+                       ragged_tensor.RaggedTensor)):
       raise ValueError(
-          '`adapt()` requires a batched Dataset, an EagerTensor, '
+          '`adapt()` requires a batched Dataset, a Tensor, '
           'or a Numpy array as input, '
           'got {}'.format(type(data)))
 
@@ -158,9 +161,14 @@ class CombinerPreprocessingLayer(PreprocessingLayer):
             'elements. Please use `dataset.take(...)` to make the number '
             'of elements finite.')
       next_data = self._get_dataset_iterator(data)
+      # TODO(fchollet): consider checking if the dataset is already batched
+      # and otherwise batching it.
+    elif isinstance(data, (ops.Tensor, ragged_tensor.RaggedTensor)):
+      next_data = self._get_dataset_iterator(
+          dataset_ops.Dataset.from_tensor_slices(data).batch(512))
     else:
       generator, _ = training_generator.convert_to_generator_like(
-          data, batch_size=len(data))
+          data, batch_size=512)
       # If the data is not a dataset, we can iterate over it using next(foo);
       # here, we wrap that into a callable.
       next_data = lambda: next(generator)
diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer_v1.py b/tensorflow/python/keras/engine/base_preprocessing_layer_v1.py
index fb77b696f68..f603fac25c3 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer_v1.py
@@ -55,8 +55,9 @@ class CombinerPreprocessingLayer(
 
   def _get_dataset_iterator(self, dataset):
     """Gets an iterator from a tf.data.Dataset."""
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     session = K.get_session()
+    session.run(iterator.initializer)
     next_element = iterator.get_next()
     return lambda: session.run(next_element)
 
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/functional.py
similarity index 54%
rename from tensorflow/python/keras/engine/network.py
rename to tensorflow/python/keras/engine/functional.py
index 87d1953ace5..bdc800d4d1b 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -22,84 +22,48 @@ from __future__ import print_function
 import collections
 import copy
 import itertools
-import json
-import os
 
-import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import composite_tensor
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
-from tensorflow.python.keras.engine import compile_utils
 from tensorflow.python.keras.engine import input_layer as input_layer_module
+from tensorflow.python.keras.engine import node as node_module
+from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.saving import hdf5_format
-from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.saving.saved_model import network_serialization
 from tensorflow.python.keras.utils import generic_utils
-from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
-from tensorflow.python.keras.utils.io_utils import path_to_string
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import py_checkpoint_reader
 from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.training.tracking import data_structures
-from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
-from tensorflow.python.training.tracking import tracking
-from tensorflow.python.training.tracking import util as trackable_utils
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
-from tensorflow.python.util import serialization
 from tensorflow.python.util import tf_inspect
-from tensorflow.tools.docs import doc_controls
 
 
-# pylint: disable=g-import-not-at-top
-try:
-  import h5py
-except ImportError:
-  h5py = None
+# pylint: disable=g-classes-have-attributes
+class Functional(training_lib.Model):
+  """A `Functional` model is a `Model` defined as a directed graph of layers.
 
-try:
-  import yaml
-except ImportError:
-  yaml = None
-# pylint: enable=g-import-not-at-top
-
-
-class Network(base_layer.Layer):
-  """A `Network` is a composition of layers.
-
-  `Network` is the topological form of a "model". A `Model`
-  is simply a `Network` with added training routines.
-
-  Two types of `Networks` exist: Graph Networks and Subclass Networks. Graph
-  networks are used in the Keras Functional and Sequential APIs. Subclassed
-  networks are used when a user subclasses the `Model` class. In general,
-  more Keras features are supported with Graph Networks than with Subclassed
-  Networks, specifically:
+  Three types of `Model` exist: subclassed `Model`, `Functional` model,
+  and `Sequential` (a special case of `Functional`).
+  In general, more Keras features are supported with `Functional`
+  than with subclassed `Model`s, specifically:
 
   - Model cloning (`keras.models.clone`)
   - Serialization (`model.get_config()/from_config`, `model.to_json()/to_yaml()`
   - Whole-model saving (`model.save()`)
 
-  A Graph Network can be instantiated by passing two arguments to `__init__`.
-  The first argument is the `keras.Input` Tensors that represent the inputs
-  to the Network. The second argument specifies the output Tensors that
-  represent the outputs of this Network. Both arguments can be a nested
-  structure of Tensors.
+  A `Functional` model can be instantiated by passing two arguments to
+  `__init__`. The first argument is the `keras.Input` Tensors that represent
+  the inputs to the model. The second argument specifies the output
+  tensors that represent the outputs of this model. Both arguments can be a
+  nested structure of tensors.
 
   Example:
 
@@ -107,10 +71,10 @@ class Network(base_layer.Layer):
   inputs = {'x1': keras.Input(shape=(10,)), 'x2': keras.Input(shape=(1,))}
   t = keras.layers.Dense(1, activation='relu')(inputs['x1'])
   outputs = keras.layers.Add()([t, inputs['x2'])
-  network = Network(inputs, outputs)
+  model = keras.Model(inputs, outputs)
   ```
 
-  A Graph Network constructed using the Functional API can also include raw
+  A `Functional` model constructed using the Functional API can also include raw
   TensorFlow functions, with the exception of functions that create Variables
   or assign ops.
 
@@ -120,38 +84,14 @@ class Network(base_layer.Layer):
   inputs = keras.Input(shape=(10,))
   x = keras.layers.Dense(1)(inputs)
   outputs = tf.nn.relu(x)
-  network = Network(inputs, outputs)
+  model = keras.Model(inputs, outputs)
   ```
 
-  Subclassed Networks can be instantiated via `name` and (optional) `dynamic`
-  keyword arguments. Subclassed Networks keep track of their Layers, and their
-  `call` method can be overridden. Subclassed Networks are typically created
-  indirectly, by subclassing the `Model` class.
-
-  Example:
-
-  ```
-  class MyModel(keras.Model):
-    def __init__(self):
-      super(MyModel, self).__init__(name='my_model', dynamic=False)
-
-      self.layer1 = keras.layers.Dense(10, activation='relu')
-
-    def call(self, inputs):
-      return self.layer1(inputs)
-  ```
-
-  Allowed args in `super().__init__`:
-    name: String name of the model.
-    dynamic: (Subclassed models only) Set this to `True` if your model should
-      only be run eagerly, and should not be used to generate a static
-      computation graph. This attribute is automatically set for Functional API
-      models.
+  Arguments:
+    inputs: List of input tensors (must be created via `tf.keras.Input()`).
+    outputs: List of outputs tensors.
+    name: String, optional. Name of the model.
     trainable: Boolean, whether the model's variables should be trainable.
-    dtype: (Subclassed models only) Default dtype of the model's weights (
-      default of `None` means use the type of the first input). This attribute
-      has no effect on Functional API models, which do not have weights of their
-      own.
   """
 
   # See tf.Module for the usage of this property.
@@ -160,79 +100,31 @@ class Network(base_layer.Layer):
   _TF_MODULE_IGNORED_PROPERTIES = frozenset(itertools.chain(
       ('_layer_call_argspecs', '_compiled_trainable_state',
        '_output_mask_cache', '_output_tensor_cache', '_output_shape_cache'),
-      base_layer.Layer._TF_MODULE_IGNORED_PROPERTIES
+      training_lib.Model._TF_MODULE_IGNORED_PROPERTIES
   ))
 
-  def __init__(self, *args, **kwargs):  # pylint: disable=super-init-not-called
-    # Signature detection
-    if (len(args) == 2 or
-        len(args) == 1 and 'outputs' in kwargs or
-        'inputs' in kwargs and 'outputs' in kwargs):
-      # Graph network
-      self._init_graph_network(*args, **kwargs)
-    else:
-      # Subclassed network
-      self._init_subclassed_network(**kwargs)
-
-    tf_utils.assert_no_legacy_layers(self.layers)
-
-  # Several Network methods have "no_automatic_dependency_tracking"
-  # annotations. Since Network does automatic dependency tracking on attribute
-  # assignment, including for common data structures such as lists, by default
-  # we'd have quite a few empty dependencies which users don't care about (or
-  # would need some way to ignore dependencies automatically, which is confusing
-  # when applied to user code). Some attributes, such as _layers, would cause
-  # structural issues (_layers being the place where Layers assigned to tracked
-  # attributes are stored).
-  #
-  # Aside from these aesthetic and structural issues, useless dependencies on
-  # empty lists shouldn't cause issues; adding or removing them will not break
-  # checkpoints, but may cause "all Python objects matched" assertions to fail
-  # (in which case less strict assertions may be substituted if necessary).
   @trackable.no_automatic_dependency_tracking
-  def _base_init(self, **kwargs):
-    # The following are implemented as property functions:
-    # self.trainable_weights
-    # self.non_trainable_weights
-    # self.input_spec
-    # self.losses
-    # self.updates
-
-    generic_utils.validate_kwargs(kwargs, {'trainable', 'dtype', 'dynamic',
-                                           'name', 'autocast'})
-
-    super(Network, self).__init__(**kwargs)
-
-    self.input_names = None
-    self.output_names = None
-    self._saved_model_inputs_spec = None
-
-    # This is True for Sequential networks and Functional networks.
-    self._compute_output_and_mask_jointly = False
-
-    # Don't reset compilation if already done. This may occur if calling
-    # `__init__` (or `_init_graph_network`) on an already-compiled model
-    # such as a Sequential model. Sequential models may need to rebuild
-    # themselves after compilation.
-    self._maybe_create_attribute('_is_compiled', False)
-    self._maybe_create_attribute('optimizer', None)
-
-    self._trackable_saver = (
-        trackable_utils.saver_with_op_caching(self))
+  def __init__(self, inputs=None, outputs=None, name=None, trainable=True):
+    # generic_utils.validate_kwargs(
+    #     kwargs, {'name', 'trainable'},
+    #     'Functional models may only specify `name` and `trainable` keyword '
+    #     'arguments during initialization. Got an unexpected argument:')
+    super(Functional, self).__init__(name=name, trainable=trainable)
+    self._init_graph_network(inputs, outputs)
 
   @trackable.no_automatic_dependency_tracking
-  def _init_graph_network(self, inputs, outputs, **kwargs):
-    generic_utils.validate_kwargs(
-        kwargs, {'name', 'trainable'},
-        'Functional models may only specify `name` and `trainable` keyword '
-        'arguments during initialization. Got an unexpected argument:')
+  def _init_graph_network(self, inputs, outputs):
+    # This method is needed for Sequential to reinitialize graph network when
+    # layer is added or removed.
+    self._is_graph_network = True
+
     # Normalize and set self.inputs, self.outputs.
     if isinstance(inputs, list) and len(nest.flatten(inputs)) == 1:
       inputs = inputs[0]
     if isinstance(outputs, list) and len(nest.flatten(outputs)) == 1:
       outputs = outputs[0]
-    self._nested_outputs = outputs
     self._nested_inputs = inputs
+    self._nested_outputs = outputs
     self.inputs = nest.flatten(inputs)
     self.outputs = nest.flatten(outputs)
 
@@ -247,7 +139,6 @@ class Network(base_layer.Layer):
     if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
       base_layer_utils.create_keras_history(self._nested_outputs)
 
-    self._base_init(**kwargs)
     self._validate_graph_inputs_and_outputs()
 
     # A Network does not create weights of its own, thus it is already
@@ -255,7 +146,6 @@ class Network(base_layer.Layer):
     self.built = True
     self._build_input_shape = nest.map_structure(lambda x: x.shape, inputs)
     self._compute_output_and_mask_jointly = True
-    self._is_graph_network = True
     # `_expects_training_arg` is True since the `training` argument is always
     # present in the signature of the `call` method of a graph network.
     self._expects_training_arg = True
@@ -303,7 +193,6 @@ class Network(base_layer.Layer):
     self._layer_call_argspecs = {}
     for layer in self._layers:
       self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
-      layer._attribute_sentinel.add_parent(self._attribute_sentinel)
 
     # Build self.input_names and self.output_names.
     self._set_output_names()
@@ -325,6 +214,7 @@ class Network(base_layer.Layer):
 
     self._compute_tensor_usage_count()
     self._set_save_spec(self._nested_inputs)
+    tf_utils.assert_no_legacy_layers(self.layers)
 
   @property
   def input(self):
@@ -340,9 +230,7 @@ class Network(base_layer.Layer):
       RuntimeError: If called in Eager mode.
       AttributeError: If no inbound nodes are found.
     """
-    if self._is_graph_network:
-      return self._nested_inputs
-    return super(Network, self).input
+    return self._nested_inputs
 
   @property
   def input_shape(self):
@@ -360,9 +248,7 @@ class Network(base_layer.Layer):
         AttributeError: if the layer has no defined input_shape.
         RuntimeError: if called in Eager mode.
     """
-    if self._is_graph_network:
-      return nest.map_structure(backend.int_shape, self.input)
-    return super(Network, self).input_shape
+    return nest.map_structure(backend.int_shape, self.input)
 
   @property
   def output(self):
@@ -379,9 +265,7 @@ class Network(base_layer.Layer):
         layers.
       RuntimeError: if called in Eager mode.
     """
-    if self._is_graph_network:
-      return self._nested_outputs
-    return super(Network, self).output
+    return self._nested_outputs
 
   @property
   def output_shape(self):
@@ -398,9 +282,7 @@ class Network(base_layer.Layer):
         AttributeError: if the layer has no defined output shape.
         RuntimeError: if called in Eager mode.
     """
-    if self._is_graph_network:
-      return nest.map_structure(backend.int_shape, self.output)
-    return super(Network, self).output_shape
+    return nest.map_structure(backend.int_shape, self.output)
 
   def _set_output_names(self):
     """Assigns unique names to the Network's outputs.
@@ -421,29 +303,9 @@ class Network(base_layer.Layer):
       uniquified.append(proposal)
     self.output_names = uniquified
 
-  @trackable.no_automatic_dependency_tracking
-  def _init_subclassed_network(self, **kwargs):
-    self._base_init(**kwargs)
-    self._is_graph_network = False
-    self.inputs = None
-    self.outputs = None
-
-  @property
-  @trackable_layer_utils.cache_recursive_attribute('dynamic')
-  def dynamic(self):
-    if self._is_graph_network:
-      return any(layer.dynamic for layer in self.layers)
-    return self._dynamic or any(layer.dynamic for layer in self.layers)
-
   @property
   def _layer_checkpoint_dependencies(self):
     """Dictionary of layer dependencies to be included in the checkpoint."""
-    # Use getattr because this function can be called from __setattr__, at which
-    # point the _is_graph_network attribute has not been created.
-    if (not getattr(self, '_is_graph_network', False) and
-        base_layer_utils.is_subclassed(self)):
-      return {}  # Only add layer dependencies for graph networks
-
     weight_layer_index = 0
 
     dependencies = collections.OrderedDict()
@@ -470,14 +332,14 @@ class Network(base_layer.Layer):
     dependencies = [
         trackable.TrackableReference(name=name, ref=layer)
         for name, layer in self._layer_checkpoint_dependencies.items()]
-    dependencies.extend(super(Network, self)._checkpoint_dependencies)
+    dependencies.extend(super(Functional, self)._checkpoint_dependencies)
     return dependencies
 
   def _lookup_dependency(self, name):
     layer_dependencies = self._layer_checkpoint_dependencies
     if name in layer_dependencies:
       return layer_dependencies[name]
-    return super(Network, self)._lookup_dependency(name)
+    return super(Functional, self)._lookup_dependency(name)
 
   def _handle_deferred_layer_dependencies(self, layers):
     """Handles layer checkpoint dependencies that are added after init."""
@@ -488,262 +350,17 @@ class Network(base_layer.Layer):
         self._handle_deferred_dependencies(name=layer_to_name[layer],
                                            trackable=layer)
 
-  def __setattr__(self, name, value):
-    if not getattr(self, '_self_setattr_tracking', True):
-      super(Network, self).__setattr__(name, value)
-      return
-
-    if all(
-        isinstance(v, (base_layer.Layer,
-                       data_structures.TrackableDataStructure)) or
-        trackable_layer_utils.has_weights(v) for v in nest.flatten(value)):
-      try:
-        self._is_graph_network
-      except AttributeError:
-        # six.raise_from supresses the original AttributeError from being raised
-        six.raise_from(
-            RuntimeError('It looks like you are subclassing `Model` and you '
-                         'forgot to call `super(YourClass, self).__init__()`.'
-                         ' Always start with this line.'), None)
-
-    super(Network, self).__setattr__(name, value)
-
-    # Keep track of metric instance created in subclassed model/layer.
-    # We do this so that we can maintain the correct order of metrics by adding
-    # the instance to the `metrics` list as soon as it is created.
-    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
-    if isinstance(value, metrics_module.Metric):
-      self._metrics.append(value)
-
   @property
-  @trackable_layer_utils.cache_recursive_attribute('stateful')
-  def stateful(self):
-    return any(getattr(layer, 'stateful', False) for layer in self.layers)
-
-  def reset_states(self):
-    for layer in self.layers:
-      if hasattr(layer, 'reset_states') and getattr(layer, 'stateful', False):
-        layer.reset_states()
-
-  @property
-  @deprecation.deprecated(
-      date=None,
-      instructions='This property should not be used in TensorFlow 2.0, '
-      'as updates are applied automatically.')
-  @doc_controls.do_not_generate_docs
-  def state_updates(self):
-    """Deprecated, do NOT use!
-
-    Returns the `updates` from all layers that are stateful.
-
-    This is useful for separating training updates and
-    state updates, e.g. when we need to update a layer's internal state
-    during prediction.
-
-    Returns:
-        A list of update ops.
-    """
-    state_updates = []
-    for layer in self.layers:
-      if getattr(layer, 'stateful', False):
-        if hasattr(layer, 'updates'):
-          state_updates += layer.updates
-    return state_updates
-
-  @property
-  def weights(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self._dedup_weights(self._undeduplicated_weights)
-
-  @property
-  def _undeduplicated_weights(self):
-    """Returns the undeduplicated list of all layer variables/weights."""
-    self._assert_weights_created()
-    weights = []
-    for layer in self._layers:
-      weights += layer.weights
-    weights += (self._trainable_weights + self._non_trainable_weights)
-    return weights
-
-  @property
-  @tracking.cached_per_instance
   def _should_compute_mask(self):
-    return self._is_graph_network and super(Network, self)._should_compute_mask
+    return True
 
   def compute_mask(self, inputs, mask):
-    if not self._is_graph_network:
-      return None
-
     # TODO(omalleyt): b/123540974 This function is not really safe to call
     # by itself because it will duplicate any updates and losses in graph
     # mode by `call`ing the Layers again.
     output_tensors = self._run_internal_graph(inputs, mask=mask)
-    return nest.map_structure(lambda t: t._keras_mask, output_tensors)
-
-  @property
-  def layers(self):
-    return list(
-        trackable_layer_utils.filter_empty_layer_containers(self._layers))
-
-  def get_layer(self, name=None, index=None):
-    """Retrieves a layer based on either its name (unique) or index.
-
-    If `name` and `index` are both provided, `index` will take precedence.
-    Indices are based on order of horizontal graph traversal (bottom-up).
-
-    Arguments:
-        name: String, name of layer.
-        index: Integer, index of layer.
-
-    Returns:
-        A layer instance.
-
-    Raises:
-        ValueError: In case of invalid layer name or index.
-    """
-    # TODO(fchollet): We could build a dictionary based on layer names
-    # since they are constant, but we have not done that yet.
-    if index is not None and name is not None:
-      raise ValueError('Provide only a layer name or a layer index.')
-
-    if index is not None:
-      if len(self.layers) <= index:
-        raise ValueError('Was asked to retrieve layer at index ' + str(index) +
-                         ' but model only has ' + str(len(self.layers)) +
-                         ' layers.')
-      else:
-        return self.layers[index]
-
-    if name is not None:
-      for layer in self.layers:
-        if layer.name == name:
-          return layer
-      raise ValueError('No such layer: ' + name + '.')
-    raise ValueError('Provide either a layer name or layer index.')
-
-  @property
-  def trainable_weights(self):
-    self._assert_weights_created()
-    return self._dedup_weights(
-        trackable_layer_utils.gather_trainable_weights(
-            trainable=self.trainable,
-            sub_layers=self._layers,
-            extra_variables=self._trainable_weights))
-
-  @property
-  def non_trainable_weights(self):
-    self._assert_weights_created()
-    return self._dedup_weights(
-        trackable_layer_utils.gather_non_trainable_weights(
-            trainable=self.trainable,
-            sub_layers=self._layers,
-            extra_variables=self._non_trainable_weights +
-            self._trainable_weights))
-
-  @generic_utils.default
-  def build(self, input_shape):
-    """Builds the model based on input shapes received.
-
-    This is to be used for subclassed models, which do not know at instantiation
-    time what their inputs look like.
-
-    This method only exists for users who want to call `model.build()` in a
-    standalone way (as a substitute for calling the model on real data to
-    build it). It will never be called by the framework (and thus it will
-    never throw unexpected errors in an unrelated workflow).
-
-    Args:
-     input_shape: Single tuple, TensorShape, or list of shapes, where shapes
-         are tuples, integers, or TensorShapes.
-
-    Raises:
-      ValueError:
-        1. In case of invalid user-provided data (not of type tuple,
-           list, or TensorShape).
-        2. If the model requires call arguments that are agnostic
-           to the input shapes (positional or kwarg in call signature).
-        3. If not all layers were properly built.
-        4. If float type inputs are not supported within the layers.
-
-      In each of these cases, the user should build their model by calling it
-      on real tensor data.
-    """
-    if self._is_graph_network:
-      super(Network, self).build(input_shape)
-      return
-
-    # If subclass network
-    if input_shape is None:
-      raise ValueError('Input shape must be defined when calling build on a '
-                       'model subclass network.')
-    valid_types = (tuple, list, tensor_shape.TensorShape)
-    if not isinstance(input_shape, valid_types):
-      raise ValueError('Specified input shape is not one of the valid types. '
-                       'Please specify a batch input shape of type tuple or '
-                       'list of input shapes. User provided '
-                       'input type: {}'.format(type(input_shape)))
-
-    if input_shape and not self.inputs:
-      # We create placeholders for the `None`s in the shape and build the model
-      # in a Graph. Since tf.Variable is compatible with both eager execution
-      # and graph building, the variables created after building the model in
-      # a Graph are still valid when executing eagerly.
-      if context.executing_eagerly():
-        graph = func_graph.FuncGraph('build_graph')
-      else:
-        graph = backend.get_graph()
-      with graph.as_default():
-        if isinstance(input_shape, list):
-          x = [base_layer_utils.generate_placeholders_from_shape(shape)
-               for shape in input_shape]
-        elif isinstance(input_shape, dict):
-          x = {
-              k: base_layer_utils.generate_placeholders_from_shape(shape)
-              for k, shape in input_shape.items()
-          }
-        else:
-          x = base_layer_utils.generate_placeholders_from_shape(input_shape)
-
-        kwargs = {}
-        call_signature = self._call_full_argspec
-        call_args = call_signature.args
-        # Exclude `self`, `inputs`, and any argument with a default value.
-        if len(call_args) > 2:
-          if call_signature.defaults:
-            call_args = call_args[2:-len(call_signature.defaults)]
-          else:
-            call_args = call_args[2:]
-          for arg in call_args:
-            if arg == 'training':
-              # Case where `training` is a positional arg with no default.
-              kwargs['training'] = False
-            else:
-              # Has invalid call signature with unknown positional arguments.
-              raise ValueError(
-                  'Currently, you cannot build your model if it has '
-                  'positional or keyword arguments that are not '
-                  'inputs to the model, but are required for its '
-                  '`call` method. Instead, in order to instantiate '
-                  'and build your model, `call` your model on real '
-                  'tensor data with all expected call arguments.')
-        elif len(call_args) < 2:
-          # Signature without `inputs`.
-          raise ValueError('You can only call `build` on a model if its `call` '
-                           'method accepts an `inputs` argument.')
-        try:
-          self.call(x, **kwargs)
-        except (errors.InvalidArgumentError, TypeError):
-          raise ValueError('You cannot build your model by calling `build` '
-                           'if your layers do not support float type inputs. '
-                           'Instead, in order to instantiate and build your '
-                           'model, `call` your model on real tensor data (of '
-                           'the correct dtype).')
-
-    super(Network, self).build(input_shape)
+    return nest.map_structure(lambda t: getattr(t, '_keras_mask', None),
+                              output_tensors)
 
   def call(self, inputs, training=None, mask=None):
     """Calls the model on new inputs.
@@ -763,17 +380,10 @@ class Network(base_layer.Layer):
         A tensor if there is a single output, or
         a list of tensors if there are more than one outputs.
     """
-    if not self._is_graph_network:
-      raise NotImplementedError('When subclassing the `Model` class, you should'
-                                ' implement a `call` method.')
-
     return self._run_internal_graph(
         inputs, training=training, mask=mask)
 
   def compute_output_shape(self, input_shape):
-    if not self._is_graph_network:
-      return super(Network, self).compute_output_shape(input_shape)
-
     # Convert any shapes in tuple format to TensorShapes.
     input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
 
@@ -861,11 +471,11 @@ class Network(base_layer.Layer):
         mask: (Optional) Tensor or nested structure of Tensors.
 
     Returns:
-        Two lists: output_tensors, output_masks
+        output_tensors
     """
     inputs = self._flatten_to_reference_inputs(inputs)
     if mask is None:
-      masks = [None for _ in range(len(inputs))]
+      masks = [None] * len(inputs)
     else:
       masks = self._flatten_to_reference_inputs(mask)
     for input_t, mask in zip(inputs, masks):
@@ -873,55 +483,39 @@ class Network(base_layer.Layer):
 
     # Dictionary mapping reference tensors to computed tensors.
     tensor_dict = {}
+    tensor_usage_count = self._tensor_usage_count
     for x, y in zip(self.inputs, inputs):
       y = self._conform_to_reference_input(y, ref_input=x)
       x_id = str(id(x))
-      tensor_dict[x_id] = [y] * self._tensor_usage_count[x_id]
+      tensor_dict[x_id] = [y] * tensor_usage_count[x_id]
 
-    depth_keys = list(self._nodes_by_depth.keys())
+    nodes_by_depth = self._nodes_by_depth
+    depth_keys = list(nodes_by_depth.keys())
     depth_keys.sort(reverse=True)
 
     for depth in depth_keys:
-      nodes = self._nodes_by_depth[depth]
+      nodes = nodes_by_depth[depth]
       for node in nodes:
         if node.is_input:
           continue  # Input tensors already exist.
 
-        if not all(
-            str(id(tensor)) in tensor_dict
-            for tensor in nest.flatten(node.keras_inputs)):
+        if any(t_id not in tensor_dict for t_id in node.flat_input_ids):
           continue  # Node is not computable, try skipping.
 
-        layer = node.layer
         args, kwargs = node.map_arguments(tensor_dict)
-        outputs = layer(*args, **kwargs)
+        outputs = node.layer(*args, **kwargs)
 
         # Update tensor_dict.
-        for x, y in zip(nest.flatten(node.outputs), nest.flatten(outputs)):
-          x_id = str(id(x))
-          tensor_dict[x_id] = [y] * self._tensor_usage_count[x_id]
+        for x_id, y in zip(node.flat_output_ids, nest.flatten(outputs)):
+          tensor_dict[x_id] = [y] * tensor_usage_count[x_id]
 
     output_tensors = []
-    output_shapes = []
     for x in self.outputs:
-      assert str(id(x)) in tensor_dict, 'Could not compute output ' + str(x)
-      tensor = tensor_dict[str(id(x))].pop()
-      output_shapes.append(x.shape)
-      output_tensors.append(tensor)
+      x_id = str(id(x))
+      assert x_id in tensor_dict, 'Could not compute output ' + str(x)
+      output_tensors.append(tensor_dict[x_id].pop())
 
-    if output_shapes is not None:
-      input_shapes = [x.shape for x in inputs]
-      try:
-        cache_key = tuple(tf_utils.convert_shapes(input_shapes, to_tuples=True))
-        self._output_shape_cache[cache_key] = nest.pack_sequence_as(
-            self._nested_outputs, output_shapes)
-      except ValueError:
-        # In case there are unknown TensorShape, eg for sparse tensor input,
-        # We skip the caching since the shape is unknown.
-        pass
-
-    output_tensors = nest.pack_sequence_as(self._nested_outputs, output_tensors)
-    return output_tensors
+    return nest.pack_sequence_as(self._nested_outputs, output_tensors)
 
   def _flatten_to_reference_inputs(self, tensors):
     """Maps `tensors` to their respective `keras.Input`."""
@@ -942,41 +536,43 @@ class Network(base_layer.Layer):
 
   def _conform_to_reference_input(self, tensor, ref_input):
     """Set shape and dtype based on `keras.Input`s."""
-    # Shape handling (only for non-CompositeTensors).
-    if isinstance(tensor, ops.Tensor) and isinstance(ref_input, ops.Tensor):
+    if isinstance(tensor, ops.Tensor):
       # Allow (None,) and (None, 1) Tensors to be passed interchangably. Use the
       # shape specified by the `keras.Input`.
-      if tensor.shape.rank is not None and ref_input.shape.rank is not None:
-        should_squeeze_last_dim = (
-            tensor.shape.rank == ref_input.shape.rank + 1 and
-            tensor.shape[-1] == 1)
-        should_expand_last_dim = (
-            tensor.shape.rank == ref_input.shape.rank - 1 and
-            ref_input.shape[-1] == 1)
-        if should_squeeze_last_dim:
+      t_shape = tensor.shape
+      t_rank = t_shape.rank
+      ref_shape = ref_input.shape
+      ref_rank = ref_shape.rank
+      if t_rank is not None and ref_rank is not None:
+        # Should squeeze last dimension.
+        # True if tensor is (BATCH, ..., 1) and reference is (BATCH, ...).
+        if (t_rank == ref_rank + 1 and t_shape[-1] == 1):
           tensor = array_ops.squeeze_v2(tensor, axis=-1)
-        elif should_expand_last_dim:
+        # Should expand last_dimension.
+        # True if tensor is (BATCH, ...) and reference is (BATCH, ..., 1).
+        elif (t_rank == ref_rank - 1 and ref_shape[-1] == 1):
           tensor = array_ops.expand_dims_v2(tensor, axis=-1)
 
-      # Add shape hints to Tensors that might have None shape dims but have
-      # shapes defined by the `keras.Input`.
-      try:
-        tensor.set_shape(tensor.shape.merge_with(ref_input.shape))
-      except ValueError:
-        logging.warning(
-            'Model was constructed with shape {} for input {}, but it was '
-            'called on an input with incompatible shape {}.'.format(
-                ref_input.shape, ref_input, tensor.shape))
+      # Add shape hints to Tensors that may have None shape dims but have shapes
+      # defined by the `keras.Input` (not applicable in eager mode).
+      if not context.executing_eagerly():
+        try:
+          tensor.set_shape(tensor.shape.merge_with(ref_input.shape))
+        except ValueError:
+          logging.warning(
+              'Model was constructed with shape {} for input {}, but it was '
+              'called on an input with incompatible shape {}.'.format(
+                  ref_input.shape, ref_input, tensor.shape))
 
-    # Dtype handling.
-    if isinstance(ref_input, (ops.Tensor, composite_tensor.CompositeTensor)):
+      # Dtype casting.
+      tensor = math_ops.cast(tensor, dtype=ref_input.dtype)
+    elif isinstance(tensor, composite_tensor.CompositeTensor):
+      # Dtype casting.
       tensor = math_ops.cast(tensor, dtype=ref_input.dtype)
 
     return tensor
 
   def get_config(self):
-    if not self._is_graph_network:
-      raise NotImplementedError
     return copy.deepcopy(get_network_config(self))
 
   @classmethod
@@ -1002,373 +598,6 @@ class Network(base_layer.Layer):
     connect_ancillary_layers(model, created_layers)
     return model
 
-  def save(self,
-           filepath,
-           overwrite=True,
-           include_optimizer=True,
-           save_format=None,
-           signatures=None,
-           options=None):
-    """Saves the model to Tensorflow SavedModel or a single HDF5 file.
-
-    The savefile includes:
-
-    - The model architecture, allowing to re-instantiate the model.
-    - The model weights.
-    - The state of the optimizer, allowing to resume training
-        exactly where you left off.
-
-    This allows you to save the entirety of the state of a model
-    in a single file.
-
-    Saved models can be reinstantiated via `keras.models.load_model`.
-    The model returned by `load_model` is a compiled model ready to be used
-    (unless the saved model was never compiled in the first place).
-
-    Models built with the Sequential and Functional API can be saved to both the
-    HDF5 and SavedModel formats. Subclassed models can only be saved with the
-    SavedModel format.
-
-    Note that the model weights may have different scoped names after being
-    loaded. Scoped names include the model/layer names, such as
-    `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
-     access specific variables, e.g. `model.get_layer("dense_1").kernel`.
-
-    Arguments:
-        filepath: String, PathLike, path to SavedModel or H5 file to save the
-            model.
-        overwrite: Whether to silently overwrite any existing file at the
-            target location, or provide the user with a manual prompt.
-        include_optimizer: If True, save optimizer's state together.
-        save_format: Either `'tf'` or `'h5'`, indicating whether to save the
-            model to Tensorflow SavedModel or HDF5. Defaults to 'tf' in TF 2.X,
-            and 'h5' in TF 1.X.
-        signatures: Signatures to save with the SavedModel. Applicable to the
-            'tf' format only. Please see the `signatures` argument in
-            `tf.saved_model.save` for details.
-        options: Optional `tf.saved_model.SaveOptions` object that specifies
-            options for saving to SavedModel.
-
-    Example:
-
-    ```python
-    from keras.models import load_model
-
-    model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
-    del model  # deletes the existing model
-
-    # returns a compiled model
-    # identical to the previous one
-    model = load_model('my_model.h5')
-    ```
-    """
-    save.save_model(self, filepath, overwrite, include_optimizer, save_format,
-                    signatures, options)
-
-  def save_weights(self, filepath, overwrite=True, save_format=None):
-    """Saves all layer weights.
-
-    Either saves in HDF5 or in TensorFlow format based on the `save_format`
-    argument.
-
-    When saving in HDF5 format, the weight file has:
-      - `layer_names` (attribute), a list of strings
-          (ordered names of model layers).
-      - For every layer, a `group` named `layer.name`
-          - For every such layer group, a group attribute `weight_names`,
-              a list of strings
-              (ordered names of weights tensor of the layer).
-          - For every weight in the layer, a dataset
-              storing the weight value, named after the weight tensor.
-
-    When saving in TensorFlow format, all objects referenced by the network are
-    saved in the same format as `tf.train.Checkpoint`, including any `Layer`
-    instances or `Optimizer` instances assigned to object attributes. For
-    networks constructed from inputs and outputs using `tf.keras.Model(inputs,
-    outputs)`, `Layer` instances used by the network are tracked/saved
-    automatically. For user-defined classes which inherit from `tf.keras.Model`,
-    `Layer` instances must be assigned to object attributes, typically in the
-    constructor. See the documentation of `tf.train.Checkpoint` and
-    `tf.keras.Model` for details.
-
-    While the formats are the same, do not mix `save_weights` and
-    `tf.train.Checkpoint`. Checkpoints saved by `Model.save_weights` should be
-    loaded using `Model.load_weights`. Checkpoints saved using
-    `tf.train.Checkpoint.save` should be restored using the corresponding
-    `tf.train.Checkpoint.restore`. Prefer `tf.train.Checkpoint` over
-    `save_weights` for training checkpoints.
-
-    The TensorFlow format matches objects and variables by starting at a root
-    object, `self` for `save_weights`, and greedily matching attribute
-    names. For `Model.save` this is the `Model`, and for `Checkpoint.save` this
-    is the `Checkpoint` even if the `Checkpoint` has a model attached. This
-    means saving a `tf.keras.Model` using `save_weights` and loading into a
-    `tf.train.Checkpoint` with a `Model` attached (or vice versa) will not match
-    the `Model`'s variables. See the [guide to training
-    checkpoints](https://www.tensorflow.org/guide/checkpoint) for details
-    on the TensorFlow format.
-
-    Arguments:
-        filepath: String or PathLike, path to the file to save the weights to.
-            When saving in TensorFlow format, this is the prefix used for
-            checkpoint files (multiple files are generated). Note that the '.h5'
-            suffix causes weights to be saved in HDF5 format.
-        overwrite: Whether to silently overwrite any existing file at the
-            target location, or provide the user with a manual prompt.
-        save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
-            '.keras' will default to HDF5 if `save_format` is `None`. Otherwise
-            `None` defaults to 'tf'.
-
-    Raises:
-        ImportError: If h5py is not available when attempting to save in HDF5
-            format.
-        ValueError: For invalid/unknown format arguments.
-    """
-    self._assert_weights_created()
-    filepath = path_to_string(filepath)
-    filepath_is_h5 = _is_hdf5_filepath(filepath)
-    if save_format is None:
-      if filepath_is_h5:
-        save_format = 'h5'
-      else:
-        save_format = 'tf'
-    else:
-      user_format = save_format.lower().strip()
-      if user_format in ('tensorflow', 'tf'):
-        save_format = 'tf'
-      elif user_format in ('hdf5', 'h5', 'keras'):
-        save_format = 'h5'
-      else:
-        raise ValueError(
-            'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
-                save_format,))
-    if save_format == 'tf' and filepath_is_h5:
-      raise ValueError(
-          ('save_weights got save_format="tf"/"tensorflow", but the '
-           'filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras" '
-           'when saving in TensorFlow format.')
-          % filepath)
-
-    if save_format == 'h5' and h5py is None:
-      raise ImportError(
-          '`save_weights` requires h5py when saving in hdf5.')
-    if save_format == 'tf':
-      check_filepath = filepath + '.index'
-    else:
-      check_filepath = filepath
-    # If file exists and should not be overwritten:
-    if not overwrite and os.path.isfile(check_filepath):
-      proceed = ask_to_proceed_with_overwrite(check_filepath)
-      if not proceed:
-        return
-    if save_format == 'h5':
-      with h5py.File(filepath, 'w') as f:
-        hdf5_format.save_weights_to_hdf5_group(f, self.layers)
-    else:
-      if context.executing_eagerly():
-        session = None
-      else:
-        session = backend.get_session()
-      optimizer = getattr(self, 'optimizer', None)
-      if (optimizer
-          and not isinstance(optimizer, trackable.Trackable)):
-        logging.warning(
-            ('This model was compiled with a Keras optimizer (%s) but is being '
-             'saved in TensorFlow format with `save_weights`. The model\'s '
-             'weights will be saved, but unlike with TensorFlow optimizers in '
-             'the TensorFlow format the optimizer\'s state will not be '
-             'saved.\n\nConsider using a TensorFlow optimizer from `tf.train`.')
-            % (optimizer,))
-      self._trackable_saver.save(filepath, session=session)
-      # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
-      checkpoint_management.update_checkpoint_state_internal(
-          save_dir=os.path.dirname(filepath),
-          model_checkpoint_path=filepath,
-          save_relative_paths=True,
-          all_model_checkpoint_paths=[filepath])
-
-  def load_weights(self, filepath, by_name=False, skip_mismatch=False):
-    """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
-
-    If `by_name` is False weights are loaded based on the network's
-    topology. This means the architecture should be the same as when the weights
-    were saved.  Note that layers that don't have weights are not taken into
-    account in the topological ordering, so adding or removing layers is fine as
-    long as they don't have weights.
-
-    If `by_name` is True, weights are loaded into layers only if they share the
-    same name. This is useful for fine-tuning or transfer-learning models where
-    some of the layers have changed.
-
-    Only topological loading (`by_name=False`) is supported when loading weights
-    from the TensorFlow format. Note that topological loading differs slightly
-    between TensorFlow and HDF5 formats for user-defined classes inheriting from
-    `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the
-    TensorFlow format loads based on the object-local names of attributes to
-    which layers are assigned in the `Model`'s constructor.
-
-    Arguments:
-        filepath: String or PathLike, path to the weights file to load. For
-            weight files in TensorFlow format, this is the file prefix (the
-            same as was passed to `save_weights`).
-        by_name: Boolean, whether to load weights by name or by topological
-            order. Only topological loading is supported for weight files in
-            TensorFlow format.
-        skip_mismatch: Boolean, whether to skip loading of layers where there is
-            a mismatch in the number of weights, or a mismatch in the shape of
-            the weight (only valid when `by_name=True`).
-
-    Returns:
-        When loading a weight file in TensorFlow format, returns the same status
-        object as `tf.train.Checkpoint.restore`. When graph building, restore
-        ops are run automatically as soon as the network is built (on first call
-        for user-defined classes inheriting from `Model`, immediately if it is
-        already built).
-
-        When loading weights in HDF5 format, returns `None`.
-
-    Raises:
-        ImportError: If h5py is not available and the weight file is in HDF5
-            format.
-        ValueError: If `skip_mismatch` is set to `True` when `by_name` is
-          `False`.
-    """
-
-    if skip_mismatch and not by_name:
-      raise ValueError(
-          'When calling model.load_weights, skip_mismatch can only be set to '
-          'True when by_name is True.')
-
-    filepath = path_to_string(filepath)
-    if _is_hdf5_filepath(filepath):
-      save_format = 'h5'
-    else:
-      try:
-        py_checkpoint_reader.NewCheckpointReader(filepath)
-        save_format = 'tf'
-      except errors_impl.DataLossError:
-        # The checkpoint is not readable in TensorFlow format. Try HDF5.
-        save_format = 'h5'
-    if save_format == 'tf':
-      status = self._trackable_saver.restore(filepath)
-      if by_name:
-        raise NotImplementedError(
-            'Weights may only be loaded based on topology into Models when '
-            'loading TensorFlow-formatted weights (got by_name=True to '
-            'load_weights).')
-      if not context.executing_eagerly():
-        session = backend.get_session()
-        # Restore existing variables (if any) immediately, and set up a
-        # streaming restore for any variables created in the future.
-        trackable_utils.streaming_restore(status=status, session=session)
-      status.assert_nontrivial_match()
-      return status
-    if h5py is None:
-      raise ImportError(
-          '`load_weights` requires h5py when loading weights from HDF5.')
-    if self._is_graph_network and not self.built:
-      raise NotImplementedError(
-          'Unable to load weights saved in HDF5 format into a subclassed '
-          'Model which has not created its variables yet. Call the Model '
-          'first, then load the weights.')
-    self._assert_weights_created()
-    with h5py.File(filepath, 'r') as f:
-      if 'layer_names' not in f.attrs and 'model_weights' in f:
-        f = f['model_weights']
-      if by_name:
-        hdf5_format.load_weights_from_hdf5_group_by_name(
-            f, self.layers, skip_mismatch=skip_mismatch)
-      else:
-        hdf5_format.load_weights_from_hdf5_group(f, self.layers)
-
-  def _updated_config(self):
-    """Util shared between different serialization methods.
-
-    Returns:
-        Model config with Keras version information added.
-    """
-    from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
-
-    config = self.get_config()
-    model_config = {
-        'class_name': self.__class__.__name__,
-        'config': config,
-        'keras_version': keras_version,
-        'backend': backend.backend()
-    }
-    return model_config
-
-  def to_json(self, **kwargs):
-    """Returns a JSON string containing the network configuration.
-
-    To load a network from a JSON save file, use
-    `keras.models.model_from_json(json_string, custom_objects={})`.
-
-    Arguments:
-        **kwargs: Additional keyword arguments
-            to be passed to `json.dumps()`.
-
-    Returns:
-        A JSON string.
-    """
-    model_config = self._updated_config()
-    return json.dumps(
-        model_config, default=serialization.get_json_type, **kwargs)
-
-  def to_yaml(self, **kwargs):
-    """Returns a yaml string containing the network configuration.
-
-    To load a network from a yaml save file, use
-    `keras.models.model_from_yaml(yaml_string, custom_objects={})`.
-
-    `custom_objects` should be a dictionary mapping
-    the names of custom losses / layers / etc to the corresponding
-    functions / classes.
-
-    Arguments:
-        **kwargs: Additional keyword arguments
-            to be passed to `yaml.dump()`.
-
-    Returns:
-        A YAML string.
-
-    Raises:
-        ImportError: if yaml module is not found.
-    """
-    if yaml is None:
-      raise ImportError(
-          'Requires yaml module installed (`pip install pyyaml`).')
-    return yaml.dump(self._updated_config(), **kwargs)
-
-  def summary(self, line_length=None, positions=None, print_fn=None):
-    """Prints a string summary of the network.
-
-    Arguments:
-        line_length: Total length of printed lines
-            (e.g. set this to adapt the display to different
-            terminal window sizes).
-        positions: Relative or absolute positions of log elements
-            in each line. If not provided,
-            defaults to `[.33, .55, .67, 1.]`.
-        print_fn: Print function to use. Defaults to `print`.
-            It will be called on each line of the summary.
-            You can set it to a custom function
-            in order to capture the string summary.
-
-    Raises:
-        ValueError: if `summary()` is called before the model is built.
-    """
-    if not self.built:
-      raise ValueError('This model has not yet been built. '
-                       'Build the model first by calling `build()` or calling '
-                       '`fit()` with some data, or specify '
-                       'an `input_shape` argument in the first layer(s) for '
-                       'automatic build.')
-    layer_utils.print_summary(self,
-                              line_length=line_length,
-                              positions=positions,
-                              print_fn=print_fn)
-
   def _validate_graph_inputs_and_outputs(self):
     """Validates the inputs and outputs of a Graph Network."""
     # Check for redundancy in inputs.
@@ -1501,10 +730,6 @@ class Network(base_layer.Layer):
         self._layers.append(layer)
         deferred_layers.append(layer)
         self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
-
-        # This allows the added layer to broadcast mutations to the current
-        # layer, which is necessary to ensure cache correctness.
-        layer._attribute_sentinel.add_parent(self._attribute_sentinel)
         layer_set.add(layer)
     self._handle_deferred_layer_dependencies(deferred_layers)
 
@@ -1542,30 +767,9 @@ class Network(base_layer.Layer):
     self._tensor_usage_count = tensor_usage_count
 
   def _assert_weights_created(self):
-    """Asserts that all the weights for the network have been created.
-
-    For a non-dynamic network, the weights must already be created after the
-    layer has been called. For a dynamic network, the exact list of weights can
-    never be known for certain since it may change at any time during execution.
-
-    We run this check right before accessing weights or getting the Numpy value
-    for the current weights. Otherwise, if the layer has never been called,
-    the user would just get an empty list, which is misleading.
-
-    Raises:
-      ValueError: if the weights of the network has not yet been created.
-    """
-    if self.dynamic:
-      return
-    if (not self._is_graph_network and
-        'build' in self.__class__.__dict__ and
-        not self.built):
-      # For any model that has customized build() method but hasn't
-      # been invoked yet, this will cover both sequential and subclass model.
-      raise ValueError('Weights for model %s have not yet been created. '
-                       'Weights are created when the Model is first called on '
-                       'inputs or `build()` is called with an `input_shape`.' %
-                       self.name)
+    # Override the implementation in Model.
+    # The Functional model should always have weight created already.
+    return
 
   def _graph_network_add_loss(self, symbolic_loss):
     new_nodes, new_layers = _map_subgraph_network(self.inputs, [symbolic_loss])
@@ -1587,42 +791,11 @@ class Network(base_layer.Layer):
     new_layers.append(add_metric_layer)
     self._insert_layers(new_layers, new_nodes)
 
-  @trackable.no_automatic_dependency_tracking
-  def _set_save_spec(self, inputs):
-    if self._saved_model_inputs_spec is not None:
-      return  # Already set.
-
-    input_names = self.input_names
-    if not input_names:
-      input_names = compile_utils.create_pseudo_input_names(inputs)
-
-    flat_inputs = nest.flatten(inputs)
-    specs = []
-    for name, tensor in zip(input_names, flat_inputs):
-      specs.append(
-          tf_utils.get_tensor_spec(tensor, dynamic_batch=False, name=name))
-    specs = nest.pack_sequence_as(inputs, specs)
-
-    self._saved_model_inputs_spec = specs
-
-  def _get_save_spec(self, dynamic_batch=True):
-    if self._saved_model_inputs_spec is None:
-      return None
-
-    return nest.map_structure(
-        lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=dynamic_batch),
-        self._saved_model_inputs_spec)
-
   @property
   def _trackable_saved_model_saver(self):
     return network_serialization.NetworkSavedModelSaver(self)
 
 
-def _is_hdf5_filepath(filepath):
-  return (filepath.endswith('.h5') or filepath.endswith('.keras') or
-          filepath.endswith('.hdf5'))
-
-
 def _make_node_key(layer_name, node_index):
   return layer_name + '_ib-' + str(node_index)
 
@@ -1830,7 +1003,9 @@ def _map_subgraph_network(inputs, outputs):
 def _should_skip_first_node(layer):
   """Returns True if the first layer node should not be saved or loaded."""
   # Networks start with a pre-existing node linking their input to output.
-  return issubclass(layer.__class__, Network) and layer._is_graph_network
+  # For a sequential model, it is first created with _is_graph_network = False,
+  # we have to keep the _is_graph_network check here.
+  return isinstance(layer, Functional) and layer._is_graph_network
 
 
 def _deserialize_keras_tensors(kwargs, layer_map):
@@ -1932,19 +1107,28 @@ def reconstruct_from_config(config, custom_objects=None, created_layers=None):
         kwargs = {}
       elif len(input_data) == 4:
         kwargs = input_data[3]
-        kwargs = _deserialize_keras_tensors(kwargs, created_layers)
+        try:
+          kwargs = _deserialize_keras_tensors(kwargs, created_layers)
+        except IndexError:
+          # Happens if keras tensors in kwargs are still unprocessed
+          add_unprocessed_node(layer, node_data)
+          return
       else:
         raise ValueError('Improperly formatted model config.')
 
-      inbound_layer = created_layers[inbound_layer_name]
-      inbound_node_index = get_node_index(inbound_layer, inbound_node_index)
+      if inbound_layer_name != node_module._CONSTANT_VALUE:
+        inbound_layer = created_layers[inbound_layer_name]
+        inbound_node_index = get_node_index(inbound_layer, inbound_node_index)
 
-      if inbound_node_index is None:
-        add_unprocessed_node(layer, node_data)
-        return
-      inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
-      input_tensors.append(
-          nest.flatten(inbound_node.outputs)[inbound_tensor_index])
+        if inbound_node_index is None:
+          add_unprocessed_node(layer, node_data)
+          return
+        inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
+        input_tensors.append(
+            nest.flatten(inbound_node.outputs)[inbound_tensor_index])
+      else:
+        # We received a constant w/ no Keras history attached
+        input_tensors.append(inbound_tensor_index)
     input_tensors = nest.pack_sequence_as(node_data, input_tensors)
     # Call layer on its inputs, thus creating the node
     # and building the layer if needed.
diff --git a/tensorflow/python/keras/engine/network_test.py b/tensorflow/python/keras/engine/functional_test.py
similarity index 97%
rename from tensorflow/python/keras/engine/network_test.py
rename to tensorflow/python/keras/engine/functional_test.py
index b4e8adf2c49..90fc9f2697f 100644
--- a/tensorflow/python/keras/engine/network_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -33,8 +33,8 @@ from tensorflow.python.keras import layers
 from tensorflow.python.keras import models
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import functional
 from tensorflow.python.keras.engine import input_layer as input_layer_lib
-from tensorflow.python.keras.engine import network as network_lib
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.utils import layer_utils
@@ -89,7 +89,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
 
       self.assertEqual(len(layer.updates), 3)
 
-      network = network_lib.Network(x2, y2)
+      network = functional.Functional(x2, y2)
       self.assertEqual(len(network.updates), 3)
 
       x3 = input_layer_lib.Input(shape=(1,))
@@ -120,7 +120,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     dense_a = layers.Dense(4, name='dense_a')
     dense_b = layers.Dense(2, name='dense_b')
     y = dense_b(dense_a(x))
-    network = network_lib.Network(x, y, name='dense_network')
+    network = functional.Functional(x, y, name='dense_network')
 
     # test various get_layer by index
     self.assertEqual(network.get_layer(index=1), dense_a)
@@ -251,7 +251,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       x = input_layer_lib.Input(shape=(32,))
       dense = layers.Dense(2)
       y = dense(x)
-      network = network_lib.Network(x, y, name='dense_network')
+      network = functional.Functional(x, y, name='dense_network')
 
       # test basic attributes
       self.assertEqual(network.name, 'dense_network')
@@ -740,7 +740,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     else:
       x = input_layer_lib.Input(shape=(32,))
       y = MaskedLayer()(x)  # pylint: disable=not-callable
-      network = network_lib.Network(x, y)
+      network = functional.Functional(x, y)
 
       # test callability on Input
       x_2 = input_layer_lib.Input(shape=(32,))
@@ -1102,7 +1102,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
 
   def test_subclassed_error_if_init_not_called(self):
 
-    class MyNetwork(network_lib.Network):
+    class MyNetwork(training_lib.Model):
 
       def __init__(self):
         self._foo = [layers.Dense(10), layers.Dense(10)]
@@ -1124,10 +1124,12 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     inputs = input_layer_lib.Input(shape=(32,))
     outputs = layers.Dense(4)(inputs)
 
-    with self.assertRaisesRegexp(TypeError, 'unexpected argument'):
+    with self.assertRaisesRegexp(TypeError,
+                                 'got an unexpected keyword argument'):
       model = training_lib.Model(
           inputs, outputs, name='m', trainable=False, dtype='int64')
-    with self.assertRaisesRegexp(TypeError, 'unexpected argument'):
+    with self.assertRaisesRegexp(TypeError,
+                                 'got an unexpected keyword argument'):
       model = training_lib.Model(
           inputs, outputs, name='m', trainable=False, dynamic=False)
 
@@ -1136,8 +1138,10 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertFalse(model.trainable)
     self.assertFalse(model.dynamic)
 
+    class SubclassModel(training_lib.Model):
+      pass
     # Subclassed model
-    model = training_lib.Model(
+    model = SubclassModel(
         name='subclassed', trainable=True, dtype='int64', dynamic=True)
     self.assertEqual('subclassed', model.name)
     self.assertTrue(model.dynamic)
@@ -1150,9 +1154,9 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     input_tensor2 = input_layer_lib.Input(shape=[10], name='b')
     output_tensor1 = layers.Dense(units=10)(input_tensor1)
 
-    net = network_lib.Network(
+    net = functional.Functional(
         inputs=[input_tensor1, input_tensor2], outputs=[output_tensor1])
-    net2 = network_lib.Network.from_config(net.get_config())
+    net2 = functional.Functional.from_config(net.get_config())
     self.assertLen(net2.inputs, 2)
     self.assertEqual('a', net2.layers[0].name)
     self.assertEqual('b', net2.layers[1].name)
@@ -1180,8 +1184,8 @@ class DeferredModeTest(keras_parameterized.TestCase):
       self.assertEqual(x.shape.as_list(), [None, 2])
 
     outputs = layers.Dense(4)(x)
-    network = network_lib.Network(inputs, outputs)
-    self.assertIsInstance(network, network_lib.Network)
+    network = functional.Functional(inputs, outputs)
+    self.assertIsInstance(network, functional.Functional)
 
     if context.executing_eagerly():
       # It should be possible to call such a network on EagerTensors.
@@ -1204,7 +1208,7 @@ class DeferredModeTest(keras_parameterized.TestCase):
     c = AddLayer()([a, input_b])  # pylint: disable=not-callable
     c = layers.Dense(2)(c)
 
-    network = network_lib.Network([input_a, input_b], [a, c])
+    network = functional.Functional([input_a, input_b], [a, c])
     if context.executing_eagerly():
       a_val = constant_op.constant(
           np.random.random((10, 32)).astype('float32'))
@@ -1484,9 +1488,9 @@ class NestedNetworkTest(keras_parameterized.TestCase):
         'x2': input_layer_lib.Input(shape=(1,))
     }
     outputs = layers.Add()([inputs['x1'], inputs['x2']])
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
-    network = network_lib.Network.from_config(network.get_config())
+    network = functional.Functional.from_config(network.get_config())
 
     result_tensor = network({
         'x': array_ops.ones((1, 1), 'float32'),
@@ -1509,9 +1513,9 @@ class NestedNetworkTest(keras_parameterized.TestCase):
         'x*x': layers.Multiply()([inputs, inputs])
     }
 
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
-    network = network_lib.Network.from_config(network.get_config())
+    network = functional.Functional.from_config(network.get_config())
 
     result_tensor = network(array_ops.ones((1, 1), 'float32'))
     result = self.evaluate(result_tensor)
@@ -1531,7 +1535,8 @@ class NestedNetworkTest(keras_parameterized.TestCase):
         'x1+x2': layers.Add()([inner_inputs['x1'], inner_inputs['x2']]),
         'x1*x2': layers.Multiply()([inner_inputs['x1'], inner_inputs['x2']])
     }
-    inner_network = network_lib.Network(inner_inputs, inner_outputs)
+    inner_network = functional.Functional(
+        inner_inputs, inner_outputs)
 
     inputs = [
         input_layer_lib.Input(shape=(1,)),
@@ -1539,9 +1544,9 @@ class NestedNetworkTest(keras_parameterized.TestCase):
     ]
     middle = inner_network({'x1': inputs[0], 'x2': inputs[1]})
     outputs = layers.Add()([middle['x1+x2'], middle['x1*x2']])
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
-    network = network_lib.Network.from_config(network.get_config())
+    network = functional.Functional.from_config(network.get_config())
 
     # Computes: `(x1+x2) + (x1*x2)`
     result_tensor = network(
@@ -1735,13 +1740,13 @@ class DTypeTest(keras_parameterized.TestCase):
   def test_graph_network_dtype(self):
     inputs = input_layer_lib.Input((10,))
     outputs = layers.Dense(10)(inputs)
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
     self.assertEqual(network.dtype, 'float32')
 
   @testing_utils.enable_v2_dtype_behavior
   def test_subclassed_network_dtype(self):
 
-    class IdentityNetwork(network_lib.Network):
+    class IdentityNetwork(training_lib.Model):
 
       def call(self, inputs):
         return inputs
@@ -1785,11 +1790,11 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
 
   def layer_and_network_test(self):
     # Top level layer
-    network = network_lib.Network()
+    network = functional.Functional()
 
     layer_0 = AttrTrackingLayer()
 
-    sub_network = network_lib.Network()
+    sub_network = functional.Functional()
     layer_1 = AttrTrackingLayer(dynamic=True)
     layer_2 = AttrTrackingLayer()
     sub_network.sub_layers = [layer_1, layer_2]
@@ -1887,7 +1892,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
     x = input_layer_lib.Input(shape=(None, 32))
     dense = layers.Dense(2)
     y = dense(x)
-    network = network_lib.Network(x, y, name='dense_network')
+    network = functional.Functional(x, y, name='dense_network')
 
     for i in range(999, 1024):
       self.assertEqual(network.compute_output_shape((1, i, 32)), (1, i, 2))
@@ -1895,7 +1900,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
   def test_2d_inputs_squeezed_to_1d(self):
     input_1d = input_layer_lib.Input(shape=())
     outputs = input_1d * 2.
-    net = network_lib.Network(input_1d, outputs)
+    net = functional.Functional(input_1d, outputs)
 
     x = np.ones((10, 1))
     y = net(x)
@@ -1904,7 +1909,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
   def test_1d_inputs_expanded_to_2d(self):
     input_1d = input_layer_lib.Input(shape=(1,))
     outputs = input_1d * 2.
-    net = network_lib.Network(input_1d, outputs)
+    net = functional.Functional(input_1d, outputs)
 
     x = np.ones((10,))
     y = net(x)
@@ -1927,14 +1932,14 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
 
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=True)
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
     # Hard-coded value passed during construction is respected.
     self.assertAllEqual(network(x, training=False), x)
 
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=False)
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
     network(x, training=True)
     # Hard-coded value passed during construction is respected.
@@ -1942,7 +1947,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
 
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=None)
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
     # `None` value passed during construction is overridden.
     self.assertAllEqual(network(x, training=True), x)
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index ed715f61897..8075cc3fd0f 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -20,7 +20,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.distribute import distributed_training_utils
 from tensorflow.python.keras.engine import base_layer
@@ -161,12 +163,22 @@ class InputLayer(base_layer.Layer):
                          'InputLayer, you should instantiate your model and '
                          'directly call it on your input.')
       self.is_placeholder = False
-      self._batch_input_shape = tuple(input_tensor.shape.as_list())
-
+      try:
+        self._batch_input_shape = tuple(input_tensor.shape.as_list())
+      except ValueError:
+        # If the shape cannot be represented as a tuple (e.g. unknown rank)
+        self._batch_input_shape = None
     # Create an input node.
     input_tensor._keras_mask = None
     node_module.Node(layer=self, outputs=input_tensor)
 
+    # Store type spec
+    if isinstance(input_tensor, composite_tensor.CompositeTensor):
+      self._type_spec = input_tensor._type_spec  # pylint: disable=protected-access
+    else:
+      self._type_spec = tensor_spec.TensorSpec(
+          shape=input_tensor.shape, dtype=input_tensor.dtype, name=self.name)
+
   def get_config(self):
     config = {
         'batch_input_shape': self._batch_input_shape,
@@ -215,7 +227,9 @@ def Input(  # pylint: disable=invalid-name
       dtype: The data type expected by the input, as a string
           (`float32`, `float64`, `int32`...)
       sparse: A boolean specifying whether the placeholder to be created is
-          sparse. Only one of 'ragged' and 'sparse' can be True.
+          sparse. Only one of 'ragged' and 'sparse' can be True. Note that,
+          if `sparse` is False, sparse tensors can still be passed into the
+          input - they will be densified with a default value of 0.
       tensor: Optional existing tensor to wrap into the `Input` layer.
           If set, the layer will not create a placeholder tensor.
       ragged: A boolean specifying whether the placeholder to be created is
diff --git a/tensorflow/python/keras/engine/node.py b/tensorflow/python/keras/engine/node.py
index 945cf1c64bd..e8d98389bd7 100644
--- a/tensorflow/python/keras/engine/node.py
+++ b/tensorflow/python/keras/engine/node.py
@@ -24,6 +24,7 @@ import json
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -31,6 +32,8 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import serialization
 
+_CONSTANT_VALUE = '_CONSTANT_VALUE'
+
 
 class Node(object):
   """A `Node` describes the connectivity between two layers.
@@ -73,6 +76,9 @@ class Node(object):
 
     # Cached for performance.
     self._flat_arguments = nest.flatten((self.call_args, self.call_kwargs))
+    # Used to avoid expensive `nest` operations in the most common case.
+    self._single_positional_tensor_passed = (not self.call_kwargs and len(
+        self.call_args) == 1 and tensor_util.is_tensor(self.call_args[0]))
 
     # Create TensorFlowOpLayers if needed.
     for obj in self._flat_arguments:
@@ -102,6 +108,10 @@ class Node(object):
       tensor._keras_history = KerasHistory(
           layer=layer, node_index=node_index, tensor_index=i)
 
+    # Cached for performance.
+    self.flat_input_ids = [str(id(t)) for t in self._keras_inputs]
+    self.flat_output_ids = [str(id(t)) for t in nest.flatten(self.outputs)]
+
   @property
   def keras_inputs(self):
     """Tensors input to this node that can be traced back to a `keras.Input`."""
@@ -133,13 +143,18 @@ class Node(object):
 
   def map_arguments(self, tensor_dict):
     """Maps Keras Tensors to computed Tensors using `tensor_dict`."""
-    flat_arguments = copy.copy(self._flat_arguments)
-    for kt_id, kt_index in self._keras_inputs_ids_and_indices:
-      flat_arguments[kt_index] = tensor_dict[kt_id].pop()
+    if self._single_positional_tensor_passed:
+      # Performance optimization for most common case.
+      kt_id, _ = self._keras_inputs_ids_and_indices[0]
+      return (tensor_dict[kt_id].pop(),), {}
+    else:
+      flat_arguments = copy.copy(self._flat_arguments)
+      for kt_id, kt_index in self._keras_inputs_ids_and_indices:
+        flat_arguments[kt_index] = tensor_dict[kt_id].pop()
 
-    args, kwargs = nest.pack_sequence_as(
-        (self.call_args, self.call_kwargs), flat_arguments)
-    return args, kwargs
+      args, kwargs = nest.pack_sequence_as((self.call_args, self.call_kwargs),
+                                           flat_arguments)
+      return args, kwargs
 
   def serialize(self, make_node_key, node_conversion_map):
     """Serializes `Node` for Functional API's `get_config`."""
@@ -168,11 +183,18 @@ class Node(object):
     # `kwargs` is added to each Tensor in the first arg. This should be
     # changed in a future version of the serialization format.
     def serialize_first_arg_tensor(t):
-      kh = t._keras_history
-      node_index = kh.node_index
-      node_key = make_node_key(kh.layer.name, node_index)
-      new_node_index = node_conversion_map.get(node_key, 0)
-      data = [kh.layer.name, new_node_index, kh.tensor_index, kwargs]
+      if is_keras_tensor(t):
+        kh = t._keras_history
+        node_index = kh.node_index
+        node_key = make_node_key(kh.layer.name, node_index)
+        new_node_index = node_conversion_map.get(node_key, 0)
+        data = [kh.layer.name, new_node_index, kh.tensor_index, kwargs]
+      else:
+        # If an element in the first call argument did not originate as a
+        # keras tensor and is a constant value, we save it using the format
+        # ['_CONSTANT_VALUE', -1, serializaed_tensor_or_python_constant]
+        # (potentially including serialized kwargs in an optional 4th argument
+        data = [_CONSTANT_VALUE, -1, _serialize_keras_tensor(t), kwargs]
       return tf_utils.ListWrapper(data)
 
     data = nest.map_structure(serialize_first_arg_tensor, inputs)
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 2d5abac7fd6..a79d541c4e4 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -26,8 +26,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import layers as layer_module
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import functional
 from tensorflow.python.keras.engine import input_layer
-from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.saving.saved_model import model_serialization
 from tensorflow.python.keras.utils import generic_utils
@@ -35,7 +35,6 @@ from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.deprecation import deprecated
@@ -48,7 +47,7 @@ SINGLE_LAYER_OUTPUT_ERROR_MSG = ('All layers in a Sequential model should have '
 
 
 @keras_export('keras.Sequential', 'keras.models.Sequential')
-class Sequential(training.Model):
+class Sequential(functional.Functional):
   """`Sequential` groups a linear stack of layers into a `tf.keras.Model`.
 
   `Sequential` provides training and inference features on this model.
@@ -113,7 +112,9 @@ class Sequential(training.Model):
       layers: Optional list of layers to add to the model.
       name: Optional name for the model.
     """
-    super(Sequential, self).__init__(name=name, autocast=False)
+    # Skip the init in FunctionalModel since model doesn't have input/output yet
+    super(functional.Functional, self).__init__(  # pylint: disable=bad-super-call
+        name=name, autocast=False)
     self.supports_masking = True
     self._compute_output_and_mask_jointly = True
     self._auto_track_sub_layers = False
@@ -152,11 +153,6 @@ class Sequential(training.Model):
       return layers[1:]
     return layers[:]
 
-  @property
-  @trackable_layer_utils.cache_recursive_attribute('dynamic')
-  def dynamic(self):
-    return any(layer.dynamic for layer in self.layers)
-
   @trackable.no_automatic_dependency_tracking
   def add(self, layer):
     """Adds a layer instance on top of the layer stack.
@@ -192,10 +188,6 @@ class Sequential(training.Model):
                        ' of a layer in this model. Update the `name` argument '
                        'to pass a unique name.' % (layer.name,))
 
-    # This allows the added layer to broadcast mutations to the current
-    # layer, which is necessary to ensure cache correctness.
-    layer._attribute_sentinel.add_parent(self._attribute_sentinel)
-
     self.built = False
     set_inputs = False
     if not self._layers:
@@ -233,16 +225,13 @@ class Sequential(training.Model):
       self.built = True
 
     if set_inputs or self._graph_initialized:
-      self._init_graph_network(self.inputs, self.outputs, name=self.name)
+      self._init_graph_network(self.inputs, self.outputs)
       self._graph_initialized = True
     else:
       self._layers.append(layer)
       self._handle_deferred_layer_dependencies([layer])
 
     self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
-    # Different Model types add to `._layers` in different ways, so for safety
-    # we do a cache invalidation to make sure the changes are reflected.
-    self._attribute_sentinel.invalidate_all()
 
   @trackable.no_automatic_dependency_tracking
   def pop(self):
@@ -256,7 +245,6 @@ class Sequential(training.Model):
 
     layer = self._layers.pop()
     self._layer_call_argspecs.pop(layer)
-    self._attribute_sentinel.invalidate_all()
     if not self.layers:
       self.outputs = None
       self.inputs = None
@@ -267,7 +255,7 @@ class Sequential(training.Model):
     elif self._graph_initialized:
       self.layers[-1]._outbound_nodes = []
       self.outputs = [self.layers[-1].output]
-      self._init_graph_network(self.inputs, self.outputs, name=self.name)
+      self._init_graph_network(self.inputs, self.outputs)
       self.built = True
 
   @trackable.no_automatic_dependency_tracking
@@ -341,7 +329,7 @@ class Sequential(training.Model):
             # case, we fall back to the legacy deferred behavior.
             # TODO(fchollet): consider raising here, as we should not be
             # supporting such layers.
-            self._init_graph_network(inputs, outputs, name=self.name)
+            self._init_graph_network(inputs, outputs)
             self._graph_initialized = True
           except:  # pylint:disable=bare-except
             self._use_legacy_deferred_behavior = True
@@ -350,7 +338,7 @@ class Sequential(training.Model):
   @generic_utils.default
   def build(self, input_shape=None):
     if self._graph_initialized:
-      self._init_graph_network(self.inputs, self.outputs, name=self.name)
+      self._init_graph_network(self.inputs, self.outputs)
     else:
       if input_shape is None:
         raise ValueError('You must provide an `input_shape` argument.')
@@ -380,7 +368,7 @@ class Sequential(training.Model):
 
     if self._graph_initialized:
       if not self.built:
-        self._init_graph_network(self.inputs, self.outputs, name=self.name)
+        self._init_graph_network(self.inputs, self.outputs)
       return super(Sequential, self).call(inputs, training=training, mask=mask)
 
     outputs = inputs  # handle the corner case where self.layers is empty
@@ -401,7 +389,7 @@ class Sequential(training.Model):
         raise ValueError(SINGLE_LAYER_OUTPUT_ERROR_MSG)
       # `outputs` will be the inputs to the next layer.
       inputs = outputs
-      mask = outputs._keras_mask
+      mask = getattr(outputs, '_keras_mask', None)
     return outputs
 
   def compute_output_shape(self, input_shape):
@@ -415,7 +403,7 @@ class Sequential(training.Model):
     # by itself because it will duplicate any updates and losses in graph
     # mode by `call`ing the Layers again.
     outputs = self.call(inputs, mask=mask)
-    return outputs._keras_mask
+    return getattr(outputs, '_keras_mask', None)
 
   @deprecated('2021-01-01', 'Please use `model.predict()` instead.')
   def predict_proba(self, x, batch_size=32, verbose=0):
@@ -519,6 +507,13 @@ class Sequential(training.Model):
         return False
     return True
 
+  def _assert_weights_created(self):
+    if self._graph_initialized:
+      return
+    # When the graph has not been initialized, use the Model's implementation to
+    # to check if the weights has been created.
+    super(functional.Functional, self)._assert_weights_created()  # pylint: disable=bad-super-call
+
 
 def _get_shape_tuple(t):
   if hasattr(t, 'shape'):
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index bb68ffca2ed..adaba76e820 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -20,6 +20,9 @@ from __future__ import print_function
 
 import copy
 import itertools
+import json
+import os
+import six
 
 from tensorflow.python.autograph.lang import directives
 from tensorflow.python.distribute import distribute_coordinator as dc
@@ -31,19 +34,31 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import monitoring
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks as callbacks_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import compile_utils
 from tensorflow.python.keras.engine import data_adapter
-from tensorflow.python.keras.engine import network
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as lso
+from tensorflow.python.keras.saving import hdf5_format
+from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.saving.saved_model import model_serialization
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils import version_utils
+from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.keras.utils.io_utils import path_to_string
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -52,12 +67,33 @@ from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.profiler import trace
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import py_checkpoint_reader
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
+from tensorflow.python.util import serialization
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import keras_export
+from tensorflow.tools.docs import doc_controls
+
+
+# pylint: disable=g-import-not-at-top
+try:
+  import h5py
+except ImportError:
+  h5py = None
+
+try:
+  import yaml
+except ImportError:
+  yaml = None
+# pylint: enable=g-import-not-at-top
 
 
 _keras_api_gauge = monitoring.BoolGauge('/tensorflow/api/keras',
@@ -97,8 +133,25 @@ def disable_multi_worker(method):
       target=method, decorator_func=_method_wrapper)
 
 
+def inject_functional_model_class(cls):
+  from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.engine import training_v1  # pylint: disable=g-import-not-at-top
+  if cls == Model or cls == training_v1.Model:
+    return functional.Functional
+
+  cls.__bases__ = tuple(inject_functional_model_class(base)
+                        for base in cls.__bases__)
+  return cls
+
+
+def is_functional_model_init_params(args, kwargs):
+  return (len(args) == 2 or
+          len(args) == 1 and 'outputs' in kwargs or
+          'inputs' in kwargs and 'outputs' in kwargs)
+
+
 @keras_export('keras.Model', 'keras.models.Model')
-class Model(network.Network, version_utils.ModelVersionSelector):
+class Model(base_layer.Layer, version_utils.ModelVersionSelector):
   """`Model` groups layers into an object with training and inference features.
 
   Arguments:
@@ -174,11 +227,61 @@ class Model(network.Network, version_utils.ModelVersionSelector):
   _TF_MODULE_IGNORED_PROPERTIES = frozenset(
       itertools.chain(('_train_counter', '_test_counter', '_predict_counter',
                        '_steps_per_execution'),
-                      network.Network._TF_MODULE_IGNORED_PROPERTIES))  # pylint: disable=protected-access
+                      base_layer.Layer._TF_MODULE_IGNORED_PROPERTIES))  # pylint: disable=protected-access
 
+  def __new__(cls, *args, **kwargs):
+    # Signature detection
+    if is_functional_model_init_params(args, kwargs) and cls == Model:
+      # Functional model
+      from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
+      return functional.Functional(*args, **kwargs)
+    else:
+      return super(Model, cls).__new__(cls, *args, **kwargs)
+
+  @trackable.no_automatic_dependency_tracking
   def __init__(self, *args, **kwargs):
-    super(Model, self).__init__(*args, **kwargs)
-    _keras_api_gauge.get_cell('model').set(True)
+    # Special case for Subclassed Functional Model, which we couldn't detect
+    # when __new__ is called. We only realize it is a functional model when it
+    # calls super.__init__ with input and output tensor.
+    from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
+    if (is_functional_model_init_params(args, kwargs) and
+        not isinstance(self, functional.Functional)):
+      inject_functional_model_class(self.__class__)
+      functional.Functional.__init__(self, *args, **kwargs)
+      return
+
+    # The following are implemented as property functions:
+    # self.trainable_weights
+    # self.non_trainable_weights
+    generic_utils.validate_kwargs(kwargs, {'trainable', 'dtype', 'dynamic',
+                                           'name', 'autocast'})
+    super(Model, self).__init__(**kwargs)
+    # By default, Model is a subclass model, which is not in graph network.
+    self._is_graph_network = False
+
+    self.inputs = None
+    self.outputs = None
+    self.input_names = None
+    self.output_names = None
+    # stop_training is used by callback to stop training when error happens
+    self.stop_training = False
+    self.history = None
+    # These objects are used in the default `Model.compile`. They are not
+    # guaranteed to be set after `Model.compile` is called, as users can
+    # override compile with custom logic.
+    self.compiled_loss = None
+    self.compiled_metrics = None
+
+    # This is True for Sequential networks and Functional networks.
+    self._compute_output_and_mask_jointly = False
+
+    # Don't reset compilation if already done. This may occur if calling
+    # `__init__` (or `_init_graph_network`) on an already-compiled model
+    # such as a Sequential model. Sequential models may need to rebuild
+    # themselves after compilation.
+    self._maybe_create_attribute('_is_compiled', False)
+    self._maybe_create_attribute('optimizer', None)
+
     # Model must be created under scope of DistStrat it will be trained with.
     if ds_context.has_strategy():
       self._distribution_strategy = ds_context.get_strategy()
@@ -186,23 +289,20 @@ class Model(network.Network, version_utils.ModelVersionSelector):
       self._distribution_strategy = None
     # Defaults to value of `tf.config.experimental_functions_run_eagerly`.
     self._run_eagerly = None
-    self.stop_training = False
     # Initialize cache attrs.
     self._reset_compile_cache()
 
     # Fault-tolerance handler. Set in `ModelCheckpoint`.
     self._training_state = None
-    self.history = None
-
-    # These objects are used in the default `Model.compile`. They are not
-    # guaranteed to be set after `Model.compile` is called, as users can
-    # override compile with custom logic.
-    self.compiled_loss = None
-    self.compiled_metrics = None
+    self._saved_model_inputs_spec = None
+    self._trackable_saver = (
+        trackable_utils.saver_with_op_caching(self))
 
     self._steps_per_execution = None
 
     self._init_batch_counters()
+    self._base_model_initialized = True
+    _keras_api_gauge.get_cell('model').set(True)
 
   @trackable.no_automatic_dependency_tracking
   def _init_batch_counters(self):
@@ -214,67 +314,146 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     self._predict_counter = variables.Variable(
         0, dtype='int64', aggregation=agg)
 
-  def get_weights(self):
-    """Retrieves the weights of the model.
+  def __setattr__(self, name, value):
+    if not getattr(self, '_self_setattr_tracking', True):
+      super(Model, self).__setattr__(name, value)
+      return
 
-    Returns:
-        A flat list of Numpy arrays.
-    """
-    with self.distribute_strategy.scope():
-      return super(Model, self).get_weights()
+    if all(
+        isinstance(v, (base_layer.Layer,
+                       data_structures.TrackableDataStructure)) or
+        trackable_layer_utils.has_weights(v) for v in nest.flatten(value)):
+      try:
+        self._base_model_initialized
+      except AttributeError:
+        # six.raise_from supresses the original AttributeError from being raised
+        six.raise_from(
+            RuntimeError('It looks like you are subclassing `Model` and you '
+                         'forgot to call `super(YourClass, self).__init__()`.'
+                         ' Always start with this line.'), None)
 
-  def load_weights(self, filepath, by_name=False, skip_mismatch=False):
-    """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
+    super(Model, self).__setattr__(name, value)
 
-    If `by_name` is False weights are loaded based on the network's
-    topology. This means the architecture should be the same as when the weights
-    were saved.  Note that layers that don't have weights are not taken into
-    account in the topological ordering, so adding or removing layers is fine as
-    long as they don't have weights.
+  @generic_utils.default
+  def build(self, input_shape):
+    """Builds the model based on input shapes received.
 
-    If `by_name` is True, weights are loaded into layers only if they share the
-    same name. This is useful for fine-tuning or transfer-learning models where
-    some of the layers have changed.
+    This is to be used for subclassed models, which do not know at instantiation
+    time what their inputs look like.
 
-    Only topological loading (`by_name=False`) is supported when loading weights
-    from the TensorFlow format. Note that topological loading differs slightly
-    between TensorFlow and HDF5 formats for user-defined classes inheriting from
-    `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the
-    TensorFlow format loads based on the object-local names of attributes to
-    which layers are assigned in the `Model`'s constructor.
+    This method only exists for users who want to call `model.build()` in a
+    standalone way (as a substitute for calling the model on real data to
+    build it). It will never be called by the framework (and thus it will
+    never throw unexpected errors in an unrelated workflow).
 
-    Arguments:
-        filepath: String, path to the weights file to load. For weight files in
-            TensorFlow format, this is the file prefix (the same as was passed
-            to `save_weights`).
-        by_name: Boolean, whether to load weights by name or by topological
-            order. Only topological loading is supported for weight files in
-            TensorFlow format.
-        skip_mismatch: Boolean, whether to skip loading of layers where there is
-            a mismatch in the number of weights, or a mismatch in the shape of
-            the weight (only valid when `by_name=True`).
-
-    Returns:
-        When loading a weight file in TensorFlow format, returns the same status
-        object as `tf.train.Checkpoint.restore`. When graph building, restore
-        ops are run automatically as soon as the network is built (on first call
-        for user-defined classes inheriting from `Model`, immediately if it is
-        already built).
-
-        When loading weights in HDF5 format, returns `None`.
+    Args:
+     input_shape: Single tuple, TensorShape, or list of shapes, where shapes
+         are tuples, integers, or TensorShapes.
 
     Raises:
-        ImportError: If h5py is not available and the weight file is in HDF5
-            format.
-        ValueError: If `skip_mismatch` is set to `True` when `by_name` is
-          `False`.
+      ValueError:
+        1. In case of invalid user-provided data (not of type tuple,
+           list, or TensorShape).
+        2. If the model requires call arguments that are agnostic
+           to the input shapes (positional or kwarg in call signature).
+        3. If not all layers were properly built.
+        4. If float type inputs are not supported within the layers.
+
+      In each of these cases, the user should build their model by calling it
+      on real tensor data.
     """
-    if dist_utils.is_tpu_strategy(self._distribution_strategy):
-      if (self._distribution_strategy.extended.steps_per_run > 1 and
-          (not network._is_hdf5_filepath(filepath))):  # pylint: disable=protected-access
-        raise ValueError('Load weights is not yet supported with TPUStrategy '
-                         'with steps_per_run greater than 1.')
-    return super(Model, self).load_weights(filepath, by_name, skip_mismatch)
+    if self._is_graph_network:
+      super(Model, self).build(input_shape)
+      return
+
+    if input_shape is None:
+      raise ValueError('Input shape must be defined when calling build on a '
+                       'model subclass network.')
+    valid_types = (tuple, list, tensor_shape.TensorShape)
+    if not isinstance(input_shape, valid_types):
+      raise ValueError('Specified input shape is not one of the valid types. '
+                       'Please specify a batch input shape of type tuple or '
+                       'list of input shapes. User provided '
+                       'input type: {}'.format(type(input_shape)))
+
+    if input_shape and not self.inputs:
+      # We create placeholders for the `None`s in the shape and build the model
+      # in a Graph. Since tf.Variable is compatible with both eager execution
+      # and graph building, the variables created after building the model in
+      # a Graph are still valid when executing eagerly.
+      if context.executing_eagerly():
+        graph = func_graph.FuncGraph('build_graph')
+      else:
+        graph = backend.get_graph()
+      with graph.as_default():
+        if isinstance(input_shape, list):
+          x = [base_layer_utils.generate_placeholders_from_shape(shape)
+               for shape in input_shape]
+        elif isinstance(input_shape, dict):
+          x = {
+              k: base_layer_utils.generate_placeholders_from_shape(shape)
+              for k, shape in input_shape.items()
+          }
+        else:
+          x = base_layer_utils.generate_placeholders_from_shape(input_shape)
+
+        kwargs = {}
+        call_signature = self._call_full_argspec
+        call_args = call_signature.args
+        # Exclude `self`, `inputs`, and any argument with a default value.
+        if len(call_args) > 2:
+          if call_signature.defaults:
+            call_args = call_args[2:-len(call_signature.defaults)]
+          else:
+            call_args = call_args[2:]
+          for arg in call_args:
+            if arg == 'training':
+              # Case where `training` is a positional arg with no default.
+              kwargs['training'] = False
+            else:
+              # Has invalid call signature with unknown positional arguments.
+              raise ValueError(
+                  'Currently, you cannot build your model if it has '
+                  'positional or keyword arguments that are not '
+                  'inputs to the model, but are required for its '
+                  '`call` method. Instead, in order to instantiate '
+                  'and build your model, `call` your model on real '
+                  'tensor data with all expected call arguments.')
+        elif len(call_args) < 2:
+          # Signature without `inputs`.
+          raise ValueError('You can only call `build` on a model if its `call` '
+                           'method accepts an `inputs` argument.')
+        try:
+          self.call(x, **kwargs)
+        except (errors.InvalidArgumentError, TypeError):
+          raise ValueError('You cannot build your model by calling `build` '
+                           'if your layers do not support float type inputs. '
+                           'Instead, in order to instantiate and build your '
+                           'model, `call` your model on real tensor data (of '
+                           'the correct dtype).')
+
+    super(Model, self).build(input_shape)
+
+  def call(self, inputs, training=None, mask=None):
+    """Calls the model on new inputs.
+
+    In this case `call` just reapplies
+    all ops in the graph to the new inputs
+    (e.g. build a new computational graph from the provided inputs).
+
+    Arguments:
+        inputs: A tensor or list of tensors.
+        training: Boolean or boolean scalar tensor, indicating whether to run
+          the `Network` in training mode or inference mode.
+        mask: A mask or list of masks. A mask can be
+            either a tensor or None (no mask).
+
+    Returns:
+        A tensor if there is a single output, or
+        a list of tensors if there are more than one outputs.
+    """
+    raise NotImplementedError('When subclassing the `Model` class, you should '
+                              'implement a `call` method.')
 
   def compile(self,
               optimizer='rmsprop',
@@ -399,6 +578,10 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         dtype='int64',
         aggregation=variables.VariableAggregationV2.ONLY_FIRST_REPLICA)
 
+  @property
+  def _should_compute_mask(self):
+    return False
+
   @property
   def metrics(self):
     """Returns the model's metrics added using `compile`, `add_metric` APIs.
@@ -445,8 +628,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
       if self.compiled_metrics is not None:
         metrics += self.compiled_metrics.metrics
 
-    all_layers = self._gather_unique_layers()
-    for l in all_layers:
+    for l in self._flatten_layers():
       metrics.extend(l._metrics)  # pylint: disable=protected-access
     return metrics
 
@@ -1661,6 +1843,556 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         verbose=verbose,
         callbacks=callbacks)
 
+  ######################################################################
+  # Functions below are not training related. They are for model weights
+  # tracking, save/load, serialization, etc.
+  ######################################################################
+
+  @property
+  def trainable_weights(self):
+    self._assert_weights_created()
+    return self._dedup_weights(
+        trackable_layer_utils.gather_trainable_weights(
+            trainable=self.trainable,
+            sub_layers=self._layers,
+            extra_variables=self._trainable_weights))
+
+  @property
+  def non_trainable_weights(self):
+    self._assert_weights_created()
+    return self._dedup_weights(
+        trackable_layer_utils.gather_non_trainable_weights(
+            trainable=self.trainable,
+            sub_layers=self._layers,
+            extra_variables=self._non_trainable_weights +
+            self._trainable_weights))
+
+  def get_weights(self):
+    """Retrieves the weights of the model.
+
+    Returns:
+        A flat list of Numpy arrays.
+    """
+    with self.distribute_strategy.scope():
+      return super(Model, self).get_weights()
+
+  def save(self,
+           filepath,
+           overwrite=True,
+           include_optimizer=True,
+           save_format=None,
+           signatures=None,
+           options=None):
+    """Saves the model to Tensorflow SavedModel or a single HDF5 file.
+
+    The savefile includes:
+
+    - The model architecture, allowing to re-instantiate the model.
+    - The model weights.
+    - The state of the optimizer, allowing to resume training
+        exactly where you left off.
+
+    This allows you to save the entirety of the state of a model
+    in a single file.
+
+    Saved models can be reinstantiated via `keras.models.load_model`.
+    The model returned by `load_model` is a compiled model ready to be used
+    (unless the saved model was never compiled in the first place).
+
+    Models built with the Sequential and Functional API can be saved to both the
+    HDF5 and SavedModel formats. Subclassed models can only be saved with the
+    SavedModel format.
+
+    Note that the model weights may have different scoped names after being
+    loaded. Scoped names include the model/layer names, such as
+    `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
+     access specific variables, e.g. `model.get_layer("dense_1").kernel`.
+
+    Arguments:
+        filepath: String, PathLike, path to SavedModel or H5 file to save the
+            model.
+        overwrite: Whether to silently overwrite any existing file at the
+            target location, or provide the user with a manual prompt.
+        include_optimizer: If True, save optimizer's state together.
+        save_format: Either `'tf'` or `'h5'`, indicating whether to save the
+            model to Tensorflow SavedModel or HDF5. Defaults to 'tf' in TF 2.X,
+            and 'h5' in TF 1.X.
+        signatures: Signatures to save with the SavedModel. Applicable to the
+            'tf' format only. Please see the `signatures` argument in
+            `tf.saved_model.save` for details.
+        options: Optional `tf.saved_model.SaveOptions` object that specifies
+            options for saving to SavedModel.
+
+    Example:
+
+    ```python
+    from keras.models import load_model
+
+    model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
+    del model  # deletes the existing model
+
+    # returns a compiled model
+    # identical to the previous one
+    model = load_model('my_model.h5')
+    ```
+    """
+    save.save_model(self, filepath, overwrite, include_optimizer, save_format,
+                    signatures, options)
+
+  def save_weights(self, filepath, overwrite=True, save_format=None):
+    """Saves all layer weights.
+
+    Either saves in HDF5 or in TensorFlow format based on the `save_format`
+    argument.
+
+    When saving in HDF5 format, the weight file has:
+      - `layer_names` (attribute), a list of strings
+          (ordered names of model layers).
+      - For every layer, a `group` named `layer.name`
+          - For every such layer group, a group attribute `weight_names`,
+              a list of strings
+              (ordered names of weights tensor of the layer).
+          - For every weight in the layer, a dataset
+              storing the weight value, named after the weight tensor.
+
+    When saving in TensorFlow format, all objects referenced by the network are
+    saved in the same format as `tf.train.Checkpoint`, including any `Layer`
+    instances or `Optimizer` instances assigned to object attributes. For
+    networks constructed from inputs and outputs using `tf.keras.Model(inputs,
+    outputs)`, `Layer` instances used by the network are tracked/saved
+    automatically. For user-defined classes which inherit from `tf.keras.Model`,
+    `Layer` instances must be assigned to object attributes, typically in the
+    constructor. See the documentation of `tf.train.Checkpoint` and
+    `tf.keras.Model` for details.
+
+    While the formats are the same, do not mix `save_weights` and
+    `tf.train.Checkpoint`. Checkpoints saved by `Model.save_weights` should be
+    loaded using `Model.load_weights`. Checkpoints saved using
+    `tf.train.Checkpoint.save` should be restored using the corresponding
+    `tf.train.Checkpoint.restore`. Prefer `tf.train.Checkpoint` over
+    `save_weights` for training checkpoints.
+
+    The TensorFlow format matches objects and variables by starting at a root
+    object, `self` for `save_weights`, and greedily matching attribute
+    names. For `Model.save` this is the `Model`, and for `Checkpoint.save` this
+    is the `Checkpoint` even if the `Checkpoint` has a model attached. This
+    means saving a `tf.keras.Model` using `save_weights` and loading into a
+    `tf.train.Checkpoint` with a `Model` attached (or vice versa) will not match
+    the `Model`'s variables. See the [guide to training
+    checkpoints](https://www.tensorflow.org/guide/checkpoint) for details
+    on the TensorFlow format.
+
+    Arguments:
+        filepath: String or PathLike, path to the file to save the weights to.
+            When saving in TensorFlow format, this is the prefix used for
+            checkpoint files (multiple files are generated). Note that the '.h5'
+            suffix causes weights to be saved in HDF5 format.
+        overwrite: Whether to silently overwrite any existing file at the
+            target location, or provide the user with a manual prompt.
+        save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
+            '.keras' will default to HDF5 if `save_format` is `None`. Otherwise
+            `None` defaults to 'tf'.
+
+    Raises:
+        ImportError: If h5py is not available when attempting to save in HDF5
+            format.
+        ValueError: For invalid/unknown format arguments.
+    """
+    self._assert_weights_created()
+    filepath = path_to_string(filepath)
+    filepath_is_h5 = _is_hdf5_filepath(filepath)
+    if save_format is None:
+      if filepath_is_h5:
+        save_format = 'h5'
+      else:
+        save_format = 'tf'
+    else:
+      user_format = save_format.lower().strip()
+      if user_format in ('tensorflow', 'tf'):
+        save_format = 'tf'
+      elif user_format in ('hdf5', 'h5', 'keras'):
+        save_format = 'h5'
+      else:
+        raise ValueError(
+            'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
+                save_format,))
+    if save_format == 'tf' and filepath_is_h5:
+      raise ValueError(
+          ('save_weights got save_format="tf"/"tensorflow", but the '
+           'filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras" '
+           'when saving in TensorFlow format.')
+          % filepath)
+
+    if save_format == 'h5' and h5py is None:
+      raise ImportError(
+          '`save_weights` requires h5py when saving in hdf5.')
+    if save_format == 'tf':
+      check_filepath = filepath + '.index'
+    else:
+      check_filepath = filepath
+    # If file exists and should not be overwritten:
+    if not overwrite and os.path.isfile(check_filepath):
+      proceed = ask_to_proceed_with_overwrite(check_filepath)
+      if not proceed:
+        return
+    if save_format == 'h5':
+      with h5py.File(filepath, 'w') as f:
+        hdf5_format.save_weights_to_hdf5_group(f, self.layers)
+    else:
+      if context.executing_eagerly():
+        session = None
+      else:
+        session = backend.get_session()
+      optimizer = getattr(self, 'optimizer', None)
+      if (optimizer
+          and not isinstance(optimizer, trackable.Trackable)):
+        logging.warning(
+            ('This model was compiled with a Keras optimizer (%s) but is being '
+             'saved in TensorFlow format with `save_weights`. The model\'s '
+             'weights will be saved, but unlike with TensorFlow optimizers in '
+             'the TensorFlow format the optimizer\'s state will not be '
+             'saved.\n\nConsider using a TensorFlow optimizer from `tf.train`.')
+            % (optimizer,))
+      self._trackable_saver.save(filepath, session=session)
+      # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
+      checkpoint_management.update_checkpoint_state_internal(
+          save_dir=os.path.dirname(filepath),
+          model_checkpoint_path=filepath,
+          save_relative_paths=True,
+          all_model_checkpoint_paths=[filepath])
+
+  def load_weights(self, filepath, by_name=False, skip_mismatch=False):
+    """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
+
+    If `by_name` is False weights are loaded based on the network's
+    topology. This means the architecture should be the same as when the weights
+    were saved.  Note that layers that don't have weights are not taken into
+    account in the topological ordering, so adding or removing layers is fine as
+    long as they don't have weights.
+
+    If `by_name` is True, weights are loaded into layers only if they share the
+    same name. This is useful for fine-tuning or transfer-learning models where
+    some of the layers have changed.
+
+    Only topological loading (`by_name=False`) is supported when loading weights
+    from the TensorFlow format. Note that topological loading differs slightly
+    between TensorFlow and HDF5 formats for user-defined classes inheriting from
+    `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the
+    TensorFlow format loads based on the object-local names of attributes to
+    which layers are assigned in the `Model`'s constructor.
+
+    Arguments:
+        filepath: String, path to the weights file to load. For weight files in
+            TensorFlow format, this is the file prefix (the same as was passed
+            to `save_weights`).
+        by_name: Boolean, whether to load weights by name or by topological
+            order. Only topological loading is supported for weight files in
+            TensorFlow format.
+        skip_mismatch: Boolean, whether to skip loading of layers where there is
+            a mismatch in the number of weights, or a mismatch in the shape of
+            the weight (only valid when `by_name=True`).
+
+    Returns:
+        When loading a weight file in TensorFlow format, returns the same status
+        object as `tf.train.Checkpoint.restore`. When graph building, restore
+        ops are run automatically as soon as the network is built (on first call
+        for user-defined classes inheriting from `Model`, immediately if it is
+        already built).
+
+        When loading weights in HDF5 format, returns `None`.
+
+    Raises:
+        ImportError: If h5py is not available and the weight file is in HDF5
+            format.
+        ValueError: If `skip_mismatch` is set to `True` when `by_name` is
+          `False`.
+    """
+    if dist_utils.is_tpu_strategy(self._distribution_strategy):
+      if (self._distribution_strategy.extended.steps_per_run > 1 and
+          (not _is_hdf5_filepath(filepath))):
+        raise ValueError('Load weights is not yet supported with TPUStrategy '
+                         'with steps_per_run greater than 1.')
+    if skip_mismatch and not by_name:
+      raise ValueError(
+          'When calling model.load_weights, skip_mismatch can only be set to '
+          'True when by_name is True.')
+
+    filepath = path_to_string(filepath)
+    if _is_hdf5_filepath(filepath):
+      save_format = 'h5'
+    else:
+      try:
+        py_checkpoint_reader.NewCheckpointReader(filepath)
+        save_format = 'tf'
+      except errors_impl.DataLossError:
+        # The checkpoint is not readable in TensorFlow format. Try HDF5.
+        save_format = 'h5'
+    if save_format == 'tf':
+      status = self._trackable_saver.restore(filepath)
+      if by_name:
+        raise NotImplementedError(
+            'Weights may only be loaded based on topology into Models when '
+            'loading TensorFlow-formatted weights (got by_name=True to '
+            'load_weights).')
+      if not context.executing_eagerly():
+        session = backend.get_session()
+        # Restore existing variables (if any) immediately, and set up a
+        # streaming restore for any variables created in the future.
+        trackable_utils.streaming_restore(status=status, session=session)
+      status.assert_nontrivial_match()
+      return status
+    if h5py is None:
+      raise ImportError(
+          '`load_weights` requires h5py when loading weights from HDF5.')
+    if not self._is_graph_network and not self.built:
+      raise ValueError(
+          'Unable to load weights saved in HDF5 format into a subclassed '
+          'Model which has not created its variables yet. Call the Model '
+          'first, then load the weights.')
+    self._assert_weights_created()
+    with h5py.File(filepath, 'r') as f:
+      if 'layer_names' not in f.attrs and 'model_weights' in f:
+        f = f['model_weights']
+      if by_name:
+        hdf5_format.load_weights_from_hdf5_group_by_name(
+            f, self.layers, skip_mismatch=skip_mismatch)
+      else:
+        hdf5_format.load_weights_from_hdf5_group(f, self.layers)
+
+  def _updated_config(self):
+    """Util shared between different serialization methods.
+
+    Returns:
+        Model config with Keras version information added.
+    """
+    from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+    config = self.get_config()
+    model_config = {
+        'class_name': self.__class__.__name__,
+        'config': config,
+        'keras_version': keras_version,
+        'backend': backend.backend()
+    }
+    return model_config
+
+  def get_config(self):
+    raise NotImplementedError
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    # Since only FunctionalModel produces config, the model can only
+    # be constructed for FunctionalModel
+    from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
+    return functional.Functional.from_config(
+        config, custom_objects=custom_objects)
+
+  def to_json(self, **kwargs):
+    """Returns a JSON string containing the network configuration.
+
+    To load a network from a JSON save file, use
+    `keras.models.model_from_json(json_string, custom_objects={})`.
+
+    Arguments:
+        **kwargs: Additional keyword arguments
+            to be passed to `json.dumps()`.
+
+    Returns:
+        A JSON string.
+    """
+    model_config = self._updated_config()
+    return json.dumps(
+        model_config, default=serialization.get_json_type, **kwargs)
+
+  def to_yaml(self, **kwargs):
+    """Returns a yaml string containing the network configuration.
+
+    To load a network from a yaml save file, use
+    `keras.models.model_from_yaml(yaml_string, custom_objects={})`.
+
+    `custom_objects` should be a dictionary mapping
+    the names of custom losses / layers / etc to the corresponding
+    functions / classes.
+
+    Arguments:
+        **kwargs: Additional keyword arguments
+            to be passed to `yaml.dump()`.
+
+    Returns:
+        A YAML string.
+
+    Raises:
+        ImportError: if yaml module is not found.
+    """
+    if yaml is None:
+      raise ImportError(
+          'Requires yaml module installed (`pip install pyyaml`).')
+    return yaml.dump(self._updated_config(), **kwargs)
+
+  def reset_states(self):
+    for layer in self.layers:
+      if hasattr(layer, 'reset_states') and getattr(layer, 'stateful', False):
+        layer.reset_states()
+
+  @property
+  @deprecation.deprecated(
+      date=None,
+      instructions='This property should not be used in TensorFlow 2.0, '
+      'as updates are applied automatically.')
+  @doc_controls.do_not_generate_docs
+  def state_updates(self):
+    """Deprecated, do NOT use!
+
+    Returns the `updates` from all layers that are stateful.
+
+    This is useful for separating training updates and
+    state updates, e.g. when we need to update a layer's internal state
+    during prediction.
+
+    Returns:
+        A list of update ops.
+    """
+    state_updates = []
+    for layer in self.layers:
+      if getattr(layer, 'stateful', False):
+        if hasattr(layer, 'updates'):
+          state_updates += layer.updates
+    return state_updates
+
+  @property
+  def weights(self):
+    """Returns the list of all layer variables/weights.
+
+    Returns:
+      A list of variables.
+    """
+    return self._dedup_weights(self._undeduplicated_weights)
+
+  @property
+  def _undeduplicated_weights(self):
+    """Returns the undeduplicated list of all layer variables/weights."""
+    self._assert_weights_created()
+    weights = []
+    for layer in self._layers:
+      weights += layer.weights
+    weights += (self._trainable_weights + self._non_trainable_weights)
+    return weights
+
+  def summary(self, line_length=None, positions=None, print_fn=None):
+    """Prints a string summary of the network.
+
+    Arguments:
+        line_length: Total length of printed lines
+            (e.g. set this to adapt the display to different
+            terminal window sizes).
+        positions: Relative or absolute positions of log elements
+            in each line. If not provided,
+            defaults to `[.33, .55, .67, 1.]`.
+        print_fn: Print function to use. Defaults to `print`.
+            It will be called on each line of the summary.
+            You can set it to a custom function
+            in order to capture the string summary.
+
+    Raises:
+        ValueError: if `summary()` is called before the model is built.
+    """
+    if not self.built:
+      raise ValueError('This model has not yet been built. '
+                       'Build the model first by calling `build()` or calling '
+                       '`fit()` with some data, or specify '
+                       'an `input_shape` argument in the first layer(s) for '
+                       'automatic build.')
+    layer_utils.print_summary(self,
+                              line_length=line_length,
+                              positions=positions,
+                              print_fn=print_fn)
+
+  @property
+  def layers(self):
+    return list(self._flatten_layers(include_self=False, recursive=False))
+
+  def get_layer(self, name=None, index=None):
+    """Retrieves a layer based on either its name (unique) or index.
+
+    If `name` and `index` are both provided, `index` will take precedence.
+    Indices are based on order of horizontal graph traversal (bottom-up).
+
+    Arguments:
+        name: String, name of layer.
+        index: Integer, index of layer.
+
+    Returns:
+        A layer instance.
+
+    Raises:
+        ValueError: In case of invalid layer name or index.
+    """
+    # TODO(fchollet): We could build a dictionary based on layer names
+    # since they are constant, but we have not done that yet.
+    if index is not None and name is not None:
+      raise ValueError('Provide only a layer name or a layer index.')
+
+    if index is not None:
+      if len(self.layers) <= index:
+        raise ValueError('Was asked to retrieve layer at index ' + str(index) +
+                         ' but model only has ' + str(len(self.layers)) +
+                         ' layers.')
+      else:
+        return self.layers[index]
+
+    if name is not None:
+      for layer in self.layers:
+        if layer.name == name:
+          return layer
+      raise ValueError('No such layer: ' + name + '.')
+    raise ValueError('Provide either a layer name or layer index.')
+
+  @trackable.no_automatic_dependency_tracking
+  def _set_save_spec(self, inputs):
+    if self._saved_model_inputs_spec is not None:
+      return  # Already set.
+
+    input_names = self.input_names
+    if not input_names:
+      input_names = compile_utils.create_pseudo_input_names(inputs)
+
+    flat_inputs = nest.flatten(inputs)
+    specs = []
+    for name, tensor in zip(input_names, flat_inputs):
+      specs.append(
+          tf_utils.get_tensor_spec(tensor, dynamic_batch=False, name=name))
+    specs = nest.pack_sequence_as(inputs, specs)
+
+    self._saved_model_inputs_spec = specs
+
+  def _assert_weights_created(self):
+    """Asserts that all the weights for the model have been created.
+
+    For a non-dynamic model, the weights must already be created after the
+    layer has been called. For a dynamic model, the exact list of weights can
+    never be known for certain since it may change at any time during execution.
+
+    We run this check right before accessing weights or getting the Numpy value
+    for the current weights. Otherwise, if the layer has never been called,
+    the user would just get an empty list, which is misleading.
+
+    Raises:
+      ValueError: if the weights of the network has not yet been created.
+    """
+    if self.dynamic:
+      return
+
+    if ('build' in self.__class__.__dict__ and
+        self.__class__ != Model and
+        not self.built):
+      # For any model that has customized build() method but hasn't
+      # been invoked yet, this will cover both sequential and subclass model.
+      # Also make sure to exclude Model class itself which has build() defined.
+      raise ValueError('Weights for model %s have not yet been created. '
+                       'Weights are created when the Model is first called on '
+                       'inputs or `build()` is called with an `input_shape`.' %
+                       self.name)
+
   def _check_call_args(self, method_name):
     """Check that `call` has only one positional arg."""
     # Always allow first arg, regardless of arg name.
@@ -1990,3 +2722,8 @@ def _disallow_inside_tf_function(method_name):
         'directly on `Tensor`s inside a `tf.function` like: `model(x)`.'
     ).format(method_name=method_name)
     raise RuntimeError(error_msg)
+
+
+def _is_hdf5_filepath(filepath):
+  return (filepath.endswith('.h5') or filepath.endswith('.keras') or
+          filepath.endswith('.hdf5'))
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index e4c1ff6b1f8..111833ba8b5 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -1499,6 +1499,65 @@ class TrainingTest(keras_parameterized.TestCase):
     new_kernel = model.get_weights()[1]
     self.assertNotAllEqual(old_kernel, new_kernel)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_layer_ordering(self):
+
+    class MyLayer(layers_module.Layer):
+      pass
+
+    class MyModel(training_module.Model):
+
+      def __init__(self, name):
+        super(MyModel, self).__init__(name=name)
+
+        self.weight = variables_lib.Variable(0, name=name)
+
+        self.direct_sublayer = MyLayer(name='direct')
+        self.direct_sublayer.d = {'d': MyLayer(name='direct/dict')}
+
+        self.dict_sublayer = {'d': MyLayer(name='dict')}
+        self.dict_sublayer['d'].direct = MyLayer(name='dict/direct')
+
+    model = MyModel('model')
+    # All sublayers, including self and recursive sublayers.
+    self.assertEqual(['model', 'direct', 'direct/dict', 'dict', 'dict/direct'],
+                     [l.name for l in model._flatten_layers()])
+    # Only direct sublayers, including those in data structures.
+    self.assertEqual(['direct', 'dict'], [l.name for l in model.layers])
+
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_trainable_state_setting(self):
+
+    class UpdateLayer(layers_module.Layer):
+
+      def __init__(self):
+        super(UpdateLayer, self).__init__()
+        self.v = variables_lib.Variable(0., trainable=False)
+
+      def call(self, x):
+        self.add_update(lambda: self.v.assign_add(1.))
+        return x * self.v
+
+    layer = UpdateLayer()
+    model_with_updates = sequential.Sequential([layer])
+    model_with_updates.compile(
+        'sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+
+    layer.trainable = False
+    model_without_updates = sequential.Sequential([layer])
+    model_without_updates.compile(
+        'sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+
+    x, y = np.ones((10, 1)), np.ones((10, 1))
+
+    self.assertEqual(self.evaluate(layer.v), 0.)
+    model_with_updates.fit(x, y, batch_size=10)
+    # assign_add called.
+    self.assertEqual(self.evaluate(layer.v), 1.)
+    model_without_updates.fit(x, y, batch_size=10)
+    # assign_add not called.
+    self.assertEqual(self.evaluate(layer.v), 1.)
+
 
 class TestExceptionsAndWarnings(keras_parameterized.TestCase):
 
@@ -1531,8 +1590,7 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
     output = sparse_ops.sparse_minimum(inputs, inputs)
     with self.assertRaisesRegexp(
         ValueError,
-        'Sparse ops are not supported with functional models with built-in '
-        'layer wrapping'
+        'not supported by Keras automatic op wrapping'
     ):
       training_module.Model([inputs], output)
 
@@ -2979,6 +3037,8 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         return self.dense1(x)
 
     model = TestModel()
+    self.assertListEqual([m.name for m in model.metrics],
+                         ['metric_1', 'metric_2'])
     model.compile(
         loss='mse',
         optimizer=RMSPropOptimizer(0.01),
@@ -2998,6 +3058,41 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.train_on_batch(x, y)
     model.test_on_batch(x, y)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_multiple_add_metric_calls_layer(self):
+
+    class TestLayer(layers_module.Layer):
+
+      def __init__(self):
+        super(TestLayer, self).__init__(name='test_layer')
+        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
+        self.m1 = metrics_module.Mean(name='m_1')
+        self.m2 = [
+            metrics_module.Mean(name='m_2'),
+            metrics_module.Mean(name='m_3')
+        ]
+        self.m3 = {
+            'mean4': metrics_module.Mean(name='m_4'),
+            'mean5': metrics_module.Mean(name='m_5')
+        }
+
+      def call(self, x):
+        self.add_metric(self.m2[0](x))
+        self.add_metric(self.m2[1](x))
+        self.add_metric(self.m1(x))
+        self.add_metric(self.m3['mean4'](x))
+        self.add_metric(self.m3['mean5'](x))
+        self.add_metric(math_ops.reduce_sum(x), name='m_6', aggregation='mean')
+        return self.dense1(x)
+
+    layer = TestLayer()
+    self.assertListEqual([m.name for m in layer.metrics],
+                         ['m_1', 'm_2', 'm_3', 'm_4', 'm_5'])
+
+    layer(np.ones((10, 10)))
+    self.assertListEqual([m.name for m in layer.metrics],
+                         ['m_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6'])
+
   @keras_parameterized.run_all_keras_modes
   def test_duplicate_metric_name_in_add_metric(self):
 
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 680f33f75a5..0d7637cb98c 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -1935,7 +1935,7 @@ def get_input_shape_and_dtype(layer):
       raise ValueError('An empty Model cannot be used as a Layer.')
     layer = layer.layers[0]
 
-  if hasattr(layer, '_batch_input_shape'):
+  if getattr(layer, '_batch_input_shape', None):
     return layer._batch_input_shape, layer.dtype
   return None, None
 
diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py
index 0a40ce3899b..c137c6e517a 100644
--- a/tensorflow/python/keras/engine/training_v1.py
+++ b/tensorflow/python/keras/engine/training_v1.py
@@ -43,7 +43,7 @@ from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.distribute import distributed_training_utils
-from tensorflow.python.keras.engine import network
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.engine import training_arrays
 from tensorflow.python.keras.engine import training_distributed
@@ -62,6 +62,7 @@ from tensorflow.python.ops.losses import util as tf_losses_utils
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
+from tensorflow.python.types import core
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
@@ -180,8 +181,8 @@ class Model(training_lib.Model):
                 self._compile_time_distribution_strategy)
     if strategy:
       with strategy.scope():
-        return network.Network.get_weights(self)
-    return network.Network.get_weights(self)
+        return base_layer.Layer.get_weights(self)
+    return base_layer.Layer.get_weights(self)
 
   def load_weights(self, filepath, by_name=False, skip_mismatch=False):
     """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
@@ -231,7 +232,7 @@ class Model(training_lib.Model):
     """
     if distributed_training_utils.is_tpu_strategy(self._distribution_strategy):
       if (self._distribution_strategy.extended.steps_per_run > 1 and
-          (not network._is_hdf5_filepath(filepath))):  # pylint: disable=protected-access
+          (not training_lib._is_hdf5_filepath(filepath))):  # pylint: disable=protected-access
         raise ValueError('Load weights is not yet supported with TPUStrategy '
                          'with steps_per_run greater than 1.')
     return super(Model, self).load_weights(filepath, by_name, skip_mismatch)
@@ -490,6 +491,11 @@ class Model(training_lib.Model):
     """Returns the model's metrics added using `compile`, `add_metric` APIs."""
     metrics = []
     if self._is_compiled:
+      if not hasattr(self, '_v1_compile_was_called'):
+        # See b/155687393 for more details, the model is created as a v2
+        # instance but converted to v1. Fallback to use base Model to retrieve
+        # the metrics.
+        return super(Model, self).metrics
       metrics += self._compile_metric_functions
     metrics.extend(self._metrics)
     metrics.extend(_get_metrics_from_layers(self._layers))
@@ -503,6 +509,12 @@ class Model(training_lib.Model):
     # losses for backward compatibility.
     metrics_names = ['loss']
     if self._is_compiled:
+      if not hasattr(self, '_v1_compile_was_called'):
+        # See b/155687393 for more details, the model is created as a v2
+        # instance but converted to v1. Fallback to use base Model to retrieve
+        # the metrics name
+        return super(Model, self).metrics_names
+
       # Add output loss metric names to the metric names list.
       if len(self._training_endpoints) > 1:
         metrics_names.extend([
@@ -3143,7 +3155,7 @@ def _convert_scipy_sparse_tensor(value, expected_input):
     The possibly-converted 'value'.
   """
   if issparse is not None and issparse(value):
-    if ops.is_dense_tensor_like(expected_input):
+    if isinstance(expected_input, core.Tensor):
       if ops.executing_eagerly_outside_functions():
         # In TF2 we do not silently densify sparse matrices.
         raise ValueError('A SciPy sparse matrix was passed to a model '
diff --git a/tensorflow/python/keras/feature_column/BUILD b/tensorflow/python/keras/feature_column/BUILD
index 650efcceb52..6af53646d2f 100644
--- a/tensorflow/python/keras/feature_column/BUILD
+++ b/tensorflow/python/keras/feature_column/BUILD
@@ -12,15 +12,108 @@ exports_files(["LICENSE"])
 
 py_library(
     name = "feature_column",
+    srcs = ["__init__.py"],
     deps = [
+        ":base_feature_layer",
+        ":dense_features",
+        ":dense_features_v2",
         ":sequence_feature_column",
     ],
 )
 
+py_library(
+    name = "base_feature_layer",
+    srcs = ["base_feature_layer.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/keras/engine:base_layer",
+        "//tensorflow/python/keras/utils:generic_utils",
+    ],
+)
+
+py_library(
+    name = "dense_features",
+    srcs = [
+        "dense_features.py",
+    ],
+    deps = [
+        ":base_feature_layer",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/keras:backend",
+    ],
+)
+
+py_library(
+    name = "dense_features_v2",
+    srcs = [
+        "dense_features_v2.py",
+    ],
+    deps = [
+        ":base_feature_layer",
+        ":dense_features",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
+tf_py_test(
+    name = "dense_features_test",
+    srcs = ["dense_features_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":dense_features",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
+tf_py_test(
+    name = "dense_features_v2_test",
+    srcs = ["dense_features_v2_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":dense_features_v2",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
 py_library(
     name = "sequence_feature_column",
     srcs = ["sequence_feature_column.py"],
     deps = [
+        ":base_feature_layer",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:framework_ops",
@@ -59,6 +152,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
+        ":dense_features",
         ":sequence_feature_column",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
diff --git a/tensorflow/python/data/service/__init__.py b/tensorflow/python/keras/feature_column/__init__.py
similarity index 100%
rename from tensorflow/python/data/service/__init__.py
rename to tensorflow/python/keras/feature_column/__init__.py
diff --git a/tensorflow/python/keras/feature_column/base_feature_layer.py b/tensorflow/python/keras/feature_column/base_feature_layer.py
new file mode 100644
index 00000000000..12f507efe83
--- /dev/null
+++ b/tensorflow/python/keras/feature_column/base_feature_layer.py
@@ -0,0 +1,145 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This API defines FeatureColumn abstraction."""
+
+# This file was originally under tf/python/feature_column, and was moved to
+# Keras package in order to remove the reverse dependency from TF to Keras.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.feature_column import feature_column_v2
+from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+
+
+class _BaseFeaturesLayer(Layer):
+  """Base class for DenseFeatures and SequenceFeatures.
+
+  Defines common methods and helpers.
+
+  Args:
+    feature_columns: An iterable containing the FeatureColumns to use as
+      inputs to your model.
+    expected_column_type: Expected class for provided feature columns.
+    trainable:  Boolean, whether the layer's variables will be updated via
+      gradient descent during training.
+    name: Name to give to the DenseFeatures.
+    **kwargs: Keyword arguments to construct a layer.
+
+  Raises:
+    ValueError: if an item in `feature_columns` doesn't match
+      `expected_column_type`.
+  """
+
+  def __init__(self,
+               feature_columns,
+               expected_column_type,
+               trainable,
+               name,
+               partitioner=None,
+               **kwargs):
+    super(_BaseFeaturesLayer, self).__init__(
+        name=name, trainable=trainable, **kwargs)
+    self._feature_columns = feature_column_v2._normalize_feature_columns(  # pylint: disable=protected-access
+        feature_columns)
+    self._state_manager = feature_column_v2._StateManagerImpl(  # pylint: disable=protected-access
+        self, self.trainable)
+    self._partitioner = partitioner
+    for column in self._feature_columns:
+      if not isinstance(column, expected_column_type):
+        raise ValueError(
+            'Items of feature_columns must be a {}. '
+            'You can wrap a categorical column with an '
+            'embedding_column or indicator_column. Given: {}'.format(
+                expected_column_type, column))
+
+  def build(self, _):
+    for column in self._feature_columns:
+      with variable_scope._pure_variable_scope(  # pylint: disable=protected-access
+          self.name,
+          partitioner=self._partitioner):
+        with variable_scope._pure_variable_scope(  # pylint: disable=protected-access
+            feature_column_v2._sanitize_column_name_for_variable_scope(  # pylint: disable=protected-access
+                column.name)):
+          column.create_state(self._state_manager)
+    super(_BaseFeaturesLayer, self).build(None)
+
+  def _output_shape(self, input_shape, num_elements):
+    """Computes expected output shape of the layer or a column's dense tensor.
+
+    Args:
+      input_shape: Tensor or array with batch shape.
+      num_elements: Size of the last dimension of the output.
+
+    Returns:
+      Tuple with output shape.
+    """
+    raise NotImplementedError('Calling an abstract method.')
+
+  def compute_output_shape(self, input_shape):
+    total_elements = 0
+    for column in self._feature_columns:
+      total_elements += column.variable_shape.num_elements()
+    return self._target_shape(input_shape, total_elements)
+
+  def _process_dense_tensor(self, column, tensor):
+    """Reshapes the dense tensor output of a column based on expected shape.
+
+    Args:
+      column: A DenseColumn or SequenceDenseColumn object.
+      tensor: A dense tensor obtained from the same column.
+
+    Returns:
+      Reshaped dense tensor.
+    """
+    num_elements = column.variable_shape.num_elements()
+    target_shape = self._target_shape(array_ops.shape(tensor), num_elements)
+    return array_ops.reshape(tensor, shape=target_shape)
+
+  def _verify_and_concat_tensors(self, output_tensors):
+    """Verifies and concatenates the dense output of several columns."""
+    feature_column_v2._verify_static_batch_size_equality(  # pylint: disable=protected-access
+        output_tensors, self._feature_columns)
+    return array_ops.concat(output_tensors, -1)
+
+  def get_config(self):
+    # Import here to avoid circular imports.
+    from tensorflow.python.feature_column import serialization  # pylint: disable=g-import-not-at-top
+    column_configs = serialization.serialize_feature_columns(
+        self._feature_columns)
+    config = {'feature_columns': column_configs}
+    config['partitioner'] = generic_utils.serialize_keras_object(
+        self._partitioner)
+
+    base_config = super(  # pylint: disable=bad-super-call
+        _BaseFeaturesLayer, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    # Import here to avoid circular imports.
+    from tensorflow.python.feature_column import serialization  # pylint: disable=g-import-not-at-top
+    config_cp = config.copy()
+    config_cp['feature_columns'] = serialization.deserialize_feature_columns(
+        config['feature_columns'], custom_objects=custom_objects)
+    config_cp['partitioner'] = generic_utils.deserialize_keras_object(
+        config['partitioner'], custom_objects)
+
+    return cls(**config_cp)
diff --git a/tensorflow/python/feature_column/dense_features.py b/tensorflow/python/keras/feature_column/dense_features.py
similarity index 96%
rename from tensorflow/python/feature_column/dense_features.py
rename to tensorflow/python/keras/feature_column/dense_features.py
index 6feef185815..ef533b71fe7 100644
--- a/tensorflow/python/feature_column/dense_features.py
+++ b/tensorflow/python/keras/feature_column/dense_features.py
@@ -23,13 +23,13 @@ import json
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
-from tensorflow.python.keras.layers import serialization as layer_serialization
+from tensorflow.python.keras.feature_column import base_feature_layer as kfc
 from tensorflow.python.util import serialization
 from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export(v1=['keras.layers.DenseFeatures'])
-class DenseFeatures(fc._BaseFeaturesLayer):  # pylint: disable=protected-access
+class DenseFeatures(kfc._BaseFeaturesLayer):  # pylint: disable=protected-access
   """A layer that produces a dense `Tensor` based on given `feature_columns`.
 
   Generally a single example in training data is described with FeatureColumns.
@@ -173,7 +173,3 @@ class DenseFeatures(fc._BaseFeaturesLayer):  # pylint: disable=protected-access
           cols_to_output_tensors[column] = processed_tensors
         output_tensors.append(processed_tensors)
     return self._verify_and_concat_tensors(output_tensors)
-
-
-layer_serialization.inject_feature_column_v1_objects(
-    'DenseFeatures', DenseFeatures)
diff --git a/tensorflow/python/feature_column/dense_features_test.py b/tensorflow/python/keras/feature_column/dense_features_test.py
similarity index 60%
rename from tensorflow/python/feature_column/dense_features_test.py
rename to tensorflow/python/keras/feature_column/dense_features_test.py
index 7cd523dcc14..76b91dd605f 100644
--- a/tensorflow/python/feature_column/dense_features_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_test.py
@@ -18,22 +18,25 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 
@@ -676,5 +679,452 @@ class DenseFeaturesTest(test.TestCase):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
 
+class IndicatorColumnTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      net = df.DenseFeatures([animal])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+
+
+class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
+      })
+  @test_util.run_deprecated_v1
+  def test_dense_features(self, use_safe_embedding_lookup, partition_variables):
+    # Inputs.
+    vocabulary_size = 4
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
+      self.assertEqual(dtypes.float32, dtype)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
+
+      # Provide sparse input and get dense result.
+      l = df.DenseFeatures((embedding_column,))
+      dense_features = l({'aaa': sparse_input})
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    if partition_variables:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
+           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
+          tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
+          tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    if partition_variables:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
+           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
+          tuple([v.name for v in trainable_vars]))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+    if use_safe_embedding_lookup:
+      self.assertIn('SparseFillEmptyRows',
+                    [x.type for x in ops.get_default_graph().get_operations()])
+    else:
+      self.assertNotIn(
+          'SparseFillEmptyRows',
+          [x.type for x in ops.get_default_graph().get_operations()])
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_not_trainable(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=False)
+
+    # Provide sparse input and get dense result.
+    dense_features = df.DenseFeatures((embedding_column,))({
+        'aaa': sparse_input
+    })
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+
+class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
+
+  def _test_dense_features(self, trainable=True):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 4)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [0]
+        # example 1, ids []
+        indices=((0, 0),),
+        values=(0,),
+        dense_shape=(2, 5))
+    sparse_input_c = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 1), (1, 1), (1, 3)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_d = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids []
+        indices=((0, 1),),
+        values=(2,),
+        dense_shape=(2, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0:
+        # A ids [2], embedding = [7, 11]
+        # B ids [0], embedding = [1, 2]
+        # C ids [2], embedding = [7, 11]
+        # D ids [2], embedding = [7, 11]
+        (7., 11., 1., 2., 7., 11., 7., 11.),
+        # example 1:
+        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # B ids [], embedding = [0, 0]
+        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # D ids [], embedding = [0, 0]
+        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    categorical_column_c = fc.categorical_column_with_identity(
+        key='ccc', num_buckets=vocabulary_size)
+    categorical_column_d = fc.categorical_column_with_identity(
+        key='ddd', num_buckets=vocabulary_size)
+
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
+        [categorical_column_c, categorical_column_d],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+
+    features = {
+        'aaa': sparse_input_a,
+        'bbb': sparse_input_b,
+        'ccc': sparse_input_c,
+        'ddd': sparse_input_d
+    }
+
+    # Provide sparse input and get dense result.
+    dense_features = df.DenseFeatures(
+        feature_columns=(embedding_column_b, embedding_column_a,
+                         embedding_column_c, embedding_column_d))(
+                             features)
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertCountEqual(
+        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+        tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    if trainable:
+      self.assertCountEqual(
+          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
+    shared_embedding_vars = global_vars
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values,
+                        self.evaluate(shared_embedding_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    self._test_dense_features()
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_no_trainable(self):
+    self._test_dense_features(trainable=False)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('default', None, None),
+      ('trainable', True, 'trainable'),
+      ('not_trainable', False, 'frozen'))
+  def test_get_config(self, trainable, name):
+    cols = [fc.numeric_column('a'),
+            fc.embedding_column(fc.categorical_column_with_identity(
+                key='b', num_buckets=3), dimension=2)]
+    orig_layer = df.DenseFeatures(
+        cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    self.assertEqual(config['name'], orig_layer.name)
+    self.assertEqual(config['trainable'], trainable)
+    self.assertLen(config['feature_columns'], 2)
+    self.assertEqual(
+        config['feature_columns'][0]['class_name'], 'NumericColumn')
+    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
+    self.assertEqual(
+        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
+
+  @parameterized.named_parameters(
+      ('default', None, None),
+      ('trainable', True, 'trainable'),
+      ('not_trainable', False, 'frozen'))
+  def test_from_config(self, trainable, name):
+    cols = [fc.numeric_column('a'),
+            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
+                'b', vocabulary_list=['1', '2', '3']), dimension=2),
+            fc.indicator_column(fc.categorical_column_with_hash_bucket(
+                key='c', hash_bucket_size=3))]
+    orig_layer = df.DenseFeatures(
+        cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    new_layer = df.DenseFeatures.from_config(config)
+
+    self.assertEqual(new_layer.name, orig_layer.name)
+    self.assertEqual(new_layer.trainable, trainable)
+    self.assertLen(new_layer._feature_columns, 3)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a')
+    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
+    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
+    self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
+
+  def test_crossed_column(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        'a', vocabulary_list=['1', '2', '3'])
+    b = fc.categorical_column_with_vocabulary_list(
+        'b', vocabulary_list=['1', '2', '3'])
+    ab = fc.crossed_column([a, b], hash_bucket_size=2)
+    cols = [fc.indicator_column(ab)]
+
+    orig_layer = df.DenseFeatures(cols)
+    config = orig_layer.get_config()
+
+    new_layer = df.DenseFeatures.from_config(config)
+
+    self.assertLen(new_layer._feature_columns, 1)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SequenceFeatureColumnsTest(test.TestCase):
+  """Tests DenseFeatures with sequence feature columns."""
+
+  def test_embedding_column(self):
+    """Tests that error is raised for sequence embedding column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc.embedding_column(
+        categorical_column_a, dimension=2)
+
+    input_layer = df.DenseFeatures([embedding_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_embedding\. categorical_column must not be '
+        r'of type SequenceCategoricalColumn\.'):
+      _ = input_layer({'aaa': sparse_input})
+
+  def test_indicator_column(self):
+    """Tests that error is raised for sequence indicator column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
+
+    input_layer = df.DenseFeatures([indicator_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In indicator_column: aaa_indicator\. categorical_column must not be '
+        r'of type SequenceCategoricalColumn\.'):
+      _ = input_layer({'aaa': sparse_input})
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/feature_column/dense_features_v2.py b/tensorflow/python/keras/feature_column/dense_features_v2.py
similarity index 92%
rename from tensorflow/python/feature_column/dense_features_v2.py
rename to tensorflow/python/keras/feature_column/dense_features_v2.py
index 405c5d63249..40c71ce7bd6 100644
--- a/tensorflow/python/feature_column/dense_features_v2.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.layers import serialization as layer_serialization
+from tensorflow.python.keras.feature_column import base_feature_layer as kfc
+from tensorflow.python.keras.feature_column import dense_features
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -93,8 +93,4 @@ class DenseFeatures(dense_features.DenseFeatures):
         column.create_state(self._state_manager)
     # We would like to call Layer.build and not _DenseFeaturesHelper.build.
     # pylint: disable=protected-access
-    super(fc._BaseFeaturesLayer, self).build(None)  # pylint: disable=bad-super-call
-
-
-layer_serialization.inject_feature_column_v2_objects(
-    'DenseFeatures', DenseFeatures)
+    super(kfc._BaseFeaturesLayer, self).build(None)  # pylint: disable=bad-super-call
diff --git a/tensorflow/python/feature_column/dense_features_v2_test.py b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
similarity index 99%
rename from tensorflow/python/feature_column/dense_features_v2_test.py
rename to tensorflow/python/keras/feature_column/dense_features_v2_test.py
index 71cb163a7d9..95fc8b7ac1e 100644
--- a/tensorflow/python/feature_column/dense_features_v2_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features_v2 as df
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,6 +30,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features_v2 as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables as variables_lib
diff --git a/tensorflow/python/keras/feature_column/sequence_feature_column.py b/tensorflow/python/keras/feature_column/sequence_feature_column.py
index 856e385c8fa..5f64ca9642e 100644
--- a/tensorflow/python/keras/feature_column/sequence_feature_column.py
+++ b/tensorflow/python/keras/feature_column/sequence_feature_column.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
+from tensorflow.python.keras.feature_column import base_feature_layer as kfc
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.util.tf_export import keras_export
@@ -32,7 +33,7 @@ from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export('keras.experimental.SequenceFeatures')
-class SequenceFeatures(fc._BaseFeaturesLayer):
+class SequenceFeatures(kfc._BaseFeaturesLayer):
   """A layer for sequence input.
 
     All `feature_columns` must be sequence dense columns with the same
diff --git a/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py b/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
index 8784182e23b..b1100bf7b07 100644
--- a/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
+++ b/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
@@ -24,11 +24,11 @@ from google.protobuf import text_format
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features
 from tensorflow.python.keras.feature_column import sequence_feature_column as ksfc
 from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.ops import init_ops_v2
diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD
index 01c405a86ae..80d8fb86345 100644
--- a/tensorflow/python/keras/integration_test/BUILD
+++ b/tensorflow/python/keras/integration_test/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #   Contains Keras integration tests that verify with other TF high level APIs.
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test", "tf_py_test")
 
 package(
     default_visibility = [
@@ -70,3 +70,13 @@ tf_py_test(
         "//tensorflow/python:extra_py_tests_deps",
     ],
 )
+
+cuda_py_test(
+    name = "gradient_checkpoint_test",
+    srcs = ["gradient_checkpoint_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:extra_py_tests_deps",
+    ],
+)
diff --git a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
new file mode 100644
index 00000000000..100f3ca2022
--- /dev/null
+++ b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
@@ -0,0 +1,158 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+layers = tf.keras.layers
+optimizers = tf.keras.optimizers
+
+
+def _get_big_cnn_model(img_dim, n_channels, num_partitions,
+                       blocks_per_partition):
+  """Creates a test model whose activations are significantly larger than model size."""
+  model = tf.keras.Sequential()
+  model.add(layers.Input(shape=(img_dim, img_dim, n_channels)))
+  for _ in range(num_partitions):
+    for _ in range(blocks_per_partition):
+      model.add(layers.Conv2D(10, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+      model.add(layers.Conv2D(40, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+      model.add(layers.Conv2D(20, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+  model.add(layers.Flatten())
+  model.add(layers.Dense(32, activation=tf.nn.relu))
+  model.add(layers.Dense(10))
+  return model
+
+
+def _get_split_cnn_model(img_dim, n_channels, num_partitions,
+                         blocks_per_partition):
+  """Creates a test model that is split into `num_partitions` smaller models"""
+  models = [tf.keras.Sequential() for _ in range(num_partitions)]
+  models[0].add(layers.Input(shape=(img_dim, img_dim, n_channels)))
+  for i in range(num_partitions):
+    model = models[i]
+    if i > 0:
+      last_shape = models[i - 1].layers[-1].output_shape
+      model.add(layers.Input(shape=last_shape[1:]))
+    for _ in range(blocks_per_partition):
+      model.add(layers.Conv2D(10, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+      model.add(layers.Conv2D(40, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+      model.add(layers.Conv2D(20, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+  models[-1].add(layers.Flatten())
+  models[-1].add(layers.Dense(32, activation=tf.nn.relu))
+  models[-1].add(layers.Dense(10))
+  return models
+
+
+def _compute_loss(logits, labels):
+  return tf.reduce_mean(
+      tf.nn.sparse_softmax_cross_entropy_with_logits(
+          logits=logits, labels=labels))
+
+
+def _limit_gpu_memory():
+  """Helper function to limit GPU memory for testing  """
+  gpus = tf.config.experimental.list_physical_devices('GPU')
+  if gpus:
+    tf.config.experimental.set_virtual_device_configuration(
+        gpus[0],
+        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1152)])
+    return True
+  return False
+
+
+def _get_dummy_data(img_dim, n_channels, batch_size):
+  inputs = tf.ones([batch_size, img_dim, img_dim, n_channels])
+  labels = tf.ones([batch_size], dtype=tf.int64)
+  return inputs, labels
+
+
+def _train_no_recompute(n_steps):
+  """Trains a single large model without gradient checkpointing."""
+  img_dim, n_channels, batch_size = 256, 1, 4
+  x, y = _get_dummy_data(img_dim, n_channels, batch_size)
+  model = _get_big_cnn_model(
+      img_dim, n_channels, num_partitions=3, blocks_per_partition=2)
+  optimizer = optimizers.SGD()
+  losses = []
+  tr_vars = model.trainable_variables
+  for _ in range(n_steps):
+    with tf.GradientTape() as tape:
+      logits = model(x)
+      loss = _compute_loss(logits, y)
+      losses.append(loss)
+    grads = tape.gradient(loss, tr_vars)  # tr_vars
+    optimizer.apply_gradients(zip(grads, tr_vars))
+    del grads
+  return losses
+
+
+def _train_with_recompute(n_steps):
+  """Trains a single large model with gradient checkpointing using tf.recompute_grad."""
+  img_dim, n_channels, batch_size = 256, 1, 4
+  x, y = _get_dummy_data(img_dim, n_channels, batch_size)
+  # This model is the same model as _get_big_cnn_model but split into 3 parts.
+  models = _get_split_cnn_model(
+      img_dim, n_channels, num_partitions=3, blocks_per_partition=2)
+  model1, model2, model3 = models
+  # Apply gradient checkpointing to the submodels using tf.recompute_grad.
+  model1_re = tf.recompute_grad(model1)
+  model2_re = tf.recompute_grad(model2)
+  model3_re = tf.recompute_grad(model3)
+  optimizer = optimizers.SGD()
+  tr_vars = (
+      model1.trainable_variables + model2.trainable_variables +
+      model3.trainable_variables)
+  losses = []
+  for _ in range(n_steps):
+    with tf.GradientTape() as tape:
+      logits1 = model1_re(x)
+      logits2 = model2_re(logits1)
+      logits3 = model3_re(logits2)
+      loss = _compute_loss(logits3, y)
+      losses.append(loss)
+      grads = tape.gradient(loss, tr_vars)  # tr_vars
+      optimizer.apply_gradients(zip(grads, tr_vars))
+      del grads
+  return losses
+
+
+class GradientCheckpointTest(tf.test.TestCase):
+
+  def test_raises_oom_exception(self):
+    if not _limit_gpu_memory():
+      self.skipTest('No virtual GPUs found')
+    with self.assertRaises(Exception) as context:
+      _train_no_recompute(1)
+    self.assertTrue(
+        context.exception.__class__.__name__ == 'ResourceExhaustedError')
+
+  def test_does_not_raise_oom_exception(self):
+    if not _limit_gpu_memory():
+      self.skipTest('No virtual GPUs found')
+    n_step = 2
+    losses = _train_with_recompute(n_step)
+    self.assertTrue(len(losses) == n_step)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/layers/BUILD b/tensorflow/python/keras/layers/BUILD
index 46ac88754a8..0b664d01b6a 100644
--- a/tensorflow/python/keras/layers/BUILD
+++ b/tensorflow/python/keras/layers/BUILD
@@ -141,6 +141,7 @@ py_library(
         "//tensorflow/python/keras:initializers",
         "//tensorflow/python/keras:regularizers",
         "//tensorflow/python/keras/engine:input_spec",
+        "//tensorflow/python/keras/layers/ops:core",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:generic_utils",
         "//tensorflow/python/keras/utils:tf_utils",
@@ -213,12 +214,13 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:sharded_variable",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/keras:backend",
-        "//tensorflow/python/keras:base_layer",
         "//tensorflow/python/keras:constraints",
         "//tensorflow/python/keras:initializers",
         "//tensorflow/python/keras:regularizers",
+        "//tensorflow/python/keras/engine:base_layer",
         "//tensorflow/python/keras/utils:tf_utils",
     ],
 )
@@ -593,9 +595,15 @@ cuda_py_test(
     python_version = "PY3",
     deps = [
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:training_lib",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/keras",
         "//tensorflow/python/keras:combinations",
-        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python/keras:testing_utils",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
     ],
 )
 
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 192c6a4afc8..8ce1c7d8224 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -44,19 +44,40 @@ from tensorflow.python.keras.layers.preprocessing.image_preprocessing import Res
 
 # Preprocessing layers.
 if tf2.enabled():
+  from tensorflow.python.keras.layers.preprocessing.category_encoding import CategoryEncoding
+  from tensorflow.python.keras.layers.preprocessing.category_encoding_v1 import CategoryEncoding as CategoryEncodingV1
+  CategoryEncodingV2 = CategoryEncoding
+  from tensorflow.python.keras.layers.preprocessing.integer_lookup import IntegerLookup
+  from tensorflow.python.keras.layers.preprocessing.integer_lookup_v1 import IntegerLookup as IntegerLookupV1
+  IntegerLookupV2 = IntegerLookup
   from tensorflow.python.keras.layers.preprocessing.normalization import Normalization
   from tensorflow.python.keras.layers.preprocessing.normalization_v1 import Normalization as NormalizationV1
   NormalizationV2 = Normalization
+  from tensorflow.python.keras.layers.preprocessing.string_lookup import StringLookup
+  from tensorflow.python.keras.layers.preprocessing.string_lookup_v1 import StringLookup as StringLookupV1
+  StringLookupV2 = StringLookup
   from tensorflow.python.keras.layers.preprocessing.text_vectorization import TextVectorization
   from tensorflow.python.keras.layers.preprocessing.text_vectorization_v1 import TextVectorization as TextVectorizationV1
   TextVectorizationV2 = TextVectorization
 else:
+  from tensorflow.python.keras.layers.preprocessing.integer_lookup_v1 import IntegerLookup
+  from tensorflow.python.keras.layers.preprocessing.integer_lookup import IntegerLookup as IntegerLookupV2
+  IntegerLookupV1 = IntegerLookup
+  from tensorflow.python.keras.layers.preprocessing.category_encoding_v1 import CategoryEncoding
+  from tensorflow.python.keras.layers.preprocessing.category_encoding import CategoryEncoding as CategoryEncodingV2
+  CategoryEncodingV1 = CategoryEncoding
   from tensorflow.python.keras.layers.preprocessing.normalization_v1 import Normalization
   from tensorflow.python.keras.layers.preprocessing.normalization import Normalization as NormalizationV2
   NormalizationV1 = Normalization
+  from tensorflow.python.keras.layers.preprocessing.string_lookup_v1 import StringLookup
+  from tensorflow.python.keras.layers.preprocessing.string_lookup import StringLookup as StringLookupV2
+  StringLookupV1 = StringLookup
   from tensorflow.python.keras.layers.preprocessing.text_vectorization_v1 import TextVectorization
   from tensorflow.python.keras.layers.preprocessing.text_vectorization import TextVectorization as TextVectorizationV2
   TextVectorizationV1 = TextVectorization
+from tensorflow.python.keras.layers.preprocessing.category_crossing import CategoryCrossing
+from tensorflow.python.keras.layers.preprocessing.discretization import Discretization
+from tensorflow.python.keras.layers.preprocessing.hashing import Hashing
 
 # Advanced activations.
 from tensorflow.python.keras.layers.advanced_activations import LeakyReLU
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index f7148ccd4e9..8e6078139c9 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -78,6 +78,11 @@ class Conv(Layer):
       the dilation rate to use for dilated convolution.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any `strides` value != 1.
+    groups: A positive integer specifying the number of groups in which the
+      input is split along the channel axis. Each group is convolved
+      separately with `filters / groups` filters. The output is the
+      concatenation of all the `groups` results along the channel axis.
+      Input channels and `filters` must both be divisible by `groups`.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied.
     use_bias: Boolean, whether the layer uses a bias.
@@ -100,13 +105,15 @@ class Conv(Layer):
     name: A string, the name of the layer.
   """
 
-  def __init__(self, rank,
+  def __init__(self,
+               rank,
                filters,
                kernel_size,
                strides=1,
                padding='valid',
                data_format=None,
                dilation_rate=1,
+               groups=1,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -128,6 +135,11 @@ class Conv(Layer):
     if filters is not None and not isinstance(filters, int):
       filters = int(filters)
     self.filters = filters
+    self.groups = groups or 1
+    if filters is not None and filters % self.groups != 0:
+      raise ValueError(
+          'The number of filters must be evenly divisible by the number of '
+          'groups. Received: groups={}, filters={}'.format(groups, filters))
     self.kernel_size = conv_utils.normalize_tuple(
         kernel_size, rank, 'kernel_size')
     if not all(self.kernel_size):
@@ -155,7 +167,14 @@ class Conv(Layer):
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     input_channel = self._get_input_channel(input_shape)
-    kernel_shape = self.kernel_size + (input_channel, self.filters)
+    if input_channel % self.groups != 0:
+      raise ValueError(
+          'The number of input channels must be evenly divisible by the number '
+          'of groups. Received groups={}, but the input has {} channels '
+          '(full input shape is {}).'.format(self.groups, input_channel,
+                                             input_shape))
+    kernel_shape = self.kernel_size + (input_channel // self.groups,
+                                       self.filters)
 
     self.kernel = self.add_weight(
         name='kernel',
@@ -250,22 +269,38 @@ class Conv(Layer):
 
   def get_config(self):
     config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'filters':
+            self.filters,
+        'kernel_size':
+            self.kernel_size,
+        'strides':
+            self.strides,
+        'padding':
+            self.padding,
+        'data_format':
+            self.data_format,
+        'dilation_rate':
+            self.dilation_rate,
+        'groups':
+            self.groups,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
     }
     base_config = super(Conv, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -371,6 +406,11 @@ class Conv1D(Conv):
       the dilation rate to use for dilated convolution.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any `strides` value != 1.
+    groups: A positive integer specifying the number of groups in which the
+      input is split along the channel axis. Each group is convolved
+      separately with `filters / groups` filters. The output is the
+      concatenation of all the `groups` results along the channel axis.
+      Input channels and `filters` must both be divisible by `groups`.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).
@@ -413,6 +453,7 @@ class Conv1D(Conv):
                padding='valid',
                data_format='channels_last',
                dilation_rate=1,
+               groups=1,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -431,6 +472,7 @@ class Conv1D(Conv):
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        groups=groups,
         activation=activations.get(activation),
         use_bias=use_bias,
         kernel_initializer=initializers.get(kernel_initializer),
@@ -517,6 +559,11 @@ class Conv2D(Conv):
       all spatial dimensions.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any stride value != 1.
+    groups: A positive integer specifying the number of groups in which the
+      input is split along the channel axis. Each group is convolved
+      separately with `filters / groups` filters. The output is the
+      concatenation of all the `groups` results along the channel axis.
+      Input channels and `filters` must both be divisible by `groups`.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).
@@ -566,6 +613,7 @@ class Conv2D(Conv):
                padding='valid',
                data_format=None,
                dilation_rate=(1, 1),
+               groups=1,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -584,6 +632,7 @@ class Conv2D(Conv):
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        groups=groups,
         activation=activations.get(activation),
         use_bias=use_bias,
         kernel_initializer=initializers.get(kernel_initializer),
@@ -655,6 +704,11 @@ class Conv3D(Conv):
       all spatial dimensions.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any stride value != 1.
+    groups: A positive integer specifying the number of groups in which the
+      input is split along the channel axis. Each group is convolved
+      separately with `filters / groups` filters. The output is the
+      concatenation of all the `groups` results along the channel axis.
+      Input channels and `filters` must both be divisible by `groups`.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).
@@ -710,6 +764,7 @@ class Conv3D(Conv):
                padding='valid',
                data_format=None,
                dilation_rate=(1, 1, 1),
+               groups=1,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -728,6 +783,7 @@ class Conv3D(Conv):
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        groups=groups,
         activation=activations.get(activation),
         use_bias=use_bias,
         kernel_initializer=initializers.get(kernel_initializer),
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 528bc14adf4..f25eeb66664 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -28,6 +28,9 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
@@ -47,20 +50,41 @@ class Conv1DTest(keras_parameterized.TestCase):
           expected_output_shape=expected_output_shape)
 
   @parameterized.named_parameters(
-      ('padding_valid', {'padding': 'valid'}, (None, 5, 2)),
-      ('padding_same', {'padding': 'same'}, (None, 7, 2)),
-      ('padding_same_dilation_2', {'padding': 'same', 'dilation_rate': 2},
-       (None, 7, 2)),
-      ('padding_same_dilation_3', {'padding': 'same', 'dilation_rate': 3},
-       (None, 7, 2)),
-      ('padding_causal', {'padding': 'causal'}, (None, 7, 2)),
-      ('strides', {'strides': 2}, (None, 3, 2)),
-      ('dilation_rate', {'dilation_rate': 2}, (None, 3, 2)),
+      ('padding_valid', {
+          'padding': 'valid'
+      }, (None, 5, 2)),
+      ('padding_same', {
+          'padding': 'same'
+      }, (None, 7, 2)),
+      ('padding_same_dilation_2', {
+          'padding': 'same',
+          'dilation_rate': 2
+      }, (None, 7, 2)),
+      ('padding_same_dilation_3', {
+          'padding': 'same',
+          'dilation_rate': 3
+      }, (None, 7, 2)),
+      ('padding_causal', {
+          'padding': 'causal'
+      }, (None, 7, 2)),
+      ('strides', {
+          'strides': 2
+      }, (None, 3, 2)),
+      ('dilation_rate', {
+          'dilation_rate': 2
+      }, (None, 3, 2)),
+      # Only runs on GPU with CUDA, groups are not supported on CPU.
+      # https://github.com/tensorflow/tensorflow/issues/29005
+      ('group', {
+          'groups': 3,
+          'filters': 6
+      }, (None, 5, 6), True),
   )
-  def test_conv1d(self, kwargs, expected_output_shape):
-    kwargs['filters'] = 2
+  def test_conv1d(self, kwargs, expected_output_shape, requires_gpu=False):
+    kwargs['filters'] = kwargs.get('filters', 2)
     kwargs['kernel_size'] = 3
-    self._run_test(kwargs, expected_output_shape)
+    if not requires_gpu or test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs, expected_output_shape)
 
   def test_conv1d_regularizers(self):
     kwargs = {
@@ -148,20 +172,38 @@ class Conv2DTest(keras_parameterized.TestCase):
           expected_output_shape=expected_output_shape)
 
   @parameterized.named_parameters(
-      ('padding_valid', {'padding': 'valid'}, (None, 5, 4, 2)),
-      ('padding_same', {'padding': 'same'}, (None, 7, 6, 2)),
-      ('padding_same_dilation_2', {'padding': 'same', 'dilation_rate': 2},
-       (None, 7, 6, 2)),
-      ('strides', {'strides': (2, 2)}, (None, 3, 2, 2)),
-      ('dilation_rate', {'dilation_rate': (2, 2)}, (None, 3, 2, 2)),
+      ('padding_valid', {
+          'padding': 'valid'
+      }, (None, 5, 4, 2)),
+      ('padding_same', {
+          'padding': 'same'
+      }, (None, 7, 6, 2)),
+      ('padding_same_dilation_2', {
+          'padding': 'same',
+          'dilation_rate': 2
+      }, (None, 7, 6, 2)),
+      ('strides', {
+          'strides': (2, 2)
+      }, (None, 3, 2, 2)),
+      ('dilation_rate', {
+          'dilation_rate': (2, 2)
+      }, (None, 3, 2, 2)),
       # Only runs on GPU with CUDA, channels_first is not supported on CPU.
       # TODO(b/62340061): Support channels_first on CPU.
-      ('data_format', {'data_format': 'channels_first'}),
+      ('data_format', {
+          'data_format': 'channels_first'
+      }, None, True),
+      # Only runs on GPU with CUDA, groups are not supported on CPU.
+      # https://github.com/tensorflow/tensorflow/issues/29005
+      ('group', {
+          'groups': 3,
+          'filters': 6
+      }, (None, 5, 4, 6), True),
   )
-  def test_conv2d(self, kwargs, expected_output_shape=None):
-    kwargs['filters'] = 2
+  def test_conv2d(self, kwargs, expected_output_shape=None, requires_gpu=False):
+    kwargs['filters'] = kwargs.get('filters', 2)
     kwargs['kernel_size'] = (3, 3)
-    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+    if not requires_gpu or test.is_gpu_available(cuda_only=True):
       self._run_test(kwargs, expected_output_shape)
 
   def test_conv2d_regularizers(self):
@@ -208,7 +250,7 @@ class Conv2DTest(keras_parameterized.TestCase):
 @keras_parameterized.run_all_keras_modes
 class Conv3DTest(keras_parameterized.TestCase):
 
-  def _run_test(self, kwargs, expected_output_shape):
+  def _run_test(self, kwargs, expected_output_shape, validate_training=True):
     num_samples = 2
     stack_size = 3
     num_row = 7
@@ -220,22 +262,41 @@ class Conv3DTest(keras_parameterized.TestCase):
           keras.layers.Conv3D,
           kwargs=kwargs,
           input_shape=(num_samples, depth, num_row, num_col, stack_size),
-          expected_output_shape=expected_output_shape)
+          expected_output_shape=expected_output_shape,
+          validate_training=validate_training)
 
   @parameterized.named_parameters(
-      ('padding_valid', {'padding': 'valid'}, (None, 3, 5, 4, 2)),
-      ('padding_same', {'padding': 'same'}, (None, 5, 7, 6, 2)),
-      ('strides', {'strides': (2, 2, 2)}, (None, 2, 3, 2, 2)),
-      ('dilation_rate', {'dilation_rate': (2, 2, 2)}, (None, 1, 3, 2, 2)),
+      ('padding_valid', {
+          'padding': 'valid'
+      }, (None, 3, 5, 4, 2)),
+      ('padding_same', {
+          'padding': 'same'
+      }, (None, 5, 7, 6, 2)),
+      ('strides', {
+          'strides': (2, 2, 2)
+      }, (None, 2, 3, 2, 2)),
+      ('dilation_rate', {
+          'dilation_rate': (2, 2, 2)
+      }, (None, 1, 3, 2, 2)),
       # Only runs on GPU with CUDA, channels_first is not supported on CPU.
       # TODO(b/62340061): Support channels_first on CPU.
-      ('data_format', {'data_format': 'channels_first'}),
+      ('data_format', {
+          'data_format': 'channels_first'
+      }, None, True),
+      # Only runs on GPU with CUDA, groups are not supported on CPU.
+      # https://github.com/tensorflow/tensorflow/issues/29005
+      ('group', {
+          'groups': 3,
+          'filters': 6
+      }, (None, 3, 5, 4, 6), True),
   )
-  def test_conv3d(self, kwargs, expected_output_shape=None):
-    kwargs['filters'] = 2
+  def test_conv3d(self, kwargs, expected_output_shape=None, requires_gpu=False):
+    kwargs['filters'] = kwargs.get('filters', 2)
     kwargs['kernel_size'] = (3, 3, 3)
-    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, expected_output_shape)
+    # train_on_batch currently fails with XLA enabled on GPUs
+    test_training = 'groups' not in kwargs or not test_util.is_xla_enabled()
+    if not requires_gpu or test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs, expected_output_shape, test_training)
 
   def test_conv3d_regularizers(self):
     kwargs = {
@@ -298,6 +359,58 @@ class Conv3DTest(keras_parameterized.TestCase):
             input_data=input_data)
 
 
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+class GroupedConvTest(keras_parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('Conv1D', keras.layers.Conv1D),
+      ('Conv2D', keras.layers.Conv2D),
+      ('Conv3D', keras.layers.Conv3D),
+  )
+  def test_group_conv_incorrect_use(self, layer):
+    with self.assertRaisesRegexp(ValueError, 'The number of filters'):
+      layer(16, 3, groups=3)
+    with self.assertRaisesRegexp(ValueError, 'The number of input channels'):
+      layer(16, 3, groups=4).build((32, 12, 12, 3))
+
+  @parameterized.named_parameters(
+      ('Conv1D', keras.layers.Conv1D, (32, 12, 32)),
+      ('Conv2D', keras.layers.Conv2D, (32, 12, 12, 32)),
+      ('Conv3D', keras.layers.Conv3D, (32, 12, 12, 12, 32)),
+  )
+  def test_group_conv(self, layer_cls, input_shape):
+    if test.is_gpu_available(cuda_only=True):
+      with test_util.use_gpu():
+        inputs = random_ops.random_uniform(shape=input_shape)
+
+        layer = layer_cls(16, 3, groups=4, use_bias=False)
+        layer.build(input_shape)
+
+        input_slices = array_ops.split(inputs, 4, axis=-1)
+        weight_slices = array_ops.split(layer.kernel, 4, axis=-1)
+        expected_outputs = array_ops.concat([
+            nn.convolution_v2(inputs, weights)
+            for inputs, weights in zip(input_slices, weight_slices)
+        ],
+                                            axis=-1)
+
+        self.assertAllClose(layer(inputs), expected_outputs, rtol=1e-5)
+
+  def test_group_conv_depthwise(self):
+    if test.is_gpu_available(cuda_only=True):
+      with test_util.use_gpu():
+        inputs = random_ops.random_uniform(shape=(3, 27, 27, 32))
+
+        layer = keras.layers.Conv2D(32, 3, groups=32, use_bias=False)
+        layer.build((3, 27, 27, 32))
+
+        weights_dw = array_ops.reshape(layer.kernel, [3, 3, 32, 1])
+        expected_outputs = nn.depthwise_conv2d(
+            inputs, weights_dw, strides=[1, 1, 1, 1], padding='VALID')
+
+        self.assertAllClose(layer(inputs), expected_outputs, rtol=1e-5)
+
+
 @keras_parameterized.run_all_keras_modes
 class Conv1DTransposeTest(keras_parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index db9c47eca17..56512d0d754 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -37,18 +37,15 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
+from tensorflow.python.keras.layers.ops import core as core_ops
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import standard_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training.tracking import base as trackable
@@ -460,7 +457,7 @@ class Reshape(Layer):
   >>> # also supports shape inference using `-1` as dimension
   >>> model.add(tf.keras.layers.Reshape((-1, 2, 2)))
   >>> model.output_shape
-  (None, None, 2, 2)
+  (None, 3, 2, 2)
   """
 
   def __init__(self, target_shape, **kwargs):
@@ -495,7 +492,9 @@ class Reshape(Layer):
       is specified.
     """
     output_shape = list(output_shape)
-    msg = 'total size of new array must be unchanged'
+    msg = ('total size of new array must be unchanged, '
+           'input_shape = {}, output_shape = {}'
+           .format(input_shape, output_shape))
 
     known, unknown = 1, None
     for index, dim in enumerate(output_shape):
@@ -529,8 +528,13 @@ class Reshape(Layer):
     return tensor_shape.TensorShape(output_shape)
 
   def call(self, inputs):
-    return array_ops.reshape(inputs,
-                             (array_ops.shape(inputs)[0],) + self.target_shape)
+    result = array_ops.reshape(
+        inputs, (array_ops.shape(inputs)[0],) + self.target_shape)
+    if not context.executing_eagerly():
+      # Set the static shape for the result since it might lost during array_ops
+      # reshape, eg, some `None` dim in the result could be inferred.
+      result.set_shape(self.compute_output_shape(inputs.shape))
+    return result
 
   def get_config(self):
     config = {'target_shape': self.target_shape}
@@ -1125,11 +1129,8 @@ class Dense(Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if 'input_shape' not in kwargs and 'input_dim' in kwargs:
-      kwargs['input_shape'] = (kwargs.pop('input_dim'),)
-
     super(Dense, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+        activity_regularizer=activity_regularizer, **kwargs)
 
     self.units = int(units) if not isinstance(units, int) else units
     self.activation = activations.get(activation)
@@ -1141,19 +1142,20 @@ class Dense(Layer):
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.bias_constraint = constraints.get(bias_constraint)
 
-    self.supports_masking = True
     self.input_spec = InputSpec(min_ndim=2)
+    self.supports_masking = True
 
   def build(self, input_shape):
     dtype = dtypes.as_dtype(self.dtype or K.floatx())
     if not (dtype.is_floating or dtype.is_complex):
       raise TypeError('Unable to build `Dense` layer with non-floating point '
                       'dtype %s' % (dtype,))
+
     input_shape = tensor_shape.TensorShape(input_shape)
-    if tensor_shape.dimension_value(input_shape[-1]) is None:
+    last_dim = tensor_shape.dimension_value(input_shape[-1])
+    if last_dim is None:
       raise ValueError('The last dimension of the inputs to `Dense` '
                        'should be defined. Found `None`.')
-    last_dim = tensor_shape.dimension_value(input_shape[-1])
     self.input_spec = InputSpec(min_ndim=2, axes={-1: last_dim})
     self.kernel = self.add_weight(
         'kernel',
@@ -1177,27 +1179,12 @@ class Dense(Layer):
     self.built = True
 
   def call(self, inputs):
-    base_layer_utils.no_ragged_support(inputs, self.name)
-    rank = inputs.shape.rank
-    if rank is not None and rank > 2:
-      # Broadcasting is required for the inputs.
-      outputs = standard_ops.tensordot(inputs, self.kernel, [[rank - 1], [0]])
-      # Reshape the output back to the original ndim of the input.
-      if not context.executing_eagerly():
-        shape = inputs.shape.as_list()
-        output_shape = shape[:-1] + [self.units]
-        outputs.set_shape(output_shape)
-    else:
-      inputs = math_ops.cast(inputs, self._compute_dtype)
-      if K.is_sparse(inputs):
-        outputs = sparse_ops.sparse_tensor_dense_matmul(inputs, self.kernel)
-      else:
-        outputs = gen_math_ops.mat_mul(inputs, self.kernel)
-    if self.use_bias:
-      outputs = nn.bias_add(outputs, self.bias)
-    if self.activation is not None:
-      return self.activation(outputs)  # pylint: disable=not-callable
-    return outputs
+    return core_ops.dense(
+        inputs,
+        self.kernel,
+        self.bias,
+        self.activation,
+        dtype=self._compute_dtype_object)
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
@@ -1209,21 +1196,30 @@ class Dense(Layer):
     return input_shape[:-1].concatenate(self.units)
 
   def get_config(self):
-    config = {
-        'units': self.units,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+    config = super(Dense, self).get_config()
+    config.update({
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Dense, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
+    })
+    return config
 
 
 @keras_export('keras.layers.ActivityRegularization')
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index 3daa187f1ce..70ad63c17eb 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -430,6 +430,12 @@ class CoreLayersTest(keras_parameterized.TestCase):
         kwargs={'target_shape': (-1, 1)},
         input_shape=(None, None, 2))
 
+  def test_reshape_set_static_shape(self):
+    input_layer = keras.Input(batch_shape=(1, None))
+    reshaped = keras.layers.Reshape((1, 100))(input_layer)
+    # Make sure the batch dim is not lost after array_ops.reshape.
+    self.assertEqual(reshaped.shape, [1, 1, 100])
+
   def test_permute(self):
     testing_utils.layer_test(
         keras.layers.Permute, kwargs={'dims': (2, 1)}, input_shape=(3, 2, 4))
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
index 9cf132d68df..d25851f6569 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -267,6 +267,7 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
       self.assertEqual(len(layer.trainable_weights), 3)
       self.assertEqual(len(layer.non_trainable_weights), 0)
 
+  # TODO(b/156439419): Reenable after the bug is fixed.
   @parameterized.named_parameters(
       *test_util.generate_combinations_with_testcase_name(
           rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False],
@@ -274,9 +275,9 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
           model_nest_level=[1, 2], model_type=['seq', 'func']))
   @test_util.run_v1_only('b/120911602, b/112083752')
   @test_util.run_gpu_only
-  def test_load_weights_between_noncudnn_rnn(self, rnn_type, to_cudnn,
-                                             bidirectional, implementation,
-                                             model_nest_level, model_type):
+  def DISALBED_test_load_weights_between_noncudnn_rnn(
+      self, rnn_type, to_cudnn, bidirectional, implementation,
+      model_nest_level, model_type):
     input_size = 10
     timesteps = 6
     input_shape = (timesteps, input_size)
diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index 3f57fd6cb63..3444b3a7665 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
@@ -129,8 +130,10 @@ class Embedding(Layer):
     # since it knows all kernels using the variable only exist on CPU.
     # When eager execution is enabled, the placement decision has to be made
     # right now. Checking for the presence of GPUs to avoid complicating the
-    # TPU codepaths which can handle sparse optimizers.
-    if context.executing_eagerly() and context.context().num_gpus():
+    # TPU codepaths which can handle sparse optimizers. But if we are within
+    # a tf.function, we go back the graph mode logic and rely on the placer.
+    if (context.executing_eagerly() and context.context().num_gpus() and
+        not ops.inside_function()):
       with ops.device('cpu:0'):
         self.embeddings = self.add_weight(
             shape=(self.input_dim, self.output_dim),
@@ -181,7 +184,10 @@ class Embedding(Layer):
     dtype = K.dtype(inputs)
     if dtype != 'int32' and dtype != 'int64':
       inputs = math_ops.cast(inputs, 'int32')
-    out = embedding_ops.embedding_lookup(self.embeddings, inputs)
+    if isinstance(self.embeddings, sharded_variable.ShardedVariable):
+      out = embedding_ops.embedding_lookup_v2(self.embeddings.variables, inputs)
+    else:
+      out = embedding_ops.embedding_lookup_v2(self.embeddings, inputs)
     return out
 
   def get_config(self):
diff --git a/tensorflow/python/keras/layers/embeddings_test.py b/tensorflow/python/keras/layers/embeddings_test.py
index 661b29cd7bf..6aa873b2bd7 100644
--- a/tensorflow/python/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/layers/embeddings_test.py
@@ -21,12 +21,14 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.eager import backprop
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import adagrad
@@ -130,6 +132,20 @@ class EmbeddingTest(keras_parameterized.TestCase):
             [[[1., 1.], [2., 2.], [2., 2.]], [[0., 0.]], [[1., 1.], [2., 2.]]],
             ragged_rank=1))
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_embedding_with_sharded_variable(self):
+    layer = keras.layers.Embedding(input_dim=5, output_dim=2)
+    v = [
+        variables.Variable([[1., 2.], [3., 4.]]),
+        variables.Variable([[5., 6.], [7., 8.]]),
+        variables.Variable([[9., 10.]])
+    ]
+    model = keras.models.Sequential([layer])
+    layer.embeddings = sharded_variable.ShardedVariable(v)
+    model.run_eagerly = testing_utils.should_run_eagerly()
+    outputs = model.predict(np.array([[0, 2, 4]], dtype='int32'))
+    self.assertAllClose(outputs, [[[1., 2.], [5., 6.], [9., 10.]]])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/gru_test.py b/tensorflow/python/keras/layers/gru_test.py
index 083d52b1caa..5b794c580a2 100644
--- a/tensorflow/python/keras/layers/gru_test.py
+++ b/tensorflow/python/keras/layers/gru_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
+
 from absl.testing import parameterized
 import numpy as np
 
@@ -253,6 +255,12 @@ class GRULayerGenericTest(test.TestCase):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
+  def test_deep_copy_GRU(self):
+    cell = keras.layers.GRUCell(5)
+    copied_cell = copy.deepcopy(cell)
+    self.assertEqual(copied_cell.units, 5)
+    self.assertEqual(cell.get_config(), copied_cell.get_config())
+
   def test_regularizers_GRU(self):
     embedding_dim = 4
     layer_class = keras.layers.GRU
diff --git a/tensorflow/python/keras/layers/kernelized.py b/tensorflow/python/keras/layers/kernelized.py
index ce53334ebc7..5f401899bec 100644
--- a/tensorflow/python/keras/layers/kernelized.py
+++ b/tensorflow/python/keras/layers/kernelized.py
@@ -191,15 +191,15 @@ class RandomFourierFeatures(base_layer.Layer):
     kernel_initializer = _get_random_features_initializer(
         self.kernel_initializer, shape=(input_dim, self.output_dim))
 
-    unscaled_kernel = self.add_weight(
-        name='unscaled_random_features',
+    self.unscaled_kernel = self.add_weight(
+        name='unscaled_kernel',
         shape=(input_dim, self.output_dim),
         dtype=dtypes.float32,
         initializer=kernel_initializer,
         trainable=False)
 
     self.bias = self.add_weight(
-        name='random_features_bias',
+        name='bias',
         shape=(self.output_dim,),
         dtype=dtypes.float32,
         initializer=init_ops.random_uniform_initializer(
@@ -208,20 +208,20 @@ class RandomFourierFeatures(base_layer.Layer):
 
     if self.scale is None:
       self.scale = _get_default_scale(self.kernel_initializer, input_dim)
-    scale = self.add_weight(
-        name='random_features_scale',
+    self.kernel_scale = self.add_weight(
+        name='kernel_scale',
         shape=(1,),
         dtype=dtypes.float32,
         initializer=init_ops.constant_initializer(self.scale),
         trainable=True,
         constraint='NonNeg')
-    self.kernel = (1.0 / scale) * unscaled_kernel
     super(RandomFourierFeatures, self).build(input_shape)
 
   def call(self, inputs):
     inputs = ops.convert_to_tensor_v2(inputs, dtype=self.dtype)
     inputs = gen_math_ops.cast(inputs, dtypes.float32)
-    outputs = gen_math_ops.mat_mul(inputs, self.kernel)
+    kernel = (1.0 / self.kernel_scale) * self.unscaled_kernel
+    outputs = gen_math_ops.mat_mul(inputs, kernel)
     outputs = nn.bias_add(outputs, self.bias)
     return gen_math_ops.cos(outputs)
 
diff --git a/tensorflow/python/keras/layers/kernelized_test.py b/tensorflow/python/keras/layers/kernelized_test.py
index edb58f77868..a6a9d88423f 100644
--- a/tensorflow/python/keras/layers/kernelized_test.py
+++ b/tensorflow/python/keras/layers/kernelized_test.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import functools
 import math
+import os
+import shutil
 
 from absl.testing import parameterized
 import numpy as np
@@ -35,7 +37,10 @@ from tensorflow.python.keras import backend as keras_backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import kernelized as kernel_layers
+from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils import kernelized_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -65,6 +70,22 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
     else:
       self.assertAllClose(expected, actual, atol=atol)
 
+  @test_util.run_v2_only
+  def test_state_saving_and_loading(self):
+    input_data = np.random.random((1, 2))
+    rff_layer = kernel_layers.RandomFourierFeatures(output_dim=10, scale=3.0)
+    inputs = input_layer.Input((2,))
+    outputs = rff_layer(inputs)
+    model = training.Model(inputs, outputs)
+    output_data = model.predict(input_data)
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+    saved_model_dir = os.path.join(temp_dir, 'rff_model')
+    model.save(saved_model_dir)
+    new_model = save.load_model(saved_model_dir)
+    new_output_data = new_model.predict(input_data)
+    self.assertAllClose(output_data, new_output_data, atol=1e-4)
+
   def test_invalid_output_dim(self):
     with self.assertRaisesRegexp(
         ValueError, r'`output_dim` should be a positive integer. Given: -3.'):
@@ -246,7 +267,7 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
     num_trainable_vars = 1 if trainable else 0
     self.assertLen(rff_layer.trainable_variables, num_trainable_vars)
     if trainable:
-      self.assertEqual('random_fourier_features/random_features_scale:0',
+      self.assertEqual('random_fourier_features/kernel_scale:0',
                        rff_layer.trainable_variables[0].name)
     self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars)
 
diff --git a/tensorflow/python/keras/layers/lstm_test.py b/tensorflow/python/keras/layers/lstm_test.py
index c6387256e7e..5ed86dd829a 100644
--- a/tensorflow/python/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
+
 from absl.testing import parameterized
 import numpy as np
 
@@ -182,6 +184,12 @@ class LSTMLayerTest(keras_parameterized.TestCase):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
+  def test_deep_copy_LSTM(self):
+    cell = keras.layers.LSTMCell(5)
+    copied_cell = copy.deepcopy(cell)
+    self.assertEqual(copied_cell.units, 5)
+    self.assertEqual(cell.get_config(), copied_cell.get_config())
+
   def test_specify_initial_state_keras_tensor(self):
     num_states = 2
     timesteps = 3
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 9a35cd86525..61e134e3d94 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -12,13 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Normalization layers.
-"""
+"""Normalization layers."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -28,7 +26,6 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import tf_utils
@@ -44,7 +41,7 @@ from tensorflow.python.util.tf_export import keras_export
 
 
 class BatchNormalizationBase(Layer):
-  r"""Normalize and scale inputs or activations. (Ioffe and Szegedy, 2014).
+  r"""Normalize and scale inputs or activations.
 
   Normalize the activations of the previous layer at each batch,
   i.e. applies a transformation that maintains the mean activation
@@ -66,20 +63,16 @@ class BatchNormalizationBase(Layer):
   `training=False` when calling the model, or using `model.predict`.
 
   Arguments:
-    axis: Integer, the axis that should be normalized
-      (typically the features axis).
-      For instance, after a `Conv2D` layer with
-      `data_format="channels_first"`,
-      set `axis=1` in `BatchNormalization`.
+    axis: Integer, the axis that should be normalized (typically the features
+      axis). For instance, after a `Conv2D` layer with
+      `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
     momentum: Momentum for the moving average.
     epsilon: Small float added to variance to avoid dividing by zero.
-    center: If True, add offset of `beta` to normalized tensor.
-      If False, `beta` is ignored.
-    scale: If True, multiply by `gamma`.
-      If False, `gamma` is not used.
-      When the next layer is linear (also e.g. `nn.relu`),
-      this can be disabled since the scaling
-      will be done by the next layer.
+    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
+      is ignored.
+    scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the
+      next layer is linear (also e.g. `nn.relu`), this can be disabled since the
+      scaling will be done by the next layer.
     beta_initializer: Initializer for the beta weight.
     gamma_initializer: Initializer for the gamma weight.
     moving_mean_initializer: Initializer for the moving mean.
@@ -90,17 +83,17 @@ class BatchNormalizationBase(Layer):
     gamma_constraint: Optional constraint for the gamma weight.
     renorm: Whether to use [Batch Renormalization](
       https://arxiv.org/abs/1702.03275). This adds extra variables during
-      training. The inference is the same for either value of this parameter.
+        training. The inference is the same for either value of this parameter.
     renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
-      scalar `Tensors` used to clip the renorm correction. The correction
-      `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
-      `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+      scalar `Tensors` used to clip the renorm correction. The correction `(r,
+      d)` is used as `corrected_value = normalized_value * r + d`, with `r`
+      clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
       dmax are set to inf, 0, inf, respectively.
     renorm_momentum: Momentum used to update the moving means and standard
-      deviations with renorm. Unlike `momentum`, this affects training
-      and should be neither too small (which would add noise) nor too large
-      (which would give stale estimates). Note that `momentum` is still applied
-      to get the means and variances for inference.
+      deviations with renorm. Unlike `momentum`, this affects training and
+      should be neither too small (which would add noise) nor too large (which
+      would give stale estimates). Note that `momentum` is still applied to get
+      the means and variances for inference.
     fused: if `True`, use a faster, fused implementation, or raise a ValueError
       if the fused implementation cannot be used. If `None`, use the faster
       implementation if possible. If False, do not used the fused
@@ -118,54 +111,36 @@ class BatchNormalizationBase(Layer):
       example, if axis==-1,
         `adjustment = lambda shape: (
           tf.random.uniform(shape[-1:], 0.93, 1.07),
-          tf.random.uniform(shape[-1:], -0.1, 0.1))`
-      will scale the normalized value by up to 7% up or down, then shift the
-      result by up to 0.1 (with independent scaling and bias for each feature
-      but shared across all examples), and finally apply gamma and/or beta. If
-      `None`, no adjustment is applied. Cannot be specified if
-      virtual_batch_size is specified.
-
+          tf.random.uniform(shape[-1:], -0.1, 0.1))` will scale the normalized
+            value by up to 7% up or down, then shift the result by up to 0.1
+            (with independent scaling and bias for each feature but shared
+            across all examples), and finally apply gamma and/or beta. If
+            `None`, no adjustment is applied. Cannot be specified if
+            virtual_batch_size is specified.
   Call arguments:
     inputs: Input tensor (of any rank).
     training: Python boolean indicating whether the layer should behave in
       training mode or in inference mode.
-      - `training=True`: The layer will normalize its inputs using the
-        mean and variance of the current batch of inputs.
-      - `training=False`: The layer will normalize its inputs using the
-        mean and variance of its moving statistics, learned during training.
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Same shape as input.
-
-  {{TRAINABLE_ATTRIBUTE_NOTE}}
-
-  Normalization equations:
-    Consider the intermediate activations \(x\) of a mini-batch of size
-    \\(m\\):
-
-    We can compute the mean and variance of the batch
-
-    \\({\mu_B} = \frac{1}{m} \sum_{i=1}^{m} {x_i}\\)
-
-    \\({\sigma_B^2} = \frac{1}{m} \sum_{i=1}^{m} ({x_i} - {\mu_B})^2\\)
-
-    and then compute a normalized \\(x\\), including a small factor
-    \\({\epsilon}\\) for numerical stability.
-
-    \\(\hat{x_i} = \frac{x_i - \mu_B}{\sqrt{\sigma_B^2 + \epsilon}}\\)
-
-    And finally \\(\hat{x}\) is linearly transformed by \({\gamma}\\)
-    and \\({\beta}\\), which are learned parameters:
-
-    \\({y_i} = {\gamma * \hat{x_i} + \beta}\\)
-
+      - `training=True`: The layer will normalize its inputs using the mean and
+        variance of the current batch of inputs.
+      - `training=False`: The layer will normalize its inputs using the mean and
+        variance of its moving statistics, learned during training.
+  Input shape: Arbitrary. Use the keyword argument `input_shape` (tuple of
+    integers, does not include the samples axis) when using this layer as the
+    first layer in a model.
+  Output shape: Same shape as input.  {{TRAINABLE_ATTRIBUTE_NOTE}}
+  Normalization equations: Consider the intermediate activations \(x\) of a
+    mini-batch of size
+    \\(m\\):  We can compute the mean and variance of the batch  \\({\mu_B} =
+      \frac{1}{m} \sum_{i=1}^{m} {x_i}\\)  \\({\sigma_B^2} = \frac{1}{m}
+      \sum_{i=1}^{m} ({x_i} - {\mu_B})^2\\)  and then compute a normalized
+      \\(x\\), including a small factor \\({\epsilon}\\) for numerical
+      stability.  \\(\hat{x_i} = \frac{x_i - \mu_B}{\sqrt{\sigma_B^2 +
+      \epsilon}}\\)  And finally \\(\hat{x}\) is linearly transformed by
+      \({\gamma}\\)
+    and \\({\beta}\\), which are learned parameters:  \\({y_i} = {\gamma *
+      \hat{x_i} + \beta}\\)
   Reference:
-
     - [Ioffe and Szegedy, 2015](https://arxiv.org/abs/1502.03167).
   """
 
@@ -196,8 +171,7 @@ class BatchNormalizationBase(Layer):
                adjustment=None,
                name=None,
                **kwargs):
-    super(BatchNormalizationBase, self).__init__(
-        name=name, **kwargs)
+    super(BatchNormalizationBase, self).__init__(name=name, **kwargs)
     if isinstance(axis, (list, tuple)):
       self.axis = axis[:]
     elif isinstance(axis, int):
@@ -234,7 +208,6 @@ class BatchNormalizationBase(Layer):
 
     self.fused = fused
     self._bessels_correction_test_only = True
-    self._trainable_var = None
     self.trainable = trainable
 
     if renorm:
@@ -277,8 +250,8 @@ class BatchNormalizationBase(Layer):
     # TODO(reedwm): Support fp64 in FusedBatchNorm then remove this check.
     if self._compute_dtype not in ('float16', 'bfloat16', 'float32', None):
       raise ValueError('Passing fused=True is only supported when the compute '
-                       'dtype is float16, bfloat16, or float32. Got dtype: %s'
-                       % (self._compute_dtype,))
+                       'dtype is float16, bfloat16, or float32. Got dtype: %s' %
+                       (self._compute_dtype,))
 
   def _fused_can_be_used(self):
     try:
@@ -294,14 +267,6 @@ class BatchNormalizationBase(Layer):
   @trainable.setter
   def trainable(self, value):
     self._trainable = value
-    if self._trainable_var is not None:
-      self._trainable_var.update_value(value)
-
-  def _get_trainable_var(self):
-    if self._trainable_var is None:
-      self._trainable_var = K.freezable_variable(
-          self._trainable, name=self.name + '_trainable')
-    return self._trainable_var
 
   @property
   def _param_dtype(self):
@@ -390,13 +355,14 @@ class BatchNormalizationBase(Layer):
       param_shape = (list(axis_to_dim.values())[0],)
     else:
       # Parameter shape is the original shape but with 1 in all non-axis dims
-      param_shape = [axis_to_dim[i] if i in axis_to_dim
-                     else 1 for i in range(ndims)]
+      param_shape = [
+          axis_to_dim[i] if i in axis_to_dim else 1 for i in range(ndims)
+      ]
       if self.virtual_batch_size is not None:
         # When using virtual batches, add an extra dim at index 1
         param_shape.insert(1, 1)
         for idx, x in enumerate(self.axis):
-          self.axis[idx] = x + 1      # Account for added dimension
+          self.axis[idx] = x + 1  # Account for added dimension
 
     if self.scale:
       self.gamma = self.add_weight(
@@ -517,8 +483,7 @@ class BatchNormalizationBase(Layer):
         decay = ops.convert_to_tensor_v2(1.0 - momentum, name='decay')
         if decay.dtype != variable.dtype.base_dtype:
           decay = math_ops.cast(decay, variable.dtype.base_dtype)
-        update_delta = (
-            variable - math_ops.cast(value, variable.dtype)) * decay
+        update_delta = (variable - math_ops.cast(value, variable.dtype)) * decay
         if inputs_size is not None:
           update_delta = array_ops.where(inputs_size > 0, update_delta,
                                          K.zeros_like(update_delta))
@@ -547,7 +512,6 @@ class BatchNormalizationBase(Layer):
     # after fixing graph pattern matching and enabling fused_batch_norm to
     # take exponential_avg_factor as a tensor input.
     use_fused_avg_updates = (
-        compat.forward_compatible(2020, 3, 6) and
         ops.executing_eagerly_outside_functions() and
         isinstance(self.momentum, (float, int)) and
         device_context.enclosing_tpu_context() is None)
@@ -660,8 +624,9 @@ class BatchNormalizationBase(Layer):
     with ops.control_dependencies([r, d]):
       mean = array_ops.identity(mean)
       stddev = array_ops.identity(stddev)
-    rmin, rmax, dmax = [self.renorm_clipping.get(key)
-                        for key in ['rmin', 'rmax', 'dmax']]
+    rmin, rmax, dmax = [
+        self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']
+    ]
     if rmin is not None:
       r = math_ops.maximum(r, rmin)
     if rmax is not None:
@@ -671,13 +636,13 @@ class BatchNormalizationBase(Layer):
       d = math_ops.minimum(d, dmax)
     # When not training, use r=1, d=0.
     r = tf_utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r))
-    d = tf_utils.smart_cond(training,
-                            lambda: d,
+    d = tf_utils.smart_cond(training, lambda: d,
                             lambda: array_ops.zeros_like(d))
 
     def _update_renorm_variable(var, value, inputs_size):
       """Updates a moving average and weight, returns the unbiased value."""
       value = array_ops.identity(value)
+
       def _do_update():
         """Updates the var, returns the updated value."""
         new_var = self._assign_moving_average(var, value, self.renorm_momentum,
@@ -686,6 +651,7 @@ class BatchNormalizationBase(Layer):
 
       def _fake_update():
         return array_ops.identity(var)
+
       return tf_utils.smart_cond(training, _do_update, _fake_update)
 
     # TODO(yuefengz): colocate the operations
@@ -722,12 +688,10 @@ class BatchNormalizationBase(Layer):
     if self._USE_V2_BEHAVIOR:
       if isinstance(training, int):
         training = bool(training)
-      if base_layer_utils.is_in_keras_graph():
-        training = math_ops.logical_and(training, self._get_trainable_var())
-      elif not self.trainable:
+      if not self.trainable:
         # When the layer is not trainable, it overrides the value passed from
         # model.
-        training = self.trainable
+        training = False
     return training
 
   def call(self, inputs, training=None):
@@ -765,12 +729,13 @@ class BatchNormalizationBase(Layer):
     ndims = len(input_shape)
     reduction_axes = [i for i in range(ndims) if i not in self.axis]
     if self.virtual_batch_size is not None:
-      del reduction_axes[1]     # Do not reduce along virtual batch dim
+      del reduction_axes[1]  # Do not reduce along virtual batch dim
 
     # Broadcasting only necessary for single-axis batch norm where the axis is
     # not the last dimension
     broadcast_shape = [1] * ndims
     broadcast_shape[self.axis[0]] = input_shape.dims[self.axis[0]].value
+
     def _broadcast(v):
       if (v is not None and len(v.shape) != ndims and
           reduction_axes != list(range(ndims - 1))):
@@ -795,11 +760,9 @@ class BatchNormalizationBase(Layer):
       if self.adjustment:
         adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
         # Adjust only during training.
-        adj_scale = tf_utils.smart_cond(training,
-                                        lambda: adj_scale,
+        adj_scale = tf_utils.smart_cond(training, lambda: adj_scale,
                                         lambda: array_ops.ones_like(adj_scale))
-        adj_bias = tf_utils.smart_cond(training,
-                                       lambda: adj_bias,
+        adj_bias = tf_utils.smart_cond(training, lambda: adj_bias,
                                        lambda: array_ops.zeros_like(adj_bias))
         scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)
 
@@ -891,11 +854,8 @@ class BatchNormalizationBase(Layer):
       scale = math_ops.cast(scale, inputs.dtype)
     # TODO(reedwm): Maybe do math in float32 if given float16 inputs, if doing
     # math in float16 hurts validation accuracy of popular models like resnet.
-    outputs = nn.batch_normalization(inputs,
-                                     _broadcast(mean),
-                                     _broadcast(variance),
-                                     offset,
-                                     scale,
+    outputs = nn.batch_normalization(inputs, _broadcast(mean),
+                                     _broadcast(variance), offset, scale,
                                      self.epsilon)
     # If some components of the shape got lost due to adjustments, fix that.
     outputs.set_shape(input_shape)
@@ -909,21 +869,32 @@ class BatchNormalizationBase(Layer):
 
   def get_config(self):
     config = {
-        'axis': self.axis,
-        'momentum': self.momentum,
-        'epsilon': self.epsilon,
-        'center': self.center,
-        'scale': self.scale,
-        'beta_initializer': initializers.serialize(self.beta_initializer),
-        'gamma_initializer': initializers.serialize(self.gamma_initializer),
+        'axis':
+            self.axis,
+        'momentum':
+            self.momentum,
+        'epsilon':
+            self.epsilon,
+        'center':
+            self.center,
+        'scale':
+            self.scale,
+        'beta_initializer':
+            initializers.serialize(self.beta_initializer),
+        'gamma_initializer':
+            initializers.serialize(self.gamma_initializer),
         'moving_mean_initializer':
             initializers.serialize(self.moving_mean_initializer),
         'moving_variance_initializer':
             initializers.serialize(self.moving_variance_initializer),
-        'beta_regularizer': regularizers.serialize(self.beta_regularizer),
-        'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
-        'beta_constraint': constraints.serialize(self.beta_constraint),
-        'gamma_constraint': constraints.serialize(self.gamma_constraint)
+        'beta_regularizer':
+            regularizers.serialize(self.beta_regularizer),
+        'gamma_regularizer':
+            regularizers.serialize(self.gamma_regularizer),
+        'beta_constraint':
+            constraints.serialize(self.beta_constraint),
+        'gamma_constraint':
+            constraints.serialize(self.gamma_constraint)
     }
     # Only add TensorFlow-specific parameters if they are set, so as to preserve
     # model compatibility with external Keras.
@@ -954,16 +925,14 @@ def replace_in_base_docstring(replacements):
 @keras_export(v1=['keras.layers.BatchNormalization'])  # pylint: disable=missing-docstring
 class BatchNormalization(BatchNormalizationBase):
 
-  __doc__ = replace_in_base_docstring(
-      [('''
+  __doc__ = replace_in_base_docstring([("""
     fused: if `True`, use a faster, fused implementation, or raise a ValueError
       if the fused implementation cannot be used. If `None`, use the faster
       implementation if possible. If False, do not used the fused
-      implementation.''',
-        '''
+      implementation.""", """
     fused: if `None` or `True`, use a faster, fused implementation if possible.
-      If `False`, use the system recommended implementation.'''),
-       ('{{TRAINABLE_ATTRIBUTE_NOTE}}', '')])
+      If `False`, use the system recommended implementation."""),
+                                       ('{{TRAINABLE_ATTRIBUTE_NOTE}}', '')])
 
   _USE_V2_BEHAVIOR = False
 
@@ -1060,37 +1029,30 @@ class LayerNormalization(Layer):
 
 
   Arguments:
-    axis: Integer or List/Tuple. The axis or axes
-      to normalize across. Typically this is the features axis/axes. The
-      left-out axes are typically the batch axis/axes.
-      This argument defaults to `-1`, the last dimension in the input.
-    epsilon: Small float added to variance to avoid dividing by zero.
-      Defaults to 1e-3
-    center: If True, add offset of `beta` to normalized tensor.
-        If False, `beta` is ignored. Defaults to True.
-    scale: If True, multiply by `gamma`.
-      If False, `gamma` is not used. Defaults to True.
-      When the next layer is linear (also e.g. `nn.relu`),
-      this can be disabled since the scaling
-      will be done by the next layer.
+    axis: Integer or List/Tuple. The axis or axes to normalize across. Typically
+      this is the features axis/axes. The left-out axes are typically the batch
+      axis/axes. This argument defaults to `-1`, the last dimension in the
+      input.
+    epsilon: Small float added to variance to avoid dividing by zero. Defaults
+      to 1e-3
+    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
+      is ignored. Defaults to True.
+    scale: If True, multiply by `gamma`. If False, `gamma` is not used. Defaults
+      to True. When the next layer is linear (also e.g. `nn.relu`), this can be
+      disabled since the scaling will be done by the next layer.
     beta_initializer: Initializer for the beta weight. Defaults to zeros.
     gamma_initializer: Initializer for the gamma weight. Defaults to ones.
     beta_regularizer: Optional regularizer for the beta weight. None by default.
-    gamma_regularizer: Optional regularizer for the gamma weight.
-      None by default.
+    gamma_regularizer: Optional regularizer for the gamma weight. None by
+      default.
     beta_constraint: Optional constraint for the beta weight. None by default.
     gamma_constraint: Optional constraint for the gamma weight. None by default.
     trainable: Boolean, if `True` the variables will be marked as trainable.
       Defaults to True.
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Same shape as input.
-
+  Input shape: Arbitrary. Use the keyword argument `input_shape` (tuple of
+    integers, does not include the samples axis) when using this layer as the
+    first layer in a model.
+  Output shape: Same shape as input.
   Reference:
     - [Lei Ba et al., 2016](https://arxiv.org/abs/1607.06450).
   """
@@ -1216,9 +1178,9 @@ class LayerNormalization(Layer):
     broadcast_shape = [1] * ndims
     for dim in self.axis:
       broadcast_shape[dim] = input_shape.dims[dim].value
+
     def _broadcast(v):
-      if (v is not None and len(v.shape) != ndims and
-          self.axis != [ndims - 1]):
+      if (v is not None and len(v.shape) != ndims and self.axis != [ndims - 1]):
         return array_ops.reshape(v, broadcast_shape)
       return v
 
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index ad5d00eb4d9..4d1e3213ba7 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -22,7 +22,6 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import wrap_function
@@ -35,7 +34,6 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import normalization
 from tensorflow.python.keras.layers import normalization_v2
 from tensorflow.python.keras.mixed_precision.experimental import policy
-from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
@@ -170,6 +168,13 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_batchnorm_non_trainable_with_fit(self):
+    # We use the same data shape for all the data we use in this test.
+    # This will prevent any used tf.functions from retracing.
+    # This helps us verify that changing trainable and recompiling really
+    # does update the training loop, rather than a different data shape
+    # triggering a retrace.
+    data_shape = (100, 3)
+
     inputs = keras.Input((3,))
     bn = normalization_v2.BatchNormalization()
     outputs = bn(inputs)
@@ -178,10 +183,10 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
         'rmsprop',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly())
-    model.fit(np.random.random((100, 3)), np.random.random((100, 3)))
+    model.fit(np.random.random(data_shape), np.random.random(data_shape))
 
-    test_data = np.random.random((10, 3))
-    test_targets = np.random.random((10, 3))
+    test_data = np.random.random(data_shape)
+    test_targets = np.random.random(data_shape)
     test_loss = model.evaluate(test_data, test_targets)
 
     bn.trainable = False
@@ -192,41 +197,6 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
     train_loss = model.train_on_batch(test_data, test_targets)
     self.assertAlmostEqual(test_loss, train_loss)
 
-  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
-  def test_batchnorm_non_trainable_with_tf_function(self):
-    inputs = keras.Input((3,))
-    bn = normalization_v2.BatchNormalization()
-    outputs = bn(inputs)
-    model = keras.Model(inputs, outputs)
-    loss_fn = keras.losses.MeanSquaredError()
-    optimizer = rmsprop_v2.RMSprop()
-
-    @def_function.function()
-    def train_step(x, y):
-      with backprop.GradientTape() as tape:
-        y_pred = model(x, training=True)
-        loss = loss_fn(y, y_pred)
-      grads = tape.gradient(loss, model.trainable_weights)
-      optimizer.apply_gradients(zip(grads, model.trainable_weights))
-      return loss
-
-    @def_function.function()
-    def test_step(x, y):
-      y_pred = model(x, training=False)
-      loss = loss_fn(y, y_pred)
-      return loss
-
-    train_step(np.random.random((100, 3)), np.random.random((100, 3)))
-
-    test_data = np.random.random((10, 3))
-    test_targets = np.random.random((10, 3))
-    test_loss = test_step(test_data, test_targets)
-
-    bn.trainable = False
-    train_loss = train_step(test_data, test_targets)
-    if context.executing_eagerly():
-      self.assertAlmostEqual(test_loss.numpy(), train_loss.numpy())
-
   def test_eager_batchnorm_in_custom_model_call_with_tf_function(self):
 
     class MyModel(keras.Model):
diff --git a/tensorflow/python/keras/layers/ops/BUILD b/tensorflow/python/keras/layers/ops/BUILD
new file mode 100644
index 00000000000..09973c54790
--- /dev/null
+++ b/tensorflow/python/keras/layers/ops/BUILD
@@ -0,0 +1,24 @@
+# Description:
+#   Contains stateless ops for Keras layers.
+
+package(
+    default_visibility = [
+        "//tensorflow/python/keras/layers:__pkg__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "core",
+    srcs = ["core.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+    ],
+)
diff --git a/tensorflow/python/keras/layers/ops/__init__.py b/tensorflow/python/keras/layers/ops/__init__.py
new file mode 100644
index 00000000000..27d099a4898
--- /dev/null
+++ b/tensorflow/python/keras/layers/ops/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stateless ops for Keras layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/python/keras/layers/ops/core.py b/tensorflow/python/keras/layers/ops/core.py
new file mode 100644
index 00000000000..1a30472cba3
--- /dev/null
+++ b/tensorflow/python/keras/layers/ops/core.py
@@ -0,0 +1,69 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stateless ops for core Keras layers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import standard_ops
+
+
+# TODO(b/157913406): Expose this publicly.
+def dense(inputs, kernel, bias=None, activation=None, dtype=None):
+  """Densely connected NN layer op.
+
+  Arguments:
+    inputs: `tf.Tensor` or `tf.SparseTensor`. Inputs to operation.
+    kernel: `tf.Variable`. Matrix kernel.
+    bias: (Optional) `tf.Variable`. Bias to add to outputs.
+    activation: (Optional) 1-argument callable. Activation function to apply to
+      outputs.
+    dtype: (Optional) `tf.DType`. Dtype to cast `inputs` to.
+
+  Returns:
+    `tf.Tensor`. Output of dense connection.
+  """
+  if dtype:
+    if inputs.dtype.base_dtype != dtype.base_dtype:
+      inputs = math_ops.cast(inputs, dtype=dtype)
+
+  rank = inputs.shape.rank
+  if rank == 2 or rank is None:
+    if isinstance(inputs, sparse_tensor.SparseTensor):
+      outputs = sparse_ops.sparse_tensor_dense_matmul(inputs, kernel)
+    else:
+      outputs = gen_math_ops.mat_mul(inputs, kernel)
+  # Broadcast kernel to inputs.
+  else:
+    outputs = standard_ops.tensordot(inputs, kernel, [[rank - 1], [0]])
+    # Reshape the output back to the original ndim of the input.
+    if not context.executing_eagerly():
+      shape = inputs.shape.as_list()
+      output_shape = shape[:-1] + [kernel.shape[-1]]
+      outputs.set_shape(output_shape)
+
+  if bias is not None:
+    outputs = nn_ops.bias_add(outputs, bias)
+
+  if activation is not None:
+    outputs = activation(outputs)
+
+  return outputs
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 78b00d6c16e..af7f6392219 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -2,6 +2,8 @@
 #   Contains the Keras preprocess layers (internal TensorFlow version).
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 load("//tensorflow/core/platform/default:distribute.bzl", "distribute_py_test")
@@ -23,14 +25,16 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":categorical_crossing",
+        ":category_crossing",
         ":discretization",
         ":hashing",
         ":image_preprocessing",
+        ":integer_lookup",
         ":normalization",
         ":preprocessing_stage",
         ":preprocessing_test_utils",
         ":reduction",
+        ":string_lookup",
         ":text_vectorization",
     ],
 )
@@ -48,9 +52,9 @@ py_library(
 )
 
 py_library(
-    name = "categorical_crossing",
+    name = "category_crossing",
     srcs = [
-        "categorical_crossing.py",
+        "category_crossing.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -110,6 +114,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":table_utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
@@ -146,15 +151,26 @@ py_library(
 )
 
 py_library(
-    name = "text_vectorization",
+    name = "integer_lookup",
     srcs = [
-        "text_vectorization.py",
-        "text_vectorization_v1.py",
+        "integer_lookup.py",
+        "integer_lookup_v1.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":categorical_encoding",
         ":index_lookup",
+        ":table_utils",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
+py_library(
+    name = "table_utils",
+    srcs = [
+        "table_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
@@ -173,10 +189,37 @@ py_library(
 )
 
 py_library(
-    name = "categorical_encoding",
+    name = "text_vectorization",
     srcs = [
-        "categorical_encoding.py",
-        "categorical_encoding_v1.py",
+        "text_vectorization.py",
+        "text_vectorization_v1.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":category_encoding",
+        ":string_lookup",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras/engine:base_preprocessing_layer",
+        "//tensorflow/python/ops/ragged",
+    ],
+)
+
+py_library(
+    name = "category_encoding",
+    srcs = [
+        "category_encoding.py",
+        "category_encoding_v1.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -210,6 +253,20 @@ py_library(
     ],
 )
 
+py_library(
+    name = "string_lookup",
+    srcs = [
+        "string_lookup.py",
+        "string_lookup_v1.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":index_lookup",
+        ":table_utils",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
 py_library(
     name = "preprocessing_stage",
     srcs = [
@@ -234,16 +291,16 @@ py_library(
 )
 
 cuda_py_test(
-    name = "categorical_crossing_test",
+    name = "category_crossing_test",
     size = "medium",
-    srcs = ["categorical_crossing_test.py"],
+    srcs = ["category_crossing_test.py"],
     python_version = "PY3",
     shard_count = 4,
     tags = [
         "no_windows",  # b/149031156
     ],
     deps = [
-        ":categorical_crossing",
+        ":category_crossing",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -251,12 +308,12 @@ cuda_py_test(
 )
 
 tf_py_test(
-    name = "categorical_encoding_test",
+    name = "category_encoding_test",
     size = "medium",
-    srcs = ["categorical_encoding_test.py"],
+    srcs = ["category_encoding_test.py"],
     python_version = "PY3",
     deps = [
-        ":categorical_encoding",
+        ":category_encoding",
         ":preprocessing_test_utils",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -267,9 +324,9 @@ tf_py_test(
 )
 
 distribute_py_test(
-    name = "categorical_encoding_distribution_test",
-    srcs = ["categorical_encoding_distribution_test.py"],
-    main = "categorical_encoding_distribution_test.py",
+    name = "category_encoding_distribution_test",
+    srcs = ["category_encoding_distribution_test.py"],
+    main = "category_encoding_distribution_test.py",
     python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
@@ -278,7 +335,26 @@ distribute_py_test(
         "no_oss",  # b/155502591
     ],
     deps = [
-        ":categorical_encoding",
+        ":category_encoding",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/keras",
+    ],
+)
+
+distribute_py_test(
+    name = "category_crossing_distribution_test",
+    srcs = ["category_crossing_distribution_test.py"],
+    main = "category_crossing_distribution_test.py",
+    python_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    tpu_tags = [
+        "no_oss",  # b/155502591
+    ],
+    deps = [
+        ":category_crossing",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/keras",
@@ -398,6 +474,22 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "integer_lookup_test",
+    size = "medium",
+    srcs = ["integer_lookup_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":integer_lookup",
+        ":preprocessing_test_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras/utils:generic_utils",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 distribute_py_test(
     name = "normalization_distribution_test",
     srcs = ["normalization_distribution_test.py"],
@@ -412,11 +504,26 @@ distribute_py_test(
     ],
 )
 
+tf_py_test(
+    name = "table_utils_test",
+    srcs = ["table_utils_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":table_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras/utils:generic_utils",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "text_vectorization_test",
     size = "medium",
     srcs = ["text_vectorization_test.py"],
     python_version = "PY3",
+    shard_count = 4,
     deps = [
         ":preprocessing_test_utils",
         ":text_vectorization",
@@ -459,6 +566,22 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "string_lookup_test",
+    size = "medium",
+    srcs = ["string_lookup_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":preprocessing_test_utils",
+        ":string_lookup",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras/utils:generic_utils",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "preprocessing_stage_test",
     srcs = ["preprocessing_stage_test.py"],
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
index 276fb4767af..7c976880059 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
@@ -1,4 +1,7 @@
 # Benchmarks for Keras preprocessing layers.
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
@@ -8,12 +11,32 @@ package(
 exports_files(["LICENSE"])
 
 tf_py_test(
-    name = "categorical_encoding_benchmark",
-    srcs = ["categorical_encoding_benchmark.py"],
+    name = "category_encoding_benchmark",
+    srcs = ["category_encoding_benchmark.py"],
     python_version = "PY3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python/keras/layers/preprocessing:categorical_encoding",
+        "//tensorflow/python/keras/layers/preprocessing:category_encoding",
+    ],
+)
+
+tf_py_test(
+    name = "category_crossing_benchmark",
+    srcs = ["category_crossing_benchmark.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/layers/preprocessing:category_crossing",
+    ],
+)
+
+tf_py_test(
+    name = "hashing_benchmark",
+    srcs = ["hashing_benchmark.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/layers/preprocessing:hashing",
     ],
 )
 
@@ -36,3 +59,13 @@ tf_py_test(
         "//tensorflow/python/keras/layers/preprocessing:normalization",
     ],
 )
+
+cuda_py_test(
+    name = "image_preproc_benchmark",
+    srcs = ["image_preproc_benchmark.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/layers/preprocessing:image_preprocessing",
+    ],
+)
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/category_crossing_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/category_crossing_benchmark.py
new file mode 100644
index 00000000000..efc0ca3766f
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/category_crossing_benchmark.py
@@ -0,0 +1,116 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for Keras categorical_encoding preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import time
+
+from absl import flags
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.layers.preprocessing import category_crossing
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+v2_compat.enable_v2_behavior()
+
+
+# word_gen creates random sequences of ASCII letters (both lowercase and upper).
+# The number of unique strings is ~2,700.
+def int_gen():
+  for _ in itertools.count(1):
+    yield (np.random.randint(0, 5, (1,)), np.random.randint(0, 7, (1,)))
+
+
+class BenchmarkLayer(benchmark.Benchmark):
+  """Benchmark the layer forward pass."""
+
+  def run_dataset_implementation(self, batch_size):
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.from_generator(
+          int_gen, (dtypes.int64, dtypes.int64),
+          (tensor_shape.TensorShape([1]), tensor_shape.TensorShape([1])))
+      ds = ds.shuffle(batch_size * 100)
+      ds = ds.batch(batch_size)
+      num_batches = 5
+      ds = ds.take(num_batches)
+      ds = ds.prefetch(num_batches)
+      starts.append(time.time())
+      # Benchmarked code begins here.
+      for i in ds:
+        _ = sparse_ops.sparse_cross([i[0], i[1]])
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
+    return avg_time
+
+  def bm_layer_implementation(self, batch_size):
+    input_1 = keras.Input(shape=(1,), dtype=dtypes.int64, name="word")
+    input_2 = keras.Input(shape=(1,), dtype=dtypes.int64, name="int")
+    layer = category_crossing.CategoryCrossing()
+    _ = layer([input_1, input_2])
+
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.from_generator(
+          int_gen, (dtypes.int64, dtypes.int64),
+          (tensor_shape.TensorShape([1]), tensor_shape.TensorShape([1])))
+      ds = ds.shuffle(batch_size * 100)
+      ds = ds.batch(batch_size)
+      num_batches = 5
+      ds = ds.take(num_batches)
+      ds = ds.prefetch(num_batches)
+      starts.append(time.time())
+      # Benchmarked code begins here.
+      for i in ds:
+        _ = layer([i[0], i[1]])
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
+    name = "category_crossing|batch_%s" % batch_size
+    baseline = self.run_dataset_implementation(batch_size)
+    extras = {
+        "dataset implementation baseline": baseline,
+        "delta seconds": (baseline - avg_time),
+        "delta percent": ((baseline - avg_time) / baseline) * 100
+    }
+    self.report_benchmark(
+        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
+
+  def benchmark_vocab_size_by_batch(self):
+    for batch in [32, 64, 256]:
+      self.bm_layer_implementation(batch_size=batch)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_encoding_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
similarity index 93%
rename from tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_encoding_benchmark.py
rename to tensorflow/python/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
index e68b77ebef9..71b4c7b6b61 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_encoding_benchmark.py
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmark for Keras categorical_encoding preprocessing layer."""
+"""Benchmark for Keras category_encoding preprocessing layer."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,7 +26,7 @@ from tensorflow.python import keras
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -42,7 +42,7 @@ class BenchmarkLayer(benchmark.Benchmark):
   def run_dataset_implementation(self, output_mode, batch_size, sequence_length,
                                  max_tokens):
     input_t = keras.Input(shape=(sequence_length,), dtype=dtypes.int32)
-    layer = categorical_encoding.CategoricalEncoding(
+    layer = category_encoding.CategoryEncoding(
         max_tokens=max_tokens, output_mode=output_mode)
     _ = layer(input_t)
 
@@ -68,7 +68,7 @@ class BenchmarkLayer(benchmark.Benchmark):
       ends.append(time.time())
 
     avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
-    name = "categorical_encoding|batch_%s|seq_length_%s|%s_max_tokens" % (
+    name = "category_encoding|batch_%s|seq_length_%s|%s_max_tokens" % (
         batch_size, sequence_length, max_tokens)
     self.report_benchmark(iters=num_repeats, wall_time=avg_time, name=name)
 
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/hashing_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/hashing_benchmark.py
new file mode 100644
index 00000000000..68ab28c7f6c
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/hashing_benchmark.py
@@ -0,0 +1,115 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for Keras hashing preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import random
+import string
+import time
+
+from absl import flags
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.layers.preprocessing import hashing
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+v2_compat.enable_v2_behavior()
+
+
+# word_gen creates random sequences of ASCII letters (both lowercase and upper).
+# The number of unique strings is ~2,700.
+def word_gen():
+  for _ in itertools.count(1):
+    yield "".join(random.choice(string.ascii_letters) for i in range(2))
+
+
+class BenchmarkLayer(benchmark.Benchmark):
+  """Benchmark the layer forward pass."""
+
+  def run_dataset_implementation(self, batch_size):
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.string,
+                                              tensor_shape.TensorShape([]))
+      ds = ds.shuffle(batch_size * 100)
+      ds = ds.batch(batch_size)
+      num_batches = 5
+      ds = ds.take(num_batches)
+      ds = ds.prefetch(num_batches)
+      starts.append(time.time())
+      # Benchmarked code begins here.
+      for i in ds:
+        _ = string_ops.string_to_hash_bucket(i, num_buckets=2)
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
+    return avg_time
+
+  def bm_layer_implementation(self, batch_size):
+    input_1 = keras.Input(shape=(None,), dtype=dtypes.string, name="word")
+    layer = hashing.Hashing(num_bins=2)
+    _ = layer(input_1)
+
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.string,
+                                              tensor_shape.TensorShape([]))
+      ds = ds.shuffle(batch_size * 100)
+      ds = ds.batch(batch_size)
+      num_batches = 5
+      ds = ds.take(num_batches)
+      ds = ds.prefetch(num_batches)
+      starts.append(time.time())
+      # Benchmarked code begins here.
+      for i in ds:
+        _ = layer(i)
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
+    name = "hashing|batch_%s" % batch_size
+    baseline = self.run_dataset_implementation(batch_size)
+    extras = {
+        "dataset implementation baseline": baseline,
+        "delta seconds": (baseline - avg_time),
+        "delta percent": ((baseline - avg_time) / baseline) * 100
+    }
+    self.report_benchmark(
+        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
+
+  def benchmark_vocab_size_by_batch(self):
+    for batch in [32, 64, 256]:
+      self.bm_layer_implementation(batch_size=batch)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py
new file mode 100644
index 00000000000..302c890c823
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py
@@ -0,0 +1,163 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for Keras image preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import time
+
+from absl import flags
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.layers.preprocessing import image_preprocessing
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import image_ops_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+v2_compat.enable_v2_behavior()
+
+LOWER = .2
+UPPER = .4
+BATCH_SIZE = 32
+
+
+def rotate(inputs):
+  """rotate image."""
+  inputs_shape = array_ops.shape(inputs)
+  batch_size = inputs_shape[0]
+  img_hd = math_ops.cast(inputs_shape[1], dtypes.float32)
+  img_wd = math_ops.cast(inputs_shape[2], dtypes.float32)
+  min_angle = LOWER * 2. * np.pi
+  max_angle = UPPER * 2. * np.pi
+  angles = random_ops.random_uniform(
+      shape=[batch_size], minval=min_angle, maxval=max_angle)
+  return image_preprocessing.transform(
+      inputs, image_preprocessing.get_rotation_matrix(angles, img_hd, img_wd))
+
+
+def zoom(inputs):
+  """zoom image."""
+  inputs_shape = array_ops.shape(inputs)
+  batch_size = inputs_shape[0]
+  img_hd = math_ops.cast(inputs_shape[1], dtypes.float32)
+  img_wd = math_ops.cast(inputs_shape[2], dtypes.float32)
+  height_zoom = random_ops.random_uniform(
+      shape=[batch_size, 1], minval=1. + LOWER, maxval=1. + UPPER)
+  width_zoom = random_ops.random_uniform(
+      shape=[batch_size, 1], minval=1. + LOWER, maxval=1. + UPPER)
+  zooms = math_ops.cast(
+      array_ops.concat([width_zoom, height_zoom], axis=1), dtype=dtypes.float32)
+  return image_preprocessing.transform(
+      inputs, image_preprocessing.get_zoom_matrix(zooms, img_hd, img_wd))
+
+
+def image_augmentation(inputs, batch_size):
+  """image augmentation."""
+  img = inputs
+  img = image_ops_impl.resize_images_v2(img, size=[224, 224])
+  img = random_ops.random_crop(img, size=[batch_size, 224, 224, 3])
+  img = rotate(img)
+  img = zoom(img)
+  return img
+
+
+class BenchmarkLayer(benchmark.Benchmark):
+  """Benchmark the layer forward pass."""
+
+  def run_dataset_implementation(self, batch_size):
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.from_tensor_slices(
+          np.random.random((batch_size, 256, 256, 3)))
+      ds = ds.shuffle(batch_size * 100)
+      ds = ds.batch(batch_size)
+      ds = ds.prefetch(batch_size)
+      img_augmentation = functools.partial(
+          image_augmentation, batch_size=batch_size)
+      ds = ds.map(img_augmentation)
+      starts.append(time.time())
+      count = 0
+      # Benchmarked code begins here.
+      for i in ds:
+        _ = i
+        count += 1
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / count
+    return avg_time
+
+  def bm_layer_implementation(self, batch_size):
+    with ops.device_v2("/gpu:0"):
+      img = keras.Input(shape=(256, 256, 3), dtype=dtypes.float32)
+      preprocessor = keras.Sequential([
+          image_preprocessing.Resizing(224, 224),
+          image_preprocessing.RandomCrop(height=224, width=224),
+          image_preprocessing.RandomRotation(factor=(.2, .4)),
+          image_preprocessing.RandomFlip(mode="horizontal"),
+          image_preprocessing.RandomZoom(.2, .2)
+      ])
+      _ = preprocessor(img)
+
+      num_repeats = 5
+      starts = []
+      ends = []
+      for _ in range(num_repeats):
+        ds = dataset_ops.Dataset.from_tensor_slices(
+            np.random.random((batch_size, 256, 256, 3)))
+        ds = ds.shuffle(batch_size * 100)
+        ds = ds.batch(batch_size)
+        ds = ds.prefetch(batch_size)
+        starts.append(time.time())
+        count = 0
+        # Benchmarked code begins here.
+        for i in ds:
+          _ = preprocessor(i)
+          count += 1
+        # Benchmarked code ends here.
+        ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / count
+    name = "image_preprocessing|batch_%s" % batch_size
+    baseline = self.run_dataset_implementation(batch_size)
+    extras = {
+        "dataset implementation baseline": baseline,
+        "delta seconds": (baseline - avg_time),
+        "delta percent": ((baseline - avg_time) / baseline) * 100
+    }
+    self.report_benchmark(
+        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
+
+  def benchmark_vocab_size_by_batch(self):
+    for batch in [32, 64, 256]:
+      self.bm_layer_implementation(batch_size=batch)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
similarity index 53%
rename from tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
rename to tensorflow/python/keras/layers/preprocessing/category_crossing.py
index e3eb27b2b4e..84e5332bea5 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
@@ -23,46 +23,42 @@ import itertools
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export('keras.layers.experimental.preprocessing.CategoryCrossing')
 class CategoryCrossing(Layer):
   """Category crossing layer.
 
-  This layer transforms multiple categorical inputs to categorical outputs
-  by Cartesian product, and hash the output if necessary. Without hashing
-  (`num_bins=None`) the output dtype is string, with hashing the output dtype
-  is int64.
-
-  For each input, the hash function uses a specific fingerprint method, i.e.,
-  [FarmHash64](https://github.com/google/farmhash) to compute the hashed output,
-  that provides a consistent hashed output across different platforms.
-  For multiple inputs, the final output is calculated by first computing the
-  fingerprint of `hash_key`, and concatenate it with the fingerprints of
-  each input. The user can also obfuscate the output with customized `hash_key`.
-
-  If [SipHash64[(https://github.com/google/highwayhash) is desired instead, the
-  user can set `num_bins=None` to get string outputs, and use Hashing layer to
-  get hashed output with SipHash64.
+  This layer concatenates multiple categorical inputs into a single categorical
+  output (similar to Cartesian product). The output dtype is string.
 
   Usage:
-
-  Use with string output.
   >>> inp_1 = tf.constant([['a'], ['b'], ['c']])
   >>> inp_2 = tf.constant([['d'], ['e'], ['f']])
-  >>> layer = categorical_crossing.CategoryCrossing()
-  >>> output = layer([inp_1, inp_2])
+  >>> layer = tf.keras.layers.experimental.preprocessing.CategoryCrossing()
+  >>> layer([inp_1, inp_2])
+  <tf.Tensor: shape=(3, 1), dtype=string, numpy=
+    array([[b'a_X_d'],
+           [b'b_X_e'],
+           [b'c_X_f']], dtype=object)>
 
-  Use with hashed output.
-  >>> layer = categorical_crossing.CategoryCrossing(num_bins=2)
-  >>> output = layer([inp_1, inp_2])
 
-  Use with customized hashed output.
-  >>> layer = categorical_crossing.CategoryCrossing(num_bins=2, hash_key=133)
-  >>> output = layer([inp_1, inp_2])
+  >>> inp_1 = tf.constant([['a'], ['b'], ['c']])
+  >>> inp_2 = tf.constant([['d'], ['e'], ['f']])
+  >>> layer = tf.keras.layers.experimental.preprocessing.CategoryCrossing(
+  ...    separator='-')
+  >>> layer([inp_1, inp_2])
+  <tf.Tensor: shape=(3, 1), dtype=string, numpy=
+    array([[b'a-d'],
+           [b'b-e'],
+           [b'c-f']], dtype=object)>
 
   Arguments:
     depth: depth of input crossing. By default None, all inputs are crossed into
@@ -74,10 +70,8 @@ class CategoryCrossing(Layer):
       equal to N1 or N2. Passing `None` means a single crossed output with all
       inputs. For example, with inputs `a`, `b` and `c`, `depth=2` means the
       output will be [a;b;c;cross(a, b);cross(bc);cross(ca)].
-    num_bins: Number of hash bins. By default None, no hashing is performed.
-    hash_key: Integer hash_key that will be used by the concatenate
-      fingerprints. If not given, will use a default key from
-      `tf.sparse.cross_hashed`. This is only valid when `num_bins` is not None.
+    separator: A string added between each input being joined. Defaults to
+      '_X_'.
     name: Name to give to the layer.
     **kwargs: Keyword arguments to construct a layer.
 
@@ -87,76 +81,42 @@ class CategoryCrossing(Layer):
   Output shape: a single string or int tensor or sparse tensor of shape
     `[batch_size, d1, ..., dm]`
 
-  Below 'hash' stands for tf.fingerprint, and cat stands for 'FingerprintCat'.
+  Returns:
+    If any input is `RaggedTensor`, the output is `RaggedTensor`.
+    Else, if any input is `SparseTensor`, the output is `SparseTensor`.
+    Otherwise, the output is `Tensor`.
 
   Example: (`depth`=None)
     If the layer receives three inputs:
     `a=[[1], [4]]`, `b=[[2], [5]]`, `c=[[3], [6]]`
-    the output will be a string tensor if not hashed:
+    the output will be a string tensor:
     `[[b'1_X_2_X_3'], [b'4_X_5_X_6']]`
-    the output will be an int64 tensor if hashed:
-    `[[cat(hash(3), cat(hash(2), cat(hash(1), hash(hash_key))))],
-     [[cat(hash(6), cat(hash(5), cat(hash(4), hash(hash_key))))]`
 
   Example: (`depth` is an integer)
     With the same input above, and if `depth`=2,
-    the output will be a list of 6 string tensors if not hashed:
+    the output will be a list of 6 string tensors:
     `[[b'1'], [b'4']]`
     `[[b'2'], [b'5']]`
     `[[b'3'], [b'6']]`
     `[[b'1_X_2'], [b'4_X_5']]`,
     `[[b'2_X_3'], [b'5_X_6']]`,
     `[[b'3_X_1'], [b'6_X_4']]`
-    the output will be a list of 6 int64 tensors if hashed:
-    `[[hash(b'1')], [hash(b'4')]]`
-    `[[hash(b'2')], [hash(b'5')]]`
-    `[[hash(b'3')], [hash(b'6')]]`
-    `[[cat(hash(2), cat(hash(1), hash(hash_key)))],
-      [cat(hash(5), cat(hash(4), hash(hash_key)))]`,
-    `[[cat(hash(3), cat(hash(1), hash(hash_key)))],
-      [cat(hash(6), cat(hash(4), hash(hash_key)))]`,
-    `[[cat(hash(3), cat(hash(2), hash(hash_key)))],
-      [cat(hash(6), cat(hash(5), hash(hash_key)))]`,
 
   Example: (`depth` is a tuple/list of integers)
     With the same input above, and if `depth`=(2, 3)
-    the output will be a list of 4 string tensors if not hashed:
+    the output will be a list of 4 string tensors:
     `[[b'1_X_2'], [b'4_X_5']]`,
     `[[b'2_X_3'], [b'5_X_6']]`,
     `[[b'3_X_1'], [b'6_X_4']]`,
     `[[b'1_X_2_X_3'], [b'4_X_5_X_6']]`
-    the output will be a list of 4 int64 tensors if hashed:
-    `[
-      [cat(hash(2), cat(hash(1), hash(hash_key)))],
-      [cat(hash(5), cat(hash(4), hash(hash_key)))]
-     ]`,
-    `[
-      [cat(hash(3), cat(hash(1), hash(hash_key)))],
-      [cat(hash(6), cat(hash(4), hash(hash_key)))]
-     ]`,
-    `[
-      [cat(hash(3), cat(hash(2), hash(hash_key)))],
-      [cat(hash(6), cat(hash(5), hash(hash_key)))]
-     ]`,
-    `[
-      [cat(hash(3), cat(hash(2), cat(hash(1), hash(hash_key))))],
-      [cat(hash(6), cat(hash(5), cat(hash(4), hash(hash_key))))]
-     ]`
   """
 
-  def __init__(self,
-               depth=None,
-               num_bins=None,
-               hash_key=None,
-               name=None,
-               **kwargs):
-    # TODO(tanzheny): Consider making seperator configurable.
-    if num_bins is None and hash_key is not None:
-      raise ValueError('`hash_key` is only valid when `num_bins` is not None')
+  def __init__(self, depth=None, name=None, separator=None, **kwargs):
     super(CategoryCrossing, self).__init__(name=name, **kwargs)
     self.depth = depth
-    self.num_bins = num_bins
-    self.hash_key = hash_key
+    if separator is None:
+      separator = '_X_'
+    self.separator = separator
     if isinstance(depth, (tuple, list)):
       self._depth_tuple = depth
     elif depth is not None:
@@ -164,37 +124,26 @@ class CategoryCrossing(Layer):
 
   def partial_crossing(self, partial_inputs, ragged_out, sparse_out):
     """Gets the crossed output from a partial list/tuple of inputs."""
-    if self.num_bins is not None:
-      partial_output = sparse_ops.sparse_cross_hashed(
-          partial_inputs, num_buckets=self.num_bins, hash_key=self.hash_key)
-    else:
-      partial_output = sparse_ops.sparse_cross(partial_inputs)
-
     # If ragged_out=True, convert output from sparse to ragged.
     if ragged_out:
-      return ragged_tensor.RaggedTensor.from_sparse(partial_output)
+      # TODO(momernick): Support separator with ragged_cross.
+      if self.separator != '_X_':
+        raise ValueError('Non-default separator with ragged input is not '
+                         'supported yet, given {}'.format(self.separator))
+      return ragged_array_ops.cross(partial_inputs)
     elif sparse_out:
-      return partial_output
+      return sparse_ops.sparse_cross(partial_inputs, separator=self.separator)
     else:
-      return sparse_ops.sparse_tensor_to_dense(partial_output)
+      return sparse_ops.sparse_tensor_to_dense(
+          sparse_ops.sparse_cross(partial_inputs, separator=self.separator))
 
   def call(self, inputs):
     depth_tuple = self._depth_tuple if self.depth else (len(inputs),)
     ragged_out = sparse_out = False
-    if all([ragged_tensor.is_ragged(inp) for inp in inputs]):
-      # (b/144500510) ragged.map_flat_values(sparse_cross_hashed, inputs) will
-      # cause kernel failure. Investigate and find a more efficient
-      # implementation
-      inputs = [inp.to_sparse() for inp in inputs]
+    if any(ragged_tensor.is_ragged(inp) for inp in inputs):
       ragged_out = True
-    else:
-      if any([ragged_tensor.is_ragged(inp) for inp in inputs]):
-        raise ValueError(
-            'Inputs must be either all `RaggedTensor`, or none of them should '
-            'be `RaggedTensor`, got {}'.format(inputs))
-
-      if any([isinstance(inp, sparse_tensor.SparseTensor) for inp in inputs]):
-        sparse_out = True
+    elif any(isinstance(inp, sparse_tensor.SparseTensor) for inp in inputs):
+      sparse_out = True
 
     outputs = []
     for depth in depth_tuple:
@@ -229,15 +178,23 @@ class CategoryCrossing(Layer):
   def compute_output_signature(self, input_spec):
     input_shapes = [x.shape for x in input_spec]
     output_shape = self.compute_output_shape(input_shapes)
-    output_dtype = dtypes.int64 if self.num_bins else dtypes.string
-    return sparse_tensor.SparseTensorSpec(
-        shape=output_shape, dtype=output_dtype)
+    if any([
+        isinstance(inp_spec, ragged_tensor.RaggedTensorSpec)
+        for inp_spec in input_spec
+    ]):
+      return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.string)
+    elif any([
+        isinstance(inp_spec, sparse_tensor.SparseTensorSpec)
+        for inp_spec in input_spec
+    ]):
+      return sparse_tensor.SparseTensorSpec(
+          shape=output_shape, dtype=dtypes.string)
+    return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.string)
 
   def get_config(self):
     config = {
         'depth': self.depth,
-        'num_bins': self.num_bins,
-        'hash_key': self.hash_key
+        'separator': self.separator,
     }
     base_config = super(CategoryCrossing, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py
new file mode 100644
index 00000000000..1ccc7fe2296
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py
@@ -0,0 +1,83 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for keras.layers.preprocessing.normalization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.framework import config
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.layers.preprocessing import category_crossing
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.platform import test
+
+
+def batch_wrapper(dataset, batch_size, distribution, repeat=None):
+  if repeat:
+    dataset = dataset.repeat(repeat)
+  # TPUs currently require fully defined input shapes, drop_remainder ensures
+  # the input will have fully defined shapes.
+  if isinstance(distribution,
+                (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+    return dataset.batch(batch_size, drop_remainder=True)
+  else:
+    return dataset.batch(batch_size)
+
+
+@combinations.generate(
+    combinations.combine(
+        # Investigate why crossing is not supported with TPU.
+        distribution=strategy_combinations.all_strategies,
+        mode=['eager', 'graph']))
+class CategoryCrossingDistributionTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_distribution(self, distribution):
+    input_array_1 = np.array([['a', 'b'], ['c', 'd']])
+    input_array_2 = np.array([['e', 'f'], ['g', 'h']])
+    inp_dataset = dataset_ops.DatasetV2.from_tensor_slices(
+        {'input_1': input_array_1, 'input_2': input_array_2})
+    inp_dataset = batch_wrapper(inp_dataset, 2, distribution)
+
+    # pyformat: disable
+    expected_output = [[b'a_X_e', b'a_X_f', b'b_X_e', b'b_X_f'],
+                       [b'c_X_g', b'c_X_h', b'd_X_g', b'd_X_h']]
+    config.set_soft_device_placement(True)
+
+    with distribution.scope():
+      input_data_1 = keras.Input(shape=(2,), dtype=dtypes.string,
+                                 name='input_1')
+      input_data_2 = keras.Input(shape=(2,), dtype=dtypes.string,
+                                 name='input_2')
+      input_data = [input_data_1, input_data_2]
+      layer = category_crossing.CategoryCrossing()
+      int_data = layer(input_data)
+      model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(inp_dataset)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py b/tensorflow/python/keras/layers/preprocessing/category_crossing_test.py
similarity index 73%
rename from tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py
rename to tensorflow/python/keras/layers/preprocessing/category_crossing_test.py
index 49d8f0d7003..f076c9ea865 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_crossing_test.py
@@ -29,7 +29,7 @@ from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import training
-from tensorflow.python.keras.layers.preprocessing import categorical_crossing
+from tensorflow.python.keras.layers.preprocessing import category_crossing
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
@@ -40,8 +40,8 @@ from tensorflow.python.platform import test
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 class CategoryCrossingTest(keras_parameterized.TestCase):
 
-  def test_crossing_basic(self):
-    layer = categorical_crossing.CategoryCrossing()
+  def test_crossing_sparse_inputs(self):
+    layer = category_crossing.CategoryCrossing()
     inputs_0 = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 0], [1, 1]],
         values=['a', 'b', 'c'],
@@ -52,8 +52,8 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
     self.assertAllEqual([b'a_X_d', b'b_X_e', b'c_X_e'], output.values)
 
-  def test_crossing_sparse_inputs(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=1)
+  def test_crossing_sparse_inputs_custom_sep(self):
+    layer = category_crossing.CategoryCrossing(separator='_Y_')
     inputs_0 = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 0], [1, 1]],
         values=['a', 'b', 'c'],
@@ -62,10 +62,10 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
         indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
     output = layer([inputs_0, inputs_1])
     self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
-    self.assertAllClose([0, 0, 0], output.values)
+    self.assertAllEqual([b'a_Y_d', b'b_Y_e', b'c_Y_e'], output.values)
 
-  def test_crossing_sparse_inputs_with_hash_key(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2, hash_key=133)
+  def test_crossing_sparse_inputs_empty_sep(self):
+    layer = category_crossing.CategoryCrossing(separator='')
     inputs_0 = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 0], [1, 1]],
         values=['a', 'b', 'c'],
@@ -74,16 +74,10 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
         indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
     output = layer([inputs_0, inputs_1])
     self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
-    self.assertAllClose([1, 0, 1], output.values)
-
-    layer_2 = categorical_crossing.CategoryCrossing(num_bins=2, hash_key=137)
-    output = layer_2([inputs_0, inputs_1])
-    self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
-    # Note the output is different with above.
-    self.assertAllClose([0, 1, 0], output.values)
+    self.assertAllEqual([b'ad', b'be', b'ce'], output.values)
 
   def test_crossing_sparse_inputs_depth_int(self):
-    layer = categorical_crossing.CategoryCrossing(depth=1)
+    layer = category_crossing.CategoryCrossing(depth=1)
     inputs_0 = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 0], [2, 0]],
         values=['a', 'b', 'c'],
@@ -99,7 +93,7 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertAllEqual(expected_out, output)
 
   def test_crossing_sparse_inputs_depth_tuple(self):
-    layer = categorical_crossing.CategoryCrossing(depth=(2, 3))
+    layer = category_crossing.CategoryCrossing(depth=(2, 3))
     inputs_0 = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 0], [2, 0]],
         values=['a', 'b', 'c'],
@@ -127,44 +121,24 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
         [expected_outputs_0, expected_outputs_1, expected_outputs_2], axis=0)
     self.assertAllEqual(expected_out, output)
 
-  def test_crossing_hashed_two_bins(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2)
-    inputs_0 = sparse_tensor.SparseTensor(
-        indices=[[0, 0], [1, 0], [1, 1]],
-        values=['a', 'b', 'c'],
-        dense_shape=[2, 2])
-    inputs_1 = sparse_tensor.SparseTensor(
-        indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
-    output = layer([inputs_0, inputs_1])
-    self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
-    self.assertEqual(output.values.numpy().max(), 1)
-    self.assertEqual(output.values.numpy().min(), 0)
-
-  def test_crossing_hashed_ragged_inputs(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2)
+  def test_crossing_ragged_inputs(self):
     inputs_0 = ragged_factory_ops.constant(
         [['omar', 'skywalker'], ['marlo']],
         dtype=dtypes.string)
     inputs_1 = ragged_factory_ops.constant(
         [['a'], ['b']],
         dtype=dtypes.string)
-    out_data = layer([inputs_0, inputs_1])
-    expected_output = [[0, 0], [0]]
-    self.assertAllClose(expected_output, out_data)
     inp_0_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
     inp_1_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
-    out_t = layer([inp_0_t, inp_1_t])
-    model = training.Model(inputs=[inp_0_t, inp_1_t], outputs=out_t)
-    self.assertAllClose(expected_output, model.predict([inputs_0, inputs_1]))
 
-    non_hashed_layer = categorical_crossing.CategoryCrossing()
+    non_hashed_layer = category_crossing.CategoryCrossing()
     out_t = non_hashed_layer([inp_0_t, inp_1_t])
     model = training.Model(inputs=[inp_0_t, inp_1_t], outputs=out_t)
     expected_output = [[b'omar_X_a', b'skywalker_X_a'], [b'marlo_X_b']]
     self.assertAllEqual(expected_output, model.predict([inputs_0, inputs_1]))
 
   def test_crossing_ragged_inputs_depth_int(self):
-    layer = categorical_crossing.CategoryCrossing(depth=1)
+    layer = category_crossing.CategoryCrossing(depth=1)
     inputs_0 = ragged_factory_ops.constant([['a'], ['b'], ['c']])
     inputs_1 = ragged_factory_ops.constant([['d'], ['e'], ['f']])
     output = layer([inputs_0, inputs_1])
@@ -172,7 +146,7 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertIsInstance(output, ragged_tensor.RaggedTensor)
     self.assertAllEqual(expected_output, output)
 
-    layer = categorical_crossing.CategoryCrossing(depth=2)
+    layer = category_crossing.CategoryCrossing(depth=2)
     inp_0_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
     inp_1_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
     out_t = layer([inp_0_t, inp_1_t])
@@ -182,7 +156,7 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertAllEqual(expected_output, model.predict([inputs_0, inputs_1]))
 
   def test_crossing_ragged_inputs_depth_tuple(self):
-    layer = categorical_crossing.CategoryCrossing(depth=[2, 3])
+    layer = category_crossing.CategoryCrossing(depth=[2, 3])
     inputs_0 = ragged_factory_ops.constant([['a'], ['b'], ['c']])
     inputs_1 = ragged_factory_ops.constant([['d'], ['e'], ['f']])
     inputs_2 = ragged_factory_ops.constant([['g'], ['h'], ['i']])
@@ -198,32 +172,22 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertIsInstance(output, ragged_tensor.RaggedTensor)
     self.assertAllEqual(expected_output, output)
 
-  def test_invalid_mixed_sparse_and_ragged_input(self):
-    with self.assertRaises(ValueError):
-      layer = categorical_crossing.CategoryCrossing(num_bins=2)
-      inputs_0 = ragged_factory_ops.constant(
-          [['omar'], ['marlo']],
-          dtype=dtypes.string)
-      inputs_1 = sparse_tensor.SparseTensor(
-          indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
-      layer([inputs_0, inputs_1])
-
   def test_crossing_with_dense_inputs(self):
-    layer = categorical_crossing.CategoryCrossing()
+    layer = category_crossing.CategoryCrossing()
     inputs_0 = np.asarray([[1, 2]])
     inputs_1 = np.asarray([[1, 3]])
     output = layer([inputs_0, inputs_1])
     self.assertAllEqual([[b'1_X_1', b'1_X_3', b'2_X_1', b'2_X_3']], output)
 
   def test_crossing_dense_inputs_depth_int(self):
-    layer = categorical_crossing.CategoryCrossing(depth=1)
+    layer = category_crossing.CategoryCrossing(depth=1)
     inputs_0 = constant_op.constant([['a'], ['b'], ['c']])
     inputs_1 = constant_op.constant([['d'], ['e'], ['f']])
     output = layer([inputs_0, inputs_1])
     expected_output = [[b'a', b'd'], [b'b', b'e'], [b'c', b'f']]
     self.assertAllEqual(expected_output, output)
 
-    layer = categorical_crossing.CategoryCrossing(depth=2)
+    layer = category_crossing.CategoryCrossing(depth=2)
     inp_0_t = input_layer.Input(shape=(1,), dtype=dtypes.string)
     inp_1_t = input_layer.Input(shape=(1,), dtype=dtypes.string)
     out_t = layer([inp_0_t, inp_1_t])
@@ -234,7 +198,7 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertAllEqual(expected_output, model.predict([inputs_0, inputs_1]))
 
   def test_crossing_dense_inputs_depth_tuple(self):
-    layer = categorical_crossing.CategoryCrossing(depth=[2, 3])
+    layer = category_crossing.CategoryCrossing(depth=[2, 3])
     inputs_0 = constant_op.constant([['a'], ['b'], ['c']])
     inputs_1 = constant_op.constant([['d'], ['e'], ['f']])
     inputs_2 = constant_op.constant([['g'], ['h'], ['i']])
@@ -251,13 +215,6 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertAllEqual(expected_output,
                         model.predict([inputs_0, inputs_1, inputs_2]))
 
-  def test_crossing_hashed_with_dense_inputs(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2)
-    inputs_0 = np.asarray([[1, 2]])
-    inputs_1 = np.asarray([[1, 3]])
-    output = layer([inputs_0, inputs_1])
-    self.assertAllClose([[1, 1, 0, 0]], output)
-
   def test_crossing_compute_output_signature(self):
     input_shapes = [
         tensor_shape.TensorShape([2, 2]),
@@ -267,30 +224,21 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
         tensor_spec.TensorSpec(input_shape, dtypes.string)
         for input_shape in input_shapes
     ]
-    layer = categorical_crossing.CategoryCrossing()
+    layer = category_crossing.CategoryCrossing()
     output_spec = layer.compute_output_signature(input_specs)
     self.assertEqual(output_spec.shape.dims[0], input_shapes[0].dims[0])
     self.assertEqual(output_spec.dtype, dtypes.string)
 
-    layer = categorical_crossing.CategoryCrossing(num_bins=2)
-    output_spec = layer.compute_output_signature(input_specs)
-    self.assertEqual(output_spec.shape.dims[0], input_shapes[0].dims[0])
-    self.assertEqual(output_spec.dtype, dtypes.int64)
-
-  def test_crossing_with_invalid_hash_key(self):
-    with self.assertRaises(ValueError):
-      _ = categorical_crossing.CategoryCrossing(hash_key=133)
-
   @tf_test_util.run_v2_only
   def test_config_with_custom_name(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2, name='hashing')
+    layer = category_crossing.CategoryCrossing(depth=2, name='hashing')
     config = layer.get_config()
-    layer_1 = categorical_crossing.CategoryCrossing.from_config(config)
+    layer_1 = category_crossing.CategoryCrossing.from_config(config)
     self.assertEqual(layer_1.name, layer.name)
 
-    layer = categorical_crossing.CategoryCrossing(name='hashing')
+    layer = category_crossing.CategoryCrossing(name='hashing')
     config = layer.get_config()
-    layer_1 = categorical_crossing.CategoryCrossing.from_config(config)
+    layer_1 = category_crossing.CategoryCrossing.from_config(config)
     self.assertEqual(layer_1.name, layer.name)
 
 
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
similarity index 80%
rename from tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
rename to tensorflow/python/keras/layers/preprocessing/category_encoding.py
index 466405a27a9..a0b7d275e35 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras text CategoricalEncoding preprocessing layer."""
+"""Keras text CategoryEncoding preprocessing layer."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -32,11 +32,13 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import bincount_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import keras_export
 
 TFIDF = "tf-idf"
 INT = "int"
@@ -49,14 +51,40 @@ _NUM_ELEMENTS_NAME = "num_elements"
 _IDF_NAME = "idf"
 
 
-class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
-  """Categorical encoding layer.
+@keras_export("keras.layers.experimental.preprocessing.CategoryEncoding", v1=[])
+class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
+  """Category encoding layer.
 
   This layer provides options for condensing data into a categorical encoding.
   It accepts integer values as inputs and outputs a dense representation
   (one sample = 1-index tensor of float values representing data about the
   sample's tokens) of those inputs.
 
+  Examples:
+
+  >>> layer = tf.keras.layers.experimental.preprocessing.CategoryEncoding(
+  ...           max_tokens=4)
+  >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]])
+  <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
+    array([[1., 1., 0., 0.],
+           [2., 0., 0., 0.],
+           [0., 1., 1., 0.],
+           [0., 1., 0., 1.]], dtype=float32)>
+
+
+  Examples with weighted inputs:
+
+  >>> layer = tf.keras.layers.experimental.preprocessing.CategoryEncoding(
+  ...           max_tokens=4)
+  >>> count_weights = np.array([[.1, .2], [.1, .1], [.2, .3], [.4, .2]])
+  >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]], count_weights=count_weights)
+  <tf.Tensor: shape=(4, 4), dtype=float64, numpy=
+    array([[0.1, 0.2, 0. , 0. ],
+           [0.2, 0. , 0. , 0. ],
+           [0. , 0.2, 0.3, 0. ],
+           [0. , 0.2, 0. , 0.4]])>
+
+
   Attributes:
     max_tokens: The maximum size of the vocabulary for this layer. If None,
       there is no cap on the size of the vocabulary.
@@ -71,8 +99,13 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
           value in each token slot.
     sparse: Boolean. If true, returns a `SparseTensor` instead of a dense
       `Tensor`. Defaults to `False`.
+
+  Call arguments:
+    inputs: A 2D tensor `(samples, timesteps)`.
+    count_weights: A 2D tensor in the same shape as `inputs` indicating the
+      weight for each sample value when summing up in `count` mode. Not used in
+      `binary` or `tfidf` mode.
   """
-  # TODO(momernick): Add an examples section to the docstring.
 
   def __init__(self,
                max_tokens=None,
@@ -83,7 +116,7 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     layer_utils.validate_string_arg(
         output_mode,
         allowable_strings=(COUNT, BINARY, TFIDF),
-        layer_name="CategoricalEncoding",
+        layer_name="CategoryEncoding",
         arg_name="output_mode")
 
     # If max_tokens is set, the value must be greater than 1 - otherwise we
@@ -92,10 +125,10 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
       raise ValueError("max_tokens must be > 1.")
 
     # We need to call super() before we call _add_state_variable().
-    combiner = _CategoricalEncodingCombiner(
+    combiner = _CategoryEncodingCombiner(
         compute_max_element=max_tokens is None,
         compute_idf=output_mode == TFIDF)
-    super(CategoricalEncoding, self).__init__(combiner=combiner, **kwargs)
+    super(CategoryEncoding, self).__init__(combiner=combiner, **kwargs)
 
     self._max_tokens = max_tokens
     self._output_mode = output_mode
@@ -158,13 +191,12 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
       RuntimeError: if the layer cannot be adapted at this time.
     """
     if not reset_state:
-      raise ValueError("CategoricalEncoding does not support streaming adapts.")
+      raise ValueError("CategoryEncoding does not support streaming adapts.")
 
     if self._called and self._max_tokens is None:
-      raise RuntimeError(
-          "CategoricalEncoding can't be adapted after being called "
-          "if max_tokens is None.")
-    super(CategoricalEncoding, self).adapt(data, reset_state)
+      raise RuntimeError("CategoryEncoding can't be adapted after being called "
+                         "if max_tokens is None.")
+    super(CategoryEncoding, self).adapt(data, reset_state)
 
   def _set_state_variables(self, updates):
     if not self.built:
@@ -180,7 +212,7 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
         "output_mode": self._output_mode,
         "sparse": self._sparse,
     }
-    base_config = super(CategoricalEncoding, self).get_config()
+    base_config = super(CategoryEncoding, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
   def _convert_to_ndarray(self, x):
@@ -230,72 +262,56 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
         tfidf_data = np.resize(tfidf_data, (self._max_tokens,))
     K.set_value(self.tf_idf_weights, tfidf_data)
 
-  def call(self, inputs):
+  def call(self, inputs, count_weights=None):
+    if count_weights is not None and self._output_mode != COUNT:
+      raise ValueError("count_weights is not used in `output_mode='tf-idf'`, "
+                       "or `output_mode='binary'`. Please pass a single input.")
     self._called = True
     if self._max_tokens is None:
       out_depth = K.get_value(self.num_elements)
     else:
       out_depth = self._max_tokens
 
-    if self._sparse:
-      if self._output_mode != COUNT:
-        raise ValueError("Only supports `sparse=True` when `output_mode` "
-                         ' is \"count\", got {}'.format(self._output_mode))
-      inputs = self._convert_to_sparse_inputs(inputs)
-
-      # Consider having sparse.one_hot
-      # Append values to indices, and reduce sum to get the counts.
-      tokens = array_ops.expand_dims(
-          math_ops.cast(inputs.values, dtypes.int64), axis=1)
-      count_tokens = array_ops.concat([inputs.indices, tokens], axis=1)
-      count_values = array_ops.ones_like(inputs.values, dtype=dtypes.int64)
-      unreduced_count_shape = array_ops.concat(
-          [inputs.dense_shape, [out_depth]], axis=0)
-      counts = sparse_tensor.SparseTensor(
-          indices=count_tokens,
-          values=count_values,
-          dense_shape=unreduced_count_shape)
-      count_data = sparse_ops.sparse_reduce_sum_v2(
-          counts, axis=1, output_is_sparse=True)
-      return count_data
-
-    # If the input is a sparse tensor, we densify it with the default value of
-    # -1. Because -1 is ignored by one_hot, this effectively drops the non-set
-    # positions from the output encoding.
-    if isinstance(inputs, sparse_tensor.SparseTensor):
-      inputs = sparse_ops.sparse_tensor_to_dense(inputs, default_value=-1)
-
-    if self._output_mode == BINARY:
-      bool_one_hot_data = array_ops.one_hot(
-          inputs, depth=out_depth, on_value=True, off_value=False)
-      reduced_bool_data = math_ops.reduce_any(bool_one_hot_data, axis=1)
-      binary_data = math_ops.cast(reduced_bool_data, dtypes.int64)
-      binary_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
-      return binary_data
-
-    one_hot_data = array_ops.one_hot(inputs, depth=out_depth)
-    counts = math_ops.reduce_sum(one_hot_data, axis=1)
-    if self._output_mode == COUNT:
-      count_data = math_ops.cast(counts, dtypes.int64)
-      count_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
-      return count_data
-
-    tf_idf_data = math_ops.multiply(counts, self.tf_idf_weights)
-    tf_idf_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
     if self._output_mode == TFIDF:
+      # If the input is a sparse tensor, we densify it with the default value of
+      # -1. Because -1 is ignored by one_hot, this effectively drops the non-set
+      # positions from the output encoding.
+      if isinstance(inputs, sparse_tensor.SparseTensor):
+        inputs = sparse_ops.sparse_tensor_to_dense(inputs, default_value=-1)
+      one_hot_data = array_ops.one_hot(inputs, depth=out_depth)
+      counts = math_ops.reduce_sum(one_hot_data, axis=1)
+      tf_idf_data = math_ops.multiply(counts, self.tf_idf_weights)
+      tf_idf_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
       return tf_idf_data
 
-    # We can only get here if we didn't recognize the passed mode.
-    raise ValueError("Unknown output mode %s" % self._output_mode)
+    binary_output = (self._output_mode == BINARY)
+    if self._sparse:
+      result = bincount_ops.sparse_bincount(
+          inputs,
+          weights=count_weights,
+          minlength=out_depth,
+          axis=-1,
+          binary_output=binary_output)
+      return math_ops.cast(result, K.floatx())
+    else:
+      result = bincount_ops.bincount(
+          inputs,
+          weights=count_weights,
+          minlength=out_depth,
+          dtype=K.floatx(),
+          axis=-1,
+          binary_output=binary_output)
+      result.set_shape(tensor_shape.TensorShape((None, out_depth)))
+      return result
 
 
-class _CategoricalEncodingAccumulator(
+class _CategoryEncodingAccumulator(
     collections.namedtuple("Accumulator", ["data", "per_doc_count_dict"])):
   pass
 
 
-class _CategoricalEncodingCombiner(base_preprocessing_layer.Combiner):
-  """Combiner for the CategoricalEncoding preprocessing layer.
+class _CategoryEncodingCombiner(base_preprocessing_layer.Combiner):
+  """Combiner for the CategoryEncoding preprocessing layer.
 
   This class encapsulates the logic for computing the number of elements in the
   input dataset and the document frequency for each element.
@@ -411,7 +427,7 @@ class _CategoricalEncodingCombiner(base_preprocessing_layer.Combiner):
   def restore(self, output):
     """Creates an accumulator based on 'output'."""
     raise NotImplementedError(
-        "CategoricalEncoding does not restore or support streaming updates.")
+        "CategoryEncoding does not restore or support streaming updates.")
 
   def serialize(self, accumulator):
     """Serializes an accumulator for a remote call."""
@@ -452,4 +468,4 @@ class _CategoricalEncodingCombiner(base_preprocessing_layer.Combiner):
     else:
       per_doc_count_dict = None
     data = [0, 0]
-    return _CategoricalEncodingAccumulator(data, per_doc_count_dict)
+    return _CategoryEncodingAccumulator(data, per_doc_count_dict)
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
similarity index 64%
rename from tensorflow/python/keras/layers/preprocessing/categorical_encoding_distribution_test.py
rename to tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
index c5214533f94..011495b9314 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
@@ -21,39 +21,58 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.platform import test
 
 
+def batch_wrapper(dataset, batch_size, distribution, repeat=None):
+  if repeat:
+    dataset = dataset.repeat(repeat)
+  # TPUs currently require fully defined input shapes, drop_remainder ensures
+  # the input will have fully defined shapes.
+  if isinstance(distribution,
+                (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+    return dataset.batch(batch_size, drop_remainder=True)
+  else:
+    return dataset.batch(batch_size)
+
+
 @combinations.generate(
     combinations.combine(
-        distribution=strategy_combinations.all_strategies,
+        # (b/156783625): Outside compilation failed for eager mode only.
+        distribution=strategy_combinations.strategies_minus_tpu,
         mode=["eager", "graph"]))
-class CategoricalEncodingDistributionTest(
+class CategoryEncodingDistributionTest(
     keras_parameterized.TestCase,
     preprocessing_test_utils.PreprocessingLayerTest):
 
   def test_distribution(self, distribution):
     input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
+    inp_dataset = dataset_ops.DatasetV2.from_tensor_slices(input_array)
+    inp_dataset = batch_wrapper(inp_dataset, 2, distribution)
 
     # pyformat: disable
     expected_output = [[0, 1, 1, 1, 0, 0],
                        [1, 1, 0, 1, 0, 0]]
     # pyformat: enable
     max_tokens = 6
+    config.set_soft_device_placement(True)
 
     with distribution.scope():
       input_data = keras.Input(shape=(4,), dtype=dtypes.int32)
-      layer = categorical_encoding.CategoricalEncoding(
-          max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+      layer = category_encoding.CategoryEncoding(
+          max_tokens=max_tokens, output_mode=category_encoding.BINARY)
       int_data = layer(input_data)
       model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
+    output_dataset = model.predict(inp_dataset)
     self.assertAllEqual(expected_output, output_dataset)
 
 
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_test.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
similarity index 81%
rename from tensorflow/python/keras/layers/preprocessing/categorical_encoding_test.py
rename to tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
index e21e95a0078..edfacf0d2b3 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for Keras text categorical_encoding preprocessing layer."""
+"""Tests for Keras text category_encoding preprocessing layer."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -32,8 +32,8 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1
+from tensorflow.python.keras.layers.preprocessing import category_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -44,15 +44,15 @@ from tensorflow.python.platform import test
 
 def get_layer_class():
   if context.executing_eagerly():
-    return categorical_encoding.CategoricalEncoding
+    return category_encoding.CategoryEncoding
   else:
-    return categorical_encoding_v1.CategoricalEncoding
+    return category_encoding_v1.CategoryEncoding
 
 
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class CategoricalEncodingInputTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
+class CategoryEncodingInputTest(keras_parameterized.TestCase,
+                                preprocessing_test_utils.PreprocessingLayerTest
+                               ):
 
   def test_dense_input_sparse_output(self):
     input_array = constant_op.constant([[1, 2, 3], [3, 3, 0]])
@@ -67,9 +67,7 @@ class CategoricalEncodingInputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
-        sparse=True)
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
     int_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -80,7 +78,7 @@ class CategoricalEncodingInputTest(
     # Assert sparse output is same as dense output.
     layer = get_layer_class()(
         max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
+        output_mode=category_encoding.COUNT,
         sparse=False)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -103,7 +101,7 @@ class CategoricalEncodingInputTest(
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
 
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -111,6 +109,32 @@ class CategoricalEncodingInputTest(
     output_dataset = model.predict(sparse_tensor_data, steps=1)
     self.assertAllEqual(expected_output, output_dataset)
 
+  def test_sparse_input_with_weights(self):
+    input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 4]], dtype=np.int64)
+    weights_array = np.array([[.1, .2, .3, .4], [.2, .1, .4, .3]])
+    sparse_tensor_data = sparse_ops.from_dense(input_array)
+    sparse_weight_data = sparse_ops.from_dense(weights_array)
+
+    # pyformat: disable
+    expected_output = [[0, .1, .2, .3, .4, 0],
+                       [0, .4, 0, .1, .5, 0]]
+    # pyformat: enable
+    max_tokens = 6
+    expected_output_shape = [None, max_tokens]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    weight_data = keras.Input(shape=(None,), dtype=dtypes.float32, sparse=True)
+
+    layer = get_layer_class()(
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT)
+    int_data = layer(input_data, count_weights=weight_data)
+    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+    model = keras.Model(inputs=[input_data, weight_data], outputs=int_data)
+    output_dataset = model.predict([sparse_tensor_data, sparse_weight_data],
+                                   steps=1)
+    self.assertAllClose(expected_output, output_dataset)
+
   def test_sparse_input_sparse_output(self):
     sp_inp = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 1], [2, 0], [2, 1], [3, 1]],
@@ -128,9 +152,7 @@ class CategoricalEncodingInputTest(
     max_tokens = 6
 
     layer = get_layer_class()(
-        max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
-        sparse=True)
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
     int_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -141,7 +163,7 @@ class CategoricalEncodingInputTest(
     # Assert sparse output is same as dense output.
     layer = get_layer_class()(
         max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
+        output_mode=category_encoding.COUNT,
         sparse=False)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -150,6 +172,33 @@ class CategoricalEncodingInputTest(
         sparse_ops.sparse_tensor_to_dense(sp_output_dataset, default_value=0),
         output_dataset)
 
+  def test_sparse_input_sparse_output_with_weights(self):
+    indices = [[0, 0], [1, 1], [2, 0], [2, 1], [3, 1]]
+    sp_inp = sparse_tensor.SparseTensor(
+        indices=indices, values=[0, 2, 1, 1, 0], dense_shape=[4, 2])
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    sp_weight = sparse_tensor.SparseTensor(
+        indices=indices, values=[.1, .2, .4, .3, .2], dense_shape=[4, 2])
+    weight_data = keras.Input(shape=(None,), dtype=dtypes.float32, sparse=True)
+
+    # The expected output should be (X for missing value):
+    # [[1, X, X, X]
+    #  [X, X, 1, X]
+    #  [X, 2, X, X]
+    #  [1, X, X, X]]
+    expected_indices = [[0, 0], [1, 2], [2, 1], [3, 0]]
+    expected_values = [.1, .2, .7, .2]
+    max_tokens = 6
+
+    layer = get_layer_class()(
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
+    int_data = layer(input_data, count_weights=weight_data)
+
+    model = keras.Model(inputs=[input_data, weight_data], outputs=int_data)
+    sp_output_dataset = model.predict([sp_inp, sp_weight], steps=1)
+    self.assertAllClose(expected_values, sp_output_dataset.values)
+    self.assertAllEqual(expected_indices, sp_output_dataset.indices)
+
   def test_ragged_input(self):
     input_array = ragged_factory_ops.constant([[1, 2, 3], [3, 1]])
 
@@ -163,7 +212,7 @@ class CategoricalEncodingInputTest(
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
 
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
     int_data = layer(input_data)
 
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -184,9 +233,7 @@ class CategoricalEncodingInputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
     layer = get_layer_class()(
-        max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
-        sparse=True)
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
     int_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -197,7 +244,7 @@ class CategoricalEncodingInputTest(
     # Assert sparse output is same as dense output.
     layer = get_layer_class()(
         max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
+        output_mode=category_encoding.COUNT,
         sparse=False)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -214,9 +261,7 @@ class CategoricalEncodingInputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     encoding_layer = get_layer_class()(
-        max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
-        sparse=True)
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
     int_data = encoding_layer(input_data)
     output_data = math_ops.cast(int_data, dtypes.float32)
     weights = variables.Variable([[.1], [.2], [.3], [.4]], dtype=dtypes.float32)
@@ -228,9 +273,9 @@ class CategoricalEncodingInputTest(
 
 
 @keras_parameterized.run_all_keras_modes
-class CategoricalEncodingAdaptTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
+class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
+                                preprocessing_test_utils.PreprocessingLayerTest
+                               ):
 
   def test_sparse_adapt(self):
     vocab_data = sparse_ops.from_dense(
@@ -248,7 +293,7 @@ class CategoricalEncodingAdaptTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     layer.adapt(vocab_dataset)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -273,7 +318,7 @@ class CategoricalEncodingAdaptTest(
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
 
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     layer.adapt(vocab_dataset)
     int_data = layer(input_data)
 
@@ -296,7 +341,7 @@ class CategoricalEncodingAdaptTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
     int_data = layer(input_data)
     layer.adapt(vocab_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -306,7 +351,7 @@ class CategoricalEncodingAdaptTest(
     self.assertAllEqual(expected_output, output_dataset)
 
   def test_hard_maximum_set_state_variables_after_build(self):
-    state_variables = {categorical_encoding._NUM_ELEMENTS_NAME: 5}
+    state_variables = {category_encoding._NUM_ELEMENTS_NAME: 5}
     input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
 
     # pyformat: disable
@@ -318,7 +363,7 @@ class CategoricalEncodingAdaptTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
     int_data = layer(input_data)
     layer._set_state_variables(state_variables)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -339,7 +384,7 @@ class CategoricalEncodingAdaptTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     layer.build(input_data.shape)
     layer.set_num_elements(max_tokens)
     int_data = layer(input_data)
@@ -351,8 +396,7 @@ class CategoricalEncodingAdaptTest(
 
   def test_set_weights_fails_on_wrong_size_weights(self):
     tfidf_data = [.05, .5, .25, .2, .125]
-    layer = get_layer_class()(
-        max_tokens=6, output_mode=categorical_encoding.TFIDF)
+    layer = get_layer_class()(max_tokens=6, output_mode=category_encoding.TFIDF)
 
     with self.assertRaisesRegex(ValueError, ".*Layer weight shape.*"):
       layer.set_weights([np.array(tfidf_data)])
@@ -360,7 +404,7 @@ class CategoricalEncodingAdaptTest(
   def test_set_num_elements_after_call_fails(self):
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     _ = layer(input_data)
     with self.assertRaisesRegex(RuntimeError, "num_elements cannot be changed"):
       layer.set_num_elements(5)
@@ -370,17 +414,17 @@ class CategoricalEncodingAdaptTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     _ = layer(input_data)
     with self.assertRaisesRegex(RuntimeError, "can't be adapted"):
       layer.adapt(vocab_data)
 
   def test_set_state_variables_after_call_fails(self):
-    state_variables = {categorical_encoding._NUM_ELEMENTS_NAME: 5}
+    state_variables = {category_encoding._NUM_ELEMENTS_NAME: 5}
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     _ = layer(input_data)
     with self.assertRaisesRegex(RuntimeError, "num_elements cannot be changed"):
       layer._set_state_variables(state_variables)
@@ -388,9 +432,9 @@ class CategoricalEncodingAdaptTest(
 
 @keras_parameterized.run_all_keras_modes
 @keras_parameterized.run_all_keras_modes
-class CategoricalEncodingOutputTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
+class CategoryEncodingOutputTest(keras_parameterized.TestCase,
+                                 preprocessing_test_utils.PreprocessingLayerTest
+                                ):
 
   def test_binary_output_hard_maximum(self):
     input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
@@ -404,7 +448,7 @@ class CategoricalEncodingOutputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -424,7 +468,7 @@ class CategoricalEncodingOutputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     layer.set_weights([np.array(max_tokens)])
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -444,8 +488,7 @@ class CategoricalEncodingOutputTest(
     expected_output_shape = [None, max_tokens]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
-    layer = get_layer_class()(
-        max_tokens=6, output_mode=categorical_encoding.COUNT)
+    layer = get_layer_class()(max_tokens=6, output_mode=category_encoding.COUNT)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -465,7 +508,7 @@ class CategoricalEncodingOutputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.COUNT)
+        max_tokens=None, output_mode=category_encoding.COUNT)
     layer.set_weights([np.array(max_tokens)])
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -488,8 +531,7 @@ class CategoricalEncodingOutputTest(
     expected_output_shape = [None, max_tokens]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
-    layer = get_layer_class()(
-        max_tokens=6, output_mode=categorical_encoding.TFIDF)
+    layer = get_layer_class()(max_tokens=6, output_mode=category_encoding.TFIDF)
     layer.set_tfidf_data(tfidf_data)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -513,7 +555,7 @@ class CategoricalEncodingOutputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.TFIDF)
+        max_tokens=None, output_mode=category_encoding.TFIDF)
     layer.set_num_elements(max_tokens)
     layer.set_tfidf_data(tfidf_data)
     int_data = layer(input_data)
@@ -524,7 +566,7 @@ class CategoricalEncodingOutputTest(
     self.assertAllClose(expected_output, output_dataset)
 
 
-class CategoricalEncodingModelBuildingTest(
+class CategoryEncodingModelBuildingTest(
     keras_parameterized.TestCase,
     preprocessing_test_utils.PreprocessingLayerTest):
 
@@ -532,27 +574,27 @@ class CategoricalEncodingModelBuildingTest(
       {
           "testcase_name": "count_hard_max",
           "max_tokens": 5,
-          "output_mode": categorical_encoding.COUNT
+          "output_mode": category_encoding.COUNT
       }, {
           "testcase_name": "count_soft_max",
           "max_tokens": None,
-          "output_mode": categorical_encoding.COUNT
+          "output_mode": category_encoding.COUNT
       }, {
           "testcase_name": "binary_hard_max",
           "max_tokens": 5,
-          "output_mode": categorical_encoding.BINARY
+          "output_mode": category_encoding.BINARY
       }, {
           "testcase_name": "binary_soft_max",
           "max_tokens": None,
-          "output_mode": categorical_encoding.BINARY
+          "output_mode": category_encoding.BINARY
       }, {
           "testcase_name": "tfidf_hard_max",
           "max_tokens": 5,
-          "output_mode": categorical_encoding.TFIDF
+          "output_mode": category_encoding.TFIDF
       }, {
           "testcase_name": "tfidf_soft_max",
           "max_tokens": None,
-          "output_mode": categorical_encoding.TFIDF
+          "output_mode": category_encoding.TFIDF
       })
   def test_end_to_end_bagged_modeling(self, output_mode, max_tokens):
     tfidf_data = np.array([.03, .5, .25, .2, .125])
@@ -564,7 +606,7 @@ class CategoricalEncodingModelBuildingTest(
     weights = []
     if max_tokens is None:
       weights.append(np.array(5))
-    if output_mode == categorical_encoding.TFIDF:
+    if output_mode == category_encoding.TFIDF:
       weights.append(tfidf_data)
 
     layer.set_weights(weights)
@@ -577,7 +619,7 @@ class CategoricalEncodingModelBuildingTest(
 
 
 @keras_parameterized.run_all_keras_modes
-class CategoricalEncodingCombinerTest(
+class CategoryEncodingCombinerTest(
     keras_parameterized.TestCase,
     preprocessing_test_utils.PreprocessingLayerTest):
 
@@ -617,8 +659,7 @@ class CategoricalEncodingCombinerTest(
 
   def test_combiner_api_compatibility_int_mode(self):
     data = np.array([[1, 2, 3, 4], [1, 2, 3, 0]])
-    combiner = categorical_encoding._CategoricalEncodingCombiner(
-        compute_idf=False)
+    combiner = category_encoding._CategoryEncodingCombiner(compute_idf=False)
     expected_accumulator_output = {
         "max_element": np.array(4),
         "num_documents": np.array(2),
@@ -636,8 +677,7 @@ class CategoricalEncodingCombinerTest(
 
   def test_combiner_api_compatibility_tfidf_mode(self):
     data = np.array([[1, 2, 3, 4], [1, 2, 3, 0]])
-    combiner = categorical_encoding._CategoricalEncodingCombiner(
-        compute_idf=True)
+    combiner = category_encoding._CategoryEncodingCombiner(compute_idf=True)
     expected_accumulator_output = {
         "max_element": np.array(4),
         "document_counts": np.array([1, 2, 2, 2, 1]),
@@ -693,7 +733,7 @@ class CategoricalEncodingCombinerTest(
                                 expected_accumulator_output,
                                 expected_extract_output,
                                 compute_idf=True):
-    combiner = categorical_encoding._CategoricalEncodingCombiner(
+    combiner = category_encoding._CategoryEncodingCombiner(
         compute_idf=compute_idf)
     expected_accumulator = combiner._create_accumulator()
     expected_accumulator = self.update_accumulator(expected_accumulator,
@@ -702,6 +742,5 @@ class CategoricalEncodingCombinerTest(
     self.validate_accumulator_extract(combiner, data, expected_extract_output)
 
 
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_v1.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_v1.py
similarity index 89%
rename from tensorflow/python/keras/layers/preprocessing/categorical_encoding_v1.py
rename to tensorflow/python/keras/layers/preprocessing/category_encoding_v1.py
index 83128ed5095..3afb86b344f 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding_v1.py
@@ -12,20 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tensorflow V1 version of the text categorical_encoding preprocessing layer."""
+"""Tensorflow V1 version of the text category_encoding preprocessing layer."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding
+from tensorflow.python.util.tf_export import keras_export
 
 
-class CategoricalEncoding(categorical_encoding.CategoricalEncoding,
-                          base_preprocessing_layer_v1.CombinerPreprocessingLayer
-                         ):
-  """CategoricalEncoding layer.
+@keras_export(v1=["keras.layers.experimental.preprocessing.CategoryEncoding"])
+class CategoryEncoding(category_encoding.CategoryEncoding,
+                       base_preprocessing_layer_v1.CombinerPreprocessingLayer):
+  """CategoryEncoding layer.
 
   This layer provides options for condensing input data into denser
   representations. It accepts either integer values or strings as inputs,
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization.py b/tensorflow/python/keras/layers/preprocessing/discretization.py
index 003b6e64f90..d621410146c 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization.py
@@ -19,60 +19,59 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
-
-INTEGER = "int"
-BINARY = "binary"
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export("keras.layers.experimental.preprocessing.Discretization")
 class Discretization(Layer):
   """Buckets data into discrete ranges.
 
   This layer will place each element of its input data into one of several
-  contiguous ranges and output either an integer index or a one-hot vector
-  indicating which range each element was placed in.
-
-  What happens in `adapt()`: The dataset is examined and sliced.
+  contiguous ranges and output an integer index indicating which range each
+  element was placed in.
 
   Input shape:
     Any `tf.Tensor` or `tf.RaggedTensor` of dimension 2 or higher.
 
   Output shape:
-    The same as the input shape if `output_mode` is 'int', or
-      `[output_shape, num_buckets]` if `output_mode` is 'binary'.
+    Same as input shape.
 
   Attributes:
     bins: Optional boundary specification. Bins include the left boundary and
       exclude the right boundary, so `bins=[0., 1., 2.]` generates bins
       `(-inf, 0.)`, `[0., 1.)`, `[1., 2.)`, and `[2., +inf)`.
-    output_mode: One of 'int', 'binary'. Defaults to 'int'.
+
+  Examples:
+
+  Bucketize float values based on provided buckets.
+  >>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
+  >>> layer = tf.keras.layers.experimental.preprocessing.Discretization(
+  ...          bins=[0., 1., 2.])
+  >>> layer(input)
+  <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
+  array([[0, 2, 3, 1],
+         [1, 3, 2, 1]], dtype=int32)>
   """
 
-  def __init__(self, bins, output_mode=INTEGER, **kwargs):
+  def __init__(self, bins, **kwargs):
     super(Discretization, self).__init__(**kwargs)
     self.bins = bins
-    self.output_mode = output_mode
 
   def get_config(self):
     config = {
         "bins": self.bins,
-        "output_mode": self.output_mode,
     }
     base_config = super(Discretization, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
   def compute_output_shape(self, input_shape):
-    if self.output_mode == INTEGER:
-      return input_shape
-    else:
-      return tensor_shape.TensorShape([dim for dim in input_shape] +
-                                      [len(self.bins)])
+    return input_shape
 
   def compute_output_signature(self, input_spec):
     output_shape = self.compute_output_shape(input_spec.shape.as_list())
@@ -89,26 +88,14 @@ class Discretization(Layer):
       # Ragged map_flat_values doesn't touch the non-values tensors in the
       # ragged composite tensor. If this op is the only op a Keras model,
       # this can cause errors in Graph mode, so wrap the tensor in an identity.
-      integer_buckets = array_ops.identity(integer_buckets)
+      return array_ops.identity(integer_buckets)
     elif isinstance(inputs, sparse_tensor.SparseTensor):
       integer_buckets = math_ops._bucketize(  # pylint: disable=protected-access
           inputs.values,
           boundaries=self.bins)
+      return sparse_tensor.SparseTensor(
+          indices=array_ops.identity(inputs.indices),
+          values=integer_buckets,
+          dense_shape=array_ops.identity(inputs.dense_shape))
     else:
-      integer_buckets = math_ops._bucketize(inputs, boundaries=self.bins)  # pylint: disable=protected-access
-
-    if self.output_mode == INTEGER:
-      if isinstance(inputs, sparse_tensor.SparseTensor):
-        return sparse_tensor.SparseTensor(
-            indices=array_ops.identity(inputs.indices),
-            values=integer_buckets,
-            dense_shape=array_ops.identity(inputs.dense_shape))
-      return integer_buckets
-    else:
-      if isinstance(inputs, sparse_tensor.SparseTensor):
-        raise ValueError("`output_mode=binary` is not supported for "
-                         "sparse input")
-      # The 'bins' array is the set of boundaries between the bins. We actually
-      # have 'len(bins)+1' outputs.
-      # TODO(momernick): This will change when we have the ability to adapt().
-      return array_ops.one_hot(integer_buckets, depth=len(self.bins) + 1)
+      return math_ops._bucketize(inputs, boundaries=self.bins)  # pylint: disable=protected-access
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
index 7da40b88920..aaeef8ea868 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
@@ -45,8 +45,7 @@ class DiscretizationDistributionTest(
 
     with distribution.scope():
       input_data = keras.Input(shape=(None,))
-      layer = discretization.Discretization(
-          bins=[0., 1., 2.], output_mode=discretization.INTEGER)
+      layer = discretization.Discretization(bins=[0., 1., 2.])
       bucket_data = layer(input_data)
       self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
 
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization_test.py b/tensorflow/python/keras/layers/preprocessing/discretization_test.py
index 110bccd55e1..54acf267066 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization_test.py
@@ -32,29 +32,8 @@ from tensorflow.python.platform import test
 
 
 @keras_parameterized.run_all_keras_modes
-class CategoricalEncodingInputTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_bucketize_with_explicit_buckets_one_hot(self):
-    input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
-
-    # pyformat: disable
-    expected_output = [[[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0]],
-                       [[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0]]]
-    # pyformat: enable
-    num_buckets = 4
-    expected_output_shape = [None, None, num_buckets]
-
-    input_data = keras.Input(shape=(None,))
-    layer = discretization.Discretization(
-        bins=[0., 1., 2.], output_mode=discretization.BINARY)
-    bucket_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=bucket_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
+class DiscretizationTest(keras_parameterized.TestCase,
+                         preprocessing_test_utils.PreprocessingLayerTest):
 
   def test_bucketize_with_explicit_buckets_integer(self):
     input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
@@ -63,8 +42,7 @@ class CategoricalEncodingInputTest(
     expected_output_shape = [None, None]
 
     input_data = keras.Input(shape=(None,))
-    layer = discretization.Discretization(
-        bins=[0., 1., 2.], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[0., 1., 2.])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
 
@@ -79,8 +57,7 @@ class CategoricalEncodingInputTest(
     expected_output_shape = [None, None]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
-    layer = discretization.Discretization(
-        bins=[-.5, 0.5, 1.5], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[-.5, 0.5, 1.5])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
 
@@ -94,8 +71,7 @@ class CategoricalEncodingInputTest(
         indices=indices, values=[-1.5, 1.0, 3.4], dense_shape=[2, 3])
     expected_output = [0, 2, 3]
     input_data = keras.Input(shape=(3,), dtype=dtypes.float32, sparse=True)
-    layer = discretization.Discretization(
-        bins=[-.5, 0.5, 1.5], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[-.5, 0.5, 1.5])
     bucket_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=bucket_data)
@@ -111,8 +87,7 @@ class CategoricalEncodingInputTest(
     expected_output_shape = [None, None]
 
     input_data = keras.Input(shape=(None,), ragged=True)
-    layer = discretization.Discretization(
-        bins=[0., 1., 2.], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[0., 1., 2.])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
 
@@ -128,8 +103,7 @@ class CategoricalEncodingInputTest(
     expected_output_shape = [None, None]
 
     input_data = keras.Input(shape=(None,), ragged=True, dtype=dtypes.int64)
-    layer = discretization.Discretization(
-        bins=[-.5, 0.5, 1.5], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[-.5, 0.5, 1.5])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
 
@@ -143,8 +117,7 @@ class CategoricalEncodingInputTest(
         indices=indices, values=[-1, 1, 3], dense_shape=[2, 3])
     expected_output = [0, 2, 3]
     input_data = keras.Input(shape=(3,), dtype=dtypes.int32, sparse=True)
-    layer = discretization.Discretization(
-        bins=[-.5, 0.5, 1.5], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[-.5, 0.5, 1.5])
     bucket_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=bucket_data)
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing.py b/tensorflow/python/keras/layers/preprocessing/hashing.py
index dfd4761f193..f4a4ae0ccc8 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing.py
@@ -22,20 +22,28 @@ import functools
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.ops import gen_sparse_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util.tf_export import keras_export
+
+# Default key from tf.sparse.cross_hashed
+_DEFAULT_SALT_KEY = [0xDECAFCAFFE, 0xDECAFCAFFE]
 
 
+@keras_export('keras.layers.experimental.preprocessing.Hashing')
 class Hashing(Layer):
   """Implements categorical feature hashing, also known as "hashing trick".
 
-  This layer transforms categorical inputs to hashed output. It converts a
-  sequence of int or string to a sequence of int. The stable hash function uses
-  tensorflow::ops::Fingerprint to produce universal output that is consistent
-  across platforms.
+  This layer transforms single or multiple categorical inputs to hashed output.
+  It converts a sequence of int or string to a sequence of int. The stable hash
+  function uses tensorflow::ops::Fingerprint to produce universal output that
+  is consistent across platforms.
 
   This layer uses [FarmHash64](https://github.com/google/farmhash) by default,
   which provides a consistent hashed output across different platforms and is
@@ -48,50 +56,91 @@ class Hashing(Layer):
   the `salt` value serving as additional input to the hash function.
 
   Example (FarmHash64):
-  ```python
-    layer = Hashing(num_bins=3)
-    inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
-    layer(inputs)
-    [[1], [0], [1], [1], [2]]
-  ```
+
+  >>> layer = tf.keras.layers.experimental.preprocessing.Hashing(num_bins=3)
+  >>> inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
+  >>> layer(inp)
+  <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
+    array([[1],
+           [0],
+           [1],
+           [1],
+           [2]])>
+
 
   Example (SipHash64):
-  ```python
-    layer = Hashing(num_bins=3, salt=[133, 137])
-    inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
-    layer(inputs)
-    [[1], [2], [1], [0], [2]]
-  ```
+
+  >>> layer = tf.keras.layers.experimental.preprocessing.Hashing(num_bins=3,
+  ...    salt=[133, 137])
+  >>> inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
+  >>> layer(inp)
+  <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
+    array([[1],
+           [2],
+           [1],
+           [0],
+           [2]])>
+
+  Example (Siphash64 with a single integer, same as `salt=[133, 133]`
+
+  >>> layer = tf.keras.layers.experimental.preprocessing.Hashing(num_bins=3,
+  ...    salt=133)
+  >>> inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
+  >>> layer(inp)
+  <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
+    array([[0],
+           [0],
+           [2],
+           [1],
+           [0]])>
+
+  Reference: [SipHash with salt](https://www.131002.net/siphash/siphash.pdf)
 
   Arguments:
     num_bins: Number of hash bins.
-    salt: A tuple/list of 2 unsigned integer numbers. If passed, the hash
-      function used will be SipHash64, with these values used as an additional
-      input (known as a "salt" in cryptography).
+    salt: A single unsigned integer or None.
+      If passed, the hash function used will be SipHash64, with these values
+      used as an additional input (known as a "salt" in cryptography).
       These should be non-zero. Defaults to `None` (in that
-      case, the FarmHash64 hash function is used).
+      case, the FarmHash64 hash function is used). It also supports
+      tuple/list of 2 unsigned integer numbers, see reference paper for details.
     name: Name to give to the layer.
     **kwargs: Keyword arguments to construct a layer.
 
-  Input shape: A string, int32 or int64 tensor of shape
-    `[batch_size, d1, ..., dm]`
+  Input shape: A single or list of string, int32 or int64 `Tensor`,
+    `SparseTensor` or `RaggedTensor` of shape `[batch_size, ...,]`
 
-  Output shape: An int64 tensor of shape `[batch_size, d1, ..., dm]`
+  Output shape: An int64 `Tensor`, `SparseTensor` or `RaggedTensor` of shape
+    `[batch_size, ...]`. If any input is `RaggedTensor` then output is
+    `RaggedTensor`, otherwise if any input is `SparseTensor` then output is
+    `SparseTensor`, otherwise the output is `Tensor`.
 
   """
 
   def __init__(self, num_bins, salt=None, name=None, **kwargs):
     if num_bins is None or num_bins <= 0:
       raise ValueError('`num_bins` cannot be `None` or non-positive values.')
-    if salt is not None:
-      if not isinstance(salt, (tuple, list)) or len(salt) != 2:
-        raise ValueError('`salt` must be a tuple or list of 2 unsigned '
-                         'integer numbers, got {}'.format(salt))
     super(Hashing, self).__init__(name=name, **kwargs)
     self.num_bins = num_bins
-    self.salt = salt
+    self.strong_hash = True if salt is not None else False
+    if salt is not None:
+      if isinstance(salt, (tuple, list)) and len(salt) == 2:
+        self.salt = salt
+      elif isinstance(salt, int):
+        self.salt = [salt, salt]
+      else:
+        raise ValueError('`salt can only be a tuple of size 2 integers, or a '
+                         'single integer, given {}'.format(salt))
+    else:
+      self.salt = _DEFAULT_SALT_KEY
 
   def call(self, inputs):
+    if isinstance(inputs, (tuple, list)):
+      return self._process_input_list(inputs)
+    else:
+      return self._process_single_input(inputs)
+
+  def _process_single_input(self, inputs):
     # Converts integer inputs to string.
     if inputs.dtype.is_integer:
       if isinstance(inputs, sparse_tensor.SparseTensor):
@@ -116,10 +165,38 @@ class Hashing(Layer):
     else:
       return str_to_hash_bucket(inputs, self.num_bins, name='hash')
 
+  def _process_input_list(self, inputs):
+    # TODO(momernick): support ragged_cross_hashed with corrected fingerprint
+    # and siphash.
+    if any(isinstance(inp, ragged_tensor.RaggedTensor) for inp in inputs):
+      raise ValueError('Hashing with ragged input is not supported yet.')
+    sparse_inputs = [
+        inp for inp in inputs if isinstance(inp, sparse_tensor.SparseTensor)
+    ]
+    dense_inputs = [
+        inp for inp in inputs if not isinstance(inp, sparse_tensor.SparseTensor)
+    ]
+    all_dense = True if not sparse_inputs else False
+    indices = [sp_inp.indices for sp_inp in sparse_inputs]
+    values = [sp_inp.values for sp_inp in sparse_inputs]
+    shapes = [sp_inp.dense_shape for sp_inp in sparse_inputs]
+    indices_out, values_out, shapes_out = gen_sparse_ops.sparse_cross_hashed(
+        indices=indices,
+        values=values,
+        shapes=shapes,
+        dense_inputs=dense_inputs,
+        num_buckets=self.num_bins,
+        strong_hash=self.strong_hash,
+        salt=self.salt)
+    sparse_out = sparse_tensor.SparseTensor(indices_out, values_out, shapes_out)
+    if all_dense:
+      return sparse_ops.sparse_tensor_to_dense(sparse_out)
+    return sparse_out
+
   def _get_string_to_hash_bucket_fn(self):
     """Returns the string_to_hash_bucket op to use based on `hasher_key`."""
     # string_to_hash_bucket_fast uses FarmHash64 as hash function.
-    if self.salt is None:
+    if not self.strong_hash:
       return string_ops.string_to_hash_bucket_fast
     # string_to_hash_bucket_strong uses SipHash64 as hash function.
     else:
@@ -127,16 +204,43 @@ class Hashing(Layer):
           string_ops.string_to_hash_bucket_strong, key=self.salt)
 
   def compute_output_shape(self, input_shape):
-    return input_shape
+    if not isinstance(input_shape, (tuple, list)):
+      return input_shape
+    input_shapes = input_shape
+    batch_size = None
+    for inp_shape in input_shapes:
+      inp_tensor_shape = tensor_shape.TensorShape(inp_shape).as_list()
+      if len(inp_tensor_shape) != 2:
+        raise ValueError('Inputs must be rank 2, get {}'.format(input_shapes))
+      if batch_size is None:
+        batch_size = inp_tensor_shape[0]
+    # The second dimension is dynamic based on inputs.
+    output_shape = [batch_size, None]
+    return tensor_shape.TensorShape(output_shape)
 
   def compute_output_signature(self, input_spec):
-    output_shape = self.compute_output_shape(input_spec.shape.as_list())
-    output_dtype = dtypes.int64
-    if isinstance(input_spec, sparse_tensor.SparseTensorSpec):
+    if not isinstance(input_spec, (tuple, list)):
+      output_shape = self.compute_output_shape(input_spec.shape)
+      output_dtype = dtypes.int64
+      if isinstance(input_spec, sparse_tensor.SparseTensorSpec):
+        return sparse_tensor.SparseTensorSpec(
+            shape=output_shape, dtype=output_dtype)
+      else:
+        return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
+    input_shapes = [x.shape for x in input_spec]
+    output_shape = self.compute_output_shape(input_shapes)
+    if any([
+        isinstance(inp_spec, ragged_tensor.RaggedTensorSpec)
+        for inp_spec in input_spec
+    ]):
+      return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.int64)
+    elif any([
+        isinstance(inp_spec, sparse_tensor.SparseTensorSpec)
+        for inp_spec in input_spec
+    ]):
       return sparse_tensor.SparseTensorSpec(
-          shape=output_shape, dtype=output_dtype)
-    else:
-      return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
+          shape=output_shape, dtype=dtypes.int64)
+    return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.int64)
 
   def get_config(self):
     config = {'num_bins': self.num_bins, 'salt': self.salt}
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing_test.py b/tensorflow/python/keras/layers/preprocessing/hashing_test.py
index 147e4bc371b..4c3fd9c7501 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing_test.py
@@ -51,6 +51,15 @@ class HashingTest(keras_parameterized.TestCase):
     # Assert equal for hashed output that should be true on all platforms.
     self.assertAllClose([[0], [0], [1], [0], [0]], output)
 
+  def test_hash_dense_multi_inputs_farmhash(self):
+    layer = hashing.Hashing(num_bins=2)
+    inp_1 = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
+                        ['skywalker']])
+    inp_2 = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
+    output = layer([inp_1, inp_2])
+    # Assert equal for hashed output that should be true on all platforms.
+    self.assertAllClose([[0], [0], [1], [1], [0]], output)
+
   def test_hash_dense_int_input_farmhash(self):
     layer = hashing.Hashing(num_bins=3)
     inp = np.asarray([[0], [1], [2], [3], [4]])
@@ -72,6 +81,21 @@ class HashingTest(keras_parameterized.TestCase):
     # Note the result is different from (133, 137).
     self.assertAllClose([[1], [0], [1], [0], [1]], output_2)
 
+  def test_hash_dense_multi_inputs_siphash(self):
+    layer = hashing.Hashing(num_bins=2, salt=[133, 137])
+    inp_1 = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
+                        ['skywalker']])
+    inp_2 = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
+    output = layer([inp_1, inp_2])
+    # Assert equal for hashed output that should be true on all platforms.
+    # Note the result is different from FarmHash.
+    self.assertAllClose([[0], [1], [0], [0], [1]], output)
+
+    layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
+    output_2 = layer_2([inp_1, inp_2])
+    # Note the result is different from (133, 137).
+    self.assertAllClose([[1], [1], [1], [0], [1]], output_2)
+
   def test_hash_dense_int_input_siphash(self):
     layer = hashing.Hashing(num_bins=3, salt=[133, 137])
     inp = np.asarray([[0], [1], [2], [3], [4]])
@@ -90,6 +114,19 @@ class HashingTest(keras_parameterized.TestCase):
     self.assertAllClose(indices, output.indices)
     self.assertAllClose([0, 0, 1, 0, 0], output.values)
 
+  def test_hash_sparse_multi_inputs_farmhash(self):
+    layer = hashing.Hashing(num_bins=2)
+    indices = [[0, 0], [1, 0], [2, 0]]
+    inp_1 = sparse_tensor.SparseTensor(
+        indices=indices,
+        values=['omar', 'stringer', 'marlo'],
+        dense_shape=[3, 1])
+    inp_2 = sparse_tensor.SparseTensor(
+        indices=indices, values=['A', 'B', 'C'], dense_shape=[3, 1])
+    output = layer([inp_1, inp_2])
+    self.assertAllClose(indices, output.indices)
+    self.assertAllClose([0, 0, 1], output.values)
+
   def test_hash_sparse_int_input_farmhash(self):
     layer = hashing.Hashing(num_bins=3)
     indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
@@ -116,6 +153,25 @@ class HashingTest(keras_parameterized.TestCase):
     # The result should be same with test_hash_dense_input_siphash.
     self.assertAllClose([1, 0, 1, 0, 1], output.values)
 
+  def test_hash_sparse_multi_inputs_siphash(self):
+    layer = hashing.Hashing(num_bins=2, salt=[133, 137])
+    indices = [[0, 0], [1, 0], [2, 0]]
+    inp_1 = sparse_tensor.SparseTensor(
+        indices=indices,
+        values=['omar', 'stringer', 'marlo'],
+        dense_shape=[3, 1])
+    inp_2 = sparse_tensor.SparseTensor(
+        indices=indices, values=['A', 'B', 'C'], dense_shape=[3, 1])
+    output = layer([inp_1, inp_2])
+    # The result should be same with test_hash_dense_input_siphash.
+    self.assertAllClose(indices, output.indices)
+    self.assertAllClose([0, 1, 0], output.values)
+
+    layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
+    output = layer_2([inp_1, inp_2])
+    # The result should be same with test_hash_dense_input_siphash.
+    self.assertAllClose([1, 1, 1], output.values)
+
   def test_hash_sparse_int_input_siphash(self):
     layer = hashing.Hashing(num_bins=3, salt=[133, 137])
     indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
@@ -140,6 +196,17 @@ class HashingTest(keras_parameterized.TestCase):
     model = training.Model(inputs=inp_t, outputs=out_t)
     self.assertAllClose(out_data, model.predict(inp_data))
 
+  def test_hash_ragged_string_multi_inputs_farmhash(self):
+    layer = hashing.Hashing(num_bins=2)
+    inp_data_1 = ragged_factory_ops.constant(
+        [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
+        dtype=dtypes.string)
+    inp_data_2 = ragged_factory_ops.constant(
+        [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
+        dtype=dtypes.string)
+    with self.assertRaisesRegexp(ValueError, 'not supported yet'):
+      _ = layer([inp_data_1, inp_data_2])
+
   def test_hash_ragged_int_input_farmhash(self):
     layer = hashing.Hashing(num_bins=3)
     inp_data = ragged_factory_ops.constant([[0, 1, 3, 4], [2, 1, 0]],
@@ -178,6 +245,17 @@ class HashingTest(keras_parameterized.TestCase):
     model = training.Model(inputs=inp_t, outputs=out_t)
     self.assertAllClose(out_data, model.predict(inp_data))
 
+  def test_hash_ragged_string_multi_inputs_siphash(self):
+    layer = hashing.Hashing(num_bins=2, salt=[133, 137])
+    inp_data_1 = ragged_factory_ops.constant(
+        [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
+        dtype=dtypes.string)
+    inp_data_2 = ragged_factory_ops.constant(
+        [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
+        dtype=dtypes.string)
+    with self.assertRaisesRegexp(ValueError, 'not supported yet'):
+      _ = layer([inp_data_1, inp_data_2])
+
   def test_hash_ragged_int_input_siphash(self):
     layer = hashing.Hashing(num_bins=3, salt=[133, 137])
     inp_data = ragged_factory_ops.constant([[0, 1, 3, 4], [2, 1, 0]],
@@ -197,11 +275,11 @@ class HashingTest(keras_parameterized.TestCase):
       _ = hashing.Hashing(num_bins=None)
     with self.assertRaisesRegexp(ValueError, 'cannot be `None`'):
       _ = hashing.Hashing(num_bins=-1)
-    with self.assertRaisesRegexp(ValueError, 'must be a tuple'):
+    with self.assertRaisesRegexp(ValueError, 'can only be a tuple of size 2'):
       _ = hashing.Hashing(num_bins=2, salt='string')
-    with self.assertRaisesRegexp(ValueError, 'must be a tuple'):
+    with self.assertRaisesRegexp(ValueError, 'can only be a tuple of size 2'):
       _ = hashing.Hashing(num_bins=2, salt=[1])
-    with self.assertRaisesRegexp(ValueError, 'must be a tuple'):
+    with self.assertRaisesRegexp(ValueError, 'can only be a tuple of size 2'):
       _ = hashing.Hashing(num_bins=1, salt=constant_op.constant([133, 137]))
 
   def test_hash_compute_output_signature(self):
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index 4f909b648b6..92de25a46e6 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -51,6 +51,19 @@ _RESIZE_METHODS = {
     'mitchellcubic': ResizeMethod.MITCHELLCUBIC
 }
 
+H_AXIS = 1
+W_AXIS = 2
+
+
+def check_fill_mode_and_interpolation(fill_mode, interpolation):
+  if fill_mode not in {'reflect', 'wrap', 'constant'}:
+    raise NotImplementedError(
+        'Unknown `fill_mode` {}. Only `reflect`, `wrap` and '
+        '`constant` are supported.'.format(fill_mode))
+  if interpolation not in {'nearest', 'bilinear'}:
+    raise NotImplementedError('Unknown `interpolation` {}. Only `nearest` and '
+                              '`bilinear` are supported.'.format(interpolation))
+
 
 @keras_export('keras.layers.experimental.preprocessing.Resizing')
 class Resizing(Layer):
@@ -132,9 +145,8 @@ class CenterCrop(Layer):
 
   def call(self, inputs):
     inputs_shape = array_ops.shape(inputs)
-    h_axis, w_axis = 1, 2
-    img_hd = inputs_shape[h_axis]
-    img_wd = inputs_shape[w_axis]
+    img_hd = inputs_shape[H_AXIS]
+    img_wd = inputs_shape[W_AXIS]
     img_hd_diff = img_hd - self.target_height
     img_wd_diff = img_wd - self.target_width
     checks = []
@@ -230,9 +242,9 @@ class RandomCrop(Layer):
     def resize_and_center_cropped_inputs():
       """Deterministically resize to shorter side and center crop."""
       input_shape = array_ops.shape(inputs)
-      input_height_t = input_shape[1]
-      input_width_t = input_shape[2]
-      ratio_cond = (input_height_t / input_width_t > 1.)
+      input_height_t = input_shape[H_AXIS]
+      input_width_t = input_shape[W_AXIS]
+      ratio_cond = (input_height_t / input_width_t > (self.height / self.width))
       # pylint: disable=g-long-lambda
       resized_height = tf_utils.smart_cond(
           ratio_cond,
@@ -280,11 +292,16 @@ class RandomCrop(Layer):
 
 @keras_export('keras.layers.experimental.preprocessing.Rescaling')
 class Rescaling(Layer):
-  """Multiply inputs by `scale`.
+  """Multiply inputs by `scale` and adds `offset`.
 
-  For instance, to rescale an input in the `[0, 255]` range
+  For instance:
+
+  1. To rescale an input in the `[0, 255]` range
   to be in the `[0, 1]` range, you would pass `scale=1./255`.
 
+  2. To rescale an input in the `[0, 255]` range to be in the `[-1, 1]` range,
+  you would pass `scale=1./127.5, offset=-1`.
+
   The rescaling is applied both during training and inference.
 
   Input shape:
@@ -295,16 +312,20 @@ class Rescaling(Layer):
 
   Arguments:
     scale: Float, the scale to apply to the inputs.
+    offset: Float, the offset to apply to the inputs.
     name: A string, the name of the layer.
   """
 
-  def __init__(self, scale, name=None, **kwargs):
+  def __init__(self, scale, offset=0., name=None, **kwargs):
     self.scale = scale
+    self.offset = offset
     super(Rescaling, self).__init__(name=name, **kwargs)
 
   def call(self, inputs):
     dtype = self._compute_dtype
-    return math_ops.cast(inputs, dtype) * math_ops.cast(self.scale, dtype)
+    scale = math_ops.cast(self.scale, dtype)
+    offset = math_ops.cast(self.offset, dtype)
+    return math_ops.cast(inputs, dtype) * scale + offset
 
   def compute_output_shape(self, input_shape):
     return input_shape
@@ -312,6 +333,7 @@ class Rescaling(Layer):
   def get_config(self):
     config = {
         'scale': self.scale,
+        'offset': self.offset,
     }
     base_config = super(Rescaling, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -407,17 +429,24 @@ class RandomTranslation(Layer):
   """Randomly translate each image during training.
 
   Arguments:
-    height_factor: a positive float represented as fraction of value, or a tuple
-      of size 2 representing lower and upper bound for shifting vertically. When
-      represented as a single float, this value is used for both the upper and
-      lower bound. For instance, `height_factor=(0.2, 0.3)` results in an output
-      height varying in the range `[original - 20%, original + 30%]`.
-      `height_factor=0.2` results in an output height varying in the range
-      `[original - 20%, original + 20%]`.
-    width_factor: a positive float represented as fraction of value, or a tuple
+    height_factor: a float represented as fraction of value, or a tuple
+      of size 2 representing lower and upper bound for shifting vertically.
+      A negative value means shifting image up, while a positive value
+      means shifting image down. When represented as a single positive float,
+      this value is used for both the upper and lower bound. For instance,
+      `height_factor=(-0.2, 0.3)` results in an output shifted by a random
+      amount in the range [-20%, +30%].
+      `height_factor=0.2` results in an output height shifted by a random
+      amount in the range [-20%, +20%].
+    width_factor: a float represented as fraction of value, or a tuple
       of size 2 representing lower and upper bound for shifting horizontally.
-      When represented as a single float, this value is used for both the upper
-      and lower bound.
+      A negative value means shifting image left, while a positive value
+      means shifting image right. When represented as a single positive float,
+      this value is used for both the upper and lower bound. For instance,
+      `width_factor=(-0.2, 0.3)` results in an output shifted left by 20%, and
+      shifted right by 30%.
+      `width_factor=0.2` results in an output height shifted left or right
+      by 20%.
     fill_mode: Points outside the boundaries of the input are filled according
       to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
       - *reflect*: `(d c b a | a b c d | d c b a)`
@@ -440,8 +469,8 @@ class RandomTranslation(Layer):
       data_format='channels_last'.
 
   Raise:
-    ValueError: if lower bound is not between [0, 1], or upper bound is
-      negative.
+    ValueError: if either bound is not between [0, 1], or upper bound is
+      less than lower bound.
   """
 
   def __init__(self,
@@ -454,38 +483,34 @@ class RandomTranslation(Layer):
                **kwargs):
     self.height_factor = height_factor
     if isinstance(height_factor, (tuple, list)):
-      self.height_lower = abs(height_factor[0])
+      self.height_lower = height_factor[0]
       self.height_upper = height_factor[1]
     else:
-      self.height_lower = self.height_upper = height_factor
-    if self.height_upper < 0.:
-      raise ValueError('`height_factor` cannot have negative values as upper '
-                       'bound, got {}'.format(height_factor))
+      self.height_lower = -height_factor
+      self.height_upper = height_factor
+    if self.height_upper < self.height_lower:
+      raise ValueError('`height_factor` cannot have upper bound less than '
+                       'lower bound, got {}'.format(height_factor))
     if abs(self.height_lower) > 1. or abs(self.height_upper) > 1.:
       raise ValueError('`height_factor` must have values between [-1, 1], '
                        'got {}'.format(height_factor))
 
     self.width_factor = width_factor
     if isinstance(width_factor, (tuple, list)):
-      self.width_lower = abs(width_factor[0])
+      self.width_lower = width_factor[0]
       self.width_upper = width_factor[1]
     else:
-      self.width_lower = self.width_upper = width_factor
-    if self.width_upper < 0.:
-      raise ValueError('`width_factor` cannot have negative values as upper '
-                       'bound, got {}'.format(width_factor))
+      self.width_lower = -width_factor
+      self.width_upper = width_factor
+    if self.width_upper < self.width_lower:
+      raise ValueError('`width_factor` cannot have upper bound less than '
+                       'lower bound, got {}'.format(width_factor))
     if abs(self.width_lower) > 1. or abs(self.width_upper) > 1.:
       raise ValueError('`width_factor` must have values between [-1, 1], '
                        'got {}'.format(width_factor))
 
-    if fill_mode not in {'reflect', 'wrap', 'constant'}:
-      raise NotImplementedError(
-          'Unknown `fill_mode` {}. Only `reflect`, `wrap` and '
-          '`constant` are supported.'.format(fill_mode))
-    if interpolation not in {'nearest', 'bilinear'}:
-      raise NotImplementedError(
-          'Unknown `interpolation` {}. Only `nearest` and '
-          '`bilinear` are supported.'.format(interpolation))
+    check_fill_mode_and_interpolation(fill_mode, interpolation)
+
     self.fill_mode = fill_mode
     self.interpolation = interpolation
     self.seed = seed
@@ -501,22 +526,24 @@ class RandomTranslation(Layer):
       """Translated inputs with random ops."""
       inputs_shape = array_ops.shape(inputs)
       batch_size = inputs_shape[0]
-      h_axis, w_axis = 1, 2
+      h_axis, w_axis = H_AXIS, W_AXIS
       img_hd = math_ops.cast(inputs_shape[h_axis], dtypes.float32)
       img_wd = math_ops.cast(inputs_shape[w_axis], dtypes.float32)
       height_translate = self._rng.uniform(
           shape=[batch_size, 1],
-          minval=-self.height_lower,
-          maxval=self.height_upper)
+          minval=self.height_lower,
+          maxval=self.height_upper,
+          dtype=dtypes.float32)
       height_translate = height_translate * img_hd
       width_translate = self._rng.uniform(
           shape=[batch_size, 1],
-          minval=-self.width_lower,
-          maxval=self.width_upper)
+          minval=self.width_lower,
+          maxval=self.width_upper,
+          dtype=dtypes.float32)
       width_translate = width_translate * img_wd
       translations = math_ops.cast(
-          array_ops.concat([height_translate, width_translate], axis=1),
-          dtype=inputs.dtype)
+          array_ops.concat([width_translate, height_translate], axis=1),
+          dtype=dtypes.float32)
       return transform(
           inputs,
           get_translation_matrix(translations),
@@ -713,9 +740,15 @@ class RandomRotation(Layer):
     `(samples, height, width, channels)`, data_format='channels_last'.
 
   Attributes:
-    factor: a positive float represented as fraction of 2pi, or a tuple of size
+    factor: a float represented as fraction of 2pi, or a tuple of size
       2 representing lower and upper bound for rotating clockwise and
-      counter-clockwise. When represented as a single float, lower = upper.
+      counter-clockwise. A positive values means rotating counter clock-wise,
+      while a negative value means clock-wise. When represented as a single
+      float, this value is used for both the upper and lower bound. For
+      instance, `factor=(-0.2, 0.3)` results in an output
+      rotation by a random amount in the range `[-20% * 2pi, 30% * 2pi]`.
+      `factor=0.2` results in an output rotating by a random amount in the range
+      `[-20% * 2pi, 20% * 2pi]`.
     fill_mode: Points outside the boundaries of the input are filled according
       to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
       - *reflect*: `(d c b a | a b c d | d c b a)`
@@ -736,8 +769,8 @@ class RandomRotation(Layer):
       data_format='channels_last'.
 
   Raise:
-    ValueError: if lower bound is not between [0, 1], or upper bound is
-      negative.
+    ValueError: if either bound is not between [0, 1], or upper bound is
+      less than lower bound.
   """
 
   def __init__(self,
@@ -752,18 +785,12 @@ class RandomRotation(Layer):
       self.lower = factor[0]
       self.upper = factor[1]
     else:
-      self.lower = self.upper = factor
-    if self.lower < 0. or self.upper < 0.:
+      self.lower = -factor
+      self.upper = factor
+    if self.upper < self.lower:
       raise ValueError('Factor cannot have negative values, '
                        'got {}'.format(factor))
-    if fill_mode not in {'reflect', 'wrap', 'constant'}:
-      raise NotImplementedError(
-          'Unknown `fill_mode` {}. Only `reflect`, `wrap` and '
-          '`constant` are supported.'.format(fill_mode))
-    if interpolation not in {'nearest', 'bilinear'}:
-      raise NotImplementedError(
-          'Unknown `interpolation` {}. Only `nearest` and '
-          '`bilinear` are supported.'.format(interpolation))
+    check_fill_mode_and_interpolation(fill_mode, interpolation)
     self.fill_mode = fill_mode
     self.interpolation = interpolation
     self.seed = seed
@@ -779,13 +806,12 @@ class RandomRotation(Layer):
       """Rotated inputs with random ops."""
       inputs_shape = array_ops.shape(inputs)
       batch_size = inputs_shape[0]
-      h_axis, w_axis = 1, 2
-      img_hd = math_ops.cast(inputs_shape[h_axis], dtypes.float32)
-      img_wd = math_ops.cast(inputs_shape[w_axis], dtypes.float32)
+      img_hd = math_ops.cast(inputs_shape[H_AXIS], dtypes.float32)
+      img_wd = math_ops.cast(inputs_shape[W_AXIS], dtypes.float32)
       min_angle = self.lower * 2. * np.pi
       max_angle = self.upper * 2. * np.pi
       angles = self._rng.uniform(
-          shape=[batch_size], minval=-min_angle, maxval=max_angle)
+          shape=[batch_size], minval=min_angle, maxval=max_angle)
       return transform(
           inputs,
           get_rotation_matrix(angles, img_hd, img_wd),
@@ -811,20 +837,29 @@ class RandomRotation(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@keras_export('keras.layers.experimental.preprocessing.RandomZoom')
 class RandomZoom(Layer):
   """Randomly zoom each image during training.
 
   Arguments:
-    height_factor: a positive float represented as fraction of value, or a tuple
-      of size 2 representing lower and upper bound for zooming horizontally.
-      When represented as a single float, this value is used for both the
-      upper and lower bound. For instance, `height_factor=(0.2, 0.3)` result in
-      an output zoom varying in the range `[original * 20%, original * 30%]`.
-    width_factor: a positive float represented as fraction of value, or a tuple
+    height_factor: a float represented as fraction of value, or a tuple
       of size 2 representing lower and upper bound for zooming vertically.
       When represented as a single float, this value is used for both the
-      upper and lower bound. For instance, `width_factor=(0.2, 0.3)` result in
-      an output zoom varying in the range `[original * 20%, original * 30%]`.
+      upper and lower bound. A positive value means zooming out, while a
+      negative value means zooming in.
+      For instance, `height_factor=(0.2, 0.3)` result in an output zoomed out
+      by a random amount in the range [+20%, +30%].
+      `height_factor=(-0.3, -0.2)` result in an output zoomed in by a random
+      amount in the range [+20%, +30%].
+    width_factor: a float represented as fraction of value, or a tuple
+      of size 2 representing lower and upper bound for zooming horizontally.
+      When represented as a single float, this value is used for both the
+      upper and lower bound.
+      For instance, `width_factor=(0.2, 0.3)` result in an output zooming out
+      between 20% to 30%.
+      `width_factor=(-0.3, -0.2)` result in an output zooming in between 20%
+      to 30%. Defaults to `None`, i.e., zooming vertical and horizontal
+      directions by preserving the aspect ratio.
     fill_mode: Points outside the boundaries of the input are filled according
       to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
       - *reflect*: `(d c b a | a b c d | d c b a)`
@@ -837,6 +872,14 @@ class RandomZoom(Layer):
     seed: Integer. Used to create a random seed.
     name: A string, the name of the layer.
 
+  Example:
+
+  >>> input_img = np.random.random((32, 224, 224, 3))
+  >>> layer = tf.keras.layers.experimental.preprocessing.RandomZoom(.5, .2)
+  >>> out_img = layer(input_img)
+  >>> out_img.shape
+  TensorShape([32, 224, 224, 3])
+
   Input shape:
     4D tensor with shape:
     `(samples, height, width, channels)`, data_format='channels_last'.
@@ -850,9 +893,10 @@ class RandomZoom(Layer):
       negative.
   """
 
+  # TODO(b/156526279): Add `fill_value` argument.
   def __init__(self,
                height_factor,
-               width_factor,
+               width_factor=None,
                fill_mode='reflect',
                interpolation='bilinear',
                seed=None,
@@ -863,35 +907,28 @@ class RandomZoom(Layer):
       self.height_lower = height_factor[0]
       self.height_upper = height_factor[1]
     else:
-      self.height_lower = self.height_upper = height_factor
-    if self.height_lower < 0. or self.height_upper < 0.:
-      raise ValueError('`height_factor` cannot have negative values, '
+      self.height_lower = -height_factor
+      self.height_upper = height_factor
+
+    if abs(self.height_lower) > 1. or abs(self.height_upper) > 1.:
+      raise ValueError('`height_factor` must have values between [-1, 1], '
                        'got {}'.format(height_factor))
-    if self.height_lower > self.height_upper:
-      raise ValueError('`height_factor` cannot have lower bound larger than '
-                       'upper bound, got {}.'.format(height_factor))
 
     self.width_factor = width_factor
-    if isinstance(width_factor, (tuple, list)):
-      self.width_lower = width_factor[0]
-      self.width_upper = width_factor[1]
-    else:
-      self.width_lower = self.width_upper = width_factor
-    if self.width_lower < 0. or self.width_upper < 0.:
-      raise ValueError('`width_factor` cannot have negative values, '
-                       'got {}'.format(width_factor))
-    if self.width_lower > self.width_upper:
-      raise ValueError('`width_factor` cannot have lower bound larger than '
-                       'upper bound, got {}.'.format(width_factor))
+    if width_factor is not None:
+      if isinstance(width_factor, (tuple, list)):
+        self.width_lower = width_factor[0]
+        self.width_upper = width_factor[1]
+      else:
+        self.width_lower = -width_factor  # pylint: disable=invalid-unary-operand-type
+        self.width_upper = width_factor
+
+      if self.width_lower < -1. or self.width_upper < -1.:
+        raise ValueError('`width_factor` must have values larger than -1, '
+                         'got {}'.format(width_factor))
+
+    check_fill_mode_and_interpolation(fill_mode, interpolation)
 
-    if fill_mode not in {'reflect', 'wrap', 'constant'}:
-      raise NotImplementedError(
-          'Unknown `fill_mode` {}. Only `reflect`, `wrap` and '
-          '`constant` are supported.'.format(fill_mode))
-    if interpolation not in {'nearest', 'bilinear'}:
-      raise NotImplementedError(
-          'Unknown `interpolation` {}. Only `nearest` and '
-          '`bilinear` are supported.'.format(interpolation))
     self.fill_mode = fill_mode
     self.interpolation = interpolation
     self.seed = seed
@@ -907,22 +944,22 @@ class RandomZoom(Layer):
       """Zoomed inputs with random ops."""
       inputs_shape = array_ops.shape(inputs)
       batch_size = inputs_shape[0]
-      h_axis, w_axis = 1, 2
-      img_hd = math_ops.cast(inputs_shape[h_axis], dtypes.float32)
-      img_wd = math_ops.cast(inputs_shape[w_axis], dtypes.float32)
+      img_hd = math_ops.cast(inputs_shape[H_AXIS], dtypes.float32)
+      img_wd = math_ops.cast(inputs_shape[W_AXIS], dtypes.float32)
       height_zoom = self._rng.uniform(
           shape=[batch_size, 1],
-          minval=-self.height_lower,
-          maxval=self.height_upper)
-      height_zoom = height_zoom * img_hd
-      width_zoom = self._rng.uniform(
-          shape=[batch_size, 1],
-          minval=-self.width_lower,
-          maxval=self.width_upper)
-      width_zoom = width_zoom * img_wd
+          minval=1. + self.height_lower,
+          maxval=1. + self.height_upper)
+      if self.width_factor is not None:
+        width_zoom = self._rng.uniform(
+            shape=[batch_size, 1],
+            minval=1. + self.width_lower,
+            maxval=1. + self.width_upper)
+      else:
+        width_zoom = height_zoom
       zooms = math_ops.cast(
-          array_ops.concat([height_zoom, width_zoom], axis=1),
-          dtype=inputs.dtype)
+          array_ops.concat([width_zoom, height_zoom], axis=1),
+          dtype=dtypes.float32)
       return transform(
           inputs, get_zoom_matrix(zooms, img_hd, img_wd),
           fill_mode=self.fill_mode,
@@ -974,8 +1011,8 @@ def get_zoom_matrix(zooms, image_height, image_width, name=None):
     #      [0 0 1]]
     # where the last entry is implicit.
     # Zoom matrices are always float32.
-    x_offset = ((image_height + 1.) / 2.0) * (zooms[:, 0, None] - 1.)
-    y_offset = ((image_width + 1.) / 2.0) * (zooms[:, 1, None] - 1.)
+    x_offset = ((image_width - 1.) / 2.0) * (1.0 - zooms[:, 0, None])
+    y_offset = ((image_height - 1.) / 2.0) * (1.0 - zooms[:, 1, None])
     return array_ops.concat(
         values=[
             zooms[:, 0, None],
@@ -1029,8 +1066,8 @@ class RandomContrast(Layer):
     else:
       self.lower = self.upper = factor
     if self.lower < 0. or self.upper < 0. or self.lower > 1.:
-      raise ValueError('Factor cannot have negative values, '
-                       'got {}'.format(factor))
+      raise ValueError('Factor cannot have negative values or greater than 1.0,'
+                       ' got {}'.format(factor))
     self.seed = seed
     self.input_spec = InputSpec(ndim=4)
     super(RandomContrast, self).__init__(name=name, **kwargs)
@@ -1073,11 +1110,11 @@ class RandomHeight(Layer):
     factor: A positive float (fraction of original height), or a tuple of size 2
       representing lower and upper bound for resizing vertically. When
       represented as a single float, this value is used for both the upper and
-      lower bound. For instance, `factor=(0.2, 0.3)` results in an output height
-      varying in the range `[original + 20%, original + 30%]`. `factor=(-0.2,
-      0.3)` results in an output height varying in the range `[original - 20%,
-      original + 30%]`. `factor=0.2` results in an output height varying in the
-      range `[original - 20%, original + 20%]`.
+      lower bound. For instance, `factor=(0.2, 0.3)` results in an output with
+      height changed by a random amount in the range `[20%, 30%]`.
+      `factor=(-0.2, 0.3)` results in an output with height changed by a random
+      amount in the range `[-20%, +30%]. `factor=0.2` results in an output with
+      height changed by a random amount in the range `[-20%, +20%]`.
     interpolation: String, the interpolation method. Defaults to `bilinear`.
       Supports `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`, `lanczos5`,
       `gaussian`, `mitchellcubic`
@@ -1099,12 +1136,17 @@ class RandomHeight(Layer):
                **kwargs):
     self.factor = factor
     if isinstance(factor, (tuple, list)):
-      self.height_lower = -factor[0]
+      self.height_lower = factor[0]
       self.height_upper = factor[1]
     else:
-      self.height_lower = self.height_upper = factor
-    if self.height_lower > 1.:
-      raise ValueError('`factor` cannot have abs lower bound larger than 1.0, '
+      self.height_lower = -factor
+      self.height_upper = factor
+
+    if self.height_upper < self.height_lower:
+      raise ValueError('`factor` cannot have upper bound less than '
+                       'lower bound, got {}'.format(factor))
+    if self.height_lower < -1. or self.height_upper < -1.:
+      raise ValueError('`factor` must have values larger than -1, '
                        'got {}'.format(factor))
     self.interpolation = interpolation
     self._interpolation_method = get_interpolation(interpolation)
@@ -1120,12 +1162,11 @@ class RandomHeight(Layer):
     def random_height_inputs():
       """Inputs height-adjusted with random ops."""
       inputs_shape = array_ops.shape(inputs)
-      h_axis, w_axis = 1, 2
-      img_hd = math_ops.cast(inputs_shape[h_axis], dtypes.float32)
-      img_wd = inputs_shape[w_axis]
+      img_hd = math_ops.cast(inputs_shape[H_AXIS], dtypes.float32)
+      img_wd = inputs_shape[W_AXIS]
       height_factor = self._rng.uniform(
           shape=[],
-          minval=(1.0 - self.height_lower),
+          minval=(1.0 + self.height_lower),
           maxval=(1.0 + self.height_upper))
       adjusted_height = math_ops.cast(height_factor * img_hd, dtypes.int32)
       adjusted_size = array_ops.stack([adjusted_height, img_wd])
@@ -1163,14 +1204,14 @@ class RandomWidth(Layer):
   By default, this layer is inactive during inference.
 
   Arguments:
-    factor: A positive float (fraction of original width), or a tuple of
-      size 2 representing lower and upper bound for resizing horizontally. When
+    factor: A positive float (fraction of original height), or a tuple of size 2
+      representing lower and upper bound for resizing vertically. When
       represented as a single float, this value is used for both the upper and
-      lower bound. For instance, `factor=(0.2, 0.3)` results in an output width
-      varying in the range `[original + 20%, original + 30%]`. `factor=(-0.2,
-      0.3)` results in an output width varying in the range `[original - 20%,
-      original + 30%]`. `factor=0.2` results in an output width varying in the
-      range `[original - 20%, original + 20%]`.
+      lower bound. For instance, `factor=(0.2, 0.3)` results in an output with
+      width changed by a random amount in the range `[20%, 30%]`.
+      `factor=(-0.2, 0.3)` results in an output with width changed by a random
+      amount in the range `[-20%, +30%]. `factor=0.2` results in an output with
+      width changed by a random amount in the range `[-20%, +20%]`.
     interpolation: String, the interpolation method. Defaults to `bilinear`.
       Supports `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`, `lanczos5`,
       `gaussian`, `mitchellcubic`
@@ -1183,7 +1224,7 @@ class RandomWidth(Layer):
 
   Output shape:
     4D tensor with shape:
-    `(samples, random_height, width, channels)`.
+    `(samples, height, random_width, channels)`.
   """
 
   def __init__(self,
@@ -1194,12 +1235,16 @@ class RandomWidth(Layer):
                **kwargs):
     self.factor = factor
     if isinstance(factor, (tuple, list)):
-      self.width_lower = -factor[0]
+      self.width_lower = factor[0]
       self.width_upper = factor[1]
     else:
-      self.width_lower = self.width_upper = factor
-    if self.width_lower > 1.:
-      raise ValueError('`factor` cannot have abs lower bound larger than 1.0, '
+      self.width_lower = -factor
+      self.width_upper = factor
+    if self.width_upper < self.width_lower:
+      raise ValueError('`factor` cannot have upper bound less than '
+                       'lower bound, got {}'.format(factor))
+    if self.width_lower < -1. or self.width_upper < -1.:
+      raise ValueError('`factor` must have values larger than -1, '
                        'got {}'.format(factor))
     self.interpolation = interpolation
     self._interpolation_method = get_interpolation(interpolation)
@@ -1215,12 +1260,11 @@ class RandomWidth(Layer):
     def random_width_inputs():
       """Inputs width-adjusted with random ops."""
       inputs_shape = array_ops.shape(inputs)
-      h_axis, w_axis = 1, 2
-      img_hd = inputs_shape[h_axis]
-      img_wd = math_ops.cast(inputs_shape[w_axis], dtypes.float32)
+      img_hd = inputs_shape[H_AXIS]
+      img_wd = math_ops.cast(inputs_shape[W_AXIS], dtypes.float32)
       width_factor = self._rng.uniform(
           shape=[],
-          minval=(1.0 - self.width_lower),
+          minval=(1.0 + self.width_lower),
           maxval=(1.0 + self.width_upper))
       adjusted_width = math_ops.cast(width_factor * img_wd, dtypes.int32)
       adjusted_size = array_ops.stack([img_hd, adjusted_width])
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index a741ee1c069..f00a9657039 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.keras.layers.preprocessing import image_preprocessing
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import gen_stateful_random_ops
 from tensorflow.python.ops import image_ops_impl as image_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.platform import test
@@ -74,6 +75,40 @@ class ResizingTest(keras_parameterized.TestCase):
     with CustomObjectScope({'Resizing': image_preprocessing.Resizing}):
       self._run_test(kwargs, expected_height, expected_width)
 
+  def test_down_sampling_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 16), (1, 4, 4, 1)).astype(dtype)
+        layer = image_preprocessing.Resizing(
+            height=2, width=2, interpolation='nearest')
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [5, 7],
+            [13, 15]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 2, 2, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_up_sampling_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 4), (1, 2, 2, 1)).astype(dtype)
+        layer = image_preprocessing.Resizing(
+            height=4, width=4, interpolation='nearest')
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 0, 1, 1],
+            [0, 0, 1, 1],
+            [2, 2, 3, 3],
+            [2, 2, 3, 3]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 4, 4, 1))
+        self.assertAllEqual(expected_output, output_image)
+
   @parameterized.named_parameters(
       ('reshape_bilinear_10_by_4', {'interpolation': 'bilinear'}, 10, 4))
   def test_reshaping(self, kwargs, expected_height, expected_width):
@@ -223,6 +258,21 @@ class RandomCropTest(keras_parameterized.TestCase):
     with CustomObjectScope({'RandomCrop': image_preprocessing.RandomCrop}):
       self._run_test(expected_height, expected_width)
 
+  def test_random_crop_full_height(self):
+    self._run_test(5, 2)
+
+  def test_random_crop_full_width(self):
+    self._run_test(3, 8)
+
+  def test_random_crop_full(self):
+    np.random.seed(1337)
+    height, width = 8, 16
+    inp = np.random.random((12, 8, 16, 3))
+    with tf_test_util.use_gpu():
+      layer = image_preprocessing.RandomCrop(height, width)
+      actual_output = layer(inp, training=0)
+      self.assertAllClose(inp, actual_output)
+
   def test_predicting_with_mock_longer_height(self):
     np.random.seed(1337)
     height, width = 3, 3
@@ -242,8 +292,7 @@ class RandomCropTest(keras_parameterized.TestCase):
     with tf_test_util.use_gpu():
       layer = image_preprocessing.RandomCrop(height, width)
       actual_output = layer(inp, training=0)
-      resized_inp = image_ops.resize_images_v2(
-          inp, size=[4, 8])
+      resized_inp = image_ops.resize_images_v2(inp, size=[4, 8])
       expected_output = resized_inp[:, :, 1:7, :]
       self.assertAllClose(expected_output, actual_output)
 
@@ -258,7 +307,7 @@ class RescalingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_rescaling_base(self):
-    kwargs = {'scale': 0.004}
+    kwargs = {'scale': 1./127.5, 'offset': -1.}
     testing_utils.layer_test(
         image_preprocessing.Rescaling,
         kwargs=kwargs,
@@ -267,18 +316,18 @@ class RescalingTest(keras_parameterized.TestCase):
 
   @tf_test_util.run_v2_only
   def test_rescaling_correctness_float(self):
-    layer = image_preprocessing.Rescaling(0.004)
+    layer = image_preprocessing.Rescaling(scale=1./127.5, offset=-1.)
     inputs = random_ops.random_uniform((2, 4, 5, 3))
     outputs = layer(inputs)
-    self.assertAllClose(outputs.numpy(), inputs.numpy() * 0.004)
+    self.assertAllClose(outputs.numpy(), inputs.numpy() * (1./127.5) - 1)
 
   @tf_test_util.run_v2_only
   def test_rescaling_correctness_int(self):
-    layer = image_preprocessing.Rescaling(0.004)
+    layer = image_preprocessing.Rescaling(scale=1./127.5, offset=-1)
     inputs = random_ops.random_uniform((2, 4, 5, 3), 0, 100, dtype='int32')
     outputs = layer(inputs)
     self.assertEqual(outputs.dtype.name, 'float32')
-    self.assertAllClose(outputs.numpy(), inputs.numpy() * 0.004)
+    self.assertAllClose(outputs.numpy(), inputs.numpy() * (1./127.5) - 1)
 
   def test_config_with_custom_name(self):
     layer = image_preprocessing.Rescaling(0.5, name='rescaling')
@@ -475,21 +524,152 @@ class RandomTranslationTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(
       ('random_translate_4_by_6', .4, .6), ('random_translate_3_by_2', .3, .2),
-      ('random_translate_tuple_factor', (.5, .4), (.2, .3)))
+      ('random_translate_tuple_factor', (-.5, .4), (.2, .3)))
   def test_random_translation(self, height_factor, width_factor):
     self._run_test(height_factor, width_factor)
 
-  def test_random_translation_negative_lower(self):
-    mock_offset = np.random.random((12, 1))
-    with test.mock.patch.object(
-        gen_stateful_random_ops, 'stateful_uniform', return_value=mock_offset):
-      with self.cached_session(use_gpu=True):
-        layer = image_preprocessing.RandomTranslation((-0.2, .3), .4)
-        layer_2 = image_preprocessing.RandomTranslation((0.2, .3), .4)
-        inp = np.random.random((12, 5, 8, 3)).astype(np.float32)
-        actual_output = layer(inp, training=1)
-        actual_output_2 = layer_2(inp, training=1)
-        self.assertAllClose(actual_output, actual_output_2)
+  def test_random_translation_up_numeric_reflect(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by -.2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=(-.2, -.2), width_factor=0.)
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [5, 6, 7, 8, 9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 18, 19],
+            [20, 21, 22, 23, 24],
+            [20, 21, 22, 23, 24]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_up_numeric_constant(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by -.2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=(-.2, -.2), width_factor=0., fill_mode='constant')
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [5, 6, 7, 8, 9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 18, 19],
+            [20, 21, 22, 23, 24],
+            [0, 0, 0, 0, 0]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_down_numeric_reflect(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by .2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=(.2, .2), width_factor=0.)
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [5, 6, 7, 8, 9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 18, 19]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_asymmetric_size_numeric_reflect(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 16), (1, 8, 2, 1)).astype(dtype)
+        # Shifting by .5 * 8 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=(.5, .5), width_factor=0.)
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [6, 7],
+            [4, 5],
+            [2, 3],
+            [0, 1],
+            [0, 1],
+            [2, 3],
+            [4, 5],
+            [6, 7],
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 8, 2, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_down_numeric_constant(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by -.2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=(.2, .2), width_factor=0., fill_mode='constant')
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 0, 0, 0, 0],
+            [0, 1, 2, 3, 4],
+            [5, 6, 7, 8, 9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 18, 19]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_left_numeric_reflect(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by .2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=0., width_factor=(-.2, -.2))
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [1, 2, 3, 4, 4],
+            [6, 7, 8, 9, 9],
+            [11, 12, 13, 14, 14],
+            [16, 17, 18, 19, 19],
+            [21, 22, 23, 24, 24]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_left_numeric_constant(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by -.2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=0., width_factor=(-.2, -.2), fill_mode='constant')
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [1, 2, 3, 4, 0],
+            [6, 7, 8, 9, 0],
+            [11, 12, 13, 14, 0],
+            [16, 17, 18, 19, 0],
+            [21, 22, 23, 24, 0]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
 
   def test_random_translation_inference(self):
     with CustomObjectScope(
@@ -768,7 +948,7 @@ class RandomRotationTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(('random_rotate_4', .4),
                                   ('random_rotate_3', .3),
-                                  ('random_rotate_tuple_factor', (.5, .4)))
+                                  ('random_rotate_tuple_factor', (-.5, .4)))
   def test_random_rotation(self, factor):
     self._run_test(factor)
 
@@ -808,22 +988,75 @@ class RandomZoomTest(keras_parameterized.TestCase):
           expected_output_shape=(None, orig_height, orig_width, channels))
 
   @parameterized.named_parameters(
-      ('random_zoom_4_by_6', .4, .6), ('random_zoom_2_by_3', .2, .3),
-      ('random_zoom_tuple_factor', (.4, .5), (.2, .3)))
+      ('random_zoom_4_by_6', -.4, -.6), ('random_zoom_2_by_3', -.2, -.3),
+      ('random_zoom_tuple_factor', (-.4, -.5), (-.2, -.3)))
   def test_random_zoom_in(self, height_factor, width_factor):
     self._run_test(height_factor, width_factor)
 
   @parameterized.named_parameters(
-      ('random_zoom_4_by_6', 1.4, 1.6), ('random_zoom_2_by_3', 1.2, 1.3),
-      ('random_zoom_tuple_factor', (1.4, 1.5), (1.2, 1.3)))
+      ('random_zoom_4_by_6', .4, .6), ('random_zoom_2_by_3', .2, .3),
+      ('random_zoom_tuple_factor', (.4, .5), (.2, .3)))
   def test_random_zoom_out(self, height_factor, width_factor):
     self._run_test(height_factor, width_factor)
 
-  def test_random_zoom_invalid_factor(self):
-    with self.assertRaises(ValueError):
-      image_preprocessing.RandomZoom((.5, .4), .2)
-    with self.assertRaises(ValueError):
-      image_preprocessing.RandomZoom(.2, (.5, .4))
+  def test_random_zoom_in_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
+        layer = image_preprocessing.RandomZoom((-.5, -.5), (-.5, -.5),
+                                               interpolation='nearest')
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [6, 7, 7, 8, 8],
+            [11, 12, 12, 13, 13],
+            [11, 12, 12, 13, 13],
+            [16, 17, 17, 18, 18],
+            [16, 17, 17, 18, 18]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_zoom_out_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
+        layer = image_preprocessing.RandomZoom((.5, .5), (.8, .8),
+                                               fill_mode='constant',
+                                               interpolation='nearest')
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 0, 0, 0, 0],
+            [0, 5, 7, 9, 0],
+            [0, 10, 12, 14, 0],
+            [0, 20, 22, 24, 0],
+            [0, 0, 0, 0, 0]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_zoom_out_numeric_preserve_aspect_ratio(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
+        layer = image_preprocessing.RandomZoom((.5, .5),
+                                               fill_mode='constant',
+                                               interpolation='nearest')
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 0, 0, 0, 0],
+            [0, 6, 7, 9, 0],
+            [0, 11, 12, 14, 0],
+            [0, 21, 22, 24, 0],
+            [0, 0, 0, 0, 0]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
 
   def test_random_zoom_inference(self):
     with CustomObjectScope(
@@ -861,7 +1094,7 @@ class RandomHeightTest(keras_parameterized.TestCase):
       self.assertEqual(img_out.shape[3], 3)
 
   @parameterized.named_parameters(('random_height_4_by_6', (.4, .6)),
-                                  ('random_height_3_by_2', (.3, 1.2)),
+                                  ('random_height_3_by_2', (-.3, .2)),
                                   ('random_height_3', .3))
   def test_random_height_basic(self, factor):
     self._run_test(factor)
@@ -877,6 +1110,42 @@ class RandomHeightTest(keras_parameterized.TestCase):
         img_out = layer(img, training=True)
         self.assertEqual(img_out.shape[1], 3)
 
+  def test_random_height_longer_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 6), (2, 3, 1)).astype(dtype)
+        layer = image_preprocessing.RandomHeight(factor=(1., 1.))
+        # Return type of RandomHeight() is float32 if `interpolation` is not
+        # set to `ResizeMethod.NEAREST_NEIGHBOR`; cast `layer` to desired dtype.
+        output_image = math_ops.cast(layer(np.expand_dims(input_image, axis=0)),
+                                     dtype=dtype)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 1, 2],
+            [0.75, 1.75, 2.75],
+            [2.25, 3.25, 4.25],
+            [3, 4, 5]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 4, 3, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_height_shorter_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 8), (4, 2, 1)).astype(dtype)
+        layer = image_preprocessing.RandomHeight(
+            factor=(-.5, -.5), interpolation='nearest')
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [2, 3],
+            [6, 7]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 2, 2, 1))
+        self.assertAllEqual(expected_output, output_image)
+
   def test_random_height_invalid_factor(self):
     with self.assertRaises(ValueError):
       image_preprocessing.RandomHeight((-1.5, .4))
@@ -916,7 +1185,7 @@ class RandomWidthTest(keras_parameterized.TestCase):
       self.assertEqual(img_out.shape[3], 3)
 
   @parameterized.named_parameters(('random_width_4_by_6', (.4, .6)),
-                                  ('random_width_3_by_2', (.3, 1.2)),
+                                  ('random_width_3_by_2', (-.3, .2)),
                                   ('random_width_3', .3))
   def test_random_width_basic(self, factor):
     self._run_test(factor)
@@ -932,6 +1201,41 @@ class RandomWidthTest(keras_parameterized.TestCase):
         img_out = layer(img, training=True)
         self.assertEqual(img_out.shape[2], 3)
 
+  def test_random_width_longer_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 6), (3, 2, 1)).astype(dtype)
+        layer = image_preprocessing.RandomWidth(factor=(1., 1.))
+        # Return type of RandomWidth() is float32 if `interpolation` is not
+        # set to `ResizeMethod.NEAREST_NEIGHBOR`; cast `layer` to desired dtype.
+        output_image = math_ops.cast(layer(np.expand_dims(input_image, axis=0)),
+                                     dtype=dtype)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 0.25, 0.75, 1],
+            [2, 2.25, 2.75, 3],
+            [4, 4.25, 4.75, 5]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 3, 4, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_width_shorter_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 8), (2, 4, 1)).astype(dtype)
+        layer = image_preprocessing.RandomWidth(
+            factor=(-.5, -.5), interpolation='nearest')
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [1, 3],
+            [5, 7]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 2, 2, 1))
+        self.assertAllEqual(expected_output, output_image)
+
   def test_random_width_invalid_factor(self):
     with self.assertRaises(ValueError):
       image_preprocessing.RandomWidth((-1.5, .4))
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup.py b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
index 812eeca7ea3..7d11feae341 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
@@ -24,17 +24,12 @@ import operator
 import numpy as np
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine import base_preprocessing_layer
-from tensorflow.python.ops import array_ops
+from tensorflow.python.keras.layers.preprocessing import table_utils
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.ops.ragged import ragged_functional_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import gfile
 from tensorflow.python.util import compat
 
 # The string tokens in the extracted vocabulary
@@ -47,14 +42,16 @@ _ACCUMULATOR_COUNTS_NAME = "counts"
 
 
 class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
-  """Maps strings (or integers) from a vocabulary to integer indices.
+  """Maps values from a vocabulary to integer indices.
 
-  This layer translates a set of arbitrary strings or integers into an integer
-  output via a table-based lookup, with optional out-of-vocabulary handling.
+  This layer translates a set of arbitrary hashables into an integer output via
+  a table-based lookup, with optional out-of-vocabulary handling. This is the
+  basis layer for both IntegerLookup and IndexLookup; it holds the common
+  logic but is not intended to be exported as part of the Keras API.
 
   If desired, the user can call this layer's `adapt()` method on a data set,
   which will analyze the data set, determine the frequency of individual string
-  or integer values, and create a vocabulary from them. This vocabulary can have
+  values, and create a vocabulary from them. This vocabulary can have
   unlimited size or be capped, depending on the configuration options for this
   layer; if there are more unique values in the input than the maximum
   vocabulary size, the most frequent terms will be used to create the
@@ -62,78 +59,53 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
 
   Attributes:
     max_tokens: The maximum size of the vocabulary for this layer. If None,
-      there is no cap on the size of the vocabulary. Note that the vocabulary
-      does include OOV buckets, so the effective number of unique values in the
-      vocabulary is `(max_tokens - num_oov_tokens)` when this value is set.
-    num_oov_tokens: The number of out-of-vocabulary tokens to use; defaults to
-      1. If this value is more than 1, OOV inputs are hashed to determine their
-      OOV value; if this value is 0, passing an OOV input will result in a '-1'
-      being returned for that value in the output tensor. (Note that, because
-      the value is -1 and not 0, this will allow you to effectively drop OOV
-      values from categorical encodings.)
-    vocabulary: An optional list of vocabulary terms, or a path to a text file
-      containing a vocabulary to load into this layer. The file should contain
-      one token per line. In either case, the vocabulary must be unique; if
-      the list or file contains the same token multiple times, an error will
-      be thrown. Note that when passing a vocabulary - either as a list or as
-      a file - the vocabulary will not be present in the layer's config dict;
-      it will instead be a part of the layer's weights.
-    reserve_zero: Whether to reserve the index 0, which indicates pad values in
-      the Keras masking system. If True, the output of this layer will be in the
-      range `[1...max_tokens+1)`; if False, the output will be in the range
-      `[0...max_tokens)`. Defaults to True.
-    mask_zero: If True, input values of 0 (for integers) and `""` (for strings)
-      will be treated as masked values and assigned an output value of 0. If
-      this option is set, `reserve_zero` must also be set. Defaults to False.
-  Call arguments:
-    inputs: The data to look up. Can be a tf.Tensor or RaggedTensor.
-    invert: Controls the lookup direction. If False, the layer will map strings
-      to integers; if true, the layer will map integers to strings. Defaults
-      to False.
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      includes the OOV and mask tokens, so the effective number of tokens is
+      (max_tokens - num_oov_indices - (1 if mask_token else 0))
+    num_oov_indices: The number of out-of-vocabulary tokens to use. If this
+      value is more than 1, OOV inputs are hashed to determine their OOV value;
+      if this value is 0, passing an OOV input will result in a '-1' being
+      returned for that value in the output tensor. (Note that, because the
+      value is -1 and not 0, this will allow you to effectively drop OOV values
+      from categorical encodings.)
+    mask_token: A token that represents masked values, and which is mapped to
+      index 0. If set to None, no mask term will be added and the OOV tokens, if
+      any, will be indexed from (0...num_oov_indices) instead of
+      (1...num_oov_indices+1).
+    oov_token: The token representing an out-of-vocabulary value. This token is
+      only used when performing an inverse lookup.
+    vocabulary: An optional list of vocabulary terms. If the list contains the
+      same token multiple times, an error will be thrown.
+    invert: If true, this layer will map indices to vocabulary items instead
+      of mapping vocabulary items to indices.
   """
-  # TODO(momernick): Add an examples section to the docstring.
 
   def __init__(self,
-               max_tokens=None,
-               num_oov_tokens=1,
+               max_tokens,
+               num_oov_indices,
+               mask_token,
+               oov_token,
                vocabulary=None,
-               reserve_zero=True,
-               mask_zero=False,
+               invert=False,
                **kwargs):
-    allowed_dtypes = [dtypes.string, dtypes.int64]
-    if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
-      raise ValueError(
-          "TextVectorization may only have a dtype of string or int64.")
-    elif "dtype" not in kwargs:
-      kwargs["dtype"] = dtypes.string
 
     # If max_tokens is set, the value must be greater than 1 - otherwise we
     # are creating a 0-element vocab, which doesn't make sense.
     if max_tokens is not None and max_tokens <= 1:
-      raise ValueError("max_tokens must be greater than 1.")
+      raise ValueError("If set, `max_tokens` must be greater than 1.")
 
-    # For now, limit the num_oov_tokens to one.
-    if num_oov_tokens < 0:
-      raise ValueError("num_oov_tokens must be greater than 0. You passed %s" %
-                       num_oov_tokens)
+    if num_oov_indices < 0:
+      raise ValueError("`num_oov_indices` must be greater than 0. You passed "
+                       "%s" % num_oov_indices)
 
+    if invert and num_oov_indices != 1:
+      raise ValueError("`num_oov_tokens` must be 1 when `invert` is True.")
+
+    self.invert = invert
     self.max_tokens = max_tokens
-    self.num_oov_tokens = num_oov_tokens
-    self.reserve_zero = reserve_zero
-    self.mask_zero = mask_zero
-
-    # We need to reserve at least num_oov_tokens tokens, plus one additional
-    # value if we are reserving the zero value in our output.
-    if reserve_zero:
-      self._reserved_values = (num_oov_tokens + 1)
-    else:
-      self._reserved_values = num_oov_tokens
-
-    # We need to account for the OOV buckets in our vocabulary size.
-    if max_tokens is not None:
-      self._max_elements = max_tokens - num_oov_tokens
-    else:
-      self._max_elements = None
+    self.num_oov_indices = num_oov_indices
+    self.oov_token = oov_token
+    self.mask_token = mask_token
 
     # If there is only one OOV bucket, we can determine the OOV value (either 0
     # or 1 depending on whether 0 is reserved) and set that as the default
@@ -141,24 +113,37 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     # do a further hashing step; to make this easier, we set the OOV value to
     # -1. (This lets us do a vectorized add and cast to boolean to determine
     # locations where we need to do extra hashing.)
-    if self.num_oov_tokens == 1:
-      self._oov_value = 1 if reserve_zero else 0
+    if self.num_oov_indices == 1:
+      self._oov_value = 0 if mask_token is None else 1
     else:
       self._oov_value = -1
 
-    super(IndexLookup, self).__init__(
-        combiner=_IndexLookupCombiner(self.max_tokens), **kwargs)
-
-    # If the layer's input type is int32, we can only output int32 values -
-    # MutableHashTable doesn't allow us to map int32->int64.
-    if self.dtype == dtypes.int32:
-      self._output_dtype = dtypes.int32
+    if max_tokens is not None:
+      num_mask_tokens = (0 if mask_token is None else 1)
+      vocab_size = max_tokens - (num_oov_indices + num_mask_tokens)
     else:
-      self._output_dtype = dtypes.int64
+      vocab_size = None
+
+    super(IndexLookup, self).__init__(
+        combiner=_IndexLookupCombiner(vocab_size, self.mask_token), **kwargs)
+
+    self._output_dtype = dtypes.int64
+
+    # We need to save the key dtype so that we know if we're expecting int64
+    # keys. If we are, we will cast int32 inputs to int64 as well.
+    if invert:
+      self._key_dtype = self._output_dtype
+      value_dtype = self.dtype
+      oov_value = self.oov_token
+    else:
+      self._key_dtype = self.dtype
+      value_dtype = self._output_dtype
+      oov_value = self._oov_value
+
     self._table = lookup_ops.MutableHashTable(
-        key_dtype=self.dtype,
-        value_dtype=self._output_dtype,
-        default_value=self._oov_value,
+        key_dtype=self._key_dtype,
+        value_dtype=value_dtype,
+        default_value=oov_value,
         name=(self._name + "_index_table"))
     tracked_table = self._add_trackable(self._table, trainable=False)
     # This is a workaround for summary() on this layer. Because the table is
@@ -167,100 +152,27 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     # counting code in the Model object doesn't throw an attribute error.
     tracked_table.shape = tensor_shape.TensorShape((0,))
 
-    self._inverse_table = None
+    if self.num_oov_indices <= 1:
+      oov_indices = None
+    else:
+      oov_start = 1 if mask_token is not None else 0
+      oov_end = oov_start + num_oov_indices
+      oov_indices = list(range(oov_start, oov_end))
+
+    self._table_handler = table_utils.TableHandler(
+        table=self._table,
+        oov_tokens=oov_indices,
+        use_v1_apis=self._use_v1_apis())
 
     if vocabulary is not None:
-      if isinstance(vocabulary, str):
-        vocabulary = self._get_vocabulary_from_file(vocabulary)
-
-      vocabulary_set = set(vocabulary)
-      if len(vocabulary) != len(vocabulary_set):
-        repeated_items = [
-            item for item, count in collections.Counter(vocabulary).items()
-            if count > 1
-        ]
-        raise ValueError("The passed vocabulary has at least one repeated "
-                         "term. Please uniquify your dataset before passing "
-                         "it to IndexLookup(). The repeated terms are %s" %
-                         repeated_items)
       self.set_vocabulary(vocabulary)
 
-  def _get_vocabulary_from_file(self, vocabulary_path):
-    vocab = []
-    with gfile.GFile(vocabulary_path, "r") as reader:
-      while True:
-        # Get the next line, and break if it is None.
-        text = reader.readline()
-        if not text:
-          break
-
-        # Convert the raw text into UTF8 and strip whitespace.
-        if isinstance(text, str):
-          token = text
-        elif isinstance(text, bytes):
-          token = text.decode("utf-8", "ignore")
-        token = token.strip()
-        vocab.append(token)
-    return vocab
-
-  def _get_table_data(self):
-    keys, values = self._table.export()
-    return (keys.numpy(), values.numpy())
-
-  def vocab_size(self):
-    return self._table.size().numpy()
-
-  def _clear_table(self):
-    keys, _ = self._table.export()
-    self._table.remove(keys)
-    if self._inverse_table:
-      keys, _ = self._inverse_table.export()
-      self._inverse_table.remove(keys)
-
-  def _insert_table_data(self, keys, values):
-    if len(values) != len(keys):
-      raise RuntimeError("Size mismatch between values and key arrays. "
-                         "Keys had size %s, values had size %s." %
-                         (len(keys), len(values)))
-    self._table.insert(keys, values)
-    if self._inverse_table:
-      self._inverse_table.insert(values, keys)
-
-  def _initialize_inverse_table(self):
-    keys, values = self._table.export()
-    self._inverse_table.insert(values, keys)
-
-  def _to_numpy(self, preprocessed_data):
-    """Converts preprocessed inputs into numpy arrays."""
-    if isinstance(preprocessed_data, np.ndarray):
-      return preprocessed_data
-    return np.array(preprocessed_data.to_list())
-  # End of V1/V2 shim points.
-
-  def _assert_same_type(self, expected_type, values, value_name):
-    if dtypes.as_dtype(expected_type) != dtypes.as_dtype(values.dtype):
-      raise RuntimeError("Expected %s type %s, got %s" %
-                         (value_name, expected_type, values.dtype))
-
-  def _convert_to_ndarray(self, x, dtype=None):
-    array = np.array(x) if isinstance(x, (list, tuple)) else x
-    if dtype not in (None, dtypes.string):
-      # If the dtype is an integer, we do permissive casting. This allows
-      # users to examine int32 data if the dtype is int64 without trouble.
-      np_dtype = dtypes.as_dtype(dtype).as_numpy_dtype
-      if np.can_cast(array.dtype, np_dtype):
-        array = array.astype(np_dtype, casting="safe")
-    return array
-
   def compute_output_shape(self, input_shape):
     return input_shape
 
-  def compute_output_signature(self, input_spec, invert=False):
+  def compute_output_signature(self, input_spec):
     output_shape = self.compute_output_shape(input_spec.shape.as_list())
-    if invert:
-      output_dtype = dtypes.string
-    else:
-      output_dtype = dtypes.int64
+    output_dtype = self.dtype if self.invert else self._output_dtype
     return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
 
   def adapt(self, data, reset_state=True):
@@ -281,24 +193,28 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     super(IndexLookup, self).adapt(data, reset_state)
 
   def get_vocabulary(self):
-    if self.vocab_size() == 0:
+    if self._table_handler.vocab_size() == 0:
       return []
 
-    keys, values = self._get_table_data()
+    keys, values = self._table_handler.data()
     # This is required because the MutableHashTable doesn't preserve insertion
     # order, but we rely on the order of the array to assign indices.
-    if self.dtype == dtypes.string:
-      return [x.decode("utf-8") for _, x in sorted(zip(values, keys))]
+    if self.invert:
+      # If we are inverting, the vocabulary is in the values instead of keys.
+      return [x for _, x in sorted(zip(keys, values))]
     else:
       return [x for _, x in sorted(zip(values, keys))]
 
+  def vocab_size(self):
+    return self._table_handler.vocab_size()
+
   def get_config(self):
     config = {
+        "invert": self.invert,
         "max_tokens": self.max_tokens,
-        "num_oov_tokens": self.num_oov_tokens,
-        "vocabulary": None,
-        "reserve_zero": self.reserve_zero,
-        "mask_zero": self.mask_zero,
+        "num_oov_indices": self.num_oov_indices,
+        "oov_token": self.oov_token,
+        "mask_token": self.mask_token,
     }
     base_config = super(IndexLookup, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -310,121 +226,154 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     # abstraction for ease of saving!) we return 0.
     return 0
 
-  def set_vocabulary(self,
-                     vocab,
-                     append=False):
-    """Sets vocabulary (and optionally document frequency) data for this layer.
+  def _set_forward_vocabulary(self, vocab):
+    """Sets vocabulary data for this layer when inverse is False."""
+    table_utils.validate_vocabulary_is_unique(vocab)
+
+    should_have_mask = self.mask_token is not None
+    has_mask = vocab[0] == self.mask_token
+    oov_start = 1 if should_have_mask else 0
+
+    should_have_oov = (self.num_oov_indices > 0) and not self.invert
+    if should_have_oov:
+      oov_end = oov_start + self.num_oov_indices
+      expected_oov = [self.oov_token] * self.num_oov_indices
+      has_oov = vocab[oov_start:oov_end] == expected_oov
+      # If we get a numpy array, then has_oov may end up being a numpy array
+      # instead of a bool. Fix this by collapsing the variable if it's not bool.
+      if not isinstance(has_oov, bool):
+        has_oov = any(has_oov)
+    else:
+      has_oov = False
+
+    if all([should_have_mask, has_mask, should_have_oov]) and not has_oov:
+      raise ValueError("The passed vocabulary has the correct mask token `%s` "
+                       "at index 0, but does not have the OOV token `%s` in "
+                       "indices [%s:%s]. Instead, we found `%s`. Was this "
+                       "vocabulary generated by a layer with incompatible "
+                       "settings?" %
+                       (self.mask_token, self.oov_token, oov_start, oov_end,
+                        vocab[oov_start:oov_end]))
+
+    if all([should_have_oov, has_oov, should_have_mask]) and not has_mask:
+      raise ValueError(
+          "The passed vocabulary has the correct OOV token `%s` at "
+          "indices [%s:%s], but does not have the mask token `%s` in "
+          "index 0. Instead, we found `%s`. Was this vocabulary "
+          "generated by a layer with incompatible settings?" %
+          (self.oov_token, oov_start, oov_end, self.mask_token, vocab[0]))
+
+    insert_special_tokens = not has_oov and not has_mask
+
+    special_tokens = [] if self.mask_token is None else [self.mask_token]
+    special_tokens.extend([self.oov_token] * self.num_oov_indices)
+
+    num_special_tokens = len(special_tokens)
+    tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
+    if self.mask_token in tokens:
+      raise ValueError("Reserved mask token %s was found in the passed "
+                       "vocabulary at index %s. Please either remove the "
+                       "reserved token from the vocabulary or change the "
+                       "mask token for this layer." %
+                       (self.mask_token, tokens.index(self.mask_token)))
+    if self.oov_token in tokens:
+      raise ValueError("Reserved OOV token %s was found in the passed "
+                       "vocabulary at index %s. Please either remove the "
+                       "reserved token from the vocabulary or change the "
+                       "OOV token for this layer." %
+                       (self.oov_token, tokens.index(self.oov_token)))
+
+    if insert_special_tokens:
+      total_vocab_size = len(vocab) + num_special_tokens
+    else:
+      total_vocab_size = len(vocab)
+    if self.max_tokens is not None and total_vocab_size > self.max_tokens:
+      raise ValueError(
+          "Attempted to set a vocabulary larger than the maximum vocab size. "
+          "Passed vocab size is %s, max vocab size is %s." %
+          (total_vocab_size, self.max_tokens))
+
+    start_index = num_special_tokens
+    values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64)
+
+    self._table_handler.clear()
+    self._table_handler.insert(vocab, values)
+
+    if insert_special_tokens and num_special_tokens > 0:
+      special_token_values = np.arange(num_special_tokens, dtype=np.int64)
+      self._table_handler.insert(special_tokens, special_token_values)
+
+  def _set_inverse_vocabulary(self, vocab):
+    """Sets vocabulary data for this layer when inverse is True."""
+    table_utils.validate_vocabulary_is_unique(vocab)
+
+    should_have_mask = self.mask_token is not None
+    has_mask = vocab[0] == self.mask_token
+
+    insert_special_tokens = should_have_mask and not has_mask
+    special_tokens = [] if self.mask_token is None else [self.mask_token]
+
+    num_special_tokens = len(special_tokens)
+    tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
+    if self.mask_token in tokens:
+      raise ValueError("Reserved mask token %s was found in the passed "
+                       "vocabulary at index %s. Please either remove the "
+                       "reserved token from the vocabulary or change the "
+                       "mask token for this layer." %
+                       (self.mask_token, tokens.index(self.mask_token)))
+
+    if insert_special_tokens:
+      total_vocab_size = len(vocab) + num_special_tokens
+    else:
+      total_vocab_size = len(vocab)
+    if self.max_tokens is not None and total_vocab_size > self.max_tokens:
+      raise ValueError(
+          "Attempted to set a vocabulary larger than the maximum vocab size. "
+          "Passed vocab size is %s, max vocab size is %s." %
+          (total_vocab_size, self.max_tokens))
+
+    start_index = num_special_tokens if insert_special_tokens else 0
+    values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64)
+
+    self._table_handler.clear()
+    self._table_handler.insert(values, vocab)
+
+    if insert_special_tokens and num_special_tokens > 0:
+      special_token_values = np.arange(num_special_tokens, dtype=np.int64)
+      self._table_handler.insert(special_token_values, special_tokens)
+
+  def set_vocabulary(self, vocab):
+    """Sets vocabulary data for this layer with inverse=False.
 
     This method sets the vocabulary for this layer directly, instead of
     analyzing a dataset through 'adapt'. It should be used whenever the vocab
     information is already known. If vocabulary data is already present in the
-    layer, this method will either replace it, if 'append' is set to False, or
-    append to it (if 'append' is set to True).
+    layer, this method will either replace it
 
     Arguments:
       vocab: An array of string tokens.
-      append: Whether to overwrite or append any existing vocabulary data.
 
     Raises:
       ValueError: If there are too many inputs, the inputs do not match, or
         input data is missing.
     """
-    current_table_size = self.vocab_size()
-    total_vocab_size = len(vocab) + (current_table_size if append else 0)
-    if self.max_tokens is not None and total_vocab_size > self._max_elements:
-      raise ValueError(
-          "Attempted to set a vocabulary larger than the maximum vocab size. "
-          "Passed vocab size is %s, max vocab size is %s. Note that the OOV "
-          "token(s) are automatically added to the number of tokens." %
-          (total_vocab_size, self.max_tokens))
-
-    start_index = self._reserved_values + (self.vocab_size() if append else 0)
-    values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64)
-    vocab = self._convert_to_ndarray(vocab, self.dtype)
-    self._assert_same_type(self.dtype, vocab, "vocab")
-
-    values = self._convert_to_ndarray(values, self._output_dtype)
-    self._assert_same_type(self._output_dtype, values, "values")
-
-    if not append and self.vocab_size() > 0:
-      self._clear_table()
-    self._insert_table_data(vocab, values)
+    if self.invert:
+      self._set_inverse_vocabulary(vocab)
+    else:
+      self._set_forward_vocabulary(vocab)
 
   def _set_state_variables(self, updates):
     if not self.built:
       raise RuntimeError("_set_state_variables() must be called after build().")
     self.set_vocabulary(updates[_VOCAB_NAME])
 
-  def __call__(self, inputs, invert=False, **kwargs):
-    if invert and not self._inverse_table:
-      # If the user wants to perform an inverse lookup, we need to build an
-      # inverse lookup table and initialize it to have the inverse of the
-      # forward table's vocabulary.
-      self._inverse_table = lookup_ops.MutableHashTable(
-          key_dtype=self._output_dtype,
-          value_dtype=self.dtype,
-          default_value="",
-          name=(self._name + "_inverse_index_table"))
+  def call(self, inputs):
+    if self._key_dtype == dtypes.int64 and inputs.dtype == dtypes.int32:
+      inputs = math_ops.cast(inputs, dtypes.int64)
+    return self._table_handler.lookup(inputs)
 
-      tracked_inverse_table = self._add_trackable(
-          self._inverse_table, trainable=False)
-      # This is a workaround for summary() on this layer. Because the table is
-      # not mutable during training, the effective number of parameters (and so
-      # the weight shape) is 0; we add this as an attr so that the parameter
-      # counting code in the Model object doesn't throw an attribute error.
-      tracked_inverse_table.shape = tensor_shape.TensorShape((0,))
-
-      # This is a workaround for saving not working yet for MutableHashTables.
-      # By replacing the existing function call by an explicit failure, we
-      # can provide a more user-friendly error message.
-      def fail(_):
-        raise NotImplementedError(
-            "Saving is not yet supported for IndexLookup layers.")
-
-      self._inverse_table._list_extra_dependencies_for_serialization = fail  # pylint: disable=protected-access
-      self._initialize_inverse_table()
-
-    return super(IndexLookup, self).__call__(inputs, invert=invert, **kwargs)
-
-  def replace_oov_buckets(self, inputs, lookups):
-    if self.num_oov_tokens <= 1:
-      return lookups
-
-    if inputs.dtype.is_integer:
-      inputs = string_ops.as_string(inputs)
-    hashed_inputs = string_ops.string_to_hash_bucket_fast(
-        inputs, num_buckets=self.num_oov_tokens)
-    if self.reserve_zero:
-      hashed_inputs = math_ops.add(hashed_inputs, 1)
-    return array_ops.where(math_ops.equal(lookups, -1), hashed_inputs, lookups)
-
-  def call(self, inputs, invert=False):
-    table = self._inverse_table if invert else self._table
-    # The table lookup ops don't natively support ragged tensors, so if we have
-    # a RT we need to use map_flat_values to look up every element.
-    if ragged_tensor.is_ragged(inputs):
-      indexed_data = ragged_functional_ops.map_flat_values(table.lookup, inputs)
-      if not invert:
-        indexed_data = ragged_functional_ops.map_flat_values(
-            self.replace_oov_buckets, inputs, indexed_data)
-    elif isinstance(
-        inputs, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
-      if not invert:
-        values = self.replace_oov_buckets(inputs.values,
-                                          table.lookup(inputs.values))
-      indexed_data = sparse_tensor.SparseTensor(inputs.indices, values,
-                                                inputs.dense_shape)
-    else:
-      indexed_data = table.lookup(inputs)
-      if not invert:
-        indexed_data = self.replace_oov_buckets(inputs, indexed_data)
-      # (b/149446477): output does not preserve input shape.
-      indexed_data.set_shape(inputs.shape)
-
-    # Composite tensors can pass tensor values through, which will cause
-    # errors if this is the only layer in the model. To fix this, pass
-    # the output through an identity op.
-    return array_ops.identity(indexed_data)
+  def _use_v1_apis(self):
+    return False
 
 
 class _IndexLookupAccumulator(
@@ -445,18 +394,20 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
       dataset, all tokens are retained.s
   """
 
-  def __init__(self, vocab_size=None):
+  def __init__(self, vocab_size=None, mask_value=None):
     self._vocab_size = vocab_size
+    self._mask_value = mask_value
 
   def compute(self, values, accumulator=None):
     """Compute a step in this computation, returning a new accumulator."""
-    values = base_preprocessing_layer.convert_to_list(values)
+    values = base_preprocessing_layer.convert_to_list(
+        values, sparse_default_value=self._mask_value)
 
     if accumulator is None:
       accumulator = self._create_accumulator()
 
     # TODO(momernick): Benchmark improvements to this algorithm.
-    if isinstance(values, (str, bytes)):
+    if isinstance(values, (str, bytes, np.int64)):
       accumulator.count_dict[values] += 1
     else:
       for document in values:
@@ -491,6 +442,8 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
         "vocab": A list of the retained items in the vocabulary.
     """
     vocab_counts = accumulator.count_dict
+    if self._mask_value in vocab_counts:
+      del vocab_counts[self._mask_value]
     sorted_counts = sorted(
         vocab_counts.items(), key=operator.itemgetter(1, 0), reverse=True)
     vocab_data = (
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
index 3360dad6ffe..098e67f5f6b 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
@@ -65,7 +65,12 @@ class IndexLookupDistributionTest(
 
     with distribution.scope():
       input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-      layer = get_layer_class()()
+      layer = get_layer_class()(
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string)
       layer.adapt(vocab_dataset)
       int_data = layer(input_data)
       model = keras.Model(inputs=input_data, outputs=int_data)
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
index 54305b3d6d7..a61cef6121f 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import itertools
 import os
 import random
-import six
 import string
 
 from absl.testing import parameterized
@@ -31,7 +30,6 @@ from tensorflow.python import keras
 from tensorflow.python import tf2
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import one_device_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@@ -44,7 +42,6 @@ from tensorflow.python.keras.layers.preprocessing import preprocessing_test_util
 from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
@@ -71,11 +68,40 @@ def _get_end_to_end_test_cases():
                         ["and"], ["earth"], ["michigan"]]),
           "kwargs": {
               "max_tokens": None,
+              "num_oov_indices": 1,
+              "mask_token": "",
+              "oov_token": "[OOV]",
+              "dtype": dtypes.string,
           },
           "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
           "input_dtype":
               dtypes.string
       },
+      {
+          "testcase_name":
+              "test_inverse_strings_soft_vocab_cap",
+          # Create an array where 'earth' is the most frequent term, followed by
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
+          # accumulator is sorting by frequency.
+          "vocab_data":
+              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
+                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
+          "input_data":
+              np.array([[1], [2], [3], [4], [4], [3], [1], [5]]),
+          "kwargs": {
+              "max_tokens": None,
+              "num_oov_indices": 1,
+              "mask_token": "",
+              "oov_token": "[OOV]",
+              "dtype": dtypes.string,
+              "invert": True
+          },
+          "expected_output":
+              np.array([[b"earth"], [b"wind"], [b"and"], [b"fire"], [b"fire"],
+                        [b"and"], [b"earth"], [b"[OOV]"]]),
+          "input_dtype":
+              dtypes.int64
+      },
       {
           "testcase_name":
               "test_ints_soft_vocab_cap",
@@ -91,12 +117,87 @@ def _get_end_to_end_test_cases():
                        dtype=np.int64),
           "kwargs": {
               "max_tokens": None,
+              "num_oov_indices": 1,
+              "mask_token": 0,
+              "oov_token": -1,
               "dtype": dtypes.int64,
           },
           "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
           "input_dtype":
               dtypes.int64
       },
+      {
+          "testcase_name":
+              "test_strings_hard_vocab_cap",
+          # Create an array where 'earth' is the most frequent term, followed by
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
+          # accumulator is sorting by frequency.
+          "vocab_data":
+              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
+                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
+          "input_data":
+              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
+                        ["and"], ["earth"], ["michigan"]]),
+          "kwargs": {
+              "max_tokens": 5,
+              "num_oov_indices": 1,
+              "mask_token": "",
+              "oov_token": "[OOV]",
+              "dtype": dtypes.string,
+          },
+          "expected_output": [[2], [3], [4], [1], [1], [4], [2], [1]],
+          "input_dtype":
+              dtypes.string
+      },
+      {
+          "testcase_name":
+              "test_inverse_strings_hard_vocab_cap",
+          # Create an array where 'earth' is the most frequent term, followed by
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
+          # accumulator is sorting by frequency.
+          "vocab_data":
+              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
+                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
+          "input_data":
+              np.array([[1], [2], [3], [4], [4], [3], [1], [5]]),
+          "kwargs": {
+              "max_tokens": 5,
+              "num_oov_indices": 1,
+              "mask_token": "",
+              "oov_token": "[OOV]",
+              "dtype": dtypes.string,
+              "invert": True
+          },
+          "expected_output":
+              np.array([[b"earth"], [b"wind"], [b"and"], [b"[OOV]"], [b"[OOV]"],
+                        [b"and"], [b"earth"], [b"[OOV]"]]),
+          "input_dtype":
+              dtypes.int64
+      },
+      {
+          "testcase_name":
+              "test_ints_hard_vocab_cap",
+          # Create an array where 1138 is the most frequent term, followed by
+          # 1729, then 725, then 42. This ensures that the vocab accumulator
+          # is sorting by frequency.
+          "vocab_data":
+              np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
+                        [1729], [725], [725]],
+                       dtype=np.int64),
+          "input_data":
+              np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]],
+                       dtype=np.int64),
+          "kwargs": {
+              "max_tokens": 5,
+              "num_oov_indices": 1,
+              "mask_token": 0,
+              "oov_token": -1,
+              "dtype": dtypes.int64,
+          },
+          "expected_output": [[2], [3], [4], [1], [1], [4], [2], [1]],
+          "input_dtype":
+              dtypes.int64
+      },
   )
 
   crossed_test_cases = []
@@ -121,7 +222,11 @@ class IndexLookupLayerTest(keras_parameterized.TestCase,
                                        use_dataset, expected_output,
                                        input_dtype):
     cls = get_layer_class()
-    expected_output_dtype = dtypes.int64
+    if "invert" in kwargs and kwargs["invert"]:
+      expected_output_dtype = kwargs["dtype"]
+    else:
+      expected_output_dtype = dtypes.int64
+
     input_shape = input_data.shape
 
     if use_dataset:
@@ -152,7 +257,10 @@ class IndexLookupLayerTest(keras_parameterized.TestCase,
           expected_output_dtype=expected_output_dtype,
           validate_training=False,
           adapt_data=vocab_data)
-    self.assertAllClose(expected_output, output_data)
+    if "invert" in kwargs and kwargs["invert"]:
+      self.assertAllEqual(expected_output, output_data)
+    else:
+      self.assertAllClose(expected_output, output_data)
 
 
 @keras_parameterized.run_all_keras_modes
@@ -172,7 +280,12 @@ class CategoricalEncodingInputTest(
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -193,7 +306,12 @@ class CategoricalEncodingInputTest(
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -209,7 +327,12 @@ class CategoricalEncodingInputTest(
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -223,7 +346,31 @@ class CategoricalEncodingInputTest(
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_int32_input_with_int64_keys(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
+                                              dtype=np.int32)
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -248,7 +395,12 @@ class CategoricalEncodingMultiOOVTest(
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
-    layer = get_layer_class()(max_tokens=None, num_oov_tokens=2)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=2,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -261,7 +413,7 @@ class CategoricalEncodingMultiOOVTest(
     vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
     input_array = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 2]],
-        values=np.array([13, 132], dtype=np.int64),
+        values=np.array([13, 133], dtype=np.int64),
         dense_shape=[3, 4])
 
     expected_indices = [[0, 0], [1, 2]]
@@ -270,7 +422,11 @@ class CategoricalEncodingMultiOOVTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
     layer = get_layer_class()(
-        max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2)
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=2,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -286,7 +442,12 @@ class CategoricalEncodingMultiOOVTest(
     expected_output = [[3, 4, 6], [6, 5, 3, 2]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
-    layer = get_layer_class()(max_tokens=None, num_oov_tokens=2)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=2,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -295,13 +456,17 @@ class CategoricalEncodingMultiOOVTest(
 
   def test_ragged_int_input_multi_bucket(self):
     vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
-    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 132]],
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 133]],
                                               dtype=np.int64)
     expected_output = [[3, 4, 6], [6, 5, 3, 2]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
     layer = get_layer_class()(
-        max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2)
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=2,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -321,13 +486,14 @@ class CategoricalEncodingAdaptTest(
         dense_shape=[3, 4])
     vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
 
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.adapt(vocab_dataset)
-    # Note that the expected vocabulary has a null string (''). This is because
-    # we assume that sparse tensors are in fact dense tensors with elided
-    # values, not ragged tensors. Therefore, we assume that any missing data
-    # is important and give it a spot in our vocab.
-    expected_vocabulary = ["", "michigan", "fire"]
+    expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
     self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
 
   def test_ragged_adapt(self):
@@ -335,9 +501,14 @@ class CategoricalEncodingAdaptTest(
                                               ["fire", "michigan"]])
     vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
 
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.adapt(vocab_dataset)
-    expected_vocabulary = ["michigan", "fire"]
+    expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
     self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
 
   def test_sparse_int_input(self):
@@ -352,7 +523,12 @@ class CategoricalEncodingAdaptTest(
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -368,7 +544,12 @@ class CategoricalEncodingAdaptTest(
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -382,7 +563,12 @@ class CategoricalEncodingAdaptTest(
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -400,34 +586,15 @@ class CategoricalEncodingAdaptTest(
     batched_ds = ds.take(2)
     input_t = keras.Input(shape=(), dtype=dtypes.string)
     layer = get_layer_class()(
-        max_tokens=10, num_oov_tokens=0, reserve_zero=False)
+        max_tokens=10,
+        num_oov_indices=0,
+        mask_token=None,
+        oov_token=None,
+        dtype=dtypes.string)
     _ = layer(input_t)
     layer.adapt(batched_ds)
 
 
-@keras_parameterized.run_all_keras_modes
-class IndexLookupDistributionTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_cpu_distribution(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    strategy = one_device_strategy.OneDeviceStrategy("/cpu:0")
-
-    with strategy.scope():
-      input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-      layer = get_layer_class()()
-      layer.set_vocabulary(vocab_data)
-      int_data = layer(input_data)
-      model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-
 @keras_parameterized.run_all_keras_modes
 class IndexLookupOutputTest(keras_parameterized.TestCase,
                             preprocessing_test_utils.PreprocessingLayerTest):
@@ -439,7 +606,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()()
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -448,7 +620,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
 
   def test_output_shape(self):
     input_data = keras.Input(shape=(4,), dtype=dtypes.string)
-    layer = get_layer_class()()
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     int_data = layer(input_data)
     self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
 
@@ -459,7 +636,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(reserve_zero=False)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=None,
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -473,7 +655,13 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_data)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
     output_dataset = model.predict(input_array)
@@ -485,15 +673,6 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
                                 preprocessing_test_utils.PreprocessingLayerTest
                                ):
 
-  def _write_to_temp_file(self, file_name, vocab_list):
-    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
-    with gfile.GFile(vocab_path, "w") as writer:
-      for vocab in vocab_list:
-        writer.write(vocab + "\n")
-      writer.flush()
-      writer.close()
-    return vocab_path
-
   def test_int_output_explicit_vocab(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     input_array = np.array([["earth", "wind", "and", "fire"],
@@ -501,107 +680,307 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
     expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_data)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
-  def test_get_vocab_returns_str(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    layer = get_layer_class()(vocabulary=vocab_data)
-    layer_vocab = layer.get_vocabulary()
-    self.assertAllEqual(vocab_data, layer_vocab)
-    self.assertIsInstance(layer_vocab[0], six.text_type)
+  def test_vocab_with_max_cap(self):
+    vocab_data = ["", "[OOV]", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=5,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    layer.set_vocabulary(vocab_data)
+    returned_vocab = layer.get_vocabulary()
+    self.assertAllEqual(vocab_data, returned_vocab)
 
-  def test_int_output_explicit_vocab_from_file(self):
-    vocab_list = ["earth", "wind", "and", "fire"]
-    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
-
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_path)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_vocab_appending(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=5)
-    layer.set_vocabulary(vocab_data[0])
-    layer.set_vocabulary(vocab_data[1], append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
+  def test_int_vocab_with_max_cap(self):
+    vocab_data = [0, -1, 42, 1276, 1138]
+    layer = get_layer_class()(
+        max_tokens=5,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    layer.set_vocabulary(vocab_data)
+    returned_vocab = layer.get_vocabulary()
+    self.assertAllEqual(vocab_data, returned_vocab)
 
   def test_non_unique_vocab_fails(self):
     vocab_data = ["earth", "wind", "and", "fire", "fire"]
     with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
-      _ = get_layer_class()(vocabulary=vocab_data)
+      _ = get_layer_class()(
+          vocabulary=vocab_data,
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string)
 
-  def test_non_unique_vocab_from_file_fails(self):
-    vocab_list = ["earth", "wind", "and", "fire", "earth"]
-    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+  def test_vocab_with_oov_and_wrong_mask_fails(self):
+    vocab_data = ["custom_mask", "[OOV]", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_oov_and_no_mask_fails(self):
+    vocab_data = ["[OOV]", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_mask_but_no_oov_fails(self):
+    vocab_data = ["", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_repeated_element_fails(self):
+    vocab_data = ["earth", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
-      _ = get_layer_class()(vocabulary=vocab_path)
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_reserved_oov_element_fails(self):
+    vocab_data = ["earth", "test", "[OOV]", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_reserved_mask_element_fails(self):
+    vocab_data = ["earth", "mask_token", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="mask_token",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_non_unique_int_vocab_fails(self):
+    vocab_data = [12, 13, 14, 15, 15]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*15.*"):
+      _ = get_layer_class()(
+          vocabulary=vocab_data,
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token=0,
+          oov_token=-1,
+          dtype=dtypes.int64)
+
+  def test_int_vocab_with_oov_and_wrong_mask_fails(self):
+    vocab_data = [1234, -1, 11, 21, 13, 14]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_oov_and_no_mask_fails(self):
+    vocab_data = [-1, 11, 12, 13, 14]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_mask_but_no_oov_fails(self):
+    vocab_data = [0, 11, 12, 13, 14]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_repeated_element_fails(self):
+    vocab_data = [11, 11, 34, 23, 124]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*11.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_reserved_oov_element_fails(self):
+    vocab_data = [14, 38, -1, 34, 3, 84]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_reserved_mask_element_fails(self):
+    vocab_data = [125, 0, 3, 4, 94]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
+      layer.set_vocabulary(vocab_data)
 
 
 @keras_parameterized.run_all_keras_modes
-class InverseLookupOutputTest(keras_parameterized.TestCase,
-                              preprocessing_test_utils.PreprocessingLayerTest):
+class IndexLookupInverseVocabularyTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
 
-  def test_inverse_output(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_ints = [[2, 3, 4, 5], [5, 4, 2, 1]]
-    # Note that the token 'michigan' has been replaced by ''. This is because
-    # 'michigan' is OOV for this layer.
-    expected_strings = np.array([["earth", "wind", "and", "fire"],
-                                 ["fire", "and", "earth", ""]])
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=None)
-    layer.set_vocabulary(vocab_data)
+  def test_int_output_explicit_vocab(self):
+    vocab_data = ["[OOV]", "earth", "wind", "and", "fire"]
+    input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 1]])
+    expected_output = np.array([["earth", "wind", "and", "fire"],
+                                ["fire", "and", "earth", "[OOV]"]])
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string,
+        invert=True)
     int_data = layer(input_data)
-    string_data = layer(int_data, invert=True)
-    model = keras.Model(inputs=input_data, outputs=[int_data, string_data])
-    int_outputs, string_outputs = model.predict(input_array)
-    self.assertAllEqual(expected_ints, int_outputs)
-    self.assertAllEqual(expected_strings, string_outputs)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
 
-  def test_inverse_output_serialization(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_ints = [[2, 3, 4, 5], [5, 4, 2, 1]]
-    # Note that the token 'michigan' has been replaced by ''. This is because
-    # 'michigan' is OOV for this layer.
-    expected_strings = np.array([["earth", "wind", "and", "fire"],
-                                 ["fire", "and", "earth", ""]])
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=None)
+  def test_vocab_with_max_cap(self):
+    vocab_data = ["", "[OOV]", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=5,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string,
+        invert=True)
     layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    string_data = layer(int_data, invert=True)
-    model = keras.Model(inputs=input_data, outputs=[int_data, string_data])
+    returned_vocab = layer.get_vocabulary()
+    self.assertAllEqual(vocab_data, returned_vocab)
 
-    with CustomObjectScope({"IndexLookup": get_layer_class()}):
-      new_model = keras.Model.from_config(model.get_config())
-    new_model.set_weights(model.get_weights())
-    int_outputs, string_outputs = new_model.predict(input_array)
-    self.assertAllEqual(expected_ints, int_outputs)
-    self.assertAllEqual(expected_strings, string_outputs)
+  def test_int_vocab_with_max_cap(self):
+    vocab_data = [0, -1, 42, 1276, 1138]
+    layer = get_layer_class()(
+        max_tokens=5,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64,
+        invert=True)
+    layer.set_vocabulary(vocab_data)
+    returned_vocab = layer.get_vocabulary()
+    self.assertAllEqual(vocab_data, returned_vocab)
+
+  def test_non_unique_vocab_fails(self):
+    vocab_data = ["earth", "wind", "and", "fire", "fire"]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
+      _ = get_layer_class()(
+          vocabulary=vocab_data,
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string,
+          invert=True)
+
+  def test_vocab_with_repeated_element_fails(self):
+    vocab_data = ["earth", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string,
+        invert=True)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_reserved_mask_element_fails(self):
+    vocab_data = ["earth", "mask_token", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="mask_token",
+        oov_token="[OOV]",
+        dtype=dtypes.string,
+        invert=True)
+    with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_non_unique_int_vocab_fails(self):
+    vocab_data = [12, 13, 14, 15, 15]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*15.*"):
+      _ = get_layer_class()(
+          vocabulary=vocab_data,
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token=0,
+          oov_token=-1,
+          dtype=dtypes.int64,
+          invert=True)
+
+  def test_int_vocab_with_repeated_element_fails(self):
+    vocab_data = [11, 11, 34, 23, 124]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64,
+        invert=True)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*11.*"):
+      layer.set_vocabulary(vocab_data)
 
 
 @keras_parameterized.run_all_keras_modes(always_skip_eager=True)
@@ -612,7 +991,12 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase,
     vocab_data = ["earth", "wind", "and", "fire"]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=10)
+    layer = get_layer_class()(
+        max_tokens=10,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -626,7 +1010,12 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase,
     vocab_data = ["earth", "wind", "and", "fire"]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=10)
+    layer = get_layer_class()(
+        max_tokens=10,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -643,25 +1032,24 @@ class IndexLookupErrorTest(keras_parameterized.TestCase,
   def test_too_long_vocab_fails_in_single_setting(self):
     vocab_data = ["earth", "wind", "and", "fire"]
 
-    layer = get_layer_class()(max_tokens=4)
+    layer = get_layer_class()(
+        max_tokens=4,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     with self.assertRaisesRegex(ValueError,
                                 "vocabulary larger than the maximum vocab.*"):
       layer.set_vocabulary(vocab_data)
 
-  def test_too_long_vocab_fails_in_multiple_settings(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    layer = get_layer_class()(max_tokens=4)
-
-    # The first time we call set_vocabulary, we're under the max_tokens
-    # so it should be fine.
-    layer.set_vocabulary(vocab_data[0])
-    with self.assertRaisesRegex(ValueError,
-                                "vocabulary larger than the maximum vocab.*"):
-      layer.set_vocabulary(vocab_data[1], append=True)
-
   def test_zero_max_tokens_fails(self):
     with self.assertRaisesRegex(ValueError, ".*max_tokens.*"):
-      _ = get_layer_class()(max_tokens=0)
+      _ = get_layer_class()(
+          max_tokens=0,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string)
 
 
 @keras_parameterized.run_all_keras_modes
@@ -676,7 +1064,12 @@ class IndexLookupSavingTest(keras_parameterized.TestCase,
 
     # Build and validate a golden model.
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -705,8 +1098,9 @@ class IndexLookupSavingTest(keras_parameterized.TestCase,
 
 
 @keras_parameterized.run_all_keras_modes
-class IndexLookupCombinerTest(keras_parameterized.TestCase,
-                              preprocessing_test_utils.PreprocessingLayerTest):
+class IndexLookupStringCombinerTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
 
   def compare_text_accumulators(self, a, b, msg=None):
     if a is None or b is None:
@@ -834,5 +1228,123 @@ class IndexLookupCombinerTest(keras_parameterized.TestCase,
     self.validate_accumulator_extract(combiner, data, expected_extract_output)
 
 
+@keras_parameterized.run_all_keras_modes
+class IndexLookupIntCombinerTest(keras_parameterized.TestCase,
+                                 preprocessing_test_utils.PreprocessingLayerTest
+                                ):
+
+  def compare_text_accumulators(self, a, b, msg=None):
+    if a is None or b is None:
+      self.assertAllEqual(a, b, msg=msg)
+
+    self.assertAllEqual(a.count_dict, b.count_dict, msg=msg)
+
+  compare_accumulators = compare_text_accumulators
+
+  def update_accumulator(self, accumulator, data):
+    accumulator.count_dict.update(dict(zip(data["vocab"], data["counts"])))
+
+    return accumulator
+
+  def test_combiner_api_compatibility_int_mode(self):
+    data = np.array([[42, 1138, 725, 1729], [42, 1138, 725, 203]])
+    combiner = index_lookup._IndexLookupCombiner()
+    expected_accumulator_output = {
+        "vocab": np.array([1138, 725, 42, 1729, 203]),
+        "counts": np.array([2, 2, 2, 1, 1]),
+    }
+    expected_extract_output = {
+        "vocab": np.array([1138, 725, 42, 1729, 203]),
+    }
+    expected_accumulator = combiner._create_accumulator()
+    expected_accumulator = self.update_accumulator(expected_accumulator,
+                                                   expected_accumulator_output)
+    self.validate_accumulator_serialize_and_deserialize(combiner, data,
+                                                        expected_accumulator)
+    self.validate_accumulator_uniqueness(combiner, data)
+    self.validate_accumulator_extract(combiner, data, expected_extract_output)
+
+  # TODO(askerryryan): Add tests confirming equivalence to behavior of
+  # existing tf.keras.preprocessing.text.Tokenizer.
+  @parameterized.named_parameters(
+      {
+          "testcase_name": "top_k_smaller_than_full_vocab",
+          "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
+          "vocab_size": 3,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([3, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725]),
+          },
+      },
+      {
+          "testcase_name": "top_k_larger_than_full_vocab",
+          "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
+          "vocab_size": 10,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([3, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+          },
+      },
+      {
+          "testcase_name": "no_top_k",
+          "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
+          "vocab_size": None,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([3, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+          },
+      },
+      {
+          "testcase_name": "single_element_per_row",
+          "data": np.array([[42], [1138], [1729], [1138], [725]]),
+          "vocab_size": 3,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([2, 1, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725]),
+          },
+      },
+      # Which tokens are retained are based on global frequency, and thus are
+      # sensitive to frequency within a document. In contrast, because idf only
+      # considers the presence of a token in a document, it is insensitive
+      # to the frequency of the token within the document.
+      {
+          "testcase_name":
+              "retained_tokens_sensitive_to_within_document_frequency",
+          "data":
+              np.array([[42, 42], [1138, 1138], [1729, 1729], [1138, 1138],
+                        [725, 203]]),
+          "vocab_size":
+              3,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 42, 1729, 725, 203]),
+              "counts": np.array([4, 2, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 42]),
+          },
+      })
+  def test_combiner_computation(self, data, vocab_size,
+                                expected_accumulator_output,
+                                expected_extract_output):
+    combiner = index_lookup._IndexLookupCombiner(vocab_size=vocab_size)
+    expected_accumulator = combiner._create_accumulator()
+    expected_accumulator = self.update_accumulator(expected_accumulator,
+                                                   expected_accumulator_output)
+    self.validate_accumulator_computation(combiner, data, expected_accumulator)
+    self.validate_accumulator_extract(combiner, data, expected_extract_output)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_v1.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_v1.py
index c6e0b6ed286..47fea11dd57 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_v1.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 
-from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
 from tensorflow.python.keras.layers.preprocessing import index_lookup
-from tensorflow.python.ops.ragged import ragged_tensor_value
 
 
 class IndexLookup(index_lookup.IndexLookup,
@@ -59,37 +56,5 @@ class IndexLookup(index_lookup.IndexLookup,
       this option is set, reserve_zero must also be set. Defaults to False.
   """
 
-  def _get_table_data(self):
-    keys, values = self._table.export()
-    np_keys = K.get_session().run(keys)
-    np_values = K.get_session().run(values)
-    return (np_keys, np_values)
-
-  def vocab_size(self):
-    return K.get_session().run(self._table.size())
-
-  def _clear_table(self):
-    keys, _ = self._table.export()
-    K.get_session().run(self._table.remove(keys))
-    if self._inverse_table:
-      keys, _ = self._inverse_table.export()
-      K.get_session().run(self._inverse_table.remove(keys))
-
-  def _insert_table_data(self, keys, values):
-    K.get_session().run(self._table.insert(keys, values))
-    if self._inverse_table:
-      K.get_session().run(self._inverse_table.insert(values, keys))
-
-  def _initialize_inverse_table(self):
-    keys, values = self._table.export()
-    K.get_session().run(self._inverse_table.insert(values, keys))
-
-  def _to_numpy(self, data):
-    """Converts preprocessed inputs into numpy arrays."""
-    if isinstance(data, np.ndarray):
-      return data
-    session = K.get_session()
-    data = session.run(data)
-    if isinstance(data, ragged_tensor_value.RaggedTensorValue):
-      data = np.array(data.to_list())
-    return data
+  def _use_v1_apis(self):
+    return True
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
new file mode 100644
index 00000000000..3512b9988c1
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
@@ -0,0 +1,217 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.layers.preprocessing import index_lookup
+from tensorflow.python.keras.layers.preprocessing import table_utils
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export("keras.layers.experimental.preprocessing.IntegerLookup", v1=[])
+class IntegerLookup(index_lookup.IndexLookup):
+  """Maps integers from a vocabulary to integer indices.
+
+  This layer translates a set of arbitrary integers into an integer output via a
+  table-based lookup, with optional out-of-vocabulary handling.
+
+  If desired, the user can call this layer's `adapt()` method on a data set,
+  which will analyze the data set, determine the frequency of individual string
+  values, and create a vocabulary from them. This vocabulary can have
+  unlimited size or be capped, depending on the configuration options for this
+  layer; if there are more unique values in the input than the maximum
+  vocabulary size, the most frequent terms will be used to create the
+  vocabulary.
+
+  Attributes:
+    max_values: The maximum size of the vocabulary for this layer. If None,
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      includes the OOV and mask values, so the effective number of values is
+      (max_values - num_oov_values - (1 if mask_token else 0))
+    num_oov_indices: The number of out-of-vocabulary values to use; defaults to
+      1. If this value is more than 1, OOV inputs are modulated to determine
+      their OOV value; if this value is 0, passing an OOV input will result in
+      a '-1' being returned for that value in the output tensor. (Note that,
+      because the value is -1 and not 0, this will allow you to effectively drop
+      OOV values from categorical encodings.)
+    mask_value: A value that represents masked inputs, and which is mapped to
+      index 0. Defaults to 0. If set to None, no mask term will be added and the
+      OOV values, if any, will be indexed from (0...num_oov_values) instead of
+      (1...num_oov_values+1).
+    oov_value: The value representing an out-of-vocabulary value. Defaults to
+      -1.
+    vocabulary: An optional list of values, or a path to a text file containing
+      a vocabulary to load into this layer. The file should contain one value
+      per line. If the list or file contains the same token multiple times, an
+      error will be thrown.
+    invert: If true, this layer will map indices to vocabulary items instead
+      of mapping vocabulary items to indices.
+
+  Examples:
+
+  Creating a lookup layer with a known vocabulary
+
+  This example creates a lookup layer with a pre-existing vocabulary.
+
+  >>> vocab = [12, 36, 1138, 42]
+  >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
+  >>> layer = IntegerLookup(vocabulary=vocab)
+  >>> layer(data)
+  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+  array([[2, 4, 5],
+         [5, 1, 3]])>
+
+
+  Creating a lookup layer with an adapted vocabulary
+
+  This example creates a lookup layer and generates the vocabulary by analyzing
+  the dataset.
+
+  >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
+  >>> layer = IntegerLookup()
+  >>> layer.adapt(data)
+  >>> layer.get_vocabulary()
+  [0, -1, 42, 1138, 1000, 36, 12]
+
+  Note how the mask value 0 and the OOV value -1 have been added to the
+  vocabulary. The remaining values are sorted by frequency (1138, which has
+  2 occurrences, is first) then by inverse sort order.
+
+  >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
+  >>> layer = IntegerLookup()
+  >>> layer.adapt(data)
+  >>> layer(data)
+  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+  array([[6, 3, 2],
+         [2, 4, 5]])>
+
+
+  Lookups with multiple OOV tokens.
+
+  This example demonstrates how to use a lookup layer with multiple OOV tokens.
+  When a layer is created with more than one OOV token, any OOV values are
+  hashed into the number of OOV buckets, distributing OOV values in a
+  deterministic fashion across the set.
+
+  >>> vocab = [12, 36, 1138, 42]
+  >>> data = tf.constant([[12, 1138, 42], [37, 1000, 36]])
+  >>> layer = IntegerLookup(vocabulary=vocab, num_oov_indices=2)
+  >>> layer(data)
+  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+  array([[3, 5, 6],
+         [2, 1, 4]])>
+
+  Note that the output for OOV value 37 is 2, while the output for OOV value
+  1000 is 1. The in-vocab terms have their output index increased by 1 from
+  earlier examples (12 maps to 3, etc) in order to make space for the extra OOV
+  value.
+
+
+  Inverse lookup
+
+  This example demonstrates how to map indices to values using this layer. (You
+  can also use adapt() with inverse=True, but for simplicity we'll pass the
+  vocab in this example.)
+
+  >>> vocab = [12, 36, 1138, 42]
+  >>> data = tf.constant([[1, 3, 4], [4, 5, 2]])
+  >>> layer = IntegerLookup(vocabulary=vocab, invert=True)
+  >>> layer(data)
+  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+  array([[  12, 1138,   42],
+         [  42,   -1,   36]])>
+
+  Note that the integer 5, which is out of the vocabulary space, returns an OOV
+  token.
+
+
+  Forward and inverse lookup pairs
+
+  This example demonstrates how to use the vocabulary of a standard lookup
+  layer to create an inverse lookup layer.
+
+  >>> vocab = [12, 36, 1138, 42]
+  >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
+  >>> layer = IntegerLookup(vocabulary=vocab)
+  >>> i_layer = IntegerLookup(vocabulary=layer.get_vocabulary(), invert=True)
+  >>> int_data = layer(data)
+  >>> i_layer(int_data)
+  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+  array([[  12, 1138,   42],
+         [  42,   -1,   36]])>
+
+  In this example, the input value 1000 resulted in an output of -1, since
+  1000 was not in the vocabulary - it got represented as an OOV, and all OOV
+  values are returned as -1 in the inverse layer. Also, note that for the
+  inverse to work, you must have already set the forward layer vocabulary
+  either directly or via fit() before calling get_vocabulary().
+  """
+
+  def __init__(self,
+               max_values=None,
+               num_oov_indices=1,
+               mask_value=0,
+               oov_value=-1,
+               vocabulary=None,
+               invert=False,
+               **kwargs):
+    allowed_dtypes = [dtypes.int64]
+
+    if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
+      raise ValueError("IntegerLookup may only have a dtype in %s." %
+                       allowed_dtypes)
+
+    if "dtype" not in kwargs:
+      kwargs["dtype"] = dtypes.int64
+
+    # If max_values is set, the value must be greater than 1 - otherwise we
+    # are creating a 0-element vocab, which doesn't make sense.
+    if max_values is not None and max_values <= 1:
+      raise ValueError("If set, max_values must be greater than 1.")
+
+    if num_oov_indices < 0:
+      raise ValueError("num_oov_indices must be greater than 0. You passed %s" %
+                       num_oov_indices)
+
+    if vocabulary is not None:
+      if isinstance(vocabulary, str):
+        vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
+        vocabulary = [int(v) for v in vocabulary]
+
+    super(IntegerLookup, self).__init__(
+        max_tokens=max_values,
+        num_oov_indices=num_oov_indices,
+        mask_token=mask_value,
+        oov_token=oov_value,
+        vocabulary=vocabulary,
+        invert=invert,
+        **kwargs)
+
+  def get_config(self):
+    base_config = super(IntegerLookup, self).get_config()
+    # Because the super config has a bunch of args we're also passing,
+    # we need to rename and remove them from the config dict.
+    base_config["max_values"] = base_config["max_tokens"]
+    del base_config["max_tokens"]
+
+    base_config["mask_value"] = base_config["mask_token"]
+    del base_config["mask_token"]
+
+    base_config["oov_value"] = base_config["oov_token"]
+    del base_config["oov_token"]
+    return base_config
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
new file mode 100644
index 00000000000..0b71c6aaecc
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
@@ -0,0 +1,531 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras text vectorization preprocessing layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import os
+import random
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python import tf2
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers.preprocessing import integer_lookup
+from tensorflow.python.keras.layers.preprocessing import integer_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.keras.saving import save
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+def get_layer_class():
+  if context.executing_eagerly():
+    return integer_lookup.IntegerLookup
+  else:
+    return integer_lookup_v1.IntegerLookup
+
+
+def _get_end_to_end_test_cases():
+  test_cases = (
+      {
+          "testcase_name":
+              "test_ints_soft_vocab_cap",
+          # Create an array where 1138 is the most frequent term, followed by
+          # 1729, then 725, then 42. This ensures that the vocab accumulator
+          # is sorting by frequency.
+          "vocab_data":
+              np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
+                        [1729], [725], [725]],
+                       dtype=np.int64),
+          "input_data":
+              np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]],
+                       dtype=np.int64),
+          "kwargs": {
+              "max_values": None,
+              "dtype": dtypes.int64,
+          },
+          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+          "input_dtype":
+              dtypes.int64
+      },)
+
+  crossed_test_cases = []
+  # Cross above test cases with use_dataset in (True, False)
+  for use_dataset in (True, False):
+    for case in test_cases:
+      case = case.copy()
+      if use_dataset:
+        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+      case["use_dataset"] = use_dataset
+      crossed_test_cases.append(case)
+
+  return crossed_test_cases
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupLayerTest(keras_parameterized.TestCase,
+                             preprocessing_test_utils.PreprocessingLayerTest):
+
+  @parameterized.named_parameters(*_get_end_to_end_test_cases())
+  def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
+                                       use_dataset, expected_output,
+                                       input_dtype):
+    cls = get_layer_class()
+    expected_output_dtype = dtypes.int64
+    input_shape = input_data.shape
+
+    if use_dataset:
+      # Keras APIs expect batched datasets.
+      # TODO(rachelim): `model.predict` predicts the result on each
+      # dataset batch separately, then tries to concatenate the results
+      # together. When the results have different shapes on the non-concat
+      # axis (which can happen in the output_mode = INT case for
+      # IntegerLookup), the concatenation fails. In real use cases, this may
+      # not be an issue because users are likely to pipe the preprocessing layer
+      # into other keras layers instead of predicting it directly. A workaround
+      # for these unit tests is to have the dataset only contain one batch, so
+      # no concatenation needs to happen with the result. For consistency with
+      # numpy input, we should make `predict` join differently shaped results
+      # together sensibly, with 0 padding.
+      input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
+          input_shape[0])
+      vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
+          input_shape[0])
+
+    with CustomObjectScope({"IntegerLookup": cls}):
+      output_data = testing_utils.layer_test(
+          cls,
+          kwargs=kwargs,
+          input_shape=input_shape,
+          input_data=input_data,
+          input_dtype=input_dtype,
+          expected_output_dtype=expected_output_dtype,
+          validate_training=False,
+          adapt_data=vocab_data)
+    self.assertAllClose(expected_output, output_data)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingInputTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 32], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [5, 1]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
+                                              dtype=np.int64)
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingMultiOOVTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 133], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [6, 2]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    layer = get_layer_class()(
+        max_values=None,
+        dtype=dtypes.int64,
+        num_oov_indices=2,
+        mask_value=0,
+        oov_value=-1)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 133]],
+                                              dtype=np.int64)
+    expected_output = [[3, 4, 6], [6, 5, 3, 2]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
+    layer = get_layer_class()(max_values=None, num_oov_indices=2)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingAdaptTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_adapt(self):
+    vocab_data = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [1, 2]],
+        values=[203, 1729, 203],
+        dense_shape=[3, 4])
+    vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
+
+    layer = get_layer_class()()
+    layer.adapt(vocab_dataset)
+    expected_vocabulary = [0, -1, 203, 1729]
+    self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
+
+  def test_ragged_adapt(self):
+    vocab_data = ragged_factory_ops.constant([[203], [1729, 203]])
+    vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
+
+    layer = get_layer_class()()
+    layer.adapt(vocab_dataset)
+    expected_vocabulary = [0, -1, 203, 1729]
+    self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
+
+  def test_sparse_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 32], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [5, 1]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
+                                              dtype=np.int64)
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_single_int_generator_dataset(self):
+
+    def word_gen():
+      for _ in itertools.count(1):
+        yield random.randint(0, 100)
+
+    ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.int64,
+                                            tensor_shape.TensorShape([]))
+    batched_ds = ds.take(2)
+    input_t = keras.Input(shape=(), dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_values=10, num_oov_indices=0, mask_value=None, oov_value=None)
+    _ = layer(input_t)
+    layer.adapt(batched_ds)
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupOutputTest(keras_parameterized.TestCase,
+                              preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_int_output(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()()
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_output_shape(self):
+    input_data = keras.Input(shape=(4,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=None, num_oov_indices=1)
+    int_data = layer(input_data)
+    self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
+
+  def test_int_output_no_reserved_zero(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=None, mask_value=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_int_output_explicit_vocab(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_values=None,
+    )
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_inverse_output(self):
+    vocab_data = [0, -1, 42, 1138, 725, 1729]
+    input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 1]])
+    expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]])
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(invert=True)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_forward_backward_output(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]])
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()()
+    inverse_layer = get_layer_class()()
+    layer.set_vocabulary(vocab_data)
+    inverse_layer = get_layer_class()(
+        vocabulary=layer.get_vocabulary(), invert=True)
+    int_data = layer(input_data)
+    inverse_data = inverse_layer(int_data)
+    model = keras.Model(inputs=input_data, outputs=inverse_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupVocabularyTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def _write_to_temp_file(self, file_name, vocab_list):
+    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+    with gfile.GFile(vocab_path, "w") as writer:
+      for vocab in vocab_list:
+        writer.write(str(vocab) + "\n")
+      writer.flush()
+      writer.close()
+    return vocab_path
+
+  def test_int_output_explicit_vocab(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(vocabulary=vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_get_vocab_returns_int(self):
+    vocab_data = [42, 1138, 725, 1729]
+    expected_vocab = [0, -1, 42, 1138, 725, 1729]
+    layer = get_layer_class()(vocabulary=vocab_data)
+    layer_vocab = layer.get_vocabulary()
+    self.assertAllEqual(expected_vocab, layer_vocab)
+    self.assertIsInstance(layer_vocab[0], np.int64)
+
+  def test_int_output_explicit_vocab_from_file(self):
+    vocab_list = [42, 1138, 725, 1729]
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(vocabulary=vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_non_unique_vocab_fails(self):
+    vocab_data = [42, 1138, 725, 1729, 1729]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"):
+      _ = get_layer_class()(vocabulary=vocab_data)
+
+  def test_non_unique_vocab_from_file_fails(self):
+    vocab_list = [42, 1138, 725, 1729, 42]
+    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*42.*"):
+      _ = get_layer_class()(vocabulary=vocab_path)
+
+
+@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
+class IntegerLookupSaveableTest(keras_parameterized.TestCase,
+                                preprocessing_test_utils.PreprocessingLayerTest
+                               ):
+
+  def test_ops_are_not_added_with_multiple_get_set_weights(self):
+    vocab_data = [42, 1138, 725, 1729]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    weights = model.get_weights()
+    model.set_weights(weights)
+    keras.backend.get_session().graph.finalize()
+    weights = model.get_weights()
+    model.set_weights(weights)
+
+  def test_layer_saving_with_h5(self):
+    vocab_data = [42, 1138, 725, 1729]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    path = os.path.join(self.get_temp_dir(), "model")
+    with self.assertRaisesRegex(NotImplementedError,
+                                "Save or restore weights that is not.*"):
+      save.save_model(model, path, save_format="h5")
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupErrorTest(keras_parameterized.TestCase,
+                             preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_too_long_vocab_fails_in_single_setting(self):
+    vocab_data = [42, 1138, 725, 1729]
+
+    layer = get_layer_class()(max_values=4, num_oov_indices=1)
+    with self.assertRaisesRegex(ValueError,
+                                "vocabulary larger than the maximum vocab.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_zero_max_values_fails(self):
+    with self.assertRaisesRegex(ValueError, ".*max_values.*"):
+      _ = get_layer_class()(max_values=0, num_oov_indices=1)
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupSavingTest(keras_parameterized.TestCase,
+                              preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_vocabulary_persistence_across_saving(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    # Build and validate a golden model.
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=None, num_oov_indices=1)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(output_dataset, expected_output)
+
+    # Save the model to disk.
+    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+    model.save(output_path, save_format="tf")
+
+    # Delete the session and graph to ensure that the loaded model is generated
+    # from scratch.
+    # TODO(b/149526183): Can't clear session when TF2 is disabled.
+    if tf2.enabled():
+      keras.backend.clear_session()
+
+    loaded_model = keras.models.load_model(
+        output_path, custom_objects={"IntegerLookup": get_layer_class()})
+
+    # Ensure that the loaded model is unique (so that the save/load is real)
+    self.assertIsNot(model, loaded_model)
+
+    # Validate correctness of the new model.
+    new_output_dataset = loaded_model.predict(input_array)
+    self.assertAllEqual(new_output_dataset, expected_output)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/service/server_lib_test.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
similarity index 55%
rename from tensorflow/python/data/service/server_lib_test.py
rename to tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
index b18262bf52b..2a86e9d56b0 100644
--- a/tensorflow/python/data/service/server_lib_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
@@ -12,31 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf.data service server lib."""
-
+"""Keras string lookup preprocessing layer."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.service import server_lib
-
-from tensorflow.python.platform import test
-
-PROTOCOL = "grpc"
+from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import integer_lookup
+from tensorflow.python.util.tf_export import keras_export
 
 
-class ServerLibTest(test.TestCase):
-
-  def testStartMaster(self):
-    master = server_lib.MasterServer(PROTOCOL)
-    self.assertRegex(master.target, PROTOCOL + "://.*:.*")
-
-  def testStartWorker(self):
-    master = server_lib.MasterServer(PROTOCOL)
-    worker = server_lib.WorkerServer(PROTOCOL,
-                                     master.target[len(PROTOCOL + "://"):])
-    self.assertRegex(worker.target, PROTOCOL + "://.*:.*")
-
-
-if __name__ == "__main__":
-  test.main()
+@keras_export(v1=["keras.layers.experimental.preprocessing.IntegerLookup"])
+class IntegerLookup(integer_lookup.IntegerLookup, index_lookup_v1.IndexLookup):
+  pass
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization.py b/tensorflow/python/keras/layers/preprocessing/normalization.py
index b087a2101c7..be04e9947b8 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization.py
@@ -41,7 +41,7 @@ _VARIANCE_NAME = 'variance'
 class Normalization(CombinerPreprocessingLayer):
   """Feature-wise normalization of the data.
 
-  This layer will coerce its inputs into a normal distribution centered around
+  This layer will coerce its inputs into a distribution centered around
   0 with standard deviation 1. It accomplishes this by precomputing the mean and
   variance of the data, and calling (input-mean)/sqrt(var) at runtime.
 
@@ -55,6 +55,21 @@ class Normalization(CombinerPreprocessingLayer):
         in the specified axis. If set to 'None', the layer will perform scalar
         normalization (diving the input by a single scalar value). 0 (the batch
         axis) is not allowed.
+
+
+  Examples:
+
+  Calculate the mean and variance by analyzing the dataset in `adapt`.
+
+  >>> adapt_data = np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32)
+  >>> input_data = np.array([[1.], [2.], [3.]], np.float32)
+  >>> layer = Normalization()
+  >>> layer.adapt(adapt_data)
+  >>> layer(input_data)
+  <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
+  array([[-1.4142135 ],
+         [-0.70710677],
+         [ 0.        ]], dtype=float32)>
   """
 
   def __init__(self, axis=-1, dtype=None, **kwargs):
@@ -107,6 +122,10 @@ class Normalization(CombinerPreprocessingLayer):
     super(Normalization, self).build(input_shape)
 
   def call(self, inputs):
+    # If the inputs are not floats, cast them to floats. This avoids issues
+    # with int-float multiplication and division below.
+    if inputs.dtype != K.floatx():
+      inputs = math_ops.cast(inputs, K.floatx())
     # We need to reshape the mean and variance data to ensure that Tensorflow
     # broadcasts the data correctly.
     mean = array_ops.reshape(self.mean, self._broadcast_shape)
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
index 2e6f4990cc5..e5a429751f4 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
@@ -48,6 +48,12 @@ def _get_layer_computation_test_cases():
       "test_data": np.array([[1.], [2.], [3.]], np.float32),
       "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
       "testcase_name": "2d_single_element"
+  }, {
+      "adapt_data": np.array([[1], [2], [3], [4], [5]], dtype=np.int32),
+      "axis": -1,
+      "test_data": np.array([[1], [2], [3]], np.int32),
+      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
+      "testcase_name": "2d_int_data"
   }, {
       "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
       "axis": None,
@@ -140,6 +146,7 @@ class NormalizationTest(keras_parameterized.TestCase,
     self.validate_accumulator_extract(combiner, data, expected)
     self.validate_accumulator_extract_and_restore(combiner, data,
                                                   expected)
+
   @parameterized.named_parameters(
       {
           "data": np.array([[1], [2], [3], [4], [5]]),
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup.py b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
new file mode 100644
index 00000000000..d772f57aa4d
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
@@ -0,0 +1,212 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.layers.preprocessing import index_lookup
+from tensorflow.python.keras.layers.preprocessing import table_utils
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export("keras.layers.experimental.preprocessing.StringLookup", v1=[])
+class StringLookup(index_lookup.IndexLookup):
+  """Maps strings from a vocabulary to integer indices.
+
+  This layer translates a set of arbitrary strings into an integer output via a
+  table-based lookup, with optional out-of-vocabulary handling.
+
+  If desired, the user can call this layer's `adapt()` method on a data set,
+  which will analyze the data set, determine the frequency of individual string
+  values, and create a vocabulary from them. This vocabulary can have
+  unlimited size or be capped, depending on the configuration options for this
+  layer; if there are more unique values in the input than the maximum
+  vocabulary size, the most frequent terms will be used to create the
+  vocabulary.
+
+  Attributes:
+    max_tokens: The maximum size of the vocabulary for this layer. If None,
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      includes the OOV and mask tokens, so the effective number of tokens is
+      (max_tokens - num_oov_indices - (1 if mask_token else 0))
+    num_oov_indices: The number of out-of-vocabulary tokens to use; defaults to
+      1. If this value is more than 1, OOV inputs are hashed to determine their
+      OOV value; if this value is 0, passing an OOV input will result in a '-1'
+      being returned for that value in the output tensor. (Note that, because
+      the value is -1 and not 0, this will allow you to effectively drop OOV
+      values from categorical encodings.)
+    mask_token: A token that represents masked values, and which is mapped to
+      index 0. Defaults to the empty string "". If set to None, no mask term
+      will be added and the OOV tokens, if any, will be indexed from
+      (0...num_oov_indices) instead of (1...num_oov_indices+1).
+    oov_token: The token representing an out-of-vocabulary value. Defaults to
+      "[UNK]".
+    vocabulary: An optional list of vocabulary terms, or a path to a text file
+      containing a vocabulary to load into this layer. The file should contain
+      one token per line. If the list or file contains the same token multiple
+      times, an error will be thrown.
+    encoding: The Python string encoding to use. Defaults to `'utf-8'`.
+    invert: If true, this layer will map indices to vocabulary items instead
+      of mapping vocabulary items to indices.
+
+  Examples:
+
+  Creating a lookup layer with a known vocabulary
+
+  This example creates a lookup layer with a pre-existing vocabulary.
+
+  >>> vocab = ["a", "b", "c", "d"]
+  >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
+  >>> layer = StringLookup(vocabulary=vocab)
+  >>> layer(data)
+  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+  array([[2, 4, 5],
+         [5, 1, 3]])>
+
+
+  Creating a lookup layer with an adapted vocabulary
+
+  This example creates a lookup layer and generates the vocabulary by analyzing
+  the dataset.
+
+  >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
+  >>> layer = StringLookup()
+  >>> layer.adapt(data)
+  >>> layer.get_vocabulary()
+  ['', '[UNK]', 'd', 'z', 'c', 'b', 'a']
+
+  Note how the mask token '' and the OOV token [UNK] have been added to the
+  vocabulary. The remaining tokens are sorted by frequency ('d', which has
+  2 occurrences, is first) then by inverse sort order.
+
+  >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
+  >>> layer = StringLookup()
+  >>> layer.adapt(data)
+  >>> layer(data)
+  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+  array([[6, 4, 2],
+         [2, 3, 5]])>
+
+  Lookups with multiple OOV tokens.
+
+  This example demonstrates how to use a lookup layer with multiple OOV tokens.
+  When a layer is created with more than one OOV token, any OOV values are
+  hashed into the number of OOV buckets, distributing OOV values in a
+  deterministic fashion across the set.
+
+  >>> vocab = ["a", "b", "c", "d"]
+  >>> data = tf.constant([["a", "c", "d"], ["m", "z", "b"]])
+  >>> layer = StringLookup(vocabulary=vocab, num_oov_indices=2)
+  >>> layer(data)
+  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+  array([[3, 5, 6],
+         [1, 2, 4]])>
+
+  Note that the output for OOV value 'm' is 1, while the output for OOV value
+  'z' is 2. The in-vocab terms have their output index increased by 1 from
+  earlier examples (a maps to 3, etc) in order to make space for the extra OOV
+  value.
+
+  Inverse lookup
+
+  This example demonstrates how to map indices to strings using this layer. (You
+  can also use adapt() with inverse=True, but for simplicity we'll pass the
+  vocab in this example.)
+
+  >>> vocab = ["a", "b", "c", "d"]
+  >>> data = tf.constant([[1, 3, 4], [4, 5, 2]])
+  >>> layer = StringLookup(vocabulary=vocab, invert=True)
+  >>> layer(data)
+  <tf.Tensor: shape=(2, 3), dtype=string, numpy=
+  array([[b'a', b'c', b'd'],
+         [b'd', b'[UNK]', b'b']], dtype=object)>
+
+  Note that the integer 5, which is out of the vocabulary space, returns an OOV
+  token.
+
+
+  Forward and inverse lookup pairs
+
+  This example demonstrates how to use the vocabulary of a standard lookup
+  layer to create an inverse lookup layer.
+
+  >>> vocab = ["a", "b", "c", "d"]
+  >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
+  >>> layer = StringLookup(vocabulary=vocab)
+  >>> i_layer = StringLookup(vocabulary=layer.get_vocabulary(), invert=True)
+  >>> int_data = layer(data)
+  >>> i_layer(int_data)
+  <tf.Tensor: shape=(2, 3), dtype=string, numpy=
+  array([[b'a', b'c', b'd'],
+         [b'd', b'[UNK]', b'b']], dtype=object)>
+
+  In this example, the input value 'z' resulted in an output of '[UNK]', since
+  1000 was not in the vocabulary - it got represented as an OOV, and all OOV
+  values are returned as '[OOV}' in the inverse layer. Also, note that for the
+  inverse to work, you must have already set the forward layer vocabulary
+  either directly or via fit() before calling get_vocabulary().
+  """
+
+  def __init__(self,
+               max_tokens=None,
+               num_oov_indices=1,
+               mask_token="",
+               oov_token="[UNK]",
+               vocabulary=None,
+               encoding=None,
+               invert=False,
+               **kwargs):
+    allowed_dtypes = [dtypes.string]
+
+    if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
+      raise ValueError("StringLookup may only have a dtype in %s." %
+                       allowed_dtypes)
+
+    if "dtype" not in kwargs:
+      kwargs["dtype"] = dtypes.string
+
+    if encoding is None:
+      encoding = "utf-8"
+
+    if vocabulary is not None:
+      if isinstance(vocabulary, str):
+        vocabulary = table_utils.get_vocabulary_from_file(vocabulary, encoding)
+
+    self.encoding = encoding
+
+    super(StringLookup, self).__init__(
+        max_tokens=max_tokens,
+        num_oov_indices=num_oov_indices,
+        mask_token=mask_token,
+        oov_token=oov_token,
+        vocabulary=vocabulary,
+        invert=invert,
+        **kwargs)
+
+  def get_config(self):
+    config = {"encoding": self.encoding}
+    base_config = super(StringLookup, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def get_vocabulary(self):
+    if self._table_handler.vocab_size() == 0:
+      return []
+
+    keys, values = self._table_handler.data()
+    # This is required because the MutableHashTable doesn't preserve insertion
+    # order, but we rely on the order of the array to assign indices.
+    return [x.decode(self.encoding) for _, x in sorted(zip(values, keys))]
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
new file mode 100644
index 00000000000..2b45b59fcf4
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
@@ -0,0 +1,270 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras text vectorization preprocessing layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl.testing import parameterized
+import numpy as np
+import six
+
+from tensorflow.python import keras
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.keras.layers.preprocessing import string_lookup
+from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
+from tensorflow.python.keras.saving import save
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+def get_layer_class():
+  if context.executing_eagerly():
+    return string_lookup.StringLookup
+  else:
+    return string_lookup_v1.StringLookup
+
+
+def _get_end_to_end_test_cases():
+  test_cases = (
+      {
+          "testcase_name":
+              "test_strings_soft_vocab_cap",
+          # Create an array where 'earth' is the most frequent term, followed by
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
+          # accumulator is sorting by frequency.
+          "vocab_data":
+              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
+                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
+          "input_data":
+              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
+                        ["and"], ["earth"], ["michigan"]]),
+          "kwargs": {
+              "max_tokens": None,
+          },
+          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+          "input_dtype":
+              dtypes.string
+      },
+  )
+
+  crossed_test_cases = []
+  # Cross above test cases with use_dataset in (True, False)
+  for use_dataset in (True, False):
+    for case in test_cases:
+      case = case.copy()
+      if use_dataset:
+        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+      case["use_dataset"] = use_dataset
+      crossed_test_cases.append(case)
+
+  return crossed_test_cases
+
+
+@keras_parameterized.run_all_keras_modes
+class StringLookupLayerTest(keras_parameterized.TestCase,
+                            preprocessing_test_utils.PreprocessingLayerTest):
+
+  @parameterized.named_parameters(*_get_end_to_end_test_cases())
+  def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
+                                       use_dataset, expected_output,
+                                       input_dtype):
+    cls = get_layer_class()
+    expected_output_dtype = dtypes.int64
+    input_shape = input_data.shape
+
+    if use_dataset:
+      # Keras APIs expect batched datasets.
+      # TODO(rachelim): `model.predict` predicts the result on each
+      # dataset batch separately, then tries to concatenate the results
+      # together. When the results have different shapes on the non-concat
+      # axis (which can happen in the output_mode = INT case for
+      # StringLookup), the concatenation fails. In real use cases, this may
+      # not be an issue because users are likely to pipe the preprocessing layer
+      # into other keras layers instead of predicting it directly. A workaround
+      # for these unit tests is to have the dataset only contain one batch, so
+      # no concatenation needs to happen with the result. For consistency with
+      # numpy input, we should make `predict` join differently shaped results
+      # together sensibly, with 0 padding.
+      input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
+          input_shape[0])
+      vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
+          input_shape[0])
+
+    with CustomObjectScope({"StringLookup": cls}):
+      output_data = testing_utils.layer_test(
+          cls,
+          kwargs=kwargs,
+          input_shape=input_shape,
+          input_data=input_data,
+          input_dtype=input_dtype,
+          expected_output_dtype=expected_output_dtype,
+          validate_training=False,
+          adapt_data=vocab_data)
+    self.assertAllClose(expected_output, output_data)
+
+
+@keras_parameterized.run_all_keras_modes
+class StringLookupVocabularyTest(keras_parameterized.TestCase,
+                                 preprocessing_test_utils.PreprocessingLayerTest
+                                ):
+
+  def _write_to_temp_file(self, file_name, vocab_list):
+    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+    with gfile.GFile(vocab_path, "w") as writer:
+      for vocab in vocab_list:
+        writer.write(vocab + "\n")
+      writer.flush()
+      writer.close()
+    return vocab_path
+
+  def test_int_output_explicit_vocab(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(vocabulary=vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_get_vocab_returns_str(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    expected_vocab = ["", "[UNK]", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(vocabulary=vocab_data)
+    layer_vocab = layer.get_vocabulary()
+    self.assertAllEqual(expected_vocab, layer_vocab)
+    self.assertIsInstance(layer_vocab[0], six.text_type)
+
+  def test_int_output_explicit_vocab_from_file(self):
+    vocab_list = ["earth", "wind", "and", "fire"]
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(vocabulary=vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_non_unique_vocab_fails(self):
+    vocab_data = ["earth", "wind", "and", "fire", "fire"]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
+      _ = get_layer_class()(vocabulary=vocab_data)
+
+  def test_non_unique_vocab_from_file_fails(self):
+    vocab_list = ["earth", "wind", "and", "fire", "earth"]
+    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
+      _ = get_layer_class()(vocabulary=vocab_path)
+
+  def test_inverse_layer(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 0]])
+    expected_output = np.array([["earth", "wind", "and", "fire"],
+                                ["fire", "and", "earth", ""]])
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(vocabulary=vocab_data, invert=True)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_forward_backward_layer(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = np.array([["earth", "wind", "and", "fire"],
+                                ["fire", "and", "earth", "[UNK]"]])
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(vocabulary=vocab_data)
+    invert_layer = get_layer_class()(
+        vocabulary=layer.get_vocabulary(), invert=True)
+    int_data = layer(input_data)
+    out_data = invert_layer(int_data)
+    model = keras.Model(inputs=input_data, outputs=out_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_ragged_string_input_multi_bucket(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = ragged_factory_ops.constant([["earth", "wind", "fire"],
+                                               ["fire", "and", "earth",
+                                                "ohio"]])
+    expected_output = [[3, 4, 6], [6, 5, 3, 2]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
+    layer = get_layer_class()(num_oov_indices=2)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
+class StringLookupSaveableTest(keras_parameterized.TestCase,
+                               preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_ops_are_not_added_with_multiple_get_set_weights(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(max_tokens=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    weights = model.get_weights()
+    model.set_weights(weights)
+    keras.backend.get_session().graph.finalize()
+    weights = model.get_weights()
+    model.set_weights(weights)
+
+  def test_layer_saving_with_h5(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(max_tokens=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    path = os.path.join(self.get_temp_dir(), "model")
+    with self.assertRaisesRegex(NotImplementedError,
+                                "Save or restore weights that is not.*"):
+      save.save_model(model, path, save_format="h5")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py b/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py
new file mode 100644
index 00000000000..3b5d0679372
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py
@@ -0,0 +1,27 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import string_lookup
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export(v1=["keras.layers.experimental.preprocessing.StringLookup"])
+class StringLookup(string_lookup.StringLookup, index_lookup_v1.IndexLookup):
+  pass
diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils.py b/tensorflow/python/keras/layers/preprocessing/table_utils.py
new file mode 100644
index 00000000000..cf1bfd741c9
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils.py
@@ -0,0 +1,199 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for working with tf.lookup tables in Keras."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.keras import backend as K
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import gfile
+
+
+class TableHandler(object):
+  """Wrapper object that holds a lookup table and provides accessors."""
+
+  def __init__(self, table, oov_tokens=None, use_v1_apis=False):
+    self.table = table
+    self.use_v1_apis = use_v1_apis
+    if oov_tokens is None:
+      self.oov_tokens = oov_tokens
+    else:
+      if not isinstance(oov_tokens, (list, tuple, np.ndarray)):
+        oov_tokens = [oov_tokens]
+      self.oov_tokens = math_ops.cast(oov_tokens, table._value_dtype)  # pylint: disable=protected-access
+
+  def data(self):
+    keys, values = self.table.export()
+    return (self._eval(keys), self._eval(values))
+
+  def vocab_size(self):
+    return self._eval(self.table.size())
+
+  def clear(self):
+    keys, _ = self.table.export()
+    self._run(self.table.remove(keys))
+
+  def insert(self, keys, values):
+    if len(values) != len(keys):
+      raise RuntimeError("Size mismatch between values and key arrays. "
+                         "Keys had size %s, values had size %s." %
+                         (len(keys), len(values)))
+    keys = ops.convert_to_tensor(keys, dtype=self.table._key_dtype)  # pylint: disable=protected-access
+    values = ops.convert_to_tensor(values, dtype=self.table._value_dtype)  # pylint: disable=protected-access
+    if values.shape.ndims != 1:
+      raise ValueError("`values` must be 1-dimensional, got an input with "
+                       " %s dimensions." % values.shape.ndims)
+    self._run(self.table.insert(keys, values))
+
+  def _replace_oov_buckets(self, inputs, lookups):
+    """Replace the default OOV value with one of the OOV bucket values."""
+    if self.oov_tokens is None:
+      return lookups
+
+    num_oov_elements = self.oov_tokens.shape.num_elements()
+    if inputs.dtype.is_integer:
+      oov_indices = math_ops.floormod(inputs, num_oov_elements)
+    else:
+      oov_indices = string_ops.string_to_hash_bucket_fast(
+          inputs, num_buckets=num_oov_elements)
+
+    oov_values = array_ops.gather(self.oov_tokens, oov_indices)
+    oov_locations = math_ops.equal(lookups, self.table._default_value)  # pylint: disable=protected-access
+
+    return array_ops.where(oov_locations, oov_values, lookups)
+
+  def _ragged_lookup(self, inputs):
+    """Perform a table lookup on a ragged tensor."""
+    # The table lookup ops don't natively support ragged tensors, so if we have
+    # a RT we need to use map_flat_values to look up every element.
+    indexed_data = ragged_functional_ops.map_flat_values(
+        self.table.lookup, inputs)
+    indexed_data = ragged_functional_ops.map_flat_values(
+        self._replace_oov_buckets, inputs, indexed_data)
+    # table.lookup is not shape-preserving, so we need to set the shape here.
+    indexed_data._set_shape(inputs.shape)  # pylint: disable=protected-access
+    # Composite tensors can pass tensor values through, which will cause
+    # errors if all operations in the TF graph do so. We can break this chain
+    # with an identity here.
+    return array_ops.identity(indexed_data)
+
+  def _sparse_lookup(self, inputs):
+    """Perform a table lookup on a sparse tensor."""
+    values = self.table.lookup(inputs.values)
+    values = self._replace_oov_buckets(inputs.values, values)
+    indexed_data = sparse_tensor.SparseTensor(inputs.indices, values,
+                                              inputs.dense_shape)
+    # Composite tensors can pass tensor values through, which will cause
+    # errors if all operations in the TF graph do so. We can break this chain
+    # with an identity here.
+    return array_ops.identity(indexed_data)
+
+  def _tensor_lookup(self, inputs):
+    """Perform a table lookup on a tf.tensor."""
+    values = self.table.lookup(inputs)
+    indexed_data = self._replace_oov_buckets(inputs, values)
+    # (b/149446477): output does not preserve input shape.
+    indexed_data.set_shape(inputs.shape)
+    return indexed_data
+
+  def lookup(self, inputs):
+    """Perform a table lookup."""
+    # Sparse tensors don't play nicely with tensor conversion, so we handle
+    # them before attempting to convert lists or arrays to tensors.
+    if isinstance(
+        inputs, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
+      return self._sparse_lookup(inputs)
+
+    # Try to convert lists/arrays to tensors or RaggedTensors.
+    inputs = ragged_tensor.convert_to_tensor_or_ragged_tensor(inputs)
+
+    # Run the lookup operation on the converted tensor.
+    if ragged_tensor.is_ragged(inputs):
+      return self._ragged_lookup(inputs)
+    else:
+      return self._tensor_lookup(inputs)
+
+  def _eval(self, tensor):
+    if self.use_v1_apis:
+      return K.get_session().run(tensor)
+    else:
+      return tensor.numpy()
+
+  def _run(self, op):
+    if self.use_v1_apis:
+      K.get_session().run(op)
+
+
+def get_vocabulary_from_file(vocabulary_path, encoding="utf-8"):
+  """Read a vocabulary in from a file."""
+  vocab = []
+  with gfile.GFile(vocabulary_path, "r") as reader:
+    while True:
+      # Get the next line (incl. \n), and break if nothing is left to read.
+      text = reader.readline()
+      if not text:
+        break
+
+      # Convert the raw text and strip whitespace.
+      if isinstance(text, str):
+        token = text
+      elif isinstance(text, bytes):
+        token = text.decode(encoding, "ignore")
+      token = token.strip()
+      vocab.append(token)
+  return vocab
+
+
+def validate_vocabulary_is_unique(vocabulary):
+  """Validate that a vocabulary contains no repeated tokens."""
+  vocabulary_set = set(vocabulary)
+  if len(vocabulary) != len(vocabulary_set):
+    repeated_items = [
+        item for item, count in collections.Counter(vocabulary).items()
+        if count > 1
+    ]
+    raise ValueError("The passed vocabulary has at least one repeated "
+                     "term. Please uniquify your dataset. The repeated terms "
+                     "are %s" % repeated_items)
+
+
+def assert_same_type(expected_type, values, value_name):
+  """Assert that 'values' is of type 'expected_type'."""
+  if dtypes.as_dtype(expected_type) != dtypes.as_dtype(values.dtype):
+    raise RuntimeError("Expected %s type %s, got %s" %
+                       (value_name, expected_type, values.dtype))
+
+
+def convert_to_ndarray(x, dtype=None):
+  """Convert 'x' to a numpy array."""
+  array = np.array(x) if isinstance(x, (list, tuple)) else x
+  if dtype not in (None, dtypes.string):
+    # If the dtype is an integer, we do permissive casting. This allows
+    # users to examine int32 data if the dtype is int64 without trouble.
+    np_dtype = dtypes.as_dtype(dtype).as_numpy_dtype
+    if np.can_cast(array.dtype, np_dtype):
+      array = array.astype(np_dtype, casting="safe")
+  return array
diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils_test.py b/tensorflow/python/keras/layers/preprocessing/table_utils_test.py
new file mode 100644
index 00000000000..ab7e80b628c
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils_test.py
@@ -0,0 +1,252 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras lookup table utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.keras.layers.preprocessing import table_utils
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.platform import test
+
+
+def get_table(dtype=dtypes.string, oov_tokens=None):
+  table = lookup_ops.MutableHashTable(
+      key_dtype=dtype,
+      value_dtype=dtypes.int64,
+      default_value=-7,
+      name="index_table")
+  return table_utils.TableHandler(
+      table, oov_tokens, use_v1_apis=(not context.executing_eagerly()))
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingInputTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_string_input(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=["fire", "michigan"],
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [5, 1]
+    expected_dense_shape = [3, 4]
+
+    table = get_table(oov_tokens=[1])
+    table.insert(vocab_data, range(2, len(vocab_data) + 2))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_sparse_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 32], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [5, 1]
+    expected_dense_shape = [3, 4]
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1])
+    table.insert(vocab_data, range(2, len(vocab_data) + 2))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_string_input(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = ragged_factory_ops.constant(
+        [["earth", "wind", "fire"], ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    table = get_table(oov_tokens=[1])
+    table.insert(vocab_data, range(2, len(vocab_data) + 2))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+  def test_ragged_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
+                                              dtype=np.int64)
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1])
+    table.insert(vocab_data, range(2, len(vocab_data) + 2))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+  def test_tensor_multi_dim_values_fails(self):
+    key_data = np.array([0, 1], dtype=np.int64)
+    value_data = np.array([[11, 12], [21, 22]])
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1, 2])
+
+    with self.assertRaisesRegexp(ValueError, "must be 1-dimensional"):
+      table.insert(key_data, value_data)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingMultiOOVTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_string_input_multi_bucket(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=["fire", "ohio"], dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [6, 2]
+    expected_dense_shape = [3, 4]
+
+    table = get_table(oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_sparse_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 132], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [6, 1]
+    expected_dense_shape = [3, 4]
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_string_input_multi_bucket(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = ragged_factory_ops.constant([["earth", "wind", "fire"],
+                                               ["fire", "and", "earth",
+                                                "ohio"]])
+    expected_output = [[3, 4, 6], [6, 5, 3, 2]]
+
+    table = get_table(oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+  def test_ragged_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 132]],
+                                              dtype=np.int64)
+    expected_output = [[3, 4, 6], [6, 5, 3, 1]]
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+  def test_tensor_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = np.array([[13, 132], [13, 133]], dtype=np.int64)
+    expected_values = [[6, 1], [6, 2]]
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_values, output_data)
+
+  def test_tensor_string_input_multi_bucket(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = [["earth", "wind", "fire", "michigan"],
+                   ["fire", "and", "earth", "ohio"]]
+    expected_output = [[3, 4, 6, 1], [6, 5, 3, 2]]
+
+    table = get_table(oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+
+@keras_parameterized.run_all_keras_modes
+class IndexLookupOutputTest(keras_parameterized.TestCase,
+                            preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_int_output_default_lookup_value(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[1, 2, 3, 4], [4, 3, 1, -7]]
+
+    table = get_table(oov_tokens=None)
+    table.insert(vocab_data, range(1, len(vocab_data) + 1))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+  def test_output_shape(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+
+    table = get_table()
+    table.insert(vocab_data, range(1, len(vocab_data) + 1))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(input_array.shape[1:], output_data.shape[1:])
+
+  def test_int_output_no_reserved_zero_default_lookup_value(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[0, 1, 2, 3], [3, 2, 0, -7]]
+
+    table = get_table(oov_tokens=None)
+    table.insert(vocab_data, range(len(vocab_data)))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 119e0b5ccff..bff7969477c 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -17,10 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import json
-import operator
-
 import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
@@ -29,10 +25,9 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.engine.base_preprocessing_layer import Combiner
 from tensorflow.python.keras.engine.base_preprocessing_layer import CombinerPreprocessingLayer
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding
-from tensorflow.python.keras.layers.preprocessing import index_lookup
+from tensorflow.python.keras.layers.preprocessing import category_encoding
+from tensorflow.python.keras.layers.preprocessing import string_lookup
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -41,17 +36,16 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_string_ops
 from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import keras_export
 
 LOWER_AND_STRIP_PUNCTUATION = "lower_and_strip_punctuation"
 
 SPLIT_ON_WHITESPACE = "whitespace"
 
-TFIDF = categorical_encoding.TFIDF
-INT = categorical_encoding.INT
-BINARY = categorical_encoding.BINARY
-COUNT = categorical_encoding.COUNT
+TFIDF = category_encoding.TFIDF
+INT = category_encoding.INT
+BINARY = category_encoding.BINARY
+COUNT = category_encoding.COUNT
 
 # This is an explicit regex of all the tokens that will be stripped if
 # LOWER_AND_STRIP_PUNCTUATION is set. If an application requires other
@@ -122,7 +116,9 @@ class TextVectorization(CombinerPreprocessingLayer):
 
   Attributes:
     max_tokens: The maximum size of the vocabulary for this layer. If None,
-      there is no cap on the size of the vocabulary.
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      contains 1 OOV token, so the effective number of tokens is `(max_tokens -
+      1 - (1 if output == "int" else 0))`.
     standardize: Optional specification for standardization to apply to the
       input text. Values can be None (no standardization),
       'lower_and_strip_punctuation' (lowercase and remove punctuation) or a
@@ -138,7 +134,8 @@ class TextVectorization(CombinerPreprocessingLayer):
     output_mode: Optional specification for the output of the layer. Values can
       be "int", "binary", "count" or "tf-idf", configuring the layer as follows:
         "int": Outputs integer indices, one integer index per split string
-          token.
+          token. When output == "int", 0 is reserved for masked locations;
+          this reduces the vocab size to max_tokens-2 instead of max_tokens-1
         "binary": Outputs a single int array per batch, of either vocab_size or
           max_tokens size, containing 1s in all elements where the token mapped
           to that index exists at least once in the batch item.
@@ -160,42 +157,43 @@ class TextVectorization(CombinerPreprocessingLayer):
   Example:
   This example instantiates a TextVectorization layer that lowercases text,
   splits on whitespace, strips punctuation, and outputs integer vocab indices.
-  ```
-  max_features = 5000  # Maximum vocab size.
-  max_len = 40  # Sequence length to pad the outputs to.
 
-  # Create the layer.
-  vectorize_layer = text_vectorization.TextVectorization(
-    max_tokens=max_features,
-    output_mode='int',
-    output_sequence_length=max_len)
+  >>> text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])
+  >>> max_features = 5000  # Maximum vocab size.
+  >>> max_len = 4  # Sequence length to pad the outputs to.
+  >>> embedding_dims = 2
+  >>>
+  >>> # Create the layer.
+  >>> vectorize_layer = TextVectorization(
+  ...  max_tokens=max_features,
+  ...  output_mode='int',
+  ...  output_sequence_length=max_len)
+  >>>
+  >>> # Now that the vocab layer has been created, call `adapt` on the text-only
+  >>> # dataset to create the vocabulary. You don't have to batch, but for large
+  >>> # datasets this means we're not keeping spare copies of the dataset.
+  >>> vectorize_layer.adapt(text_dataset.batch(64))
+  >>>
+  >>> # Create the model that uses the vectorize text layer
+  >>> model = tf.keras.models.Sequential()
+  >>>
+  >>> # Start by creating an explicit input layer. It needs to have a shape of
+  >>> # (1,) (because we need to guarantee that there is exactly one string
+  >>> # input per batch), and the dtype needs to be 'string'.
+  >>> model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
+  >>>
+  >>> # The first layer in our model is the vectorization layer. After this
+  >>> # layer, we have a tensor of shape (batch_size, max_len) containing vocab
+  >>> # indices.
+  >>> model.add(vectorize_layer)
+  >>>
+  >>> # Now, the model can map strings to integers, and you can add an embedding
+  >>> # layer to map these integers to learned embeddings.
+  >>> input_data = [["foo qux bar"], ["qux baz"]]
+  >>> model.predict(input_data)
+  array([[2, 1, 4, 0],
+         [1, 3, 0, 0]])
 
-  # Now that the vocab layer has been created, call `adapt` on the text-only
-  # dataset to create the vocabulary. You don't have to batch, but for large
-  # datasets this means we're not keeping spare copies of the dataset in memory.
-  vectorize_layer.adapt(text_dataset.batch(64))
-
-  # Create the model that uses the vectorize text layer
-  model = tf.keras.models.Sequential()
-
-  # Start by creating an explicit input layer. It needs to have a shape of (1,)
-  # (because we need to guarantee that there is exactly one string input per
-  # batch), and the dtype needs to be 'string'.
-  model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
-
-  # The first layer in our model is the vectorization layer. After this layer,
-  # we have a tensor of shape (batch_size, max_len) containing vocab indices.
-  model.add(vectorize_layer)
-
-  # Next, we add a layer to map those vocab indices into a space of
-  # dimensionality 'embedding_dims'. Note that we're using max_features+1 here,
-  # since there's an OOV token that gets added to the vocabulary in
-  # vectorize_layer.
-  model.add(tf.keras.layers.Embedding(max_features+1, embedding_dims))
-
-  # At this point, you have embedded float data representing your tokens, and
-  # can add whatever other layers you need to create your model.
-  ```
   """
   # TODO(momernick): Add an examples section to the docstring.
 
@@ -269,21 +267,11 @@ class TextVectorization(CombinerPreprocessingLayer):
 
     self._max_tokens = max_tokens
 
-    # In INT mode, we have two reserved values (PAD and OOV). However, non-INT
-    # modes don't have a PAD value, so we only need to reserve one value.
-    self._reserved_values = 2 if output_mode == INT else 1
-
     # In INT mode, the zero value is reserved for padding (per Keras standard
     # padding approaches). In non-INT modes, there is no padding so we can set
     # the OOV value to zero instead of one.
     self._oov_value = 1 if output_mode == INT else 0
 
-    # We always reduce the max token number by 1 to account for the OOV token
-    # if it is set. Keras' use of the reserved number 0 for padding tokens,
-    # if the output is in INT mode, does not really count as a 'token' for
-    # vocabulary purposes, so we only reduce vocab size by 1 here.
-    self._max_vocab_size = max_tokens - 1 if max_tokens is not None else None
-
     self._standardize = standardize
     self._split = split
     self._ngrams_arg = ngrams
@@ -299,42 +287,31 @@ class TextVectorization(CombinerPreprocessingLayer):
     self._called = False
 
     super(TextVectorization, self).__init__(
-        combiner=_TextVectorizationCombiner(
-            self._max_vocab_size, compute_idf=output_mode == TFIDF),
+        combiner=None,
         **kwargs)
 
-    reserve_zero = output_mode in [None, INT]
+    mask_token = "" if output_mode in [None, INT] else None
     self._index_lookup_layer = self._get_index_lookup_class()(
-        max_tokens=max_tokens, reserve_zero=reserve_zero, dtype=dtypes.string)
+        max_tokens=max_tokens, mask_token=mask_token)
 
     # If this layer is configured for string or integer output, we do not
     # create a vectorization layer (as the output is not vectorized).
     if self._output_mode in [None, INT]:
-      return
-
-    if max_tokens is not None and self._pad_to_max:
-      max_elements = max_tokens
+      self._vectorize_layer = None
     else:
-      max_elements = None
-    self._vectorize_layer = self._get_vectorization_class()(
-        max_tokens=max_elements, output_mode=self._output_mode)
+      if max_tokens is not None and self._pad_to_max:
+        max_elements = max_tokens
+      else:
+        max_elements = None
+      self._vectorize_layer = self._get_vectorization_class()(
+          max_tokens=max_elements, output_mode=self._output_mode)
 
   # These are V1/V2 shim points. There are V1 implementations in the V1 class.
   def _get_vectorization_class(self):
-    return categorical_encoding.CategoricalEncoding
-
-  def _get_table_data(self):
-    keys, values = self._table.export()
-    return (keys.numpy(), values.numpy())
+    return category_encoding.CategoryEncoding
 
   def _get_index_lookup_class(self):
-    return index_lookup.IndexLookup
-
-  def _to_numpy(self, preprocessed_data):
-    """Converts preprocessed inputs into numpy arrays."""
-    if isinstance(preprocessed_data, np.ndarray):
-      return preprocessed_data
-    return np.array(preprocessed_data.to_list())
+    return string_lookup.StringLookup
   # End of V1/V2 shim points.
 
   def _assert_same_type(self, expected_type, values, value_name):
@@ -350,16 +327,21 @@ class TextVectorization(CombinerPreprocessingLayer):
       return tensor_shape.TensorShape([input_shape[0], self._max_tokens])
 
     if self._output_mode == INT and self._split is None:
-      return input_shape
+      if len(input_shape) == 1:
+        input_shape = tuple(input_shape) + (1,)
+      return tensor_shape.TensorShape(input_shape)
 
     if self._output_mode == INT and self._split is not None:
       input_shape = list(input_shape)
-      input_shape[1] = self._output_sequence_length
+      if len(input_shape) == 1:
+        input_shape = input_shape + [self._output_sequence_length]
+      else:
+        input_shape[1] = self._output_sequence_length
       return tensor_shape.TensorShape(input_shape)
 
   def compute_output_signature(self, input_spec):
     output_shape = self.compute_output_shape(input_spec.shape.as_list())
-    output_dtype = K.floatx() if self._output_mode == TFIDF else dtypes.int64
+    output_dtype = dtypes.int64 if self._output_mode == INT else K.floatx()
     return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
 
   def adapt(self, data, reset_state=True):
@@ -370,7 +352,7 @@ class TextVectorization(CombinerPreprocessingLayer):
 
     Arguments:
       data: The data to train on. It can be passed either as a tf.data Dataset,
-        or as a numpy array.
+        as a NumPy array, a string tensor, or as a list of texts.
       reset_state: Optional argument specifying whether to clear the state of
         the layer at the start of the call to `adapt`. This must be True for
         this layer, which does not support repeated calls to `adapt`.
@@ -381,26 +363,39 @@ class TextVectorization(CombinerPreprocessingLayer):
     # Build the layer explicitly with the original data shape instead of relying
     # on an implicit call to `build` in the base layer's `adapt`, since
     # preprocessing changes the input shape.
-    if isinstance(data, np.ndarray):
-      if data.ndim == 1:
-        data = np.expand_dims(data, axis=-1)
+    if isinstance(data, (list, tuple, np.ndarray)):
+      data = ops.convert_to_tensor(data)
+
+    if isinstance(data, ops.Tensor):
+      if data.shape.rank == 1:
+        data = array_ops.expand_dims(data, axis=-1)
       self.build(data.shape)
-      preprocessed_inputs = self._to_numpy(self._preprocess(data))
+      preprocessed_inputs = self._preprocess(data)
     elif isinstance(data, dataset_ops.DatasetV2):
       # TODO(momernick): Replace this with a more V2-friendly API.
       shape = dataset_ops.get_legacy_output_shapes(data)
       if not isinstance(shape, tensor_shape.TensorShape):
         raise ValueError("The dataset passed to 'adapt' must contain a single "
                          "tensor value.")
+      if shape.rank == 0:
+        data = data.map(lambda tensor: array_ops.expand_dims(tensor, 0))
+        shape = dataset_ops.get_legacy_output_shapes(data)
       if shape.rank == 1:
         data = data.map(lambda tensor: array_ops.expand_dims(tensor, -1))
       self.build(dataset_ops.get_legacy_output_shapes(data))
       preprocessed_inputs = data.map(self._preprocess)
     else:
       raise ValueError(
-          "adapt() requires a Dataset or a Numpy array as input, got {}".format(
+          "adapt() requires a Dataset or an array as input, got {}".format(
               type(data)))
-    super(TextVectorization, self).adapt(preprocessed_inputs, reset_state)
+
+    self._index_lookup_layer.adapt(preprocessed_inputs)
+    if self._vectorize_layer:
+      if isinstance(data, ops.Tensor):
+        integer_data = self._index_lookup_layer(preprocessed_inputs)
+      else:
+        integer_data = preprocessed_inputs.map(self._index_lookup_layer)
+      self._vectorize_layer.adapt(integer_data)
 
   def get_vocabulary(self):
     return self._index_lookup_layer.get_vocabulary()
@@ -428,26 +423,21 @@ class TextVectorization(CombinerPreprocessingLayer):
   def set_vocabulary(self,
                      vocab,
                      df_data=None,
-                     oov_df_value=None,
-                     append=False):
+                     oov_df_value=None):
     """Sets vocabulary (and optionally document frequency) data for this layer.
 
     This method sets the vocabulary and DF data for this layer directly, instead
     of analyzing a dataset through 'adapt'. It should be used whenever the vocab
     (and optionally document frequency) information is already known. If
-    vocabulary data is already present in the layer, this method will either
-    replace it, if 'append' is set to False, or append to it (if 'append' is set
-    to True).
+    vocabulary data is already present in the layer, this method will replace
+    it.
 
     Arguments:
       vocab: An array of string tokens.
       df_data: An array of document frequency data. Only necessary if the layer
         output_mode is TFIDF.
       oov_df_value: The document frequency of the OOV token. Only necessary if
-        output_mode is TFIDF. OOV data is optional when appending additional
-        data in TFIDF mode; if an OOV value is supplied it will overwrite the
-        existing OOV value.
-      append: Whether to overwrite or append any existing vocabulary data.
+        output_mode is TFIDF.
 
     Raises:
       ValueError: If there are too many inputs, the inputs do not match, or
@@ -468,8 +458,7 @@ class TextVectorization(CombinerPreprocessingLayer):
                           "be changed after the layer is "
                           "called.").format(mode=self._output_mode))
 
-    current_table_size = self._index_lookup_layer.vocab_size()
-    self._index_lookup_layer.set_vocabulary(vocab, append)
+    self._index_lookup_layer.set_vocabulary(vocab)
 
     # When doing raw or integer output, we don't have a Vectorize layer to
     # manage. In this case, we can return directly.
@@ -477,14 +466,9 @@ class TextVectorization(CombinerPreprocessingLayer):
       return
 
     if not self._pad_to_max or self._max_tokens is None:
-      num_tokens = self._index_lookup_layer.vocab_size() + self._reserved_values
+      num_tokens = self._index_lookup_layer.vocab_size()
       self._vectorize_layer.set_num_elements(num_tokens)
 
-    # We're only _really_ appending if the table_size is nonzero. This is
-    # important for some sanity checks in tfidf mode (specifically, checking if
-    # oov_df_value is set or not) and handling existing tfidf weight data.
-    append = append if current_table_size > 0 else False
-
     if self._output_mode == TFIDF:
       if df_data is None:
         raise ValueError("df_data must be set if output_mode is TFIDF")
@@ -492,31 +476,14 @@ class TextVectorization(CombinerPreprocessingLayer):
         raise ValueError("df_data must be the same length as vocab. "
                          "len(df_data) is %s, len(vocab) is %s" %
                          (len(vocab), len(df_data)))
-      if not append and oov_df_value is None:
-        raise ValueError("You must pass an oov_df_value the first time "
-                         "'set_vocabulary' is called when output_mode is "
+      if oov_df_value is None:
+        raise ValueError("You must pass an oov_df_value when output_mode is "
                          "TFIDF.")
 
       df_data = self._convert_to_ndarray(df_data)
-      if append:
-        # The existing IDF data is stored in a Keras weight, so we can get it
-        # by calling K.get_value() on the weight object. Take the first
-        # table_size+1 values in case we're padding the weight with zeros
-        existing_df_data = K.get_value(
-            self._vectorize_layer.tf_idf_weights)[:current_table_size + 1]
-        df_data = np.append(existing_df_data, df_data, axis=0)
-        # If we are appending and need to replace the OOV DF value, we can
-        # assign it over the existing OOV DF value at index 0 of the (already-
-        # concatenated) DF value array.
-        if oov_df_value is not None:
-          df_data[0] = oov_df_value
-      else:
-        # If we are not appending (that is, we have only new data) we need to
-        # insert the OOV value to the front of the array. (This is a append to
-        # the head, not a replacement of the zeroth value.)
-        if not isinstance(oov_df_value, np.ndarray):
-          oov_df_value = np.array([oov_df_value])
-        df_data = np.insert(df_data, 0, oov_df_value)
+      if not isinstance(oov_df_value, np.ndarray):
+        oov_df_value = np.array([oov_df_value])
+      df_data = np.insert(df_data, 0, oov_df_value)
       self._vectorize_layer.set_tfidf_data(df_data)
 
   def build(self, input_shape):
@@ -524,11 +491,12 @@ class TextVectorization(CombinerPreprocessingLayer):
     # in None for undefined shape axes. If using 'and !=', this causes the
     # expression to evaluate to False instead of True if the shape is undefined;
     # the expression needs to evaluate to True in that case.
-    if self._split is not None and not input_shape[1] == 1:  # pylint: disable=g-comparison-negation
-      raise RuntimeError(
-          "When using TextVectorization to tokenize strings, the first "
-          "dimension of the input array must be 1, got shape "
-          "{}".format(input_shape))
+    if self._split is not None:
+      if input_shape.ndims > 1 and not input_shape[-1] == 1:  # pylint: disable=g-comparison-negation
+        raise RuntimeError(
+            "When using TextVectorization to tokenize strings, the innermost "
+            "dimension of the input array must be 1, got shape "
+            "{}".format(input_shape))
 
     super(TextVectorization, self).build(input_shape)
 
@@ -536,8 +504,10 @@ class TextVectorization(CombinerPreprocessingLayer):
     if not self.built:
       raise RuntimeError("_set_state_variables() must be called after build().")
     if self._output_mode == TFIDF:
-      self.set_vocabulary(updates[_VOCAB_NAME], updates[_IDF_NAME],
-                          updates[_OOV_IDF_NAME])
+      self.set_vocabulary(
+          updates[_VOCAB_NAME],
+          updates[_IDF_NAME],
+          updates[_OOV_IDF_NAME])
     else:
       self.set_vocabulary(updates[_VOCAB_NAME])
 
@@ -568,7 +538,8 @@ class TextVectorization(CombinerPreprocessingLayer):
       # If we are splitting, we validate that the 1st axis is of dimension 1 and
       # so can be squeezed out. We do this here instead of after splitting for
       # performance reasons - it's more expensive to squeeze a ragged tensor.
-      inputs = array_ops.squeeze(inputs, axis=1)
+      if inputs.shape.ndims > 1:
+        inputs = array_ops.squeeze(inputs, axis=-1)
       if self._split == SPLIT_ON_WHITESPACE:
         # This treats multiple whitespaces as one whitespace, and strips leading
         # and trailing whitespace.
@@ -591,8 +562,8 @@ class TextVectorization(CombinerPreprocessingLayer):
     return inputs
 
   def call(self, inputs):
-    if inputs.shape.rank == 1:
-      inputs = array_ops.expand_dims(inputs, axis=-1)
+    if isinstance(inputs, (list, tuple, np.ndarray)):
+      inputs = ops.convert_to_tensor(inputs)
 
     self._called = True
     inputs = self._preprocess(inputs)
@@ -600,9 +571,7 @@ class TextVectorization(CombinerPreprocessingLayer):
     # If we're not doing any output processing, return right away.
     if self._output_mode is None:
       return inputs
-
     indexed_data = self._index_lookup_layer(inputs)
-
     if self._output_mode == INT:
       # Once we have the dense tensor, we can return it if we weren't given a
       # fixed output sequence length. If we were, though, we have to dynamically
@@ -615,7 +584,6 @@ class TextVectorization(CombinerPreprocessingLayer):
         dense_data = indexed_data
 
       if self._output_sequence_length is None:
-        dense_data.set_shape(tensor_shape.TensorShape((None, None)))
         return dense_data
       else:
         sequence_len = K.shape(dense_data)[1]
@@ -626,198 +594,11 @@ class TextVectorization(CombinerPreprocessingLayer):
             sequence_len < self._output_sequence_length,
             true_fn=pad_fn,
             false_fn=slice_fn)
-        output_tensor.set_shape(
-            tensor_shape.TensorShape((None, self._output_sequence_length)))
+        output_shape = output_tensor.shape.as_list()
+        output_shape[-1] = self._output_sequence_length
+        output_tensor.set_shape(tensor_shape.TensorShape(output_shape))
         return output_tensor
 
     # If we're not returning integers here, we rely on the vectorization layer
     # to create the output.
     return self._vectorize_layer(indexed_data)
-
-
-class _TextVectorizationAccumulator(
-    collections.namedtuple("_TextVectorizationAccumulator",
-                           ["count_dict", "per_doc_count_dict", "metadata"])):
-  pass
-
-
-# A note on this combiner: This contains functionality that will be extracted
-# into the Vectorization and IndexLookup combiner objects. At that point,
-# TextVectorization can become a PreprocessingStage instead of a Layer and
-# this combiner can be retired. Until then, we leave this as is instead of
-# attempting a refactor of what will soon be deleted.
-class _TextVectorizationCombiner(Combiner):
-  """Combiner for the TextVectorization preprocessing layer.
-
-  This class encapsulates the logic for computing a vocabulary based on the
-  frequency of each token.
-
-  Attributes:
-    vocab_size: (Optional) If set, only the top `vocab_size` tokens (based on
-      frequency across the dataset) are retained in the vocabulary. If None, or
-      set to a value greater than the total number of distinct tokens in the
-      dataset, all tokens are retained.
-    compute_idf: (Optional) If set, the inverse document frequency will be
-      computed for each value.
-  """
-
-  def __init__(self, vocab_size=None, compute_idf=False):
-    self._vocab_size = vocab_size
-    self._compute_idf = compute_idf
-    self._input_dtype = dtypes.string
-
-  def compute(self, values, accumulator=None):
-    """Compute a step in this computation, returning a new accumulator."""
-    if dtypes.as_dtype(self._input_dtype) != dtypes.as_dtype(values.dtype):
-      raise RuntimeError("Expected input type %s, got %s" %
-                         (self._input_dtype, values.dtype))
-    if ragged_tensor.is_ragged(values):
-      values = values.to_list()
-    if isinstance(values, ops.EagerTensor):
-      values = values.numpy()
-    if isinstance(values, np.ndarray):
-      values = values.tolist()
-
-    if accumulator is None:
-      accumulator = self._create_accumulator()
-
-    # If we are being passed raw strings or bytestrings, we need to wrap them
-    # in an array so we don't accidentally iterate over the bytes instead of
-    # treating the string as one object.
-    if isinstance(values, (str, bytes)):
-      values = [values]
-
-    # TODO(momernick): Benchmark improvements to this algorithm.
-    for document in values:
-      current_doc_id = accumulator.metadata[0]
-      for token in document:
-        accumulator.count_dict[token] += 1
-        if self._compute_idf:
-          doc_count = accumulator.per_doc_count_dict[token]
-          if doc_count["last_doc_id"] != current_doc_id:
-            doc_count["count"] += 1
-            doc_count["last_doc_id"] = current_doc_id
-      accumulator.metadata[0] += 1
-
-    return accumulator
-
-  def merge(self, accumulators):
-    """Merge several accumulators to a single accumulator."""
-    if not accumulators:
-      return accumulators
-
-    base_accumulator = accumulators[0]
-
-    for accumulator in accumulators[1:]:
-      base_accumulator.metadata[0] += accumulator.metadata[0]
-      for token, value in accumulator.count_dict.items():
-        base_accumulator.count_dict[token] += value
-      if self._compute_idf:
-        for token, value in accumulator.per_doc_count_dict.items():
-          # Any newly created token counts in 'base_accumulator''s
-          # per_doc_count_dict will have a last_doc_id of -1. This is always
-          # less than the next doc id (which are strictly positive), so any
-          # future occurrences are guaranteed to be counted.
-          base_accumulator.per_doc_count_dict[token]["count"] += value["count"]
-
-    return base_accumulator
-
-  def _inverse_document_frequency(self, document_counts, num_documents):
-    """Compute the inverse-document-frequency (IDF) component of TFIDF.
-
-    Uses the default weighting scheme described in
-    https://en.wikipedia.org/wiki/Tf%E2%80%93idf.
-
-    Args:
-      document_counts: An array of the # of documents each token appears in.
-      num_documents: An int representing the total number of documents
-
-    Returns:
-      An array of "inverse document frequency" weights.
-    """
-    return np.log(1 + num_documents / (1 + np.array(document_counts)))
-
-  def extract(self, accumulator):
-    """Convert an accumulator into a dict of output values.
-
-    Args:
-      accumulator: An accumulator aggregating over the full dataset.
-
-    Returns:
-      A dict of:
-        "vocab": A list of the retained items in the vocabulary.
-        "idf": The inverse-document-frequency for each item in vocab.
-          idf[vocab_idx] is the IDF value for the corresponding vocab item.
-        "oov_idf": The inverse-document-frequency for the OOV token.
-    """
-    if self._compute_idf:
-      vocab_counts, document_counts, num_documents = accumulator
-    else:
-      vocab_counts, _, _ = accumulator
-
-    sorted_counts = sorted(
-        vocab_counts.items(), key=operator.itemgetter(1, 0), reverse=True)
-    vocab_data = (
-        sorted_counts[:self._vocab_size] if self._vocab_size else sorted_counts)
-    vocab = [data[0] for data in vocab_data]
-
-    if self._compute_idf:
-      doc_counts = [document_counts[token]["count"] for token in vocab]
-      idf = self._inverse_document_frequency(doc_counts, num_documents[0])
-      oov_idf = np.array([np.log(1 + num_documents[0])])
-      return {_VOCAB_NAME: vocab, _IDF_NAME: idf, _OOV_IDF_NAME: oov_idf}
-    else:
-      return {_VOCAB_NAME: vocab}
-
-  def restore(self, output):
-    """Create an accumulator based on 'output'."""
-    raise NotImplementedError(
-        "TextVectorization does not restore or support streaming updates.")
-
-  def serialize(self, accumulator):
-    """Serialize an accumulator for a remote call."""
-    output_dict = {}
-    output_dict["metadata"] = accumulator.metadata
-    output_dict["vocab"] = list(accumulator.count_dict.keys())
-    output_dict["vocab_counts"] = list(accumulator.count_dict.values())
-    if self._compute_idf:
-      output_dict["idf_vocab"] = list(accumulator.per_doc_count_dict.keys())
-      output_dict["idf_counts"] = [
-          counter["count"]
-          for counter in accumulator.per_doc_count_dict.values()
-      ]
-    return compat.as_bytes(json.dumps(output_dict))
-
-  def deserialize(self, encoded_accumulator):
-    """Deserialize an accumulator received from 'serialize()'."""
-    accumulator_dict = json.loads(compat.as_text(encoded_accumulator))
-
-    accumulator = self._create_accumulator()
-    accumulator.metadata[0] = accumulator_dict["metadata"][0]
-
-    count_dict = dict(
-        zip(accumulator_dict["vocab"], accumulator_dict["vocab_counts"]))
-    accumulator.count_dict.update(count_dict)
-
-    if self._compute_idf:
-      create_dict = lambda x: {"count": x, "last_doc_id": -1}
-      idf_count_dicts = [
-          create_dict(count) for count in accumulator_dict["idf_counts"]
-      ]
-      idf_dict = dict(zip(accumulator_dict["idf_vocab"], idf_count_dicts))
-      accumulator.per_doc_count_dict.update(idf_dict)
-
-    return accumulator
-
-  def _create_accumulator(self):
-    """Accumulate a sorted array of vocab tokens and corresponding counts."""
-
-    count_dict = collections.defaultdict(int)
-    if self._compute_idf:
-      create_default_dict = lambda: {"count": 0, "last_doc_id": -1}
-      per_doc_count_dict = collections.defaultdict(create_default_dict)
-    else:
-      per_doc_count_dict = None
-    metadata = [0]
-    return _TextVectorizationAccumulator(count_dict, per_doc_count_dict,
-                                         metadata)
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index d8325f39149..c641b2b71c9 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -29,6 +29,7 @@ from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import one_device_strategy
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
@@ -36,12 +37,10 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import convolutional
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.layers import embeddings
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.keras.layers.preprocessing import text_vectorization
 from tensorflow.python.keras.layers.preprocessing import text_vectorization_v1
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-from tensorflow.python.keras.saving import saved_model_experimental as saving
 from tensorflow.python.keras.utils import generic_utils
-from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_string_ops
@@ -61,7 +60,7 @@ def _get_end_to_end_test_cases():
           "testcase_name":
               "test_simple_tokens_int_mode",
           # Create an array where 'earth' is the most frequent term, followed by
-          # 'wind', then 'and', then 'fire'. This ensures that the vocab accumulator
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
           # is sorting by frequency.
           "vocab_data":
               np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
@@ -77,6 +76,26 @@ def _get_end_to_end_test_cases():
           },
           "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
       },
+      {
+          "testcase_name":
+              "test_simple_tokens_int_mode_hard_cap",
+          # Create an array where 'earth' is the most frequent term, followed by
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
+          # is sorting by frequency.
+          "vocab_data":
+              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
+                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
+          "input_data":
+              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
+                        ["and"], ["earth"], ["michigan"]]),
+          "kwargs": {
+              "max_tokens": 6,
+              "standardize": None,
+              "split": None,
+              "output_mode": text_vectorization.INT
+          },
+          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+      },
       {
           "testcase_name":
               "test_documents_int_mode",
@@ -250,10 +269,10 @@ class TextVectorizationLayerTest(keras_parameterized.TestCase,
   def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
                                        use_dataset, expected_output):
     cls = get_layer_class()
-    if kwargs.get("output_mode") == text_vectorization.TFIDF:
-      expected_output_dtype = dtypes.float32
-    else:
+    if kwargs.get("output_mode") == text_vectorization.INT:
       expected_output_dtype = dtypes.int64
+    else:
+      expected_output_dtype = dtypes.float32
     input_shape = input_data.shape
 
     if use_dataset:
@@ -274,18 +293,121 @@ class TextVectorizationLayerTest(keras_parameterized.TestCase,
       vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
           input_shape[0])
 
-    with CustomObjectScope({"TextVectorization": cls}):
-      output_data = testing_utils.layer_test(
-          cls,
-          kwargs=kwargs,
-          input_shape=input_shape,
-          input_data=input_data,
-          input_dtype=dtypes.string,
-          expected_output_dtype=expected_output_dtype,
-          validate_training=False,
-          adapt_data=vocab_data)
+    output_data = testing_utils.layer_test(
+        cls,
+        kwargs=kwargs,
+        input_shape=input_shape,
+        input_data=input_data,
+        input_dtype=dtypes.string,
+        expected_output_dtype=expected_output_dtype,
+        validate_training=False,
+        adapt_data=vocab_data)
     self.assertAllClose(expected_output, output_data)
 
+  def test_list_inputs_1d(self):
+    vocab_data = ["two two two", "two three three", "three four four five"]
+    input_data = ["two three", "four five"]
+    layer = get_layer_class()()
+    layer.adapt(vocab_data)
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    layer.set_vocabulary(["two", "three", "four", "five"])
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+
+  def test_tensor_inputs(self):
+    vocab_data = constant_op.constant(
+        ["two two two", "two three three", "three four four five"])
+    input_data = constant_op.constant(["two three", "four five"])
+    layer = get_layer_class()()
+    layer.adapt(vocab_data)
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    layer.set_vocabulary(["two", "three", "four", "five"])
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+
+  def test_list_inputs_2d(self):
+    vocab_data = [
+        ["two two two"], ["two three three"], ["three four four five"]]
+    input_data = [["two three"], ["four five"]]
+    layer = get_layer_class()()
+    layer.adapt(vocab_data)
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    layer.set_vocabulary(["two", "three", "four", "five"])
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+
+  def test_dataset_of_single_strings(self):
+    vocab_data = ["two two two", "two three three", "three four four five"]
+    input_data = ["two three", "four five"]
+    vocab_ds = dataset_ops.Dataset.from_tensor_slices(vocab_data)  # unbatched
+    layer = get_layer_class()()
+    layer.adapt(vocab_ds)
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+
+  @parameterized.named_parameters(
+      {
+          "testcase_name": "1d",
+          "data": ["0", "a", "b", "c", "d", "e", "a", "b", "c", "d", "f"],
+          "expected": [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1]
+      },
+      {
+          "testcase_name": "2d",
+          "data": [["0", "a", "b", "c", "d"], ["e", "a", "b", "c", "d"], ["f"]],
+          "expected": [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 0, 0, 0, 0]]
+      },
+      {
+          "testcase_name":
+              "3d",
+          "data": [[["0", "a", "b"], ["c", "d"]], [["e", "a"], ["b", "c", "d"]],
+                   [["f"]]],
+          "expected": [[[1, 2, 3], [4, 5, 0]], [[1, 2, 0], [3, 4, 5]],
+                       [[1, 0, 0], [0, 0, 0]]]
+      },
+  )
+  def test_layer_dimensionality_handling(self, data, expected):
+    vocab = ["a", "b", "c", "d"]
+    vectorization = get_layer_class()(
+        max_tokens=None, standardize=None, split=None, pad_to_max_tokens=False)
+    vectorization.set_vocabulary(vocab)
+    output = vectorization(ragged_factory_ops.constant(data))
+    self.assertAllEqual(expected, output)
+
+  @parameterized.named_parameters(
+      {
+          "testcase_name": "1d",
+          "data": ["0 a b c d e a b c d f"],
+          "expected": [[1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1]]
+      },
+      {
+          "testcase_name":
+              "3d",
+          "data": [[["0 a b"], ["c d"]], [["e a"], ["b c d"]], [["f"]]],
+          "expected": [[[1, 2, 3], [4, 5, 0]], [[1, 2, 0], [3, 4, 5]],
+                       [[1, 0, 0], [0, 0, 0]]]
+      },
+  )
+  def test_layer_dimensionality_handling_with_split(self, data, expected):
+    vocab = ["a", "b", "c", "d"]
+    vectorization = get_layer_class()(
+        max_tokens=None,
+        standardize=None,
+        split=text_vectorization.SPLIT_ON_WHITESPACE,
+        pad_to_max_tokens=False)
+    vectorization.set_vocabulary(vocab)
+    output = vectorization(ragged_factory_ops.constant(data, inner_shape=(1,)))
+    self.assertAllEqual(expected, output)
+
 
 @keras_parameterized.run_all_keras_modes
 class TextVectorizationPreprocessingTest(
@@ -511,7 +633,7 @@ class TextVectorizationPreprocessingTest(
         split=text_vectorization.SPLIT_ON_WHITESPACE,
         output_mode=None)
     with self.assertRaisesRegex(RuntimeError,
-                                ".*tokenize strings, the first dimension.*"):
+                                ".*tokenize strings, the innermost dime.*"):
       _ = layer(input_data)
 
   def test_string_splitting_with_non_1d_raggedarray_fails(self):
@@ -522,7 +644,7 @@ class TextVectorizationPreprocessingTest(
         split=text_vectorization.SPLIT_ON_WHITESPACE,
         output_mode=None)
     with self.assertRaisesRegex(RuntimeError,
-                                ".*tokenize strings, the first dimension.*"):
+                                ".*tokenize strings, the innermost dime.*"):
       _ = layer(input_data)
 
   def test_standardization_with_invalid_standardize_arg(self):
@@ -619,25 +741,6 @@ class TextVectorizationOutputTest(
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
-  def test_vocab_appending(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-    layer.set_vocabulary(vocab_data[0])
-    layer.set_vocabulary(vocab_data[1], append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
-
   def test_int_output_densifies_with_zeros(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     # Create an input array that has 5 elements in the first example and 4 in
@@ -952,7 +1055,7 @@ class TextVectorizationOutputTest(
         output_mode=text_vectorization.BINARY,
         pad_to_max_tokens=False)
     _ = layer(input_data)
-    with self.assertRaisesRegex(RuntimeError, "vocabulary cannot be changed"):
+    with self.assertRaisesRegex(RuntimeError, "can't be adapted after being"):
       layer.adapt(vocab_data)
 
   def test_bag_output_soft_maximum_set_state_variables_after_call_fails(self):
@@ -1046,7 +1149,10 @@ class TextVectorizationOutputTest(
         split=None,
         output_mode=text_vectorization.TFIDF,
         pad_to_max_tokens=True)
-    layer.set_vocabulary(vocab_data, df_data=tfidf_data, oov_df_value=.05)
+    layer.set_vocabulary(
+        vocab_data,
+        df_data=tfidf_data,
+        oov_df_value=.05)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -1084,60 +1190,6 @@ class TextVectorizationOutputTest(
     output_dataset = model.predict(input_array)
     self.assertAllClose(expected_output, output_dataset)
 
-  def test_tfidf_appending(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    tfidf_data = [[.5, .25], [.2, .125]]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "fire", "earth", "michigan"]])
-
-    # pyformat: disable
-    # pylint: disable=bad-whitespace
-    expected_output = [[ 0,  1, .25, .2,    0],
-                       [.1, .5,   0,  0, .125]]
-    # pylint: enable=bad-whitespace
-    # pyformat: enable
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TFIDF)
-    layer.set_vocabulary(vocab_data[0], df_data=tfidf_data[0], oov_df_value=.05)
-    layer.set_vocabulary(vocab_data[1], df_data=tfidf_data[1], append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
-
-  def test_tfidf_appending_with_oov_replacement(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    tfidf_data = [[.5, .25], [.2, .125]]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "fire", "earth", "michigan"]])
-
-    # pyformat: disable
-    # pylint: disable=bad-whitespace
-    expected_output = [[ 0,  1, .25, .2,    0],
-                       [1.5, .5,   0,  0, .125]]
-    # pylint: enable=bad-whitespace
-    # pyformat: enable
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TFIDF)
-    layer.set_vocabulary(vocab_data[0], df_data=tfidf_data[0], oov_df_value=.05)
-    # Note that here we've replaced the OOV vaue.
-    layer.set_vocabulary(
-        vocab_data[1], df_data=tfidf_data[1], oov_df_value=.75, append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
-
   def test_accept_1D_input(self):
     input_array = np.array(["earth wind and fire",
                             "fire and earth michigan"])
@@ -1274,22 +1326,6 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
                                 "vocabulary larger than the maximum vocab.*"):
       layer.set_vocabulary(vocab_data)
 
-  def test_too_long_vocab_fails_in_multiple_settings(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-
-    layer = get_layer_class()(
-        max_tokens=4,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-
-    # The first time we call set_vocabulary, we're under the max_tokens limit
-    # so it should be fine.
-    layer.set_vocabulary(vocab_data[0])
-    with self.assertRaisesRegex(ValueError,
-                                "vocabulary larger than the maximum vocab.*"):
-      layer.set_vocabulary(vocab_data[1], append=True)
-
   def test_setting_vocab_without_tfidf_data_fails_in_tfidf_mode(self):
     vocab_data = ["earth", "wind", "and", "fire"]
 
@@ -1326,18 +1362,6 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
                                 "You must pass an oov_df_value.*"):
       layer.set_vocabulary(vocab_data, df_data)
 
-  def test_tfidf_set_vocab_with_no_oov_fails_with_append_set(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    df_data = [1, 2, 3, 4]
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TFIDF)
-    with self.assertRaisesRegex(ValueError,
-                                "You must pass an oov_df_value.*"):
-      layer.set_vocabulary(vocab_data, df_data, append=True)
-
   def test_set_tfidf_in_non_tfidf_fails(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     df_data = [1, 2, 3, 4]
@@ -1393,6 +1417,7 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
                                 ".*`output_sequence_length` must not be set.*"):
       _ = get_layer_class()(output_mode="count", output_sequence_length=2)
 
+
 # Custom functions for the custom callable serialization test. Declared here
 # to avoid multiple registrations from run_all_keras_modes().
 @generic_utils.register_keras_serializable(package="Test")
@@ -1438,8 +1463,7 @@ class TextVectorizationSavingTest(
     if tf2.enabled():
       keras.backend.clear_session()
 
-    loaded_model = keras.models.load_model(
-        output_path, custom_objects={"TextVectorization": get_layer_class()})
+    loaded_model = keras.models.load_model(output_path)
     self.assertAllEqual(loaded_model.predict(input_array), expected_output)
 
   def test_saving_when_nested(self):
@@ -1473,67 +1497,10 @@ class TextVectorizationSavingTest(
     if tf2.enabled():
       keras.backend.clear_session()
 
-    loaded_model = keras.models.load_model(
-        output_path, custom_objects={"TextVectorization": get_layer_class()})
+    loaded_model = keras.models.load_model(output_path)
     self.assertAllEqual(loaded_model.predict(input_array), expected_output)
 
-  def test_serialization_with_custom_callables(self):
-    input_array = np.array([["earth>wind>and Fire"],
-                            ["\tfire>And\nearth>michigan"]])
-    expected_output = [[b"earth", b"wind", b"and fire"],
-                       [b"\tfire", b"and\nearth", b"michigan"]]
-
-    input_data = keras.Input(shape=(1,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=None,
-        standardize=custom_standardize_fn,
-        split=custom_split_fn,
-        ngrams=None,
-        output_mode=None)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-    serialized_model_data = model.get_config()
-    with CustomObjectScope({"TextVectorization": get_layer_class()}):
-      new_model = keras.Model.from_config(serialized_model_data)
-    new_output_dataset = new_model.predict(input_array)
-    self.assertAllEqual(expected_output, new_output_dataset)
-
-  def DISABLED_test_vocabulary_persistence_across_saving(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    # Build and validate a golden model.
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=None,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(output_dataset, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-    model.save(output_path, save_format="tf")
-    loaded_model = saving.load_from_saved_model(
-        output_path, custom_objects={"TextVectorization": get_layer_class()})
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_dataset = loaded_model.predict(input_array)
-    self.assertAllEqual(new_output_dataset, expected_output)
-
-  def DISABLED_test_vocabulary_persistence_across_saving_with_tfidf(self):
+  def test_saving_with_tfidf(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     tfidf_data = [.5, .25, .2, .125]
     input_array = np.array([["earth", "wind", "and", "earth"],
@@ -1563,8 +1530,7 @@ class TextVectorizationSavingTest(
     # Save the model to disk.
     output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
     model.save(output_path, save_format="tf")
-    loaded_model = saving.load_from_saved_model(
-        output_path, custom_objects={"TextVectorization": get_layer_class()})
+    loaded_model = keras.models.load_model(output_path)
 
     # Ensure that the loaded model is unique (so that the save/load is real)
     self.assertIsNot(model, loaded_model)
@@ -1573,208 +1539,62 @@ class TextVectorizationSavingTest(
     new_output_dataset = loaded_model.predict(input_array)
     self.assertAllClose(new_output_dataset, expected_output)
 
+  def test_serialization_with_custom_callables(self):
+    input_array = np.array([["earth>wind>and Fire"],
+                            ["\tfire>And\nearth>michigan"]])
+    expected_output = [[b"earth", b"wind", b"and fire"],
+                       [b"\tfire", b"and\nearth", b"michigan"]]
+
+    input_data = keras.Input(shape=(1,), dtype=dtypes.string)
+    layer = get_layer_class()(
+        max_tokens=None,
+        standardize=custom_standardize_fn,
+        split=custom_split_fn,
+        ngrams=None,
+        output_mode=None)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+    serialized_model_data = model.get_config()
+    new_model = keras.Model.from_config(serialized_model_data)
+    new_output_dataset = new_model.predict(input_array)
+    self.assertAllEqual(expected_output, new_output_dataset)
+
 
 @keras_parameterized.run_all_keras_modes
-class TextVectorizationCombinerTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
+class TextVectorizationE2ETest(keras_parameterized.TestCase,
+                               preprocessing_test_utils.PreprocessingLayerTest):
 
-  def compare_text_accumulators(self, a, b, msg=None):
-    if a is None or b is None:
-      self.assertAllEqual(a, b, msg=msg)
+  def test_keras_vocab_trimming_example(self):
+    vocab_data = np.array([
+        "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
+        "and", "fire"
+    ])
+    input_array = np.array([["earth", "wind", "and", "earth"],
+                            ["ohio", "and", "earth", "michigan"]])
 
-    self.assertAllEqual(a.count_dict, b.count_dict, msg=msg)
-    self.assertAllEqual(a.metadata, b.metadata, msg=msg)
-
-    if a.per_doc_count_dict is not None:
-
-      def per_doc_counts(accumulator):
-        count_values = [
-            count_dict["count"]
-            for count_dict in accumulator.per_doc_count_dict.values()
-        ]
-        return dict(zip(accumulator.per_doc_count_dict.keys(), count_values))
-
-      self.assertAllEqual(per_doc_counts(a), per_doc_counts(b), msg=msg)
-
-  compare_accumulators = compare_text_accumulators
-
-  def update_accumulator(self, accumulator, data):
-    accumulator.count_dict.update(dict(zip(data["vocab"], data["counts"])))
-    accumulator.metadata[0] = data["num_documents"]
-
-    if "document_counts" in data:
-      create_dict = lambda x: {"count": x, "last_doc_id": -1}
-      idf_count_dicts = [
-          create_dict(count) for count in data["document_counts"]
-      ]
-      idf_dict = dict(zip(data["vocab"], idf_count_dicts))
-
-      accumulator.per_doc_count_dict.update(idf_dict)
-
-    return accumulator
-
-  def test_combiner_api_compatibility_int_mode(self):
-    data = np.array([["earth", "wind", "and", "fire"],
-                     ["earth", "wind", "and", "michigan"]])
-    combiner = text_vectorization._TextVectorizationCombiner(compute_idf=False)
-    expected_accumulator_output = {
-        "vocab": np.array(["and", "earth", "wind", "fire", "michigan"]),
-        "counts": np.array([2, 2, 2, 1, 1]),
-        "num_documents": np.array(2),
-    }
-    expected_extract_output = {
-        "vocab": np.array(["wind", "earth", "and", "michigan", "fire"]),
-    }
-    expected_accumulator = combiner._create_accumulator()
-    expected_accumulator = self.update_accumulator(expected_accumulator,
-                                                   expected_accumulator_output)
-    self.validate_accumulator_serialize_and_deserialize(combiner, data,
-                                                        expected_accumulator)
-    self.validate_accumulator_uniqueness(combiner, data)
-    self.validate_accumulator_extract(combiner, data, expected_extract_output)
-
-  def test_combiner_api_compatibility_tfidf_mode(self):
-    data = np.array([["earth", "wind", "and", "fire"],
-                     ["earth", "wind", "and", "michigan"]])
-    combiner = text_vectorization._TextVectorizationCombiner(compute_idf=True)
-    expected_extract_output = {
-        "vocab": np.array(["wind", "earth", "and", "michigan", "fire"]),
-        "idf": np.array([0.510826, 0.510826, 0.510826, 0.693147, 0.693147]),
-        "oov_idf": np.array([1.098612])
-    }
-    expected_accumulator_output = {
-        "vocab": np.array(["wind", "earth", "and", "michigan", "fire"]),
-        "counts": np.array([2, 2, 2, 1, 1]),
-        "document_counts": np.array([2, 2, 2, 1, 1]),
-        "num_documents": np.array(2),
-    }
-
-    expected_accumulator = combiner._create_accumulator()
-    expected_accumulator = self.update_accumulator(expected_accumulator,
-                                                   expected_accumulator_output)
-    self.validate_accumulator_serialize_and_deserialize(combiner, data,
-                                                        expected_accumulator)
-    self.validate_accumulator_uniqueness(combiner, data)
-    self.validate_accumulator_extract(combiner, data, expected_extract_output)
-
-  # TODO(askerryryan): Add tests confirming equivalence to behavior of
-  # existing tf.keras.preprocessing.text.Tokenizer.
-  @parameterized.named_parameters(
-      {
-          "testcase_name":
-              "top_k_smaller_than_full_vocab",
-          "data":
-              np.array([["earth", "wind"], ["fire", "wind"], ["and"],
-                        ["fire", "wind"]]),
-          "vocab_size":
-              3,
-          "expected_accumulator_output": {
-              "vocab": np.array(["wind", "fire", "earth", "and"]),
-              "counts": np.array([3, 2, 1, 1]),
-              "document_counts": np.array([3, 2, 1, 1]),
-              "num_documents": np.array(4),
-          },
-          "expected_extract_output": {
-              "vocab": np.array(["wind", "fire", "earth"]),
-              "idf": np.array([0.693147, 0.847298, 1.098612]),
-              "oov_idf": np.array([1.609438]),
-          },
-      },
-      {
-          "testcase_name":
-              "top_k_larger_than_full_vocab",
-          "data":
-              np.array([["earth", "wind"], ["fire", "wind"], ["and"],
-                        ["fire", "wind"]]),
-          "vocab_size":
-              10,
-          "expected_accumulator_output": {
-              "vocab": np.array(["wind", "fire", "earth", "and"]),
-              "counts": np.array([3, 2, 1, 1]),
-              "document_counts": np.array([3, 2, 1, 1]),
-              "num_documents": np.array(4),
-          },
-          "expected_extract_output": {
-              "vocab": np.array(["wind", "fire", "earth", "and"]),
-              "idf": np.array([0.693147, 0.847298, 1.098612, 1.098612]),
-              "oov_idf": np.array([1.609438]),
-          },
-      },
-      {
-          "testcase_name":
-              "no_top_k",
-          "data":
-              np.array([["earth", "wind"], ["fire", "wind"], ["and"],
-                        ["fire", "wind"]]),
-          "vocab_size":
-              None,
-          "expected_accumulator_output": {
-              "vocab": np.array(["wind", "fire", "earth", "and"]),
-              "counts": np.array([3, 2, 1, 1]),
-              "document_counts": np.array([3, 2, 1, 1]),
-              "num_documents": np.array(4),
-          },
-          "expected_extract_output": {
-              "vocab": np.array(["wind", "fire", "earth", "and"]),
-              "idf": np.array([0.693147, 0.847298, 1.098612, 1.098612]),
-              "oov_idf": np.array([1.609438]),
-          },
-      },
-      {
-          "testcase_name": "single_element_per_row",
-          "data": np.array([["earth"], ["wind"], ["fire"], ["wind"], ["and"]]),
-          "vocab_size": 3,
-          "expected_accumulator_output": {
-              "vocab": np.array(["wind", "and", "earth", "fire"]),
-              "counts": np.array([2, 1, 1, 1]),
-              "document_counts": np.array([2, 1, 1, 1]),
-              "num_documents": np.array(5),
-          },
-          "expected_extract_output": {
-              "vocab": np.array(["wind", "fire", "earth"]),
-              "idf": np.array([0.980829, 1.252763, 1.252763]),
-              "oov_idf": np.array([1.791759]),
-          },
-      },
-      # Which tokens are retained are based on global frequency, and thus are
-      # sensitive to frequency within a document. In contrast, because idf only
-      # considers the presence of a token in a document, it is insensitive
-      # to the frequency of the token within the document.
-      {
-          "testcase_name":
-              "retained_tokens_sensitive_to_within_document_frequency",
-          "data":
-              np.array([["earth", "earth"], ["wind", "wind"], ["fire", "fire"],
-                        ["wind", "wind"], ["and", "michigan"]]),
-          "vocab_size":
-              3,
-          "expected_accumulator_output": {
-              "vocab": np.array(["wind", "earth", "fire", "and", "michigan"]),
-              "counts": np.array([4, 2, 2, 1, 1]),
-              "document_counts": np.array([2, 1, 1, 1, 1]),
-              "num_documents": np.array(5),
-          },
-          "expected_extract_output": {
-              "vocab": np.array(["wind", "fire", "earth"]),
-              "idf": np.array([0.980829, 1.252763, 1.252763]),
-              "oov_idf": np.array([1.791759]),
-          },
-      })
-  def test_combiner_computation(self,
-                                data,
-                                vocab_size,
-                                expected_accumulator_output,
-                                expected_extract_output,
-                                compute_idf=True):
-    combiner = text_vectorization._TextVectorizationCombiner(
-        vocab_size=vocab_size, compute_idf=compute_idf)
-    expected_accumulator = combiner._create_accumulator()
-    expected_accumulator = self.update_accumulator(expected_accumulator,
-                                                   expected_accumulator_output)
-    self.validate_accumulator_computation(combiner, data, expected_accumulator)
-    self.validate_accumulator_extract(combiner, data, expected_extract_output)
+    # pyformat: disable
+    expected_output = [[1, 2, 1],
+                       [3, 1, 0]]
+    # pyformat: enable
+    max_tokens = 3
+    expected_output_shape = [None, max_tokens]
 
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(
+        max_tokens=max_tokens,
+        standardize=None,
+        split=None,
+        output_mode=text_vectorization.COUNT,
+        pad_to_max_tokens=True)
+    int_data = layer(input_data)
+    layer.adapt(vocab_data)
+    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+    model = keras.Model(input_data, int_data)
+    output = model.predict(input_array)
+    self.assertAllEqual(expected_output, output)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
index b869bee52ab..505cdc39547 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
@@ -18,14 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1
-from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
+from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import text_vectorization
-from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -81,17 +77,7 @@ class TextVectorization(text_vectorization.TextVectorization,
   """
 
   def _get_vectorization_class(self):
-    return categorical_encoding_v1.CategoricalEncoding
+    return category_encoding_v1.CategoryEncoding
 
   def _get_index_lookup_class(self):
-    return index_lookup_v1.IndexLookup
-
-  def _to_numpy(self, data):
-    """Converts preprocessed inputs into numpy arrays."""
-    if isinstance(data, np.ndarray):
-      return data
-    session = K.get_session()
-    data = session.run(data)
-    if isinstance(data, ragged_tensor_value.RaggedTensorValue):
-      data = np.array(data.to_list())
-    return data
+    return string_lookup_v1.StringLookup
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index f11f4e88f21..0ce17c6101e 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -1096,18 +1096,30 @@ class DropoutRNNCellMixin(object):
   """
 
   def __init__(self, *args, **kwargs):
-    # Note that the following two masks will be used in "graph function" mode,
-    # e.g. these masks are symbolic tensors. In eager mode, the `eager_*_mask`
-    # tensors will be generated differently than in the "graph function" case,
-    # and they will be cached.
-    # Also note that in graph mode, we still cache those masks only because the
-    # RNN could be created with `unroll=True`. In that case, the `cell.call()`
-    # function will be invoked multiple times, and we want to ensure same mask
-    # is used every time.
+    self._create_non_trackable_mask_cache()
+    super(DropoutRNNCellMixin, self).__init__(*args, **kwargs)
+
+  @trackable.no_automatic_dependency_tracking
+  def _create_non_trackable_mask_cache(self):
+    """Create the cache for dropout and recurrent dropout mask.
+
+    Note that the following two masks will be used in "graph function" mode,
+    e.g. these masks are symbolic tensors. In eager mode, the `eager_*_mask`
+    tensors will be generated differently than in the "graph function" case,
+    and they will be cached.
+
+    Also note that in graph mode, we still cache those masks only because the
+    RNN could be created with `unroll=True`. In that case, the `cell.call()`
+    function will be invoked multiple times, and we want to ensure same mask
+    is used every time.
+
+    Also the caches are created without tracking. Since they are not picklable
+    by python when deepcopy, we don't want layer._obj_reference_counts_dict
+    to track it by default.
+    """
     self._dropout_mask_cache = K.ContextValueCache(self._create_dropout_mask)
     self._recurrent_dropout_mask_cache = K.ContextValueCache(
         self._create_recurrent_dropout_mask)
-    super(DropoutRNNCellMixin, self).__init__(*args, **kwargs)
 
   def reset_dropout_mask(self):
     """Reset the cached dropout masks if any.
@@ -1187,6 +1199,21 @@ class DropoutRNNCellMixin(object):
     init_kwargs = dict(inputs=inputs, training=training, count=count)
     return self._recurrent_dropout_mask_cache.setdefault(kwargs=init_kwargs)
 
+  def __getstate__(self):
+    # Used for deepcopy. The caching can't be pickled by python, since it will
+    # contain tensor and graph.
+    state = super(DropoutRNNCellMixin, self).__getstate__()
+    state.pop('_dropout_mask_cache', None)
+    state.pop('_recurrent_dropout_mask_cache', None)
+    return state
+
+  def __setstate__(self, state):
+    state['_dropout_mask_cache'] = K.ContextValueCache(
+        self._create_dropout_mask)
+    state['_recurrent_dropout_mask_cache'] = K.ContextValueCache(
+        self._create_recurrent_dropout_mask)
+    super(DropoutRNNCellMixin, self).__setstate__(state)
+
 
 @keras_export('keras.layers.SimpleRNNCell')
 class SimpleRNNCell(DropoutRNNCellMixin, Layer):
diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
index a9d5ef8587c..adefb689a1f 100644
--- a/tensorflow/python/keras/layers/recurrent_v2.py
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -413,7 +413,9 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
     input_shape = K.int_shape(inputs)
     timesteps = input_shape[0] if self.time_major else input_shape[1]
 
-    if not self._could_use_gpu_kernel:
+    # TODO(b/156447398) Investigate why the cuDNN kernel kernel fails with
+    # ragged inputs.
+    if is_ragged_input or not self._could_use_gpu_kernel:
       kwargs = {'training': training}
       self._maybe_reset_cell_dropout_mask(self.cell)
 
@@ -1109,7 +1111,9 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
     input_shape = K.int_shape(inputs)
     timesteps = input_shape[0] if self.time_major else input_shape[1]
 
-    if not self._could_use_gpu_kernel:
+    # TODO(b/156447398) Investigate why the cuDNN kernel kernel fails with
+    # ragged inputs.
+    if is_ragged_input or not self._could_use_gpu_kernel:
       # Fall back to use the normal LSTM.
       kwargs = {'training': training}
       self._maybe_reset_cell_dropout_mask(self.cell)
diff --git a/tensorflow/python/keras/layers/recurrent_v2_test.py b/tensorflow/python/keras/layers/recurrent_v2_test.py
index 4cb964b4bc4..ec70761c8a8 100644
--- a/tensorflow/python/keras/layers/recurrent_v2_test.py
+++ b/tensorflow/python/keras/layers/recurrent_v2_test.py
@@ -30,7 +30,9 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import embeddings
 from tensorflow.python.keras.layers import recurrent_v2 as rnn_v2
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 
 
@@ -113,6 +115,16 @@ class RNNV2Test(keras_parameterized.TestCase):
     model = keras.Model(inputs=inputs, outputs=outputs, name='3_layer')
     model.save(os.path.join(self.get_temp_dir(), 'model'), save_format='tf')
 
+  @parameterized.parameters([rnn_v2.LSTM, rnn_v2.GRU])
+  def test_ragged(self, layer):
+    vocab_size = 100
+    inputs = ragged_factory_ops.constant(
+        np.random.RandomState(0).randint(0, vocab_size, [128, 25]))
+    embedder = embeddings.Embedding(input_dim=vocab_size, output_dim=16)
+    embedded_inputs = embedder(inputs)
+    lstm = layer(32)
+    lstm(embedded_inputs)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 67aaf1d6eb8..d990f2075c8 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -45,9 +45,20 @@ from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.keras.layers import recurrent_v2
 from tensorflow.python.keras.layers import rnn_cell_wrapper_v2
 from tensorflow.python.keras.layers import wrappers
+from tensorflow.python.keras.layers.preprocessing import category_crossing
+from tensorflow.python.keras.layers.preprocessing import category_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
+from tensorflow.python.keras.layers.preprocessing import discretization
+from tensorflow.python.keras.layers.preprocessing import hashing
 from tensorflow.python.keras.layers.preprocessing import image_preprocessing
+from tensorflow.python.keras.layers.preprocessing import integer_lookup as preprocessing_integer_lookup
+from tensorflow.python.keras.layers.preprocessing import integer_lookup_v1 as preprocessing_integer_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import normalization as preprocessing_normalization
 from tensorflow.python.keras.layers.preprocessing import normalization_v1 as preprocessing_normalization_v1
+from tensorflow.python.keras.layers.preprocessing import string_lookup as preprocessing_string_lookup
+from tensorflow.python.keras.layers.preprocessing import string_lookup_v1 as preprocessing_string_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import text_vectorization as preprocessing_text_vectorization
+from tensorflow.python.keras.layers.preprocessing import text_vectorization_v1 as preprocessing_text_vectorization_v1
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.util import tf_inspect as inspect
 from tensorflow.python.util.tf_export import keras_export
@@ -56,31 +67,19 @@ from tensorflow.python.util.tf_export import keras_export
 ALL_MODULES = (base_layer, input_layer, advanced_activations, convolutional,
                convolutional_recurrent, core, cudnn_recurrent, dense_attention,
                embeddings, einsum_dense, local, merge, noise, normalization,
-               pooling, image_preprocessing, preprocessing_normalization_v1,
-               recurrent, wrappers)
-ALL_V2_MODULES = (
-    rnn_cell_wrapper_v2,
-    normalization_v2,
-    recurrent_v2,
-    preprocessing_normalization
-)
-FEATURE_COLUMN_V1_OBJECTS = {}
-FEATURE_COLUMN_V2_OBJECTS = {}
+               pooling, image_preprocessing, preprocessing_integer_lookup_v1,
+               preprocessing_normalization_v1, preprocessing_string_lookup_v1,
+               preprocessing_text_vectorization_v1, recurrent, wrappers,
+               hashing, category_crossing, category_encoding_v1, discretization)
+ALL_V2_MODULES = (rnn_cell_wrapper_v2, normalization_v2, recurrent_v2,
+                  preprocessing_integer_lookup, preprocessing_normalization,
+                  preprocessing_string_lookup, preprocessing_text_vectorization,
+                  category_encoding)
 # ALL_OBJECTS is meant to be a global mutable. Hence we need to make it
 # thread-local to avoid concurrent mutations.
 LOCAL = threading.local()
 
 
-def inject_feature_column_v1_objects(name, cls):
-  global FEATURE_COLUMN_V1_OBJECTS
-  FEATURE_COLUMN_V1_OBJECTS[name] = cls
-
-
-def inject_feature_column_v2_objects(name, cls):
-  global FEATURE_COLUMN_V2_OBJECTS
-  FEATURE_COLUMN_V2_OBJECTS[name] = cls
-
-
 def populate_deserializable_objects():
   """Populates dict ALL_OBJECTS with every built-in layer.
   """
@@ -126,7 +125,7 @@ def populate_deserializable_objects():
 
   LOCAL.ALL_OBJECTS['Input'] = input_layer.Input
   LOCAL.ALL_OBJECTS['InputSpec'] = input_spec.InputSpec
-  LOCAL.ALL_OBJECTS['Network'] = models.Network
+  LOCAL.ALL_OBJECTS['Functional'] = models.Functional
   LOCAL.ALL_OBJECTS['Model'] = models.Model
   LOCAL.ALL_OBJECTS['SequenceFeatures'] = SequenceFeatures
   LOCAL.ALL_OBJECTS['Sequential'] = models.Sequential
@@ -134,9 +133,11 @@ def populate_deserializable_objects():
   LOCAL.ALL_OBJECTS['WideDeepModel'] = WideDeepModel
 
   if tf2.enabled():
-    LOCAL.ALL_OBJECTS.update(FEATURE_COLUMN_V2_OBJECTS)
+    from tensorflow.python.keras.feature_column.dense_features_v2 import DenseFeatures  # pylint: disable=g-import-not-at-top
+    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
   else:
-    LOCAL.ALL_OBJECTS.update(FEATURE_COLUMN_V1_OBJECTS)
+    from tensorflow.python.keras.feature_column.dense_features import DenseFeatures  # pylint: disable=g-import-not-at-top
+    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
 
   # Merge layers, function versions.
   LOCAL.ALL_OBJECTS['add'] = merge.add
diff --git a/tensorflow/python/keras/layers/serialization_test.py b/tensorflow/python/keras/layers/serialization_test.py
index b18a0fbd8cc..920881c6a3e 100644
--- a/tensorflow/python/keras/layers/serialization_test.py
+++ b/tensorflow/python/keras/layers/serialization_test.py
@@ -53,7 +53,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
     new_layer = keras.layers.deserialize(config)
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     if tf2.enabled():
       self.assertEqual(new_layer.kernel_initializer.__class__,
                        keras.initializers.OnesV2)
@@ -88,7 +88,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
         config, custom_objects={'SerializableInt': SerializableInt})
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     if tf2.enabled():
       self.assertEqual(new_layer.kernel_initializer.__class__,
                        keras.initializers.OnesV2)
@@ -116,7 +116,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
       self.assertEqual(new_layer.beta_initializer.__class__,
                        keras.initializers.Zeros)
     self.assertEqual(new_layer.gamma_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
 
   @parameterized.parameters(
       [batchnorm_v1.BatchNormalization, batchnorm_v2.BatchNormalization])
@@ -135,7 +135,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
       self.assertEqual(new_layer.beta_initializer.__class__,
                        keras.initializers.Zeros)
     self.assertEqual(new_layer.gamma_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
 
   @parameterized.parameters([rnn_v1.LSTM, rnn_v2.LSTM])
   def test_serialize_deserialize_lstm(self, layer):
diff --git a/tensorflow/python/keras/layers/simplernn_test.py b/tensorflow/python/keras/layers/simplernn_test.py
index c9935bfcfe7..b586814a345 100644
--- a/tensorflow/python/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/layers/simplernn_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
+
 from absl.testing import parameterized
 import numpy as np
 
@@ -133,6 +135,12 @@ class SimpleRNNLayerTest(test.TestCase, parameterized.TestCase):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
+  def test_deep_copy_SimpleRNN(self):
+    cell = keras.layers.SimpleRNNCell(5)
+    copied_cell = copy.deepcopy(cell)
+    self.assertEqual(copied_cell.units, 5)
+    self.assertEqual(cell.get_config(), copied_cell.get_config())
+
   def test_regularizers_SimpleRNN(self):
     embedding_dim = 4
     layer_class = keras.layers.SimpleRNN
diff --git a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
index 73e395f5715..1a328995a80 100644
--- a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
+++ b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
@@ -288,9 +288,10 @@ class AutoLambdaTest(keras_parameterized.TestCase):
                         constant_op.constant(40.0, shape=(1, 1)))
 
   def test_no_tracking(self):
-    x = keras.backend.placeholder((10, 10))
-    keras.layers.Dense(1)(x)
-    self.assertTrue(x._keras_history_checked)
+    if not context.executing_eagerly():
+      x = constant_op.constant(1.0, shape=(10, 10))
+      keras.layers.Dense(1)(x)
+      self.assertTrue(x._keras_history_checked)
 
   def test_timing_scales_linearly(self):
 
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index a3173f4d11f..a73177fff12 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.layers.rnn_cell_wrapper_v2 import ResidualWrapper
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.ops import array_ops
@@ -377,7 +378,8 @@ class TimeDistributedTest(keras_parameterized.TestCase):
           input_layer.compute_output_shape([None, 2, 4]).as_list(),
           [None, 2, 8])
 
-  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  # TODO(scottzhu): check why v1 session failed.
   def test_TimeDistributed_with_mask_first_implementation(self):
     np.random.seed(100)
     rnn_layer = keras.layers.LSTM(4, return_sequences=True, stateful=True)
@@ -1212,9 +1214,14 @@ class BidirectionalTest(test.TestCase, parameterized.TestCase):
       f_merged = keras.backend.function([inputs], layer(inputs))
       f_forward = keras.backend.function([inputs],
                                          layer.forward_layer(inputs))
+
+      # TODO(kaftan): after KerasTensor refactor TF op layers should work
+      # with many composite tensors, and this shouldn't need to be a lambda
+      # layer.
+      reverse_layer = core.Lambda(array_ops.reverse, arguments=dict(axis=[1]))
       f_backward = keras.backend.function(
           [inputs],
-          array_ops.reverse(layer.backward_layer(inputs), axis=[1]))
+          reverse_layer(layer.backward_layer(inputs)))
 
       y_merged = f_merged(x)
       y_expected = merge_func(
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 99fb015288b..54afe2e26ce 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.ops.losses import util as tf_losses_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
@@ -1164,6 +1165,7 @@ class Huber(LossFunctionWrapper):
               'keras.losses.mean_squared_error',
               'keras.losses.mse',
               'keras.losses.MSE')
+@dispatch.add_dispatch_support
 def mean_squared_error(y_true, y_pred):
   """Computes the mean squared error between labels and predictions.
 
@@ -1199,6 +1201,7 @@ def mean_squared_error(y_true, y_pred):
               'keras.losses.mean_absolute_error',
               'keras.losses.mae',
               'keras.losses.MAE')
+@dispatch.add_dispatch_support
 def mean_absolute_error(y_true, y_pred):
   """Computes the mean absolute error between labels and predictions.
 
@@ -1231,6 +1234,7 @@ def mean_absolute_error(y_true, y_pred):
               'keras.losses.mean_absolute_percentage_error',
               'keras.losses.mape',
               'keras.losses.MAPE')
+@dispatch.add_dispatch_support
 def mean_absolute_percentage_error(y_true, y_pred):
   """Computes the mean absolute percentage error between `y_true` and `y_pred`.
 
@@ -1267,6 +1271,7 @@ def mean_absolute_percentage_error(y_true, y_pred):
               'keras.losses.mean_squared_logarithmic_error',
               'keras.losses.msle',
               'keras.losses.MSLE')
+@dispatch.add_dispatch_support
 def mean_squared_logarithmic_error(y_true, y_pred):
   """Computes the mean squared logarithmic error between `y_true` and `y_pred`.
 
@@ -1315,6 +1320,7 @@ def _maybe_convert_labels(y_true):
 
 
 @keras_export('keras.metrics.squared_hinge', 'keras.losses.squared_hinge')
+@dispatch.add_dispatch_support
 def squared_hinge(y_true, y_pred):
   """Computes the squared hinge loss between `y_true` and `y_pred`.
 
@@ -1347,6 +1353,7 @@ def squared_hinge(y_true, y_pred):
 
 
 @keras_export('keras.metrics.hinge', 'keras.losses.hinge')
+@dispatch.add_dispatch_support
 def hinge(y_true, y_pred):
   """Computes the hinge loss between `y_true` and `y_pred`.
 
@@ -1378,6 +1385,7 @@ def hinge(y_true, y_pred):
 
 
 @keras_export('keras.losses.categorical_hinge')
+@dispatch.add_dispatch_support
 def categorical_hinge(y_true, y_pred):
   """Computes the categorical hinge loss between `y_true` and `y_pred`.
 
@@ -1410,6 +1418,7 @@ def categorical_hinge(y_true, y_pred):
 
 
 @keras_export('keras.losses.huber', v1=[])
+@dispatch.add_dispatch_support
 def huber(y_true, y_pred, delta=1.0):
   """Computes Huber loss value.
 
@@ -1447,6 +1456,7 @@ def huber(y_true, y_pred, delta=1.0):
 
 
 @keras_export('keras.losses.log_cosh', 'keras.losses.logcosh')
+@dispatch.add_dispatch_support
 def log_cosh(y_true, y_pred):
   """Logarithm of the hyperbolic cosine of the prediction error.
 
@@ -1485,6 +1495,7 @@ def log_cosh(y_true, y_pred):
 
 @keras_export('keras.metrics.categorical_crossentropy',
               'keras.losses.categorical_crossentropy')
+@dispatch.add_dispatch_support
 def categorical_crossentropy(y_true,
                              y_pred,
                              from_logits=False,
@@ -1515,7 +1526,7 @@ def categorical_crossentropy(y_true,
   label_smoothing = ops.convert_to_tensor_v2(label_smoothing, dtype=K.floatx())
 
   def _smooth_labels():
-    num_classes = math_ops.cast(array_ops.shape(y_true)[1], y_pred.dtype)
+    num_classes = math_ops.cast(array_ops.shape(y_true)[-1], y_pred.dtype)
     return y_true * (1.0 - label_smoothing) + (label_smoothing / num_classes)
 
   y_true = smart_cond.smart_cond(label_smoothing,
@@ -1525,6 +1536,7 @@ def categorical_crossentropy(y_true,
 
 @keras_export('keras.metrics.sparse_categorical_crossentropy',
               'keras.losses.sparse_categorical_crossentropy')
+@dispatch.add_dispatch_support
 def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1):
   """Computes the sparse categorical crossentropy loss.
 
@@ -1556,6 +1568,7 @@ def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1):
 
 @keras_export('keras.metrics.binary_crossentropy',
               'keras.losses.binary_crossentropy')
+@dispatch.add_dispatch_support
 def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):
   """Computes the binary crossentropy loss.
 
@@ -1599,6 +1612,7 @@ def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):
               'keras.losses.kullback_leibler_divergence',
               'keras.losses.kld',
               'keras.losses.KLD')
+@dispatch.add_dispatch_support
 def kl_divergence(y_true, y_pred):
   """Computes Kullback-Leibler divergence loss between `y_true` and `y_pred`.
 
@@ -1635,6 +1649,7 @@ def kl_divergence(y_true, y_pred):
 
 
 @keras_export('keras.metrics.poisson', 'keras.losses.poisson')
+@dispatch.add_dispatch_support
 def poisson(y_true, y_pred):
   """Computes the Poisson loss between y_true and y_pred.
 
@@ -1676,6 +1691,7 @@ def poisson(y_true, y_pred):
         'keras.losses.cosine',
         'keras.losses.cosine_similarity',
     ])
+@dispatch.add_dispatch_support
 def cosine_similarity(y_true, y_pred, axis=-1):
   """Computes the cosine similarity between labels and predictions.
 
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index 574d3d3f756..26a586b872b 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -125,8 +125,10 @@ class KerasLossesTest(test.TestCase, parameterized.TestCase):
         backend.eval(output_from_softmax),
         atol=1e-5)
 
-  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  @combinations.generate(combinations.combine(mode=['graph']))
   def test_sparse_categorical_crossentropy_loss_with_unknown_rank_tensor(self):
+    # This test only runs in graph because the TF op layer is not supported yet
+    # for sparse ops.
     t = backend.placeholder()
     p = backend.placeholder()
     o = losses.sparse_categorical_crossentropy(t, p)
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 63cf7c578bc..a67755b9333 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -69,6 +69,7 @@ from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util as tf_losses_utils
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
@@ -3212,6 +3213,7 @@ def accuracy(y_true, y_pred):
 
 
 @keras_export('keras.metrics.binary_accuracy')
+@dispatch.add_dispatch_support
 def binary_accuracy(y_true, y_pred, threshold=0.5):
   """Calculates how often predictions matches binary labels.
 
@@ -3239,6 +3241,7 @@ def binary_accuracy(y_true, y_pred, threshold=0.5):
 
 
 @keras_export('keras.metrics.categorical_accuracy')
+@dispatch.add_dispatch_support
 def categorical_accuracy(y_true, y_pred):
   """Calculates how often predictions matches one-hot labels.
 
@@ -3267,6 +3270,7 @@ def categorical_accuracy(y_true, y_pred):
 
 
 @keras_export('keras.metrics.sparse_categorical_accuracy')
+@dispatch.add_dispatch_support
 def sparse_categorical_accuracy(y_true, y_pred):
   """Calculates how often predictions matches integer labels.
 
@@ -3307,6 +3311,7 @@ def sparse_categorical_accuracy(y_true, y_pred):
 
 
 @keras_export('keras.metrics.top_k_categorical_accuracy')
+@dispatch.add_dispatch_support
 def top_k_categorical_accuracy(y_true, y_pred, k=5):
   """Computes how often targets are in the top `K` predictions.
 
@@ -3332,6 +3337,7 @@ def top_k_categorical_accuracy(y_true, y_pred, k=5):
 
 
 @keras_export('keras.metrics.sparse_top_k_categorical_accuracy')
+@dispatch.add_dispatch_support
 def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
   """Computes how often integer targets are in the top `K` predictions.
 
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 99eadaec4c8..fee65be18c9 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
@@ -2247,6 +2248,23 @@ class ResetStatesTest(keras_parameterized.TestCase):
     self.assertArrayNear(self.evaluate(m_obj.total_cm)[0], [1, 0], 1e-1)
     self.assertArrayNear(self.evaluate(m_obj.total_cm)[1], [3, 0], 1e-1)
 
+  def test_reset_states_recall_float64(self):
+    # Test case for GitHub issue 36790.
+    try:
+      backend.set_floatx('float64')
+      r_obj = metrics.Recall()
+      model = _get_model([r_obj])
+      x = np.concatenate((np.ones((50, 4)), np.zeros((50, 4))))
+      y = np.concatenate((np.ones((50, 1)), np.ones((50, 1))))
+      model.evaluate(x, y)
+      self.assertEqual(self.evaluate(r_obj.true_positives), 50.)
+      self.assertEqual(self.evaluate(r_obj.false_negatives), 50.)
+      model.evaluate(x, y)
+      self.assertEqual(self.evaluate(r_obj.true_positives), 50.)
+      self.assertEqual(self.evaluate(r_obj.false_negatives), 50.)
+    finally:
+      backend.set_floatx('float32')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
index 25d05b78c3e..4b52b442754 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/BUILD
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -125,11 +125,14 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:ps_values",
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/types",
     ],
 )
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index c43ca21ea06..7d0abe30581 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -17,15 +17,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import ps_values as ps_distribute_values
 from tensorflow.python.distribute import values as distribute_values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.types import core
 
 
-class AutoCastVariable(variables.Variable):
+class AutoCastVariable(variables.Variable, core.Tensor):
   """Variable that will cast itself to a different dtype in applicable contexts.
 
   This class wraps a floating-point `tf.Variable`. It emulates the variable
@@ -417,7 +419,6 @@ class AutoCastVariable(variables.Variable):
 
 ops.register_tensor_conversion_function(AutoCastVariable,
                                         AutoCastVariable._dense_var_to_tensor)  # pylint:disable=protected-access
-ops.register_dense_tensor_like_type(AutoCastVariable)
 
 
 def create_autocast_variable(variable):
@@ -437,7 +438,7 @@ def create_autocast_variable(variable):
     An AutoCastVariable that wraps the variable.
   """
   if not isinstance(variable, (distribute_values.DistributedVariable,
-                               distribute_values.AggregatingVariable)):
+                               ps_distribute_values.AggregatingVariable)):
     return AutoCastVariable(variable)
 
   class AutoCastDistributedVariable(AutoCastVariable, variable.__class__):
@@ -448,7 +449,8 @@ def create_autocast_variable(variable):
     """
 
     def __repr__(self):
-      if issubclass(distribute_values.AggregatingVariable, variable.__class__):
+      if issubclass(ps_distribute_values.AggregatingVariable,
+                    variable.__class__):
         # AggregatingVariable's __repr__ simply calls super.__repr__. So we do
         # the same here for consistency, which calls AutoCastVariable.__repr__.
         return super(AutoCastDistributedVariable, self).__repr__()
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index 41591d3edbd..78041973cc1 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -146,7 +146,8 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(x.true_dtype, dtypes.float32)
       self.assertIsInstance(x.true_dtype, dtypes.DType)
 
-      with ops.get_default_graph()._enable_auto_casting_variables('float16'):
+      dtype = dtypes.float16
+      with ops.get_default_graph()._enable_auto_casting_variables(dtype):
         self.assertEqual(x.dtype, dtypes.float16)
         self.assertIsInstance(x.dtype, dtypes.DType)
         self.assertEqual(x.true_dtype, dtypes.float32)
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index eaffb90e64b..37a3f01272f 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -23,7 +23,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.engine import network
+from tensorflow.python.keras.engine import functional
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine import training_v1
@@ -31,7 +31,6 @@ from tensorflow.python.keras.engine.base_layer import AddMetric
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
-from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.saving import model_config
 from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils import generic_utils
@@ -45,6 +44,7 @@ from tensorflow.python.util.tf_export import keras_export
 # API entries importable from `keras.models`:
 Model = training.Model  # pylint: disable=invalid-name
 Sequential = sequential.Sequential  # pylint: disable=invalid-name
+Functional = functional.Functional  # pylint: disable=invalid-name
 save_model = save.save_model
 load_model = save.load_model
 model_from_config = model_config.model_from_config
@@ -193,12 +193,12 @@ def _clone_functional_model(model, input_tensors=None, layer_fn=_clone_layer):
   if not callable(layer_fn):
     raise ValueError('Expected `layer_fn` argument to be a callable.')
 
-  model_config, created_layers = _clone_layers_and_model_config(
+  model_configs, created_layers = _clone_layers_and_model_config(
       model, new_input_layers, layer_fn)
   # Reconstruct model from the config, using the cloned layers.
   input_tensors, output_tensors, created_layers = (
-      network.reconstruct_from_config(model_config,
-                                      created_layers=created_layers))
+      functional.reconstruct_from_config(model_configs,
+                                         created_layers=created_layers))
   metrics_names = model.metrics_names
   model = Model(input_tensors, output_tensors, name=model.name)
   # Layers not directly tied to outputs of the Model, such as loss layers
@@ -209,8 +209,8 @@ def _clone_functional_model(model, input_tensors=None, layer_fn=_clone_layer):
   if ancillary_layers:
     new_nodes = nest.flatten([
         layer.inbound_nodes[1:]
-        if network._should_skip_first_node(layer) else layer.inbound_nodes
-        for layer in created_layers.values()
+        if functional._should_skip_first_node(layer)
+        else layer.inbound_nodes for layer in created_layers.values()
     ])
     _insert_ancillary_layers(model, ancillary_layers, metrics_names, new_nodes)
   return model
@@ -244,7 +244,8 @@ def _clone_layers_and_model_config(model, input_layers, layer_fn):
       created_layers[layer.name] = layer_fn(layer)
     return {}
 
-  config = network.get_network_config(model, serialize_layer_fn=_copy_layer)
+  config = functional.get_network_config(
+      model, serialize_layer_fn=_copy_layer)
   return config, created_layers
 
 
@@ -495,7 +496,7 @@ def _in_place_subclassed_model_reset(model):
     # This will not work for nested subclassed models used as layers.
     # This would be theoretically possible to support, but would add complexity.
     # Only do it if users complain.
-    if isinstance(layer, Network) and not layer._is_graph_network:
+    if isinstance(layer, training.Model) and not layer._is_graph_network:
       raise ValueError('We do not support the use of nested subclassed models '
                        'in `model_to_estimator` at this time. Found nested '
                        'model: %s' % layer)
@@ -504,11 +505,6 @@ def _in_place_subclassed_model_reset(model):
     setattr(model, name, fresh_layer)
     model._layers.append(fresh_layer)
 
-    # The base Layer __setattr__ will invalidate its attribute cache when
-    # `._layers` is assigned, but it has no way to know when the underlying list
-    # is mutated so we must explicitly signal the append.
-    model._attribute_sentinel.invalidate_all()
-
   # Cache original model build attributes (in addition to layers)
   if (not hasattr(model, '_original_attributes_cache') or
       model._original_attributes_cache is None):
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
index 5de5e59b385..d1deaf34f45 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -121,16 +121,19 @@ class RMSprop(optimizer_v2.OptimizerV2):
         Setting this to `True` may help with training, but is slightly more
         expensive in terms of computation and memory. Defaults to `False`.
       name: Optional name prefix for the operations created when applying
-        gradients. Defaults to "RMSprop".  @compatibility(eager) When eager
-        execution is enabled, `learning_rate`, `decay`, `momentum`, and
-        `epsilon` can each be a callable that takes no arguments and returns the
-        actual value to use. This can be useful for changing these values across
-        different invocations of optimizer functions. @end_compatibility
+        gradients. Defaults to "RMSprop".
       **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
         `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
         gradients by value, `decay` is included for backward compatibility to
         allow time inverse decay of learning rate. `lr` is included for backward
         compatibility, recommended to use `learning_rate` instead.
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate`, `decay`, `momentum`, and
+    `epsilon` can each be a callable that takes no arguments and returns the
+    actual value to use. This can be useful for changing these values across
+    different invocations of optimizer functions.
+    @end_compatibility
     """
     super(RMSprop, self).__init__(name, **kwargs)
     self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
diff --git a/tensorflow/python/keras/preprocessing/BUILD b/tensorflow/python/keras/preprocessing/BUILD
index 403bc6e4808..24260fb71db 100644
--- a/tensorflow/python/keras/preprocessing/BUILD
+++ b/tensorflow/python/keras/preprocessing/BUILD
@@ -85,6 +85,7 @@ tf_py_test(
     deps = [
         ":image",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/keras/preprocessing/image.py b/tensorflow/python/keras/preprocessing/image.py
index 3af573fa036..953962c7771 100644
--- a/tensorflow/python/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/preprocessing/image.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 # pylint: disable=invalid-name
 # pylint: disable=g-import-not-at-top
+# pylint: disable=g-classes-have-attributes
 """Set of tools for real-time data augmentation on image data.
 """
 from __future__ import absolute_import
@@ -35,6 +36,7 @@ from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
@@ -49,6 +51,7 @@ random_brightness = image.random_brightness
 apply_affine_transform = image.apply_affine_transform
 
 
+@keras_export('keras.preprocessing.image.smart_resize', v1=[])
 def smart_resize(x, size, interpolation='bilinear'):
   """Resize images to a target size without aspect ratio distortion.
 
@@ -65,7 +68,7 @@ def smart_resize(x, size, interpolation='bilinear'):
   ```
 
   However, if you do this, you distort the aspect ratio of your images, since
-  in general they do not all have the same aspect ratio. This is
+  in general they do not all have the same aspect ratio as `size`. This is
   fine in many cases, but not always (e.g. for GANs this can be a problem).
 
   Note that passing the argument `preserve_aspect_ratio=True` to `resize`
@@ -458,6 +461,123 @@ class NumpyArrayIterator(image.NumpyArrayIterator, Iterator):
         **kwargs)
 
 
+class DataFrameIterator(image.DataFrameIterator, Iterator):
+  """Iterator capable of reading images from a directory on disk as a dataframe.
+
+  Arguments:
+      dataframe: Pandas dataframe containing the filepaths relative to
+        `directory` (or absolute paths if `directory` is None) of the images in
+        a string column. It should include other column/s
+          depending on the `class_mode`: - if `class_mode` is `"categorical"`
+            (default value) it must include the `y_col` column with the class/es
+            of each image. Values in column can be string/list/tuple if a single
+            class or list/tuple if multiple classes. - if `class_mode` is
+            `"binary"` or `"sparse"` it must include the given `y_col` column
+            with class values as strings. - if `class_mode` is `"raw"` or
+            `"multi_output"` it should contain the columns specified in `y_col`.
+            - if `class_mode` is `"input"` or `None` no extra column is needed.
+      directory: string, path to the directory to read images from. If `None`,
+        data in `x_col` column should be absolute paths.
+      image_data_generator: Instance of `ImageDataGenerator` to use for random
+        transformations and normalization. If None, no transformations and
+        normalizations are made.
+      x_col: string, column in `dataframe` that contains the filenames (or
+        absolute paths if `directory` is `None`).
+      y_col: string or list, column/s in `dataframe` that has the target data.
+      weight_col: string, column in `dataframe` that contains the sample
+          weights. Default: `None`.
+      target_size: tuple of integers, dimensions to resize input images to.
+      color_mode: One of `"rgb"`, `"rgba"`, `"grayscale"`. Color mode to read
+        images.
+      classes: Optional list of strings, classes to use (e.g. `["dogs",
+        "cats"]`). If None, all classes in `y_col` will be used.
+      class_mode: one of "binary", "categorical", "input", "multi_output",
+          "raw", "sparse" or None. Default: "categorical".
+          Mode for yielding the targets:
+          - `"binary"`: 1D numpy array of binary labels,
+          - `"categorical"`: 2D numpy array of one-hot encoded labels. Supports
+            multi-label output.
+          - `"input"`: images identical to input images (mainly used to work
+            with autoencoders),
+          - `"multi_output"`: list with the values of the different columns,
+          - `"raw"`: numpy array of values in `y_col` column(s),
+          - `"sparse"`: 1D numpy array of integer labels, - `None`, no targets
+            are returned (the generator will only yield batches of image data,
+            which is useful to use in `model.predict_generator()`).
+      batch_size: Integer, size of a batch.
+      shuffle: Boolean, whether to shuffle the data between epochs.
+      seed: Random seed for data shuffling.
+      data_format: String, one of `channels_first`, `channels_last`.
+      save_to_dir: Optional directory where to save the pictures being yielded,
+        in a viewable format. This is useful for visualizing the random
+        transformations being applied, for debugging purposes.
+      save_prefix: String prefix to use for saving sample images (if
+        `save_to_dir` is set).
+      save_format: Format to use for saving sample images (if `save_to_dir` is
+        set).
+      subset: Subset of data (`"training"` or `"validation"`) if
+        validation_split is set in ImageDataGenerator.
+      interpolation: Interpolation method used to resample the image if the
+        target size is different from that of the loaded image. Supported
+        methods are "nearest", "bilinear", and "bicubic". If PIL version 1.1.3
+        or newer is installed, "lanczos" is also supported. If PIL version 3.4.0
+        or newer is installed, "box" and "hamming" are also supported. By
+        default, "nearest" is used.
+      dtype: Dtype to use for the generated arrays.
+      validate_filenames: Boolean, whether to validate image filenames in
+        `x_col`. If `True`, invalid images will be ignored. Disabling this
+        option
+      can lead to speed-up in the instantiation of this class. Default: `True`.
+  """
+
+  def __init__(
+      self,
+      dataframe,
+      directory=None,
+      image_data_generator=None,
+      x_col='filename',
+      y_col='class',
+      weight_col=None,
+      target_size=(256, 256),
+      color_mode='rgb',
+      classes=None,
+      class_mode='categorical',
+      batch_size=32,
+      shuffle=True,
+      seed=None,
+      data_format='channels_last',
+      save_to_dir=None,
+      save_prefix='',
+      save_format='png',
+      subset=None,
+      interpolation='nearest',
+      dtype='float32',
+      validate_filenames=True):
+    super(DataFrameIterator, self).__init__(
+        dataframe=dataframe,
+        directory=directory,
+        image_data_generator=image_data_generator,
+        x_col=x_col,
+        y_col=y_col,
+        weight_col=weight_col,
+        target_size=target_size,
+        color_mode=color_mode,
+        classes=classes,
+        class_mode=class_mode,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        seed=seed,
+        data_format=data_format,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format,
+        subset=subset,
+        interpolation=interpolation,
+        dtype=dtype,
+        validate_filenames=validate_filenames
+    )
+
+
 @keras_export('keras.preprocessing.image.ImageDataGenerator')
 class ImageDataGenerator(image.ImageDataGenerator):
   """Generate batches of tensor image data with real-time data augmentation.
@@ -685,6 +805,302 @@ class ImageDataGenerator(image.ImageDataGenerator):
         validation_split=validation_split,
         **kwargs)
 
+  def flow(self,
+           x,
+           y=None,
+           batch_size=32,
+           shuffle=True,
+           sample_weight=None,
+           seed=None,
+           save_to_dir=None,
+           save_prefix='',
+           save_format='png',
+           subset=None):
+    """Takes data & label arrays, generates batches of augmented data.
+
+    Arguments:
+        x: Input data. Numpy array of rank 4 or a tuple. If tuple, the first
+          element should contain the images and the second element another numpy
+          array or a list of numpy arrays that gets passed to the output without
+          any modifications. Can be used to feed the model miscellaneous data
+          along with the images. In case of grayscale data, the channels axis of
+          the image array should have value 1, in case of RGB data, it should
+          have value 3, and in case of RGBA data, it should have value 4.
+        y: Labels.
+        batch_size: Int (default: 32).
+        shuffle: Boolean (default: True).
+        sample_weight: Sample weights.
+        seed: Int (default: None).
+        save_to_dir: None or str (default: None). This allows you to optionally
+          specify a directory to which to save the augmented pictures being
+          generated (useful for visualizing what you are doing).
+        save_prefix: Str (default: `''`). Prefix to use for filenames of saved
+          pictures (only relevant if `save_to_dir` is set).
+        save_format: one of "png", "jpeg"
+            (only relevant if `save_to_dir` is set). Default: "png".
+        subset: Subset of data (`"training"` or `"validation"`) if
+          `validation_split` is set in `ImageDataGenerator`.
+
+    Returns:
+        An `Iterator` yielding tuples of `(x, y)`
+            where `x` is a numpy array of image data
+            (in the case of a single image input) or a list
+            of numpy arrays (in the case with
+            additional inputs) and `y` is a numpy array
+            of corresponding labels. If 'sample_weight' is not None,
+            the yielded tuples are of the form `(x, y, sample_weight)`.
+            If `y` is None, only the numpy array `x` is returned.
+    """
+    return NumpyArrayIterator(
+        x,
+        y,
+        self,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        sample_weight=sample_weight,
+        seed=seed,
+        data_format=self.data_format,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format,
+        subset=subset)
+
+  def flow_from_directory(self,
+                          directory,
+                          target_size=(256, 256),
+                          color_mode='rgb',
+                          classes=None,
+                          class_mode='categorical',
+                          batch_size=32,
+                          shuffle=True,
+                          seed=None,
+                          save_to_dir=None,
+                          save_prefix='',
+                          save_format='png',
+                          follow_links=False,
+                          subset=None,
+                          interpolation='nearest'):
+    """Takes the path to a directory & generates batches of augmented data.
+
+    Arguments:
+        directory: string, path to the target directory. It should contain one
+          subdirectory per class. Any PNG, JPG, BMP, PPM or TIF images inside
+          each of the subdirectories directory tree will be included in the
+          generator. See [this script](
+            https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d)
+              for more details.
+        target_size: Tuple of integers `(height, width)`, defaults to `(256,
+          256)`. The dimensions to which all images found will be resized.
+        color_mode: One of "grayscale", "rgb", "rgba". Default: "rgb". Whether
+          the images will be converted to have 1, 3, or 4 channels.
+        classes: Optional list of class subdirectories
+            (e.g. `['dogs', 'cats']`). Default: None. If not provided, the list
+              of classes will be automatically inferred from the subdirectory
+              names/structure under `directory`, where each subdirectory will be
+              treated as a different class (and the order of the classes, which
+              will map to the label indices, will be alphanumeric). The
+              dictionary containing the mapping from class names to class
+              indices can be obtained via the attribute `class_indices`.
+        class_mode: One of "categorical", "binary", "sparse",
+            "input", or None. Default: "categorical".
+            Determines the type of label arrays that are returned: -
+              "categorical" will be 2D one-hot encoded labels, - "binary" will
+              be 1D binary labels, "sparse" will be 1D integer labels, - "input"
+              will be images identical to input images (mainly used to work with
+              autoencoders). - If None, no labels are returned (the generator
+              will only yield batches of image data, which is useful to use with
+              `model.predict_generator()`). Please note that in case of
+              class_mode None, the data still needs to reside in a subdirectory
+              of `directory` for it to work correctly.
+        batch_size: Size of the batches of data (default: 32).
+        shuffle: Whether to shuffle the data (default: True) If set to False,
+          sorts the data in alphanumeric order.
+        seed: Optional random seed for shuffling and transformations.
+        save_to_dir: None or str (default: None). This allows you to optionally
+          specify a directory to which to save the augmented pictures being
+          generated (useful for visualizing what you are doing).
+        save_prefix: Str. Prefix to use for filenames of saved pictures (only
+          relevant if `save_to_dir` is set).
+        save_format: One of "png", "jpeg"
+            (only relevant if `save_to_dir` is set). Default: "png".
+        follow_links: Whether to follow symlinks inside
+            class subdirectories (default: False).
+        subset: Subset of data (`"training"` or `"validation"`) if
+          `validation_split` is set in `ImageDataGenerator`.
+        interpolation: Interpolation method used to resample the image if the
+          target size is different from that of the loaded image. Supported
+          methods are `"nearest"`, `"bilinear"`, and `"bicubic"`. If PIL version
+          1.1.3 or newer is installed, `"lanczos"` is also supported. If PIL
+          version 3.4.0 or newer is installed, `"box"` and `"hamming"` are also
+          supported. By default, `"nearest"` is used.
+
+    Returns:
+        A `DirectoryIterator` yielding tuples of `(x, y)`
+            where `x` is a numpy array containing a batch
+            of images with shape `(batch_size, *target_size, channels)`
+            and `y` is a numpy array of corresponding labels.
+    """
+    return DirectoryIterator(
+        directory,
+        self,
+        target_size=target_size,
+        color_mode=color_mode,
+        classes=classes,
+        class_mode=class_mode,
+        data_format=self.data_format,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        seed=seed,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format,
+        follow_links=follow_links,
+        subset=subset,
+        interpolation=interpolation)
+
+  def flow_from_dataframe(self,
+                          dataframe,
+                          directory=None,
+                          x_col='filename',
+                          y_col='class',
+                          weight_col=None,
+                          target_size=(256, 256),
+                          color_mode='rgb',
+                          classes=None,
+                          class_mode='categorical',
+                          batch_size=32,
+                          shuffle=True,
+                          seed=None,
+                          save_to_dir=None,
+                          save_prefix='',
+                          save_format='png',
+                          subset=None,
+                          interpolation='nearest',
+                          validate_filenames=True,
+                          **kwargs):
+    """Takes the dataframe and the path to a directory + generates batches.
+
+     The generated batches contain augmented/normalized data.
+
+    **A simple tutorial can be found **[here](
+                                http://bit.ly/keras_flow_from_dataframe).
+
+    Arguments:
+        dataframe: Pandas dataframe containing the filepaths relative to
+          `directory` (or absolute paths if `directory` is None) of the images
+          in a string column. It should include other column/s
+            depending on the `class_mode`: - if `class_mode` is `"categorical"`
+              (default value) it must include the `y_col` column with the
+              class/es of each image. Values in column can be string/list/tuple
+              if a single class or list/tuple if multiple classes. - if
+              `class_mode` is `"binary"` or `"sparse"` it must include the given
+              `y_col` column with class values as strings. - if `class_mode` is
+              `"raw"` or `"multi_output"` it should contain the columns
+              specified in `y_col`. - if `class_mode` is `"input"` or `None` no
+              extra column is needed.
+        directory: string, path to the directory to read images from. If `None`,
+          data in `x_col` column should be absolute paths.
+        x_col: string, column in `dataframe` that contains the filenames (or
+          absolute paths if `directory` is `None`).
+        y_col: string or list, column/s in `dataframe` that has the target data.
+        weight_col: string, column in `dataframe` that contains the sample
+            weights. Default: `None`.
+        target_size: tuple of integers `(height, width)`, default: `(256, 256)`.
+          The dimensions to which all images found will be resized.
+        color_mode: one of "grayscale", "rgb", "rgba". Default: "rgb". Whether
+          the images will be converted to have 1 or 3 color channels.
+        classes: optional list of classes (e.g. `['dogs', 'cats']`). Default is
+          None. If not provided, the list of classes will be automatically
+          inferred from the `y_col`, which will map to the label indices, will
+          be alphanumeric). The dictionary containing the mapping from class
+          names to class indices can be obtained via the attribute
+          `class_indices`.
+        class_mode: one of "binary", "categorical", "input", "multi_output",
+            "raw", sparse" or None. Default: "categorical".
+            Mode for yielding the targets:
+            - `"binary"`: 1D numpy array of binary labels,
+            - `"categorical"`: 2D numpy array of one-hot encoded labels.
+              Supports multi-label output.
+            - `"input"`: images identical to input images (mainly used to work
+              with autoencoders),
+            - `"multi_output"`: list with the values of the different columns,
+            - `"raw"`: numpy array of values in `y_col` column(s),
+            - `"sparse"`: 1D numpy array of integer labels, - `None`, no targets
+              are returned (the generator will only yield batches of image data,
+              which is useful to use in `model.predict_generator()`).
+        batch_size: size of the batches of data (default: 32).
+        shuffle: whether to shuffle the data (default: True)
+        seed: optional random seed for shuffling and transformations.
+        save_to_dir: None or str (default: None). This allows you to optionally
+          specify a directory to which to save the augmented pictures being
+          generated (useful for visualizing what you are doing).
+        save_prefix: str. Prefix to use for filenames of saved pictures (only
+          relevant if `save_to_dir` is set).
+        save_format: one of "png", "jpeg"
+            (only relevant if `save_to_dir` is set). Default: "png".
+        subset: Subset of data (`"training"` or `"validation"`) if
+          `validation_split` is set in `ImageDataGenerator`.
+        interpolation: Interpolation method used to resample the image if the
+          target size is different from that of the loaded image. Supported
+          methods are `"nearest"`, `"bilinear"`, and `"bicubic"`. If PIL version
+          1.1.3 or newer is installed, `"lanczos"` is also supported. If PIL
+          version 3.4.0 or newer is installed, `"box"` and `"hamming"` are also
+          supported. By default, `"nearest"` is used.
+        validate_filenames: Boolean, whether to validate image filenames in
+          `x_col`. If `True`, invalid images will be ignored. Disabling this
+          option can lead to speed-up in the execution of this function.
+          Defaults to `True`.
+        **kwargs: legacy arguments for raising deprecation warnings.
+
+    Returns:
+        A `DataFrameIterator` yielding tuples of `(x, y)`
+        where `x` is a numpy array containing a batch
+        of images with shape `(batch_size, *target_size, channels)`
+        and `y` is a numpy array of corresponding labels.
+    """
+    if 'has_ext' in kwargs:
+      tf_logging.warn(
+          'has_ext is deprecated, filenames in the dataframe have '
+          'to match the exact filenames in disk.', DeprecationWarning)
+    if 'sort' in kwargs:
+      tf_logging.warn(
+          'sort is deprecated, batches will be created in the'
+          'same order than the filenames provided if shuffle'
+          'is set to False.', DeprecationWarning)
+    if class_mode == 'other':
+      tf_logging.warn(
+          '`class_mode` "other" is deprecated, please use '
+          '`class_mode` "raw".', DeprecationWarning)
+      class_mode = 'raw'
+    if 'drop_duplicates' in kwargs:
+      tf_logging.warn(
+          'drop_duplicates is deprecated, you can drop duplicates '
+          'by using the pandas.DataFrame.drop_duplicates method.',
+          DeprecationWarning)
+
+    return DataFrameIterator(
+        dataframe,
+        directory,
+        self,
+        x_col=x_col,
+        y_col=y_col,
+        weight_col=weight_col,
+        target_size=target_size,
+        color_mode=color_mode,
+        classes=classes,
+        class_mode=class_mode,
+        data_format=self.data_format,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        seed=seed,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format,
+        subset=subset,
+        interpolation=interpolation,
+        validate_filenames=validate_filenames)
+
+
 keras_export('keras.preprocessing.image.random_rotation')(random_rotation)
 keras_export('keras.preprocessing.image.random_shift')(random_shift)
 keras_export('keras.preprocessing.image.random_shear')(random_shear)
diff --git a/tensorflow/python/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
index d7da420318f..d2f4b18f7dd 100644
--- a/tensorflow/python/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/preprocessing/image_test.py
@@ -25,6 +25,9 @@ import tempfile
 import numpy as np
 
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import layers
+from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.preprocessing import image as preprocessing_image
 from tensorflow.python.platform import test
 
@@ -52,7 +55,7 @@ def _generate_test_images():
   return [rgb_images, gray_images]
 
 
-class TestImage(test.TestCase):
+class TestImage(keras_parameterized.TestCase):
 
   @test_util.run_v2_only
   def test_smart_resize(self):
@@ -143,8 +146,7 @@ class TestImage(test.TestCase):
       generator = preprocessing_image.ImageDataGenerator(
           data_format='unknown')
 
-    generator = preprocessing_image.ImageDataGenerator(
-        zoom_range=(2, 2))
+    generator = preprocessing_image.ImageDataGenerator(zoom_range=(2., 2.))
 
   def test_image_data_generator_fit(self):
     generator = preprocessing_image.ImageDataGenerator(
@@ -319,14 +321,21 @@ class TestImage(test.TestCase):
     self.assertEqual(
         len(set(train_iterator.filenames) & set(filenames)), num_training)
 
+    model = sequential.Sequential([layers.Flatten(), layers.Dense(2)])
+    model.compile(optimizer='sgd', loss='mse')
+    model.fit(train_iterator, epochs=1)
+
     shutil.rmtree(tmp_folder)
 
+  @keras_parameterized.run_all_keras_modes
   def test_directory_iterator_with_validation_split_25_percent(self):
     self.directory_iterator_with_validation_split_test_helper(0.25)
 
+  @keras_parameterized.run_all_keras_modes
   def test_directory_iterator_with_validation_split_40_percent(self):
     self.directory_iterator_with_validation_split_test_helper(0.40)
 
+  @keras_parameterized.run_all_keras_modes
   def test_directory_iterator_with_validation_split_50_percent(self):
     self.directory_iterator_with_validation_split_test_helper(0.50)
 
diff --git a/tensorflow/python/keras/regularizers.py b/tensorflow/python/keras/regularizers.py
index 973d916f7e0..b8bae4cc155 100644
--- a/tensorflow/python/keras/regularizers.py
+++ b/tensorflow/python/keras/regularizers.py
@@ -14,13 +14,14 @@
 # ==============================================================================
 """Built-in regularizers.
 """
+# pylint: disable=invalid-name
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import six
 
-from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import backend
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
@@ -60,8 +61,8 @@ class Regularizer(object):
   >>> layer = tf.keras.layers.Dense(
   ...     5, input_dim=5,
   ...     kernel_initializer='ones',
-  ...     kernel_regularizer=tf.keras.regularizers.l1(0.01),
-  ...     activity_regularizer=tf.keras.regularizers.l2(0.01))
+  ...     kernel_regularizer=tf.keras.regularizers.L1(0.01),
+  ...     activity_regularizer=tf.keras.regularizers.L2(0.01))
   >>> tensor = tf.ones(shape=(5, 5)) * 2.0
   >>> out = layer(tensor)
 
@@ -73,9 +74,9 @@ class Regularizer(object):
   ## Available penalties
 
   ```python
-  tf.keras.regularizers.l1(0.3)  # L1 Regularization Penalty
-  tf.keras.regularizers.l2(0.1)  # L2 Regularization Penalty
-  tf.keras.regularizers.l1_l2(l1=0.01, l2=0.01)  # L1 + L2 penalties
+  tf.keras.regularizers.L1(0.3)  # L1 Regularization Penalty
+  tf.keras.regularizers.L2(0.1)  # L2 Regularization Penalty
+  tf.keras.regularizers.L1L2(l1=0.01, l2=0.01)  # L1 + L2 penalties
   ```
 
   ## Directly calling a regularizer
@@ -84,7 +85,7 @@ class Regularizer(object):
   as if it is a one-argument function.
 
   E.g.
-  >>> regularizer = tf.keras.regularizers.l2(2.)
+  >>> regularizer = tf.keras.regularizers.L2(2.)
   >>> tensor = tf.ones(shape=(5, 5))
   >>> regularizer(tensor)
   <tf.Tensor: shape=(), dtype=float32, numpy=50.0>
@@ -194,7 +195,7 @@ class Regularizer(object):
 
 @keras_export('keras.regularizers.L1L2')
 class L1L2(Regularizer):
-  r"""A regularizer that applies both L1 and L2 regularization penalties.
+  """A regularizer that applies both L1 and L2 regularization penalties.
 
   The L1 regularization penalty is computed as:
   `loss = l1 * reduce_sum(abs(x))`
@@ -202,19 +203,23 @@ class L1L2(Regularizer):
   The L2 regularization penalty is computed as
   `loss = l2 * reduce_sum(square(x))`
 
+  L1L2 may be passed to a layer as a string identifier:
+
+  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l1_l2')
+
+  In this case, the default values used are `l1=0.01` and `l2=0.01`.
+
   Attributes:
       l1: Float; L1 regularization factor.
       l2: Float; L2 regularization factor.
   """
 
   def __init__(self, l1=0., l2=0.):  # pylint: disable=redefined-outer-name
-    self.l1 = K.cast_to_floatx(l1)
-    self.l2 = K.cast_to_floatx(l2)
+    self.l1 = backend.cast_to_floatx(l1)
+    self.l2 = backend.cast_to_floatx(l2)
 
   def __call__(self, x):
-    if not self.l1 and not self.l2:
-      return K.constant(0.)
-    regularization = 0.
+    regularization = backend.constant(0., dtype=x.dtype)
     if self.l1:
       regularization += self.l1 * math_ops.reduce_sum(math_ops.abs(x))
     if self.l2:
@@ -225,39 +230,64 @@ class L1L2(Regularizer):
     return {'l1': float(self.l1), 'l2': float(self.l2)}
 
 
-# Aliases.
-
-
-@keras_export('keras.regularizers.l1')
-def l1(l=0.01):
-  r"""Create a regularizer that applies an L1 regularization penalty.
+@keras_export('keras.regularizers.L1', 'keras.regularizers.l1')
+class L1(Regularizer):
+  """A regularizer that applies a L1 regularization penalty.
 
   The L1 regularization penalty is computed as:
-  `loss = l * reduce_sum(abs(x))`
+  `loss = l1 * reduce_sum(abs(x))`
 
-  Arguments:
-      l: Float; L1 regularization factor.
+  L1 may be passed to a layer as a string identifier:
 
-  Returns:
-    An L1 Regularizer with the given regularization factor.
+  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l1')
+
+  In this case, the default value used is `l1=0.01`.
+
+  Attributes:
+      l1: Float; L1 regularization factor.
   """
-  return L1L2(l1=l)
+
+  def __init__(self, l1=0.01, **kwargs):  # pylint: disable=redefined-outer-name
+    l1 = kwargs.pop('l', l1)  # Backwards compatibility
+    if kwargs:
+      raise TypeError('Argument(s) not recognized: %s' % (kwargs,))
+    self.l1 = backend.cast_to_floatx(l1)
+
+  def __call__(self, x):
+    return self.l1 * math_ops.reduce_sum(math_ops.abs(x))
+
+  def get_config(self):
+    return {'l1': float(self.l1)}
 
 
-@keras_export('keras.regularizers.l2')
-def l2(l=0.01):
-  r"""Create a regularizer that applies an L2 regularization penalty.
+@keras_export('keras.regularizers.L2', 'keras.regularizers.l2')
+class L2(Regularizer):
+  """A regularizer that applies a L2 regularization penalty.
 
   The L2 regularization penalty is computed as:
-  `loss = l * reduce_sum(square(x))`
+  `loss = l2 * reduce_sum(square(x))`
 
-  Arguments:
-      l: Float; L2 regularization factor.
+  L2 may be passed to a layer as a string identifier:
 
-  Returns:
-    An L2 Regularizer with the given regularization factor.
+  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l2')
+
+  In this case, the default value used is `l2=0.01`.
+
+  Attributes:
+      l2: Float; L2 regularization factor.
   """
-  return L1L2(l2=l)
+
+  def __init__(self, l2=0.01, **kwargs):  # pylint: disable=redefined-outer-name
+    l2 = kwargs.pop('l', l2)  # Backwards compatibility
+    if kwargs:
+      raise TypeError('Argument(s) not recognized: %s' % (kwargs,))
+    self.l2 = backend.cast_to_floatx(l2)
+
+  def __call__(self, x):
+    return self.l2 * math_ops.reduce_sum(math_ops.square(x))
+
+  def get_config(self):
+    return {'l2': float(self.l2)}
 
 
 @keras_export('keras.regularizers.l1_l2')
@@ -280,6 +310,11 @@ def l1_l2(l1=0.01, l2=0.01):  # pylint: disable=redefined-outer-name
   return L1L2(l1=l1, l2=l2)
 
 
+# Deserialization aliases.
+l1 = L1
+l2 = L2
+
+
 @keras_export('keras.regularizers.serialize')
 def serialize(regularizer):
   return serialize_keras_object(regularizer)
@@ -287,6 +322,10 @@ def serialize(regularizer):
 
 @keras_export('keras.regularizers.deserialize')
 def deserialize(config, custom_objects=None):
+  if config == 'l1_l2':
+    # Special case necessary since the defaults used for "l1_l2" (string)
+    # differ from those of the L1L2 class.
+    return L1L2(l1=0.01, l2=0.01)
   return deserialize_keras_object(
       config,
       module_objects=globals(),
@@ -296,18 +335,12 @@ def deserialize(config, custom_objects=None):
 
 @keras_export('keras.regularizers.get')
 def get(identifier):
+  """Retrieve a regularizer instance from a config or identifier."""
   if identifier is None:
     return None
   if isinstance(identifier, dict):
     return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
-    identifier = str(identifier)
-    # We have to special-case functions that return classes.
-    # TODO(omalleyt): Turn these into classes or class aliases.
-    special_cases = ['l1', 'l2', 'l1_l2']
-    if identifier in special_cases:
-      # Treat like a class.
-      return deserialize({'class_name': identifier, 'config': {}})
     return deserialize(str(identifier))
   elif callable(identifier):
     return identifier
diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index f3adb2d0695..800d609fe99 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -876,7 +876,7 @@ def _legacy_weights(layer):
       non_trainable_weights.
   """
   weights = layer.trainable_weights + layer.non_trainable_weights
-  if any([not isinstance(w, variables_module.Variable) for w in weights]):
+  if any(not isinstance(w, variables_module.Variable) for w in weights):
     raise NotImplementedError(
         'Save or restore weights that is not an instance of `tf.Variable` is '
         'not supported in h5, use `save_format=\'tf\'` instead. Got a model '
diff --git a/tensorflow/python/keras/saving/hdf5_format_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
index cae58329005..757385a25ea 100644
--- a/tensorflow/python/keras/saving/hdf5_format_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -1210,7 +1210,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
   def test_incompatible_checkpoint(self):
     save_path = trackable.Checkpoint().save(
         os.path.join(self.get_temp_dir(), 'ckpt'))
-    m = keras.Model()
+    m = DummySubclassModel()
     with self.assertRaisesRegexp(AssertionError, 'Nothing to load'):
       m.load_weights(save_path)
     m.dense = keras.layers.Dense(2)
@@ -1222,7 +1222,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_directory_passed(self):
     with self.cached_session():
-      m = keras.Model()
+      m = DummySubclassModel()
       v = m.add_weight(name='v', shape=[])
       self.evaluate(v.assign(42.))
       prefix = os.path.join(self.get_temp_dir(),
@@ -1235,7 +1235,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_relative_path(self):
     with self.cached_session():
-      m = keras.Model()
+      m = DummySubclassModel()
       v = m.add_weight(name='v', shape=[])
       os.chdir(self.get_temp_dir())
 
@@ -1266,7 +1266,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_nonexistent_prefix_directory(self):
     with self.cached_session():
-      m = keras.Model()
+      m = DummySubclassModel()
       v = m.add_weight(name='v', shape=[])
       self.evaluate(v.assign(42.))
       prefix = os.path.join(self.get_temp_dir(),
@@ -1276,5 +1276,10 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
       m.load_weights(prefix)
       self.assertEqual(42., self.evaluate(v))
 
+
+class DummySubclassModel(training.Model):
+  pass
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index 43c09a62ea9..7f725d3978e 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -76,7 +76,7 @@ def save_model(model,
 
   Note that the model weights may have different scoped names after being
   loaded. Scoped names include the model/layer names, such as
-  "dense_1/kernel:0"`. It is recommended that you use the layer properties to
+  `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
   access specific variables, e.g. `model.get_layer("dense_1").kernel`.
 
   _SavedModel serialization_
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index 5ffeb0671a1..ca8164c9407 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -62,9 +62,9 @@ layers_module = LazyLoader(
 input_layer = LazyLoader(
     "input_layer", globals(),
     "tensorflow.python.keras.engine.input_layer")
-network_lib = LazyLoader(
-    "network_lib", globals(),
-    "tensorflow.python.keras.engine.network")
+functional_lib = LazyLoader(
+    "functional_lib", globals(),
+    "tensorflow.python.keras.engine.functional")
 training_lib = LazyLoader(
     "training_lib", globals(),
     "tensorflow.python.keras.engine.training")
@@ -142,7 +142,7 @@ def _is_graph_network(layer):
   # pylint: disable=protected-access
   if isinstance(layer, RevivedNetwork):
     return False
-  elif isinstance(layer, network_lib.Network):
+  elif isinstance(layer, functional_lib.Functional):
     return (layer._is_graph_network or
             isinstance(layer, models_lib.Sequential))
   return False
@@ -371,7 +371,8 @@ class KerasObjectLoader(tf_load.Loader):
     # functional or Sequential model.
     model_is_functional_or_sequential = (
         metadata.get('is_graph_network', False) or
-        metadata['class_name'] == 'Sequential')
+        metadata['class_name'] == 'Sequential' or
+        metadata['class_name'] == 'Functional')
     if not (generic_utils.validate_config(config) and
             model_is_functional_or_sequential):
       return None  # Revive as custom model.
@@ -383,7 +384,8 @@ class KerasObjectLoader(tf_load.Loader):
     if class_name == 'Sequential':
       model = models_lib.Sequential(name=config['name'])
     else:
-      model = models_lib.Model(name=config['name'])
+      model = models_lib.Functional(
+          inputs=[], outputs=[], name=config['name'])
 
     # Record this model and its layers. This will later be used to reconstruct
     # the model.
@@ -544,8 +546,11 @@ class KerasObjectLoader(tf_load.Loader):
     config = json_utils.decode(
         self._proto.nodes[model_id].user_object.metadata)['config']
     if isinstance(model, models_lib.Sequential):
-      if config['layers'][0]['class_name'] != 'InputLayer':
-        if 'batch_input_shape' in config['layers'][0]['config']:
+      if not isinstance(layers[0], input_layer.InputLayer):
+        if config['layers'][0]['class_name'] == 'InputLayer':
+          layers.insert(0, input_layer.InputLayer.from_config(
+              config['layers'][0]['config']))
+        elif 'batch_input_shape' in config['layers'][0]['config']:
           batch_input_shape = config['layers'][0]['config']['batch_input_shape']
           layers.insert(0, input_layer.InputLayer(
               input_shape=batch_input_shape[1:],
@@ -561,10 +566,11 @@ class KerasObjectLoader(tf_load.Loader):
         if not model.built and not isinstance(input_specs, dict):
           model.build(input_shapes)
     else:
-      (inputs, outputs, created_layers) = network_lib.reconstruct_from_config(
-          config, created_layers={layer.name: layer for layer in layers})
+      (inputs, outputs,
+       created_layers) = functional_lib.reconstruct_from_config(
+           config, created_layers={layer.name: layer for layer in layers})
       model.__init__(inputs, outputs, name=config['name'])
-      network_lib.connect_ancillary_layers(model, created_layers)
+      functional_lib.connect_ancillary_layers(model, created_layers)
 
     # Set model dtype and trainable status.
     _set_network_attributes_from_metadata(model)
@@ -764,7 +770,7 @@ def revive_custom_object(identifier, metadata):
   revived_classes = {
       '_tf_keras_layer': (RevivedLayer, base_layer.Layer),
       '_tf_keras_input_layer': (RevivedInputLayer, input_layer.InputLayer),
-      '_tf_keras_network': (RevivedNetwork, network_lib.Network),
+      '_tf_keras_network': (RevivedNetwork, functional_lib.Functional),
       '_tf_keras_model': (RevivedNetwork, model_class),
       '_tf_keras_sequential': (RevivedNetwork, models_lib.Sequential),
   }
@@ -852,7 +858,7 @@ def _revive_setter(layer, name, value):
       layer._track_trackable(value, name=name)
     layer._serialized_attributes[name] = value
     # pylint: enable=protected-access
-  elif (isinstance(layer, network_lib.Network) and
+  elif (isinstance(layer, functional_lib.Functional) and
         re.match(r'^layer(_with_weights)?-[\d+]', name) is not None):
     # Edges named "layer-n" or "layer_with_weights-n", which are tracked in
     # network._track_layers, should not be added as an attribute.
diff --git a/tensorflow/python/keras/saving/saved_model/model_serialization.py b/tensorflow/python/keras/saving/saved_model/model_serialization.py
index 412fb0b54e5..c711e82a045 100644
--- a/tensorflow/python/keras/saving/saved_model/model_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/model_serialization.py
@@ -20,11 +20,11 @@ from __future__ import print_function
 
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.saving.saved_model import constants
-from tensorflow.python.keras.saving.saved_model import network_serialization
+from tensorflow.python.keras.saving.saved_model import layer_serialization
 from tensorflow.python.keras.saving.saved_model import save_impl
 
 
-class ModelSavedModelSaver(network_serialization.NetworkSavedModelSaver):
+class ModelSavedModelSaver(layer_serialization.LayerSavedModelSaver):
   """Model SavedModel serialization."""
 
   @property
@@ -33,6 +33,10 @@ class ModelSavedModelSaver(network_serialization.NetworkSavedModelSaver):
 
   def _python_properties_internal(self):
     metadata = super(ModelSavedModelSaver, self)._python_properties_internal()
+    # Network stateful property is dependent on the child layers.
+    metadata.pop('stateful')
+    metadata['is_graph_network'] = self.obj._is_graph_network  # pylint: disable=protected-access
+
     metadata.update(
         saving_utils.model_metadata(
             self.obj, include_optimizer=True, require_config=False))
diff --git a/tensorflow/python/keras/saving/saved_model/network_serialization.py b/tensorflow/python/keras/saving/saved_model/network_serialization.py
index 1c94377e3db..c98cba47155 100644
--- a/tensorflow/python/keras/saving/saved_model/network_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/network_serialization.py
@@ -18,22 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras.saving.saved_model import layer_serialization
+from tensorflow.python.keras.saving.saved_model import model_serialization
 
 
-# Network serialization is pretty much the same as layer serialization.
-class NetworkSavedModelSaver(layer_serialization.LayerSavedModelSaver):
+# FunctionalModel serialization is pretty much the same as Model serialization.
+class NetworkSavedModelSaver(model_serialization.ModelSavedModelSaver):
   """Network serialization."""
 
   @property
   def object_identifier(self):
     return '_tf_keras_network'
-
-  def _python_properties_internal(self):
-    metadata = super(NetworkSavedModelSaver, self)._python_properties_internal()
-
-    # Network stateful property is dependent on the child layers.
-    metadata.pop('stateful')
-
-    metadata['is_graph_network'] = self.obj._is_graph_network  # pylint: disable=protected-access
-    return metadata
diff --git a/tensorflow/python/keras/saving/saved_model/revive_test.py b/tensorflow/python/keras/saving/saved_model/revive_test.py
index 4bd11460181..5e94597d00d 100644
--- a/tensorflow/python/keras/saving/saved_model/revive_test.py
+++ b/tensorflow/python/keras/saving/saved_model/revive_test.py
@@ -31,6 +31,7 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
@@ -38,6 +39,7 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -221,7 +223,6 @@ class TestModelRevive(keras_parameterized.TestCase):
               inner_model_subclassed]
     model = testing_utils.get_model_from_layers(
         layers, input_shape=input_shape)
-
     # Run data through the Model to create save spec and weights.
     model.predict(np.ones((10, 2, 3)), batch_size=10)
 
@@ -242,6 +243,14 @@ class TestModelRevive(keras_parameterized.TestCase):
     revived = keras_load.load(self.path)
     self._assert_revived_correctness(model, revived)
 
+  def test_revive_sequential_inputs(self):
+    model = keras.models.Sequential(
+        [keras.Input((None,), dtype=dtypes.string),
+         keras.layers.Lambda(string_ops.string_lower)])
+    model.save(self.path, save_format='tf')
+    revived = keras_load.load(self.path)
+    self.assertEqual(dtypes.string, revived._layers[0].dtype)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/keras/saving/saved_model/save_impl.py b/tensorflow/python/keras/saving/saved_model/save_impl.py
index f3be6d0a595..7802470c523 100644
--- a/tensorflow/python/keras/saving/saved_model/save_impl.py
+++ b/tensorflow/python/keras/saving/saved_model/save_impl.py
@@ -521,7 +521,8 @@ def layer_call_wrapper(call_collection, method):
     with base_layer_utils.call_context().enter(
         layer, inputs=inputs, build_graph=False, training=training,
         saving=True):
-      with base_layer_utils.autocast_context_manager(layer._compute_dtype):  # pylint: disable=protected-access
+      with base_layer_utils.autocast_context_manager(
+          layer._compute_dtype_object):  # pylint: disable=protected-access
         ret = method(*args, **kwargs)
     _restore_layer_losses(original_losses)
     return ret
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 9cbe8607a54..c6cc2f7a1d5 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -39,7 +39,6 @@ from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.feature_column import feature_column_v2 as fc
-from tensorflow.python.feature_column.dense_features import DenseFeatures
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -48,6 +47,7 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.feature_column.dense_features import DenseFeatures
 from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.saving.saved_model import save_impl as keras_save
 from tensorflow.python.keras.utils import generic_utils
@@ -57,6 +57,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import load as tf_load
 from tensorflow.python.saved_model import save as tf_save
@@ -391,6 +392,37 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
       self.evaluate(loaded.get_updates_for(input_arr2))
     self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
 
+  def testDisablingBatchNormTrainableBeforeSaving(self):
+    # We disable trainable on the batchnorm layers before saving
+    model = keras.models.Sequential(
+        keras.layers.BatchNormalization(input_shape=(1,)))
+    model.trainable = False
+    self.evaluate(variables.variables_initializer(model.variables))
+    saved_model_dir = self._save_model_dir()
+    model.save(saved_model_dir, save_format='tf')
+    loaded = keras_load.load(saved_model_dir)
+    self.evaluate(variables.variables_initializer(loaded.variables))
+    input_arr = array_ops.constant([[11], [12], [13]], dtype=dtypes.float32)
+    input_arr2 = array_ops.constant([[14], [15], [16]], dtype=dtypes.float32)
+    self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0])
+
+    # Trainable should still be disabled after loading
+    self.evaluate(loaded(input_arr, training=True))
+    if not context.executing_eagerly():
+      self.evaluate(loaded.get_updates_for(input_arr))
+    self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.0])
+
+    # Re-enabling trainable on the loaded model should cause the batchnorm
+    # layer to start training again.
+    # Note: this only works in v2.
+    if context.executing_eagerly():
+      loaded.trainable = True
+      self.evaluate(loaded(input_arr, training=True))
+      self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
+
+      self.evaluate(loaded(input_arr2, training=False))
+      self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
+
   def testSaveWithSignatures(self):
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(5, input_shape=(3,),
@@ -699,6 +731,42 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     self.assertAllClose(layer.states, loaded_layer.states)
     self.assertAllClose(model(input_arr), loaded(input_arr))
 
+  def testSaveWithRaggedInputs(self):
+
+    class EmbeddingMerger(keras.layers.Layer):
+
+      def __init__(self, list_features, **kwargs):
+        super().__init__(**kwargs)
+        self._supports_ragged_inputs = True
+        self.embeddings = {
+            feature: keras.layers.Embedding(10, 3) for feature in list_features}
+        self.mean = keras.layers.Lambda(
+            math_ops.reduce_mean, arguments=dict(axis=1))
+
+      def call(self, inputs):
+        tensors = [self.embeddings[col](inputs[col]) for col in inputs]
+        tensors = [self.mean(inp) for inp in tensors]
+        return keras.layers.Add()(tensors)
+
+    list_features = ['feature_1', 'feature_2']
+    feature_1 = ragged_factory_ops.constant([[0.], [1, 3]])
+    feature_2 = ragged_factory_ops.constant([[1., 2], [4]])
+    f = {'feature_1': feature_1,
+         'feature_2': feature_2}
+    f_inputs = {
+        'feature_1': keras.Input(shape=(None,), name='feature_1', ragged=True),
+        'feature_2': keras.Input(shape=(None,), name='feature_2', ragged=True)}
+
+    out = EmbeddingMerger(list_features)(f_inputs)
+    model = keras.Model(f_inputs, out)
+    self.evaluate(variables.variables_initializer(model.variables))
+    saved_model_dir = self._save_model_dir()
+    tf_save.save(model, saved_model_dir)
+
+    loaded = keras_load.load(saved_model_dir)
+    self.evaluate(variables.variables_initializer(loaded.variables))
+    self.assertAllClose(model.predict(f), loaded.predict(f))
+
 
 class TestLayerCallTracing(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/saving/saved_model/utils.py b/tensorflow/python/keras/saving/saved_model/utils.py
index dedcea02a4f..bd3f0c1b626 100644
--- a/tensorflow/python/keras/saving/saved_model/utils.py
+++ b/tensorflow/python/keras/saving/saved_model/utils.py
@@ -79,7 +79,7 @@ def use_wrapped_call(layer, call_fn, default_training_value=None,
     # child layers. This causes `.losses` to only return eager losses.
     # pylint: disable=protected-access
     if context.executing_eagerly():
-      for i in layer._gather_unique_layers():
+      for i in layer._flatten_layers():
         if i is not layer:
           i._eager_losses = [base_layer_utils.REVIVED_LOSS_PLACEHOLDER]
     # pylint: enable=protected-access
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 5da6aeef391..b41abbdf1f5 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -25,6 +25,7 @@ import numpy as np
 
 from tensorflow.python import tf2
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
@@ -44,6 +45,14 @@ from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
 
+def string_test(actual, expected):
+  np.testing.assert_array_equal(actual, expected)
+
+
+def numeric_test(actual, expected):
+  np.testing.assert_allclose(actual, expected, rtol=1e-3, atol=1e-6)
+
+
 def get_test_data(train_samples,
                   test_samples,
                   input_shape,
@@ -132,6 +141,11 @@ def layer_test(layer_cls,
   if expected_output_dtype is None:
     expected_output_dtype = input_dtype
 
+  if dtypes.as_dtype(expected_output_dtype) == dtypes.string:
+    assert_equal = string_test
+  else:
+    assert_equal = numeric_test
+
   # instantiation
   kwargs = kwargs or {}
   layer = layer_cls(**kwargs)
@@ -199,8 +213,7 @@ def layer_test(layer_cls,
         (layer_cls.__name__, x, actual_output.dtype,
          computed_output_signature.dtype, kwargs))
   if expected_output is not None:
-    np.testing.assert_allclose(actual_output, expected_output,
-                               rtol=1e-3, atol=1e-6)
+    assert_equal(actual_output, expected_output)
 
   # test serialization, weight setting at model level
   model_config = model.get_config()
@@ -209,7 +222,7 @@ def layer_test(layer_cls,
     weights = model.get_weights()
     recovered_model.set_weights(weights)
     output = recovered_model.predict(input_data)
-    np.testing.assert_allclose(output, actual_output, rtol=1e-3, atol=1e-6)
+    assert_equal(output, actual_output)
 
   # test training mode (e.g. useful for dropout tests)
   # Rebuild the model to avoid the graph being reused between predict() and
@@ -254,8 +267,7 @@ def layer_test(layer_cls,
              computed_output_shape,
              kwargs))
   if expected_output is not None:
-    np.testing.assert_allclose(actual_output, expected_output,
-                               rtol=1e-3, atol=1e-6)
+    assert_equal(actual_output, expected_output)
 
   # test serialization, weight setting at model level
   model_config = model.get_config()
@@ -264,7 +276,7 @@ def layer_test(layer_cls,
     weights = model.get_weights()
     recovered_model.set_weights(weights)
     output = recovered_model.predict(input_data)
-    np.testing.assert_allclose(output, actual_output, rtol=1e-3, atol=1e-6)
+    assert_equal(output, actual_output)
 
   # for further checks in the caller function
   return actual_output
diff --git a/tensorflow/python/keras/tests/add_loss_correctness_test.py b/tensorflow/python/keras/tests/add_loss_correctness_test.py
index 323a2626c15..a19eec75ffb 100644
--- a/tensorflow/python/keras/tests/add_loss_correctness_test.py
+++ b/tensorflow/python/keras/tests/add_loss_correctness_test.py
@@ -288,7 +288,7 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
           model_layers, input_shape=(10,))
 
       x = np.ones((10, 10), 'float32')
-      y = np.ones((10, 1), 'float32')
+      y = np.zeros((10, 1), 'float32')
 
       optimizer = RMSPropOptimizer(learning_rate=0.001)
       model.compile(
diff --git a/tensorflow/python/keras/utils/data_utils.py b/tensorflow/python/keras/utils/data_utils.py
index 6c0122cdf72..3456db013d3 100644
--- a/tensorflow/python/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/utils/data_utils.py
@@ -886,15 +886,18 @@ class OrderedEnqueuer(SequenceEnqueuer):
         `(inputs, targets)` or
         `(inputs, targets, sample_weights)`.
     """
-    try:
-      while self.is_running():
-        inputs = self.queue.get(block=True).get()
-        self.queue.task_done()
+    while self.is_running():
+      try:
+        inputs = self.queue.get(block=True, timeout=5).get()
+        if self.is_running():
+          self.queue.task_done()
         if inputs is not None:
           yield inputs
-    except Exception:  # pylint: disable=broad-except
-      self.stop()
-      six.reraise(*sys.exc_info())
+      except queue.Empty:
+        pass
+      except Exception:  # pylint: disable=broad-except
+        self.stop()
+        six.reraise(*sys.exc_info())
 
 
 def init_pool_generator(gens, random_seed=None, id_queue=None):
diff --git a/tensorflow/python/keras/utils/generic_utils_test.py b/tensorflow/python/keras/utils/generic_utils_test.py
index 334758871fa..ddaa60c3c24 100644
--- a/tensorflow/python/keras/utils/generic_utils_test.py
+++ b/tensorflow/python/keras/utils/generic_utils_test.py
@@ -201,7 +201,7 @@ class SerializeKerasObjectTest(test.TestCase):
         config, custom_objects={'SerializableInt': SerializableInt})
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     self.assertEqual(new_layer.units.__class__, SerializableInt)
     self.assertEqual(new_layer.units, 3)
 
@@ -253,7 +253,7 @@ class SerializeKerasObjectTest(test.TestCase):
     self.assertEqual(new_layer.name, 'SerializableNestedInt')
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     self.assertEqual(new_layer.units.__class__, SerializableNestedInt)
     self.assertEqual(new_layer.units, 3)
     self.assertEqual(new_layer.units.int_obj.__class__, SerializableInt)
@@ -293,7 +293,7 @@ class SerializeKerasObjectTest(test.TestCase):
             'SerializableNestedInt': SerializableNestedInt
         })
     self.assertEqual(new_layer.activation, keras.activations.relu)
-    self.assertIsInstance(new_layer.bias_regularizer, keras.regularizers.L1L2)
+    self.assertIsInstance(new_layer.bias_regularizer, keras.regularizers.L2)
     self.assertIsInstance(new_layer.units, SerializableNestedInt)
     self.assertEqual(new_layer.units, 3)
     self.assertIs(new_layer.units.fn, serializable_fn)
diff --git a/tensorflow/python/keras/utils/metrics_utils.py b/tensorflow/python/keras/utils/metrics_utils.py
index 58fff40564d..5f9b57c095e 100644
--- a/tensorflow/python/keras/utils/metrics_utils.py
+++ b/tensorflow/python/keras/utils/metrics_utils.py
@@ -299,9 +299,19 @@ def update_confusion_matrix_variables(variables_to_update,
                      '`multi_label` is True.')
   if variables_to_update is None:
     return
-  y_true = math_ops.cast(y_true, dtype=dtypes.float32)
-  y_pred = math_ops.cast(y_pred, dtype=dtypes.float32)
-  thresholds = ops.convert_to_tensor_v2(thresholds, dtype=dtypes.float32)
+  if not any(
+      key for key in variables_to_update if key in list(ConfusionMatrix)):
+    raise ValueError(
+        'Please provide at least one valid confusion matrix '
+        'variable to update. Valid variable key options are: "{}". '
+        'Received: "{}"'.format(
+            list(ConfusionMatrix), variables_to_update.keys()))
+
+  variable_dtype = list(variables_to_update.values())[0].dtype
+
+  y_true = math_ops.cast(y_true, dtype=variable_dtype)
+  y_pred = math_ops.cast(y_pred, dtype=variable_dtype)
+  thresholds = ops.convert_to_tensor_v2(thresholds, dtype=variable_dtype)
   num_thresholds = thresholds.shape[0]
   if multi_label:
     one_thresh = math_ops.equal(
@@ -314,14 +324,6 @@ def update_confusion_matrix_variables(variables_to_update,
                                                                sample_weight)
     one_thresh = math_ops.cast(True, dtype=dtypes.bool)
 
-  if not any(
-      key for key in variables_to_update if key in list(ConfusionMatrix)):
-    raise ValueError(
-        'Please provide at least one valid confusion matrix '
-        'variable to update. Valid variable key options are: "{}". '
-        'Received: "{}"'.format(
-            list(ConfusionMatrix), variables_to_update.keys()))
-
   invalid_keys = [
       key for key in variables_to_update if key not in list(ConfusionMatrix)
   ]
@@ -401,7 +403,7 @@ def update_confusion_matrix_variables(variables_to_update,
 
   if sample_weight is not None:
     sample_weight = weights_broadcast_ops.broadcast_weights(
-        math_ops.cast(sample_weight, dtype=dtypes.float32), y_pred)
+        math_ops.cast(sample_weight, dtype=variable_dtype), y_pred)
     weights_tiled = array_ops.tile(
         array_ops.reshape(sample_weight, thresh_tiles), data_tiles)
   else:
@@ -422,9 +424,9 @@ def update_confusion_matrix_variables(variables_to_update,
 
   def weighted_assign_add(label, pred, weights, var):
     label_and_pred = math_ops.cast(
-        math_ops.logical_and(label, pred), dtype=dtypes.float32)
+        math_ops.logical_and(label, pred), dtype=var.dtype)
     if weights is not None:
-      label_and_pred *= weights
+      label_and_pred *= math_ops.cast(weights, dtype=var.dtype)
     return var.assign_add(math_ops.reduce_sum(label_and_pred, 1))
 
   loop_vars = {
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 220df9c7f8a..2c8f0f58f6c 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -328,7 +328,7 @@ def shape_type_conversion(fn):
 
 
 def are_all_symbolic_tensors(tensors):
-  return all(is_symbolic_tensor(tensor) for tensor in tensors)
+  return all(map(is_symbolic_tensor, tensors))
 
 
 _user_convertible_tensor_types = set()
@@ -346,9 +346,12 @@ def is_symbolic_tensor(tensor):
   Returns:
     True for symbolic tensors, False for eager tensors.
   """
-  if isinstance(tensor, tuple(_user_convertible_tensor_types)):
-    tensor = ops.convert_to_tensor_or_composite(tensor)
-  if isinstance(tensor, variables.Variable):
+  if isinstance(tensor, ops.Tensor):
+    return hasattr(tensor, 'graph')
+  elif isinstance(tensor, composite_tensor.CompositeTensor):
+    component_tensors = nest.flatten(tensor, expand_composites=True)
+    return any(hasattr(t, 'graph') for t in component_tensors)
+  elif isinstance(tensor, variables.Variable):
     # Variables that are output of a Keras Layer in Functional API mode
     # should be considered symbolic.
     # TODO(omalleyt): We need a better way to check this in order to
@@ -356,12 +359,11 @@ def is_symbolic_tensor(tensor):
     # return Variables as outputs.
     return (getattr(tensor, '_keras_history', False) or
             not context.executing_eagerly())
-  if isinstance(tensor, composite_tensor.CompositeTensor):
-    component_tensors = nest.flatten(tensor, expand_composites=True)
-    return any(hasattr(t, 'graph') for t in component_tensors)
-  if isinstance(tensor, ops.Tensor):
-    return hasattr(tensor, 'graph')
-  return False
+  elif isinstance(tensor, tuple(_user_convertible_tensor_types)):
+    tensor = ops.convert_to_tensor_or_composite(tensor)
+    return is_symbolic_tensor(tensor)
+  else:
+    return False
 
 
 def register_symbolic_tensor_type(cls):
@@ -479,11 +481,15 @@ def dataset_is_infinite(dataset):
 
 def get_tensor_spec(t, dynamic_batch=False, name=None):
   """Returns a `TensorSpec` given a single `Tensor` or `TensorSpec`."""
+  # pylint: disable=protected-access
   if isinstance(t, type_spec.TypeSpec):
     spec = t
   elif isinstance(t, composite_tensor.CompositeTensor):
     # TODO(b/148821952): Should these specs have a name attr?
-    spec = t._type_spec  # pylint: disable=protected-access
+    spec = t._type_spec
+  elif (hasattr(t, '_keras_history') and
+        hasattr(t._keras_history[0], '_type_spec')):
+    return t._keras_history[0]._type_spec
   elif hasattr(t, 'shape') and hasattr(t, 'dtype'):
     spec = tensor_spec.TensorSpec(shape=t.shape, dtype=t.dtype, name=name)
   else:
@@ -494,11 +500,12 @@ def get_tensor_spec(t, dynamic_batch=False, name=None):
 
   dynamic_batch_spec = copy.deepcopy(spec)
   # RaggedTensorSpec only has a private _shape.
-  shape = dynamic_batch_spec._shape.as_list()  # pylint: disable=protected-access
+  shape = dynamic_batch_spec._shape.as_list()
   if shape:
     shape[0] = None
-    dynamic_batch_spec._shape = tensor_shape.TensorShape(shape)  # pylint: disable=protected-access
+    dynamic_batch_spec._shape = tensor_shape.TensorShape(shape)
   return dynamic_batch_spec
+  # pylint: enable=protected-access
 
 
 def to_numpy_or_python_type(tensors):
@@ -526,4 +533,3 @@ def to_numpy_or_python_type(tensors):
     return t  # Don't turn ragged or sparse tensors to NumPy.
 
   return nest.map_structure(_to_single_numpy_or_python_type, tensors)
-
diff --git a/tensorflow/python/keras/utils/version_utils_test.py b/tensorflow/python/keras/utils/version_utils_test.py
index 76e888ca553..0a3cd53f3c0 100644
--- a/tensorflow/python/keras/utils/version_utils_test.py
+++ b/tensorflow/python/keras/utils/version_utils_test.py
@@ -53,12 +53,12 @@ class SplitUtilsTest(keras_parameterized.TestCase):
     inputs = keras.Input(10)
     outputs = keras.layers.Dense(1)(inputs)
     model = keras.Model(inputs, outputs)
-    self._check_model_class(model.__class__)
+    self._check_model_class(model.__class__.__bases__[0])
     self._check_layer_class(model)
 
   def test_sequential_model(self):
     model = keras.Sequential([keras.layers.Dense(1)])
-    model_class = model.__class__.__bases__[0]
+    model_class = model.__class__.__bases__[0].__bases__[0]
     self._check_model_class(model_class)
     self._check_layer_class(model)
 
diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index 87c436a5bd7..3720708543f 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -55,10 +55,10 @@ def check_pydot():
 
 
 def is_wrapped_model(layer):
-  from tensorflow.python.keras.engine import network
+  from tensorflow.python.keras.engine import functional
   from tensorflow.python.keras.layers import wrappers
   return (isinstance(layer, wrappers.Wrapper) and
-          isinstance(layer.layer, network.Network))
+          isinstance(layer.layer, functional.Functional))
 
 
 def add_edge(dot, src, dst):
@@ -98,7 +98,7 @@ def model_to_dot(model,
   """
   from tensorflow.python.keras.layers import wrappers
   from tensorflow.python.keras.engine import sequential
-  from tensorflow.python.keras.engine import network
+  from tensorflow.python.keras.engine import functional
 
   if not check_pydot():
     message = (
@@ -129,6 +129,7 @@ def model_to_dot(model,
   sub_w_first_node = {}
   sub_w_last_node = {}
 
+  layers = model.layers
   if not model._is_graph_network:
     node = pydot.Node(str(id(model)), label=model.name)
     dot.add_node(node)
@@ -136,7 +137,7 @@ def model_to_dot(model,
   elif isinstance(model, sequential.Sequential):
     if not model.built:
       model.build()
-  layers = model._layers
+    layers = super(sequential.Sequential, model).layers
 
   # Create graph nodes.
   for i, layer in enumerate(layers):
@@ -147,7 +148,8 @@ def model_to_dot(model,
     class_name = layer.__class__.__name__
 
     if isinstance(layer, wrappers.Wrapper):
-      if expand_nested and isinstance(layer.layer, network.Network):
+      if expand_nested and isinstance(layer.layer,
+                                      functional.Functional):
         submodel_wrapper = model_to_dot(layer.layer, show_shapes,
                                         show_layer_names, rankdir,
                                         expand_nested,
@@ -162,7 +164,7 @@ def model_to_dot(model,
         child_class_name = layer.layer.__class__.__name__
         class_name = '{}({})'.format(class_name, child_class_name)
 
-    if expand_nested and isinstance(layer, network.Network):
+    if expand_nested and isinstance(layer, functional.Functional):
       submodel_not_wrapper = model_to_dot(layer, show_shapes,
                                           show_layer_names, rankdir,
                                           expand_nested,
@@ -200,7 +202,8 @@ def model_to_dot(model,
                                                      inputlabels,
                                                      outputlabels)
 
-    if not expand_nested or not isinstance(layer, network.Network):
+    if not expand_nested or not isinstance(
+        layer, functional.Functional):
       node = pydot.Node(layer_id, label=label)
       dot.add_node(node)
 
@@ -218,16 +221,17 @@ def model_to_dot(model,
             add_edge(dot, inbound_layer_id, layer_id)
           else:
             # if inbound_layer is not Model or wrapped Model
-            if (not isinstance(inbound_layer, network.Network) and
+            if (not isinstance(inbound_layer,
+                               functional.Functional) and
                 not is_wrapped_model(inbound_layer)):
               # if current layer is not Model or wrapped Model
-              if (not isinstance(layer, network.Network) and
+              if (not isinstance(layer, functional.Functional) and
                   not is_wrapped_model(layer)):
                 assert dot.get_node(inbound_layer_id)
                 assert dot.get_node(layer_id)
                 add_edge(dot, inbound_layer_id, layer_id)
               # if current layer is Model
-              elif isinstance(layer, network.Network):
+              elif isinstance(layer, functional.Functional):
                 add_edge(dot, inbound_layer_id,
                          sub_n_first_node[layer.name].get_name())
               # if current layer is wrapped Model
@@ -236,9 +240,9 @@ def model_to_dot(model,
                 name = sub_w_first_node[layer.layer.name].get_name()
                 add_edge(dot, layer_id, name)
             # if inbound_layer is Model
-            elif isinstance(inbound_layer, network.Network):
+            elif isinstance(inbound_layer, functional.Functional):
               name = sub_n_last_node[inbound_layer.name].get_name()
-              if isinstance(layer, network.Network):
+              if isinstance(layer, functional.Functional):
                 output_name = sub_n_first_node[layer.name].get_name()
                 add_edge(dot, name, output_name)
               else:
@@ -313,8 +317,9 @@ def plot_model(model,
   # Return the image as a Jupyter Image object, to be displayed in-line.
   # Note that we cannot easily detect whether the code is running in a
   # notebook, and thus we always return the Image if Jupyter is available.
-  try:
-    from IPython import display
-    return display.Image(filename=to_file)
-  except ImportError:
-    pass
+  if extension != 'pdf':
+    try:
+      from IPython import display
+      return display.Image(filename=to_file)
+    except ImportError:
+      pass
diff --git a/tensorflow/python/keras/utils/vis_utils_test.py b/tensorflow/python/keras/utils/vis_utils_test.py
index 34bc835da32..984014216be 100644
--- a/tensorflow/python/keras/utils/vis_utils_test.py
+++ b/tensorflow/python/keras/utils/vis_utils_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python import keras
 from tensorflow.python.keras.utils import vis_utils
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -67,6 +68,32 @@ class ModelToDotFormatTest(test.TestCase):
     except ImportError:
       pass
 
+  def test_plot_model_with_add_loss(self):
+    inputs = keras.Input(shape=(None, 3))
+    outputs = keras.layers.Dense(1)(inputs)
+    model = keras.Model(inputs, outputs)
+    model.add_loss(math_ops.reduce_mean(outputs))
+    dot_img_file = 'model_3.png'
+    try:
+      vis_utils.plot_model(
+          model, to_file=dot_img_file, show_shapes=True, expand_nested=True)
+      self.assertTrue(file_io.file_exists(dot_img_file))
+      file_io.delete_file(dot_img_file)
+    except ImportError:
+      pass
+
+    model = keras.Sequential([
+        keras.Input(shape=(None, 3)), keras.layers.Dense(1)])
+    model.add_loss(math_ops.reduce_mean(model.output))
+    dot_img_file = 'model_4.png'
+    try:
+      vis_utils.plot_model(
+          model, to_file=dot_img_file, show_shapes=True, expand_nested=True)
+      self.assertTrue(file_io.file_exists(dot_img_file))
+      file_io.delete_file(dot_img_file)
+    except ImportError:
+      pass
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index b226e0cb859..a04c874c9d6 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1,8 +1,11 @@
 # Tests of TensorFlow kernels written using the Python API.
 
-load("//tensorflow:tensorflow.bzl", "sycl_py_test", "tf_custom_op_library", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "sycl_py_test", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
 package(
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],  # Apache 2.0
@@ -175,9 +178,9 @@ cuda_py_test(
     srcs = ["bincount_op_test.py"],
     tags = ["no_windows_gpu"],
     deps = [
+        "//tensorflow/python:bincount_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
     ],
 )
 
@@ -861,6 +864,7 @@ cuda_py_test(
     srcs = ["resource_variable_ops_test.py"],
     # TODO(b/128347673): Re-enable.
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3468,7 +3472,7 @@ cuda_py_test(
     name = "svd_op_test",
     size = "medium",
     srcs = ["svd_op_test.py"],
-    shard_count = 20,
+    shard_count = 30,
     tags = [
         "no_oss",  # b/117185141.
         "nomsan",  # TODO(b/117236102): Re-enable in msan build.
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index bea08ac70bf..9eb8bfcef41 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -42,6 +42,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -1994,5 +1995,32 @@ class RepeatTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(v_tf_fn, v_np)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class TileVariantTest(test_util.TensorFlowTestCase):
+
+  def test_tile_tensor_list(self):
+    t = constant_op.constant(np.random.uniform(size=[2, 3, 4]))
+    handle = list_ops.tensor_list_from_tensor(t, element_shape=None)
+    with ops.device("CPU:0"):
+      tiled_handles = array_ops.tile(array_ops.reshape(handle, [1]), [2])
+    tiled_tensor_0 = list_ops.tensor_list_stack(tiled_handles[0], t.dtype, 2,
+                                                [3, 4])
+    tiled_tensor_1 = list_ops.tensor_list_stack(tiled_handles[1], t.dtype, 2,
+                                                [3, 4])
+    self.assertAllEqual(t, tiled_tensor_0)
+    self.assertAllEqual(t, tiled_tensor_1)
+    # Now mutate some of the lists and make sure the changes are not reflected
+    # in the tiled handles.
+    with ops.control_dependencies([
+        list_ops.tensor_list_scatter([t[0] + 1], [0], input_handle=handle),
+        list_ops.tensor_list_set_item(tiled_handles[0], 0, t[0] + 2)]):
+      tiled_tensor_0 = list_ops.tensor_list_stack(tiled_handles[0], t.dtype, 2,
+                                                  [3, 4])
+      tiled_tensor_1 = list_ops.tensor_list_stack(tiled_handles[1], t.dtype, 2,
+                                                  [3, 4])
+    self.assertAllEqual(t, tiled_tensor_0)
+    self.assertAllEqual(t, tiled_tensor_1)
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py
index 87e709fc69e..804a0b20cc9 100644
--- a/tensorflow/python/kernel_tests/attention_ops_test.py
+++ b/tensorflow/python/kernel_tests/attention_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
 
@@ -196,6 +197,55 @@ class ExtractGlimpseTest(test.TestCase):
         expected_rows=[None, None, None, 1, 2, 3, 4],
         expected_cols=[56, 57, 58, 59, 60])
 
+  def testGlimpseNoiseZeroV1Compatible(self):
+    # Note: The old versions of extract_glimpse was incorrect in implementation.
+    # This test is for compatibility so that graph save in old versions behave
+    # the same. Notice the API uses gen_image_ops.extract_glimpse() on purpose.
+    #
+    # Image:
+    # [  0.   1.   2.   3.   4.]
+    # [  5.   6.   7.   8.   9.]
+    # [ 10.  11.  12.  13.  14.]
+    # [ 15.  16.  17.  18.  19.]
+    # [ 20.  21.  22.  23.  24.]
+    img = constant_op.constant(
+        np.arange(25).reshape((1, 5, 5, 1)), dtype=dtypes.float32)
+    with self.test_session():
+      # Result 1:
+      # [ 0.  0.  0.]
+      # [ 0.  0.  0.]
+      # [ 0.  0.  0.]
+      result1 = gen_image_ops.extract_glimpse(
+          img, [3, 3], [[-2, 2]],
+          centered=False,
+          normalized=False,
+          noise='zero',
+          uniform_noise=False)
+      self.assertAllEqual(
+          np.asarray([[0, 0, 0], [0, 0, 0], [0, 0, 0]]),
+          self.evaluate(result1)[0, :, :, 0])
+
+      # Result 2:
+      # [  0.   0.   0.   0.   0.   0.   0.]
+      # [  0.   0.   1.   2.   3.   4.   0.]
+      # [  0.   5.   6.   7.   8.   9.   0.]
+      # [  0.  10.  11.  12.  13.  14.   0.]
+      # [  0.  15.  16.  17.  18.  19.   0.]
+      # [  0.  20.  21.  22.  23.  24.   0.]
+      # [  0.   0.   0.   0.   0.   0.   0.]
+      result2 = gen_image_ops.extract_glimpse(
+          img, [7, 7], [[0, 0]],
+          normalized=False,
+          noise='zero',
+          uniform_noise=False)
+      self.assertAllEqual(
+          np.asarray([[0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 2, 3, 4, 0],
+                      [0, 5, 6, 7, 8, 9, 0], [0, 10, 11, 12, 13, 14, 0],
+                      [0, 15, 16, 17, 18, 19, 0], [0, 20, 21, 22, 23, 24, 0],
+                      [0, 0, 0, 0, 0, 0, 0]]),
+          self.evaluate(result2)[0, :, :, 0])
+
+
   def testGlimpseNoiseZero(self):
     # Image:
     # [  0.   1.   2.   3.   4.]
@@ -211,7 +261,7 @@ class ExtractGlimpseTest(test.TestCase):
       # [ 0.  0.  0.]
       # [ 0.  0.  0.]
       result1 = image_ops.extract_glimpse_v2(
-          img, [3, 3], [[-2, 2]],
+          img, [3, 3], [[-2, -2]],
           centered=False,
           normalized=False,
           noise='zero')
@@ -220,22 +270,37 @@ class ExtractGlimpseTest(test.TestCase):
           self.evaluate(result1)[0, :, :, 0])
 
       # Result 2:
+      # [ 12.  13.  14.   0.   0.   0.   0.]
+      # [ 17.  18.  19.   0.   0.   0.   0.]
+      # [ 22.  23.  24.   0.   0.   0.   0.]
+      # [  0.   0.   0.   0.   0.   0.   0.]
+      # [  0.   0.   0.   0.   0.   0.   0.]
       # [  0.   0.   0.   0.   0.   0.   0.]
-      # [  0.   0.   1.   2.   3.   4.   0.]
-      # [  0.   5.   6.   7.   8.   9.   0.]
-      # [  0.  10.  11.  12.  13.  14.   0.]
-      # [  0.  15.  16.  17.  18.  19.   0.]
-      # [  0.  20.  21.  22.  23.  24.   0.]
       # [  0.   0.   0.   0.   0.   0.   0.]
       result2 = image_ops.extract_glimpse_v2(
           img, [7, 7], [[0, 0]], normalized=False, noise='zero')
       self.assertAllEqual(
-          np.asarray([[0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 2, 3, 4, 0],
-                      [0, 5, 6, 7, 8, 9, 0], [0, 10, 11, 12, 13, 14, 0],
-                      [0, 15, 16, 17, 18, 19, 0], [0, 20, 21, 22, 23, 24, 0],
+          np.asarray([[12, 13, 14, 0, 0, 0, 0], [17, 18, 19, 0, 0, 0, 0],
+                      [22, 23, 24, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0],
+                      [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0],
                       [0, 0, 0, 0, 0, 0, 0]]),
           self.evaluate(result2)[0, :, :, 0])
 
+  def testGlimpseNonNormalizedNonCentered(self):
+    img = constant_op.constant(
+        np.arange(25).reshape((1, 5, 5, 1)), dtype=dtypes.float32)
+    with self.test_session():
+      result1 = image_ops.extract_glimpse_v2(
+          img, [3, 3], [[0, 0]], centered=False, normalized=False)
+      result2 = image_ops.extract_glimpse_v2(
+          img, [3, 3], [[1, 0]], centered=False, normalized=False)
+      self.assertAllEqual(
+          np.asarray([[0, 1, 2], [5, 6, 7], [10, 11, 12]]),
+          self.evaluate(result1)[0, :, :, 0])
+      self.assertAllEqual(
+          np.asarray([[5, 6, 7], [10, 11, 12], [15, 16, 17]]),
+          self.evaluate(result2)[0, :, :, 0])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index c4f70b5bc29..c564c822918 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -55,8 +55,8 @@ class BetaincTest(test.TestCase):
       # the scipy version of betainc uses a double-only implementation.
       # TODO(ebrevdo): identify reasons for (sometime) precision loss
       # with doubles
-      rtol = 1e-4 if dtype == dtypes.float32 else 5e-5
-      atol = 9e-6 if dtype == dtypes.float32 else 3e-6
+      rtol = 1e-4
+      atol = 1e-5
       self.assertAllCloseAccordingToType(
           scipy_out, tf_out, rtol=rtol, atol=atol)
 
@@ -66,7 +66,8 @@ class BetaincTest(test.TestCase):
       with self.cached_session():
         tf_comb = math_ops.betainc(a_comb, b_comb, x_comb).eval()
       scipy_comb = special.betainc(a_comb, b_comb, x_comb, dtype=np_dt)
-      self.assertAllCloseAccordingToType(scipy_comb, tf_comb)
+      self.assertAllCloseAccordingToType(
+          scipy_comb, tf_comb, rtol=rtol, atol=atol)
 
       # Test broadcasting between scalars and other shapes
       with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
index 4178e1203e2..22ac9f8e99d 100644
--- a/tensorflow/python/kernel_tests/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for math_ops.bincount."""
+"""Tests for bincount_ops.bincount."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -25,8 +25,8 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import bincount_ops
 from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -37,45 +37,50 @@ class BincountTest(test_util.TensorFlowTestCase):
 
   def test_empty(self):
     with self.session(use_gpu=True):
-      self.assertAllEqual(self.evaluate(math_ops.bincount([], minlength=5)),
-                          [0, 0, 0, 0, 0])
-      self.assertAllEqual(self.evaluate(math_ops.bincount([], minlength=1)),
-                          [0])
-      self.assertAllEqual(self.evaluate(math_ops.bincount([], minlength=0)),
-                          [])
-      self.assertEqual(self.evaluate(math_ops.bincount([], minlength=0,
-                                                       dtype=np.float32)).dtype,
-                       np.float32)
-      self.assertEqual(self.evaluate(math_ops.bincount([], minlength=3,
-                                                       dtype=np.float64)).dtype,
-                       np.float64)
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount([], minlength=5)),
+          [0, 0, 0, 0, 0])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount([], minlength=1)), [0])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount([], minlength=0)), [])
+      self.assertEqual(
+          self.evaluate(
+              bincount_ops.bincount([], minlength=0, dtype=np.float32)).dtype,
+          np.float32)
+      self.assertEqual(
+          self.evaluate(
+              bincount_ops.bincount([], minlength=3, dtype=np.float64)).dtype,
+          np.float64)
 
   def test_values(self):
     with self.session(use_gpu=True):
-      self.assertAllEqual(self.evaluate(math_ops.bincount([1, 1, 1, 2, 2, 3])),
-                          [0, 3, 2, 1])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount([1, 1, 1, 2, 2, 3])),
+          [0, 3, 2, 1])
       arr = [1, 1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5]
-      self.assertAllEqual(self.evaluate(math_ops.bincount(arr)),
-                          [0, 5, 4, 3, 2, 1])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount(arr)), [0, 5, 4, 3, 2, 1])
       arr += [0, 0, 0, 0, 0, 0]
-      self.assertAllEqual(self.evaluate(math_ops.bincount(arr)),
-                          [6, 5, 4, 3, 2, 1])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount(arr)), [6, 5, 4, 3, 2, 1])
 
-      self.assertAllEqual(self.evaluate(math_ops.bincount([])), [])
-      self.assertAllEqual(self.evaluate(math_ops.bincount([0, 0, 0])), [3])
-      self.assertAllEqual(self.evaluate(math_ops.bincount([5])),
-                          [0, 0, 0, 0, 0, 1])
-      self.assertAllEqual(self.evaluate(math_ops.bincount(np.arange(10000))),
-                          np.ones(10000))
+      self.assertAllEqual(self.evaluate(bincount_ops.bincount([])), [])
+      self.assertAllEqual(self.evaluate(bincount_ops.bincount([0, 0, 0])), [3])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount([5])), [0, 0, 0, 0, 0, 1])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount(np.arange(10000))),
+          np.ones(10000))
 
   def test_maxlength(self):
     with self.session(use_gpu=True):
-      self.assertAllEqual(self.evaluate(math_ops.bincount([5], maxlength=3)),
-                          [0, 0, 0])
-      self.assertAllEqual(self.evaluate(math_ops.bincount([1], maxlength=3)),
-                          [0, 1])
-      self.assertAllEqual(self.evaluate(math_ops.bincount([], maxlength=3)),
-                          [])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount([5], maxlength=3)), [0, 0, 0])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount([1], maxlength=3)), [0, 1])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount([], maxlength=3)), [])
 
   def test_random_with_weights(self):
     num_samples = 10000
@@ -88,7 +93,7 @@ class BincountTest(test_util.TensorFlowTestCase):
         else:
           weights = np.random.random(num_samples)
         self.assertAllClose(
-            self.evaluate(math_ops.bincount(arr, weights)),
+            self.evaluate(bincount_ops.bincount(arr, weights)),
             np.bincount(arr, weights))
 
   def test_random_without_weights(self):
@@ -99,20 +104,20 @@ class BincountTest(test_util.TensorFlowTestCase):
         arr = np.random.randint(0, 1000, num_samples)
         weights = np.ones(num_samples).astype(dtype)
         self.assertAllClose(
-            self.evaluate(math_ops.bincount(arr, None)),
+            self.evaluate(bincount_ops.bincount(arr, None)),
             np.bincount(arr, weights))
 
   def test_zero_weights(self):
     with self.session(use_gpu=True):
       self.assertAllEqual(
-          self.evaluate(math_ops.bincount(np.arange(1000), np.zeros(1000))),
+          self.evaluate(bincount_ops.bincount(np.arange(1000), np.zeros(1000))),
           np.zeros(1000))
 
   def test_negative(self):
     # unsorted_segment_sum will only report InvalidArgumentError on CPU
     with self.cached_session(), ops.device("/CPU:0"):
       with self.assertRaises(errors.InvalidArgumentError):
-        self.evaluate(math_ops.bincount([1, 2, 3, -1, 6, 8]))
+        self.evaluate(bincount_ops.bincount([1, 2, 3, -1, 6, 8]))
 
   @test_util.run_deprecated_v1
   def test_shape_function(self):
@@ -183,7 +188,7 @@ class BincountOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           np_out,
           self.evaluate(
               gen_math_ops.dense_bincount(
-                  input=inp, weights=[], size=size, binary_count=True)))
+                  input=inp, weights=[], size=size, binary_output=True)))
 
   @parameterized.parameters([{
       "dtype": np.int32,
@@ -201,7 +206,7 @@ class BincountOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           np_out,
           self.evaluate(
               gen_math_ops.dense_bincount(
-                  input=inp, weights=np_weight, size=size, binary_count=True)))
+                  input=inp, weights=np_weight, size=size, binary_output=True)))
 
   def _test_bincount_col_count(self, num_rows, num_cols, size, dtype):
     np.random.seed(42)
@@ -230,7 +235,7 @@ class BincountOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           np_out,
           self.evaluate(
               gen_math_ops.dense_bincount(
-                  input=inp, weights=[], size=size, binary_count=True)))
+                  input=inp, weights=[], size=size, binary_output=True)))
 
   def _test_bincount_col_count_with_weights(self, num_rows, num_cols, size,
                                             dtype):
@@ -401,7 +406,7 @@ class SparseBincountOpTest(test_util.TensorFlowTestCase,
                 dense_shape=[num_rows],
                 size=size,
                 weights=[],
-                binary_count=True)))
+                binary_output=True)))
 
   @parameterized.parameters([{
       "dtype": np.int32,
@@ -427,7 +432,7 @@ class SparseBincountOpTest(test_util.TensorFlowTestCase,
                 dense_shape=[num_rows],
                 size=size,
                 weights=inp_weight,
-                binary_count=True)))
+                binary_output=True)))
 
   @parameterized.parameters([{
       "dtype": np.int32,
@@ -490,7 +495,7 @@ class SparseBincountOpTest(test_util.TensorFlowTestCase,
                 dense_shape=inp_sparse.dense_shape,
                 size=size,
                 weights=[],
-                binary_count=True)))
+                binary_output=True)))
 
 
 class RaggedBincountOpTest(test_util.TensorFlowTestCase,
@@ -530,7 +535,7 @@ class RaggedBincountOpTest(test_util.TensorFlowTestCase,
                 values=x.values,
                 weights=[],
                 size=6,
-                binary_count=True)))
+                binary_output=True)))
 
   @parameterized.parameters([{
       "dtype": np.int32,
@@ -629,7 +634,7 @@ class RaggedBincountOpTest(test_util.TensorFlowTestCase,
                 values=x.values,
                 weights=[],
                 size=size,
-                binary_count=True)))
+                binary_output=True)))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index fb44c33d602..7c3a382c955 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -82,7 +82,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     self.eps = 0.01
     self.max_elements = 1 << 16
-    self.num_quantiles = constant_op.constant(3, dtype=dtypes.int64)
+    self.num_quantiles = constant_op.constant(4, dtype=dtypes.int64)
 
   def testBasicQuantileBucketsSingleResource(self):
     with self.cached_session() as sess:
@@ -183,7 +183,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
 
       save = saver.Saver()
       resources.initialize_resources(resources.shared_resources()).run()
@@ -202,7 +205,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
       save = saver.Saver()
       save.restore(sess, save_path)
       buckets = accumulator.get_bucket_boundaries()
@@ -215,7 +221,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
 
       save = saver.Saver()
       resources.initialize_resources(resources.shared_resources()).run()
@@ -233,7 +242,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
       save = saver.Saver()
       save.restore(sess, save_path)
       buckets = accumulator.get_bucket_boundaries()
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 47f392d7438..9bade548849 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -528,6 +528,17 @@ class AssertAllCloseTest(test.TestCase):
       x = check_ops.assert_near(t1, t2)
       assert x is None
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_doesnt_raise_complex(self):
+    x = constant_op.constant(1. + 0.1j, name="x")
+    y = constant_op.constant(1.1 + 0.1j, name="y")
+    with ops.control_dependencies([
+        check_ops.assert_near(
+            x, y, atol=0., rtol=0.5, message="failure message")
+    ]):
+      out = array_ops.identity(x)
+      self.evaluate(out)
+
 
 class AssertLessTest(test.TestCase):
 
@@ -1688,8 +1699,6 @@ class AssertShapesTest(test.TestCase):
         rank_three_shapes, array_ops.constant(1), correct_rank=3, actual_rank=0)
 
   def test_raises_dynamic_incorrect_rank(self):
-    self.skipTest("b/134600611")
-
     x_value = 5
     rank_two_shapes = [(1, 1), (1, 3), ("a", "b"), (None, None)]
     with ops.Graph().as_default():
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index e17a029c5ff..5dc334c897b 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -29,15 +29,14 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging
 
 
 # Different gradient implementations for benchmark purposes
@@ -91,7 +90,7 @@ def TriAngInvCompositeGrad(l, grad):
 
 class CholeskyOpTest(test.TestCase):
 
-  def _verifyCholeskyBase(self, sess, x, chol, verification):
+  def _verifyCholeskyBase(self, x, chol, verification):
     chol_np, verification_np = self.evaluate([chol, verification])
     self.assertAllClose(x, verification_np)
     self.assertShapeEqual(x, chol)
@@ -106,21 +105,24 @@ class CholeskyOpTest(test.TestCase):
 
   def _verifyCholesky(self, x):
     # Verify that LL^T == x.
-    with self.cached_session(use_gpu=True) as sess:
-      chol = linalg_ops.cholesky(x)
-      verification = math_ops.matmul(chol, chol, adjoint_b=True)
-      self._verifyCholeskyBase(sess, x, chol, verification)
+    chol = linalg_ops.cholesky(x)
+    verification = math_ops.matmul(chol, chol, adjoint_b=True)
+    self._verifyCholeskyBase(x, chol, verification)
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testBasic(self):
     data = np.array([[4., -1., 2.], [-1., 6., 0], [2., 0., 5.]])
     for dtype in (np.float32, np.float64):
-      self._verifyCholesky(data.astype(dtype))
+      with self.subTest(dtype=dtype):
+        self._verifyCholesky(data.astype(dtype))
     for dtype in (np.complex64, np.complex128):
-      complex_data = np.tril(1j * data, -1).astype(dtype)
-      complex_data += np.triu(-1j * data, 1).astype(dtype)
-      complex_data += data
-      self._verifyCholesky(complex_data)
+      with self.subTest(dtype=dtype):
+        complex_data = np.tril(1j * data, -1).astype(dtype)
+        complex_data += np.triu(-1j * data, 1).astype(dtype)
+        complex_data += data
+        self._verifyCholesky(complex_data)
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testBatch(self):
     simple_array = np.array([[[1., 0.], [0., 5.]]])  # shape (1, 2, 2)
     self._verifyCholesky(simple_array)
@@ -131,30 +133,32 @@ class CholeskyOpTest(test.TestCase):
     # Generate random positive-definite matrices.
     matrices = np.random.rand(10, 5, 5)
     for i in xrange(10):
-      matrices[i] = np.dot(matrices[i].T, matrices[i])
+      with self.subTest(i=i):
+        matrices[i] = np.dot(matrices[i].T, matrices[i])
     self._verifyCholesky(matrices)
 
     # Generate random complex valued positive-definite matrices.
     matrices = np.random.rand(10, 5, 5) + 1j * np.random.rand(10, 5, 5)
     for i in xrange(10):
-      matrices[i] = np.dot(matrices[i].T.conj(), matrices[i])
+      with self.subTest(i=i):
+        matrices[i] = np.dot(matrices[i].T.conj(), matrices[i])
     self._verifyCholesky(matrices)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testNonSquareMatrix(self):
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       linalg_ops.cholesky(np.array([[1., 2., 3.], [3., 4., 5.]]))
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       linalg_ops.cholesky(
           np.array([[[1., 2., 3.], [3., 4., 5.]], [[1., 2., 3.], [3., 4., 5.]]
                    ]))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     tensor3 = constant_op.constant([1., 2.])
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       linalg_ops.cholesky(tensor3)
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       linalg_ops.cholesky(tensor3)
 
   # The below invalid Cholesky call returns an error with TF Classic and just
@@ -171,126 +175,126 @@ class CholeskyOpTest(test.TestCase):
         self._verifyCholesky(
             np.array([[1., -1., 0.], [-1., 1., -1.], [0., -1., 1.]]))
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testEmpty(self):
     self._verifyCholesky(np.empty([0, 2, 2]))
     self._verifyCholesky(np.empty([2, 0, 0]))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrentExecutesWithoutError(self):
-    with self.session(use_gpu=True) as sess:
-      matrix1 = random_ops.random_normal([5, 5], seed=42)
-      matrix2 = random_ops.random_normal([5, 5], seed=42)
-      matrix1 = math_ops.matmul(matrix1, matrix1, adjoint_a=True)
-      matrix2 = math_ops.matmul(matrix2, matrix2, adjoint_a=True)
-      c1 = linalg_ops.cholesky(matrix1)
-      c2 = linalg_ops.cholesky(matrix2)
-      c1_val, c2_val = self.evaluate([c1, c2])
-      self.assertAllClose(c1_val, c2_val)
+    seed = [42, 24]
+    matrix_shape = [5, 5]
+    matrix1 = stateless_random_ops.stateless_random_normal(matrix_shape, seed)
+    matrix2 = stateless_random_ops.stateless_random_normal(matrix_shape, seed)
+    matrix1 = math_ops.matmul(matrix1, matrix1, adjoint_a=True)
+    matrix2 = math_ops.matmul(matrix2, matrix2, adjoint_a=True)
+    c1 = linalg_ops.cholesky(matrix1)
+    c2 = linalg_ops.cholesky(matrix2)
+    c1_val, c2_val = self.evaluate([c1, c2])
+    self.assertAllClose(c1_val, c2_val)
 
 
 class CholeskyGradTest(test.TestCase):
-  _backprop_block_size = 32
+  _backprop_block_size = 16
 
   def getShapes(self, shapeList):
     return ((elem, int(np.floor(1.2 * elem))) for elem in shapeList)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testSmallMatrices(self):
     np.random.seed(0)
     shapes = self.getShapes([1, 2, 10])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.float32, dtypes_lib.float64))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testSmallMatricesComplex(self):
     np.random.seed(0)
     shapes = self.getShapes([1, 2, 10])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.complex64, dtypes_lib.complex128))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testOneBlockMatrices(self):
     np.random.seed(0)
     shapes = self.getShapes([self._backprop_block_size + 1])
     self.runFiniteDifferences(
         shapes,
         dtypes=(dtypes_lib.float32, dtypes_lib.float64),
-        scalarTest=True)
+        scalar_test=True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testTwoBlockMatrixFloat(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
     self.runFiniteDifferences(
-        shapes, dtypes=(dtypes_lib.float32,), scalarTest=True)
+        shapes, dtypes=(dtypes_lib.float32,), scalar_test=True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testTwoBlockMatrixDouble(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
     self.runFiniteDifferences(
-        shapes, dtypes=(dtypes_lib.float64,), scalarTest=True)
+        shapes, dtypes=(dtypes_lib.float64,), scalar_test=True)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testTwoBlockMatrixComplexFloat(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
     self.runFiniteDifferences(
-        shapes, dtypes=(dtypes_lib.complex64,), scalarTest=True)
+        shapes, dtypes=(dtypes_lib.complex64,), scalar_test=True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testTwoBlockMatrixComplexDouble(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
     self.runFiniteDifferences(
-        shapes, dtypes=(dtypes_lib.complex128,), scalarTest=True)
+        shapes, dtypes=(dtypes_lib.complex128,), scalar_test=True)
+
+  def _runOneTest(self, shape, dtype, batch, scalar_test):
+    if dtype == dtypes_lib.float64:
+      tol = 1e-5
+    elif dtype == dtypes_lib.complex128:
+      tol = 5e-5
+    else:
+      tol = 5e-3
+    epsilon = np.finfo(dtype.as_numpy_dtype).eps
+    delta = epsilon**(1.0 / 3.0)
+
+    def RandomInput():
+      a = np.random.randn(shape[0], shape[1]).astype(dtype.as_numpy_dtype)
+      if dtype.is_complex:
+        a += 1j * np.random.randn(shape[0], shape[1]).astype(
+            dtype.as_numpy_dtype)
+      return a
+
+    def Compute(x):
+      # Turn the random matrix x into a Hermitian matrix by
+      # computing the quadratic form x * x^H.
+      a = math_ops.matmul(x, math_ops.conj(
+          array_ops.matrix_transpose(x))) / shape[0]
+      if batch:
+        a = array_ops.tile(array_ops.expand_dims(a, 0), [2, 1, 1])
+      # Finally take the cholesky decomposition of the Hermitian matrix.
+      c = linalg_ops.cholesky(a)
+      if scalar_test:
+        # Reduce to a single scalar output to speed up test.
+        c = math_ops.reduce_mean(c)
+      return c
+
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        Compute, [RandomInput()], delta=delta)
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
 
   def runFiniteDifferences(self,
                            shapes,
                            dtypes=(dtypes_lib.float32, dtypes_lib.float64,
                                    dtypes_lib.complex64, dtypes_lib.complex128),
-                           scalarTest=False):
-    with self.session(use_gpu=True):
-      for shape in shapes:
-        for batch in False, True:
-          for dtype in dtypes:
-            if not scalarTest:
-              data = np.random.randn(shape[0], shape[1])
-              if dtype.is_complex:
-                data = data.astype(np.complex64)
-                data += 1j * np.random.randn(shape[0], shape[1])
-              x = constant_op.constant(data, dtype)
-              tensor = math_ops.matmul(
-                  x, math_ops.conj(array_ops.transpose(x))) / shape[0]
-            else:
-              # This is designed to be a faster test for larger matrices.
-              data = np.random.randn()
-              if dtype.is_complex:
-                data = np.complex64(data)
-                data += 1j * np.random.randn()
-              x = constant_op.constant(data, dtype)
-              R = constant_op.constant(
-                  np.random.randn(shape[0], shape[1]), dtype)
-              e = math_ops.multiply(R, x)
-              tensor = math_ops.matmul(
-                  e, math_ops.conj(array_ops.transpose(e))) / shape[0]
-
-            # Inner-most matrices in tensor are positive definite.
-            if batch:
-              tensor = array_ops.tile(
-                  array_ops.expand_dims(tensor, 0), [4, 1, 1])
-            y = linalg_ops.cholesky(tensor)
-            if scalarTest:
-              y = math_ops.reduce_mean(y)
-            error = gradient_checker.compute_gradient_error(
-                x, x._shape_as_list(), y, y._shape_as_list())
-            tf_logging.info("error = %f", error)
-            if dtype == dtypes_lib.float64:
-              self.assertLess(error, 1e-5)
-            elif dtype == dtypes_lib.complex128:
-              self.assertLess(error, 5e-5)
-            else:
-              self.assertLess(error, 5e-3)
+                           scalar_test=False):
+    for shape_ in shapes:
+      for dtype_ in dtypes:
+        for batch_ in False, True:
+          self._runOneTest(shape_, dtype_, batch_, scalar_test)
 
 
 class CholeskyBenchmark(test.Benchmark):
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 9192dc05ebc..e01abc8133d 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -431,6 +431,82 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2DExpandedBatch(self):
+    tensor_in_sizes_batch = [10, 2, 3, 3]
+    tensor_in_sizes_expanded_batch = [2, 5, 2, 3, 3]
+    filter_in_sizes = [1, 1, 3, 3]
+    filter_in = self._CreateNumpyTensor(filter_in_sizes)
+    x1 = self._CreateNumpyTensor(tensor_in_sizes_batch)
+    x2 = x1.reshape(tensor_in_sizes_expanded_batch)
+    conv1 = nn_ops.conv2d(
+        x1,
+        filter_in,
+        strides=[1, 1],
+        padding="VALID")
+    conv2 = nn_ops.conv2d(
+        x2,
+        filter_in,
+        strides=[1, 1],
+        padding="VALID")
+    self.assertEqual(conv1.shape, tensor_in_sizes_batch)
+    self.assertEqual(conv2.shape, tensor_in_sizes_expanded_batch)
+    self.assertAllEqual(
+        conv1,
+        self.evaluate(conv2).reshape(conv1.shape))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConvolutionClass2DExpandedBatch(self):
+    tensor_in_sizes_batch = [10, 2, 3, 3]
+    tensor_in_sizes_expanded_batch = [2, 5, 2, 3, 3]
+    filter_in_sizes = [1, 1, 3, 3]
+    filter_in = self._CreateNumpyTensor(filter_in_sizes)
+    x1 = self._CreateNumpyTensor(tensor_in_sizes_batch)
+    x2 = x1.reshape(tensor_in_sizes_expanded_batch)
+    convolver1 = nn_ops.Convolution(
+        input_shape=x1.shape,
+        filter_shape=filter_in.shape,
+        strides=[1, 1],
+        padding="VALID")
+    self.assertEqual(convolver1.num_batch_dims, 1)
+    convolver2 = nn_ops.Convolution(
+        input_shape=x2.shape,
+        filter_shape=filter_in.shape,
+        strides=[1, 1],
+        padding="VALID")
+    self.assertEqual(convolver2.num_batch_dims, 2)
+    conv1 = convolver1(x1, filter_in)
+    conv2 = convolver2(x2, filter_in)
+    self.assertEqual(conv1.shape, tensor_in_sizes_batch)
+    self.assertEqual(conv2.shape, tensor_in_sizes_expanded_batch)
+    self.assertAllEqual(
+        conv1,
+        self.evaluate(conv2).reshape(conv1.shape))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConvolutionWith2SpatialDimensionsAndExpandedBatch(self):
+    tensor_in_sizes_batch = [10, 2, 3, 3]
+    tensor_in_sizes_expanded_batch = [2, 5, 2, 3, 3]
+    filter_in_sizes = [1, 1, 3, 3]
+    filter_in = self._CreateNumpyTensor(filter_in_sizes)
+    x1 = self._CreateNumpyTensor(tensor_in_sizes_batch)
+    x2 = x1.reshape(tensor_in_sizes_expanded_batch)
+    conv1 = nn_ops.convolution(
+        x1,
+        filter_in,
+        strides=[1, 1],
+        padding="VALID")
+    conv2 = nn_ops.convolution(
+        x2,
+        filter_in,
+        strides=[1, 1],
+        padding="VALID")
+    self.assertEqual(conv1.shape, tensor_in_sizes_batch)
+    self.assertEqual(conv2.shape, tensor_in_sizes_expanded_batch)
+    self.assertAllEqual(
+        conv1,
+        self.evaluate(conv2).reshape(conv1.shape))
+
   @test_util.run_in_graph_and_eager_modes
   def testConv2D2x2Filter2x1Dilation(self):
     self._VerifyDilatedConvValues(
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 303d2791d07..8c84bde1431 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -97,23 +97,27 @@ class ComparisonOpTest(test.TestCase):
     for t in dtypes:
       for x in data:
         for y in data:
-          self.assertEqual(self._compareScalar(math_ops.less, x, y, t), x < y)
-          self.assertEqual(
-              self._compareScalar(math_ops.less_equal, x, y, t), x <= y)
-          self.assertEqual(
-              self._compareScalar(math_ops.greater, x, y, t), x > y)
-          self.assertEqual(
-              self._compareScalar(math_ops.greater_equal, x, y, t), x >= y)
-          self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(
-              self._compareScalar(math_ops.not_equal, x, y, t), x != y)
+          with self.subTest(t=t, x=x, y=y):
+            self.assertEqual(self._compareScalar(math_ops.less, x, y, t), x < y)
+            self.assertEqual(
+                self._compareScalar(math_ops.less_equal, x, y, t), x <= y)
+            self.assertEqual(
+                self._compareScalar(math_ops.greater, x, y, t), x > y)
+            self.assertEqual(
+                self._compareScalar(math_ops.greater_equal, x, y, t), x >= y)
+            self.assertEqual(
+                self._compareScalar(math_ops.equal, x, y, t), x == y)
+            self.assertEqual(
+                self._compareScalar(math_ops.not_equal, x, y, t), x != y)
     data = [-1, 0, 1, -1j, 1j, 1 + 1j, 1 - 1j]
     for t in [np.complex64, np.complex128]:
       for x in data:
         for y in data:
-          self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(
-              self._compareScalar(math_ops.not_equal, x, y, t), x != y)
+          with self.subTest(t=t, x=x, y=y):
+            self.assertEqual(
+                self._compareScalar(math_ops.equal, x, y, t), x == y)
+            self.assertEqual(
+                self._compareScalar(math_ops.not_equal, x, y, t), x != y)
 
   def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
@@ -126,22 +130,24 @@ class ComparisonOpTest(test.TestCase):
     x = np.linspace(-15, 15, 6).reshape(1, 3, 2)
     y = np.linspace(20, -10, 6).reshape(1, 3, 2)
     for t in [np.float16, np.float32, np.float64, np.int32, np.int64]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(xt, yt, np.less, math_ops.less)
-      self._compare(xt, yt, np.less_equal, math_ops.less_equal)
-      self._compare(xt, yt, np.greater, math_ops.greater)
-      self._compare(xt, yt, np.greater_equal, math_ops.greater_equal)
-      self._compare(xt, yt, np.equal, math_ops.equal)
-      self._compare(xt, yt, np.not_equal, math_ops.not_equal)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(xt, yt, np.less, math_ops.less)
+        self._compare(xt, yt, np.less_equal, math_ops.less_equal)
+        self._compare(xt, yt, np.greater, math_ops.greater)
+        self._compare(xt, yt, np.greater_equal, math_ops.greater_equal)
+        self._compare(xt, yt, np.equal, math_ops.equal)
+        self._compare(xt, yt, np.not_equal, math_ops.not_equal)
     # Complex types do not support ordering but do support equality tests.
     for t in [np.complex64, np.complex128]:
-      xt = x.astype(t)
-      xt -= 1j * xt
-      yt = y.astype(t)
-      yt -= 1j * yt
-      self._compare(xt, yt, np.equal, math_ops.equal)
-      self._compare(xt, yt, np.not_equal, math_ops.not_equal)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        xt -= 1j * xt
+        yt = y.astype(t)
+        yt -= 1j * yt
+        self._compare(xt, yt, np.equal, math_ops.equal)
+        self._compare(xt, yt, np.not_equal, math_ops.not_equal)
 
   def _compareBCast(self, xs, ys, dtype, np_func, tf_func):
     x = np.linspace(-15, 15, np.prod(xs)).astype(dtype).reshape(xs)
@@ -178,7 +184,8 @@ class ComparisonOpTest(test.TestCase):
 
     for (xs, ys) in shapes:
       for dtype in dtypes:
-        self._compareBCast(xs, ys, dtype, np_func, tf_func)
+        with self.subTest(xs=xs, ys=ys, dtype=dtype):
+          self._compareBCast(xs, ys, dtype, np_func, tf_func)
 
   def testBCastLess(self):
     self._testBCastByFunc(np.less, math_ops.less)
@@ -209,10 +216,11 @@ class ComparisonOpTest(test.TestCase):
     y = np.arange(0, 10).reshape([5, 2])
     for t in dtypes:
       for f in funcs:
-        with self.assertRaisesRegexp(
-            (ValueError, errors.InvalidArgumentError),
-            "Incompatible shapes|Dimensions must be equal"):
-          f(x.astype(t), y.astype(t))
+        with self.subTest(t=t, f=f):
+          with self.assertRaisesRegexp(
+              (ValueError, errors.InvalidArgumentError),
+              "Incompatible shapes|Dimensions must be equal"):
+            f(x.astype(t), y.astype(t))
 
 
 class LogicalOpTest(test.TestCase):
@@ -241,23 +249,27 @@ class LogicalOpTest(test.TestCase):
     data = [np.array([True]), np.array([False])]
     for use_gpu in [True, False]:
       for x in data:
-        self._not(x, use_gpu)
+        with self.subTest(use_gpu=use_gpu, x=x):
+          self._not(x, use_gpu)
       for x in data:
         for y in data:
-          self._compareBinary(x, y, np.logical_and, math_ops.logical_and,
-                              use_gpu)
-          self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
-          self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor,
-                              use_gpu)
+          with self.subTest(use_gpu=use_gpu, x=x, y=y):
+            self._compareBinary(x, y, np.logical_and, math_ops.logical_and,
+                                use_gpu)
+            self._compareBinary(x, y, np.logical_or, math_ops.logical_or,
+                                use_gpu)
+            self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor,
+                                use_gpu)
 
   def testTensor(self):
     x = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     y = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     for use_gpu in [True, False]:
-      self._not(x, use_gpu)
-      self._compareBinary(x, y, np.logical_and, math_ops.logical_and, use_gpu)
-      self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
-      self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor, use_gpu)
+      with self.subTest(use_gpu=use_gpu):
+        self._not(x, use_gpu)
+        self._compareBinary(x, y, np.logical_and, math_ops.logical_and, use_gpu)
+        self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
+        self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor, use_gpu)
 
   def testBCast(self):
     shapes = [
@@ -277,18 +289,22 @@ class LogicalOpTest(test.TestCase):
       x = np.random.randint(0, 2, np.prod(xs)).astype(np.bool).reshape(xs)
       y = np.random.randint(0, 2, np.prod(ys)).astype(np.bool).reshape(ys)
       for use_gpu in [True, False]:
-        self._compareBinary(x, y, np.logical_and, math_ops.logical_and, use_gpu)
-        self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
-        self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor, use_gpu)
+        with self.subTest(xs=xs, ys=ys, use_gpu=use_gpu):
+          self._compareBinary(x, y, np.logical_and, math_ops.logical_and,
+                              use_gpu)
+          self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
+          self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor,
+                              use_gpu)
 
   @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     x = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     y = np.random.randint(0, 2, 6).astype(np.bool).reshape(3, 2, 1)
     for f in [math_ops.logical_and, math_ops.logical_or, math_ops.logical_xor]:
-      with self.assertRaisesWithPredicateMatch(
-          ValueError, lambda e: "Dimensions must" in str(e)):
-        f(x, y)
+      with self.subTest(f=f):
+        with self.assertRaisesWithPredicateMatch(
+            ValueError, lambda e: "Dimensions must" in str(e)):
+          f(x, y)
 
   @test_util.run_deprecated_v1
   def testUsingAsPythonValueFails(self):
@@ -389,11 +405,12 @@ class SelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(fn, c, xt, yt, use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
-        self._compare(fn, c, xt, yt, use_gpu=True)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(fn, c, xt, yt, use_gpu=False)
+        if t in [np.float16, np.float32, np.float64]:
+          self._compare(fn, c, xt, yt, use_gpu=True)
 
   def testScalar(self):
     self._testScalar(array_ops.where)
@@ -404,11 +421,12 @@ class SelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(fn, c, xt, yt, use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
-        self._compare(fn, c, xt, yt, use_gpu=True)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(fn, c, xt, yt, use_gpu=False)
+        if t in [np.float16, np.float32, np.float64]:
+          self._compare(fn, c, xt, yt, use_gpu=True)
 
   def testScalarBroadcast(self):
     c = True
@@ -450,11 +468,12 @@ class SelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(fn, c, xt, yt, use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
-        self._compare(fn, c, xt, yt, use_gpu=True)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(fn, c, xt, yt, use_gpu=False)
+        if t in [np.float16, np.float32, np.float64]:
+          self._compare(fn, c, xt, yt, use_gpu=True)
 
   def testBasic(self):
     self._testBasic(array_ops.where)
@@ -465,11 +484,12 @@ class SelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(fn, c, xt, yt, use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
-        self._compare(fn, c, xt, yt, use_gpu=True)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(fn, c, xt, yt, use_gpu=False)
+        if t in [np.float16, np.float32, np.float64]:
+          self._compare(fn, c, xt, yt, use_gpu=True)
 
   def testBasicBroadcast(self):
     c0 = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
@@ -478,53 +498,55 @@ class SelectOpTest(test.TestCase):
     c3 = np.random.randint(0, 2, 1).astype(np.bool).reshape(1, 1, 1)
     for c in [c0, c1, c2, c3]:
       # where_v2 only
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1, 1) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 3, 1) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1, 2) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 2) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(3, 2) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+      with self.subTest(c=c):
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1, 1) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 3, 1) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1, 2) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 2) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(3, 2) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
 
   def _testGradients(self, fn):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     x = np.random.rand(1, 3, 2) * 100
     y = np.random.rand(1, 3, 2) * 100
     for t in [np.float16, np.float32, np.float64]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      if t == np.float16:
-        # Compare fp16 theoretical gradients to fp32 numerical gradients,
-        # since fp16 numerical gradients are too imprecise unless great
-        # care is taken with choosing the inputs and the delta. This is
-        # a weaker check (in particular, it does not test the op itself,
-        # only its gradient), but it's much better than nothing.
-        self._compareGradientX(fn, c, xt, yt, np.float)
-        self._compareGradientY(fn, c, xt, yt, np.float)
-      else:
-        self._compareGradientX(fn, c, xt, yt)
-        self._compareGradientY(fn, c, xt, yt)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        if t == np.float16:
+          # Compare fp16 theoretical gradients to fp32 numerical gradients,
+          # since fp16 numerical gradients are too imprecise unless great
+          # care is taken with choosing the inputs and the delta. This is
+          # a weaker check (in particular, it does not test the op itself,
+          # only its gradient), but it's much better than nothing.
+          self._compareGradientX(fn, c, xt, yt, np.float)
+          self._compareGradientY(fn, c, xt, yt, np.float)
+        else:
+          self._compareGradientX(fn, c, xt, yt)
+          self._compareGradientY(fn, c, xt, yt)
 
   @test_util.run_deprecated_v1
   def testGradients(self):
@@ -536,27 +558,28 @@ class SelectOpTest(test.TestCase):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     for t in [np.float32, np.float64]:
       # where_v2 only
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1, 1) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 3, 1) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1, 2) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 2) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(3, 2) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+      with self.subTest(t=t):
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1, 1) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 3, 1) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1, 2) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 2) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(3, 2) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
 
   def _testShapeMismatch(self, fn):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
@@ -566,10 +589,11 @@ class SelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      with self.assertRaises(ValueError):
-        fn(c, xt, yt)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        with self.assertRaises(ValueError):
+          fn(c, xt, yt)
 
   @test_util.run_deprecated_v1
   def testShapeMismatch(self):
@@ -597,9 +621,10 @@ class SelectOpTest(test.TestCase):
       for c in False, True:
         for a in 7.0, np.nan:
           for b in 5.0, np.nan:
-            x = fn(c, a, b).eval()
-            y = a if c else b
-            self.assertEqual(np.isnan(x), np.isnan(y))
+            with self.subTest(c=c, a=a, b=b):
+              x = fn(c, a, b).eval()
+              y = a if c else b
+              self.assertEqual(np.isnan(x), np.isnan(y))
 
   @test_util.run_deprecated_v1
   def testNan(self):
@@ -677,11 +702,12 @@ class BatchSelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(c, xt, yt, use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
-        self._compare(c, xt, yt, use_gpu=True)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(c, xt, yt, use_gpu=False)
+        if t in [np.float16, np.float32, np.float64]:
+          self._compare(c, xt, yt, use_gpu=True)
 
   @test_util.run_deprecated_v1
   def testGradients(self):
@@ -689,19 +715,20 @@ class BatchSelectOpTest(test.TestCase):
     x = np.random.rand(16, 2, 8) * 100
     y = np.random.rand(16, 2, 8) * 100
     for t in [np.float16, np.float32, np.float64]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      if t == np.float16:
-        # Compare fp16 theoretical gradients to fp32 numerical gradients,
-        # since fp16 numerical gradients are too imprecise unless great
-        # care is taken with choosing the inputs and the delta. This is
-        # a weaker check (in particular, it does not test the op itself,
-        # only its gradient), but it's much better than nothing.
-        self._compareGradientX(c, xt, yt, np.float)
-        self._compareGradientY(c, xt, yt, np.float)
-      else:
-        self._compareGradientX(c, xt, yt)
-        self._compareGradientY(c, xt, yt)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        if t == np.float16:
+          # Compare fp16 theoretical gradients to fp32 numerical gradients,
+          # since fp16 numerical gradients are too imprecise unless great
+          # care is taken with choosing the inputs and the delta. This is
+          # a weaker check (in particular, it does not test the op itself,
+          # only its gradient), but it's much better than nothing.
+          self._compareGradientX(c, xt, yt, np.float)
+          self._compareGradientY(c, xt, yt, np.float)
+        else:
+          self._compareGradientX(c, xt, yt)
+          self._compareGradientY(c, xt, yt)
 
   @test_util.run_deprecated_v1
   def testShapeMismatch(self):
@@ -712,10 +739,11 @@ class BatchSelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      with self.assertRaises(ValueError):
-        array_ops.where(c, xt, yt)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        with self.assertRaises(ValueError):
+          array_ops.where(c, xt, yt)
 
 
 class MinMaxOpTest(test.TestCase):
@@ -735,23 +763,26 @@ class MinMaxOpTest(test.TestCase):
     y = np.random.rand(1, 3, 2) * 100.
     for t in [np.float16, np.float32, np.float64, np.uint8, np.int16, np.int32,
               np.int64]:
-      self._compare(x.astype(t), y.astype(t), use_gpu=False)
-      self._compare(x.astype(t), y.astype(t), use_gpu=True)
+      with self.subTest(t=t):
+        self._compare(x.astype(t), y.astype(t), use_gpu=False)
+        self._compare(x.astype(t), y.astype(t), use_gpu=True)
 
   def testDifferentShapes(self):
     x = np.random.rand(1, 3, 2) * 100.
     y = np.random.rand(2) * 100.  # should broadcast
     for t in [np.float16, np.float32, np.float64, np.int32, np.int64]:
-      self._compare(x.astype(t), y.astype(t), use_gpu=False)
-      self._compare(x.astype(t), y.astype(t), use_gpu=True)
+      with self.subTest(t=t):
+        self._compare(x.astype(t), y.astype(t), use_gpu=False)
+        self._compare(x.astype(t), y.astype(t), use_gpu=True)
 
   def testScalar(self):
     x = np.random.rand(1, 3, 2) * 100.
     y = np.random.rand(1).item() * 100.  # should broadcast
     # dropped np.float64, int64 because TF automatically converts to 32 bit
     for t in [np.float32, np.int32]:
-      self._compare(x.astype(t), t(y), use_gpu=False)
-      self._compare(x.astype(t), t(y), use_gpu=True)
+      with self.subTest(t=t):
+        self._compare(x.astype(t), t(y), use_gpu=False)
+        self._compare(x.astype(t), t(y), use_gpu=True)
 
   def _compareGradientX(self, func, x, y):
     with self.cached_session():
@@ -841,13 +872,15 @@ class MathOpsOverloadTest(test.TestCase):
     ]
     for dtype in dtypes:
       for np_func, tf_func in funcs:
-        if dtype in (dtypes_lib.complex64,
-                     dtypes_lib.complex128) and tf_func == _FLOORDIV:
-          continue  # floordiv makes no sense for complex
-        self._compareBinary(10, 5, dtype, np_func, tf_func)
+        with self.subTest(dtype=dtype, np_func=np_func, tf_func=tf_func):
+          if dtype in (dtypes_lib.complex64,
+                       dtypes_lib.complex128) and tf_func == _FLOORDIV:
+            continue  # floordiv makes no sense for complex
+          self._compareBinary(10, 5, dtype, np_func, tf_func)
     # Mod only works for int32 and int64.
     for dtype in [dtypes_lib.int32, dtypes_lib.int64]:
-      self._compareBinary(10, 3, dtype, np.mod, _MOD)
+      with self.subTest(dtype=dtype):
+        self._compareBinary(10, 3, dtype, np.mod, _MOD)
 
   def testOverloadComparisons(self):
     dtypes = [
@@ -865,18 +898,20 @@ class MathOpsOverloadTest(test.TestCase):
     ]
     for dtype in dtypes:
       for np_func, tf_func in funcs:
-        self._compareBinary(10, 5, dtype, np_func, tf_func)
+        with self.subTest(dtype=dtype, np_func=np_func, tf_func=tf_func):
+          self._compareBinary(10, 5, dtype, np_func, tf_func)
     logical_funcs = [(np.logical_and, _AND), (np.logical_or, _OR),
                      (np.logical_xor, _XOR), (np.equal, math_ops.equal),
                      (np.not_equal, math_ops.not_equal)]
     for np_func, tf_func in logical_funcs:
-      self._compareBinary(True, False, dtypes_lib.bool, np_func, tf_func)
-      self._compareBinary(True, True, dtypes_lib.bool, np_func, tf_func)
-      self._compareBinary(False, False, dtypes_lib.bool, np_func, tf_func)
-      self._compareBinary(False, True, dtypes_lib.bool, np_func, tf_func)
-      self._compareBinary([True, True, False, False],
-                          [True, False, True, False], dtypes_lib.bool, np_func,
-                          tf_func)
+      with self.subTest(np_func=np_func, tf_func=tf_func):
+        self._compareBinary(True, False, dtypes_lib.bool, np_func, tf_func)
+        self._compareBinary(True, True, dtypes_lib.bool, np_func, tf_func)
+        self._compareBinary(False, False, dtypes_lib.bool, np_func, tf_func)
+        self._compareBinary(False, True, dtypes_lib.bool, np_func, tf_func)
+        self._compareBinary([True, True, False, False],
+                            [True, False, True, False], dtypes_lib.bool,
+                            np_func, tf_func)
     self._compareUnary(True, dtypes_lib.bool, np.logical_not, _INV)
     self._compareUnary(False, dtypes_lib.bool, np.logical_not, _INV)
     self._compareUnary([True, False], dtypes_lib.bool, np.logical_not, _INV)
@@ -924,16 +959,17 @@ class IsFiniteInfNanTest(test.TestCase):
         # It is not accurate for very large arguments, so we test for
         # fi.max/100 instead of fi.max here.
         for value in [fi.min, -2, -1, 0, fi.tiny, 1, 2, 1000, fi.max / 100]:
-          x = np.full((size,), value, dtype=dtype)
-          np_y = np.sqrt(x)
-          np_nan = np.isnan(np_y)
-          with test_util.use_gpu():
-            tf_y = math_ops.sqrt(x)
-            tf_nan = math_ops.is_nan(tf_y)
-            if value < 0:
-              self.assertAllEqual(np_nan, self.evaluate(tf_nan))
-            else:
-              self.assertAllCloseAccordingToType(np_y, self.evaluate(tf_y))
+          with self.subTest(dtype=dtype, size=size, value=value):
+            x = np.full((size,), value, dtype=dtype)
+            np_y = np.sqrt(x)
+            np_nan = np.isnan(np_y)
+            with test_util.use_gpu():
+              tf_y = math_ops.sqrt(x)
+              tf_nan = math_ops.is_nan(tf_y)
+              if value < 0:
+                self.assertAllEqual(np_nan, self.evaluate(tf_nan))
+              else:
+                self.assertAllCloseAccordingToType(np_y, self.evaluate(tf_y))
 
 
 class RoundingTest(test.TestCase):
@@ -978,7 +1014,8 @@ class RoundingTest(test.TestCase):
   def testTypes(self):
     self.skipTest("b/131162241")
     for dtype in [np.float16, np.float32, np.float64]:
-      self._testDtype(dtype)
+      with self.subTest(dtype=dtype):
+        self._testDtype(dtype)
 
 
 class ComplexMakeRealImagTest(test.TestCase):
@@ -999,19 +1036,21 @@ class ComplexMakeRealImagTest(test.TestCase):
     real = (np.arange(-3, 3) / 4.).reshape([1, 3, 2]).astype(np.float32)
     imag = (np.arange(-3, 3) / 5.).reshape([1, 3, 2]).astype(np.float32)
     for use_gpu in [False, True]:
-      self._compareMake(real, imag, use_gpu)
-      self._compareMake(real, 12.0, use_gpu)
-      self._compareMake(23.0, imag, use_gpu)
+      with self.subTest(use_gpu=use_gpu):
+        self._compareMake(real, imag, use_gpu)
+        self._compareMake(real, 12.0, use_gpu)
+        self._compareMake(23.0, imag, use_gpu)
 
   def testRealImagNumericType(self):
     for use_gpu in [True, False]:
       for value in [1., 1j, 1. + 1j]:
-        np_real, np_imag = np.real(value), np.imag(value)
-        with test_util.device(use_gpu=use_gpu):
-          tf_real = math_ops.real(value)
-          tf_imag = math_ops.imag(value)
-          self.assertAllEqual(np_real, self.evaluate(tf_real))
-          self.assertAllEqual(np_imag, self.evaluate(tf_imag))
+        with self.subTest(use_gpu=use_gpu, value=value):
+          np_real, np_imag = np.real(value), np.imag(value)
+          with test_util.device(use_gpu=use_gpu):
+            tf_real = math_ops.real(value)
+            tf_imag = math_ops.imag(value)
+            self.assertAllEqual(np_real, self.evaluate(tf_real))
+            self.assertAllEqual(np_imag, self.evaluate(tf_imag))
 
   def _compareRealImag(self, cplx, use_gpu):
     np_real, np_imag = np.real(cplx), np.imag(cplx)
@@ -1079,9 +1118,10 @@ class ComplexMakeRealImagTest(test.TestCase):
   def testRealReal(self):
     for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float32,
                   dtypes_lib.float64):
-      x = array_ops.placeholder(dtype)
-      y = math_ops.real(x)
-      self.assertEqual(x, y)
+      with self.subTest(dtype=dtype):
+        x = array_ops.placeholder(dtype)
+        y = math_ops.real(x)
+        self.assertEqual(x, y)
 
   def _compareConj(self, cplx, use_gpu):
     np_ans = np.conj(cplx)
@@ -1110,9 +1150,10 @@ class ComplexMakeRealImagTest(test.TestCase):
   def testConjReal(self):
     for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float16,
                   dtypes_lib.float32, dtypes_lib.float64):
-      x = array_ops.placeholder(dtype)
-      y = math_ops.conj(x)
-      self.assertEqual(x, y)
+      with self.subTest(dtype=dtype):
+        x = array_ops.placeholder(dtype)
+        y = math_ops.conj(x)
+        self.assertEqual(x, y)
 
   @test_util.run_deprecated_v1
   def testConjString(self):
@@ -1146,10 +1187,11 @@ class ComplexMakeRealImagTest(test.TestCase):
     epsilon = 1e-3
     with self.cached_session():
       for args in [(x_, 0.), (0., x_)]:
-        z = math_ops.reduce_sum(math_ops.abs(math_ops.complex(*args)))
-        jacob_t, jacob_n = gradient_checker.compute_gradient(
-            x_, list(x.shape), z, [1], x_init_value=x, delta=epsilon)
-        self.assertAllClose(jacob_t, jacob_n, rtol=epsilon, atol=epsilon)
+        with self.subTest(args=args):
+          z = math_ops.reduce_sum(math_ops.abs(math_ops.complex(*args)))
+          jacob_t, jacob_n = gradient_checker.compute_gradient(
+              x_, list(x.shape), z, [1], x_init_value=x, delta=epsilon)
+          self.assertAllClose(jacob_t, jacob_n, rtol=epsilon, atol=epsilon)
 
   @test_util.run_deprecated_v1
   def testGradient(self):
@@ -1208,7 +1250,8 @@ class PolyvalTest(test.TestCase):
         np.int32, np.float32, np.float64, np.complex64, np.complex128
     ]:
       for degree in range(5):
-        self._runtest(dtype, degree)
+        with self.subTest(dtype=dtype, degree=degree):
+          self._runtest(dtype, degree)
 
   def testBroadcast(self):
     dtype = np.float32
@@ -1216,15 +1259,16 @@ class PolyvalTest(test.TestCase):
     shapes = [(1,), (2, 1), (1, 2), (2, 2)]
     for x_shape in shapes:
       for coeff_shape in shapes:
-        x = np.random.rand(*x_shape).astype(dtype)
-        coeffs = [
-            np.random.rand(*coeff_shape).astype(dtype)
-            for _ in range(degree + 1)
-        ]
-        np_val = np.polyval(coeffs, x)
-        with self.cached_session():
-          tf_val = math_ops.polyval(coeffs, x)
-          self.assertAllClose(np_val, self.evaluate(tf_val))
+        with self.subTest(x_shape=x_shape, coeff_shape=coeff_shape):
+          x = np.random.rand(*x_shape).astype(dtype)
+          coeffs = [
+              np.random.rand(*coeff_shape).astype(dtype)
+              for _ in range(degree + 1)
+          ]
+          np_val = np.polyval(coeffs, x)
+          with self.cached_session():
+            tf_val = math_ops.polyval(coeffs, x)
+            self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testEmpty(self):
     x = np.random.rand(2, 2).astype(np.float32)
diff --git a/tensorflow/python/kernel_tests/denormal_test.py b/tensorflow/python/kernel_tests/denormal_test.py
index d824e95f213..6e073f0d526 100644
--- a/tensorflow/python/kernel_tests/denormal_test.py
+++ b/tensorflow/python/kernel_tests/denormal_test.py
@@ -23,7 +23,6 @@ import platform
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -35,32 +34,30 @@ class DenormalTest(test.TestCase):
       tiny = np.finfo(dtype).tiny
       self.assertEqual(tiny, tiny / 16 * 16)
 
-  def _flushDenormalsTest(self, use_gpu, dtypes):
-    if platform.machine() == "ppc64le" or platform.machine(
-    ) == "s390x" or platform.machine() == "aarch64":
+  def _flushDenormalsTest(self, dtypes):
+    if (platform.machine() == "ppc64le" or platform.machine() == "s390x" or
+        platform.machine() == "aarch64"):
       # Disabled denormal_test on power/s390x/aarch64 platform
       # Check relevant discussion - https://github.com/tensorflow/tensorflow/issues/11902
       return
-    with self.cached_session(use_gpu=use_gpu):
-      array_ops.identity(7).eval()
-      for dtype in dtypes:
-        tiny = np.finfo(dtype).tiny
-        # Small shape to test main thread, large shape to test thread pool
-        for shape in (), (1 << 20,):
-          flush = 0.1 * constant_op.constant(tiny, shape=shape)
-          self.assertAllEqual(flush.eval(), np.zeros(shape))
-          # Make sure the flags don't leak out
-          self.testPythonHasDenormals()
+    for dtype in dtypes:
+      tiny = np.finfo(dtype).tiny
+      # Small shape to test main thread, large shape to test thread pool
+      for shape in (), (1 << 20,):
+        flush = 0.1 * constant_op.constant(tiny, shape=shape)
+        self.assertAllEqual(self.evaluate(flush), np.zeros(shape))
+        # Make sure the flags don't leak out
+        self.testPythonHasDenormals()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=False)
   def testFlushDenormalsCPU(self):
     # On CPUs, the processor flags flush for both single and double precision.
-    self._flushDenormalsTest(use_gpu=False, dtypes=(np.float32, np.float64))
+    self._flushDenormalsTest(dtypes=(np.float32, np.float64))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testFlushDenormalsGPU(self):
     # On GPUs, only single precision can flush to zero.
-    self._flushDenormalsTest(use_gpu=True, dtypes=(np.float32,))
+    self._flushDenormalsTest(dtypes=(np.float32,))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/einsum_op_test.py b/tensorflow/python/kernel_tests/einsum_op_test.py
index a6b623b828c..47d5d457193 100644
--- a/tensorflow/python/kernel_tests/einsum_op_test.py
+++ b/tensorflow/python/kernel_tests/einsum_op_test.py
@@ -42,10 +42,11 @@ class EinsumOpTest(test.TestCase):
     r = np.random.RandomState(0)
     inputs = []
     for shape in input_shapes:
-      arr = np.array(r.randn(*shape)).astype(dtype)
-      if dtype == np.complex64 or dtype == np.complex128:
-        arr += 1j * np.array(r.randn(*shape)).astype(dtype)
-      inputs.append(arr)
+      with self.subTest(s=s, shape=shape):
+        arr = np.array(r.randn(*shape)).astype(dtype)
+        if dtype == np.complex64 or dtype == np.complex128:
+          arr += 1j * np.array(r.randn(*shape)).astype(dtype)
+        inputs.append(arr)
     input_tensors = [constant_op.constant(x, shape=x.shape) for x in inputs]
     a = np.einsum(s, *inputs)
     b = self.evaluate(gen_linalg_ops.einsum(input_tensors, s))
@@ -160,10 +161,11 @@ class EinsumOpTest(test.TestCase):
       input_shapes = [(2, 2), (2, 2)]
       inputs = []
       for shape in input_shapes:
-        arr = np.array(r.randn(*shape)).astype(dtype)
-        if dtype == np.complex64 or dtype == np.complex128:
-          arr += 1j * np.array(r.randn(*shape)).astype(dtype)
-        inputs.append(arr)
+        with self.subTest(dtype=dtype, shape=shape):
+          arr = np.array(r.randn(*shape)).astype(dtype)
+          if dtype == np.complex64 or dtype == np.complex128:
+            arr += 1j * np.array(r.randn(*shape)).astype(dtype)
+          inputs.append(arr)
       input_tensors = [constant_op.constant(x) for x in inputs]
       if dtype == bfloat16:
         # np.einsum doesn't support bfloat16.
@@ -199,14 +201,15 @@ class EinsumOpTest(test.TestCase):
         ('...ij,...jk->ik', r.randn(2, 2, 3), r.randn(3, 4)),
     ]
     for args in cases:
-      with self.assertRaises((ValueError, errors.InvalidArgumentError)):
-        _ = self.evaluate(gen_linalg_ops.einsum(args[1:], args[0]))
+      with self.subTest(args=args):
+        with self.assertRaises((ValueError, errors.InvalidArgumentError)):
+          _ = self.evaluate(gen_linalg_ops.einsum(args[1:], args[0]))
 
-      placeholders = [
-          array_ops.placeholder_with_default(x, shape=None) for x in args[1:]
-      ]
-      with self.assertRaises((ValueError, errors.InvalidArgumentError)):
-        _ = self.evaluate(gen_linalg_ops.einsum(placeholders, args[0]))
+        placeholders = [
+            array_ops.placeholder_with_default(x, shape=None) for x in args[1:]
+        ]
+        with self.assertRaises((ValueError, errors.InvalidArgumentError)):
+          _ = self.evaluate(gen_linalg_ops.einsum(placeholders, args[0]))
 
   @test_util.run_in_graph_and_eager_modes
   def testPlaceholder(self):
@@ -216,10 +219,12 @@ class EinsumOpTest(test.TestCase):
       inputs = []
       input_placeholders = []
       for actual_shape, placeholder_shape in input_and_placeholder_shapes:
-        input_np = np.array(r.randn(*actual_shape))
-        inputs.append(input_np)
-        input_placeholders.append(
-            array_ops.placeholder_with_default(input_np, placeholder_shape))
+        with self.subTest(equation=equation, actual_shape=actual_shape,
+                          placeholder_shape=placeholder_shape):
+          input_np = np.array(r.randn(*actual_shape))
+          inputs.append(input_np)
+          input_placeholders.append(
+              array_ops.placeholder_with_default(input_np, placeholder_shape))
 
       a = np.einsum(equation, *inputs)
       b = self.evaluate(gen_linalg_ops.einsum(input_placeholders, equation))
@@ -288,19 +293,22 @@ class EinsumGradTest(test.TestCase):
     with self.cached_session():
       r = np.random.RandomState(seed=0)
       for dtype in (np.float32, np.float64, np.complex64, np.complex128):
-        tol = 10 * np.sqrt(np.finfo(dtype).resolution)
-        if dtype in (np.complex64, np.complex128):
-          inputs = [
-              np.array(r.randn(*shape), dtype) +
-              1j * np.array(r.randn(*shape), dtype) for shape in input_shapes
-          ]
-        else:
-          inputs = [np.array(r.randn(*shape), dtype) for shape in input_shapes]
-        input_tensors = [constant_op.constant(x, shape=x.shape) for x in inputs]
-        analytical, numerical = gradient_checker_v2.compute_gradient(
-            lambda *xs: gen_linalg_ops.einsum(xs, s), input_tensors)
-        self.assertLess(
-            gradient_checker_v2.max_error(analytical, numerical), tol)
+        with self.subTest(s=s, dtype=dtype):
+          tol = 10 * np.sqrt(np.finfo(dtype).resolution)
+          if dtype in (np.complex64, np.complex128):
+            inputs = [
+                np.array(r.randn(*shape), dtype) +
+                1j * np.array(r.randn(*shape), dtype) for shape in input_shapes
+            ]
+          else:
+            inputs = [
+                np.array(r.randn(*shape), dtype) for shape in input_shapes]
+          input_tensors = [
+              constant_op.constant(x, shape=x.shape) for x in inputs]
+          analytical, numerical = gradient_checker_v2.compute_gradient(
+              lambda *xs: gen_linalg_ops.einsum(xs, s), input_tensors)
+          self.assertLess(
+              gradient_checker_v2.max_error(analytical, numerical), tol)
 
   @test_util.disable_xla('b/131919749')
   def testUnary(self):
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index 953f18bb07a..b966110963c 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -62,14 +62,15 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       data = np.array([0, 1, 2, 3, 7, 5])
       for dtype in _TEST_TYPES:
         for indices in 4, [1, 2, 2, 4, 5]:
-          params_np = self._buildParams(data, dtype)
-          params = constant_op.constant(params_np)
-          indices_tf = constant_op.constant(indices)
-          gather_t = array_ops.gather(params, indices_tf)
-          gather_val = self.evaluate(gather_t)
-          np_val = params_np[indices]
-          self.assertAllEqual(np_val, gather_val)
-          self.assertEqual(np_val.shape, gather_t.get_shape())
+          with self.subTest(dtype=dtype, indices=indices):
+            params_np = self._buildParams(data, dtype)
+            params = constant_op.constant(params_np)
+            indices_tf = constant_op.constant(indices)
+            gather_t = array_ops.gather(params, indices_tf)
+            gather_val = self.evaluate(gather_t)
+            np_val = params_np[indices]
+            self.assertAllEqual(np_val, gather_val)
+            self.assertEqual(np_val.shape, gather_t.get_shape())
 
   def testScalar2D(self):
     with self.session(use_gpu=True):
@@ -77,14 +78,15 @@ class GatherTest(test.TestCase, parameterized.TestCase):
                        [9, 10, 11], [12, 13, 14]])
       for dtype in _TEST_TYPES:
         for axis in range(data.ndim):
-          params_np = self._buildParams(data, dtype)
-          params = constant_op.constant(params_np)
-          indices = constant_op.constant(2)
-          gather_t = array_ops.gather(params, indices, axis=axis)
-          gather_val = self.evaluate(gather_t)
-          self.assertAllEqual(np.take(params_np, 2, axis=axis), gather_val)
-          expected_shape = data.shape[:axis] + data.shape[axis + 1:]
-          self.assertEqual(expected_shape, gather_t.get_shape())
+          with self.subTest(dtype=dtype, axis=axis):
+            params_np = self._buildParams(data, dtype)
+            params = constant_op.constant(params_np)
+            indices = constant_op.constant(2)
+            gather_t = array_ops.gather(params, indices, axis=axis)
+            gather_val = self.evaluate(gather_t)
+            self.assertAllEqual(np.take(params_np, 2, axis=axis), gather_val)
+            expected_shape = data.shape[:axis] + data.shape[axis + 1:]
+            self.assertEqual(expected_shape, gather_t.get_shape())
 
   def testSimpleTwoD32(self):
     with self.session(use_gpu=True):
@@ -92,16 +94,17 @@ class GatherTest(test.TestCase, parameterized.TestCase):
                        [9, 10, 11], [12, 13, 14]])
       for dtype in _TEST_TYPES:
         for axis in range(data.ndim):
-          params_np = self._buildParams(data, dtype)
-          params = constant_op.constant(params_np)
-          # The indices must be in bounds for any axis.
-          indices = constant_op.constant([0, 1, 0, 2])
-          gather_t = array_ops.gather(params, indices, axis=axis)
-          gather_val = self.evaluate(gather_t)
-          self.assertAllEqual(np.take(params_np, [0, 1, 0, 2], axis=axis),
-                              gather_val)
-          expected_shape = data.shape[:axis] + (4,) + data.shape[axis + 1:]
-          self.assertEqual(expected_shape, gather_t.get_shape())
+          with self.subTest(dtype=dtype, axis=axis):
+            params_np = self._buildParams(data, dtype)
+            params = constant_op.constant(params_np)
+            # The indices must be in bounds for any axis.
+            indices = constant_op.constant([0, 1, 0, 2])
+            gather_t = array_ops.gather(params, indices, axis=axis)
+            gather_val = self.evaluate(gather_t)
+            self.assertAllEqual(np.take(params_np, [0, 1, 0, 2], axis=axis),
+                                gather_val)
+            expected_shape = data.shape[:axis] + (4,) + data.shape[axis + 1:]
+            self.assertEqual(expected_shape, gather_t.get_shape())
 
   @test_util.run_deprecated_v1
   def testHigherRank(self):
@@ -112,58 +115,60 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         for axis in range(len(shape)):
           params = self._buildParams(np.random.randn(*shape), dtype)
           indices = np.random.randint(shape[axis], size=indices_shape)
-          with self.cached_session(use_gpu=True) as sess:
-            tf_params = constant_op.constant(params)
-            tf_indices = constant_op.constant(indices)
-            # Check that both positive and negative indices for axis work.
-            tf_axis = constant_op.constant(axis)
-            tf_negative_axis = constant_op.constant(-len(shape) + axis)
-            gather = array_ops.gather(tf_params, tf_indices, axis=tf_axis)
-            gather_negative_axis = array_ops.gather(
-                tf_params, tf_indices, axis=tf_negative_axis)
-            gather_value, gather_negative_axis_value = sess.run(
-                [gather, gather_negative_axis])
-            gather_np = np.take(params, indices, axis)
-            self.assertAllEqual(gather_np, gather_value)
-            self.assertAllEqual(gather_np, gather_negative_axis_value)
-            expected_shape = (params.shape[:axis] + indices.shape +
-                              params.shape[axis + 1:])
-            self.assertEqual(expected_shape, gather.shape)
-            self.assertEqual(expected_shape, gather_negative_axis.shape)
+          with self.subTest(indices_shape=indices_shape, dtype=dtype, axis=axis,
+                            indices=indices):
+            with self.cached_session(use_gpu=True) as sess:
+              tf_params = constant_op.constant(params)
+              tf_indices = constant_op.constant(indices)
+              # Check that both positive and negative indices for axis work.
+              tf_axis = constant_op.constant(axis)
+              tf_negative_axis = constant_op.constant(-len(shape) + axis)
+              gather = array_ops.gather(tf_params, tf_indices, axis=tf_axis)
+              gather_negative_axis = array_ops.gather(
+                  tf_params, tf_indices, axis=tf_negative_axis)
+              gather_value, gather_negative_axis_value = sess.run(
+                  [gather, gather_negative_axis])
+              gather_np = np.take(params, indices, axis)
+              self.assertAllEqual(gather_np, gather_value)
+              self.assertAllEqual(gather_np, gather_negative_axis_value)
+              expected_shape = (params.shape[:axis] + indices.shape +
+                                params.shape[axis + 1:])
+              self.assertEqual(expected_shape, gather.shape)
+              self.assertEqual(expected_shape, gather_negative_axis.shape)
 
-            # Test gradients
-            gather_grad = np.random.randn(
-                *gather.get_shape().as_list()).astype(dtype.as_numpy_dtype)
-            if dtype.is_complex:
-              gather_grad -= 1j * gather_grad
-            params_grad, indices_grad, axis_grad = gradients_impl.gradients(
-                gather, [tf_params, tf_indices, tf_axis], gather_grad)
-            self.assertEqual(indices_grad, None)
-            self.assertEqual(axis_grad, None)
-            if dtype.is_integer:
-              self.assertEqual(params_grad, None)
-              continue
-            # For axis 0, we are able to create an efficient IndexedSlices for
-            # the gradient.
-            if axis == 0:
-              self.assertEqual(type(params_grad), ops.IndexedSlices)
-              params_grad = ops.convert_to_tensor(params_grad)
-            correct_params_grad = np.zeros(shape).astype(dtype.as_numpy_dtype)
-            outer_dims = axis
-            inner_dims = len(shape) - axis - 1
-            gather_grad = gather_grad.reshape(
-                shape[:axis] + (indices.size,) + shape[axis + 1:])
-            for source_index, dest_index in enumerate(indices.flat):
-              dest_slice = ((slice(None),) * outer_dims + (dest_index,) +
-                            (slice(None),) * inner_dims)
-              source_slice = ((slice(None),) * outer_dims + (source_index,) +
+              # Test gradients
+              gather_grad = np.random.randn(
+                  *gather.get_shape().as_list()).astype(dtype.as_numpy_dtype)
+              if dtype.is_complex:
+                gather_grad -= 1j * gather_grad
+              params_grad, indices_grad, axis_grad = gradients_impl.gradients(
+                  gather, [tf_params, tf_indices, tf_axis], gather_grad)
+              self.assertEqual(indices_grad, None)
+              self.assertEqual(axis_grad, None)
+              if dtype.is_integer:
+                self.assertEqual(params_grad, None)
+                continue
+              # For axis 0, we are able to create an efficient IndexedSlices for
+              # the gradient.
+              if axis == 0:
+                self.assertEqual(type(params_grad), ops.IndexedSlices)
+                params_grad = ops.convert_to_tensor(params_grad)
+              correct_params_grad = np.zeros(shape).astype(dtype.as_numpy_dtype)
+              outer_dims = axis
+              inner_dims = len(shape) - axis - 1
+              gather_grad = gather_grad.reshape(
+                  shape[:axis] + (indices.size,) + shape[axis + 1:])
+              for source_index, dest_index in enumerate(indices.flat):
+                dest_slice = ((slice(None),) * outer_dims + (dest_index,) +
                               (slice(None),) * inner_dims)
-              correct_params_grad[dest_slice] += gather_grad[source_slice]
-            self.assertAllClose(
-                correct_params_grad,
-                self.evaluate(params_grad),
-                atol=2e-6,
-                rtol=2e-6)
+                source_slice = ((slice(None),) * outer_dims + (source_index,) +
+                                (slice(None),) * inner_dims)
+                correct_params_grad[dest_slice] += gather_grad[source_slice]
+              self.assertAllClose(
+                  correct_params_grad,
+                  self.evaluate(params_grad),
+                  atol=2e-6,
+                  rtol=2e-6)
 
   @test_util.run_deprecated_v1
   def testString(self):
@@ -177,12 +182,14 @@ class GatherTest(test.TestCase, parameterized.TestCase):
   @test_util.run_deprecated_v1
   def testUInt32AndUInt64(self):
     for unsigned_type in (dtypes.uint32, dtypes.uint64):
-      params = self._buildParams(
-          np.array([[1, 2, 3], [7, 8, 9]]), unsigned_type)
-      with self.cached_session():
-        self.assertAllEqual([7, 8, 9],
-                            array_ops.gather(params, 1, axis=0).eval())
-        self.assertAllEqual([1, 7], array_ops.gather(params, 0, axis=1).eval())
+      with self.subTest(unsigned_type=unsigned_type):
+        params = self._buildParams(
+            np.array([[1, 2, 3], [7, 8, 9]]), unsigned_type)
+        with self.cached_session():
+          self.assertAllEqual([7, 8, 9],
+                              array_ops.gather(params, 1, axis=0).eval())
+          self.assertAllEqual([1, 7],
+                              array_ops.gather(params, 0, axis=1).eval())
 
   @test_util.run_deprecated_v1
   def testUnknownIndices(self):
@@ -237,14 +244,15 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       indices = 0
       for bad_axis in (1, 2, -2):
         # Shape inference can validate axis for known params rank.
-        with self.assertRaisesWithPredicateMatch(
-            ValueError, "Shape must be at least rank . but is rank 1"):
-          array_ops.gather(params, indices, axis=bad_axis)
-        # If params rank is unknown, an op error occurs.
-        with self.assertRaisesOpError(
-            r"Expected axis in the range \[-1, 1\), but got %s" % bad_axis):
-          array_ops.gather(params_ph, indices, axis=bad_axis).eval(
-              feed_dict={params_ph: params})
+        with self.subTest(bad_axis=bad_axis):
+          with self.assertRaisesWithPredicateMatch(
+              ValueError, "Shape must be at least rank . but is rank 1"):
+            array_ops.gather(params, indices, axis=bad_axis)
+          # If params rank is unknown, an op error occurs.
+          with self.assertRaisesOpError(
+              r"Expected axis in the range \[-1, 1\), but got %s" % bad_axis):
+            array_ops.gather(params_ph, indices, axis=bad_axis).eval(
+                feed_dict={params_ph: params})
 
   @test_util.run_deprecated_v1
   def testEmptySlices(self):
@@ -252,20 +260,21 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       for dtype in _TEST_TYPES:
         for itype in np.int32, np.int64:
           # Leading axis gather.
-          params = np.zeros((7, 0, 0), dtype=dtype.as_numpy_dtype)
-          indices = np.array([3, 4], dtype=itype)
-          gather = array_ops.gather(params, indices, axis=0)
-          self.assertAllEqual(gather.eval(), np.zeros((2, 0, 0)))
+          with self.subTest(dtype=dtype, itype=itype):
+            params = np.zeros((7, 0, 0), dtype=dtype.as_numpy_dtype)
+            indices = np.array([3, 4], dtype=itype)
+            gather = array_ops.gather(params, indices, axis=0)
+            self.assertAllEqual(gather.eval(), np.zeros((2, 0, 0)))
 
-          # Middle axis gather.
-          params = np.zeros((0, 7, 0), dtype=dtype.as_numpy_dtype)
-          gather = array_ops.gather(params, indices, axis=1)
-          self.assertAllEqual(gather.eval(), np.zeros((0, 2, 0)))
+            # Middle axis gather.
+            params = np.zeros((0, 7, 0), dtype=dtype.as_numpy_dtype)
+            gather = array_ops.gather(params, indices, axis=1)
+            self.assertAllEqual(gather.eval(), np.zeros((0, 2, 0)))
 
-          # Trailing axis gather.
-          params = np.zeros((0, 0, 7), dtype=dtype.as_numpy_dtype)
-          gather = array_ops.gather(params, indices, axis=2)
-          self.assertAllEqual(gather.eval(), np.zeros((0, 0, 2)))
+            # Trailing axis gather.
+            params = np.zeros((0, 0, 7), dtype=dtype.as_numpy_dtype)
+            gather = array_ops.gather(params, indices, axis=2)
+            self.assertAllEqual(gather.eval(), np.zeros((0, 0, 2)))
 
   @parameterized.parameters([
       # batch_dims=0 (equivalent to tf.gather)
diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index 3e0676b0746..36e58bee829 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -23,7 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -61,35 +61,32 @@ class MatrixUnaryFunctorGradientTest(test_lib.TestCase):
 
 def _GetMatrixUnaryFunctorGradientTest(functor_, dtype_, shape_, **kwargs_):
 
-  @test_util.run_v1_only('b/120545219')
+  @test_util.enable_control_flow_v2
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def Test(self):
-    with self.session(use_gpu=True):
+
+    def RandomInput():
       np.random.seed(1)
-      a_np = np.random.uniform(
+      return np.random.uniform(
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
-      a = constant_op.constant(a_np)
-      if functor_.__name__ == 'matrix_square_root':
-        # Square the input matrix to ensure that its matrix square root exists
-        a = math_ops.matmul(a, a)
-        a_np = self.evaluate(a)
-      b = functor_(a, **kwargs_)
 
-      # Optimal stepsize for central difference is O(epsilon^{1/3}).
-      epsilon = np.finfo(dtype_).eps
-      delta = epsilon**(1.0 / 3.0)
-      # tolerance obtained by looking at actual differences using
-      # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
-      tol = 1e-6 if dtype_ == np.float64 else 0.05
+    if functor_.__name__ == 'matrix_square_root':
+      # Square the input matrix to ensure that its matrix square root exists
+      f = lambda x: functor_(math_ops.matmul(x, x), **kwargs_)
+    else:
+      f = functor_
 
-      theoretical, numerical = gradient_checker.compute_gradient(
-          a,
-          a.get_shape().as_list(),
-          b,
-          b.get_shape().as_list(),
-          x_init_value=a_np,
-          delta=delta)
-      self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+    # Optimal stepsize for central difference is O(epsilon^{1/3}).
+    epsilon = np.finfo(dtype_).eps
+    delta = epsilon**(1.0 / 3.0)
+    # tolerance obtained by looking at actual differences using
+    # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
+    tol = 1e-6 if dtype_ == np.float64 else 0.05
+
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        f, [RandomInput()], delta=delta)
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
 
   return Test
 
@@ -104,42 +101,33 @@ def _GetMatrixBinaryFunctorGradientTest(functor_,
                                         float32_tol_fudge=1.0,
                                         **kwargs_):
 
-  @test_util.run_v1_only('b/120545219')
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def Test(self):
-    # TODO(rmlarsen): Debug illegal address bug on CUDA and re-enable
-    # GPU test for matrix_solve.
-    use_gpu = False if functor_ == linalg_ops.matrix_solve else True
 
-    with self.session(use_gpu=use_gpu):
+    def RandomInput():
       np.random.seed(1)
-      a_np = np.random.uniform(
+      return np.random.uniform(
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
-      a = constant_op.constant(a_np)
 
-      b_np = np.random.uniform(
-          low=-1.0, high=1.0,
-          size=np.prod(shape_)).reshape(shape_).astype(dtype_)
-      b = constant_op.constant(b_np)
-      c = functor_(a, b, **kwargs_)
+    fixed = RandomInput()
 
-      # Optimal stepsize for central difference is O(epsilon^{1/3}).
-      epsilon = np.finfo(dtype_).eps
-      delta = epsilon**(1.0 / 3.0)
-      # tolerance obtained by looking at actual differences using
-      # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
-      tol = 1e-6 if dtype_ == np.float64 else float32_tol_fudge * 0.05
-      # The gradients for a and b may be of very different magnitudes,
-      # so to not get spurious failures we test them separately.
-      for factor, factor_init in [a, a_np], [b, b_np]:
-        theoretical, numerical = gradient_checker.compute_gradient(
-            factor,
-            factor.get_shape().as_list(),
-            c,
-            c.get_shape().as_list(),
-            x_init_value=factor_init,
-            delta=delta)
-        self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+    # Optimal stepsize for central difference is O(epsilon^{1/3}).
+    epsilon = np.finfo(dtype_).eps
+    delta = epsilon**(1.0 / 3.0)
+    # tolerance obtained by looking at actual differences using
+    # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
+    tol = 1e-6 if dtype_ == np.float64 else float32_tol_fudge * 0.05
+
+    # check gradient w.r.t. left argument.
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        lambda x: functor_(x, fixed, **kwargs_), [RandomInput()], delta=delta)
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+
+    # check gradient w.r.t. right argument.
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        lambda y: functor_(fixed, y, **kwargs_), [RandomInput()], delta=delta)
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
 
   return Test
 
diff --git a/tensorflow/python/kernel_tests/linalg_ops_test.py b/tensorflow/python/kernel_tests/linalg_ops_test.py
index 20cd128783e..916d9a4b8c8 100644
--- a/tensorflow/python/kernel_tests/linalg_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg_ops_test.py
@@ -66,10 +66,11 @@ class CholeskySolveTest(test.TestCase):
                _RandomPDMatrix(n, self.rng)]).astype(np_type)
           chol = linalg_ops.cholesky(array)
           for k in range(1, 3):
-            rhs = self.rng.randn(2, n, k).astype(np_type)
-            x = linalg_ops.cholesky_solve(chol, rhs)
-            self.assertAllClose(
-                rhs, math_ops.matmul(array, x).eval(), atol=atol)
+            with self.subTest(n=n, np_type=np_type, atol=atol, k=k):
+              rhs = self.rng.randn(2, n, k).astype(np_type)
+              x = linalg_ops.cholesky_solve(chol, rhs)
+              self.assertAllClose(
+                  rhs, math_ops.matmul(array, x).eval(), atol=atol)
 
 
 class LogdetTest(test.TestCase):
@@ -82,24 +83,26 @@ class LogdetTest(test.TestCase):
     for n in range(1, 6):
       for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
                              (np.complex64, 0.05), (np.complex128, 1e-5)]:
-        matrix = _RandomPDMatrix(n, self.rng, np_dtype)
-        _, logdet_np = np.linalg.slogdet(matrix)
-        with self.session(use_gpu=True):
-          # Create 2 x n x n matrix
-          # matrix = np.array(
-          #     [_RandomPDMatrix(n, self.rng, np_dtype),
-          #      _RandomPDMatrix(n, self.rng, np_dtype)]).astype(np_dtype)
-          logdet_tf = linalg.logdet(matrix)
-          self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
+        with self.subTest(n=n, np_dtype=np_dtype, atol=atol):
+          matrix = _RandomPDMatrix(n, self.rng, np_dtype)
+          _, logdet_np = np.linalg.slogdet(matrix)
+          with self.session(use_gpu=True):
+            # Create 2 x n x n matrix
+            # matrix = np.array(
+            #     [_RandomPDMatrix(n, self.rng, np_dtype),
+            #      _RandomPDMatrix(n, self.rng, np_dtype)]).astype(np_dtype)
+            logdet_tf = linalg.logdet(matrix)
+            self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
 
   def test_works_with_underflow_case(self):
     for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
                            (np.complex64, 0.05), (np.complex128, 1e-5)]:
-      matrix = (np.eye(20) * 1e-6).astype(np_dtype)
-      _, logdet_np = np.linalg.slogdet(matrix)
-      with self.session(use_gpu=True):
-        logdet_tf = linalg.logdet(matrix)
-        self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
+      with self.subTest(np_dtype=np_dtype, atol=atol):
+        matrix = (np.eye(20) * 1e-6).astype(np_dtype)
+        _, logdet_np = np.linalg.slogdet(matrix)
+        with self.session(use_gpu=True):
+          logdet_tf = linalg.logdet(matrix)
+          self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
 
 
 class SlogdetTest(test.TestCase):
@@ -112,7 +115,20 @@ class SlogdetTest(test.TestCase):
     for n in range(1, 6):
       for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
                              (np.complex64, 0.05), (np.complex128, 1e-5)]:
-        matrix = _RandomPDMatrix(n, self.rng, np_dtype)
+        with self.subTest(n=n, np_dtype=np_dtype, atol=atol):
+          matrix = _RandomPDMatrix(n, self.rng, np_dtype)
+          sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
+          with self.session(use_gpu=True):
+            sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
+            self.assertAllClose(
+                log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
+            self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
+
+  def test_works_with_underflow_case(self):
+    for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
+                           (np.complex64, 0.05), (np.complex128, 1e-5)]:
+      with self.subTest(np_dtype=np_dtype, atol=atol):
+        matrix = (np.eye(20) * 1e-6).astype(np_dtype)
         sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
         with self.session(use_gpu=True):
           sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
@@ -120,30 +136,20 @@ class SlogdetTest(test.TestCase):
               log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
           self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
 
-  def test_works_with_underflow_case(self):
-    for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
-                           (np.complex64, 0.05), (np.complex128, 1e-5)]:
-      matrix = (np.eye(20) * 1e-6).astype(np_dtype)
-      sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
-      with self.session(use_gpu=True):
-        sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
-        self.assertAllClose(
-            log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
-        self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
-
 
 class AdjointTest(test.TestCase):
 
   def test_compare_to_numpy(self):
     for dtype in np.float64, np.float64, np.complex64, np.complex128:
-      matrix_np = np.array([[1 + 1j, 2 + 2j, 3 + 3j], [4 + 4j, 5 + 5j,
-                                                       6 + 6j]]).astype(dtype)
-      expected_transposed = np.conj(matrix_np.T)
-      with self.session():
-        matrix = ops.convert_to_tensor(matrix_np)
-        transposed = linalg.adjoint(matrix)
-        self.assertEqual((3, 2), transposed.get_shape())
-        self.assertAllEqual(expected_transposed, self.evaluate(transposed))
+      with self.subTest(dtype=dtype):
+        matrix_np = np.array([[1 + 1j, 2 + 2j, 3 + 3j], [4 + 4j, 5 + 5j,
+                                                         6 + 6j]]).astype(dtype)
+        expected_transposed = np.conj(matrix_np.T)
+        with self.session():
+          matrix = ops.convert_to_tensor(matrix_np)
+          transposed = linalg.adjoint(matrix)
+          self.assertEqual((3, 2), transposed.get_shape())
+          self.assertAllEqual(expected_transposed, self.evaluate(transposed))
 
 
 class EyeTest(parameterized.TestCase, test.TestCase):
diff --git a/tensorflow/python/kernel_tests/lu_op_test.py b/tensorflow/python/kernel_tests/lu_op_test.py
index 1c0280c3ce6..de9d8c32cb5 100644
--- a/tensorflow/python/kernel_tests/lu_op_test.py
+++ b/tensorflow/python/kernel_tests/lu_op_test.py
@@ -30,7 +30,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -128,14 +128,16 @@ class LuOpTest(test.TestCase):
 
     for dtype in (np.float32, np.float64):
       for output_idx_type in (dtypes.int32, dtypes.int64):
-        self._verifyLu(data.astype(dtype), output_idx_type=output_idx_type)
+        with self.subTest(dtype=dtype, output_idx_type=output_idx_type):
+          self._verifyLu(data.astype(dtype), output_idx_type=output_idx_type)
 
     for dtype in (np.complex64, np.complex128):
       for output_idx_type in (dtypes.int32, dtypes.int64):
-        complex_data = np.tril(1j * data, -1).astype(dtype)
-        complex_data += np.triu(-1j * data, 1).astype(dtype)
-        complex_data += data
-        self._verifyLu(complex_data, output_idx_type=output_idx_type)
+        with self.subTest(dtype=dtype, output_idx_type=output_idx_type):
+          complex_data = np.tril(1j * data, -1).astype(dtype)
+          complex_data += np.triu(-1j * data, 1).astype(dtype)
+          complex_data += data
+          self._verifyLu(complex_data, output_idx_type=output_idx_type)
 
   def testPivoting(self):
     # This matrix triggers partial pivoting because the first diagonal entry
@@ -144,38 +146,41 @@ class LuOpTest(test.TestCase):
     self._verifyLu(data.astype(np.float32))
 
     for dtype in (np.float32, np.float64):
-      self._verifyLu(data.astype(dtype))
-      _, p = linalg_ops.lu(data)
-      p_val = self.evaluate([p])
-      # Make sure p_val is not the identity permutation.
-      self.assertNotAllClose(np.arange(3), p_val)
+      with self.subTest(dtype=dtype):
+        self._verifyLu(data.astype(dtype))
+        _, p = linalg_ops.lu(data)
+        p_val = self.evaluate([p])
+        # Make sure p_val is not the identity permutation.
+        self.assertNotAllClose(np.arange(3), p_val)
 
     for dtype in (np.complex64, np.complex128):
-      complex_data = np.tril(1j * data, -1).astype(dtype)
-      complex_data += np.triu(-1j * data, 1).astype(dtype)
-      complex_data += data
-      self._verifyLu(complex_data)
-      _, p = linalg_ops.lu(data)
-      p_val = self.evaluate([p])
-      # Make sure p_val is not the identity permutation.
-      self.assertNotAllClose(np.arange(3), p_val)
+      with self.subTest(dtype=dtype):
+        complex_data = np.tril(1j * data, -1).astype(dtype)
+        complex_data += np.triu(-1j * data, 1).astype(dtype)
+        complex_data += data
+        self._verifyLu(complex_data)
+        _, p = linalg_ops.lu(data)
+        p_val = self.evaluate([p])
+        # Make sure p_val is not the identity permutation.
+        self.assertNotAllClose(np.arange(3), p_val)
 
   def testInvalidMatrix(self):
     # LU factorization gives an error when the input is singular.
     # Note: A singular matrix may return without error but it won't be a valid
     # factorization.
     for dtype in self.float_types:
-      with self.assertRaises(errors.InvalidArgumentError):
-        self.evaluate(
-            linalg_ops.lu(
-                np.array([[1., 2., 3.], [2., 4., 6.], [2., 3., 4.]],
-                         dtype=dtype)))
-      with self.assertRaises(errors.InvalidArgumentError):
-        self.evaluate(
-            linalg_ops.lu(
-                np.array([[[1., 2., 3.], [2., 4., 6.], [1., 2., 3.]],
-                          [[1., 2., 3.], [3., 4., 5.], [5., 6., 7.]]],
-                         dtype=dtype)))
+      with self.subTest(dtype=dtype):
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(
+              linalg_ops.lu(
+                  np.array([[1., 2., 3.], [2., 4., 6.], [2., 3., 4.]],
+                           dtype=dtype)))
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(
+              linalg_ops.lu(
+                  np.array([[[1., 2., 3.], [2., 4., 6.], [1., 2., 3.]],
+                            [[1., 2., 3.], [3., 4., 5.], [5., 6., 7.]]],
+                           dtype=dtype)))
 
   def testBatch(self):
     simple_array = np.array([[[1., -1.], [2., 5.]]])  # shape (1, 2, 2)
@@ -209,15 +214,20 @@ class LuOpTest(test.TestCase):
     data = np.random.rand(n, n) + 1j * np.random.rand(n, n)
     self._verifyLu(data)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testEmpty(self):
     self._verifyLu(np.empty([0, 2, 2]))
     self._verifyLu(np.empty([2, 0, 0]))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrentExecutesWithoutError(self):
-    matrix1 = random_ops.random_normal([5, 5], seed=42)
-    matrix2 = random_ops.random_normal([5, 5], seed=42)
+    matrix_shape = [5, 5]
+    seed = [42, 24]
+    matrix1 = stateless_random_ops.stateless_random_normal(
+        shape=matrix_shape, seed=seed)
+    matrix2 = stateless_random_ops.stateless_random_normal(
+        shape=matrix_shape, seed=seed)
+    self.assertAllEqual(matrix1, matrix2)
     lu1, p1 = linalg_ops.lu(matrix1)
     lu2, p2 = linalg_ops.lu(matrix2)
     lu1_val, p1_val, lu2_val, p2_val = self.evaluate([lu1, p1, lu2, p2])
diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index 1e10d689886..913c3a49cb0 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -186,6 +187,25 @@ class MapFnTest(test.TestCase):
     self.assertAllEqual(-nums, received[1])
     self.assertAllEqual(nums, received[2])
 
+  @test_util.run_in_graph_and_eager_modes
+  def testMap_autograph_indirect(self):
+
+    def test_function(x):
+      cond = constant_op.constant(-1)
+      if cond == 0:
+        result = x
+      else:
+        result = x
+      return result
+
+    @def_function.function
+    def map_call(x):
+      return map_fn.map_fn(test_function, x)
+
+    x = constant_op.constant([1])
+    y = map_call(x)
+    self.assertAllEqual([1], self.evaluate(y))
+
   @test_util.run_in_graph_and_eager_modes
   def testMapShape(self):
     x = constant_op.constant([[1, 2, 3], [4, 5, 6]])
diff --git a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
index fa466d975f8..8cc230d2806 100644
--- a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
@@ -23,12 +23,13 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.linalg import linalg_impl
 from tensorflow.python.platform import benchmark
@@ -57,7 +58,7 @@ class LogarithmOpTest(test.TestCase):
     matrix_batch = np.tile(matrix_batch, [2, 3, 1, 1])
     return matrix_batch
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testNonsymmetric(self):
     # 2x2 matrices
     matrix1 = np.array([[1., 2.], [3., 4.]])
@@ -71,7 +72,7 @@ class LogarithmOpTest(test.TestCase):
     # Complex batch
     self._verifyLogarithmComplex(self._makeBatch(matrix1, matrix2))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testSymmetricPositiveDefinite(self):
     # 2x2 matrices
     matrix1 = np.array([[2., 1.], [1., 2.]])
@@ -85,27 +86,27 @@ class LogarithmOpTest(test.TestCase):
     # Complex batch
     self._verifyLogarithmComplex(self._makeBatch(matrix1, matrix2))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testNonSquareMatrix(self):
     # When the logarithm of a non-square matrix is attempted we should return
     # an error
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       gen_linalg_ops.matrix_logarithm(
           np.array([[1., 2., 3.], [3., 4., 5.]], dtype=np.complex64))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The input to the logarithm should be at least a 2-dimensional tensor.
     tensor3 = constant_op.constant([1., 2.], dtype=dtypes.complex64)
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       gen_linalg_ops.matrix_logarithm(tensor3)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testEmpty(self):
     self._verifyLogarithmComplex(np.empty([0, 2, 2], dtype=np.complex64))
     self._verifyLogarithmComplex(np.empty([2, 0, 0], dtype=np.complex64))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testRandomSmallAndLargeComplex64(self):
     np.random.seed(42)
     for batch_dims in [(), (1,), (3,), (2, 2)]:
@@ -116,7 +117,7 @@ class LogarithmOpTest(test.TestCase):
             size=np.prod(shape)).reshape(shape).astype(np.complex64)
         self._verifyLogarithmComplex(matrix)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testRandomSmallAndLargeComplex128(self):
     np.random.seed(42)
     for batch_dims in [(), (1,), (3,), (2, 2)]:
@@ -127,17 +128,21 @@ class LogarithmOpTest(test.TestCase):
             size=np.prod(shape)).reshape(shape).astype(np.complex128)
         self._verifyLogarithmComplex(matrix)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrentExecutesWithoutError(self):
-    with self.session(use_gpu=True) as sess:
-      matrix1 = math_ops.cast(
-          random_ops.random_normal([5, 5], seed=42), dtypes.complex64)
-      matrix2 = math_ops.cast(
-          random_ops.random_normal([5, 5], seed=42), dtypes.complex64)
-      logm1 = gen_linalg_ops.matrix_logarithm(matrix1)
-      logm2 = gen_linalg_ops.matrix_logarithm(matrix2)
-      logm = self.evaluate([logm1, logm2])
-      self.assertAllEqual(logm[0], logm[1])
+    matrix_shape = [5, 5]
+    seed = [42, 24]
+    matrix1 = math_ops.cast(
+        stateless_random_ops.stateless_random_normal(matrix_shape, seed=seed),
+        dtypes.complex64)
+    matrix2 = math_ops.cast(
+        stateless_random_ops.stateless_random_normal(matrix_shape, seed=seed),
+        dtypes.complex64)
+    self.assertAllEqual(matrix1, matrix2)
+    logm1 = gen_linalg_ops.matrix_logarithm(matrix1)
+    logm2 = gen_linalg_ops.matrix_logarithm(matrix2)
+    logm = self.evaluate([logm1, logm2])
+    self.assertAllEqual(logm[0], logm[1])
 
 
 class MatrixLogarithmBenchmark(test.Benchmark):
diff --git a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
index b99c8f6d256..b7a159e2eff 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
@@ -20,10 +20,11 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -89,6 +90,8 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
     if not fast and l2_regularizer != 0:
       # The slow path does not support regularization.
       return
+    if use_placeholder and context.executing_eagerly():
+      return
     maxdim = np.max(x.shape)
     if dtype == np.float32 or dtype == np.complex64:
       tol = maxdim * 5e-4
@@ -109,64 +112,70 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
         b = np.tile(b, batch_shape + (1, 1))
         np_ans = np.tile(np_ans, batch_shape + (1, 1))
         np_r_norm = np.tile(np_r_norm, batch_shape)
-      with self.cached_session(use_gpu=fast) as sess:
-        if use_placeholder:
-          a_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
-          b_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
-          feed_dict = {a_ph: a, b_ph: b}
-          tf_ans = linalg_ops.matrix_solve_ls(
-              a_ph, b_ph, fast=fast, l2_regularizer=l2_regularizer)
-        else:
-          tf_ans = linalg_ops.matrix_solve_ls(
-              a, b, fast=fast, l2_regularizer=l2_regularizer)
-          feed_dict = {}
-          self.assertEqual(np_ans.shape, tf_ans.get_shape())
-        if l2_regularizer == 0:
-          # The least squares solution should satisfy A^H * (b - A*x) = 0.
-          tf_r = b - math_ops.matmul(a, tf_ans)
-          tf_r = math_ops.matmul(a, tf_r, adjoint_a=True)
-          tf_r_norm = linalg_ops.norm(tf_r, ord="fro", axis=[-2, -1])
-          tf_ans_val, tf_r_norm_val = sess.run(
-              [tf_ans, tf_r_norm], feed_dict=feed_dict)
-          self.assertAllClose(np_r_norm, tf_r_norm_val, atol=tol, rtol=tol)
-        else:
+      if use_placeholder:
+        a_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
+        b_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
+        feed_dict = {a_ph: a, b_ph: b}
+        tf_ans = linalg_ops.matrix_solve_ls(
+            a_ph, b_ph, fast=fast, l2_regularizer=l2_regularizer)
+      else:
+        tf_ans = linalg_ops.matrix_solve_ls(
+            a, b, fast=fast, l2_regularizer=l2_regularizer)
+        feed_dict = None
+        self.assertEqual(np_ans.shape, tf_ans.get_shape())
+      if feed_dict:
+        with self.session(use_gpu=True) as sess:
           tf_ans_val = sess.run(tf_ans, feed_dict=feed_dict)
-
+      else:
+        tf_ans_val = self.evaluate(tf_ans)
       self.assertEqual(np_ans.shape, tf_ans_val.shape)
       self.assertAllClose(np_ans, tf_ans_val, atol=2 * tol, rtol=2 * tol)
 
-  @test_util.run_v1_only("b/120545219")
+      if l2_regularizer == 0:
+        # The least squares solution should satisfy A^H * (b - A*x) = 0.
+        tf_r = b - math_ops.matmul(a, tf_ans)
+        tf_r = math_ops.matmul(a, tf_r, adjoint_a=True)
+        tf_r_norm = linalg_ops.norm(tf_r, ord="fro", axis=[-2, -1])
+        if feed_dict:
+          with self.session(use_gpu=True) as sess:
+            tf_ans_val, tf_r_norm_val = sess.run([tf_ans, tf_r_norm],
+                                                 feed_dict=feed_dict)
+        else:
+          tf_ans_val, tf_r_norm_val = self.evaluate([tf_ans, tf_r_norm])
+        self.assertAllClose(np_r_norm, tf_r_norm_val, atol=tol, rtol=tol)
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The matrix and right-hand sides should have the same number of rows.
     with self.session(use_gpu=True):
       matrix = constant_op.constant([[1., 0.], [0., 1.]])
       rhs = constant_op.constant([[1., 0.]])
-      with self.assertRaises(ValueError):
+      with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
         linalg_ops.matrix_solve_ls(matrix, rhs)
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testEmpty(self):
     full = np.array([[1., 2.], [3., 4.], [5., 6.]])
     empty0 = np.empty([3, 0])
     empty1 = np.empty([0, 2])
     for fast in [True, False]:
-      with self.cached_session(use_gpu=True):
-        tf_ans = self.evaluate(
-            linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast))
-        self.assertEqual(tf_ans.shape, (0, 0))
-        tf_ans = self.evaluate(
-            linalg_ops.matrix_solve_ls(empty0, full, fast=fast))
-        self.assertEqual(tf_ans.shape, (0, 2))
-        tf_ans = self.evaluate(
-            linalg_ops.matrix_solve_ls(full, empty0, fast=fast))
-        self.assertEqual(tf_ans.shape, (2, 0))
-        tf_ans = self.evaluate(
-            linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast))
-        self.assertEqual(tf_ans.shape, (2, 2))
+      tf_ans = self.evaluate(
+          linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast))
+      self.assertEqual(tf_ans.shape, (0, 0))
+      tf_ans = self.evaluate(
+          linalg_ops.matrix_solve_ls(empty0, full, fast=fast))
+      self.assertEqual(tf_ans.shape, (0, 2))
+      tf_ans = self.evaluate(
+          linalg_ops.matrix_solve_ls(full, empty0, fast=fast))
+      self.assertEqual(tf_ans.shape, (2, 0))
+      tf_ans = self.evaluate(
+          linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast))
+      self.assertEqual(tf_ans.shape, (2, 2))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testBatchResultSize(self):
     # 3x3x3 matrices, 3x3x1 right-hand sides.
-    matrix = np.array([1., 2., 3., 4., 5., 6., 7., 8., 9.] * 3).reshape(3, 3, 3)
+    matrix = np.array([1., 0., 0., 0., 1., 0., 0., 0., 1.] * 3).reshape(3, 3, 3)
     rhs = np.array([1., 2., 3.] * 3).reshape(3, 3, 1)
     answer = linalg_ops.matrix_solve(matrix, rhs)
     ls_answer = linalg_ops.matrix_solve_ls(matrix, rhs)
@@ -358,8 +367,7 @@ if __name__ == "__main__":
     # ROCm does not support BLAS operations for complex types
     dtypes_to_test += [np.complex64, np.complex128]
   for dtype_ in dtypes_to_test:
-    # TF2 does not support placeholders under eager so we skip it
-    for use_placeholder_ in set([False, not tf2.enabled()]):
+    for use_placeholder_ in set([False, True]):
       for fast_ in [True, False]:
         l2_regularizers = [0] if dtype_ == np.complex128 else [0, 0.1]
         for l2_regularizer_ in l2_regularizers:
diff --git a/tensorflow/python/kernel_tests/matrix_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
index 0b6b403210c..bbd909c8e58 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
@@ -21,14 +21,16 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -56,19 +58,19 @@ class MatrixSolveOpTest(test.TestCase):
           a_np = np.tile(a_np, batch_dims + [1, 1])
           b = np.tile(b, batch_dims + [1, 1])
         np_ans = np.linalg.solve(a_np, b)
-        for use_placeholder in False, True:
-          with self.cached_session(use_gpu=True) as sess:
-            if use_placeholder:
-              a_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
-              b_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
-              tf_ans = linalg_ops.matrix_solve(a_ph, b_ph, adjoint=adjoint)
+        for use_placeholder in set((False, not context.executing_eagerly())):
+          if use_placeholder:
+            a_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
+            b_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
+            tf_ans = linalg_ops.matrix_solve(a_ph, b_ph, adjoint=adjoint)
+            with self.cached_session(use_gpu=True) as sess:
               out = sess.run(tf_ans, {a_ph: a, b_ph: b})
-            else:
-              tf_ans = linalg_ops.matrix_solve(a, b, adjoint=adjoint)
-              out = self.evaluate(tf_ans)
-              self.assertEqual(tf_ans.get_shape(), out.shape)
-            self.assertEqual(np_ans.shape, out.shape)
-            self.assertAllClose(np_ans, out, atol=tol, rtol=tol)
+          else:
+            tf_ans = linalg_ops.matrix_solve(a, b, adjoint=adjoint)
+            out = self.evaluate(tf_ans)
+            self.assertEqual(tf_ans.get_shape(), out.shape)
+          self.assertEqual(np_ans.shape, out.shape)
+          self.assertAllClose(np_ans, out, atol=tol, rtol=tol)
 
   def _generateMatrix(self, m, n):
     matrix = (np.random.normal(-5, 5,
@@ -77,7 +79,7 @@ class MatrixSolveOpTest(test.TestCase):
         [m, n]))
     return matrix
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testSolve(self):
     for n in 1, 2, 4, 9:
       matrix = self._generateMatrix(n, n)
@@ -85,7 +87,7 @@ class MatrixSolveOpTest(test.TestCase):
         rhs = self._generateMatrix(n, nrhs)
         self._verifySolve(matrix, rhs)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testSolveBatch(self):
     for n in 2, 5:
       matrix = self._generateMatrix(n, n)
@@ -94,48 +96,50 @@ class MatrixSolveOpTest(test.TestCase):
         for batch_dims in [[2], [2, 2], [7, 4]]:
           self._verifySolve(matrix, rhs, batch_dims=batch_dims)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testNonSquareMatrix(self):
     # When the solve of a non-square matrix is attempted we should return
     # an error
-    with self.session(use_gpu=True):
-      with self.assertRaises(ValueError):
-        matrix = constant_op.constant([[1., 2., 3.], [3., 4., 5.]])
-        linalg_ops.matrix_solve(matrix, matrix)
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
+      matrix = constant_op.constant([[1., 2., 3.], [3., 4., 5.]])
+      self.evaluate(linalg_ops.matrix_solve(matrix, matrix))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The matrix and right-hand sides should have the same number of rows.
-    with self.session(use_gpu=True):
-      matrix = constant_op.constant([[1., 0.], [0., 1.]])
-      rhs = constant_op.constant([[1., 0.]])
-      with self.assertRaises(ValueError):
-        linalg_ops.matrix_solve(matrix, rhs)
+    matrix = constant_op.constant([[1., 0.], [0., 1.]])
+    rhs = constant_op.constant([[1., 0.]])
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
+      self.evaluate(linalg_ops.matrix_solve(matrix, rhs))
 
   def testNotInvertible(self):
     # The input should be invertible.
-    with self.session(use_gpu=True):
-      with self.assertRaisesOpError("Input matrix is not invertible."):
-        # All rows of the matrix below add to zero
-        matrix = constant_op.constant([[1., 0., -1.], [-1., 1., 0.],
-                                       [0., -1., 1.]])
-        linalg_ops.matrix_solve(matrix, matrix).eval()
+    with self.assertRaisesOpError("Input matrix is not invertible."):
+      # All rows of the matrix below add to zero
+      matrix = constant_op.constant([[1., 0., -1.], [-1., 1., 0.],
+                                     [0., -1., 1.]])
+      self.evaluate(linalg_ops.matrix_solve(matrix, matrix))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrent(self):
-    with self.session(use_gpu=True) as sess:
-      all_ops = []
-      for adjoint_ in False, True:
-        lhs1 = random_ops.random_normal([3, 3], seed=42)
-        lhs2 = random_ops.random_normal([3, 3], seed=42)
-        rhs1 = random_ops.random_normal([3, 3], seed=42)
-        rhs2 = random_ops.random_normal([3, 3], seed=42)
-        s1 = linalg_ops.matrix_solve(lhs1, rhs1, adjoint=adjoint_)
-        s2 = linalg_ops.matrix_solve(lhs2, rhs2, adjoint=adjoint_)
-        all_ops += [s1, s2]
-      val = self.evaluate(all_ops)
-      self.assertAllEqual(val[0], val[1])
-      self.assertAllEqual(val[2], val[3])
+    seed = [42, 24]
+    matrix_shape = [3, 3]
+    all_ops = []
+    for adjoint_ in False, True:
+      lhs1 = stateless_random_ops.stateless_random_normal(
+          matrix_shape, seed=seed)
+      lhs2 = stateless_random_ops.stateless_random_normal(
+          matrix_shape, seed=seed)
+      rhs1 = stateless_random_ops.stateless_random_normal(
+          matrix_shape, seed=seed)
+      rhs2 = stateless_random_ops.stateless_random_normal(
+          matrix_shape, seed=seed)
+      s1 = linalg_ops.matrix_solve(lhs1, rhs1, adjoint=adjoint_)
+      s2 = linalg_ops.matrix_solve(lhs2, rhs2, adjoint=adjoint_)
+      all_ops += [s1, s2]
+    val = self.evaluate(all_ops)
+    for i in range(0, len(all_ops), 2):
+      self.assertAllEqual(val[i], val[i + 1])
 
 
 class MatrixSolveBenchmark(test.Benchmark):
diff --git a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
index c36d83e2530..6cf330ed981 100644
--- a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
@@ -21,10 +21,11 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.platform import test
 
 
@@ -89,31 +90,35 @@ class SquareRootOpTest(test.TestCase):
     self._verifySquareRootReal(np.empty([0, 2, 2]))
     self._verifySquareRootReal(np.empty([2, 0, 0]))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The input to the square root should be at least a 2-dimensional tensor.
     tensor = constant_op.constant([1., 2.])
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       gen_linalg_ops.matrix_square_root(tensor)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testNotSquare(self):
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       tensor = constant_op.constant([[1., 0., -1.], [-1., 1., 0.]])
       self.evaluate(gen_linalg_ops.matrix_square_root(tensor))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrentExecutesWithoutError(self):
-    with test_util.use_gpu():
-      matrix1 = random_ops.random_normal([5, 5], seed=42)
-      matrix2 = random_ops.random_normal([5, 5], seed=42)
-      square1 = math_ops.matmul(matrix1, matrix1)
-      square2 = math_ops.matmul(matrix2, matrix2)
-      sqrt1 = gen_linalg_ops.matrix_square_root(square1)
-      sqrt2 = gen_linalg_ops.matrix_square_root(square2)
-      all_ops = [sqrt1, sqrt2]
-      sqrt = self.evaluate(all_ops)
-      self.assertAllClose(sqrt[0], sqrt[1])
+    matrix_shape = [5, 5]
+    seed = [42, 24]
+    matrix1 = stateless_random_ops.stateless_random_normal(
+        shape=matrix_shape, seed=seed)
+    matrix2 = stateless_random_ops.stateless_random_normal(
+        shape=matrix_shape, seed=seed)
+    self.assertAllEqual(matrix1, matrix2)
+    square1 = math_ops.matmul(matrix1, matrix1)
+    square2 = math_ops.matmul(matrix2, matrix2)
+    sqrt1 = gen_linalg_ops.matrix_square_root(square1)
+    sqrt2 = gen_linalg_ops.matrix_square_root(square2)
+    all_ops = [sqrt1, sqrt2]
+    sqrt = self.evaluate(all_ops)
+    self.assertAllClose(sqrt[0], sqrt[1])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/numerics_test.py b/tensorflow/python/kernel_tests/numerics_test.py
index 4d31cd45289..eadb8ceff07 100644
--- a/tensorflow/python/kernel_tests/numerics_test.py
+++ b/tensorflow/python/kernel_tests/numerics_test.py
@@ -18,13 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -67,7 +64,7 @@ class VerifyTensorAllFiniteTest(test.TestCase):
         self.evaluate(t_verified)
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("add_check_numerics_op() is meant to be a v1-only API")
 class NumericsTest(test.TestCase):
 
   def testInf(self):
@@ -132,55 +129,6 @@ class NumericsTest(test.TestCase):
         r"or `tf.while_loop\(\)`\."):
       numerics.add_check_numerics_ops()
 
-  def testCheckNumericsV2OpNegativeAndPositiveInf(self):
-    """Test that CheckNumericsV2 op distinguishes negative and positive infs."""
-    with self.session(graph=ops.Graph()):
-      t1 = constant_op.constant([-1.0, 1.0])
-      t2 = constant_op.constant([0.0, 0.0])
-      checked = array_ops.check_numerics_v2(
-          t1 / t2, message="pass through test")
-      caught = None
-      try:
-        self.evaluate(checked)
-      except errors.InvalidArgumentError as error:
-        caught = error
-      self.assertIn("had -Inf and +Inf values", caught.message)
-      self.assertIn("pass through test", caught.message)
-
-  def testCheckNumericsV2OpNegativeAndPositiveInfAndNaN(self):
-    """CheckNumericsV2 op distinguishes - & + infs when nan is present."""
-    with self.session(graph=ops.Graph()):
-      t1 = constant_op.constant([-1.0, 1.0, 0.0])
-      t2 = constant_op.constant([0.0, 0.0, 0.0])
-      checked = array_ops.check_numerics_v2(
-          t1 / t2, message="pass through test")
-      caught = None
-      try:
-        self.evaluate(checked)
-      except errors.InvalidArgumentError as error:
-        caught = error
-      self.assertIn("had -Inf, +Inf, and NaN values", caught.message)
-      self.assertIn("pass through test", caught.message)
-
-  def testCheckNumericsV2PositiveInfAndNaN(self):
-    """Test that CheckNumericsV2 op shows sign of inf when nan is present."""
-    with self.session(graph=ops.Graph()):
-      t1 = constant_op.constant([0.0, 1.0])
-      t2 = constant_op.constant([0.0, 0.0])
-      checked = array_ops.check_numerics_v2(
-          t1 / t2, message="pass through test")
-      caught = None
-      try:
-        self.evaluate(checked)
-      except errors.InvalidArgumentError as error:
-        caught = error
-      self.assertIn("had +Inf and NaN values", caught.message)
-      self.assertIn("pass through test", caught.message)
-
 
 if __name__ == "__main__":
-  # TODO(b/130689556): XLA CPU does not honor inf/nan which causes problems
-  os.environ[
-      "XLA_FLAGS"] = "--xla_cpu_enable_fast_math=false " + os.environ.get(
-          "XLA_FLAGS", "")
   test.main()
diff --git a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
index d5331dcb3e9..051f7e1168a 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
@@ -205,14 +205,14 @@ class PoolingTest(test.TestCase):
         padding="VALID",
         expected=[29.5, 32.5, 50.5, 53.5, 176.5, 179.5, 197.5, 200.5])
 
-  def _MaxPool3DEmptyTensorOutputShape(self):
+  def testMaxPool3DEmptyTensorOutputShape(self):
     """Verifies the output shape of the max pooling function when tensor is empty.
 
     Args: none
     """
     input_sizes = [0, 112, 112, 112, 64]
 
-    input_data = 1
+    input_data = 1.
     input_tensor = constant_op.constant(
         input_data, shape=input_sizes, name="input")
     max_pool_3d = nn_ops.max_pool3d(
diff --git a/tensorflow/python/kernel_tests/proto/BUILD b/tensorflow/python/kernel_tests/proto/BUILD
index d9643f3d125..0e935dfe8c4 100644
--- a/tensorflow/python/kernel_tests/proto/BUILD
+++ b/tensorflow/python/kernel_tests/proto/BUILD
@@ -1,7 +1,7 @@
 # Tests of tf.io.*proto.
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 
diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index 4e0af934053..d5337c183a6 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -20,17 +20,18 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -45,35 +46,37 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class QrOpTest(test.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
-    # The input to qr should be a tensor of at least rank 2.
+    # The input to svd should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be at least rank 2 but is rank 0"):
+    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
+                                 "rank.* 2.*0"):
       linalg_ops.qr(scalar)
     vector = constant_op.constant([1., 2.])
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be at least rank 2 but is rank 1"):
+    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
+                                 "rank.* 2.*1"):
       linalg_ops.qr(vector)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrentExecutesWithoutError(self):
-    with self.session(use_gpu=True) as sess:
-      all_ops = []
-      for full_matrices_ in True, False:
-        for rows_ in 4, 5:
-          for cols_ in 4, 5:
-            matrix1 = random_ops.random_normal([rows_, cols_], seed=42)
-            matrix2 = random_ops.random_normal([rows_, cols_], seed=42)
-            q1, r1 = linalg_ops.qr(matrix1, full_matrices=full_matrices_)
-            q2, r2 = linalg_ops.qr(matrix2, full_matrices=full_matrices_)
-            all_ops += [q1, r1, q2, r2]
-      val = self.evaluate(all_ops)
-      for i in range(8):
-        q = 4 * i
-        self.assertAllClose(val[q], val[q + 2])  # q1 == q2
-        self.assertAllClose(val[q + 1], val[q + 3])  # r1 == r2
+    seed = [42, 24]
+    all_ops = []
+    for full_matrices_ in True, False:
+      for rows_ in 4, 5:
+        for cols_ in 4, 5:
+          matrix_shape = [rows_, cols_]
+          matrix1 = stateless_random_ops.stateless_random_normal(
+              matrix_shape, seed)
+          matrix2 = stateless_random_ops.stateless_random_normal(
+              matrix_shape, seed)
+          self.assertAllEqual(matrix1, matrix2)
+          q1, r1 = linalg_ops.qr(matrix1, full_matrices=full_matrices_)
+          q2, r2 = linalg_ops.qr(matrix2, full_matrices=full_matrices_)
+          all_ops += [q1, q2, r1, r2]
+    val = self.evaluate(all_ops)
+    for i in range(0, len(val), 2):
+      self.assertAllClose(val[i], val[i + 1])
 
 
 def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
@@ -121,8 +124,10 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       tol = 1e-14
     self.assertAllClose(identity, xx, atol=tol)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def Test(self):
+    if not use_static_shape_ and context.executing_eagerly():
+      return
     np.random.seed(1)
     x_np = np.random.uniform(
         low=-1.0, high=1.0, size=np.prod(shape_)).reshape(shape_).astype(dtype_)
@@ -131,7 +136,6 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
 
-    with self.session(use_gpu=True) as sess:
       if use_static_shape_:
         x_tf = constant_op.constant(x_np)
       else:
@@ -141,7 +145,8 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       if use_static_shape_:
         q_tf_val, r_tf_val = self.evaluate([q_tf, r_tf])
       else:
-        q_tf_val, r_tf_val = sess.run([q_tf, r_tf], feed_dict={x_tf: x_np})
+        with self.session(use_gpu=True) as sess:
+          q_tf_val, r_tf_val = sess.run([q_tf, r_tf], feed_dict={x_tf: x_np})
 
       q_dims = q_tf_val.shape
       np_q = np.ndarray(q_dims, dtype_)
@@ -170,13 +175,16 @@ class QrGradOpTest(test.TestCase):
 
 def _GetQrGradOpTest(dtype_, shape_, full_matrices_):
 
-  @test_util.run_v1_only("b/120545219")
-  def Test(self):
-    np.random.seed(42)
+  def RandomInput():
     a = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_)
     if dtype_ in [np.complex64, np.complex128]:
       a += 1j * np.random.uniform(
           low=-1.0, high=1.0, size=shape_).astype(dtype_)
+    return a
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def Test(self):
+    np.random.seed(42)
     # Optimal stepsize for central difference is O(epsilon^{1/3}).
     epsilon = np.finfo(dtype_).eps
     delta = 0.1 * epsilon**(1.0 / 3.0)
@@ -184,23 +192,16 @@ def _GetQrGradOpTest(dtype_, shape_, full_matrices_):
       tol = 3e-2
     else:
       tol = 1e-6
-    with self.session(use_gpu=True):
-      tf_a = constant_op.constant(a)
-      tf_b = linalg_ops.qr(tf_a, full_matrices=full_matrices_)
-      for b in tf_b:
-        x_init = np.random.uniform(
-            low=-1.0, high=1.0, size=shape_).astype(dtype_)
-        if dtype_ in [np.complex64, np.complex128]:
-          x_init += 1j * np.random.uniform(
-              low=-1.0, high=1.0, size=shape_).astype(dtype_)
-        theoretical, numerical = gradient_checker.compute_gradient(
-            tf_a,
-            tf_a.get_shape().as_list(),
-            b,
-            b.get_shape().as_list(),
-            x_init_value=x_init,
-            delta=delta)
-        self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+    # TODO(b/157171666): Sadly we have to double the computation because
+    # gradient_checker_v2.compute_gradient expects a list of functions.
+    funcs = [
+        lambda a: linalg_ops.qr(a, full_matrices=full_matrices_)[0],
+        lambda a: linalg_ops.qr(a, full_matrices=full_matrices_)[1]
+    ]
+    for f in funcs:
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          f, [RandomInput()], delta=delta)
+      self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
 
   return Test
 
@@ -266,7 +267,7 @@ if __name__ == "__main__":
         for full_matrices in False, True:
           for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
             # TF2 does not support placeholders under eager so we skip it
-            for use_static_shape in set([True, tf2.enabled()]):
+            for use_static_shape in [True, False]:
               shape = batch_dims + (rows, cols)
               name = "%s_%s_full_%s_static_%s" % (dtype.__name__,
                                                   "_".join(map(str, shape)),
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index c3335cbc546..b5d291d2973 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -87,6 +87,7 @@ cuda_py_test(
     name = "random_ops_test",
     size = "medium",
     srcs = ["random_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -101,6 +102,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["stateless_random_ops_test.py"],
     shard_count = 2,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 4dbbb7c7f1e..73c8bd09db0 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -336,6 +336,8 @@ class RandomUniformTest(RandomOpTestCommon):
       self.assertLess(error.max(), 5 * std)
 
   # Check that minval = maxval is fine iff we're producing no numbers
+  @test_util.disable_tfrt(
+      "TFE_TensorHandleToNumpy not implemented yet. b/156191611")
   def testUniformIntsDegenerate(self):
     for dt in dtypes.int32, dtypes.int64:
       def sample(n):
diff --git a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
index 0b9fbab716c..d7e50083deb 100644
--- a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
@@ -154,44 +154,54 @@ class StatelessOpsTest(test.TestCase, parameterized.TestCase):
                                 **kwds),
               functools.partial(random_ops.random_poisson, shape=(10,), **kwds))
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchFloat(self):
     self._test_match(self._float_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchInt(self):
     self._test_match(self._int_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchMultinomial(self):
     self._test_match(self._multinomial_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchGamma(self):
     self._test_match(self._gamma_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchPoisson(self):
     self._test_match(self._poisson_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismFloat(self):
     self._test_determinism(
         self._float_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismInt(self):
     self._test_determinism(
         self._int_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismMultinomial(self):
     self._test_determinism(self._multinomial_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismGamma(self):
     self._test_determinism(self._gamma_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismPoisson(self):
     self._test_determinism(self._poisson_cases())
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 41ce9eb8a57..bf229943fd4 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -57,6 +57,8 @@ from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
 
 
+@test_util.disable_tfrt(
+    "Trying to assign variable with wrong dtype. b/156200342")
 @test_util.with_control_flow_v2
 class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
                               parameterized.TestCase):
@@ -332,6 +334,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     g = gradients_impl.gradients(c, [b], unconnected_gradients="zero")[0]
     self.assertAllEqual(g.shape.as_list(), [1, 2])
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.run_deprecated_v1
   def testGradientCondInWhileLoop(self):
     v = resource_variable_ops.ResourceVariable(initial_value=1.0)
@@ -965,6 +968,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
           assign = var.assign(np.zeros(shape=[2, 2]))
           self.evaluate(assign)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.disable_xla("XLA doesn't allow changing shape at assignment, as "
                          "dictated by tf2xla/xla_resource.cc:SetTypeAndShape")
   @test_util.run_in_graph_and_eager_modes
@@ -1327,6 +1331,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
 
   # TODO(ebrevdo): Add run_in_graph_and_eager_modes once we can create
   # EagerTensor constants with TensorProto inputs.
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.run_in_graph_and_eager_modes()
   def testVariantInitializer(self):
     variant_shape_and_type_data = self.create_variant_shape_and_type_data()
@@ -1520,6 +1525,7 @@ class PerReplicaResourceHandleTest(test_util.TensorFlowTestCase):
         context.LogicalDeviceConfiguration(),
     ])
 
+  @test_util.disable_tfrt("Multiple device support. b/154956430")
   def testAllowedDevices(self):
     device0 = "/job:localhost/replica:0/task:0/device:CPU:0"
     device1 = "/job:localhost/replica:0/task:0/device:CPU:1"
diff --git a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
index 05307c9834a..267decff38b 100644
--- a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
+++ b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
@@ -19,10 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
@@ -135,56 +136,52 @@ class ReverseSequenceTest(test.TestCase):
     print("ReverseSequence gradient error = %g" % err)
     self.assertLess(err, 1e-8)
 
-  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
-    t = array_ops.reverse_sequence(
-        array_ops.placeholder(
-            dtypes.float32, shape=None),
-        seq_lengths=array_ops.placeholder(
-            dtypes.int64, shape=(32,)),
-        batch_axis=0,
-        seq_axis=1)
-    self.assertIs(t.get_shape().ndims, None)
+    # Enter graph mode since we want to test partial shapes
+    with context.graph_mode():
+      t = array_ops.reverse_sequence(
+          array_ops.placeholder(dtypes.float32, shape=None),
+          seq_lengths=array_ops.placeholder(dtypes.int64, shape=(32,)),
+          batch_axis=0,
+          seq_axis=1)
+      self.assertIs(t.get_shape().ndims, None)
 
+  def testInvalidArguments(self):
     # Batch size mismatched between input and seq_lengths.
-    with self.assertRaises(ValueError):
-      array_ops.reverse_sequence(
-          array_ops.placeholder(
-              dtypes.float32, shape=(32, 2, 3)),
-          seq_lengths=array_ops.placeholder(
-              dtypes.int64, shape=(33,)),
-          seq_axis=3)
+    # seq_length too long
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 (r"Dimensions must be equal|"
+                                  r"Length of seq_lengths != input.dims\(0\)")):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2, 2], seq_axis=1)
+
+    # seq_length too short
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 (r"Dimensions must be equal|"
+                                  r"Length of seq_lengths != input.dims\(0\)")):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [2], seq_axis=1)
+
+    # Invalid seq_length shape
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 ("Shape must be rank 1 but is rank 2|"
+                                  "seq_lengths must be 1-dim")):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [[2, 2]], seq_axis=1)
 
     # seq_axis out of bounds.
-    with self.assertRaisesRegexp(ValueError, "seq_dim must be < input rank"):
-      array_ops.reverse_sequence(
-          array_ops.placeholder(
-              dtypes.float32, shape=(32, 2, 3)),
-          seq_lengths=array_ops.placeholder(
-              dtypes.int64, shape=(32,)),
-          seq_axis=3)
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 "seq_dim must be < input rank"):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2], seq_axis=2)
 
     # batch_axis out of bounds.
-    with self.assertRaisesRegexp(ValueError, "batch_dim must be < input rank"):
-      array_ops.reverse_sequence(
-          array_ops.placeholder(
-              dtypes.float32, shape=(32, 2, 3)),
-          seq_lengths=array_ops.placeholder(
-              dtypes.int64, shape=(32,)),
-          seq_axis=0,
-          batch_axis=3)
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 "batch_dim must be < input rank"):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2],
+                                 seq_axis=1,
+                                 batch_axis=3)
 
-    with self.cached_session():
-      inputs = array_ops.placeholder(dtypes.float32, shape=(32, 2, 3))
-      seq_lengths = array_ops.placeholder(dtypes.int64, shape=(32,))
-      output = array_ops.reverse_sequence(
-          inputs, seq_lengths=seq_lengths,
-          seq_axis=0)  # batch_axis default is 0
-      with self.assertRaisesOpError("batch_dim == seq_dim"):
-        output.eval(feed_dict={
-            inputs: np.random.rand(32, 2, 3),
-            seq_lengths: xrange(32)
-        })
+    with self.assertRaisesRegexp((errors.OpError, errors.InvalidArgumentError),
+                                 "batch_dim == seq_dim == 0"):
+      output = array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2], seq_axis=0)
+      self.evaluate(output)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index 7dde89c9818..6c2f2e236f2 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -500,6 +500,8 @@ class TileTest(test.TestCase, parameterized.TestCase):
         "int16": (dtypes.int16, int),
         "int32": (dtypes.int32, int),
         "int64": (dtypes.int64, int),
+        "uint32": (dtypes.uint32, int),
+        "uint64": (dtypes.uint64, int),
         bytes: (dtypes.string, bytes)
     }
     for dtype_np, (dtype_tf, cast) in types_to_test.items():
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index adb12a5e850..bd893184570 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -149,7 +149,6 @@ cuda_py_tests(
     python_version = "PY3",
     shard_count = 4,
     tags = [
-        "no_oss_py38",  #TODO(b/151631881)
         "no_windows_gpu",
     ],
     deps = [
diff --git a/tensorflow/python/kernel_tests/signal/mel_ops_test.py b/tensorflow/python/kernel_tests/signal/mel_ops_test.py
index 3b5b236ddd5..a7aac484074 100644
--- a/tensorflow/python/kernel_tests/signal/mel_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/mel_ops_test.py
@@ -169,8 +169,6 @@ class LinearToMelTest(test.TestCase, parameterized.TestCase):
       return
     with self.assertRaises(ValueError):
       mel_ops.linear_to_mel_weight_matrix(num_mel_bins=0)
-    with self.assertRaises(ValueError):
-      mel_ops.linear_to_mel_weight_matrix(num_spectrogram_bins=0)
     with self.assertRaises(ValueError):
       mel_ops.linear_to_mel_weight_matrix(sample_rate=0.0)
     with self.assertRaises(ValueError):
diff --git a/tensorflow/python/kernel_tests/signal/test_util.py b/tensorflow/python/kernel_tests/signal/test_util.py
index 1e95fe4b28f..e8d477a843b 100644
--- a/tensorflow/python/kernel_tests/signal/test_util.py
+++ b/tensorflow/python/kernel_tests/signal/test_util.py
@@ -50,7 +50,7 @@ def grappler_optimize(graph, fetches=None, config_proto=None):
   return tf_optimizer.OptimizeGraph(config_proto, metagraph)
 
 
-def tflite_convert(fn, input_templates, use_mlir=False):
+def tflite_convert(fn, input_templates):
   """Converts the provided fn to tf.lite model.
 
   Args:
@@ -59,7 +59,6 @@ def tflite_convert(fn, input_templates, use_mlir=False):
     input_templates: A list of Tensors, ndarrays or TensorSpecs describing the
       inputs that fn expects. The actual values of the Tensors or ndarrays are
       unused.
-    use_mlir: Experimental. Whether to use the tf.lite MLIR converter.
 
   Returns:
     The serialized tf.lite model.
@@ -67,7 +66,6 @@ def tflite_convert(fn, input_templates, use_mlir=False):
   fn = def_function.function(fn)
   concrete_func = fn.get_concrete_function(*input_templates)
   converter = lite.TFLiteConverterV2([concrete_func])
-  converter.experimental_new_converter = use_mlir
   return converter.convert()
 
 
diff --git a/tensorflow/python/kernel_tests/signal/window_ops_test.py b/tensorflow/python/kernel_tests/signal/window_ops_test.py
index 9f5fe6f64c7..9432e70c7f2 100644
--- a/tensorflow/python/kernel_tests/signal/window_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/window_ops_test.py
@@ -156,15 +156,14 @@ class WindowOpsTest(test.TestCase, parameterized.TestCase):
       self.assertLen(rewritten_graph.node, 1)
 
   @parameterized.parameters(
-      # Due to control flow, only MLIR is supported.
       # Only float32 is supported.
-      (window_ops.hann_window, 10, False, dtypes.float32, True),
-      (window_ops.hann_window, 10, True, dtypes.float32, True),
-      (window_ops.hamming_window, 10, False, dtypes.float32, True),
-      (window_ops.hamming_window, 10, True, dtypes.float32, True),
-      (window_ops.vorbis_window, 12, None, dtypes.float32, True))
-  def test_tflite_convert(self, window_fn, window_length, periodic, dtype,
-                          use_mlir):
+      (window_ops.hann_window, 10, False, dtypes.float32),
+      (window_ops.hann_window, 10, True, dtypes.float32),
+      (window_ops.hamming_window, 10, False, dtypes.float32),
+      (window_ops.hamming_window, 10, True, dtypes.float32),
+      (window_ops.vorbis_window, 12, None, dtypes.float32))
+  def test_tflite_convert(self, window_fn, window_length, periodic, dtype):
+
     def fn(window_length):
       try:
         return window_fn(window_length, periodic=periodic, dtype=dtype)
@@ -172,7 +171,7 @@ class WindowOpsTest(test.TestCase, parameterized.TestCase):
         return window_fn(window_length, dtype=dtype)
 
     tflite_model = test_util.tflite_convert(
-        fn, [tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32)], use_mlir)
+        fn, [tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32)])
     window_length = np.array(window_length).astype(np.int32)
     actual_output, = test_util.evaluate_tflite_model(
         tflite_model, [window_length])
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index 39d88896416..b53147552c3 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -235,19 +235,22 @@ class SliceTest(test.TestCase):
       self.assertAllEqual(slice_val, inp[x, 0:y])
 
   def testSimple(self):
-    with self.session(use_gpu=True) as sess:
-      inp = np.random.rand(4, 4).astype("f")
-      a = constant_op.constant(
-          [float(x) for x in inp.ravel(order="C")],
-          shape=[4, 4],
-          dtype=dtypes.float32)
-      slice_t = array_ops.slice(a, [0, 0], [2, 2])
-      slice2_t = a[:2, :2]
-      slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
-    self.assertAllEqual(slice_val, inp[:2, :2])
-    self.assertAllEqual(slice2_val, inp[:2, :2])
-    self.assertEqual(slice_val.shape, slice_t.get_shape())
-    self.assertEqual(slice2_val.shape, slice2_t.get_shape())
+    with test_util.use_gpu():
+      for dtype in [
+          np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.bool,
+          np.float16, np.float32, np.float64, np.complex64, np.complex128,]:
+        inp = np.random.rand(4, 4).astype(dtype)
+        a = constant_op.constant(
+            [float(x) for x in inp.ravel(order="C")],
+            shape=[4, 4],
+            dtype=dtypes.float32)
+        slice_t = array_ops.slice(a, [0, 0], [2, 2])
+        slice2_t = a[:2, :2]
+        slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
+        self.assertAllEqual(slice_val, np.array(inp[:2, :2], dtype=np.float32))
+        self.assertAllEqual(slice2_val, np.array(inp[:2, :2], dtype=np.float32))
+        self.assertEqual(slice_val.shape, slice_t.get_shape())
+        self.assertEqual(slice2_val.shape, slice2_t.get_shape())
 
   @test_util.run_deprecated_v1
   def testComplex(self):
diff --git a/tensorflow/python/kernel_tests/sparse_cross_op_test.py b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
index 5037f82af72..b352c1a080f 100644
--- a/tensorflow/python/kernel_tests/sparse_cross_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
@@ -27,10 +27,55 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
+class BaseSparseCrossOpTest(test.TestCase):
+
+  def _sparse_tensor(self, data, batch_size=-1):
+    """Generates a SparseTensor.
+
+    Args:
+      data: Should be a list of list of strings or int64. Each item of the outer
+        list represents a batch. Each item of the batch is a feature of a
+        specific feature column.
+      batch_size: optional batch size, especially for cases when data has no
+        entry for some batches.
+
+    Returns:
+     A SparseTensor.
+    """
+    indices = []
+    values = []
+    max_col_count = 0
+    for batch, batch_ix in zip(data, range(len(data))):
+      for column, column_ix in zip(batch, range(len(batch))):
+        indices.append([batch_ix, column_ix])
+        values.append(column)
+        max_col_count = max(max_col_count, column_ix + 1)
+    shape = [batch_size if batch_size != -1 else len(data), max_col_count]
+    value_type = (
+        dtypes.string
+        if not values or isinstance(values[0], str) else dtypes.int64)
+    return sparse_tensor.SparseTensor(
+        constant_op.constant(indices, dtypes.int64, [len(indices), 2]),
+        constant_op.constant(values, value_type, [len(indices)]),
+        constant_op.constant(shape, dtypes.int64))
+
+  def _assert_sparse_tensor_equals(self, sp1, sp2):
+    self.assertAllEqual(sp1.indices.eval(), sp2.indices)
+    self.assertAllEqual(sp1.values.eval(), sp2.values)
+    self.assertAllEqual(sp1.dense_shape.eval(), sp2.dense_shape)
+
+  def _assert_sparse_tensor_empty(self, sp):
+    self.assertEqual(0, sp.indices.size)
+    self.assertEqual(0, sp.values.size)
+    # TODO(zakaria): check if we can ignore the first dim of the shape.
+    self.assertEqual(0, sp.dense_shape[1])
+
+
 class SparseCrossOpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -459,5 +504,552 @@ class SparseCrossOpTest(test.TestCase):
       self.evaluate(sparse_ops.sparse_cross([st1, st2]))
 
 
+class SparseCrossV2OpTest(BaseSparseCrossOpTest):
+
+  @test_util.run_deprecated_v1
+  def test_sparse(self):
+    """Tests a simple scenario."""
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1'],
+                                    ['batch2-FC1-F1', 'batch2-FC1-F2']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1'],
+                                    ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices],
+        values=[sp_inp_1.values, sp_inp_2.values],
+        shapes=[sp_inp_1.dense_shape, sp_inp_2.dense_shape],
+        dense_inputs=[],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['batch1-FC1-F1_X_batch1-FC2-F1'],
+        ['batch2-FC1-F1_X_batch2-FC2-F1',
+         'batch2-FC1-F1_X_batch2-FC2-F2',
+         'batch2-FC1-F2_X_batch2-FC2-F1',
+         'batch2-FC1-F2_X_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_sparse_sep(self):
+    """Tests a simple scenario."""
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1'],
+                                    ['batch2-FC1-F1', 'batch2-FC1-F2']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1'],
+                                    ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices],
+        values=[sp_inp_1.values, sp_inp_2.values],
+        shapes=[sp_inp_1.dense_shape, sp_inp_2.dense_shape],
+        dense_inputs=[],
+        sep='_Y_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['batch1-FC1-F1_Y_batch1-FC2-F1'],
+        ['batch2-FC1-F1_Y_batch2-FC2-F1',
+         'batch2-FC1-F1_Y_batch2-FC2-F2',
+         'batch2-FC1-F2_Y_batch2-FC2-F1',
+         'batch2-FC1-F2_Y_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_dense(self):
+    """Tests only dense inputs."""
+    dense_inp_1 = constant_op.constant([['batch1-FC1-F1', 'batch1-FC1-F2'],
+                                        ['batch2-FC1-F1', 'batch2-FC1-F2']],
+                                       dtypes.string)
+    dense_inp_2 = constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                                        ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                                       dtypes.string)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[],
+        values=[],
+        shapes=[],
+        dense_inputs=[dense_inp_1, dense_inp_2],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2',
+         'batch1-FC1-F2_X_batch1-FC2-F1', 'batch1-FC1-F2_X_batch1-FC2-F2'
+        ],
+        ['batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+         'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_dense_sep(self):
+    """Tests only dense inputs."""
+    dense_inp_1 = constant_op.constant([['batch1-FC1-F1', 'batch1-FC1-F2'],
+                                        ['batch2-FC1-F1', 'batch2-FC1-F2']],
+                                       dtypes.string)
+    dense_inp_2 = constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                                        ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                                       dtypes.string)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[],
+        values=[],
+        shapes=[],
+        dense_inputs=[dense_inp_1, dense_inp_2],
+        sep='_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['batch1-FC1-F1_batch1-FC2-F1', 'batch1-FC1-F1_batch1-FC2-F2',
+         'batch1-FC1-F2_batch1-FC2-F1', 'batch1-FC1-F2_batch1-FC2-F2'
+        ],
+        ['batch2-FC1-F1_batch2-FC2-F1', 'batch2-FC1-F1_batch2-FC2-F2',
+         'batch2-FC1-F2_batch2-FC2-F1', 'batch2-FC1-F2_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_integer_mixed_string_sparse(self):
+    """Tests mixed type."""
+    sp_inp_1 = self._sparse_tensor([[11], [333, 55555]])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1'],
+                                    ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices],
+        values=[sp_inp_1.values, sp_inp_2.values],
+        shapes=[sp_inp_1.dense_shape, sp_inp_2.dense_shape],
+        dense_inputs=[],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['11_X_batch1-FC2-F1'],
+        ['333_X_batch2-FC2-F1', '333_X_batch2-FC2-F2',
+         '55555_X_batch2-FC2-F1', '55555_X_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_integer_mixed_string_dense(self):
+    """Tests mixed dense inputs."""
+    dense_inp_1 = constant_op.constant([[11, 333], [55555, 999999]],
+                                       dtypes.int64)
+    dense_inp_2 = constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                                        ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                                       dtypes.string)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[],
+        values=[],
+        shapes=[],
+        dense_inputs=[dense_inp_1, dense_inp_2],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['11_X_batch1-FC2-F1', '11_X_batch1-FC2-F2',
+         '333_X_batch1-FC2-F1', '333_X_batch1-FC2-F2'
+        ],
+        ['55555_X_batch2-FC2-F1', '55555_X_batch2-FC2-F2',
+         '999999_X_batch2-FC2-F1', '999999_X_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_sparse_cross_dense(self):
+    """Tests sparse and dense inputs."""
+    sp_inp = self._sparse_tensor([['batch1-FC1-F1'],
+                                  ['batch2-FC1-F1', 'batch2-FC1-F2']])
+    dense_inp = constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                                      ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                                     dtypes.string)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp.indices],
+        values=[sp_inp.values],
+        shapes=[sp_inp.dense_shape],
+        dense_inputs=[dense_inp],
+        sep='_X_')
+    expected_out = self._sparse_tensor(
+        [['batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2'],
+         [
+             'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+             'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+         ]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_permutation_3x3x3(self):
+    """Tests 3x3x3 permutation."""
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor(
+        [['batch1-FC2-F1', 'batch1-FC2-F2', 'batch1-FC2-F3']])
+    sp_inp_3 = self._sparse_tensor(
+        [['batch1-FC3-F1', 'batch1-FC3-F2', 'batch1-FC3-F3']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F3'
+    ]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_permutation_3x1x2(self):
+    """Tests 3x1x2 permutation."""
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2'
+    ]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_large_batch(self):
+    """Tests with large batch size to force multithreading."""
+    batch_size = 5000
+    col1 = []
+    col2 = []
+    col3 = []
+    for b in range(batch_size):
+      col1.append(
+          ['batch%d-FC1-F1' % b,
+           'batch%d-FC1-F2' % b,
+           'batch%d-FC1-F3' % b])
+      col2.append(['batch%d-FC2-F1' % b])
+      col3.append(['batch%d-FC3-F1' % b, 'batch%d-FC3-F2' % b])
+    sp_inp_1 = self._sparse_tensor(col1)
+    sp_inp_2 = self._sparse_tensor(col2)
+    sp_inp_3 = self._sparse_tensor(col3)
+
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+
+    col_out = []
+    for b in range(batch_size):
+      col_out.append([
+          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
+          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
+          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b)
+      ])
+
+    expected_out = self._sparse_tensor(col_out)
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_one_column_empty(self):
+    """Tests when one column is empty.
+
+    The crossed tensor should be empty.
+    """
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']])
+    sp_inp_2 = self._sparse_tensor([], 1)
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_empty(self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_some_columns_empty(self):
+    """Tests when more than one columns are empty.
+
+    Cross for the corresponding batch should be empty.
+    """
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']], 2)
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1'], ['batch2-FC2-F1']], 2)
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']], 2)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2'
+    ]], 2)
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_all_columns_empty(self):
+    """Tests when all columns are empty.
+
+    The crossed tensor should be empty.
+    """
+    sp_inp_1 = self._sparse_tensor([])
+    sp_inp_2 = self._sparse_tensor([])
+    sp_inp_3 = self._sparse_tensor([])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_empty(self.evaluate(out))
+
+
+class SparseCrossHashedOpTest(BaseSparseCrossOpTest):
+
+  @test_util.run_deprecated_v1
+  def test_hashed_zero_bucket_no_hash_key(self):
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        num_buckets=0,
+        salt=[1, 1],
+        strong_hash=False)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[9186962005966787372]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+    # salt is not being used when `strong_hash` is False.
+    inds_2, vals_2, shapes_2 = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        num_buckets=0,
+        salt=[137, 173],
+        strong_hash=False)
+    out_2 = sparse_tensor.SparseTensor(inds_2, vals_2, shapes_2)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out_2))
+
+  @test_util.run_deprecated_v1
+  def test_hashed_output(self):
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        num_buckets=100,
+        salt=[137, 173],
+        strong_hash=False)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[79]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_hashed_has_no_collision(self):
+    """Tests that fingerprint concatenation has no collisions."""
+    # Although the last 10 bits of 359 and 1024+359 are identical.
+    # As a result, all the crosses shouldn't collide.
+    t1 = constant_op.constant([[359], [359 + 1024]], dtype=dtypes.int64)
+    t2 = constant_op.constant(
+        [list(range(10)), list(range(10))], dtype=dtypes.int64)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[],
+        values=[],
+        shapes=[],
+        dense_inputs=[t2, t1],
+        num_buckets=1024,
+        salt=[137, 173],
+        strong_hash=False)
+    cross = sparse_tensor.SparseTensor(inds, vals, shapes)
+    cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
+    with session.Session():
+      values = self.evaluate(cross_dense)
+      self.assertTrue(numpy.not_equal(values[0], values[1]).all())
+
+  def test_hashed_3x1x2(self):
+    """Tests 3x1x2 permutation with hashed output."""
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        num_buckets=1000,
+        salt=[137, 173],
+        strong_hash=False)
+    output = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      out = self.evaluate(output)
+      self.assertEqual(6, len(out.values))
+      self.assertAllEqual([[0, i] for i in range(6)], out.indices)
+      self.assertTrue(all(x < 1000 and x >= 0 for x in out.values))
+      all_values_are_different = len(out.values) == len(set(out.values))
+      self.assertTrue(all_values_are_different)
+
+  def test_hashed_different_salt(self):
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        strong_hash=False,
+        num_buckets=1000,
+        salt=[137, 173])
+    output = sparse_tensor.SparseTensor(inds, vals, shapes)
+    inds_2, vals_2, shapes_2 = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        strong_hash=True,
+        num_buckets=1000,
+        salt=[137, 1])
+    output_2 = sparse_tensor.SparseTensor(inds_2, vals_2, shapes_2)
+    with self.cached_session():
+      out = self.evaluate(output)
+      out_2 = self.evaluate(output_2)
+      self.assertAllEqual(out.indices, out_2.indices)
+      self.assertNotAllEqual(out.values, out_2.values)
+
+  def test_sep_ignored_in_hashed_out(self):
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        strong_hash=True,
+        num_buckets=1000,
+        salt=[137, 173])
+    output = sparse_tensor.SparseTensor(inds, vals, shapes)
+    inds_2, vals_2, shapes_2 = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        strong_hash=True,
+        num_buckets=1000,
+        salt=[137, 173])
+    output_2 = sparse_tensor.SparseTensor(inds_2, vals_2, shapes_2)
+    with self.cached_session():
+      out = self.evaluate(output)
+      out_2 = self.evaluate(output_2)
+      self.assertAllEqual(out.indices, out_2.indices)
+      self.assertAllEqual(out.values, out_2.values)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index 120e604e7ae..cad131dda74 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -20,18 +20,20 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -46,54 +48,51 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class SvdOpTest(test.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The input to svd should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be at least rank 2 but is rank 0"):
+    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
+                                 "rank.* 2.*0"):
       linalg_ops.svd(scalar)
     vector = constant_op.constant([1., 2.])
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be at least rank 2 but is rank 1"):
+    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
+                                 "rank.* 2.*1"):
       linalg_ops.svd(vector)
 
-  @test_util.run_v1_only("b/120545219")
-  def testConcurrentExecutesWithoutError(self):
-    with self.session(use_gpu=True) as sess:
-      all_ops = []
-      for compute_uv_ in True, False:
-        for full_matrices_ in True, False:
-          matrix1 = random_ops.random_normal([5, 5], seed=42)
-          matrix2 = random_ops.random_normal([5, 5], seed=42)
-          if compute_uv_:
-            s1, u1, v1 = linalg_ops.svd(
-                matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_)
-            s2, u2, v2 = linalg_ops.svd(
-                matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
-            all_ops += [s1, u1, v1, s2, u2, v2]
-          else:
-            s1 = linalg_ops.svd(
-                matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_)
-            s2 = linalg_ops.svd(
-                matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
-            all_ops += [s1, s2]
-      val = self.evaluate(all_ops)
-      for i in range(2):
-        s = 6 * i
-        self.assertAllEqual(val[s], val[s + 3])  # s1 == s2
-        self.assertAllEqual(val[s + 1], val[s + 4])  # u1 == u2
-        self.assertAllEqual(val[s + 2], val[s + 5])  # v1 == v2
-      for i in range(2):
-        s = 12 + 2 * i
-        self.assertAllEqual(val[s], val[s + 1])  # s1 == s2
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testExecuteMultipleWithoutError(self):
+    all_ops = []
+    shape = [6, 5]
+    seed = [42, 24]
+    for compute_uv_ in True, False:
+      for full_matrices_ in True, False:
+        matrix1 = stateless_random_ops.stateless_random_normal(shape, seed)
+        matrix2 = stateless_random_ops.stateless_random_normal(shape, seed)
+        self.assertAllEqual(matrix1, matrix2)
+        if compute_uv_:
+          s1, u1, v1 = linalg_ops.svd(
+              matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_)
+          s2, u2, v2 = linalg_ops.svd(
+              matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
+          all_ops += [s1, s2, u1, u2, v1, v2]
+        else:
+          s1 = linalg_ops.svd(
+              matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_)
+          s2 = linalg_ops.svd(
+              matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
+          all_ops += [s1, s2]
+    val = self.evaluate(all_ops)
+    for i in range(0, len(val), 2):
+      self.assertAllEqual(val[i], val[i + 1])
 
 
 def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
                   full_matrices_):
 
   def CompareSingularValues(self, x, y, tol):
-    self.assertAllClose(x, y, atol=(x[0] + y[0]) * tol)
+    atol = (x[0] + y[0]) * tol if len(x) else tol
+    self.assertAllClose(x, y, atol=atol)
 
   def CompareSingularVectors(self, x, y, rank, tol):
     # We only compare the first 'rank' singular vectors since the
@@ -135,8 +134,10 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
     identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
     self.assertAllClose(identity, xx, atol=tol)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def Test(self):
+    if not use_static_shape_ and context.executing_eagerly():
+      return
     is_complex = dtype_ in (np.complex64, np.complex128)
     is_single = dtype_ in (np.float32, np.complex64)
     tol = 3e-4 if is_single else 1e-12
@@ -151,48 +152,48 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
 
-    with self.session(use_gpu=True) as sess:
-      if use_static_shape_:
-        x_tf = constant_op.constant(x_np)
-      else:
-        x_tf = array_ops.placeholder(dtype_)
+    if use_static_shape_:
+      x_tf = constant_op.constant(x_np)
+    else:
+      x_tf = array_ops.placeholder(dtype_)
 
-      if compute_uv_:
-        s_tf, u_tf, v_tf = linalg_ops.svd(
-            x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
-        if use_static_shape_:
-          s_tf_val, u_tf_val, v_tf_val = self.evaluate([s_tf, u_tf, v_tf])
-        else:
+    if compute_uv_:
+      s_tf, u_tf, v_tf = linalg_ops.svd(
+          x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
+      if use_static_shape_:
+        s_tf_val, u_tf_val, v_tf_val = self.evaluate([s_tf, u_tf, v_tf])
+      else:
+        with self.session(use_gpu=True) as sess:
           s_tf_val, u_tf_val, v_tf_val = sess.run(
               [s_tf, u_tf, v_tf], feed_dict={x_tf: x_np})
+    else:
+      s_tf = linalg_ops.svd(
+          x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
+      if use_static_shape_:
+        s_tf_val = self.evaluate(s_tf)
       else:
-        s_tf = linalg_ops.svd(
-            x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
-        if use_static_shape_:
-          s_tf_val = self.evaluate(s_tf)
-        else:
+        with self.session(use_gpu=True) as sess:
           s_tf_val = sess.run(s_tf, feed_dict={x_tf: x_np})
 
-      if compute_uv_:
-        u_np, s_np, v_np = np.linalg.svd(
-            x_np, compute_uv=compute_uv_, full_matrices=full_matrices_)
-      else:
-        s_np = np.linalg.svd(
-            x_np, compute_uv=compute_uv_, full_matrices=full_matrices_)
-      # We explicitly avoid the situation where numpy eliminates a first
-      # dimension that is equal to one.
-      s_np = np.reshape(s_np, s_tf_val.shape)
+    if compute_uv_:
+      u_np, s_np, v_np = np.linalg.svd(
+          x_np, compute_uv=compute_uv_, full_matrices=full_matrices_)
+    else:
+      s_np = np.linalg.svd(
+          x_np, compute_uv=compute_uv_, full_matrices=full_matrices_)
+    # We explicitly avoid the situation where numpy eliminates a first
+    # dimension that is equal to one.
+    s_np = np.reshape(s_np, s_tf_val.shape)
 
-      CompareSingularValues(self, s_np, s_tf_val, tol)
-      if compute_uv_:
-        CompareSingularVectors(self, u_np, u_tf_val, min(shape_[-2:]), tol)
-        CompareSingularVectors(self,
-                               np.conj(np.swapaxes(v_np, -2, -1)), v_tf_val,
-                               min(shape_[-2:]), tol)
-        CheckApproximation(self, x_np, u_tf_val, s_tf_val, v_tf_val,
-                           full_matrices_, tol)
-        CheckUnitary(self, u_tf_val, tol)
-        CheckUnitary(self, v_tf_val, tol)
+    CompareSingularValues(self, s_np, s_tf_val, tol)
+    if compute_uv_:
+      CompareSingularVectors(self, u_np, u_tf_val, min(shape_[-2:]), tol)
+      CompareSingularVectors(self, np.conj(np.swapaxes(v_np, -2, -1)), v_tf_val,
+                             min(shape_[-2:]), tol)
+      CheckApproximation(self, x_np, u_tf_val, s_tf_val, v_tf_val,
+                         full_matrices_, tol)
+      CheckUnitary(self, u_tf_val, tol)
+      CheckUnitary(self, v_tf_val, tol)
 
   return Test
 
@@ -225,45 +226,41 @@ def _NormalizingSvd(tf_a, full_matrices_):
 
 def _GetSvdGradOpTest(dtype_, shape_, compute_uv_, full_matrices_):
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def Test(self):
-    np.random.seed(42)
-    a = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_)
-    if dtype_ in [np.complex64, np.complex128]:
-      a += 1j * np.random.uniform(
-          low=-1.0, high=1.0, size=shape_).astype(dtype_)
+
+    def RandomInput():
+      np.random.seed(42)
+      a = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_)
+      if dtype_ in [np.complex64, np.complex128]:
+        a += 1j * np.random.uniform(
+            low=-1.0, high=1.0, size=shape_).astype(dtype_)
+      return a
+
     # Optimal stepsize for central difference is O(epsilon^{1/3}).
     # See Equation (21) in:
     # http://www.karenkopecky.net/Teaching/eco613614/Notes_NumericalDifferentiation.pdf
     # TODO(rmlarsen): Move step size control to gradient checker.
     epsilon = np.finfo(dtype_).eps
-    delta = 0.1 * epsilon**(1.0 / 3.0)
+    delta = 0.25 * epsilon**(1.0 / 3.0)
     if dtype_ in [np.float32, np.complex64]:
       tol = 3e-2
     else:
       tol = 1e-6
-    with self.session(use_gpu=True):
-      tf_a = constant_op.constant(a)
-      if compute_uv_:
-        tf_s, tf_u, tf_v = _NormalizingSvd(tf_a, full_matrices_)
-        outputs = [tf_s, tf_u, tf_v]
-      else:
-        tf_s = linalg_ops.svd(tf_a, compute_uv=False)
-        outputs = [tf_s]
-      for b in outputs:
-        x_init = np.random.uniform(
-            low=-1.0, high=1.0, size=shape_).astype(dtype_)
-        if dtype_ in [np.complex64, np.complex128]:
-          x_init += 1j * np.random.uniform(
-              low=-1.0, high=1.0, size=shape_).astype(dtype_)
-        theoretical, numerical = gradient_checker.compute_gradient(
-            tf_a,
-            tf_a.get_shape().as_list(),
-            b,
-            b.get_shape().as_list(),
-            x_init_value=x_init,
-            delta=delta)
-        self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+    if compute_uv_:
+      funcs = [
+          lambda a: _NormalizingSvd(a, full_matrices_)[0],
+          lambda a: _NormalizingSvd(a, full_matrices_)[1],
+          lambda a: _NormalizingSvd(a, full_matrices_)[2]
+      ]
+    else:
+      funcs = [lambda a: linalg_ops.svd(a, compute_uv=False)]
+
+    for f in funcs:
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          f, [RandomInput()], delta=delta)
+      self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+
   return Test
 
 
@@ -374,18 +371,18 @@ if __name__ == "__main__":
   for compute_uv in False, True:
     for full_matrices in False, True:
       for dtype in dtypes_to_test:
-        for rows in 1, 2, 5, 10, 32, 100:
-          for cols in 1, 2, 5, 10, 32, 100:
+        for rows in 0, 1, 2, 5, 10, 32, 100:
+          for cols in 0, 1, 2, 5, 10, 32, 100:
             for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
-              shape = batch_dims + (rows, cols)
-              # TF2 does not support placeholders under eager so we skip it
-              for use_static_shape in set([True, tf2.enabled()]):
+              full_shape = batch_dims + (rows, cols)
+              for use_static_shape in set([True, False]):
                 name = "%s_%s_static_shape_%s__compute_uv_%s_full_%s" % (
-                    dtype.__name__, "_".join(map(str, shape)), use_static_shape,
-                    compute_uv, full_matrices)
-                _AddTest(SvdOpTest, "Svd", name,
-                         _GetSvdOpTest(dtype, shape, use_static_shape,
-                                       compute_uv, full_matrices))
+                    dtype.__name__, "_".join(map(str, full_shape)),
+                    use_static_shape, compute_uv, full_matrices)
+                _AddTest(
+                    SvdOpTest, "Svd", name,
+                    _GetSvdOpTest(dtype, full_shape, use_static_shape,
+                                  compute_uv, full_matrices))
   for compute_uv in False, True:
     for full_matrices in False, True:
       dtypes = ([np.float32, np.float64] + [np.complex64, np.complex128] *
@@ -396,16 +393,16 @@ if __name__ == "__main__":
           mat_shapes += [(5, 11), (11, 5)]
         for mat_shape in mat_shapes:
           for batch_dims in [(), (3,)]:
-            shape = batch_dims + mat_shape
-            name = "%s_%s_compute_uv_%s_full_%s" % (
-                dtype.__name__, "_".join(map(str, shape)), compute_uv,
-                full_matrices)
-            _AddTest(SvdGradOpTest, "SvdGrad", name,
-                     _GetSvdGradOpTest(dtype, shape, compute_uv, full_matrices))
+            full_shape = batch_dims + mat_shape
+            name = "%s_%s_compute_uv_%s_full_%s" % (dtype.__name__, "_".join(
+                map(str, full_shape)), compute_uv, full_matrices)
+            _AddTest(
+                SvdGradOpTest, "SvdGrad", name,
+                _GetSvdGradOpTest(dtype, full_shape, compute_uv, full_matrices))
             # The results are too inaccurate for float32.
             if dtype in (np.float64, np.complex128):
               _AddTest(
                   SvdGradGradOpTest, "SvdGradGrad", name,
-                  _GetSvdGradGradOpTest(dtype, shape, compute_uv,
+                  _GetSvdGradGradOpTest(dtype, full_shape, compute_uv,
                                         full_matrices))
   test.main()
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 33879232fd3..5d587954858 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -1021,7 +1021,7 @@ class TensorArrayTest(test.TestCase):
     # self._testWhileLoopWritePackGradients(
     #     dynamic_size=False, dtype=tf.int64)
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerWhileLoopDynamicWritePackGradients(self):
     self._testWhileLoopWritePackGradients(
         dynamic_size=True, dtype=dtypes.float32)
@@ -1251,7 +1251,6 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaises(ValueError):
         w1.write(4, c2)
 
-  @test_util.run_v1_only("b/117943489")
   def testUnpackShape(self):
     self._testUnpackShape()
 
@@ -1340,11 +1339,11 @@ class TensorArrayTest(test.TestCase):
       grad = gradients_impl.gradients(ys=[r], xs=[x])
       self.assertAllEqual(np.array([1.0, 1.0, 1.0]), self.evaluate(grad)[0])
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayUnpackDynamic(self):
     self._testTensorArrayUnpackDynamic()
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArraySplitDynamic(self):
     with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
@@ -1422,7 +1421,7 @@ class TensorArrayTest(test.TestCase):
           v2_msg if control_flow_util.ENABLE_CONTROL_FLOW_V2 else v1_msg):
         ta.stack().eval()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayEvalEmpty(self):
     self._testTensorArrayEvalEmpty()
 
@@ -1445,11 +1444,11 @@ class TensorArrayTest(test.TestCase):
       # first dimension of zero
       self.assertAllEqual([0, 5], self.evaluate(concatenated).shape)
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayEvalEmptyWithDefault(self):
     self._testTensorArrayEvalEmptyWithDefault()
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayScatterReadAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
@@ -1476,7 +1475,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([10.0, -10.0], read_vals[1])
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayScatterPartialReadAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index 71e448f7855..7f8c5e9781b 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -41,16 +41,19 @@ def _add_test(test, test_name, fn):
 
 class TensordotTest(test_lib.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_invalid_shape(self):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4], [5, 6]]
     a_axes = [1]
     b_axes = [0]
     # Invalid static shapes.
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       math_ops.tensordot(a, b, (a_axes, b_axes))
+
     # Invalid dynamic shapes.
+    if context.executing_eagerly():
+      return
     with self.cached_session() as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "Matrix size-incompatible"):
@@ -65,7 +68,7 @@ class TensordotTest(test_lib.TestCase):
                 axes_ph: (a_axes, b_axes)
             })
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_invalid_axes(self):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4]]
@@ -77,6 +80,8 @@ class TensordotTest(test_lib.TestCase):
     with self.assertRaises(IndexError):
       math_ops.tensordot(a, b, [[0], [7]])
 
+    if context.executing_eagerly():
+      return
     # Invalid dynamic axes.
     a_ph = array_ops.placeholder(dtypes.float32)
     b_ph = array_ops.placeholder(dtypes.float32)
@@ -93,22 +98,22 @@ class TensordotTest(test_lib.TestCase):
                   axes_ph: axes_value
               })
 
-  # Test case for 11950
+  # Test case for https://github.com/tensorflow/tensorflow/issues/11950
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_valid_axis(self):
     for axes_value in [1, 2], [[1], [2]], [[], []], 0:
-      with self.cached_session():
-        np_a = np.ones((3, 3))
-        np_b = np.array([2, 3, 1])[None, None]
-        np_ans = np.tensordot(np_a, np_b, axes_value)
+      np_a = np.ones((3, 3))
+      np_b = np.array([2, 3, 1])[None, None]
+      np_ans = np.tensordot(np_a, np_b, axes_value)
 
-        tf_a = array_ops.ones((3, 3), dtype=dtypes.float32)
-        tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None]
-        tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value)
+      tf_a = array_ops.ones((3, 3), dtype=dtypes.float32)
+      tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None]
+      tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value)
 
-        self.assertAllEqual(tf_ans.shape, np_ans.shape)
-        self.assertAllEqual(tf_ans, np_ans)
+      self.assertAllEqual(tf_ans.shape, np_ans.shape)
+      self.assertAllEqual(self.evaluate(tf_ans), np_ans)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Shape inference test")
   def test_partial_shape_inference(self):
     for axes in ([1], [0]), 1:
       a = array_ops.placeholder(dtypes.float32)
@@ -159,7 +164,10 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
         size=np.prod(b_shape)).reshape(b_shape).astype(dtype_)
     return a, b, a_dims, b_dims
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_tensordot(self):
+    if dynamic_shape_ and context.executing_eagerly():
+      self.skipTest("Placeholders not support in eager mode")
     num_trials = min(30, num_dims_ * num_dims_)
     if dtype_ == np.float16:
       tol = 0.05
@@ -187,7 +195,10 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
       self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol)
       self.assertAllEqual(tf_ans.shape, np_ans.shape)
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_tensordot_scalar_axes(self):
+    if dynamic_shape_ and context.executing_eagerly():
+      self.skipTest("Placeholders not support in eager mode")
     if num_dims_ < 1:
       self.skipTest("Not a test")
     if dtype_ == np.float16:
@@ -229,7 +240,7 @@ if __name__ == "__main__":
       for rank_b in 1, 2, 4, 5:
         for num_dims in range(0, min(rank_a, rank_b) + 1):
           # TF2 does not support placeholders under eager so we skip it
-          for dynamic_shape in set([False, not tf2.enabled()]):
+          for dynamic_shape in set([False, True]):
             for testcase in _get_tensordot_tests(dtype, rank_a, rank_b,
                                                  num_dims, dynamic_shape):
               name = "%s_%s_%s_%s_%s_%s" % (testcase.__name__, dtype.__name__,
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index 7d9e875be2d..436fef8171f 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -61,17 +61,18 @@ class UniqueTest(test.TestCase):
 
   def testInt32Axis(self):
     for dtype in [np.int32, np.int64]:
-      x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
-      y0, idx0 = gen_array_ops.unique_v2(x, axis=np.array([0], dtype))
-      self.assertEqual(y0.shape.rank, 2)
-      tf_y0, tf_idx0 = self.evaluate([y0, idx0])
-      y1, idx1 = gen_array_ops.unique_v2(x, axis=np.array([1], dtype))
-      self.assertEqual(y1.shape.rank, 2)
-      tf_y1, tf_idx1 = self.evaluate([y1, idx1])
-      self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
-      self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
-      self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
-      self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
+      with self.subTest(dtype=dtype):
+        x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
+        y0, idx0 = gen_array_ops.unique_v2(x, axis=np.array([0], dtype))
+        self.assertEqual(y0.shape.rank, 2)
+        tf_y0, tf_idx0 = self.evaluate([y0, idx0])
+        y1, idx1 = gen_array_ops.unique_v2(x, axis=np.array([1], dtype))
+        self.assertEqual(y1.shape.rank, 2)
+        tf_y1, tf_idx1 = self.evaluate([y1, idx1])
+        self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
+        self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
+        self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
+        self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
 
   def testInt32V2(self):
     # This test is only temporary, once V2 is used
@@ -144,26 +145,28 @@ class UniqueWithCountsTest(test.TestCase):
     for i in range(len(x)):
       self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii'))
     for value, count in zip(tf_y, tf_count):
-      v = [1 if x[i] == value.decode('ascii') else 0 for i in range(7000)]
-      self.assertEqual(count, sum(v))
+      with self.subTest(value=value, count=count):
+        v = [1 if x[i] == value.decode('ascii') else 0 for i in range(7000)]
+        self.assertEqual(count, sum(v))
 
   def testInt32Axis(self):
     for dtype in [np.int32, np.int64]:
-      x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
-      y0, idx0, count0 = gen_array_ops.unique_with_counts_v2(
-          x, axis=np.array([0], dtype))
-      self.assertEqual(y0.shape.rank, 2)
-      tf_y0, tf_idx0, tf_count0 = self.evaluate([y0, idx0, count0])
-      y1, idx1, count1 = gen_array_ops.unique_with_counts_v2(
-          x, axis=np.array([1], dtype))
-      self.assertEqual(y1.shape.rank, 2)
-      tf_y1, tf_idx1, tf_count1 = self.evaluate([y1, idx1, count1])
-      self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
-      self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
-      self.assertAllEqual(tf_count0, np.array([2, 1]))
-      self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
-      self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
-      self.assertAllEqual(tf_count1, np.array([1, 2]))
+      with self.subTest(dtype=dtype):
+        x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
+        y0, idx0, count0 = gen_array_ops.unique_with_counts_v2(
+            x, axis=np.array([0], dtype))
+        self.assertEqual(y0.shape.rank, 2)
+        tf_y0, tf_idx0, tf_count0 = self.evaluate([y0, idx0, count0])
+        y1, idx1, count1 = gen_array_ops.unique_with_counts_v2(
+            x, axis=np.array([1], dtype))
+        self.assertEqual(y1.shape.rank, 2)
+        tf_y1, tf_idx1, tf_count1 = self.evaluate([y1, idx1, count1])
+        self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
+        self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
+        self.assertAllEqual(tf_count0, np.array([2, 1]))
+        self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
+        self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
+        self.assertAllEqual(tf_count1, np.array([1, 2]))
 
   def testInt32V2(self):
     # This test is only temporary, once V2 is used
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index 2f9972c81bf..2afd2888e8f 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <cstring>
 
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -488,8 +490,9 @@ Status TF_TensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray) {
   return Status::OK();
 }
 
-Status PyArrayToTF_Tensor(PyObject* ndarray, Safe_TF_TensorPtr* out_tensor) {
-  DCHECK(out_tensor != nullptr);
+Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
+                       Safe_TF_TensorPtr* ret, bool convert_string) {
+  DCHECK(ret != nullptr);
 
   // Make sure we dereference this array object in case of error, etc.
   Safe_PyObjectPtr array_safe(make_safe(
@@ -515,26 +518,52 @@ Status PyArrayToTF_Tensor(PyObject* ndarray, Safe_TF_TensorPtr* out_tensor) {
   if (dtype == TF_RESOURCE) {
     size_t size = PyArray_NBYTES(array);
     array_safe.release();
-    *out_tensor = make_safe(TF_NewTensor(dtype, {}, 0, PyArray_DATA(array),
-                                         size, &DelayedNumpyDecref, array));
+
+    if (ctx) {
+      *ret = make_safe(new TF_Tensor{tensorflow::unwrap(ctx)->CreateTensor(
+          static_cast<tensorflow::DataType>(dtype), {}, 0, PyArray_DATA(array),
+          size, convert_string, &DelayedNumpyDecref, array)});
+    } else {
+      *ret = make_safe(TF_NewTensor(dtype, {}, 0, PyArray_DATA(array), size,
+                                    &DelayedNumpyDecref, array));
+    }
 
   } else if (dtype != TF_STRING) {
     size_t size = PyArray_NBYTES(array);
     array_safe.release();
-    *out_tensor = make_safe(TF_NewTensor(dtype, dims.data(), dims.size(),
-                                         PyArray_DATA(array), size,
-                                         &DelayedNumpyDecref, array));
+    if (ctx) {
+      *ret = make_safe(new TF_Tensor{tensorflow::unwrap(ctx)->CreateTensor(
+          static_cast<tensorflow::DataType>(dtype), dims.data(), dims.size(),
+          PyArray_DATA(array), size, convert_string, &DelayedNumpyDecref,
+          array)});
+    } else {
+      *ret = make_safe(TF_NewTensor(dtype, dims.data(), dims.size(),
+                                    PyArray_DATA(array), size,
+                                    &DelayedNumpyDecref, array));
+    }
+
   } else {
     size_t size = 0;
     void* encoded = nullptr;
     TF_RETURN_IF_ERROR(EncodePyBytesArray(array, nelems, &size, &encoded));
-    *out_tensor = make_safe(TF_NewTensor(
-        dtype, dims.data(), dims.size(), encoded, size,
-        [](void* data, size_t len, void* arg) {
-          delete[] reinterpret_cast<char*>(data);
-        },
-        nullptr));
+    if (ctx) {
+      *ret = make_safe(new TF_Tensor{tensorflow::unwrap(ctx)->CreateTensor(
+          static_cast<tensorflow::DataType>(dtype), dims.data(), dims.size(),
+          encoded, size, convert_string,
+          [](void* data, size_t len, void* arg) {
+            delete[] reinterpret_cast<char*>(data);
+          },
+          nullptr)});
+    } else {
+      *ret = make_safe(TF_NewTensor(
+          dtype, dims.data(), dims.size(), encoded, size,
+          [](void* data, size_t len, void* arg) {
+            delete[] reinterpret_cast<char*>(data);
+          },
+          nullptr));
+    }
   }
+
   return Status::OK();
 }
 
@@ -543,7 +572,8 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src, Status* status);
 
 Status NdarrayToTensor(PyObject* obj, Tensor* ret) {
   Safe_TF_TensorPtr tf_tensor = make_safe(static_cast<TF_Tensor*>(nullptr));
-  Status s = PyArrayToTF_Tensor(obj, &tf_tensor);
+  Status s = NdarrayToTensor(nullptr /*ctx*/, obj, &tf_tensor,
+                             false /*convert_string*/);
   if (!s.ok()) {
     return s;
   }
diff --git a/tensorflow/python/lib/core/ndarray_tensor.h b/tensorflow/python/lib/core/ndarray_tensor.h
index c5cd24cff2d..38c098417d5 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.h
+++ b/tensorflow/python/lib/core/ndarray_tensor.h
@@ -28,15 +28,21 @@ Status TF_TensorToMaybeAliasedPyArray(Safe_TF_TensorPtr tensor,
 
 Status TF_TensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray);
 
-// Converts the given numpy ndarray to a (safe) TF_Tensor. The returned
-// TF_Tensor in `out_tensor` may have its own Python reference to `ndarray`s
-// data. After `out_tensor` is destroyed, this reference must (eventually) be
-// decremented via ClearDecrefCache().
-//
-// `out_tensor` must be non-null. Caller retains ownership of `ndarray`.
-Status PyArrayToTF_Tensor(PyObject* ndarray, Safe_TF_TensorPtr* out_tensor);
+// Creates a tensor in 'ret' from the input `ndarray`. The returned TF_Tensor
+// in `ret` may have its own Python reference to `ndarray`s data. After `ret`
+// is destroyed, this reference must (eventually) be decremented via
+// ClearDecrefCache().
+// `convert_string` indicates whether it has to handle tstring conversion.
+// Expected to be removed once tstring migration is done.
+ABSL_MUST_USE_RESULT
+Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
+                       Safe_TF_TensorPtr* ret, bool convert_string);
 
 // Creates a tensor in 'ret' from the input Ndarray.
+// TODO(kkb): This is an old conversion function that does not support TFRT.
+// Currently it's used for session, py_func, and an internal project.  Migrate
+// them.
+ABSL_MUST_USE_RESULT
 Status NdarrayToTensor(PyObject* obj, Tensor* ret);
 
 // Creates a numpy array in 'ret' which either aliases the content of 't' or has
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index ecf4a92f0e7..22829f546b1 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -681,9 +681,11 @@ typedef Converter<bool> BoolConverter;
 // The two may share underlying storage so changes to one may reflect in the
 // other.
 TFE_TensorHandle* NumpyToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj) {
-  tensorflow::Tensor tensor;
-  tensorflow::Status status = tensorflow::NdarrayToTensor(obj, &tensor);
-  if (!status.ok()) {
+  Safe_TF_TensorPtr tf_tensor = make_safe(static_cast<TF_Tensor*>(nullptr));
+  Status status = tensorflow::NdarrayToTensor(ctx, obj, &tf_tensor,
+                                              true /*convert_string*/);
+
+  if (TF_PREDICT_FALSE(!status.ok())) {
     PyErr_SetString(PyExc_ValueError,
                     tensorflow::strings::StrCat(
                         "Failed to convert a NumPy array to a Tensor (",
@@ -692,8 +694,8 @@ TFE_TensorHandle* NumpyToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj) {
     return nullptr;
   }
 
-  TensorInterface t(std::move(tensor));
-  return tensorflow::wrap(tensorflow::unwrap(ctx)->CreateLocalHandle(&t));
+  return tensorflow::wrap(
+      tensorflow::unwrap(ctx)->CreateLocalHandle(tf_tensor->tensor));
 }
 
 }  // namespace
diff --git a/tensorflow/python/lib/core/pybind11_status.h b/tensorflow/python/lib/core/pybind11_status.h
index feb974798de..3f9991c6577 100644
--- a/tensorflow/python/lib/core/pybind11_status.h
+++ b/tensorflow/python/lib/core/pybind11_status.h
@@ -69,6 +69,20 @@ inline void MaybeRaiseRegisteredFromStatus(const tensorflow::Status& status) {
   }
 }
 
+inline void MaybeRaiseRegisteredFromStatusWithGIL(
+    const tensorflow::Status& status) {
+  if (!status.ok()) {
+    // Acquire GIL for throwing exception.
+    pybind11::gil_scoped_acquire acquire;
+
+    PyErr_SetObject(PyExceptionRegistry::Lookup(status.code()),
+                    pybind11::make_tuple(pybind11::none(), pybind11::none(),
+                                         status.error_message())
+                        .ptr());
+    throw pybind11::error_already_set();
+  }
+}
+
 inline void MaybeRaiseFromTFStatus(TF_Status* status) {
   TF_Code code = TF_GetCode(status);
   if (code != TF_OK) {
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index a1db2fb056c..c904cba08f8 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -165,7 +165,7 @@ class FileIO(object):
     self._read_buf.seek(offset)
 
   def readline(self):
-    r"""Reads the next line from the file. Leaves the '\n' at the end."""
+    r"""Reads the next line, keeping \n. At EOF, returns ''."""
     self._preread_check()
     return self._prepare_value(self._read_buf.readline())
 
@@ -736,6 +736,15 @@ def walk_v2(top, topdown=True, onerror=None):
     `(dirname, [subdirname, subdirname, ...], [filename, filename, ...])`.
     Each item is a string.
   """
+
+  def _make_full_path(parent, item):
+    # Since `os.path.join` discards paths before one that starts with the path
+    # separator (https://docs.python.org/3/library/os.path.html#os.path.join),
+    # we have to manually handle that case as `/` is a valid character on GCS.
+    if item[0] == os.sep:
+      return "".join([os.path.join(parent, ""), item])
+    return os.path.join(parent, item)
+
   top = compat.as_str_any(top)
   try:
     listing = list_directory(top)
@@ -748,7 +757,7 @@ def walk_v2(top, topdown=True, onerror=None):
   files = []
   subdirs = []
   for item in listing:
-    full_path = os.path.join(top, item)
+    full_path = _make_full_path(top, item)
     if is_directory(full_path):
       subdirs.append(item)
     else:
@@ -760,7 +769,8 @@ def walk_v2(top, topdown=True, onerror=None):
     yield here
 
   for subdir in subdirs:
-    for subitem in walk_v2(os.path.join(top, subdir), topdown, onerror=onerror):
+    for subitem in walk_v2(
+        _make_full_path(top, subdir), topdown, onerror=onerror):
       yield subitem
 
   if not topdown:
diff --git a/tensorflow/python/lib/io/file_io_wrapper.cc b/tensorflow/python/lib/io/file_io_wrapper.cc
index de806a9c969..0a2410b69e1 100644
--- a/tensorflow/python/lib/io/file_io_wrapper.cc
+++ b/tensorflow/python/lib/io/file_io_wrapper.cc
@@ -42,50 +42,65 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
       py::gil_scoped_release release;
       status = tensorflow::Env::Default()->FileExists(filename);
     }
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
   });
   m.def("DeleteFile", [](const std::string& filename) {
-    tensorflow::MaybeRaiseRegisteredFromStatus(
-        tensorflow::Env::Default()->DeleteFile(filename));
+    py::gil_scoped_release release;
+    tensorflow::Status status =
+        tensorflow::Env::Default()->DeleteFile(filename);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
   });
   m.def("ReadFileToString", [](const std::string& filename) {
     std::string data;
+    py::gil_scoped_release release;
     const auto status =
         ReadFileToString(tensorflow::Env::Default(), filename, &data);
+    pybind11::gil_scoped_acquire acquire;
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     return py::bytes(data);
   });
   m.def("WriteStringToFile",
         [](const std::string& filename, tensorflow::StringPiece data) {
-          return WriteStringToFile(tensorflow::Env::Default(), filename, data);
+          py::gil_scoped_release release;
+          const auto status =
+              WriteStringToFile(tensorflow::Env::Default(), filename, data);
+          tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
         });
   m.def("GetChildren", [](const std::string& dirname) {
     std::vector<std::string> results;
+    py::gil_scoped_release release;
     const auto status =
         tensorflow::Env::Default()->GetChildren(dirname, &results);
+    pybind11::gil_scoped_acquire acquire;
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     return results;
   });
   m.def("GetMatchingFiles", [](const std::string& pattern) {
     std::vector<std::string> results;
+    py::gil_scoped_release release;
     const auto status =
         tensorflow::Env::Default()->GetMatchingPaths(pattern, &results);
+    pybind11::gil_scoped_acquire acquire;
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     return results;
   });
   m.def("CreateDir", [](const std::string& dirname) {
+    py::gil_scoped_release release;
     const auto status = tensorflow::Env::Default()->CreateDir(dirname);
     if (tensorflow::errors::IsAlreadyExists(status)) {
       return;
     }
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
   });
   m.def("RecursivelyCreateDir", [](const std::string& dirname) {
-    tensorflow::MaybeRaiseRegisteredFromStatus(
-        tensorflow::Env::Default()->RecursivelyCreateDir(dirname));
+    py::gil_scoped_release release;
+    const auto status =
+        tensorflow::Env::Default()->RecursivelyCreateDir(dirname);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
   });
   m.def("CopyFile",
         [](const std::string& src, const std::string& target, bool overwrite) {
+          py::gil_scoped_release release;
           auto* env = tensorflow::Env::Default();
           tensorflow::Status status;
           if (!overwrite && env->FileExists(target).ok()) {
@@ -93,10 +108,11 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
           } else {
             status = env->CopyFile(src, target);
           }
-          tensorflow::MaybeRaiseRegisteredFromStatus(status);
+          tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
         });
   m.def("RenameFile",
         [](const std::string& src, const std::string& target, bool overwrite) {
+          py::gil_scoped_release release;
           auto* env = tensorflow::Env::Default();
           tensorflow::Status status;
           if (!overwrite && env->FileExists(target).ok()) {
@@ -104,9 +120,10 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
           } else {
             status = env->RenameFile(src, target);
           }
-          tensorflow::MaybeRaiseRegisteredFromStatus(status);
+          tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
         });
   m.def("DeleteRecursively", [](const std::string& dirname) {
+    py::gil_scoped_release release;
     tensorflow::int64 undeleted_files;
     tensorflow::int64 undeleted_dirs;
     auto status = tensorflow::Env::Default()->DeleteRecursively(
@@ -115,23 +132,25 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
       status =
           tensorflow::errors::PermissionDenied("could not fully delete dir");
     }
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
   });
   m.def("IsDirectory", [](const std::string& dirname) {
+    py::gil_scoped_release release;
     const auto status = tensorflow::Env::Default()->IsDirectory(dirname);
     // FAILED_PRECONDITION response means path exists but isn't a dir.
     if (tensorflow::errors::IsFailedPrecondition(status)) {
       return false;
     }
 
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
     return true;
   });
   m.def("HasAtomicMove", [](const std::string& path) {
+    py::gil_scoped_release release;
     bool has_atomic_move;
     const auto status =
         tensorflow::Env::Default()->HasAtomicMove(path, &has_atomic_move);
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
     return has_atomic_move;
   });
 
@@ -141,9 +160,11 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
       .def_readonly("is_directory", &tensorflow::FileStatistics::is_directory);
 
   m.def("Stat", [](const std::string& filename) {
+    py::gil_scoped_release release;
     std::unique_ptr<tensorflow::FileStatistics> self(
         new tensorflow::FileStatistics);
     const auto status = tensorflow::Env::Default()->Stat(filename, self.get());
+    py::gil_scoped_acquire acquire;
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     return self.release();
   });
@@ -151,66 +172,83 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
   using tensorflow::WritableFile;
   py::class_<WritableFile>(m, "WritableFile")
       .def(py::init([](const std::string& filename, const std::string& mode) {
+        py::gil_scoped_release release;
         auto* env = tensorflow::Env::Default();
         std::unique_ptr<WritableFile> self;
         const auto status = mode.find("a") == std::string::npos
                                 ? env->NewWritableFile(filename, &self)
                                 : env->NewAppendableFile(filename, &self);
+        py::gil_scoped_acquire acquire;
         tensorflow::MaybeRaiseRegisteredFromStatus(status);
         return self.release();
       }))
       .def("append",
            [](WritableFile* self, tensorflow::StringPiece data) {
-             tensorflow::MaybeRaiseRegisteredFromStatus(self->Append(data));
+             const auto status = self->Append(data);
+             tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
            })
       // TODO(slebedev): Make WritableFile::Tell const and change self
       // to be a reference.
       .def("tell",
            [](WritableFile* self) {
              tensorflow::int64 pos = -1;
+             py::gil_scoped_release release;
              const auto status = self->Tell(&pos);
-             tensorflow::MaybeRaiseRegisteredFromStatus(status);
+             tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
              return pos;
            })
       .def("flush",
            [](WritableFile* self) {
-             tensorflow::MaybeRaiseRegisteredFromStatus(self->Flush());
+             py::gil_scoped_release release;
+             tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(self->Flush());
            })
       .def("close", [](WritableFile* self) {
-        tensorflow::MaybeRaiseRegisteredFromStatus(self->Close());
+        py::gil_scoped_release release;
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(self->Close());
       });
 
   using tensorflow::io::BufferedInputStream;
   py::class_<BufferedInputStream>(m, "BufferedInputStream")
       .def(py::init([](const std::string& filename, size_t buffer_size) {
+        py::gil_scoped_release release;
         std::unique_ptr<tensorflow::RandomAccessFile> file;
         const auto status =
             tensorflow::Env::Default()->NewRandomAccessFile(filename, &file);
-        tensorflow::MaybeRaiseRegisteredFromStatus(status);
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
         std::unique_ptr<tensorflow::io::RandomAccessInputStream> input_stream(
             new tensorflow::io::RandomAccessInputStream(file.release(),
                                                         /*owns_file=*/true));
+        py::gil_scoped_acquire acquire;
         return new BufferedInputStream(input_stream.release(), buffer_size,
                                        /*owns_input_stream=*/true);
       }))
       .def("read",
            [](BufferedInputStream* self, tensorflow::int64 bytes_to_read) {
+             py::gil_scoped_release release;
              tensorflow::tstring result;
              const auto status = self->ReadNBytes(bytes_to_read, &result);
              if (!status.ok() && !tensorflow::errors::IsOutOfRange(status)) {
                result.clear();
                tensorflow::MaybeRaiseRegisteredFromStatus(status);
              }
+             py::gil_scoped_acquire acquire;
              return py::bytes(result);
            })
       .def("readline",
            [](BufferedInputStream* self) {
-             return py::bytes(self->ReadLineAsString());
+             py::gil_scoped_release release;
+             auto output = self->ReadLineAsString();
+             py::gil_scoped_acquire acquire;
+             return py::bytes(output);
            })
       .def("seek",
            [](BufferedInputStream* self, tensorflow::int64 pos) {
-             tensorflow::MaybeRaiseRegisteredFromStatus(self->Seek(pos));
+             py::gil_scoped_release release;
+             tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(self->Seek(pos));
            })
-      .def("tell", [](BufferedInputStream* self) { return self->Tell(); });
+      .def("tell", [](BufferedInputStream* self) {
+        py::gil_scoped_release release;
+        return self->Tell();
+      });
 }
 }  // namespace
diff --git a/tensorflow/python/lite/toco_python_api_wrapper.cc b/tensorflow/python/lite/toco_python_api_wrapper.cc
index e6e0e111ec4..b77200a3bee 100644
--- a/tensorflow/python/lite/toco_python_api_wrapper.cc
+++ b/tensorflow/python/lite/toco_python_api_wrapper.cc
@@ -57,12 +57,13 @@ PYBIND11_MODULE(_pywrap_toco_api, m) {
   m.def(
       "ExperimentalMlirQuantizeModel",
       [](py::object input_contents_txt_raw, bool disable_per_channel,
-         bool fully_quantize) {
+         bool fully_quantize, int inference_type) {
         return tensorflow::PyoOrThrow(toco::MlirQuantizeModel(
-            input_contents_txt_raw.ptr(), disable_per_channel, fully_quantize));
+            input_contents_txt_raw.ptr(), disable_per_channel, fully_quantize,
+            inference_type));
       },
       py::arg("input_contents_txt_raw"), py::arg("disable_per_channel") = false,
-      py::arg("fully_quantize") = true,
+      py::arg("fully_quantize") = true, py::arg("inference_type") = 9,
       R"pbdoc(
       Returns a quantized model.
     )pbdoc");
diff --git a/tensorflow/python/module/module_test.py b/tensorflow/python/module/module_test.py
index 963d3549b2b..0578823c7fb 100644
--- a/tensorflow/python/module/module_test.py
+++ b/tensorflow/python/module/module_test.py
@@ -26,6 +26,7 @@ from absl.testing import parameterized
 import six
 
 from tensorflow.python import tf2
+from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values as distributed_values
 from tensorflow.python.eager import context
@@ -250,7 +251,7 @@ class VariableTrackingTest(test_util.TensorFlowTestCase):
         None, [variables.Variable(1.)], variables.VariableAggregation.SUM)
     tpu = tpu_values.TPUMirroredVariable(
         strategy=None, values=[variables.Variable(42.)], aggregation=None)
-    aggregating = distributed_values.AggregatingVariable(
+    aggregating = ps_values.AggregatingVariable(
         strategy=None, v=variables.Variable(1.), aggregation=None)
 
     m = module.Module()
@@ -514,8 +515,8 @@ class FlattenTest(parameterized.TestCase, test_util.TensorFlowTestCase):
 
     m = module.Module()
     m.layers = {non_orderable(): None, non_orderable(): None}
-    with self.assertRaisesRegexp(ValueError,
-                                 "Error processing property 'layers'"):
+    with self.assertRaisesRegex(ValueError,
+                                "Error processing property 'layers'"):
       m.variables  # pylint: disable=pointless-statement
 
 
diff --git a/tensorflow/python/modules_with_exports.py b/tensorflow/python/modules_with_exports.py
new file mode 100644
index 00000000000..40ef715aacf
--- /dev/null
+++ b/tensorflow/python/modules_with_exports.py
@@ -0,0 +1,78 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Imports modules that should be scanned during API generation.
+
+This file should eventually contain everything we need to scan looking for
+tf_export decorators.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import,g-bad-import-order,g-import-not-at-top
+# pylint: disable=unused-import,g-importing-member
+
+# Protocol buffers
+from tensorflow.core.framework.graph_pb2 import *
+from tensorflow.core.framework.node_def_pb2 import *
+from tensorflow.core.framework.summary_pb2 import *
+from tensorflow.core.framework.attr_value_pb2 import *
+from tensorflow.core.protobuf.meta_graph_pb2 import TensorInfo
+from tensorflow.core.protobuf.meta_graph_pb2 import MetaGraphDef
+from tensorflow.core.protobuf.config_pb2 import *
+from tensorflow.core.util.event_pb2 import *
+
+# Framework
+from tensorflow.python.framework.framework_lib import *  # pylint: disable=redefined-builtin
+from tensorflow.python.framework.versions import *
+from tensorflow.python.framework import config
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import graph_util
+
+# Session
+from tensorflow.python.client.client_lib import *
+
+# Ops
+from tensorflow.python.ops.standard_ops import *
+
+# Namespaces
+from tensorflow.python.ops import initializers_ns as initializers
+
+from tensorflow.python.util.tf_export import tf_export
+
+# Export protos
+# pylint: disable=undefined-variable
+tf_export(v1=['AttrValue'])(AttrValue)
+tf_export(v1=['ConfigProto'])(ConfigProto)
+tf_export(v1=['Event', 'summary.Event'])(Event)
+tf_export(v1=['GPUOptions'])(GPUOptions)
+tf_export(v1=['GraphDef'])(GraphDef)
+tf_export(v1=['GraphOptions'])(GraphOptions)
+tf_export(v1=['HistogramProto'])(HistogramProto)
+tf_export(v1=['LogMessage'])(LogMessage)
+tf_export(v1=['MetaGraphDef'])(MetaGraphDef)
+tf_export(v1=['NameAttrList'])(NameAttrList)
+tf_export(v1=['NodeDef'])(NodeDef)
+tf_export(v1=['OptimizerOptions'])(OptimizerOptions)
+tf_export(v1=['RunMetadata'])(RunMetadata)
+tf_export(v1=['RunOptions'])(RunOptions)
+tf_export(v1=['SessionLog', 'summary.SessionLog'])(SessionLog)
+tf_export(v1=['Summary', 'summary.Summary'])(Summary)
+tf_export(v1=['summary.SummaryDescription'])(SummaryDescription)
+tf_export(v1=['SummaryMetadata'])(SummaryMetadata)
+tf_export(v1=['summary.TaggedRunMetadata'])(TaggedRunMetadata)
+tf_export(v1=['TensorInfo'])(TensorInfo)
+# pylint: enable=undefined-variable
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 33aac84d77f..a2088fcdc48 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import gen_math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_array_ops import *
 from tensorflow.python.ops.gen_array_ops import reverse_v2 as reverse  # pylint: disable=unused-import
+from tensorflow.python.types import core
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
@@ -56,6 +57,7 @@ _BaseSlice = slice
 
 
 @tf_export("reshape", v1=["reshape", "manip.reshape"])
+@dispatch.add_dispatch_support
 def reshape(tensor, shape, name=None):  # pylint: disable=redefined-outer-name
   r"""Reshapes a tensor.
 
@@ -196,6 +198,7 @@ def reshape(tensor, shape, name=None):  # pylint: disable=redefined-outer-name
 
 
 @tf_export("fill")
+@dispatch.add_dispatch_support
 def fill(dims, value, name=None):
   r"""Creates a tensor filled with a scalar value.
 
@@ -454,6 +457,7 @@ listdiff.__doc__ = gen_array_ops.list_diff.__doc__ + "\n" + listdiff.__doc__
                         "This op will be removed after the deprecation date. "
                         "Please switch to tf.sets.difference().")
 @tf_export(v1=["setdiff1d"])
+@dispatch.add_dispatch_support
 def setdiff1d(x, y, index_dtype=dtypes.int32, name=None):
   """Computes the difference between two lists of numbers or strings.
 
@@ -497,6 +501,7 @@ setdiff1d.__doc__ = gen_array_ops.list_diff.__doc__
 
 
 @tf_export("broadcast_dynamic_shape")
+@dispatch.add_dispatch_support
 def broadcast_dynamic_shape(shape_x, shape_y):
   """Computes the shape of a broadcast given symbolic shapes.
 
@@ -522,6 +527,7 @@ def broadcast_dynamic_shape(shape_x, shape_y):
 
 
 @tf_export("broadcast_static_shape")
+@dispatch.add_dispatch_support
 def broadcast_static_shape(shape_x, shape_y):
   """Computes the shape of a broadcast given known shapes.
 
@@ -549,25 +555,23 @@ def broadcast_static_shape(shape_x, shape_y):
 
 
 @tf_export("shape", v1=[])
+@dispatch.add_dispatch_support
 def shape_v2(input, out_type=dtypes.int32, name=None):
   # pylint: disable=redefined-builtin
   """Returns the shape of a tensor.
   
   See also `tf.size`, `tf.rank`.
 
-  This operation returns a 1-D integer tensor representing the shape of `input`.
-  This represents the minimal set of known information at definition time.
+  `tf.shape` returns a 1-D integer tensor representing the shape of `input`.
 
   For example:
 
   >>> t = tf.constant([[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]])
   >>> tf.shape(t)
   <tf.Tensor: shape=(3,), dtype=int32, numpy=array([2, 2, 3], dtype=int32)>
-  >>> tf.shape(t).numpy()
-  array([2, 2, 3], dtype=int32)
 
-  Note: When using symbolic tensors, such as when using the Keras functional
-  API, tf.shape() will return the shape of the symbolic tensor.
+  Note: When using symbolic tensors, such as when using the Keras API,
+  tf.shape() will return the shape of the symbolic tensor.
 
   >>> a = tf.keras.layers.Input((None, 10))
   >>> tf.shape(a)
@@ -577,10 +581,13 @@ def shape_v2(input, out_type=dtypes.int32, name=None):
 
   >>> a.shape
   TensorShape([None, None, 10])
+  
+  (The first `None` represents the as yet unknown batch size.)
 
   `tf.shape` and `Tensor.shape` should be identical in eager mode.  Within
   `tf.function` or within a `compat.v1` context, not all dimensions may be
-  known until execution time.
+  known until execution time. Hence when defining custom layers and models
+  for graph mode, prefer the dynamic `tf.shape(x)` over the static `x.shape`.
 
   Args:
     input: A `Tensor` or `SparseTensor`.
@@ -595,6 +602,7 @@ def shape_v2(input, out_type=dtypes.int32, name=None):
 
 
 @tf_export(v1=["shape"])
+@dispatch.add_dispatch_support
 def shape(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the shape of a tensor.
@@ -649,6 +657,7 @@ def shape_internal(input, name=None, optimize=True, out_type=dtypes.int32):
 
 
 @tf_export("shape_n")
+@dispatch.add_dispatch_support
 def shape_n(input, out_type=dtypes.int32, name=None):
   # pylint: disable=redefined-builtin
   """Returns shape of tensors.
@@ -1006,6 +1015,7 @@ def _slice_helper(tensor, slice_spec, var=None):
 
 # pylint: disable=undefined-variable,protected-access,redefined-outer-name
 @tf_export("slice")
+@dispatch.add_dispatch_support
 def slice(input_, begin, size, name=None):
   # pylint: disable=redefined-builtin
   """Extracts a slice from a tensor.
@@ -1061,6 +1071,7 @@ def slice(input_, begin, size, name=None):
 
 # pylint: disable=invalid-name
 @tf_export("strided_slice")
+@dispatch.add_dispatch_support
 def strided_slice(input_,
                   begin,
                   end,
@@ -1252,6 +1263,7 @@ ops.Tensor._override_operator("__getitem__", _slice_helper)
 
 
 @tf_export("parallel_stack")
+@dispatch.add_dispatch_support
 def parallel_stack(values, name="parallel_stack"):
   """Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor in parallel.
 
@@ -1381,13 +1393,13 @@ def _autopacking_helper(list_or_tuple, dtype, name):
   if context.executing_eagerly():
     # NOTE: Fast path when all the items are tensors, this doesn't do any type
     # checking.
-    if all(ops.is_dense_tensor_like(elem) for elem in list_or_tuple):
+    if all(isinstance(elem, core.Tensor) for elem in list_or_tuple):
       return gen_array_ops.pack(list_or_tuple, name=name)
   must_pack = False
   converted_elems = []
   with ops.name_scope(name) as scope:
     for i, elem in enumerate(list_or_tuple):
-      if ops.is_dense_tensor_like(elem):
+      if isinstance(elem, core.Tensor):
         if dtype is not None and elem.dtype.base_dtype != dtype:
           raise TypeError("Cannot convert a list containing a tensor of dtype "
                           "%s to %s (Tensor is: %r)" %
@@ -1396,7 +1408,7 @@ def _autopacking_helper(list_or_tuple, dtype, name):
         must_pack = True
       elif isinstance(elem, (list, tuple)):
         converted_elem = _autopacking_helper(elem, dtype, str(i))
-        if ops.is_dense_tensor_like(converted_elem):
+        if isinstance(converted_elem, core.Tensor):
           must_pack = True
         converted_elems.append(converted_elem)
       else:
@@ -1404,7 +1416,7 @@ def _autopacking_helper(list_or_tuple, dtype, name):
     if must_pack:
       elems_as_tensors = []
       for i, elem in enumerate(converted_elems):
-        if ops.is_dense_tensor_like(elem):
+        if isinstance(elem, core.Tensor):
           elems_as_tensors.append(elem)
         else:
           # NOTE(mrry): This is inefficient, but it enables us to
@@ -1429,7 +1441,7 @@ def _get_dtype_from_nested_lists(list_or_tuple):
     such object exists.
   """
   for elem in list_or_tuple:
-    if ops.is_dense_tensor_like(elem):
+    if isinstance(elem, core.Tensor):
       return elem.dtype.base_dtype
     elif isinstance(elem, (list, tuple)):
       maybe_dtype = _get_dtype_from_nested_lists(elem)
@@ -1441,7 +1453,7 @@ def _get_dtype_from_nested_lists(list_or_tuple):
 def _cast_nested_seqs_to_dtype(dtype):
 
   def _maybe_cast(elem):
-    if ops.is_dense_tensor_like(elem):
+    if isinstance(elem, core.Tensor):
       if dtype != elem.dtype.base_dtype:
         elem = gen_math_ops.cast(elem, dtype)
     return elem
@@ -1455,7 +1467,7 @@ _NON_AUTOPACKABLE_TYPES.add(np.ndarray)
 
 def _should_not_autopack(v):
   # The condition we really want is
-  #    ops.is_dense_tensor_like(...)
+  #    any(isinstance(elem, core.Tensor))
   # but it is >5x slower due to abc.ABCMeta.__instancecheck__.
   # pylint: disable=unidiomatic-typecheck
   # TODO(slebedev): add nest.all?
@@ -1488,6 +1500,7 @@ ops.register_tensor_conversion_function((list, tuple),
 
 
 @tf_export("unstack")
+@dispatch.add_dispatch_support
 def unstack(value, num=None, axis=0, name="unstack"):
   """Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
 
@@ -1631,6 +1644,7 @@ def concat(values, axis, name="concat"):
 
 
 @tf_export(v1=["boolean_mask"])
+@dispatch.add_dispatch_support
 def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
   """Apply boolean mask to tensor.
 
@@ -1823,6 +1837,7 @@ def sparse_mask(a, mask_indices, name=None):
 
 
 @tf_export("unique")
+@dispatch.add_dispatch_support
 def unique(x, out_idx=dtypes.int32, name=None):
   """Finds unique elements in a 1-D tensor.
 
@@ -1870,6 +1885,7 @@ unique.__doc__ = gen_array_ops.unique.__doc__
 
 
 @tf_export("unique_with_counts")
+@dispatch.add_dispatch_support
 def unique_with_counts(x, out_idx=dtypes.int32, name=None):
   """Finds unique elements in a 1-D tensor.
 
@@ -1922,30 +1938,31 @@ unique_with_counts.__doc__ = gen_array_ops.unique_with_counts.__doc__
 
 
 @tf_export("split")
+@dispatch.add_dispatch_support
 def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   """Splits a tensor `value` into a list of sub tensors.
 
   See also `tf.unstack`.
 
-  If `num_or_size_splits` is an integer, then `value` is split along the
-  dimension `axis` into `num_split` smaller tensors. This requires that
-  `value.shape[axis]` is divisible by `num_split`.
+  If `num_or_size_splits` is an integer,  then `value` is split along the
+  dimension `axis` into `num_or_size_splits` smaller tensors. This requires that
+  `value.shape[axis]` is divisible by `num_or_size_splits`.
 
-  If `num_or_size_splits` is a 1-D Tensor (or list), we call it `size_splits`
-  and `value` is split into `len(size_splits)` elements. The shape of the `i`-th
+  If `num_or_size_splits` is a 1-D Tensor (or list), then `value` is split into
+  `len(num_or_size_splits)` elements. The shape of the `i`-th
   element has the same size as the `value` except along dimension `axis` where
-  the size is `size_splits[i]`.
+  the size is `num_or_size_splits[i]`.
 
   For example:
 
   >>> x = tf.Variable(tf.random.uniform([5, 30], -1, 1))
-
-  Split `x` into 3 tensors along dimension 1
+  >>>
+  >>> # Split `x` into 3 tensors along dimension 1
   >>> s0, s1, s2 = tf.split(x, num_or_size_splits=3, axis=1)
   >>> tf.shape(s0).numpy()
   array([ 5, 10], dtype=int32)
-
-  Split `x` into 3 tensors with sizes [4, 15, 11] along dimension 1
+  >>>
+  >>> # Split `x` into 3 tensors with sizes [4, 15, 11] along dimension 1
   >>> split0, split1, split2 = tf.split(x, [4, 15, 11], 1)
   >>> tf.shape(split0).numpy()
   array([5, 4], dtype=int32)
@@ -1999,6 +2016,7 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
 
 
 @tf_export("transpose", v1=[])
+@dispatch.add_dispatch_support
 def transpose_v2(a, perm=None, conjugate=False, name="transpose"):
   """Transposes `a`, where `a` is a Tensor.
 
@@ -2079,6 +2097,7 @@ def transpose_v2(a, perm=None, conjugate=False, name="transpose"):
 
 
 @tf_export(v1=["transpose"])
+@dispatch.add_dispatch_support
 def transpose(a, perm=None, name="transpose", conjugate=False):
   """Transposes `a`.
 
@@ -2169,6 +2188,7 @@ def transpose(a, perm=None, name="transpose", conjugate=False):
 @tf_export(
     "linalg.matrix_transpose",
     v1=["linalg.transpose", "linalg.matrix_transpose", "matrix_transpose"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_transpose", "linalg.transpose")
 def matrix_transpose(a, name="matrix_transpose", conjugate=False):
   """Transposes last two dimensions of tensor `a`.
@@ -2247,6 +2267,7 @@ def matrix_transpose(a, name="matrix_transpose", conjugate=False):
 
 
 @tf_export("linalg.diag", v1=["linalg.diag", "matrix_diag"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_diag")
 def matrix_diag(diagonal,
                 name="diag",
@@ -2415,6 +2436,7 @@ def matrix_diag(diagonal,
 
 
 @tf_export("linalg.diag_part", v1=["linalg.diag_part", "matrix_diag_part"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_diag_part")
 @dispatch.add_dispatch_support
 def matrix_diag_part(
@@ -2555,6 +2577,7 @@ def matrix_diag_part(
 
 
 @tf_export("linalg.set_diag", v1=["linalg.set_diag", "matrix_set_diag"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_set_diag")
 def matrix_set_diag(
     input,  # pylint:disable=redefined-builtin
@@ -2718,6 +2741,7 @@ def _tag_zeros_tensor(fun):
 
 
 @tf_export("zeros")
+@dispatch.add_dispatch_support
 @_tag_zeros_tensor
 def zeros(shape, dtype=dtypes.float32, name=None):
   """Creates a tensor with all elements set to zero.
@@ -2970,6 +2994,7 @@ def ones_like_impl(tensor, dtype, name, optimize=True):
 
 
 @tf_export("ones")
+@dispatch.add_dispatch_support
 def ones(shape, dtype=dtypes.float32, name=None):
   """Creates a tensor with all elements set to one (1).
 
@@ -3181,6 +3206,7 @@ def sparse_placeholder(dtype, shape=None, name=None):
 
 
 @tf_export("pad", v1=[])
+@dispatch.add_dispatch_support
 def pad_v2(tensor, paddings, mode="CONSTANT", constant_values=0, name=None):
   """Pads a tensor.
 
@@ -3239,6 +3265,7 @@ def pad_v2(tensor, paddings, mode="CONSTANT", constant_values=0, name=None):
 
 
 @tf_export(v1=["pad"])
+@dispatch.add_dispatch_support
 def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):  # pylint: disable=invalid-name
   """Pads a tensor.
 
@@ -3356,6 +3383,7 @@ def _get_paddings_constant(paddings):
 
 
 @tf_export("meshgrid")
+@dispatch.add_dispatch_support
 def meshgrid(*args, **kwargs):
   """Broadcasts parameters for evaluation on an N-D grid.
 
@@ -3499,6 +3527,7 @@ def _TileGradShape(op):
 
 
 @tf_export("edit_distance")
+@dispatch.add_dispatch_support
 def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   """Computes the Levenshtein distance between sequences.
 
@@ -3693,6 +3722,7 @@ def required_space_to_batch_paddings(input_shape,
 
 
 @tf_export(v1=["nn.space_to_batch", "space_to_batch"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("space_to_batch")
 def space_to_batch(  # pylint: disable=missing-docstring
     input,  # pylint: disable=redefined-builtin
@@ -3716,6 +3746,7 @@ space_to_batch.__doc__ = gen_array_ops.space_to_batch.__doc__
 
 
 @tf_export("space_to_batch", "nn.space_to_batch", v1=[])
+@dispatch.add_dispatch_support
 def space_to_batch_v2(input, block_shape, paddings, name=None):  # pylint: disable=redefined-builtin
   return space_to_batch_nd(input, block_shape, paddings, name)
 
@@ -3724,6 +3755,7 @@ space_to_batch_v2.__doc__ = gen_array_ops.space_to_batch_nd.__doc__
 
 
 @tf_export(v1=["nn.space_to_depth", "space_to_depth"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("space_to_depth")
 def space_to_depth(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.space_to_depth(input, block_size, data_format, name=name)
@@ -3733,6 +3765,7 @@ space_to_depth.__doc__ = gen_array_ops.space_to_depth.__doc__
 
 
 @tf_export("nn.space_to_depth", v1=[])
+@dispatch.add_dispatch_support
 def space_to_depth_v2(input, block_size, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
   return gen_array_ops.space_to_depth(input, block_size, data_format, name=name)
 
@@ -3741,6 +3774,7 @@ space_to_depth_v2.__doc__ = gen_array_ops.space_to_depth.__doc__
 
 
 @tf_export(v1=["nn.depth_to_space", "depth_to_space"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("depth_to_space")
 def depth_to_space(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.depth_to_space(input, block_size, data_format, name=name)
@@ -3750,6 +3784,7 @@ depth_to_space.__doc__ = gen_array_ops.depth_to_space.__doc__
 
 
 @tf_export("nn.depth_to_space", v1=[])
+@dispatch.add_dispatch_support
 def depth_to_space_v2(input, block_size, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
   return gen_array_ops.depth_to_space(input, block_size, data_format, name=name)
 
@@ -3758,6 +3793,7 @@ depth_to_space_v2.__doc__ = gen_array_ops.depth_to_space.__doc__
 
 
 @tf_export(v1=["batch_to_space"])
+@dispatch.add_dispatch_support
 def batch_to_space(input, crops, block_size, name=None, block_shape=None):  # pylint: disable=redefined-builtin,missing-docstring
   block_size = deprecation.deprecated_argument_lookup("block_shape",
                                                       block_shape, "block_size",
@@ -3775,6 +3811,7 @@ batch_to_space.__doc__ = gen_array_ops.batch_to_space.__doc__
 
 
 @tf_export("batch_to_space", v1=[])
+@dispatch.add_dispatch_support
 def batch_to_space_v2(input, block_shape, crops, name=None):  # pylint: disable=redefined-builtin
   """BatchToSpace for N-D tensors of type T.
 
@@ -3819,68 +3856,88 @@ def batch_to_space_v2(input, block_shape, crops, name=None):  # pylint: disable=
            block_shape[0] - crops[0,0] - crops[0,1], ..., input_shape[M] *
            block_shape[M-1] - crops[M-1,0] - crops[M-1,1],  input_shape[M+1],
            ..., input_shape[N-1]]
-      Some Examples:
-      (1) For the following input of shape `[4, 1, 1, 1]`,
-         `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
-         ```python
-         [[[[1]]],
-          [[[2]]],
-          [[[3]]],
-          [[[4]]]]
-         ```
-         The output tensor has shape `[1, 2, 2, 1]` and value:
-         ``` x = [[[[1], [2]],
-                   [[3], [4]]]] ```
-      (2) For the following input of shape `[4, 1, 1, 3]`,
-         `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
-         ```python
-         [[[1,  2,   3]],
-          [[4,  5,   6]],
-          [[7,  8,   9]],
-          [[10, 11, 12]]]
-         ```
-         The output tensor has shape `[1, 2, 2, 3]` and value:
-         ```python
-         x = [[[[1, 2, 3], [4,  5,  6 ]],
-               [[7, 8, 9], [10, 11, 12]]]]
-         ```
-      (3) For the following
-         input of shape `[4, 2, 2, 1]`,
-         `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
-         ```python
-         x = [[[[1], [3]], [[ 9], [11]]],
-              [[[2], [4]], [[10], [12]]],
-              [[[5], [7]], [[13], [15]]],
-              [[[6], [8]], [[14], [16]]]]
-         ```
-         The output tensor has shape `[1, 4, 4, 1]` and value:
-         ```python
-         x = [[[1],  [2],  [ 3], [ 4]],
-              [[5],  [6],  [ 7], [ 8]],
-              [[9],  [10], [11], [12]],
-              [[13], [14], [15], [16]]]
-         ```
-       (4) For the following input of shape
-          `[8, 1, 3, 1]`,
-          `block_shape = [2, 2]`, and `crops = [[0, 0], [2, 0]]`:
-          ```python
-          x = [[[[0], [ 1], [ 3]]],
-               [[[0], [ 9], [11]]],
-               [[[0], [ 2], [ 4]]],
-               [[[0], [10], [12]]],
-               [[[0], [ 5], [ 7]]],
-               [[[0], [13], [15]]],
-               [[[0], [ 6], [ 8]]],
-               [[[0], [14], [16]]]]
-          ```
-          The output tensor has shape `[2, 2, 4, 1]` and value:
-          ```python
-          x = [[[[ 1], [ 2], [ 3], [ 4]],
-                [[ 5], [ 6], [ 7], [ 8]]],
-               [[[ 9], [10], [11], [12]],
-                [[13], [14], [15], [16]]]] ```
     name: A name for the operation (optional).
 
+  Examples:
+
+  (1) For the following input of shape `[4, 1, 1, 1]`,
+     `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+     ```python
+     [[[[1]]],
+      [[[2]]],
+      [[[3]]],
+      [[[4]]]]
+     ```
+
+    The output tensor has shape `[1, 2, 2, 1]` and value:
+
+     ```
+     x = [[[[1], [2]],
+         [[3], [4]]]]
+     ```
+
+  (2) For the following input of shape `[4, 1, 1, 3]`,
+     `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+     ```python
+     [[[1,  2,   3]],
+      [[4,  5,   6]],
+      [[7,  8,   9]],
+      [[10, 11, 12]]]
+     ```
+
+    The output tensor has shape `[1, 2, 2, 3]` and value:
+
+    ```python
+     x = [[[[1, 2, 3], [4,  5,  6 ]],
+           [[7, 8, 9], [10, 11, 12]]]]
+     ```
+
+  (3) For the following
+     input of shape `[4, 2, 2, 1]`,
+     `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+     ```python
+     x = [[[[1], [3]], [[ 9], [11]]],
+          [[[2], [4]], [[10], [12]]],
+          [[[5], [7]], [[13], [15]]],
+          [[[6], [8]], [[14], [16]]]]
+     ```
+
+    The output tensor has shape `[1, 4, 4, 1]` and value:
+
+    ```python
+     x = [[[1],  [2],  [ 3], [ 4]],
+          [[5],  [6],  [ 7], [ 8]],
+          [[9],  [10], [11], [12]],
+          [[13], [14], [15], [16]]]
+     ```
+
+   (4) For the following input of shape
+      `[8, 1, 3, 1]`,
+      `block_shape = [2, 2]`, and `crops = [[0, 0], [2, 0]]`:
+
+      ```python
+      x = [[[[0], [ 1], [ 3]]],
+           [[[0], [ 9], [11]]],
+           [[[0], [ 2], [ 4]]],
+           [[[0], [10], [12]]],
+           [[[0], [ 5], [ 7]]],
+           [[[0], [13], [15]]],
+           [[[0], [ 6], [ 8]]],
+           [[[0], [14], [16]]]]
+      ```
+
+      The output tensor has shape `[2, 2, 4, 1]` and value:
+
+      ```python
+      x = [[[[ 1], [ 2], [ 3], [ 4]],
+            [[ 5], [ 6], [ 7], [ 8]]],
+           [[[ 9], [10], [11], [12]],
+            [[13], [14], [15], [16]]]]
+      ```
+
   Returns:
     A `Tensor`. Has the same type as `input`.
   """
@@ -4070,6 +4127,7 @@ def _all_dimensions(x):
 
 
 @tf_export("sequence_mask")
+@dispatch.add_dispatch_support
 def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
   """Returns a mask tensor representing the first N positions of each cell.
 
@@ -4296,6 +4354,7 @@ def where(condition, x=None, y=None, name=None):
 
 
 @tf_export("where", v1=["where_v2"])
+@dispatch.add_dispatch_support
 def where_v2(condition, x=None, y=None, name=None):
   """Return the elements where `condition` is `True` (multiplexing `x` and `y`).
 
@@ -4341,7 +4400,7 @@ def where_v2(condition, x=None, y=None, name=None):
 
   The `condition` tensor acts as a mask that chooses whether the corresponding
   element / row in the output should be taken from `x`
-  (if the elemment in `condition is True) or `y` (if it is false).
+  (if the element in `condition is True) or `y` (if it is false).
 
   >>> tf.where([True, False, False, True], [1,2,3,4], [100,200,300,400])
   <tf.Tensor: shape=(4,), dtype=int32, numpy=array([  1, 200, 300,   4],
@@ -4414,8 +4473,8 @@ def reverse_sequence(input,
   dimension `seq_axis`.
 
   The elements of `seq_lengths` must obey `seq_lengths[i] <=
-  input.dims[seq_dim]`, and `seq_lengths` must be a vector of length
-  `input.dims[batch_dim]`.
+  input.dims[seq_axis]`, and `seq_lengths` must be a vector of length
+  `input.dims[batch_axis]`.
 
   The output slice `i` along dimension `batch_axis` is then given by
   input slice `i`, with the first `seq_lengths[i]` slices along
@@ -4437,8 +4496,8 @@ def reverse_sequence(input,
   Args:
     input: A `Tensor`. The input to reverse.
     seq_lengths: A `Tensor`. Must be one of the following types: `int32`,
-      `int64`. 1-D with length `input.dims(batch_dim)` and `max(seq_lengths) <=
-      input.dims(seq_dim)`
+      `int64`. 1-D with length `input.dims(batch_axis)` and `max(seq_lengths) <=
+      input.dims(seq_axis)`
     seq_axis: An `int`. The dimension which is partially reversed.
     batch_axis: An optional `int`. Defaults to `0`. The dimension along which
       reversal is performed.
@@ -4982,6 +5041,7 @@ def batch_gather_nd(params, indices, batch_dims, name=None):
 # because round_mode was added later.
 # (And also now because of 'axis' processing).
 @tf_export(v1=["quantize_v2"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     "2017-10-25",
     "`tf.quantize_v2` is deprecated, please use `tf.quantization.quantize` "
@@ -5035,6 +5095,7 @@ quantize_v2.__doc__ = """Please use `tf.quantization.quantize` instead."""
 # tf.quantization.quantize; we can deprecate tf.quantization.quantize in next
 # version of TensorFlow.
 @tf_export("quantization.quantize", v1=["quantization.quantize", "quantize"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("quantize")
 def quantize(
     input,  # pylint: disable=redefined-builtin
@@ -5074,6 +5135,7 @@ def quantize(
 
 @tf_export("quantization.dequantize", v1=["quantization.dequantize",
                                           "dequantize"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("dequantize")
 def dequantize(  # pylint: disable=missing-docstring
     input,  # pylint: disable=redefined-builtin
@@ -5109,6 +5171,7 @@ dequantize.__doc__ = gen_array_ops.dequantize.__doc__
 
 
 @tf_export("quantization.quantize_and_dequantize")
+@dispatch.add_dispatch_support
 def quantize_and_dequantize(
     input,  # pylint: disable=redefined-builtin
     input_min,
@@ -5168,6 +5231,7 @@ def quantize_and_dequantize(
 
 
 @tf_export("searchsorted")
+@dispatch.add_dispatch_support
 def searchsorted(sorted_sequence,
                  values,
                  side="left",
@@ -5232,6 +5296,7 @@ quantize.__doc__ = gen_array_ops.quantize_v2.__doc__
 
 
 @tf_export("image.extract_patches")
+@dispatch.add_dispatch_support
 def extract_image_patches_v2(images, sizes, strides, rates, padding, name=None):
   r"""Extract `patches` from `images`.
 
@@ -5353,6 +5418,7 @@ def extract_image_patches_v2(images, sizes, strides, rates, padding, name=None):
 
 
 @tf_export(v1=["image.extract_image_patches", "extract_image_patches"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "ksizes is deprecated, use sizes instead",
                              "ksizes")
 def extract_image_patches(  # pylint: disable=missing-docstring
@@ -5401,6 +5467,7 @@ extract_image_patches.__doc__ = gen_array_ops.extract_image_patches.__doc__
 
 
 @tf_export("fingerprint")
+@dispatch.add_dispatch_support
 def fingerprint(data, method="farmhash64", name=None):
   r"""Generates fingerprint values.
 
@@ -5647,6 +5714,7 @@ def _with_nonzero_rank(data):
 
 
 @tf_export("repeat")
+@dispatch.add_dispatch_support
 def repeat(input, repeats, axis=None, name=None):  # pylint: disable=redefined-builtin
   """Repeat elements of `input`.
   
diff --git a/tensorflow/python/ops/bincount.py b/tensorflow/python/ops/bincount.py
deleted file mode 100644
index e1b3bebaaaa..00000000000
--- a/tensorflow/python/ops/bincount.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# maxlengthations under the License.
-# ==============================================================================
-"""tf.sparse.bincount ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_count_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.util.tf_export import tf_export
-
-
-@tf_export("sparse.bincount")
-def sparse_bincount(values,
-                    weights=None,
-                    axis=0,
-                    minlength=None,
-                    maxlength=None,
-                    binary_count=False,
-                    name=None):
-  """Count the number of times an integer value appears in a tensor.
-
-  This op takes an N-dimensional `Tensor`, `RaggedTensor`, or `SparseTensor`,
-  and returns an N-dimensional int64 SparseTensor where element
-  `[i0...i[axis], j]` contains the number of times the value `j` appears in
-  slice `[i0...i[axis], :]` of the input tensor.  Currently, only N=0 and
-  N=-1 are supported.
-
-  Args:
-    values: A Tensor, RaggedTensor, or SparseTensor whose values should be
-      counted. These tensors must have a rank of 1 or 2.
-    weights: A 1-dimensional Tensor of weights. If specified, the input array is
-      weighted by the weight array, i.e. if a value `n` is found at position
-      `i`, `out[n]`  will be increased by `weight[i]` instead of 1.
-    axis: The axis to slice over. Axes at and below `axis` will be flattened
-      before bin counting. Currently, only `0`, and `-1` are supported. If None,
-      all axes will be flattened (identical to passing `0`).
-    minlength: If given, skips `values` that are less than `minlength`, and
-      ensures that the output has a `dense_shape` of at least `minlength` in the
-      inner dimension.
-    maxlength: If given, skips `values` that are greater than or equal to
-      `maxlength`, and ensures that the output has a `dense_shape` of at most
-      `maxlength` in the inner dimension.
-    binary_count: Whether to do a binary count. When True, this op will return 1
-      for any value that exists instead of counting the number of occurrences.
-    name: A name for this op.
-
-  Returns:
-    A SparseTensor with `output.shape = values.shape[:axis] + [N]`, where `N` is
-      * `maxlength` (if set);
-      * `minlength` (if set, and `minlength > reduce_max(values)`);
-      * `0` (if `values` is empty);
-      * `reduce_max(values) + 1` otherwise.
-
-
-  Examples:
-
-  **Bin-counting every item in individual batches**
-
-  This example takes an input (which could be a Tensor, RaggedTensor, or
-  SparseTensor) and returns a SparseTensor where the value of (i,j) is the
-  number of times value j appears in batch i.
-
-  >>> data = [[10, 20, 30, 20], [11, 101, 11, 10001]]
-  >>> output = tf.sparse.bincount(data, axis=-1)
-  >>> print(output)
-  SparseTensor(indices=tf.Tensor(
-  [[    0    10]
-   [    0    20]
-   [    0    30]
-   [    1    11]
-   [    1   101]
-   [    1 10001]], shape=(6, 2), dtype=int64),
-   values=tf.Tensor([1 2 1 2 1 1], shape=(6,), dtype=int64),
-   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
-
-  **Bin-counting with defined output shape**
-
-  This example takes an input (which could be a Tensor, RaggedTensor, or
-  SparseTensor) and returns a SparseTensor where the value of (i,j) is the
-  number of times value j appears in batch i. However, all values of j
-  above 'maxlength' are ignored. The dense_shape of the output sparse tensor
-  is set to 'minlength'. Note that, while the input is identical to the
-  example above, the value '10001' in batch item 2 is dropped, and the
-  dense shape is [2, 500] instead of [2,10002] or [2, 102].
-
-  >>> minlength = maxlength = 500
-  >>> data = [[10, 20, 30, 20], [11, 101, 11, 10001]]
-  >>> output = tf.sparse.bincount(
-  ...    data, axis=-1, minlength=minlength, maxlength=maxlength)
-  >>> print(output)
-  SparseTensor(indices=tf.Tensor(
-  [[  0  10]
-   [  0  20]
-   [  0  30]
-   [  1  11]
-   [  1 101]], shape=(5, 2), dtype=int64),
-   values=tf.Tensor([1 2 1 2 1], shape=(5,), dtype=int64),
-   dense_shape=tf.Tensor([  2 500], shape=(2,), dtype=int64))
-
-  **Binary bin-counting**
-
-  This example takes an input (which could be a Tensor, RaggedTensor, or
-  SparseTensor) and returns a SparseTensor where (i,j) is 1 if the value j
-  appears in batch i at least once and is 0 otherwise. Note that, even though
-  some values (like 20 in batch 1 and 11 in batch 2) appear more than once,
-  the 'values' tensor is all 1s.
-
-  >>> dense = [[10, 20, 30, 20], [11, 101, 11, 10001]]
-  >>> output = tf.sparse.bincount(dense, binary_count=True, axis=-1)
-  >>> print(output)
-  SparseTensor(indices=tf.Tensor(
-  [[    0    10]
-   [    0    20]
-   [    0    30]
-   [    1    11]
-   [    1   101]
-   [    1 10001]], shape=(6, 2), dtype=int64),
-   values=tf.Tensor([1 1 1 1 1 1], shape=(6,), dtype=int64),
-   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
-
-  """
-  with ops.name_scope(name, "count", [values, weights]):
-    if not isinstance(values, sparse_tensor.SparseTensor):
-      values = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-          values, name="values")
-
-    if weights is not None and binary_count:
-      raise ValueError("binary_count and weights are mutually exclusive.")
-
-    if weights is None:
-      weights = []
-      output_type = dtypes.int64
-    else:
-      output_type = dtypes.float32
-
-    if axis is None:
-      axis = 0
-
-    if axis not in [0, -1]:
-      raise ValueError("Unsupported axis value %s. Only 0 and -1 are currently "
-                       "supported." % axis)
-
-    minlength_value = minlength if minlength is not None else -1
-    maxlength_value = maxlength if maxlength is not None else -1
-
-    if axis == 0:
-      if isinstance(values,
-                    (sparse_tensor.SparseTensor, ragged_tensor.RaggedTensor)):
-        values = values.values
-      else:
-        values = array_ops.reshape(values, [-1])
-
-    if isinstance(values, sparse_tensor.SparseTensor):
-      c_ind, c_val, c_shape = gen_count_ops.sparse_count_sparse_output(
-          values.indices,
-          values.values,
-          values.dense_shape,
-          weights=weights,
-          minlength=minlength_value,
-          maxlength=maxlength_value,
-          binary_count=binary_count,
-          output_type=output_type)
-    elif isinstance(values, ragged_tensor.RaggedTensor):
-      c_ind, c_val, c_shape = gen_count_ops.ragged_count_sparse_output(
-          values.row_splits,
-          values.values,
-          weights=weights,
-          minlength=minlength_value,
-          maxlength=maxlength_value,
-          binary_count=binary_count,
-          output_type=output_type)
-    else:
-      c_ind, c_val, c_shape = gen_count_ops.dense_count_sparse_output(
-          values,
-          weights=weights,
-          minlength=minlength_value,
-          maxlength=maxlength_value,
-          binary_count=binary_count,
-          output_type=output_type)
-
-    return sparse_tensor.SparseTensor(c_ind, c_val, c_shape)
diff --git a/tensorflow/python/ops/bincount_ops.py b/tensorflow/python/ops/bincount_ops.py
new file mode 100644
index 00000000000..758f0180a84
--- /dev/null
+++ b/tensorflow/python/ops/bincount_ops.py
@@ -0,0 +1,526 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# maxlengthations under the License.
+# ==============================================================================
+"""bincount ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import gen_count_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("math.bincount", v1=[])
+def bincount(arr,
+             weights=None,
+             minlength=None,
+             maxlength=None,
+             dtype=dtypes.int32,
+             name=None,
+             axis=None,
+             binary_output=False):
+  """Counts the number of occurrences of each value in an integer array.
+
+  If `minlength` and `maxlength` are not given, returns a vector with length
+  `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
+  If `weights` are non-None, then index `i` of the output stores the sum of the
+  value in `weights` at each index where the corresponding value in `arr` is
+  `i`.
+
+  ```python
+  values = tf.constant([1,1,2,3,2,4,4,5])
+  tf.math.bincount(values) #[0 2 2 1 2 1]
+  ```
+  Vector length = Maximum element in vector `values` is 5. Adding 1, which is 6
+                  will be the vector length.
+
+  Each bin value in the output indicates number of occurrences of the particular
+  index. Here, index 1 in output has a value 2. This indicates value 1 occurs
+  two times in `values`.
+
+  ```python
+  values = tf.constant([1,1,2,3,2,4,4,5])
+  weights = tf.constant([1,5,0,1,0,5,4,5])
+  tf.math.bincount(values, weights=weights) #[0 6 0 1 9 5]
+  ```
+  Bin will be incremented by the corresponding weight instead of 1.
+  Here, index 1 in output has a value 6. This is the summation of weights
+  corresponding to the value in `values`.
+
+  **Bin-counting on a certain axis**
+
+  This example takes a 2 dimensional input and returns a `Tensor` with
+  bincounting on each sample.
+
+  >>> data = np.array([[1, 2, 3, 0], [0, 0, 1, 2]], dtype=np.int32)
+  >>> tf.math.bincount(data, axis=-1)
+  <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
+    array([[1, 1, 1, 1],
+           [2, 1, 1, 0]], dtype=int32)>
+
+
+  **Bin-counting with binary_output**
+
+  This example gives binary output instead of counting the occurrence.
+
+  >>> data = np.array([[1, 2, 3, 0], [0, 0, 1, 2]], dtype=np.int32)
+  >>> tf.math.bincount(data, axis=-1, binary_output=True)
+  <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
+    array([[1, 1, 1, 1],
+           [1, 1, 1, 0]], dtype=int32)>
+
+  Args:
+    arr: A Tensor, RaggedTensor, or SparseTensor whose values should be counted.
+      These tensors must have a rank of 2 if `axis=-1`.
+    weights: If non-None, must be the same shape as arr. For each value in
+      `arr`, the bin will be incremented by the corresponding weight instead of
+      1.
+    minlength: If given, ensures the output has length at least `minlength`,
+      padding with zeros at the end if necessary.
+    maxlength: If given, skips values in `arr` that are equal or greater than
+      `maxlength`, ensuring that the output has length at most `maxlength`.
+    dtype: If `weights` is None, determines the type of the output bins.
+    name: A name scope for the associated operations (optional).
+    axis: The axis to slice over. Axes at and below `axis` will be flattened
+      before bin counting. Currently, only `0`, and `-1` are supported. If None,
+      all axes will be flattened (identical to passing `0`).
+    binary_output: If True, this op will output 1 instead of the number of times
+      a token appears (equivalent to one_hot + reduce_any instead of one_hot +
+      reduce_add). Defaults to False.
+
+  Returns:
+    A vector with the same dtype as `weights` or the given `dtype`. The bin
+    values.
+
+  Raises:
+    `InvalidArgumentError` if negative values are provided as an input.
+
+  """
+  name = "bincount" if name is None else name
+  with ops.name_scope(name):
+    # Somehow forward compatible needs to be False.
+    if not binary_output and axis is None:
+      arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32)
+      array_is_nonempty = math_ops.reduce_prod(array_ops.shape(arr)) > 0
+      output_size = math_ops.cast(array_is_nonempty, dtypes.int32) * (
+          math_ops.reduce_max(arr) + 1)
+      if minlength is not None:
+        minlength = ops.convert_to_tensor(
+            minlength, name="minlength", dtype=dtypes.int32)
+        output_size = gen_math_ops.maximum(minlength, output_size)
+      if maxlength is not None:
+        maxlength = ops.convert_to_tensor(
+            maxlength, name="maxlength", dtype=dtypes.int32)
+        output_size = gen_math_ops.minimum(maxlength, output_size)
+      if weights is not None:
+        weights = ops.convert_to_tensor(weights, name="weights")
+        return gen_math_ops.unsorted_segment_sum(weights, arr, output_size)
+      weights = constant_op.constant([], dtype)
+      return gen_math_ops.bincount(arr, output_size, weights)
+
+    if not isinstance(arr, sparse_tensor.SparseTensor):
+      arr = ragged_tensor.convert_to_tensor_or_ragged_tensor(arr, name="arr")
+    if weights is not None:
+      if not isinstance(weights, sparse_tensor.SparseTensor):
+        weights = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+            weights, name="weights")
+
+    if weights is not None and binary_output:
+      raise ValueError("binary_output and weights are mutually exclusive.")
+
+    if not arr.dtype.is_integer:
+      arr = math_ops.cast(arr, dtypes.int32)
+    if axis is None:
+      axis = 0
+
+    if axis not in [0, -1]:
+      raise ValueError("Unsupported axis value %s. Only 0 and -1 are currently "
+                       "supported." % axis)
+
+    if isinstance(arr, ragged_tensor.RaggedTensor):
+      array_is_nonempty = math_ops.reduce_prod(array_ops.shape(arr.values)) > 0
+    else:
+      array_is_nonempty = math_ops.reduce_prod(array_ops.shape(arr)) > 0
+    if isinstance(arr, sparse_tensor.SparseTensor):
+      output_size = math_ops.cast(array_is_nonempty, arr.dtype) * (
+          math_ops.reduce_max(arr.values) + 1)
+    else:
+      output_size = math_ops.cast(array_is_nonempty, arr.dtype) * (
+          math_ops.reduce_max(arr) + 1)
+    if minlength is not None:
+      minlength = ops.convert_to_tensor(
+          minlength, name="minlength", dtype=arr.dtype)
+      output_size = gen_math_ops.maximum(minlength, output_size)
+    if maxlength is not None:
+      maxlength = ops.convert_to_tensor(
+          maxlength, name="maxlength", dtype=arr.dtype)
+      output_size = gen_math_ops.minimum(maxlength, output_size)
+
+    if axis == 0:
+      if isinstance(arr, sparse_tensor.SparseTensor):
+        if weights is not None:
+          weights = validate_sparse_weights(arr, weights, dtype)
+        arr = arr.values
+      elif isinstance(arr, ragged_tensor.RaggedTensor):
+        if weights is not None:
+          weights = validate_ragged_weights(arr, weights, dtype)
+        arr = arr.values
+      else:
+        if weights is not None:
+          weights = array_ops.reshape(weights, [-1])
+        arr = array_ops.reshape(arr, [-1])
+
+    if isinstance(arr, sparse_tensor.SparseTensor):
+      weights = validate_sparse_weights(arr, weights, dtype)
+      return gen_math_ops.sparse_bincount(
+          indices=arr.indices,
+          values=arr.values,
+          dense_shape=arr.dense_shape,
+          size=output_size,
+          weights=weights,
+          binary_output=binary_output)
+    elif isinstance(arr, ragged_tensor.RaggedTensor):
+      weights = validate_ragged_weights(arr, weights, dtype)
+      return gen_math_ops.ragged_bincount(
+          splits=arr.row_splits,
+          values=arr.values,
+          size=output_size,
+          weights=weights,
+          binary_output=binary_output)
+    else:
+      weights = validate_dense_weights(arr, weights, dtype)
+      return gen_math_ops.dense_bincount(
+          input=arr,
+          size=output_size,
+          weights=weights,
+          binary_output=binary_output)
+
+
+@tf_export(v1=["math.bincount", "bincount"])
+@deprecation.deprecated_endpoints("bincount")
+def bincount_v1(arr,
+                weights=None,
+                minlength=None,
+                maxlength=None,
+                dtype=dtypes.int32):
+  """Counts the number of occurrences of each value in an integer array.
+
+  If `minlength` and `maxlength` are not given, returns a vector with length
+  `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
+  If `weights` are non-None, then index `i` of the output stores the sum of the
+  value in `weights` at each index where the corresponding value in `arr` is
+  `i`.
+
+  Args:
+    arr: An int32 tensor of non-negative values.
+    weights: If non-None, must be the same shape as arr. For each value in
+      `arr`, the bin will be incremented by the corresponding weight instead of
+      1.
+    minlength: If given, ensures the output has length at least `minlength`,
+      padding with zeros at the end if necessary.
+    maxlength: If given, skips values in `arr` that are equal or greater than
+      `maxlength`, ensuring that the output has length at most `maxlength`.
+    dtype: If `weights` is None, determines the type of the output bins.
+
+  Returns:
+    A vector with the same dtype as `weights` or the given `dtype`. The bin
+    values.
+  """
+  return bincount(arr, weights, minlength, maxlength, dtype)
+
+
+@tf_export("sparse.bincount")
+def sparse_bincount(values,
+                    weights=None,
+                    axis=0,
+                    minlength=None,
+                    maxlength=None,
+                    binary_output=False,
+                    name=None):
+  """Count the number of times an integer value appears in a tensor.
+
+  This op takes an N-dimensional `Tensor`, `RaggedTensor`, or `SparseTensor`,
+  and returns an N-dimensional int64 SparseTensor where element
+  `[i0...i[axis], j]` contains the number of times the value `j` appears in
+  slice `[i0...i[axis], :]` of the input tensor.  Currently, only N=0 and
+  N=-1 are supported.
+
+  Args:
+    values: A Tensor, RaggedTensor, or SparseTensor whose values should be
+      counted. These tensors must have a rank of 2 if `axis=-1`.
+    weights: If non-None, must be the same shape as arr. For each value in
+      `value`, the bin will be incremented by the corresponding weight instead
+      of 1.
+    axis: The axis to slice over. Axes at and below `axis` will be flattened
+      before bin counting. Currently, only `0`, and `-1` are supported. If None,
+      all axes will be flattened (identical to passing `0`).
+    minlength: If given, ensures the output has length at least `minlength`,
+      padding with zeros at the end if necessary.
+    maxlength: If given, skips values in `values` that are equal or greater than
+      `maxlength`, ensuring that the output has length at most `maxlength`.
+    binary_output: If True, this op will output 1 instead of the number of times
+      a token appears (equivalent to one_hot + reduce_any instead of one_hot +
+      reduce_add). Defaults to False.
+    name: A name for this op.
+
+  Returns:
+    A SparseTensor with `output.shape = values.shape[:axis] + [N]`, where `N` is
+      * `maxlength` (if set);
+      * `minlength` (if set, and `minlength > reduce_max(values)`);
+      * `0` (if `values` is empty);
+      * `reduce_max(values) + 1` otherwise.
+
+
+  Examples:
+
+  **Bin-counting every item in individual batches**
+
+  This example takes an input (which could be a Tensor, RaggedTensor, or
+  SparseTensor) and returns a SparseTensor where the value of (i,j) is the
+  number of times value j appears in batch i.
+
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> output = tf.sparse.bincount(data, axis=-1)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[    0    10]
+   [    0    20]
+   [    0    30]
+   [    1    11]
+   [    1   101]
+   [    1 10001]], shape=(6, 2), dtype=int64),
+   values=tf.Tensor([1 2 1 2 1 1], shape=(6,), dtype=int64),
+   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
+
+  **Bin-counting with defined output shape**
+
+  This example takes an input (which could be a Tensor, RaggedTensor, or
+  SparseTensor) and returns a SparseTensor where the value of (i,j) is the
+  number of times value j appears in batch i. However, all values of j
+  above 'maxlength' are ignored. The dense_shape of the output sparse tensor
+  is set to 'minlength'. Note that, while the input is identical to the
+  example above, the value '10001' in batch item 2 is dropped, and the
+  dense shape is [2, 500] instead of [2,10002] or [2, 102].
+
+  >>> minlength = maxlength = 500
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> output = tf.sparse.bincount(
+  ...    data, axis=-1, minlength=minlength, maxlength=maxlength)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[  0  10]
+   [  0  20]
+   [  0  30]
+   [  1  11]
+   [  1 101]], shape=(5, 2), dtype=int64),
+   values=tf.Tensor([1 2 1 2 1], shape=(5,), dtype=int64),
+   dense_shape=tf.Tensor([  2 500], shape=(2,), dtype=int64))
+
+  **Binary bin-counting**
+
+  This example takes an input (which could be a Tensor, RaggedTensor, or
+  SparseTensor) and returns a SparseTensor where (i,j) is 1 if the value j
+  appears in batch i at least once and is 0 otherwise. Note that, even though
+  some values (like 20 in batch 1 and 11 in batch 2) appear more than once,
+  the 'values' tensor is all 1s.
+
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> output = tf.sparse.bincount(data, binary_output=True, axis=-1)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[    0    10]
+   [    0    20]
+   [    0    30]
+   [    1    11]
+   [    1   101]
+   [    1 10001]], shape=(6, 2), dtype=int64),
+   values=tf.Tensor([1 1 1 1 1 1], shape=(6,), dtype=int64),
+   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
+
+  **Weighted bin-counting**
+
+  This example takes two inputs - a values tensor and a weights tensor. These
+  tensors must be identically shaped, and have the same row splits or indices
+  in the case of RaggedTensors or SparseTensors. When performing a weighted
+  count, the op will output a SparseTensor where the value of (i, j) is the
+  sum of the values in the weight tensor's batch i in the locations where
+  the values tensor has the value j. In this case, the output dtype is the
+  same as the dtype of the weights tensor.
+
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> weights = [[2, 0.25, 15, 0.5], [2, 17, 3, 0.9]]
+  >>> output = tf.sparse.bincount(data, weights=weights, axis=-1)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[    0    10]
+   [    0    20]
+   [    0    30]
+   [    1    11]
+   [    1   101]
+   [    1 10001]], shape=(6, 2), dtype=int64),
+   values=tf.Tensor([2. 0.75 15. 5. 17. 0.9], shape=(6,), dtype=float32),
+   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
+
+  """
+  with ops.name_scope(name, "count", [values, weights]):
+    if not isinstance(values, sparse_tensor.SparseTensor):
+      values = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+          values, name="values")
+    if weights is not None:
+      if not isinstance(weights, sparse_tensor.SparseTensor):
+        weights = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+            weights, name="weights")
+
+    if weights is not None and binary_output:
+      raise ValueError("binary_output and weights are mutually exclusive.")
+
+    if axis is None:
+      axis = 0
+
+    if axis not in [0, -1]:
+      raise ValueError("Unsupported axis value %s. Only 0 and -1 are currently "
+                       "supported." % axis)
+
+    minlength_value = minlength if minlength is not None else -1
+    maxlength_value = maxlength if maxlength is not None else -1
+
+    if axis == 0:
+      if isinstance(values, sparse_tensor.SparseTensor):
+        if weights is not None:
+          weights = validate_sparse_weights(values, weights)
+        values = values.values
+      elif isinstance(values, ragged_tensor.RaggedTensor):
+        if weights is not None:
+          weights = validate_ragged_weights(values, weights)
+        values = values.values
+      else:
+        if weights is not None:
+          weights = array_ops.reshape(weights, [-1])
+        values = array_ops.reshape(values, [-1])
+
+    if isinstance(values, sparse_tensor.SparseTensor):
+      weights = validate_sparse_weights(values, weights)
+      c_ind, c_val, c_shape = gen_count_ops.sparse_count_sparse_output(
+          values.indices,
+          values.values,
+          values.dense_shape,
+          weights,
+          minlength=minlength_value,
+          maxlength=maxlength_value,
+          binary_output=binary_output)
+    elif isinstance(values, ragged_tensor.RaggedTensor):
+      weights = validate_ragged_weights(values, weights)
+      c_ind, c_val, c_shape = gen_count_ops.ragged_count_sparse_output(
+          values.row_splits,
+          values.values,
+          weights,
+          minlength=minlength_value,
+          maxlength=maxlength_value,
+          binary_output=binary_output)
+    else:
+      weights = validate_dense_weights(values, weights)
+      c_ind, c_val, c_shape = gen_count_ops.dense_count_sparse_output(
+          values,
+          weights=weights,
+          minlength=minlength_value,
+          maxlength=maxlength_value,
+          binary_output=binary_output)
+
+    return sparse_tensor.SparseTensor(c_ind, c_val, c_shape)
+
+
+def validate_dense_weights(values, weights, dtype=None):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    if dtype:
+      return array_ops.constant([], dtype=dtype)
+    return array_ops.constant([], dtype=values.dtype)
+
+  if not isinstance(weights, ops.Tensor):
+    raise ValueError(
+        "`weights` must be a tf.Tensor if `values` is a tf.Tensor.")
+
+  return weights
+
+
+def validate_sparse_weights(values, weights, dtype=None):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    if dtype:
+      return array_ops.constant([], dtype=dtype)
+    return array_ops.constant([], dtype=values.values.dtype)
+
+  if not isinstance(weights, sparse_tensor.SparseTensor):
+    raise ValueError(
+        "`weights` must be a SparseTensor if `values` is a SparseTensor.")
+
+  checks = []
+  if weights.dense_shape is not values.dense_shape:
+    checks.append(
+        check_ops.assert_equal(
+            weights.dense_shape,
+            values.dense_shape,
+            message="'weights' and 'values' must have the same dense shape."))
+  if weights.indices is not values.indices:
+    checks.append(
+        check_ops.assert_equal(
+            weights.indices,
+            values.indices,
+            message="'weights' and 'values' must have the same indices.")
+    )
+  if checks:
+    with ops.control_dependencies(checks):
+      weights = array_ops.identity(weights.values)
+  else:
+    weights = weights.values
+
+  return weights
+
+
+def validate_ragged_weights(values, weights, dtype=None):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    if dtype:
+      return array_ops.constant([], dtype=dtype)
+    return array_ops.constant([], dtype=values.values.dtype)
+
+  if not isinstance(weights, ragged_tensor.RaggedTensor):
+    raise ValueError(
+        "`weights` must be a RaggedTensor if `values` is a RaggedTensor.")
+
+  checks = []
+  if weights.row_splits is not values.row_splits:
+    checks.append(
+        check_ops.assert_equal(
+            weights.row_splits,
+            values.row_splits,
+            message="'weights' and 'values' must have the same row splits."))
+  if checks:
+    with ops.control_dependencies(checks):
+      weights = array_ops.identity(weights.values)
+  else:
+    weights = weights.values
+
+  return weights
diff --git a/tensorflow/python/ops/bincount_test.py b/tensorflow/python/ops/bincount_ops_test.py
similarity index 51%
rename from tensorflow/python/ops/bincount_test.py
rename to tensorflow/python/ops/bincount_ops_test.py
index 776b65b72d0..74fd17cae2b 100644
--- a/tensorflow/python/ops/bincount_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -21,9 +21,14 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.ops import bincount
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import bincount_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 
 
@@ -65,7 +70,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
           "expected_values": [1, 1, 1, 1, 1],
           "expected_shape": [2, 6],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_maxlength_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -73,7 +78,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
           "expected_values": [1, 1, 1, 1, 1],
           "expected_shape": [2, 7],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_minlength_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -82,7 +87,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [1, 7]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [2, 9],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_minlength_larger_values_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -91,40 +96,40 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [1, 7]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [2, 8],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_no_maxlength_weights",
           "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
-          "expected_values": [1, 2, 3, 8, 5],
+          "expected_values": [2, 1, 0.5, 9, 3],
           "expected_shape": [2, 6],
-          "weights": [0.5, 1, 2, 3, 4, 5]
+          "weights": [[0.5, 1, 2], [3, 4, 5]]
       }, {
           "testcase_name": "_maxlength_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "maxlength": 7,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
-          "expected_values": [1, 2, 3, 0.5, 8],
+          "expected_values": [2, 1, 0.5, 3, 9],
           "expected_shape": [2, 7],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6]
+          "weights": [[0.5, 1, 2, 11], [7, 3, 4, 5]]
       }, {
           "testcase_name": "_minlength_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "minlength": 9,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
                                [1, 7]],
-          "expected_values": [1, 2, 3, 7, 0.5, 8, 7],
+          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
           "expected_shape": [2, 9],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
       }, {
           "testcase_name": "_minlength_larger_values_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "minlength": 3,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
                                [1, 7]],
-          "expected_values": [1, 2, 3, 7, 0.5, 8, 7],
+          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
           "expected_shape": [2, 8],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
       }, {
           "testcase_name": "_1d",
           "x": np.array([3, 2, 1, 1], dtype=np.int32),
@@ -146,15 +151,15 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                        expected_shape,
                        minlength=None,
                        maxlength=None,
-                       binary_count=False,
+                       binary_output=False,
                        weights=None,
                        axis=-1):
-    y = bincount.sparse_bincount(
+    y = bincount_ops.sparse_bincount(
         x,
         weights=weights,
         minlength=minlength,
         maxlength=maxlength,
-        binary_count=binary_count,
+        binary_output=binary_output,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
@@ -216,7 +221,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
           "expected_values": [1, 1, 1, 1],
           "expected_shape": [3, 6],
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -230,7 +235,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 7],
           "maxlength":
               7,
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -244,7 +249,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 9],
           "minlength":
               9,
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -258,7 +263,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 8],
           "minlength":
               3,
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -268,9 +273,10 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 8, 5],
+          "expected_values": [2, 6, 7, 10],
           "expected_shape": [3, 6],
-          "weights": [0.5, 1, 2, 3, 4, 5]
+          "weights":
+              np.array([[6, 0, 2, 0], [0, 0, 0, 0], [10, 0, 3.5, 3.5]]),
       },
       {
           "testcase_name":
@@ -279,11 +285,12 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [0, 0, 7, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 8, 5],
+          "expected_values": [2, 6, 7, 10],
           "expected_shape": [3, 7],
           "maxlength":
               7,
-          "weights": [0.5, 1, 2, 3, 4, 5, 6]
+          "weights":
+              np.array([[6, 0, 2, 0], [0, 0, 14, 0], [10, 0, 3.5, 3.5]]),
       },
       {
           "testcase_name":
@@ -292,11 +299,12 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 7, 8, 5],
+          "expected_values": [2, 6, 14, 6.5, 10],
           "expected_shape": [3, 9],
           "minlength":
               9,
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights":
+              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
       },
       {
           "testcase_name":
@@ -305,11 +313,12 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 7, 8, 5],
+          "expected_values": [2, 6, 14, 6.5, 10],
           "expected_shape": [3, 8],
           "minlength":
               3,
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights":
+              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
       },
       {
           "testcase_name": "_1d",
@@ -338,16 +347,17 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                         expected_shape,
                         maxlength=None,
                         minlength=None,
-                        binary_count=False,
+                        binary_output=False,
                         weights=None,
                         axis=-1):
     x_sparse = sparse_ops.from_dense(x)
-    y = bincount.sparse_bincount(
+    w_sparse = sparse_ops.from_dense(weights) if weights is not None else None
+    y = bincount_ops.sparse_bincount(
         x_sparse,
-        weights=weights,
+        weights=w_sparse,
         minlength=minlength,
         maxlength=maxlength,
-        binary_count=binary_count,
+        binary_output=binary_output,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
@@ -393,7 +403,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 6],
-          "binary_count": True,
+          "binary_output": True,
       },
       {
           "testcase_name": "_maxlength_binary",
@@ -402,7 +412,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 7],
-          "binary_count": True,
+          "binary_output": True,
       },
       {
           "testcase_name": "_minlength_binary",
@@ -412,13 +422,13 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 9],
-          "binary_count": True,
+          "binary_output": True,
       },
       {
           "testcase_name": "_minlength_larger_values_binary",
           "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
           "minlength": 3,
-          "binary_count": True,
+          "binary_output": True,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
@@ -428,18 +438,18 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "testcase_name": "_no_maxlength_weights",
           "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [0.5, 1, 3, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
           "expected_shape": [5, 6],
-          "weights": [0.5, 1, 2, 3, 4, 5]
+          "weights": [[], [], [6, 0.5, 2], [], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_maxlength_weights",
           "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
           "maxlength": 7,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [0.5, 1, 3, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
           "expected_shape": [5, 7],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6]
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_minlength_weights",
@@ -447,9 +457,9 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "minlength": 9,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
-          "expected_values": [0.5, 1, 3, 7, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
           "expected_shape": [5, 9],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_minlength_larger_values_weights",
@@ -457,9 +467,9 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "minlength": 3,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
-          "expected_values": [0.5, 1, 3, 7, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
           "expected_shape": [5, 8],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_1d",
@@ -484,21 +494,345 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                         expected_shape,
                         maxlength=None,
                         minlength=None,
-                        binary_count=False,
+                        binary_output=False,
                         weights=None,
                         axis=-1):
     x_ragged = ragged_factory_ops.constant(x)
-    y = bincount.sparse_bincount(
+    w = ragged_factory_ops.constant(weights) if weights is not None else None
+    y = bincount_ops.sparse_bincount(
         x_ragged,
-        weights=weights,
+        weights=w,
         minlength=minlength,
         maxlength=maxlength,
-        binary_count=binary_count,
+        binary_output=binary_output,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
     self.assertAllEqual(expected_shape, y.dense_shape)
 
 
+class TestDenseBincount(test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_sparse_input_all_count(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    size = 1000
+    n_elems = 4096
+    inp_indices = np.random.randint(0, num_rows, (n_elems, 1))
+    inp_indices = np.concatenate([inp_indices, np.zeros((n_elems, 1))], axis=1)
+    inp_vals = np.random.randint(0, size, (n_elems,), dtype=dtype)
+    sparse_inp = sparse_tensor.SparseTensor(inp_indices, inp_vals,
+                                            [num_rows, 1])
+
+    np_out = np.bincount(inp_vals, minlength=size)
+    self.assertAllEqual(
+        np_out, self.evaluate(bincount_ops.bincount(sparse_inp, axis=0)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_sparse_input_all_count_with_weights(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    size = 1000
+    n_elems = 4096
+    inp_indices = np.random.randint(0, num_rows, (n_elems, 1))
+    inp_indices = np.concatenate([inp_indices, np.zeros((n_elems, 1))], axis=1)
+    inp_vals = np.random.randint(0, size, (n_elems,), dtype=dtype)
+    sparse_inp = sparse_tensor.SparseTensor(inp_indices, inp_vals,
+                                            [num_rows, 1])
+    weight_vals = np.random.random((n_elems,))
+    sparse_weights = sparse_tensor.SparseTensor(inp_indices, weight_vals,
+                                                [num_rows, 1])
+
+    np_out = np.bincount(inp_vals, minlength=size, weights=weight_vals)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(bincount_ops.bincount(
+            sparse_inp, sparse_weights, axis=0)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_sparse_input_all_binary(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    size = 10
+    n_elems = 4096
+    inp_indices = np.random.randint(0, num_rows, (n_elems, 1))
+    inp_indices = np.concatenate([inp_indices, np.zeros((n_elems, 1))], axis=1)
+    inp_vals = np.random.randint(0, size, (n_elems,), dtype=dtype)
+    sparse_inp = sparse_tensor.SparseTensor(inp_indices, inp_vals,
+                                            [num_rows, 1])
+
+    np_out = np.ones((size,))
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(bincount_ops.bincount(sparse_inp, binary_output=True)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_sparse_input_col_reduce_count(self, dtype):
+    num_rows = 128
+    num_cols = 27
+    size = 100
+    np.random.seed(42)
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_out = np.reshape(
+        np.concatenate(
+            [np.bincount(inp[j, :], minlength=size) for j in range(num_rows)],
+            axis=0), (num_rows, size))
+    # from_dense will filter out 0s.
+    inp = inp + 1
+    # from_dense will cause OOM in GPU.
+    with ops.device("/CPU:0"):
+      inp_sparse = sparse_ops.from_dense(inp)
+      inp_sparse = sparse_tensor.SparseTensor(inp_sparse.indices,
+                                              inp_sparse.values - 1,
+                                              inp_sparse.dense_shape)
+    self.assertAllEqual(
+        np_out, self.evaluate(bincount_ops.bincount(arr=inp_sparse, axis=-1)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_sparse_input_col_reduce_binary(self, dtype):
+    num_rows = 128
+    num_cols = 27
+    size = 100
+    np.random.seed(42)
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_out = np.reshape(
+        np.concatenate([
+            np.where(np.bincount(inp[j, :], minlength=size) > 0, 1, 0)
+            for j in range(num_rows)
+        ],
+                       axis=0), (num_rows, size))
+    # from_dense will filter out 0s.
+    inp = inp + 1
+    # from_dense will cause OOM in GPU.
+    with ops.device("/CPU:0"):
+      inp_sparse = sparse_ops.from_dense(inp)
+      inp_sparse = sparse_tensor.SparseTensor(inp_sparse.indices,
+                                              inp_sparse.values - 1,
+                                              inp_sparse.dense_shape)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(
+            bincount_ops.bincount(arr=inp_sparse, axis=-1, binary_output=True)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_input_count(self, dtype):
+    x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]],
+                                    dtype)
+    # pyformat: disable
+    expected_output = [
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [1, 1, 0, 1, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [1, 0, 0, 0, 2, 1]]
+    # pyformat: enable
+    self.assertAllEqual(expected_output,
+                        self.evaluate(bincount_ops.bincount(arr=x, axis=-1)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_input_binary(self, dtype):
+    x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]])
+    # pyformat: disable
+    expected_output = [
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [1, 1, 0, 1, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [1, 0, 0, 0, 1, 1]]
+    # pyformat: enable
+    self.assertAllEqual(
+        expected_output,
+        self.evaluate(
+            bincount_ops.bincount(arr=x, axis=-1, binary_output=True)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_input_count_with_weights(self, dtype):
+    x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]])
+    weights = ragged_factory_ops.constant([[], [], [.1, .2, .3], [],
+                                           [.2, .5, .6, .3]])
+    # pyformat: disable
+    expected_output = [
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [.2, .3, 0, .1, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [.5, 0, 0, 0, .9, .2]]
+    # pyformat: enable
+    self.assertAllClose(
+        expected_output,
+        self.evaluate(bincount_ops.bincount(arr=x, weights=weights, axis=-1)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_input_count_np(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    num_cols = 27
+    size = 1000
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_out = np.reshape(
+        np.concatenate(
+            [np.bincount(inp[j, :], minlength=size) for j in range(num_rows)],
+            axis=0), (num_rows, size))
+    x = ragged_tensor.RaggedTensor.from_tensor(inp)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(bincount_ops.bincount(arr=x, minlength=size, axis=-1)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_input_count_np_with_weights(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    num_cols = 27
+    size = 1000
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_weight = np.random.random((num_rows, num_cols))
+    np_out = np.reshape(
+        np.concatenate([
+            np.bincount(inp[j, :], weights=np_weight[j, :], minlength=size)
+            for j in range(num_rows)
+        ],
+                       axis=0), (num_rows, size))
+    x = ragged_tensor.RaggedTensor.from_tensor(inp)
+    weights = ragged_tensor.RaggedTensor.from_tensor(np_weight)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(
+            bincount_ops.bincount(
+                arr=x, weights=weights, minlength=size, axis=-1)))
+
+
+class TestSparseCountFailureModes(test.TestCase):
+
+  def test_dense_input_sparse_weights_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_dense_input_ragged_weights_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_dense_input_wrong_shape_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = np.array([[3, 2], [5, 4], [4, 3]])
+    # Note: Eager mode and graph mode throw different errors here. Graph mode
+    # will fail with a ValueError from the shape checking logic, while Eager
+    # will fail with an InvalidArgumentError from the kernel itself.
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "must have the same shape"):
+        self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
+    else:
+      with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
+        self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_dense_weights_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_ragged_weights_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_wrong_indices_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 1, 0, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same indices"):
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_too_many_indices_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 1, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Incompatible shapes"):
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_wrong_shape_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4], [0, 0, 0, 0]],
+                 dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same dense shape"):
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_dense_weights_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_sparse_weights_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_different_shape_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same row splits"):
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index 56f76a49d51..6c1a36e65c9 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -24,12 +24,14 @@ from tensorflow.python.ops import array_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import gen_candidate_sampling_ops
 from tensorflow.python.ops import math_ops  # pylint: disable=unused-import
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export(
     'random.uniform_candidate_sampler',
     v1=['random.uniform_candidate_sampler', 'nn.uniform_candidate_sampler'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('nn.uniform_candidate_sampler')
 def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
                               range_max, seed=None, name=None):
@@ -92,6 +94,7 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
         'random.log_uniform_candidate_sampler',
         'nn.log_uniform_candidate_sampler'
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('nn.log_uniform_candidate_sampler')
 def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
                                   range_max, seed=None, name=None):
@@ -154,6 +157,7 @@ def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
 @tf_export(
     'random.learned_unigram_candidate_sampler',
     'nn.learned_unigram_candidate_sampler')
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints(['nn.learned_unigram_candidate_sampler'])
 def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
                                       unique, range_max, seed=None, name=None):
@@ -213,6 +217,7 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
 
 @tf_export('random.fixed_unigram_candidate_sampler',
            'nn.fixed_unigram_candidate_sampler')
+@dispatch.add_dispatch_support
 def fixed_unigram_candidate_sampler(true_classes,
                                     num_true,
                                     num_sampled,
@@ -341,6 +346,7 @@ def all_candidate_sampler(true_classes, num_true, num_sampled, unique,
 
 
 @tf_export('nn.compute_accidental_hits')
+@dispatch.add_dispatch_support
 def compute_accidental_hits(true_classes, sampled_candidates, num_true,
                             seed=None, name=None):
   """Compute the position ids in `sampled_candidates` matching `true_classes`.
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 3085e05eaf6..680796df48d 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 NUMERIC_TYPES = frozenset(
@@ -375,6 +376,7 @@ def _binary_assert(sym, opname, op_func, static_func, x, y, data, summarize,
 @tf_export(
     'debugging.assert_proper_iterable',
     v1=['debugging.assert_proper_iterable', 'assert_proper_iterable'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_proper_iterable')
 def assert_proper_iterable(values):
   """Static assert that values is a "proper" iterable.
@@ -404,6 +406,7 @@ def assert_proper_iterable(values):
 
 
 @tf_export('debugging.assert_negative', v1=[])
+@dispatch.add_dispatch_support
 def assert_negative_v2(x, message=None, summarize=None, name=None):
   """Assert the condition `x < 0` holds element-wise.
 
@@ -436,6 +439,7 @@ def assert_negative_v2(x, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_negative', 'assert_negative'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_negative')
 @_unary_assert_doc('< 0', 'negative')
 def assert_negative(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
@@ -456,6 +460,7 @@ def assert_negative(x, data=None, summarize=None, message=None, name=None):  # p
 
 
 @tf_export('debugging.assert_positive', v1=[])
+@dispatch.add_dispatch_support
 def assert_positive_v2(x, message=None, summarize=None, name=None):
   """Assert the condition `x > 0` holds element-wise.
 
@@ -488,6 +493,7 @@ def assert_positive_v2(x, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_positive', 'assert_positive'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_positive')
 @_unary_assert_doc('> 0', 'positive')
 def assert_positive(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
@@ -507,6 +513,7 @@ def assert_positive(x, data=None, summarize=None, message=None, name=None):  # p
 
 
 @tf_export('debugging.assert_non_negative', v1=[])
+@dispatch.add_dispatch_support
 def assert_non_negative_v2(x, message=None, summarize=None, name=None):
   """Assert the condition `x >= 0` holds element-wise.
 
@@ -541,6 +548,7 @@ def assert_non_negative_v2(x, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_non_negative', 'assert_non_negative'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_non_negative')
 @_unary_assert_doc('>= 0', 'non-negative')
 def assert_non_negative(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
@@ -561,6 +569,7 @@ def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_non_positive', v1=[])
+@dispatch.add_dispatch_support
 def assert_non_positive_v2(x, message=None, summarize=None, name=None):
   """Assert the condition `x <= 0` holds element-wise.
 
@@ -595,6 +604,7 @@ def assert_non_positive_v2(x, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_non_positive', 'assert_non_positive'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_non_positive')
 @_unary_assert_doc('<= 0', 'non-positive')
 def assert_non_positive(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
@@ -615,6 +625,7 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_equal', 'assert_equal', v1=[])
+@dispatch.add_dispatch_support
 def assert_equal_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x == y` holds element-wise.
 
@@ -649,6 +660,7 @@ def assert_equal_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_equal', 'assert_equal'])
+@dispatch.add_dispatch_support
 @_binary_assert_doc('==')
 def assert_equal(x, y, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
   with ops.name_scope(name, 'assert_equal', [x, y, data]):
@@ -660,6 +672,7 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):  # p
 
 
 @tf_export('debugging.assert_none_equal', v1=[])
+@dispatch.add_dispatch_support
 def assert_none_equal_v2(x, y, summarize=None, message=None, name=None):
   """Assert the condition `x != y` holds for all elements.
 
@@ -698,6 +711,7 @@ def assert_none_equal_v2(x, y, summarize=None, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_none_equal', 'assert_none_equal'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_none_equal')
 @_binary_assert_doc('!=')
 def assert_none_equal(
@@ -707,6 +721,7 @@ def assert_none_equal(
 
 
 @tf_export('debugging.assert_near', v1=[])
+@dispatch.add_dispatch_support
 def assert_near_v2(x, y, rtol=None, atol=None, message=None, summarize=None,
                    name=None):
   """Assert the condition `x` and `y` are close element-wise.
@@ -750,9 +765,9 @@ def assert_near_v2(x, y, rtol=None, atol=None, message=None, summarize=None,
       statically known.
 
   @compatibility(numpy)
-  Similar to `numpy.assert_allclose`, except tolerance depends on data type.
-  This is due to the fact that `TensorFlow` is often used with `32bit`, `64bit`,
-  and even `16bit` data.
+  Similar to `numpy.testing.assert_allclose`, except tolerance depends on data
+  type. This is due to the fact that `TensorFlow` is often used with `32bit`,
+  `64bit`, and even `16bit` data.
   @end_compatibility
   """
   return assert_near(x=x, y=y, rtol=rtol, atol=atol, summarize=summarize,
@@ -760,6 +775,7 @@ def assert_near_v2(x, y, rtol=None, atol=None, message=None, summarize=None,
 
 
 @tf_export(v1=['debugging.assert_near', 'assert_near'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_near')
 def assert_near(
     x, y, rtol=None, atol=None, data=None, summarize=None, message=None,
@@ -802,9 +818,9 @@ def assert_near(
     Op that raises `InvalidArgumentError` if `x` and `y` are not close enough.
 
   @compatibility(numpy)
-  Similar to `numpy.assert_allclose`, except tolerance depends on data type.
-  This is due to the fact that `TensorFlow` is often used with `32bit`, `64bit`,
-  and even `16bit` data.
+  Similar to `numpy.testing.assert_allclose`, except tolerance depends on data
+  type. This is due to the fact that `TensorFlow` is often used with `32bit`,
+  `64bit`, and even `16bit` data.
   @end_compatibility
   """
   message = message or ''
@@ -812,12 +828,15 @@ def assert_near(
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y', dtype=x.dtype)
 
-    eps = np.finfo(x.dtype.as_numpy_dtype).eps
+    dtype = x.dtype
+    if dtype.is_complex:
+      dtype = dtype.real_dtype
+    eps = np.finfo(dtype.as_numpy_dtype).eps
     rtol = 10 * eps if rtol is None else rtol
     atol = 10 * eps if atol is None else atol
 
-    rtol = ops.convert_to_tensor(rtol, name='rtol', dtype=x.dtype)
-    atol = ops.convert_to_tensor(atol, name='atol', dtype=x.dtype)
+    rtol = ops.convert_to_tensor(rtol, name='rtol', dtype=dtype)
+    atol = ops.convert_to_tensor(atol, name='atol', dtype=dtype)
 
     if context.executing_eagerly():
       x_name = _shape_and_dtype_str(x)
@@ -839,6 +858,7 @@ def assert_near(
 
 
 @tf_export('debugging.assert_less', 'assert_less', v1=[])
+@dispatch.add_dispatch_support
 def assert_less_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x < y` holds element-wise.
 
@@ -874,6 +894,7 @@ def assert_less_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_less', 'assert_less'])
+@dispatch.add_dispatch_support
 @_binary_assert_doc('<')
 def assert_less(x, y, data=None, summarize=None, message=None, name=None):
   return _binary_assert('<', 'assert_less', math_ops.less, np.less, x, y, data,
@@ -881,6 +902,7 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_less_equal', v1=[])
+@dispatch.add_dispatch_support
 def assert_less_equal_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x <= y` holds element-wise.
 
@@ -917,6 +939,7 @@ def assert_less_equal_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_less_equal', 'assert_less_equal'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_less_equal')
 @_binary_assert_doc('<=')
 def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
@@ -925,6 +948,7 @@ def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_greater', 'assert_greater', v1=[])
+@dispatch.add_dispatch_support
 def assert_greater_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x > y` holds element-wise.
 
@@ -961,6 +985,7 @@ def assert_greater_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_greater', 'assert_greater'])
+@dispatch.add_dispatch_support
 @_binary_assert_doc('>')
 def assert_greater(x, y, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
   return _binary_assert('>', 'assert_greater', math_ops.greater, np.greater, x,
@@ -968,6 +993,7 @@ def assert_greater(x, y, data=None, summarize=None, message=None, name=None):  #
 
 
 @tf_export('debugging.assert_greater_equal', v1=[])
+@dispatch.add_dispatch_support
 def assert_greater_equal_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x >= y` holds element-wise.
 
@@ -1005,6 +1031,7 @@ def assert_greater_equal_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_greater_equal', 'assert_greater_equal'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_greater_equal')
 @_binary_assert_doc('>=')
 def assert_greater_equal(x, y, data=None, summarize=None, message=None,
@@ -1062,6 +1089,7 @@ def _assert_rank_condition(
 
 
 @tf_export('debugging.assert_rank', 'assert_rank', v1=[])
+@dispatch.add_dispatch_support
 def assert_rank_v2(x, rank, message=None, name=None):
   """Assert that `x` has rank equal to `rank`.
 
@@ -1095,6 +1123,7 @@ def assert_rank_v2(x, rank, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_rank', 'assert_rank'])
+@dispatch.add_dispatch_support
 def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   """Assert `x` has rank equal to `rank`.
 
@@ -1157,6 +1186,7 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_rank_at_least', v1=[])
+@dispatch.add_dispatch_support
 def assert_rank_at_least_v2(x, rank, message=None, name=None):
   """Assert that `x` has rank of at least `rank`.
 
@@ -1190,6 +1220,7 @@ def assert_rank_at_least_v2(x, rank, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_rank_at_least', 'assert_rank_at_least'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_rank_at_least')
 def assert_rank_at_least(
     x, rank, data=None, summarize=None, message=None, name=None):
@@ -1322,6 +1353,7 @@ def _assert_ranks_condition(
 
 
 @tf_export('debugging.assert_rank_in', v1=[])
+@dispatch.add_dispatch_support
 def assert_rank_in_v2(x, ranks, message=None, name=None):
   """Assert that `x` has a rank in `ranks`.
 
@@ -1354,6 +1386,7 @@ def assert_rank_in_v2(x, ranks, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_rank_in', 'assert_rank_in'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_rank_in')
 def assert_rank_in(
     x, ranks, data=None, summarize=None, message=None, name=None):
@@ -1417,6 +1450,7 @@ def assert_rank_in(
 
 
 @tf_export('debugging.assert_integer', v1=[])
+@dispatch.add_dispatch_support
 def assert_integer_v2(x, message=None, name=None):
   """Assert that `x` is of integer dtype.
 
@@ -1437,6 +1471,7 @@ def assert_integer_v2(x, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_integer', 'assert_integer'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_integer')
 def assert_integer(x, message=None, name=None):
   """Assert that `x` is of integer dtype.
@@ -1476,6 +1511,7 @@ def assert_integer(x, message=None, name=None):
 
 
 @tf_export('debugging.assert_type', v1=[])
+@dispatch.add_dispatch_support
 def assert_type_v2(tensor, tf_type, message=None, name=None):
   """Asserts that the given `Tensor` is of the specified type.
 
@@ -1495,6 +1531,7 @@ def assert_type_v2(tensor, tf_type, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_type', 'assert_type'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_type')
 def assert_type(tensor, tf_type, message=None, name=None):
   """Statically asserts that the given `Tensor` is of the specified type.
@@ -1584,6 +1621,7 @@ _TensorDimSizes = collections.namedtuple(
 
 
 @tf_export('debugging.assert_shapes', v1=[])
+@dispatch.add_dispatch_support
 def assert_shapes_v2(shapes, data=None, summarize=None, message=None,
                      name=None):
   """Assert tensor shapes and dimension size relationships between tensors.
@@ -1650,6 +1688,7 @@ def assert_shapes_v2(shapes, data=None, summarize=None, message=None,
 
 
 @tf_export(v1=['debugging.assert_shapes'])
+@dispatch.add_dispatch_support
 def assert_shapes(shapes, data=None, summarize=None, message=None, name=None):
   """Assert tensor shapes and dimension size relationships between tensors.
 
@@ -1845,7 +1884,12 @@ def assert_shapes(shapes, data=None, summarize=None, message=None, name=None):
                 'Specified by tensor %s dimension %d' %
                 (tensor_name(specified_by_y), specified_at_dim))
 
-          actual_size = sizes.actual_sizes[tensor_dim]
+          # This is extremely subtle. If actual_sizes is dynamic, we must
+          # make sure a control dependency is inserted here so that this slice
+          # can not execute until the rank is asserted to be enough for the
+          # slice to not fail.
+          with ops.control_dependencies(rank_assertions):
+            actual_size = sizes.actual_sizes[tensor_dim]
           if _has_known_value(actual_size) and _has_known_value(specified_size):
             if int(actual_size) != int(specified_size):
               raise ValueError(
@@ -1871,12 +1915,17 @@ def assert_shapes(shapes, data=None, summarize=None, message=None, name=None):
           size_assertions.append(
               control_flow_ops.Assert(condition, data_, summarize=summarize))
         else:
-          size = sizes.actual_sizes[tensor_dim]
+          # Not sure if actual_sizes is a constant, but for safety, guard
+          # on rank. See explanation above about actual_sizes need for safety.
+          with ops.control_dependencies(rank_assertions):
+            size = sizes.actual_sizes[tensor_dim]
           size_specifications[size_symbol] = (size, sizes.x, tensor_dim)
 
-    with ops.control_dependencies(rank_assertions):
-      shapes_assertion = control_flow_ops.group(size_assertions)
-    return shapes_assertion
+  # Ensure both assertions actually occur.
+  with ops.control_dependencies(rank_assertions):
+    shapes_assertion = control_flow_ops.group(size_assertions)
+
+  return shapes_assertion
 
 
 # pylint: disable=line-too-long
@@ -1929,6 +1978,7 @@ def is_numeric_tensor(tensor):
         'math.is_non_decreasing', 'debugging.is_non_decreasing',
         'is_non_decreasing'
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('debugging.is_non_decreasing',
                                   'is_non_decreasing')
 def is_non_decreasing(x, name=None):
@@ -1970,6 +2020,7 @@ def is_non_decreasing(x, name=None):
         'math.is_strictly_increasing', 'debugging.is_strictly_increasing',
         'is_strictly_increasing'
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('debugging.is_strictly_increasing',
                                   'is_strictly_increasing')
 def is_strictly_increasing(x, name=None):
@@ -2056,6 +2107,7 @@ def _assert_same_base_type(items, expected_type=None):
 @tf_export(
     'debugging.assert_same_float_dtype',
     v1=['debugging.assert_same_float_dtype', 'assert_same_float_dtype'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_same_float_dtype')
 def assert_same_float_dtype(tensors=None, dtype=None):
   """Validate and return float type based on `tensors` and `dtype`.
@@ -2088,6 +2140,7 @@ def assert_same_float_dtype(tensors=None, dtype=None):
 
 
 @tf_export('debugging.assert_scalar', v1=[])
+@dispatch.add_dispatch_support
 def assert_scalar_v2(tensor, message=None, name=None):
   """Asserts that the given `tensor` is a scalar.
 
@@ -2110,6 +2163,7 @@ def assert_scalar_v2(tensor, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_scalar', 'assert_scalar'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_scalar')
 def assert_scalar(tensor, name=None, message=None):
   """Asserts that the given `tensor` is a scalar (i.e. zero-dimensional).
@@ -2144,6 +2198,7 @@ def assert_scalar(tensor, name=None, message=None):
 
 
 @tf_export('ensure_shape')
+@dispatch.add_dispatch_support
 def ensure_shape(x, shape, name=None):
   """Updates the shape of a tensor and checks at runtime that the shape holds.
 
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index edb35afa52c..f7662516b4f 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -152,6 +152,7 @@ def _clip_by_value_grad(op, grad):
 
 
 @tf_export("clip_by_norm")
+@dispatch.add_dispatch_support
 def clip_by_norm(t, clip_norm, axes=None, name=None):
   """Clips tensor values to a maximum L2-norm.
 
@@ -235,6 +236,7 @@ def clip_by_norm(t, clip_norm, axes=None, name=None):
 
 
 @tf_export("linalg.global_norm", v1=["linalg.global_norm", "global_norm"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("global_norm")
 def global_norm(t_list, name=None):
   """Computes the global norm of multiple tensors.
@@ -285,6 +287,7 @@ def global_norm(t_list, name=None):
 
 
 @tf_export("clip_by_global_norm")
+@dispatch.add_dispatch_support
 def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
   """Clips values of multiple tensors by the ratio of the sum of their norms.
 
@@ -382,6 +385,7 @@ def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
     "use clip_by_norm(t, clip_norm * tf.cast(tf.size(t), tf.float32), name) "
     "instead.")
 @tf_export(v1=["clip_by_average_norm"])
+@dispatch.add_dispatch_support
 def clip_by_average_norm(t, clip_norm, name=None):
   """Clips tensor values to a maximum average L2-norm.
 
diff --git a/tensorflow/python/ops/collective_ops.py b/tensorflow/python/ops/collective_ops.py
index f34d6631783..7ae2fec262e 100644
--- a/tensorflow/python/ops/collective_ops.py
+++ b/tensorflow/python/ops/collective_ops.py
@@ -20,8 +20,15 @@ from __future__ import print_function
 from tensorflow.python.ops import gen_collective_ops
 
 
-def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
-               subdiv_offsets=(0,), communication_hint='auto'):
+def all_reduce(t,
+               group_size,
+               group_key,
+               instance_key,
+               merge_op,
+               final_op,
+               subdiv_offsets=(0,),
+               communication_hint='auto',
+               timeout=0):
   """Reduces tensors collectively, across devices.
 
   Args:
@@ -40,6 +47,9 @@ def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
     communication_hint: preferred collective communication.  The implementation
       may fall back to another mechanism.  Options include `auto`, `ring`, and
       `nccl`.
+    timeout: If set to a non zero, set a completion timeout to detect staleness.
+      If the timer goes off, a DeadlineExceededError is raised.
+      The timeout value in seconds. This feature is experimental.
 
   Returns:
     An Op implementing the distributed reduction.
@@ -57,11 +67,16 @@ def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
       merge_op=merge_op,
       final_op=final_op,
       subdiv_offsets=subdiv_offsets,
-      communication_hint=communication_hint.lower())
+      communication_hint=communication_hint.lower(),
+      timeout_seconds=timeout)
 
 
-def all_gather(t, group_size, group_key, instance_key,
-               communication_hint='auto'):
+def all_gather(t,
+               group_size,
+               group_key,
+               instance_key,
+               communication_hint='auto',
+               timeout=0):
   """Accumulates tensors collectively, across devices, along first dimension.
 
   Args:
@@ -73,6 +88,9 @@ def all_gather(t, group_size, group_key, instance_key,
     communication_hint: preferred collective communication.  The implementation
       may fall back to another mechanism.  Options include `auto`, `ring`, and
       `nccl`.
+    timeout: If set to a non zero, set a completion timeout to detect staleness.
+      If the timer goes off, a DeadlineExceededError is raised.
+      The timeout value in seconds. This feature is experimental.
 
   Returns:
     An Op implementing the distributed operation.
@@ -88,11 +106,18 @@ def all_gather(t, group_size, group_key, instance_key,
       group_size=group_size,
       group_key=group_key,
       instance_key=instance_key,
-      communication_hint=communication_hint.lower())
+      communication_hint=communication_hint.lower(),
+      timeout_seconds=timeout)
 
 
-def broadcast_send(t, shape, dtype, group_size, group_key, instance_key,
-                   communication_hint='auto'):
+def broadcast_send(t,
+                   shape,
+                   dtype,
+                   group_size,
+                   group_key,
+                   instance_key,
+                   communication_hint='auto',
+                   timeout=0):
   """Broadcasts one tensor to a group of others, across devices.
 
   Args:
@@ -107,6 +132,9 @@ def broadcast_send(t, shape, dtype, group_size, group_key, instance_key,
     communication_hint: preferred collective communication.  The implementation
       may fall back to another mechanism.  Options include `auto`, `ring`, and
       `nccl`.
+    timeout: If set to a non zero, set a completion timeout to detect staleness.
+      If the timer goes off, a DeadlineExceededError is raised.
+      The timeout value in seconds. This feature is experimental.
 
   Returns:
     An Op implementing the distributed broadcast send.
@@ -139,11 +167,17 @@ def broadcast_send(t, shape, dtype, group_size, group_key, instance_key,
       group_size=group_size,
       group_key=group_key,
       instance_key=instance_key,
-      communication_hint=communication_hint.lower())
+      communication_hint=communication_hint.lower(),
+      timeout_seconds=timeout)
 
 
-def broadcast_recv(shape, dtype, group_size, group_key, instance_key,
-                   communication_hint='auto'):
+def broadcast_recv(shape,
+                   dtype,
+                   group_size,
+                   group_key,
+                   instance_key,
+                   communication_hint='auto',
+                   timeout=0):
   """Receives a broadcasts tensor, across devices.
 
   Args:
@@ -157,6 +191,9 @@ def broadcast_recv(shape, dtype, group_size, group_key, instance_key,
     communication_hint: preferred collective communication.  The implementation
       may fall back to another mechanism.  Options include `auto`, `ring`, and
       `nccl`.
+    timeout: If set to a non zero, set a completion timeout to detect staleness.
+      If the timer goes off, a DeadlineExceededError is raised.
+      The timeout value in seconds. This feature is experimental.
 
   Returns:
     An Op implementing the broadcast receive.
@@ -173,4 +210,5 @@ def broadcast_recv(shape, dtype, group_size, group_key, instance_key,
       group_size=group_size,
       group_key=group_key,
       instance_key=instance_key,
-      communication_hint=communication_hint.lower())
+      communication_hint=communication_hint.lower(),
+      timeout_seconds=timeout)
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index b9b533147fc..47c25fcafc0 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
+
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.eager import context
@@ -40,11 +42,21 @@ from tensorflow.python.platform import tf_logging as logging
 
 class CollectiveOpTest(test.TestCase):
 
-  def _testCollectiveReduce(self, inputs, expected, set_graph_key,
-                            communication_hint='auto', fp16=False,
-                            instance_key=1, merge_op='Add', final_op='Div'):
+  def _testCollectiveReduce(self,
+                            inputs,
+                            expected,
+                            set_graph_key,
+                            communication_hint='auto',
+                            fp16=False,
+                            instance_key=1,
+                            merge_op='Add',
+                            final_op='Div',
+                            timeout=0,
+                            reported_group_size=None):
     group_key = 1
     group_size = len(inputs)
+    if reported_group_size is None:
+      reported_group_size = group_size
     device_type = 'CPU'
     config = config_pb2.ConfigProto(device_count={device_type: group_size})
     devices = ['/{}:{}'.format(device_type, i) for i in range(group_size)]
@@ -55,9 +67,16 @@ class CollectiveOpTest(test.TestCase):
         with ops.device(devices[i]):
           tensor = constant_op.constant(inputs[i], dtype=(
               dtypes.float16 if fp16 else dtypes.float32))
-          colred.append(collective_ops.all_reduce(
-              tensor, group_size, group_key, instance_key, merge_op, final_op,
-              communication_hint=communication_hint))
+          colred.append(
+              collective_ops.all_reduce(
+                  tensor,
+                  reported_group_size,
+                  group_key,
+                  instance_key,
+                  merge_op,
+                  final_op,
+                  communication_hint=communication_hint,
+                  timeout=timeout))
       run_options = config_pb2.RunOptions()
       if set_graph_key:
         run_options.experimental.collective_graph_key = 1
@@ -117,6 +136,69 @@ class CollectiveOpTest(test.TestCase):
         [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
         [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2])
 
+  @test_util.run_deprecated_v1
+  def testCollectiveTimeoutV1(self):
+    timeout = 4.5
+    kwargs = dict(
+        inputs=[[i + j + 0.1 for i in range(8)] for j in range(3)],
+        expected=[1 + i + 0.1 for i in range(8)],
+        set_graph_key=True,
+        timeout=timeout)
+
+    self._testCollectiveReduce(**kwargs)
+
+    start_time = time.time()
+    with self.assertRaisesRegex(
+        errors.DeadlineExceededError,
+        'Collective has timed out waiting for other workers'):
+      self._testCollectiveReduce(
+          reported_group_size=len(kwargs['inputs']) + 1, **kwargs)
+    elapsed = time.time() - start_time
+    self.assertAllGreaterEqual(elapsed, timeout)
+
+  @test_util.run_v2_only
+  def testCollectiveTimeoutV2(self):
+    context._reset_context()
+    timeout = 4.5
+    cpus = config.list_physical_devices('CPU')
+    self.assertEqual(len(cpus), 1)
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
+    context.ensure_initialized()
+
+    @def_function.function
+    def run_all_reduce(group_size, reported_group_size=None):
+      group_key = 20
+      instance_key = 30
+      tensor = [1, 2, 3, 4]
+      results = []
+      if reported_group_size is None:
+        reported_group_size = group_size
+      for i in range(group_size):
+        with ops.device('/CPU:{}'.format(i)):
+          input_data = constant_op.constant(tensor)
+          collective_op = collective_ops.all_reduce(
+              input_data,
+              group_size=reported_group_size,
+              group_key=group_key,
+              instance_key=instance_key,
+              merge_op='Add',
+              final_op='Id',
+              timeout=timeout)
+          results.append(collective_op)
+      return results
+
+    run_all_reduce(2, 2)
+
+    start_time = time.time()
+    with self.assertRaisesRegex(errors.DeadlineExceededError,
+                                'Collective has timed out during execution'):
+      run_all_reduce(1, 2)
+    elapsed = time.time() - start_time
+    self.assertAllGreaterEqual(elapsed, timeout)
+
   @test_util.run_deprecated_v1
   def testNcclHintFallbackToRingReduce(self):
     """Tests that setting `communication_hint=nccl` works on non-GPU builds."""
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index dababc7615e..479d1122742 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -942,7 +942,10 @@ class _CondGradFuncGraph(util.CondBranchFuncGraph):
     return captured_tensor
 
 
-def indexed_case(branch_index, branch_fns, name="indexed_case"):
+def indexed_case(branch_index,
+                 branch_fns,
+                 name="indexed_case",
+                 lower_using_switch_merge=None):
   """Like conv_v2, except emits a Case op instead of an If."""
   if isinstance(branch_index, int):
     raise TypeError("branch_index must not be a Python int", branch_index)
@@ -976,7 +979,8 @@ def indexed_case(branch_index, branch_fns, name="indexed_case"):
     return _build_case(
         branch_index,
         branch_graphs, [g.external_captures for g in branch_graphs],
-        name=scope)
+        name=scope,
+        lower_using_switch_merge=lower_using_switch_merge)
 
 
 @ops.RegisterGradient("Case")
@@ -1064,7 +1068,11 @@ def _CaseGrad(op, *grads):  # pylint: disable=invalid-name
   return [None] + outputs
 
 
-def _build_case(branch_index, branch_graphs, branch_inputs, name=None):
+def _build_case(branch_index,
+                branch_graphs,
+                branch_inputs,
+                name=None,
+                lower_using_switch_merge=None):
   """Creates an `Case` op from `branch_index`, branch graphs and inputs.
 
   Note that this modifies `branch_graphs` to make the inputs match, and to
@@ -1080,6 +1088,7 @@ def _build_case(branch_index, branch_graphs, branch_inputs, name=None):
     branch_inputs: List of lists of Tensors to be passed to corresponding
       branch_graph as input.
     name: the name for the Case op.
+    lower_using_switch_merge: Lower this op using switch merge ops (optional).
 
   Returns:
     A list of Tensors which are the outputs of the Case op. Does not include
@@ -1105,7 +1114,7 @@ def _build_case(branch_index, branch_graphs, branch_inputs, name=None):
   case_op, tensors = _get_op_and_outputs(tensors)
 
   if case_op is not None:
-    util.maybe_set_lowering_attr(case_op)
+    util.maybe_set_lowering_attr(case_op, lower_using_switch_merge)
     util.maybe_propagate_compile_time_consts_in_xla(case_op)
     _set_read_only_resource_inputs_attr(case_op, branch_graphs)
     # Prevent fetching since the variant outputs can't be fetched directly.
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index 3e885975b03..39177defe57 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -93,6 +94,7 @@ def remove_squeezable_dimensions(
 
 
 @tf_export('math.confusion_matrix', v1=[])
+@dispatch.add_dispatch_support
 def confusion_matrix(labels,
                      predictions,
                      num_classes=None,
@@ -202,6 +204,7 @@ def confusion_matrix(labels,
 
 
 @tf_export(v1=['math.confusion_matrix', 'confusion_matrix'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('confusion_matrix', 'train.confusion_matrix')
 def confusion_matrix_v1(labels,
                         predictions,
diff --git a/tensorflow/python/ops/control_flow_grad.py b/tensorflow/python/ops/control_flow_grad.py
index 6a551deb5ba..ce426ea344b 100644
--- a/tensorflow/python/ops/control_flow_grad.py
+++ b/tensorflow/python/ops/control_flow_grad.py
@@ -210,6 +210,8 @@ def _EnterGrad(op, grad):
   # pylint: disable=protected-access
   grad_ctxt = graph._get_control_flow_context()
   # pylint: enable=protected-access
+  if grad_ctxt is None:
+    return grad
   if not grad_ctxt.back_prop:
     # Skip gradient computation, if the attribute `back_prop` is false.
     return grad
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 58948f7d52a..3398308d42e 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -43,6 +43,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util as util
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_control_flow_ops
+from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
@@ -54,6 +55,7 @@ from tensorflow.python.ops.gen_control_flow_ops import *
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.lazy_loader import LazyLoader
@@ -110,6 +112,7 @@ def _summarize_eager(tensor, summarize=None):
 # Assert and Print are special symbols in python, so we must
 # use an upper-case version of them.
 @tf_export("debugging.Assert", "Assert")
+@dispatch.add_dispatch_support
 @tf_should_use.should_use_result
 def Assert(condition, data, summarize=None, name=None):
   """Asserts that the given condition is true.
@@ -1095,6 +1098,7 @@ def _UnpackIfSingleton(res):
 # pylint: disable=redefined-outer-name
 # pylint: disable=g-doc-args
 @tf_export(v1=["cond"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(
     None, "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
     "fn1", "fn2")
@@ -1318,6 +1322,7 @@ def _cast_indexed_slice_indices(a, b):
 
 
 @tf_export("cond", v1=[])
+@dispatch.add_dispatch_support
 def cond_for_tf_v2(pred, true_fn=None, false_fn=None, name=None):
   """Return `true_fn()` if the predicate `pred` is true else `false_fn()`.
 
@@ -2942,6 +2947,7 @@ def group(*inputs, **kwargs):
 
 
 @tf_export("tuple", v1=[])
+@dispatch.add_dispatch_support
 def tuple_v2(tensors, control_inputs=None, name=None):
   """Group tensors together.
 
@@ -2978,6 +2984,7 @@ def tuple_v2(tensors, control_inputs=None, name=None):
 
 
 @tf_export(v1=["tuple"])
+@dispatch.add_dispatch_support
 def tuple(tensors, name=None, control_inputs=None):  # pylint: disable=redefined-builtin
   """Group tensors together.
 
@@ -3277,7 +3284,11 @@ def _indexed_case_verify_and_canonicalize_args(branch_fns, default,
   return actions
 
 
-def _indexed_case_helper(branch_fns, default, branch_index, name):
+def _indexed_case_helper(branch_fns,
+                         default,
+                         branch_index,
+                         name,
+                         lower_using_switch_merge=None):
   """Implementation of case that emits the n-way indexed Case op.
 
   Args:
@@ -3287,6 +3298,7 @@ def _indexed_case_helper(branch_fns, default, branch_index, name):
     branch_index: Optional int `Tensor`, which selects for the corresponding
       pred_fn_pair.
     name: A name for this operation (optional).
+    lower_using_switch_merge: Lower this op using switch merge ops (optional).
 
   Returns:
     The tensors returned by the pair whose key matched branch_index, or
@@ -3308,10 +3320,14 @@ def _indexed_case_helper(branch_fns, default, branch_index, name):
           | math_ops.greater_equal(branch_index, len(branch_fns)),
           len(branch_fns) - 1, branch_index)
       return branch_fns[int(branch_index)]()
-    return cond_v2.indexed_case(branch_index, branch_fns)
+    return cond_v2.indexed_case(
+        branch_index,
+        branch_fns,
+        lower_using_switch_merge=lower_using_switch_merge)
 
 
 @tf_export("case", v1=[])
+@dispatch.add_dispatch_support
 def case_v2(pred_fn_pairs,
             default=None,
             exclusive=False,
@@ -3416,6 +3432,7 @@ def case_v2(pred_fn_pairs,
 
 
 @tf_export(v1=["case"])
+@dispatch.add_dispatch_support
 def case(pred_fn_pairs,
          default=None,
          exclusive=False,
@@ -3599,6 +3616,50 @@ def switch_case(branch_index,
   return _indexed_case_helper(branch_fns, default, branch_index, name)
 
 
+def execute_fn_for_device(device_branch_fns, default_fn, name="execute_fn"):
+  """Executes one of the provided callables based on the device placement.
+
+  This API is used when the implementations for high level function depend on
+  the underlying device placement. It takes a dictionary of device type to
+  callables. The device type includes "CPU", "GPU", "TPU", etc. When the type of
+  the device where to run this op matches the key in 'device_branch_fns',
+  the corresponding callable is executed, falling back to 'default_fn' if none
+  matches.
+
+  **Example:**
+  ```python
+  def f1(): return tf.constant(1)
+  def f2(): return tf.constant(2)
+  r = tf.execute_fn_for_device({"CPU": f1, "GPU": f2}, default_fn=f1)
+  ```
+  'r' is evaluated as 1 when it runs on CPU, 2 running on GPU, 1 running on
+  any other device types.
+
+
+  Args:
+    device_branch_fns: a dictionary of device types to the callables. Each
+      callable must return a matching structure of tensors.
+    default_fn: fallback callable when the underlying device does not match any
+      key in the 'device_branch_fns'.
+    name: A name for this operation (optional).
+
+  Returns:
+    The tensors returned by the callable identified by device type during
+    execution, or those returned by 'default_fn' if no key matches.
+  """
+
+  device_branch_fns_upper = {k.upper(): v for k, v in device_branch_fns.items()}
+  branch_fns = list(device_branch_fns_upper.values())
+  devices = list(device_branch_fns_upper.keys())
+  device_index = gen_functional_ops.device_index(device_names=devices)
+  return _indexed_case_helper(
+      branch_fns,
+      default_fn,
+      device_index,
+      name,
+      lower_using_switch_merge=False)
+
+
 class XLAControlFlowContext(ControlFlowContext):
   """Base class for XLA and TPU control flow contexts."""
 
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 2979eb79bfd..f4459d8e34a 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -1211,6 +1211,92 @@ class IndexedCaseTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       control_flow_ops.switch_case(array_ops.constant(1), branches)
 
 
+class ExecuteFnForDeviceTest(test_util.TensorFlowTestCase):
+
+  def testCommonCases(self):
+
+    def cpu_fn(x):
+      return x + x
+
+    def gpu_fn(x):
+      return x * x
+
+    def flexible_fn(a):
+      branches = {"CPU": lambda: cpu_fn(a), "GPU": lambda: gpu_fn(a)}
+      return control_flow_ops.execute_fn_for_device(branches, lambda: cpu_fn(a))
+
+    @def_function.function
+    def flexible_defun(a):
+      return flexible_fn(a)
+
+    def run_defun_and_tape(a):
+      with backprop.GradientTape() as tape:
+        tape.watch(a)
+        result = flexible_defun(a)
+      grad = tape.gradient(result, a)
+      r = flexible_fn(a)
+      return r, result, grad
+
+    a = array_ops.constant(3.)
+    with ops.device("cpu:0"):
+      r, result, grad = run_defun_and_tape(a)
+      self.assertEqual(6., self.evaluate(r))
+      self.assertEqual(6., self.evaluate(result))
+      self.assertEqual([2.], self.evaluate(grad))
+
+    if test_util.is_gpu_available():
+      with ops.device("gpu:0"):
+        r, result, grad = run_defun_and_tape(a)
+        self.assertEqual(9., self.evaluate(r))
+        self.assertEqual(9., self.evaluate(result))
+        self.assertEqual([6.], self.evaluate(grad))
+
+    # no device annotation
+    r, result, grad = run_defun_and_tape(a)
+    if test_util.is_gpu_available():
+      self.assertEqual(9., self.evaluate(r))
+      self.assertEqual(9., self.evaluate(result))
+      self.assertEqual([6.], self.evaluate(grad))
+    else:
+      self.assertEqual(6., self.evaluate(r))
+      self.assertEqual(6., self.evaluate(result))
+      self.assertEqual([2.], self.evaluate(grad))
+
+  def testFallBack(self):
+
+    def default_fn(x):
+      return x
+
+    def tpu_fn(x):
+      return x * x * x
+
+    def flexible_fn(a):
+      branches = {"TPU": lambda: tpu_fn(a)}
+      return control_flow_ops.execute_fn_for_device(
+          branches, default_fn=lambda: default_fn(a))
+
+    @def_function.function
+    def flexible_defun(a):
+      return flexible_fn(a)
+
+    a = array_ops.constant(3.)
+    with ops.device("cpu:0"):
+      result_defun = flexible_defun(a)
+      result_defun = flexible_fn(a)
+      self.assertEqual(3., self.evaluate(result_defun))
+      # execute_fn_for_device is not inside defun_function.
+      result = flexible_fn(a)
+      self.assertEqual(3., self.evaluate(result))
+
+    if test_util.is_gpu_available():
+      with ops.device("gpu:0"):
+        result_defun = flexible_defun(a)
+        self.assertEqual(3., self.evaluate(result_defun))
+        # execute_fn_for_device is not inside defun_function.
+        result = flexible_fn(a)
+        self.assertEqual(3., self.evaluate(result))
+
+
 class CaseTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index 7e87d25fe99..4fc464c545c 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -92,7 +92,7 @@ def unique_grad_fn_name(forward_name):
   return "%s_grad_%s" % (forward_name, ops.uid())
 
 
-def maybe_set_lowering_attr(op):
+def maybe_set_lowering_attr(op, lower_using_switch_merge=None):
   """Sets the flag to enable lowering on `op` if necessary.
 
   Lowering allows cond_v2 and while_v2 to avoid some of the limitations of
@@ -108,14 +108,21 @@ def maybe_set_lowering_attr(op):
     - When the eager execution context specifies the executor of functions to
       be the single threaded executor (see context.function_executor_type()).
       Because the single threaded executor does not support v1 control flow ops.
+    - When 'lower_using_switch_merge' is explicitly set to False.
 
   Args:
     op: An `If` or `While` Operation.
+    lower_using_switch_merge: Explicit value to lower or not (optional).
   """
-  if (not _DISABLE_LOWER_USING_SWITCH_MERGE and
-      not control_flow_util.GraphOrParentsInXlaContext(op.graph) and
-      context.context().function_call_options.executor_type !=
-      "SINGLE_THREADED_EXECUTOR"):
+  if lower_using_switch_merge is not None:
+    # pylint: disable=protected-access
+    op._set_attr("_lower_using_switch_merge",
+                 attr_value_pb2.AttrValue(b=lower_using_switch_merge))
+    # pylint: enable=protected-access
+  elif (not _DISABLE_LOWER_USING_SWITCH_MERGE and
+        not control_flow_util.GraphOrParentsInXlaContext(op.graph) and
+        context.context().function_call_options.executor_type !=
+        "SINGLE_THREADED_EXECUTOR"):
     # pylint: disable=protected-access
     op._set_attr("_lower_using_switch_merge", attr_value_pb2.AttrValue(b=True))
     # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index d989bc0be44..6c9cdf1dd08 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -43,6 +43,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.nn_grad import _BroadcastMul
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -70,6 +71,7 @@ def _generate_defun_backend(unique_api_name, preferred_device, func):
 
 # pylint: disable=protected-access, invalid-name
 @tf_export(v1=["nn.ctc_loss"])
+@dispatch.add_dispatch_support
 def ctc_loss(labels,
              inputs=None,
              sequence_length=None,
@@ -284,6 +286,7 @@ def _CTCLossV2Grad(op, grad_loss, _):
 
 
 @tf_export("nn.ctc_greedy_decoder")
+@dispatch.add_dispatch_support
 def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
   """Performs greedy decoding on the logits given in input (best path).
 
@@ -333,6 +336,7 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
 
 
 @tf_export(v1=["nn.ctc_beam_search_decoder"])
+@dispatch.add_dispatch_support
 def ctc_beam_search_decoder(inputs,
                             sequence_length,
                             beam_width=100,
@@ -395,6 +399,7 @@ def ctc_beam_search_decoder(inputs,
 
 
 @tf_export("nn.ctc_beam_search_decoder", v1=["nn.ctc_beam_search_decoder_v2"])
+@dispatch.add_dispatch_support
 def ctc_beam_search_decoder_v2(inputs,
                                sequence_length,
                                beam_width=100,
@@ -731,6 +736,7 @@ def _ctc_loss_shape(op):
 
 # pylint: disable=protected-access, invalid-name
 @tf_export(v1=["nn.ctc_loss_v2"])
+@dispatch.add_dispatch_support
 def ctc_loss_v2(labels,
                 logits,
                 label_length,
@@ -825,6 +831,7 @@ def ctc_loss_v2(labels,
 
 
 @tf_export("nn.ctc_loss", v1=[])
+@dispatch.add_dispatch_support
 def ctc_loss_v3(labels,
                 logits,
                 label_length,
@@ -1056,6 +1063,7 @@ def ctc_loss_dense(labels,
 
 
 @tf_export("nn.collapse_repeated")
+@dispatch.add_dispatch_support
 def collapse_repeated(labels, seq_length, name=None):
   """Merge repeated labels into single labels.
 
@@ -1153,6 +1161,7 @@ def dense_labels_to_sparse(dense, length):
 
 
 @tf_export("nn.ctc_unique_labels")
+@dispatch.add_dispatch_support
 def ctc_unique_labels(labels, name=None):
   """Get unique labels and indices for batched labels for `tf.nn.ctc_loss`.
 
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 4040a4db038..2a9194fb146 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import op_selector
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
@@ -351,13 +352,8 @@ def _graph_mode_decorator(f, args, kwargs):
                     "argument 'variables'.")
   if variables_in_signature and not variables:
     # User seems to intend to use variables but none were captured.
-    if not variable_scope.get_variable_scope().use_resource:
-      raise TypeError("If using @custom_gradient with a function that "
-                      "uses variables, the enclosing variable scope must "
-                      "have use_resource=True.")
-    else:
-      logging.warn("@custom_gradient grad_fn has 'variables' in signature, but "
-                   "no ResourceVariables were used on the forward pass.")
+    logging.warn("@custom_gradient grad_fn has 'variables' in signature, but "
+                 "no ResourceVariables were used on the forward pass.")
   flat_result = nest.flatten(result)
   flat_result_len = len(flat_result)
 
@@ -482,28 +478,47 @@ def recompute_grad(f):
   def inner(*args, **kwargs):
     """Inner function closure for calculating gradients."""
     current_var_scope = variable_scope.get_variable_scope()
+    with tape_lib.stop_recording():
+      result = f(*args, **kwargs)
 
-    result = f(*args, **kwargs)
+    def grad_wrapper(*wrapper_args, **grad_kwargs):
+      """Wrapper function to accomodate lack of kwargs in graph mode decorator."""
 
-    def grad(*dresult, **grad_kwargs):
-      """Gradient function calculation for inner function."""
-      variables = grad_kwargs.get("variables")
-      with backprop.GradientTape() as t:
-        id_args = [gen_array_ops.identity(x) for x in args]
-        t.watch(id_args)
+      @custom_gradient
+      def inner_recompute_grad(*dresult):
+        """Nested custom gradient function for computing grads in reverse and forward mode autodiff."""
+        # Gradient calculation for reverse mode autodiff.
+        variables = grad_kwargs.get("variables")
+        with backprop.GradientTape() as t:
+          id_args = [gen_array_ops.identity(x) for x in args]
+          t.watch(id_args)
+          if variables is not None:
+            t.watch(variables)
+          with ops.control_dependencies(dresult):
+            with variable_scope.variable_scope(current_var_scope):
+              result = f(*id_args, **kwargs)
+        kw_vars = []
         if variables is not None:
-          t.watch(variables)
-        with ops.control_dependencies(dresult):
-          with variable_scope.variable_scope(current_var_scope):
-            result = f(*id_args, **kwargs)
-      kw_vars = []
-      if variables is not None:
-        kw_vars = list(variables)
-      grads = t.gradient(
-          result, list(id_args) + kw_vars, output_gradients=dresult)
-      return grads[:len(id_args)], grads[len(id_args):]
+          kw_vars = list(variables)
+        grads = t.gradient(
+            result,
+            list(id_args) + kw_vars,
+            output_gradients=dresult,
+            unconnected_gradients=UnconnectedGradients.ZERO)
 
-    return result, grad
+        def transpose(*t_args, **t_kwargs):
+          """Gradient function calculation for forward mode autodiff."""
+          # Just throw an error since gradients / activations are not stored on tape for recompute.
+          raise NotImplementedError(
+              "recompute_grad tried to transpose grad of {}. "
+              "Consider not using recompute_grad in forward mode"
+              "autodiff".format(f.__name__))
+
+        return (grads[:len(id_args)], grads[len(id_args):]), transpose
+
+      return inner_recompute_grad(*wrapper_args)
+
+    return result, grad_wrapper
 
   return inner
 
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 2fdae49b1f6..1c7b204fa58 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -250,6 +251,7 @@ def _embedding_lookup_and_transform(params,
 
 
 @tf_export(v1=["nn.embedding_lookup"])
+@dispatch.add_dispatch_support
 def embedding_lookup(
     params,
     ids,
@@ -327,6 +329,7 @@ def embedding_lookup(
 
 
 @tf_export("nn.embedding_lookup", v1=[])
+@dispatch.add_dispatch_support
 def embedding_lookup_v2(params, ids, max_norm=None, name=None):
   """Looks up embeddings for the given `ids` from a list of tensors.
 
@@ -392,6 +395,7 @@ def embedding_lookup_v2(params, ids, max_norm=None, name=None):
 
 
 @tf_export(v1=["nn.embedding_lookup_sparse"])
+@dispatch.add_dispatch_support
 def embedding_lookup_sparse(params,
                             sp_ids,
                             sp_weights,
@@ -574,6 +578,7 @@ def embedding_lookup_sparse(params,
 
 
 @tf_export("nn.embedding_lookup_sparse", v1=[])
+@dispatch.add_dispatch_support
 def embedding_lookup_sparse_v2(params,
                                sp_ids,
                                sp_weights,
@@ -664,6 +669,7 @@ def embedding_lookup_sparse_v2(params,
 
 
 @tf_export("nn.safe_embedding_lookup_sparse", v1=[])
+@dispatch.add_dispatch_support
 def safe_embedding_lookup_sparse_v2(embedding_weights,
                                     sparse_ids,
                                     sparse_weights=None,
@@ -765,6 +771,7 @@ def safe_embedding_lookup_sparse_v2(embedding_weights,
 
 
 @tf_export(v1=["nn.safe_embedding_lookup_sparse"])
+@dispatch.add_dispatch_support
 def safe_embedding_lookup_sparse(embedding_weights,
                                  sparse_ids,
                                  sparse_weights=None,
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index fe66e8ccdfb..6e285d6681d 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops.gen_functional_ops import remote_call
 from tensorflow.python.ops.gen_functional_ops import symbolic_gradient
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -45,6 +46,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 # TODO(yuanbyu, mrry): Handle stride to support sliding windows.
 @tf_export(v1=["foldl"])
+@dispatch.add_dispatch_support
 def foldl(fn,
           elems,
           initializer=None,
@@ -162,6 +164,7 @@ def foldl(fn,
 
 
 @tf_export("foldl", v1=[])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_arg_values(
     None,
     """back_prop=False is deprecated. Consider using tf.stop_gradient instead.
@@ -238,6 +241,7 @@ def foldl_v2(fn,
 
 
 @tf_export(v1=["foldr"])
+@dispatch.add_dispatch_support
 def foldr(fn,
           elems,
           initializer=None,
@@ -356,6 +360,7 @@ def foldr(fn,
 
 
 @tf_export("foldr", v1=[])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_arg_values(
     None,
     """back_prop=False is deprecated. Consider using tf.stop_gradient instead.
@@ -432,6 +437,7 @@ def foldr_v2(fn,
 
 
 @tf_export(v1=["scan"])
+@dispatch.add_dispatch_support
 def scan(fn,
          elems,
          initializer=None,
@@ -686,6 +692,7 @@ def scan(fn,
 
 
 @tf_export("scan", v1=[])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_arg_values(
     None,
     """back_prop=False is deprecated. Consider using tf.stop_gradient instead.
@@ -831,12 +838,13 @@ def If(cond, inputs, then_branch, else_branch, name=None):
     or else_branch(inputs).
   """
   # pylint: disable=protected-access
+  if isinstance(then_branch, function._DefinedFunction):
+    tlist = [_.type for _ in then_branch.definition.signature.output_arg]
+  else:
+    # We assume that `then_branch` is a ConcreteFunction here.
+    tlist = nest.flatten(then_branch.output_dtypes)
   return gen_functional_ops._if(
-      cond,
-      inputs, [_.type for _ in then_branch.definition.signature.output_arg],
-      then_branch,
-      else_branch,
-      name=name)
+      cond, inputs, tlist, then_branch, else_branch, name=name)
 
 
 def Gradient(inputs, f, name=None):
@@ -863,11 +871,24 @@ def Gradient(inputs, f, name=None):
   return symbolic_gradient(input=inputs, Tout=tlist, f=f, name=name)
 
 
+def _GetInputDtypes(func):
+  """Returns the input dtypes of func, excluding dtypes for captured inputs."""
+  if isinstance(func, function._DefinedFunction):  # pylint: disable=protected-access
+    return func.declared_input_types
+
+  # We assume that `func` is a ConcreteFunction here, but we are not able to
+  # verify since importing eager function library will cause cyclic dependence.
+  #
+  # ConcreteFunction.inputs includes captured inputs.
+  num_non_captured_inputs = len(func.inputs) - len(func.captured_inputs)
+  inputs_without_captured = func.inputs[:num_non_captured_inputs]
+  return [t.dtype for t in inputs_without_captured]
+
+
 def _LoopBodyCaptureWrapper(func):
   """Returns a wrapper for `func` that handles loop-carried captured inputs."""
 
-  @function.Defun(
-      *func.declared_input_types, func_name="%s_Wrapper" % func.name)
+  @function.Defun(*_GetInputDtypes(func), func_name="%s_Wrapper" % func.name)
   def Wrapper(*args):
     """A wrapper that handles loop-carried captured inputs."""
     result = func(*args)
@@ -877,11 +898,11 @@ def _LoopBodyCaptureWrapper(func):
     if isinstance(result, ops.Operation):
       return extra_args
     # Unary functions return a single Tensor value.
-    elif not isinstance(result, tuple):
+    elif not isinstance(result, (list, tuple)):
       return (result,) + extra_args
     # N-ary functions return a tuple of Tensors.
     else:
-      return result + extra_args
+      return result + type(result)(extra_args)
 
   return Wrapper
 
@@ -917,19 +938,23 @@ def While(input_, cond, body, name=None, hostmem=None):
     raise ValueError("While op 'cond' argument must be a function "
                      "without implicitly captured inputs.")
 
-  if cond.declared_input_types != body.declared_input_types:
+  cond_input_types = _GetInputDtypes(cond)
+  body_input_types = _GetInputDtypes(body)
+
+  if cond_input_types != body_input_types:
     raise ValueError(
         "While op 'cond' and 'body' signatures do not match. %r vs %r" %
-        (cond.declared_input_types, body.declared_input_types))
+        (cond_input_types, body_input_types))
 
   if body.captured_inputs:
-    cond_dtypes = list(
-        body.declared_input_types) + [t.dtype for t in body.captured_inputs]
+    cond_dtypes = list(body_input_types) + [
+        t.dtype for t in body.captured_inputs
+    ]
 
     @function.Defun(*cond_dtypes, func_name="%s_Wrapper" % cond.name)
     def CondWrapper(*args):
       """A wrapper that handles loop-carried captured inputs."""
-      return cond(*args[:len(body.declared_input_types)])
+      return cond(*args[:len(body_input_types)])
 
     ret = gen_functional_ops._while(
         input_ + body.captured_inputs,
@@ -1184,8 +1209,8 @@ def partitioned_call(args,
   if hasattr(f, "graph"):
     _set_read_only_resource_inputs_attr(op, f.graph)
     if hasattr(f.graph, "collective_manager_ids_used"):
-      ops.set_int_list_attr(
-          op, acd.COLLECTIVE_MANAGER_IDS, f.graph.collective_manager_ids_used)
+      ops.set_int_list_attr(op, acd.COLLECTIVE_MANAGER_IDS,
+                            f.graph.collective_manager_ids_used)
   return outputs if outputs else op
 
 
diff --git a/tensorflow/python/ops/functional_ops_test.py b/tensorflow/python/ops/functional_ops_test.py
new file mode 100644
index 00000000000..7e3bc631c44
--- /dev/null
+++ b/tensorflow/python/ops/functional_ops_test.py
@@ -0,0 +1,69 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for functional operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.platform import test
+
+
+class FunctionalOpsTest(test.TestCase):
+
+  @test_util.deprecated_graph_mode_only
+  def testIfWithDefun(self):
+
+    @function.Defun(dtypes.float32)
+    def Then(x):
+      return x + 1
+
+    @function.Defun(dtypes.float32)
+    def Else(x):
+      return x - 1
+
+    with self.cached_session():
+      inputs = [10.]
+      result = self.evaluate(functional_ops.If(False, inputs, Then, Else))
+      self.assertEqual([9.0], result)
+
+  def testIfWithFunction(self):
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec((), dtypes.float32)])
+    def Then(x):
+      return x + 1
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec((), dtypes.float32)])
+    def Else(x):
+      return x - 1
+
+    with self.cached_session():
+      inputs = [10.]
+      result = self.evaluate(
+          functional_ops.If(False, inputs, Then.get_concrete_function(),
+                            Else.get_concrete_function()))
+      self.assertEqual([9.0], result)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/ops/gradient_checker_test.py b/tensorflow/python/ops/gradient_checker_test.py
index 92ca9c2971e..c8ebf12569a 100644
--- a/tensorflow/python/ops/gradient_checker_test.py
+++ b/tensorflow/python/ops/gradient_checker_test.py
@@ -149,7 +149,7 @@ class GradientCheckerTest(test.TestCase):
       self.assertAllEqual(correct, analytical)
       self.assertAllClose(correct, numerical, rtol=1e-4)
       self.assertLess(
-          gradient_checker.compute_gradient_error(x, size, y, size), 2e-4)
+          gradient_checker.compute_gradient_error(x, size, y, size), 3e-4)
 
   @test_util.run_deprecated_v1
   def testComplexConj(self):
diff --git a/tensorflow/python/ops/gradient_checker_v2.py b/tensorflow/python/ops/gradient_checker_v2.py
index 633b5e57d95..3ca0903c80c 100644
--- a/tensorflow/python/ops/gradient_checker_v2.py
+++ b/tensorflow/python/ops/gradient_checker_v2.py
@@ -28,7 +28,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -217,14 +216,8 @@ def _compute_numeric_jacobian(f, y_size, y_dtype, xs, param, delta):
     and "x_size" columns where "x_size" is the number of elements in xs[param]
     and "y_size" is the number of elements in the result.
   """
-  # bfloat16 doesn't have enough bits to represent high precision numbers such
-  # as delta. Convert to float32 here. Since numeric_jacobian is expected to
-  # be the groundtruth to compare against, it shouldn't lose any information.
   x_shape = xs[param].shape
   x_dtype = xs[param].dtype
-  if y_dtype == dtypes.bfloat16:
-    f = lambda *xs: math_ops.cast(f(*xs), dtypes.float32)
-    y_dtype = dtypes.float32
 
   # To compute the jacobian, we treat x and y as one-dimensional vectors
   x_size = _product(x_shape) * (2 if x_dtype.is_complex else 1)
@@ -292,10 +285,10 @@ def _compute_gradient_list(f, xs, delta):
   xs_shapes = [x.shape for x in xs]
   f_temp = _prepare(f, xs_dtypes, xs_shapes)
   y = f_temp(*xs)
-  return zip(*[
+  return tuple(zip(*[
       _compute_gradient(f, y.shape, dtypes.as_dtype(y.dtype), xs, i, delta)
       for i in range(len(xs))
-  ])
+  ]))
 
 
 @tf_export("test.compute_gradient", v1=[])
diff --git a/tensorflow/python/ops/gradient_checker_v2_test.py b/tensorflow/python/ops/gradient_checker_v2_test.py
index b77c95d8968..d59228d78d1 100644
--- a/tensorflow/python/ops/gradient_checker_v2_test.py
+++ b/tensorflow/python/ops/gradient_checker_v2_test.py
@@ -97,6 +97,15 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("x1 error = %f", error)
     self.assertLess(error, 1e-4)
 
+  def testBfloat16(self):
+    x1 = constant_op.constant(2.0, dtype="bfloat16")
+    x2 = constant_op.constant(3.0, dtype="bfloat16")
+    # bfloat16 is very imprecise, so we use very large delta and error bar here.
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        lambda x1: math_ops.add(x1, x2), [x1], delta=0.1))
+    tf_logging.info("x1 error = %f", error)
+    self.assertLess(error, 0.07)
+
   def testAddCustomized(self):
     size = (2, 3)
     x1 = constant_op.constant(2.0, shape=size, dtype=dtypes.float64, name="x1")
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 817d8a1adbe..a06be7af74b 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -59,6 +59,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.nn_ops import bias_add
 from tensorflow.python.platform import googletest
+from tensorflow.python.ops import gradient_checker_v2
 
 
 class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
@@ -1340,6 +1341,46 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
 
     return grads_re, grads
 
+  def _grad(self, f, argnums=0):
+    """Return a function which computes the gradient of `f`."""
+
+    def _f(*params):
+      with backprop.GradientTape() as tape:
+        tape.watch(params)
+        outputs = f(*params)
+      return tape.gradient(
+          outputs,
+          params[argnums],
+          unconnected_gradients=unconnected_gradients.UnconnectedGradients.ZERO)
+
+    return _f
+
+  def _test_gradients(self, f, inputs, order, delta=1e-3, rtol=1e-2, atol=1e-6):
+    """Tests backward jacobians of `f`'s [0, `order`)-order gradients."""
+    if order < 1:
+      raise ValueError(
+          "`order` should be a positive integer, got '{}'.".format(order))
+    if order > 1:
+      self._test_gradients(
+          f=self._grad(f),
+          inputs=inputs,
+          order=order - 1,
+          delta=delta,
+          rtol=rtol,
+          atol=atol)
+    sym_jac_back, num_jac = gradient_checker_v2.compute_gradient(
+        f, inputs, delta=delta)
+    self.assertAllClose(num_jac, sym_jac_back, rtol=rtol, atol=atol)
+
+  @test_util.run_v2_only
+  def testCustomGradientRecomputeGradHigherOrder(self):
+
+    @custom_gradient.recompute_grad
+    def f(x):
+      return math_ops.reduce_prod(math_ops.tanh(x)**2)
+
+    self._test_gradients(f, [constant_op.constant([1.])], order=3)
+
   @test_util.run_in_graph_and_eager_modes
   def testFnRecompute(self):
     """Checks that recompute_grad works grads of function args."""
@@ -1356,8 +1397,8 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
           shape=10,
           trainable=True,
       )
-
-      test_input = constant(np.zeros((10, 10), dtype=np.float32))
+      self.evaluate(test_var.assign(np.ones([10])))
+      test_input = constant(np.ones((10, 10), dtype=np.float32))
 
       grads_re, grads = self._TestFnVariablesGradient(test_input, TestFn,
                                                       test_input)
@@ -1400,6 +1441,7 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
             shape=10,
             trainable=True,
         )
+        self.evaluate(test_var.assign(np.ones([10])))
         return input_t * test_var
 
     test_input_t = constant(np.zeros((10, 10), dtype=np.float32))
@@ -1442,6 +1484,8 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
       out_re = test_fn_re(test_input_t)
       out = TestFn(test_input_t)
 
+    init = variables.global_variables_initializer()
+    self.evaluate(init)
     grads_re = gradients.gradients(out_re, variables.trainable_variables())
     grads = gradients.gradients(out, variables.trainable_variables())
 
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index 92f3e7a24ba..233ea46c48b 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -26,10 +26,12 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('histogram_fixed_width_bins')
+@dispatch.add_dispatch_support
 def histogram_fixed_width_bins(values,
                                value_range,
                                nbins=100,
@@ -62,17 +64,14 @@ def histogram_fixed_width_bins(values,
 
   Examples:
 
-  ```python
-  # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-  nbins = 5
-  value_range = [0.0, 5.0]
-  new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-
-  with tf.compat.v1.get_default_session() as sess:
-    indices = tf.histogram_fixed_width_bins(new_values, value_range, nbins=5)
-    variables.global_variables_initializer().run()
-    sess.run(indices) # [0, 0, 1, 2, 4, 4]
-  ```
+  >>> # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+  ...
+  >>> nbins = 5
+  >>> value_range = [0.0, 5.0]
+  >>> new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+  >>> indices = tf.histogram_fixed_width_bins(new_values, value_range, nbins=5)
+  >>> indices.numpy()
+  array([0, 0, 1, 2, 4, 4], dtype=int32)
   """
   with ops.name_scope(name, 'histogram_fixed_width_bins',
                       [values, value_range, nbins]):
@@ -101,6 +100,7 @@ def histogram_fixed_width_bins(values,
 
 
 @tf_export('histogram_fixed_width')
+@dispatch.add_dispatch_support
 def histogram_fixed_width(values,
                           value_range,
                           nbins=100,
@@ -131,17 +131,14 @@ def histogram_fixed_width(values,
 
   Examples:
 
-  ```python
-  # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-  nbins = 5
-  value_range = [0.0, 5.0]
-  new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-
-  with tf.compat.v1.get_default_session() as sess:
-    hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-    variables.global_variables_initializer().run()
-    sess.run(hist) => [2, 1, 1, 0, 2]
-  ```
+  >>> # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+  ...
+  >>> nbins = 5
+  >>> value_range = [0.0, 5.0]
+  >>> new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+  >>> hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+  >>> hist.numpy()
+  array([2, 1, 1, 0, 2], dtype=int32)
   """
   with ops.name_scope(name, 'histogram_fixed_width',
                       [values, value_range, nbins]) as name:
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index e6a5cdbf4e8..0532b24edca 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import sort_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable('RandomCrop')
@@ -323,6 +324,7 @@ def fix_image_flip_shape(image, result):
 
 
 @tf_export('image.random_flip_up_down')
+@dispatch.add_dispatch_support
 def random_flip_up_down(image, seed=None):
   """Randomly flips an image vertically (upside down).
 
@@ -363,6 +365,7 @@ def random_flip_up_down(image, seed=None):
 
 
 @tf_export('image.random_flip_left_right')
+@dispatch.add_dispatch_support
 def random_flip_left_right(image, seed=None):
   """Randomly flip an image horizontally (left to right).
 
@@ -450,6 +453,7 @@ def _random_flip(image, flip_index, seed, scope_name):
 
 
 @tf_export('image.flip_left_right')
+@dispatch.add_dispatch_support
 def flip_left_right(image):
   """Flip an image horizontally (left to right).
 
@@ -484,6 +488,7 @@ def flip_left_right(image):
 
 
 @tf_export('image.flip_up_down')
+@dispatch.add_dispatch_support
 def flip_up_down(image):
   """Flip an image vertically (upside down).
 
@@ -549,6 +554,7 @@ def _flip(image, flip_index, scope_name):
 
 
 @tf_export('image.rot90')
+@dispatch.add_dispatch_support
 def rot90(image, k=1, name=None):
   """Rotate image(s) counter-clockwise by 90 degrees.
 
@@ -660,6 +666,7 @@ def _rot90_4D(images, k, name_scope):
 
 
 @tf_export('image.transpose', v1=['image.transpose', 'image.transpose_image'])
+@dispatch.add_dispatch_support
 def transpose(image, name=None):
   """Transpose image(s) by swapping the height and width dimension.
 
@@ -718,6 +725,7 @@ def transpose(image, name=None):
 
 
 @tf_export('image.central_crop')
+@dispatch.add_dispatch_support
 def central_crop(image, central_fraction):
   """Crop the central region of the image(s).
 
@@ -850,6 +858,7 @@ def central_crop(image, central_fraction):
 
 
 @tf_export('image.pad_to_bounding_box')
+@dispatch.add_dispatch_support
 def pad_to_bounding_box(image, offset_height, offset_width, target_height,
                         target_width):
   """Pad `image` with zeros to the specified `height` and `width`.
@@ -959,6 +968,7 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
 
 
 @tf_export('image.crop_to_bounding_box')
+@dispatch.add_dispatch_support
 def crop_to_bounding_box(image, offset_height, offset_width, target_height,
                          target_width):
   """Crops an image to a specified bounding box.
@@ -1041,6 +1051,7 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
 @tf_export(
     'image.resize_with_crop_or_pad',
     v1=['image.resize_with_crop_or_pad', 'image.resize_image_with_crop_or_pad'])
+@dispatch.add_dispatch_support
 def resize_image_with_crop_or_pad(image, target_height, target_width):
   """Crops and/or pads an image to a target width and height.
 
@@ -1231,8 +1242,10 @@ def _resize_images_common(images, resizer_fn, size, preserve_aspect_ratio, name,
                                    name='size')
 
     size_const_as_shape = tensor_util.constant_value_as_shape(size)
-    new_height_const = size_const_as_shape.dims[0].value
-    new_width_const = size_const_as_shape.dims[1].value
+    new_height_const = tensor_shape.dimension_at_index(size_const_as_shape,
+                                                       0).value
+    new_width_const = tensor_shape.dimension_at_index(size_const_as_shape,
+                                                      1).value
 
     # If we can determine that the height and width will be unmodified by this
     # transformation, we avoid performing the resize.
@@ -1256,6 +1269,7 @@ def _resize_images_common(images, resizer_fn, size, preserve_aspect_ratio, name,
 
 
 @tf_export(v1=['image.resize_images', 'image.resize'])
+@dispatch.add_dispatch_support
 def resize_images(images,
                   size,
                   method=ResizeMethodV1.BILINEAR,
@@ -1341,6 +1355,7 @@ def resize_images(images,
 
 
 @tf_export('image.resize', v1=[])
+@dispatch.add_dispatch_support
 def resize_images_v2(images,
                      size,
                      method=ResizeMethod.BILINEAR,
@@ -1592,6 +1607,7 @@ def _resize_image_with_pad_common(image, target_height, target_width,
 
 
 @tf_export(v1=['image.resize_image_with_pad'])
+@dispatch.add_dispatch_support
 def resize_image_with_pad_v1(image,
                              target_height,
                              target_width,
@@ -1634,6 +1650,7 @@ def resize_image_with_pad_v1(image,
 
 
 @tf_export('image.resize_with_pad', v1=[])
+@dispatch.add_dispatch_support
 def resize_image_with_pad_v2(image,
                              target_height,
                              target_width,
@@ -1674,6 +1691,7 @@ def resize_image_with_pad_v2(image,
 
 
 @tf_export('image.per_image_standardization')
+@dispatch.add_dispatch_support
 def per_image_standardization(image):
   """Linearly scales each image in `image` to have mean 0 and variance 1.
 
@@ -1719,6 +1737,7 @@ def per_image_standardization(image):
 
 
 @tf_export('image.random_brightness')
+@dispatch.add_dispatch_support
 def random_brightness(image, max_delta, seed=None):
   """Adjust the brightness of images by a random factor.
 
@@ -1754,6 +1773,7 @@ def random_brightness(image, max_delta, seed=None):
 
 
 @tf_export('image.random_contrast')
+@dispatch.add_dispatch_support
 def random_contrast(image, lower, upper, seed=None):
   """Adjust the contrast of an image or images by a random factor.
 
@@ -1794,6 +1814,7 @@ def random_contrast(image, lower, upper, seed=None):
 
 
 @tf_export('image.adjust_brightness')
+@dispatch.add_dispatch_support
 def adjust_brightness(image, delta):
   """Adjust the brightness of RGB or Grayscale images.
 
@@ -1845,6 +1866,7 @@ def adjust_brightness(image, delta):
 
 
 @tf_export('image.adjust_contrast')
+@dispatch.add_dispatch_support
 def adjust_contrast(images, contrast_factor):
   """Adjust contrast of RGB or grayscale images.
 
@@ -1901,6 +1923,7 @@ def adjust_contrast(images, contrast_factor):
 
 
 @tf_export('image.adjust_gamma')
+@dispatch.add_dispatch_support
 def adjust_gamma(image, gamma=1, gain=1):
   """Performs [Gamma Correction](http://en.wikipedia.org/wiki/Gamma_correction).
 
@@ -1965,6 +1988,7 @@ def adjust_gamma(image, gamma=1, gain=1):
 
 
 @tf_export('image.convert_image_dtype')
+@dispatch.add_dispatch_support
 def convert_image_dtype(image, dtype, saturate=False, name=None):
   """Convert `image` to `dtype`, scaling its values if needed.
 
@@ -2064,6 +2088,7 @@ def convert_image_dtype(image, dtype, saturate=False, name=None):
 
 
 @tf_export('image.rgb_to_grayscale')
+@dispatch.add_dispatch_support
 def rgb_to_grayscale(images, name=None):
   """Converts one or more images from RGB to Grayscale.
 
@@ -2099,6 +2124,7 @@ def rgb_to_grayscale(images, name=None):
 
 
 @tf_export('image.grayscale_to_rgb')
+@dispatch.add_dispatch_support
 def grayscale_to_rgb(images, name=None):
   """Converts one or more images from Grayscale to RGB.
 
@@ -2135,6 +2161,7 @@ def grayscale_to_rgb(images, name=None):
 
 # pylint: disable=invalid-name
 @tf_export('image.random_hue')
+@dispatch.add_dispatch_support
 def random_hue(image, max_delta, seed=None):
   """Adjust the hue of RGB images by a random factor.
 
@@ -2177,6 +2204,7 @@ def random_hue(image, max_delta, seed=None):
 
 
 @tf_export('image.adjust_hue')
+@dispatch.add_dispatch_support
 def adjust_hue(image, delta, name=None):
   """Adjust hue of RGB images.
 
@@ -2244,6 +2272,7 @@ def adjust_hue(image, delta, name=None):
 
 # pylint: disable=invalid-name
 @tf_export('image.random_jpeg_quality')
+@dispatch.add_dispatch_support
 def random_jpeg_quality(image, min_jpeg_quality, max_jpeg_quality, seed=None):
   """Randomly changes jpeg encoding quality for inducing jpeg noise.
 
@@ -2291,6 +2320,7 @@ def random_jpeg_quality(image, min_jpeg_quality, max_jpeg_quality, seed=None):
 
 
 @tf_export('image.adjust_jpeg_quality')
+@dispatch.add_dispatch_support
 def adjust_jpeg_quality(image, jpeg_quality, name=None):
   """Adjust jpeg encoding quality of an image.
 
@@ -2341,6 +2371,7 @@ def adjust_jpeg_quality(image, jpeg_quality, name=None):
 
 
 @tf_export('image.random_saturation')
+@dispatch.add_dispatch_support
 def random_saturation(image, lower, upper, seed=None):
   """Adjust the saturation of RGB images by a random factor.
 
@@ -2387,6 +2418,7 @@ def random_saturation(image, lower, upper, seed=None):
 
 
 @tf_export('image.adjust_saturation')
+@dispatch.add_dispatch_support
 def adjust_saturation(image, saturation_factor, name=None):
   """Adjust saturation of RGB images.
 
@@ -2478,42 +2510,43 @@ tf_export(
     'io.decode_and_crop_jpeg',
     'image.decode_and_crop_jpeg',
     v1=['io.decode_and_crop_jpeg', 'image.decode_and_crop_jpeg'])(
-        gen_image_ops.decode_and_crop_jpeg)
+        dispatch.add_dispatch_support(gen_image_ops.decode_and_crop_jpeg))
 
 tf_export(
     'io.decode_bmp',
     'image.decode_bmp',
     v1=['io.decode_bmp', 'image.decode_bmp'])(
-        gen_image_ops.decode_bmp)
+        dispatch.add_dispatch_support(gen_image_ops.decode_bmp))
 tf_export(
     'io.decode_gif',
     'image.decode_gif',
     v1=['io.decode_gif', 'image.decode_gif'])(
-        gen_image_ops.decode_gif)
+        dispatch.add_dispatch_support(gen_image_ops.decode_gif))
 tf_export(
     'io.decode_jpeg',
     'image.decode_jpeg',
     v1=['io.decode_jpeg', 'image.decode_jpeg'])(
-        gen_image_ops.decode_jpeg)
+        dispatch.add_dispatch_support(gen_image_ops.decode_jpeg))
 tf_export(
     'io.decode_png',
     'image.decode_png',
     v1=['io.decode_png', 'image.decode_png'])(
-        gen_image_ops.decode_png)
+        dispatch.add_dispatch_support(gen_image_ops.decode_png))
 
 tf_export(
     'io.encode_jpeg',
     'image.encode_jpeg',
     v1=['io.encode_jpeg', 'image.encode_jpeg'])(
-        gen_image_ops.encode_jpeg)
+        dispatch.add_dispatch_support(gen_image_ops.encode_jpeg))
 tf_export(
     'io.extract_jpeg_shape',
     'image.extract_jpeg_shape',
     v1=['io.extract_jpeg_shape', 'image.extract_jpeg_shape'])(
-        gen_image_ops.extract_jpeg_shape)
+        dispatch.add_dispatch_support(gen_image_ops.extract_jpeg_shape))
 
 
 @tf_export('io.encode_png', 'image.encode_png')
+@dispatch.add_dispatch_support
 def encode_png(image, compression=-1, name=None):
   r"""PNG-encode an image.
 
@@ -2546,6 +2579,7 @@ def encode_png(image, compression=-1, name=None):
     'io.decode_image',
     'image.decode_image',
     v1=['io.decode_image', 'image.decode_image'])
+@dispatch.add_dispatch_support
 def decode_image(contents,
                  channels=None,
                  dtype=dtypes.uint8,
@@ -2659,6 +2693,7 @@ def decode_image(contents,
 
 
 @tf_export('image.total_variation')
+@dispatch.add_dispatch_support
 def total_variation(images, name=None):
   """Calculate and return the total variation for one or more images.
 
@@ -2730,6 +2765,7 @@ def total_variation(images, name=None):
 
 
 @tf_export('image.sample_distorted_bounding_box', v1=[])
+@dispatch.add_dispatch_support
 def sample_distorted_bounding_box_v2(image_size,
                                      bounding_boxes,
                                      seed=0,
@@ -2829,6 +2865,7 @@ def sample_distorted_bounding_box_v2(image_size,
 
 
 @tf_export(v1=['image.sample_distorted_bounding_box'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     date=None,
     instructions='`seed2` arg is deprecated.'
@@ -2943,6 +2980,7 @@ def sample_distorted_bounding_box(image_size,
 
 
 @tf_export('image.non_max_suppression')
+@dispatch.add_dispatch_support
 def non_max_suppression(boxes,
                         scores,
                         max_output_size,
@@ -2995,6 +3033,7 @@ def non_max_suppression(boxes,
 
 
 @tf_export('image.non_max_suppression_with_scores')
+@dispatch.add_dispatch_support
 def non_max_suppression_with_scores(boxes,
                                     scores,
                                     max_output_size,
@@ -3081,6 +3120,7 @@ def non_max_suppression_with_scores(boxes,
 
 
 @tf_export('image.non_max_suppression_overlaps')
+@dispatch.add_dispatch_support
 def non_max_suppression_with_overlaps(overlaps,
                                       scores,
                                       max_output_size,
@@ -3132,6 +3172,7 @@ _rgb_to_yiq_kernel = [[0.299, 0.59590059, 0.2115],
 
 
 @tf_export('image.rgb_to_yiq')
+@dispatch.add_dispatch_support
 def rgb_to_yiq(images):
   """Converts one or more images from RGB to YIQ.
 
@@ -3165,6 +3206,7 @@ _yiq_to_rgb_kernel = [[1, 1, 1], [0.95598634, -0.27201283, -1.10674021],
 
 
 @tf_export('image.yiq_to_rgb')
+@dispatch.add_dispatch_support
 def yiq_to_rgb(images):
   """Converts one or more images from YIQ to RGB.
 
@@ -3193,6 +3235,7 @@ _rgb_to_yuv_kernel = [[0.299, -0.14714119, 0.61497538],
 
 
 @tf_export('image.rgb_to_yuv')
+@dispatch.add_dispatch_support
 def rgb_to_yuv(images):
   """Converts one or more images from RGB to YUV.
 
@@ -3219,6 +3262,7 @@ _yuv_to_rgb_kernel = [[1, 1, 1], [0, -0.394642334, 2.03206185],
 
 
 @tf_export('image.yuv_to_rgb')
+@dispatch.add_dispatch_support
 def yuv_to_rgb(images):
   """Converts one or more images from YUV to RGB.
 
@@ -3312,6 +3356,7 @@ def _verify_compatible_image_shapes(img1, img2):
 
 
 @tf_export('image.psnr')
+@dispatch.add_dispatch_support
 def psnr(a, b, max_val, name=None):
   """Returns the Peak Signal-to-Noise Ratio between a and b.
 
@@ -3523,6 +3568,7 @@ def _ssim_per_channel(img1,
 
 
 @tf_export('image.ssim')
+@dispatch.add_dispatch_support
 def ssim(img1,
          img2,
          max_val,
@@ -3602,6 +3648,7 @@ _MSSSIM_WEIGHTS = (0.0448, 0.2856, 0.3001, 0.2363, 0.1333)
 
 
 @tf_export('image.ssim_multiscale')
+@dispatch.add_dispatch_support
 def ssim_multiscale(img1,
                     img2,
                     max_val,
@@ -3729,6 +3776,7 @@ def ssim_multiscale(img1,
 
 
 @tf_export('image.image_gradients')
+@dispatch.add_dispatch_support
 def image_gradients(image):
   """Returns image gradients (dy, dx) for each color channel.
 
@@ -3802,6 +3850,7 @@ def image_gradients(image):
 
 
 @tf_export('image.sobel_edges')
+@dispatch.add_dispatch_support
 def sobel_edges(image):
   """Returns a tensor holding Sobel edge maps.
 
@@ -3886,21 +3935,22 @@ resize_area_deprecation = deprecation.deprecated(
     instructions=(
         'Use `tf.image.resize(...method=ResizeMethod.AREA...)` instead.'))
 tf_export(v1=['image.resize_area'])(
-    resize_area_deprecation(gen_image_ops.resize_area))
+    resize_area_deprecation(
+        dispatch.add_dispatch_support(gen_image_ops.resize_area)))
 
 resize_bicubic_deprecation = deprecation.deprecated(
     date=None,
     instructions=(
         'Use `tf.image.resize(...method=ResizeMethod.BICUBIC...)` instead.'))
 tf_export(v1=['image.resize_bicubic'])(
-    resize_bicubic_deprecation(resize_bicubic))
+    dispatch.add_dispatch_support(resize_bicubic_deprecation(resize_bicubic)))
 
 resize_bilinear_deprecation = deprecation.deprecated(
     date=None,
     instructions=(
         'Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.'))
 tf_export(v1=['image.resize_bilinear'])(
-    resize_bilinear_deprecation(resize_bilinear))
+    dispatch.add_dispatch_support(resize_bilinear_deprecation(resize_bilinear)))
 
 resize_nearest_neighbor_deprecation = deprecation.deprecated(
     date=None,
@@ -3908,10 +3958,12 @@ resize_nearest_neighbor_deprecation = deprecation.deprecated(
         'Use `tf.image.resize(...method=ResizeMethod.NEAREST_NEIGHBOR...)` '
         'instead.'))
 tf_export(v1=['image.resize_nearest_neighbor'])(
-    resize_nearest_neighbor_deprecation(resize_nearest_neighbor))
+    dispatch.add_dispatch_support(
+        resize_nearest_neighbor_deprecation(resize_nearest_neighbor)))
 
 
 @tf_export('image.crop_and_resize', v1=[])
+@dispatch.add_dispatch_support
 def crop_and_resize_v2(image,
                        boxes,
                        box_indices,
@@ -3995,6 +4047,7 @@ def crop_and_resize_v2(image,
 
 
 @tf_export(v1=['image.crop_and_resize'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              'box_ind is deprecated, use box_indices instead',
                              'box_ind')
@@ -4017,6 +4070,7 @@ crop_and_resize_v1.__doc__ = gen_image_ops.crop_and_resize.__doc__
 
 
 @tf_export(v1=['image.extract_glimpse'])
+@dispatch.add_dispatch_support
 def extract_glimpse(
     input,  # pylint: disable=redefined-builtin
     size,
@@ -4060,8 +4114,8 @@ def extract_glimpse(
   ...          [[6.0],
   ...           [7.0],
   ...           [8.0]]]]
-  >>> tf.image.extract_glimpse(x, size=(2, 2), offsets=[[1, 1]],
-  ...                         centered=False, normalized=False)
+  >>> tf.compat.v1.image.extract_glimpse(x, size=(2, 2), offsets=[[1, 1]],
+  ...                                    centered=False, normalized=False)
   <tf.Tensor: shape=(1, 2, 2, 1), dtype=float32, numpy=
   array([[[[0.],
            [1.]],
@@ -4102,6 +4156,7 @@ def extract_glimpse(
 
 
 @tf_export('image.extract_glimpse', v1=[])
+@dispatch.add_dispatch_support
 def extract_glimpse_v2(
     input,  # pylint: disable=redefined-builtin
     size,
@@ -4148,10 +4203,10 @@ def extract_glimpse_v2(
   >>> tf.image.extract_glimpse(x, size=(2, 2), offsets=[[1, 1]],
   ...                         centered=False, normalized=False)
   <tf.Tensor: shape=(1, 2, 2, 1), dtype=float32, numpy=
-  array([[[[0.],
-           [1.]],
-          [[3.],
-           [4.]]]], dtype=float32)>
+  array([[[[4.],
+           [5.]],
+          [[7.],
+           [8.]]]], dtype=float32)>
 
   Args:
     input: A `Tensor` of type `float32`. A 4-D float tensor of shape
@@ -4176,7 +4231,7 @@ def extract_glimpse_v2(
   Returns:
     A `Tensor` of type `float32`.
   """
-  return gen_image_ops.extract_glimpse(
+  return gen_image_ops.extract_glimpse_v2(
       input=input,
       size=size,
       offsets=offsets,
@@ -4188,6 +4243,7 @@ def extract_glimpse_v2(
 
 
 @tf_export('image.combined_non_max_suppression')
+@dispatch.add_dispatch_support
 def combined_non_max_suppression(boxes,
                                  scores,
                                  max_output_size_per_class,
@@ -4440,6 +4496,7 @@ def _suppression_loop_body(boxes, iou_threshold, output_size, idx, tile_size):
 
 
 @tf_export('image.non_max_suppression_padded')
+@dispatch.add_dispatch_support
 def non_max_suppression_padded(boxes,
                                scores,
                                max_output_size,
@@ -4814,6 +4871,7 @@ def non_max_suppression_padded_v1(boxes,
 
 
 @tf_export('image.draw_bounding_boxes', v1=[])
+@dispatch.add_dispatch_support
 def draw_bounding_boxes_v2(images, boxes, colors, name=None):
   """Draw bounding boxes on a batch of images.
 
@@ -4868,6 +4926,7 @@ def draw_bounding_boxes_v2(images, boxes, colors, name=None):
 
 
 @tf_export(v1=['image.draw_bounding_boxes'])
+@dispatch.add_dispatch_support
 def draw_bounding_boxes(images, boxes, name=None, colors=None):
   """Draw bounding boxes on a batch of images.
 
@@ -4920,6 +4979,7 @@ def draw_bounding_boxes(images, boxes, name=None, colors=None):
 
 
 @tf_export('image.generate_bounding_box_proposals')
+@dispatch.add_dispatch_support
 def generate_bounding_box_proposals(scores,
                                     bbox_deltas,
                                     image_info,
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index f7617d83caf..82acd09caec 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -41,7 +41,7 @@ cholesky = linalg_ops.cholesky
 cholesky_solve = linalg_ops.cholesky_solve
 det = linalg_ops.matrix_determinant
 slogdet = gen_linalg_ops.log_matrix_determinant
-tf_export('linalg.slogdet')(slogdet)
+tf_export('linalg.slogdet')(dispatch.add_dispatch_support(slogdet))
 diag = array_ops.matrix_diag
 diag_part = array_ops.matrix_diag_part
 eigh = linalg_ops.self_adjoint_eig
@@ -51,7 +51,7 @@ eye = linalg_ops.eye
 inv = linalg_ops.matrix_inverse
 logm = gen_linalg_ops.matrix_logarithm
 lu = gen_linalg_ops.lu
-tf_export('linalg.logm')(logm)
+tf_export('linalg.logm')(dispatch.add_dispatch_support(logm))
 lstsq = linalg_ops.matrix_solve_ls
 norm = linalg_ops.norm
 qr = linalg_ops.qr
@@ -230,6 +230,7 @@ def _matrix_exp_pade13(matrix):
 
 
 @tf_export('linalg.expm')
+@dispatch.add_dispatch_support
 def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the matrix exponential of one or more square matrices.
 
@@ -340,6 +341,7 @@ def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
 
 
 @tf_export('linalg.tridiagonal_solve')
+@dispatch.add_dispatch_support
 def tridiagonal_solve(diagonals,
                       rhs,
                       diagonals_format='compact',
@@ -541,6 +543,7 @@ def _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
 
 
 @tf_export('linalg.tridiagonal_matmul')
+@dispatch.add_dispatch_support
 def tridiagonal_matmul(diagonals, rhs, diagonals_format='compact', name=None):
   r"""Multiplies tridiagonal matrix by matrix.
 
@@ -638,6 +641,7 @@ def _maybe_validate_matrix(a, validate_args):
 
 
 @tf_export('linalg.matrix_rank')
+@dispatch.add_dispatch_support
 def matrix_rank(a, tol=None, validate_args=False, name=None):
   """Compute the matrix rank of one or more matrices.
 
@@ -676,6 +680,7 @@ def matrix_rank(a, tol=None, validate_args=False, name=None):
 
 
 @tf_export('linalg.pinv')
+@dispatch.add_dispatch_support
 def pinv(a, rcond=None, validate_args=False, name=None):
   """Compute the Moore-Penrose pseudo-inverse of one or more matrices.
 
@@ -805,6 +810,7 @@ def pinv(a, rcond=None, validate_args=False, name=None):
 
 
 @tf_export('linalg.lu_solve')
+@dispatch.add_dispatch_support
 def lu_solve(lower_upper, perm, rhs, validate_args=False, name=None):
   """Solves systems of linear eqns `A X = RHS`, given LU factorizations.
 
@@ -902,6 +908,7 @@ def lu_solve(lower_upper, perm, rhs, validate_args=False, name=None):
 
 
 @tf_export('linalg.lu_matrix_inverse')
+@dispatch.add_dispatch_support
 def lu_matrix_inverse(lower_upper, perm, validate_args=False, name=None):
   """Computes the inverse given the LU decomposition(s) of one or more matrices.
 
@@ -966,6 +973,7 @@ def lu_matrix_inverse(lower_upper, perm, validate_args=False, name=None):
 
 
 @tf_export('linalg.lu_reconstruct')
+@dispatch.add_dispatch_support
 def lu_reconstruct(lower_upper, perm, validate_args=False, name=None):
   """The reconstruct one or more matrices from their LU decomposition(s).
 
diff --git a/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py b/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py
index 613309f856d..6794636c3fd 100644
--- a/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py
+++ b/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py
@@ -27,10 +27,12 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('linalg.experimental.conjugate_gradient')
+@dispatch.add_dispatch_support
 def conjugate_gradient(operator,
                        rhs,
                        preconditioner=None,
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index abca7df19e0..03b7b98119d 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.gen_linalg_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 # Names below are lower_case.
@@ -82,6 +83,7 @@ def _RegularizedGramianCholesky(matrix, l2_regularizer, first_kind):
 @tf_export(
     'linalg.triangular_solve',
     v1=['linalg.triangular_solve', 'matrix_triangular_solve'])
+@dispatch.add_dispatch_support
 def matrix_triangular_solve(matrix, rhs, lower=True, adjoint=False, name=None):
   """Solve systems of linear equations with upper or lower triangular matrices.
 
@@ -143,6 +145,7 @@ def matrix_triangular_solve(matrix, rhs, lower=True, adjoint=False, name=None):
 
 @tf_export(
     'linalg.cholesky_solve', v1=['linalg.cholesky_solve', 'cholesky_solve'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('cholesky_solve')
 def cholesky_solve(chol, rhs, name=None):
   """Solves systems of linear eqns `A X = RHS`, given Cholesky factorizations.
@@ -187,6 +190,7 @@ def cholesky_solve(chol, rhs, name=None):
 
 
 @tf_export('eye', 'linalg.eye')
+@dispatch.add_dispatch_support
 def eye(num_rows,
         num_columns=None,
         batch_shape=None,
@@ -234,6 +238,7 @@ def eye(num_rows,
 
 
 @tf_export('linalg.lstsq', v1=['linalg.lstsq', 'matrix_solve_ls'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('matrix_solve_ls')
 def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
   r"""Solves one or more linear least-squares problems.
@@ -371,6 +376,7 @@ def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
 
 
 @tf_export('linalg.eig', 'eig', v1=[])
+@dispatch.add_dispatch_support
 def eig(tensor, name=None):
   """Computes the eigen decomposition of a batch of matrices.
 
@@ -401,6 +407,7 @@ def eig(tensor, name=None):
 
 
 @tf_export('linalg.eigvals', 'eigvals', v1=[])
+@dispatch.add_dispatch_support
 def eigvals(tensor, name=None):
   """Computes the eigenvalues of one or more matrices.
 
@@ -427,6 +434,7 @@ def eigvals(tensor, name=None):
 
 
 @tf_export('linalg.eigh', v1=['linalg.eigh', 'self_adjoint_eig'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('self_adjoint_eig')
 def self_adjoint_eig(tensor, name=None):
   """Computes the eigen decomposition of a batch of self-adjoint matrices.
@@ -450,6 +458,7 @@ def self_adjoint_eig(tensor, name=None):
 
 
 @tf_export('linalg.eigvalsh', v1=['linalg.eigvalsh', 'self_adjoint_eigvals'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('self_adjoint_eigvals')
 def self_adjoint_eigvals(tensor, name=None):
   """Computes the eigenvalues of one or more self-adjoint matrices.
@@ -473,6 +482,7 @@ def self_adjoint_eigvals(tensor, name=None):
 
 
 @tf_export('linalg.svd', v1=['linalg.svd', 'svd'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('svd')
 def svd(tensor, full_matrices=False, compute_uv=True, name=None):
   r"""Computes the singular value decompositions of one or more matrices.
@@ -544,6 +554,7 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export('norm', 'linalg.norm', v1=[])
+@dispatch.add_dispatch_support
 def norm_v2(tensor,
             ord='euclidean',
             axis=None,
@@ -615,6 +626,7 @@ def norm_v2(tensor,
 
 # pylint: disable=redefined-builtin
 @tf_export(v1=['norm', 'linalg.norm'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(
     None, 'keep_dims is deprecated, use keepdims instead', 'keep_dims')
 def norm(tensor,
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 7e980a0dbb3..8ca63f55987 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.gen_logging_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
@@ -71,6 +72,7 @@ except NameError:
             "only a concern in graph mode. Below is an example "
             "of how to ensure tf.print executes in graph mode:\n")
 @tf_export(v1=["Print"])
+@dispatch.add_dispatch_support
 def Print(input_, data, message=None, first_n=None, summarize=None, name=None):
   """Prints a list of tensors.
 
@@ -136,6 +138,7 @@ def _is_filepath(output_stream):
 # function definition.
 # pylint: disable=g-doc-args
 @tf_export("print")
+@dispatch.add_dispatch_support
 def print_v2(*inputs, **kwargs):
   """Print the specified inputs.
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 556c646f2a7..6a7b4b68420 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
@@ -136,6 +137,7 @@ def _num_elements(losses):
 
 
 @tf_export(v1=["losses.compute_weighted_loss"])
+@dispatch.add_dispatch_support
 def compute_weighted_loss(
     losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES,
     reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -204,6 +206,7 @@ def compute_weighted_loss(
 
 
 @tf_export(v1=["losses.absolute_difference"])
+@dispatch.add_dispatch_support
 def absolute_difference(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -257,6 +260,7 @@ def absolute_difference(
 
 
 @tf_export(v1=["losses.cosine_distance"])
+@dispatch.add_dispatch_support
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def cosine_distance(
     labels, predictions, axis=None, weights=1.0, scope=None,
@@ -313,6 +317,7 @@ def cosine_distance(
 
 
 @tf_export(v1=["losses.hinge_loss"])
+@dispatch.add_dispatch_support
 def hinge_loss(labels, logits, weights=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -363,6 +368,7 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
 
 
 @tf_export(v1=["losses.huber_loss"])
+@dispatch.add_dispatch_support
 def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -439,6 +445,7 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
 
 
 @tf_export(v1=["losses.log_loss"])
+@dispatch.add_dispatch_support
 def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
              loss_collection=ops.GraphKeys.LOSSES,
              reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -496,6 +503,7 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
 
 # TODO(b/37208492): Add reduction arg.
 @tf_export(v1=["losses.mean_pairwise_squared_error"])
+@dispatch.add_dispatch_support
 def mean_pairwise_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES):
@@ -592,6 +600,7 @@ def mean_pairwise_squared_error(
 
 
 @tf_export(v1=["losses.mean_squared_error"])
+@dispatch.add_dispatch_support
 def mean_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -645,6 +654,7 @@ def mean_squared_error(
 
 
 @tf_export(v1=["losses.sigmoid_cross_entropy"])
+@dispatch.add_dispatch_support
 def sigmoid_cross_entropy(
     multi_class_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -709,6 +719,7 @@ def sigmoid_cross_entropy(
 
 
 @tf_export(v1=["losses.softmax_cross_entropy"])
+@dispatch.add_dispatch_support
 def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -831,6 +842,7 @@ def _remove_squeezable_dimensions(
 
 
 @tf_export(v1=["losses.sparse_softmax_cross_entropy"])
+@dispatch.add_dispatch_support
 def sparse_softmax_cross_entropy(
     labels, logits, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
diff --git a/tensorflow/python/ops/manip_ops.py b/tensorflow/python/ops/manip_ops.py
index 56e8a894c24..fe99696f82f 100644
--- a/tensorflow/python/ops/manip_ops.py
+++ b/tensorflow/python/ops/manip_ops.py
@@ -20,11 +20,13 @@ from __future__ import print_function
 
 from tensorflow.python.ops import gen_manip_ops as _gen_manip_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access
 @tf_export('roll', v1=['roll', 'manip.roll'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('manip.roll')
 def roll(input, shift, axis, name=None):  # pylint: disable=redefined-builtin
   return _gen_manip_ops.roll(input, shift, axis, name)
diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index 2c9c678336e..96810805c18 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -22,6 +22,8 @@ from __future__ import print_function
 
 import re
 
+from tensorflow.python.autograph.core import ag_ctx as autograph_ctx
+from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
@@ -451,13 +453,14 @@ def map_fn(fn,
 
     # Prepare result tensor array.
     # TODO(edloper): Should we set infer_shape=False for composite tensors?
-    result_batchable_dtype = _result_flat_signature_to_batchable_dtype(
-        result_flat_signature)
-    result_batchable_ta = [
-        tensor_array_ops.TensorArray(
-            dtype=dt, size=n, dynamic_size=False, infer_shape=infer_shape)
-        for dt in result_batchable_dtype
-    ]
+    result_batchable_tensor_spec = (
+        _result_flat_signature_to_batchable_tensor_spec(result_flat_signature))
+    result_batchable_ta = []
+    for spec in result_batchable_tensor_spec:
+      result_batchable_ta.append(
+          tensor_array_ops.TensorArray(
+              dtype=spec.dtype, size=n, dynamic_size=False,
+              infer_shape=infer_shape, element_shape=spec.shape))
 
     def compute(i, tas):
       """The loop body of map_fn.
@@ -477,7 +480,9 @@ def map_fn(fn,
       elems_value_flat = _elems_value_batchable_to_flat(elems_value_batchable,
                                                         elems_flat_signature)
       elems_value = elems_unflatten(elems_value_flat)
-      result_value = fn(elems_value)
+      ag_ctx = autograph_ctx.control_status_ctx()
+      autographed_fn = autograph.tf_convert(fn, ag_ctx)
+      result_value = autographed_fn(elems_value)
       nest.assert_same_structure(fn_output_signature or elems, result_value)
       result_value_flat = nest.flatten(result_value)
       result_value_batchable = _result_value_flat_to_batchable(
@@ -534,15 +539,14 @@ def _most_general_compatible_type(spec):
     return spec
 
 
-def _result_flat_signature_to_batchable_dtype(result_flat_signature):
-  """Converts result_flat_signature -> result_batchable_dtype."""
-  components = []
+def _result_flat_signature_to_batchable_tensor_spec(result_flat_signature):
+  """Converts result_flat_signature -> result_batchable_tensor_specs."""
+  tensor_specs = []
   for spec in result_flat_signature:
     if not isinstance(spec, type_spec.BatchableTypeSpec):
       raise TypeError("map_fn can not generate %s outputs" % (spec,))
-    # pylint: disable=protected-access
-    components.extend([s.dtype for s in spec._flat_tensor_specs])
-  return components
+    tensor_specs.extend(spec._flat_tensor_specs)  # pylint: disable=protected-access
+  return tensor_specs
 
 
 def _elems_flat_to_batchable(elems_flat):
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 4c4982c6fd5..3a19bfa6a9a 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -104,6 +104,7 @@ nextafter = gen_math_ops.next_after
 
 
 @tf_export("linspace", v1=["lin_space", "linspace"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("lin_space")
 def linspace_nd(start, stop, num, name=None, axis=0):
   r"""Generates evenly-spaced values in an interval along a given axis.
@@ -166,6 +167,11 @@ def linspace_nd(start, stop, num, name=None, axis=0):
     num_int = array_ops.convert_to_int_tensor(num, name="num")
     num = cast(num_int, dtype=start.dtype)
 
+    broadcast_shape = array_ops.broadcast_dynamic_shape(
+        array_ops.shape(start), array_ops.shape(stop))
+    start = array_ops.broadcast_to(start, broadcast_shape)
+    stop = array_ops.broadcast_to(stop, broadcast_shape)
+
     expanded_start = array_ops.expand_dims(start, axis=axis)
     expanded_stop = array_ops.expand_dims(stop, axis=axis)
 
@@ -174,39 +180,38 @@ def linspace_nd(start, stop, num, name=None, axis=0):
 
     axis = array_ops.where_v2(axis >= 0, axis, ndims + axis)
 
-    # to avoid having negative values in the range or zero division
-    # The result is sliced in the end so a correct result is returned for
-    # num == 1.
-    n_steps = gen_math_ops.maximum(num - 1., 1.)
-    delta = (expanded_stop - expanded_start) / n_steps
+    # The purpose is to avoid having negative values when repeating.
+    num_fill = gen_math_ops.maximum(num_int - 2, 0)
+    # To avoid having negative values in the range or zero division
+    # the result is sliced in the end so a correct result is returned for
+    # num == 1, and num == 0.
+    n_steps = gen_math_ops.maximum(num_int - 1, 1)
+    delta = (expanded_stop - expanded_start) / cast(n_steps,
+                                                    expanded_stop.dtype)
+    # Re-cast tensors as delta.
+    expanded_start = cast(expanded_start, delta.dtype)
+    expanded_stop = cast(expanded_stop, delta.dtype)
     # If num < 0, we will throw exception in the range
     # otherwise use the same div for delta
-    range_end = array_ops.where_v2(num_int > 0, n_steps, -1)
-    num_range = range(1., range_end, dtype=start.dtype)
-    shape_range = range(ndims)
-    ones_like_shape_range = array_ops.ones_like(shape_range)
-    axis_tiled = ones_like_shape_range * axis
-    # the purpose is to avoid having negative values when repeating
-    num_fill = gen_math_ops.maximum(num_int - 2, 0)
-    num_tiled = array_ops.ones_like(shape_range) * num_fill
-    ones = array_ops.ones_like(num_tiled)
-    mask = gen_math_ops.equal(axis_tiled, shape_range)
-    # reshape_target is [1. 1. 1. ... 1. num 1. 1. ... 1.], where the index
-    # of num is equal to axis
-    reshape_target = array_ops.where_v2(mask, num_fill, shape)
-    delta_expanded = array_ops.reshape(delta, shape)
-    delta_repeated = array_ops.broadcast_to(delta_expanded, reshape_target)
-    start_repeated = array_ops.broadcast_to(expanded_start, reshape_target)
+    range_end = array_ops.where_v2(num_int >= 0, n_steps, -1)
+    # Even though range supports an output dtype, its limited
+    # (e.g. doesn't support half at the moment).
+    desired_range = cast(range(1, range_end, dtype=dtypes.int64), delta.dtype)
+    mask = gen_math_ops.equal(axis, range(ndims))
+    # desired_range_shape is [1. 1. 1. ... 1. num_fill 1. 1. ... 1.], where the
+    # index of num_fill is equal to axis.
+    desired_range_shape = array_ops.where_v2(mask, num_fill, 1)
+    desired_range = array_ops.reshape(desired_range, desired_range_shape)
 
-    expanded_shape = array_ops.where_v2(mask, num_fill, ones)
-    range_indices = array_ops.reshape(num_range, expanded_shape)
-    tiled_range_indices = array_ops.tile(range_indices, shape)
-    res = start_repeated + delta_repeated * tiled_range_indices
+    res = expanded_start + delta * desired_range
+
+    # Add the start and endpoints to the result, and slice out the desired
+    # portion.
     all_tensors = (expanded_start, res, expanded_stop)
     concatenated = array_ops.concat(all_tensors, axis=axis)
     begin = array_ops.zeros_like(shape)
-    num_slice = ones_like_shape_range * num_int
-    size = array_ops.where_v2(mask, num_slice, shape)
+    size = array_ops.where_v2(mask, num_int, shape)
+
     return array_ops.slice(concatenated, begin, size)
 
 
@@ -214,8 +219,8 @@ linspace = linspace_nd
 
 arg_max = deprecation.deprecated(None, "Use `tf.math.argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
 arg_min = deprecation.deprecated(None, "Use `tf.math.argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
-tf_export(v1=["arg_max"])(arg_max)
-tf_export(v1=["arg_min"])(arg_min)
+tf_export(v1=["arg_max"])(dispatch.add_dispatch_support(arg_max))
+tf_export(v1=["arg_min"])(dispatch.add_dispatch_support(arg_min))
 
 
 # This is set by resource_variable_ops.py. It is included in this way since
@@ -234,6 +239,7 @@ def _set_doc(doc):
 
 # pylint: disable=redefined-builtin
 @tf_export(v1=["math.argmax", "argmax"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -250,10 +256,11 @@ def argmax(input,
 
 
 @tf_export("math.argmax", "argmax", v1=[])
+@dispatch.add_dispatch_support
 def argmax_v2(input, axis=None, output_type=dtypes.int64, name=None):
   """Returns the index with the largest value across axes of a tensor.
 
-  Note that in case of ties the identity of the return value is not guaranteed.
+  In case of identity returns the smallest index.
 
   For example:
 
@@ -266,6 +273,9 @@ def argmax_v2(input, axis=None, output_type=dtypes.int64, name=None):
   <tf.Tensor: shape=(5,), dtype=int64, numpy=array([2, 2, 0, 2, 2])>
   >>> tf.math.argmax(B, 1)
   <tf.Tensor: shape=(3,), dtype=int64, numpy=array([2, 2, 1])>
+  >>> C = tf.constant([0, 0, 0, 0])
+  >>> tf.math.argmax(C) # Returns smallest index in case of ties
+  <tf.Tensor: shape=(), dtype=int64, numpy=0>
 
   Args:
     input: A `Tensor`.
@@ -283,6 +293,7 @@ def argmax_v2(input, axis=None, output_type=dtypes.int64, name=None):
 
 
 @tf_export(v1=["math.argmin", "argmin"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -299,10 +310,11 @@ def argmin(input,
 
 
 @tf_export("math.argmin", "argmin", v1=[])
+@dispatch.add_dispatch_support
 def argmin_v2(input, axis=None, output_type=dtypes.int64, name=None):
   """Returns the index with the smallest value across axes of a tensor.
 
-  Note that in case of ties the identity of the return value is not guaranteed.
+  Returns the smallest index in case of ties.
 
   Args:
     input: A `Tensor`. Must be one of the following types: `float32`, `float64`,
@@ -549,6 +561,7 @@ def _neg(x, name=None):
 
 
 @tf_export(v1=["math.scalar_mul", "scalar_mul"])
+@dispatch.add_dispatch_support
 def scalar_mul(scalar, x, name=None):
   """Multiplies a scalar times a `Tensor` or `IndexedSlices` object.
 
@@ -581,6 +594,7 @@ def scalar_mul(scalar, x, name=None):
 
 
 @tf_export("math.scalar_mul", "scalar_mul", v1=[])
+@dispatch.add_dispatch_support
 @_set_doc(scalar_mul.__doc__)
 def scalar_mul_v2(scalar, x, name=None):
   with ops.name_scope(name, "scalar_mul", [x]) as name:
@@ -701,6 +715,7 @@ def sign(x, name=None):
 
 
 @tf_export("math.real", v1=["math.real", "real"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("real")
 @dispatch.add_dispatch_support
 def real(input, name=None):
@@ -735,6 +750,7 @@ def real(input, name=None):
 
 
 @tf_export("math.imag", v1=["math.imag", "imag"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("imag")
 @dispatch.add_dispatch_support
 def imag(input, name=None):
@@ -768,6 +784,7 @@ def imag(input, name=None):
 
 
 @tf_export("math.angle", v1=["math.angle", "angle"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("angle")
 @dispatch.add_dispatch_support
 def angle(input, name=None):
@@ -937,6 +954,7 @@ def saturate_cast(value, dtype, name=None):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_float"])
+@dispatch.add_dispatch_support
 def to_float(x, name="ToFloat"):
   """Casts a tensor to type `float32`.
 
@@ -956,6 +974,7 @@ def to_float(x, name="ToFloat"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_double"])
+@dispatch.add_dispatch_support
 def to_double(x, name="ToDouble"):
   """Casts a tensor to type `float64`.
 
@@ -975,6 +994,7 @@ def to_double(x, name="ToDouble"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_int32"])
+@dispatch.add_dispatch_support
 def to_int32(x, name="ToInt32"):
   """Casts a tensor to type `int32`.
 
@@ -994,6 +1014,7 @@ def to_int32(x, name="ToInt32"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_int64"])
+@dispatch.add_dispatch_support
 def to_int64(x, name="ToInt64"):
   """Casts a tensor to type `int64`.
 
@@ -1013,6 +1034,7 @@ def to_int64(x, name="ToInt64"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_bfloat16"])
+@dispatch.add_dispatch_support
 def to_bfloat16(x, name="ToBFloat16"):
   """Casts a tensor to type `bfloat16`.
 
@@ -1032,6 +1054,7 @@ def to_bfloat16(x, name="ToBFloat16"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_complex64"])
+@dispatch.add_dispatch_support
 def to_complex64(x, name="ToComplex64"):
   """Casts a tensor to type `complex64`.
 
@@ -1051,6 +1074,7 @@ def to_complex64(x, name="ToComplex64"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_complex128"])
+@dispatch.add_dispatch_support
 def to_complex128(x, name="ToComplex128"):
   """Casts a tensor to type `complex128`.
 
@@ -1090,21 +1114,26 @@ def _OverrideBinaryOperatorHelper(func, op_name, clazz_object=ops.Tensor):
 
   def binary_op_wrapper(x, y):
     with ops.name_scope(None, op_name, [x, y]) as name:
-      if isinstance(x, ops.Tensor) and isinstance(y, ops.Tensor):
+      try:
         return func(x, y, name=name)
-      elif not isinstance(y, sparse_tensor.SparseTensor):
-        try:
-          y = ops.convert_to_tensor_v2(
-              y, dtype_hint=x.dtype.base_dtype, name="y")
-        except TypeError:
-          # If the RHS is not a tensor, it might be a tensor aware object
-          # that can implement the operator with knowledge of itself
-          # and the tensor.
-          if hasattr(type(y), "__r%s__" % op_name):
-            return NotImplemented
-          else:
-            raise
-      return func(x, y, name=name)
+      except (TypeError, ValueError) as e:
+        # Even if dispatching the op failed, the RHS may be a tensor aware
+        # object that can implement the operator with knowledge of itself
+        # and the tensor.
+        # If the RHS is not tensor aware we still want to raise the
+        # original error from the LHS, because it may be more
+        # informative.
+        if hasattr(type(y), "__r%s__" % op_name):
+          try:
+            r_op = getattr(y, "__r%s__" % op_name)
+            out = r_op(x)
+            if out == NotImplemented:
+              raise
+            return out
+          except (TypeError, ValueError):
+            raise e
+        else:
+          raise
 
   def binary_op_wrapper_sparse(sp_x, y):
     with ops.name_scope(None, op_name, [sp_x, y]) as name:
@@ -1184,7 +1213,7 @@ def _sparse_dense_truediv(sp_indices, sp_values, sp_shape, y, name=None):
 def _truediv_python3(x, y, name=None):
   with ops.name_scope(name, "truediv", [x, y]) as name:
     x = ops.convert_to_tensor(x, name="x")
-    y = ops.convert_to_tensor(y, name="y")
+    y = ops.convert_to_tensor(y, dtype_hint=x.dtype.base_dtype, name="y")
     x_dtype = x.dtype.base_dtype
     y_dtype = y.dtype.base_dtype
     if x_dtype != y_dtype:
@@ -1265,6 +1294,7 @@ def truediv(x, y, name=None):
     date=None,
     instructions="Deprecated in favor of operator or tf.math.divide.")
 @tf_export(v1=["div"])
+@dispatch.add_dispatch_support
 def div(x, y, name=None):
   """Divides x / y elementwise (using Python 2 division operator semantics).
 
@@ -1288,6 +1318,7 @@ def div(x, y, name=None):
 
 
 @tf_export("math.divide_no_nan", v1=["math.divide_no_nan", "div_no_nan"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("div_no_nan")
 @dispatch.add_dispatch_support
 def div_no_nan(x, y, name=None):
@@ -1380,6 +1411,9 @@ floormod = gen_math_ops.floor_mod
 
 def _add_dispatch(x, y, name=None):
   """Dispatches to add for strings and add_v2 for all other types."""
+  if not isinstance(y, ops.Tensor) and not isinstance(
+      y, sparse_tensor.SparseTensor):
+    y = ops.convert_to_tensor(y, dtype_hint=x.dtype.base_dtype, name="y")
   if x.dtype == dtypes.string:
     return gen_math_ops.add(x, y, name=name)
   else:
@@ -1388,14 +1422,12 @@ def _add_dispatch(x, y, name=None):
 
 def _mul_dispatch(x, y, name=None):
   """Dispatches cwise mul for "Dense*Dense" and "Dense*Sparse"."""
-  is_tensor_y = isinstance(y, ops.Tensor)
-  if is_tensor_y:
-    return gen_math_ops.mul(x, y, name=name)
-  else:
-    assert isinstance(y, sparse_tensor.SparseTensor)  # Case: Dense * Sparse.
+  if isinstance(y, sparse_tensor.SparseTensor):  # Case: Dense * Sparse.
     new_vals = gen_sparse_ops.sparse_dense_cwise_mul(y.indices, y.values,
                                                      y.dense_shape, x, name)
     return sparse_tensor.SparseTensor(y.indices, new_vals, y.dense_shape)
+  else:
+    return multiply(x, y, name=name)
 
 
 # NOTE(aselle): When integer division is added for sparse_dense_cwise,
@@ -1409,10 +1441,10 @@ _OverrideBinaryOperatorHelper(gen_sparse_ops.sparse_dense_cwise_mul, "mul",
                               sparse_tensor.SparseTensor)
 
 _OverrideBinaryOperatorHelper(_add_dispatch, "add")
-_OverrideBinaryOperatorHelper(gen_math_ops.sub, "sub")
+_OverrideBinaryOperatorHelper(subtract, "sub")
 _OverrideBinaryOperatorHelper(_mul_dispatch, "mul")
-_OverrideBinaryOperatorHelper(_div_python2, "div")
-_OverrideBinaryOperatorHelper(_truediv_python3, "truediv")
+_OverrideBinaryOperatorHelper(div, "div")
+_OverrideBinaryOperatorHelper(truediv, "truediv")
 _OverrideBinaryOperatorHelper(floordiv, "floordiv")
 _OverrideBinaryOperatorHelper(gen_math_ops.floor_mod, "mod")
 _OverrideBinaryOperatorHelper(pow, "pow")
@@ -1509,7 +1541,7 @@ def logical_and(x, y, name=None):
   return gen_math_ops.logical_and(x, y, name)
 
 
-_OverrideBinaryOperatorHelper(gen_math_ops.logical_and, "and")
+_OverrideBinaryOperatorHelper(logical_and, "and")
 _OverrideBinaryOperatorHelper(gen_math_ops.logical_or, "or")
 _OverrideBinaryOperatorHelper(logical_xor, "xor")
 
@@ -1620,6 +1652,7 @@ ops.Tensor._override_operator("__ne__", tensor_not_equals)
 
 
 @tf_export("range")
+@dispatch.add_dispatch_support
 def range(start, limit=None, delta=1, dtype=None, name="range"):  # pylint: disable=redefined-builtin
   """Creates a sequence of numbers.
 
@@ -1751,6 +1784,7 @@ def _may_reduce_to_scalar(keepdims, axis, output):
 
 
 @tf_export(v1=["math.reduce_sum", "reduce_sum"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -1885,6 +1919,7 @@ def reduce_sum_with_dims(input_tensor,
 
 
 @tf_export("math.reduce_euclidean_norm")
+@dispatch.add_dispatch_support
 def reduce_euclidean_norm(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the Euclidean norm of elements across dimensions of a tensor.
 
@@ -1928,6 +1963,7 @@ def reduce_euclidean_norm(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.count_nonzero", "count_nonzero"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2005,6 +2041,7 @@ def count_nonzero(input_tensor=None,
 
 
 @tf_export("math.count_nonzero", v1=[])
+@dispatch.add_dispatch_support
 def count_nonzero_v2(
     input,  # pylint: disable=redefined-builtin
     axis=None,
@@ -2072,6 +2109,7 @@ def count_nonzero_v2(
 
 
 @tf_export(v1=["math.reduce_mean", "reduce_mean"])
+@dispatch.add_dispatch_support
 def reduce_mean_v1(input_tensor,
                    axis=None,
                    keepdims=None,
@@ -2198,6 +2236,7 @@ def reduce_mean(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export("math.reduce_variance")
+@dispatch.add_dispatch_support
 def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the variance of elements across dimensions of a tensor.
 
@@ -2211,15 +2250,16 @@ def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
 
   For example:
 
-  ```python
-  x = tf.constant([[1., 2.], [3., 4.]])
-  tf.reduce_variance(x)  # 1.25
-  tf.reduce_variance(x, 0)  # [1., 1.]
-  tf.reduce_variance(x, 1)  # [0.25,  0.25]
-  ```
+  >>> x = tf.constant([[1., 2.], [3., 4.]])
+  >>> tf.math.reduce_variance(x)
+  <tf.Tensor: shape=(), dtype=float32, numpy=1.25>
+  >>> tf.math.reduce_variance(x, 0)
+  <tf.Tensor: shape=(2,), dtype=float32, numpy=array([1., 1.], ...)>
+  >>> tf.math.reduce_variance(x, 1)
+  <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.25, 0.25], ...)>
 
   Args:
-    input_tensor: The tensor to reduce. Should have numeric type.
+    input_tensor: The tensor to reduce. Should have real or complex type.
     axis: The dimensions to reduce. If `None` (the default), reduces all
       dimensions. Must be in the range `[-rank(input_tensor),
       rank(input_tensor))`.
@@ -2227,25 +2267,37 @@ def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
     name: A name scope for the associated operations (optional).
 
   Returns:
-    The reduced tensor, of the same dtype as the input_tensor.
+    The reduced tensor, of the same dtype as the input_tensor. Note,  for
+    `complex64` or `complex128` input, the returned `Tensor` will be of type
+    `float32` or `float64`, respectively.
 
   @compatibility(numpy)
   Equivalent to np.var
 
-  Please note that `np.var` has a `dtype` parameter that could be used to
-  specify the output type. By default this is `dtype=float64`. On the other
-  hand, `tf.reduce_variance` has an aggressive type inference from
-  `input_tensor`,
+  Please note `np.var` has a `dtype` parameter that could be used to specify the
+  output type. By default this is `dtype=float64`. On the other hand,
+  `tf.math.reduce_variance` has aggressive type inference from `input_tensor`.
   @end_compatibility
   """
   name = name if name else "reduce_variance"
   with ops.name_scope(name):
     means = reduce_mean(input_tensor, axis=axis, keepdims=True)
-    squared_deviations = gen_math_ops.square(input_tensor - means)
+    if means.dtype.is_integer:
+      raise TypeError("Input must be either real or complex")
+    diff = input_tensor - means
+    if diff.dtype.is_complex:
+      # For complex values we need to take the absolute value before squaring.
+      # This is achieved by multiplying with the conjugate.
+      real_dtype = diff.dtype.real_dtype
+      squared_deviations = gen_math_ops.real(
+          gen_math_ops.mul(gen_math_ops.conj(diff), diff), Tout=real_dtype)
+    else:
+      squared_deviations = gen_math_ops.square(diff)
     return reduce_mean(squared_deviations, axis=axis, keepdims=keepdims)
 
 
 @tf_export("math.reduce_std")
+@dispatch.add_dispatch_support
 def reduce_std(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the standard deviation of elements across dimensions of a tensor.
 
@@ -2259,15 +2311,16 @@ def reduce_std(input_tensor, axis=None, keepdims=False, name=None):
 
   For example:
 
-  ```python
-  x = tf.constant([[1., 2.], [3., 4.]])
-  tf.reduce_std(x)  # 1.1180339887498949
-  tf.reduce_std(x, 0)  # [1., 1.]
-  tf.reduce_std(x, 1)  # [0.5,  0.5]
-  ```
+  >>> x = tf.constant([[1., 2.], [3., 4.]])
+  >>> tf.math.reduce_std(x)
+  <tf.Tensor: shape=(), dtype=float32, numpy=1.118034>
+  >>> tf.math.reduce_std(x, 0)
+  <tf.Tensor: shape=(2,), dtype=float32, numpy=array([1., 1.], dtype=float32)>
+  >>> tf.math.reduce_std(x, 1)
+  <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.5, 0.5], dtype=float32)>
 
   Args:
-    input_tensor: The tensor to reduce. Should have numeric type.
+    input_tensor: The tensor to reduce. Should have real or complex type.
     axis: The dimensions to reduce. If `None` (the default), reduces all
       dimensions. Must be in the range `[-rank(input_tensor),
       rank(input_tensor))`.
@@ -2275,14 +2328,16 @@ def reduce_std(input_tensor, axis=None, keepdims=False, name=None):
     name: A name scope for the associated operations (optional).
 
   Returns:
-    The reduced tensor, of the same dtype as the input_tensor.
+    The reduced tensor, of the same dtype as the input_tensor. Note,  for
+    `complex64` or `complex128` input, the returned `Tensor` will be of type
+    `float32` or `float64`, respectively.
 
   @compatibility(numpy)
   Equivalent to np.std
 
-  Please note that `np.std` has a `dtype` parameter that could be used to
-  specify the output type. By default this is `dtype=float64`. On the other
-  hand, `tf.reduce_std` has an aggressive type inference from `input_tensor`,
+  Please note `np.std` has a `dtype` parameter that could be used to specify the
+  output type. By default this is `dtype=float64`. On the other hand,
+  `tf.math.reduce_std` has aggressive type inference from `input_tensor`.
   @end_compatibility
   """
   name = name if name else "reduce_std"
@@ -2328,6 +2383,7 @@ def reduce_prod(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.reduce_prod", "reduce_prod"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2373,6 +2429,7 @@ def reduce_prod_v1(input_tensor,
 
 
 @tf_export(v1=["math.reduce_min", "reduce_min"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2459,6 +2516,7 @@ def reduce_min(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.reduce_max", "reduce_max"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2563,6 +2621,7 @@ def reduce_max_with_dims(input_tensor,
 
 
 @tf_export(v1=["math.reduce_all", "reduce_all"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2662,6 +2721,7 @@ def reduce_all(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.reduce_any", "reduce_any"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2761,6 +2821,7 @@ def reduce_any(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.reduce_logsumexp", "reduce_logsumexp"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2817,6 +2878,7 @@ def reduce_logsumexp_v1(input_tensor,
 
 
 @tf_export("math.reduce_logsumexp", "reduce_logsumexp", v1=[])
+@dispatch.add_dispatch_support
 def reduce_logsumexp(input_tensor, axis=None, keepdims=False, name=None):
   """Computes log(sum(exp(elements across dimensions of a tensor))).
 
@@ -2877,6 +2939,7 @@ def reduce_logsumexp(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export("linalg.trace", v1=["linalg.trace", "trace"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("trace")
 @dispatch.add_dispatch_support
 def trace(x, name=None):
@@ -3050,10 +3113,10 @@ def matmul(a,
       if not isinstance(a, (ops.EagerTensor, _resource_variable_type)):
         a = ops.convert_to_tensor(a, name="a")
       if not isinstance(b, (ops.EagerTensor, _resource_variable_type)):
-        b = ops.convert_to_tensor(b, name="b")
+        b = ops.convert_to_tensor(b, dtype_hint=a.dtype.base_dtype, name="b")
     else:
       a = ops.convert_to_tensor(a, name="a")
-      b = ops.convert_to_tensor(b, name="b")
+      b = ops.convert_to_tensor(b, dtype_hint=a.dtype.base_dtype, name="b")
 
     # TODO(apassos) remove _shape_tuple here when it is not needed.
     a_shape = a._shape_tuple()  # pylint: disable=protected-access
@@ -3116,6 +3179,7 @@ def matmul(a,
 
 
 @tf_export("linalg.matvec")
+@dispatch.add_dispatch_support
 def matvec(a,
            b,
            transpose_a=False,
@@ -3219,6 +3283,7 @@ _OverrideBinaryOperatorHelper(matmul, "matmul")
 sparse_matmul = deprecation.deprecated(None, "Use `tf.linalg.matmul` instead")(
     gen_math_ops.sparse_mat_mul)
 tf_export(v1=["sparse_matmul"])(sparse_matmul)
+@dispatch.add_dispatch_support
 
 
 @ops.RegisterStatistics("MatMul", "flops")
@@ -3371,6 +3436,7 @@ def add_n(inputs, name=None):
 
 
 @tf_export("math.accumulate_n", v1=["math.accumulate_n", "accumulate_n"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("accumulate_n")
 def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   """Returns the element-wise sum of a list of tensors.
@@ -3449,6 +3515,7 @@ def _accumulate_n_grad(op, grad):
 
 
 @tf_export("math.sigmoid", "nn.sigmoid", "sigmoid")
+@dispatch.add_dispatch_support
 def sigmoid(x, name=None):
   r"""Computes sigmoid of `x` element-wise.
 
@@ -3520,115 +3587,8 @@ def log_sigmoid(x, name=None):
     return gen_math_ops.neg(gen_nn_ops.softplus(-x), name=name)
 
 
-@tf_export("math.bincount", v1=[])
-def bincount(arr,
-             weights=None,
-             minlength=None,
-             maxlength=None,
-             dtype=dtypes.int32,
-             name=None):
-  """Counts the number of occurrences of each value in an integer array.
-
-  If `minlength` and `maxlength` are not given, returns a vector with length
-  `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
-  If `weights` are non-None, then index `i` of the output stores the sum of the
-  value in `weights` at each index where the corresponding value in `arr` is
-  `i`.
-
-  ```python
-  values = tf.constant([1,1,2,3,2,4,4,5])
-  tf.math.bincount(values) #[0 2 2 1 2 1]
-  ```
-  Vector length = Maximum element in vector `values` is 5. Adding 1, which is 6
-                  will be the vector length.
-
-  Each bin value in the output indicates number of occurrences of the particular
-  index. Here, index 1 in output has a value 2. This indicates value 1 occurs
-  two times in `values`.
-
-  ```python
-  values = tf.constant([1,1,2,3,2,4,4,5])
-  weights = tf.constant([1,5,0,1,0,5,4,5])
-  tf.math.bincount(values, weights=weights) #[0 6 0 1 9 5]
-  ```
-  Bin will be incremented by the corresponding weight instead of 1.
-  Here, index 1 in output has a value 6. This is the summation of weights
-  corresponding to the value in `values`.
-
-  Args:
-    arr: An int32 tensor of non-negative values.
-    weights: If non-None, must be the same shape as arr. For each value in
-      `arr`, the bin will be incremented by the corresponding weight instead of
-      1.
-    minlength: If given, ensures the output has length at least `minlength`,
-      padding with zeros at the end if necessary.
-    maxlength: If given, skips values in `arr` that are equal or greater than
-      `maxlength`, ensuring that the output has length at most `maxlength`.
-    dtype: If `weights` is None, determines the type of the output bins.
-    name: A name scope for the associated operations (optional).
-
-  Returns:
-    A vector with the same dtype as `weights` or the given `dtype`. The bin
-    values.
-
-  Raises:
-    `InvalidArgumentError` if negative values are provided as an input.
-
-  """
-  name = "bincount" if name is None else name
-  with ops.name_scope(name):
-    arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32)
-    array_is_nonempty = reduce_prod(array_ops.shape(arr)) > 0
-    output_size = cast(array_is_nonempty, dtypes.int32) * (reduce_max(arr) + 1)
-    if minlength is not None:
-      minlength = ops.convert_to_tensor(
-          minlength, name="minlength", dtype=dtypes.int32)
-      output_size = gen_math_ops.maximum(minlength, output_size)
-    if maxlength is not None:
-      maxlength = ops.convert_to_tensor(
-          maxlength, name="maxlength", dtype=dtypes.int32)
-      output_size = gen_math_ops.minimum(maxlength, output_size)
-    if weights is not None:
-      weights = ops.convert_to_tensor(weights, name="weights")
-      return gen_math_ops.unsorted_segment_sum(weights, arr, output_size)
-    weights = constant_op.constant([], dtype)
-    return gen_math_ops.bincount(arr, output_size, weights)
-
-
-@tf_export(v1=["math.bincount", "bincount"])
-@deprecation.deprecated_endpoints("bincount")
-def bincount_v1(arr,
-                weights=None,
-                minlength=None,
-                maxlength=None,
-                dtype=dtypes.int32):
-  """Counts the number of occurrences of each value in an integer array.
-
-  If `minlength` and `maxlength` are not given, returns a vector with length
-  `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
-  If `weights` are non-None, then index `i` of the output stores the sum of the
-  value in `weights` at each index where the corresponding value in `arr` is
-  `i`.
-
-  Args:
-    arr: An int32 tensor of non-negative values.
-    weights: If non-None, must be the same shape as arr. For each value in
-      `arr`, the bin will be incremented by the corresponding weight instead of
-      1.
-    minlength: If given, ensures the output has length at least `minlength`,
-      padding with zeros at the end if necessary.
-    maxlength: If given, skips values in `arr` that are equal or greater than
-      `maxlength`, ensuring that the output has length at most `maxlength`.
-    dtype: If `weights` is None, determines the type of the output bins.
-
-  Returns:
-    A vector with the same dtype as `weights` or the given `dtype`. The bin
-    values.
-  """
-  return bincount(arr, weights, minlength, maxlength, dtype)
-
-
 @tf_export("math.cumsum", "cumsum")
+@dispatch.add_dispatch_support
 def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative sum of the tensor `x` along `axis`.
 
@@ -3700,6 +3660,7 @@ def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
 
 
 @tf_export("math.cumprod", v1=["math.cumprod", "cumprod"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("cumprod")
 def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative product of the tensor `x` along `axis`.
@@ -3753,6 +3714,7 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
 
 
 @tf_export("math.cumulative_logsumexp", v1=["math.cumulative_logsumexp"])
+@dispatch.add_dispatch_support
 def cumulative_logsumexp(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative log-sum-exp of the tensor `x` along `axis`.
 
@@ -3912,6 +3874,7 @@ def _unsorted_segment_N(data, segment_ids, num_segments):
 @tf_export(
     "math.unsorted_segment_mean",
     v1=["math.unsorted_segment_mean", "unsorted_segment_mean"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("unsorted_segment_mean")
 @dispatch.add_dispatch_support
 def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
@@ -3958,6 +3921,7 @@ def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
 @tf_export(
     "math.unsorted_segment_sqrt_n",
     v1=["math.unsorted_segment_sqrt_n", "unsorted_segment_sqrt_n"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("unsorted_segment_sqrt_n")
 @dispatch.add_dispatch_support
 def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
@@ -4307,6 +4271,7 @@ def sparse_segment_sqrt_n_v2(data,
 
 
 @tf_export("tensordot", "linalg.tensordot")
+@dispatch.add_dispatch_support
 def tensordot(a, b, axes, name=None):
   r"""Tensor contraction of a and b along specified axes and outer product.
 
@@ -4493,6 +4458,7 @@ def tensordot(a, b, axes, name=None):
 
 
 @tf_export("math.polyval")
+@dispatch.add_dispatch_support
 def polyval(coeffs, x, name=None):
   r"""Computes the elementwise value of a polynomial.
 
@@ -4505,9 +4471,9 @@ def polyval(coeffs, x, name=None):
 
      p(x) = coeffs[n-1] + x * (coeffs[n-2] + ... + x * (coeffs[1] +
             x * coeffs[0]))
-            
+
   Usage Example:
-  
+
   >>> coefficients = [1.0, 2.5, -4.2]
   >>> x = 5.0
   >>> y = tf.math.polyval(coefficients, x)
@@ -4563,6 +4529,7 @@ def polyval(coeffs, x, name=None):
 
 
 @tf_export("math.reciprocal_no_nan")
+@dispatch.add_dispatch_support
 def reciprocal_no_nan(x, name=None):
   """Performs a safe reciprocal operation, element wise.
 
@@ -4665,6 +4632,7 @@ def ndtri(x, name=None):
 
 
 @tf_export("math.ceil", v1=["math.ceil", "ceil"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("ceil")
 @dispatch.add_dispatch_support
 def ceil(x, name=None):
@@ -4778,6 +4746,7 @@ def exp(x, name=None):
 
 
 @tf_export("math.sobol_sample")
+@dispatch.add_dispatch_support
 def sobol_sample(dim, num_results, skip=0, dtype=dtypes.float32, name=None):
   """Generates points from the Sobol sequence.
 
@@ -4802,6 +4771,7 @@ def sobol_sample(dim, num_results, skip=0, dtype=dtypes.float32, name=None):
 
 
 @tf_export("math.rsqrt", v1=["math.rsqrt", "rsqrt"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("rsqrt")
 @dispatch.add_dispatch_support
 def rsqrt(x, name=None):
diff --git a/tensorflow/python/ops/math_ops_linspace_test.py b/tensorflow/python/ops/math_ops_linspace_test.py
new file mode 100644
index 00000000000..f56b1c9284d
--- /dev/null
+++ b/tensorflow/python/ops/math_ops_linspace_test.py
@@ -0,0 +1,60 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.math_ops.linspace."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Using distutils.version.LooseVersion was resulting in an error, so importing
+# directly.
+from distutils.version import LooseVersion  # pylint: disable=g-importing-member
+import itertools
+
+import numpy as np
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class LinspaceTest(test_util.TensorFlowTestCase):
+
+  def testLinspaceBroadcasts(self):
+    if LooseVersion(np.version.version) < LooseVersion("1.16.0"):
+      self.skipTest("numpy doesn't support axes before version 1.16.0")
+
+    shapes = [(), (2,), (2, 2)]
+
+    types = [np.float64, np.int64]
+
+    for start_shape, stop_shape in itertools.product(shapes, repeat=2):
+      for num in [0, 1, 2, 20]:
+        ndims = max(len(start_shape), len(stop_shape))
+        for axis in range(-ndims, ndims):
+          for dtype in types:
+            start = np.ones(start_shape, dtype)
+            stop = 10 * np.ones(stop_shape, dtype)
+
+            np_ans = np.linspace(start, stop, num, axis=axis)
+            tf_ans = self.evaluate(
+                math_ops.linspace_nd(start, stop, num, axis=axis))
+
+            self.assertAllClose(np_ans, tf_ans)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 2405eec9e49..ca34a0012f1 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -44,6 +44,16 @@ class ReduceTest(test_util.TensorFlowTestCase):
       y_tf = self.evaluate(math_ops.reduce_sum(x))
       self.assertEqual(y_tf, 21)
 
+  def testReduceExtendType(self):
+    in_f32 = np.random.randn(1000, 1000).astype(np.float32)
+    in_bf16 = math_ops.cast(in_f32, dtypes.bfloat16)
+
+    out_f32 = self.evaluate(math_ops.reduce_sum(in_f32))
+    out_bf16 = self.evaluate(math_ops.reduce_sum(in_bf16))
+    expected = math_ops.cast(out_f32, dtypes.bfloat16)
+
+    self.assertAllClose(out_bf16, expected, 1e-3)
+
   def testReduceExplicitAxes(self):
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     with test_util.device(use_gpu=True):
@@ -72,8 +82,26 @@ class ReduceTest(test_util.TensorFlowTestCase):
     self.assertAllClose(
         self.evaluate(math_ops.reduce_variance(x, axis=0)), [0, 0, 0])
 
-    x = np.array([[0, 2, 1, 1], [1, 2, 0, 1]], "float32")
-    self.assertAllClose(self.evaluate(math_ops.reduce_variance(x)), 0.5)
+    x = [[1, 2, 1, 1], [1, 1, 0, 1]]
+    with self.assertRaisesRegexp(TypeError, "must be either real or complex"):
+      math_ops.reduce_variance(x)
+
+    x = [[1., 2., 1., 1.], [1., 1., 0., 1.]]
+    self.assertEqual(self.evaluate(math_ops.reduce_variance(x)), 0.25)
+    x_np = np.array(x)
+    self.assertEqual(np.var(x_np), 0.25)
+    self.assertEqual(self.evaluate(math_ops.reduce_variance(x_np)), 0.25)
+
+  def testReduceVarComplex(self):
+    # Ensure that complex values are handled to be consistent with numpy
+    complex_ys = [([0 - 1j, 0 + 1j], dtypes.float64),
+                  (np.array([0 - 1j, 0 + 1j], "complex64"), dtypes.float32),
+                  (np.array([0 - 1j, 0 + 1j], "complex128"), dtypes.float64)]
+    for y, dtype in complex_ys:
+      y_result = math_ops.reduce_variance(y)
+      self.assertEqual(np.var(y), 1.0)
+      self.assertEqual(self.evaluate(y_result), 1.0)
+      self.assertEqual(y_result.dtype, dtype)
 
   def testReduceStd(self):
     x = np.array([[0, 0, 0], [0, 0, 0]], "float32")
@@ -81,8 +109,26 @@ class ReduceTest(test_util.TensorFlowTestCase):
     self.assertAllClose(
         self.evaluate(math_ops.reduce_std(x, axis=0)), [0, 0, 0])
 
-    x = np.array([[1, 2, 1, 1], [1, 1, 0, 1]], "float32")
-    self.assertAllClose(self.evaluate(math_ops.reduce_std(x)), 0.5)
+    x = [[1, 2, 1, 1], [1, 1, 0, 1]]
+    with self.assertRaisesRegexp(TypeError, "must be either real or complex"):
+      math_ops.reduce_std(x)
+
+    x = [[1., 2., 1., 1.], [1., 1., 0., 1.]]
+    self.assertEqual(self.evaluate(math_ops.reduce_std(x)), 0.5)
+    x_np = np.array(x)
+    self.assertEqual(np.std(x_np), 0.5)
+    self.assertEqual(self.evaluate(math_ops.reduce_std(x_np)), 0.5)
+
+  def testReduceStdComplex(self):
+    # Ensure that complex values are handled to be consistent with numpy
+    complex_ys = [([0 - 1j, 0 + 1j], dtypes.float64),
+                  (np.array([0 - 1j, 0 + 1j], "complex64"), dtypes.float32),
+                  (np.array([0 - 1j, 0 + 1j], "complex128"), dtypes.float64)]
+    for y, dtype in complex_ys:
+      y_result = math_ops.reduce_std(y)
+      self.assertEqual(np.std(y), 1.0)
+      self.assertEqual(self.evaluate(y_result), 1.0)
+      self.assertEqual(y_result.dtype, dtype)
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -682,6 +728,45 @@ class BinaryOpsTest(test_util.TensorFlowTestCase):
       a = array_ops.ones([1], dtype=dtypes.int32) + 1.0
       self.evaluate(a)
 
+  def testRHSDispatchingAndErrorRaising(self):
+    if context.executing_eagerly():
+      error = ValueError
+      error_message = (
+          r"Attempt to convert a value .* with an unsupported type")
+    else:
+      error = TypeError
+      error_message = (
+          r"Failed to convert object of type .* to Tensor")
+
+    class RHSReturnsTrue(object):
+
+      def __radd__(self, other):
+        return True
+    a = array_ops.ones([1], dtype=dtypes.int32) + RHSReturnsTrue()
+    self.assertEqual(a, True)
+
+    class RHSRaisesError(object):
+
+      def __radd__(self, other):
+        raise TypeError("RHS not implemented")
+    with self.assertRaisesRegexp(error, error_message):
+      a = array_ops.ones([1], dtype=dtypes.int32) + RHSRaisesError()
+      self.evaluate(a)
+
+    class RHSReturnsNotImplemented(object):
+
+      def __radd__(self, other):
+        return NotImplemented
+    with self.assertRaisesRegexp(error, error_message):
+      a = array_ops.ones([1], dtype=dtypes.int32) + RHSReturnsNotImplemented()
+      self.evaluate(a)
+
+    class RHSNotImplemented(object):
+      pass
+    with self.assertRaisesRegexp(error, error_message):
+      a = array_ops.ones([1], dtype=dtypes.int32) + RHSNotImplemented()
+      self.evaluate(a)
+
 
 class SignTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index a809b768833..5497325f6c0 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -380,13 +379,7 @@ class BatchNormalizationTest(test.TestCase):
     use_gpu_vals = [False]
     if test.is_gpu_available(cuda_only=True):
       use_gpu_vals += [True]
-    factors = [
-        1.0,
-    ]
-    if compat.forward_compatible(2020, 3, 6):
-      factors += [
-          0.6,
-      ]
+    factors = [1.0, 0.6]
     for dtype in [np.float16, np.float32]:
       for use_gpu in use_gpu_vals:
         for data_format in ['NHWC', 'NCHW']:
diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py
index 9da56cb7200..490451c16c9 100644
--- a/tensorflow/python/ops/nn_grad_test.py
+++ b/tensorflow/python/ops/nn_grad_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import nn_impl
@@ -33,6 +35,29 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
 
+class SoftmaxOpTest(test.TestCase):
+
+  # This test is for bfloat16, but the type has a problem with compute_gradient.
+  # TODO(penporn): Change the data type back to bfloat16 once b/157773623 is
+  # fixed. (compute_gradient internally converts bfloat16 to float32 for
+  # calculation anyway.)
+  def testSoftmaxGradGradExtendType(self):
+    with self.cached_session():
+
+      def f(x):
+        assert x.dtype == dtypes.float32
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.softmax(x)
+        return tape.gradient(y, x)
+
+      x = constant_op.constant([[-2, -1, 1, 3], [5, 7, 8, 9]],
+                               dtype=dtypes.float32)
+      error = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
+      self.assertLess(error, 1e-4)
+
+
 class Relu6OpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 03c1289246e..eec352b4e2e 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import math
 
-from tensorflow.python.compat import compat
 from tensorflow.python.distribute import distribution_strategy_context as ds
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -39,12 +38,14 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import util as losses_util
 from tensorflow.python.platform import device_context
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("nn.log_poisson_loss")
+@dispatch.add_dispatch_support
 def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
   """Computes log Poisson loss given `log_input`.
 
@@ -110,6 +111,7 @@ def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
 
 
 @tf_export(v1=["nn.sigmoid_cross_entropy_with_logits"])
+@dispatch.add_dispatch_support
 def sigmoid_cross_entropy_with_logits(  # pylint: disable=invalid-name
     _sentinel=None,
     labels=None,
@@ -192,6 +194,7 @@ def sigmoid_cross_entropy_with_logits(  # pylint: disable=invalid-name
 # Note: intentionally calling this v2 to not allow existing code with indirect
 # imports to ignore the sentinel behavior.
 @tf_export("nn.sigmoid_cross_entropy_with_logits", v1=[])
+@dispatch.add_dispatch_support
 def sigmoid_cross_entropy_with_logits_v2(  # pylint: disable=invalid-name
     labels=None,
     logits=None,
@@ -242,6 +245,7 @@ def sigmoid_cross_entropy_with_logits_v2(  # pylint: disable=invalid-name
 
 
 @tf_export("nn.weighted_cross_entropy_with_logits", v1=[])
+@dispatch.add_dispatch_support
 def weighted_cross_entropy_with_logits_v2(labels, logits, pos_weight,
                                           name=None):
   """Computes a weighted cross entropy.
@@ -320,6 +324,7 @@ def weighted_cross_entropy_with_logits_v2(labels, logits, pos_weight,
 
 
 @tf_export(v1=["nn.weighted_cross_entropy_with_logits"])
+@dispatch.add_dispatch_support
 @deprecated_args(None, "targets is deprecated, use labels instead", "targets")
 def weighted_cross_entropy_with_logits(labels=None,
                                        logits=None,
@@ -384,6 +389,7 @@ def weighted_cross_entropy_with_logits(labels=None,
 
 
 @tf_export("nn.compute_average_loss")
+@dispatch.add_dispatch_support
 def compute_average_loss(per_example_loss,
                          sample_weight=None,
                          global_batch_size=None):
@@ -440,6 +446,7 @@ def compute_average_loss(per_example_loss,
 
 
 @tf_export("nn.scale_regularization_loss")
+@dispatch.add_dispatch_support
 def scale_regularization_loss(regularization_loss):
   """Scales the sum of the given regularization losses by number of replicas.
 
@@ -478,6 +485,7 @@ def scale_regularization_loss(regularization_loss):
 
 
 @tf_export(v1=["nn.relu_layer"])
+@dispatch.add_dispatch_support
 def relu_layer(x, weights, biases, name=None):
   """Computes Relu(x * weight + biases).
 
@@ -501,6 +509,7 @@ def relu_layer(x, weights, biases, name=None):
 
 
 @tf_export("nn.swish")
+@dispatch.add_dispatch_support
 @custom_gradient.custom_gradient
 def swish(features):
   # pylint: disable=g-doc-args
@@ -538,6 +547,7 @@ def swish(features):
 
 # pylint: disable=redefined-builtin
 @tf_export("linalg.normalize")
+@dispatch.add_dispatch_support
 def normalize(tensor, ord="euclidean", axis=None, name=None):
   """Normalizes `tensor` along dimension `axis` using specified norm.
 
@@ -590,6 +600,7 @@ def normalize(tensor, ord="euclidean", axis=None, name=None):
 
 
 @tf_export(v1=["math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize"])
+@dispatch.add_dispatch_support
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
   """Normalizes along dimension `axis` using an L2 norm.
@@ -618,6 +629,7 @@ def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
 
 
 @tf_export("math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize", v1=[])
+@dispatch.add_dispatch_support
 def l2_normalize_v2(x, axis=None, epsilon=1e-12, name=None):
   """Normalizes along dimension `axis` using an L2 norm.
 
@@ -641,6 +653,15 @@ def l2_normalize_v2(x, axis=None, epsilon=1e-12, name=None):
   """
   with ops.name_scope(name, "l2_normalize", [x]) as name:
     x = ops.convert_to_tensor(x, name="x")
+    if x.dtype.is_complex:
+      square_real = math_ops.square(math_ops.real(x))
+      square_imag = math_ops.square(math_ops.imag(x))
+      square_sum = math_ops.real(
+          math_ops.reduce_sum(square_real + square_imag, axis, keepdims=True))
+      x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
+      norm_real = math_ops.multiply(math_ops.real(x), x_inv_norm)
+      norm_imag = math_ops.multiply(math_ops.imag(x), x_inv_norm)
+      return math_ops.complex(norm_real, norm_imag, name=name)
     square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
     x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
     return math_ops.multiply(x, x_inv_norm, name=name)
@@ -668,6 +689,7 @@ def _count_nonzero(input_tensor, dtype=dtypes.int64):
 
 
 @tf_export("math.zero_fraction", "nn.zero_fraction")
+@dispatch.add_dispatch_support
 def zero_fraction(value, name=None):
   """Returns the fraction of zeros in `value`.
 
@@ -710,6 +732,7 @@ def zero_fraction(value, name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export(v1=["nn.depthwise_conv2d"])
+@dispatch.add_dispatch_support
 def depthwise_conv2d(input,
                      filter,
                      strides,
@@ -838,6 +861,7 @@ def depthwise_conv2d(input,
 
 
 @tf_export("nn.depthwise_conv2d", v1=[])
+@dispatch.add_dispatch_support
 def depthwise_conv2d_v2(input,
                         filter,
                         strides,
@@ -935,6 +959,7 @@ def depthwise_conv2d_v2(input,
 
 # pylint: disable=redefined-builtin,line-too-long
 @tf_export(v1=["nn.separable_conv2d"])
+@dispatch.add_dispatch_support
 def separable_conv2d(input,
                      depthwise_filter,
                      pointwise_filter,
@@ -1042,6 +1067,7 @@ def separable_conv2d(input,
 
 
 @tf_export("nn.separable_conv2d", v1=[])
+@dispatch.add_dispatch_support
 def separable_conv2d_v2(
     input,
     depthwise_filter,
@@ -1117,6 +1143,7 @@ def separable_conv2d_v2(
 
 
 @tf_export(v1=["nn.sufficient_statistics"])
+@dispatch.add_dispatch_support
 def sufficient_statistics(x, axes, shift=None, keep_dims=None, name=None,
                           keepdims=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.
@@ -1174,6 +1201,7 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=None, name=None,
 
 
 @tf_export("nn.sufficient_statistics", v1=[])
+@dispatch.add_dispatch_support
 def sufficient_statistics_v2(x, axes, shift=None, keepdims=False, name=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.
 
@@ -1203,6 +1231,7 @@ def sufficient_statistics_v2(x, axes, shift=None, keepdims=False, name=None):
 
 
 @tf_export("nn.normalize_moments")
+@dispatch.add_dispatch_support
 def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
 
@@ -1235,6 +1264,7 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
 
 
 @tf_export(v1=["nn.moments"])
+@dispatch.add_dispatch_support
 def moments(
     x,
     axes,
@@ -1300,6 +1330,7 @@ def moments(
 
 
 @tf_export("nn.moments", v1=[])
+@dispatch.add_dispatch_support
 def moments_v2(
     x,
     axes,
@@ -1336,6 +1367,7 @@ def moments_v2(
 
 
 @tf_export(v1=["nn.weighted_moments"])
+@dispatch.add_dispatch_support
 def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=None,
                      keepdims=None):
   """Returns the frequency-weighted mean and variance of `x`.
@@ -1414,6 +1446,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=None,
 
 
 @tf_export("nn.weighted_moments", v1=[])
+@dispatch.add_dispatch_support
 def weighted_moments_v2(x, axes, frequency_weights, keepdims=False, name=None):
   """Returns the frequency-weighted mean and variance of `x`.
 
@@ -1438,6 +1471,7 @@ def weighted_moments_v2(x, axes, frequency_weights, keepdims=False, name=None):
 
 
 @tf_export("nn.batch_normalization")
+@dispatch.add_dispatch_support
 def batch_normalization(x,
                         mean,
                         variance,
@@ -1472,7 +1506,7 @@ def batch_normalization(x,
       `tf.nn.moments(..., keepdims=False)` during training, or running averages
       thereof during inference.
 
-  See equation 11 in Algorithm 2 of source: 
+  See equation 11 in Algorithm 2 of source:
   [Batch Normalization: Accelerating Deep Network Training by
   Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy]
   (http://arxiv.org/abs/1502.03167).
@@ -1508,6 +1542,7 @@ def batch_normalization(x,
 
 
 @tf_export(v1=["nn.fused_batch_norm"])
+@dispatch.add_dispatch_support
 def fused_batch_norm(
     x,
     scale,
@@ -1603,34 +1638,22 @@ def fused_batch_norm(
   min_epsilon = 1.001e-5
   epsilon = epsilon if epsilon > min_epsilon else min_epsilon
 
-  if compat.forward_compatible(2020, 3, 6):
-    y, running_mean, running_var, _, _, _ = gen_nn_ops.fused_batch_norm_v3(
-        x,
-        scale,
-        offset,
-        mean,
-        variance,
-        epsilon=epsilon,
-        exponential_avg_factor=exponential_avg_factor,
-        data_format=data_format,
-        is_training=is_training,
-        name=name)
-    return y, running_mean, running_var
-  else:
-    y, running_mean, running_var, _, _, _ = gen_nn_ops.fused_batch_norm_v3(
-        x,
-        scale,
-        offset,
-        mean,
-        variance,
-        epsilon=epsilon,
-        data_format=data_format,
-        is_training=is_training,
-        name=name)
-    return y, running_mean, running_var
+  y, running_mean, running_var, _, _, _ = gen_nn_ops.fused_batch_norm_v3(
+      x,
+      scale,
+      offset,
+      mean,
+      variance,
+      epsilon=epsilon,
+      exponential_avg_factor=exponential_avg_factor,
+      data_format=data_format,
+      is_training=is_training,
+      name=name)
+  return y, running_mean, running_var
 
 
 @tf_export(v1=["nn.batch_norm_with_global_normalization"])
+@dispatch.add_dispatch_support
 def batch_norm_with_global_normalization(t=None,
                                          m=None,
                                          v=None,
@@ -1685,6 +1708,7 @@ def batch_norm_with_global_normalization(t=None,
 
 # pylint: disable=redefined-builtin,line-too-long
 @tf_export("nn.batch_norm_with_global_normalization", v1=[])
+@dispatch.add_dispatch_support
 def batch_norm_with_global_normalization_v2(input,
                                             mean,
                                             variance,
@@ -1934,6 +1958,7 @@ def _compute_sampled_logits(weights,
 
 
 @tf_export("nn.nce_loss", v1=[])
+@dispatch.add_dispatch_support
 def nce_loss_v2(weights,
                 biases,
                 labels,
@@ -2038,6 +2063,7 @@ def nce_loss_v2(weights,
 
 
 @tf_export(v1=["nn.nce_loss"])
+@dispatch.add_dispatch_support
 def nce_loss(weights,
              biases,
              labels,
@@ -2149,6 +2175,7 @@ def nce_loss(weights,
 
 
 @tf_export("nn.sampled_softmax_loss", v1=[])
+@dispatch.add_dispatch_support
 def sampled_softmax_loss_v2(weights,
                             biases,
                             labels,
@@ -2240,6 +2267,7 @@ def sampled_softmax_loss_v2(weights,
 
 
 @tf_export(v1=["nn.sampled_softmax_loss"])
+@dispatch.add_dispatch_support
 def sampled_softmax_loss(weights,
                          biases,
                          labels,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index de5be20aa84..549a29e61b0 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
 import numbers
 import os
 
@@ -45,6 +46,7 @@ from tensorflow.python.ops.gen_nn_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.platform import device_context
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.deprecation import deprecated_argument_lookup
@@ -129,9 +131,9 @@ def _non_atrous_convolution(
   """
   with ops.name_scope(name, "non_atrous_convolution", [input, filter]) as scope:
     input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
-    input_shape = input.get_shape()
+    input_shape = input.shape
     filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
-    filter_shape = filter.get_shape()
+    filter_shape = filter.shape
     op = _NonAtrousConvolution(
         input_shape,
         filter_shape=filter_shape,
@@ -146,36 +148,51 @@ class _NonAtrousConvolution(object):
   """Helper class for _non_atrous_convolution.
 
   Note that this class assumes that shapes of input and filter passed to
-  __call__ are compatible with input_shape and filter_shape passed to the
+  `__call__` are compatible with `input_shape` and filter_shape passed to the
   constructor.
 
   Arguments:
-    input_shape: static input shape, i.e. input.get_shape().
-    filter_shape: static filter shape, i.e. filter.get_shape().
+    input_shape: static input shape, i.e. input.shape.
+    filter_shape: static filter shape, i.e. filter.shape.
     padding: see _non_atrous_convolution.
     data_format: see _non_atrous_convolution.
     strides: see _non_atrous_convolution.
     name: see _non_atrous_convolution.
+    num_batch_dims: (Optional.)  The number of batch dimensions in the input;
+     if not provided, the default of `1` is used.
   """
 
   def __init__(
       self,
       input_shape,
-      filter_shape,  # pylint: disable=redefined-builtin
+      filter_shape,
       padding,
       data_format=None,
       strides=None,
-      name=None):
-    filter_shape = filter_shape.with_rank(input_shape.ndims)
+      name=None,
+      num_batch_dims=1):
+    # filter shape is always rank num_spatial_dims + 2
+    # and num_spatial_dims == input_shape.ndims - num_batch_dims - 1
+    if input_shape.ndims is not None:
+      filter_shape = filter_shape.with_rank(
+          input_shape.ndims - num_batch_dims + 1)
     self.padding = padding
     self.name = name
-    input_shape = input_shape.with_rank(filter_shape.ndims)
+    # input shape is == num_spatial_dims + num_batch_dims + 1
+    # and filter_shape is always rank num_spatial_dims + 2
+    if filter_shape.ndims is not None:
+      input_shape = input_shape.with_rank(
+          filter_shape.ndims + num_batch_dims - 1)
     if input_shape.ndims is None:
-      raise ValueError("Rank of convolution must be known")
-    if input_shape.ndims < 3 or input_shape.ndims > 5:
       raise ValueError(
-          "`input` and `filter` must have rank at least 3 and at most 5")
-    conv_dims = input_shape.ndims - 2
+          "Rank of convolution must be known, but saw input_shape.ndims == {}"
+          .format(input_shape.ndims))
+    if input_shape.ndims < 3 or input_shape.ndims - num_batch_dims + 1 > 5:
+      raise ValueError(
+          "`input_shape.ndims - num_batch_dims + 1` must be at least 3 and at "
+          "most 5 but saw `input_shape.ndims == {}` and `num_batch_dims == {}`"
+          .format(input_shape.ndims, num_batch_dims))
+    conv_dims = input_shape.ndims - num_batch_dims - 1
     if strides is None:
       strides = [1] * conv_dims
     elif len(strides) != conv_dims:
@@ -237,7 +254,57 @@ class _NonAtrousConvolution(object):
         name=self.name)
 
 
+def _squeeze_batch_dims(inp, op, inner_rank, name):
+  """Returns `unsqueeze_batch(op(squeeze_batch(inp)))`.
+
+  Where `squeeze_batch` reshapes `inp` to shape
+  `[prod(inp.shape[:-inner_rank])] + inp.shape[-inner_rank:]`
+  and `unsqueeze_batch` does the reverse reshape but on the output.
+
+  Args:
+    inp: A tensor with dims `batch_shape + inner_shape` where `inner_shape`
+      is length `inner_rank`.
+    op: A callable that takes a single input tensor and returns a single.
+      output tensor.
+    inner_rank: A python integer.
+    name: A string.
+
+  Returns:
+    `unsqueeze_batch_op(squeeze_batch(inp))`.
+  """
+  with ops.name_scope(name, "Convolution", [inp]):
+    inp = ops.convert_to_tensor(inp, name="input")
+    shape = inp.shape
+
+    inner_shape = shape[-inner_rank:]
+    if not inner_shape.is_fully_defined():
+      inner_shape = array_ops.shape(inp)[-inner_rank:]
+
+    batch_shape = shape[:-inner_rank]
+    if not batch_shape.is_fully_defined():
+      batch_shape = array_ops.shape(inp)[:-inner_rank]
+
+    if isinstance(inner_shape, tensor_shape.TensorShape):
+      inp_reshaped = array_ops.reshape(inp, [-1] + inner_shape.as_list())
+    else:
+      inp_reshaped = array_ops.reshape(
+          inp, array_ops.concat(([-1], inner_shape), axis=-1))
+
+    out_reshaped = op(inp_reshaped)
+
+    out_inner_shape = out_reshaped.shape[-inner_rank:]
+    if not out_inner_shape.is_fully_defined():
+      out_inner_shape = array_ops.shape(out_reshaped)[-inner_rank:]
+
+    out = array_ops.reshape(
+        out_reshaped, array_ops.concat((batch_shape, out_inner_shape), axis=-1))
+
+    out.set_shape(inp.shape[:-inner_rank] + out.shape[-inner_rank:])
+    return out
+
+
 @tf_export("nn.dilation2d", v1=[])
+@dispatch.add_dispatch_support
 def dilation2d_v2(
     input,   # pylint: disable=redefined-builtin
     filters,  # pylint: disable=redefined-builtin
@@ -305,6 +372,7 @@ def dilation2d_v2(
 
 
 @tf_export(v1=["nn.dilation2d"])
+@dispatch.add_dispatch_support
 def dilation2d_v1(  # pylint: disable=missing-docstring
     input,  # pylint: disable=redefined-builtin
     filter=None,  # pylint: disable=redefined-builtin
@@ -323,6 +391,7 @@ dilation2d_v1.__doc__ = gen_nn_ops.dilation2d.__doc__
 
 
 @tf_export("nn.with_space_to_batch")
+@dispatch.add_dispatch_support
 def with_space_to_batch(
     input,  # pylint: disable=redefined-builtin
     dilation_rate,
@@ -466,7 +535,7 @@ def with_space_to_batch(
 
   """
   input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
-  input_shape = input.get_shape()
+  input_shape = input.shape
 
   def build_op(num_spatial_dims, padding):
     return lambda inp, _: op(inp, num_spatial_dims, padding)
@@ -486,18 +555,19 @@ class _WithSpaceToBatch(object):
   """Helper class for with_space_to_batch.
 
   Note that this class assumes that shapes of input and filter passed to
-  __call__ are compatible with input_shape and filter_shape passed to the
-  constructor.
+  `__call__` are compatible with `input_shape`, `filter_shape`, and
+  `spatial_dims` passed to the constructor.
 
   Arguments
-    input_shape: static shape of input. i.e. input.get_shape().
-    dilation_rate: see with_space_to_batch
-    padding: see with_space_to_batch
+    input_shape: static shape of input. i.e. input.shape.
+    dilation_rate: see `with_space_to_batch`.
+    padding: see `with_space_to_batch`.
     build_op: Function that maps (num_spatial_dims, paddings) -> (function that
       maps (input, filter) -> output).
-    filter_shape: see with_space_to_batch
-    spatial_dims: see with_space_to_batch
-    data_format: see with_space_to_batch
+    filter_shape: see `with_space_to_batch`.
+    spatial_dims: `see with_space_to_batch`.
+    data_format: see `with_space_to_batch`.
+    num_batch_dims: (Optional).  Number of batch dims in `input_shape`.
   """
 
   def __init__(self,
@@ -507,24 +577,25 @@ class _WithSpaceToBatch(object):
                build_op,
                filter_shape=None,
                spatial_dims=None,
-               data_format=None):
+               data_format=None,
+               num_batch_dims=1):
     """Helper class for _with_space_to_batch."""
     dilation_rate = ops.convert_to_tensor(
         dilation_rate, dtypes.int32, name="dilation_rate")
-    try:
-      rate_shape = dilation_rate.get_shape().with_rank(1)
-    except ValueError:
-      raise ValueError("rate must be rank 1")
+    if dilation_rate.shape.ndims not in (None, 1):
+      raise ValueError(
+          "rate must be rank 1 but saw {}".format(dilation_rate.shape.ndims))
 
-    if not dilation_rate.get_shape().is_fully_defined():
-      raise ValueError("rate must have known shape")
+    if not dilation_rate.shape.is_fully_defined():
+      raise ValueError("rate must have known shape, but saw {}"
+                       .format(dilation_rate.shape))
 
-    num_spatial_dims = rate_shape.dims[0].value
+    num_spatial_dims = dilation_rate.shape.dims[0].value
 
     if data_format is not None and data_format.startswith("NC"):
-      starting_spatial_dim = 2
+      starting_spatial_dim = num_batch_dims + 1
     else:
-      starting_spatial_dim = 1
+      starting_spatial_dim = num_batch_dims
 
     if spatial_dims is None:
       spatial_dims = range(starting_spatial_dim,
@@ -534,7 +605,7 @@ class _WithSpaceToBatch(object):
     if spatial_dims != orig_spatial_dims or any(x < 1 for x in spatial_dims):
       raise ValueError(
           "spatial_dims must be a monotonically increasing sequence of "
-          "positive integers")
+          "positive integers, but saw: {}".format(orig_spatial_dims))
 
     if data_format is not None and data_format.startswith("NC"):
       expected_input_rank = spatial_dims[-1]
@@ -545,14 +616,16 @@ class _WithSpaceToBatch(object):
       input_shape.with_rank_at_least(expected_input_rank)
     except ValueError:
       raise ValueError(
-          "input tensor must have rank %d at least" % (expected_input_rank))
+          "input tensor must have rank at least {}, but saw rank {}"
+          .format(expected_input_rank, input_shape.ndims))
 
     const_rate = tensor_util.constant_value(dilation_rate)
     rate_or_const_rate = dilation_rate
     if const_rate is not None:
       rate_or_const_rate = const_rate
       if np.any(const_rate < 1):
-        raise ValueError("dilation_rate must be positive")
+        raise ValueError("dilation_rate must be positive, but saw: {}"
+                         .format(const_rate))
       if np.all(const_rate == 1):
         self.call = build_op(num_spatial_dims, padding)
         return
@@ -618,6 +691,7 @@ class _WithSpaceToBatch(object):
       filter_shape = array_ops.shape(filter)
       base_paddings = _with_space_to_batch_base_paddings(
           filter_shape, self.num_spatial_dims, self.rate_or_const_rate)
+
     paddings, crops = array_ops.required_space_to_batch_paddings(
         input_shape=input_spatial_shape,
         base_paddings=base_paddings,
@@ -771,6 +845,7 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
 
 
 @tf_export(v1=["nn.convolution"])
+@dispatch.add_dispatch_support
 def convolution(
     input,  # pylint: disable=redefined-builtin
     filter,  # pylint: disable=redefined-builtin
@@ -906,7 +981,8 @@ def convolution(
 
 
 @tf_export("nn.convolution", v1=[])
-def convolution_v2(
+@dispatch.add_dispatch_support
+def convolution_v2(  # pylint: disable=missing-docstring
     input,  # pylint: disable=redefined-builtin
     filters,
     strides=None,
@@ -938,31 +1014,84 @@ def convolution_internal(
     data_format=None,
     dilations=None,
     name=None,
-    call_from_convolution=True):
-  """Internal function which performs rank agnostic convolution."""
-  if isinstance(input.shape, tensor_shape.TensorShape) and \
-        input.shape.rank is not None:
-    n = len(input.shape) - 2
-  elif not isinstance(input.shape, tensor_shape.TensorShape) and \
-        input.shape is not None:
-    n = len(input.shape) - 2
-  elif isinstance(filters.shape, tensor_shape.TensorShape) and \
-        filters.shape.rank is not None:
+    call_from_convolution=True,
+    num_spatial_dims=None):
+  """Internal function which performs rank agnostic convolution.
+
+  Args:
+    input: See `convolution`.
+    filters: See `convolution`.
+    strides: See `convolution`.
+    padding: See `convolution`.
+    data_format: See `convolution`.
+    dilations: See `convolution`.
+    name: See `convolution`.
+    call_from_convolution: See `convolution`.
+    num_spatial_dims: (Optional.).  It is a integer describing the
+      rank of the spatial dimensions.  For `1-D`, `2-D` and `3-D` convolutions,
+      the value of `num_spatial_dims` is `1`, `2`, and `3`, respectively.
+      This argument is only required to disambiguate the rank of `batch_shape`
+      when `filter_shape.ndims is None` and `len(batch_shape) > 1`.  For
+      backwards compatibility, if `num_spatial_dims is None` and
+     `filter_shape.ndims is None`, then `len(batch_shape)` is assumed to be
+     `1` (i.e., the input is expected to be
+     `[batch_size, num_channels] + input_spatial_shape`
+     or `[batch_size] + input_spatial_shape + [num_channels]`.
+
+  Returns:
+    A tensor of shape and dtype matching that of `input`.
+
+  Raises:
+    ValueError: If input and filter both have unknown shapes, or if
+      `num_spatial_dims` is provided and incompatible with the value
+      estimated from `filters.shape`.
+  """
+  n = None
+  if getattr(filters, 'shape', None) is None:
+    with ops.name_scope(name, 'convolution_internal', [filters, input]):
+      filters = ops.convert_to_tensor(filters, name='filters')
+  if (isinstance(filters.shape, tensor_shape.TensorShape)
+      and filters.shape.rank is not None):
     n = len(filters.shape) - 2
-  elif not isinstance(filters.shape, tensor_shape.TensorShape) and \
-        filters.shape is not None:
+  elif (not isinstance(filters.shape, tensor_shape.TensorShape)
+        and filters.shape is not None):
     n = len(filters.shape) - 2
+
+  if (isinstance(input.shape, tensor_shape.TensorShape)
+      and input.shape.rank is not None):
+    if n is None:
+      n = (num_spatial_dims if num_spatial_dims is not None
+           else len(input.shape) - 2)
+    num_batch_dims = len(input.shape) - n - 1
+  elif (not isinstance(input.shape, tensor_shape.TensorShape)
+        and input.shape is not None):
+    if n is None:
+      n = (num_spatial_dims if num_spatial_dims is not None
+           else len(input.shape) - 2)
+    num_batch_dims = len(input.shape) - n - 1
   else:
+    num_batch_dims = 1  # Default behavior if it cannot be estimated.
+
+  if n is None:
     raise ValueError("rank of input or filter must be known")
 
+  if num_spatial_dims is not None and n != num_spatial_dims:
+    raise ValueError(
+        "inconsistent estimate of spatial dims ({}) vs. actual passed "
+        "num_spatial_dims ({}).  n was estimated as len(filters.shape) - 2, "
+        "but filters shape is: {}".format(n, num_spatial_dims, filters.shape))
+
   if not 1 <= n <= 3:
     raise ValueError(
-        "Input tensor must be of rank 3, 4 or 5 but was {}.".format(n + 2))
+        "num_spatial_dims (input.shape.ndims - num_batch_dims - 1) must be one "
+        "of 1, 2 or 3 but saw {}.  num_batch_dims: {}."
+        .format(n, num_batch_dims))
 
   if data_format is None:
-    channel_index = n + 1
+    channel_index = num_batch_dims + n
   else:
-    channel_index = 1 if data_format.startswith("NC") else n + 1
+    channel_index = (
+        num_batch_dims if data_format.startswith("NC") else n + num_batch_dims)
 
   strides = _get_sequence(strides, n, channel_index, "strides")
   dilations = _get_sequence(dilations, n, channel_index, "dilations")
@@ -975,7 +1104,7 @@ def convolution_internal(
     scope = "convolution"
 
   with ops.name_scope(name, scope, [input, filters]) as name:
-    conv_ops = {1: conv1d, 2: gen_nn_ops.conv2d, 3: gen_nn_ops.conv3d}
+    conv_ops = {1: conv1d, 2: _conv2d_expanded_batch, 3: gen_nn_ops.conv3d}
 
     if device_context.enclosing_tpu_context() is not None or all(
         i == 1 for i in dilations):
@@ -1005,7 +1134,8 @@ def convolution_internal(
           strides=strides,
           dilation_rate=dilations,
           name=name,
-          data_format=data_format)
+          data_format=data_format,
+          num_spatial_dims=n)
       return op(input, filters)
 
 
@@ -1013,17 +1143,34 @@ class Convolution(object):
   """Helper class for convolution.
 
   Note that this class assumes that shapes of input and filter passed to
-  __call__ are compatible with input_shape and filter_shape passed to the
-  constructor.
+  `__call__` are compatible with `input_shape`, `filter_shape`, and
+  `num_spatial_dims` passed to the constructor.
 
   Arguments
-    input_shape: static shape of input. i.e. input.get_shape().
-    filter_shape: static shape of the filter. i.e. filter.get_shape().
-    padding:  see convolution.
+    input_shape: static shape of input. i.e. input.shape.  Its length is
+      `batch_shape + input_spatial_shape + [num_channels]` if `data_format`
+      does not start with `NC`, or
+      `batch_shape + [num_channels] + input_spatial_shape` if `data_format`
+      starts with `NC`.
+    filter_shape: static shape of the filter. i.e. filter.shape.
+    padding: The padding algorithm, must be "SAME" or "VALID".
     strides: see convolution.
     dilation_rate: see convolution.
     name: see convolution.
-    data_format: see convolution.
+    data_format: A string or `None`.  Specifies whether the channel dimension of
+      the `input` and output is the last dimension (if `data_format` is `None`
+      or does not start with `NC`), or the first post-batch dimension (i.e. if
+      `data_format` starts with `NC`).
+    num_spatial_dims: (Usually optional.)  Python integer, the rank of the
+      spatial and channel dimensions.  For `1-D`, `2-D` and `3-D` convolutions,
+      the value of `num_spatial_dims` is `1`, `2`, and `3`, respectively.
+      This argument is only required to disambiguate the rank of `batch_shape`
+      when `filter_shape.ndims is None` and `len(batch_shape) > 1`.  For
+      backwards compatibility, if `num_spatial_dims is None` and
+      `filter_shape.ndims is None`, then `len(batch_shape)` is assumed to be
+      `1` (i.e., the input is expected to be
+      `[batch_size, num_channels] + input_spatial_shape`
+      or `[batch_size] + input_spatial_shape + [num_channels]`.
   """
 
   def __init__(self,
@@ -1033,42 +1180,74 @@ class Convolution(object):
                strides=None,
                dilation_rate=None,
                name=None,
-               data_format=None):
+               data_format=None,
+               num_spatial_dims=None):
     """Helper function for convolution."""
-    num_total_dims = filter_shape.ndims
-    if num_total_dims is None:
-      num_total_dims = input_shape.ndims
-    if num_total_dims is None:
-      raise ValueError("rank of input or filter must be known")
+    num_batch_dims = None
+    filter_shape = tensor_shape.as_shape(filter_shape)
+    input_shape = tensor_shape.as_shape(input_shape)
 
-    num_spatial_dims = num_total_dims - 2
+    if filter_shape.ndims is not None:
+      if (num_spatial_dims is not None and
+          filter_shape.ndims != num_spatial_dims + 2):
+        raise ValueError(
+            "Expected filter_shape.ndims == num_spatial_dims + 2, "
+            "but saw filter_shape.ndims == {} and num_spatial_dims == {}"
+            .format(filter_shape.ndims, num_spatial_dims))
+      else:
+        num_spatial_dims = filter_shape.ndims - 2
 
-    try:
-      input_shape.with_rank(num_spatial_dims + 2)
-    except ValueError:
+    if input_shape.ndims is not None and num_spatial_dims is not None:
+      num_batch_dims = input_shape.ndims - num_spatial_dims - 1
+
+    if num_spatial_dims is None:
+      num_spatial_dims = input_shape.ndims - 2
+    else:
+      if input_shape.ndims is not None:
+        if input_shape.ndims < num_spatial_dims + 2:
+          raise ValueError(
+              "Expected input_shape.ndims >= num_spatial_dims + 2, but saw "
+              "input_shape.ndims == {} and num_spatial_dims == {}"
+              .format(input_shape.ndims, num_spatial_dims))
+        else:
+          if num_batch_dims is None:
+            num_batch_dims = input_shape.ndims - num_spatial_dims - 1
+
+    if num_spatial_dims is None:
       raise ValueError(
-          "input tensor must have rank %d" % (num_spatial_dims + 2))
+          "Cannot estimate num_spatial_dims since input_shape.ndims is None, "
+          "filter_shape.ndims is None, and argument num_spatial_dims is also "
+          "None.")
 
-    try:
-      filter_shape.with_rank(num_spatial_dims + 2)
-    except ValueError:
+    if num_batch_dims is None:
+      num_batch_dims = 1
+
+    if num_batch_dims < 1:
       raise ValueError(
-          "filter tensor must have rank %d" % (num_spatial_dims + 2))
+          "num_batch_dims should be >= 1, but saw {}.  num_batch_dims was "
+          "estimated as `input_shape.ndims - num_spatial_dims - 1` and "
+          "num_spatial_dims was either provided or estimated as "
+          "`filter_shape.ndims - 2`.  input_shape.ndims: {}, "
+          "num_spatial_dims: {}, filter_shape.ndims: {}"
+          .format(num_batch_dims, input_shape.ndims, num_spatial_dims,
+                  filter_shape.ndims))
 
     if data_format is None or not data_format.startswith("NC"):
       input_channels_dim = tensor_shape.dimension_at_index(
-          input_shape, num_spatial_dims + 1)
-      spatial_dims = range(1, num_spatial_dims + 1)
+          input_shape, num_spatial_dims + num_batch_dims)
+      spatial_dims = range(num_batch_dims, num_spatial_dims + num_batch_dims)
     else:
-      input_channels_dim = tensor_shape.dimension_at_index(input_shape, 1)
-      spatial_dims = range(2, num_spatial_dims + 2)
+      input_channels_dim = tensor_shape.dimension_at_index(
+          input_shape, num_batch_dims)
+      spatial_dims = range(
+          num_batch_dims + 1, num_spatial_dims + num_batch_dims + 1)
 
-    if not input_channels_dim.is_compatible_with(
-        filter_shape[num_spatial_dims]):
-      raise ValueError(
-          "number of input channels does not match corresponding dimension of "
-          "filter, {} != {}".format(input_channels_dim,
-                                    filter_shape[num_spatial_dims]))
+    filter_dim = tensor_shape.dimension_at_index(filter_shape, num_spatial_dims)
+    if not (input_channels_dim % filter_dim).is_compatible_with(0):
+      raise ValueError("The number of input channels is not divisible by the "
+                       "corresponding number of output filters. Received: "
+                       "input channels={}, output filters={}".format(
+                           input_channels_dim, filter_dim))
 
     strides, dilation_rate = _get_strides_and_dilation_rate(
         num_spatial_dims, strides, dilation_rate)
@@ -1080,6 +1259,8 @@ class Convolution(object):
     self.padding = padding
     self.name = name
     self.dilation_rate = dilation_rate
+    self.num_batch_dims = num_batch_dims
+    self.num_spatial_dims = num_spatial_dims
     self.conv_op = _WithSpaceToBatch(
         input_shape,
         dilation_rate=dilation_rate,
@@ -1087,7 +1268,8 @@ class Convolution(object):
         build_op=self._build_op,
         filter_shape=filter_shape,
         spatial_dims=spatial_dims,
-        data_format=data_format)
+        data_format=data_format,
+        num_batch_dims=num_batch_dims)
 
   def _build_op(self, _, padding):
     return _NonAtrousConvolution(
@@ -1096,7 +1278,8 @@ class Convolution(object):
         padding=padding,
         data_format=self.data_format,
         strides=self.strides,
-        name=self.name)
+        name=self.name,
+        num_batch_dims=self.num_batch_dims)
 
   def __call__(self, inp, filter):  # pylint: disable=redefined-builtin
     # TPU convolution supports dilations greater than 1.
@@ -1109,12 +1292,14 @@ class Convolution(object):
           data_format=self.data_format,
           dilations=self.dilation_rate,
           name=self.name,
-          call_from_convolution=False)
+          call_from_convolution=False,
+          num_spatial_dims=self.num_spatial_dims)
     else:
       return self.conv_op(inp, filter)
 
 
 @tf_export(v1=["nn.pool"])
+@dispatch.add_dispatch_support
 def pool(
     input,  # pylint: disable=redefined-builtin
     window_shape,
@@ -1289,6 +1474,7 @@ def pool(
 
 
 @tf_export("nn.pool", v1=[])
+@dispatch.add_dispatch_support
 def pool_v2(
     input,  # pylint: disable=redefined-builtin
     window_shape,
@@ -1388,6 +1574,7 @@ def pool_v2(
 
 
 @tf_export("nn.atrous_conv2d")
+@dispatch.add_dispatch_support
 def atrous_conv2d(value, filters, rate, padding, name=None):
   """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
 
@@ -1575,6 +1762,7 @@ def convert_padding(padding):
 
 
 @tf_export(v1=["nn.conv1d"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_arg_values(
     None,
     "`NCHW` for data_format is deprecated, use `NCW` instead",
@@ -1673,6 +1861,7 @@ def conv1d(
 
 
 @tf_export("nn.conv1d", v1=[])
+@dispatch.add_dispatch_support
 def conv1d_v2(
     input,  # pylint: disable=redefined-builtin
     filters,
@@ -1738,6 +1927,7 @@ def conv1d_v2(
 
 
 @tf_export("nn.conv1d_transpose")
+@dispatch.add_dispatch_support
 def conv1d_transpose(
     input,  # pylint: disable=redefined-builtin
     filters,
@@ -1826,6 +2016,7 @@ def conv1d_transpose(
 
 
 @tf_export("nn.conv2d", v1=[])
+@dispatch.add_dispatch_support
 def conv2d_v2(input,  # pylint: disable=redefined-builtin
               filters,
               strides,
@@ -1834,12 +2025,15 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
               dilations=None,
               name=None):
   # pylint: disable=line-too-long
-  r"""Computes a 2-D convolution given 4-D `input` and `filters` tensors.
+  r"""Computes a 2-D convolution given `input` and 4-D `filters` tensors.
 
-  Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-  and a filter / kernel tensor of shape
-  `[filter_height, filter_width, in_channels, out_channels]`, this op
-  performs the following:
+  The `input` tensor may have rank `4` or higher, where shape dimensions `[:-3]`
+  are considered batch dimensions (`batch_shape`).
+
+  Given an input tensor of shape
+  `batch_shape + [in_height, in_width, in_channels]` and a filter / kernel
+  tensor of shape `[filter_height, filter_width, in_channels, out_channels]`,
+  this op performs the following:
 
   1. Flattens the filter to a 2-D matrix with shape
      `[filter_height * filter_width * in_channels, output_channels]`.
@@ -1857,9 +2051,9 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
 
   Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
   horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
-  
+
   Usage Example:
-  
+
   >>> x_in = np.array([[
   ...   [[2], [1], [2], [0], [1]],
   ...   [[1], [3], [2], [2], [3]],
@@ -1877,8 +2071,9 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
   Args:
     input: A `Tensor`. Must be one of the following types:
       `half`, `bfloat16`, `float32`, `float64`.
-      A 4-D tensor. The dimension order is interpreted according to the value
-      of `data_format`, see below for details.
+      A 4+-D tensor. The dimension order is interpreted according to the value
+      of `data_format`; with the all-but-inner-3 dimensions acting as batch
+      dimensions.  See below for details.
     filters: A `Tensor`. Must have the same type as `input`.
       A 4-D tensor of shape
       `[filter_height, filter_width, in_channels, out_channels]`
@@ -1898,9 +2093,9 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
       Defaults to `"NHWC"`.
       Specify the data format of the input and output data. With the
       default format "NHWC", the data is stored in the order of:
-          [batch, height, width, channels].
+          `batch_shape + [height, width, channels]`.
       Alternatively, the format could be "NCHW", the data storage order of:
-          [batch, channels, height, width].
+          `batch_shape + [channels, height, width]`.
     dilations: An int or list of `ints` that has length `1`, `2` or `4`,
       defaults to 1. The dilation factor for each dimension of`input`. If a
       single value is given it is replicated in the `H` and `W` dimension. By
@@ -1912,7 +2107,7 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor`. Has the same type as `input`.
+    A `Tensor`. Has the same type as `input` and the same outer batch shape.
   """
   # pylint: enable=line-too-long
   return conv2d(input,  # pylint: disable=redefined-builtin
@@ -1926,6 +2121,7 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
 
 
 @tf_export(v1=["nn.conv2d"])
+@dispatch.add_dispatch_support
 def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
     filter=None,
@@ -2011,18 +2207,40 @@ def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
 
   strides = _get_sequence(strides, 2, channel_index, "strides")
   dilations = _get_sequence(dilations, 2, channel_index, "dilations")
-  return gen_nn_ops.conv2d(input,  # pylint: disable=redefined-builtin
-                           filter,
-                           strides,
-                           padding,
-                           use_cudnn_on_gpu=use_cudnn_on_gpu,
-                           explicit_paddings=explicit_paddings,
-                           data_format=data_format,
-                           dilations=dilations,
-                           name=name)
+
+  # Try really hard to avoid modifying the legacy name scopes - return early.
+  shape = getattr(input, "shape", None)
+  if shape is not None:
+    ndims = getattr(shape, "ndims", -1)
+    if ndims == -1: ndims = len(shape)
+  if ndims in (4, 3, 2, 1, 0, None):
+    return gen_nn_ops.conv2d(
+        input,
+        filter=filter,
+        strides=strides,
+        padding=padding,
+        use_cudnn_on_gpu=use_cudnn_on_gpu,
+        explicit_paddings=explicit_paddings,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
+  return _squeeze_batch_dims(
+      input,
+      functools.partial(
+          gen_nn_ops.conv2d,
+          filter=filter,
+          strides=strides,
+          padding=padding,
+          use_cudnn_on_gpu=use_cudnn_on_gpu,
+          explicit_paddings=explicit_paddings,
+          data_format=data_format,
+          dilations=dilations),
+      inner_rank=3,
+      name=name)
 
 
 @tf_export(v1=["nn.conv2d_backprop_filter"])
+@dispatch.add_dispatch_support
 def conv2d_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
     filter_sizes,
@@ -2083,6 +2301,7 @@ def conv2d_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-defau
 
 
 @tf_export(v1=["nn.conv2d_backprop_input"])
+@dispatch.add_dispatch_support
 def conv2d_backprop_input(  # pylint: disable=redefined-builtin,dangerous-default-value
     input_sizes,
     filter=None,
@@ -2147,6 +2366,7 @@ def conv2d_backprop_input(  # pylint: disable=redefined-builtin,dangerous-defaul
 
 
 @tf_export(v1=["nn.conv2d_transpose"])
+@dispatch.add_dispatch_support
 def conv2d_transpose(
     value=None,
     filter=None,  # pylint: disable=redefined-builtin
@@ -2223,6 +2443,7 @@ def conv2d_transpose(
 
 
 @tf_export("nn.conv2d_transpose", v1=[])
+@dispatch.add_dispatch_support
 def conv2d_transpose_v2(
     input,  # pylint: disable=redefined-builtin
     filters,  # pylint: disable=redefined-builtin
@@ -2299,7 +2520,44 @@ def conv2d_transpose_v2(
         name=name)
 
 
+def _conv2d_expanded_batch(
+    input,  # pylint: disable=redefined-builtin
+    filters,
+    strides,
+    padding,
+    data_format,
+    dilations,
+    name):
+  """Helper function for `convolution_internal`; handles expanded batches."""
+  # Try really hard to avoid modifying the legacy name scopes - return early.
+  shape = getattr(input, "shape", None)
+  if shape is not None:
+    ndims = getattr(shape, "ndims", -1)
+    if ndims == -1: ndims = len(shape)
+  if ndims in (4, 3, 2, 1, 0, None):
+    return gen_nn_ops.conv2d(
+        input,
+        filter=filters,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
+  return _squeeze_batch_dims(
+      input,
+      functools.partial(
+          gen_nn_ops.conv2d,
+          filter=filters,
+          strides=strides,
+          padding=padding,
+          data_format=data_format,
+          dilations=dilations),
+      inner_rank=3,
+      name=name)
+
+
 @tf_export("nn.atrous_conv2d_transpose")
+@dispatch.add_dispatch_support
 def atrous_conv2d_transpose(value,
                             filters,
                             output_shape,
@@ -2458,6 +2716,7 @@ def atrous_conv2d_transpose(value,
 
 
 @tf_export(v1=["nn.depthwise_conv2d_native"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("nn.depthwise_conv2d_native")
 def depthwise_conv2d_native(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
@@ -2537,6 +2796,7 @@ def depthwise_conv2d_native(  # pylint: disable=redefined-builtin,dangerous-defa
         "nn.depthwise_conv2d_native_backprop_input",
         "nn.depthwise_conv2d_backprop_input"
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("nn.depthwise_conv2d_native_backprop_input")
 def depthwise_conv2d_native_backprop_input(  # pylint: disable=redefined-builtin,dangerous-default-value
     input_sizes,
@@ -2606,6 +2866,7 @@ def depthwise_conv2d_native_backprop_input(  # pylint: disable=redefined-builtin
         "nn.depthwise_conv2d_native_backprop_filter",
         "nn.depthwise_conv2d_backprop_filter"
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("nn.depthwise_conv2d_native_backprop_filter")
 def depthwise_conv2d_native_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
@@ -2671,6 +2932,7 @@ def depthwise_conv2d_native_backprop_filter(  # pylint: disable=redefined-builti
 
 
 @tf_export("nn.conv3d", v1=[])
+@dispatch.add_dispatch_support
 def conv3d_v2(input,  # pylint: disable=redefined-builtin,missing-docstring
               filters,
               strides,
@@ -2690,6 +2952,7 @@ def conv3d_v2(input,  # pylint: disable=redefined-builtin,missing-docstring
 
 
 @tf_export(v1=["nn.conv3d"])
+@dispatch.add_dispatch_support
 def conv3d_v1(  # pylint: disable=missing-docstring,dangerous-default-value
     input,  # pylint: disable=redefined-builtin
     filter=None,  # pylint: disable=redefined-builtin
@@ -2710,6 +2973,7 @@ conv3d_v1.__doc__ = gen_nn_ops.conv3d.__doc__
 
 
 @tf_export(v1=["nn.conv3d_transpose"])
+@dispatch.add_dispatch_support
 def conv3d_transpose(
     value,
     filter=None,  # pylint: disable=redefined-builtin
@@ -2781,6 +3045,7 @@ def conv3d_transpose(
 
 
 @tf_export("nn.conv3d_transpose", v1=[])
+@dispatch.add_dispatch_support
 def conv3d_transpose_v2(input,  # pylint: disable=redefined-builtin
                         filters,
                         output_shape,
@@ -2796,12 +3061,12 @@ def conv3d_transpose_v2(input,  # pylint: disable=redefined-builtin
   rather than an actual deconvolution.
 
   Args:
-    input: A 5-D `Tensor` of type `float` and shape `[batch, height, width,
-      in_channels]` for `NHWC` data format or `[batch, in_channels, height,
-      width]` for `NCHW` data format.
-    filters: A 5-D `Tensor` with the same type as `value` and shape `[height,
-      width, output_channels, in_channels]`.  `filter`'s `in_channels` dimension
-      must match that of `value`.
+    input: A 5-D `Tensor` of type `float` and shape `[batch, depth, height,
+      width, in_channels]` for `NDHWC` data format or `[batch, in_channels,
+      depth, height, width]` for `NCDHW` data format.
+    filters: A 5-D `Tensor` with the same type as `value` and shape `[depth,
+      height, width, output_channels, in_channels]`.  `filter`'s `in_channels`
+      dimension must match that of `value`.
     output_shape: A 1-D `Tensor` representing the output shape of the
       deconvolution op.
     strides: An int or list of `ints` that has length `1`, `3` or `5`.  The
@@ -2860,6 +3125,7 @@ CONV_TRANSPOSE_OPS = (
 
 
 @tf_export("nn.conv_transpose")
+@dispatch.add_dispatch_support
 def conv_transpose(input,  # pylint: disable=redefined-builtin
                    filters,
                    output_shape,
@@ -2957,6 +3223,7 @@ _tf_deterministic_ops.value = None
 
 
 @tf_export("nn.bias_add")
+@dispatch.add_dispatch_support
 def bias_add(value, bias, data_format=None, name=None):
   """Adds `bias` to `value`.
 
@@ -3046,6 +3313,7 @@ def bias_add_v1(value, bias, name=None):
 
 
 @tf_export(v1=["nn.crelu"])
+@dispatch.add_dispatch_support
 def crelu(features, name=None, axis=-1):
   """Computes Concatenated ReLU.
 
@@ -3078,12 +3346,14 @@ def crelu(features, name=None, axis=-1):
 
 
 @tf_export("nn.crelu", v1=[])
+@dispatch.add_dispatch_support
 def crelu_v2(features, axis=-1, name=None):
   return crelu(features, name=name, axis=axis)
 crelu_v2.__doc__ = crelu.__doc__
 
 
 @tf_export("nn.relu6")
+@dispatch.add_dispatch_support
 def relu6(features, name=None):
   """Computes Rectified Linear 6: `min(max(features, 0), 6)`.
 
@@ -3106,6 +3376,7 @@ def relu6(features, name=None):
 
 
 @tf_export("nn.leaky_relu")
+@dispatch.add_dispatch_support
 def leaky_relu(features, alpha=0.2, name=None):
   """Compute the Leaky ReLU activation function.
 
@@ -3244,6 +3515,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
 
 
 @tf_export(v1=["nn.softmax", "math.softmax"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def softmax(logits, axis=None, name=None, dim=None):
   """Computes softmax activations.
@@ -3279,7 +3551,7 @@ def softmax(logits, axis=None, name=None, dim=None):
       Tensor.
     RuntimeError: If a registered conversion function returns an invalid
       value.
-      
+
   """
   axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   if axis is None:
@@ -3288,6 +3560,7 @@ def softmax(logits, axis=None, name=None, dim=None):
 
 
 @tf_export("nn.softmax", "math.softmax", v1=[])
+@dispatch.add_dispatch_support
 def softmax_v2(logits, axis=None, name=None):
   """Computes softmax activations.
 
@@ -3315,6 +3588,7 @@ def softmax_v2(logits, axis=None, name=None):
 
 
 @tf_export(v1=["nn.log_softmax", "math.log_softmax"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def log_softmax(logits, axis=None, name=None, dim=None):
   """Computes log softmax activations.
@@ -3345,6 +3619,7 @@ def log_softmax(logits, axis=None, name=None, dim=None):
 
 
 @tf_export("nn.log_softmax", "math.log_softmax", v1=[])
+@dispatch.add_dispatch_support
 def log_softmax_v2(logits, axis=None, name=None):
   """Computes log softmax activations.
 
@@ -3381,6 +3656,7 @@ def _ensure_xent_args(name, sentinel, labels, logits):
 
 
 @tf_export("nn.softmax_cross_entropy_with_logits", v1=[])
+@dispatch.add_dispatch_support
 def softmax_cross_entropy_with_logits_v2(labels, logits, axis=-1, name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
@@ -3443,6 +3719,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, axis=-1, name=None):
 
 
 @tf_export(v1=["nn.softmax_cross_entropy_with_logits_v2"])
+@dispatch.add_dispatch_support
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def softmax_cross_entropy_with_logits_v2_helper(
     labels, logits, axis=None, name=None, dim=None):
@@ -3570,6 +3847,7 @@ See `tf.nn.softmax_cross_entropy_with_logits_v2`.
 
 
 @tf_export(v1=["nn.softmax_cross_entropy_with_logits"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(date=None, instructions=_XENT_DEPRECATION)
 def softmax_cross_entropy_with_logits(
     _sentinel=None,  # pylint: disable=invalid-name
@@ -3638,6 +3916,7 @@ def softmax_cross_entropy_with_logits(
 
 
 @tf_export(v1=["nn.sparse_softmax_cross_entropy_with_logits"])
+@dispatch.add_dispatch_support
 def sparse_softmax_cross_entropy_with_logits(
     _sentinel=None,  # pylint: disable=invalid-name
     labels=None,
@@ -3763,6 +4042,7 @@ def sparse_softmax_cross_entropy_with_logits(
 
 
 @tf_export("nn.sparse_softmax_cross_entropy_with_logits", v1=[])
+@dispatch.add_dispatch_support
 def sparse_softmax_cross_entropy_with_logits_v2(labels, logits, name=None):
   """Computes sparse softmax cross entropy between `logits` and `labels`.
 
@@ -3815,6 +4095,7 @@ def sparse_softmax_cross_entropy_with_logits_v2(labels, logits, name=None):
 
 
 @tf_export("nn.avg_pool", v1=["nn.avg_pool_v2"])
+@dispatch.add_dispatch_support
 def avg_pool_v2(input, ksize, strides, padding, data_format=None, name=None):  # pylint: disable=redefined-builtin
   """Performs the avg pooling on the input.
 
@@ -3877,6 +4158,7 @@ def avg_pool_v2(input, ksize, strides, padding, data_format=None, name=None):  #
 
 
 @tf_export(v1=["nn.avg_pool", "nn.avg_pool2d"])
+@dispatch.add_dispatch_support
 def avg_pool(value, ksize, strides, padding, data_format="NHWC",
              name=None, input=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
@@ -3921,6 +4203,7 @@ def avg_pool(value, ksize, strides, padding, data_format="NHWC",
 
 
 @tf_export("nn.avg_pool2d", v1=[])
+@dispatch.add_dispatch_support
 def avg_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
 
@@ -3960,6 +4243,7 @@ def avg_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
 
 
 @tf_export("nn.avg_pool1d")
+@dispatch.add_dispatch_support
 def avg_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
 
@@ -4005,6 +4289,7 @@ def avg_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):  #
 
 
 @tf_export("nn.avg_pool3d")
+@dispatch.add_dispatch_support
 def avg_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
 
@@ -4045,6 +4330,7 @@ def avg_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export("nn.max_pool", v1=["nn.max_pool_v2"])
+@dispatch.add_dispatch_support
 def max_pool_v2(input, ksize, strides, padding, data_format=None, name=None):
   """Performs the max pooling on the input.
 
@@ -4105,6 +4391,7 @@ def max_pool_v2(input, ksize, strides, padding, data_format=None, name=None):
 
 
 @tf_export(v1=["nn.max_pool"])
+@dispatch.add_dispatch_support
 def max_pool(value,
              ksize,
              strides,
@@ -4154,6 +4441,7 @@ def max_pool(value,
 
 # pylint: disable=redefined-builtin
 @tf_export("nn.max_pool1d")
+@dispatch.add_dispatch_support
 def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
   """Performs the max pooling on the input.
 
@@ -4198,6 +4486,7 @@ def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export("nn.max_pool2d")
+@dispatch.add_dispatch_support
 def max_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
   """Performs the max pooling on the input.
 
@@ -4236,6 +4525,7 @@ def max_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export("nn.max_pool3d")
+@dispatch.add_dispatch_support
 def max_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):
   """Performs the max pooling on the input.
 
@@ -4278,6 +4568,7 @@ def max_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):
 
 
 @tf_export("nn.max_pool_with_argmax", v1=[])
+@dispatch.add_dispatch_support
 def max_pool_with_argmax_v2(
     input,  # pylint: disable=redefined-builtin
     ksize,
@@ -4347,6 +4638,7 @@ def max_pool_with_argmax_v2(
 
 
 @tf_export(v1=["nn.max_pool_with_argmax"])
+@dispatch.add_dispatch_support
 def max_pool_with_argmax_v1(  # pylint: disable=missing-docstring,invalid-name
     input,  # pylint: disable=redefined-builtin
     ksize,
@@ -4441,6 +4733,7 @@ def _calc_bias_add_flops(graph, node):
 
 
 @tf_export(v1=["nn.xw_plus_b"])
+@dispatch.add_dispatch_support
 def xw_plus_b(x, weights, biases, name=None):  # pylint: disable=invalid-name
   """Computes matmul(x, weights) + biases.
 
@@ -4513,6 +4806,7 @@ def _get_noise_shape(x, noise_shape):
 
 
 @tf_export(v1=["nn.dropout"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "Please use `rate` instead of `keep_prob`. "
                              "Rate should be set to `rate = 1 - keep_prob`.",
                              "keep_prob")
@@ -4567,6 +4861,7 @@ def dropout(x, keep_prob=None, noise_shape=None, seed=None, name=None,
 
 
 @tf_export("nn.dropout", v1=[])
+@dispatch.add_dispatch_support
 def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):
   """Computes dropout: randomly sets elements to zero to prevent overfitting.
 
@@ -4688,6 +4983,7 @@ def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):
 
 
 @tf_export("math.top_k", "nn.top_k")
+@dispatch.add_dispatch_support
 def top_k(input, k=1, sorted=True, name=None):  # pylint: disable=redefined-builtin
   """Finds values and indices of the `k` largest entries for the last dimension.
 
@@ -4748,6 +5044,7 @@ def nth_element(input, n, reverse=False, name=None):  # pylint: disable=redefine
 
 
 @tf_export(v1=["nn.fractional_max_pool"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(date=None, instructions="`seed2` and `deterministic` "
                         "args are deprecated.  Use fractional_max_pool_v2.")
 def fractional_max_pool(value,
@@ -4834,6 +5131,7 @@ def fractional_max_pool(value,
 
 
 @tf_export("nn.fractional_max_pool", v1=[])
+@dispatch.add_dispatch_support
 def fractional_max_pool_v2(value,
                            pooling_ratio,
                            pseudo_random=False,
@@ -4919,6 +5217,7 @@ def fractional_max_pool_v2(value,
 
 
 @tf_export(v1=["nn.fractional_avg_pool"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(date=None, instructions="`seed2` and `deterministic` "
                         "args are deprecated.  Use fractional_avg_pool_v2.")
 def fractional_avg_pool(value,
@@ -4984,6 +5283,7 @@ def fractional_avg_pool(value,
 
 
 @tf_export("nn.fractional_avg_pool", v1=[])
+@dispatch.add_dispatch_support
 def fractional_avg_pool_v2(value,
                            pooling_ratio,
                            pseudo_random=False,
@@ -5062,6 +5362,7 @@ def _calc_dilation2d_flops(graph, node):
 
 
 @tf_export(v1=["nn.erosion2d"])
+@dispatch.add_dispatch_support
 def erosion2d(value, kernel, strides, rates, padding, name=None):
   """Computes the grayscale erosion of 4-D `value` and 3-D `kernel` tensors.
 
@@ -5121,6 +5422,7 @@ def erosion2d(value, kernel, strides, rates, padding, name=None):
 
 
 @tf_export("nn.erosion2d", v1=[])
+@dispatch.add_dispatch_support
 def erosion2d_v2(value,
                  filters,
                  strides,
@@ -5190,6 +5492,7 @@ def erosion2d_v2(value,
 
 
 @tf_export(v1=["math.in_top_k", "nn.in_top_k"])
+@dispatch.add_dispatch_support
 def in_top_k(predictions, targets, k, name=None):
   r"""Says whether the targets are in the top `K` predictions.
 
@@ -5224,6 +5527,7 @@ def in_top_k(predictions, targets, k, name=None):
 
 
 @tf_export("math.in_top_k", "nn.in_top_k", v1=[])
+@dispatch.add_dispatch_support
 def in_top_k_v2(targets, predictions, k, name=None):
   return in_top_k(predictions, targets, k, name)
 
@@ -5231,7 +5535,11 @@ def in_top_k_v2(targets, predictions, k, name=None):
 in_top_k_v2.__doc__ = in_top_k.__doc__
 
 
-tf_export(v1=["nn.quantized_avg_pool"])(gen_nn_ops.quantized_avg_pool)
-tf_export(v1=["nn.quantized_conv2d"])(gen_nn_ops.quantized_conv2d)
-tf_export(v1=["nn.quantized_relu_x"])(gen_nn_ops.quantized_relu_x)
-tf_export(v1=["nn.quantized_max_pool"])(gen_nn_ops.quantized_max_pool)
+tf_export(v1=["nn.quantized_avg_pool"])(
+    dispatch.add_dispatch_support(gen_nn_ops.quantized_avg_pool))
+tf_export(v1=["nn.quantized_conv2d"])(
+    dispatch.add_dispatch_support(gen_nn_ops.quantized_conv2d))
+tf_export(v1=["nn.quantized_relu_x"])(
+    dispatch.add_dispatch_support(gen_nn_ops.quantized_relu_x))
+tf_export(v1=["nn.quantized_max_pool"])(
+    dispatch.add_dispatch_support(gen_nn_ops.quantized_max_pool))
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 860bdc60387..e672018bcf6 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -130,6 +130,18 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     self.assertAllClose(x_neg_axis_tf, y_pos_axis_tf, eps)
     self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps)
 
+  def testSoftmaxExtendType(self):
+    x_shape = [5, 10]
+    x_np = np.random.randn(*x_shape).astype(np.float32)
+
+    x_f32_tf = constant_op.constant(x_np)
+    x_bf16_tf = math_ops.cast(x_f32_tf, dtypes.bfloat16)
+    y_f32_tf = self.evaluate(nn_ops.softmax(x_f32_tf))
+    y_bf16_tf = self.evaluate(nn_ops.softmax(x_bf16_tf))
+    expected = math_ops.cast(y_f32_tf, dtypes.bfloat16)
+    tol = x_shape[1] * 1e-3
+    self.assertAllClose(y_bf16_tf, expected, rtol=tol, atol=tol)
+
   @parameterized.parameters(((5, 10),), ((2, 3, 4),))
   @test_util.run_deprecated_v1
   def testGradient(self, x_shape):
@@ -303,6 +315,20 @@ class L2NormalizeTest(test_lib.TestCase):
       print("L2Normalize gradient err = %g " % err)
       self.assertLess(err, 1e-4)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testL2NormalizeComplex(self):
+    x_shape = [20, 7, 3]
+    for dtype in [np.complex64, np.complex128]:
+      np.random.seed(1)
+      x_np = (
+          np.random.random_sample(x_shape).astype(dtype) +
+          np.random.random_sample(x_shape).astype(dtype) * 1j)
+      for dim in range(len(x_shape)):
+        y_np = self._l2Normalize(x_np, dim)
+        x_tf = constant_op.constant(x_np, name="x")
+        y_tf = nn_impl.l2_normalize_v2(x_tf, dim)
+        self.assertAllClose(y_np, self.evaluate(y_tf))
+
 
 class DropoutTest(test_lib.TestCase):
 
@@ -1199,6 +1225,32 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
       y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [7, 3, 4, 9])
 
+  def testNHWCToNCHW_Size2(self):
+    x_val = [4, 9]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
+      self.assertAllEqual(y_val, [4, 9])
+
+  @test_util.disable_xla("unsupported data format")
+  def testNHWCToWHCN(self):
+    x_val = [7, 4, 9, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x, src_format="NHWC", dst_format="WHCN")
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
+      self.assertAllEqual(y_val, [9, 4, 3, 7])
+
+  @test_util.disable_xla("unsupported data format")
+  def testNHWCToWHCN_Size2(self):
+    x_val = [4, 9]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x, src_format="NHWC", dst_format="WHCN")
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
+      self.assertAllEqual(y_val, [9, 4])
+
   def testNCHWToNHWC(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
@@ -1207,6 +1259,14 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
       y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [7, 9, 3, 4])
 
+  def testNCHWToNHWC_Size2(self):
+    x_val = [9, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
+      self.assertAllEqual(y_val, [9, 3])
+
   def testNHWCToHWNC(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
diff --git a/tensorflow/python/ops/numerics.py b/tensorflow/python/ops/numerics.py
index 9f9e7229442..81a532bb150 100644
--- a/tensorflow/python/ops/numerics.py
+++ b/tensorflow/python/ops/numerics.py
@@ -25,10 +25,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export(v1=["debugging.assert_all_finite", "verify_tensor_all_finite"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("verify_tensor_all_finite")
 def verify_tensor_all_finite(t=None, msg=None, name=None, x=None, message=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
@@ -50,6 +52,7 @@ def verify_tensor_all_finite(t=None, msg=None, name=None, x=None, message=None):
 
 
 @tf_export("debugging.assert_all_finite", v1=[])
+@dispatch.add_dispatch_support
 def verify_tensor_all_finite_v2(x, message, name=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
 
diff --git a/tensorflow/python/ops/numpy_ops/BUILD b/tensorflow/python/ops/numpy_ops/BUILD
new file mode 100644
index 00000000000..5b4dae352d6
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/BUILD
@@ -0,0 +1,16 @@
+# TF numpy API
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_library(
+    name = "numpy_ops",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+)
diff --git a/tensorflow/lite/micro/tools/make/download_dependencies.sh b/tensorflow/python/ops/numpy_ops/__init__.py
old mode 100755
new mode 100644
similarity index 75%
rename from tensorflow/lite/micro/tools/make/download_dependencies.sh
rename to tensorflow/python/ops/numpy_ops/__init__.py
index df2caedb28d..d78a4c3a6fb
--- a/tensorflow/lite/micro/tools/make/download_dependencies.sh
+++ b/tensorflow/python/ops/numpy_ops/__init__.py
@@ -1,5 +1,4 @@
-#!/bin/bash
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Tensorflow numpy API."""
 
-set -e
-
-echo "download_dependencies.sh is no longer needed, just use 'make -f tensorflow/lite/micro/tools/make/Makefile'." >&2
-exit 1
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 11380b2dac2..243471553d9 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -47,6 +47,7 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradients as gradient_ops
 from tensorflow.python.ops import image_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
@@ -884,6 +885,136 @@ class TensorArrayTest(PForTestCase):
       self.assertAllClose(actual_grad, computed_grad)
 
 
+class TensorListTest(PForTestCase):
+
+  def test_create_outside_and_write(self):
+    handle1 = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+    handle2 = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+
+    def loop_fn(i):
+      h1 = list_ops.tensor_list_set_item(handle1, 0, i)
+      h1 = list_ops.tensor_list_set_item(h1, 1, 1)
+      h2 = list_ops.tensor_list_set_item(handle2, 0, 1)
+      return (list_ops.tensor_list_stack(h1, dtypes.int32),
+              list_ops.tensor_list_stack(h2, dtypes.int32))
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_create_inside_and_write(self):
+
+    def loop_fn(i):
+      h1 = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+      h1 = list_ops.tensor_list_set_item(h1, 0, i)
+      h1 = list_ops.tensor_list_set_item(h1, 1, 1)
+      h2 = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+      h2 = list_ops.tensor_list_set_item(h2, 0, 1)
+      return (list_ops.tensor_list_stack(h1, dtypes.int32),
+              list_ops.tensor_list_stack(h2, dtypes.int32))
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_create_outside_and_read(self):
+    handle = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+    handle = list_ops.tensor_list_set_item(handle, 0, 0)
+    handle = list_ops.tensor_list_set_item(handle, 1, 1)
+
+    def loop_fn(i):
+      return (list_ops.tensor_list_get_item(handle, i, dtypes.int32),
+              list_ops.tensor_list_get_item(handle, 0, dtypes.int32),
+              list_ops.tensor_list_length(handle),
+              list_ops.tensor_list_element_shape(handle, dtypes.int32),
+              list_ops.tensor_list_element_shape(handle, dtypes.int64))
+
+    self._test_loop_fn(loop_fn, 2)
+
+  def test_create_inside_and_read(self):
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+      handle = list_ops.tensor_list_set_item(handle, 0, i)
+      handle = list_ops.tensor_list_set_item(handle, 1, 1)
+      return (list_ops.tensor_list_get_item(handle, 0, dtypes.int32),
+              list_ops.tensor_list_get_item(handle, i, dtypes.int32),
+              list_ops.tensor_list_length(handle),
+              list_ops.tensor_list_element_shape(handle, dtypes.int32),
+              list_ops.tensor_list_element_shape(handle, dtypes.int64))
+
+    self._test_loop_fn(loop_fn, 2)
+
+  def test_create_outside_and_scatter(self):
+    h = list_ops.tensor_list_reserve([2], 2, dtypes.int32)
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_scatter([[i, 2]], [0], input_handle=h)
+      handle = list_ops.tensor_list_scatter([[1, 2]], [1], input_handle=handle)
+      handle = list_ops.tensor_list_scatter([[1, 2]], [1], input_handle=handle)
+      return list_ops.tensor_list_stack(handle, dtypes.int32)
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_create_inside_and_scatter(self):
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_reserve([2], 2, dtypes.int32)
+      handle = list_ops.tensor_list_scatter([[i, 2]], [0], input_handle=handle)
+      handle = list_ops.tensor_list_scatter([[1, 2]], [1], input_handle=handle)
+      return list_ops.tensor_list_stack(handle, dtypes.int32)
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_create_outside_and_gather(self):
+    handle = list_ops.tensor_list_reserve([2], 2, dtypes.int32)
+    handle = list_ops.tensor_list_scatter([[2, 3]], [0], input_handle=handle)
+    handle = list_ops.tensor_list_scatter([[1, 2]], [1], input_handle=handle)
+
+    def loop_fn(i):
+      return (list_ops.tensor_list_gather(handle, [0, 1], dtypes.int32),
+              list_ops.tensor_list_gather(handle, [i], dtypes.int32))
+
+    self._test_loop_fn(loop_fn, 2)
+
+  def test_create_inside_and_gather(self):
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_reserve([2], 2, dtypes.int32)
+      handle = list_ops.tensor_list_scatter([[i, 2]], [0], input_handle=handle)
+      handle = list_ops.tensor_list_scatter([[1, 2]], [1], input_handle=handle)
+      return (list_ops.tensor_list_gather(handle, [0, 1], dtypes.int32),
+              list_ops.tensor_list_gather(handle, [i], dtypes.int32))
+
+    self._test_loop_fn(loop_fn, 2)
+
+  def test_tensor_list_from_tensor(self):
+    t = random_ops.random_uniform([2, 3, 4])
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_from_tensor(array_ops.gather(t, i), [4])
+      return list_ops.tensor_list_stack(handle, t.dtype)
+
+    self._test_loop_fn(loop_fn, 2)
+
+  def test_tensor_list_reserve_while_loop(self):
+    # Here a loop invariant TensorList is captured by a while_loop, which then
+    # performs loop dependent operations on it, resulting in a loop variant
+    # output. This forces stacking of the variant handle captured by the
+    # while_loop.
+    # We handle this particular case by forcing vectorization of
+    # TensorListReserve operation.
+    v2_enabled = control_flow_v2_toggles.control_flow_v2_enabled()
+    control_flow_v2_toggles.enable_control_flow_v2()
+    def loop_fn(i):
+      handle = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+      _, out_handle = control_flow_ops.while_loop(
+          lambda j, _: j < 2,
+          lambda j, h: (j + 1, list_ops.tensor_list_set_item(h, j, i)),
+          (0, handle))
+      return list_ops.tensor_list_stack(out_handle, dtypes.int32)
+
+    self._test_loop_fn(loop_fn, 2)
+    if not v2_enabled:
+      control_flow_v2_toggles.disable_control_flow_v2()
+
+
 class StackTest(PForTestCase):
 
   @test_util.run_v1_only("b/122612051")
@@ -1400,6 +1531,8 @@ class StatelessIfTest(PForTestCase):
 class IfTest(PForTestCase):
 
   def test_read_var(self):
+    self.skipTest("b/156438918")  # Flaky
+
     x = [1, 2, 3, 4, 5.]
     y = 2.5
     z = resource_variable_ops.ResourceVariable(5.)
@@ -1901,6 +2034,14 @@ class VariableTest(PForTestCase):
     ):
       pfor_control_flow_ops.vectorized_map(f, x)
 
+  @test_util.run_all_in_graph_and_eager_modes
+  def test_variable_shape(self):
+    v = resource_variable_ops.ResourceVariable([1, 2])
+
+    def loop_fn(_):
+      return resource_variable_ops.variable_shape(v.handle)
+
+    self._test_loop_fn(loop_fn, 2)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
index 773195283d6..8e18b9968fe 100644
--- a/tensorflow/python/ops/parallel_for/math_test.py
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -23,6 +23,7 @@ from absl.testing import parameterized
 from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops as framework_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
@@ -150,72 +151,81 @@ class MathTest(PForTestCase, parameterized.TestCase):
       self._test_loop_fn(loop_fn, 3)
 
   def test_binary_cwise_ops(self):
-    logical_ops = [
-        math_ops.logical_and, math_ops.logical_or, math_ops.logical_xor
-    ]
+    # Enable tensor equality to test `equal` and `not_equal` ops below.
+    default_equality = framework_ops.Tensor._USE_EQUALITY
+    framework_ops.enable_tensor_equality()
+    try:
+      logical_ops = [
+          math_ops.logical_and, math_ops.logical_or, math_ops.logical_xor
+      ]
 
-    # Wrapper functions restricting the range of inputs of zeta and polygamma.
-    def safe_polygamma(x, y):
-      return math_ops.polygamma(
-          math_ops.round(clip_ops.clip_by_value(y, 1, 10)), x * x + 1)
+      # Wrapper functions restricting the range of inputs of zeta and polygamma.
+      def safe_polygamma(x, y):
+        return math_ops.polygamma(
+            math_ops.round(clip_ops.clip_by_value(y, 1, 10)), x * x + 1)
 
-    def safe_zeta(x, y):
-      return math_ops.zeta(x * x + 1, y * y)
+      def safe_zeta(x, y):
+        return math_ops.zeta(x * x + 1, y * y)
 
-    float_ops = [
-        math_ops.add,
-        math_ops.add_v2,
-        math_ops.atan2,
-        math_ops.complex,
-        math_ops.div,
-        math_ops.divide,
-        math_ops.div_no_nan,
-        math_ops.equal,
-        math_ops.floor_mod,
-        math_ops.greater,
-        math_ops.greater_equal,
-        math_ops.igamma,
-        math_ops.igammac,
-        math_ops.igamma_grad_a,
-        math_ops.less,
-        math_ops.less_equal,
-        math_ops.maximum,
-        math_ops.minimum,
-        math_ops.mod,
-        math_ops.multiply,
-        math_ops.not_equal,
-        math_ops.pow,
-        math_ops.squared_difference,
-        math_ops.subtract,
-        math_ops.truncate_mod,
-        safe_polygamma,
-        safe_zeta,
-    ]
-    # FloorDiv fails on XLA due floor's discontinuities exacerbating small
-    # division differences.
-    if not test_util.is_xla_enabled():
-      float_ops += [math_ops.floor_div]
-    for op in logical_ops + float_ops:
-      x = random_ops.random_uniform([7, 3, 5])
-      y = random_ops.random_uniform([3, 5])
-      if op in logical_ops:
-        x = x > 0
-        y = y > 0
+      float_ops = [
+          math_ops.add,
+          math_ops.add_v2,
+          math_ops.atan2,
+          math_ops.complex,
+          math_ops.div,
+          math_ops.divide,
+          math_ops.div_no_nan,
+          math_ops.equal,
+          lambda x, y: framework_ops.convert_to_tensor(x == y),
+          lambda x, y: framework_ops.convert_to_tensor(x != y),
+          math_ops.floor_mod,
+          math_ops.greater,
+          math_ops.greater_equal,
+          math_ops.igamma,
+          math_ops.igammac,
+          math_ops.igamma_grad_a,
+          math_ops.less,
+          math_ops.less_equal,
+          math_ops.maximum,
+          math_ops.minimum,
+          math_ops.mod,
+          math_ops.multiply,
+          math_ops.not_equal,
+          math_ops.pow,
+          math_ops.squared_difference,
+          math_ops.subtract,
+          math_ops.truncate_mod,
+          safe_polygamma,
+          safe_zeta,
+      ]
+      # FloorDiv fails on XLA due floor's discontinuities exacerbating small
+      # division differences.
+      if not test_util.is_xla_enabled():
+        float_ops += [math_ops.floor_div]
+      for op in logical_ops + float_ops:
+        x = random_ops.random_uniform([7, 3, 5])
+        y = random_ops.random_uniform([3, 5])
+        if op in logical_ops:
+          x = x > 0
+          y = y > 0
 
-      output_dtypes = []
+        output_dtypes = []
 
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        x1 = array_ops.gather(x, i)
-        y1 = array_ops.gather(y, i)
-        outputs = [op(x, y), op(x1, y), op(x, y1), op(x1, y1), op(x1, x1)]
-        del output_dtypes[:]
-        output_dtypes.extend(t.dtype for t in outputs)
-        return outputs
+        # pylint: disable=cell-var-from-loop
+        def loop_fn(i):
+          x1 = array_ops.gather(x, i)
+          y1 = array_ops.gather(y, i)
+          outputs = [op(x, y), op(x1, y), op(x, y1), op(x1, y1), op(x1, x1)]
+          del output_dtypes[:]
+          output_dtypes.extend(t.dtype for t in outputs)
+          return outputs
 
-      # pylint: enable=cell-var-from-loop
+        # pylint: enable=cell-var-from-loop
 
-      self._test_loop_fn(loop_fn, 3)
+        self._test_loop_fn(loop_fn, 3)
+    finally:
+      if not default_equality:
+        framework_ops.disable_tensor_equality()
 
   def test_approximate_equal(self):
     x = random_ops.random_uniform([3, 5])
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index bece477e754..582bfecdc76 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -24,6 +24,7 @@ import string
 import sys
 import traceback
 
+import numpy as np
 import six
 
 from tensorflow.compiler.tf2xla.python import xla
@@ -52,6 +53,7 @@ from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -74,6 +76,19 @@ flags.DEFINE_bool(
 
 def _stack(t, length):
   """stacks `t` `length` times."""
+  # Note that this stacking may currently be triggered, for example, when a
+  # loop invariant tensor with dtype variant is input to a while_loop which then
+  # produces a loop dependent output. Simply stacking the variants may not be
+  # suitable since operations on stacked handles may expect a vectorized version
+  # of the variant.
+  # Given that variant types are generic, we are currently unable to figure out
+  # which particular variant type is being considered here and hence it may not
+  # be safe to allow stacking it.
+  if t.dtype == dtypes.variant:
+    raise NotImplementedError(
+        "Vectorization tried to stack variant tensor %s. "
+        "This is likely because vectorization of that variant "
+        "is not fully supported yet." % t)
   ones = array_ops.ones_like(array_ops.shape(t))
   ones = array_ops.reshape(ones, [-1])
   length = array_ops.reshape(length, [-1])
@@ -93,6 +108,7 @@ def _stack(t, length):
 passthrough_stateful_ops = set([
     "VariableV2",
     "VarHandleOp",
+    "VariableShape",
     "ReadVariableOp",
     "StackV2",
     "TensorArrayWriteV3",
@@ -101,6 +117,15 @@ passthrough_stateful_ops = set([
 ])
 
 
+# Ops which we will treat like stateful for the purpose of vectorization.
+# Typically this is used to force pfor converters to run for these ops.
+force_stateful_ops = set([
+    # We vectorize this since we need to change the element shape set on the
+    # list.
+    "TensorListReserve",
+])
+
+
 def _is_stateful_pfor_op(op):
   if isinstance(op, WhileOp):
     return op.is_stateful
@@ -109,6 +134,8 @@ def _is_stateful_pfor_op(op):
     return False
   if op.type in passthrough_stateful_ops:
     return False
+  if op.type in force_stateful_ops:
+    return True
   assert hasattr(op, "op_def") and op.op_def is not None, op
   return op.op_def.is_stateful
 
@@ -964,8 +991,9 @@ def wrap(tensor, is_stacked=True, is_sparse_stacked=False):
   return WrappedTensor(tensor, is_stacked, is_sparse_stacked)
 
 
-def _fallback_converter(pfor_input):
-  logging.warn("Using a while_loop for converting %s", pfor_input.op_type)
+def _fallback_converter(pfor_input, warn=True):
+  if warn:
+    logging.warn("Using a while_loop for converting %s", pfor_input.op_type)
   output_dtypes = [x.dtype for x in pfor_input.outputs]
   iters = pfor_input.pfor.loop_len_vector[0]
 
@@ -2062,7 +2090,7 @@ def _convert_diag(pfor_input):
   else:
     # It is not clear if we can do better than a while loop here with existing
     # kernels.
-    return _fallback_converter(pfor_input)
+    return _fallback_converter(pfor_input, warn=False)
 
 
 # See notes for MatrixDiagV2
@@ -2105,7 +2133,7 @@ def _convert_diag_part(pfor_input):
   else:
     # It is not clear if we can do better than a while loop here with existing
     # kernels.
-    return _fallback_converter(pfor_input)
+    return _fallback_converter(pfor_input, warn=False)
 
 
 @RegisterPFor("OneHot")
@@ -2784,8 +2812,8 @@ def _convert_equal(pfor_input):
   x = pfor_input.input(0)[0]
   y = pfor_input.input(1)[0]
   incompatible_shape_error = pfor_input.get_attr("incompatible_shape_error")
-  assert incompatible_shape_error
-  return wrap(math_ops.equal(x, y), True)
+  return wrap(gen_math_ops.equal(
+      x, y, incompatible_shape_error=incompatible_shape_error), True)
 
 
 @RegisterPFor("NotEqual")
@@ -2794,8 +2822,8 @@ def _convert_not_equal(pfor_input):
   x = pfor_input.input(0)[0]
   y = pfor_input.input(1)[0]
   incompatible_shape_error = pfor_input.get_attr("incompatible_shape_error")
-  assert incompatible_shape_error
-  return wrap(math_ops.not_equal(x, y), True)
+  return wrap(gen_math_ops.not_equal(
+      x, y, incompatible_shape_error=incompatible_shape_error), True)
 
 
 @RegisterPFor("ApproximateEqual")
@@ -3039,7 +3067,7 @@ def _convert_stateless_multinomial(pfor_input):
   # random numbers under vectorization.
   # Unfortunately, the kernels currently are not necessarily setup to do this
   # efficiently and hence we fallback to a sequential loop for vectorization.
-  return _fallback_converter(pfor_input)
+  return _fallback_converter(pfor_input, warn=False)
 
 
 # linalg_ops
@@ -3471,6 +3499,241 @@ def _convert_tensor_array_grad_v3(pfor_input):
   return [wrap(grad_handle, False), wrap(flow_out, True)]
 
 
+def _stack_tensor_list_shape(shape, pfor_input):
+  first_dim = pfor_input.pfor.loop_len_vector
+  shape_value = tensor_util.constant_value(shape)
+  # Note that negative values in the shape are used to signify unknown shapes
+  # and are handled in a special way.
+  if shape_value is not None:
+    shape_value = np.asarray(shape_value)
+    if -1 in shape_value:
+      return constant_op.constant(-1)
+    elif not shape_value.size:
+      return first_dim
+  else:
+    shape = array_ops.reshape(shape, [-1])
+    return control_flow_ops.cond(
+        math_ops.reduce_any(shape < 0),
+        lambda: constant_op.constant(-1),
+        lambda: array_ops.concat([first_dim, shape], axis=0))
+
+
+def _tile_variant(t, pfor_input):
+  """stacks `t` `length` times."""
+  t.set_shape([])
+  t = array_ops.reshape(t, [-1])
+  with ops.device("CPU:0"):
+    return array_ops.tile(t, pfor_input.pfor.loop_len_vector)
+
+
+def _untile_variant(t):
+  return array_ops.gather(t, 0)
+
+
+@RegisterPFor("TensorListReserve")
+def _convert_tensor_list_reserve(pfor_input):
+  element_shape = pfor_input.unstacked_input(0)
+  num_elements = pfor_input.unstacked_input(1)
+  element_dtype = pfor_input.get_attr("element_dtype")
+
+  # Prepend a dimension to element_shape.
+  element_shape = _stack_tensor_list_shape(element_shape, pfor_input)
+  handle = list_ops.tensor_list_reserve(
+      element_shape, num_elements, element_dtype=element_dtype)
+
+  return wrap(_tile_variant(handle, pfor_input), True)
+
+
+@RegisterPFor("TensorListElementShape")
+def _convert_tensor_list_element_shape(pfor_input):
+  handle = _untile_variant(pfor_input.stacked_input(0))
+  shape_type = pfor_input.get_attr("shape_type")
+  shape = list_ops.tensor_list_element_shape(handle, shape_type)
+  shape = array_ops.reshape(shape, [-1])
+  shape = shape[1:]
+  return wrap(shape, False)
+
+
+@RegisterPFor("TensorListLength")
+def _convert_tensor_list_length(pfor_input):
+  handle = _untile_variant(pfor_input.stacked_input(0))
+  return wrap(list_ops.tensor_list_length(handle), False)
+
+
+def _stack_tensor_list(handle, dtype, pfor_input, element_shape=None):
+  if element_shape is None:
+    element_shape = list_ops.tensor_list_element_shape(handle, dtypes.int32)
+  length = list_ops.tensor_list_length(handle)
+  new_handle = list_ops.tensor_list_reserve(
+      _stack_tensor_list_shape(element_shape, pfor_input), length, dtype)
+
+  def _body_fn(i, h):
+    elem = list_ops.tensor_list_get_item(handle, i, dtype, element_shape)
+    elem = _stack(elem, pfor_input.pfor.loop_len_vector).t
+    return i + 1, list_ops.tensor_list_set_item(h, i, elem)
+
+  return control_flow_ops.while_loop(lambda i, _: i < length, _body_fn,
+                                     [0, new_handle])[1]
+
+
+@RegisterPFor("TensorListGetItem")
+def _convert_tensor_list_get_item(pfor_input):
+  handle, handle_stacked, _ = pfor_input.input(0)
+  index, index_stacked, _ = pfor_input.input(1)
+  element_shape = pfor_input.unstacked_input(2)
+  element_dtype = pfor_input.get_attr("element_dtype")
+
+  if handle_stacked:
+    handle = _untile_variant(handle)
+    element_shape = _stack_tensor_list_shape(element_shape, pfor_input)
+    if index_stacked:
+      # We use a sequential loop since that may be more efficient than first
+      # gathering and concatenating all the element corresponding to `index`,
+      # and then doing a gather on it.
+      def _map_fn(i):
+        item_i = list_ops.tensor_list_get_item(
+            handle,
+            index[i],
+            element_dtype=element_dtype)
+        return array_ops.gather(item_i, i)
+
+      output = map_fn.map_fn(_map_fn, pfor_input.pfor.all_indices)
+      return wrap(output, True)
+    else:
+      output = list_ops.tensor_list_get_item(
+          handle,
+          index,
+          element_shape=element_shape,
+          element_dtype=element_dtype)
+      return wrap(output, True)
+  else:
+    assert index_stacked
+    return wrap(
+        list_ops.tensor_list_gather(
+            handle,
+            index,
+            element_shape=element_shape,
+            element_dtype=element_dtype), True)
+
+
+@RegisterPFor("TensorListSetItem")
+def _convert_tensor_array_set_item(pfor_input):
+  handle, handle_stacked, _ = pfor_input.input(0)
+  index, index_stacked, _ = pfor_input.input(1)
+  item, item_stacked, _ = pfor_input.input(2)
+
+  if not handle_stacked:
+    # Special case where we can statically guarantee that the indices are
+    # disjoint.
+    if index is pfor_input.pfor.all_indices:
+      if not item_stacked:
+        item = _stack(item, pfor_input.pfor.loop_len_vector).t
+      return wrap(
+          list_ops.tensor_list_scatter(item, index, input_handle=handle), False)
+    else:
+      handle = _stack_tensor_list(handle, item.dtype, pfor_input)
+  else:
+    handle = _untile_variant(handle)
+
+  if index_stacked:
+    # TODO(agarwal): handle this.
+    raise ValueError("Vectorizing writes to a TensorList with loop "
+                     "variant indices is currently unsupported.")
+
+  else:
+    if not item_stacked:
+      item = _stack(item, pfor_input.pfor.loop_len_vector).t
+    handle = list_ops.tensor_list_set_item(handle, index, item)
+    return wrap(_tile_variant(handle, pfor_input), True)
+
+
+@RegisterPFor("TensorListStack")
+def _convert_tensor_list_stack(pfor_input):
+  handle = pfor_input.stacked_input(0)
+  input_shape = pfor_input.unstacked_input(1)
+  element_dtype = pfor_input.get_attr("element_dtype")
+  num_elements = pfor_input.get_attr("num_elements")
+
+  handle = _untile_variant(handle)
+  input_shape = _stack_tensor_list_shape(input_shape, pfor_input)
+  output = list_ops.tensor_list_stack(
+      handle,
+      element_dtype,
+      element_shape=input_shape,
+      num_elements=num_elements)
+  output = _transpose_first_two_dims(output)
+  return wrap(output, True)
+
+
+@RegisterPFor("TensorListGather")
+def _convert_tensor_list_gather(pfor_input):
+  handle, handle_stacked, _ = pfor_input.input(0)
+  index, index_stacked, _ = pfor_input.input(1)
+  element_shape = pfor_input.unstacked_input(2)
+  element_dtype = pfor_input.get_attr("element_dtype")
+
+  if handle_stacked:
+    handle = _untile_variant(handle)
+    element_shape = _stack_tensor_list_shape(element_shape, pfor_input)
+    if index_stacked:
+      # We use a sequential loop since that may be more efficient than first
+      # gathering and concatenating all the element corresponding to `index`,
+      # and then doing a gather on it.
+      def _map_fn(i):
+        item_i = list_ops.tensor_list_gather(
+            handle,
+            index[i],
+            element_dtype=element_dtype)
+        axis = array_ops.rank(index) - 1
+        return array_ops.gather(item_i, i, axis=axis)
+
+      output = map_fn.map_fn(_map_fn, pfor_input.pfor.all_indices)
+      return wrap(output, True)
+    else:
+      output = list_ops.tensor_list_gather(
+          handle,
+          index,
+          element_shape=element_shape,
+          element_dtype=element_dtype)
+      return wrap(output, True)
+  else:
+    assert index_stacked
+    index_shape = array_ops.shape(index)
+    index = array_ops.reshape(index, [-1])
+    values = list_ops.tensor_list_gather(
+        handle, index, element_shape=element_shape, element_dtype=element_dtype)
+    final_shape = array_ops.concat(
+        [index_shape, array_ops.shape(values)[1:]], axis=0)
+    return wrap(array_ops.reshape(values, final_shape), True)
+
+
+@RegisterPFor("TensorListScatterIntoExistingList")
+def _convert_tensor_list_scatter(pfor_input):
+  pfor_input.stack_inputs([1])
+  handle, handle_stacked, _ = pfor_input.input(0)
+  item = pfor_input.stacked_input(1)
+  # TODO(agarwal): handle stacked indices.
+  indices = pfor_input.unstacked_input(2)
+  if handle_stacked:
+    handle = _untile_variant(handle)
+  else:
+    handle = _stack_tensor_list(handle, item.dtype, pfor_input)
+
+  item = _transpose_first_two_dims(item)
+  handle = list_ops.tensor_list_scatter(item, indices, input_handle=handle)
+  return wrap(_tile_variant(handle, pfor_input), True)
+
+
+@RegisterPFor("TensorListFromTensor")
+def _convert_tensor_list_from_tensor(pfor_input):
+  tensor = pfor_input.stacked_input(0)
+  element_shape = pfor_input.unstacked_input(1)
+  tensor = _transpose_first_two_dims(tensor)
+  element_shape = _stack_tensor_list_shape(element_shape, pfor_input)
+  handle = list_ops.tensor_list_from_tensor(tensor, element_shape)
+  return wrap(_tile_variant(handle, pfor_input), True)
+
+
 # StackV2 conversion is tricky since we don't have arrays of StackV2. So similar
 # to TensorArrays, we convert them by changing the dimension of the elements
 # inside the stack.
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 8e518e913be..edcae89aada 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import parsing_config
 from tensorflow.python.ops.gen_parsing_ops import *
 # pylint: enable=wildcard-import,undefined-variable
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -77,6 +78,7 @@ def _prepend_none_dimension(features):
 
 
 @tf_export("io.parse_example", v1=[])
+@dispatch.add_dispatch_support
 def parse_example_v2(serialized, features, example_names=None, name=None):
   # pylint: disable=line-too-long
   """Parses `Example` protos into a `dict` of tensors.
@@ -314,6 +316,7 @@ def parse_example_v2(serialized, features, example_names=None, name=None):
 
 
 @tf_export(v1=["io.parse_example", "parse_example"])
+@dispatch.add_dispatch_support
 def parse_example(serialized, features, name=None, example_names=None):
   return parse_example_v2(serialized, features, example_names, name)
 
@@ -373,6 +376,7 @@ def _parse_example_raw(serialized, names, params, name):
 
 
 @tf_export(v1=["io.parse_single_example", "parse_single_example"])
+@dispatch.add_dispatch_support
 def parse_single_example(serialized, features, name=None, example_names=None):
   """Parses a single `Example` proto.
 
@@ -407,6 +411,7 @@ def parse_single_example(serialized, features, name=None, example_names=None):
 
 
 @tf_export("io.parse_single_example", v1=[])
+@dispatch.add_dispatch_support
 def parse_single_example_v2(
     serialized, features, example_names=None, name=None
     ):
@@ -448,6 +453,7 @@ def parse_single_example_v2(
 
 
 @tf_export("io.parse_sequence_example")
+@dispatch.add_dispatch_support
 def parse_sequence_example(serialized,
                            context_features=None,
                            sequence_features=None,
@@ -692,6 +698,7 @@ def _parse_sequence_example_raw(serialized,
 @tf_export("io.parse_single_sequence_example",
            v1=["io.parse_single_sequence_example",
                "parse_single_sequence_example"])
+@dispatch.add_dispatch_support
 def parse_single_sequence_example(
     serialized, context_features=None, sequence_features=None,
     example_name=None, name=None):
@@ -835,6 +842,7 @@ def _parse_single_sequence_example_raw(serialized,
 
 
 @tf_export("io.decode_raw", v1=[])
+@dispatch.add_dispatch_support
 def decode_raw(input_bytes,
                out_type,
                little_endian=True,
@@ -877,6 +885,7 @@ def decode_raw(input_bytes,
 
 
 @tf_export(v1=["decode_raw", "io.decode_raw"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "bytes is deprecated, use input_bytes instead",
                              "bytes")
@@ -921,6 +930,7 @@ def decode_raw_v1(
 
 # Swap `name` and `na_value` for backward compatibility.
 @tf_export(v1=["io.decode_csv", "decode_csv"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("decode_csv")
 def decode_csv(records,
                record_defaults,
@@ -970,6 +980,7 @@ def decode_csv(records,
 
 
 @tf_export("io.decode_csv", v1=[])
+@dispatch.add_dispatch_support
 def decode_csv_v2(records,
                   record_defaults,
                   field_delim=",",
diff --git a/tensorflow/python/ops/proto_ops.py b/tensorflow/python/ops/proto_ops.py
index 1f7300dbef9..0e19aad584c 100644
--- a/tensorflow/python/ops/proto_ops.py
+++ b/tensorflow/python/ops/proto_ops.py
@@ -22,10 +22,11 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops.gen_decode_proto_ops import decode_proto_v2 as decode_proto
 from tensorflow.python.ops.gen_encode_proto_ops import encode_proto
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
-tf_export("io.decode_proto")(decode_proto)
-tf_export("io.encode_proto")(encode_proto)
+tf_export("io.decode_proto")(dispatch.add_dispatch_support(decode_proto))
+tf_export("io.encode_proto")(dispatch.add_dispatch_support(encode_proto))
 
 ops.NotDifferentiable("DecodeProtoV2")
 ops.NotDifferentiable("EncodeProto")
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index 66cac6a11d2..b2a02b82454 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -307,6 +307,7 @@ py_library(
     deps = [
         ":segment_id_ops",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:bincount_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -417,6 +418,7 @@ py_library(
     deps = [
         ":ragged_util",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:bincount_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index 7f971cd558f..782902f2f71 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged import segment_id_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 #===============================================================================
@@ -40,6 +41,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('ragged.boolean_mask')
+@dispatch.add_dispatch_support
 def boolean_mask(data, mask, name=None):
   """Applies a boolean mask to `data` without flattening the mask dimensions.
 
@@ -538,6 +540,7 @@ def ragged_one_hot(indices,
 # ragged.stack_dynamic_partitions
 #===============================================================================
 @tf_export('ragged.stack_dynamic_partitions')
+@dispatch.add_dispatch_support
 def stack_dynamic_partitions(data, partitions, num_partitions, name=None):
   """Stacks dynamic partitions of a Tensor or RaggedTensor.
 
@@ -699,6 +702,7 @@ def reverse(tensor, axis, name=None):
 
 
 @tf_export('ragged.cross')
+@dispatch.add_dispatch_support
 def cross(inputs, name=None):
   """Generates feature cross from a list of tensors.
 
@@ -725,6 +729,7 @@ def cross(inputs, name=None):
 
 
 @tf_export('ragged.cross_hashed')
+@dispatch.add_dispatch_support
 def cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
   """Generates hashed feature cross from a list of tensors.
 
diff --git a/tensorflow/python/ops/ragged/ragged_concat_ops.py b/tensorflow/python/ops/ragged/ragged_concat_ops.py
index 9bcb1aa4765..cd710f449a6 100644
--- a/tensorflow/python/ops/ragged/ragged_concat_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_concat_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -71,6 +72,7 @@ def concat(values, axis, name=None):
 
 
 @tf_export('ragged.stack')
+@dispatch.add_dispatch_support
 def stack(values, axis=0, name=None):
   """Stacks a list of rank-`R` tensors into one rank-`(R+1)` `RaggedTensor`.
 
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index dd5bd782462..f13bed07ba0 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
@@ -453,6 +454,26 @@ def _ragged_dynamic_partition(data, partitions, num_partitions, name=None):
                                                      num_partitions, name)
   return [result[i] for i in range(num_partitions)]
 
+
+def _ragged_nn_dropout_v1(x, keep_prob=None, noise_shape=None, seed=None,
+                          name=None, rate=None):
+  if noise_shape is not None:
+    raise ValueError('noise_shape is not supported yet for RaggedTensor x')
+  with ops.name_scope(name, 'RaggedNNDropout', [x, rate]):
+    x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
+    return x.with_flat_values(nn_ops.dropout(x.flat_values, keep_prob=keep_prob,
+                                             seed=seed, rate=rate))
+
+
+def _ragged_nn_dropout_v2(x, rate, noise_shape=None, seed=None, name=None):
+  if noise_shape is not None:
+    raise ValueError('noise_shape is not supported yet for RaggedTensor x')
+  with ops.name_scope(name, 'RaggedNNDropout', [x, rate]):
+    x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
+    return x.with_flat_values(nn_ops.dropout_v2(x.flat_values, rate=rate,
+                                                seed=seed))
+
+
 # (original_op, ragged_op, ragged_args)
 _RAGGED_DISPATCH_OPS = [
     (array_ops.batch_gather, ragged_batch_gather_ops.batch_gather,
@@ -497,6 +518,8 @@ _RAGGED_DISPATCH_OPS = [
     (math_ops.reduce_mean, ragged_math_ops.reduce_mean, ['input_tensor']),
     (math_ops.reduce_any, ragged_math_ops.reduce_any, ['input_tensor']),
     (math_ops.reduce_all, ragged_math_ops.reduce_all, ['input_tensor']),
+    (nn_ops.dropout, _ragged_nn_dropout_v1, ['x']),
+    (nn_ops.dropout_v2, _ragged_nn_dropout_v2, ['x']),
 ]
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
index 0ce9a6f9771..60d9f6c8713 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_dispatch
@@ -232,6 +233,10 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
           {'op': array_ops.check_numerics,
            'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
            'message': 'check-numerics'},
+          {'op': nn_ops.dropout,
+           'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
+           'rate': 0.5,
+           'seed': 1},
       ]
       )  # pyformat: disable
   def testUnaryElementwiseOp(self, x, op=math_ops.abs, **extra_args):
@@ -820,7 +825,8 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
         'strings.substr', 'strings.to_hash_bucket_fast',
         'strings.to_hash_bucket_strong', 'strings.to_hash_bucket',
         'strings.to_number', 'strings.unicode_script', 'tile', 'truncatediv',
-        'truncatemod', 'zeros_like', 'dynamic_partition', 'reverse'
+        'truncatemod', 'zeros_like', 'dynamic_partition', 'reverse',
+        'nn.dropout',
     ]
 
     # Ops that should be listed as supported in v1 only.
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
index aa148ae7fe8..3a6f6231149 100644
--- a/tensorflow/python/ops/ragged/ragged_factory_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -34,6 +35,7 @@ from tensorflow.python.util.tf_export import tf_export
 # Op to construct a constant RaggedTensor from a nested Python list.
 #===============================================================================
 @tf_export("ragged.constant")
+@dispatch.add_dispatch_support
 def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None,
              name=None, row_splits_dtype=dtypes.int64):
   """Constructs a constant RaggedTensor from a nested Python list.
@@ -86,6 +88,7 @@ def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None,
 
 
 @tf_export(v1=["ragged.constant_value"])
+@dispatch.add_dispatch_support
 def constant_value(pylist, dtype=None, ragged_rank=None, inner_shape=None,
                    row_splits_dtype="int64"):
   """Constructs a RaggedTensorValue from a nested Python list.
@@ -311,6 +314,7 @@ def _default_inner_shape_for_pylist(pylist, ragged_rank):
 
 
 @tf_export(v1=["ragged.placeholder"])
+@dispatch.add_dispatch_support
 def placeholder(dtype, ragged_rank, value_shape=None, name=None):
   """Creates a placeholder for a `tf.RaggedTensor` that will always be fed.
 
diff --git a/tensorflow/python/ops/ragged/ragged_functional_ops.py b/tensorflow/python/ops/ragged/ragged_functional_ops.py
index cc45f729e58..00b5ced6170 100644
--- a/tensorflow/python/ops/ragged/ragged_functional_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_functional_ops.py
@@ -24,10 +24,12 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_config
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("ragged.map_flat_values")
+@dispatch.add_dispatch_support
 def map_flat_values(op, *args, **kwargs):
   """Applies `op` to the values of one or more RaggedTensors.
 
diff --git a/tensorflow/python/ops/ragged/ragged_math_ops.py b/tensorflow/python/ops/ragged/ragged_math_ops.py
index 5483cda571c..73a53583ada 100644
--- a/tensorflow/python/ops/ragged/ragged_math_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_math_ops.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import segment_id_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -38,6 +39,7 @@ from tensorflow.python.util.tf_export import tf_export
 #===============================================================================
 # pylint: disable=redefined-builtin
 @tf_export('ragged.range')
+@dispatch.add_dispatch_support
 def range(starts, limits=None, deltas=1, dtype=None,
           name=None, row_splits_dtype=dtypes.int64):
   """Returns a `RaggedTensor` containing the specified sequences of numbers.
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
index d5f21832044..0d9c4d506f3 100755
--- a/tensorflow/python/ops/ragged/ragged_string_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -29,10 +29,12 @@ from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import compat as util_compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("strings.bytes_split")
+@dispatch.add_dispatch_support
 def string_bytes_split(input, name=None):  # pylint: disable=redefined-builtin
   """Split string elements of `input` into bytes.
 
@@ -80,6 +82,7 @@ def string_bytes_split(input, name=None):  # pylint: disable=redefined-builtin
 
 # pylint: disable=redefined-builtin
 @tf_export("strings.unicode_encode")
+@dispatch.add_dispatch_support
 def unicode_encode(input,
                    output_encoding,
                    errors="replace",
@@ -177,6 +180,7 @@ def unicode_encode(input,
 
 # pylint: disable=redefined-builtin
 @tf_export("strings.unicode_decode")
+@dispatch.add_dispatch_support
 def unicode_decode(input,
                    input_encoding,
                    errors="replace",
@@ -222,6 +226,7 @@ def unicode_decode(input,
 
 
 @tf_export("strings.unicode_decode_with_offsets")
+@dispatch.add_dispatch_support
 def unicode_decode_with_offsets(input,
                                 input_encoding,
                                 errors="replace",
@@ -283,6 +288,7 @@ def unicode_decode_with_offsets(input,
 
 
 @tf_export("strings.unicode_split")
+@dispatch.add_dispatch_support
 def unicode_split(input,
                   input_encoding,
                   errors="replace",
@@ -330,6 +336,7 @@ def unicode_split(input,
 
 
 @tf_export("strings.unicode_split_with_offsets")
+@dispatch.add_dispatch_support
 def unicode_split_with_offsets(input,
                                input_encoding,
                                errors="replace",
@@ -453,6 +460,7 @@ def _unicode_decode(input, input_encoding, errors, replacement_char,
 
 
 @tf_export("strings.split", v1=[])
+@dispatch.add_dispatch_support
 def string_split_v2(input, sep=None, maxsplit=-1, name=None):  # pylint: disable=redefined-builtin
   """Split elements of `input` based on `sep` into a `RaggedTensor`.
 
@@ -514,6 +522,7 @@ def string_split_v2(input, sep=None, maxsplit=-1, name=None):  # pylint: disable
 
 
 @tf_export(v1=["string_split"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "delimiter is deprecated, please use sep instead.",
                              "delimiter")
@@ -578,6 +587,7 @@ def string_split(source, sep=None, skip_empty=True, delimiter=None,
 # In TensorFlow 1.x, "tf.strings.split" uses the new signature (with maxsplit),
 # but we need to add the result_type argument.
 @tf_export(v1=["strings.split"])
+@dispatch.add_dispatch_support
 def strings_split_v1(input=None, sep=None, maxsplit=-1,  # pylint: disable=redefined-builtin
                      result_type="SparseTensor", source=None, name=None):
   """Split elements of `input` based on `sep`.
@@ -651,6 +661,7 @@ def reduce_join(inputs, axis=None, keepdims=None, separator="", name=None):
 
 
 @tf_export("strings.ngrams")
+@dispatch.add_dispatch_support
 def ngrams(data,
            ngram_width,
            separator=" ",
diff --git a/tensorflow/python/ops/ragged/row_partition.py b/tensorflow/python/ops/ragged/row_partition.py
index 133b55a53bf..e86ecc3f034 100644
--- a/tensorflow/python/ops/ragged/row_partition.py
+++ b/tensorflow/python/ops/ragged/row_partition.py
@@ -228,6 +228,9 @@ class RowPartition(composite_tensor.CompositeTensor):
     ...     nrows=4))
     tf.RowPartition(row_splits=tf.Tensor([0 4 4 7 8], shape=(5,), dtype=int64))
     """
+    # Local import bincount_ops to avoid import-cycle since bincount_ops
+    # imports ragged_tensor.
+    from tensorflow.python.ops import bincount_ops  # pylint: disable=g-import-not-at-top
     if not isinstance(validate, bool):
       raise TypeError("validate must have type bool")
     with ops.name_scope(None, "RowPartitionFromValueRowIds",
@@ -278,7 +281,7 @@ class RowPartition(composite_tensor.CompositeTensor):
       # cast.
       value_rowids_int32 = math_ops.cast(value_rowids, dtypes.int32)
       nrows_int32 = math_ops.cast(nrows, dtypes.int32)
-      row_lengths = math_ops.bincount(
+      row_lengths = bincount_ops.bincount(
           value_rowids_int32,
           minlength=nrows_int32,
           maxlength=nrows_int32,
diff --git a/tensorflow/python/ops/ragged/segment_id_ops.py b/tensorflow/python/ops/ragged/segment_id_ops.py
index 5329860743e..3b3809d8d56 100644
--- a/tensorflow/python/ops/ragged/segment_id_ops.py
+++ b/tensorflow/python/ops/ragged/segment_id_ops.py
@@ -25,12 +25,14 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 # For background on "segments" and "segment ids", see:
 # https://www.tensorflow.org/api_docs/python/tf/math#Segmentation
 @tf_export("ragged.row_splits_to_segment_ids")
+@dispatch.add_dispatch_support
 def row_splits_to_segment_ids(splits, name=None, out_type=None):
   """Generates the segmentation corresponding to a RaggedTensor `row_splits`.
 
@@ -74,6 +76,7 @@ def row_splits_to_segment_ids(splits, name=None, out_type=None):
 # For background on "segments" and "segment ids", see:
 # https://www.tensorflow.org/api_docs/python/tf/math#Segmentation
 @tf_export("ragged.segment_ids_to_row_splits")
+@dispatch.add_dispatch_support
 def segment_ids_to_row_splits(segment_ids, num_segments=None,
                               out_type=None, name=None):
   """Generates the RaggedTensor `row_splits` corresponding to a segmentation.
@@ -95,6 +98,8 @@ def segment_ids_to_row_splits(segment_ids, num_segments=None,
   Returns:
     A sorted 1-D integer Tensor, with `shape=[num_segments + 1]`.
   """
+  # Local import bincount_ops to avoid import-cycle.
+  from tensorflow.python.ops import bincount_ops  # pylint: disable=g-import-not-at-top
   if out_type is None:
     if isinstance(segment_ids, ops.Tensor):
       out_type = segment_ids.dtype
@@ -116,7 +121,7 @@ def segment_ids_to_row_splits(segment_ids, num_segments=None,
                                                        dtype=dtypes.int32)
       num_segments.shape.assert_has_rank(0)
 
-    row_lengths = math_ops.bincount(
+    row_lengths = bincount_ops.bincount(
         segment_ids,
         minlength=num_segments,
         maxlength=num_segments,
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 83cb7fcc92a..1af91ed0dd3 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -36,10 +36,12 @@ from tensorflow.python.ops.gen_random_ops import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("random.normal", v1=["random.normal", "random_normal"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_normal")
 def random_normal(shape,
                   mean=0.0,
@@ -155,6 +157,7 @@ def parameterized_truncated_normal(shape,
 
 @tf_export("random.truncated_normal",
            v1=["random.truncated_normal", "truncated_normal"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("truncated_normal")
 def truncated_normal(shape,
                      mean=0.0,
@@ -202,6 +205,7 @@ ops.NotDifferentiable("TruncatedNormal")
 
 
 @tf_export("random.uniform", v1=["random.uniform", "random_uniform"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_uniform")
 def random_uniform(shape,
                    minval=0,
@@ -313,6 +317,7 @@ ops.NotDifferentiable("RandomUniform")
 
 
 @tf_export("random.shuffle", v1=["random.shuffle", "random_shuffle"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_shuffle")
 def random_shuffle(value, seed=None, name=None):
   """Randomly shuffles a tensor along its first dimension.
@@ -345,6 +350,7 @@ def random_shuffle(value, seed=None, name=None):
 
 
 @tf_export("image.random_crop", v1=["image.random_crop", "random_crop"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_crop")
 def random_crop(value, size, seed=None, name=None):
   """Randomly crops a tensor to a given size.
@@ -389,6 +395,7 @@ def random_crop(value, size, seed=None, name=None):
 
 
 @tf_export(v1=["random.multinomial", "multinomial"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     date=None, instructions="Use `tf.random.categorical` instead.")
 def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
@@ -468,6 +475,7 @@ def _maybe_set_static_shape_helper(tensor, shape, postfix_tensor):
 
 
 @tf_export("random.gamma", v1=["random.gamma", "random_gamma"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_gamma")
 def random_gamma(shape,
                  alpha,
@@ -561,6 +569,7 @@ def random_gamma(shape,
 
 
 @tf_export(v1=["random.poisson", "random_poisson"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_poisson")
 def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
   """Draws `shape` samples from each of the given Poisson distribution(s).
@@ -601,6 +610,7 @@ def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
 
 
 @tf_export("random.poisson", v1=[])
+@dispatch.add_dispatch_support
 def random_poisson_v2(shape, lam, dtype=dtypes.float32, seed=None, name=None):
   """Draws `shape` samples from each of the given Poisson distribution(s).
 
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index f99f886f210..d8a7765a208 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -49,6 +49,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.ops.gen_resource_variable_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.types import core
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 
@@ -330,7 +331,7 @@ def variable_accessed(variable):
     tape.variable_accessed(variable)
 
 
-class BaseResourceVariable(variables.VariableV1):
+class BaseResourceVariable(variables.VariableV1, core.Tensor):
   """A python variable from an existing handle."""
 
   # TODO(wangpeng): Deprecate `constraint` when callers no long pass it in.
@@ -1830,7 +1831,6 @@ def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
 # allowing instances of the class to be used as tensors.
 ops.register_tensor_conversion_function(BaseResourceVariable,
                                         _dense_var_to_tensor)
-ops.register_dense_tensor_like_type(BaseResourceVariable)
 
 
 class _UnreadVariable(BaseResourceVariable):
@@ -1955,9 +1955,6 @@ class _UnreadVariable(BaseResourceVariable):
     return self._parent_op
 
 
-ops.register_dense_tensor_like_type(_UnreadVariable)
-
-
 @ops.RegisterGradient("ReadVariableOp")
 def _ReadGrad(_, grad):
   """Gradient for read op."""
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index b87e5d65a37..6c11ebefb1c 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -342,6 +343,7 @@ def _reverse_seq(input_seq, lengths):
                         "keras.layers.RNN(cell))`, which is equivalent to "
                         "this API")
 @tf_export(v1=["nn.bidirectional_dynamic_rnn"])
+@dispatch.add_dispatch_support
 def bidirectional_dynamic_rnn(cell_fw,
                               cell_bw,
                               inputs,
@@ -499,6 +501,7 @@ def bidirectional_dynamic_rnn(cell_fw,
     None,
     "Please use `keras.layers.RNN(cell)`, which is equivalent to this API")
 @tf_export(v1=["nn.dynamic_rnn"])
+@dispatch.add_dispatch_support
 def dynamic_rnn(cell,
                 inputs,
                 sequence_length=None,
@@ -912,6 +915,7 @@ def _dynamic_rnn_loop(cell,
 
 
 @tf_export(v1=["nn.raw_rnn"])
+@dispatch.add_dispatch_support
 def raw_rnn(cell,
             loop_fn,
             parallel_iterations=None,
@@ -1238,6 +1242,7 @@ def raw_rnn(cell,
                         "Please use `keras.layers.RNN(cell, unroll=True)`, "
                         "which is equivalent to this API")
 @tf_export(v1=["nn.static_rnn"])
+@dispatch.add_dispatch_support
 def static_rnn(cell,
                inputs,
                initial_state=None,
@@ -1416,6 +1421,7 @@ def static_rnn(cell,
                         "Please use `keras.layers.RNN(cell, stateful=True)`, "
                         "which is equivalent to this API")
 @tf_export(v1=["nn.static_state_saving_rnn"])
+@dispatch.add_dispatch_support
 def static_state_saving_rnn(cell,
                             inputs,
                             state_saver,
@@ -1510,6 +1516,7 @@ def static_state_saving_rnn(cell,
                         "keras.layers.RNN(cell, unroll=True))`, which is "
                         "equivalent to this API")
 @tf_export(v1=["nn.static_bidirectional_rnn"])
+@dispatch.add_dispatch_support
 def static_bidirectional_rnn(cell_fw,
                              cell_bw,
                              inputs,
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index bee85dc4a5b..7ee5a16ca9a 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import gen_script_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import lazy_loader
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
@@ -370,6 +371,7 @@ def _EagerPyFuncGrad(op, *dy):
 
 
 @tf_export("py_function")
+@dispatch.add_dispatch_support
 def eager_py_func(func, inp, Tout, name=None):
   """Wraps a python function into a TensorFlow op that executes it eagerly.
 
@@ -551,6 +553,7 @@ def py_func_common(func, inp, Tout, stateful=True, name=None):
     stateful argument making all functions stateful.
     """)
 @tf_export(v1=["py_func"])
+@dispatch.add_dispatch_support
 def py_func(func, inp, Tout, stateful=True, name=None):
   return py_func_common(func, inp, Tout, stateful, name=name)
 
@@ -559,6 +562,7 @@ py_func.__doc__ = "%s" % py_func_common.__doc__
 
 
 @tf_export("numpy_function")
+@dispatch.add_dispatch_support
 def numpy_function(func, inp, Tout, name=None):
   """Wraps a python function and uses it as a TensorFlow op.
 
diff --git a/tensorflow/python/ops/sets_impl.py b/tensorflow/python/ops/sets_impl.py
index 988d437bae8..0b65033ce8c 100644
--- a/tensorflow/python/ops/sets_impl.py
+++ b/tensorflow/python/ops/sets_impl.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import gen_set_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -32,6 +33,7 @@ _VALID_DTYPES = set([
 
 
 @tf_export("sets.size", v1=["sets.size", "sets.set_size"])
+@dispatch.add_dispatch_support
 def set_size(a, validate_indices=True):
   """Compute number of unique elements along last dimension of `a`.
 
@@ -135,6 +137,7 @@ def _set_operation(a, b, set_operation, validate_indices=True):
 
 @tf_export(
     "sets.intersection", v1=["sets.intersection", "sets.set_intersection"])
+@dispatch.add_dispatch_support
 def set_intersection(a, b, validate_indices=True):
   """Compute set intersection of elements in last dimension of `a` and `b`.
 
@@ -205,6 +208,7 @@ def set_intersection(a, b, validate_indices=True):
 
 @tf_export(
     "sets.difference", v1=["sets.difference", "sets.set_difference"])
+@dispatch.add_dispatch_support
 def set_difference(a, b, aminusb=True, validate_indices=True):
   """Compute set difference of elements in last dimension of `a` and `b`.
 
@@ -286,6 +290,7 @@ def set_difference(a, b, aminusb=True, validate_indices=True):
 
 @tf_export(
     "sets.union", v1=["sets.union", "sets.set_union"])
+@dispatch.add_dispatch_support
 def set_union(a, b, validate_indices=True):
   """Compute set union of elements in last dimension of `a` and `b`.
 
diff --git a/tensorflow/python/ops/signal/dct_ops.py b/tensorflow/python/ops/signal/dct_ops.py
index d628e54cdf9..18730743941 100644
--- a/tensorflow/python/ops/signal/dct_ops.py
+++ b/tensorflow/python/ops/signal/dct_ops.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import math_ops as _math_ops
 from tensorflow.python.ops.signal import fft_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -50,6 +51,7 @@ def _validate_dct_arguments(input_tensor, dct_type, n, axis, norm):
 
 # TODO(rjryan): Implement `axis` parameter.
 @tf_export("signal.dct", v1=["signal.dct", "spectral.dct"])
+@dispatch.add_dispatch_support
 def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
   """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.
 
@@ -181,6 +183,7 @@ def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disabl
 
 # TODO(rjryan): Implement `n` and `axis` parameters.
 @tf_export("signal.idct", v1=["signal.idct", "spectral.idct"])
+@dispatch.add_dispatch_support
 def idct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
   """Computes the 1D [Inverse Discrete Cosine Transform (DCT)][idct] of `input`.
 
diff --git a/tensorflow/python/ops/signal/fft_ops.py b/tensorflow/python/ops/signal/fft_ops.py
index 6e9e8ef80e4..86a94cf5de7 100644
--- a/tensorflow/python/ops/signal/fft_ops.py
+++ b/tensorflow/python/ops/signal/fft_ops.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import manip_ops
 from tensorflow.python.ops import math_ops as _math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -181,17 +182,23 @@ ifft2d = gen_spectral_ops.ifft2d
 fft3d = gen_spectral_ops.fft3d
 ifft3d = gen_spectral_ops.ifft3d
 rfft = _rfft_wrapper(gen_spectral_ops.rfft, 1, "rfft")
-tf_export("signal.rfft", v1=["signal.rfft", "spectral.rfft"])(rfft)
+tf_export("signal.rfft", v1=["signal.rfft", "spectral.rfft"])(
+    dispatch.add_dispatch_support(rfft))
 irfft = _irfft_wrapper(gen_spectral_ops.irfft, 1, "irfft")
-tf_export("signal.irfft", v1=["signal.irfft", "spectral.irfft"])(irfft)
+tf_export("signal.irfft", v1=["signal.irfft", "spectral.irfft"])(
+    dispatch.add_dispatch_support(irfft))
 rfft2d = _rfft_wrapper(gen_spectral_ops.rfft2d, 2, "rfft2d")
-tf_export("signal.rfft2d", v1=["signal.rfft2d", "spectral.rfft2d"])(rfft2d)
+tf_export("signal.rfft2d", v1=["signal.rfft2d", "spectral.rfft2d"])(
+    dispatch.add_dispatch_support(rfft2d))
 irfft2d = _irfft_wrapper(gen_spectral_ops.irfft2d, 2, "irfft2d")
-tf_export("signal.irfft2d", v1=["signal.irfft2d", "spectral.irfft2d"])(irfft2d)
+tf_export("signal.irfft2d", v1=["signal.irfft2d", "spectral.irfft2d"])(
+    dispatch.add_dispatch_support(irfft2d))
 rfft3d = _rfft_wrapper(gen_spectral_ops.rfft3d, 3, "rfft3d")
-tf_export("signal.rfft3d", v1=["signal.rfft3d", "spectral.rfft3d"])(rfft3d)
+tf_export("signal.rfft3d", v1=["signal.rfft3d", "spectral.rfft3d"])(
+    dispatch.add_dispatch_support(rfft3d))
 irfft3d = _irfft_wrapper(gen_spectral_ops.irfft3d, 3, "irfft3d")
-tf_export("signal.irfft3d", v1=["signal.irfft3d", "spectral.irfft3d"])(irfft3d)
+tf_export("signal.irfft3d", v1=["signal.irfft3d", "spectral.irfft3d"])(
+    dispatch.add_dispatch_support(irfft3d))
 
 
 def _fft_size_for_grad(grad, rank):
@@ -363,6 +370,7 @@ def _irfft_grad_helper(rank, rfft_fn):
 
 
 @tf_export("signal.fftshift")
+@dispatch.add_dispatch_support
 def fftshift(x, axes=None, name=None):
   """Shift the zero-frequency component to the center of the spectrum.
 
@@ -407,6 +415,7 @@ def fftshift(x, axes=None, name=None):
 
 
 @tf_export("signal.ifftshift")
+@dispatch.add_dispatch_support
 def ifftshift(x, axes=None, name=None):
   """The inverse of fftshift.
 
diff --git a/tensorflow/python/ops/signal/mel_ops.py b/tensorflow/python/ops/signal/mel_ops.py
index aa0769166a4..cf0bed9ef1b 100644
--- a/tensorflow/python/ops/signal/mel_ops.py
+++ b/tensorflow/python/ops/signal/mel_ops.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.signal import shape_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -90,6 +91,7 @@ def _validate_arguments(num_mel_bins, sample_rate,
 
 
 @tf_export('signal.linear_to_mel_weight_matrix')
+@dispatch.add_dispatch_support
 def linear_to_mel_weight_matrix(num_mel_bins=20,
                                 num_spectrogram_bins=129,
                                 sample_rate=8000,
@@ -128,8 +130,6 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
       # S has shape [..., num_spectrogram_bins].
       # M has shape [..., num_mel_bins].
       M = tf.tensordot(S, A, 1)
-      # tf.tensordot does not support shape inference for this case yet.
-      M.set_shape(S.shape[:-1].concatenate(A.shape[-1:]))
 
   Args:
     num_mel_bins: Python int. How many bands in the resulting mel spectrum.
diff --git a/tensorflow/python/ops/signal/mfcc_ops.py b/tensorflow/python/ops/signal/mfcc_ops.py
index 56cbff40bca..948b78a858e 100644
--- a/tensorflow/python/ops/signal/mfcc_ops.py
+++ b/tensorflow/python/ops/signal/mfcc_ops.py
@@ -22,10 +22,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.signal import dct_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('signal.mfccs_from_log_mel_spectrograms')
+@dispatch.add_dispatch_support
 def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
   """Computes [MFCCs][mfcc] of `log_mel_spectrograms`.
 
diff --git a/tensorflow/python/ops/signal/reconstruction_ops.py b/tensorflow/python/ops/signal/reconstruction_ops.py
index fcdcf592f14..e340e97b3e5 100644
--- a/tensorflow/python/ops/signal/reconstruction_ops.py
+++ b/tensorflow/python/ops/signal/reconstruction_ops.py
@@ -23,10 +23,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("signal.overlap_and_add")
+@dispatch.add_dispatch_support
 def overlap_and_add(signal, frame_step, name=None):
   """Reconstructs a signal from a framed representation.
 
diff --git a/tensorflow/python/ops/signal/shape_ops.py b/tensorflow/python/ops/signal/shape_ops.py
index 1c95873fc3d..7a3acce3475 100644
--- a/tensorflow/python/ops/signal/shape_ops.py
+++ b/tensorflow/python/ops/signal/shape_ops.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.signal import util_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -55,6 +56,7 @@ def _infer_frame_shape(signal, frame_length, frame_step, pad_end, axis):
 
 
 @tf_export("signal.frame")
+@dispatch.add_dispatch_support
 def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
           name=None):
   """Expands `signal`'s `axis` dimension into frames of `frame_length`.
diff --git a/tensorflow/python/ops/signal/spectral_ops.py b/tensorflow/python/ops/signal/spectral_ops.py
index d096e53e8f8..7c4c5542b84 100644
--- a/tensorflow/python/ops/signal/spectral_ops.py
+++ b/tensorflow/python/ops/signal/spectral_ops.py
@@ -31,10 +31,12 @@ from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.ops.signal import reconstruction_ops
 from tensorflow.python.ops.signal import shape_ops
 from tensorflow.python.ops.signal import window_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('signal.stft')
+@dispatch.add_dispatch_support
 def stft(signals, frame_length, frame_step, fft_length=None,
          window_fn=window_ops.hann_window,
          pad_end=False, name=None):
@@ -95,6 +97,7 @@ def stft(signals, frame_length, frame_step, fft_length=None,
 
 
 @tf_export('signal.inverse_stft_window_fn')
+@dispatch.add_dispatch_support
 def inverse_stft_window_fn(frame_step,
                            forward_window_fn=window_ops.hann_window,
                            name=None):
@@ -156,6 +159,7 @@ def inverse_stft_window_fn(frame_step,
 
 
 @tf_export('signal.inverse_stft')
+@dispatch.add_dispatch_support
 def inverse_stft(stfts,
                  frame_length,
                  frame_step,
@@ -291,6 +295,7 @@ def _enclosing_power_of_two(value):
 
 
 @tf_export('signal.mdct')
+@dispatch.add_dispatch_support
 def mdct(signals, frame_length, window_fn=window_ops.vorbis_window,
          pad_end=False, norm=None, name=None):
   """Computes the [Modified Discrete Cosine Transform][mdct] of `signals`.
@@ -366,6 +371,7 @@ def mdct(signals, frame_length, window_fn=window_ops.vorbis_window,
 
 
 @tf_export('signal.inverse_mdct')
+@dispatch.add_dispatch_support
 def inverse_mdct(mdcts,
                  window_fn=window_ops.vorbis_window,
                  norm=None,
diff --git a/tensorflow/python/ops/signal/window_ops.py b/tensorflow/python/ops/signal/window_ops.py
index bb10bdf4be5..eb33c3f3b58 100644
--- a/tensorflow/python/ops/signal/window_ops.py
+++ b/tensorflow/python/ops/signal/window_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -52,6 +53,7 @@ def _check_params(window_length, dtype):
 
 
 @tf_export('signal.kaiser_window')
+@dispatch.add_dispatch_support
 def kaiser_window(window_length, beta=12., dtype=dtypes.float32, name=None):
   """Generate a [Kaiser window][kaiser].
 
@@ -91,6 +93,7 @@ def kaiser_window(window_length, beta=12., dtype=dtypes.float32, name=None):
 
 
 @tf_export('signal.kaiser_bessel_derived_window')
+@dispatch.add_dispatch_support
 def kaiser_bessel_derived_window(window_length, beta=12.,
                                  dtype=dtypes.float32, name=None):
   """Generate a [Kaiser Bessel derived window][kbd].
@@ -118,6 +121,7 @@ def kaiser_bessel_derived_window(window_length, beta=12.,
 
 
 @tf_export('signal.vorbis_window')
+@dispatch.add_dispatch_support
 def vorbis_window(window_length, dtype=dtypes.float32, name=None):
   """Generate a [Vorbis power complementary window][vorbis].
 
@@ -142,6 +146,7 @@ def vorbis_window(window_length, dtype=dtypes.float32, name=None):
 
 
 @tf_export('signal.hann_window')
+@dispatch.add_dispatch_support
 def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
   """Generate a [Hann window][hann].
 
@@ -167,6 +172,7 @@ def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
 
 
 @tf_export('signal.hamming_window')
+@dispatch.add_dispatch_support
 def hamming_window(window_length, periodic=True, dtype=dtypes.float32,
                    name=None):
   """Generate a [Hamming][hamming] window.
diff --git a/tensorflow/python/ops/sort_ops.py b/tensorflow/python/ops/sort_ops.py
index 92435e6bdef..4e66a80bc01 100644
--- a/tensorflow/python/ops/sort_ops.py
+++ b/tensorflow/python/ops/sort_ops.py
@@ -30,10 +30,12 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('sort')
+@dispatch.add_dispatch_support
 def sort(values, axis=-1, direction='ASCENDING', name=None):
   """Sorts a tensor.
   
@@ -67,6 +69,7 @@ def sort(values, axis=-1, direction='ASCENDING', name=None):
 
 
 @tf_export('argsort')
+@dispatch.add_dispatch_support
 def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None):
   """Returns the indices of a tensor that give its sorted order along an axis.
 
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 844aa3c744c..cc4b1010021 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -27,6 +27,7 @@ import numbers
 
 import numpy as np
 
+from tensorflow.python.compat import compat as tf_compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -569,7 +570,7 @@ def sparse_add_v2(a, b, threshold=0):
 
 
 @tf_export("sparse.cross")
-def sparse_cross(inputs, name=None):
+def sparse_cross(inputs, name=None, separator=None):
   """Generates sparse cross from a list of sparse and dense tensors.
 
   For example, if the inputs are
@@ -590,14 +591,39 @@ def sparse_cross(inputs, name=None):
       [1, 0]: "b_X_e_X_g"
       [1, 1]: "c_X_e_X_g"
 
+  Customized separator "_Y_":
+
+  >>> inp_0 = tf.constant([['a'], ['b']])
+  >>> inp_1 = tf.constant([['c'], ['d']])
+  >>> output = tf.sparse.cross([inp_0, inp_1], separator='_Y_')
+  >>> output.values
+  <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'a_Y_c', b'b_Y_d'],
+    dtype=object)>
+
+
   Args:
     inputs: An iterable of `Tensor` or `SparseTensor`.
     name: Optional name for the op.
+    separator: A string added between each string being joined. Defaults to
+      '_X_'.
 
   Returns:
     A `SparseTensor` of type `string`.
   """
-  return _sparse_cross_internal(inputs=inputs, hashed_output=False, name=name)
+  if separator is None and not tf_compat.forward_compatible(2020, 6, 14):
+    return _sparse_cross_internal(inputs=inputs, hashed_output=False, name=name)
+  if separator is None:
+    separator = "_X_"
+  separator = ops.convert_to_tensor(separator, dtypes.string)
+  indices, values, shapes, dense_inputs = _sparse_cross_internval_v2(inputs)
+  indices_out, values_out, shape_out = gen_sparse_ops.sparse_cross_v2(
+      indices=indices,
+      values=values,
+      shapes=shapes,
+      dense_inputs=dense_inputs,
+      sep=separator,
+      name=name)
+  return sparse_tensor.SparseTensor(indices_out, values_out, shape_out)
 
 
 _sparse_cross = sparse_cross
@@ -655,6 +681,32 @@ _sparse_cross_hashed = sparse_cross_hashed
 _DEFAULT_HASH_KEY = 0xDECAFCAFFE
 
 
+def _sparse_cross_internval_v2(inputs):
+  """See gen_sparse_ops.sparse_cross_v2."""
+  if not isinstance(inputs, (tuple, list)):
+    raise TypeError("Inputs must be a list")
+  if not all(
+      isinstance(i, sparse_tensor.SparseTensor) or isinstance(i, ops.Tensor)
+      for i in inputs):
+    raise TypeError("All inputs must be Tensor or SparseTensor.")
+  sparse_inputs = [
+      i for i in inputs if isinstance(i, sparse_tensor.SparseTensor)
+  ]
+  dense_inputs = [
+      i for i in inputs if not isinstance(i, sparse_tensor.SparseTensor)
+  ]
+  indices = [sp_input.indices for sp_input in sparse_inputs]
+  values = [sp_input.values for sp_input in sparse_inputs]
+  shapes = [sp_input.dense_shape for sp_input in sparse_inputs]
+  for i in range(len(values)):
+    if values[i].dtype != dtypes.string:
+      values[i] = math_ops.cast(values[i], dtypes.int64)
+  for i in range(len(dense_inputs)):
+    if dense_inputs[i].dtype != dtypes.string:
+      dense_inputs[i] = math_ops.cast(dense_inputs[i], dtypes.int64)
+  return indices, values, shapes, dense_inputs
+
+
 def _sparse_cross_internal(inputs,
                            hashed_output=False,
                            num_buckets=0,
@@ -1065,6 +1117,7 @@ def sparse_slice(sp_input, start, size, name=None):
 
 
 @tf_export(v1=["sparse_to_dense"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     None,
     "Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.")
@@ -1994,6 +2047,7 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
 
 
 @tf_export(v1=["io.serialize_sparse", "serialize_sparse"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("serialize_sparse")
 def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
   """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
@@ -2014,6 +2068,7 @@ def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
 
 
 @tf_export("io.serialize_sparse", v1=[])
+@dispatch.add_dispatch_support
 def serialize_sparse_v2(sp_input, out_type=dtypes.string, name=None):
   """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
 
@@ -2040,6 +2095,7 @@ def serialize_sparse_v2(sp_input, out_type=dtypes.string, name=None):
 
 
 @tf_export(v1=["io.serialize_many_sparse", "serialize_many_sparse"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("serialize_many_sparse")
 def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
   """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
@@ -2069,6 +2125,7 @@ def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
 
 
 @tf_export("io.serialize_many_sparse", v1=[])
+@dispatch.add_dispatch_support
 def serialize_many_sparse_v2(sp_input, out_type=dtypes.string, name=None):
   """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
 
@@ -2172,6 +2229,7 @@ def deserialize_sparse(serialized_sparse, dtype, rank=None, name=None):
 @tf_export(
     "io.deserialize_many_sparse",
     v1=["io.deserialize_many_sparse", "deserialize_many_sparse"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("deserialize_many_sparse")
 def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
   """Deserialize and concatenate `SparseTensors` from a serialized minibatch.
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index a05a488408d..036346cdecd 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -42,11 +42,13 @@ from tensorflow.python.ops import gen_special_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(b/27419586) Change docstring for required dtype of x once int allowed
 @tf_export('math.lbeta', v1=['math.lbeta', 'lbeta'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('lbeta')
 def lbeta(x, name=None):
   r"""Computes \\(ln(|Beta(x)|)\\), reducing along the last dimension.
@@ -102,6 +104,7 @@ def lbeta(x, name=None):
 
 
 @tf_export('math.special.dawsn')
+@dispatch.add_dispatch_support
 def dawsn(x, name=None):
   """Computes Dawson's integral of `x` element-wise.
 
@@ -131,6 +134,7 @@ def dawsn(x, name=None):
 
 
 @tf_export('math.special.expint')
+@dispatch.add_dispatch_support
 def expint(x, name=None):
   """Computes the Exponential integral of `x` element-wise.
 
@@ -159,6 +163,7 @@ def expint(x, name=None):
 
 
 @tf_export('math.special.fresnel_cos')
+@dispatch.add_dispatch_support
 def fresnel_cos(x, name=None):
   """Computes Fresnel's cosine integral of `x` element-wise.
 
@@ -188,6 +193,7 @@ def fresnel_cos(x, name=None):
 
 
 @tf_export('math.special.fresnel_sin')
+@dispatch.add_dispatch_support
 def fresnel_sin(x, name=None):
   """Computes Fresnel's sine integral of `x` element-wise.
 
@@ -216,6 +222,7 @@ def fresnel_sin(x, name=None):
 
 
 @tf_export('math.special.spence')
+@dispatch.add_dispatch_support
 def spence(x, name=None):
   """Computes Spence's integral of `x` element-wise.
 
@@ -244,6 +251,7 @@ def spence(x, name=None):
 
 
 @tf_export('math.bessel_i0')
+@dispatch.add_dispatch_support
 def bessel_i0(x, name=None):
   """Computes the Bessel i0 function of `x` element-wise.
 
@@ -268,6 +276,7 @@ def bessel_i0(x, name=None):
 
 
 @tf_export('math.bessel_i1')
+@dispatch.add_dispatch_support
 def bessel_i1(x, name=None):
   """Computes the Bessel i1 function of `x` element-wise.
 
@@ -325,6 +334,7 @@ def _enclosing_tpu_context():
 
 
 @tf_export('einsum', 'linalg.einsum')
+@dispatch.add_dispatch_support
 def einsum(equation, *inputs, **kwargs):
   """Tensor contraction over specified indices and outer product.
 
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index 2bf53d3a0f7..25fefcc514c 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_stateless_random_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable("StatelessMultinomial")
@@ -40,6 +41,7 @@ ops.NotDifferentiable("StatelessTruncatedNormal")
 
 
 @tf_export("random.experimental.stateless_split")
+@dispatch.add_dispatch_support
 def split(seed, num=2):
   """Splits an RNG seed into `num` new seeds by adding a leading axis.
 
@@ -73,6 +75,7 @@ def split(seed, num=2):
 
 
 @tf_export("random.experimental.stateless_fold_in")
+@dispatch.add_dispatch_support
 def fold_in(seed, data):
   """Folds in data to an RNG seed to form a new RNG seed.
 
@@ -111,6 +114,7 @@ def fold_in(seed, data):
 
 
 @tf_export("random.stateless_uniform")
+@dispatch.add_dispatch_support
 def stateless_random_uniform(shape,
                              seed,
                              minval=0,
@@ -120,9 +124,9 @@ def stateless_random_uniform(shape,
   """Outputs deterministic pseudorandom values from a uniform distribution.
 
   This is a stateless version of `tf.random.uniform`: if run twice with the
-  same seeds, it will produce the same pseudorandom numbers.  The output is
-  consistent across multiple runs on the same hardware (and between CPU
-  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  same seeds and shapes, it will produce the same pseudorandom numbers.  The
+  output is consistent across multiple runs on the same hardware (and between
+  CPU and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
   hardware.
 
   The generated values follow a uniform distribution in the range
@@ -205,6 +209,7 @@ def stateless_random_uniform(shape,
 
 
 @tf_export("random.stateless_binomial")
+@dispatch.add_dispatch_support
 def stateless_random_binomial(shape,
                               seed,
                               counts,
@@ -217,10 +222,10 @@ def stateless_random_binomial(shape,
   probability of success parameters.
 
   This is a stateless version of `tf.random.Generator.binomial`: if run twice
-  with the same seeds, it will produce the same pseudorandom numbers. The
-  output is consistent across multiple runs on the same hardware (and between
-  CPU and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
-  hardware.
+  with the same seeds and shapes, it will produce the same pseudorandom numbers.
+  The output is consistent across multiple runs on the same hardware (and
+  between CPU and GPU), but may change between versions of TensorFlow or on
+  non-CPU/GPU hardware.
 
   Example:
 
@@ -274,6 +279,7 @@ def stateless_random_binomial(shape,
 
 
 @tf_export("random.stateless_gamma")
+@dispatch.add_dispatch_support
 def stateless_random_gamma(shape,
                            seed,
                            alpha,
@@ -286,9 +292,10 @@ def stateless_random_gamma(shape,
   (`alpha`) and inverse scale (`beta`) parameters.
 
   This is a stateless version of `tf.random.gamma`: if run twice with the same
-  seeds, it will produce the same pseudorandom numbers. The output is consistent
-  across multiple runs on the same hardware (and between CPU and GPU), but may
-  change between versions of TensorFlow or on non-CPU/GPU hardware.
+  seeds and shapes, it will produce the same pseudorandom numbers. The output is
+  consistent across multiple runs on the same hardware (and between CPU and
+  GPU),
+  but may change between versions of TensorFlow or on non-CPU/GPU hardware.
 
   A slight difference exists in the interpretation of the `shape` parameter
   between `stateless_gamma` and `gamma`: in `gamma`, the `shape` is always
@@ -372,6 +379,7 @@ def stateless_random_gamma(shape,
 
 
 @tf_export("random.stateless_poisson")
+@dispatch.add_dispatch_support
 def stateless_random_poisson(shape,
                              seed,
                              lam,
@@ -383,9 +391,9 @@ def stateless_random_poisson(shape,
   parameter.
 
   This is a stateless version of `tf.random.poisson`: if run twice with the same
-  seeds, it will produce the same pseudorandom numbers. The output is consistent
-  across multiple runs on the same hardware, but may change between versions of
-  TensorFlow or on non-CPU/GPU hardware.
+  seeds and shapes, it will produce the same pseudorandom numbers. The output is
+  consistent across multiple runs on the same hardware, but may change between
+  versions of TensorFlow or on non-CPU/GPU hardware.
 
   A slight difference exists in the interpretation of the `shape` parameter
   between `stateless_poisson` and `poisson`: in `poisson`, the `shape` is always
@@ -434,6 +442,7 @@ def stateless_random_poisson(shape,
 
 
 @tf_export("random.stateless_normal")
+@dispatch.add_dispatch_support
 def stateless_random_normal(shape,
                             seed,
                             mean=0.0,
@@ -443,9 +452,9 @@ def stateless_random_normal(shape,
   """Outputs deterministic pseudorandom values from a normal distribution.
 
   This is a stateless version of `tf.random.normal`: if run twice with the
-  same seeds, it will produce the same pseudorandom numbers.  The output is
-  consistent across multiple runs on the same hardware (and between CPU
-  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  same seeds and shapes, it will produce the same pseudorandom numbers.  The
+  output is consistent across multiple runs on the same hardware (and between
+  CPU and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
   hardware.
 
   Args:
@@ -474,6 +483,7 @@ def stateless_random_normal(shape,
 
 
 @tf_export("random.stateless_truncated_normal")
+@dispatch.add_dispatch_support
 def stateless_truncated_normal(shape,
                                seed,
                                mean=0.0,
@@ -483,10 +493,9 @@ def stateless_truncated_normal(shape,
   """Outputs deterministic pseudorandom values, truncated normally distributed.
 
   This is a stateless version of `tf.random.truncated_normal`: if run twice with
-  the
-  same seeds, it will produce the same pseudorandom numbers.  The output is
-  consistent across multiple runs on the same hardware (and between CPU
-  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  the same seeds and shapes, it will produce the same pseudorandom numbers.  The
+  output is consistent across multiple runs on the same hardware (and between
+  CPU and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
   hardware.
 
   The generated values follow a normal distribution with specified mean and
@@ -520,6 +529,7 @@ def stateless_truncated_normal(shape,
 
 
 @tf_export(v1=["random.stateless_multinomial"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     date=None, instructions="Use `tf.random.stateless_categorical` instead.")
 def stateless_multinomial(logits,
@@ -530,9 +540,9 @@ def stateless_multinomial(logits,
   """Draws deterministic pseudorandom samples from a multinomial distribution.
 
   This is a stateless version of `tf.random.categorical`: if run twice with the
-  same seeds, it will produce the same pseudorandom numbers.  The output is
-  consistent across multiple runs on the same hardware (and between CPU
-  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  same seeds and shapes, it will produce the same pseudorandom numbers.  The
+  output is consistent across multiple runs on the same hardware (and between
+  CPU and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
   hardware.
 
   Example:
@@ -562,6 +572,7 @@ def stateless_multinomial(logits,
 
 
 @tf_export("random.stateless_categorical")
+@dispatch.add_dispatch_support
 def stateless_categorical(logits,
                           num_samples,
                           seed,
@@ -570,11 +581,12 @@ def stateless_categorical(logits,
   """Draws deterministic pseudorandom samples from a categorical distribution.
 
   This is a stateless version of `tf.categorical`: if run twice with the
-  same seeds, it will produce the same pseudorandom numbers.  The output is
-  consistent across multiple runs on the same hardware (and between CPU
-  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  same seeds and shapes, it will produce the same pseudorandom numbers.  The
+  output is consistent across multiple runs on the same hardware (and between
+  CPU and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
   hardware.
 
+
   Example:
 
   ```python
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 09ba078383a..dd0ae223d9d 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -73,6 +73,7 @@ regex_full_match.__doc__ = gen_string_ops.regex_full_match.__doc__
 
 @tf_export(
     "strings.regex_replace", v1=["strings.regex_replace", "regex_replace"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("regex_replace")
 @dispatch.add_dispatch_support
 def regex_replace(input, pattern, rewrite, replace_global=True, name=None):
@@ -112,6 +113,7 @@ def regex_replace(input, pattern, rewrite, replace_global=True, name=None):
 
 
 @tf_export("strings.format")
+@dispatch.add_dispatch_support
 def string_format(template, inputs, placeholder="{}", summarize=3, name=None):
   r"""Formats a string template using a list of tensors.
 
@@ -300,6 +302,7 @@ def _reduce_join_reduction_dims(x, axis):
 
 
 @tf_export(v1=["strings.reduce_join", "reduce_join"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -412,6 +415,7 @@ string_length_v2.__doc__ = gen_string_ops.string_length.__doc__
 
 
 @tf_export(v1=["substr"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(None, "Use `tf.strings.substr` instead of `tf.substr`.")
 def substr_deprecated(input, pos, len, name=None, unit="BYTE"):
   return substr(input, pos, len, name=name, unit=unit)
@@ -476,6 +480,7 @@ def string_to_number(input, out_type=dtypes.float32, name=None):
 
 
 @tf_export(v1=["strings.to_number", "string_to_number"])
+@dispatch.add_dispatch_support
 def string_to_number_v1(
     string_tensor=None,
     out_type=dtypes.float32,
@@ -519,6 +524,7 @@ def string_to_hash_bucket(input, num_buckets, name=None):
 
 
 @tf_export(v1=["strings.to_hash_bucket", "string_to_hash_bucket"])
+@dispatch.add_dispatch_support
 def string_to_hash_bucket_v1(
     string_tensor=None,
     num_buckets=None,
@@ -532,6 +538,7 @@ string_to_hash_bucket_v1.__doc__ = gen_string_ops.string_to_hash_bucket.__doc__
 
 
 @tf_export("strings.join", v1=["strings.join", "string_join"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("string_join")
 @dispatch.add_dispatch_support
 def string_join(inputs, separator="", name=None):
diff --git a/tensorflow/python/ops/structured/BUILD b/tensorflow/python/ops/structured/BUILD
index e9504efdd99..64b7bd7f1d5 100644
--- a/tensorflow/python/ops/structured/BUILD
+++ b/tensorflow/python/ops/structured/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow:tensorflow.bzl", "py_test")
 package(
     default_visibility = [
         "//learning/tfx/autotfx:__subpackages__",
+        "//research/graph/convolutions/model/autotfx:__subpackages__",
         "//tensorflow:internal",
     ],
     licenses = ["notice"],  # Apache 2.0
diff --git a/tensorflow/python/ops/structured/structured_tensor.py b/tensorflow/python/ops/structured/structured_tensor.py
index cb02ce52438..3c3bd03a06b 100644
--- a/tensorflow/python/ops/structured/structured_tensor.py
+++ b/tensorflow/python/ops/structured/structured_tensor.py
@@ -60,24 +60,24 @@ class StructuredTensor(composite_tensor.CompositeTensor):
 
   ### Examples
 
-  ```python
   >>> # A scalar StructuredTensor describing a single person.
-  >>> s1 = tf.structured.constant({"age": 82, "nicknames": ["Bob", "Bobby"]})
-  >>> print s1.shape
-  ()
-  >>> print s1["age"]
-  tf.Tensor(82, shape=(), dtype=int32)
+  >>> s1 = StructuredTensor.from_pyval(
+  ...     {"age": 82, "nicknames": ["Bob", "Bobby"]})
+  >>> s1.shape
+  TensorShape([])
+  >>> s1["age"]
+  <tf.Tensor: shape=(), dtype=int32, numpy=82>
 
   >>> # A vector StructuredTensor describing three people.
-  >>> s2 = stf.struct.constant([
+  >>> s2 = StructuredTensor.from_pyval([
   ...     {"age": 12, "nicknames": ["Josaphine"]},
   ...     {"age": 82, "nicknames": ["Bob", "Bobby"]},
-  ...     {"age": 82, "nicknames": ["Elmo"]}])
-  >>> print s2.shape
-  (3,)
-  >>> print s2[0]["age"]
-  tf.Tensor(12, shape=(), dtype=int32)
-  ```
+  ...     {"age": 42, "nicknames": ["Elmo"]}])
+  >>> s2.shape
+  TensorShape([3])
+  >>> s2[0]["age"]
+  <tf.Tensor: shape=(), dtype=int32, numpy=12>
+
 
   ### Field Paths
 
@@ -155,12 +155,19 @@ class StructuredTensor(composite_tensor.CompositeTensor):
     Examples:
 
       >>> StructuredTensor.from_fields({'x': 1, 'y': [1, 2, 3]})
-      (FILL THIS IN)
+      <StructuredTensor(
+        fields={
+          "x": tf.Tensor(1, shape=(), dtype=int32),
+          "y": tf.Tensor([1 2 3], shape=(3,), dtype=int32)},
+        shape=())>
 
       >>> StructuredTensor.from_fields({'foo': [1, 2], 'bar': [3, 4]},
       ...                              shape=[2])
-      (FILL THIS IN)
-
+      <StructuredTensor(
+        fields={
+          "bar": tf.Tensor([3 4], shape=(2,), dtype=int32),
+          "foo": tf.Tensor([1 2], shape=(2,), dtype=int32)},
+        shape=(2,))>
     """
     shape = tensor_shape.as_shape(shape)
     rank = shape.rank
@@ -312,7 +319,7 @@ class StructuredTensor(composite_tensor.CompositeTensor):
     If `field_name` is a `string`, then it names a field directly owned by this
     `StructuredTensor`.  If this `StructuredTensor` has shape `[D1...DN]`, then
     the returned tensor will have shape `[D1...DN, V1...VM]`, where the slice
-    `result[d1...dN]`contains the field value for the structure at
+    `result[d1...dN]` contains the field value for the structure at
     `self[d1...dN]`.
 
     If `field_name` is a `tuple` of `string`, then it specifies a path to a
@@ -430,8 +437,15 @@ class StructuredTensor(composite_tensor.CompositeTensor):
       return self._fields[key[rank]].__getitem__(key[:rank] + key[rank + 1:])
 
   def __repr__(self):
-    return '<StructuredTensor(fields={%s}, shape=%s)>' % (', '.join(
-        '%r' % k for k in sorted(self._fields)), self._shape)
+    fields = sorted(self._fields.items())
+    fields = ((k, str(v).replace('\n', '\n            ')) for k, v in fields)
+    fields = ('"{}": {}'.format(k, v) for k, v in fields)
+    dict_repr = ',\n        '.join(fields)
+    return (
+        '<StructuredTensor(\n'
+        '    fields={\n'
+        '        %s},\n'
+        '    shape=%s)>' % (dict_repr, self._shape))
 
   #=============================================================================
   # Conversion
@@ -458,9 +472,9 @@ class StructuredTensor(composite_tensor.CompositeTensor):
 
     Requires that all fields are Eager tensors.
 
-    >>> print(StructuredTensor.from_fields(
-    ...     {'a': [1, 2, 3]}, [3]).to_pyval())
-    [{b'a': 1}, {b'a': 2}, {b'a': 3}]
+    >>> StructuredTensor.from_fields(
+    ...     {'a': [1, 2, 3]}, [3]).to_pyval()
+    [{'a': 1}, {'a': 2}, {'a': 3}]
 
     Note that `StructuredTensor.from_pyval(pyval).to_pyval() == pyval`.
 
@@ -496,9 +510,13 @@ class StructuredTensor(composite_tensor.CompositeTensor):
   def from_pyval(cls, pyval, typespec=None):
     """Constructs a StructuredTensor from a nested Python structure.
 
-    >>> print StructuredTensor.from_pyval(
+    >>> StructuredTensor.from_pyval(
     ...     {'a': [1, 2, 3], 'b': [[4, 5], [6, 7]]})
-    <StructuredTensor {'a': [1, 2, 3], 'b': [[4, 5], [6, 7]]}>
+    <StructuredTensor(
+        fields={
+          "a": tf.Tensor([1 2 3], shape=(3,), dtype=int32),
+          "b": <tf.RaggedTensor [[4, 5], [6, 7]]>},
+        shape=())>
 
     Note that `StructuredTensor.from_pyval(pyval).to_pyval() == pyval`.
 
@@ -628,7 +646,10 @@ class StructuredTensor(composite_tensor.CompositeTensor):
     ...     [{'foo': 12}, {'foo': 33}, {'foo': 99}])
     >>> partition = RowPartition.from_row_lengths([2, 0, 1])
     >>> st.partition_outer_dimension(partition)
-    <StructuredTensor [[{'foo': 12}, {'foo': 33}], [], [{'foo': 99}]]>
+    <StructuredTensor(
+      fields={
+        "foo": <tf.RaggedTensor [[12, 33], [], [99]]>},
+      shape=(3, None))>
 
     Args:
       row_partition: A `RowPartition`.
@@ -651,7 +672,10 @@ class StructuredTensor(composite_tensor.CompositeTensor):
     >>> st = StructuredTensor.from_pyval(
     ...     [[{'foo': 12}, {'foo': 33}], [], [{'foo': 99}]])
     >>> st.merge_dims(0, 1)
-    <StructuredTensor [{'foo': 12}, {'foo': 33}, {'foo': 99}]>
+    <StructuredTensor(
+      fields={
+        "foo": tf.Tensor([12 33 99], shape=(3,), dtype=int32)},
+      shape=(3,))>
 
     Args:
       outer_axis: `int`: The first dimension in the range of dimensions to
@@ -1056,14 +1080,17 @@ def _partition_outer_dimension(value, row_partition):
 
   Examples:
 
-    >>> partition = row_partition.RowPartition.from_row_lengths([2, 0, 1])
+    >>> partition = RowPartition.from_row_lengths([2, 0, 1])
     >>> _partition_outer_dimension(tf.constant([1, 2, 3]), partition)
-    [[1, 2], [], [3]]
+    <tf.RaggedTensor [[1, 2], [], [3]]>
 
     >>> struct_value = StructuredTensor.from_pyval(
     ...     [{'x': 1}, {'x': 2}, {'x': 3}])
     >>> _partition_outer_dimension(struct_value, partition)
-    [[{'x': 1}, {'x': 2}], [], [{'x': 3}]])
+    <StructuredTensor(
+      fields={
+        "x": <tf.RaggedTensor [[1, 2], [], [3]]>},
+      shape=(3, None))>
 
   Args:
     value: Tensor, RaggedTensor, or StructuredTensor
diff --git a/tensorflow/python/ops/structured/structured_tensor_test.py b/tensorflow/python/ops/structured/structured_tensor_test.py
index 99f6bb6f5a3..896bfff1296 100644
--- a/tensorflow/python/ops/structured/structured_tensor_test.py
+++ b/tensorflow/python/ops/structured/structured_tensor_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import textwrap
+
 from absl.testing import parameterized
 import numpy as np
 
@@ -922,14 +924,33 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
     st = StructuredTensor.from_pyval({"a": 5, "b": {"c": [1, 2, 3]}})
     self.assertAllEqual(st.field_value(("a",)), 5)
     self.assertAllEqual(st.field_value(("b", "c")), [1, 2, 3])
-    with self.assertRaisesRegexp(KeyError,
-                                 r"Field path \('a', 'b'\) not found in .*"):
+    expected = "Field path \(.*a.*,.*b.*\) not found in .*"
+    with self.assertRaisesRegexp(KeyError, expected):
       st.field_value(("a", "b"))
 
   def testRepr(self):
     st = StructuredTensor.from_pyval({"a": 5, "b": {"c": [1, 2, 3]}})
-    self.assertEqual(
-        repr(st), "<StructuredTensor(fields={'a', 'b'}, shape=())>")
+    if context.executing_eagerly():
+      expected = textwrap.dedent("""
+          <StructuredTensor(
+              fields={
+                  "a": tf.Tensor(5, shape=(), dtype=int32),
+                  "b": <StructuredTensor(
+                          fields={
+                              "c": tf.Tensor([1 2 3], shape=(3,), dtype=int32)},
+                          shape=())>},
+              shape=())>""")[1:]
+    else:
+      expected = textwrap.dedent("""
+          <StructuredTensor(
+              fields={
+                  "a": Tensor("Const:0", shape=(), dtype=int32),
+                  "b": <StructuredTensor(
+                          fields={
+                              "c": Tensor("RaggedConstant/Const:0", shape=(3,), dtype=int32)},
+                          shape=())>},
+              shape=())>""")[1:]
+    self.assertEqual(repr(st), expected)
 
   def testPartitionOuterDimension2DDenseField(self):
     struct = structured_tensor.StructuredTensor.from_fields(
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index e8ea9ff4e4d..58dc92084a6 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -21,10 +21,11 @@ from __future__ import print_function
 
 import contextlib
 
-import numpy as np
 import traceback
 import weakref
 
+import numpy as np
+
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -985,22 +986,20 @@ class TensorArray(object):
 
   Example 3: A simple loop interacting with a `tf.Variable`.
 
-  # TODO(b/153898334) reenable this one flakyness is removed
-  # >>> v = tf.Variable(1)
-  # >>>
-  # >>> @tf.function
-  # ... def f(x):
-  # ...   ta = tf.TensorArray(tf.int32, size=0, dynamic_size=True)
-  # ...
-  # ...   for i in tf.range(x):
-  # ...     v.assign_add(i)
-  # ...     ta = ta.write(i, v)
-  # ...
-  # ...   return ta.stack()
-  # >>>
-  # >>> f(5)
-  # <tf.Tensor: shape=(5,), dtype=int32, numpy=array([ 1,  2,  4,  7, 11],
-  # dtype=int32)>
+  # TODO(b/153898334): Convert back to doctest once bug is resolved.
+  ```
+  v = tf.Variable(1)
+  @tf.function
+  def f(x):
+    ta = tf.TensorArray(tf.int32, size=0, dynamic_size=True)
+    for i in tf.range(x):
+      v.assign_add(i)
+      ta = ta.write(i, v)
+    return ta.stack()
+  f(5)
+  <tf.Tensor: shape=(5,), dtype=int32, numpy=array([ 1,  2,  4,  7, 11],
+  dtype=int32)>
+  ```
   """
 
   def __init__(self,
@@ -1122,7 +1121,7 @@ class TensorArray(object):
     Returns:
       A new TensorArray object with flow that ensures the control dependencies
       from the contexts will become control dependencies for writes, reads, etc.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
     """
     return self._implementation.identity()
 
@@ -1152,7 +1151,7 @@ class TensorArray(object):
 
     Returns:
       A new TensorArray object with flow that ensures the write occurs.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
 
     Raises:
       ValueError: if there are more writers than specified.
@@ -1217,7 +1216,7 @@ class TensorArray(object):
 
     Returns:
       A new TensorArray object with flow that ensures the unstack occurs.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
 
     Raises:
       ValueError: if the shape inference fails.
@@ -1236,7 +1235,7 @@ class TensorArray(object):
 
     Returns:
       A new TensorArray object with flow that ensures the scatter occurs.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
 
     Raises:
       ValueError: if the shape inference fails.
@@ -1255,7 +1254,7 @@ class TensorArray(object):
 
     Returns:
       A new TensorArray object with flow that ensures the split occurs.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
 
     Raises:
       ValueError: if the shape inference fails.
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index d65cd235ca8..81c3f9a2f70 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -42,6 +42,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.types import core
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import tf_contextlib
@@ -1000,7 +1001,7 @@ class _VariableStore(object):
     return initializer, initializing_from_value
 
 
-class _LazyEvalTensor(object):
+class _LazyEvalTensor(core.Tensor):
   """A Tensor-like object that only evaluates its thunk when used."""
 
   def __init__(self, thunk):
@@ -1069,8 +1070,6 @@ session.register_session_run_conversion_functions(
     lambda fetch: ([fetch._master_tensor], lambda fetched_vals: fetched_vals[0])  # pylint: disable=protected-access
     )
 
-ops.register_dense_tensor_like_type(_LazyEvalTensor)
-
 
 # To stop regularization, use this regularizer
 @tf_export(v1=["no_regularizer"])
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 1080778e3d3..d3df0659b5a 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -47,6 +47,7 @@ from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.types import core
 
 
 def default_variable_creator(_, **kwds):
@@ -264,6 +265,7 @@ class VariableMetaclass(type):
 
 
 @tf_export("Variable", v1=[])
+# TODO(mdan): This should subclass core.Tensor, and not all its subclasses?
 class Variable(six.with_metaclass(VariableMetaclass, trackable.Trackable)):
   """See the [variable guide](https://tensorflow.org/guide/variable).
 
@@ -1551,7 +1553,7 @@ class VariableV1(Variable):
 
 
 # TODO(apassos): do not repeat all comments here
-class RefVariable(VariableV1):
+class RefVariable(VariableV1, core.Tensor):
   """Ref-based implementation of variables."""
 
   def __init__(
@@ -3032,7 +3034,6 @@ class PartitionedVariable(object):
 # allowing instances of the class to be used as tensors.
 ops.register_tensor_conversion_function(RefVariable,
                                         RefVariable._TensorConversionFunction)  # pylint: disable=protected-access
-ops.register_dense_tensor_like_type(RefVariable)
 
 
 @tf_export(v1=["global_variables"])
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index e5ca60843e3..ffc090a4676 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -224,9 +224,8 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:util",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python/profiler/internal:_pywrap_traceme",
-        "@six_archive//:six",
     ],
 )
 
@@ -238,13 +237,3 @@ py_library(
         ":trace",
     ],
 )
-
-py_library(
-    name = "scoped_annotation",
-    srcs = ["scoped_annotation.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python/profiler/internal:_pywrap_scoped_annotation",
-        "@six_archive//:six",
-    ],
-)
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index d9f93c2fb21..6f7193b3207 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -80,28 +80,28 @@ cuda_py_test(
 tf_python_pybind_extension(
     name = "_pywrap_traceme",
     srcs = ["traceme_wrapper.cc"],
-    features = ["-layering_check"],
     module_name = "_pywrap_traceme",
     visibility = [
         "//perftools/accelerators/xprof/xprofilez/integration_tests:__pkg__",
         "//tensorflow/python/profiler:__subpackages__",
     ],
     deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/lib:traceme_headers",
-        "@com_google_absl//absl/types:optional",
+        ":traceme_wrapper",
         "@pybind11",
     ],
 )
 
-tf_python_pybind_extension(
-    name = "_pywrap_scoped_annotation",
-    srcs = ["scoped_annotation_wrapper.cc"],
+cc_library(
+    name = "traceme_wrapper",
+    hdrs = ["traceme_wrapper.h"],
     features = ["-layering_check"],
-    module_name = "_pywrap_scoped_annotation",
+    visibility = [
+        "//tensorflow/compiler/xla/python:__pkg__",
+    ],
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/lib:scoped_annotation_headers",
+        "//tensorflow/core/profiler/lib:traceme_headers",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@pybind11",
     ],
diff --git a/tensorflow/python/profiler/internal/scoped_annotation_wrapper.cc b/tensorflow/python/profiler/internal/scoped_annotation_wrapper.cc
deleted file mode 100644
index 078ebb0966c..00000000000
--- a/tensorflow/python/profiler/internal/scoped_annotation_wrapper.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <utility>
-
-#include "absl/types/optional.h"
-#include "pybind11/pybind11.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/lib/scoped_annotation.h"
-
-namespace py = pybind11;
-
-namespace {
-
-// Helper to implement ScopedAnnotation as a context manager in Python.
-class ScopedAnnotationWrapper {
- public:
-  explicit ScopedAnnotationWrapper(const tensorflow::string& name)
-      : name_(name) {}
-
-  void Enter() { annotation_.emplace(std::move(name_)); }
-
-  void Exit() { annotation_.reset(); }
-
-  static bool IsEnabled() {
-    return tensorflow::profiler::ScopedAnnotation::IsEnabled();
-  }
-
- private:
-  tensorflow::string name_;
-  absl::optional<tensorflow::profiler::ScopedAnnotation> annotation_;
-};
-
-}  // namespace
-
-PYBIND11_MODULE(_pywrap_scoped_annotation, m) {
-  py::class_<ScopedAnnotationWrapper> scoped_annotation_class(
-      m, "ScopedAnnotation");
-  scoped_annotation_class.def(py::init<const tensorflow::string&>())
-      .def("Enter", &ScopedAnnotationWrapper::Enter)
-      .def("Exit", &ScopedAnnotationWrapper::Exit)
-      .def_static("IsEnabled", &ScopedAnnotationWrapper::IsEnabled);
-};
diff --git a/tensorflow/python/profiler/internal/traceme_wrapper.cc b/tensorflow/python/profiler/internal/traceme_wrapper.cc
index a1b5370836b..32a1f423918 100644
--- a/tensorflow/python/profiler/internal/traceme_wrapper.cc
+++ b/tensorflow/python/profiler/internal/traceme_wrapper.cc
@@ -13,46 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <utility>
+#include "tensorflow/python/profiler/internal/traceme_wrapper.h"
 
-#include "absl/types/optional.h"
+#include "pybind11/attr.h"
 #include "pybind11/pybind11.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
 
-namespace py = pybind11;
+namespace py = ::pybind11;
 
-namespace {
-
-// Helper to implement TraceMe as a context manager in Python.
-class TraceMeWrapper {
- public:
-  explicit TraceMeWrapper(const tensorflow::string& name) : name_(name) {}
-
-  void Enter() { traceme_.emplace(std::move(name_)); }
-
-  void SetMetadata(const tensorflow::string& new_metadata) {
-    if (TF_PREDICT_TRUE(traceme_)) {
-      traceme_->SetMetadata(new_metadata);
-    }
-  }
-
-  void Exit() { traceme_.reset(); }
-
-  static bool IsEnabled() { return tensorflow::profiler::TraceMe::Active(); }
-
- private:
-  tensorflow::string name_;
-  absl::optional<tensorflow::profiler::TraceMe> traceme_;
-};
-
-}  // namespace
+using ::tensorflow::profiler::TraceMeWrapper;
 
 PYBIND11_MODULE(_pywrap_traceme, m) {
-  py::class_<TraceMeWrapper> traceme_class(m, "TraceMe");
-  traceme_class.def(py::init<const tensorflow::string&>())
-      .def("Enter", &TraceMeWrapper::Enter)
-      .def("Exit", &TraceMeWrapper::Exit)
+  py::class_<TraceMeWrapper>(m, "TraceMe", py::module_local())
+      .def(py::init<const py::str&, const py::kwargs&>())
       .def("SetMetadata", &TraceMeWrapper::SetMetadata)
       .def_static("IsEnabled", &TraceMeWrapper::IsEnabled);
 };
diff --git a/tensorflow/python/profiler/internal/traceme_wrapper.h b/tensorflow/python/profiler/internal/traceme_wrapper.h
new file mode 100644
index 00000000000..6e61d9cdd8f
--- /dev/null
+++ b/tensorflow/python/profiler/internal/traceme_wrapper.h
@@ -0,0 +1,82 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_WRAPPER_
+#define TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_WRAPPER_
+
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "pybind11/pytypes.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Wraps TraceMe with an interface that takes python types.
+class TraceMeWrapper {
+ public:
+  // pybind11::str and pybind11::kwargs are taken by const reference to avoid
+  // python reference-counting overhead.
+  TraceMeWrapper(const pybind11::str& name, const pybind11::kwargs& kwargs)
+      : traceme_(
+            [&]() {
+              std::string name_and_metadata(name);
+              if (!kwargs.empty()) {
+                AppendMetadata(&name_and_metadata, kwargs);
+              }
+              return name_and_metadata;
+            },
+            /*level=*/1) {}
+
+  // pybind11::kwargs is taken by const reference to avoid python
+  // reference-counting overhead.
+  void SetMetadata(const pybind11::kwargs& kwargs) {
+    if (TF_PREDICT_FALSE(!kwargs.empty())) {
+      traceme_.AppendMetadata([&]() {
+        std::string metadata;
+        AppendMetadata(&metadata, kwargs);
+        return metadata;
+      });
+    }
+  }
+
+  void Stop() { traceme_.Stop(); }
+
+  static bool IsEnabled() { return tensorflow::profiler::TraceMe::Active(); }
+
+ private:
+  // Converts kwargs to strings and appends them to name encoded as TraceMe
+  // metadata.
+  static void AppendMetadata(std::string* name,
+                             const pybind11::kwargs& kwargs) {
+    name->push_back('#');
+    for (const auto& kv : kwargs) {
+      absl::StrAppend(name, std::string(pybind11::str(kv.first)), "=",
+                      std::string(pybind11::str(kv.second)), ",");
+    }
+    name->back() = '#';
+  }
+
+  tensorflow::profiler::TraceMe traceme_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_WRAPPER_
diff --git a/tensorflow/python/profiler/scoped_annotation.py b/tensorflow/python/profiler/scoped_annotation.py
deleted file mode 100644
index 1d7e2b024b4..00000000000
--- a/tensorflow/python/profiler/scoped_annotation.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""ScopedAnnotation allows the profiler to annotate device (e.g., GPU) events.
-
-Usage:
-    with scoped_annotation.ScopedAnnotation('name'):
-      ...
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-
-from tensorflow.python.profiler.internal import _pywrap_scoped_annotation
-
-
-class ScopedAnnotation(object):
-  """Context manager that generates an annotation for the profiler."""
-
-  def __init__(self, name, **kwargs):
-    if _pywrap_scoped_annotation.ScopedAnnotation.IsEnabled():
-      if kwargs:
-        name += '#' + ','.join(key + '=' + str(value)
-                               for key, value in six.iteritems(kwargs)) + '#'
-      self._scoped_annotation = _pywrap_scoped_annotation.ScopedAnnotation(name)
-    else:
-      self._scoped_annotation = None
-
-  def __enter__(self):
-    if self._scoped_annotation:
-      self._scoped_annotation.Enter()
-
-  def __exit__(self, exc_type, exc_val, exc_tb):
-    if self._scoped_annotation:
-      self._scoped_annotation.Exit()
diff --git a/tensorflow/python/profiler/trace.py b/tensorflow/python/profiler/trace.py
index 424bdd6f3fc..ea4eb060488 100644
--- a/tensorflow/python/profiler/trace.py
+++ b/tensorflow/python/profiler/trace.py
@@ -18,29 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
 from tensorflow.python.profiler.internal import _pywrap_traceme
 from tensorflow.python.util.tf_export import tf_export
 
 
-def encode_metadata(metadata):
-  """Encodes the given metadata to a string.
-
-  Args:
-    metadata: in key-value pairs.
-
-  Returns:
-    The encoded string.
-  """
-  if not metadata:
-    return ''
-  content = []
-  for key, value in six.iteritems(metadata):
-    content.append('%s=%s'%(key, value))
-  return '#' + ','.join(content) + '#'
-
-
 @tf_export('profiler.experimental.Trace', v1=[])
 class Trace(object):
   """Context manager that generates a trace event in the profiler.
@@ -92,14 +73,13 @@ class Trace(object):
       training step being traced.
     """
     if _pywrap_traceme.TraceMe.IsEnabled():
-      name += encode_metadata(kwargs)
-      self._traceme = _pywrap_traceme.TraceMe(name)
+      # Creating _pywrap_traceme.TraceMe starts the clock.
+      self._traceme = _pywrap_traceme.TraceMe(name, **kwargs)
     else:
       self._traceme = None
 
   def __enter__(self):
-    if self._traceme:
-      self._traceme.Enter()
+    # Starting the TraceMe clock here would require an extra Python->C++ call.
     return self
 
   def set_metadata(self, **kwargs):
@@ -134,9 +114,8 @@ class Trace(object):
     to measure the entire duration of call()).
     """
     if self._traceme and kwargs:
-      additional_metadata = encode_metadata(kwargs)
-      self._traceme.SetMetadata(additional_metadata)
+      self._traceme.SetMetadata(**kwargs)
 
   def __exit__(self, exc_type, exc_val, exc_tb):
-    if self._traceme:
-      self._traceme.Exit()
+    # Deallocating _pywrap_traceme.TraceMe stops the clock.
+    self._traceme = None
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 2e5db7edd27..5c30d320fb7 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -2,6 +2,8 @@
 # TensorFlow SavedModel.
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
index b36a1f27456..dccb222c26e 100644
--- a/tensorflow/python/saved_model/function_deserialization.py
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -107,10 +107,14 @@ def _concrete_function_callable_with(function, inputs, allow_conversion):
       if not expected.shape.is_compatible_with(arg.shape):
         return False
     elif isinstance(expected, type_spec.TypeSpec):
-      return expected.is_compatible_with(arg)
-    elif (_is_tensor(arg) and
-          id(arg) != id(expected)) or (not _is_tensor(arg) and arg != expected):
-      return False
+      if not expected.is_compatible_with(arg):
+        return False
+    elif _is_tensor(arg):
+      if id(arg) != id(expected):
+        return False
+    else:
+      if arg != expected:
+        return False
   return True
 
 
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 462f6f50f11..2144682e21b 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -1823,11 +1823,15 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
   def test_ragged(self, cycles):
 
-    @def_function.function(input_signature=[
-        ragged_tensor.RaggedTensorSpec(shape=[None, None], dtype=dtypes.int32)
-    ])
-    def f(x):
-      return x + 1
+    @def_function.function
+    def f(x, c=1):
+      """Returns Tensor x incremented by Python constant c."""
+      return math_ops.add(x, c)
+
+    for c in (1, 2, 3):
+      _ = f.get_concrete_function(
+          ragged_tensor.RaggedTensorSpec([None, None], dtype=dtypes.int32),
+          c)
 
     obj = tracking.AutoTrackable()
     obj.f = f
@@ -1835,10 +1839,14 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     imported1 = cycle(obj, cycles, signatures={})
     rt = ragged_factory_ops.constant([[1, 2], [3]])
     self.assertAllEqual(imported1.f(rt), [[2, 3], [4]])
+    self.assertAllEqual(imported1.f(rt, 2), [[3, 4], [5]])
+    self.assertAllEqual(imported1.f(rt, 3), [[4, 5], [6]])
 
     imported2 = cycle(obj, cycles)
     rt = ragged_factory_ops.constant([[1, 2], [3]])
-    self.assertAllEqual(imported2.f(rt), [[2, 3], [4]])
+    self.assertAllEqual(imported2.f(rt, 1), [[2, 3], [4]])
+    self.assertAllEqual(imported2.f(rt, 2), [[3, 4], [5]])
+    self.assertAllEqual(imported2.f(rt, 3), [[4, 5], [6]])
 
 
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 42e971d050d..0f635b6bf85 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -178,7 +178,7 @@ def get_tensor_from_tensor_info(tensor_info, graph=None, import_scope=None):
     spec = struct_coder.decode_proto(spec_proto)
     components = [_get_tensor(component.name) for component in
                   tensor_info.composite_tensor.components]
-    return spec._from_components(components)  # pylint: disable=protected-access
+    return nest.pack_sequence_as(spec, components, expand_composites=True)
   else:
     raise ValueError("Invalid TensorInfo.encoding: %s" % encoding)
 
diff --git a/tensorflow/python/tf2.py b/tensorflow/python/tf2.py
index bc713d6e28b..4c9d027221f 100644
--- a/tensorflow/python/tf2.py
+++ b/tensorflow/python/tf2.py
@@ -28,19 +28,19 @@ _force_enable = None
 
 
 def enable():
-  """Enables v2 behaviors."""
+  #Enables v2 behaviors.
   global _force_enable
   _force_enable = True
 
 
 def disable():
-  """Disables v2 behaviors."""
+  #Disables v2 behaviors.
   global _force_enable
   _force_enable = False
 
 
 def enabled():
-  """Returns True iff TensorFlow 2.0 behavior should be enabled."""
+  #Returns True iff TensorFlow 2.0 behavior should be enabled.
   if _force_enable is None:
     return os.getenv("TF2_BEHAVIOR", "0") != "0"
 
diff --git a/tensorflow/python/tf_program/BUILD b/tensorflow/python/tf_program/BUILD
new file mode 100644
index 00000000000..9dfb0df8a24
--- /dev/null
+++ b/tensorflow/python/tf_program/BUILD
@@ -0,0 +1,22 @@
+package(licenses = ["notice"])
+
+py_library(
+    name = "pywrap_tfd",
+    srcs = ["pywrap_tfd.py"],
+    deps = [
+        "//tensorflow/compiler/mlir/python/mlir_wrapper",
+    ],
+)
+
+py_library(
+    name = "mlir_gen",
+    srcs = ["mlir_gen.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":pywrap_tfd",
+        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct/static_analysis",
+        "//tensorflow/python/types",
+        "@gast_archive//:gast",
+    ],
+)
diff --git a/tensorflow/python/tf_program/mlir_gen.py b/tensorflow/python/tf_program/mlir_gen.py
new file mode 100644
index 00000000000..8395848a53a
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_gen.py
@@ -0,0 +1,456 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""mlir_gen: Generate mlir code from python code."""
+
+# pylint: disable=invalid-name
+# pylint: disable=missing-function-docstring
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast as ast
+from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import cfg
+from tensorflow.python.autograph.pyct import inspect_utils
+from tensorflow.python.autograph.pyct import naming
+from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import qual_names
+from tensorflow.python.autograph.pyct import transformer
+from tensorflow.python.autograph.pyct.static_analysis import activity
+from tensorflow.python.autograph.pyct.static_analysis import annos
+from tensorflow.python.autograph.pyct.static_analysis import liveness
+from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
+from tensorflow.python.autograph.pyct.static_analysis import reaching_fndefs
+import tensorflow.python.tf_program.pywrap_tfd as tfp
+from tensorflow.python.types import core
+
+
+class SymbolTable(object):
+  """Symbol Table for python code."""
+
+  def __init__(self):
+    self.symbols = []
+    self.enter_scope()
+
+  def enter_scope(self):
+    """Enter a new scope - at function level."""
+    self.symbols.append({'types': {}, 'symbols': {}})
+    self.curr_table = self.symbols[len(self.symbols) - 1]
+
+  def insert_symbol(self, name, value):
+    self.curr_table['symbols'][name] = value
+    self.curr_table['types'][name] = value.getType()
+    return value
+
+  def insert_type(self, name, type_):
+    self.curr_table['types'][name] = type_
+
+  def exit_scope(self):
+    self.symbols.pop()
+    self.curr_table = self.symbols[len(self.symbols) - 1]
+
+  def lookup(self, name):
+    curr_idx = len(self.symbols) - 1
+    while curr_idx >= 0 and (name not in self.symbols[curr_idx]['symbols']):
+      curr_idx -= 1
+    if curr_idx < 0:
+      return None
+    return self.symbols[curr_idx]['symbols'][name]
+
+  def lookup_type(self, name):
+    curr_idx = len(self.symbols) - 1
+    while curr_idx >= 0 and (name not in self.symbols[curr_idx]['types']):
+      curr_idx -= 1
+    if curr_idx < 0:
+      return None
+    return self.symbols[curr_idx]['types'][name]
+
+  def __repr__(self):
+    s = '\n'.join(
+        ' ' * idx * 2 + str(table) for idx, table in enumerate(self.symbols))
+    return s
+
+
+class ProcessType(ast.NodeVisitor):
+  """Visit a node and return processed type Currently only visits annotations and gives their type.
+  """
+
+  def __init__(self, prog, ctx):
+    self.prog = prog
+    self.ctx = ctx
+
+  def visit_Attribute(self, node):
+    # Supported: core.Tensor
+    value = self.visit(node.value)
+    if value is None or not hasattr(value, node.attr):
+      raise AttributeError(str(type(value)) + ' has no attribute ' + node.attr)
+    attr = getattr(value, node.attr)
+
+    if attr == core.Tensor:
+      return tfp.UnrankedTensorType.get(tfp.IntegerType.get(32, self.prog.ctx))
+    return attr
+
+  def visit_Name(self, node):
+    if node.id == 'int':
+      return tfp.IntegerType.get(32, self.prog.ctx)
+    if node.id == 'bool':
+      return tfp.IntegerType.get(1, self.prog.ctx)
+    if node.id in self.ctx.info.namespace:
+      return self.ctx.info.namespace[node.id]
+
+
+class MLIRGen(ast.NodeVisitor):
+  """Visit the AST and generate MLIR code Requires liveness, reading_definitions.
+  """
+
+  def __init__(self, ctx):
+    self.ctx = ctx
+    self.symbol_table = SymbolTable()
+    self.prog = tfp.TFProgram()
+    self.opbuilder = None
+
+  def visit_block(self, block):
+    return [self.visit(item) for item in block]
+
+  def process_type(self, node):
+    return ProcessType(self.prog, self.ctx).visit(node)
+
+  def visit_Assign(self, node):
+    value = self.visit(node.value)
+    if isinstance(value, tuple):
+      # If it is a tuple of values, assign one to each in targets
+      # TODO: This currently is assuming that all elts in targets[0] are Name
+      # objects. This might not be always True.
+      for key, val in zip(node.targets[0].elts, value):
+        self.symbol_table.insert_symbol(key.id, val)
+    else:
+      self.symbol_table.insert_symbol(node.targets[0].id, value)
+
+  def visit_BinOp(self, node):
+    left = self.visit(node.left)
+    right = self.visit(node.right)
+    if isinstance(node.op, ast.Sub):
+      return tfp.Tf_SubOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(),
+                                 left, right).getResult(0)
+    if isinstance(node.op, ast.Add):
+      return tfp.Tf_AddV2Op.create(self.opbuilder,
+                                   self.opbuilder.getUnknownLoc(), left,
+                                   right).getResult(0)
+
+  def visit_BoolOp(self, node):
+    values = [self.visit(value) for value in node.values]
+    if isinstance(node.op, ast.Or):
+      return tfp.OrOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(),
+                             values).getResult(0)
+    if isinstance(node.op, ast.And):
+      return tfp.AndOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(),
+                              values).getResult(0)
+
+  def visit_Call(self, node):
+    func = self.visit(node.func)
+    args = [self.visit(arg) for arg in node.args]
+    callop = tfp.Tf_LegacyCallOp.create(self.opbuilder,
+                                        self.opbuilder.getUnknownLoc(),
+                                        func.getType().getResults(), args,
+                                        func.getName())
+    if callop.getNumResults() == 1:
+      return callop[0]
+    return tuple(callop.getResult(idx) for idx in range(callop.getNumResults()))
+
+  def visit_Compare(self, node):
+    left = self.visit(node.left)
+    opb = self.opbuilder
+    for op, right in zip(node.ops, node.comparators):
+      if isinstance(op, ast.Eq):
+        left = tfp.Tf_EqualOp.create(opb, opb.getUnknownLoc(), left,
+                                     self.visit(right)).getResult(0)
+      elif isinstance(op, ast.Lt):
+        left = tfp.Tf_LessOp.create(opb, opb.getUnknownLoc(), left,
+                                    self.visit(right)).getResult(0)
+      elif isinstance(op, ast.LtE):
+        left = tfp.Tf_LessEqualOp.create(opb, opb.getUnknownLoc(), left,
+                                         self.visit(right)).getResult(0)
+      elif isinstance(op, ast.Gt):
+        left = tfp.Tf_GreaterOp.create(opb, opb.getUnknownLoc(), left,
+                                       self.visit(right)).getResult(0)
+      elif isinstance(op, ast.GtE):
+        left = tfp.Tf_GreaterEqualOp.create(opb, opb.getUnknownLoc(), left,
+                                            self.visit(right)).getResult(0)
+      elif isinstance(op, ast.NotEq):
+        left = tfp.Tf_NotEqualOp.create(opb, opb.getUnknownLoc(), left,
+                                        self.visit(right)).getResult(0)
+      else:
+        raise NotImplementedError('CompareOp operator not recognized')
+    return left
+
+  def visit_Constant(self, node):
+    opb = self.opbuilder
+    value = None
+    if isinstance(node.value, int):
+      value = tfp.Tf_ConstOp.create(
+          opb, opb.getUnknownLoc(),
+          tfp.IntegerAttr.get(
+              tfp.IntegerType.get(32, self.prog.ctx), node.value)).getResult(0)
+    return value
+
+  def visit_FunctionDef(self, node):
+    # Cache the current builder
+    cache_builder = self.opbuilder
+    inputs, outputs = [], []
+
+    for arg in node.args.args:
+      inputs.append(self.process_type(arg.annotation))
+
+    if node.returns:
+      outputs = [self.process_type(node.returns)]
+
+    currfunc = self.prog.add_function(
+        self.ctx.namer.new_symbol(node.name, []),
+        self.prog.get_function_type(inputs, outputs))
+
+    # Add the function to symbol table and enter new scope
+    self.symbol_table.insert_symbol(node.name, currfunc)
+    self.symbol_table.enter_scope()
+
+    # Add arguments to symbol table
+    for arg, value in zip(node.args.args, currfunc.getArguments()):
+      self.symbol_table.insert_symbol(arg.id, value)
+    self.opbuilder = tfp.OpBuilder(currfunc.getBody())
+
+    self.visit_block(node.body)
+    self.symbol_table.exit_scope()
+    self.opbuilder = cache_builder
+
+  def visit_If(self, node):
+    cond = self.visit(node.test)
+
+    # Create ifop
+    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+    orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
+    modified_in_cond = list(body_scope.modified | orelse_scope.modified)
+    outputs = [
+        self.symbol_table.lookup_type(str(var)) for var in modified_in_cond
+    ]
+    ifop = tfp.IfOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(), cond,
+                           outputs)
+
+    # Cache the builder
+    cache_builder = self.opbuilder
+
+    # Visit body
+    self.opbuilder = tfp.OpBuilder(ifop.getRegion(0))
+    # Enter scope to avoid values generated inside the region to come in symbol
+    # table
+    self.symbol_table.enter_scope()
+    for stmt in node.body:
+      self.visit(stmt)
+    retvals = [
+        self.symbol_table.lookup(str(varname)) for varname in modified_in_cond
+    ]
+    tfp.ReturnOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(), retvals)
+    self.symbol_table.exit_scope()
+
+    # Visit orelse
+    self.opbuilder = tfp.OpBuilder(ifop.getRegion(1))
+    self.symbol_table.enter_scope()
+    for stmt in node.orelse:
+      self.visit(stmt)
+    retvals = [
+        self.symbol_table.lookup(str(varname)) for varname in modified_in_cond
+    ]
+    tfp.ReturnOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(), retvals)
+    self.symbol_table.exit_scope()
+
+    # Reset builder and enter return values in symbol table
+    self.opbuilder = cache_builder
+    for idx, var in enumerate(modified_in_cond):
+      self.symbol_table.insert_symbol(str(var), ifop.getResult(idx))
+
+    if ifop.getNumResults() == 1:
+      return ifop.getResult(0)
+
+    return tuple(ifop.getResult(i) for i in range(ifop.getNumResults()))
+
+  def visit_Name(self, node):
+    if self.symbol_table.lookup(node.id):
+      return self.symbol_table.lookup(node.id)
+    raise NotImplementedError('Symbol not found' + node.id)
+
+  def visit_Return(self, node):
+    opb = self.opbuilder
+    value = self.visit(node.value)
+    if isinstance(value, tuple):
+      # For more than one return values
+      return tfp.ReturnOp.create(opb, opb.getUnknownLoc(), list(value))
+    return tfp.ReturnOp.create(opb, opb.getUnknownLoc(), [value])
+
+  def visit_Tuple(self, node):
+    return tuple(self.visit(elt) for elt in node.elts)
+
+  def visit_UnaryOp(self, node):
+    operand = self.visit(node.operand)
+    if isinstance(node.op, ast.USub):
+      return tfp.Tf_NegOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(),
+                                 operand).getResult(0)
+
+  def _get_basic_loop_vars(self, modified, live_in, live_out):
+    # [This is directly from
+    # tensorflow/python/autograph/converters/control_flow.py]
+    # The loop variables corresponding to simple symbols (e.g. `x`).
+    basic_loop_vars = []
+    for s in modified:
+      if s.is_composite():
+        # TODO: Raise an error when this happens for a TF loop.
+        continue
+      # Variables not live into or out of the loop are considered local to the
+      # loop.
+      if s not in live_in and s not in live_out:
+        continue
+      basic_loop_vars.append(s)
+    return frozenset(basic_loop_vars)
+
+  def _get_composite_loop_vars(self, modified, live_in):
+    # [This is directly from
+    # tensorflow/python/autograph/converters/control_flow.py]
+    # The loop variables corresponding to composite symbols (e.g. `self.x`).
+    composite_loop_vars = []
+    for s in modified:
+      if not s.is_composite():
+        continue
+      # Mutations made to objects created inside the loop will appear as writes
+      # to composite symbols. Because these mutations appear as modifications
+      # made to composite symbols, we check whether the composite's parent is
+      # actually live into the loop.
+      # Example:
+      #   while cond:
+      #     x = Foo()
+      #     x.foo = 2 * x.foo  # x.foo is live into the loop, but x is not.
+      #
+      # Note that some parents might not be symbols - for example, in x['foo'],
+      # 'foo' is a parent, but it's a literal, not a symbol. We don't check the
+      # liveness of literals.
+      support_set_symbols = tuple(
+          sss for sss in s.support_set if sss.is_symbol())
+      if not all(sss in live_in for sss in support_set_symbols):
+        continue
+      composite_loop_vars.append(s)
+    return frozenset(composite_loop_vars)
+
+  def _get_loop_vars(self, node, modified):
+    # [This is directly from python/autograph/converters/control_flow.py]
+    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+    defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
+    live_in = anno.getanno(node, anno.Static.LIVE_VARS_IN)
+    live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
+    reserved_symbols = body_scope.referenced
+
+    basic_loop_vars = self._get_basic_loop_vars(modified, live_in, live_out)
+    composite_loop_vars = self._get_composite_loop_vars(modified, live_in)
+    loop_vars = tuple(basic_loop_vars | composite_loop_vars)
+
+    # Variable that are used or defined inside the loop, but not defined
+    # before entering the loop. Only simple variables must be defined. The
+    # composite ones will be implicitly checked at runtime.
+    undefined_lives = basic_loop_vars - defined_in
+
+    return loop_vars, reserved_symbols, undefined_lives
+
+  def visit_While(self, node):
+
+    # Create a new WhileOp
+    # `inputs` are initial values for loop variables
+    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+    loop_vars, _, _ = self._get_loop_vars(node, body_scope.modified)
+    inputs = [self.symbol_table.lookup(str(name)) for name in loop_vars]
+    types = [input_.getType() for input_ in inputs]
+    while_op = tfp.WhileOp.create(self.opbuilder,
+                                  self.opbuilder.getUnknownLoc(), inputs, types)
+
+    # cache the current builder
+    cache_builder = self.opbuilder
+
+    # Process cond
+    self.symbol_table.enter_scope()
+    for input_, type_ in zip(loop_vars, types):
+      self.symbol_table.insert_symbol(
+          str(input_),
+          while_op.getRegion(0).front().addArgument(type_))
+    self.opbuilder = tfp.OpBuilder(while_op.getRegion(0))
+    tfp.ReturnOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(),
+                        [self.visit(node.test)])
+    self.symbol_table.exit_scope()
+
+    # Process body
+    self.symbol_table.enter_scope()
+    for input_, type_ in zip(loop_vars, types):
+      self.symbol_table.insert_symbol(
+          str(input_),
+          while_op.getRegion(1).front().addArgument(type_))
+    self.opbuilder = tfp.OpBuilder(while_op.getRegion(1))
+    self.visit_block(node.body)
+    tfp.ReturnOp.create(
+        self.opbuilder, self.opbuilder.getUnknownLoc(),
+        [self.symbol_table.lookup(str(name)) for name in loop_vars])
+    self.symbol_table.exit_scope()
+
+    # Enter new values as symbols
+    for idx, var in enumerate(loop_vars):
+      self.symbol_table.insert_symbol(str(var), while_op.getResult(idx))
+
+    # Restore builder
+    self.opbuilder = cache_builder
+
+
+def mlir_gen_internal(node, entity_info):
+  """Returns mlir module for unprocessed node `node`."""
+  namer = naming.Namer({})
+  graphs = cfg.build(node)
+  ctx = transformer.Context(entity_info, namer, None)
+  node = qual_names.resolve(node)
+  node = activity.resolve(node, ctx)
+  node = reaching_definitions.resolve(node, ctx, graphs)
+  node = reaching_fndefs.resolve(node, ctx, graphs)
+  node = liveness.resolve(node, ctx, graphs)
+  mlir_generator = MLIRGen(ctx)
+  mlir_generator.visit(node)
+  return mlir_generator.prog
+
+
+def mlir_gen(func):
+  """Parse a function and return TFProgram."""
+  node, source = parser.parse_entity(func, future_features=())
+  entity_info = transformer.EntityInfo(
+      name=func.__name__,
+      source_code=source,
+      source_file=None,
+      future_features=(),
+      namespace=inspect_utils.getnamespace(func))
+  return mlir_gen_internal(node, entity_info)
+
+
+def mlir_gen_from_source(source=None, src_file=None):
+  """Parse a function as either a string or from a supplied file path and return a TFProgram.
+  """
+  if source is None:
+    source = open(src_file).read()
+  node = ast.parse(source)
+  entity_info = transformer.EntityInfo(
+      name='mlir_module',
+      source_code=source,
+      source_file=None,
+      future_features=(),
+      namespace={})
+  return mlir_gen_internal(node, entity_info)
diff --git a/tensorflow/python/tf_program/pywrap_tfd.py b/tensorflow/python/tf_program/pywrap_tfd.py
new file mode 100644
index 00000000000..0d9a236f5d3
--- /dev/null
+++ b/tensorflow/python/tf_program/pywrap_tfd.py
@@ -0,0 +1,159 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Intermediate between python bindings for MLIR and mlir generation for tensorflow program.
+
+This passes most of the mlir classes as is, but adds a few new operations and
+the basic structure for a tensorflow program.
+"""
+
+# pylint: disable=invalid-name
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.mlir.python.mlir_wrapper import mlir_wrapper as mlir
+
+# Class Definitions
+OpBuilder = mlir.OpBuilder
+Block = mlir.Block
+
+# Types
+Type = mlir.Type
+IntegerType = mlir.IntegerType
+FloatType = mlir.FloatType
+RankedTensorType = mlir.RankedTensorType
+UnrankedTensorType = mlir.UnrankedTensorType
+IntegerAttr = mlir.IntegerAttr
+
+# Standard Ops
+ReturnOp = mlir.ReturnOp
+
+# TF Dialect Ops
+Tf_AnyOp = mlir.Tf_AnyOp
+Tf_AddV2Op = mlir.Tf_AddV2Op
+Tf_ConstOp = mlir.Tf_ConstOp
+Tf_EqualOp = mlir.Tf_EqualOp
+Tf_GreaterEqualOp = mlir.Tf_GreaterEqualOp
+Tf_GreaterOp = mlir.Tf_GreaterOp
+Tf_LegacyCallOp = mlir.Tf_LegacyCallOp
+Tf_LessEqualOp = mlir.Tf_LessEqualOp
+Tf_LessOp = mlir.Tf_LessOp
+Tf_NegOp = mlir.Tf_NegOp
+Tf_NotEqualOp = mlir.Tf_NotEqualOp
+Tf_SubOp = mlir.Tf_SubOp
+
+
+class IfOp(object):
+  """
+  tfp.if(cond) ({body}, {orelse}) : type If `cond` is true, `body` is
+  executed, otherwise `orelse` is executed.
+  """
+
+  @classmethod
+  def create(cls, opb, loc, cond, outputs):
+    state = mlir.OperationState(loc, "tfp.If")
+    state.addOperands([cond])
+    state.addTypes(outputs)
+    state.addRegion().push_back(Block.new())  # body region
+    state.addRegion().push_back(Block.new())  # orelse region
+    return opb.createOperation(state)
+
+
+class OrOp(object):
+  """
+  tfp.Or(ops...) This is like tf.Any, except that the first dimension is opened
+  into `ops`.
+
+  Returns a tensor of 1-bit integers which is "Logical OR" of the
+  coressponding elements in ops...
+  """
+
+  @classmethod
+  def create(cls, opb, loc, values):
+    state = mlir.OperationState(loc, "tfp.Or")
+    state.addTypes(
+        [UnrankedTensorType.get(IntegerType.get(1, opb.getContext()))])
+    state.addOperands(values)
+    return opb.createOperation(state)
+
+
+class AndOp(object):
+  """
+  tfp.And(ops...) This is like tf.All, except that the first dimension is opened
+  to `ops`.
+
+  Returns a tensor of 1-bit integers which is "Logical AND" of the
+  coressponding elements in ops...
+  """
+
+  @classmethod
+  def create(cls, opb, loc, values):
+    state = mlir.OperationState(loc, "tfp.And")
+    state.addTypes(
+        [UnrankedTensorType.get(IntegerType.get(1, opb.getContext()))])
+    state.addOperands(values)
+    return opb.createOperation(state)
+
+
+class WhileOp(object):
+  """tfp.While(init-vals, {
+
+    ^bb1(cond-args):
+      cond-region
+      return cond
+  }, {
+    ^bb1(body-args):
+      body-region
+  })
+  As long as `cond-region` returns a "true"-like value, the body-region
+  is executed and the arguments are replaced by its return values for the next
+  iteration.
+  """
+
+  @classmethod
+  def create(cls, opb, loc, inputs, outputs):
+    state = mlir.OperationState(loc, "tfp.While")
+    state.addOperands(inputs)
+    state.addTypes(outputs)
+    state.addRegion().push_back(Block.new())  # cond region
+    state.addRegion().push_back(Block.new())  # body region
+    return opb.createOperation(state)
+
+
+class TFProgram(object):
+  """Python wrap for a Tensorflow Program (essentially an mlir Module)."""
+
+  def __init__(self):
+    mlir.registerDialects()
+    self.ctx = mlir.MLIRContext()
+    self.builder = mlir.Builder(self.ctx)
+    self.module = mlir.ModuleOp.create(mlir.UnknownLoc.get(self.ctx))
+    self.curr_func = None
+
+  def add_function(self, name, func_type):
+    self.curr_func = mlir.FuncOp.create(
+        mlir.UnknownLoc.get(self.ctx), name, func_type)
+    self.module.push_back(self.curr_func)
+    return self.curr_func
+
+  def get_function_type(self, inputs, outputs):
+    return self.builder.getFunctionType(inputs, outputs)
+
+  def dump(self):
+    self.module.dump()
+
+  def __str__(self):
+    return self.module.getAsStr()
diff --git a/tensorflow/python/tf_program/tests/BUILD b/tensorflow/python/tf_program/tests/BUILD
new file mode 100644
index 00000000000..1cf0fad6c93
--- /dev/null
+++ b/tensorflow/python/tf_program/tests/BUILD
@@ -0,0 +1,20 @@
+package(licenses = ["notice"])
+
+py_test(
+    name = "mlir_gen_test",
+    size = "small",
+    testonly = True,
+    srcs = ["mlir_gen_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = [
+        "no_oss_py2",
+        "no_pip",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/python/mlir_wrapper:filecheck_wrapper",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/tf_program:mlir_gen",
+        "//tensorflow/python/types",
+    ],
+)
diff --git a/tensorflow/python/tf_program/tests/mlir_gen_test.py b/tensorflow/python/tf_program/tests/mlir_gen_test.py
new file mode 100644
index 00000000000..49737352d73
--- /dev/null
+++ b/tensorflow/python/tf_program/tests/mlir_gen_test.py
@@ -0,0 +1,247 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `mlir_gen` module"""
+
+# pylint: disable=missing-function-docstring
+# pylint: disable=invalid-name
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import test
+from tensorflow.python.types import core
+from tensorflow.python.tf_program.mlir_gen import mlir_gen
+
+import tensorflow.compiler.mlir.python.mlir_wrapper.filecheck_wrapper as fw
+
+
+class MLIRGenTestBase(test.TestCase):
+
+  def _check_code(self, mlir_code, exp_mlir_code):
+    return self.assertTrue(fw.check(str(mlir_code), exp_mlir_code))
+
+
+class MLIRGenTest(MLIRGenTestBase):
+  """MLIR Generation Tests for Tensorflow Program"""
+
+  def test_simple(self):
+
+    def test_fn():
+      pass
+
+    mlir_code = mlir_gen(test_fn)
+    mlir_code_exp = r"""
+      CHECK-LABEL: @test_fn
+    """
+    self._check_code(mlir_code, mlir_code_exp)
+
+  def test_argument(self):
+
+    def test_fn(x: core.Tensor) -> core.Tensor:
+      return x
+
+    mlir_code = mlir_gen(test_fn)
+    mlir_code_exp = r"""
+      CHECK-LABEL: @test_fn(%arg0: tensor<*xi32>) -> tensor<*xi32> {
+        CHECK-NEXT: return %arg0 : tensor<*xi32>
+    """
+    self._check_code(mlir_code, mlir_code_exp)
+
+  def test_constant(self):
+
+    def test_fn() -> int:
+      return 23
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r"""
+      CHECK-LABEL: func @test_fn() -> i32
+      CHECK: %[[r0:[0-9]+]] = "tf.Const"() {value = dense<23> : tensor<i32>} : () -> tensor<i32>
+      CHECK: return %[[r0]] : tensor<i32>
+    """
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_BoolOp(self):
+
+    def test_fn(x: bool, y: bool) -> bool:
+      return x or y or x and x and y
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r"""
+      CHECK-LABEL: func @test_fn(%arg0: i1, %arg1: i1) -> i1
+      CHECK: %[[r0:[0-9]+]] = "tfp.And"(%arg0, %arg0, %arg1) : (i1, i1, i1) -> tensor<*xi1>
+      CHECK: %[[r1:[0-9]+]] = "tfp.Or"(%arg0, %arg1, %[[r0]]) : (i1, i1, tensor<*xi1>) -> tensor<*xi1>
+      CHECK: return %[[r1]] : tensor<*xi1>
+    """
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_Call(self):
+
+    def test_fn():
+
+      def f1():
+        return 23
+
+      def f2():
+        return f1()
+
+      f2()
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r"""
+      CHECK-LABEL: func @test_fn()
+        CHECK: "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @f2} : () -> ()
+      CHECK: }
+      CHECK-LABEL: func @f1() {
+        CHECK: %[[r0:[0-9]+]] = "tf.Const"() {value = dense<23> : tensor<i32>} : () -> tensor<i32>
+        CHECK: return %[[r0]] : tensor<i32>
+      CHECK: }
+      CHECK-LABEL: func @f2() {
+        CHECK: "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @f1} : () -> ()
+      }
+    """
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_Compare(self):
+
+    def test_fn(x: core.Tensor, y: core.Tensor, z: core.Tensor):
+      return x > y < z
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r"""
+      CHECK-LABEL: func @test_fn(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %arg2: tensor<*xi32>)
+      CHECK: %[[r0:[0-9]+]] = "tf.Greater"(%arg0, %arg1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
+      CHECK: %[[r1:[0-9]+]] = "tf.Less"(%[[r0]], %arg2) : (tensor<*xi1>, tensor<*xi32>) -> tensor<*xi1>
+      CHECK: return %[[r1]] : tensor<*xi1>
+    """
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_Assign_BinOp(self):
+
+    def test_fn() -> int:
+      y = 12 + 23 - 24
+      return y
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r"""
+      CHECK-LABEL: func @test_fn() -> i32
+      CHECK: %[[r0:[0-9]+]] = "tf.AddV2"(%{{[0-9]+}}, %{{[0-9]+}}) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      CHECK: %[[r1:[0-9]+]] = "tf.Sub"(%{{[0-9]+}}, %{{[0-9]+}}) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      CHECK: return %[[r1]] : tensor<i32>
+    """
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_if(self):
+
+    def test_fn(x: core.Tensor) -> int:
+      res = 0
+      if x > 0:
+        res = 1
+      elif x < 0:
+        res = -1
+      else:
+        res = 0
+      return res
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r"""
+      CHECK-LABEL: func @test_fn(%arg0: tensor<*xi32>) -> i32
+
+      CHECK: %[[r1:[0-9]+]] = "tf.Greater"(%arg0, %{{[0-9]+}}) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+      CHECK-NEXT: %[[r2:[0-9]+]] = "tfp.If"(%[[r1]]) ( {
+        CHECK: return %{{[0-9]+}} : tensor<i32>
+      CHECK-NEXT: },  {
+        CHECK: %[[r3:[0-9]+]] = "tf.Less"(%arg0, %{{[0-9]+}}) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+        CHECK: %[[r4:[0-9]+]] = "tfp.If"(%[[r3]]) ( {
+          CHECK: %[[r5:[0-9]+]] = "tf.Neg"(%{{[0-9]+}}) : (tensor<i32>) -> tensor<i32>
+          CHECK: return %[[r5]] : tensor<i32>
+        CHECK-NEXT: },  {
+          CHECK: return %{{[0-9]+}} : tensor<i32>
+        CHECK-NEXT: }) : (tensor<*xi1>) -> tensor<i32>
+        CHECK: return %[[r4]] : tensor<i32>
+      CHECK-NEXT: }) : (tensor<*xi1>) -> tensor<i32>
+      CHECK-NEXT: return %[[r2]] : tensor<i32>
+    """
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_while(self):
+
+    def test_fn(x: core.Tensor) -> core.Tensor:
+      s = 0
+      while x > 0:
+        s = s + x
+      return s
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r"""
+      CHECK-LABEL: func @test_fn(%arg0: tensor<*xi32>) -> tensor<*xi32>
+
+      CHECK: %[[r1:[0-9]+]] = "tfp.While"(%0) ( {
+      CHECK-NEXT: ^{{[^ ]+}}(%arg1: tensor<i32>):
+        CHECK: %[[r2:[0-9]+]] = "tf.Greater"(%arg0, %{{[0-9]+}}) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+        CHECK-NEXT: return %[[r2]] : tensor<*xi1>
+      CHECK-NEXT: },  {
+      CHECK-NEXT: ^{{[^ ]+}}(%arg1: tensor<i32>):
+        CHECK: %[[r3:[0-9]+]] = "tf.AddV2"(%arg1, %arg0) : (tensor<i32>, tensor<*xi32>) -> tensor<*xi32>
+        CHECK-NEXT: return %[[r3]] : tensor<*xi32>
+      CHECK-NEXT: }) : (tensor<i32>) -> tensor<i32>
+      CHECK-NEXT: return %[[r1]] : tensor<i32>
+    """
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_fibonacci(self):
+
+    def test_fn(x: core.Tensor) -> core.Tensor:
+      res, idx = 0, 2
+      a, b = 0, 1
+      if x == 0 or x == 1:
+        res = x
+      else:
+        while idx <= x:
+          res = a + b
+          a = b
+          b = res
+          idx = idx + 1
+      return res
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r"""
+      CHECK-LABEL: @test_fn(%arg0: tensor<*xi32>) -> tensor<*xi32>
+      CHECK: %[[r5:[0-9]+]] = "tf.Equal"(%arg0, %{{[0-9]+}}) {incompatible_shape_error = true} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+      CHECK: %[[r7:[0-9]+]] = "tf.Equal"(%arg0, %{{[0-9]+}}) {incompatible_shape_error = true} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+      CHECK: %[[r8:[0-9]+]] = "tfp.Or"(%[[r5]], %[[r7]]) : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
+
+      CHECK: %[[r9:[0-9]+]]:4 = "tfp.If"(%[[r8]]) ( {
+        CHECK-NEXT: return %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>
+        CHECK-NEXT: },  {
+        CHECK-NEXT: %[[r10:[0-9]+]]:4 = "tfp.While"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) ( {
+          CHECK-NEXT: ^{{[^ ]*}}(%arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>):
+          CHECK-NEXT: %[[r11:[0-9]+]] = "tf.LessEqual"(%arg{{[0-9]+}}, %arg{{[0-9]+}}) : (tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>) -> tensor<*xi1>
+          CHECK-NEXT: return %[[r11]] : tensor<*xi1>
+        CHECK-NEXT: },  {
+          CHECK-NEXT: ^{{[^ ]*}}(%arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>):
+          CHECK-NEXT: %[[r12:[0-9]+]] = "tf.AddV2"(%arg{{[0-9]+}}, %arg{{[0-9]+}}) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+          CHECK: %[[r13:[0-9]+]] = "tf.AddV2"(%arg{{[0-9]+}}, %{{[0-9]+}}) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+          CHECK-NEXT: return %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+        CHECK-NEXT: }) : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>)
+        CHECK-NEXT: return %[[r10]]#{{[0-9]+}}, %[[r10]]#{{[0-9]+}}, %[[r10]]#{{[0-9]+}}, %[[r10]]#{{[0-9]+}} : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+      CHECK-NEXT: }) : (tensor<*xi1>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>)
+      CHECK-NEXT: return %[[r9]]#{{[0-9]+}} : tensor<i32>
+    """
+    self._check_code(mlir_code, exp_mlir_code)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index ec54efa61cf..efcd912f430 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -210,6 +210,22 @@ TFE_OutputTensorHandles InputTFE_OutputTensorHandles(
   return output_tensor_handles;
 }
 
+// Packs multiple `EagerTensor`s of the same dtype and shape into one
+// `EagerTensor`.
+py::object TFE_Py_PackEagerTensors_wrapper(const py::handle& context,
+                                           const py::handle& tensors) {
+  TFE_Context* ctx = tensorflow::InputTFE_Context(context);
+  TFE_InputTensorHandles handles = InputTFE_InputTensorHandles(tensors);
+  tensorflow::Safe_TF_StatusPtr status = tensorflow::make_safe(TF_NewStatus());
+  int size = handles.size();
+  TFE_TensorHandle* packed_handle =
+      TFE_CreatePackedTensorHandle(ctx, handles.data(), &size, status.get());
+  tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+  PyObject* packed_tensor =
+      EagerTensorFromHandle(packed_handle, /*is_packed=*/true);
+  return tensorflow::PyoOrThrow(packed_tensor);
+}
+
 // This function was created from fusing the typemap logic in platform/base.i.
 py::object TFE_Py_ExecuteCancelable_wrapper(
     const py::handle& context, const char* device_name, const char* op_name,
@@ -488,6 +504,18 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
     // NOTE: different from TFE_ContextSyncExecutors that raises potential
     // errors, deliberately ignore executor statuses in cleanup.
   });
+  m.def("TFE_ContextSetSoftDevicePlacement", [](py::handle& ctx, bool enable) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    TFE_ContextSetSoftDevicePlacement(tensorflow::InputTFE_Context(ctx), enable,
+                                      status.get());
+  });
+  m.def("TFE_ContextSetLogDevicePlacement", [](py::handle& ctx, bool enable) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    TFE_ContextSetSoftDevicePlacement(tensorflow::InputTFE_Context(ctx), enable,
+                                      status.get());
+  });
 
   // TFE_Executor logic
   m.def(
@@ -546,6 +574,10 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   m.def("TFE_Py_InitEagerTensor", [](const py::handle& o) {
     return tensorflow::PyoOrThrow(TFE_Py_InitEagerTensor(o.ptr()));
   });
+  m.def("TFE_Py_PackEagerTensors",
+        [](const py::handle& context, const py::handle& handles) {
+          return tensorflow::TFE_Py_PackEagerTensors_wrapper(context, handles);
+        });
   m.def("TFE_Py_SetEagerTensorProfiler", &TFE_Py_SetEagerTensorProfiler);
   m.def("TFE_Py_RegisterJVPFunction", [](const py::handle& o) {
     return tensorflow::PyoOrThrow(TFE_Py_RegisterJVPFunction(o.ptr()));
diff --git a/tensorflow/python/tools/api/generator/api_gen.bzl b/tensorflow/python/tools/api/generator/api_gen.bzl
index 3c7f7ac3900..45355c90aaa 100644
--- a/tensorflow/python/tools/api/generator/api_gen.bzl
+++ b/tensorflow/python/tools/api/generator/api_gen.bzl
@@ -42,8 +42,15 @@ def gen_api_init_files(
         api_version = 2,
         compat_api_versions = [],
         compat_init_templates = [],
-        packages = ["tensorflow.python", "tensorflow.lite.python.lite"],
-        package_deps = ["//tensorflow/python:no_contrib"],
+        packages = [
+            "tensorflow.python",
+            "tensorflow.lite.python.lite",
+            "tensorflow.python.modules_with_exports",
+        ],
+        package_deps = [
+            "//tensorflow/python:no_contrib",
+            "//tensorflow/python:modules_with_exports",
+        ],
         output_package = "tensorflow",
         output_dir = "",
         root_file_name = "__init__.py"):
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 13068a8090e..629c8a4a52f 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -16,6 +16,7 @@ TENSORFLOW_API_INIT_FILES = [
     "config/threading/__init__.py",
     "data/__init__.py",
     "data/experimental/__init__.py",
+    "data/experimental/service/__init__.py",
     "debugging/__init__.py",
     "debugging/experimental/__init__.py",
     "distribute/__init__.py",
@@ -67,6 +68,7 @@ TENSORFLOW_API_INIT_FILES = [
     "summary/experimental/__init__.py",
     "sysconfig/__init__.py",
     "test/__init__.py",
+    "tpu/experimental/embedding/__init__.py",
     "tpu/experimental/__init__.py",
     "tpu/__init__.py",
     "train/__init__.py",
@@ -98,6 +100,7 @@ KERAS_API_INIT_FILES = [
     "keras/applications/xception/__init__.py",
     "keras/backend/__init__.py",
     "keras/callbacks/__init__.py",
+    "keras/callbacks/experimental/__init__.py",
     "keras/constraints/__init__.py",
     "keras/datasets/__init__.py",
     "keras/datasets/boston_housing/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index e5f0f46898f..5c9f1694081 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -16,6 +16,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "config/threading/__init__.py",
     "data/__init__.py",
     "data/experimental/__init__.py",
+    "data/experimental/service/__init__.py",
     "debugging/__init__.py",
     "debugging/experimental/__init__.py",
     "distribute/__init__.py",
@@ -85,6 +86,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "summary/__init__.py",
     "sysconfig/__init__.py",
     "test/__init__.py",
+    "tpu/experimental/embedding/__init__.py",
     "tpu/experimental/__init__.py",
     "tpu/__init__.py",
     "train/__init__.py",
@@ -118,6 +120,7 @@ KERAS_API_INIT_FILES_V1 = [
     "keras/applications/xception/__init__.py",
     "keras/backend/__init__.py",
     "keras/callbacks/__init__.py",
+    "keras/callbacks/experimental/__init__.py",
     "keras/constraints/__init__.py",
     "keras/datasets/__init__.py",
     "keras/datasets/boston_housing/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/create_python_api.py b/tensorflow/python/tools/api/generator/create_python_api.py
index 42119aeae34..cc2f8ebfb28 100644
--- a/tensorflow/python/tools/api/generator/create_python_api.py
+++ b/tensorflow/python/tools/api/generator/create_python_api.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-"""Generates and prints out imports and constants for new TensorFlow python api.
-"""
+"""Generates and prints out imports and constants for new TensorFlow python api."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -85,9 +84,9 @@ def get_canonical_import(import_set):
   ordering.
 
   Args:
-    import_set: (set) Imports providing the same symbol. This is a set of
-      tuples in the form (import, priority). We want to pick an import
-      with highest priority.
+    import_set: (set) Imports providing the same symbol. This is a set of tuples
+      in the form (import, priority). We want to pick an import with highest
+      priority.
 
   Returns:
     A module name to import
@@ -106,9 +105,11 @@ def get_canonical_import(import_set):
 class _ModuleInitCodeBuilder(object):
   """Builds a map from module name to imports included in that module."""
 
-  def __init__(
-      self, output_package, api_version, lazy_loading=_LAZY_LOADING,
-      use_relative_imports=False):
+  def __init__(self,
+               output_package,
+               api_version,
+               lazy_loading=_LAZY_LOADING,
+               use_relative_imports=False):
     self._output_package = output_package
     # Maps API module to API symbol name to set of tuples of the form
     # (module name, priority).
@@ -127,16 +128,13 @@ class _ModuleInitCodeBuilder(object):
 
   def _check_already_imported(self, symbol_id, api_name):
     if (api_name in self._dest_import_to_id and
-        symbol_id != self._dest_import_to_id[api_name] and
-        symbol_id != -1):
+        symbol_id != self._dest_import_to_id[api_name] and symbol_id != -1):
       raise SymbolExposedTwiceError(
-          'Trying to export multiple symbols with same name: %s.' %
-          api_name)
+          'Trying to export multiple symbols with same name: %s.' % api_name)
     self._dest_import_to_id[api_name] = symbol_id
 
-  def add_import(
-      self, symbol, source_module_name, source_name, dest_module_name,
-      dest_name):
+  def add_import(self, symbol, source_module_name, source_name,
+                 dest_module_name, dest_name):
     """Adds this import to module_imports.
 
     Args:
@@ -150,6 +148,10 @@ class _ModuleInitCodeBuilder(object):
       SymbolExposedTwiceError: Raised when an import with the same
         dest_name has already been added to dest_module_name.
     """
+    # modules_with_exports.py is only used during API generation and
+    # won't be available when actually importing tensorflow.
+    if source_module_name.endswith('python.modules_with_exports'):
+      source_module_name = symbol.__module__
     import_str = self.format_import(source_module_name, source_name, dest_name)
 
     # Check if we are trying to expose two different symbols with same name.
@@ -191,7 +193,7 @@ class _ModuleInitCodeBuilder(object):
 
       for submodule_index in range(len(module_split)):
         if submodule_index > 0:
-          submodule = module_split[submodule_index-1]
+          submodule = module_split[submodule_index - 1]
           parent_module += '.' + submodule if parent_module else submodule
         import_from = self._output_package
         if self._lazy_loading:
@@ -264,8 +266,8 @@ __all__.extend([_s for _s in _names_with_underscore])
           if not dest_module.startswith(_COMPAT_MODULE_PREFIX):
             deprecation = 'True'
         # Workaround to make sure not load lite from lite/__init__.py
-        if (not dest_module and 'lite' in self._module_imports
-            and self._lazy_loading):
+        if (not dest_module and 'lite' in self._module_imports and
+            self._lazy_loading):
           has_lite = 'True'
         if self._lazy_loading:
           public_apis_name = '_PUBLIC_APIS'
@@ -311,8 +313,8 @@ __all__.extend([_s for _s in _names_with_underscore])
         self._module_imports[from_dest_module].copy())
 
 
-def add_nested_compat_imports(
-    module_builder, compat_api_versions, output_package):
+def add_nested_compat_imports(module_builder, compat_api_versions,
+                              output_package):
   """Adds compat.vN.compat.vK modules to module builder.
 
   To avoid circular imports, we want to add __init__.py files under
@@ -334,8 +336,8 @@ def add_nested_compat_imports(
       subcompat_module = _SUBCOMPAT_MODULE_TEMPLATE % (v, sv)
       compat_module = _COMPAT_MODULE_TEMPLATE % sv
       module_builder.copy_imports(compat_module, subcompat_module)
-      module_builder.copy_imports(
-          '%s.compat' % compat_module, '%s.compat' % subcompat_module)
+      module_builder.copy_imports('%s.compat' % compat_module,
+                                  '%s.compat' % subcompat_module)
 
   # Prefixes of modules under compatibility packages, for e.g. "compat.v1.".
   compat_prefixes = tuple(
@@ -400,14 +402,13 @@ def _join_modules(module1, module2):
   return '%s.%s' % (module1, module2)
 
 
-def add_imports_for_symbol(
-    module_code_builder,
-    symbol,
-    source_module_name,
-    source_name,
-    api_name,
-    api_version,
-    output_module_prefix=''):
+def add_imports_for_symbol(module_code_builder,
+                           symbol,
+                           source_module_name,
+                           source_name,
+                           api_name,
+                           api_version,
+                           output_module_prefix=''):
   """Add imports for the given symbol to `module_code_builder`.
 
   Args:
@@ -432,8 +433,8 @@ def add_imports_for_symbol(
       for export in exports:
         dest_module, dest_name = _get_name_and_module(export)
         dest_module = _join_modules(output_module_prefix, dest_module)
-        module_code_builder.add_import(
-            None, source_module_name, name, dest_module, dest_name)
+        module_code_builder.add_import(None, source_module_name, name,
+                                       dest_module, dest_name)
 
   # If symbol has _tf_api_names attribute, then add import for it.
   if (hasattr(symbol, '__dict__') and names_attr in symbol.__dict__):
@@ -442,8 +443,8 @@ def add_imports_for_symbol(
     for export in getattr(symbol, names_attr):  # pylint: disable=protected-access
       dest_module, dest_name = _get_name_and_module(export)
       dest_module = _join_modules(output_module_prefix, dest_module)
-      module_code_builder.add_import(
-          symbol, source_module_name, source_name, dest_module, dest_name)
+      module_code_builder.add_import(symbol, source_module_name, source_name,
+                                     dest_module, dest_name)
 
 
 def get_api_init_text(packages,
@@ -466,8 +467,8 @@ def get_api_init_text(packages,
       directory.
     lazy_loading: Boolean flag. If True, a lazy loading `__init__.py` file is
       produced and if `False`, static imports are used.
-    use_relative_imports: True if we should use relative imports when
-      importing submodules.
+    use_relative_imports: True if we should use relative imports when importing
+      submodules.
 
   Returns:
     A dictionary where
@@ -477,8 +478,10 @@ def get_api_init_text(packages,
   """
   if compat_api_versions is None:
     compat_api_versions = []
-  module_code_builder = _ModuleInitCodeBuilder(
-      output_package, api_version, lazy_loading, use_relative_imports)
+  module_code_builder = _ModuleInitCodeBuilder(output_package, api_version,
+                                               lazy_loading,
+                                               use_relative_imports)
+
   # Traverse over everything imported above. Specifically,
   # we want to traverse over TensorFlow Python modules.
 
@@ -496,24 +499,23 @@ def get_api_init_text(packages,
       continue
 
     for module_contents_name in dir(module):
-      if (module.__name__ + '.' + module_contents_name
-          in _SYMBOLS_TO_SKIP_EXPLICITLY):
+      if (module.__name__ + '.' +
+          module_contents_name in _SYMBOLS_TO_SKIP_EXPLICITLY):
         continue
       attr = getattr(module, module_contents_name)
       _, attr = tf_decorator.unwrap(attr)
 
-      add_imports_for_symbol(
-          module_code_builder, attr, module.__name__, module_contents_name,
-          api_name, api_version)
+      add_imports_for_symbol(module_code_builder, attr, module.__name__,
+                             module_contents_name, api_name, api_version)
       for compat_api_version in compat_api_versions:
-        add_imports_for_symbol(
-            module_code_builder, attr, module.__name__, module_contents_name,
-            api_name, compat_api_version,
-            _COMPAT_MODULE_TEMPLATE % compat_api_version)
+        add_imports_for_symbol(module_code_builder, attr, module.__name__,
+                               module_contents_name, api_name,
+                               compat_api_version,
+                               _COMPAT_MODULE_TEMPLATE % compat_api_version)
 
   if compat_api_versions:
-    add_nested_compat_imports(
-        module_code_builder, compat_api_versions, output_package)
+    add_nested_compat_imports(module_code_builder, compat_api_versions,
+                              output_package)
   return module_code_builder.build()
 
 
@@ -545,8 +547,8 @@ def get_module_docstring(module_name, package, api_name):
   4. Returns a default docstring.
 
   Args:
-    module_name: module name relative to tensorflow
-      (excluding 'tensorflow.' prefix) to get a docstring for.
+    module_name: module name relative to tensorflow (excluding 'tensorflow.'
+      prefix) to get a docstring for.
     package: Base python package containing python with target tf_export
       decorators.
     api_name: API you want to generate (e.g. `tensorflow` or `estimator`).
@@ -581,31 +583,37 @@ def get_module_docstring(module_name, package, api_name):
   return 'Public API for tf.%s namespace.' % module_name
 
 
-def create_api_files(output_files, packages, root_init_template, output_dir,
-                     output_package, api_name, api_version,
-                     compat_api_versions, compat_init_templates,
-                     lazy_loading=_LAZY_LOADING, use_relative_imports=False):
+def create_api_files(output_files,
+                     packages,
+                     root_init_template,
+                     output_dir,
+                     output_package,
+                     api_name,
+                     api_version,
+                     compat_api_versions,
+                     compat_init_templates,
+                     lazy_loading=_LAZY_LOADING,
+                     use_relative_imports=False):
   """Creates __init__.py files for the Python API.
 
   Args:
     output_files: List of __init__.py file paths to create.
     packages: Base python packages containing python with target tf_export
       decorators.
-    root_init_template: Template for top-level __init__.py file.
-      "# API IMPORTS PLACEHOLDER" comment in the template file will be replaced
-      with imports.
+    root_init_template: Template for top-level __init__.py file. "# API IMPORTS
+      PLACEHOLDER" comment in the template file will be replaced with imports.
     output_dir: output API root directory.
     output_package: Base output package where generated API will be added.
     api_name: API you want to generate (e.g. `tensorflow` or `estimator`).
     api_version: API version to generate (`v1` or `v2`).
     compat_api_versions: Additional API versions to generate in compat/
       subdirectory.
-    compat_init_templates: List of templates for top level compat init files
-      in the same order as compat_api_versions.
+    compat_init_templates: List of templates for top level compat init files in
+      the same order as compat_api_versions.
     lazy_loading: Boolean flag. If True, a lazy loading `__init__.py` file is
       produced and if `False`, static imports are used.
-    use_relative_imports: True if we should use relative imports when
-      import submodules.
+    use_relative_imports: True if we should use relative imports when import
+      submodules.
 
   Raises:
     ValueError: if output_files list is missing a required file.
@@ -645,8 +653,7 @@ def create_api_files(output_files, packages, root_init_template, output_dir,
   for module, text in module_text_map.items():
     # Make sure genrule output file list is in sync with API exports.
     if module not in module_name_to_file_path:
-      module_file_path = '"%s/__init__.py"' %  (
-          module.replace('.', '/'))
+      module_file_path = '"%s/__init__.py"' % (module.replace('.', '/'))
       missing_output_files.append(module_file_path)
       continue
 
@@ -664,8 +671,9 @@ def create_api_files(output_files, packages, root_init_template, output_dir,
         contents = contents.replace('# API IMPORTS PLACEHOLDER', text)
     else:
       contents = (
-          _GENERATED_FILE_HEADER % get_module_docstring(
-              module, packages[0], api_name) + text + _GENERATED_FILE_FOOTER)
+          _GENERATED_FILE_HEADER %
+          get_module_docstring(module, packages[0], api_name) + text +
+          _GENERATED_FILE_FOOTER)
     if module in deprecation_footer_map:
       if '# WRAPPER_PLACEHOLDER' in contents:
         contents = contents.replace('# WRAPPER_PLACEHOLDER',
@@ -680,14 +688,17 @@ def create_api_files(output_files, packages, root_init_template, output_dir,
         """Missing outputs for genrule:\n%s. Be sure to add these targets to
 tensorflow/python/tools/api/generator/api_init_files_v1.bzl and
 tensorflow/python/tools/api/generator/api_init_files.bzl (tensorflow repo), or
-tensorflow_estimator/python/estimator/api/api_gen.bzl (estimator repo)"""
-        % ',\n'.join(sorted(missing_output_files)))
+tensorflow_estimator/python/estimator/api/api_gen.bzl (estimator repo)""" %
+        ',\n'.join(sorted(missing_output_files)))
 
 
 def main():
   parser = argparse.ArgumentParser()
   parser.add_argument(
-      'outputs', metavar='O', type=str, nargs='+',
+      'outputs',
+      metavar='O',
+      type=str,
+      nargs='+',
       help='If a single file is passed in, then we we assume it contains a '
       'semicolon-separated list of Python files that we expect this script to '
       'output. If multiple files are passed in, then we assume output files '
@@ -699,46 +710,66 @@ def main():
       help='Base packages that import modules containing the target tf_export '
       'decorators.')
   parser.add_argument(
-      '--root_init_template', default='', type=str,
+      '--root_init_template',
+      default='',
+      type=str,
       help='Template for top level __init__.py file. '
-           '"#API IMPORTS PLACEHOLDER" comment will be replaced with imports.')
+      '"#API IMPORTS PLACEHOLDER" comment will be replaced with imports.')
   parser.add_argument(
-      '--apidir', type=str, required=True,
+      '--apidir',
+      type=str,
+      required=True,
       help='Directory where generated output files are placed. '
-           'gendir should be a prefix of apidir. Also, apidir '
-           'should be a prefix of every directory in outputs.')
+      'gendir should be a prefix of apidir. Also, apidir '
+      'should be a prefix of every directory in outputs.')
   parser.add_argument(
-      '--apiname', required=True, type=str,
+      '--apiname',
+      required=True,
+      type=str,
       choices=API_ATTRS.keys(),
       help='The API you want to generate.')
   parser.add_argument(
-      '--apiversion', default=2, type=int,
+      '--apiversion',
+      default=2,
+      type=int,
       choices=_API_VERSIONS,
       help='The API version you want to generate.')
   parser.add_argument(
-      '--compat_apiversions', default=[], type=int, action='append',
+      '--compat_apiversions',
+      default=[],
+      type=int,
+      action='append',
       help='Additional versions to generate in compat/ subdirectory. '
-           'If set to 0, then no additional version would be generated.')
+      'If set to 0, then no additional version would be generated.')
   parser.add_argument(
-      '--compat_init_templates', default=[], type=str, action='append',
+      '--compat_init_templates',
+      default=[],
+      type=str,
+      action='append',
       help='Templates for top-level __init__ files under compat modules. '
-           'The list of init file templates must be in the same order as '
-           'list of versions passed with compat_apiversions.')
+      'The list of init file templates must be in the same order as '
+      'list of versions passed with compat_apiversions.')
   parser.add_argument(
-      '--output_package', default='tensorflow', type=str,
+      '--output_package',
+      default='tensorflow',
+      type=str,
       help='Root output package.')
   parser.add_argument(
-      '--loading', default='default', type=str,
+      '--loading',
+      default='default',
+      type=str,
       choices=['lazy', 'static', 'default'],
       help='Controls how the generated __init__.py file loads the exported '
-           'symbols. \'lazy\' means the symbols are loaded when first used. '
-           '\'static\' means all exported symbols are loaded in the '
-           '__init__.py file. \'default\' uses the value of the '
-           '_LAZY_LOADING constant in create_python_api.py.')
+      'symbols. \'lazy\' means the symbols are loaded when first used. '
+      '\'static\' means all exported symbols are loaded in the '
+      '__init__.py file. \'default\' uses the value of the '
+      '_LAZY_LOADING constant in create_python_api.py.')
   parser.add_argument(
-      '--use_relative_imports', default=False, type=bool,
+      '--use_relative_imports',
+      default=False,
+      type=bool,
       help='Whether to import submodules using relative imports or absolute '
-           'imports')
+      'imports')
   args = parser.parse_args()
 
   if len(args.outputs) == 1:
diff --git a/tensorflow/python/tools/print_selective_registration_header.py b/tensorflow/python/tools/print_selective_registration_header.py
index 0bb56878f48..6d35749ee01 100644
--- a/tensorflow/python/tools/print_selective_registration_header.py
+++ b/tensorflow/python/tools/print_selective_registration_header.py
@@ -46,8 +46,10 @@ FLAGS = None
 
 def main(unused_argv):
   graphs = FLAGS.graphs.split(',')
-  print(selective_registration_header_lib.get_header(
-      graphs, FLAGS.proto_fileformat, FLAGS.default_ops))
+  print(
+      selective_registration_header_lib.get_header(graphs,
+                                                   FLAGS.proto_fileformat,
+                                                   FLAGS.default_ops))
 
 
 if __name__ == '__main__':
@@ -63,7 +65,9 @@ if __name__ == '__main__':
       '--proto_fileformat',
       type=str,
       default='rawproto',
-      help='Format of proto file, either textproto or rawproto.')
+      help='Format of proto file, either textproto, rawproto or ops_list. The '
+      'ops_list is the file contains the list of ops in JSON format. Ex: '
+      '"[["Add", "BinaryOp<CPUDevice, functor::add<float>>"]]".')
   parser.add_argument(
       '--default_ops',
       type=str,
diff --git a/tensorflow/python/tools/print_selective_registration_header_test.py b/tensorflow/python/tools/print_selective_registration_header_test.py
index d03084a2cfe..b06f6123ddd 100644
--- a/tensorflow/python/tools/print_selective_registration_header_test.py
+++ b/tensorflow/python/tools/print_selective_registration_header_test.py
@@ -93,6 +93,12 @@ class PrintOpFilegroupTest(test.TestCase):
       fnames.append(fname)
     return fnames
 
+  def WriteTextFile(self, content):
+    fname = os.path.join(self.get_temp_dir(), 'text.txt')
+    with gfile.GFile(fname, 'w') as f:
+      f.write(content)
+    return [fname]
+
   def testGetOps(self):
     default_ops = 'NoOp:NoOp,_Recv:RecvOp,_Send:SendOp'
     graphs = [
@@ -136,6 +142,59 @@ class PrintOpFilegroupTest(test.TestCase):
         ],
         ops_and_kernels)
 
+  def testGetOpsFromList(self):
+    default_ops = ''
+    # Test with 2 different ops.
+    ops_list = """[["Add", "BinaryOp<CPUDevice, functor::add<float>>"],
+        ["Softplus", "SoftplusOp<CPUDevice, float>"]]"""
+    ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
+        'ops_list', self.WriteTextFile(ops_list), default_ops)
+    self.assertListEqual([
+        ('Add', 'BinaryOp<CPUDevice, functor::add<float>>'),
+        ('Softplus', 'SoftplusOp<CPUDevice, float>'),
+    ], ops_and_kernels)
+
+    # Test with a single op.
+    ops_list = '[["Softplus", "SoftplusOp<CPUDevice, float>"]]'
+    ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
+        'ops_list', self.WriteTextFile(ops_list), default_ops)
+    self.assertListEqual([
+        ('Softplus', 'SoftplusOp<CPUDevice, float>'),
+    ], ops_and_kernels)
+
+    # Test with duplicated op.
+    ops_list = """[["Add", "BinaryOp<CPUDevice, functor::add<float>>"],
+        ["Add", "BinaryOp<CPUDevice, functor::add<float>>"]]"""
+    ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
+        'ops_list', self.WriteTextFile(ops_list), default_ops)
+    self.assertListEqual([
+        ('Add', 'BinaryOp<CPUDevice, functor::add<float>>'),
+    ], ops_and_kernels)
+
+    # Test op with no kernel.
+    ops_list = '[["Softplus", ""]]'
+    ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
+        'ops_list', self.WriteTextFile(ops_list), default_ops)
+    self.assertListEqual([
+        ('Softplus', None),
+    ], ops_and_kernels)
+
+    # Test two ops_list files.
+    ops_list = '[["Softplus", "SoftplusOp<CPUDevice, float>"]]'
+    ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
+        'ops_list',
+        self.WriteTextFile(ops_list) + self.WriteTextFile(ops_list),
+        default_ops)
+    self.assertListEqual([
+        ('Softplus', 'SoftplusOp<CPUDevice, float>'),
+    ], ops_and_kernels)
+
+    # Test empty file.
+    ops_list = ''
+    with self.assertRaises(Exception):
+      ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
+          'ops_list', self.WriteTextFile(ops_list), default_ops)
+
   def testAll(self):
     default_ops = 'all'
     graphs = [
diff --git a/tensorflow/python/tools/saved_model_aot_compile.py b/tensorflow/python/tools/saved_model_aot_compile.py
index a8694454ef2..5a34d10420a 100644
--- a/tensorflow/python/tools/saved_model_aot_compile.py
+++ b/tensorflow/python/tools/saved_model_aot_compile.py
@@ -215,6 +215,7 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
                                    signature_def_key,
                                    cpp_class,
                                    target_triple,
+                                   target_cpu,
                                    variables_to_feed=(),
                                    enable_multithreading=False):
   """Compile a `MetaGraphDef` to header+object files in `output_prefix`.
@@ -239,6 +240,7 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
     signature_def_key: String, the signature_def to use in the SavedModel.
     cpp_class: String, Name of output C++ class.
     target_triple: String, LLVM target triple.
+    target_cpu: String, LLVM target cpu name.
     variables_to_feed: A list of strings, the variables that will be fed by the
       user; these won't be frozen.  If `None`, then we will extract all the
       variables in the graph and mark them as to-feed.  The default behavior is
@@ -367,6 +369,7 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
       config=config_pbtxt_location,
       cpp_class=cpp_class,
       target_triple=target_triple,
+      target_cpu=target_cpu,
       entry_point='entry_{}'.format(entry_digest),
       out_function_object='{}.o'.format(output_prefix),
       out_header='{}.h'.format(output_prefix),
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 261ee1b9e9d..0f8f68436a3 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -825,6 +825,7 @@ def aot_compile_cpu(args):
       variables_to_feed=variables_to_feed,
       output_prefix=args.output_prefix,
       target_triple=args.target_triple,
+      target_cpu=args.target_cpu,
       cpp_class=args.cpp_class,
       enable_multithreading=args.enable_multithreading)
 
@@ -1096,6 +1097,14 @@ def add_aot_compile_cpu_subparser(subparsers):
             'x86_64-none-darwin, x86_64-apple-ios, arm64-none-ios, '
             'armv7-none-android.  More examples are available in tfcompile.bzl '
             'in the tensorflow codebase.'))
+  parser_compile.add_argument(
+      '--target_cpu',
+      type=str,
+      default='',
+      help=('Target cpu name for LLVM during AOT compilation.  Examples: '
+            'x86_64, skylake, haswell, westmere, <empty> (unknown).  For '
+            'a complete list of options, run (for x86 targets): '
+            '`llc -march=x86 -mcpu=help`'))
   parser_compile.add_argument(
       '--checkpoint_path',
       type=str,
diff --git a/tensorflow/python/tools/selective_registration_header_lib.py b/tensorflow/python/tools/selective_registration_header_lib.py
index da34da594e3..25f538b4f67 100644
--- a/tensorflow/python/tools/selective_registration_header_lib.py
+++ b/tensorflow/python/tools/selective_registration_header_lib.py
@@ -22,11 +22,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import json
 import os
 import sys
 
 from google.protobuf import text_format
-
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python import _pywrap_kernel_registry
 from tensorflow.python.platform import gfile
@@ -41,6 +41,39 @@ OPS_WITHOUT_KERNEL_WHITELIST = frozenset([
     # core/common_runtime/accumulate_n_optimizer.cc.
     'AccumulateNV2'
 ])
+FLEX_PREFIX = b'Flex'
+FLEX_PREFIX_LENGTH = len(FLEX_PREFIX)
+
+
+def _get_ops_from_ops_list(input_file):
+  """Gets the ops and kernels needed from the ops list file."""
+  ops = set()
+  ops_list_str = gfile.GFile(input_file, 'r').read()
+  if not ops_list_str:
+    raise Exception('Input file should not be empty')
+  ops_list = json.loads(ops_list_str)
+  for op, kernel in ops_list:
+    op_and_kernel = (op, kernel if kernel else None)
+    ops.add(op_and_kernel)
+  return ops
+
+
+def _get_ops_from_graphdef(graph_def):
+  """Gets the ops and kernels needed from the tensorflow model."""
+  ops = set()
+  for node_def in graph_def.node:
+    if not node_def.device:
+      node_def.device = '/cpu:0'
+    kernel_class = _pywrap_kernel_registry.TryFindKernelClass(
+        node_def.SerializeToString())
+    op = str(node_def.op)
+    if kernel_class or op in OPS_WITHOUT_KERNEL_WHITELIST:
+      op_and_kernel = (op, str(kernel_class.decode('utf-8'))
+                       if kernel_class else None)
+      ops.add(op_and_kernel)
+    else:
+      print('Warning: no kernel found for op %s' % node_def.op, file=sys.stderr)
+  return ops
 
 
 def get_ops_and_kernels(proto_fileformat, proto_files, default_ops_str):
@@ -49,6 +82,11 @@ def get_ops_and_kernels(proto_fileformat, proto_files, default_ops_str):
 
   for proto_file in proto_files:
     tf_logging.info('Loading proto file %s', proto_file)
+    # Load ops list file.
+    if proto_fileformat == 'ops_list':
+      ops = ops.union(_get_ops_from_ops_list(proto_file))
+      continue
+
     # Load GraphDef.
     file_data = gfile.GFile(proto_file, 'rb').read()
     if proto_fileformat == 'rawproto':
@@ -56,22 +94,7 @@ def get_ops_and_kernels(proto_fileformat, proto_files, default_ops_str):
     else:
       assert proto_fileformat == 'textproto'
       graph_def = text_format.Parse(file_data, graph_pb2.GraphDef())
-
-    # Find all ops and kernels used by the graph.
-    for node_def in graph_def.node:
-      if not node_def.device:
-        node_def.device = '/cpu:0'
-      kernel_class = _pywrap_kernel_registry.TryFindKernelClass(
-          node_def.SerializeToString())
-      op = str(node_def.op)
-      if kernel_class or op in OPS_WITHOUT_KERNEL_WHITELIST:
-        op_and_kernel = (op, str(kernel_class.decode('utf-8'))
-                         if kernel_class else None)
-        if op_and_kernel not in ops:
-          ops.add(op_and_kernel)
-      else:
-        print(
-            'Warning: no kernel found for op %s' % node_def.op, file=sys.stderr)
+    ops = ops.union(_get_ops_from_graphdef(graph_def))
 
   # Add default ops.
   if default_ops_str and default_ops_str != 'all':
@@ -91,7 +114,7 @@ def get_header_from_ops_and_kernels(ops_and_kernels,
   Args:
     ops_and_kernels: a set of (op_name, kernel_class_name) pairs to include.
     include_all_ops_and_kernels: if True, ops_and_kernels is ignored and all op
-    kernels are included.
+      kernels are included.
 
   Returns:
     the string of the header that should be written as ops_to_register.h.
@@ -112,7 +135,7 @@ def get_header_from_ops_and_kernels(ops_and_kernels,
     append('#define SHOULD_REGISTER_OP_KERNEL(clz) true')
     append('#define SHOULD_REGISTER_OP_GRADIENT true')
   else:
-    line = '''
+    line = """
     namespace {
       constexpr const char* skip(const char* x) {
         return (*x) ? (*x == ' ' ? skip(x + 1) : x) : x;
@@ -138,10 +161,11 @@ def get_header_from_ops_and_kernels(ops_and_kernels,
         }
       };
     }  // end namespace
-    '''
+    """
     line += 'constexpr const char* kNecessaryOpKernelClasses[] = {\n'
     for _, kernel_class in ops_and_kernels:
-      if kernel_class is None: continue
+      if kernel_class is None:
+        continue
       line += '"%s",\n' % kernel_class
     line += '};'
     append(line)
@@ -160,8 +184,8 @@ def get_header_from_ops_and_kernels(ops_and_kernels,
     append('#define SHOULD_REGISTER_OP(op) ShouldRegisterOp(op)')
     append('')
 
-    append('#define SHOULD_REGISTER_OP_GRADIENT ' + (
-        'true' if 'SymbolicGradient' in ops else 'false'))
+    append('#define SHOULD_REGISTER_OP_GRADIENT ' +
+           ('true' if 'SymbolicGradient' in ops else 'false'))
 
   append('#endif')
   return '\n'.join(result_list)
@@ -174,11 +198,13 @@ def get_header(graphs,
 
   Args:
     graphs: a list of paths to GraphDef files to include.
-    proto_fileformat: optional format of proto file, either 'textproto' or
-      'rawproto' (default).
+    proto_fileformat: optional format of proto file, either 'textproto',
+      'rawproto' (default) or ops_list. The ops_list is the file contain the
+      list of ops in JSON format, Ex: "[["Transpose", "TransposeCpuOp"]]".
     default_ops: optional comma-separated string of operator:kernel pairs to
       always include implementation for. Pass 'all' to have all operators and
       kernels included. Default: 'NoOp:NoOp,_Recv:RecvOp,_Send:SendOp'.
+
   Returns:
     the string of the header that should be written as ops_to_register.h.
   """
diff --git a/tensorflow/python/tools/tools.bzl b/tensorflow/python/tools/tools.bzl
index c6853e1fc63..79f771bbcad 100644
--- a/tensorflow/python/tools/tools.bzl
+++ b/tensorflow/python/tools/tools.bzl
@@ -1,6 +1,7 @@
 """Definitions for using tools like saved_model_cli."""
 
 load("//tensorflow:tensorflow.bzl", "clean_dep", "if_xla_available")
+load("//tensorflow:tensorflow.bzl", "tfcompile_target_cpu")
 load("//tensorflow/compiler/aot:tfcompile.bzl", "target_llvm_triple")
 
 def _maybe_force_compile(args, force_compile):
@@ -19,6 +20,7 @@ def saved_model_compile_aot(
         signature_def = "serving_default",
         variables_to_feed = "",
         target_triple = None,
+        target_cpu = None,
         force_without_xla_support_flag = True,
         tags = None):
     """Compile a SavedModel directory accessible from a filegroup.
@@ -88,7 +90,9 @@ def saved_model_compile_aot(
         uninitialized in the compiled object (this applies to all input
         arguments from the signature as well).
       target_triple: The LLVM target triple to use (defaults to current build's
-        target architecture's triple).
+        target architecture's triple).  Similar to clang's -target flag.
+      target_cpu: The LLVM cpu name used for compilation.  Similar to clang's
+        -mcpu flag.
       force_without_xla_support_flag: Whether to compile even when
         `--define=with_xla_support=true` is not set.  If `False`, and the
         define is not passed when building, then the created `cc_library`
@@ -100,6 +104,7 @@ def saved_model_compile_aot(
     """
     saved_model = "{}/saved_model.pb".format(directory)
     target_triple = target_triple or target_llvm_triple()
+    target_cpu = target_cpu or tfcompile_target_cpu() or ""
     variables_to_feed = variables_to_feed or "''"
     if checkpoint_path:
         checkpoint_cmd_args = (
@@ -131,6 +136,7 @@ def saved_model_compile_aot(
             "--variables_to_feed {} ".format(variables_to_feed) +
             "--signature_def_key {} ".format(signature_def) +
             "--target_triple " + target_triple + " " +
+            ("--target_cpu " + target_cpu + " " if target_cpu else "") +
             "--tag_set {} ".format(tag_set)
         ),
         tags = tags,
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index ebf0a4ffc57..422c106432e 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -179,6 +179,8 @@ py_library(
         ":feature_column_v2",
         ":preempted_hook_py",
         ":tpu_embedding",
+        ":tpu_embedding_v2",
+        ":tpu_embedding_v2_utils",
         ":tpu_lib",
     ],
 )
@@ -435,6 +437,84 @@ tf_py_test(
     ],
 )
 
+py_library(
+    name = "tpu_embedding_v2_utils",
+    srcs = ["tpu_embedding_v2_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/training/saving:saveable_hook",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "tpu_embedding_v2",
+    srcs = ["tpu_embedding_v2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_embedding_v2_utils",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/training/saving:saveable_hook",
+        "@six_archive//:six",
+    ],
+)
+
+tpu_py_test(
+    name = "tpu_embedding_v2_test",
+    srcs = [
+        "tpu_embedding_v2_test.py",
+    ],
+    disable_experimental = True,
+    python_version = "PY3",
+    shard_count = 4,
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_embedding",
+        ":tpu_embedding_v2",
+        ":tpu_strategy_util",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:tpu_strategy",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python/training/tracking:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "tpu_embedding_v2_cpu_test",
+    srcs = [
+        "tpu_embedding_v2_cpu_test.py",
+    ],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_embedding_v2",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//third_party/py/numpy",
+    ],
+)
+
 tf_proto_library(
     name = "tensor_tracer_proto",
     srcs = ["tensor_tracer.proto"],
diff --git a/tensorflow/python/tpu/api.py b/tensorflow/python/tpu/api.py
index 7296de81dfe..a7db89ec0a5 100644
--- a/tensorflow/python/tpu/api.py
+++ b/tensorflow/python/tpu/api.py
@@ -27,5 +27,7 @@ from tensorflow.python.tpu import bfloat16
 from tensorflow.python.tpu import feature_column_v2
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_embedding
+from tensorflow.python.tpu import tpu_embedding_v2
+from tensorflow.python.tpu import tpu_embedding_v2_utils
 from tensorflow.python.tpu import tpu_optimizer
 # pylint: enable=unused-import
diff --git a/tensorflow/python/tpu/feature_column_v2.py b/tensorflow/python/tpu/feature_column_v2.py
index d9820425467..1012506c48b 100644
--- a/tensorflow/python/tpu/feature_column_v2.py
+++ b/tensorflow/python/tpu/feature_column_v2.py
@@ -31,15 +31,18 @@ from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu.feature_column import _is_running_on_cpu
 from tensorflow.python.tpu.feature_column import _record_variable_scope_and_name
 from tensorflow.python.tpu.feature_column import _SUPPORTED_CATEGORICAL_COLUMNS_V2
+from tensorflow.python.tpu.feature_column import _SUPPORTED_SEQUENCE_COLUMNS
 from tensorflow.python.tpu.feature_column import _TPUBaseEmbeddingColumn
 from tensorflow.python.util.tf_export import tf_export
 # pylint: disable=protected-access
 
 _ALLOWED_DEVICES = ['cpu', 'tpu_tensor_core', 'tpu_embedding_core']
+_TENSOR_CORE_MASK_KEY_SUFFIX = '__TENSOR_CORE_MASK'
 
 
 class EmbeddingDevice(enum.Enum):
@@ -174,10 +177,13 @@ def embedding_column_v2(categorical_column,
   elif embedding_lookup_device == 'tpu_embedding_core':
     embedding_lookup_device = EmbeddingDevice.TPU_EMBEDDING_CORE
 
-  if (embedding_lookup_device == EmbeddingDevice.TPU_TENSOR_CORE and
-      not tensor_core_shape):
-    raise ValueError('Using embedding_lookup_device=tpu_tensor_core requires '
-                     'tensor_core_shape to be set.')
+  if embedding_lookup_device == EmbeddingDevice.TPU_TENSOR_CORE:
+    if not tensor_core_shape:
+      raise ValueError('Using embedding_lookup_device=tpu_tensor_core requires '
+                       'tensor_core_shape to be set.')
+    if isinstance(categorical_column, _SUPPORTED_SEQUENCE_COLUMNS):
+      raise ValueError('embedding_lookup_device=tpu_tensor_core currently does '
+                       'not support sequence columns.')
 
   if not embedding_lookup_device:
     return _TPUEmbeddingColumnV2(
@@ -372,10 +378,14 @@ def shared_embedding_columns_v2(categorical_columns,
   elif embedding_lookup_device == 'tpu_embedding_core':
     embedding_lookup_device = EmbeddingDevice.TPU_EMBEDDING_CORE
 
-  if (embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE and
-      not tensor_core_shape):
-    raise ValueError('Using embedding_lookup_device=tpu_tensor_core requires '
-                     'tensor_core_shape to be set.')
+  if embedding_lookup_device == EmbeddingDevice.TPU_TENSOR_CORE:
+    if not tensor_core_shape:
+      raise ValueError('Using embedding_lookup_device=tpu_tensor_core requires '
+                       'tensor_core_shape to be set.')
+    for c in sorted_columns:
+      if isinstance(c, _SUPPORTED_SEQUENCE_COLUMNS):
+        raise ValueError('embedding_lookup_device=tpu_tensor_core currently '
+                         'does not support sequence columns.')
 
   # Create the state (_SharedEmbeddingColumnLayer) here.
   for categorical_column, max_sequence_length in zip(
@@ -807,7 +817,13 @@ def sparse_embedding_aggregate_slice(params,
     if combiner == 'sum':
       return aggregate_emb
     elif combiner == 'mean':
-      return aggregate_emb / math_ops.reduce_sum(values_mask_broadcast, axis=1)
+      # In the case we have an empty row, both aggregate_emb and
+      # math_ops.reduce_sum(values_mask_broadcast, axis=1) will be 0. Thus,
+      # we can take max it with a non-zero value to prevent NaNs. Note that
+      # math_ops.reduce_sum(values_mask_broadcast, axis=1) will have integer
+      # values so 1.0 is the smallest value.
+      return aggregate_emb / math_ops.maximum(
+          math_ops.reduce_sum(values_mask_broadcast, axis=1), 1.0)
     else:
       raise ValueError('Dense TPU Embedding does not support combiner '
                        'other than sum and mean.')
@@ -851,6 +867,20 @@ def pad_sparse_embedding_lookup_indices(sparse_indices, padded_size):
   return padded_values, padded_mask
 
 
+def _check_invalid_cases(embedding_lookup_device):
+  """Checks for invalid embedding_lookup_device configurations."""
+  if (tpu.under_tpu_inference_context() and
+      embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE):
+    raise ValueError(
+        'Using embedding_lookup_device=tpu_embedding_core during inference '
+        'is not supported.')
+  if embedding_lookup_device == EmbeddingDevice.CPU:
+    if not tpu.under_tpu_inference_context():
+      raise ValueError(
+          'Using TPUEmbeddingColumn with embedding_lookup_device="cpu" '
+          'during training is not supported.')
+
+
 class _TPUDeviceSpecificEmbeddingColumnV2(_TPUEmbeddingColumnV2):
   """TPUEmbeddingColumn which allows serving on TensorCore."""
 
@@ -874,46 +904,108 @@ class _TPUDeviceSpecificEmbeddingColumnV2(_TPUEmbeddingColumnV2):
       del kwargs['embedding_lookup_device']
     _TPUEmbeddingColumnV2.__init__(self, *args, **kwargs)
 
-  def create_state(self, state_manager):
-    if (tpu.under_tpu_inference_context() and
-        self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE):
-      raise ValueError(
-          'Using embedding_lookup_device=tpu_embedding_core during inference '
-          'is not supported.')
-    if self._embedding_lookup_device == EmbeddingDevice.CPU:
-      if tpu.under_tpu_inference_context():
-        return fc_lib.EmbeddingColumn.create_state(self, state_manager)
-      else:
-        raise ValueError(
-            'Using TPUEmbeddingColumn with embedding_lookup_device="cpu" '
-            'during training is not supported.')
+  def __deepcopy__(self, memo):
+    return _TPUDeviceSpecificEmbeddingColumnV2(
+        *(copy.deepcopy(a, memo) for a in self.__getnewargs__()),
+        tensor_core_shape=self._tensor_core_shape,
+        embedding_lookup_device=self._embedding_lookup_device)
 
-    return super(_TPUDeviceSpecificEmbeddingColumnV2,
-                 self).create_state(state_manager)
+  def create_state(self, state_manager):
+    _check_invalid_cases(self._embedding_lookup_device)
+    # CPU case.
+    is_cpu = self._embedding_lookup_device == EmbeddingDevice.CPU
+    is_cpu = is_cpu or _is_running_on_cpu()
+    if is_cpu:
+      return fc_lib.EmbeddingColumn.create_state(self, state_manager)
+    # TPU_EMBEDDING_CORE case.
+    elif self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE:
+      return super(_TPUDeviceSpecificEmbeddingColumnV2,
+                   self).create_state(state_manager)
+
+    # TPU_EMBEDDING_CORE case.
+    return fc_lib.EmbeddingColumn.create_state(self, state_manager)
 
   def get_dense_tensor(self, transformation_cache, state_manager):
     """Private method that follows get_dense_tensor."""
-
-    # If we aren't inferencing on TensorCore, just delegate to parent.
-    if not tpu.under_tpu_inference_context() or not self._tensor_core_shape:
+    _check_invalid_cases(self._embedding_lookup_device)
+    # CPU Case.
+    is_cpu = self._embedding_lookup_device == EmbeddingDevice.CPU
+    is_cpu = is_cpu or _is_running_on_cpu()
+    if is_cpu:
+      return super(_TPUDeviceSpecificEmbeddingColumnV2,
+                   self).get_dense_tensor(transformation_cache, state_manager)
+    # TPU_EMBEDDING_CORE case.
+    elif self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE:
       return super(_TPUDeviceSpecificEmbeddingColumnV2,
                    self).get_dense_tensor(transformation_cache, state_manager)
-    sparse_tensor = transformation_cache.get(self.categorical_column.name,
-                                             state_manager)
 
-    # Use outside compile to densify and pad the input tensors.
-    def host_computation():
-      return pad_sparse_embedding_lookup_indices(sparse_tensor,
-                                                 self._tensor_core_shape[1])
+    # TPU_EMBEDDING_CORE cases.
+    if tpu.under_tpu_inference_context():
+      # For inference, use outside compile to densify and pad the input tensors.
+      sparse_tensor = transformation_cache.get(self.categorical_column.name,
+                                               state_manager)
 
-    values, mask = tpu.outside_compilation(host_computation)
+      def host_computation():
+        return pad_sparse_embedding_lookup_indices(sparse_tensor,
+                                                   self._tensor_core_shape[1])
 
-    # Do a dense embedding lookup on TensorCore.
-    embedding_weights = state_manager.get_variable(self, 'embedding_weights')
-    embedding = sparse_embedding_aggregate_slice(embedding_weights,
-                                                 (values, mask),
-                                                 self.get_combiner())
-    return embedding
+      values, mask = tpu.outside_compilation(host_computation)
+    else:
+      # For training, the inputs should already have been densified and padded.
+      values = transformation_cache.get(self.categorical_column.name,
+                                        state_manager)
+      mask = transformation_cache.get(
+          self.categorical_column.name + _TENSOR_CORE_MASK_KEY_SUFFIX,
+          state_manager)
+    embedding_weights = state_manager.get_variable(
+        self, name='embedding_weights')
+    return sparse_embedding_aggregate_slice(embedding_weights, (values, mask),
+                                            self.get_combiner())
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    _check_invalid_cases(self._embedding_lookup_device)
+    # CPU Case.
+    is_cpu = self._embedding_lookup_device == EmbeddingDevice.CPU
+    is_cpu = is_cpu or _is_running_on_cpu()
+    if is_cpu:
+      return super(_TPUDeviceSpecificEmbeddingColumnV2,
+                   self)._get_dense_tensor(inputs, weight_collections,
+                                           trainable)
+    # TPU_EMBEDDING_CORE case.
+    elif self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE:
+      return super(_TPUDeviceSpecificEmbeddingColumnV2,
+                   self)._get_dense_tensor(inputs, weight_collections,
+                                           trainable)
+
+    # TPU_EMBEDDING_CORE cases.
+    if tpu.under_tpu_inference_context():
+      # For inference, use outside compile to densify and pad the input tensors.
+      sparse_tensor = inputs.get(self.get_feature_key_name())
+
+      def host_computation():
+        return pad_sparse_embedding_lookup_indices(sparse_tensor,
+                                                   self._tensor_core_shape[1])
+
+      values, mask = tpu.outside_compilation(host_computation)
+    else:
+      # For training, the inputs should already have been densified and padded.
+      values = inputs.get(self.get_feature_key_name())
+      mask = inputs.get(self.get_feature_key_name() +
+                        _TENSOR_CORE_MASK_KEY_SUFFIX)
+
+    embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
+    if (weight_collections and
+        ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections):
+      weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+    embedding_weights = variable_scope.get_variable(
+        name='embedding_weights',
+        shape=embedding_shape,
+        dtype=dtypes.float32,
+        initializer=self.initializer,
+        trainable=self.trainable and trainable,
+        collections=weight_collections)
+    return sparse_embedding_aggregate_slice(embedding_weights, (values, mask),
+                                            self.get_combiner())
 
 
 class _TPUSharedDeviceSpecificEmbeddingColumnV2(_TPUSharedEmbeddingColumnV2):
@@ -940,34 +1032,48 @@ class _TPUSharedDeviceSpecificEmbeddingColumnV2(_TPUSharedEmbeddingColumnV2):
       del kwargs['embedding_lookup_device']
     _TPUSharedEmbeddingColumnV2.__init__(self, *args, **kwargs)
 
+  def __deepcopy__(self, memo):
+    return _TPUSharedDeviceSpecificEmbeddingColumnV2(
+        *(copy.deepcopy(a, memo) for a in self.__getnewargs__()),
+        tensor_core_shape=self._tensor_core_shape,
+        embedding_lookup_device=self._embedding_lookup_device)
+
   def _get_dense_tensor_internal(self, transformation_cache, state_manager):
     """Private method that follows _get_dense_tensor_internal."""
-    if (tpu.under_tpu_inference_context() and
-        self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE):
-      raise ValueError('Using embedding_lookup_device=tpu_embedding_core '
-                       'during inference is not supported.')
-    if self._embedding_lookup_device == EmbeddingDevice.CPU:
-      if tpu.under_tpu_inference_context():
-        return super(_TPUSharedDeviceSpecificEmbeddingColumnV2,
-                     self)._get_dense_tensor_internal(transformation_cache,
-                                                      state_manager)
-      else:
-        raise ValueError(
-            'Using TPUSharedEmbeddingColumn with '
-            'embedding_lookup_device="cpu" during training is not supported.')
-    sparse_tensor = transformation_cache.get(self.categorical_column.name,
-                                             state_manager)
+    _check_invalid_cases(self._embedding_lookup_device)
+    # CPU Case.
+    is_cpu = self._embedding_lookup_device == EmbeddingDevice.CPU
+    is_cpu = is_cpu or _is_running_on_cpu()
+    if is_cpu:
+      return super(_TPUSharedDeviceSpecificEmbeddingColumnV2,
+                   self)._get_dense_tensor_internal(transformation_cache,
+                                                    state_manager)
+    # TPU_EMBEDDING_CORE case.
+    if self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE:
+      return super(_TPUSharedDeviceSpecificEmbeddingColumnV2,
+                   self)._get_dense_tensor_internal(transformation_cache,
+                                                    state_manager)
 
-    # Use outside compile to densify and pad the input tensors.
-    def host_computation():
-      return pad_sparse_embedding_lookup_indices(sparse_tensor,
-                                                 self._tensor_core_shape[1])
+    # TPU_EMBEDDING_CORE cases.
+    if tpu.under_tpu_inference_context():
+      # For inference, use outside compile to densify and pad the input tensors.
+      sparse_tensor = transformation_cache.get(self.categorical_column.name,
+                                               state_manager)
 
-    values, mask = tpu.outside_compilation(host_computation)
+      def host_computation():
+        return pad_sparse_embedding_lookup_indices(sparse_tensor,
+                                                   self._tensor_core_shape[1])
+
+      values, mask = tpu.outside_compilation(host_computation)
+    else:
+      # For training, the inputs should already have been densified and padded.
+      values = transformation_cache.get(self.categorical_column.name,
+                                        state_manager)
+      mask = transformation_cache.get(
+          self.categorical_column.name + _TENSOR_CORE_MASK_KEY_SUFFIX,
+          state_manager)
 
     # Do a dense embedding lookup on TensorCore.
     embedding_weights = self.shared_embedding_column_creator.embedding_weights
-    embedding = sparse_embedding_aggregate_slice(embedding_weights,
-                                                 (values, mask),
-                                                 self.get_combiner())
-    return embedding
+    return sparse_embedding_aggregate_slice(embedding_weights, (values, mask),
+                                            self.get_combiner())
diff --git a/tensorflow/python/tpu/feature_column_v2_test.py b/tensorflow/python/tpu/feature_column_v2_test.py
index 282d176b301..932fe4e5a0a 100644
--- a/tensorflow/python/tpu/feature_column_v2_test.py
+++ b/tensorflow/python/tpu/feature_column_v2_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.tpu import feature_column_v2 as tpu_fc
 from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_function
 
 
 def _initialized_session():
@@ -514,50 +515,119 @@ class DeviceSpecificEmbeddingColumnTestV2(test.TestCase,
           embedding_lookup_device='tpu_tensor_core',
           tensor_core_shape=[None, 3])
 
-    # Run in TPUInferenceContext so that we hit the intended densification case.
+    # Run in TPUContexts so that we hit the intended densification case.
     context = tpu._TPUInferenceContext('tpu_inference')
     context.Enter()
+    with tpu_function.tpu_shard_context(1):
+      dense_features = fc_lib.DenseFeatures(embedding_column)
+      # Sqrtn combiner not supported for now.
+      if combiner == 'sqrtn':
+        with self.assertRaisesRegexp(
+            ValueError, 'Dense TPU Embedding does not support combiner'):
+          embedding_lookup = dense_features(input_features)
+        return
+      if combiner == 'mean':
+        expected_lookups = (
+            # example 0:
+            (7., 11.),  # ids [2], embedding = [7, 11]
+            # example 1:
+            (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) =
+            # [2, 3.5]
+        )
+      elif combiner == 'sum':
+        expected_lookups = (
+            # example 0:
+            (7., 11.),  # ids [2], embedding = [7, 11]
+            # example 1:
+            (4., 7),  # ids [0, 1], embedding = sum([1, 2] + [3, 5]) = [4, 7]
+        )
 
-    dense_features = fc_lib.DenseFeatures(embedding_column)
-    # Sqrtn combiner not supported for now.
-    if combiner == 'sqrtn':
-      with self.assertRaisesRegexp(
-          ValueError, 'Dense TPU Embedding does not support combiner'):
-        embedding_lookup = dense_features(input_features)
-      return
-    if combiner == 'mean':
+      embedding_lookup = dense_features(input_features)
+
+      # Assert expected embedding variable and lookups.
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      if shared:
+        self.assertCountEqual(('inp_shared_embedding:0',),
+                              tuple([v.name for v in global_vars]))
+      else:
+        self.assertCountEqual(
+            ('dense_features/inp_embedding/embedding_weights:0',),
+            tuple([v.name for v in global_vars]))
+
+      embedding_var = global_vars[0]
+      with _initialized_session():
+        self.assertAllEqual(embedding_values, embedding_var.eval())
+        eval_res = embedding_lookup.eval()
+        self.assertAllEqual(expected_lookups, eval_res)
+      context.Exit()
+
+  @test_util.deprecated_graph_mode_only
+  def test_empty_row(self):
+    # Inputs.
+    vocabulary_size = 3
+    input_sparse_tensor = sparse_tensor.SparseTensorValue(
+        # example 0, ids []
+        # example 1, ids [0, 1, 3]
+        indices=((1, 0), (1, 1), (1, 4)),
+        values=(0, 1, 3),
+        dense_shape=(2, 5))
+    input_features = {'inp': input_sparse_tensor}
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.),  # id 2
+        (13., 17.)  # id 3
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Build columns.
+    categorical_column_input = fc_lib.categorical_column_with_identity(
+        key='inp', num_buckets=vocabulary_size)
+
+    # Set tensor_core_shape to be [None, 20] to ensure some padding and
+    # dynamic batch size.
+    embedding_column = tpu_fc.embedding_column_v2(
+        categorical_column_input,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        combiner='mean',
+        embedding_lookup_device='tpu_tensor_core',
+        tensor_core_shape=[None, 3])
+
+    # Run in TPUContexts so that we hit the intended densification case.
+    context = tpu._TPUInferenceContext('tpu_inference')
+    context.Enter()
+    with tpu_function.tpu_shard_context(1):
+      dense_features = fc_lib.DenseFeatures(embedding_column)
       expected_lookups = (
           # example 0:
-          (7., 11.),  # ids [2], embedding = [7, 11]
+          (0., 0.),  # ids [], embedding = [0, 0]
           # example 1:
           (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
       )
-    elif combiner == 'sum':
-      expected_lookups = (
-          # example 0:
-          (7., 11.),  # ids [2], embedding = [7, 11]
-          # example 1:
-          (4., 7),  # ids [0, 1], embedding = sum([1, 2] + [3, 5]) = [4, 7]
-      )
 
-    embedding_lookup = dense_features(input_features)
+      embedding_lookup = dense_features(input_features)
 
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    if shared:
-      self.assertCountEqual(('inp_shared_embedding:0',),
-                            tuple([v.name for v in global_vars]))
-    else:
+      # Assert expected embedding variable and lookups.
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
       self.assertCountEqual(
           ('dense_features/inp_embedding/embedding_weights:0',),
           tuple([v.name for v in global_vars]))
 
-    embedding_var = global_vars[0]
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, embedding_var.eval())
-      eval_res = embedding_lookup.eval()
-      self.assertAllEqual(expected_lookups, eval_res)
-    context.Exit()
+      embedding_var = global_vars[0]
+      with _initialized_session():
+        self.assertAllEqual(embedding_values, embedding_var.eval())
+        eval_res = embedding_lookup.eval()
+        self.assertAllEqual(expected_lookups, eval_res)
+      context.Exit()
 
   @test_util.deprecated_graph_mode_only
   def test_error_dense_shape_invalid(self):
diff --git a/tensorflow/python/tpu/profiler/BUILD b/tensorflow/python/tpu/profiler/BUILD
index b505262c6a2..84ffb4234c0 100644
--- a/tensorflow/python/tpu/profiler/BUILD
+++ b/tensorflow/python/tpu/profiler/BUILD
@@ -38,7 +38,8 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:versions",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
-        "//tensorflow/python/eager:profiler_client",
+        "//tensorflow/python/profiler:profiler_client",
+        "//tensorflow/python/profiler:profiler_v2",
         "@absl_py//absl:app",
         "@absl_py//absl/flags",
     ],
diff --git a/tensorflow/python/tpu/profiler/capture_tpu_profile.py b/tensorflow/python/tpu/profiler/capture_tpu_profile.py
index f0d22027e4e..0068dc402c0 100644
--- a/tensorflow/python/tpu/profiler/capture_tpu_profile.py
+++ b/tensorflow/python/tpu/profiler/capture_tpu_profile.py
@@ -25,7 +25,8 @@ from absl import flags
 from distutils.version import LooseVersion
 
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver as resolver
-from tensorflow.python.eager import profiler_client
+from tensorflow.python.profiler import profiler_client
+from tensorflow.python.profiler import profiler_v2 as profiler
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import versions
 from tensorflow.python.platform import gfile
@@ -65,9 +66,10 @@ flags.DEFINE_integer('duration_ms', 0,
 flags.DEFINE_integer(
     'num_tracing_attempts', 3, 'Automatically retry N times when no trace '
     'event is collected.')
-flags.DEFINE_boolean('include_dataset_ops', True,
-                     'Set to false to profile longer TPU '
-                     'device traces.')
+flags.DEFINE_boolean('include_dataset_ops', True, 'Deprecated.')
+flags.DEFINE_integer(
+    'host_tracer_level', 2, 'Adjust host tracer level to control the verbosity '
+    ' of the TraceMe event being collected.')
 
 # Monitoring parameters
 flags.DEFINE_integer(
@@ -77,8 +79,7 @@ flags.DEFINE_integer(
 flags.DEFINE_integer(
     'num_queries', 100,
     'This script will run monitoring for num_queries before it stops.')
-flags.DEFINE_boolean('display_timestamp', False,
-                     'Set to true to display timestamp in monitoring results.')
+flags.DEFINE_boolean('display_timestamp', True, 'Deprecated.')
 
 
 def get_workers_list(cluster_resolver):
@@ -111,8 +112,7 @@ def get_workers_list(cluster_resolver):
   return ','.join(workers_list)
 
 
-def monitoring_helper(service_addr, duration_ms, monitoring_level,
-                      display_timestamp, num_queries):
+def monitoring_helper(service_addr, duration_ms, monitoring_level, num_queries):
   """Helper function to print monitoring results.
 
   Helper function to print monitoring results for num_queries times.
@@ -122,15 +122,13 @@ def monitoring_helper(service_addr, duration_ms, monitoring_level,
     duration_ms: Duration of one monitoring sample in milliseconds.
     monitoring_level: An integer between 1 and 2. Level 2 is more verbose than
       level 1 and shows more metrics.
-    display_timestamp: Set to true to display timestamp in monitoring.
     num_queries: Number of monitoring samples to collect.
   """
   if monitoring_level <= 0 or monitoring_level > 2:
     sys.exit('Please choose a monitoring level between 1 and 2.')
 
   for query in range(0, num_queries):
-    res = profiler_client.monitor(service_addr, duration_ms, monitoring_level,
-                                  display_timestamp)
+    res = profiler_client.monitor(service_addr, duration_ms, monitoring_level)
     print('Cloud TPU Monitoring Results (Sample ', query, '):\n\n', res)
 
 
@@ -144,8 +142,8 @@ def main(unused_argv=None):
   print('TensorFlow version %s detected' % tf_version)
   print('Welcome to the Cloud TPU Profiler v%s' % profiler_version.__version__)
 
-  if LooseVersion(tf_version) < LooseVersion('1.14.0'):
-    sys.exit('You must install tensorflow >= 1.14.0 to use this plugin.')
+  if LooseVersion(tf_version) < LooseVersion('2.2.0'):
+    sys.exit('You must install tensorflow >= 2.2.0 to use this plugin.')
 
   if not FLAGS.service_addr and not FLAGS.tpu:
     sys.exit('You must specify either --service_addr or --tpu.')
@@ -184,7 +182,7 @@ def main(unused_argv=None):
           FLAGS.duration_ms, ' ms and show metrics for ', FLAGS.num_queries,
           ' time(s).')
     monitoring_helper(service_addr, duration_ms, FLAGS.monitoring_level,
-                      FLAGS.display_timestamp, FLAGS.num_queries)
+                      FLAGS.num_queries)
   else:
     if not FLAGS.logdir:
       sys.exit('You must specify either --logdir or --monitoring_level.')
@@ -193,11 +191,16 @@ def main(unused_argv=None):
       gfile.MakeDirs(FLAGS.logdir)
 
     try:
-      profiler_client.start_tracing(service_addr,
-                                    os.path.expanduser(FLAGS.logdir),
-                                    duration_ms, workers_list,
-                                    FLAGS.include_dataset_ops,
-                                    FLAGS.num_tracing_attempts)
+      if LooseVersion(tf_version) < LooseVersion('2.3.0'):
+        profiler_client.trace(service_addr, os.path.expanduser(FLAGS.logdir),
+                              duration_ms, workers_list,
+                              FLAGS.num_tracing_attempts)
+      else:
+        options = profiler.ProfilerOptions(
+            host_tracer_level=FLAGS.host_tracer_level)
+        profiler_client.trace(service_addr, os.path.expanduser(FLAGS.logdir),
+                              duration_ms, workers_list,
+                              FLAGS.num_tracing_attempts, options)
     except errors.UnavailableError:
       sys.exit(0)
 
diff --git a/tensorflow/python/tpu/tensor_tracer.proto b/tensorflow/python/tpu/tensor_tracer.proto
index ad5392d65fe..7b745f0f45b 100644
--- a/tensorflow/python/tpu/tensor_tracer.proto
+++ b/tensorflow/python/tpu/tensor_tracer.proto
@@ -21,6 +21,10 @@ message TensorTracerReport {
   // A map from tensor name to its TracedTensorDef.
   map<string, TracedTensorDef> tensordef = 3;
 
+  // The fingerprint of the TensorTracerReport (fingerprint calculation excludes
+  // this field and graphdef).
+  string fingerprint = 4;
+
   message TensorTracerConfig {
     // Tensor tracer version, e.g. hostcall, outside compilation.
     string version = 1;
diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py
index bd96de42f3a..b4f99897094 100644
--- a/tensorflow/python/tpu/tensor_tracer.py
+++ b/tensorflow/python/tpu/tensor_tracer.py
@@ -100,7 +100,7 @@ _TT_TENSORBOARD_PLUGIN_NAME = 'tensor_tracer'
 _TT_HOSTCALL_KEY = 'tensor_tracer_host_call'
 _TT_EVENT_FILE_SUFFIX = '.tensor_tracer'
 
-_TT_SUMMARY_MAX_QUEUE = 100
+_TT_SUMMARY_MAX_QUEUE = 10
 
 
 def set_parameters(tensor_tracer_params=None):
@@ -206,6 +206,9 @@ def set_parameters(tensor_tracer_params=None):
           -> op2 -> op1 -> op0, if op0 has a NaN and trace_stack_size is 1, the
           result of op1 will also be printed. trace_stack_size is 2, the result
           of op1 and op2 will be printed.
+        - use_fingerprint_subdirectory: The trace directory will be chosen as
+          using the fingerprint of the trace metadata under the provided
+          trace_dir.
   """
   flags = '--%s=1' % tensor_tracer_flags.FLAG_NAME_ENABLE
   if tensor_tracer_params:
@@ -547,6 +550,7 @@ class TensorTracer(object):
     self._traced_op_names = set()
     self._report_proto = None
     self._temp_cache_var = []
+    self._report_proto_path = ''
 
   def report_proto(self):
     """Getter for tensor_tracer.proto object for summary and full_tensor_summary modes.
@@ -564,6 +568,14 @@ class TensorTracer(object):
                        'Report proto only exists for '
                        'trace_mode=[summary|full_tensor_summary]')
 
+  def report_proto_path(self):
+    """Getter for path where tensor_tracer.proto object should be written.
+
+    Returns:
+      A string path.
+    """
+    return self._report_proto_path
+
   def _get_all_cache_variables(self):
     return self._cache_variables
 
@@ -1366,6 +1378,13 @@ class TensorTracer(object):
       self._report_proto = report_handler.create_report_proto(
           self._tt_config, self._parameters, tensor_trace_order,
           tensor_trace_points, self._signature_types())
+      if self._parameters.use_fingerprint_subdir:
+        self._parameters.trace_dir = os.path.join(
+            self._parameters.trace_dir, self._report_proto.fingerprint)
+        logging.info('TensorTracer updating trace_dir to %s',
+                     self._parameters.trace_dir)
+      self._report_proto_path = tensor_tracer_report.report_proto_path(
+          self._parameters.trace_dir)
       if self._parameters.report_file_path != _SKIP_REPORT_FILE:
         report_handler.write_report_proto(self._report_proto, self._parameters)
     else:
diff --git a/tensorflow/python/tpu/tensor_tracer_flags.py b/tensorflow/python/tpu/tensor_tracer_flags.py
index c5e3e88597b..4e412c46e82 100644
--- a/tensorflow/python/tpu/tensor_tracer_flags.py
+++ b/tensorflow/python/tpu/tensor_tracer_flags.py
@@ -74,6 +74,7 @@ FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS = 'dump_graphs'
 FLAG_NAME_SUMMARY_SIGNATURES = 'signatures'
 FLAG_NAME_SUMMARY_PER_CORE = 'collect_summary_per_core'
 FLAG_NAME_TEMP_CACHE_VAR = 'use_temp_cache'
+FLAG_NAME_FINGERPRINT_DIR = 'use_fingerprint_subdirectory'
 
 _OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
 _TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
@@ -127,6 +128,7 @@ class TTParameters(object):
     self.trace_scalar_ops = self.is_flag_on(FLAG_NAME_TRACE_SCALAR_OPS)
     self.use_compact_trace = self.is_flag_on(FLAG_NAME_USE_COMPACT_TRACE)
     self.use_temp_cache_var = self.is_flag_on(FLAG_NAME_TEMP_CACHE_VAR)
+    self.use_fingerprint_subdir = self.is_flag_on(FLAG_NAME_FINGERPRINT_DIR)
 
     # _trace_ops_before_included and _trace_ops_after_included denotes to depth
     # of tracing relative to the ops given in --included_opnames or
@@ -274,7 +276,7 @@ class TTParameters(object):
         FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS, FLAG_NAME_OP_RANGE,
         FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS, FLAG_NAME_TRACE_LEVEL,
         FLAG_NAME_SUMMARY_SIGNATURES, FLAG_NAME_SUMMARY_PER_CORE,
-        FLAG_NAME_TEMP_CACHE_VAR
+        FLAG_NAME_TEMP_CACHE_VAR, FLAG_NAME_FINGERPRINT_DIR
     ]
     tensor_tracer_flags = self._env.get(FLAGS_ENV_VAR)
     if not tensor_tracer_flags:
diff --git a/tensorflow/python/tpu/tensor_tracer_report.py b/tensorflow/python/tpu/tensor_tracer_report.py
index e8a122d981f..3270b2a2fd3 100644
--- a/tensorflow/python/tpu/tensor_tracer_report.py
+++ b/tensorflow/python/tpu/tensor_tracer_report.py
@@ -19,8 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import hashlib
 import os
 
+
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu import tensor_tracer_pb2
@@ -53,6 +55,18 @@ _CURRENT_VERSION = 'use-outside-compilation'
 _TT_REPORT_PROTO = 'tensor_tracer_report.report_pb'
 
 
+def report_proto_path(trace_dir):
+  """Returns the path where report proto should be written.
+
+  Args:
+     trace_dir: String denoting the trace directory.
+
+  Returns:
+     A string denoting the path to the report proto.
+  """
+  return os.path.join(trace_dir, _TT_REPORT_PROTO)
+
+
 def topological_sort(g):
   """Performs topological sort on the given graph.
 
@@ -206,6 +220,12 @@ class OpenReportFile(object):
       self._report_file.close()
 
 
+def proto_fingerprint(message_proto):
+  serialized_message = message_proto.SerializeToString()
+  hasher = hashlib.sha256(serialized_message)
+  return hasher.hexdigest()
+
+
 class TTReportHandle(object):
   """Utility class responsible from creating a tensor tracer report."""
 
@@ -255,8 +275,6 @@ class TTReportHandle(object):
                                     key=lambda x: x[1]):
       report.config.signatures.append(signature_name)
 
-    tf_graph = tensor_trace_order.graph_order.graph
-    report.graphdef.CopyFrom(tf_graph.as_graph_def())
     for tensor in tensor_trace_order.graph_order.tensors:
       tensor_def = tensor_tracer_pb2.TensorTracerReport.TracedTensorDef()
       tensor_def.name = tensor.name
@@ -265,6 +283,11 @@ class TTReportHandle(object):
         tensor_def.cache_index = (
             tensor_trace_order.tensorname_to_cache_idx[tensor.name])
       else:
+        # To prevent small changes affecting the fingerprint calculation, avoid
+        # writing the untraced tensors to metadata. Fingerprints will be
+        # different only when the list of the traced tensors are different.
+        if tt_parameters.use_fingerprint_subdir:
+          continue
         tensor_def.is_traced = False
 
       if tensor.name in tensor_trace_points:
@@ -274,12 +297,17 @@ class TTReportHandle(object):
       elif tensor.op.name in self.instrument_records:
         tensor_def.explanation = self.instrument_records[tensor.op.name]
       report.tensordef[tensor.name].CopyFrom(tensor_def)
+    report.fingerprint = proto_fingerprint(report)
+    logging.info('TensorTracerProto fingerprint is %s.',
+                 report.fingerprint)
+    tf_graph = tensor_trace_order.graph_order.graph
+    report.graphdef.CopyFrom(tf_graph.as_graph_def())
     return report
 
   def write_report_proto(self, report_proto, tt_parameters):
     """Writes the given report proto under trace_dir."""
     gfile.MakeDirs(tt_parameters.trace_dir)
-    report_path = os.path.join(tt_parameters.trace_dir, _TT_REPORT_PROTO)
+    report_path = report_proto_path(tt_parameters.trace_dir)
     with gfile.GFile(report_path, 'wb') as f:
       f.write(report_proto.SerializeToString())
 
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index c70a26f2b4d..28eba69b7da 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -1353,7 +1353,7 @@ def split_compile_and_replicate(computation,
 
       def custom_getter(getter, name, *args, **kwargs):
         """Variables on TPU have a few restrictions."""
-        partitioner = kwargs["partitioner"]
+        partitioner = kwargs.get("partitioner", None)
         if partitioner is not None:
           kwargs["partitioner"] = None
           logging.warning(
diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
index fa07a929acc..d1848f34502 100644
--- a/tensorflow/python/tpu/tpu_embedding.py
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -828,7 +828,7 @@ class TPUEmbedding(object):
   ...     end_learning_rate=0.0)
   >>> wordpiece_table_config = TableConfig(
   ...   vocabulary_size=119547,
-  ...   dimension=768,
+  ...   dimension=256,
   ...   learning_rate_fn=learning_rate_fn)
   >>> wordpiece_feature_config = FeatureConfig(
   ...   table_id='bert/embeddings/word_embeddings',
@@ -846,11 +846,11 @@ class TPUEmbedding(object):
   ...  batch_size=128,
   ...  mode=TRAINING,
   ...  optimization_parameters=optimization_parameters,
-  ...  device_config=DeviceConfig(
-  ...    num_cores=64, num_hosts=4, job_name='tpu_worker'))
+  ...  master='')
   >>> with tf.Graph().as_default():
   ...   init_tpu_op = tf.compat.v1.tpu.initialize_system(
-  ...     embedding_config=tpu_embedding.config_proto, job='tpu_worker')
+  ...     embedding_config=tpu_embedding.config_proto)
+  ...   tf.compat.v1.Session().run(init_tpu_op)
   """
 
   # TODO(shizhiw): Consider adding a field to FeatureConfig that indicates that
diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
new file mode 100644
index 00000000000..5a66f6ce8b9
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -0,0 +1,1321 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Mid level API for TPU Embeddings."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import functools
+from absl import logging
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.protobuf.tpu import tpu_embedding_configuration_pb2
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import sharded_variable
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute import values as tf_values
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.training.saving import saveable_hook
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.util import compat
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+_HOOK_KEY = "TPUEmbedding_saveable"
+_NAME_KEY = "_tpu_embedding_layer"
+
+
+# TODO(bfontain): Cleanup and remove this once there is an implementation of
+# sharded variables that can be used in the PSStrategy with optimizers.
+# We implement just enough of the of a tf.Variable so that this could be passed
+# to an optimizer.
+class TPUShardedVariable(sharded_variable.ShardedVariable):
+  """A ShardedVariable class for TPU."""
+
+  @property
+  def _in_graph_mode(self):
+    return self.variables[0]._in_graph_mode  # pylint: disable=protected-access
+
+  @property
+  def _unique_id(self):
+    return self.variables[0]._unique_id  # pylint: disable=protected-access
+
+  @property
+  def _distribute_strategy(self):
+    return self.variables[0]._distribute_strategy  # pylint: disable=protected-access
+
+  @property
+  def _shared_name(self):
+    return self._name
+
+
+def _add_key_attr(op, name):
+  op._set_attr(_NAME_KEY, attr_value_pb2.AttrValue(s=compat.as_bytes(name)))  # pylint: disable=protected-access
+
+
+@tf_export("tpu.experimental.embedding.TPUEmbedding")
+class TPUEmbedding(tracking.AutoTrackable):
+  """The TPUEmbedding mid level API.
+
+  NOTE: When instantiated under a TPUStrategy, this class can only be created
+  once per call to `tf.tpu.experimental.initialize_tpu_system`. If you wish to
+  re-initialize the embedding engine you must re-initialize the tpu as well.
+  Doing this will clear any variables from TPU, so ensure you have checkpointed
+  before you do this. If a further instances of the class are needed,
+  set the `initialize_tpu_embedding` argument to `False`.
+
+  This class can be used to support training large embeddings on TPU. When
+  creating an instance of this class, you must specify the complete set of
+  tables and features you expect to lookup in those tables. See the
+  documentation of `tf.tpu.experimental.embedding.TableConfig` and
+  `tf.tpu.experimental.embedding.FeatureConfig` for more details on the complete
+  set of options. We will cover the basic usage here.
+
+  NOTE: multiple `FeatureConfig` objects can use the same `TableConfig` object,
+  allowing different features to share the same table:
+
+  ```python
+  table_config_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  table_config_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  feature_config = {
+      'feature_one': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_two': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_three': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_two)}
+  ```
+
+  There are two modes under which the `TPUEmbedding` class can used. This
+  depends on if the class was created under a `TPUStrategy` scope or not.
+
+  Under `TPUStrategy`, we allow access to the method `enqueue`, `dequeue` and
+  `apply_gradients`. We will show examples below of how to use these to train
+  and evaluate your model. Under CPU, we only access to the `embedding_tables`
+  property which allow access to the embedding tables so that you can use them
+  to run model evaluation/prediction on CPU.
+
+  First lets look at the `TPUStrategy` mode. Initial setup looks like:
+
+  ```python
+  strategy = tf.distribute.experimental.TPUStrategy(...)
+  with strategy.scope():
+    embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+        feature_config=feature_config,
+        batch_size=1024,
+        optimizer=tf.tpu.experimental.embedding.SGD(0.1))
+  ```
+
+  To use this API on TPU you should use a custom training loop. Below is an
+  example of a training and evaluation step:
+
+  ```python
+  @tf.function
+  def training_step(dataset_iterator, num_steps):
+    def tpu_step(tpu_features):
+      with tf.GradientTape() as tape:
+        activations = embedding.dequeue()
+        tape.watch(activations)
+        model_output = model(activations)
+        loss = ...  # some function of labels and model_output
+
+      embedding_gradients = tape.gradient(loss, activations)
+      embedding.apply_gradients(embedding_gradients)
+      # Insert your model gradient and optimizer application here
+
+    for _ in tf.range(num_steps):
+      embedding_features, tpu_features = next(dataset_iterator)
+      embedding.enqueue(embedding_features, training=True)
+      strategy.run(tpu_step, args=(embedding_features, ))
+
+  @tf.function
+  def evalution_step(dataset_iterator, num_steps):
+    def tpu_step(tpu_features):
+      activations = embedding.dequeue()
+      model_output = model(activations)
+      # Insert your evaluation code here.
+
+    for _ in tf.range(num_steps):
+      embedding_features, tpu_features = next(dataset_iterator)
+      embedding.enqueue(embedding_features, training=False)
+      strategy.run(tpu_step, args=(embedding_features, ))
+  ```
+
+  NOTE: The calls to `enqueue` have `training` set to `True` when
+  `embedding.apply_gradients` is used and set to `False` when
+  `embedding.apply_gradients` is not present in the function. If you don't
+  follow this pattern you may cause an error to be raised or the tpu may
+  deadlock.
+
+  In the above examples, we assume that the user has a dataset which returns
+  a tuple where the first element of the tuple matches the structure of what
+  was passed as the `feature_config` argument to the object initializer. Also we
+  utilize `tf.range` to get a `tf.while_loop` in order to increase performance.
+
+  When checkpointing your model, you should include your
+  `tf.tpu.experimental.embedding.TPUEmbedding` object in the checkpoint. It is a
+  trackable object and saving it will save the embedding tables and their
+  optimizer slot variables:
+
+  ```python
+  checkpoint = tf.train.Checkpoint(model=model, embedding=embedding)
+  checkpoint.save(...)
+  ```
+
+  On CPU, only the `embedding_table` property is usable. This will allow you to
+  restore a checkpoint to the object and have access to the table variables:
+
+  ```python
+  model = model_fn(...)
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=1024,
+      optimizer=tf.tpu.experimental.embedding.SGD(0.1))
+  checkpoint = tf.train.Checkpoint(model=model, embedding=embedding)
+  checkpoint.restore(...)
+
+  tables = embedding.embedding_tables
+  ```
+
+  You can now use table in functions like `tf.nn.embedding_lookup` to perform
+  your embedding lookup and pass to your model.
+
+  """
+
+  def __init__(self, feature_config, batch_size, optimizer,
+               pipeline_execution_with_tensor_core=False,
+               initialize_tpu_embedding=True):
+    """Creates the TPUEmbedding mid level API object.
+
+    ```python
+    strategy = tf.distribute.experimental.TPUStrategy(...)
+    with strategy.scope():
+      embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+          feature_config=tf.tpu.experimental.embedding.FeatureConfig(
+              table=tf.tpu.experimental.embedding.TableConfig(
+                  dim=...,
+                  vocabulary_size=...)))
+    ```
+
+    Args:
+      feature_config: A nested structure of
+        `tf.tpu.experimental.embedding.FeatureConfig` configs.
+      batch_size: The global batch size that you indend to use. Note that is
+        fixed and the same batch size must be used for both training and
+        evaluation.
+      optimizer: An instance of one of `tf.tpu.experimental.embedding.SGD`,
+        `tf.tpu.experimental.embedding.Adagrad` or
+        `tf.tpu.experimental.embedding.Adam`.
+      pipeline_execution_with_tensor_core: If True, the TPU embedding
+        computations will overlap with the TensorCore computations (and hence
+        will be one step old). Set to True for improved performance.
+      initialize_tpu_embedding: If False, will not initialize the TPU embedding
+        engine. If this is set to False and another instance of this class has
+        not initialized the tpu embedding engine, the creation of this object
+        will fail.
+
+    Raises:
+      ValueError: If optimizer is not one of tf.tpu.experimental.embedding.(SGD,
+      Adam or Adagrad).
+    """
+    self._strategy = distribution_strategy_context.get_strategy()
+    self._using_tpu = isinstance(self._strategy, tpu_strategy.TPUStrategy)
+    self._pipeline_execution_with_tensor_core = (
+        pipeline_execution_with_tensor_core)
+
+    self._feature_config = feature_config
+
+    # The TPU embedding ops are slightly inconsistent with how they refer to
+    # tables:
+    # * The enqueue op takes a parallel list of tensors for input, one of those
+    #   is the table id for the feature which matches the integer index of the
+    #   table in the proto created by _create_config_proto().
+    # * The recv_tpu_embedding_activations op emits lookups per table in the
+    #   order from the config proto.
+    # * The send_tpu_embedding_gradients expects input tensors to be per table
+    #   in the same order as the config proto.
+    # * Per optimizer load and retrieve ops are specified per table and take the
+    #   table name rather than the table id.
+    # Thus we must fix a common order to tables and ensure they have unique
+    # names.
+
+    # Set table order here
+    self._table_config = list(
+        {feature.table for feature in nest.flatten(feature_config)})
+
+    # Ensure tables have unique names. Also error check the optimizer as we
+    # specifically don't do that in the TableConfig class to allow high level
+    # APIs that are built on this to use strings/other classes to represent
+    # optimizers (before they are passed to this class).
+    table_names = []
+    for i, table in enumerate(self._table_config):
+      if table.optimizer is None:
+        # TODO(bfontain) Should we allow some sort of optimizer merging here?
+        table.optimizer = optimizer
+      if not isinstance(table.optimizer, tpu_embedding_v2_utils._Optimizer):  # pylint: disable=protected-access
+        raise ValueError("{} is an unsupported optimizer class. Please pass an "
+                         "instance of one of the optimizer classes under "
+                         "tf.tpu.experimental.embedding.".format(
+                             type(table.optimizer)))
+      if table.name is None:
+        table.name = "table_{}".format(i)
+      if table.name in table_names:
+        raise ValueError("Multiple tables with name {} found.".format(
+            table.name))
+      table_names.append(table.name)
+
+    if self._using_tpu:
+      # Extract a list of callable learning rates also in fixed order. Each
+      # table in the confix proto will get a index into this list and we will
+      # pass this list in the same order after evaluation to the
+      # send_tpu_embedding_gradients op.
+      self._dynamic_learning_rates = list({
+          table.optimizer.learning_rate for table in self._table_config if
+          callable(table.optimizer.learning_rate)})
+
+      # We need to list of host devices for the load/retrieve operations.
+      self._hosts = get_list_of_hosts(self._strategy)
+
+      # TODO(bfontain) Remove this once we have an official way of splitting
+      # prefetch between host and device.
+      self._strategy.extended._set_prefetch_on_host(True)  # pylint: disable=protected-access
+
+      # We generally use the per core batch size, but will have the user pass
+      # in a global batch size.
+      self._batch_size = batch_size // self._strategy.num_replicas_in_sync
+
+      self._config_proto = self._create_config_proto()
+      if initialize_tpu_embedding:
+        # This is mainly for testing purposes, sometimes we don't want to
+        # initialize the embedding engine, but just want a copy of the API
+        # which can interact with an already initialized engine.
+        logging.info("Initializing TPU Embedding engine with config: %s",
+                     self._config_proto)
+        @def_function.function
+        def load_config():
+          tpu.initialize_system_for_tpu_embedding(self._config_proto)
+
+        load_config()
+        logging.info("Done initializing TPU Embedding engine.")
+
+    # Create and load variables and slot variables into the TPU.
+    # Note that this is a dict of dicts. Keys to the first dict are table names.
+    # We would prefer to use TableConfigs, but then these variables won't be
+    # properly tracked by the tracking API.
+    self._variables = self._create_variables_and_slots()
+    if self._using_tpu:
+      self._load_variables()
+
+  @property
+  def embedding_tables(self):
+    """Returns a dict of embedding tables, keyed by `TableConfig`.
+
+    This property only works when the `TPUEmbedding` object is created under a
+    non-TPU strategy. This is intended to be used to for CPU based lookup when
+    creating a serving checkpoint.
+
+    Returns:
+      A dict of embedding tables, keyed by `TableConfig`.
+
+    Raises:
+      RuntimeError: If object was created under a `TPUStrategy`.
+    """
+    # We don't support returning tables on TPU due to their sharded nature and
+    # the fact that when using a TPUStrategy:
+    # 1. Variables are stale and are only updated when a checkpoint is made.
+    # 2. Updating the variables won't affect the actual tables on the TPU.
+    if self._using_tpu:
+      raise RuntimeError("Unable to retrieve embedding tables when using a TPU "
+                         "strategy. If you need access, save your model, "
+                         "create this object under a CPU strategy and restore.")
+
+    # Only return the tables and not the slot variables. On CPU this are honest
+    # tf.Variables.
+    return {table: self._variables[table.name]["parameters"]
+            for table in self._table_config}
+
+  def _create_config_proto(self):
+    """Creates the TPUEmbeddingConfiguration proto.
+
+    This proto is used to initialize the TPU embedding engine.
+
+    Returns:
+      A TPUEmbeddingConfiguration proto.
+    """
+
+    config_proto = tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration()
+
+    # There are several things that need to be computed here:
+    # 1. Each table has a num_features, which corresponds to the number of
+    #    output rows per example for this table. Sequence features count for
+    #    their maximum sequence length.
+    # 2. Learning rate index: the index of the dynamic learning rate for this
+    #    table (if it exists) in the list we created at initialization.
+    #    We don't simply create one learning rate index per table as this has
+    #    extremely bad performance characteristics. The more separate
+    #    optimization configurations we have, the worse the performance will be.
+    num_features = {table: 0 for table in self._table_config}
+    for feature in nest.flatten(self._feature_config):
+      num_features[feature.table] += (1 if feature.max_sequence_length == 0
+                                      else feature.max_sequence_length)
+
+    # Map each callable dynamic learning rate to its in index in the list.
+    learning_rate_index = {r: i for i, r in enumerate(
+        self._dynamic_learning_rates)}
+
+    for table in self._table_config:
+      table_descriptor = config_proto.table_descriptor.add()
+      table_descriptor.name = table.name
+
+      # For small tables, we pad to the number of hosts so that at least one
+      # id will be assigned to each host.
+      table_descriptor.vocabulary_size = max(table.vocabulary_size,
+                                             self._strategy.extended.num_hosts)
+      table_descriptor.dimension = table.dim
+
+      table_descriptor.num_features = num_features[table]
+
+      parameters = table_descriptor.optimization_parameters
+
+      # We handle the learning rate separately here and don't allow the
+      # optimization class to handle this, as it doesn't know about dynamic
+      # rates.
+      if callable(table.optimizer.learning_rate):
+        parameters.learning_rate.dynamic.tag = (
+            learning_rate_index[table.optimizer.learning_rate])
+      else:
+        parameters.learning_rate.constant = table.optimizer.learning_rate
+
+      # Use optimizer to handle the rest of the parameters.
+      table.optimizer._set_optimization_parameters(parameters)  # pylint: disable=protected-access
+
+    # Always set mode to training, we override the mode during enqueue.
+    config_proto.mode = (
+        tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration.TRAINING)
+
+    config_proto.batch_size_per_tensor_core = self._batch_size
+    config_proto.num_hosts = self._strategy.extended.num_hosts
+    config_proto.num_tensor_cores = self._strategy.num_replicas_in_sync
+
+    # TODO(bfontain): Allow users to pick MOD for the host sharding.
+    config_proto.sharding_strategy = (
+        tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration.DIV_DEFAULT)
+    config_proto.pipeline_execution_with_tensor_core = (
+        self._pipeline_execution_with_tensor_core)
+
+    return config_proto
+
+  def _compute_per_table_gradients(self, gradients):
+    """Computes a dict of lists of gradients, keyed by table name.
+
+    Args:
+      gradients: A nested structure of Tensors (and Nones) with the same
+        structure as the feature config.
+
+    Returns:
+      A dict of lists of tensors, keyed by the table names, containing the
+    gradients in the correct order with None gradients repalaced by zeros.
+    """
+
+    nest.assert_same_structure(self._feature_config, gradients)
+
+    per_table_gradients = {table: [] for table in self._table_config}
+    for (path, gradient), feature in zip(
+        nest.flatten_with_joined_string_paths(gradients),
+        nest.flatten(self._feature_config)):
+      if gradient is not None and not isinstance(gradient, ops.Tensor):
+        raise ValueError(
+            "Found {} at path {} in gradients. Expected Tensor.".format(
+                type(gradient), path))
+
+      # Expected tensor shape differs for sequence and non-sequence features.
+      if feature.max_sequence_length > 0:
+        shape = [self._batch_size, feature.max_sequence_length,
+                 feature.table.dim]
+      else:
+        shape = [self._batch_size, feature.table.dim]
+
+      if gradient is not None:
+        if gradient.shape != shape:
+          raise ValueError("Found gradient of shape {} at path {}. Expected "
+                           "shape {}.".format(gradient.shape, path, shape))
+
+        # We expand dims on non-sequence features so that all features are
+        # of rank 3 and we can concat on axis=1.
+        if len(shape) == 2:
+          gradient = array_ops.expand_dims(gradient, axis=1)
+      else:
+        # No gradient for this feature, since we must give a gradient for all
+        # features, pass in a zero tensor here. Note that this is not correct
+        # for all optimizers.
+        logging.warn("No gradient passed for feature %s, sending zero "
+                     "gradient. This may not be correct behavior for certain "
+                     "optimizers like Adam.", path)
+        # Create a shape to mimic the expand_dims above for non-sequence
+        # features.
+        if len(shape) == 2:
+          shape = [shape[0], 1, shape[1]]
+        gradient = array_ops.zeros(shape, dtype=dtypes.float32)
+      per_table_gradients[feature.table].append(gradient)
+
+    return per_table_gradients
+
+  def apply_gradients(self, gradients, name=None):
+    """Applies the gradient update to the embedding tables.
+
+    If a gradient of `None` is passed in any position of the nested structure,
+    then an gradient update with a zero gradient is applied for that feature.
+    For optimizers like SGD or Adagrad, this is the same as applying no update
+    at all. For lazy Adam and other sparsely applied optimizers with decay,
+    ensure you understand the effect of applying a zero gradient.
+
+    ```python
+    strategy = tf.distribute.experimental.TPUStrategy(...)
+    with strategy.scope():
+      embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
+
+    distributed_dataset = strategy.experimental_distribute_dataset(...)
+    dataset_iterator = iter(distributed_dataset)
+
+    @tf.function
+    def training_step():
+      def tpu_step(tpu_features):
+        with tf.GradientTape() as tape:
+          activations = embedding.dequeue()
+          tape.watch(activations)
+
+          loss = ... #  some computation involving activations
+
+        embedding_gradients = tape.gradient(loss, activations)
+        embedding.apply_gradients(embedding_gradients)
+
+      embedding_features, tpu_features = next(dataset_iterator)
+      embedding.enqueue(embedding_features, training=True)
+      strategy.run(tpu_step, args=(embedding_features, ))
+
+    training_step()
+    ```
+
+    Args:
+      gradients: A nested structure of gradients, with structure matching the
+        `feature_config` passed to this object.
+      name: A name for the underlying op.
+
+    Raises:
+      RuntimeError: If called when object wasn't created under a `TPUStrategy`.
+      ValueError: If a non-`tf.Tensor` non-`None` gradient is passed in, or a
+        `tf.Tensor` of the incorrect shape is passed in. Also if
+        the size of any sequence in `gradients` does not match corresponding
+        sequence in `feature_config`.
+      TypeError: If the type of any sequence in `gradients` does not match
+        corresponding sequence in `feature_config`.
+    """
+    if not self._using_tpu:
+      raise RuntimeError("apply_gradients is not valid when TPUEmbedding "
+                         "object is not created under a TPUStrategy.")
+
+    # send_tpu_embedding_gradients requires per table gradient, if we only have
+    # one feature per table this isn't an issue. When multiple features share
+    # the same table, the order of the features in per table tensor returned by
+    # recv_tpu_embedding_activations matches the order in which they were passed
+    # to enqueue.
+    # In all three places, we use the fixed order given by nest.flatten to have
+    # a consistent feature order.
+
+    # First construct a dict of tensors one for each table.
+    per_table_gradients = self._compute_per_table_gradients(gradients)
+
+    # Now that we have a list of gradients we can compute a list of gradients
+    # in the fixed order of self._table_config which interleave the gradients of
+    # the individual features. We concat on axis 1 and then reshape into a 2d
+    # tensor. The send gradients op expects a tensor of shape
+    # [num_features*batch_size, dim] for each table.
+    interleaved_gradients = []
+    for table in self._table_config:
+      interleaved_gradients.append(array_ops.reshape(
+          array_ops.concat(per_table_gradients[table], axis=1),
+          [-1, table.dim]))
+    op = tpu_ops.send_tpu_embedding_gradients(
+        inputs=interleaved_gradients,
+        learning_rates=[math_ops.cast(fn(), dtype=dtypes.float32)
+                        for fn in self._dynamic_learning_rates],
+        config=self._config_proto.SerializeToString())
+
+    # Apply the name tag to the op.
+    if name is not None:
+      _add_key_attr(op, name)
+
+  def dequeue(self, name=None):
+    """Get the embedding results.
+
+    Returns a nested structure of `tf.Tensor` objects, matching the structure of
+    the `feature_config` argument to the `TPUEmbedding` class. The output shape
+    of the tensors is `(batch_size, dim)`, where `batch_size` is the per core
+    batch size, `dim` is the dimension of the corresponding `TableConfig`. If
+    the feature's corresponding `FeatureConfig` has `max_sequence_length`
+    greater than 0, the output will be a sequence of shape
+    `(batch_size, max_sequence_length, dim)` instead.
+
+    ```python
+    strategy = tf.distribute.experimental.TPUStrategy(...)
+    with strategy.scope():
+      embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
+
+    distributed_dataset = strategy.experimental_distribute_dataset(...)
+    dataset_iterator = iter(distributed_dataset)
+
+    @tf.function
+    def training_step():
+      def tpu_step(tpu_features):
+        with tf.GradientTape() as tape:
+          activations = embedding.dequeue()
+          tape.watch(activations)
+
+          loss = ... #  some computation involving activations
+
+        embedding_gradients = tape.gradient(loss, activations)
+        embedding.apply_gradients(embedding_gradients)
+
+      embedding_features, tpu_features = next(dataset_iterator)
+      embedding.enqueue(embedding_features, training=True)
+      strategy.run(tpu_step, args=(embedding_features, ))
+
+    training_step()
+    ```
+
+    Args:
+      name: A name for the underlying op.
+
+    Returns:
+      A nested structure of tensors, with the same structure as `feature_config`
+    passed to this instance of the `TPUEmbedding` object.
+
+    Raises:
+      RuntimeError: If called when object wasn't created under a `TPUStrategy`.
+    """
+    if not self._using_tpu:
+      raise RuntimeError("dequeue is not valid when TPUEmbedding object is not "
+                         "created under a TPUStrategy.")
+
+    # The activations returned by this op are per table. So we must separate
+    # them out into per feature activations. The activations are interleaved:
+    # for each table, we expect a [num_features*batch_size, dim] tensor.
+    # E.g. we expect the slice [:num_features, :] to contain the lookups for the
+    # first example of all features using this table.
+    activations = tpu_ops.recv_tpu_embedding_activations(
+        num_outputs=len(self._table_config),
+        config=self._config_proto.SerializeToString())
+
+    # Apply the name tag to the op.
+    if name is not None:
+      _add_key_attr(activations[0].op, name)
+
+    # Compute the number of features for this  table.
+    num_features = {table: 0 for table in self._table_config}
+    for feature in nest.flatten(self._feature_config):
+      num_features[feature.table] += (1 if feature.max_sequence_length == 0
+                                      else feature.max_sequence_length)
+
+    # Activations are reshaped so that they are indexed by batch size and then
+    # by the 'feature' index within the batch. The final dimension should equal
+    # the dimension of the table.
+    table_to_activation = {
+        table: array_ops.reshape(activation,
+                                 [self._batch_size, num_features[table], -1])
+        for table, activation in zip(self._table_config, activations)}
+
+    # We process the features in the same order we enqueued them.
+    # For each feature we take the next slice of the activations, so need to
+    # track the activations and the current position we are in.
+    table_to_position = {table: 0 for table in self._table_config}
+
+    per_feature_activations = []
+    for feature in nest.flatten(self._feature_config):
+      activation = table_to_activation[feature.table]
+      feature_index = table_to_position[feature.table]
+      # We treat non-sequence and sequence features differently here as sequence
+      # features have rank 3 while non-sequence features have rank 2.
+      if feature.max_sequence_length == 0:
+        per_feature_activations.append(
+            activation[:, feature_index, :])
+        table_to_position[feature.table] += 1
+      else:
+        per_feature_activations.append(
+            activation[:, feature_index:(
+                feature_index+feature.max_sequence_length), :])
+        table_to_position[feature.table] += feature.max_sequence_length
+
+    # Pack the list back into the same nested structure as the features.
+    return nest.pack_sequence_as(self._feature_config, per_feature_activations)
+
+  def _create_variables_and_slots(self):
+    """Create variables for TPU embeddings.
+
+    Note under TPUStrategy this will ensure that all creations happen within a
+    variable creation scope of the sharded variable creator.
+
+    Returns:
+      A dict of dicts. The outer dict is keyed by the table names and the inner
+      dicts are keyed by 'parameters' and the slot variable names.
+    """
+
+    def create_variables(table):
+      """Create all variables."""
+      shape = (table.vocabulary_size, table.dim)
+
+      # We use functools.partial here for the initial_value so that we have a
+      # variable creation that is compatible with both the sharded variable
+      # creator and the normal variable creator. The sharded variable creator
+      # will extract the shape of the tensor from the functool.partial object to
+      # decide on the sharding.
+      parameters = tf_variables.Variable(
+          name=table.name,
+          initial_value=functools.partial(
+              table.initializer, shape=shape, dtype=dtypes.float32),
+          trainable=not self._using_tpu)
+      slot_vars = table.optimizer._create_slots(parameters)  # pylint: disable=protected-access
+      slot_vars["parameters"] = parameters
+      return slot_vars
+
+    # Store tables based on name rather than TableConfig as we can't track
+    # through dicts with non-string keys, i.e. we won't be able to save.
+    variables = {}
+    for table in self._table_config:
+      if not self._using_tpu:
+        variables[table.name] = create_variables(table)
+      else:
+        with variable_scope.variable_creator_scope(
+            make_sharded_variable_creator(self._hosts)):
+          variables[table.name] = create_variables(table)
+
+    return variables
+
+  @def_function.function
+  def _load_variables(self):
+    """Load embedding tables to onto TPU for each table and host."""
+
+    def select_fn(host_id):
+      return lambda x: x.variables[host_id]
+
+    num_hosts = self._strategy.extended.num_hosts
+    config = self._config_proto.SerializeToString()
+    for host_id, host in enumerate(self._hosts):
+      variables = nest.map_structure(select_fn(host_id), self._variables)
+      with ops.device(host):
+        for table in self._table_config:
+          table.optimizer._load()(  # pylint: disable=protected-access
+              table_name=table.name,
+              num_shards=num_hosts,
+              shard_id=host_id,
+              config=config,
+              **variables[table.name])
+          # Ensure that only the first table/first host gets a config so that we
+          # don't bloat graph by attaching this large string to each op.
+          # We have num tables * num hosts of these so for models with a large
+          # number of tables training on a large slice, this can be an issue.
+          config = None
+
+  @def_function.function
+  def _retrieve_variables(self):
+    """Retrieve embedding tables from TPU to host memory."""
+    num_hosts = self._strategy.extended.num_hosts
+    config = self._config_proto.SerializeToString()
+    for host_id, host in enumerate(self._hosts):
+      with ops.device(host):
+        for table in self._table_config:
+          retrieved = table.optimizer._retrieve()(  # pylint: disable=protected-access
+              table_name=table.name,
+              num_shards=num_hosts,
+              shard_id=host_id,
+              config=config)
+          # When there are no slot variables (e.g with SGD) this returns a
+          # single tensor rather than a tuple. In this case we put the tensor in
+          # a list to make the following code easier to write.
+          if not isinstance(retrieved, tuple):
+            retrieved = (retrieved,)
+
+          for i, slot in enumerate(["parameters"] +
+                                   table.optimizer._slot_names()):  # pylint: disable=protected-access
+            # We must assign the CPU variables the values of tensors that were
+            # returned from the TPU.
+            self._variables[table.name][slot].variables[host_id].assign(
+                retrieved[i])
+          # Ensure that only the first table/first host gets a config so that we
+          # don't bloat graph by attaching this large string to each op.
+          # We have num tables * num hosts of these so for models with a large
+          # number of tables training on a large slice, this can be an issue.
+          config = None
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides default Trackable implementation to add load/retrieve hook."""
+    # This saveable should be here in both TPU and CPU checkpoints, so when on
+    # CPU, we add the hook with no functions.
+    # TODO(bfontain): Update restore logic in saver so that these hooks are
+    # always executed. Once that is done, we can output an empty list when on
+    # CPU.
+    def factory(name=_HOOK_KEY):
+      return TPUEmbeddingSaveable(
+          name,
+          self._load_variables if self._using_tpu else None,
+          self._retrieve_variables if self._using_tpu else None)
+    return {_HOOK_KEY: factory}
+
+  # Some helper functions for the below enqueue function.
+  def _add_data_for_tensor(self, tensor, weight, indices, values, weights,
+                           int_zeros, float_zeros, path):
+    if weight is not None:
+      raise ValueError(
+          "Weight specified for dense input {}, which is not allowed. "
+          "Weight will always be 1 in this case.".format(path))
+    # For tensors, there are no indices and no weights.
+    indices.append(int_zeros)
+    values.append(math_ops.cast(tensor, dtypes.int32))
+    weights.append(float_zeros)
+
+  def _add_data_for_sparse_tensor(self, tensor, weight, indices, values,
+                                  weights, int_zeros, float_zeros, path):
+    indices.append(math_ops.cast(tensor.indices, dtypes.int32))
+    values.append(math_ops.cast(tensor.values, dtypes.int32))
+    # If we have weights they must be a SparseTensor.
+    if weight is not None:
+      if not isinstance(weight, sparse_tensor.SparseTensor):
+        raise ValueError("Weight for {} is type {} which does not match "
+                         "type input which is SparseTensor.".format(
+                             path, type(weight)))
+      weights.append(math_ops.cast(weight.values, dtypes.float32))
+    else:
+      weights.append(float_zeros)
+
+  def _add_data_for_ragged_tensor(self, tensor, weight, indices, values,
+                                  weights, int_zeros, float_zeros, path):
+    indices.append(math_ops.cast(tensor.row_splits, dtypes.int32))
+    values.append(math_ops.cast(tensor.values, dtypes.int32))
+    # If we have weights they must be a RaggedTensor.
+    if weight is not None:
+      if not isinstance(weight, ragged_tensor.RaggedTensor):
+        raise ValueError("Weight for {} is type {} which does not match "
+                         "type input which is RaggedTensor.".format(
+                             path, type(weight)))
+      weights.append(math_ops.cast(weight.values, dtypes.float32))
+    else:
+      weights.append(float_zeros)
+
+  def _generate_enqueue_op(self, flat_inputs, flat_weights, flat_features,
+                           device_ordinal, mode_override):
+    """Outputs a the enqueue op given the inputs and weights.
+
+    Args:
+      flat_inputs: A list of input tensors.
+      flat_weights: A list of input weights (or None) of the same length as
+        flat_inputs.
+      flat_features: A list of FeatureConfigs of the same length as flat_inputs.
+      device_ordinal: The device to create the enqueue op for.
+      mode_override: A tensor containing the string "train" or "inference".
+
+    Returns:
+      The enqueue op.
+    """
+
+    # First we need to understand which op to use. This depends on if sparse
+    # or ragged tensors are in the flat_inputs.
+    sparse = False
+    ragged = False
+    for inp in flat_inputs:
+      if isinstance(inp, sparse_tensor.SparseTensor):
+        sparse = True
+      elif isinstance(inp, ragged_tensor.RaggedTensor):
+        ragged = True
+    if sparse and ragged:
+      raise ValueError(
+          "Found both SparseTensors and RaggedTensors in the input to the "
+          "enqueue operation. Please ensure that your data does not include "
+          "both SparseTensors and RaggedTensors. It is ok to have Tensors in "
+          "combination with one of the previous types.")
+
+    # Combiners are per table, list in the same order as the table order.
+    combiners = [table.combiner for table in self._table_config]
+
+    # Reverse mapping of self._table_config, so that we can lookup the table
+    # index.
+    table_to_id = {table: i for i, table in enumerate(self._table_config)}
+
+    # These parallel arrays will be the inputs to the enqueue op.
+    indices = []  # sample_indices for sparse, sample_splits for ragged.
+    values = []
+    weights = []
+    table_ids = []
+    max_sequence_lengths = []
+
+    # We have to supply a empty/zero tensor in a list position where we don't
+    # have data (e.g. indices for standard Tensor input, weight when no weight
+    # is specified). We create one op here per call, so that we reduce the
+    # graph size.
+    int_zeros = array_ops.zeros((0,), dtype=dtypes.int32)
+    float_zeros = array_ops.zeros((0,), dtype=dtypes.float32)
+
+    # In the following loop we insert casts so that everything is either int32
+    # or float32. This is because op inputs which are lists of tensors must be
+    # of the same type within the list. Moreover the CPU implementions of these
+    # ops cast to these types anyway, so we don't lose any data by casting
+    # early.
+    for inp, weight, (path, feature) in zip(
+        flat_inputs, flat_weights, flat_features):
+      table_ids.append(table_to_id[feature.table])
+      max_sequence_lengths.append(feature.max_sequence_length)
+      if isinstance(inp, ops.Tensor):
+        self._add_data_for_tensor(inp, weight, indices, values, weights,
+                                  int_zeros, float_zeros, path)
+      elif isinstance(inp, sparse_tensor.SparseTensor):
+        self._add_data_for_sparse_tensor(inp, weight, indices, values, weights,
+                                         int_zeros, float_zeros, path)
+      elif isinstance(inp, ragged_tensor.RaggedTensor):
+        self._add_data_for_ragged_tensor(inp, weight, indices, values, weights,
+                                         int_zeros, float_zeros, path)
+      else:
+        raise ValueError("Input {} is of unknown type {}. Please only pass "
+                         "Tensor, SparseTensor or RaggedTensor as input to "
+                         "enqueue.".format(path, type(inp)))
+
+    if ragged:
+      return tpu_ops.enqueue_tpu_embedding_ragged_tensor_batch(
+          sample_splits=indices,
+          embedding_indices=values,
+          aggregation_weights=weights,
+          mode_override=mode_override,
+          device_ordinal=device_ordinal,
+          combiners=combiners,
+          table_ids=table_ids,
+          max_sequence_lengths=max_sequence_lengths)
+    return tpu_ops.enqueue_tpu_embedding_sparse_tensor_batch(
+        sample_indices=indices,
+        embedding_indices=values,
+        aggregation_weights=weights,
+        mode_override=mode_override,
+        device_ordinal=device_ordinal,
+        combiners=combiners,
+        table_ids=table_ids,
+        max_sequence_lengths=max_sequence_lengths)
+
+  def _raise_error_for_incorrect_control_flow_context(self):
+    """Raises an error if we are not in the TPUReplicateContext."""
+    # Do not allow any XLA control flow (i.e. control flow in between a
+    # TPUStrategy's run call and the call to this function), as we can't
+    # extract the enqueue from the head when in XLA control flow.
+    graph = ops.get_default_graph()
+    in_tpu_ctx = False
+    while graph is not None:
+      ctx = graph._get_control_flow_context()  # pylint: disable=protected-access
+      while ctx is not None:
+        if isinstance(ctx, tpu.TPUReplicateContext):
+          in_tpu_ctx = True
+          break
+        ctx = ctx.outer_context
+      if in_tpu_ctx:
+        break
+      graph = getattr(graph, "outer_graph", None)
+    if graph != ops.get_default_graph() and in_tpu_ctx:
+      raise RuntimeError(
+          "Current graph {} does not match graph which contains "
+          "TPUReplicateContext {}. This is most likely due to the fact that "
+          "enqueueing embedding data is called inside control flow or a "
+          "nested function inside `strategy.run`. This is not supported "
+          "because outside compilation fails to extract the enqueue ops as "
+          "head of computation.".format(ops.get_default_graph(), graph))
+    return in_tpu_ctx
+
+  def _raise_error_for_non_direct_inputs(self, features):
+    """Checks all tensors in features to see if they are a direct input."""
+
+    # expand_composites here is important: as composite tensors pass through
+    # tpu.replicate, they get 'flattened' into their component tensors and then
+    # repacked before being passed to the tpu function. In means that it is the
+    # component tensors which are produced by an op with the
+    # "_tpu_input_identity" attribute.
+    for path, input_tensor in nest.flatten_with_joined_string_paths(
+        features, expand_composites=True):
+      if input_tensor.op.type == "Placeholder":
+        continue
+      try:
+        is_input = input_tensor.op.get_attr("_tpu_input_identity")
+      except ValueError:
+        is_input = False
+      if not is_input:
+        raise ValueError(
+            "Received input tensor {} which is the output of op {} (type {}) "
+            "which does not have the `_tpu_input_identity` attr. Please "
+            "ensure that the inputs to this layer are taken directly from "
+            "the arguments of the function called by "
+            "strategy.run. Two possible causes are: dynamic batch size "
+            "support or you are using a keras layer and are not passing "
+            "tensors which match the dtype of the `tf.keras.Input`s."
+            "If you are triggering dynamic batch size support, you can "
+            "disable it by passing tf.distribute.RunOptions("
+            "experimental_enable_dynamic_batch_size=False) to the options "
+            "argument of strategy.run().".format(path,
+                                                 input_tensor.op.name,
+                                                 input_tensor.op.type))
+
+  def enqueue(self, features, weights=None, training=True, name=None):
+    """Enqueues id tensors for embedding lookup.
+
+    This function enqueues a structure of features to be looked up in the
+    embedding tables. We expect that the batch size of each of the tensors in
+    features matches the per core batch size. This will automatically happen if
+    your input dataset is batched to the global batch size and you use
+    `tf.distribute.experimental.TPUStrategy`'s `experimental_distribute_dataset`
+    or if you use `experimental_distribute_datasets_from_function` and batch
+    to the per core batch size computed by the context passed to your input
+    function.
+
+    ```python
+    strategy = tf.distribute.experimental.TPUStrategy(...)
+    with strategy.scope():
+      embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
+
+    distributed_dataset = strategy.experimental_distribute_dataset(...)
+    dataset_iterator = iter(distributed_dataset)
+
+    @tf.function
+    def training_step():
+      def tpu_step(tpu_features):
+        with tf.GradientTape() as tape:
+          activations = embedding.dequeue()
+          tape.watch(activations)
+
+          loss = ... #  some computation involving activations
+
+        embedding_gradients = tape.gradient(loss, activations)
+        embedding.apply_gradients(embedding_gradients)
+
+      embedding_features, tpu_features = next(dataset_iterator)
+      embedding.enqueue(embedding_features, training=True)
+      strategy.run(tpu_step, args=(embedding_features,))
+
+    training_step()
+    ```
+
+    NOTE: You should specify `training=True` when using
+    `embedding.apply_gradients` as above and `training=False` when not using
+    `embedding.apply_gradients` (e.g. for frozen embeddings or when doing
+    evaluation).
+
+    Args:
+      features: A nested structure of `tf.Tensor`s, `tf.SparseTensor`s or
+        `tf.RaggedTensor`s, with the same structure as `feature_config`. Inputs
+        will be downcast to `tf.int32`. Only one type out of `tf.SparseTensor`
+        or `tf.RaggedTensor` is supported per call.
+      weights: If not `None`, a nested structure of `tf.Tensor`s,
+        `tf.SparseTensor`s or `tf.RaggedTensor`s, matching the above, except
+        that the tensors should be of float type (and they will be downcast to
+        `tf.float32`). For `tf.SparseTensor`s we assume the `indices` are the
+        same for the parallel entries from `features` and similarly for
+        `tf.RaggedTensor`s we assume the row_splits are the same.
+      training: Defaults to `True`. If `False`, enqueue the batch as inference
+        batch (forward pass only). Do not call `apply_gradients` when this is
+        `False` as this may lead to a deadlock.
+       name: A name for the underlying op.
+
+    Raises:
+      ValueError: When called inside a strategy.run call and input is not
+        directly taken from the args of the `strategy.run` call. Also if
+        the size of any sequence in `features` does not match corresponding
+        sequence in `feature_config`. Similarly for `weights`, if not `None`.
+      RuntimeError: When called inside a strategy.run call and inside XLA
+        control flow.
+      TypeError: If the type of any sequence in `features` does not match
+        corresponding sequence in `feature_config`. Similarly for `weights`, if
+        not `None`.
+    """
+    if not self._using_tpu:
+      raise RuntimeError("enqueue is not valid when TPUEmbedding object is not "
+                         "created under a TPUStrategy.")
+
+    nest.assert_same_structure(self._feature_config, features)
+
+    # TODO(bfontain): Add a check that the input batch_size matches the per core
+    # batch size that this instance of the API was initialized with.
+
+    flat_inputs = nest.flatten(features)
+    flat_weights = [None] * len(flat_inputs)
+    if weights is not None:
+      nest.assert_same_structure(self._feature_config, weights)
+      flat_weights = nest.flatten(weights)
+    flat_features = nest.flatten_with_joined_string_paths(self._feature_config)
+
+    in_tpu_context = self._raise_error_for_incorrect_control_flow_context()
+    # If we are in a tpu_context, automatically apply outside compilation.
+    if in_tpu_context:
+      self._raise_error_for_non_direct_inputs(features)
+
+      def generate_enqueue_ops():
+        """Generate enqueue ops for outside compilation."""
+        # Note that we put array_ops.where_v2 rather than a python if so that
+        # the op is explicitly create and the constant ops are both in the graph
+        # even though we don't expect training to be a tensor (and thus generate
+        # control flow automatically). This need to make it easier to re-write
+        # the graph later if we need to fix which mode needs to be used.
+        mode_override = array_ops.where_v2(training,
+                                           constant_op.constant("train"),
+                                           constant_op.constant("inference"))
+
+        # Device ordinal is -1 here, a later rewrite will fix this once the op
+        # is expanded by outside compilation.
+        enqueue_op = self._generate_enqueue_op(
+            flat_inputs, flat_weights, flat_features, device_ordinal=-1,
+            mode_override=mode_override)
+
+        # Apply the name tag to the op.
+        if name is not None:
+          _add_key_attr(enqueue_op, name)
+
+        # Ensure that this op has outbound control flow, otherwise it won't be
+        # executed.
+        ops.get_default_graph().control_outputs.append(enqueue_op)
+
+      tpu.outside_compilation(generate_enqueue_ops)
+
+    else:
+      mode_override = "train" if training else "inference"
+      # We generate enqueue ops per device, so we need to gather the all
+      # features for a single device in to a dict.
+      # We rely here on the fact that the devices in the PerReplica value occur
+      # in the same (standard) order as self._strategy.extended.worker_devices.
+      enqueue_ops = []
+      for replica_id in range(self._strategy.num_replicas_in_sync):
+        replica_inputs = tf_values.select_replica(replica_id, flat_inputs)
+        replica_weights = tf_values.select_replica(replica_id, flat_weights)
+        tpu_device = self._strategy.extended.worker_devices[replica_id]
+        # TPU devices string are like /job:worker/replica:0/task:0/device:TPU:0
+        # the device ordinal is the last number
+        device_ordinal = int(tpu_device.rsplit(":", 1)[1])
+        with ops.device(device_util.get_host_for_device(tpu_device)):
+          enqueue_op = self._generate_enqueue_op(
+              replica_inputs, replica_weights, flat_features,
+              device_ordinal=device_ordinal, mode_override=mode_override)
+
+          # Apply the name tag to the op.
+          if name is not None:
+            _add_key_attr(enqueue_op, name)
+          enqueue_ops.append(enqueue_op)
+      ops.get_default_graph().control_outputs.extend(enqueue_ops)
+
+
+class TPUEmbeddingSaveable(saveable_hook.SaveableHook):
+  """Save/Restore hook to Retrieve/Load TPUEmbedding variables."""
+
+  def __init__(self, name, load, retrieve):
+    self._load = load
+    self._retrieve = retrieve
+    super(TPUEmbeddingSaveable, self).__init__(name=name)
+
+  def before_save(self):
+    if self._retrieve is not None:
+      self._retrieve()
+
+  def after_restore(self):
+    if self._load is not None:
+      self._load()
+
+
+def _ragged_embedding_lookup_with_reduce(table, ragged, weights, combiner):
+  """Compute a ragged lookup followed by a reduce on axis 1.
+
+  Args:
+    table: The embedding table.
+    ragged: A RaggedTensor of ids to look up.
+    weights: A RaggedTensor of weights (or None).
+    combiner: One of "mean", "sum", "sqrtn".
+
+  Returns:
+    A Tensor.
+  """
+  if weights is None:
+    weights = array_ops.ones_like(ragged)
+  weights = array_ops.expand_dims(weights, axis=2)
+  ragged_result = embedding_ops.embedding_lookup_ragged(table, ragged)
+  ragged_result = math_ops.reduce_sum(ragged_result * weights, axis=1)
+  if combiner == "mean":
+    ragged_result = ragged_result / math_ops.reduce_sum(weights, axis=1)
+  elif combiner == "sqrtn":
+    ragged_result = ragged_result, math_ops.sqrt(math_ops.reduce_sum(
+        weights*weights, axis=1))
+  return ragged_result
+
+
+def cpu_embedding_lookup(inputs, weights, tables, feature_config):
+  """Uses CPU embedding lookup for embedding ids in features.
+
+  Args:
+    inputs: a nested structure of Tensors, SparseTensors or RaggedTensors.
+    weights: a nested structure of Tensors, SparseTensors or RaggedTensors or
+      None for no weights.
+    tables: a dict of mapping TableConfig objects to Variables.
+    feature_config: a nested structure of FeatureConfig objects with the same
+      structure as inputs.
+
+  Returns:
+    A nested structure of Tensors with the same structure as inputs.
+  """
+
+  nest.assert_same_structure(inputs, feature_config)
+
+  flat_inputs = nest.flatten(inputs)
+  flat_weights = [None] * len(flat_inputs)
+  if weights is not None:
+    nest.assert_same_structure(inputs, weights)
+    flat_weights = nest.flatten(weights)
+  flat_features = nest.flatten_with_joined_string_paths(feature_config)
+
+  outputs = []
+  for inp, weight, (path, feature) in zip(
+      flat_inputs, flat_weights, flat_features):
+    table = tables[feature.table]
+    if feature.max_sequence_length > 0:
+      raise ValueError("Sequence features unsupported at this time.")
+
+    if weight is not None:
+      if isinstance(inp, ops.Tensor):
+        raise ValueError(
+            "Weight specified for {}, but input is dense.".format(path))
+      elif type(weight) is not type(inp):
+        raise ValueError(
+            "Weight for {} is of type {} but it does not match type of the "
+            "input which is {}.".format(path, type(weight), type(inp)))
+
+    if isinstance(inp, ops.Tensor):
+      outputs.append(embedding_ops.embedding_lookup_v2(table, inp))
+
+    elif isinstance(inp, sparse_tensor.SparseTensor):
+      outputs.append(embedding_ops.safe_embedding_lookup_sparse_v2(
+          table, inp, sparse_weights=weight, combiner=feature.table.combiner))
+
+    elif isinstance(inp, ragged_tensor.RaggedTensor):
+      outputs.append(_ragged_embedding_lookup_with_reduce(
+          table, inp, weight, feature.table.combiner))
+
+    else:
+      raise ValueError("Input {} is type {}. Tensor, SparseTensor or "
+                       "RaggedTensor expected.".format(path, type(inp)))
+  return nest.pack_sequence_as(feature_config, outputs)
+
+
+def get_list_of_hosts(strategy):
+  """Returns a sorted list of CPU devices for the remote jobs.
+
+  Args:
+    strategy: A TPUStrategy object.
+
+  Returns:
+    A sort list of device strings.
+  """
+  list_of_hosts = []
+  # Assume this is sorted by task
+  for tpu_device in strategy.extended.worker_devices:
+    host = device_util.get_host_for_device(tpu_device)
+    if host not in list_of_hosts:
+      list_of_hosts.append(host)
+  assert len(list_of_hosts) == strategy.extended.num_hosts
+  return list_of_hosts
+
+
+def extract_variable_info(kwargs):
+  """Extracts the variable creation attributes from the kwargs.
+
+  Args:
+    kwargs: a dict of keyword arguments that were passed to a variable creator
+      scope.
+
+  Returns:
+    A tuple of variable name, initialization function, shape, and dtype.
+  """
+  if (isinstance(kwargs["initial_value"], functools.partial) and (
+      "shape" in kwargs["initial_value"].keywords or
+      kwargs["initial_value"].args)):
+    # Sometimes shape is passed positionally, sometimes it's passed as a kwarg.
+    if "shape" in kwargs["initial_value"].keywords:
+      shape = kwargs["initial_value"].keywords["shape"]
+    else:
+      shape = kwargs["initial_value"].args[0]
+    return (kwargs["name"], shape,
+            kwargs["initial_value"].keywords.get("dtype", kwargs["dtype"]),
+            kwargs["initial_value"].func)
+  elif "shape" not in kwargs or kwargs["shape"] is None:
+    raise ValueError(
+        "Unable to extract initializer function and shape from {}. Please "
+        "either pass a function that expects a shape and dtype as the "
+        "initial value for your variable or functools.partial object with "
+        "the shape and dtype kwargs set. This is needed so that we can "
+        "initialize the shards of the ShardedVariable locally.".format(
+            kwargs["initial_value"]))
+  else:
+    return (kwargs["name"], kwargs["shape"], kwargs["dtype"],
+            kwargs["initial_value"])
+
+
+def make_sharded_variable_creator(hosts):
+  """Makes a sharded variable creator given a list of hosts.
+
+  Args:
+    hosts: a list of tensorflow devices on which to shard the tensors.
+
+  Returns:
+    A variable creator function.
+  """
+
+  def sharded_variable_creator(next_creator, *args, **kwargs):
+    """The sharded variable creator."""
+    kwargs["skip_mirrored_creator"] = True
+
+    num_hosts = len(hosts)
+    name, shape, dtype, initial_value = extract_variable_info(kwargs)
+    rows = shape[0]
+    cols = shape[1]
+    missing = rows % num_hosts
+    # we partition as if we were using MOD sharding.
+    partitions = ([rows // num_hosts + 1] * missing + [rows // num_hosts] *
+                  (num_hosts - missing))
+    variables = []
+    newkwargs = kwargs
+    newkwargs["dtype"] = dtype
+    for i, p in enumerate(partitions):
+      with ops.device(hosts[i]):
+        newkwargs["shape"] = (p, cols)
+        newkwargs["name"] = "{}_{}".format(name, i)
+        newkwargs["initial_value"] = (
+            lambda: initial_value(newkwargs["shape"], dtype=dtype))
+        variables.append(next_creator(*args, **kwargs))
+    return TPUShardedVariable(variables, name=name)
+  return sharded_variable_creator
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py b/tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py
new file mode 100644
index 00000000000..3177498deba
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py
@@ -0,0 +1,302 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TPU Embeddings mid level API on CPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_embedding_v2
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+from tensorflow.python.util import nest
+
+
+class CPUEmbeddingTest(test.TestCase):
+
+  def setUp(self):
+    super(CPUEmbeddingTest, self).setUp()
+
+    self.embedding_values = np.array(list(range(32)), dtype=np.float64)
+    self.initializer = init_ops_v2.Constant(self.embedding_values)
+    # Embedding for video initialized to
+    # 0 1 2 3
+    # 4 5 6 7
+    # ...
+    self.table_video = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=8,
+        dim=4,
+        initializer=self.initializer,
+        combiner='sum',
+        name='video')
+    # Embedding for user initialized to
+    # 0 1
+    # 2 3
+    # 4 5
+    # 6 7
+    # ...
+    self.table_user = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=16,
+        dim=2,
+        initializer=self.initializer,
+        combiner='mean',
+        name='user')
+    self.feature_config = (
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched'),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='favorited'),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_user, name='friends'))
+
+    self.batch_size = 2
+    self.data_batch_size = 4
+
+    # One (global) batch of inputs
+    # sparse tensor for watched:
+    # row 0: 0
+    # row 1: 0, 1
+    # row 2: 0, 1
+    # row 3: 1
+    self.feature_watched_indices = [[0, 0], [1, 0], [1, 1],
+                                    [2, 0], [2, 1], [3, 0]]
+    self.feature_watched_values = [0, 0, 1, 0, 1, 1]
+    self.feature_watched_row_lengths = [1, 2, 2, 1]
+    # sparse tensor for favorited:
+    # row 0: 0, 1
+    # row 1: 1
+    # row 2: 0
+    # row 3: 0, 1
+    self.feature_favorited_indices = [[0, 0], [0, 1], [1, 0],
+                                      [2, 0], [3, 0], [3, 1]]
+    self.feature_favorited_values = [0, 1, 1, 0, 0, 1]
+    self.feature_favorited_row_lengths = [2, 1, 1, 2]
+    # sparse tensor for friends:
+    # row 0: 3
+    # row 1: 0, 1, 2
+    # row 2: 3
+    # row 3: 0, 1, 2
+    self.feature_friends_indices = [[0, 0], [1, 0], [1, 1], [1, 2],
+                                    [2, 0], [3, 0], [3, 1], [3, 2]]
+    self.feature_friends_values = [3, 0, 1, 2, 3, 0, 1, 2]
+    self.feature_friends_row_lengths = [1, 3, 1, 3]
+
+  def _create_mid_level(self):
+    optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+    return tpu_embedding_v2.TPUEmbedding(
+        feature_config=self.feature_config,
+        batch_size=self.batch_size,
+        optimizer=optimizer)
+
+  def _get_dense_tensors(self, dtype=dtypes.int32):
+    feature0 = constant_op.constant(self.feature_watched_values, dtype=dtype)
+    feature1 = constant_op.constant(self.feature_favorited_values, dtype=dtype)
+    feature2 = constant_op.constant(self.feature_friends_values, dtype=dtype)
+    return (feature0, feature1, feature2)
+
+  def test_cpu_dense_lookup(self):
+    mid_level = self._create_mid_level()
+    features = self._get_dense_tensors()
+    results = tpu_embedding_v2.cpu_embedding_lookup(
+        features,
+        weights=None,
+        tables=mid_level.embedding_tables,
+        feature_config=self.feature_config)
+    all_lookups = []
+    for feature, config in zip(nest.flatten(features), self.feature_config):
+      table = mid_level.embedding_tables[config.table].numpy()
+      all_lookups.append(table[feature.numpy()])
+    self.assertAllClose(results, nest.pack_sequence_as(results, all_lookups))
+
+  def test_cpu_dense_lookup_with_weights(self):
+    mid_level = self._create_mid_level()
+    features = self._get_dense_tensors()
+    weights = self._get_dense_tensors(dtype=dtypes.float32)
+
+    with self.assertRaisesRegex(
+        ValueError, 'Weight specified for .*, but input is dense.'):
+      tpu_embedding_v2.cpu_embedding_lookup(
+          features,
+          weights=weights,
+          tables=mid_level.embedding_tables,
+          feature_config=self.feature_config)
+
+  def _get_sparse_tensors(self, dtype=dtypes.int32):
+    feature0 = sparse_tensor.SparseTensor(
+        indices=self.feature_watched_indices,
+        values=constant_op.constant(self.feature_watched_values, dtype=dtype),
+        dense_shape=[self.data_batch_size, 2])
+    feature1 = sparse_tensor.SparseTensor(
+        indices=self.feature_favorited_indices,
+        values=constant_op.constant(self.feature_favorited_values, dtype=dtype),
+        dense_shape=[self.data_batch_size, 2])
+    feature2 = sparse_tensor.SparseTensor(
+        indices=self.feature_friends_indices,
+        values=constant_op.constant(self.feature_friends_values, dtype=dtype),
+        dense_shape=[self.data_batch_size, 3])
+    return (feature0, feature1, feature2)
+
+  def test_cpu_sparse_lookup(self):
+    mid_level = self._create_mid_level()
+    features = self._get_sparse_tensors()
+    results = tpu_embedding_v2.cpu_embedding_lookup(
+        features,
+        weights=None,
+        tables=mid_level.embedding_tables,
+        feature_config=self.feature_config)
+    reduced = []
+    for feature, config in zip(nest.flatten(features), self.feature_config):
+      table = mid_level.embedding_tables[config.table].numpy()
+      all_lookups = table[feature.values.numpy()]
+      # With row starts we can use reduceat in numpy. Get row starts from the
+      # ragged tensor API.
+      ragged = ragged_tensor.RaggedTensor.from_sparse(feature)
+      row_starts = ragged.row_starts().numpy()
+      reduced.append(np.add.reduceat(all_lookups, row_starts))
+      if config.table.combiner == 'mean':
+        # for mean, divide by the row lengths.
+        reduced[-1] /= np.expand_dims(ragged.row_lengths().numpy(), axis=1)
+    self.assertAllClose(results, nest.pack_sequence_as(results, reduced))
+
+  def test_cpu_sparse_lookup_with_weights(self):
+    mid_level = self._create_mid_level()
+    features = self._get_sparse_tensors()
+    weights = self._get_sparse_tensors(dtype=dtypes.float32)
+    results = tpu_embedding_v2.cpu_embedding_lookup(
+        features,
+        weights=weights,
+        tables=mid_level.embedding_tables,
+        feature_config=self.feature_config)
+    weighted_sum = []
+    for feature, weight, config in zip(nest.flatten(features),
+                                       nest.flatten(weights),
+                                       self.feature_config):
+      table = mid_level.embedding_tables[config.table].numpy()
+      # Expand dims here needed to broadcast this multiplication properly.
+      weight = np.expand_dims(weight.values.numpy(), axis=1)
+      all_lookups = table[feature.values.numpy()] * weight
+      # With row starts we can use reduceat in numpy. Get row starts from the
+      # ragged tensor API.
+      row_starts = ragged_tensor.RaggedTensor.from_sparse(feature).row_starts()
+      row_starts = row_starts.numpy()
+      weighted_sum.append(np.add.reduceat(all_lookups, row_starts))
+      if config.table.combiner == 'mean':
+        weighted_sum[-1] /= np.add.reduceat(weight, row_starts)
+    self.assertAllClose(results, nest.pack_sequence_as(results,
+                                                       weighted_sum))
+
+  def test_cpu_sparse_lookup_with_non_sparse_weights(self):
+    mid_level = self._create_mid_level()
+    features = self._get_sparse_tensors()
+    weights = self._get_dense_tensors(dtype=dtypes.float32)
+    with self.assertRaisesRegex(
+        ValueError, 'but it does not match type of the input which is'):
+      tpu_embedding_v2.cpu_embedding_lookup(
+          features,
+          weights=weights,
+          tables=mid_level.embedding_tables,
+          feature_config=self.feature_config)
+
+  def _get_ragged_tensors(self, dtype=dtypes.int32):
+    feature0 = ragged_tensor.RaggedTensor.from_row_lengths(
+        values=constant_op.constant(self.feature_watched_values, dtype=dtype),
+        row_lengths=self.feature_watched_row_lengths)
+    feature1 = ragged_tensor.RaggedTensor.from_row_lengths(
+        values=constant_op.constant(self.feature_favorited_values, dtype=dtype),
+        row_lengths=self.feature_favorited_row_lengths)
+    feature2 = ragged_tensor.RaggedTensor.from_row_lengths(
+        values=constant_op.constant(self.feature_friends_values, dtype=dtype),
+        row_lengths=self.feature_friends_row_lengths)
+    return (feature0, feature1, feature2)
+
+  def test_cpu_ragged_lookup_with_weights(self):
+    mid_level = self._create_mid_level()
+    features = self._get_ragged_tensors()
+    weights = self._get_ragged_tensors(dtype=dtypes.float32)
+    results = tpu_embedding_v2.cpu_embedding_lookup(
+        features,
+        weights=weights,
+        tables=mid_level.embedding_tables,
+        feature_config=self.feature_config)
+    weighted_sum = []
+    for feature, weight, config in zip(nest.flatten(features),
+                                       nest.flatten(weights),
+                                       self.feature_config):
+      table = mid_level.embedding_tables[config.table].numpy()
+      # Expand dims here needed to broadcast this multiplication properly.
+      weight = np.expand_dims(weight.values.numpy(), axis=1)
+      all_lookups = table[feature.values.numpy()] * weight
+      row_starts = feature.row_starts().numpy()
+      weighted_sum.append(np.add.reduceat(all_lookups, row_starts))
+      if config.table.combiner == 'mean':
+        weighted_sum[-1] /= np.add.reduceat(weight, row_starts)
+    self.assertAllClose(results, nest.pack_sequence_as(results,
+                                                       weighted_sum))
+
+  def test_cpu_invalid_structure_for_features(self):
+    mid_level = self._create_mid_level()
+    # Remove one element of the tuple, self.feature_config has 3 so we need to
+    # pass 3.
+    features = tuple(self._get_sparse_tensors()[:2])
+    with self.assertRaises(ValueError):
+      tpu_embedding_v2.cpu_embedding_lookup(
+          features,
+          weights=None,
+          tables=mid_level.embedding_tables,
+          feature_config=self.feature_config)
+
+  def test_cpu_invalid_structure_for_weights(self):
+    mid_level = self._create_mid_level()
+    features = self._get_sparse_tensors()
+    # Remove one element of the tuple, self.feature_config has 3 so we need to
+    # pass 3 (or None).
+    weights = tuple(self._get_dense_tensors(dtype=dtypes.float32)[:2])
+    with self.assertRaises(ValueError):
+      tpu_embedding_v2.cpu_embedding_lookup(
+          features,
+          weights=weights,
+          tables=mid_level.embedding_tables,
+          feature_config=self.feature_config)
+
+  def test_cpu_sequence_lookup(self):
+    feature_config = (
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched', max_sequence_length=2),)
+    optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+    mid_level = tpu_embedding_v2.TPUEmbedding(
+        feature_config=feature_config,
+        batch_size=self.batch_size,
+        optimizer=optimizer)
+    features = tuple(self._get_sparse_tensors()[:1])
+    with self.assertRaisesRegex(
+        ValueError, 'Sequence features unsupported at this time.'):
+      tpu_embedding_v2.cpu_embedding_lookup(
+          features,
+          weights=None,
+          tables=mid_level.embedding_tables,
+          feature_config=feature_config)
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_test.py b/tensorflow/python/tpu/tpu_embedding_v2_test.py
new file mode 100644
index 00000000000..78b5c9fa3bc
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v2_test.py
@@ -0,0 +1,1457 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TPU Embeddings mid level API on TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import itertools
+import os
+
+from absl import flags
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import remote
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import save
+from tensorflow.python.tpu import tpu_embedding
+from tensorflow.python.tpu import tpu_embedding_v2
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+from tensorflow.python.tpu import tpu_strategy_util
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training.tracking import util
+from tensorflow.python.util import nest
+
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string('tpu', '', 'Name of TPU to connect to.')
+flags.DEFINE_string('project', None, 'Name of GCP project with TPU.')
+flags.DEFINE_string('zone', None, 'Name of GCP zone with TPU.')
+flags.DEFINE_string('model_dir', os.environ.get('TEST_TMPDIR'),
+                    'A temporary directory.')
+
+
+class TPUEmbeddingCheckpointTest(parameterized.TestCase, test.TestCase):
+
+  def setUp(self):
+    super(TPUEmbeddingCheckpointTest, self).setUp()
+    self.resolver = tpu_cluster_resolver.TPUClusterResolver(
+        tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project)
+    remote.connect_to_cluster(self.resolver)
+    tpu_strategy_util.initialize_tpu_system(self.resolver)
+    self.strategy = tpu_strategy.TPUStrategy(self.resolver)
+    self.num_rows = self.strategy.num_replicas_in_sync
+
+    # These tests use two mid level API objects, initialized with different
+    # values. These have the same sizes.
+    with self.strategy.scope():
+      self.first_mid_level_contents = np.ones((self.num_rows, 4))
+      self.first_mid_level_optimizer = tpu_embedding_v2_utils.SGD(
+          learning_rate=0.1)
+      self.first_mid_level = self.build_mid_level(
+          self.first_mid_level_contents, self.first_mid_level_optimizer)
+
+      self.second_mid_level_contents = np.ones((self.num_rows, 4)) * 2
+      self.second_mid_level_optimizer = tpu_embedding_v2_utils.SGD(
+          learning_rate=0.1)
+      self.second_mid_level = self.build_mid_level(
+          self.second_mid_level_contents, self.second_mid_level_optimizer,
+          initialize_tpu_embedding=False)
+
+    self.cpu_mid_level_optimizer = tpu_embedding_v2_utils.SGD(
+        learning_rate=0.1)
+    self.cpu_mid_level = self.build_mid_level(
+        self.second_mid_level_contents, self.cpu_mid_level_optimizer)
+
+  def tearDown(self):
+    tpu_strategy_util.shutdown_tpu_system(self.resolver)
+    super(TPUEmbeddingCheckpointTest, self).tearDown()
+
+  def test_checkpoint_save_retrieves(self):
+    # Ensure that the variables from the first model are loaded.
+    self.first_mid_level._load_variables()
+
+    self.assertAllClose(
+        self.first_mid_level_contents,
+        self.make_checkpoint_and_get_embedding('before_load',
+                                               self.first_mid_level),
+        msg='Checkpoint should contain values from the first api object.')
+
+    self.second_mid_level._load_variables()
+
+    # When we load the variables from the second mid level API object to the TPU
+    # we expect that checkpointing the first mid level API object will now
+    # retrieve the values from the TPU which are now different from the current
+    # variables in the first mid level.
+    self.assertAllClose(
+        self.second_mid_level_contents,
+        self.make_checkpoint_and_get_embedding('after_load',
+                                               self.first_mid_level),
+        msg='Checkpoint should contain values from the second api object.')
+
+  def test_checkpoint_restore_loads(self):
+
+    def get_values(mid):
+      return ops.convert_to_tensor(
+          mid._variables['table']['parameters'].variables[0])
+
+    self.first_mid_level._load_variables()
+
+    first_checkpoint = util.Checkpoint(model=self.first_mid_level)
+    first_checkpoint.save(_get_tmpdir('restore', 'save'))
+
+    # Checkpoint now has values from first_mid_level. See first assert in
+    # test_checkpoint_save_retrieves.
+
+    self.second_mid_level._load_variables()
+
+    self.assertAllClose(
+        self.second_mid_level_contents,
+        get_values(self.second_mid_level),
+        msg='Second mid level api should contain its initial values.',
+    )
+
+    # We restore the checkpoint of our first model into our second model.
+    # This should load the first mid level API object onto the TPU.
+    second_checkpoint = util.Checkpoint(model=self.second_mid_level)
+    second_checkpoint.restore(_get_tmpdir('restore', 'save-1'))
+
+    # Call retrieve here as a way to check what the TPU contains contains.
+    # Calling the retrieve ops directly might make for a cleaner separation of
+    # test and module, though.
+    self.second_mid_level._retrieve_variables()
+
+    self.assertAllClose(
+        self.first_mid_level_contents,
+        get_values(self.second_mid_level),
+        msg='Second mid level api should have retrieved the first model values.'
+    )
+
+  def build_mid_level(self, embedding_values, optimizer,
+                      initialize_tpu_embedding=True):
+    """Creates an embedding api object initialized to embedding_values."""
+    initializer = init_ops_v2.Constant(embedding_values)
+
+    table = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=self.num_rows, dim=4, initializer=initializer,
+        combiner='sum', name='table')
+    feature_config = (tpu_embedding_v2_utils.FeatureConfig(
+        table=table, name='feature'),)
+
+    # batch_size here does not matter as we aren't traininig in any of these
+    # tests.
+    return tpu_embedding_v2.TPUEmbedding(
+        feature_config, 64, optimizer,
+        initialize_tpu_embedding=initialize_tpu_embedding)
+
+  def make_checkpoint_and_get_embedding(self, name, model):
+    """Saves model to checkpoint name, retrieves embedding variables."""
+    checkpoint = util.Checkpoint(model=model)
+    checkpoint.save(_get_tmpdir(name, 'save'))
+
+    # Get the name of the parameters variable which should be the only
+    # [self.num_rows, 4] shaped tensor in the checkpoint. Note that we do this
+    # as the key can change.
+    variables = checkpoint_utils.list_variables(_get_tmpdir(name))
+    variables = [name for name, size in variables if size == [self.num_rows, 4]]
+    if len(variables) != 1:
+      raise RuntimeError('Found {} copies of the parameter variable in the '
+                         'checkpoint. Exactly one copy exported.'.format(
+                             len(variables)))
+    return checkpoint_utils.load_variable(_get_tmpdir(name), variables[0])
+
+  def test_model_export_cpu(self):
+    self.first_mid_level._load_variables()
+
+    tpu_checkpoint = util.Checkpoint(model=self.first_mid_level)
+    tpu_checkpoint.save(_get_tmpdir('export_cpu', 'save'))
+
+    # We restore the checkpoint of our tpu mid level onto our cpu mid level.
+    cpu_checkpoint = util.Checkpoint(model=self.cpu_mid_level)
+    cpu_checkpoint.restore(_get_tmpdir('export_cpu', 'save-1'))
+
+    @def_function.function
+    def serve_tensors(features):
+      features = tpu_embedding_v2.cpu_embedding_lookup(
+          features, None, self.cpu_mid_level.embedding_tables,
+          self.cpu_mid_level._feature_config)
+      return features[0]
+
+    signatures = {
+        'serving_default':
+            serve_tensors.get_concrete_function(
+                (tensor_spec.TensorSpec(
+                    shape=(2,), dtype=dtypes.int32, name='feature'),))}
+    save.save(self.cpu_mid_level,
+              export_dir=_get_tmpdir('export_cpu', 'exported_model'),
+              signatures=signatures)
+
+    imported = load.load(_get_tmpdir('export_cpu', 'exported_model'))
+    predict_fn = imported.signatures['serving_default']
+
+    input_feature_value = np.array([1, 0])
+    input_batch = (constant_op.constant(input_feature_value,
+                                        dtype=dtypes.int32),)
+    prediction = predict_fn(*input_batch)['output_0']
+    self.assertAllClose(prediction.numpy(),
+                        self.first_mid_level_contents[input_feature_value])
+
+  @parameterized.parameters(tpu_embedding_v2_utils.SGD,
+                            tpu_embedding_v2_utils.Adagrad,
+                            tpu_embedding_v2_utils.Adam)
+  def test_check_checkpoint_variable_names_are_same_on_cpu_and_tpu(self,
+                                                                   optimizer):
+    # Reinitialize the TPU so that we can re-initialize the embeddings with the
+    # given optimizer.
+    tpu_strategy_util.initialize_tpu_system(self.resolver)
+    optimizer = optimizer(learning_rate=0.1)
+
+    with self.strategy.scope():
+      tpu_mid_level = self.build_mid_level(
+          self.first_mid_level_contents, optimizer)
+
+    tpu_checkpoint = util.Checkpoint(model=tpu_mid_level)
+    tpu_checkpoint.save(_get_tmpdir('save-tpu', 'save'))
+    tpu_variables = checkpoint_utils.list_variables(_get_tmpdir('save-tpu'))
+
+    cpu_mid_level = self.build_mid_level(
+        self.first_mid_level_contents, optimizer)
+
+    cpu_checkpoint = util.Checkpoint(model=cpu_mid_level)
+    cpu_checkpoint.save(_get_tmpdir('save-cpu', 'save'))
+    cpu_variables = checkpoint_utils.list_variables(_get_tmpdir('save-cpu'))
+
+    self.assertAllEqual(tpu_variables, cpu_variables)
+
+
+class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
+
+  def setUp(self):
+    super(TPUEmbeddingTest, self).setUp()
+    self.embedding_values = np.array(list(range(32)), dtype=np.float64)
+    self.initializer = init_ops_v2.Constant(self.embedding_values)
+    # Embedding for video initialized to
+    # 0 1 2 3
+    # 4 5 6 7
+    # ...
+    self.table_video = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=8,
+        dim=4,
+        initializer=self.initializer,
+        combiner='sum',
+        name='video')
+    # Embedding for user initialized to
+    # 0 1
+    # 2 3
+    # 4 5
+    # 6 7
+    # ...
+    self.table_user = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=16,
+        dim=2,
+        initializer=self.initializer,
+        combiner='mean',
+        name='user')
+    self.feature_config = (
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched'),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='favorited'),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_user, name='friends'))
+
+    self.batch_size = 2
+    self.data_batch_size = 4
+
+    # One (global) batch of inputs
+    # sparse tensor for watched:
+    # row 0: 0
+    # row 1: 0, 1
+    # row 2: 0, 1
+    # row 3: 1
+    self.feature_watched_indices = [[0, 0], [1, 0], [1, 1],
+                                    [2, 0], [2, 1], [3, 0]]
+    self.feature_watched_values = [0, 0, 1, 0, 1, 1]
+    self.feature_watched_row_lengths = [1, 2, 2, 1]
+    # sparse tensor for favorited:
+    # row 0: 0, 1
+    # row 1: 1
+    # row 2: 0
+    # row 3: 0, 1
+    self.feature_favorited_indices = [[0, 0], [0, 1], [1, 0],
+                                      [2, 0], [3, 0], [3, 1]]
+    self.feature_favorited_values = [0, 1, 1, 0, 0, 1]
+    self.feature_favorited_row_lengths = [2, 1, 1, 2]
+    # sparse tensor for friends:
+    # row 0: 3
+    # row 1: 0, 1, 2
+    # row 2: 3
+    # row 3: 0, 1, 2
+    self.feature_friends_indices = [[0, 0], [1, 0], [1, 1], [1, 2],
+                                    [2, 0], [3, 0], [3, 1], [3, 2]]
+    self.feature_friends_values = [3, 0, 1, 2, 3, 0, 1, 2]
+    self.feature_friends_row_lengths = [1, 3, 1, 3]
+    self.resolver = None
+
+  def tearDown(self):
+    if self.resolver:
+      tpu_strategy_util.shutdown_tpu_system(self.resolver)
+    super(TPUEmbeddingTest, self).tearDown()
+
+  def test_tables_with_same_name(self):
+    with self.assertRaisesRegex(
+        ValueError, 'Multiple tables with name table found.'):
+      with self._get_strategy().scope():
+        tpu_embedding_v2.TPUEmbedding(
+            (tpu_embedding_v2_utils.FeatureConfig(
+                table=tpu_embedding_v2_utils.TableConfig(
+                    name='table',
+                    vocabulary_size=4,
+                    dim=2,
+                    initializer=self.initializer,),
+                name='watched'),
+             tpu_embedding_v2_utils.FeatureConfig(
+                 table=tpu_embedding_v2_utils.TableConfig(
+                     name='table',
+                     vocabulary_size=4,
+                     dim=2,
+                     initializer=self.initializer),
+                 name='favorited')),
+            self.batch_size,
+            tpu_embedding_v2_utils.SGD(learning_rate=0.1))
+
+  def test_unsupported_optimizer(self):
+    with self.assertRaisesRegex(
+        ValueError, 'is an unsupported optimizer class.'):
+      with self._get_strategy().scope():
+        tpu_embedding_v2.TPUEmbedding(
+            self.feature_config, self.batch_size,
+            tpu_embedding.AdagradParameters(learning_rate=0.1))
+
+  def test_pass_non_tensor_to_apply_gradients(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    @def_function.function
+    def test_apply():
+      mid_level_api.apply_gradients((1, 2, 3))
+
+    with self.assertRaisesRegex(ValueError, 'Expected Tensor.'):
+      strategy.run(test_apply)
+
+  def test_pass_different_structure_to_apply_gradients(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    @def_function.function
+    def test_apply():
+      # This should be a tuple as feature_config is a tuple of 3 configs.
+      mid_level_api.apply_gradients([1, 2, 3])
+
+    with self.assertRaisesRegex(
+        TypeError,
+        'The two structures don\'t have the same nested structure.'):
+      strategy.run(test_apply)
+
+  def test_pass_none_to_apply_gradients(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    dataset = self._create_sparse_dataset(strategy)
+    data = next(iter(strategy.experimental_distribute_dataset(dataset)))
+
+    @def_function.function
+    def embedding_and_set_gradients(data):
+      mid_level_api.enqueue(data)
+      def tpu_fn():
+        results = mid_level_api.dequeue()
+        mid_level_api.apply_gradients((None, None,
+                                       array_ops.ones_like(results[2])))
+        return results
+      return strategy.run(tpu_fn)
+
+    @def_function.function
+    def embedding_only(data):
+      mid_level_api.enqueue(data, training=False)
+      def tpu_fn():
+        return mid_level_api.dequeue()
+      return strategy.run(tpu_fn)
+
+    first = self._get_replica_numpy(
+        embedding_and_set_gradients(data), strategy, 0)
+    second = self._get_replica_numpy(embedding_only(data), strategy, 0)
+
+    # First two features should be the same as None gradient was applied.
+    # Third feature had gradient of 1 passed in from each core.
+    # Each core received the same ids per core and returned the following batch:
+    # [ row 3, row 0 + row 1 + row 2 ]
+    # so gradient update was (learning rate = 0.1):
+    #   row 0: -1/3*0.1
+    #   row 1: -1/3*0.1
+    #   row 2: -1/3*0.1
+    #   row 3: -1*0.1
+    # There is a factor of num_replicas because each replica gave an update.
+
+    num_replicas = strategy.num_replicas_in_sync
+    update = ([[0.0]], [[0.0]],
+              [[0.1 * num_replicas], [0.1 / 3 * num_replicas]])
+    golden = tuple([feature-np.array(up) for feature, up in zip(first, update)])
+
+    self.assertAllClose(golden, second)
+
+  def _get_strategy(self):
+    self.resolver = tpu_cluster_resolver.TPUClusterResolver(
+        tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project)
+    remote.connect_to_cluster(self.resolver)
+    tpu_strategy_util.initialize_tpu_system(self.resolver)
+    return tpu_strategy.TPUStrategy(self.resolver)
+
+  def test_dequeue_on_cpu(self):
+    mid_level_api = self._create_mid_level()
+    with self.assertRaises(RuntimeError):
+      mid_level_api.dequeue()
+
+  def test_enqueue_on_cpu(self):
+    mid_level_api = self._create_mid_level()
+    features = {
+        'watched': sparse_tensor.SparseTensor(
+            indices=self.feature_watched_indices,
+            values=self.feature_watched_values,
+            dense_shape=[2, 2])}
+    with self.assertRaises(RuntimeError):
+      mid_level_api.enqueue(features)
+
+  def test_apply_gradients_on_cpu(self):
+    mid_level_api = self._create_mid_level()
+    with self.assertRaises(RuntimeError):
+      mid_level_api.enqueue(None)
+
+  def test_get_embedding_tables_on_cpu(self):
+    mid_level_api = self._create_mid_level()
+    self.assertEqual(
+        set(mid_level_api.embedding_tables.keys()),
+        set([self.table_video, self.table_user]))
+
+  def test_get_embedding_tables_on_tpu(self):
+    with self._get_strategy().scope():
+      mid_level_api = self._create_mid_level()
+    with self.assertRaises(RuntimeError):
+      mid_level_api.embedding_tables()
+
+  def test_enqueue_weight_for_dense_tensor(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    input_fn = self._create_dense_input_fn(strategy, include_weights=True)
+    dist = strategy.experimental_distribute_datasets_from_function(input_fn)
+    dist_iter = iter(dist)
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      features, weights = next(dist_iter)
+      mid_level_api.enqueue(features, weights=weights, training=False)
+      return strategy.run(step)
+
+    with self.assertRaisesRegex(ValueError, 'Weight specified for dense input'):
+      test_fn()
+
+  def test_enqueue_wrong_weight_type_for_sparse_tensor(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy)
+    ragged = self._create_ragged_dataset(strategy, include_weights=True)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      features = next(sparse_iter)
+      _, weights = next(ragged_iter)
+      mid_level_api.enqueue(features, weights=weights, training=False)
+      return strategy.run(step)
+
+    with self.assertRaisesRegex(
+        ValueError, 'which does not match type input which is SparseTensor.'):
+      test_fn()
+
+  def test_enqueue_wrong_weight_type_for_ragged_tensor(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy, include_weights=True)
+    ragged = self._create_ragged_dataset(strategy)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      _, weights = next(sparse_iter)
+      features = next(ragged_iter)
+      mid_level_api.enqueue(features, weights=weights, training=False)
+      return strategy.run(step)
+
+    with self.assertRaisesRegex(
+        ValueError, 'which does not match type input which is RaggedTensor.'):
+      test_fn()
+
+  def test_enqueue_sparse_and_ragged(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy)
+    ragged = self._create_ragged_dataset(strategy)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      sparse_features = next(sparse_iter)
+      ragged_features = next(ragged_iter)
+      features = (sparse_features[0], ragged_features[1], sparse_features[2])
+      mid_level_api.enqueue(features, training=False)
+      return strategy.run(step)
+
+    with self.assertRaisesRegex(
+        ValueError, 'Found both SparseTensors and RaggedTensors'):
+      test_fn()
+
+  def test_enqueue_incorrect_structure_for_features(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      features = next(sparse_iter)
+      features = (features[0],)
+      mid_level_api.enqueue(features, training=False)
+      return strategy.run(step)
+
+    # The error here is raised from nest.assert_same_structure
+    with self.assertRaises(ValueError):
+      test_fn()
+
+  def test_enqueue_incorrect_structure_for_weights(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy, include_weights=True)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      features, weights = next(sparse_iter)
+      weights = (weights[0],)
+      mid_level_api.enqueue(features, weights=weights, training=False)
+      return strategy.run(step)
+
+    # The error here is raised from nest.assert_same_structure
+    with self.assertRaises(ValueError):
+      test_fn()
+
+  def test_enqueue_ragged_tensor(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy)
+    ragged = self._create_ragged_dataset(strategy)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
+
+    @def_function.function
+    def test_fn():
+      def get_activations():
+        return mid_level_api.dequeue()
+
+      sparse_features = next(sparse_iter)
+      ragged_features = next(ragged_iter)
+      mid_level_api.enqueue(sparse_features, training=False)
+      sparse_activations = strategy.run(get_activations)
+      mid_level_api.enqueue(ragged_features, training=False)
+      ragged_activations = strategy.run(get_activations)
+      return sparse_activations, ragged_activations
+
+    sparse_activations, ragged_activations = test_fn()
+
+    # Extact per core numpy arrays and check that both sparse and ragged have
+    # the same results.
+    sparse0 = self._get_replica_numpy(sparse_activations, strategy, 0)
+    ragged0 = self._get_replica_numpy(ragged_activations, strategy, 0)
+    self.assertAllClose(sparse0, ragged0)
+
+  @parameterized.parameters(True, False)
+  def test_enqueue_with_weights(self, ragged):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    weight = 0.5
+    if ragged:
+      dataset = self._create_ragged_dataset(strategy, include_weights=True,
+                                            weight=weight)
+    else:
+      dataset = self._create_sparse_dataset(strategy, include_weights=True,
+                                            weight=weight)
+
+    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def enqueue_and_get(features, weights):
+      def get_activations():
+        return mid_level_api.dequeue()
+      mid_level_api.enqueue(features, weights=weights, training=False)
+      return strategy.run(get_activations)
+
+    features, weights = next(dataset_iter)
+    # Replace the weight for the second feature by None to test.
+    weights = (weights[0], None, weights[2])
+
+    no_weights_activations = enqueue_and_get(features, weights=None)
+    weights_activations = enqueue_and_get(features, weights=weights)
+
+    # Extact per core numpy arrays.
+    no_weights0 = self._get_replica_numpy(no_weights_activations, strategy, 0)
+    weights0 = self._get_replica_numpy(weights_activations, strategy, 0)
+    # videos table has sum combiner and users table has mean combiner.
+    # i.e. users table lookups isn't affected by the weights as all the weights
+    # are the same.
+    # Tuple entry 0 and 1 are the watched and favorited features from the videos
+    # table and entry 2 is the friends feature from the users table.
+    # Note that None was passed as a weight for entry 1 so weight should have no
+    # effect.
+    weight = (0.5, 1.0, 1.0)
+    golden = tuple([no_weight * w for no_weight, w in zip(no_weights0, weight)])
+
+    self.assertAllClose(golden, weights0)
+
+  def test_enqueue_with_outside_compilation(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    dataset = self._create_sparse_dataset(strategy)
+    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def enqueue_with_outside_compilation(data):
+      def get_activations(features):
+        mid_level_api.enqueue(features, training=False)
+        return mid_level_api.dequeue()
+      return strategy.run(get_activations, args=(data,))
+
+    @def_function.function
+    def enqueue_without_outside_compilation(data):
+      def get_activations():
+        return mid_level_api.dequeue()
+      mid_level_api.enqueue(data, training=False)
+      return strategy.run(get_activations)
+
+    features = next(dataset_iter)
+
+    activations_oc = enqueue_with_outside_compilation(features)
+    activations = enqueue_without_outside_compilation(features)
+
+    # Extact per core numpy arrays.
+    activations_oc0 = self._get_replica_numpy(activations_oc, strategy, 0)
+    activations0 = self._get_replica_numpy(activations, strategy, 0)
+
+    self.assertAllClose(activations_oc0, activations0)
+
+  def test_enqueue_with_outside_compilation_in_control_flow(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    dataset = self._create_sparse_dataset(strategy)
+    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+
+    # This is one way to force the enqueue in some control flow. @tf.functions
+    # aren't inlined in the calling tf.function. An alternative would be to
+    # place the enqueue in a switch_v2 or something similar.
+    @def_function.function
+    def enqueue_fn(features):
+      mid_level_api.enqueue(features, training=False)
+
+    @def_function.function
+    def enqueue_with_outside_compilation():
+      def get_activations(features):
+        enqueue_fn(features)
+        return mid_level_api.dequeue()
+      return strategy.run(get_activations, args=(next(dataset_iter),))
+
+    with self.assertRaisesRegex(
+        RuntimeError,
+        'does not match graph which contains TPUReplicateContext'):
+      enqueue_with_outside_compilation()
+
+  def test_enqueue_with_outside_compilation_non_direct_input(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    dataset = self._create_sparse_dataset(strategy)
+    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def enqueue_with_outside_compilation():
+      def get_activations(features):
+        # This inserts a mul operation on the TPU to trigger the direct input
+        # error.
+        features = (features[0]*2, features[1]*2, features[2]*2)
+        mid_level_api.enqueue(features, training=False)
+        return mid_level_api.dequeue()
+      return strategy.run(get_activations, args=(next(dataset_iter),))
+
+    with self.assertRaisesRegex(
+        ValueError, 'which does not have the `_tpu_input_identity` attr'):
+      enqueue_with_outside_compilation()
+
+  def test_enqueue_with_outside_compilation_auto_mode(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    dataset = self._create_sparse_dataset(strategy)
+    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def enqueue_with_no_gradient_apply(data):
+      def get_activations(features):
+        # Note the lack of setting training=False, so training defaults to true
+        # here even though we don't have apply gradients.
+        # We detect the correct mode based on which ops exist that share the
+        # same 'name'.
+        mid_level_api.enqueue(features, name='call1')
+        return mid_level_api.dequeue(name='call1')
+      return strategy.run(get_activations, args=(data,))
+
+    @def_function.function
+    def enqueue_with_gradient_apply(data):
+      def get_activations(features):
+        mid_level_api.enqueue(features, name='call2')
+        activations = mid_level_api.dequeue(name='call2')
+        # Apply an all ones gradient
+        gradients = nest.map_structure(array_ops.ones_like, activations)
+        mid_level_api.apply_gradients(gradients, name='call2')
+        return activations
+      return strategy.run(get_activations, args=(data,))
+
+    data = next(dataset_iter)
+    before_gradient_apply = enqueue_with_gradient_apply(data)
+    after_gradient_apply = enqueue_with_no_gradient_apply(data)
+    before_gradient_apply0 = self._get_replica_numpy(before_gradient_apply,
+                                                     strategy, 0)
+    after_gradient_apply0 = self._get_replica_numpy(after_gradient_apply,
+                                                    strategy, 0)
+
+    num_replicas = strategy.num_replicas_in_sync
+    # We are passing a gradient of 1 for all lookups, optimizer is SGD with a
+    # learning rate of 0.1. Feature 0 and 1 are looked up with a sum combiner
+    # with the following ids:
+    # Feature 0: [0, 0, 1], [0, 1, 1], ... repeated over num_replicas
+    # Feature 1: [0, 1, 1], [0, 0, 1], ... repeated over num_replicas
+    # i.e. Row 0 and 1 were looked up 3*num_replicas times over all cores and as
+    # the gradient is 1, the accumulated gradient is 3*num_replicas for each
+    # position in row 0 and 1 in table.
+    #
+    # See comments in test_pass_none_to_apply_gradients for the update to
+    # Feature 2 and its table.
+    # The *2 in the next tests are because those rows have 2 lookups vs
+    # the 1 lookup in the other row.
+    update = ([[0.3 * num_replicas], [0.3 * num_replicas * 2]],
+              [[0.3 * num_replicas * 2], [0.3 * num_replicas]],
+              [[0.1 * num_replicas], [0.1 / 3 * num_replicas]])
+    golden = tuple([before - np.array(up) for before, up in
+                    zip(before_gradient_apply0, update)])
+
+    self.assertAllClose(golden, after_gradient_apply0)
+
+  def _create_strategy_and_mid_level(self, optimizer_name):
+    strategy = self._get_strategy()
+
+    with strategy.scope():
+      if optimizer_name == 'sgd':
+        optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+      elif optimizer_name == 'adagrad':
+        optimizer = tpu_embedding_v2_utils.Adagrad(learning_rate=0.1)
+      elif optimizer_name == 'adam':
+        optimizer = tpu_embedding_v2_utils.Adam(learning_rate=0.1)
+      else:
+        raise ValueError('optimizer is not recognized: ', optimizer_name)
+      mid_level_api = self._create_mid_level(optimizer=optimizer)
+
+    return strategy, mid_level_api, optimizer
+
+  @parameterized.parameters(
+      *itertools.product(
+          ['sgd', 'adagrad', 'adam'],
+          [True, False]))
+  def test_embedding(self, optimizer_name, training):
+    strategy, mid_level_api, optimizer = (
+        self._create_strategy_and_mid_level(optimizer_name))
+
+    dataset = self._create_sparse_dataset(strategy)
+    dist = strategy.experimental_distribute_dataset(dataset)
+    dist_iter = iter(dist)
+
+    @def_function.function
+    def test_fn():
+
+      def step():
+        """Create and run computation that returns the embedding activations."""
+        if not training:
+          activations = mid_level_api.dequeue()
+          total_loss = _get_total_loss_tensor(activations)
+          ret_val = [total_loss] + list(activations)
+          return ret_val
+        else:
+          with backprop.GradientTape() as tape:
+            activations = mid_level_api.dequeue()
+            tape.watch(activations)
+            total_loss = _get_total_loss_tensor(activations)
+            loss_per_replica = total_loss / strategy.num_replicas_in_sync
+          gradients = tape.gradient(loss_per_replica, activations)
+          mid_level_api.apply_gradients(gradients)
+        ret_val = [total_loss] + list(activations)
+        return ret_val
+
+      mid_level_api.enqueue(next(dist_iter), training=training)
+      result = strategy.run(step)
+      return result
+
+    # Run model.
+    shard_out_val = test_fn()
+
+    # Retrieve TPU weights to CPU.
+    mid_level_api._retrieve_variables()
+
+    # Compute sparse tensors for global batch.
+    input_data = next(iter(self._create_sparse_dataset(strategy)))
+
+    # Check results.
+    self._check_results(strategy, shard_out_val, training, input_data,
+                        mid_level_api._variables,
+                        optimizer)
+
+  def _create_mid_level(self, optimizer=None):
+    # Create `TPUEmbedding` object.
+    if optimizer is None:
+      optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+
+    num_replicas = (
+        distribution_strategy_context.get_strategy().num_replicas_in_sync)
+    return tpu_embedding_v2.TPUEmbedding(
+        feature_config=self.feature_config,
+        batch_size=self.batch_size * num_replicas,
+        optimizer=optimizer)
+
+  def _create_sparse_dataset(self, strategy, include_weights=False, weight=0.5):
+    # Create dataset for enqueue operation
+    sparse_features = (
+        sparse_tensor.SparseTensor(
+            indices=self.feature_watched_indices,
+            values=self.feature_watched_values,
+            dense_shape=[self.data_batch_size, 2]),
+        sparse_tensor.SparseTensor(
+            indices=self.feature_favorited_indices,
+            values=self.feature_favorited_values,
+            dense_shape=[self.data_batch_size, 2]),
+        sparse_tensor.SparseTensor(
+            indices=self.feature_friends_indices,
+            values=self.feature_friends_values,
+            dense_shape=[self.data_batch_size, 3]))
+    if include_weights:
+      weights = []
+      for sparse in sparse_features:
+        values = (
+            array_ops.ones_like(sparse.values, dtype=dtypes.float32) * weight)
+        weights.append(sparse_tensor.SparseTensor(
+            indices=sparse.indices,
+            values=values,
+            dense_shape=sparse.dense_shape))
+      sparse_features = (sparse_features, tuple(weights))
+
+    dataset = dataset_ops.DatasetV2.from_tensors(sparse_features)
+
+    # Data is batched to self.data_batch_size, rebatch to global batch size.
+    return dataset.unbatch().repeat().batch(
+        self.batch_size * strategy.num_replicas_in_sync, drop_remainder=True)
+
+  def _create_ragged_dataset(self, strategy, include_weights=False, weight=0.5):
+    # Create dataset for enqueue operation
+    ragged_features = (
+        ragged_tensor.RaggedTensor.from_row_lengths(
+            row_lengths=self.feature_watched_row_lengths,
+            values=self.feature_watched_values),
+        ragged_tensor.RaggedTensor.from_row_lengths(
+            row_lengths=self.feature_favorited_row_lengths,
+            values=self.feature_favorited_values),
+        ragged_tensor.RaggedTensor.from_row_lengths(
+            row_lengths=self.feature_friends_row_lengths,
+            values=self.feature_friends_values))
+    if include_weights:
+      weights = []
+      for ragged in ragged_features:
+        weights.append(ragged.with_values(
+            array_ops.ones_like(ragged.values, dtype=dtypes.float32) * weight))
+      ragged_features = (ragged_features, tuple(weights))
+
+    dataset = dataset_ops.DatasetV2.from_tensors(ragged_features)
+
+    # Data is batched to self.data_batch_size, rebatch to global batch size.
+    return dataset.unbatch().repeat().batch(
+        self.batch_size * strategy.num_replicas_in_sync, drop_remainder=True)
+
+  def _create_dense_input_fn(self, strategy, include_weights=False, weight=0.5):
+
+    def input_fn(ctx):
+      del ctx
+      features = (
+          constant_op.constant(self.feature_watched_values[-2:],
+                               dtype=dtypes.int32),
+          constant_op.constant(self.feature_favorited_values[-2:],
+                               dtype=dtypes.int32),
+          constant_op.constant(self.feature_friends_values[-2:],
+                               dtype=dtypes.int32))
+      if include_weights:
+        weights = [array_ops.ones_like(t, dtype=dtypes.float32) * weight
+                   for t in features]
+        features = (features, tuple(weights))
+      return dataset_ops.DatasetV2.from_tensors(features).repeat()
+
+    return input_fn
+
+  def _check_results(self, strategy, shard_out_val, training, input_data,
+                     table_to_variable, optimizer):
+    num_replicas = strategy.num_replicas_in_sync
+
+    # Unpack the values `strategy.run()` returns.
+    loss = _unpack(strategy, shard_out_val[0])
+    activation_watched = _unpack(strategy, shard_out_val[1])
+    activation_favorited = _unpack(strategy, shard_out_val[2])
+    activation_friends = _unpack(strategy, shard_out_val[3])
+
+    # Core 0:
+    # Calculate the values of embedding activations.
+    activation_watched_gold0 = np.array([[0, 1, 2, 3], [4, 6, 8, 10]])
+    activation_favorited_gold0 = np.array([[4, 6, 8, 10], [4, 5, 6, 7]])
+    # Second row of `activation_friends_gold0` is the mean of the following.
+    # row 0: 0 1
+    # row 1: 2 3
+    # row 2: 4 5
+    activation_friends_gold0 = np.array([[6, 7], [2, 3]])
+
+    loss_gold0 = _compute_loss(activation_watched_gold0,
+                               activation_favorited_gold0,
+                               activation_friends_gold0)
+
+    # Add on values from other cores:
+    # Activations for watched are an alternating sequence of
+    # activation_watched_gold0 and activation_favorited_gold0.
+    # For favorited it is the same but in the opposite order.
+    activation_watched_gold = np.concatenate(
+        (np.concatenate((np.expand_dims(activation_watched_gold0, axis=0),) *
+                        (num_replicas // 2)),
+         np.concatenate((np.expand_dims(activation_favorited_gold0, axis=0),) *
+                        (num_replicas // 2))),
+        axis=1).reshape([self.batch_size * num_replicas, 4])
+    activation_favorited_gold = np.concatenate(
+        (activation_watched_gold[self.batch_size:,],
+         activation_watched_gold[0:self.batch_size,]))
+    activation_friends_gold = np.concatenate(
+        (activation_friends_gold0,) * num_replicas)
+
+    loss_gold = [loss_gold0] * num_replicas
+
+    # Test values.
+    self.assertAllClose(activation_watched_gold, activation_watched)
+    self.assertAllClose(activation_favorited_gold, activation_favorited)
+    self.assertAllClose(activation_friends_gold, activation_friends)
+
+    self.assertAllClose(loss_gold, loss)
+
+    embedding_table_video_before = np.copy(
+        np.reshape(self.embedding_values, [8, 4]))
+    embedding_table_user_before = np.copy(
+        np.reshape(self.embedding_values, [16, 2]))
+
+    global_batch_size = self.batch_size * num_replicas
+    if training:
+      gradient_wrt_watched_gold = (2 * activation_watched_gold /
+                                   global_batch_size)
+      gradient_wrt_favorited_gold = (2 * activation_favorited_gold /
+                                     global_batch_size)
+      gradient_wrt_friends_gold = (2 * activation_friends_gold /
+                                   global_batch_size)
+
+      # Calculate gradients wrt embedding tables.
+      gradients_wrt_user = (
+          _compute_gradients_wrt_embedding_table(
+              global_batch_size, gradient_wrt_friends_gold,
+              embedding_table_user_before, input_data[2].indices.numpy(),
+              input_data[2].values.numpy(), self.table_user.combiner))
+      gradients_wrt_video = (
+          _compute_gradients_wrt_embedding_table(
+              global_batch_size, gradient_wrt_favorited_gold,
+              embedding_table_video_before, input_data[1].indices.numpy(),
+              input_data[1].values.numpy(), self.table_video.combiner) +
+          _compute_gradients_wrt_embedding_table(
+              global_batch_size, gradient_wrt_watched_gold,
+              embedding_table_video_before, input_data[0].indices.numpy(),
+              input_data[0].values.numpy(), self.table_video.combiner))
+
+      self._check_embedding_and_slot_variables(embedding_table_user_before,
+                                               gradients_wrt_user,
+                                               embedding_table_video_before,
+                                               gradients_wrt_video,
+                                               optimizer,
+                                               table_to_variable)
+
+  def _check_embedding_and_slot_variables(self, embedding_table_user_before,
+                                          gradients_wrt_user,
+                                          embedding_table_video_before,
+                                          gradients_wrt_video,
+                                          optimizer,
+                                          table_to_variable):
+    if isinstance(optimizer, tpu_embedding_v2_utils.SGD):
+      check_fn = self._check_embedding_and_slot_variables_for_sgd
+    elif isinstance(optimizer, tpu_embedding_v2_utils.Adagrad):
+      check_fn = self._check_embedding_and_slot_variables_for_adagrad
+    elif isinstance(optimizer, tpu_embedding_v2_utils.Adam):
+      check_fn = self._check_embedding_and_slot_variables_for_adam
+    else:
+      raise ValueError('optimizer is not recognized: ', type(optimizer))
+    check_fn(embedding_table_user_before, gradients_wrt_user,
+             optimizer, table_to_variable[self.table_user.name])
+    check_fn(embedding_table_video_before, gradients_wrt_video,
+             optimizer, table_to_variable[self.table_video.name])
+
+  def _check_embedding_and_slot_variables_for_sgd(self, embedding_table_before,
+                                                  gradients,
+                                                  optimizer,
+                                                  variables):
+    embedding_table = np.copy(embedding_table_before)
+    embedding_table -= optimizer.learning_rate * np.sum(gradients, axis=0)
+    self.assertAllClose(_get_variable(variables['parameters']).numpy(),
+                        embedding_table)
+
+  def _check_embedding_and_slot_variables_for_adagrad(self,
+                                                      embedding_table_before,
+                                                      gradients,
+                                                      optimizer,
+                                                      variable):
+    embedding_table = np.copy(embedding_table_before)
+    accumulator = (
+        optimizer.initial_accumulator_value + np.sum(gradients, axis=0)**2)
+    embedding_table -= (
+        optimizer.learning_rate * np.sum(gradients, axis=0) /
+        np.sqrt(accumulator))
+    self.assertAllClose(_get_variable(variable['parameters']).numpy(),
+                        embedding_table)
+    self.assertAllClose(_get_variable(variable['accumulators']).numpy(),
+                        accumulator)
+
+  def _check_embedding_and_slot_variables_for_adam(self, embedding_table_before,
+                                                   gradients,
+                                                   optimizer,
+                                                   variable):
+    embedding_table = np.copy(embedding_table_before)
+    g = np.sum(gradients, axis=0)
+    v = g**2 * (1 - optimizer.beta_2)
+    m = g * (1 - optimizer.beta_1)
+    epsilon = optimizer.epsilon
+    # TPU Embeddings don't have the LR decay factor for Adam.
+    lr_modifier = 1
+    embedding_table -= (
+        m * optimizer.learning_rate * lr_modifier / (np.sqrt(v) + epsilon))
+    self.assertAllClose(_get_variable(variable['parameters']).numpy(),
+                        embedding_table, rtol=1e-4)
+    self.assertAllClose(_get_variable(variable['momenta']).numpy(),
+                        m, rtol=1e-4)
+    self.assertAllClose(_get_variable(variable['velocities']).numpy(),
+                        v, rtol=1e-4)
+
+  def _get_replica_numpy(self, structured, strategy, replica_id):
+    def select_replica(x):
+      x = strategy.experimental_local_results(x)
+      if len(x) == 1:
+        return x.numpy()
+      return x[replica_id].numpy()
+    return nest.map_structure(select_replica, structured)
+
+  def test_dense_lookup(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    input_fn = self._create_dense_input_fn(strategy)
+    dist = strategy.experimental_distribute_datasets_from_function(input_fn)
+    dist_iter = iter(dist)
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      mid_level_api.enqueue(next(dist_iter), training=False)
+      return strategy.run(step)
+
+    # Run model.
+    shard0 = self._get_replica_numpy(test_fn(), strategy, 0)
+
+    # embedding_values is a linear list, so we reshape to match the correct
+    # shape of the corresponding table before performing the lookup.
+    numpy_videos = np.reshape(self.embedding_values, (8, 4))
+    numpy_users = np.reshape(self.embedding_values, (16, 2))
+    golden = ((numpy_videos[self.feature_watched_values[-2:]],
+               numpy_videos[self.feature_favorited_values[-2:]],
+               numpy_users[self.feature_friends_values[-2:]]))
+    self.assertAllClose(shard0, golden)
+
+  def test_variable_learning_rate(self):
+    num_steps = 10
+    num_steps_float = float(num_steps)
+    starting_lr = 1.0
+    ending_lr = 0.5
+
+    strategy = self._get_strategy()
+    num_replicas = strategy.num_replicas_in_sync
+
+    # Create model with Keras.
+    with strategy.scope():
+      step_counter = tf_variables.Variable(0.0, dtypes.float32)
+
+      def lr_function():
+        return gen_math_ops.maximum(
+            ending_lr,
+            starting_lr + ((ending_lr - starting_lr) * step_counter) /
+            num_steps_float)
+
+      optimizer = tpu_embedding_v2_utils.SGD(learning_rate=lr_function)
+      table_config = tpu_embedding_v2_utils.TableConfig(
+          vocabulary_size=num_replicas,
+          dim=4,
+          initializer=init_ops_v2.Constant(np.zeros((num_replicas, 4))),
+          combiner='sum', name='table')
+      mid_level_api = tpu_embedding_v2.TPUEmbedding(
+          feature_config={
+              'feature': tpu_embedding_v2_utils.FeatureConfig(
+                  table=table_config, name='feature')},
+          batch_size=num_replicas,
+          optimizer=optimizer)
+
+    feature = {'feature': constant_op.constant([0], dtype=dtypes.int32)}
+
+    def input_fn(ctx):
+      del ctx
+      return dataset_ops.DatasetV2.from_tensors(feature).repeat()
+    dist = strategy.experimental_distribute_datasets_from_function(input_fn)
+    dist_iter = iter(dist)
+
+    @def_function.function
+    def test_fn():
+      def step():
+        with backprop.GradientTape() as tape:
+          activations = mid_level_api.dequeue()
+          tape.watch(activations)
+          result = math_ops.reduce_sum(activations['feature'])
+          loss = result / num_replicas
+        grads = tape.gradient(loss, activations)
+        mid_level_api.apply_gradients(grads)
+        return activations['feature']
+
+      mid_level_api.enqueue(next(dist_iter), training=True)
+      return strategy.run(step)
+
+    # Run model.
+    results = []
+    for _ in range(num_steps):
+      result = test_fn()
+      results.append(_unpack(strategy, result))
+      step_counter.assign_add(1.0)
+
+    # Table is 2 elements wide, per-replica batch size of 1, with id 0.
+    # Loss for the gradient is the sum of the entries divided by the number of
+    # replicas. Thus the per replica gradient is 1/#of replicas for row 0 and no
+    # other updates. The reduced gradient is therefore 1.
+    # Learning rate schedule over num_steps steps:
+    # 1.0 0.95 0.9 0.85 0.8 ...
+    # Since use SGD and the gradient is one, the first row of the table is
+    # [0, 0] [-1.0, -1.0] [-1.95, -1.95] [-2.85, -2.85] ... (the negative
+    # partial sums of the above).
+
+    learning_rates = [starting_lr - (starting_lr - ending_lr) / num_steps * j
+                      for j in range(num_steps)]
+    cumsum = [sum(learning_rates[0:j]) for j in range(num_steps)]
+    goldens = [[[-cumsum[i]] * table_config.dim] * num_replicas
+               for i in range(10)]
+    self.assertAllClose(results, goldens)
+
+  @parameterized.parameters([True, False])
+  def test_optimizer_with_slot_creation_fn(self, use_tpu):
+    def slot_creation_fn(table, slot_names):
+      slots = {}
+      for slot in slot_names:
+        slots[slot] = tf_variables.Variable(
+            name='{}_{}'.format(table.name, slot),
+            initial_value=functools.partial(
+                init_ops_v2.Zeros(), shape=table.shape, dtype=dtypes.float32),
+            trainable=False)
+      return slots
+    optimizer = tpu_embedding_v2_utils.Adagrad(
+        learning_rate=0.1,
+        slot_variable_creation_fn=slot_creation_fn)
+    if use_tpu:
+      strategy = self._get_strategy()
+    else:
+      strategy = distribution_strategy_context.get_strategy()
+    num_replicas = strategy.num_replicas_in_sync
+    with strategy.scope():
+      mid_level = tpu_embedding_v2.TPUEmbedding(
+          feature_config=self.feature_config,
+          batch_size=self.batch_size * num_replicas,
+          optimizer=optimizer)
+    video_accumulator = mid_level._variables['video']['accumulators']
+    user_accumulator = mid_level._variables['user']['accumulators']
+    if use_tpu:
+      # To check the table contents (ensure that it is zero rather than the
+      # normal initial accumulator value specified to in the optimizer config),
+      # we need to select the underlying table variable on TPU.
+      # We only have one shard on Forge.
+      video_accumulator = video_accumulator.variables[0]
+      user_accumulator = user_accumulator.variables[0]
+
+    self.assertAllClose(video_accumulator.numpy(),
+                        np.zeros((self.table_video.vocabulary_size,
+                                  self.table_video.dim)))
+    self.assertAllClose(user_accumulator.numpy(),
+                        np.zeros((self.table_user.vocabulary_size,
+                                  self.table_user.dim)))
+
+  def test_optimizer_with_slot_creation_fn_non_partial(self):
+    def slot_creation_fn(table, slot_names):
+      slots = {}
+      for slot in slot_names:
+        # Note that we don't pass functools.partial here, so on TPU we can't
+        # extract the shape. We expect the error below.
+        slots[slot] = tf_variables.Variable(
+            name='{}_{}'.format(table.name, slot),
+            initial_value=init_ops_v2.Zeros()(shape=table.shape,
+                                              dtype=dtypes.float32),
+            trainable=False)
+      return slots
+    optimizer = tpu_embedding_v2_utils.Adagrad(
+        learning_rate=0.1,
+        slot_variable_creation_fn=slot_creation_fn)
+    strategy = self._get_strategy()
+    num_replicas = strategy.num_replicas_in_sync
+    with strategy.scope():
+      with self.assertRaisesRegex(ValueError,
+                                  'Unable to extract initializer function'):
+        tpu_embedding_v2.TPUEmbedding(
+            feature_config=self.feature_config,
+            batch_size=self.batch_size*num_replicas,
+            optimizer=optimizer)
+
+  def test_sequence_embeddings(self):
+    feature_config = (
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched',
+            max_sequence_length=2),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='favorited',
+            max_sequence_length=2),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_user, name='friends',
+            max_sequence_length=3))
+    optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+    strategy = self._get_strategy()
+    num_replicas = strategy.num_replicas_in_sync
+    with strategy.scope():
+      mid_level = tpu_embedding_v2.TPUEmbedding(
+          feature_config=feature_config,
+          batch_size=self.batch_size * num_replicas,
+          optimizer=optimizer)
+
+    dataset = self._create_sparse_dataset(strategy)
+    data = next(iter(strategy.experimental_distribute_dataset(dataset)))
+
+    @def_function.function
+    def embedding_and_set_gradients(data):
+      def tpu_fn():
+        activations = mid_level.dequeue()
+        mid_level.apply_gradients(nest.map_structure(array_ops.ones_like,
+                                                     activations))
+        return activations
+      mid_level.enqueue(data)
+      return strategy.run(tpu_fn)
+
+    @def_function.function
+    def embedding_only(data):
+      def tpu_fn():
+        return mid_level.dequeue()
+      mid_level.enqueue(data)
+      return strategy.run(tpu_fn)
+
+    # Only check core 0.
+    before_update = self._get_replica_numpy(
+        embedding_and_set_gradients(data), strategy, 0)
+    after_update = self._get_replica_numpy(embedding_only(data), strategy, 0)
+
+    # For videos table, row 0 and row 1 are looked up 3*num_replicas times as
+    # they occur 3 times per replica (considering the features 0 and 1 which are
+    # both looked up in the videos table).
+    # Feature 0 has ids [0, 0, 1], [0, 1, 1], ... repeated over num_replicas
+    # Feature 1 has ids [0, 1, 1], [0, 0, 1], ... repeated over num_replicas
+    # This means that both rows 0 and 1 get a -0.1*3*num_replicas update
+    # For users table, each row is looked up twice:
+    # Feature 2 has ids [3, 0, 1, 2], .. repeated over num_replicas
+    # This means that we get a -0.1*num_replicas update to the third feature.
+
+    # In general this means that after the update, if we lookup feature 0 and 1
+    # the values will be 0.3*num_replicas lower per entry and for feature 2 they
+    # will be 0.1*num_replicas lower.
+    # The one issue that that these lookups contain padding values.
+    # For core 0, we get the first 2 elements of the 4 element batch.
+    # For feature 0, the indices are [[0, 0], [1, 0], [1, 1]] with max sequence
+    # length of 2, which means that [0, 1] will be 0s.
+    # For feature 1, the indices are [[0, 0], [0, 1], [1, 0]] with max sequence
+    # length of 2, which means that [1, 1] will be 0s.
+    # For feature 2, the indices are [[0, 0], [1, 0], [1, 1], [1, 2]] with max
+    # sequence length of 3, which means that [0, 1], [0, 2] will be 0s.
+    # The following masks represent that so that we only apply the above updates
+    # to the non-padding rows:
+    masks = (
+        np.array([[[1], [0]], [[1], [1]]]),
+        np.array([[[1], [1]], [[1], [0]]]),
+        np.array([[[1], [0], [0]], [[1], [1], [1]]]))
+
+    per_row_update = (0.3 * num_replicas,
+                      0.3 * num_replicas,
+                      0.1 * num_replicas)
+    golden = tuple([before - update * mask for before, update, mask in
+                    zip(before_update, per_row_update, masks)])
+    self.assertAllClose(golden, after_update)
+
+
+def _compute_gradients_wrt_embedding_table(batch_size,
+                                           gradient_wrt_activation,
+                                           embedding_table,
+                                           feature_indices,
+                                           feature_values,
+                                           combiner,
+                                           max_sequence_length=0):
+  """Compute gradients wrt embedding_table.
+
+  Args:
+    batch_size: `int`, batch size.
+    gradient_wrt_activation: `np.array` with shape `batch_size` by
+      embedding `dimension`.
+    embedding_table: `np.array` with shape `vocabulary_size` by embedding
+      `dimension`.
+    feature_indices: `indices` as used to construct `SparseTensor`.
+    feature_values: `values` as used to construct `SparseTensor`.
+    combiner: `String`, 'mean' or 'sum'.
+    max_sequence_length: If non-zero, a sequence feature with the given length.
+
+  Returns:
+    Gradients wrt `embedding_table`, an `np.array`s with shape
+      `batch_size` by `vocabulary_size` by
+      embedding `dimension`.
+
+  Raises:
+    ValueError: if `combiner` is not one of 'mean' or 'sum'.
+  """
+  if combiner not in ('mean', 'sum'):
+    raise ValueError('`combiner` must be mean or sum; got {}.'.format(combiner))
+  grads = []
+  for i in range(batch_size):
+    grad = np.zeros_like(embedding_table)
+    count = 0
+    for (batch_i, seq_index), vocabulary_id in zip(feature_indices,
+                                                   feature_values):
+      if batch_i == i:
+        count += 1
+        if max_sequence_length > 0:
+          if seq_index < max_sequence_length:
+            grad[vocabulary_id, :] += gradient_wrt_activation[i, seq_index, :]
+        else:
+          grad[vocabulary_id, :] += gradient_wrt_activation[i, :]
+    if combiner == 'mean' and not max_sequence_length:
+      grad = grad / count
+    grads.append(grad)
+  return np.stack(grads)
+
+
+def _unpack(strategy, per_replica_output):
+  per_replica_output = strategy.experimental_local_results(per_replica_output)
+  per_replica_output = array_ops.concat(per_replica_output, axis=0).numpy()
+  return per_replica_output
+
+
+def _get_total_loss_tensor(activations):
+  losses = []
+  for activation in activations:
+    losses.append(
+        math_ops.reduce_mean(
+            math_ops.reduce_sum(
+                gen_math_ops.squared_difference(activation, 0), 1)))
+  total_loss = array_ops.expand_dims_v2(sum(losses), 0)
+  return total_loss
+
+
+def _compute_loss(activation_watched, activation_favorited, activation_friends):
+  watched_loss = np.mean(np.sum(activation_watched**2, axis=1))
+  if len(activation_favorited.shape) == 2:
+    favorited_loss = np.mean(np.sum(activation_favorited**2, axis=1))
+  else:
+    favorited_loss = np.mean(np.sum(activation_favorited**2, axis=(1, 2)))
+  if len(activation_friends.shape) == 2:
+    friends_loss = np.mean(np.sum(activation_friends**2, axis=1))
+  else:
+    friends_loss = np.mean(np.sum(activation_friends**2, axis=(1, 2)))
+  loss = watched_loss + favorited_loss + friends_loss
+  return loss
+
+
+def _get_tmpdir(name, subdir=''):
+  segments = [FLAGS.model_dir, name] + ([subdir] if subdir else [])
+  return os.path.join(*segments)
+
+
+def _get_variable(variable):
+  if isinstance(variable, tpu_embedding_v2.TPUShardedVariable):
+    return variable.variables[0]
+  return variable
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils.py b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
new file mode 100644
index 00000000000..bba0d10a62f
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
@@ -0,0 +1,624 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Companion classes for mid level API for TPU Embeddings in TF2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import abc
+import functools
+import math
+import six
+
+from tensorflow.core.protobuf.tpu import optimization_parameters_pb2
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@six.add_metaclass(abc.ABCMeta)
+class _Optimizer(object):
+  """Base class for all optimizers, with common parameters."""
+
+  def __init__(self, learning_rate, use_gradient_accumulation, clip_weight_min,
+               clip_weight_max, weight_decay_factor,
+               multiply_weight_decay_factor_by_learning_rate,
+               slot_variable_creation_fn=None):
+    self.learning_rate = learning_rate
+    self.use_gradient_accumulation = use_gradient_accumulation
+    self.clip_weight_min = clip_weight_min
+    self.clip_weight_max = clip_weight_max
+    self.weight_decay_factor = weight_decay_factor
+    self.multiply_weight_decay_factor_by_learning_rate = (
+        multiply_weight_decay_factor_by_learning_rate)
+
+    if (slot_variable_creation_fn is not None and
+        not callable(slot_variable_creation_fn)):
+      raise ValueError("slot_variable_creation_fn must be either None or a "
+                       "callable.")
+    self.slot_variable_creation_fn = slot_variable_creation_fn
+
+  @abc.abstractmethod
+  def _slot_names(self):
+    """Returns the name of all the slot variables.
+
+    This does not include the 'parameters' variable and these names must match
+    the names of the slots variables as used in the corresponding
+    `tpu_ops.load_tpu_embedding_*` ops.
+    """
+    raise NotImplementedError
+
+  @abc.abstractmethod
+  def _slot_initializers(self):
+    """Returns initializers for slot variables.
+
+    This returns a parallel list to self._slot_names().
+    """
+    raise NotImplementedError
+
+  def _set_optimization_parameters(self, parameters):
+    """Sets the optimizer fields in the OptimizationParameters."""
+    if self.use_gradient_accumulation:
+      parameters.gradient_accumulation_status = (
+          optimization_parameters_pb2.GradientAccumulationStatus.ENABLED)
+    else:
+      parameters.gradient_accumulation_status = (
+          optimization_parameters_pb2.GradientAccumulationStatus.DISABLED)
+
+    if self.clip_weight_min is not None:
+      parameters.clipping_limits.lower.value = self.clip_weight_min
+
+    if self.clip_weight_max is not None:
+      parameters.clipping_limits.upper.value = self.clip_weight_max
+
+    if self.weight_decay_factor:
+      parameters.weight_decay_factor = self.weight_decay_factor
+      if self.multiply_weight_decay_factor_by_learning_rate:
+        parameters.multiply_weight_decay_factor_by_learning_rate = True
+
+  @abc.abstractmethod
+  def _load(self):
+    """Returns the load function for the optimizer."""
+    raise NotImplementedError
+
+  @abc.abstractmethod
+  def _retrieve(self):
+    """Returns the retrieve function for the optimizer."""
+    raise NotImplementedError
+
+  def _create_slots(self, table):
+    """Creates slot variables for table.
+
+    Uses shape of table to create parallel slot variables.
+
+    Args:
+      table: A Variable or equivalent.
+
+    Returns:
+      A dict of variables, keyed by self._slot_names().
+    """
+    if self.slot_variable_creation_fn is not None:
+      return self.slot_variable_creation_fn(table, self._slot_names())
+    else:
+      slots = {}
+      for slot, initializer in zip(self._slot_names(),
+                                   self._slot_initializers()):
+        slots[slot] = tf_variables.Variable(
+            name=table.name + "/" + slot,
+            initial_value=functools.partial(
+                initializer, shape=table.shape, dtype=table.dtype),
+            trainable=False)
+      return slots
+
+
+@tf_export("tpu.experimental.embedding.SGD")
+class SGD(_Optimizer):
+  """Optimization parameters for stochastic gradient descent for TPU embeddings.
+
+  Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer`
+  argument to set the global optimizer and its parameters:
+
+  ```
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      ...
+      optimizer=tf.tpu.experimental.embedding.SGD(0.1))
+  ```
+
+  This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the
+  optimizer parameter to set a table specific optimizer. This will override the
+  optimizer and parameters for global embedding optimizer defined above:
+
+  ```
+  table_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...,
+      optimizer=tf.tpu.experimental.embedding.SGD(0.2))
+  table_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+
+  feature_config = (
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_one),
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_two))
+
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=...
+      optimizer=tf.tpu.experimental.embedding.SGD(0.1))
+  ```
+
+  In the above example, the first feature will be looked up in a table that has
+  a learning rate of 0.2 while the second feature will be looked up in a table
+  that has a learning rate of 0.1.
+
+  See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
+  complete description of these parameters and their impacts on the optimizer
+  algorithm.
+  """
+
+  def __init__(self,
+               learning_rate=0.01,
+               clip_weight_min=None,
+               clip_weight_max=None,
+               weight_decay_factor=None,
+               multiply_weight_decay_factor_by_learning_rate=None):
+    """Optimization parameters for stochastic gradient descent.
+
+    Args:
+      learning_rate: The learning rate. It should be a floating point value or a
+        callable taking no arguments for a dynamic learning rate.
+      clip_weight_min: the minimum value to clip by; None means -infinity.
+      clip_weight_max: the maximum value to clip by; None means +infinity.
+      weight_decay_factor: amount of weight decay to apply; None means that the
+        weights are not decayed. Weights are decayed by multiplying the weight
+        by this factor each step.
+      multiply_weight_decay_factor_by_learning_rate: if true,
+        `weight_decay_factor` is multiplied by the current learning rate.
+    """
+    super(SGD, self).__init__(
+        learning_rate, False, clip_weight_min, clip_weight_max,
+        weight_decay_factor, multiply_weight_decay_factor_by_learning_rate)
+
+  def _slot_names(self):
+    return []
+
+  def _slot_initializers(self):
+    return []
+
+  def _set_optimization_parameters(self, parameters):
+    super(SGD, self)._set_optimization_parameters(parameters)
+    parameters.stochastic_gradient_descent.SetInParent()
+
+  def _load(self):
+    return tpu_ops.load_tpu_embedding_stochastic_gradient_descent_parameters
+
+  def _retrieve(self):
+    return tpu_ops.retrieve_tpu_embedding_stochastic_gradient_descent_parameters
+
+
+@tf_export("tpu.experimental.embedding.Adagrad")
+class Adagrad(_Optimizer):
+  """Optimization parameters for Adagrad with TPU embeddings.
+
+  Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer`
+  argument to set the global optimizer and its parameters:
+
+  ```python
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      ...
+      optimizer=tf.tpu.experimental.embedding.Adagrad(0.1))
+  ```
+
+  This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the
+  optimizer parameter to set a table specific optimizer. This will override the
+  optimizer and parameters for global embedding optimizer defined above:
+
+  ```python
+  table_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...,
+      optimizer=tf.tpu.experimental.embedding.Adagrad(0.2))
+  table_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+
+  feature_config = (
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_one),
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_two))
+
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=...
+      optimizer=tf.tpu.experimental.embedding.Adagrad(0.1))
+  ```
+
+  In the above example, the first feature will be looked up in a table that has
+  a learning rate of 0.2 while the second feature will be looked up in a table
+  that has a learning rate of 0.1.
+
+  See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
+  complete description of these parameters and their impacts on the optimizer
+  algorithm.
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               initial_accumulator_value=0.1,
+               use_gradient_accumulation=True,
+               clip_weight_min=None,
+               clip_weight_max=None,
+               weight_decay_factor=None,
+               multiply_weight_decay_factor_by_learning_rate=None,
+               slot_variable_creation_fn=None):
+    """Optimization parameters for Adagrad.
+
+    Args:
+      learning_rate: The learning rate. It should be a floating point value or a
+        callable taking no arguments for a dynamic learning rate.
+      initial_accumulator_value: initial accumulator for Adagrad.
+      use_gradient_accumulation: setting this to `False` makes embedding
+        gradients calculation less accurate but faster.
+      clip_weight_min: the minimum value to clip by; None means -infinity.
+      clip_weight_max: the maximum value to clip by; None means +infinity.
+      weight_decay_factor: amount of weight decay to apply; None means that the
+        weights are not decayed.
+      multiply_weight_decay_factor_by_learning_rate: if true,
+        `weight_decay_factor` is multiplied by the current learning rate.
+      slot_variable_creation_fn: Defaults to `None`. If you wish do directly
+        control the creation of the slot variables, set this to a callable
+        taking two parameters, a variable and a list of slot names to create for
+        it. This function should return a dict with the slot names as keys and
+        the created variables as values. When set to None (the default), uses
+        the built-in variable creation.
+    """
+    super(Adagrad, self).__init__(
+        learning_rate, use_gradient_accumulation, clip_weight_min,
+        clip_weight_max, weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate,
+        slot_variable_creation_fn)
+    if initial_accumulator_value <= 0:
+      raise ValueError("Adagrad initial_accumulator_value must be positive")
+    self.initial_accumulator_value = initial_accumulator_value
+
+  def _slot_names(self):
+    return ["accumulators"]
+
+  def _slot_initializers(self):
+    return [init_ops_v2.Constant(self.initial_accumulator_value)]
+
+  def _set_optimization_parameters(self, parameters):
+    super(Adagrad, self)._set_optimization_parameters(parameters)
+    parameters.adagrad.SetInParent()
+
+  def _load(self):
+    return tpu_ops.load_tpu_embedding_adagrad_parameters
+
+  def _retrieve(self):
+    return tpu_ops.retrieve_tpu_embedding_adagrad_parameters
+
+
+@tf_export("tpu.experimental.embedding.Adam")
+class Adam(_Optimizer):
+  """Optimization parameters for Adam with TPU embeddings.
+
+  Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer`
+  argument to set the global optimizer and its parameters:
+
+  NOTE: By default this optimizer is lazy, i.e. it will not apply the gradient
+  update of zero to rows that were not looked up. You can change this behavior
+  by setting `lazy_adam` to `False`.
+
+  ```python
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      ...
+      optimizer=tf.tpu.experimental.embedding.Adam(0.1))
+  ```
+
+  This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the
+  optimizer parameter to set a table specific optimizer. This will override the
+  optimizer and parameters for global embedding optimizer defined above:
+
+  ```python
+  table_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...,
+      optimizer=tf.tpu.experimental.embedding.Adam(0.2))
+  table_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+
+  feature_config = (
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_one),
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_two))
+
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=...
+      optimizer=tf.tpu.experimental.embedding.Adam(0.1))
+  ```
+
+  In the above example, the first feature will be looked up in a table that has
+  a learning rate of 0.2 while the second feature will be looked up in a table
+  that has a learning rate of 0.1.
+
+  See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
+  complete description of these parameters and their impacts on the optimizer
+  algorithm.
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-07,
+               lazy_adam=True,
+               sum_inside_sqrt=True,
+               use_gradient_accumulation=True,
+               clip_weight_min=None,
+               clip_weight_max=None,
+               weight_decay_factor=None,
+               multiply_weight_decay_factor_by_learning_rate=None,
+               slot_variable_creation_fn=None):
+    """Optimization parameters for Adam.
+
+    See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
+    complete description of these parameters and their impacts on the optimizer
+    algorithm.
+
+    Args:
+      learning_rate: The learning rate. It should be a floating point value or a
+        callable taking no arguments for a dynamic learning rate.
+      beta_1: A float value.
+        The exponential decay rate for the 1st moment estimates.
+      beta_2: A float value.
+        The exponential decay rate for the 2nd moment estimates.
+      epsilon: A small constant for numerical stability.
+      lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster.
+      sum_inside_sqrt: When this is true, the Adam update formula is changed
+        from `m / (sqrt(v) + epsilon)` to `m / sqrt(v + epsilon**2)`. This
+        option improves the performance of TPU training and is not expected to
+        harm model quality.
+      use_gradient_accumulation: Setting this to `False` makes embedding
+        gradients calculation less accurate but faster.
+      clip_weight_min: the minimum value to clip by; None means -infinity.
+      clip_weight_max: the maximum value to clip by; None means +infinity.
+      weight_decay_factor: amount of weight decay to apply; None means that the
+        weights are not decayed.
+      multiply_weight_decay_factor_by_learning_rate: if true,
+        `weight_decay_factor` is multiplied by the current learning rate.
+      slot_variable_creation_fn: a callable taking two parameters, a variable
+        and a list of slot names to create for it. This function should return
+        a dict with the slot names as keys and the created variables as values.
+        When set to None (the default), uses the built-in variable creation.
+    """
+    super(Adam, self).__init__(
+        learning_rate, use_gradient_accumulation, clip_weight_min,
+        clip_weight_max, weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate,
+        slot_variable_creation_fn)
+    if beta_1 < 0. or beta_1 >= 1.:
+      raise ValueError("beta1 must be in the range [0, 1), but received {}."
+                       .format(beta_1))
+    if beta_2 < 0. or beta_2 >= 1.:
+      raise ValueError("beta2 must be in the range [0, 1), but received {}."
+                       .format(beta_2))
+    if epsilon <= 0.:
+      raise ValueError("epsilon must be positive; got {}.".format(epsilon))
+    if not use_gradient_accumulation and not lazy_adam:
+      raise ValueError(
+          "When disabling Lazy Adam, gradient accumulation must be used.")
+
+    self.beta_1 = beta_1
+    self.beta_2 = beta_2
+    self.epsilon = epsilon
+    self.lazy_adam = lazy_adam
+    self.sum_inside_sqrt = sum_inside_sqrt
+
+  def _slot_names(self):
+    return ["momenta", "velocities"]
+
+  def _slot_initializers(self):
+    return [init_ops_v2.Constant(), init_ops_v2.Constant()]
+
+  def _set_optimization_parameters(self, parameters):
+    super(Adam, self)._set_optimization_parameters(parameters)
+    parameters.adam.beta1 = self.beta_1
+    parameters.adam.beta2 = self.beta_2
+    parameters.adam.epsilon = self.epsilon
+    parameters.adam.use_non_lazy_adam = not self.lazy_adam
+    parameters.adam.use_sum_inside_sqrt = self.sum_inside_sqrt
+
+  def _load(self):
+    return tpu_ops.load_tpu_embedding_adam_parameters
+
+  def _retrieve(self):
+    return tpu_ops.retrieve_tpu_embedding_adam_parameters
+
+
+@tf_export("tpu.experimental.embedding.TableConfig")
+class TableConfig(object):
+  """Configuration data for one embedding table.
+
+  This class holds the configuration data for a single embedding table. It is
+  used as the `table` parameter of a
+  `tf.tpu.experimental.embedding.FeatureConfig`. Multiple
+  `tf.tpu.experimental.embedding.FeatureConfig` objects can use the same
+  `tf.tpu.experimental.embedding.TableConfig` object. In this case a shared
+  table will be created for those feature lookups.
+
+  ```python
+  table_config_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  table_config_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  feature_config = {
+      'feature_one': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_two': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_three': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_two)}
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=...
+      optimizer=tf.tpu.experimental.embedding.Adam(0.1))
+  ```
+
+  The above configuration has 2 tables, and three features. The first two
+  features will be looked up in the first table and the third feature will be
+  looked up in the second table.
+
+  """
+
+  def __init__(self, vocabulary_size, dim, initializer, optimizer=None,
+               combiner="mean", name=None):
+    """Embedding table configuration.
+
+    Args:
+      vocabulary_size: Size of the table's vocabulary (number of rows).
+      dim: The embedding dimension (width) of the table.
+      initializer: A callable initializer taking one parameter, the shape of the
+        variable that will be initialized. Will be called once per task, to
+        initialize that task's shard of the embedding table. If not specified,
+        defaults to `truncated_normal_initializer` with mean `0.0` and standard
+        deviation `1/sqrt(dim)`.
+      optimizer: An optional instance of an optimizer parameters class, instance
+        of one of `tf.tpu.experimental.embedding.SGD`,
+        `tf.tpu.experimental.embedding.Adagrad` or
+        `tf.tpu.experimental.embedding.Adam`. It set will override the global
+        optimizer passed to `tf.tpu.experimental.embedding.TPUEmbedding`.
+      combiner: A string specifying how to reduce if there are multiple entries
+        in a single row. Currently 'mean', 'sqrtn', 'sum' are
+        supported, with 'mean' the default. 'sqrtn' often achieves good
+        accuracy, in particular with bag-of-words columns. For more information,
+        see `tf.nn.embedding_lookup_sparse`.
+      name: An optional string used to name the table. Useful for debugging.
+
+    Returns:
+      `TableConfig`.
+
+    Raises:
+      ValueError: if `vocabulary_size` is not a positive integer.
+      ValueError: if `dim` is not a positive integer.
+      ValueError: if `initializer` is specified and is not callable.
+      ValueError: if `combiner` is not supported.
+    """
+    if not isinstance(vocabulary_size, int) or vocabulary_size < 1:
+      raise ValueError("Invalid vocabulary_size {}.".format(vocabulary_size))
+
+    if not isinstance(dim, int) or dim < 1:
+      raise ValueError("Invalid dim {}.".format(dim))
+
+    if (initializer is not None) and (not callable(initializer)):
+      raise ValueError("initializer must be callable if specified.")
+    if initializer is None:
+      initializer = init_ops_v2.TruncatedNormal(mean=0.0,
+                                                stddev=1/math.sqrt(dim))
+
+    if combiner not in ("mean", "sum", "sqrtn"):
+      raise ValueError("Invalid combiner {}".format(combiner))
+
+    self.vocabulary_size = vocabulary_size
+    self.dim = dim
+    self.initializer = initializer
+    self.optimizer = optimizer
+    self.combiner = combiner
+    self.name = name
+
+
+@tf_export("tpu.experimental.embedding.FeatureConfig")
+class FeatureConfig(object):
+  """Configuration data for one embedding feature.
+
+  This class holds the configuration data for a single embedding feature. The
+  main use is to assign features to `tf.tpu.experimental.embedding.TableConfig`s
+  via the table parameter:
+
+  ```python
+  table_config_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  table_config_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  feature_config = {
+      'feature_one': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_two': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_three': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_two)}
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=...
+      optimizer=tf.tpu.experimental.embedding.Adam(0.1))
+  ```
+
+  The above configuration has 2 tables, and three features. The first two
+  features will be looked up in the first table and the third feature will be
+  looked up in the second table.
+
+  When feeding features into `embedding.enqueue` they can be `tf.Tensor`s,
+  `tf.SparseTensor`s or `tf.RaggedTensor`s. When the argument
+  `max_sequence_length` is 0, the default, you should expect a output of
+  `embedding.dequeue` for this feature of shape `(batch_size, dim)`. If
+  `max_sequence_length` is greater than 0, the feature is embedded as a sequence
+  and padded up to the given length. The shape of the output for this feature
+  will be `(batch_size, max_sequence_length, dim)`.
+  """
+
+  def __init__(self, table, max_sequence_length=0, name=None):
+    """Feature configuration.
+
+    Args:
+      table: An instance of `tf.tpu.experimental.embedding.TableConfig`,
+        describing the table in which this feature should be looked up.
+      max_sequence_length: If positive, the feature is a sequence feature with
+        the corresponding maximum sequence length. If the sequence is longer
+        than this, it will be truncated. If 0, the feature is not a sequence
+        feature.
+      name: An optional name for the feature, useful for debugging.
+
+    Returns:
+      `FeatureConfig`.
+
+    Raises:
+      ValueError: if `table` is not an instance of
+        `tf.tpu.experimental.embedding.TableConfig`.
+      ValueError: if `max_sequence_length` not an integer or is negative.
+    """
+    if not isinstance(table, TableConfig):
+      raise ValueError("table is type {}, expected "
+                       "`tf.tpu.experimental.embedding.TableConfig`".format(
+                           type(table)))
+
+    if not isinstance(max_sequence_length, int) or max_sequence_length < 0:
+      raise ValueError("Invalid max_sequence_length {}.".format(
+          max_sequence_length))
+
+    self.table = table
+    self.max_sequence_length = max_sequence_length
+    self.name = name
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 615ac587c21..93bacbdc0bb 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -92,11 +92,14 @@ class AdamOptimizer(optimizer.Optimizer):
         Section 2.1), not the epsilon in Algorithm 1 of the paper.
       use_locking: If True use locks for update operations.
       name: Optional name for the operations created when applying gradients.
-        Defaults to "Adam".  @compatibility(eager) When eager execution is
-        enabled, `learning_rate`, `beta1`, `beta2`, and `epsilon` can each be a
-        callable that takes no arguments and returns the actual value to use.
-        This can be useful for changing these values across different
-        invocations of optimizer functions. @end_compatibility
+        Defaults to "Adam".
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate`, `beta1`, `beta2`, and
+    `epsilon` can each be a callable that takes no arguments and returns the
+    actual value to use. This can be useful for changing these values across
+    different invocations of optimizer functions.
+    @end_compatibility
     """
     super(AdamOptimizer, self).__init__(use_locking, name)
     self._lr = learning_rate
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index d77278e98f4..ab63f4237da 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -1189,7 +1189,7 @@ class _WrappedSession(object):
       try:
         self._sess.close()
       except _PREEMPTION_ERRORS as e:
-        logging.warning(
+        logging.error(
             'An error occurred when attempting to close the '
             'session. This may be due to a preemption in a '
             'connected worker or parameter server. Error: %s', e)
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 9732ea04f26..1fe8a8c729b 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -768,7 +768,7 @@ class Optimizer(
       # pylint: enable=protected-access
       mirrored_slot = named_slots.get(key, None)
       if mirrored_slot is None: return None
-      return mirrored_slot._get_closest()  # pylint: disable=protected-access
+      return mirrored_slot._get_on_device_or_primary()  # pylint: disable=protected-access
 
     return named_slots.get(_var_key(var), None)
 
diff --git a/tensorflow/python/training/supervisor_test.py b/tensorflow/python/training/supervisor_test.py
index fa0f89f3aa2..0529cff1697 100644
--- a/tensorflow/python/training/supervisor_test.py
+++ b/tensorflow/python/training/supervisor_test.py
@@ -110,7 +110,7 @@ class SupervisorTest(test.TestCase):
     with ops.Graph().as_default():
       my_op = constant_op.constant(1.0)
       sv = supervisor.Supervisor(logdir=logdir)
-      with sv.managed_session("") as sess:
+      with sv.managed_session(""):
         for _ in xrange(10):
           self.evaluate(my_op)
       # Supervisor has been stopped.
@@ -170,7 +170,7 @@ class SupervisorTest(test.TestCase):
           "", close_summary_writer=True, start_standard_services=False) as sess:
         sv.summary_computed(sess, sess.run(summ))
     event_paths = sorted(glob.glob(os.path.join(logdir, "event*")))
-    self.assertEquals(2, len(event_paths))
+    self.assertEqual(2, len(event_paths))
     # The two event files should have the same contents.
     for path in event_paths:
       # The summary iterator should report the summary once as we closed the
@@ -178,7 +178,7 @@ class SupervisorTest(test.TestCase):
       rr = summary_iterator.summary_iterator(path)
       # The first event should list the file_version.
       ev = next(rr)
-      self.assertEquals("brain.Event:2", ev.file_version)
+      self.assertEqual("brain.Event:2", ev.file_version)
 
       # The next one has the graph and metagraph.
       ev = next(rr)
@@ -198,7 +198,7 @@ class SupervisorTest(test.TestCase):
 
       # The next one should be a stop message if we closed cleanly.
       ev = next(rr)
-      self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status)
+      self.assertEqual(event_pb2.SessionLog.STOP, ev.session_log.status)
 
       # We should be done.
       with self.assertRaises(StopIteration):
@@ -227,7 +227,7 @@ class SupervisorTest(test.TestCase):
     rr = _summary_iterator(logdir)
     # The first event should list the file_version.
     ev = next(rr)
-    self.assertEquals("brain.Event:2", ev.file_version)
+    self.assertEqual("brain.Event:2", ev.file_version)
 
     # The next one has the graph.
     ev = next(rr)
@@ -360,7 +360,7 @@ class SupervisorTest(test.TestCase):
 
     # The first event should list the file_version.
     ev = next(rr)
-    self.assertEquals("brain.Event:2", ev.file_version)
+    self.assertEqual("brain.Event:2", ev.file_version)
 
     # The next one has the graph.
     ev = next(rr)
@@ -385,7 +385,7 @@ class SupervisorTest(test.TestCase):
 
     # The next one should be a stop message if we closed cleanly.
     ev = next(rr)
-    self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status)
+    self.assertEqual(event_pb2.SessionLog.STOP, ev.session_log.status)
 
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
@@ -421,7 +421,7 @@ class SupervisorTest(test.TestCase):
       with self.assertRaisesRegexp(RuntimeError, "requires a summary writer"):
         sv.summary_computed(sess, sess.run(summ))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testLogdirButExplicitlyNoSummaryWriter(self):
     logdir = self._test_dir("explicit_no_summary_writer")
     with ops.Graph().as_default():
@@ -460,7 +460,7 @@ class SupervisorTest(test.TestCase):
 
     # The first event should list the file_version.
     ev = next(rr)
-    self.assertEquals("brain.Event:2", ev.file_version)
+    self.assertEqual("brain.Event:2", ev.file_version)
 
     # The next one has the graph.
     ev = next(rr)
@@ -486,7 +486,7 @@ class SupervisorTest(test.TestCase):
 
     # The next one should be a stop message if we closed cleanly.
     ev = next(rr)
-    self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status)
+    self.assertEqual(event_pb2.SessionLog.STOP, ev.session_log.status)
 
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
@@ -507,7 +507,7 @@ class SupervisorTest(test.TestCase):
       sv = supervisor.Supervisor(logdir="", session_manager=sm)
       sv.prepare_or_wait_for_session("")
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testInitOp(self):
     logdir = self._test_dir("default_init_op")
     with ops.Graph().as_default():
@@ -517,7 +517,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testInitFn(self):
     logdir = self._test_dir("default_init_op")
     with ops.Graph().as_default():
@@ -531,7 +531,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testInitOpWithFeedDict(self):
     logdir = self._test_dir("feed_dict_init_op")
     with ops.Graph().as_default():
@@ -545,7 +545,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testReadyForLocalInitOp(self):
     server = server_lib.Server.create_local_server()
     logdir = self._test_dir("default_ready_for_local_init_op")
@@ -588,7 +588,7 @@ class SupervisorTest(test.TestCase):
     sv0.stop()
     sv1.stop()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testReadyForLocalInitOpRestoreFromCheckpoint(self):
     server = server_lib.Server.create_local_server()
     logdir = self._test_dir("ready_for_local_init_op_restore")
@@ -660,7 +660,7 @@ class SupervisorTest(test.TestCase):
 
       # This shouldn't add a variable to the VARIABLES collection responsible
       # for variables that are saved/restored from checkpoints.
-      self.assertEquals(len(variables.global_variables()), 0)
+      self.assertEqual(len(variables.global_variables()), 0)
 
       # Suppress normal variable inits to make sure the local one is
       # initialized via local_init_op.
@@ -681,7 +681,7 @@ class SupervisorTest(test.TestCase):
             collections=[ops.GraphKeys.LOCAL_VARIABLES])
         # This shouldn't add a variable to the VARIABLES collection responsible
         # for variables that are saved/restored from checkpoints.
-        self.assertEquals(len(variables.global_variables()), 0)
+        self.assertEqual(len(variables.global_variables()), 0)
 
       # Suppress normal variable inits to make sure the local one is
       # initialized via local_init_op.
@@ -720,7 +720,7 @@ class SupervisorTest(test.TestCase):
                                    "Variables not initialized: w"):
         sv.prepare_or_wait_for_session(server.target)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testSetupFail(self):
     logdir = self._test_dir("setup_fail")
     with ops.Graph().as_default():
@@ -731,17 +731,17 @@ class SupervisorTest(test.TestCase):
       variables.VariableV1([1.0, 2.0, 3.0], name="v")
       supervisor.Supervisor(logdir=logdir, is_chief=False)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testDefaultGlobalStep(self):
     logdir = self._test_dir("default_global_step")
     with ops.Graph().as_default():
       variables.VariableV1(287, name="global_step")
       sv = supervisor.Supervisor(logdir=logdir)
       sess = sv.prepare_or_wait_for_session("")
-      self.assertEquals(287, sess.run(sv.global_step))
+      self.assertEqual(287, sess.run(sv.global_step))
       sv.stop()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testRestoreFromMetaGraph(self):
     logdir = self._test_dir("restore_from_meta_graph")
     with ops.Graph().as_default():
@@ -756,14 +756,14 @@ class SupervisorTest(test.TestCase):
       self.assertIsNotNone(new_saver)
       sv2 = supervisor.Supervisor(logdir=logdir, saver=new_saver)
       sess = sv2.prepare_or_wait_for_session("")
-      self.assertEquals(1, sess.run("v0:0"))
+      self.assertEqual(1, sess.run("v0:0"))
       sv2.saver.save(sess, sv2.save_path)
       sv2.stop()
 
   # This test is based on the fact that the standard services start
   # right away and get to run once before sv.stop() returns.
   # We still sleep a bit to make the test robust.
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testStandardServicesWithoutGlobalStep(self):
     logdir = self._test_dir("standard_services_without_global_step")
     # Create a checkpoint.
@@ -784,7 +784,7 @@ class SupervisorTest(test.TestCase):
     # There should be an event file with a version number.
     rr = _summary_iterator(logdir)
     ev = next(rr)
-    self.assertEquals("brain.Event:2", ev.file_version)
+    self.assertEqual("brain.Event:2", ev.file_version)
     ev = next(rr)
     ev_graph = graph_pb2.GraphDef()
     ev_graph.ParseFromString(ev.graph_def)
@@ -802,7 +802,7 @@ class SupervisorTest(test.TestCase):
     self.assertProtoEquals("value { tag: 'v' simple_value: 1.0 }", ev.summary)
 
     ev = next(rr)
-    self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status)
+    self.assertEqual(event_pb2.SessionLog.STOP, ev.session_log.status)
 
     self.assertRaises(StopIteration, lambda: next(rr))
     # There should be a checkpoint file with the variable "foo"
@@ -814,7 +814,7 @@ class SupervisorTest(test.TestCase):
 
   # Same as testStandardServicesNoGlobalStep but with a global step.
   # We should get a summary about the step time.
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testStandardServicesWithGlobalStep(self):
     logdir = self._test_dir("standard_services_with_global_step")
     # Create a checkpoint.
@@ -835,7 +835,7 @@ class SupervisorTest(test.TestCase):
     # There should be an event file with a version number.
     rr = _summary_iterator(logdir)
     ev = next(rr)
-    self.assertEquals("brain.Event:2", ev.file_version)
+    self.assertEqual("brain.Event:2", ev.file_version)
     ev = next(rr)
     ev_graph = graph_pb2.GraphDef()
     ev_graph.ParseFromString(ev.graph_def)
@@ -849,8 +849,8 @@ class SupervisorTest(test.TestCase):
     ev = next(rr)
     # It is actually undeterministic whether SessionLog.START gets written
     # before the summary or the checkpoint, but this works when run 10000 times.
-    self.assertEquals(123, ev.step)
-    self.assertEquals(event_pb2.SessionLog.START, ev.session_log.status)
+    self.assertEqual(123, ev.step)
+    self.assertEqual(event_pb2.SessionLog.START, ev.session_log.status)
     first = next(rr)
     second = next(rr)
     # It is undeterministic whether the value gets written before the checkpoint
@@ -858,17 +858,17 @@ class SupervisorTest(test.TestCase):
     if first.HasField("summary"):
       self.assertProtoEquals("""value { tag: 'global_step/sec'
                                         simple_value: 0.0 }""", first.summary)
-      self.assertEquals(123, second.step)
-      self.assertEquals(event_pb2.SessionLog.CHECKPOINT,
-                        second.session_log.status)
+      self.assertEqual(123, second.step)
+      self.assertEqual(event_pb2.SessionLog.CHECKPOINT,
+                       second.session_log.status)
     else:
-      self.assertEquals(123, first.step)
-      self.assertEquals(event_pb2.SessionLog.CHECKPOINT,
-                        first.session_log.status)
+      self.assertEqual(123, first.step)
+      self.assertEqual(event_pb2.SessionLog.CHECKPOINT,
+                       first.session_log.status)
       self.assertProtoEquals("""value { tag: 'global_step/sec'
                                         simple_value: 0.0 }""", second.summary)
     ev = next(rr)
-    self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status)
+    self.assertEqual(event_pb2.SessionLog.STOP, ev.session_log.status)
     self.assertRaises(StopIteration, lambda: next(rr))
     # There should be a checkpoint file with the variable "foo"
     with ops.Graph().as_default(), self.cached_session() as sess:
diff --git a/tensorflow/python/training/tensorboard_logging.py b/tensorflow/python/training/tensorboard_logging.py
index b275d2f682f..7c1a2cdb086 100644
--- a/tensorflow/python/training/tensorboard_logging.py
+++ b/tensorflow/python/training/tensorboard_logging.py
@@ -21,9 +21,11 @@ or that should be permanently associated with the training session.
 
 You can use this just like the logging module:
 
->>> tensorboard_logging.set_summary_writer(summary_writer)
->>> tensorboard_logging.info("my %s", "message")
->>> tensorboard_logging.log(tensorboard_logging.WARN, "something")
+```
+tensorboard_logging.set_summary_writer(summary_writer)
+tensorboard_logging.info("my %s", "message")
+tensorboard_logging.log(tensorboard_logging.WARN, "something")
+```
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/types/BUILD b/tensorflow/python/types/BUILD
index f35ca7fb803..e93bf5c10b3 100644
--- a/tensorflow/python/types/BUILD
+++ b/tensorflow/python/types/BUILD
@@ -27,6 +27,9 @@ py_strict_library(
         "internal.py",
     ],
     srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = [
+        "//tensorflow:__subpackages__",
+        "//tensorflow:types_whitelist",
+    ],
     deps = [],
 )
diff --git a/tensorflow/python/util/dispatch.py b/tensorflow/python/util/dispatch.py
index e94e3345348..51dfe3793ae 100644
--- a/tensorflow/python/util/dispatch.py
+++ b/tensorflow/python/util/dispatch.py
@@ -39,6 +39,10 @@ from tensorflow.python.util import tf_inspect
 DISPATCH_ATTR = "_tf_dispatchers"
 
 
+# OpDispatchers which should be used for all operations.
+_GLOBAL_DISPATCHERS = []
+
+
 class OpDispatcher(object):
   """Abstract base class for TensorFlow operator dispatchers.
 
@@ -82,7 +86,20 @@ class OpDispatcher(object):
     getattr(op, DISPATCH_ATTR).append(self)
 
 
-def dispatch(op, *args, **kwargs):
+class GlobalOpDispatcher(object):
+  """Abstract base class for TensorFlow global operator dispatchers."""
+
+  NOT_SUPPORTED = OpDispatcher.NOT_SUPPORTED
+
+  def handle(self, op, args, kwargs):
+    """Handle the specified operation with the specified arguments."""
+
+  def register(self):
+    """Register this dispatcher as a handler for all ops."""
+    _GLOBAL_DISPATCHERS.append(self)
+
+
+def dispatch(op, args, kwargs):
   """Returns the result from the first successful dispatcher for a given op.
 
   Calls the `handle` method of each `OpDispatcher` that has been registered
@@ -90,8 +107,8 @@ def dispatch(op, *args, **kwargs):
 
   Args:
     op: Python function: the operation to dispatch for.
-    *args: The arguments to the operation.
-    **kwargs: They keyword arguments to the operation.
+    args: The arguments to the operation.
+    kwargs: They keyword arguments to the operation.
 
   Returns:
     The result of the operation, or `NOT_SUPPORTED` if no registered
@@ -101,6 +118,10 @@ def dispatch(op, *args, **kwargs):
     result = dispatcher.handle(args, kwargs)
     if result is not OpDispatcher.NOT_SUPPORTED:
       return result
+  for dispatcher in _GLOBAL_DISPATCHERS:
+    result = dispatcher.handle(op, args, kwargs)
+    if result is not OpDispatcher.NOT_SUPPORTED:
+      return result
   return OpDispatcher.NOT_SUPPORTED
 
 
@@ -181,7 +202,7 @@ def add_dispatch_support(target):
     except (TypeError, ValueError):
       # Note: convert_to_eager_tensor currently raises a ValueError, not a
       # TypeError, when given unexpected types.  So we need to catch both.
-      result = dispatch(wrapper, *args, **kwargs)
+      result = dispatch(wrapper, args, kwargs)
       if result is not OpDispatcher.NOT_SUPPORTED:
         return result
       else:
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
index 89999fcf843..bd35c391924 100644
--- a/tensorflow/python/util/dispatch_test.py
+++ b/tensorflow/python/util/dispatch_test.py
@@ -45,6 +45,47 @@ def test_op(x, y, z):
   return x + (2 * y) + (3 * z)
 
 
+class TensorTracer(object):
+  """An object used to trace TensorFlow graphs.
+
+  This is an example class that is used to test global op dispatchers.  The
+  global op dispatcher for TensorTracers is defined below.
+  """
+
+  def __init__(self, name, args=None, kwargs=None):
+    self.name = name
+    self.args = args
+    self.kwargs = kwargs
+
+  def __repr__(self):
+    if self.args is None and self.kwargs is None:
+      return self.name
+    else:
+      args = [str(x) for x in self.args]
+      args += sorted(
+          ["{}={}".format(name, x) for (name, x) in self.kwargs.items()])
+      return "{}({})".format(self.name, ", ".join(args))
+
+
+class TensorTracerOpDispatcher(dispatch.GlobalOpDispatcher):
+  """Global op dispatcher for TensorTracer."""
+
+  def handle(self, op, args, kwargs):
+    # Dispatcher only applies if at least one arg is a TensorTracer.
+    if not (any(self.is_tensor_tracer_arg(x) for x in args) or
+            any(self.is_tensor_tracer_arg(x) for x in kwargs.values())):
+      return self.NOT_SUPPORTED
+
+    return TensorTracer(op.__name__, args, kwargs)
+
+  def is_tensor_tracer_arg(self, value):
+    if isinstance(value, TensorTracer):
+      return True
+    if isinstance(value, (list, tuple)):
+      if any(isinstance(x, TensorTracer) for x in value):
+        return True
+
+
 @test_util.run_all_in_graph_and_eager_modes
 class DispatchTest(test_util.TensorFlowTestCase):
 
@@ -131,8 +172,21 @@ class DispatchTest(test_util.TensorFlowTestCase):
         r".*some_op \(from __main__\) is deprecated and will be "
         "removed in a future version.*")
 
+  def testGlobalDispatcher(self):
+    original_global_dispatchers = dispatch._GLOBAL_DISPATCHERS
+    try:
+      TensorTracerOpDispatcher().register()
+
+      x = TensorTracer("x")
+      y = TensorTracer("y")
+      trace = math_ops.reduce_sum(math_ops.add(math_ops.abs(x), y), axis=3)
+      self.assertEqual(
+          str(trace), "reduce_sum(add(name=None, x=abs(x), y=y), axis=3)")
+
+    finally:
+      # Clean up.
+      dispatch._GLOBAL_DISPATCHERS = original_global_dispatchers
+
 
 if __name__ == "__main__":
   googletest.main()
-
-
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 695cc4cc909..b4736bee142 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -215,7 +215,15 @@ def _yield_sorted_items(iterable):
   Yields:
     The iterable's (key, value) pairs, in order of sorted keys.
   """
-  if isinstance(iterable, _collections_abc.Mapping):
+  # Ordered to check common structure types (list, tuple, dict) first.
+  if isinstance(iterable, list):
+    for item in enumerate(iterable):
+      yield item
+  # namedtuples handled separately to avoid expensive namedtuple check.
+  elif type(iterable) == tuple:  # pylint: disable=unidiomatic-typecheck
+    for item in enumerate(iterable):
+      yield item
+  elif isinstance(iterable, (dict, _collections_abc.Mapping)):
     # Iterate through dictionaries in a deterministic order by sorting the
     # keys. Notice this means that we ignore the original order of `OrderedDict`
     # instances. This is intentional, to avoid potential bugs caused by mixing
diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index 04c96d03617..be1433a723c 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -94,6 +94,12 @@ class InvalidSymbolNameError(Exception):
   """Raised when trying to export symbol as an invalid or unallowed name."""
   pass
 
+_NAME_TO_SYMBOL_MAPPING = dict()
+
+
+def get_symbol_from_name(name):
+  return _NAME_TO_SYMBOL_MAPPING.get(name)
+
 
 def get_canonical_name_for_symbol(
     symbol, api_name=TENSORFLOW_API_NAME,
@@ -333,6 +339,11 @@ class api_export(object):  # pylint: disable=invalid-name
     _, undecorated_func = tf_decorator.unwrap(func)
     self.set_attr(undecorated_func, api_names_attr, self._names)
     self.set_attr(undecorated_func, api_names_attr_v1, self._names_v1)
+
+    for name in self._names:
+      _NAME_TO_SYMBOL_MAPPING[name] = func
+    for name_v1 in self._names_v1:
+      _NAME_TO_SYMBOL_MAPPING['compat.v1.%s' % name_v1] = func
     return func
 
   def set_attr(self, func, api_names_attr, names):
diff --git a/tensorflow/python/util/tf_export_test.py b/tensorflow/python/util/tf_export_test.py
index 20625792e9b..6716560b79b 100644
--- a/tensorflow/python/util/tf_export_test.py
+++ b/tensorflow/python/util/tf_export_test.py
@@ -82,6 +82,33 @@ class ValidateExportTest(test.TestCase):
                       tf_export.get_v1_names(decorated_function))
     self.assertEquals(['nameA', 'nameB'],
                       tf_export.get_v2_names(decorated_function))
+    self.assertEqual(tf_export.get_symbol_from_name('nameA'),
+                     decorated_function)
+    self.assertEqual(tf_export.get_symbol_from_name('nameB'),
+                     decorated_function)
+    self.assertEqual(
+        tf_export.get_symbol_from_name(
+            tf_export.get_canonical_name_for_symbol(decorated_function)),
+        decorated_function)
+
+  def testExportSingleFunctionV1Only(self):
+    export_decorator = tf_export.tf_export(v1=['nameA', 'nameB'])
+    decorated_function = export_decorator(_test_function)
+    self.assertEqual(decorated_function, _test_function)
+    self.assertAllEqual(('nameA', 'nameB'), decorated_function._tf_api_names_v1)
+    self.assertAllEqual(['nameA', 'nameB'],
+                        tf_export.get_v1_names(decorated_function))
+    self.assertEqual([],
+                     tf_export.get_v2_names(decorated_function))
+    self.assertEqual(tf_export.get_symbol_from_name('compat.v1.nameA'),
+                     decorated_function)
+    self.assertEqual(tf_export.get_symbol_from_name('compat.v1.nameB'),
+                     decorated_function)
+    self.assertEqual(
+        tf_export.get_symbol_from_name(
+            tf_export.get_canonical_name_for_symbol(
+                decorated_function, add_prefix_to_v1_names=True)),
+        decorated_function)
 
   def testExportMultipleFunctions(self):
     export_decorator1 = tf_export.tf_export('nameA', 'nameB')
@@ -92,6 +119,20 @@ class ValidateExportTest(test.TestCase):
     self.assertEquals(decorated_function2, _test_function2)
     self.assertEquals(('nameA', 'nameB'), decorated_function1._tf_api_names)
     self.assertEquals(('nameC', 'nameD'), decorated_function2._tf_api_names)
+    self.assertEqual(tf_export.get_symbol_from_name('nameB'),
+                     decorated_function1)
+    self.assertEqual(tf_export.get_symbol_from_name('nameD'),
+                     decorated_function2)
+    self.assertEqual(
+        tf_export.get_symbol_from_name(
+            tf_export.get_canonical_name_for_symbol(
+                decorated_function1)),
+        decorated_function1)
+    self.assertEqual(
+        tf_export.get_symbol_from_name(
+            tf_export.get_canonical_name_for_symbol(
+                decorated_function2)),
+        decorated_function2)
 
   def testExportClasses(self):
     export_decorator_a = tf_export.tf_export('TestClassA1')
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index 0c11b08131c..9ba4b7520e5 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import copy
 import sys
+import textwrap
 import traceback
 
 import six  # pylint: disable=unused-import
@@ -231,20 +232,27 @@ def should_use_result(fn=None, warn_in_eager=False, error_in_function=False):
     The wrapped function.
   """
   def decorated(fn):
+    """Decorates the input function."""
     def wrapped(*args, **kwargs):
       return _add_should_use_warning(fn(*args, **kwargs),
                                      warn_in_eager=warn_in_eager,
                                      error_in_function=error_in_function)
+    fn_doc = fn.__doc__ or ''
+    split_doc = fn_doc.split('\n', 1)
+    if len(split_doc) == 1:
+      updated_doc = fn_doc
+    else:
+      brief, rest = split_doc
+      updated_doc = '\n'.join([brief, textwrap.dedent(rest)])
+
+    note = ('\n\nNote: The output of this function should be used. If it is '
+            'not, a warning will be logged or an error may be raised. '
+            'To mark the output as used, call its .mark_used() method.')
     return tf_decorator.make_decorator(
         target=fn,
         decorator_func=wrapped,
         decorator_name='should_use_result',
-        decorator_doc=(
-            (fn.__doc__ or '') +
-            ('\n\n  '
-             '**NOTE** The output of this function should be used.  If it is '
-             'not, a warning will be logged or an error may be raised.  '
-             'To mark the output as used, call its .mark_used() method.')))
+        decorator_doc=updated_doc + note)
 
   if fn is not None:
     return decorated(fn)
diff --git a/tensorflow/stream_executor/build_defs.bzl b/tensorflow/stream_executor/build_defs.bzl
index 3cb24f8468f..830f526dbd7 100644
--- a/tensorflow/stream_executor/build_defs.bzl
+++ b/tensorflow/stream_executor/build_defs.bzl
@@ -10,6 +10,9 @@ def tf_additional_cuda_platform_deps():
 def tf_additional_cuda_driver_deps():
     return [":cuda_stub"]
 
+def tf_additional_cupti_deps():
+    return ["//tensorflow/stream_executor/cuda:cupti_stub"]
+
 def tf_additional_cudnn_plugin_deps():
     return []
 
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 1457a36beaf..c3cf9f5db15 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -273,6 +273,9 @@ cc_library(
     textual_hdrs = glob(["cufft_*.inc"]),
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
+        # LINT.IfChange
+        "@local_config_cuda//cuda:cufft_headers",
+        # LINT.ThenChange(//tensorflow/copy.bara.sky:cublas_headers)
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform:dso_loader",
     ]),
@@ -371,6 +374,9 @@ cc_library(
     textual_hdrs = ["curand_10_0.inc"],
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
+        # LINT.IfChange
+        "@local_config_cuda//cuda:curand_headers",
+        # LINT.ThenChange(//tensorflow/copy.bara.sky:cublas_headers)
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform:dso_loader",
     ]),
@@ -429,6 +435,7 @@ cc_library(
     deps = if_cuda_is_configured([
         # LINT.IfChange
         "@local_config_cuda//cuda:cublas_headers",
+        "@local_config_cuda//cuda:cusolver_headers",
         # LINT.ThenChange(//tensorflow/copy.bara.sky:cublas_headers)
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/stream_executor/lib",
@@ -451,6 +458,9 @@ cc_library(
     textual_hdrs = glob(["cusparse_*.inc"]),
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
+        # LINT.IfChange
+        "@local_config_cuda//cuda:cusparse_headers",
+        # LINT.ThenChange(//tensorflow/copy.bara.sky:cublas_headers)
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform:dso_loader",
     ]),
diff --git a/tensorflow/stream_executor/cuda/cublas_10_2.inc b/tensorflow/stream_executor/cuda/cublas_10_2.inc
index 42c4e5fef3b..067ba675288 100644
--- a/tensorflow/stream_executor/cuda/cublas_10_2.inc
+++ b/tensorflow/stream_executor/cuda/cublas_10_2.inc
@@ -2,29 +2,31 @@
 
 extern "C" {
 
-cublasStatus_t CUBLASWINAPI cublasCreate_v2 (cublasHandle_t *handle) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t *);
+cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCreate_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDestroy_v2 (cublasHandle_t handle) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t);
+cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDestroy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle, int *version) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int *);
+cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle,
+                                                int *version) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, version);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(libraryPropertyType, int *);
+cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type,
+                                              int *value) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
@@ -37,57 +39,71 @@ size_t CUBLASWINAPI cublasGetCudartVersion(void) {
   return func_ptr();
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetStream_v2 (cublasHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle,
+                                               cudaStream_t streamId) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetStream_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetStream_v2 (cublasHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
+cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle,
+                                               cudaStream_t *streamId) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStream_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t *mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
+cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle,
+                                                    cublasPointerMode_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetPointerMode_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
+cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle,
+                                                    cublasPointerMode_t mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetPointerMode_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t *mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
+cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle,
+                                                 cublasAtomicsMode_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetAtomicsMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
+cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle,
+                                                 cublasAtomicsMode_t mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetAtomicsMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle, cublasMath_t *mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
+cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle,
+                                              cublasMath_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMathMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
+cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle,
+                                              cublasMath_t mode) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMathMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
@@ -118,399 +134,384 @@ cublasGetLoggerCallback(cublasLogCallback *userCallback) {
   return func_ptr(userCallback);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetVector (int n, int elemSize, const void *x, 
-                                             int incx, void *devicePtr, int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void *x,
+                                            int incx, void *devicePtr,
+                                            int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVector");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, x, incx, devicePtr, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetVector (int n, int elemSize, const void *x, 
-                                             int incx, void *y, int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void *x,
+                                            int incx, void *y, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVector");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetMatrix (int rows, int cols, int elemSize, 
-                                             const void *A, int lda, void *B, 
-                                             int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize,
+                                            const void *A, int lda, void *B,
+                                            int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
+                                                 int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrix");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetMatrix (int rows, int cols, int elemSize, 
-                                             const void *A, int lda, void *B,
-                                             int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize,
+                                            const void *A, int lda, void *B,
+                                            int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
+                                                 int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrix");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetVectorAsync (int n, int elemSize, 
-                                                  const void *hostPtr, int incx, 
-                                                  void *devicePtr, int incy,
-                                                  cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasSetVectorAsync(int n, int elemSize,
+                                                 const void *hostPtr, int incx,
+                                                 void *devicePtr, int incy,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
+                                                 void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVectorAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, hostPtr, incx, devicePtr, incy, stream);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetVectorAsync (int n, int elemSize,
-                                                  const void *devicePtr, int incx,
-                                                  void *hostPtr, int incy,
-                                                  cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasGetVectorAsync(int n, int elemSize,
+                                                 const void *devicePtr,
+                                                 int incx, void *hostPtr,
+                                                 int incy,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
+                                                 void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVectorAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, devicePtr, incx, hostPtr, incy, stream);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync (int rows, int cols, int elemSize,
-                                                  const void *A, int lda, void *B,
-                                                  int ldb, cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync(int rows, int cols,
+                                                 int elemSize, const void *A,
+                                                 int lda, void *B, int ldb,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      int, int, int, const void *, int, void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrixAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync (int rows, int cols, int elemSize,
-                                                  const void *A, int lda, void *B,
-                                                  int ldb, cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync(int rows, int cols,
+                                                 int elemSize, const void *A,
+                                                 int lda, void *B, int ldb,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      int, int, int, const void *, int, void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrixAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
 }
 
-void CUBLASWINAPI cublasXerbla (const char *srName, int info) {
-  using FuncPtr = void (CUBLASWINAPI *)(const char *, int);
+void CUBLASWINAPI cublasXerbla(const char *srName, int info) {
+  using FuncPtr = void(CUBLASWINAPI *)(const char *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasXerbla");
   if (!func_ptr) LogFatalSymbolNotFound("cublasXerbla");
   return func_ptr(srName, info);
 }
 
-cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const void *x, 
-                                                     cudaDataType xType,
-                                                     int incx, 
-                                                     void *result,
-                                                     cudaDataType resultType,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, void *result,
+                                         cudaDataType resultType,
+                                         cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, void *,
+      cudaDataType, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasNrm2Ex");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, xType, incx, result, resultType, executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const float *x, 
-                                                     int incx, 
-                                                     float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx,
+                                           float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const double *x, 
-                                                     int incx, 
-                                                     double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx,
+                                           double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDotEx (cublasHandle_t handle,
-                                                     int n, 
-                                                     const void *x,
-                                                     cudaDataType xType, 
-                                                     int incx, 
-                                                     const void *y, 
-                                                     cudaDataType yType,
-                                                     int incy,
-                                                     void *result,
-                                                     cudaDataType resultType,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle, int n,
+                                        const void *x, cudaDataType xType,
+                                        int incx, const void *y,
+                                        cudaDataType yType, int incy,
+                                        void *result, cudaDataType resultType,
+                                        cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, void *, cudaDataType, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType, executionType);
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
+                  executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDotcEx (cublasHandle_t handle,
-                                                     int n, 
-                                                     const void *x,
-                                                     cudaDataType xType, 
-                                                     int incx, 
-                                                     const void *y, 
-                                                     cudaDataType yType,
-                                                     int incy,
-                                                     void *result,
-                                                     cudaDataType resultType,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, const void *y,
+                                         cudaDataType yType, int incy,
+                                         void *result, cudaDataType resultType,
+                                         cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, void *, cudaDataType, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotcEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType, executionType);
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
+                  executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSdot_v2 (cublasHandle_t handle,
-                                                     int n, 
-                                                     const float *x, 
-                                                     int incx, 
-                                                     const float *y, 
-                                                     int incy,
-                                                     float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle, int n,
+                                          const float *x, int incx,
+                                          const float *y, int incy,
+                                          float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, int, const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDdot_v2 (cublasHandle_t handle,
-                                                     int n, 
-                                                     const double *x, 
-                                                     int incx, 
-                                                     const double *y,
-                                                     int incy,
-                                                     double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle, int n,
+                                          const double *x, int incx,
+                                          const double *y, int incy,
+                                          double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, int, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCdotu_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      const cuComplex *y, 
-                                                      int incy,
-                                                      cuComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           const cuComplex *y, int incy,
+                                           cuComplex *result) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     int, const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCdotc_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      const cuComplex *y, 
-                                                      int incy,
-                                                      cuComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           const cuComplex *y, int incy,
+                                           cuComplex *result) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     int, const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdotu_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      const cuDoubleComplex *y, 
-                                                      int incy,
-                                                      cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           const cuDoubleComplex *y, int incy,
+                                           cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdotc_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx,
-                                                      const cuDoubleComplex *y, 
-                                                      int incy,
-                                                      cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           const cuDoubleComplex *y, int incy,
+                                           cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScalEx(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const void *alpha,  /* host or device pointer */
-                                                     cudaDataType alphaType,
-                                                     void *x, 
-                                                     cudaDataType xType,
-                                                     int incx,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType, int, cudaDataType);
+cublasStatus_t CUBLASWINAPI
+cublasScalEx(cublasHandle_t handle, int n,
+             const void *alpha, /* host or device pointer */
+             cudaDataType alphaType, void *x, cudaDataType xType, int incx,
+             cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType,
+      int, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScalEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, alphaType, x, xType, incx, executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const float *alpha,  /* host or device pointer */
-                                                     float *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSscal_v2(cublasHandle_t handle, int n,
+               const float *alpha, /* host or device pointer */
+               float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const double *alpha,  /* host or device pointer */
-                                                     double *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDscal_v2(cublasHandle_t handle, int n,
+               const double *alpha, /* host or device pointer */
+               double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const cuComplex *alpha, /* host or device pointer */
-                                                     cuComplex *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCscal_v2(cublasHandle_t handle, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsscal_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsscal_v2(cublasHandle_t handle, int n,
+                const float *alpha, /* host or device pointer */
+                cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */
-                                                     cuDoubleComplex *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZscal_v2(cublasHandle_t handle, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdscal_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const double *alpha, /* host or device pointer */
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZdscal_v2(cublasHandle_t handle, int n,
+                const double *alpha, /* host or device pointer */
+                cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasAxpyEx (cublasHandle_t handle,
-                                                      int n,
-                                                      const void *alpha, /* host or device pointer */
-                                                      cudaDataType alphaType,
-                                                      const void *x,
-                                                      cudaDataType xType,
-                                                      int incx,
-                                                      void *y,
-                                                      cudaDataType yType,
-                                                      int incy,
-                                                      cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, const void *, cudaDataType, int, void *, cudaDataType, int, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasAxpyEx(
+    cublasHandle_t handle, int n,
+    const void *alpha, /* host or device pointer */
+    cudaDataType alphaType, const void *x, cudaDataType xType, int incx,
+    void *y, cudaDataType yType, int incy, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, const void *,
+      cudaDataType, int, void *, cudaDataType, int, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasAxpyEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy, executiontype);
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy,
+                  executiontype);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */
-                                                      const float *x, 
-                                                      int incx, 
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSaxpy_v2(cublasHandle_t handle, int n,
+               const float *alpha, /* host or device pointer */
+               const float *x, int incx, float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const double *alpha, /* host or device pointer */
-                                                      const double *x, 
-                                                      int incx, 
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDaxpy_v2(cublasHandle_t handle, int n,
+               const double *alpha, /* host or device pointer */
+               const double *x, int incx, double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *alpha, /* host or device pointer */
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      cuComplex *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCaxpy_v2(cublasHandle_t handle, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, cuComplex *y, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      cuDoubleComplex *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(
+    cublasHandle_t handle, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
@@ -528,97 +529,82 @@ cublasStatus_t CUBLASWINAPI cublasCopyEx(cublasHandle_t handle, int n,
   return func_ptr(handle, n, x, xType, incx, y, yType, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasScopy_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx, float *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDcopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const double *x, 
-                                                      int incx, 
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDcopy_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx, double *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCcopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCcopy_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZcopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZcopy_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuDoubleComplex *, int,
+                                                 cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      float *x, 
-                                                      int incx, 
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSswap_v2(cublasHandle_t handle, int n,
+                                           float *x, int incx, float *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *,
+                                                 int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      double *x, 
-                                                      int incx, 
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDswap_v2(cublasHandle_t handle, int n,
+                                           double *x, int incx, double *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, double *,
+                                                 int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      cuComplex *x, 
-                                                      int incx, 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCswap_v2(cublasHandle_t handle, int n,
+                                           cuComplex *x, int incx, cuComplex *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZswap_v2(cublasHandle_t handle, int n,
+                                           cuDoubleComplex *x, int incx,
+                                           cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
@@ -635,45 +621,41 @@ cublasStatus_t CUBLASWINAPI cublasSwapEx(cublasHandle_t handle, int n, void *x,
   return func_ptr(handle, n, x, xType, incx, y, yType, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, int n,
+                                            const float *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const double *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, int n,
+                                            const double *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
@@ -690,45 +672,41 @@ cublasStatus_t CUBLASWINAPI cublasIamaxEx(
   return func_ptr(handle, n, x, xType, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, int n,
+                                            const float *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const double *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, int n,
+                                            const double *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
@@ -757,129 +735,113 @@ cublasStatus_t CUBLASWINAPI cublasAsumEx(
   return func_ptr(handle, n, x, xType, incx, result, resultType, executiontype);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const float *x, 
-                                                     int incx, 
-                                                     float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx,
+                                           float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const double *x, 
-                                                     int incx, 
-                                                     double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx,
+                                           double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     float *x, 
-                                                     int incx, 
-                                                     float *y, 
-                                                     int incy, 
-                                                     const float *c,  /* host or device pointer */
-                                                     const float *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int, const float *, const float *);
+cublasStatus_t CUBLASWINAPI
+cublasSrot_v2(cublasHandle_t handle, int n, float *x, int incx, float *y,
+              int incy, const float *c, /* host or device pointer */
+              const float *s) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *,
+                                     int, const float *, const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     double *x, 
-                                                     int incx, 
-                                                     double *y, 
-                                                     int incy, 
-                                                     const double *c,  /* host or device pointer */
-                                                     const double *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int, const double *, const double *);
+cublasStatus_t CUBLASWINAPI
+cublasDrot_v2(cublasHandle_t handle, int n, double *x, int incx, double *y,
+              int incy, const double *c, /* host or device pointer */
+              const double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *, int, double *, int, const double *,
+      const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuComplex *x, 
-                                                     int incx, 
-                                                     cuComplex *y, 
-                                                     int incy, 
-                                                     const float *c,      /* host or device pointer */
-                                                     const cuComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *, const cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCrot_v2(
+    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
+    int incy, const float *c, /* host or device pointer */
+    const cuComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
+      const cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsrot_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuComplex *x, 
-                                                     int incx, 
-                                                     cuComplex *y, 
-                                                     int incy, 
-                                                     const float *c,  /* host or device pointer */
-                                                     const float *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *, const float *);
+cublasStatus_t CUBLASWINAPI cublasCsrot_v2(
+    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
+    int incy, const float *c, /* host or device pointer */
+    const float *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
+      const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuDoubleComplex *x, 
-                                                     int incx, 
-                                                     cuDoubleComplex *y, 
-                                                     int incy, 
-                                                     const double *c,            /* host or device pointer */
-                                                     const cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, const double *, const cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZrot_v2(
+    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
+    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
+    const cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      const double *, const cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdrot_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuDoubleComplex *x, 
-                                                     int incx, 
-                                                     cuDoubleComplex *y, 
-                                                     int incy, 
-                                                     const double *c,  /* host or device pointer */
-                                                     const double *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, const double *, const double *);
+cublasStatus_t CUBLASWINAPI cublasZdrot_v2(
+    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
+    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
+    const double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      const double *, const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
@@ -899,45 +861,50 @@ cublasRotEx(cublasHandle_t handle, int n, void *x, cudaDataType xType, int incx,
                   executiontype);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrotg_v2(cublasHandle_t handle, 
-                                                     float *a,   /* host or device pointer */
-                                                     float *b,   /* host or device pointer */
-                                                     float *c,   /* host or device pointer */
-                                                     float *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, float *, float *, float *, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSrotg_v2(cublasHandle_t handle, float *a, /* host or device pointer */
+               float *b,                        /* host or device pointer */
+               float *c,                        /* host or device pointer */
+               float *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, float *,
+                                                 float *, float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrotg_v2(cublasHandle_t handle, 
-                                                     double *a,  /* host or device pointer */
-                                                     double *b,  /* host or device pointer */
-                                                     double *c,  /* host or device pointer */
-                                                     double *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, double *, double *, double *, double *);
+cublasStatus_t CUBLASWINAPI
+cublasDrotg_v2(cublasHandle_t handle, double *a, /* host or device pointer */
+               double *b,                        /* host or device pointer */
+               double *c,                        /* host or device pointer */
+               double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, double *,
+                                                 double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCrotg_v2(cublasHandle_t handle, 
-                                                     cuComplex *a,  /* host or device pointer */
-                                                     cuComplex *b,  /* host or device pointer */
-                                                     float *c,      /* host or device pointer */
-                                                     cuComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasCrotg_v2(cublasHandle_t handle, cuComplex *a, /* host or device pointer */
+               cuComplex *b,                        /* host or device pointer */
+               float *c,                            /* host or device pointer */
+               cuComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZrotg_v2(cublasHandle_t handle, 
-                                                     cuDoubleComplex *a,  /* host or device pointer */
-                                                     cuDoubleComplex *b,  /* host or device pointer */
-                                                     double *c,           /* host or device pointer */
-                                                     cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZrotg_v2(
+    cublasHandle_t handle, cuDoubleComplex *a, /* host or device pointer */
+    cuDoubleComplex *b,                        /* host or device pointer */
+    double *c,                                 /* host or device pointer */
+    cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *,
+      cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
@@ -959,27 +926,21 @@ cublasStatus_t CUBLASWINAPI cublasRotgEx(cublasHandle_t handle,
   return func_ptr(handle, a, b, abType, c, s, csType, executiontype);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     float *x, 
-                                                     int incx, 
-                                                     float *y, 
-                                                     int incy, 
-                                                     const float* param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int, const float *);
+cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, int n,
+                                           float *x, int incx, float *y,
+                                           int incy, const float *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, float *, int, float *, int, const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, param);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     double *x, 
-                                                     int incx, 
-                                                     double *y, 
-                                                     int incy, 
-                                                     const double* param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int, const double *);
+cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, int n,
+                                           double *x, int incx, double *y,
+                                           int incy, const double *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *, int, double *, int, const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, param);
@@ -999,25 +960,27 @@ cublasRotmEx(cublasHandle_t handle, int n, void *x, cudaDataType xType,
                   executiontype);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrotmg_v2(cublasHandle_t handle, 
-                                                      float *d1,        /* host or device pointer */
-                                                      float *d2,        /* host or device pointer */
-                                                      float *x1,        /* host or device pointer */
-                                                      const float *y1,  /* host or device pointer */
-                                                      float *param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, float *, float *, float *, const float *, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSrotmg_v2(cublasHandle_t handle, float *d1, /* host or device pointer */
+                float *d2,                        /* host or device pointer */
+                float *x1,                        /* host or device pointer */
+                const float *y1,                  /* host or device pointer */
+                float *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, float *, float *, float *, const float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, d1, d2, x1, y1, param);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrotmg_v2(cublasHandle_t handle, 
-                                                      double *d1,        /* host or device pointer */  
-                                                      double *d2,        /* host or device pointer */  
-                                                      double *x1,        /* host or device pointer */  
-                                                      const double *y1,  /* host or device pointer */  
-                                                      double *param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, double *, double *, double *, const double *, double *);
+cublasStatus_t CUBLASWINAPI
+cublasDrotmg_v2(cublasHandle_t handle, double *d1, /* host or device pointer */
+                double *d2,                        /* host or device pointer */
+                double *x1,                        /* host or device pointer */
+                const double *y1,                  /* host or device pointer */
+                double *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, double *, double *, double *, const double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, d1, d2, x1, y1, param);
@@ -1040,2031 +1003,1701 @@ cublasRotmgEx(cublasHandle_t handle, void *d1,     /* host or device pointer */
                   paramType, executiontype);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemv_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t trans, 
-                                                      int m, 
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */
-                                                      const float *A, 
-                                                      int lda, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      const float *beta,  /* host or device pointer */
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const float *, const float *,
+      int, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemv_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta, /* host or device pointer */
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x, 
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgemv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgbmv_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A, 
-                                                      int lda, 
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               int kl, int ku, const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const float *,
+      const float *, int, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgbmv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const double *alpha, /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda, 
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta, /* host or device pointer */ 
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               int kl, int ku, const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgbmv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda, 
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
+    int ku, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *x, int incx,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgbmv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                      const cuDoubleComplex *A,
-                                                      int lda, 
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
+    int ku, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const float *A, int lda, float *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *AP, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *AP, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *AP, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *AP, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *AP,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *AP, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *AP, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *AP, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *AP, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *AP,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const float *A, int lda, float *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsymv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */ 
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *beta, /* host or device pointer */ 
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsymv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta, /* host or device pointer */ 
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsymv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsymv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */ 
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta,   /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsymv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChemv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasChemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhemv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */ 
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta,   /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZhemv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsbmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha,   /* host or device pointer */ 
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *x, 
-                                                      int incx,
-                                                      const float *beta,  /* host or device pointer */ 
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *,
+      int, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha,   /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *x, 
-                                                      int incx,
-                                                      const double *beta,   /* host or device pointer */ 
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChbmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x, 
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasChbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhbmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *x, 
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSspmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const float *alpha,  /* host or device pointer */                                           
-                                                      const float *AP,
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *beta,   /* host or device pointer */  
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *AP, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *,
+      const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDspmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *AP,
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta,  /* host or device pointer */  
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const double *alpha, /* host or device pointer */
+               const double *AP, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *AP,
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasChpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *AP, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhpmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *AP,
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZhpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *AP, const cuDoubleComplex *x, int incx,
+               const cuDoubleComplex *beta, /* host or device pointer */
+               cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSger_v2 (cublasHandle_t handle,
-                                                     int m,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const float *x,
-                                                     int incx,
-                                                     const float *y,
-                                                     int incy,
-                                                     float *A,
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const float *, const float *, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSger_v2(
+    cublasHandle_t handle, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *x, int incx, const float *y, int incy, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const float *, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDger_v2 (cublasHandle_t handle, 
-                                                     int m,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */   
-                                                     const double *x,
-                                                     int incx,
-                                                     const double *y,
-                                                     int incy,
-                                                     double *A,
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const double *, const double *, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDger_v2(
+    cublasHandle_t handle, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const double *, const double *, int,
+      const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgeru_v2 (cublasHandle_t handle, 
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *y,
-                                                      int incy,
-                                                      cuComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgeru_v2(cublasHandle_t handle, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgerc_v2 (cublasHandle_t handle,
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *y,
-                                                      int incy,
-                                                      cuComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgerc_v2(cublasHandle_t handle, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgeru_v2 (cublasHandle_t handle, 
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgeru_v2(cublasHandle_t handle, int m, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgerc_v2 (cublasHandle_t handle,
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgerc_v2(cublasHandle_t handle, int m, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const float *x,
-                                                     int incx,
-                                                     float *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const float *x, int incx, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const double *x,
-                                                     int incx,
-                                                     double *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const double *x, int incx, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const cuComplex *alpha, /* host or device pointer */  
-                                                     const cuComplex *x,
-                                                     int incx,
-                                                     cuComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const cuComplex *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *x,
-                                                     int incx,
-                                                     cuDoubleComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const cuDoubleComplex *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCher_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const cuComplex *x,
-                                                     int incx,
-                                                     cuComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
+      int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZher_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *x,
-                                                     int incx,
-                                                     cuDoubleComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSspr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const float *x,
-                                                     int incx,
-                                                     float *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const float *x, int incx, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDspr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const double *x,
-                                                     int incx,
-                                                     double *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI
+cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const double *x, int incx, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChpr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const cuComplex *x,
-                                                     int incx,
-                                                     cuComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasChpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
+      int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhpr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *x,
-                                                     int incx,
-                                                     cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *y,
-                                                      int incy,
-                                                      float *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const float *alpha, /* host or device pointer */
+    const float *x, int incx, const float *y, int incy, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *y,
-                                                      int incy,
-                                                      double *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, int n, 
-                                                      const cuComplex *alpha,  /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx, 
-                                                      const cuComplex *y,
-                                                      int incy, 
-                                                      cuComplex *A, 
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCher2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, int n, 
-                                                      const cuComplex *alpha,  /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx, 
-                                                      const cuComplex *y,
-                                                      int incy, 
-                                                      cuComplex *A, 
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZher2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSspr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const float *alpha,  /* host or device pointer */  
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *y,
-                                                      int incy,
-                                                      float *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *x, int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDspr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const double *alpha,  /* host or device pointer */  
-                                                      const double *x,
-                                                      int incx, 
-                                                      const double *y,
-                                                      int incy,
-                                                      double *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDspr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChpr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *y,
-                                                      int incy,
-                                                      cuComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasChpr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *x, int incx, const cuComplex *y, int incy, cuComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhpr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasZhpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A, 
-                                                      int lda,
-                                                      const float *B,
-                                                      int ldb, 
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A, 
-                                                      int lda,
-                                                      const double *B,
-                                                      int ldb, 
-                                                      const double *beta, /* host or device pointer */  
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A, 
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb, 
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3m  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A, 
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb, 
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3m(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3m");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3mEx (cublasHandle_t handle, 
-                                                     cublasOperation_t transa, cublasOperation_t transb,  
-                                                     int m, int n, int k, 
-                                                     const cuComplex *alpha, 
-                                                     const void *A, 
-                                                     cudaDataType Atype, 
-                                                     int lda, 
-                                                     const void *B, 
-                                                     cudaDataType Btype, 
-                                                     int ldb,
-                                                     const cuComplex *beta, 
-                                                     void *C, 
-                                                     cudaDataType Ctype, 
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const void *, cudaDataType, int, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const void *A,
+    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
+    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A, 
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb, 
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemm3m  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A, 
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb, 
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgemm3m(cublasHandle_t handle, cublasOperation_t transa,
+              cublasOperation_t transb, int m, int n, int k,
+              const cuDoubleComplex *alpha, /* host or device pointer */
+              const cuDoubleComplex *A, int lda, const cuDoubleComplex *B,
+              int ldb, const cuDoubleComplex *beta, /* host or device pointer */
+              cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm3m");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemmEx  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype,
-                                                      int lda,
-                                                      const void *B,
-                                                      cudaDataType Btype,
-                                                      int ldb, 
-                                                      const float *beta, /* host or device pointer */  
-                                                      void *C,
-                                                      cudaDataType Ctype,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const void *, cudaDataType, int, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasSgemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda, const void *B,
+    cudaDataType Btype, int ldb, const float *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const float *, void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGemmEx  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const void *alpha, /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype,
-                                                      int lda,
-                                                      const void *B,
-                                                      cudaDataType Btype,
-                                                      int ldb, 
-                                                      const void *beta, /* host or device pointer */  
-                                                      void *C,
-                                                      cudaDataType Ctype,
-                                                      int ldc,
-                                                      cudaDataType computeType,
-                                                      cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void *, const void *, cudaDataType, int, const void *, cudaDataType, int, const void *, void *, cudaDataType, int, cudaDataType, cublasGemmAlgo_t);
+cublasStatus_t CUBLASWINAPI cublasGemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const void *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda, const void *B,
+    cudaDataType Btype, int ldb, const void *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc, cudaDataType computeType,
+    cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const void *, const void *, cudaDataType, int, const void *, cudaDataType,
+      int, const void *, void *, cudaDataType, int, cudaDataType,
+      cublasGemmAlgo_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemmEx (cublasHandle_t handle, 
-                                                     cublasOperation_t transa, cublasOperation_t transb,  
-                                                     int m, int n, int k, 
-                                                     const cuComplex *alpha, 
-                                                     const void *A, 
-                                                     cudaDataType Atype, 
-                                                     int lda, 
-                                                     const void *B, 
-                                                     cudaDataType Btype, 
-                                                     int ldb,
-                                                     const cuComplex *beta, 
-                                                     void *C, 
-                                                     cudaDataType Ctype, 
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const void *, cudaDataType, int, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCgemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const void *A,
+    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
+    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasUint8gemmBias (cublasHandle_t handle, 
-                                                           cublasOperation_t transa, cublasOperation_t transb, cublasOperation_t transc,  
-                                                           int m, int n, int k, 
-                                                           const unsigned char *A, int A_bias, int lda, 
-                                                           const unsigned char *B, int B_bias, int ldb,
-                                                                 unsigned char *C, int C_bias, int ldc,
-                                                           int C_mult, int C_shift) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t, int, int, int, const unsigned char *, int, int, const unsigned char *, int, int, unsigned char *, int, int, int, int);
+cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    cublasOperation_t transc, int m, int n, int k, const unsigned char *A,
+    int A_bias, int lda, const unsigned char *B, int B_bias, int ldb,
+    unsigned char *C, int C_bias, int ldc, int C_mult, int C_shift) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t,
+      int, int, int, const unsigned char *, int, int, const unsigned char *,
+      int, int, unsigned char *, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasUint8gemmBias");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B, B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
+  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B,
+                  B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha,           /* host or device pointer */
+    const float *A, int lda, const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha,  /* host or device pointer */  
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *beta,  /* host or device pointer */  
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha,            /* host or device pointer */
+    const double *A, int lda, const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrkEx ( cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype, 
-                                                      int lda,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      void *C, 
-                                                      cudaDataType Ctype, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrkEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
+      void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      int n, 
-                                                      int k,
-                                                      const cuComplex *alpha, 
-                                                      const void *A, 
-                                                      cudaDataType Atype, 
-                                                      int lda,
-                                                      const cuComplex *beta, 
-                                                      void *C, 
-                                                      cudaDataType Ctype, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, const void *A, cudaDataType Atype,
+    int lda, const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
+      void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk3mEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha,  /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const float *beta,   /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const cuComplex *, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCherk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha,               /* host or device pointer */
+    const cuComplex *A, int lda, const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const cuComplex *, int, const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZherk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha,  /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const double *beta,  /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZherk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const cuDoubleComplex *, int, const double *,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherkEx  (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha,  /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype,
-                                                      int lda,
-                                                      const float *beta,   /* host or device pointer */  
-                                                      void *C,
-                                                      cudaDataType Ctype,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCherkEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    const float *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const void *, cudaDataType, int, const float *, void *,
+      cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherk3mEx (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo, 
-                                                       cublasOperation_t trans, 
-                                                       int n, 
-                                                       int k,
-                                                       const float *alpha, 
-                                                       const void *A, cudaDataType Atype, 
-                                                       int lda,
-                                                       const float *beta, 
-                                                       void *C, 
-                                                       cudaDataType Ctype, 
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCherk3mEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, const void *A, cudaDataType Atype,
+    int lda, const float *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const void *, cudaDataType, int, const float *, void *,
+      cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk3mEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const float *alpha, /* host or device pointer */  
-                                                       const float *A,
-                                                       int lda,
-                                                       const float *B,
-                                                       int ldb,
-                                                       const float *beta, /* host or device pointer */  
-                                                       float *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const double *alpha, /* host or device pointer */  
-                                                       const double *A,
-                                                       int lda,
-                                                       const double *B,
-                                                       int ldb,
-                                                       const double *beta, /* host or device pointer */  
-                                                       double *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const cuComplex *alpha, /* host or device pointer */  
-                                                       const cuComplex *A,
-                                                       int lda,
-                                                       const cuComplex *B,
-                                                       int ldb,
-                                                       const cuComplex *beta, /* host or device pointer */  
-                                                       cuComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const cuDoubleComplex *alpha,  /* host or device pointer */  
-                                                       const cuDoubleComplex *A,
-                                                       int lda,
-                                                       const cuDoubleComplex *B,
-                                                       int ldb,
-                                                       const cuDoubleComplex *beta,  /* host or device pointer */  
-                                                       cuDoubleComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCher2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const cuComplex *alpha, /* host or device pointer */  
-                                                       const cuComplex *A,
-                                                       int lda,
-                                                       const cuComplex *B,
-                                                       int ldb,
-                                                       const float *beta,   /* host or device pointer */  
-                                                       cuComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCher2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZher2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans, 
-                                                       int n,
-                                                       int k,
-                                                       const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                       const cuDoubleComplex *A, 
-                                                       int lda,
-                                                       const cuDoubleComplex *B,
-                                                       int ldb,
-                                                       const double *beta, /* host or device pointer */  
-                                                       cuDoubleComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZher2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const float *alpha, /* host or device pointer */ 
-                                                    const float *A,
-                                                    int lda,
-                                                    const float *B,
-                                                    int ldb,
-                                                    const float *beta, /* host or device pointer */ 
-                                                    float *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const double *alpha, /* host or device pointer */ 
-                                                    const double *A,
-                                                    int lda,
-                                                    const double *B,
-                                                    int ldb,
-                                                    const double *beta, /* host or device pointer */ 
-                                                    double *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuComplex *alpha, /* host or device pointer */ 
-                                                    const cuComplex *A,
-                                                    int lda,
-                                                    const cuComplex *B,
-                                                    int ldb,
-                                                    const cuComplex *beta, /* host or device pointer */ 
-                                                    cuComplex *C, 
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo, 
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                    const cuDoubleComplex *A,
-                                                    int lda,
-                                                    const cuDoubleComplex *B,
-                                                    int ldb,
-                                                    const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                    cuDoubleComplex *C, 
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuComplex *alpha, /* host or device pointer */ 
-                                                    const cuComplex *A,
-                                                    int lda,
-                                                    const cuComplex *B,
-                                                    int ldb,
-                                                    const float *beta, /* host or device pointer */ 
-                                                    cuComplex *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCherkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZherkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                    const cuDoubleComplex *A,
-                                                    int lda,
-                                                    const cuDoubleComplex *B,
-                                                    int ldb,
-                                                    const double *beta, /* host or device pointer */ 
-                                                    cuDoubleComplex *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZherkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *B,
-                                                      int ldb,
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m, 
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *B,
-                                                      int ldb,
-                                                      const double *beta, /* host or device pointer */  
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChemm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasChemm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhemm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZhemm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrsm_v2 (cublasHandle_t handle, 
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda,
-                                                      float *B,
-                                                      int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *A, int lda, float *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *, int, float *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrsm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *B,
-                                                      int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *A, int lda, double *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *, int, double *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(cublasHandle_t handle,
-                                                     cublasSideMode_t side,
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuComplex *alpha, /* host or device pointer */  
-                                                     const cuComplex *A,
-                                                     int lda,
-                                                     cuComplex *B,
-                                                     int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, cuComplex *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(cublasHandle_t handle, 
-                                                     cublasSideMode_t side,
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *A,                                        
-                                                     int lda,
-                                                     cuDoubleComplex *B,
-                                                     int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrmm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda, 
-                                                      const float *B,
-                                                      int ldb,
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrmm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *B,
-                                                      int ldb,
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb, double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *, int,
+      const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(cublasHandle_t handle,
-                                                     cublasSideMode_t side,
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuComplex *alpha, /* host or device pointer */  
-                                                     const cuComplex *A,
-                                                     int lda,
-                                                     const cuComplex *B,
-                                                     int ldb,
-                                                     cuComplex *C,
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb, cuComplex *C,
+    int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, 
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *A,
-                                                     int lda,
-                                                     const cuDoubleComplex *B,
-                                                     int ldb,
-                                                     cuDoubleComplex *C,
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasSgemmBatched(
@@ -3079,7 +2712,8 @@ cublasStatus_t CUBLASWINAPI cublasSgemmBatched(
       const float *, float *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI cublasDgemmBatched(
@@ -3094,7 +2728,8 @@ cublasStatus_t CUBLASWINAPI cublasDgemmBatched(
       const double *, double *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI cublasCgemmBatched(
@@ -3110,7 +2745,8 @@ cublasStatus_t CUBLASWINAPI cublasCgemmBatched(
       int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(
@@ -3126,7 +2762,8 @@ cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(
       int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI
@@ -3144,7 +2781,8 @@ cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa,
       cuDoubleComplex *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx(
@@ -3188,200 +2826,155 @@ cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx(
                   batchCount, computeType, algo);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const float *alpha,  /* host or device pointer */
-                                                                 const float *A,
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const float *B,
-                                                                 int ldb,
-                                                                 long long int strideB,
-                                                                 const float *beta,   /* host or device pointer */
-                                                                 float *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *, int, long long, const float *, int, long long, const float *, float *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha,        /* host or device pointer */
+    const float *A, int lda, long long int strideA, /* purposely signed */
+    const float *B, int ldb, long long int strideB,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *, int, long long, const float *, int,
+      long long, const float *, float *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const double *alpha,  /* host or device pointer */
-                                                                 const double *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const double *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const double *beta,   /* host or device pointer */
-                                                                 double *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *, int, long long, const double *, int, long long, const double *, double *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, long long int strideA, /* purposely signed */
+    const double *B, int ldb, long long int strideB,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *, int, long long, const double *, int,
+      long long, const double *, double *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const cuComplex *alpha,  /* host or device pointer */
-                                                                 const cuComplex *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const cuComplex *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const cuComplex *beta,   /* host or device pointer */
-                                                                 cuComplex *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, long long, const cuComplex *, int, long long, const cuComplex *, cuComplex *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
+    const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
+      int, long long, const cuComplex *, cuComplex *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const cuComplex *alpha,  /* host or device pointer */
-                                                                 const cuComplex *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const cuComplex *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const cuComplex *beta,   /* host or device pointer */
-                                                                 cuComplex *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, long long, const cuComplex *, int, long long, const cuComplex *, cuComplex *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
+    const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
+      int, long long, const cuComplex *, cuComplex *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const cuDoubleComplex *alpha,  /* host or device pointer */
-                                                                 const cuDoubleComplex *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const cuDoubleComplex *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const cuDoubleComplex *beta,   /* host or device poi */
-                                                                 cuDoubleComplex *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, cuDoubleComplex *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    long long int strideA, /* purposely signed */
+    const cuDoubleComplex *B, int ldb, long long int strideB,
+    const cuDoubleComplex *beta, /* host or device poi */
+    cuDoubleComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, long long,
+      const cuDoubleComplex *, int, long long, const cuDoubleComplex *,
+      cuDoubleComplex *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const float *alpha, /* host or device pointer */ 
-                                                  const float *A, 
-                                                  int lda,
-                                                  const float *beta , /* host or device pointer */ 
-                                                  const float *B, 
-                                                  int ldb,
-                                                  float *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const float *alpha,           /* host or device pointer */
+    const float *A, int lda, const float *beta, /* host or device pointer */
+    const float *B, int ldb, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, const float *, int,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const double *alpha, /* host or device pointer */ 
-                                                  const double *A, 
-                                                  int lda,
-                                                  const double *beta, /* host or device pointer */ 
-                                                  const double *B, 
-                                                  int ldb,
-                                                  double *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const double *alpha,            /* host or device pointer */
+    const double *A, int lda, const double *beta, /* host or device pointer */
+    const double *B, int ldb, double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, const double *, int,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const cuComplex *alpha, /* host or device pointer */ 
-                                                  const cuComplex *A, 
-                                                  int lda,
-                                                  const cuComplex *beta, /* host or device pointer */  
-                                                  const cuComplex *B, 
-                                                  int ldb,
-                                                  cuComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    const cuComplex *B, int ldb, cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                  const cuDoubleComplex *A, 
-                                                  int lda,
-                                                  const cuDoubleComplex *beta, /* host or device pointer */  
-                                                  const cuDoubleComplex *B, 
-                                                  int ldb,
-                                                  cuDoubleComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    const cuDoubleComplex *B, int ldb, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(
@@ -3494,7 +3087,8 @@ cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(
       const int *, float *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(
@@ -3506,7 +3100,8 @@ cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(
       const int *, double *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(
@@ -3518,7 +3113,8 @@ cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(
       int, const int *, cuComplex *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(
@@ -3531,7 +3127,8 @@ cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(
       cuDoubleComplex *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI cublasStrsmBatched(
@@ -3546,7 +3143,8 @@ cublasStatus_t CUBLASWINAPI cublasStrsmBatched(
       float *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(
@@ -3561,7 +3159,8 @@ cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(
       double *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(
@@ -3576,7 +3175,8 @@ cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(
       int, cuComplex *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(
@@ -3591,7 +3191,8 @@ cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(
       const cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(
@@ -3710,7 +3311,8 @@ cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
       float *const[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI
@@ -3724,7 +3326,8 @@ cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
       double *const[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI
@@ -3737,7 +3340,8 @@ cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
       cuComplex *const[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI
@@ -3751,1467 +3355,1666 @@ cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
       int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const float *A, 
-                                                  int lda,
-                                                  const float *x, 
-                                                  int incx,
-                                                  float *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const float *, int, const float *, int, float *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const float *A, int lda, const float *x,
+                                        int incx, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const double *A, 
-                                                  int lda,
-                                                  const double *x, 
-                                                  int incx,
-                                                  double *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const double *, int, const double *, int, double *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const double *A, int lda,
+                                        const double *x, int incx, double *C,
+                                        int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const double *, int,
+      const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const cuComplex *A, 
-                                                  int lda,
-                                                  const cuComplex *x, 
-                                                  int incx,
-                                                  cuComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const cuComplex *A, int lda,
+                                        const cuComplex *x, int incx,
+                                        cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const cuDoubleComplex *A, 
-                                                  int lda,
-                                                  const cuDoubleComplex *x, 
-                                                  int incx,
-                                                  cuDoubleComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const cuDoubleComplex *A, int lda,
+                                        const cuDoubleComplex *x, int incx,
+                                        cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const float *AP,
-                                                     float *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const float *AP, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const double *AP,
-                                                     double *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const double *AP, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuComplex *AP,
-                                                     cuComplex *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuComplex *AP, cuComplex *A,
+                                         int lda) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
+                                     const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuDoubleComplex *AP,
-                                                     cuDoubleComplex *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuDoubleComplex *AP,
+                                         cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const float *A,
-                                                     int lda,
-                                                     float *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasStrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const float *A, int lda, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const double *A,
-                                                     int lda,
-                                                     double *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const double *A, int lda, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuComplex *A,
-                                                     int lda,
-                                                     cuComplex *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuComplex *A, int lda,
+                                         cuComplex *AP) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
+                                     const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuDoubleComplex *A,
-                                                     int lda,
-                                                     cuDoubleComplex *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuDoubleComplex *A, int lda,
+                                         cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus CUBLASWINAPI cublasInit (void) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+cublasStatus CUBLASWINAPI cublasInit(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasInit");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
-cublasStatus CUBLASWINAPI cublasShutdown (void) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+cublasStatus CUBLASWINAPI cublasShutdown(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasShutdown");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
-cublasStatus CUBLASWINAPI cublasGetError (void) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+cublasStatus CUBLASWINAPI cublasGetError(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetError");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
 cublasStatus CUBLASWINAPI cublasGetVersion(int *version) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int *);
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(version);
 }
 
-cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, void **);
+cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void **devicePtr) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasAlloc");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, devicePtr);
 }
 
-cublasStatus CUBLASWINAPI cublasFree (void *devicePtr) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(void *);
+cublasStatus CUBLASWINAPI cublasFree(void *devicePtr) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasFree");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(devicePtr);
 }
 
-cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cudaStream_t);
+cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetKernelStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stream);
 }
 
-float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int);
+float CUBLASWINAPI cublasSnrm2(int n, const float *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSnrm2");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int);
+double CUBLASWINAPI cublasDnrm2(int n, const double *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDnrm2");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const cuComplex *, int);
+float CUBLASWINAPI cublasScnrm2(int n, const cuComplex *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasScnrm2");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDznrm2");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y, 
-                               int incy) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int, const float *, int);
+float CUBLASWINAPI cublasSdot(int n, const float *x, int incx, const float *y,
+                              int incy) {
+  using FuncPtr =
+      float(CUBLASWINAPI *)(int, const float *, int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSdot");
   return func_ptr(n, x, incx, y, incy);
 }
 
-double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y, 
-                               int incy) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int, const double *, int);
+double CUBLASWINAPI cublasDdot(int n, const double *x, int incx,
+                               const double *y, int incy) {
+  using FuncPtr =
+      double(CUBLASWINAPI *)(int, const double *, int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDdot");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y, 
-                               int incy) {
-  using FuncPtr = cuComplex (CUBLASWINAPI *)(int, const cuComplex *, int, const cuComplex *, int);
+cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex *x, int incx,
+                                   const cuComplex *y, int incy) {
+  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
+                                            const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCdotu");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y, 
-                               int incy) {
-  using FuncPtr = cuComplex (CUBLASWINAPI *)(int, const cuComplex *, int, const cuComplex *, int);
+cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex *x, int incx,
+                                   const cuComplex *y, int incy) {
+  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
+                                            const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCdotc");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
-                               int incy) {
-  using FuncPtr = cuDoubleComplex (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex *x,
+                                         int incx, const cuDoubleComplex *y,
+                                         int incy) {
+  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
+      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdotu");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
-                               int incy) {
-  using FuncPtr = cuDoubleComplex (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex *x,
+                                         int incx, const cuDoubleComplex *y,
+                                         int incy) {
+  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
+      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdotc");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float, float *, int);
+void CUBLASWINAPI cublasSscal(int n, float alpha, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double, double *, int);
+void CUBLASWINAPI cublasDscal(int n, double alpha, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float, cuComplex *, int);
+void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex *x,
+                               int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx, 
-                               float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float *x, int incx,
+                              float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x, 
-                               int incx, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double *x, int incx,
+                              double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x, 
-                               int incx, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex *x,
+                              int incx, cuComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
-                               int incx, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZaxpy(int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const float *, int, float *, int);
+void CUBLASWINAPI cublasScopy(int n, const float *x, int incx, float *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasScopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDcopy(int n, const double *x, int incx, double *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDcopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCcopy(int n, const cuComplex *x, int incx, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCcopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y,
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const cuDoubleComplex *, int,
+                                       cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZcopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int);
+void CUBLASWINAPI cublasSswap(int n, float *x, int incx, float *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int);
+void CUBLASWINAPI cublasDswap(int n, double *x, int incx, double *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCswap(int n, cuComplex *x, int incx, cuComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const float *, int);
+int CUBLASWINAPI cublasIsamax(int n, const float *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIsamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const double *, int);
+int CUBLASWINAPI cublasIdamax(int n, const double *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIdamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuComplex *, int);
+int CUBLASWINAPI cublasIcamax(int n, const cuComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIcamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIzamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const float *, int);
+int CUBLASWINAPI cublasIsamin(int n, const float *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIsamin");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const double *, int);
+int CUBLASWINAPI cublasIdamin(int n, const double *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIdamin");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuComplex *, int);
+int CUBLASWINAPI cublasIcamin(int n, const cuComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIcamin");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIzamin");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasSasum (int n, const float *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int);
+float CUBLASWINAPI cublasSasum(int n, const float *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSasum");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDasum (int n, const double *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int);
+double CUBLASWINAPI cublasDasum(int n, const double *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDasum");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const cuComplex *, int);
+float CUBLASWINAPI cublasScasum(int n, const cuComplex *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasScasum");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDzasum");
   return func_ptr(n, x, incx);
 }
 
-void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy, 
-                              float sc, float ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
+void CUBLASWINAPI cublasSrot(int n, float *x, int incx, float *y, int incy,
+                             float sc, float ss) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrot");
   return func_ptr(n, x, incx, y, incy, sc, ss);
 }
 
-void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy, 
-                              double sc, double ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
+void CUBLASWINAPI cublasDrot(int n, double *x, int incx, double *y, int incy,
+                             double sc, double ss) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrot");
   return func_ptr(n, x, incx, y, incy, sc, ss);
 }
 
-void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y, 
-                              int incy, float c, cuComplex s) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int, float, cuComplex);
+void CUBLASWINAPI cublasCrot(int n, cuComplex *x, int incx, cuComplex *y,
+                             int incy, float c, cuComplex s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
+                                       float, cuComplex);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCrot");
   return func_ptr(n, x, incx, y, incy, c, s);
 }
 
-void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx, 
-                              cuDoubleComplex *y, int incy, double sc, 
-                              cuDoubleComplex cs) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int, double, cuDoubleComplex);
+void CUBLASWINAPI cublasZrot(int n, cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *y, int incy, double sc,
+                             cuDoubleComplex cs) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+                           double, cuDoubleComplex);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZrot");
   return func_ptr(n, x, incx, y, incy, sc, cs);
 }
 
-void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y,
-                               int incy, float c, float s) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int, float, float);
+void CUBLASWINAPI cublasCsrot(int n, cuComplex *x, int incx, cuComplex *y,
+                              int incy, float c, float s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
+                                       float, float);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsrot");
   return func_ptr(n, x, incx, y, incy, c, s);
 }
 
-void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx, 
-                               cuDoubleComplex *y, int incy, double c, double s) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int, double, double);
+void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy, double c,
+                              double s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuDoubleComplex *, int,
+                                       cuDoubleComplex *, int, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdrot");
   return func_ptr(n, x, incx, y, incy, c, s);
 }
 
-void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(float *, float *, float *, float *);
+void CUBLASWINAPI cublasSrotg(float *sa, float *sb, float *sc, float *ss) {
+  using FuncPtr = void(CUBLASWINAPI *)(float *, float *, float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrotg");
   return func_ptr(sa, sb, sc, ss);
 }
 
-void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(double *, double *, double *, double *);
+void CUBLASWINAPI cublasDrotg(double *sa, double *sb, double *sc, double *ss) {
+  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrotg");
   return func_ptr(sa, sb, sc, ss);
 }
 
-void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc,
-                               cuComplex *cs) {
-  using FuncPtr = void (CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
+void CUBLASWINAPI cublasCrotg(cuComplex *ca, cuComplex cb, float *sc,
+                              cuComplex *cs) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCrotg");
   return func_ptr(ca, cb, sc, cs);
 }
 
-void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc,
-                               cuDoubleComplex *cs) {
-  using FuncPtr = void (CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex, double *, cuDoubleComplex *);
+void CUBLASWINAPI cublasZrotg(cuDoubleComplex *ca, cuDoubleComplex cb,
+                              double *sc, cuDoubleComplex *cs) {
+  using FuncPtr = void(CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex,
+                                       double *, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZrotg");
   return func_ptr(ca, cb, sc, cs);
 }
 
-void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy, 
-                              const float* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
+void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
+                              const float *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrotm");
   return func_ptr(n, x, incx, y, incy, sparam);
 }
 
-void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy, 
-                              const double* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
+void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
+                              const double *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrotm");
   return func_ptr(n, x, incx, y, incy, sparam);
 }
 
-void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1, 
-                                const float *sy1, float* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
+void CUBLASWINAPI cublasSrotmg(float *sd1, float *sd2, float *sx1,
+                               const float *sy1, float *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrotmg");
   return func_ptr(sd1, sd2, sx1, sy1, sparam);
 }
 
-void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1, 
-                                const double *sy1, double* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(double *, double *, double *, const double *, double *);
+void CUBLASWINAPI cublasDrotmg(double *sd1, double *sd2, double *sx1,
+                               const double *sy1, double *sparam) {
+  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *,
+                                       const double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrotmg");
   return func_ptr(sd1, sd2, sx1, sy1, sparam);
 }
 
-void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha,
-                               const float *A, int lda, const float *x, int incx,
-                               float beta, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSgemv(char trans, int m, int n, float alpha,
+                              const float *A, int lda, const float *x, int incx,
+                              float beta, float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha,
-                               const double *A, int lda, const double *x, int incx,
-                               double beta, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDgemv(char trans, int m, int n, double alpha,
+                              const double *A, int lda, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
-                               const cuComplex *A, int lda, const cuComplex *x, int incx,
-                               cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCgemv(char trans, int m, int n, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-                               cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgemv(char trans, int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku, 
-                               float alpha, const float *A, int lda, 
-                               const float *x, int incx, float beta, float *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSgbmv(char trans, int m, int n, int kl, int ku,
+                              float alpha, const float *A, int lda,
+                              const float *x, int incx, float beta, float *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku, 
-                               double alpha, const double *A, int lda, 
-                               const double *x, int incx, double beta, double *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDgbmv(char trans, int m, int n, int kl, int ku,
+                              double alpha, const double *A, int lda,
+                              const double *x, int incx, double beta, double *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, int, int, double, const double *,
+                           int, const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, 
-                               cuComplex alpha, const cuComplex *A, int lda, 
-                               const cuComplex *x, int incx, cuComplex beta, cuComplex *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCgbmv(char trans, int m, int n, int kl, int ku,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *x, int incx, cuComplex beta,
+                              cuComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku, 
-                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, 
-                               const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgbmv(char trans, int m, int n, int kl, int ku,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n, 
-                               const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
+                                       int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n, 
-                               const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n, 
-                               const cuComplex *A, int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrmv(char uplo, char trans, char diag, int n,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n, 
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtrmv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k, 
-                               const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStbmv(char uplo, char trans, char diag, int n, int k,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtbmv(char uplo, char trans, char diag, int n, int k,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const cuComplex *A, int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtbmv(char uplo, char trans, char diag, int n, int k,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtbmv(char uplo, char trans, char diag, int n, int k,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
+                           int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n,
+                              const float *AP, float *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, double *, int);
+void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n,
+                              const double *AP, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, cuComplex *, int);
+void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n,
+                              const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *AP, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
+                                       int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda, 
+void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *A, int lda,
                               cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP, 
-                              float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n,
+                              const float *AP, float *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, double *, int);
+void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n,
+                              const double *AP, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, cuComplex *, int);
+void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n,
+                              const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, 
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *AP, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasStbsv(char uplo, char trans, 
-                              char diag, int n, int k, const float *A, 
-                              int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStbsv(char uplo, char trans, char diag, int n, int k,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const double *A, 
-                              int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtbsv(char uplo, char trans, char diag, int n, int k,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const cuComplex *A, 
-                              int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtbsv(char uplo, char trans, char diag, int n, int k,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const cuDoubleComplex *A, 
-                              int lda, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtbsv(char uplo, char trans, char diag, int n, int k,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
+                           int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A,
-                               int lda, const float *x, int incx, float beta, 
-                               float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsymv(char uplo, int n, float alpha, const float *A,
+                              int lda, const float *x, int incx, float beta,
+                              float *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsymv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A,
-                               int lda, const double *x, int incx, double beta, 
-                               double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsymv(char uplo, int n, double alpha, const double *A,
+                              int lda, const double *x, int incx, double beta,
+                              double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsymv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, const cuComplex *x, int incx, cuComplex beta, 
-                               cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasChemv(char uplo, int n, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChemv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A,
-                               int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta, 
-                               cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZhemv(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhemv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha, 
-                               const float *A, int lda, const float *x, int incx, 
-                               float beta, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsbmv(char uplo, int n, int k, float alpha,
+                              const float *A, int lda, const float *x, int incx,
+                              float beta, float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha, 
-                               const double *A, int lda, const double *x, int incx, 
-                               double beta, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsbmv(char uplo, int n, int k, double alpha,
+                              const double *A, int lda, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, 
-                               const cuComplex *A, int lda, const cuComplex *x, int incx, 
-                               cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasChbmv(char uplo, int n, int k, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha, 
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, 
-                               cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZhbmv(char uplo, int n, int k, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha,
-                              const float *AP, const float *x,
-                              int incx, float beta, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha, const float *AP,
+                              const float *x, int incx, float beta, float *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *,
+                                       const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSspmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha,
-                              const double *AP, const double *x,
-                              int incx, double beta, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha, const double *AP,
+                              const double *x, int incx, double beta, double *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, const double *,
+                           int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDspmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
 void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
-                              const cuComplex *AP, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, const cuComplex *, int, cuComplex, cuComplex *, int);
+                              const cuComplex *AP, const cuComplex *x, int incx,
+                              cuComplex beta, cuComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChpmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
 void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *AP, const cuDoubleComplex *x,
-                              int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+                              const cuDoubleComplex *AP,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhpmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx,
-                              const float *y, int incy, float *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, float, const float *, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasSger(int m, int n, float alpha, const float *x,
+                             int incx, const float *y, int incy, float *A,
+                             int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, int, float, const float *, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSger");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx,
-                              const double *y, int incy, double *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, double, const double *, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDger(int m, int n, double alpha, const double *x,
+                             int incx, const double *y, int incy, double *A,
+                             int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, int, double, const double *, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDger");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
-                               int incx, const cuComplex *y, int incy,
-                               cuComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCgeru(int m, int n, cuComplex alpha, const cuComplex *x,
+                              int incx, const cuComplex *y, int incy,
+                              cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgeru");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
-                               int incx, const cuComplex *y, int incy,
-                               cuComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCgerc(int m, int n, cuComplex alpha, const cuComplex *x,
+                              int incx, const cuComplex *y, int incy,
+                              cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgerc");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
-                               int incx, const cuDoubleComplex *y, int incy,
-                               cuDoubleComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgeru(int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgeru");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
-                               int incx, const cuDoubleComplex *y, int incy,
-                               cuDoubleComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgerc(int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgerc");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x,
-                              int incx, float *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float *x,
+                             int incx, float *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x,
-                              int incx, double *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double *x,
+                             int incx, double *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasCher (char uplo, int n, float alpha, 
-                              const cuComplex *x, int incx, cuComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex *x,
+                             int incx, cuComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCher");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasZher (char uplo, int n, double alpha, 
-                              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZher(char uplo, int n, double alpha,
+                             const cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZher");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x,
-                              int incx, float *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, float *);
+void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float *x,
+                             int incx, float *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSspr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x,
-                              int incx, double *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, double *);
+void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double *x,
+                             int incx, double *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDspr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x,
-                              int incx, cuComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const cuComplex *, int, cuComplex *);
+void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex *x,
+                             int incx, cuComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
+                                       cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChpr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x,
-                              int incx, cuDoubleComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
+void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha,
+                             const cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x, 
-                               int incx, const float *y, int incy, float *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasSsyr2(char uplo, int n, float alpha, const float *x,
+                              int incx, const float *y, int incy, float *A,
+                              int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x, 
-                               int incx, const double *y, int incy, double *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDsyr2(char uplo, int n, double alpha, const double *x,
+                              int incx, const double *y, int incy, double *A,
+                              int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x, 
-                               int incx, const cuComplex *y, int incy, cuComplex *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCher2(char uplo, int n, cuComplex alpha,
+                              const cuComplex *x, int incx, const cuComplex *y,
+                              int incy, cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCher2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
-                               int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZher2(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZher2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x, 
-                               int incx, const float *y, int incy, float *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float *);
+void CUBLASWINAPI cublasSspr2(char uplo, int n, float alpha, const float *x,
+                              int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSspr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha,
-                               const double *x, int incx, const double *y,
-                               int incy, double *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double *);
+void CUBLASWINAPI cublasDspr2(char uplo, int n, double alpha, const double *x,
+                              int incx, const double *y, int incy, double *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDspr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
-                               const cuComplex *x, int incx, const cuComplex *y,
-                               int incy, cuComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+void CUBLASWINAPI cublasChpr2(char uplo, int n, cuComplex alpha,
+                              const cuComplex *x, int incx, const cuComplex *y,
+                              int incy, cuComplex *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChpr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-                               int incy, cuDoubleComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+void CUBLASWINAPI cublasZhpr2(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k, 
-                               float alpha, const float *A, int lda, 
-                               const float *B, int ldb, float beta, float *C, 
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSgemm(char transa, char transb, int m, int n, int k,
+                              float alpha, const float *A, int lda,
+                              const float *B, int ldb, float beta, float *C,
+                              int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k,
-                               double alpha, const double *A, int lda, 
-                               const double *B, int ldb, double beta, double *C, 
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDgemm(char transa, char transb, int m, int n, int k,
+                              double alpha, const double *A, int lda,
+                              const double *B, int ldb, double beta, double *C,
+                              int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, int, double, const double *,
+                           int, const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k, 
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, cuComplex beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCgemm(char transa, char transb, int m, int n, int k,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n,
-                               int k, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               const cuDoubleComplex *B, int ldb,
-                               cuDoubleComplex beta, cuDoubleComplex *C,
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgemm(char transa, char transb, int m, int n, int k,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha, 
-                               const float *A, int lda, float beta, float *C, 
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsyrk(char uplo, char trans, int n, int k, float alpha,
+                              const float *A, int lda, float beta, float *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, float,
+                                       const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k,
-                               double alpha, const double *A, int lda,
-                               double beta, double *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsyrk(char uplo, char trans, int n, int k, double alpha,
+                              const double *A, int lda, double beta, double *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, double, const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               cuComplex beta, cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCsyrk(char uplo, char trans, int n, int k,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              cuComplex beta, cuComplex *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *,
+                           int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k,
-                               cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               cuDoubleComplex beta,
-                               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZsyrk(char uplo, char trans, int n, int k,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex,
+                                       const cuDoubleComplex *, int,
+                                       cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k,
-                               float alpha, const cuComplex *A, int lda,
-                               float beta, cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int, float, cuComplex *, int);
+void CUBLASWINAPI cublasCherk(char uplo, char trans, int n, int k, float alpha,
+                              const cuComplex *A, int lda, float beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int,
+                           float, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCherk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k,
-                               double alpha,
-                               const cuDoubleComplex *A, int lda,
-                               double beta,
-                               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZherk(char uplo, char trans, int n, int k, double alpha,
+                              const cuDoubleComplex *A, int lda, double beta,
+                              cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, double,
+                                       const cuDoubleComplex *, int, double,
+                                       cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZherk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha, 
-                                const float *A, int lda, const float *B, int ldb, 
-                                float beta, float *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsyr2k(char uplo, char trans, int n, int k, float alpha,
+                               const float *A, int lda, const float *B, int ldb,
+                               float beta, float *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k,
-                                double alpha, const double *A, int lda,
-                                const double *B, int ldb, double beta,
-                                double *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsyr2k(char uplo, char trans, int n, int k,
+                               double alpha, const double *A, int lda,
+                               const double *B, int ldb, double beta, double *C,
+                               int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k,
-                                cuComplex alpha, const cuComplex *A, int lda,
-                                const cuComplex *B, int ldb, cuComplex beta,
-                                cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCsyr2k(char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, cuComplex beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k,
-                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                                const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
-                                cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZsyr2k(char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *B, int ldb,
+                               cuDoubleComplex beta, cuDoubleComplex *C,
+                               int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k,
-                                cuComplex alpha, const cuComplex *A, int lda,
-                                const cuComplex *B, int ldb, float beta,
-                                cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, float, cuComplex *, int);
+void CUBLASWINAPI cublasCher2k(char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, float beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, float, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCher2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k,
-                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                                const cuDoubleComplex *B, int ldb, double beta,
-                                cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZher2k(char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *B, int ldb,
+                               double beta, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZher2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha, 
-                               const float *A, int lda, const float *B, int ldb,
-                               float beta, float *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsymm(char side, char uplo, int m, int n, float alpha,
+                              const float *A, int lda, const float *B, int ldb,
+                              float beta, float *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha, 
-                               const double *A, int lda, const double *B, int ldb,
-                               double beta, double *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsymm(char side, char uplo, int m, int n, double alpha,
+                              const double *A, int lda, const double *B,
+                              int ldb, double beta, double *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha, 
-                               const cuComplex *A, int lda, const cuComplex *B, int ldb,
-                               cuComplex beta, cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCsymm(char side, char uplo, int m, int n,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha, 
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-                               cuDoubleComplex beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZsymm(char side, char uplo, int m, int n,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, cuComplex beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasChemm(char side, char uplo, int m, int n,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChemm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n,
-                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                               const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
-                               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZhemm(char side, char uplo, int m, int n,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhemm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag,
-                               int m, int n, float alpha, const float *A, int lda,
-                               float *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, float alpha, const float *A,
+                              int lda, float *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa,
-                               char diag, int m, int n, double alpha,
-                               const double *A, int lda, double *B,
-                               int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, double alpha, const double *A,
+                              int lda, double *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag,
-                               int m, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, cuComplex *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuComplex alpha, const cuComplex *A,
+                              int lda, cuComplex *B, int ldb) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa,
-                               char diag, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               cuDoubleComplex *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
+                                       cuDoubleComplex, const cuDoubleComplex *,
+                                       int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag,
-                               int m, int n, float alpha, const float *A, int lda,
-                               float *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, float alpha, const float *A,
+                              int lda, float *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa,
-                               char diag, int m, int n, double alpha,
-                               const double *A, int lda, double *B,
-                               int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, double alpha, const double *A,
+                              int lda, double *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag,
-                               int m, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, cuComplex *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuComplex alpha, const cuComplex *A,
+                              int lda, cuComplex *B, int ldb) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa,
-                               char diag, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *B,
-                               int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
+                                       cuDoubleComplex, const cuDoubleComplex *,
+                                       int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
diff --git a/tensorflow/stream_executor/cuda/cublas_11_0.inc b/tensorflow/stream_executor/cuda/cublas_11_0.inc
new file mode 100644
index 00000000000..c30b2cf8f68
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cublas_11_0.inc
@@ -0,0 +1,5023 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle,
+                                                int *version) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, version);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type,
+                                              int *value) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+size_t CUBLASWINAPI cublasGetCudartVersion(void) {
+  using FuncPtr = size_t(CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetCudartVersion");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasGetCudartVersion");
+  return func_ptr();
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle,
+                                               cudaStream_t streamId) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetStream_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle,
+                                               cudaStream_t *streamId) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStream_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle,
+                                                    cublasPointerMode_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetPointerMode_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle,
+                                                    cublasPointerMode_t mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetPointerMode_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle,
+                                                 cublasAtomicsMode_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetAtomicsMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle,
+                                                 cublasAtomicsMode_t mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetAtomicsMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle,
+                                              cublasMath_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMathMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle,
+                                              cublasMath_t mode) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMathMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLoggerConfigure(int logIsOn, int logToStdOut,
+                                                  int logToStdErr,
+                                                  const char *logFileName) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLoggerConfigure");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(logIsOn, logToStdOut, logToStdErr, logFileName);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSetLoggerCallback(cublasLogCallback userCallback) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLogCallback);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetLoggerCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(userCallback);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasGetLoggerCallback(cublasLogCallback *userCallback) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLogCallback *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetLoggerCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(userCallback);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void *x,
+                                            int incx, void *devicePtr,
+                                            int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVector");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, x, incx, devicePtr, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void *x,
+                                            int incx, void *y, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVector");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize,
+                                            const void *A, int lda, void *B,
+                                            int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
+                                                 int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrix");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize,
+                                            const void *A, int lda, void *B,
+                                            int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
+                                                 int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrix");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetVectorAsync(int n, int elemSize,
+                                                 const void *hostPtr, int incx,
+                                                 void *devicePtr, int incy,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
+                                                 void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVectorAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, hostPtr, incx, devicePtr, incy, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVectorAsync(int n, int elemSize,
+                                                 const void *devicePtr,
+                                                 int incx, void *hostPtr,
+                                                 int incy,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
+                                                 void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVectorAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, devicePtr, incx, hostPtr, incy, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync(int rows, int cols,
+                                                 int elemSize, const void *A,
+                                                 int lda, void *B, int ldb,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      int, int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrixAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync(int rows, int cols,
+                                                 int elemSize, const void *A,
+                                                 int lda, void *B, int ldb,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      int, int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrixAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
+}
+
+void CUBLASWINAPI cublasXerbla(const char *srName, int info) {
+  using FuncPtr = void(CUBLASWINAPI *)(const char *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasXerbla");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasXerbla");
+  return func_ptr(srName, info);
+}
+
+cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, void *result,
+                                         cudaDataType resultType,
+                                         cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, void *,
+      cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasNrm2Ex");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result, resultType, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx,
+                                           float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx,
+                                           double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle, int n,
+                                        const void *x, cudaDataType xType,
+                                        int incx, const void *y,
+                                        cudaDataType yType, int incy,
+                                        void *result, cudaDataType resultType,
+                                        cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
+                  executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, const void *y,
+                                         cudaDataType yType, int incy,
+                                         void *result, cudaDataType resultType,
+                                         cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotcEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
+                  executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle, int n,
+                                          const float *x, int incx,
+                                          const float *y, int incy,
+                                          float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle, int n,
+                                          const double *x, int incx,
+                                          const double *y, int incy,
+                                          double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           const cuComplex *y, int incy,
+                                           cuComplex *result) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           const cuComplex *y, int incy,
+                                           cuComplex *result) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           const cuDoubleComplex *y, int incy,
+                                           cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           const cuDoubleComplex *y, int incy,
+                                           cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasScalEx(cublasHandle_t handle, int n,
+             const void *alpha, /* host or device pointer */
+             cudaDataType alphaType, void *x, cudaDataType xType, int incx,
+             cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType,
+      int, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScalEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSscal_v2(cublasHandle_t handle, int n,
+               const float *alpha, /* host or device pointer */
+               float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDscal_v2(cublasHandle_t handle, int n,
+               const double *alpha, /* host or device pointer */
+               double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCscal_v2(cublasHandle_t handle, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCsscal_v2(cublasHandle_t handle, int n,
+                const float *alpha, /* host or device pointer */
+                cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZscal_v2(cublasHandle_t handle, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZdscal_v2(cublasHandle_t handle, int n,
+                const double *alpha, /* host or device pointer */
+                cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasAxpyEx(
+    cublasHandle_t handle, int n,
+    const void *alpha, /* host or device pointer */
+    cudaDataType alphaType, const void *x, cudaDataType xType, int incx,
+    void *y, cudaDataType yType, int incy, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, const void *,
+      cudaDataType, int, void *, cudaDataType, int, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAxpyEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy,
+                  executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSaxpy_v2(cublasHandle_t handle, int n,
+               const float *alpha, /* host or device pointer */
+               const float *x, int incx, float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDaxpy_v2(cublasHandle_t handle, int n,
+               const double *alpha, /* host or device pointer */
+               const double *x, int incx, double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCaxpy_v2(cublasHandle_t handle, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, cuComplex *y, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(
+    cublasHandle_t handle, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCopyEx(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, void *y, cudaDataType yType,
+                                         int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, void *,
+      cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCopyEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScopy_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx, float *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDcopy_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx, double *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCcopy_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZcopy_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuDoubleComplex *, int,
+                                                 cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSswap_v2(cublasHandle_t handle, int n,
+                                           float *x, int incx, float *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *,
+                                                 int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDswap_v2(cublasHandle_t handle, int n,
+                                           double *x, int incx, double *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, double *,
+                                                 int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCswap_v2(cublasHandle_t handle, int n,
+                                           cuComplex *x, int incx, cuComplex *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZswap_v2(cublasHandle_t handle, int n,
+                                           cuDoubleComplex *x, int incx,
+                                           cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSwapEx(cublasHandle_t handle, int n, void *x,
+                                         cudaDataType xType, int incx, void *y,
+                                         cudaDataType yType, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, void *, cudaDataType,
+                                     int, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSwapEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, int n,
+                                            const float *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, int n,
+                                            const double *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIamaxEx(
+    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
+    int *result /* host or device pointer */
+) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIamaxEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, int n,
+                                            const float *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, int n,
+                                            const double *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIaminEx(
+    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
+    int *result /* host or device pointer */
+) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIaminEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasAsumEx(
+    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
+    void *result, cudaDataType resultType, /* host or device pointer */
+    cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, void *,
+      cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAsumEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result, resultType, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx,
+                                           float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx,
+                                           double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSrot_v2(cublasHandle_t handle, int n, float *x, int incx, float *y,
+              int incy, const float *c, /* host or device pointer */
+              const float *s) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *,
+                                     int, const float *, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDrot_v2(cublasHandle_t handle, int n, double *x, int incx, double *y,
+              int incy, const double *c, /* host or device pointer */
+              const double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *, int, double *, int, const double *,
+      const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCrot_v2(
+    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
+    int incy, const float *c, /* host or device pointer */
+    const cuComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
+      const cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsrot_v2(
+    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
+    int incy, const float *c, /* host or device pointer */
+    const float *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
+      const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZrot_v2(
+    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
+    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
+    const cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      const double *, const cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdrot_v2(
+    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
+    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
+    const double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      const double *, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasRotEx(cublasHandle_t handle, int n, void *x, cudaDataType xType, int incx,
+            void *y, cudaDataType yType, int incy,
+            const void *c, /* host or device pointer */
+            const void *s, cudaDataType csType, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, void *, cudaDataType, int, void *, cudaDataType, int,
+      const void *, const void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, c, s, csType,
+                  executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSrotg_v2(cublasHandle_t handle, float *a, /* host or device pointer */
+               float *b,                        /* host or device pointer */
+               float *c,                        /* host or device pointer */
+               float *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, float *,
+                                                 float *, float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDrotg_v2(cublasHandle_t handle, double *a, /* host or device pointer */
+               double *b,                        /* host or device pointer */
+               double *c,                        /* host or device pointer */
+               double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, double *,
+                                                 double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCrotg_v2(cublasHandle_t handle, cuComplex *a, /* host or device pointer */
+               cuComplex *b,                        /* host or device pointer */
+               float *c,                            /* host or device pointer */
+               cuComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZrotg_v2(
+    cublasHandle_t handle, cuDoubleComplex *a, /* host or device pointer */
+    cuDoubleComplex *b,                        /* host or device pointer */
+    double *c,                                 /* host or device pointer */
+    cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *,
+      cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasRotgEx(cublasHandle_t handle,
+                                         void *a, /* host or device pointer */
+                                         void *b, /* host or device pointer */
+                                         cudaDataType abType,
+                                         void *c, /* host or device pointer */
+                                         void *s, /* host or device pointer */
+                                         cudaDataType csType,
+                                         cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, void *, void *,
+                                                 cudaDataType, void *, void *,
+                                                 cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotgEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, abType, c, s, csType, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, int n,
+                                           float *x, int incx, float *y,
+                                           int incy, const float *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, float *, int, float *, int, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, param);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, int n,
+                                           double *x, int incx, double *y,
+                                           int incy, const double *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *, int, double *, int, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, param);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasRotmEx(cublasHandle_t handle, int n, void *x, cudaDataType xType,
+             int incx, void *y, cudaDataType yType, int incy,
+             const void *param, /* host or device pointer */
+             cudaDataType paramType, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, void *, cudaDataType, int, void *, cudaDataType, int,
+      const void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, param, paramType,
+                  executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSrotmg_v2(cublasHandle_t handle, float *d1, /* host or device pointer */
+                float *d2,                        /* host or device pointer */
+                float *x1,                        /* host or device pointer */
+                const float *y1,                  /* host or device pointer */
+                float *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, float *, float *, float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, d1, d2, x1, y1, param);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDrotmg_v2(cublasHandle_t handle, double *d1, /* host or device pointer */
+                double *d2,                        /* host or device pointer */
+                double *x1,                        /* host or device pointer */
+                const double *y1,                  /* host or device pointer */
+                double *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, double *, double *, double *, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, d1, d2, x1, y1, param);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasRotmgEx(cublasHandle_t handle, void *d1,     /* host or device pointer */
+              cudaDataType d1Type, void *d2,       /* host or device pointer */
+              cudaDataType d2Type, void *x1,       /* host or device pointer */
+              cudaDataType x1Type, const void *y1, /* host or device pointer */
+              cudaDataType y1Type, void *param,    /* host or device pointer */
+              cudaDataType paramType, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, void *, cudaDataType, void *, cudaDataType, void *,
+      cudaDataType, const void *, cudaDataType, void *, cudaDataType,
+      cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotmgEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, d1, d1Type, d2, d2Type, x1, x1Type, y1, y1Type, param,
+                  paramType, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const float *, const float *,
+      int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               int kl, int ku, const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const float *,
+      const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               int kl, int ku, const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
+    int ku, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *x, int incx,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
+    int ku, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const float *A, int lda, float *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *AP,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *AP,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const float *A, int lda, float *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsymv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasChemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhemv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *,
+      int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasChbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *AP, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *,
+      const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const double *alpha, /* host or device pointer */
+               const double *AP, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasChpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *AP, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZhpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *AP, const cuDoubleComplex *x, int incx,
+               const cuDoubleComplex *beta, /* host or device pointer */
+               cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSger_v2(
+    cublasHandle_t handle, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *x, int incx, const float *y, int incy, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const float *, const float *, int,
+      const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDger_v2(
+    cublasHandle_t handle, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const double *, const double *, int,
+      const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCgeru_v2(cublasHandle_t handle, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCgerc_v2(cublasHandle_t handle, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgeru_v2(cublasHandle_t handle, int m, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgerc_v2(cublasHandle_t handle, int m, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const float *x, int incx, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const double *x, int incx, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const cuComplex *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const cuDoubleComplex *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
+      int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const float *x, int incx, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const double *x, int incx, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasChpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
+      int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const float *alpha, /* host or device pointer */
+    const float *x, int incx, const float *y, int incy, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *x, int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDspr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChpr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *x, int incx, const cuComplex *y, int incy, cuComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZhpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3m(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3m");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const void *A,
+    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
+    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgemm3m(cublasHandle_t handle, cublasOperation_t transa,
+              cublasOperation_t transb, int m, int n, int k,
+              const cuDoubleComplex *alpha, /* host or device pointer */
+              const cuDoubleComplex *A, int lda, const cuDoubleComplex *B,
+              int ldb, const cuDoubleComplex *beta, /* host or device pointer */
+              cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm3m");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda, const void *B,
+    cudaDataType Btype, int ldb, const float *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const float *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const void *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda, const void *B,
+    cudaDataType Btype, int ldb, const void *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc, cublasComputeType_t computeType,
+    cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const void *, const void *, cudaDataType, int, const void *, cudaDataType,
+      int, const void *, void *, cudaDataType, int, cublasComputeType_t,
+      cublasGemmAlgo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const void *A,
+    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
+    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    cublasOperation_t transc, int m, int n, int k, const unsigned char *A,
+    int A_bias, int lda, const unsigned char *B, int B_bias, int ldb,
+    unsigned char *C, int C_bias, int ldc, int C_mult, int C_shift) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t,
+      int, int, int, const unsigned char *, int, int, const unsigned char *,
+      int, int, unsigned char *, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasUint8gemmBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B,
+                  B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha,           /* host or device pointer */
+    const float *A, int lda, const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha,            /* host or device pointer */
+    const double *A, int lda, const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrkEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
+      void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, const void *A, cudaDataType Atype,
+    int lda, const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
+      void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha,               /* host or device pointer */
+    const cuComplex *A, int lda, const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const cuComplex *, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZherk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const cuDoubleComplex *, int, const double *,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherkEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    const float *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const void *, cudaDataType, int, const float *, void *,
+      cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherk3mEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, const void *A, cudaDataType Atype,
+    int lda, const float *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const void *, cudaDataType, int, const float *, void *,
+      cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCher2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZher2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZherkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChemm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhemm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *A, int lda, float *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *, int, float *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *A, int lda, double *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *, int, double *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, cuComplex *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *, int,
+      const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb, double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *, int,
+      const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb, cuComplex *C,
+    int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const float *const Aarray[], int lda, const float *const Barray[], int ldb,
+    const float *beta, /* host or device pointer */
+    float *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *const[], int, const float *const[], int,
+      const float *, float *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *const Aarray[], int lda, const double *const Barray[],
+    int ldb, const double *beta, /* host or device pointer */
+    double *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *const[], int, const double *const[], int,
+      const double *, double *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *const Aarray[], int lda, const cuComplex *const Barray[],
+    int ldb, const cuComplex *beta, /* host or device pointer */
+    cuComplex *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *const[], int,
+      const cuComplex *const[], int, const cuComplex *, cuComplex *const[], int,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *const Aarray[], int lda, const cuComplex *const Barray[],
+    int ldb, const cuComplex *beta, /* host or device pointer */
+    cuComplex *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *const[], int,
+      const cuComplex *const[], int, const cuComplex *, cuComplex *const[], int,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa,
+                   cublasOperation_t transb, int m, int n, int k,
+                   const cuDoubleComplex *alpha, /* host or device pointer */
+                   const cuDoubleComplex *const Aarray[], int lda,
+                   const cuDoubleComplex *const Barray[], int ldb,
+                   const cuDoubleComplex *beta, /* host or device pointer */
+                   cuDoubleComplex *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *const[], int,
+      const cuDoubleComplex *const[], int, const cuDoubleComplex *,
+      cuDoubleComplex *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const void *alpha, /* host or device pointer */
+    const void *const Aarray[], cudaDataType Atype, int lda,
+    const void *const Barray[], cudaDataType Btype, int ldb,
+    const void *beta, /* host or device pointer */
+    void *const Carray[], cudaDataType Ctype, int ldc, int batchCount,
+    cublasComputeType_t computeType, cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const void *, const void *const[], cudaDataType, int, const void *const[],
+      cudaDataType, int, const void *, void *const[], cudaDataType, int, int,
+      cublasComputeType_t, cublasGemmAlgo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmBatchedEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, Atype, lda,
+                  Barray, Btype, ldb, beta, Carray, Ctype, ldc, batchCount,
+                  computeType, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const void *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    long long int strideA, /* purposely signed */
+    const void *B, cudaDataType Btype, int ldb, long long int strideB,
+    const void *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc, long long int strideC, int batchCount,
+    cublasComputeType_t computeType, cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const void *, const void *, cudaDataType, int, long long, const void *,
+      cudaDataType, int, long long, const void *, void *, cudaDataType, int,
+      long long, int, cublasComputeType_t, cublasGemmAlgo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmStridedBatchedEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda,
+                  strideA, B, Btype, ldb, strideB, beta, C, Ctype, ldc, strideC,
+                  batchCount, computeType, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha,        /* host or device pointer */
+    const float *A, int lda, long long int strideA, /* purposely signed */
+    const float *B, int ldb, long long int strideB,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *, int, long long, const float *, int,
+      long long, const float *, float *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, long long int strideA, /* purposely signed */
+    const double *B, int ldb, long long int strideB,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *, int, long long, const double *, int,
+      long long, const double *, double *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
+    const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
+      int, long long, const cuComplex *, cuComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
+    const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
+      int, long long, const cuComplex *, cuComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    long long int strideA, /* purposely signed */
+    const cuDoubleComplex *B, int ldb, long long int strideB,
+    const cuDoubleComplex *beta, /* host or device poi */
+    cuDoubleComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, long long,
+      const cuDoubleComplex *, int, long long, const cuDoubleComplex *,
+      cuDoubleComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const float *alpha,           /* host or device pointer */
+    const float *A, int lda, const float *beta, /* host or device pointer */
+    const float *B, int ldb, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, const float *, int,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const double *alpha,            /* host or device pointer */
+    const double *A, int lda, const double *beta, /* host or device pointer */
+    const double *B, int ldb, double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, const double *, int,
+      double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    const cuComplex *B, int ldb, cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    const cuDoubleComplex *B, int ldb, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(
+    cublasHandle_t handle, int n, float *const A[], /*Device pointer*/
+    int lda, int *P,                                /*Device Pointer*/
+    int *info,                                      /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, float *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(
+    cublasHandle_t handle, int n, double *const A[], /*Device pointer*/
+    int lda, int *P,                                 /*Device Pointer*/
+    int *info,                                       /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(
+    cublasHandle_t handle, int n, cuComplex *const A[], /*Device pointer*/
+    int lda, int *P,                                    /*Device Pointer*/
+    int *info,                                          /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(
+    cublasHandle_t handle, int n, cuDoubleComplex *const A[], /*Device pointer*/
+    int lda, int *P,                                          /*Device Pointer*/
+    int *info,                                                /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgetriBatched(
+    cublasHandle_t handle, int n, const float *const A[], /*Device pointer*/
+    int lda, const int *P,                                /*Device pointer*/
+    float *const C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *const[], int, const int *,
+      float *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetriBatched(
+    cublasHandle_t handle, int n, const double *const A[], /*Device pointer*/
+    int lda, const int *P,                                 /*Device pointer*/
+    double *const C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *const[], int, const int *,
+      double *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgetriBatched(
+    cublasHandle_t handle, int n, const cuComplex *const A[], /*Device pointer*/
+    int lda, const int *P,                                    /*Device pointer*/
+    cuComplex *const C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *const[], int, const int *,
+      cuComplex *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgetriBatched(cublasHandle_t handle, int n,
+                    const cuDoubleComplex *const A[], /*Device pointer*/
+                    int lda, const int *P,            /*Device pointer*/
+                    cuDoubleComplex *const C[],       /*Device pointer*/
+                    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *const[], int, const int *,
+      cuDoubleComplex *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const float *const Aarray[], int lda, const int *devIpiv,
+    float *const Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const float *const[], int,
+      const int *, float *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const double *const Aarray[], int lda, const int *devIpiv,
+    double *const Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const double *const[], int,
+      const int *, double *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const cuComplex *const Aarray[], int lda, const int *devIpiv,
+    cuComplex *const Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *const[],
+      int, const int *, cuComplex *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const cuDoubleComplex *const Aarray[], int lda, const int *devIpiv,
+    cuDoubleComplex *const Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *const[], int, const int *,
+      cuDoubleComplex *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /*Host or Device Pointer*/
+    const float *const A[], int lda, float *const B[], int ldb,
+    int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *const[], int,
+      float *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /*Host or Device Pointer*/
+    const double *const A[], int lda, double *const B[], int ldb,
+    int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *const[], int,
+      double *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /*Host or Device Pointer*/
+    const cuComplex *const A[], int lda, cuComplex *const B[], int ldb,
+    int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *const[],
+      int, cuComplex *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /*Host or Device Pointer*/
+    const cuDoubleComplex *const A[], int lda, cuDoubleComplex *const B[],
+    int ldb, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(
+    cublasHandle_t handle, int n, const float *const A[], /*Device pointer*/
+    int lda, float *const Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                               /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const float *const[],
+                                     int, float *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(
+    cublasHandle_t handle, int n, const double *const A[], /*Device pointer*/
+    int lda, double *const Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                                /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const double *const[],
+                                     int, double *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(
+    cublasHandle_t handle, int n, const cuComplex *const A[], /*Device pointer*/
+    int lda, cuComplex *const Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                                   /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *const[], int, cuComplex *const[],
+      int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZmatinvBatched(cublasHandle_t handle, int n,
+                     const cuDoubleComplex *const A[],       /*Device pointer*/
+                     int lda, cuDoubleComplex *const Ainv[], /*Device pointer*/
+                     int lda_inv, int *info,                 /*Device Pointer*/
+                     int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *const[], int,
+      cuDoubleComplex *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSgeqrfBatched(cublasHandle_t handle, int m, int n,
+                    float *const Aarray[],            /*Device pointer*/
+                    int lda, float *const TauArray[], /*Device pointer*/
+                    int *info, int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, int, float *const[],
+                                     int, float *const[], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDgeqrfBatched(cublasHandle_t handle, int m, int n,
+                    double *const Aarray[],            /*Device pointer*/
+                    int lda, double *const TauArray[], /*Device pointer*/
+                    int *info, int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, int, double *const[],
+                                     int, double *const[], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCgeqrfBatched(cublasHandle_t handle, int m, int n,
+                    cuComplex *const Aarray[],            /*Device pointer*/
+                    int lda, cuComplex *const TauArray[], /*Device pointer*/
+                    int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, cuComplex *const[], int, cuComplex *const[],
+      int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgeqrfBatched(
+    cublasHandle_t handle, int m, int n,
+    cuDoubleComplex *const Aarray[],            /*Device pointer*/
+    int lda, cuDoubleComplex *const TauArray[], /*Device pointer*/
+    int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, cuDoubleComplex *const[], int,
+      cuDoubleComplex *const[], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, float *const Aarray[],       /*Device pointer*/
+                   int lda, float *const Carray[],        /*Device pointer*/
+                   int ldc, int *info, int *devInfoArray, /*Device pointer*/
+                   int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, float *const[], int,
+      float *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, double *const Aarray[],      /*Device pointer*/
+                   int lda, double *const Carray[],       /*Device pointer*/
+                   int ldc, int *info, int *devInfoArray, /*Device pointer*/
+                   int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, double *const[], int,
+      double *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, cuComplex *const Aarray[], /*Device pointer*/
+                   int lda, cuComplex *const Carray[],  /*Device pointer*/
+                   int ldc, int *info, int *devInfoArray, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, cuComplex *const[], int,
+      cuComplex *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, cuDoubleComplex *const Aarray[], /*Device pointer*/
+                   int lda, cuDoubleComplex *const Carray[],  /*Device pointer*/
+                   int ldc, int *info, int *devInfoArray, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int,
+      cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int *,
+      int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
+                                        cublasSideMode_t mode, int m, int n,
+                                        const float *A, int lda, const float *x,
+                                        int incx, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const float *, int,
+      const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
+                                        cublasSideMode_t mode, int m, int n,
+                                        const double *A, int lda,
+                                        const double *x, int incx, double *C,
+                                        int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const double *, int,
+      const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
+                                        cublasSideMode_t mode, int m, int n,
+                                        const cuComplex *A, int lda,
+                                        const cuComplex *x, int incx,
+                                        cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
+                                        cublasSideMode_t mode, int m, int n,
+                                        const cuDoubleComplex *A, int lda,
+                                        const cuDoubleComplex *x, int incx,
+                                        cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const float *AP, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const double *AP, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuComplex *AP, cuComplex *A,
+                                         int lda) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
+                                     const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuDoubleComplex *AP,
+                                         cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const float *A, int lda, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const double *A, int lda, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuComplex *A, int lda,
+                                         cuComplex *AP) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
+                                     const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuDoubleComplex *A, int lda,
+                                         cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus CUBLASWINAPI cublasInit(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasInit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cublasStatus CUBLASWINAPI cublasShutdown(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasShutdown");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cublasStatus CUBLASWINAPI cublasGetError(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cublasStatus CUBLASWINAPI cublasGetVersion(int *version) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(version);
+}
+
+cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void **devicePtr) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, devicePtr);
+}
+
+cublasStatus CUBLASWINAPI cublasFree(void *devicePtr) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devicePtr);
+}
+
+cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetKernelStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+float CUBLASWINAPI cublasSnrm2(int n, const float *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSnrm2");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDnrm2(int n, const double *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDnrm2");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasScnrm2(int n, const cuComplex *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasScnrm2");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDznrm2");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasSdot(int n, const float *x, int incx, const float *y,
+                              int incy) {
+  using FuncPtr =
+      float(CUBLASWINAPI *)(int, const float *, int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSdot");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+double CUBLASWINAPI cublasDdot(int n, const double *x, int incx,
+                               const double *y, int incy) {
+  using FuncPtr =
+      double(CUBLASWINAPI *)(int, const double *, int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDdot");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex *x, int incx,
+                                   const cuComplex *y, int incy) {
+  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
+                                            const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotu");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex *x, int incx,
+                                   const cuComplex *y, int incy) {
+  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
+                                            const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotc");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex *x,
+                                         int incx, const cuDoubleComplex *y,
+                                         int incy) {
+  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
+      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotu");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex *x,
+                                         int incx, const cuDoubleComplex *y,
+                                         int incy) {
+  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
+      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotc");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasSscal(int n, float alpha, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasDscal(int n, double alpha, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex *x,
+                               int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float *x, int incx,
+                              float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double *x, int incx,
+                              double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex *x,
+                              int incx, cuComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int,
+                                       cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasZaxpy(int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasScopy(int n, const float *x, int incx, float *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasScopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasDcopy(int n, const double *x, int incx, double *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDcopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasCcopy(int n, const cuComplex *x, int incx, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCcopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const cuDoubleComplex *, int,
+                                       cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZcopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasSswap(int n, float *x, int incx, float *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasDswap(int n, double *x, int incx, double *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasCswap(int n, cuComplex *x, int incx, cuComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+int CUBLASWINAPI cublasIsamax(int n, const float *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIdamax(int n, const double *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIcamax(int n, const cuComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIsamin(int n, const float *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamin");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIdamin(int n, const double *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamin");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIcamin(int n, const cuComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamin");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamin");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasSasum(int n, const float *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSasum");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDasum(int n, const double *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDasum");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasScasum(int n, const cuComplex *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasScasum");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDzasum");
+  return func_ptr(n, x, incx);
+}
+
+void CUBLASWINAPI cublasSrot(int n, float *x, int incx, float *y, int incy,
+                             float sc, float ss) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrot");
+  return func_ptr(n, x, incx, y, incy, sc, ss);
+}
+
+void CUBLASWINAPI cublasDrot(int n, double *x, int incx, double *y, int incy,
+                             double sc, double ss) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrot");
+  return func_ptr(n, x, incx, y, incy, sc, ss);
+}
+
+void CUBLASWINAPI cublasCrot(int n, cuComplex *x, int incx, cuComplex *y,
+                             int incy, float c, cuComplex s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
+                                       float, cuComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCrot");
+  return func_ptr(n, x, incx, y, incy, c, s);
+}
+
+void CUBLASWINAPI cublasZrot(int n, cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *y, int incy, double sc,
+                             cuDoubleComplex cs) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+                           double, cuDoubleComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZrot");
+  return func_ptr(n, x, incx, y, incy, sc, cs);
+}
+
+void CUBLASWINAPI cublasCsrot(int n, cuComplex *x, int incx, cuComplex *y,
+                              int incy, float c, float s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
+                                       float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsrot");
+  return func_ptr(n, x, incx, y, incy, c, s);
+}
+
+void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy, double c,
+                              double s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuDoubleComplex *, int,
+                                       cuDoubleComplex *, int, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdrot");
+  return func_ptr(n, x, incx, y, incy, c, s);
+}
+
+void CUBLASWINAPI cublasSrotg(float *sa, float *sb, float *sc, float *ss) {
+  using FuncPtr = void(CUBLASWINAPI *)(float *, float *, float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotg");
+  return func_ptr(sa, sb, sc, ss);
+}
+
+void CUBLASWINAPI cublasDrotg(double *sa, double *sb, double *sc, double *ss) {
+  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotg");
+  return func_ptr(sa, sb, sc, ss);
+}
+
+void CUBLASWINAPI cublasCrotg(cuComplex *ca, cuComplex cb, float *sc,
+                              cuComplex *cs) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCrotg");
+  return func_ptr(ca, cb, sc, cs);
+}
+
+void CUBLASWINAPI cublasZrotg(cuDoubleComplex *ca, cuDoubleComplex cb,
+                              double *sc, cuDoubleComplex *cs) {
+  using FuncPtr = void(CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex,
+                                       double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZrotg");
+  return func_ptr(ca, cb, sc, cs);
+}
+
+void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
+                              const float *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotm");
+  return func_ptr(n, x, incx, y, incy, sparam);
+}
+
+void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
+                              const double *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotm");
+  return func_ptr(n, x, incx, y, incy, sparam);
+}
+
+void CUBLASWINAPI cublasSrotmg(float *sd1, float *sd2, float *sx1,
+                               const float *sy1, float *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotmg");
+  return func_ptr(sd1, sd2, sx1, sy1, sparam);
+}
+
+void CUBLASWINAPI cublasDrotmg(double *sd1, double *sd2, double *sx1,
+                               const double *sy1, double *sparam) {
+  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *,
+                                       const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotmg");
+  return func_ptr(sd1, sd2, sx1, sy1, sparam);
+}
+
+void CUBLASWINAPI cublasSgemv(char trans, int m, int n, float alpha,
+                              const float *A, int lda, const float *x, int incx,
+                              float beta, float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDgemv(char trans, int m, int n, double alpha,
+                              const double *A, int lda, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasCgemv(char trans, int m, int n, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZgemv(char trans, int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSgbmv(char trans, int m, int n, int kl, int ku,
+                              float alpha, const float *A, int lda,
+                              const float *x, int incx, float beta, float *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDgbmv(char trans, int m, int n, int kl, int ku,
+                              double alpha, const double *A, int lda,
+                              const double *x, int incx, double beta, double *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, int, int, double, const double *,
+                           int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasCgbmv(char trans, int m, int n, int kl, int ku,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *x, int incx, cuComplex beta,
+                              cuComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZgbmv(char trans, int m, int n, int kl, int ku,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
+                                       int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtrmv(char uplo, char trans, char diag, int n,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtrmv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasStbmv(char uplo, char trans, char diag, int n, int k,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtbmv(char uplo, char trans, char diag, int n, int k,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtbmv(char uplo, char trans, char diag, int n, int k,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtbmv(char uplo, char trans, char diag, int n, int k,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
+                           int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n,
+                              const float *AP, float *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n,
+                              const double *AP, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n,
+                              const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *AP, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
+                                       int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n,
+                              const float *AP, float *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n,
+                              const double *AP, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n,
+                              const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *AP, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasStbsv(char uplo, char trans, char diag, int n, int k,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtbsv(char uplo, char trans, char diag, int n, int k,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtbsv(char uplo, char trans, char diag, int n, int k,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtbsv(char uplo, char trans, char diag, int n, int k,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
+                           int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasSsymv(char uplo, int n, float alpha, const float *A,
+                              int lda, const float *x, int incx, float beta,
+                              float *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDsymv(char uplo, int n, double alpha, const double *A,
+                              int lda, const double *x, int incx, double beta,
+                              double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                           const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasChemv(char uplo, int n, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChemv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZhemv(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSsbmv(char uplo, int n, int k, float alpha,
+                              const float *A, int lda, const float *x, int incx,
+                              float beta, float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDsbmv(char uplo, int n, int k, double alpha,
+                              const double *A, int lda, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasChbmv(char uplo, int n, int k, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZhbmv(char uplo, int n, int k, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha, const float *AP,
+                              const float *x, int incx, float beta, float *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *,
+                                       const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSspmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha, const double *AP,
+                              const double *x, int incx, double beta, double *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, const double *,
+                           int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDspmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
+                              const cuComplex *AP, const cuComplex *x, int incx,
+                              cuComplex beta, cuComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChpmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *AP,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSger(int m, int n, float alpha, const float *x,
+                             int incx, const float *y, int incy, float *A,
+                             int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, int, float, const float *, int,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSger");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasDger(int m, int n, double alpha, const double *x,
+                             int incx, const double *y, int incy, double *A,
+                             int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, int, double, const double *, int,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDger");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasCgeru(int m, int n, cuComplex alpha, const cuComplex *x,
+                              int incx, const cuComplex *y, int incy,
+                              cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgeru");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasCgerc(int m, int n, cuComplex alpha, const cuComplex *x,
+                              int incx, const cuComplex *y, int incy,
+                              cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgerc");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasZgeru(int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgeru");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasZgerc(int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgerc");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float *x,
+                             int incx, float *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double *x,
+                             int incx, double *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex *x,
+                             int incx, cuComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
+                                       cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCher");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasZher(char uplo, int n, double alpha,
+                             const cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZher");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float *x,
+                             int incx, float *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double *x,
+                             int incx, double *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex *x,
+                             int incx, cuComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
+                                       cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha,
+                             const cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasSsyr2(char uplo, int n, float alpha, const float *x,
+                              int incx, const float *y, int incy, float *A,
+                              int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasDsyr2(char uplo, int n, double alpha, const double *x,
+                              int incx, const double *y, int incy, double *A,
+                              int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasCher2(char uplo, int n, cuComplex alpha,
+                              const cuComplex *x, int incx, const cuComplex *y,
+                              int incy, cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasZher2(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasSspr2(char uplo, int n, float alpha, const float *x,
+                              int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasDspr2(char uplo, int n, double alpha, const double *x,
+                              int incx, const double *y, int incy, double *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasChpr2(char uplo, int n, cuComplex alpha,
+                              const cuComplex *x, int incx, const cuComplex *y,
+                              int incy, cuComplex *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasZhpr2(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasSgemm(char transa, char transb, int m, int n, int k,
+                              float alpha, const float *A, int lda,
+                              const float *B, int ldb, float beta, float *C,
+                              int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDgemm(char transa, char transb, int m, int n, int k,
+                              double alpha, const double *A, int lda,
+                              const double *B, int ldb, double beta, double *C,
+                              int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, int, double, const double *,
+                           int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCgemm(char transa, char transb, int m, int n, int k,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZgemm(char transa, char transb, int m, int n, int k,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasSsyrk(char uplo, char trans, int n, int k, float alpha,
+                              const float *A, int lda, float beta, float *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, float,
+                                       const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDsyrk(char uplo, char trans, int n, int k, double alpha,
+                              const double *A, int lda, double beta, double *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, double, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCsyrk(char uplo, char trans, int n, int k,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              cuComplex beta, cuComplex *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *,
+                           int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZsyrk(char uplo, char trans, int n, int k,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex,
+                                       const cuDoubleComplex *, int,
+                                       cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCherk(char uplo, char trans, int n, int k, float alpha,
+                              const cuComplex *A, int lda, float beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int,
+                           float, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCherk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZherk(char uplo, char trans, int n, int k, double alpha,
+                              const cuDoubleComplex *A, int lda, double beta,
+                              cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, double,
+                                       const cuDoubleComplex *, int, double,
+                                       cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZherk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasSsyr2k(char uplo, char trans, int n, int k, float alpha,
+                               const float *A, int lda, const float *B, int ldb,
+                               float beta, float *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDsyr2k(char uplo, char trans, int n, int k,
+                               double alpha, const double *A, int lda,
+                               const double *B, int ldb, double beta, double *C,
+                               int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCsyr2k(char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, cuComplex beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZsyr2k(char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *B, int ldb,
+                               cuDoubleComplex beta, cuDoubleComplex *C,
+                               int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCher2k(char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, float beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, float, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZher2k(char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *B, int ldb,
+                               double beta, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasSsymm(char side, char uplo, int m, int n, float alpha,
+                              const float *A, int lda, const float *B, int ldb,
+                              float beta, float *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDsymm(char side, char uplo, int m, int n, double alpha,
+                              const double *A, int lda, const double *B,
+                              int ldb, double beta, double *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCsymm(char side, char uplo, int m, int n,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZsymm(char side, char uplo, int m, int n,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasChemm(char side, char uplo, int m, int n,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChemm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZhemm(char side, char uplo, int m, int n,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasStrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, float alpha, const float *A,
+                              int lda, float *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasDtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, double alpha, const double *A,
+                              int lda, double *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasCtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuComplex alpha, const cuComplex *A,
+                              int lda, cuComplex *B, int ldb) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
+                           const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasZtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
+                                       cuDoubleComplex, const cuDoubleComplex *,
+                                       int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasStrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, float alpha, const float *A,
+                              int lda, float *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasDtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, double alpha, const double *A,
+                              int lda, double *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasCtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuComplex alpha, const cuComplex *A,
+                              int lda, cuComplex *B, int ldb) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
+                           const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasZtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
+                                       cuDoubleComplex, const cuDoubleComplex *,
+                                       int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cublas_9_0.inc b/tensorflow/stream_executor/cuda/cublas_9_0.inc
index ba46426878f..5e716114b23 100644
--- a/tensorflow/stream_executor/cuda/cublas_9_0.inc
+++ b/tensorflow/stream_executor/cuda/cublas_9_0.inc
@@ -2,5120 +2,4814 @@
 
 extern "C" {
 
-cublasStatus_t CUBLASWINAPI cublasCreate_v2 (cublasHandle_t *handle) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t *);
+cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCreate_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDestroy_v2 (cublasHandle_t handle) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t);
+cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDestroy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle, int *version) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int *);
+cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle,
+                                                int *version) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, version);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(libraryPropertyType, int *);
+cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type,
+                                              int *value) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetStream_v2 (cublasHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle,
+                                               cudaStream_t streamId) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetStream_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetStream_v2 (cublasHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
+cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle,
+                                               cudaStream_t *streamId) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStream_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t *mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
+cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle,
+                                                    cublasPointerMode_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetPointerMode_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
+cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle,
+                                                    cublasPointerMode_t mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetPointerMode_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t *mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
+cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle,
+                                                 cublasAtomicsMode_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetAtomicsMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
+cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle,
+                                                 cublasAtomicsMode_t mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetAtomicsMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle, cublasMath_t *mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
+cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle,
+                                              cublasMath_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMathMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
+cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle,
+                                              cublasMath_t mode) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMathMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetVector (int n, int elemSize, const void *x, 
-                                             int incx, void *devicePtr, int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void *x,
+                                            int incx, void *devicePtr,
+                                            int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVector");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, x, incx, devicePtr, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetVector (int n, int elemSize, const void *x, 
-                                             int incx, void *y, int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void *x,
+                                            int incx, void *y, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVector");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetMatrix (int rows, int cols, int elemSize, 
-                                             const void *A, int lda, void *B, 
-                                             int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize,
+                                            const void *A, int lda, void *B,
+                                            int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
+                                                 int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrix");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetMatrix (int rows, int cols, int elemSize, 
-                                             const void *A, int lda, void *B,
-                                             int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize,
+                                            const void *A, int lda, void *B,
+                                            int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
+                                                 int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrix");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetVectorAsync (int n, int elemSize, 
-                                                  const void *hostPtr, int incx, 
-                                                  void *devicePtr, int incy,
-                                                  cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasSetVectorAsync(int n, int elemSize,
+                                                 const void *hostPtr, int incx,
+                                                 void *devicePtr, int incy,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
+                                                 void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVectorAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, hostPtr, incx, devicePtr, incy, stream);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetVectorAsync (int n, int elemSize,
-                                                  const void *devicePtr, int incx,
-                                                  void *hostPtr, int incy,
-                                                  cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasGetVectorAsync(int n, int elemSize,
+                                                 const void *devicePtr,
+                                                 int incx, void *hostPtr,
+                                                 int incy,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
+                                                 void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVectorAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, devicePtr, incx, hostPtr, incy, stream);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync (int rows, int cols, int elemSize,
-                                                  const void *A, int lda, void *B,
-                                                  int ldb, cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync(int rows, int cols,
+                                                 int elemSize, const void *A,
+                                                 int lda, void *B, int ldb,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      int, int, int, const void *, int, void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrixAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync (int rows, int cols, int elemSize,
-                                                  const void *A, int lda, void *B,
-                                                  int ldb, cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync(int rows, int cols,
+                                                 int elemSize, const void *A,
+                                                 int lda, void *B, int ldb,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      int, int, int, const void *, int, void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrixAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
 }
 
-void CUBLASWINAPI cublasXerbla (const char *srName, int info) {
-  using FuncPtr = void (CUBLASWINAPI *)(const char *, int);
+void CUBLASWINAPI cublasXerbla(const char *srName, int info) {
+  using FuncPtr = void(CUBLASWINAPI *)(const char *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasXerbla");
   if (!func_ptr) LogFatalSymbolNotFound("cublasXerbla");
   return func_ptr(srName, info);
 }
 
-cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const void *x, 
-                                                     cudaDataType xType,
-                                                     int incx, 
-                                                     void *result,
-                                                     cudaDataType resultType,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, void *result,
+                                         cudaDataType resultType,
+                                         cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, void *,
+      cudaDataType, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasNrm2Ex");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, xType, incx, result, resultType, executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const float *x, 
-                                                     int incx, 
-                                                     float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx,
+                                           float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const double *x, 
-                                                     int incx, 
-                                                     double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx,
+                                           double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDotEx (cublasHandle_t handle,
-                                                     int n, 
-                                                     const void *x,
-                                                     cudaDataType xType, 
-                                                     int incx, 
-                                                     const void *y, 
-                                                     cudaDataType yType,
-                                                     int incy,
-                                                     void *result,
-                                                     cudaDataType resultType,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle, int n,
+                                        const void *x, cudaDataType xType,
+                                        int incx, const void *y,
+                                        cudaDataType yType, int incy,
+                                        void *result, cudaDataType resultType,
+                                        cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, void *, cudaDataType, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType, executionType);
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
+                  executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDotcEx (cublasHandle_t handle,
-                                                     int n, 
-                                                     const void *x,
-                                                     cudaDataType xType, 
-                                                     int incx, 
-                                                     const void *y, 
-                                                     cudaDataType yType,
-                                                     int incy,
-                                                     void *result,
-                                                     cudaDataType resultType,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, const void *y,
+                                         cudaDataType yType, int incy,
+                                         void *result, cudaDataType resultType,
+                                         cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, void *, cudaDataType, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotcEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType, executionType);
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
+                  executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSdot_v2 (cublasHandle_t handle,
-                                                     int n, 
-                                                     const float *x, 
-                                                     int incx, 
-                                                     const float *y, 
-                                                     int incy,
-                                                     float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle, int n,
+                                          const float *x, int incx,
+                                          const float *y, int incy,
+                                          float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, int, const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDdot_v2 (cublasHandle_t handle,
-                                                     int n, 
-                                                     const double *x, 
-                                                     int incx, 
-                                                     const double *y,
-                                                     int incy,
-                                                     double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle, int n,
+                                          const double *x, int incx,
+                                          const double *y, int incy,
+                                          double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, int, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCdotu_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      const cuComplex *y, 
-                                                      int incy,
-                                                      cuComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           const cuComplex *y, int incy,
+                                           cuComplex *result) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     int, const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCdotc_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      const cuComplex *y, 
-                                                      int incy,
-                                                      cuComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           const cuComplex *y, int incy,
+                                           cuComplex *result) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     int, const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdotu_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      const cuDoubleComplex *y, 
-                                                      int incy,
-                                                      cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           const cuDoubleComplex *y, int incy,
+                                           cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdotc_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx,
-                                                      const cuDoubleComplex *y, 
-                                                      int incy,
-                                                      cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           const cuDoubleComplex *y, int incy,
+                                           cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScalEx(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const void *alpha,  /* host or device pointer */
-                                                     cudaDataType alphaType,
-                                                     void *x, 
-                                                     cudaDataType xType,
-                                                     int incx,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType, int, cudaDataType);
+cublasStatus_t CUBLASWINAPI
+cublasScalEx(cublasHandle_t handle, int n,
+             const void *alpha, /* host or device pointer */
+             cudaDataType alphaType, void *x, cudaDataType xType, int incx,
+             cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType,
+      int, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScalEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, alphaType, x, xType, incx, executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const float *alpha,  /* host or device pointer */
-                                                     float *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSscal_v2(cublasHandle_t handle, int n,
+               const float *alpha, /* host or device pointer */
+               float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const double *alpha,  /* host or device pointer */
-                                                     double *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDscal_v2(cublasHandle_t handle, int n,
+               const double *alpha, /* host or device pointer */
+               double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const cuComplex *alpha, /* host or device pointer */
-                                                     cuComplex *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCscal_v2(cublasHandle_t handle, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsscal_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsscal_v2(cublasHandle_t handle, int n,
+                const float *alpha, /* host or device pointer */
+                cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */
-                                                     cuDoubleComplex *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZscal_v2(cublasHandle_t handle, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdscal_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const double *alpha, /* host or device pointer */
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZdscal_v2(cublasHandle_t handle, int n,
+                const double *alpha, /* host or device pointer */
+                cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasAxpyEx (cublasHandle_t handle,
-                                                      int n,
-                                                      const void *alpha, /* host or device pointer */
-                                                      cudaDataType alphaType,
-                                                      const void *x,
-                                                      cudaDataType xType,
-                                                      int incx,
-                                                      void *y,
-                                                      cudaDataType yType,
-                                                      int incy,
-                                                      cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, const void *, cudaDataType, int, void *, cudaDataType, int, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasAxpyEx(
+    cublasHandle_t handle, int n,
+    const void *alpha, /* host or device pointer */
+    cudaDataType alphaType, const void *x, cudaDataType xType, int incx,
+    void *y, cudaDataType yType, int incy, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, const void *,
+      cudaDataType, int, void *, cudaDataType, int, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasAxpyEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy, executiontype);
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy,
+                  executiontype);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */
-                                                      const float *x, 
-                                                      int incx, 
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSaxpy_v2(cublasHandle_t handle, int n,
+               const float *alpha, /* host or device pointer */
+               const float *x, int incx, float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const double *alpha, /* host or device pointer */
-                                                      const double *x, 
-                                                      int incx, 
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDaxpy_v2(cublasHandle_t handle, int n,
+               const double *alpha, /* host or device pointer */
+               const double *x, int incx, double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *alpha, /* host or device pointer */
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      cuComplex *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCaxpy_v2(cublasHandle_t handle, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, cuComplex *y, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      cuDoubleComplex *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(
+    cublasHandle_t handle, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasScopy_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx, float *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDcopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const double *x, 
-                                                      int incx, 
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDcopy_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx, double *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCcopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCcopy_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZcopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZcopy_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuDoubleComplex *, int,
+                                                 cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      float *x, 
-                                                      int incx, 
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSswap_v2(cublasHandle_t handle, int n,
+                                           float *x, int incx, float *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *,
+                                                 int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      double *x, 
-                                                      int incx, 
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDswap_v2(cublasHandle_t handle, int n,
+                                           double *x, int incx, double *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, double *,
+                                                 int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      cuComplex *x, 
-                                                      int incx, 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCswap_v2(cublasHandle_t handle, int n,
+                                           cuComplex *x, int incx, cuComplex *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZswap_v2(cublasHandle_t handle, int n,
+                                           cuDoubleComplex *x, int incx,
+                                           cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, int n,
+                                            const float *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const double *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, int n,
+                                            const double *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, int n,
+                                            const float *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const double *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, int n,
+                                            const double *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const float *x, 
-                                                     int incx, 
-                                                     float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx,
+                                           float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const double *x, 
-                                                     int incx, 
-                                                     double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx,
+                                           double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     float *x, 
-                                                     int incx, 
-                                                     float *y, 
-                                                     int incy, 
-                                                     const float *c,  /* host or device pointer */
-                                                     const float *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int, const float *, const float *);
+cublasStatus_t CUBLASWINAPI
+cublasSrot_v2(cublasHandle_t handle, int n, float *x, int incx, float *y,
+              int incy, const float *c, /* host or device pointer */
+              const float *s) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *,
+                                     int, const float *, const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     double *x, 
-                                                     int incx, 
-                                                     double *y, 
-                                                     int incy, 
-                                                     const double *c,  /* host or device pointer */
-                                                     const double *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int, const double *, const double *);
+cublasStatus_t CUBLASWINAPI
+cublasDrot_v2(cublasHandle_t handle, int n, double *x, int incx, double *y,
+              int incy, const double *c, /* host or device pointer */
+              const double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *, int, double *, int, const double *,
+      const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuComplex *x, 
-                                                     int incx, 
-                                                     cuComplex *y, 
-                                                     int incy, 
-                                                     const float *c,      /* host or device pointer */
-                                                     const cuComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *, const cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCrot_v2(
+    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
+    int incy, const float *c, /* host or device pointer */
+    const cuComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
+      const cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsrot_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuComplex *x, 
-                                                     int incx, 
-                                                     cuComplex *y, 
-                                                     int incy, 
-                                                     const float *c,  /* host or device pointer */
-                                                     const float *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *, const float *);
+cublasStatus_t CUBLASWINAPI cublasCsrot_v2(
+    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
+    int incy, const float *c, /* host or device pointer */
+    const float *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
+      const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuDoubleComplex *x, 
-                                                     int incx, 
-                                                     cuDoubleComplex *y, 
-                                                     int incy, 
-                                                     const double *c,            /* host or device pointer */
-                                                     const cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, const double *, const cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZrot_v2(
+    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
+    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
+    const cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      const double *, const cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdrot_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuDoubleComplex *x, 
-                                                     int incx, 
-                                                     cuDoubleComplex *y, 
-                                                     int incy, 
-                                                     const double *c,  /* host or device pointer */
-                                                     const double *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, const double *, const double *);
+cublasStatus_t CUBLASWINAPI cublasZdrot_v2(
+    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
+    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
+    const double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      const double *, const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrotg_v2(cublasHandle_t handle, 
-                                                     float *a,   /* host or device pointer */
-                                                     float *b,   /* host or device pointer */
-                                                     float *c,   /* host or device pointer */
-                                                     float *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, float *, float *, float *, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSrotg_v2(cublasHandle_t handle, float *a, /* host or device pointer */
+               float *b,                        /* host or device pointer */
+               float *c,                        /* host or device pointer */
+               float *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, float *,
+                                                 float *, float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrotg_v2(cublasHandle_t handle, 
-                                                     double *a,  /* host or device pointer */
-                                                     double *b,  /* host or device pointer */
-                                                     double *c,  /* host or device pointer */
-                                                     double *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, double *, double *, double *, double *);
+cublasStatus_t CUBLASWINAPI
+cublasDrotg_v2(cublasHandle_t handle, double *a, /* host or device pointer */
+               double *b,                        /* host or device pointer */
+               double *c,                        /* host or device pointer */
+               double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, double *,
+                                                 double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCrotg_v2(cublasHandle_t handle, 
-                                                     cuComplex *a,  /* host or device pointer */
-                                                     cuComplex *b,  /* host or device pointer */
-                                                     float *c,      /* host or device pointer */
-                                                     cuComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasCrotg_v2(cublasHandle_t handle, cuComplex *a, /* host or device pointer */
+               cuComplex *b,                        /* host or device pointer */
+               float *c,                            /* host or device pointer */
+               cuComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZrotg_v2(cublasHandle_t handle, 
-                                                     cuDoubleComplex *a,  /* host or device pointer */
-                                                     cuDoubleComplex *b,  /* host or device pointer */
-                                                     double *c,           /* host or device pointer */
-                                                     cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZrotg_v2(
+    cublasHandle_t handle, cuDoubleComplex *a, /* host or device pointer */
+    cuDoubleComplex *b,                        /* host or device pointer */
+    double *c,                                 /* host or device pointer */
+    cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *,
+      cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     float *x, 
-                                                     int incx, 
-                                                     float *y, 
-                                                     int incy, 
-                                                     const float* param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int, const float *);
+cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, int n,
+                                           float *x, int incx, float *y,
+                                           int incy, const float *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, float *, int, float *, int, const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, param);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     double *x, 
-                                                     int incx, 
-                                                     double *y, 
-                                                     int incy, 
-                                                     const double* param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int, const double *);
+cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, int n,
+                                           double *x, int incx, double *y,
+                                           int incy, const double *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *, int, double *, int, const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, param);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrotmg_v2(cublasHandle_t handle, 
-                                                      float *d1,        /* host or device pointer */
-                                                      float *d2,        /* host or device pointer */
-                                                      float *x1,        /* host or device pointer */
-                                                      const float *y1,  /* host or device pointer */
-                                                      float *param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, float *, float *, float *, const float *, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSrotmg_v2(cublasHandle_t handle, float *d1, /* host or device pointer */
+                float *d2,                        /* host or device pointer */
+                float *x1,                        /* host or device pointer */
+                const float *y1,                  /* host or device pointer */
+                float *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, float *, float *, float *, const float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, d1, d2, x1, y1, param);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrotmg_v2(cublasHandle_t handle, 
-                                                      double *d1,        /* host or device pointer */  
-                                                      double *d2,        /* host or device pointer */  
-                                                      double *x1,        /* host or device pointer */  
-                                                      const double *y1,  /* host or device pointer */  
-                                                      double *param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, double *, double *, double *, const double *, double *);
+cublasStatus_t CUBLASWINAPI
+cublasDrotmg_v2(cublasHandle_t handle, double *d1, /* host or device pointer */
+                double *d2,                        /* host or device pointer */
+                double *x1,                        /* host or device pointer */
+                const double *y1,                  /* host or device pointer */
+                double *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, double *, double *, double *, const double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, d1, d2, x1, y1, param);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemv_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t trans, 
-                                                      int m, 
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */
-                                                      const float *A, 
-                                                      int lda, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      const float *beta,  /* host or device pointer */
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const float *, const float *,
+      int, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemv_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta, /* host or device pointer */
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x, 
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgemv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgbmv_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A, 
-                                                      int lda, 
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               int kl, int ku, const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const float *,
+      const float *, int, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgbmv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const double *alpha, /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda, 
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta, /* host or device pointer */ 
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               int kl, int ku, const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgbmv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda, 
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
+    int ku, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *x, int incx,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgbmv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                      const cuDoubleComplex *A,
-                                                      int lda, 
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
+    int ku, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const float *A, int lda, float *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *AP, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *AP, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *AP, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *AP, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *AP,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *AP, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *AP, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *AP, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *AP, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *AP,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const float *A, int lda, float *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsymv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */ 
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *beta, /* host or device pointer */ 
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsymv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta, /* host or device pointer */ 
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsymv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsymv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */ 
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta,   /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsymv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChemv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasChemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhemv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */ 
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta,   /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZhemv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsbmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha,   /* host or device pointer */ 
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *x, 
-                                                      int incx,
-                                                      const float *beta,  /* host or device pointer */ 
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *,
+      int, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha,   /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *x, 
-                                                      int incx,
-                                                      const double *beta,   /* host or device pointer */ 
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChbmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x, 
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasChbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhbmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *x, 
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSspmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const float *alpha,  /* host or device pointer */                                           
-                                                      const float *AP,
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *beta,   /* host or device pointer */  
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *AP, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *,
+      const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDspmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *AP,
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta,  /* host or device pointer */  
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const double *alpha, /* host or device pointer */
+               const double *AP, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *AP,
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasChpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *AP, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhpmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *AP,
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZhpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *AP, const cuDoubleComplex *x, int incx,
+               const cuDoubleComplex *beta, /* host or device pointer */
+               cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSger_v2 (cublasHandle_t handle,
-                                                     int m,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const float *x,
-                                                     int incx,
-                                                     const float *y,
-                                                     int incy,
-                                                     float *A,
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const float *, const float *, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSger_v2(
+    cublasHandle_t handle, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *x, int incx, const float *y, int incy, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const float *, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDger_v2 (cublasHandle_t handle, 
-                                                     int m,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */   
-                                                     const double *x,
-                                                     int incx,
-                                                     const double *y,
-                                                     int incy,
-                                                     double *A,
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const double *, const double *, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDger_v2(
+    cublasHandle_t handle, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const double *, const double *, int,
+      const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgeru_v2 (cublasHandle_t handle, 
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *y,
-                                                      int incy,
-                                                      cuComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgeru_v2(cublasHandle_t handle, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgerc_v2 (cublasHandle_t handle,
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *y,
-                                                      int incy,
-                                                      cuComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgerc_v2(cublasHandle_t handle, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgeru_v2 (cublasHandle_t handle, 
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgeru_v2(cublasHandle_t handle, int m, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgerc_v2 (cublasHandle_t handle,
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgerc_v2(cublasHandle_t handle, int m, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const float *x,
-                                                     int incx,
-                                                     float *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const float *x, int incx, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const double *x,
-                                                     int incx,
-                                                     double *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const double *x, int incx, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const cuComplex *alpha, /* host or device pointer */  
-                                                     const cuComplex *x,
-                                                     int incx,
-                                                     cuComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const cuComplex *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *x,
-                                                     int incx,
-                                                     cuDoubleComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const cuDoubleComplex *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCher_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const cuComplex *x,
-                                                     int incx,
-                                                     cuComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
+      int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZher_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *x,
-                                                     int incx,
-                                                     cuDoubleComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSspr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const float *x,
-                                                     int incx,
-                                                     float *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const float *x, int incx, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDspr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const double *x,
-                                                     int incx,
-                                                     double *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI
+cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const double *x, int incx, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChpr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const cuComplex *x,
-                                                     int incx,
-                                                     cuComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasChpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
+      int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhpr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *x,
-                                                     int incx,
-                                                     cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *y,
-                                                      int incy,
-                                                      float *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const float *alpha, /* host or device pointer */
+    const float *x, int incx, const float *y, int incy, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *y,
-                                                      int incy,
-                                                      double *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, int n, 
-                                                      const cuComplex *alpha,  /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx, 
-                                                      const cuComplex *y,
-                                                      int incy, 
-                                                      cuComplex *A, 
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCher2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, int n, 
-                                                      const cuComplex *alpha,  /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx, 
-                                                      const cuComplex *y,
-                                                      int incy, 
-                                                      cuComplex *A, 
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZher2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSspr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const float *alpha,  /* host or device pointer */  
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *y,
-                                                      int incy,
-                                                      float *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *x, int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDspr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const double *alpha,  /* host or device pointer */  
-                                                      const double *x,
-                                                      int incx, 
-                                                      const double *y,
-                                                      int incy,
-                                                      double *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDspr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChpr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *y,
-                                                      int incy,
-                                                      cuComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasChpr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *x, int incx, const cuComplex *y, int incy, cuComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhpr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasZhpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A, 
-                                                      int lda,
-                                                      const float *B,
-                                                      int ldb, 
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A, 
-                                                      int lda,
-                                                      const double *B,
-                                                      int ldb, 
-                                                      const double *beta, /* host or device pointer */  
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A, 
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb, 
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3m  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A, 
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb, 
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3m(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3m");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3mEx (cublasHandle_t handle, 
-                                                     cublasOperation_t transa, cublasOperation_t transb,  
-                                                     int m, int n, int k, 
-                                                     const cuComplex *alpha, 
-                                                     const void *A, 
-                                                     cudaDataType Atype, 
-                                                     int lda, 
-                                                     const void *B, 
-                                                     cudaDataType Btype, 
-                                                     int ldb,
-                                                     const cuComplex *beta, 
-                                                     void *C, 
-                                                     cudaDataType Ctype, 
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const void *, cudaDataType, int, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const void *A,
+    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
+    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A, 
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb, 
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemm3m  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A, 
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb, 
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgemm3m(cublasHandle_t handle, cublasOperation_t transa,
+              cublasOperation_t transb, int m, int n, int k,
+              const cuDoubleComplex *alpha, /* host or device pointer */
+              const cuDoubleComplex *A, int lda, const cuDoubleComplex *B,
+              int ldb, const cuDoubleComplex *beta, /* host or device pointer */
+              cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm3m");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemmEx  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype,
-                                                      int lda,
-                                                      const void *B,
-                                                      cudaDataType Btype,
-                                                      int ldb, 
-                                                      const float *beta, /* host or device pointer */  
-                                                      void *C,
-                                                      cudaDataType Ctype,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const void *, cudaDataType, int, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasSgemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda, const void *B,
+    cudaDataType Btype, int ldb, const float *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const float *, void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGemmEx  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const void *alpha, /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype,
-                                                      int lda,
-                                                      const void *B,
-                                                      cudaDataType Btype,
-                                                      int ldb, 
-                                                      const void *beta, /* host or device pointer */  
-                                                      void *C,
-                                                      cudaDataType Ctype,
-                                                      int ldc,
-                                                      cudaDataType computeType,
-                                                      cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void *, const void *, cudaDataType, int, const void *, cudaDataType, int, const void *, void *, cudaDataType, int, cudaDataType, cublasGemmAlgo_t);
+cublasStatus_t CUBLASWINAPI cublasGemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const void *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda, const void *B,
+    cudaDataType Btype, int ldb, const void *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc, cudaDataType computeType,
+    cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const void *, const void *, cudaDataType, int, const void *, cudaDataType,
+      int, const void *, void *, cudaDataType, int, cudaDataType,
+      cublasGemmAlgo_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemmEx (cublasHandle_t handle, 
-                                                     cublasOperation_t transa, cublasOperation_t transb,  
-                                                     int m, int n, int k, 
-                                                     const cuComplex *alpha, 
-                                                     const void *A, 
-                                                     cudaDataType Atype, 
-                                                     int lda, 
-                                                     const void *B, 
-                                                     cudaDataType Btype, 
-                                                     int ldb,
-                                                     const cuComplex *beta, 
-                                                     void *C, 
-                                                     cudaDataType Ctype, 
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const void *, cudaDataType, int, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCgemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const void *A,
+    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
+    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasUint8gemmBias (cublasHandle_t handle, 
-                                                           cublasOperation_t transa, cublasOperation_t transb, cublasOperation_t transc,  
-                                                           int m, int n, int k, 
-                                                           const unsigned char *A, int A_bias, int lda, 
-                                                           const unsigned char *B, int B_bias, int ldb,
-                                                                 unsigned char *C, int C_bias, int ldc,
-                                                           int C_mult, int C_shift) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t, int, int, int, const unsigned char *, int, int, const unsigned char *, int, int, unsigned char *, int, int, int, int);
+cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    cublasOperation_t transc, int m, int n, int k, const unsigned char *A,
+    int A_bias, int lda, const unsigned char *B, int B_bias, int ldb,
+    unsigned char *C, int C_bias, int ldc, int C_mult, int C_shift) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t,
+      int, int, int, const unsigned char *, int, int, const unsigned char *,
+      int, int, unsigned char *, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasUint8gemmBias");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B, B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
+  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B,
+                  B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha,           /* host or device pointer */
+    const float *A, int lda, const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha,  /* host or device pointer */  
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *beta,  /* host or device pointer */  
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha,            /* host or device pointer */
+    const double *A, int lda, const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrkEx ( cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype, 
-                                                      int lda,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      void *C, 
-                                                      cudaDataType Ctype, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrkEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
+      void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      int n, 
-                                                      int k,
-                                                      const cuComplex *alpha, 
-                                                      const void *A, 
-                                                      cudaDataType Atype, 
-                                                      int lda,
-                                                      const cuComplex *beta, 
-                                                      void *C, 
-                                                      cudaDataType Ctype, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, const void *A, cudaDataType Atype,
+    int lda, const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
+      void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk3mEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha,  /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const float *beta,   /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const cuComplex *, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCherk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha,               /* host or device pointer */
+    const cuComplex *A, int lda, const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const cuComplex *, int, const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZherk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha,  /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const double *beta,  /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZherk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const cuDoubleComplex *, int, const double *,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherkEx  (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha,  /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype,
-                                                      int lda,
-                                                      const float *beta,   /* host or device pointer */  
-                                                      void *C,
-                                                      cudaDataType Ctype,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCherkEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    const float *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const void *, cudaDataType, int, const float *, void *,
+      cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherk3mEx (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo, 
-                                                       cublasOperation_t trans, 
-                                                       int n, 
-                                                       int k,
-                                                       const float *alpha, 
-                                                       const void *A, cudaDataType Atype, 
-                                                       int lda,
-                                                       const float *beta, 
-                                                       void *C, 
-                                                       cudaDataType Ctype, 
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCherk3mEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, const void *A, cudaDataType Atype,
+    int lda, const float *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const void *, cudaDataType, int, const float *, void *,
+      cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk3mEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const float *alpha, /* host or device pointer */  
-                                                       const float *A,
-                                                       int lda,
-                                                       const float *B,
-                                                       int ldb,
-                                                       const float *beta, /* host or device pointer */  
-                                                       float *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const double *alpha, /* host or device pointer */  
-                                                       const double *A,
-                                                       int lda,
-                                                       const double *B,
-                                                       int ldb,
-                                                       const double *beta, /* host or device pointer */  
-                                                       double *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const cuComplex *alpha, /* host or device pointer */  
-                                                       const cuComplex *A,
-                                                       int lda,
-                                                       const cuComplex *B,
-                                                       int ldb,
-                                                       const cuComplex *beta, /* host or device pointer */  
-                                                       cuComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const cuDoubleComplex *alpha,  /* host or device pointer */  
-                                                       const cuDoubleComplex *A,
-                                                       int lda,
-                                                       const cuDoubleComplex *B,
-                                                       int ldb,
-                                                       const cuDoubleComplex *beta,  /* host or device pointer */  
-                                                       cuDoubleComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCher2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const cuComplex *alpha, /* host or device pointer */  
-                                                       const cuComplex *A,
-                                                       int lda,
-                                                       const cuComplex *B,
-                                                       int ldb,
-                                                       const float *beta,   /* host or device pointer */  
-                                                       cuComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCher2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZher2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans, 
-                                                       int n,
-                                                       int k,
-                                                       const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                       const cuDoubleComplex *A, 
-                                                       int lda,
-                                                       const cuDoubleComplex *B,
-                                                       int ldb,
-                                                       const double *beta, /* host or device pointer */  
-                                                       cuDoubleComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZher2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const float *alpha, /* host or device pointer */ 
-                                                    const float *A,
-                                                    int lda,
-                                                    const float *B,
-                                                    int ldb,
-                                                    const float *beta, /* host or device pointer */ 
-                                                    float *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const double *alpha, /* host or device pointer */ 
-                                                    const double *A,
-                                                    int lda,
-                                                    const double *B,
-                                                    int ldb,
-                                                    const double *beta, /* host or device pointer */ 
-                                                    double *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuComplex *alpha, /* host or device pointer */ 
-                                                    const cuComplex *A,
-                                                    int lda,
-                                                    const cuComplex *B,
-                                                    int ldb,
-                                                    const cuComplex *beta, /* host or device pointer */ 
-                                                    cuComplex *C, 
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo, 
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                    const cuDoubleComplex *A,
-                                                    int lda,
-                                                    const cuDoubleComplex *B,
-                                                    int ldb,
-                                                    const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                    cuDoubleComplex *C, 
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuComplex *alpha, /* host or device pointer */ 
-                                                    const cuComplex *A,
-                                                    int lda,
-                                                    const cuComplex *B,
-                                                    int ldb,
-                                                    const float *beta, /* host or device pointer */ 
-                                                    cuComplex *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCherkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZherkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                    const cuDoubleComplex *A,
-                                                    int lda,
-                                                    const cuDoubleComplex *B,
-                                                    int ldb,
-                                                    const double *beta, /* host or device pointer */ 
-                                                    cuDoubleComplex *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZherkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *B,
-                                                      int ldb,
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m, 
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *B,
-                                                      int ldb,
-                                                      const double *beta, /* host or device pointer */  
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChemm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasChemm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhemm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZhemm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrsm_v2 (cublasHandle_t handle, 
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda,
-                                                      float *B,
-                                                      int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *A, int lda, float *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *, int, float *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrsm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *B,
-                                                      int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *A, int lda, double *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *, int, double *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(cublasHandle_t handle,
-                                                     cublasSideMode_t side,
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuComplex *alpha, /* host or device pointer */  
-                                                     const cuComplex *A,
-                                                     int lda,
-                                                     cuComplex *B,
-                                                     int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, cuComplex *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(cublasHandle_t handle, 
-                                                     cublasSideMode_t side,
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *A,                                        
-                                                     int lda,
-                                                     cuDoubleComplex *B,
-                                                     int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrmm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda, 
-                                                      const float *B,
-                                                      int ldb,
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrmm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *B,
-                                                      int ldb,
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb, double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *, int,
+      const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(cublasHandle_t handle,
-                                                     cublasSideMode_t side,
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuComplex *alpha, /* host or device pointer */  
-                                                     const cuComplex *A,
-                                                     int lda,
-                                                     const cuComplex *B,
-                                                     int ldb,
-                                                     cuComplex *C,
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb, cuComplex *C,
+    int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, 
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *A,
-                                                     int lda,
-                                                     const cuDoubleComplex *B,
-                                                     int ldb,
-                                                     cuDoubleComplex *C,
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemmBatched (cublasHandle_t handle,
-                                                          cublasOperation_t transa,
-                                                          cublasOperation_t transb, 
-                                                          int m,
-                                                          int n,
-                                                          int k,
-                                                          const float *alpha,  /* host or device pointer */  
-                                                          const float *Aarray[], 
-                                                          int lda,
-                                                          const float *Barray[],
-                                                          int ldb, 
-                                                          const float *beta,   /* host or device pointer */  
-                                                          float *Carray[],
-                                                          int ldc,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *[], int, const float *[], int, const float *, float *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasSgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const float *Aarray[], int lda, const float *Barray[], int ldb,
+    const float *beta, /* host or device pointer */
+    float *Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *[], int, const float *[], int, const float *,
+      float *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemmBatched (cublasHandle_t handle,
-                                                          cublasOperation_t transa,
-                                                          cublasOperation_t transb, 
-                                                          int m,
-                                                          int n,
-                                                          int k,
-                                                          const double *alpha,  /* host or device pointer */ 
-                                                          const double *Aarray[], 
-                                                          int lda,
-                                                          const double *Barray[],
-                                                          int ldb, 
-                                                          const double *beta,  /* host or device pointer */ 
-                                                          double *Carray[],
-                                                          int ldc,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *[], int, const double *[], int, const double *, double *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasDgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *Aarray[], int lda, const double *Barray[], int ldb,
+    const double *beta, /* host or device pointer */
+    double *Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *[], int, const double *[], int,
+      const double *, double *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemmBatched (cublasHandle_t handle,
-                                                          cublasOperation_t transa,
-                                                          cublasOperation_t transb, 
-                                                          int m,
-                                                          int n,
-                                                          int k,
-                                                          const cuComplex *alpha, /* host or device pointer */ 
-                                                          const cuComplex *Aarray[], 
-                                                          int lda,
-                                                          const cuComplex *Barray[],
-                                                          int ldb, 
-                                                          const cuComplex *beta, /* host or device pointer */ 
-                                                          cuComplex *Carray[],
-                                                          int ldc,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *[], int, const cuComplex *[], int, const cuComplex *, cuComplex *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasCgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *Aarray[], int lda, const cuComplex *Barray[], int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *[], int, const cuComplex *[], int,
+      const cuComplex *, cuComplex *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched (cublasHandle_t handle,
-                                                          cublasOperation_t transa,
-                                                          cublasOperation_t transb, 
-                                                          int m,
-                                                          int n,
-                                                          int k,
-                                                          const cuComplex *alpha, /* host or device pointer */ 
-                                                          const cuComplex *Aarray[], 
-                                                          int lda,
-                                                          const cuComplex *Barray[],
-                                                          int ldb, 
-                                                          const cuComplex *beta, /* host or device pointer */ 
-                                                          cuComplex *Carray[],
-                                                          int ldc,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *[], int, const cuComplex *[], int, const cuComplex *, cuComplex *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *Aarray[], int lda, const cuComplex *Barray[], int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *[], int, const cuComplex *[], int,
+      const cuComplex *, cuComplex *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemmBatched (cublasHandle_t handle,
-                                                          cublasOperation_t transa,
-                                                          cublasOperation_t transb, 
-                                                          int m,
-                                                          int n,
-                                                          int k,
-                                                          const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                          const cuDoubleComplex *Aarray[], 
-                                                          int lda,
-                                                          const cuDoubleComplex *Barray[],
-                                                          int ldb, 
-                                                          const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                          cuDoubleComplex *Carray[],
-                                                          int ldc,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *[], int, const cuDoubleComplex *[], int, const cuDoubleComplex *, cuDoubleComplex *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasZgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *Aarray[], int lda, const cuDoubleComplex *Barray[],
+    int ldb, const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *[], int,
+      const cuDoubleComplex *[], int, const cuDoubleComplex *,
+      cuDoubleComplex *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const float *alpha,  /* host or device pointer */
-                                                                 const float *A,
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const float *B,
-                                                                 int ldb,
-                                                                 long long int strideB,
-                                                                 const float *beta,   /* host or device pointer */
-                                                                 float *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *, int, long long, const float *, int, long long, const float *, float *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha,        /* host or device pointer */
+    const float *A, int lda, long long int strideA, /* purposely signed */
+    const float *B, int ldb, long long int strideB,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *, int, long long, const float *, int,
+      long long, const float *, float *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const double *alpha,  /* host or device pointer */
-                                                                 const double *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const double *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const double *beta,   /* host or device pointer */
-                                                                 double *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *, int, long long, const double *, int, long long, const double *, double *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, long long int strideA, /* purposely signed */
+    const double *B, int ldb, long long int strideB,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *, int, long long, const double *, int,
+      long long, const double *, double *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const cuComplex *alpha,  /* host or device pointer */
-                                                                 const cuComplex *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const cuComplex *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const cuComplex *beta,   /* host or device pointer */
-                                                                 cuComplex *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, long long, const cuComplex *, int, long long, const cuComplex *, cuComplex *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
+    const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
+      int, long long, const cuComplex *, cuComplex *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const cuComplex *alpha,  /* host or device pointer */
-                                                                 const cuComplex *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const cuComplex *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const cuComplex *beta,   /* host or device pointer */
-                                                                 cuComplex *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, long long, const cuComplex *, int, long long, const cuComplex *, cuComplex *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
+    const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
+      int, long long, const cuComplex *, cuComplex *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const cuDoubleComplex *alpha,  /* host or device pointer */
-                                                                 const cuDoubleComplex *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const cuDoubleComplex *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const cuDoubleComplex *beta,   /* host or device poi */
-                                                                 cuDoubleComplex *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, cuDoubleComplex *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    long long int strideA, /* purposely signed */
+    const cuDoubleComplex *B, int ldb, long long int strideB,
+    const cuDoubleComplex *beta, /* host or device poi */
+    cuDoubleComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, long long,
+      const cuDoubleComplex *, int, long long, const cuDoubleComplex *,
+      cuDoubleComplex *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const float *alpha, /* host or device pointer */ 
-                                                  const float *A, 
-                                                  int lda,
-                                                  const float *beta , /* host or device pointer */ 
-                                                  const float *B, 
-                                                  int ldb,
-                                                  float *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const float *alpha,           /* host or device pointer */
+    const float *A, int lda, const float *beta, /* host or device pointer */
+    const float *B, int ldb, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, const float *, int,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const double *alpha, /* host or device pointer */ 
-                                                  const double *A, 
-                                                  int lda,
-                                                  const double *beta, /* host or device pointer */ 
-                                                  const double *B, 
-                                                  int ldb,
-                                                  double *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const double *alpha,            /* host or device pointer */
+    const double *A, int lda, const double *beta, /* host or device pointer */
+    const double *B, int ldb, double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, const double *, int,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const cuComplex *alpha, /* host or device pointer */ 
-                                                  const cuComplex *A, 
-                                                  int lda,
-                                                  const cuComplex *beta, /* host or device pointer */  
-                                                  const cuComplex *B, 
-                                                  int ldb,
-                                                  cuComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    const cuComplex *B, int ldb, cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                  const cuDoubleComplex *A, 
-                                                  int lda,
-                                                  const cuDoubleComplex *beta, /* host or device pointer */  
-                                                  const cuDoubleComplex *B, 
-                                                  int ldb,
-                                                  cuDoubleComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    const cuDoubleComplex *B, int ldb, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(cublasHandle_t handle,
-                                                  int n, 
-                                                  float *A[],                      /*Device pointer*/
-                                                  int lda, 
-                                                  int *P,                          /*Device Pointer*/
-                                                  int *info,                       /*Device Pointer*/
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *[], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(cublasHandle_t handle, int n,
+                                                float *A[], /*Device pointer*/
+                                                int lda,
+                                                int *P,    /*Device Pointer*/
+                                                int *info, /*Device Pointer*/
+                                                int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *[],
+                                                 int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(cublasHandle_t handle,
-                                                  int n, 
-                                                  double *A[],                     /*Device pointer*/
-                                                  int lda, 
-                                                  int *P,                          /*Device Pointer*/
-                                                  int *info,                       /*Device Pointer*/
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *[], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(cublasHandle_t handle, int n,
+                                                double *A[], /*Device pointer*/
+                                                int lda,
+                                                int *P,    /*Device Pointer*/
+                                                int *info, /*Device Pointer*/
+                                                int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(cublasHandle_t handle,
-                                                  int n, 
-                                                  cuComplex *A[],                 /*Device pointer*/
-                                                  int lda, 
-                                                  int *P,                         /*Device Pointer*/
-                                                  int *info,                      /*Device Pointer*/
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *[], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(
+    cublasHandle_t handle, int n, cuComplex *A[], /*Device pointer*/
+    int lda, int *P,                              /*Device Pointer*/
+    int *info,                                    /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(cublasHandle_t handle,
-                                                  int n, 
-                                                  cuDoubleComplex *A[],           /*Device pointer*/
-                                                  int lda, 
-                                                  int *P,                         /*Device Pointer*/
-                                                  int *info,                      /*Device Pointer*/
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *[], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(
+    cublasHandle_t handle, int n, cuDoubleComplex *A[], /*Device pointer*/
+    int lda, int *P,                                    /*Device Pointer*/
+    int *info,                                          /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgetriBatched(cublasHandle_t handle,
-                                                  int n,
-                                                  const float *A[],               /*Device pointer*/
-                                                  int lda,
-                                                  const int *P,                   /*Device pointer*/
-                                                  float *C[],                     /*Device pointer*/
-                                                  int ldc,
-                                                  int *info,
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *[], int, const int *, float *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasSgetriBatched(
+    cublasHandle_t handle, int n, const float *A[], /*Device pointer*/
+    int lda, const int *P,                          /*Device pointer*/
+    float *C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const float *[], int,
+                                     const int *, float *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetriBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgetriBatched(cublasHandle_t handle,
-                                                  int n,
-                                                  const double *A[],              /*Device pointer*/
-                                                  int lda,
-                                                  const int *P,                   /*Device pointer*/
-                                                  double *C[],                    /*Device pointer*/
-                                                  int ldc,
-                                                  int *info,
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *[], int, const int *, double *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasDgetriBatched(
+    cublasHandle_t handle, int n, const double *A[], /*Device pointer*/
+    int lda, const int *P,                           /*Device pointer*/
+    double *C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const double *[], int,
+                                     const int *, double *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetriBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgetriBatched(cublasHandle_t handle,
-                                                  int n,
-                                                  const cuComplex *A[],            /*Device pointer*/
-                                                  int lda,
-                                                  const int *P,                   /*Device pointer*/
-                                                  cuComplex *C[],                 /*Device pointer*/
-                                                  int ldc,
-                                                  int *info,
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *[], int, const int *, cuComplex *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasCgetriBatched(
+    cublasHandle_t handle, int n, const cuComplex *A[], /*Device pointer*/
+    int lda, const int *P,                              /*Device pointer*/
+    cuComplex *C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *[], int, const int *, cuComplex *[],
+      int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetriBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgetriBatched(cublasHandle_t handle,
-                                                  int n,
-                                                  const cuDoubleComplex *A[],     /*Device pointer*/
-                                                  int lda,
-                                                  const int *P,                   /*Device pointer*/
-                                                  cuDoubleComplex *C[],           /*Device pointer*/
-                                                  int ldc,
-                                                  int *info,
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *[], int, const int *, cuDoubleComplex *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasZgetriBatched(
+    cublasHandle_t handle, int n, const cuDoubleComplex *A[], /*Device pointer*/
+    int lda, const int *P,                                    /*Device pointer*/
+    cuDoubleComplex *C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *[], int, const int *,
+      cuDoubleComplex *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetriBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasSgetrsBatched( cublasHandle_t handle, 
-                                                            cublasOperation_t trans, 
-                                                            int n, 
-                                                            int nrhs, 
-                                                            const float *Aarray[], 
-                                                            int lda, 
-                                                            const int *devIpiv, 
-                                                            float *Barray[], 
-                                                            int ldb, 
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const float *[], int, const int *, float *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(cublasHandle_t handle,
+                                                cublasOperation_t trans, int n,
+                                                int nrhs, const float *Aarray[],
+                                                int lda, const int *devIpiv,
+                                                float *Barray[], int ldb,
+                                                int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const float *[], int,
+      const int *, float *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgetrsBatched( cublasHandle_t handle, 
-                                                           cublasOperation_t trans, 
-                                                           int n, 
-                                                           int nrhs, 
-                                                           const double *Aarray[], 
-                                                           int lda, 
-                                                           const int *devIpiv, 
-                                                           double *Barray[], 
-                                                           int ldb, 
-                                                           int *info,
-                                                           int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const double *[], int, const int *, double *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const double *Aarray[], int lda, const int *devIpiv, double *Barray[],
+    int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const double *[], int,
+      const int *, double *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasCgetrsBatched( cublasHandle_t handle, 
-                                                            cublasOperation_t trans, 
-                                                            int n, 
-                                                            int nrhs, 
-                                                            const cuComplex *Aarray[], 
-                                                            int lda, 
-                                                            const int *devIpiv, 
-                                                            cuComplex *Barray[], 
-                                                            int ldb, 
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuComplex *[], int, const int *, cuComplex *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const cuComplex *Aarray[], int lda, const int *devIpiv, cuComplex *Barray[],
+    int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *[], int,
+      const int *, cuComplex *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasZgetrsBatched( cublasHandle_t handle, 
-                                                            cublasOperation_t trans, 
-                                                            int n, 
-                                                            int nrhs, 
-                                                            const cuDoubleComplex *Aarray[], 
-                                                            int lda, 
-                                                            const int *devIpiv, 
-                                                            cuDoubleComplex *Barray[], 
-                                                            int ldb, 
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *[], int, const int *, cuDoubleComplex *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const cuDoubleComplex *Aarray[], int lda, const int *devIpiv,
+    cuDoubleComplex *Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *[],
+      int, const int *, cuDoubleComplex *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrsmBatched( cublasHandle_t    handle, 
-                                                          cublasSideMode_t  side, 
-                                                          cublasFillMode_t  uplo,
-                                                          cublasOperation_t trans, 
-                                                          cublasDiagType_t  diag,
-                                                          int m, 
-                                                          int n, 
-                                                          const float *alpha,           /*Host or Device Pointer*/
-                                                          const float *A[], 
-                                                          int lda,
-                                                          float *B[], 
-                                                          int ldb,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *[], int, float *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasStrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /*Host or Device Pointer*/
+    const float *A[], int lda, float *B[], int ldb, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *[], int,
+      float *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrsmBatched( cublasHandle_t    handle, 
-                                                          cublasSideMode_t  side, 
-                                                          cublasFillMode_t  uplo,
-                                                          cublasOperation_t trans, 
-                                                          cublasDiagType_t  diag,
-                                                          int m, 
-                                                          int n, 
-                                                          const double *alpha,          /*Host or Device Pointer*/
-                                                          const double *A[], 
-                                                          int lda,
-                                                          double *B[], 
-                                                          int ldb,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *[], int, double *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /*Host or Device Pointer*/
+    const double *A[], int lda, double *B[], int ldb, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *[], int,
+      double *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrsmBatched( cublasHandle_t    handle, 
-                                                          cublasSideMode_t  side, 
-                                                          cublasFillMode_t  uplo,
-                                                          cublasOperation_t trans, 
-                                                          cublasDiagType_t  diag,
-                                                          int m, 
-                                                          int n, 
-                                                          const cuComplex *alpha,       /*Host or Device Pointer*/
-                                                          const cuComplex *A[], 
-                                                          int lda,
-                                                          cuComplex *B[], 
-                                                          int ldb,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *[], int, cuComplex *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /*Host or Device Pointer*/
+    const cuComplex *A[], int lda, cuComplex *B[], int ldb, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *[], int,
+      cuComplex *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrsmBatched( cublasHandle_t    handle, 
-                                                          cublasSideMode_t  side, 
-                                                          cublasFillMode_t  uplo,
-                                                          cublasOperation_t trans, 
-                                                          cublasDiagType_t  diag,
-                                                          int m, 
-                                                          int n, 
-                                                          const cuDoubleComplex *alpha, /*Host or Device Pointer*/
-                                                          const cuDoubleComplex *A[], 
-                                                          int lda,
-                                                          cuDoubleComplex *B[], 
-                                                          int ldb,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *[], int, cuDoubleComplex *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /*Host or Device Pointer*/
+    const cuDoubleComplex *A[], int lda, cuDoubleComplex *B[], int ldb,
+    int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *[], int, cuDoubleComplex *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(cublasHandle_t handle,
-                                                          int n, 
-                                                          const float *A[],                  /*Device pointer*/
-                                                          int lda, 
-                                                          float *Ainv[],               /*Device pointer*/
-                                                          int lda_inv, 
-                                                          int *info,                   /*Device Pointer*/
-                                                          int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *[], int, float *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(
+    cublasHandle_t handle, int n, const float *A[], /*Device pointer*/
+    int lda, float *Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                         /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *[], int, float *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSmatinvBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(cublasHandle_t handle,
-                                                          int n, 
-                                                          const double *A[],                 /*Device pointer*/
-                                                          int lda, 
-                                                          double *Ainv[],              /*Device pointer*/
-                                                          int lda_inv, 
-                                                          int *info,                   /*Device Pointer*/
-                                                          int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *[], int, double *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(
+    cublasHandle_t handle, int n, const double *A[], /*Device pointer*/
+    int lda, double *Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                          /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *[], int, double *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDmatinvBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(cublasHandle_t handle,
-                                                          int n, 
-                                                          const cuComplex *A[],              /*Device pointer*/
-                                                          int lda, 
-                                                          cuComplex *Ainv[],           /*Device pointer*/
-                                                          int lda_inv, 
-                                                          int *info,                   /*Device Pointer*/
-                                                          int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *[], int, cuComplex *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(
+    cublasHandle_t handle, int n, const cuComplex *A[], /*Device pointer*/
+    int lda, cuComplex *Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                             /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *[],
+                                     int, cuComplex *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCmatinvBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZmatinvBatched(cublasHandle_t handle,
-                                                          int n, 
-                                                          const cuDoubleComplex *A[],        /*Device pointer*/
-                                                          int lda, 
-                                                          cuDoubleComplex *Ainv[],     /*Device pointer*/
-                                                          int lda_inv, 
-                                                          int *info,                   /*Device Pointer*/
-                                                          int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *[], int, cuDoubleComplex *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasZmatinvBatched(
+    cublasHandle_t handle, int n, const cuDoubleComplex *A[], /*Device pointer*/
+    int lda, cuDoubleComplex *Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                                   /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *[], int, cuDoubleComplex *[],
+      int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZmatinvBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgeqrfBatched( cublasHandle_t handle, 
-                                                           int m, 
-                                                           int n,
-                                                           float *Aarray[],           /*Device pointer*/
-                                                           int lda, 
-                                                           float *TauArray[],        /* Device pointer*/                                                           
-                                                           int *info,
-                                                           int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, float *[], int, float *[], int *, int);
+cublasStatus_t CUBLASWINAPI cublasSgeqrfBatched(
+    cublasHandle_t handle, int m, int n, float *Aarray[], /*Device pointer*/
+    int lda, float *TauArray[],                           /* Device pointer*/
+    int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, float *[], int, float *[], int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeqrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasDgeqrfBatched( cublasHandle_t handle, 
-                                                            int m, 
-                                                            int n,
-                                                            double *Aarray[],           /*Device pointer*/
-                                                            int lda, 
-                                                            double *TauArray[],        /* Device pointer*/                                                            
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, double *[], int, double *[], int *, int);
+cublasStatus_t CUBLASWINAPI cublasDgeqrfBatched(
+    cublasHandle_t handle, int m, int n, double *Aarray[], /*Device pointer*/
+    int lda, double *TauArray[],                           /* Device pointer*/
+    int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, double *[], int, double *[], int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeqrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasCgeqrfBatched( cublasHandle_t handle, 
-                                                            int m, 
-                                                            int n,
-                                                            cuComplex *Aarray[],           /*Device pointer*/
-                                                            int lda, 
-                                                            cuComplex *TauArray[],        /* Device pointer*/                                                            
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, cuComplex *[], int, cuComplex *[], int *, int);
+cublasStatus_t CUBLASWINAPI cublasCgeqrfBatched(
+    cublasHandle_t handle, int m, int n, cuComplex *Aarray[], /*Device pointer*/
+    int lda, cuComplex *TauArray[], /* Device pointer*/
+    int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, cuComplex *[], int, cuComplex *[], int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeqrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasZgeqrfBatched( cublasHandle_t handle, 
-                                                            int m, 
-                                                            int n,
-                                                            cuDoubleComplex *Aarray[],           /*Device pointer*/
-                                                            int lda, 
-                                                            cuDoubleComplex *TauArray[],        /* Device pointer*/                                                          
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, cuDoubleComplex *[], int, cuDoubleComplex *[], int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgeqrfBatched(cublasHandle_t handle, int m, int n,
+                    cuDoubleComplex *Aarray[],            /*Device pointer*/
+                    int lda, cuDoubleComplex *TauArray[], /* Device pointer*/
+                    int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, cuDoubleComplex *[], int, cuDoubleComplex *[],
+      int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeqrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasSgelsBatched( cublasHandle_t handle, 
-                                                           cublasOperation_t trans, 
-                                                           int m,  
-                                                           int n,
-                                                           int nrhs,
-                                                           float *Aarray[], /*Device pointer*/
-                                                           int lda, 
-                                                           float *Carray[], /* Device pointer*/
-                                                           int ldc,                                                                 
-                                                           int *info, 
-                                                           int *devInfoArray, /* Device pointer*/
-                                                           int batchSize ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, float *[], int, float *[], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, float *Aarray[],             /*Device pointer*/
+                   int lda, float *Carray[],              /* Device pointer*/
+                   int ldc, int *info, int *devInfoArray, /* Device pointer*/
+                   int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, float *[], int,
+      float *[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasDgelsBatched( cublasHandle_t handle,
-                                                           cublasOperation_t trans,  
-                                                           int m,  
-                                                           int n,
-                                                           int nrhs,
-                                                           double *Aarray[], /*Device pointer*/
-                                                           int lda, 
-                                                           double *Carray[], /* Device pointer*/
-                                                           int ldc,                                                                 
-                                                           int *info, 
-                                                           int *devInfoArray, /* Device pointer*/
-                                                           int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, double *[], int, double *[], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, double *Aarray[],            /*Device pointer*/
+                   int lda, double *Carray[],             /* Device pointer*/
+                   int ldc, int *info, int *devInfoArray, /* Device pointer*/
+                   int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, double *[], int,
+      double *[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasCgelsBatched( cublasHandle_t handle, 
-                                                           cublasOperation_t trans, 
-                                                           int m,  
-                                                           int n,
-                                                           int nrhs,
-                                                           cuComplex *Aarray[], /*Device pointer*/
-                                                           int lda, 
-                                                           cuComplex *Carray[], /* Device pointer*/
-                                                           int ldc,                                                                 
-                                                           int *info, 
-                                                           int *devInfoArray,
-                                                           int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, cuComplex *[], int, cuComplex *[], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, cuComplex *Aarray[], /*Device pointer*/
+                   int lda, cuComplex *Carray[],  /* Device pointer*/
+                   int ldc, int *info, int *devInfoArray, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, cuComplex *[], int,
+      cuComplex *[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasZgelsBatched( cublasHandle_t handle, 
-                                                           cublasOperation_t trans, 
-                                                           int m,  
-                                                           int n,
-                                                           int nrhs,
-                                                           cuDoubleComplex *Aarray[], /*Device pointer*/
-                                                           int lda, 
-                                                           cuDoubleComplex *Carray[], /* Device pointer*/
-                                                           int ldc,                                                                 
-                                                           int *info, 
-                                                           int *devInfoArray,
-                                                           int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, cuDoubleComplex *[], int, cuDoubleComplex *[], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, cuDoubleComplex *Aarray[], /*Device pointer*/
+                   int lda, cuDoubleComplex *Carray[],  /* Device pointer*/
+                   int ldc, int *info, int *devInfoArray, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, cuDoubleComplex *[],
+      int, cuDoubleComplex *[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const float *A, 
-                                                  int lda,
-                                                  const float *x, 
-                                                  int incx,
-                                                  float *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const float *, int, const float *, int, float *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const float *A, int lda, const float *x,
+                                        int incx, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const double *A, 
-                                                  int lda,
-                                                  const double *x, 
-                                                  int incx,
-                                                  double *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const double *, int, const double *, int, double *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const double *A, int lda,
+                                        const double *x, int incx, double *C,
+                                        int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const double *, int,
+      const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const cuComplex *A, 
-                                                  int lda,
-                                                  const cuComplex *x, 
-                                                  int incx,
-                                                  cuComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const cuComplex *A, int lda,
+                                        const cuComplex *x, int incx,
+                                        cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const cuDoubleComplex *A, 
-                                                  int lda,
-                                                  const cuDoubleComplex *x, 
-                                                  int incx,
-                                                  cuDoubleComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const cuDoubleComplex *A, int lda,
+                                        const cuDoubleComplex *x, int incx,
+                                        cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const float *AP,
-                                                     float *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const float *AP, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const double *AP,
-                                                     double *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const double *AP, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuComplex *AP,
-                                                     cuComplex *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuComplex *AP, cuComplex *A,
+                                         int lda) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
+                                     const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuDoubleComplex *AP,
-                                                     cuDoubleComplex *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuDoubleComplex *AP,
+                                         cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const float *A,
-                                                     int lda,
-                                                     float *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasStrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const float *A, int lda, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const double *A,
-                                                     int lda,
-                                                     double *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const double *A, int lda, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuComplex *A,
-                                                     int lda,
-                                                     cuComplex *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuComplex *A, int lda,
+                                         cuComplex *AP) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
+                                     const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuDoubleComplex *A,
-                                                     int lda,
-                                                     cuDoubleComplex *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuDoubleComplex *A, int lda,
+                                         cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus CUBLASWINAPI cublasInit (void) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+cublasStatus CUBLASWINAPI cublasInit(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasInit");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
-cublasStatus CUBLASWINAPI cublasShutdown (void) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+cublasStatus CUBLASWINAPI cublasShutdown(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasShutdown");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
-cublasStatus CUBLASWINAPI cublasGetError (void) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+cublasStatus CUBLASWINAPI cublasGetError(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetError");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
 cublasStatus CUBLASWINAPI cublasGetVersion(int *version) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int *);
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(version);
 }
 
-cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, void **);
+cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void **devicePtr) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasAlloc");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, devicePtr);
 }
 
-cublasStatus CUBLASWINAPI cublasFree (void *devicePtr) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(void *);
+cublasStatus CUBLASWINAPI cublasFree(void *devicePtr) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasFree");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(devicePtr);
 }
 
-cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cudaStream_t);
+cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetKernelStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stream);
 }
 
-float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int);
+float CUBLASWINAPI cublasSnrm2(int n, const float *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSnrm2");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int);
+double CUBLASWINAPI cublasDnrm2(int n, const double *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDnrm2");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const cuComplex *, int);
+float CUBLASWINAPI cublasScnrm2(int n, const cuComplex *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasScnrm2");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDznrm2");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y, 
-                               int incy) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int, const float *, int);
+float CUBLASWINAPI cublasSdot(int n, const float *x, int incx, const float *y,
+                              int incy) {
+  using FuncPtr =
+      float(CUBLASWINAPI *)(int, const float *, int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSdot");
   return func_ptr(n, x, incx, y, incy);
 }
 
-double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y, 
-                               int incy) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int, const double *, int);
+double CUBLASWINAPI cublasDdot(int n, const double *x, int incx,
+                               const double *y, int incy) {
+  using FuncPtr =
+      double(CUBLASWINAPI *)(int, const double *, int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDdot");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y, 
-                               int incy) {
-  using FuncPtr = cuComplex (CUBLASWINAPI *)(int, const cuComplex *, int, const cuComplex *, int);
+cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex *x, int incx,
+                                   const cuComplex *y, int incy) {
+  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
+                                            const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCdotu");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y, 
-                               int incy) {
-  using FuncPtr = cuComplex (CUBLASWINAPI *)(int, const cuComplex *, int, const cuComplex *, int);
+cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex *x, int incx,
+                                   const cuComplex *y, int incy) {
+  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
+                                            const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCdotc");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
-                               int incy) {
-  using FuncPtr = cuDoubleComplex (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex *x,
+                                         int incx, const cuDoubleComplex *y,
+                                         int incy) {
+  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
+      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdotu");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
-                               int incy) {
-  using FuncPtr = cuDoubleComplex (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex *x,
+                                         int incx, const cuDoubleComplex *y,
+                                         int incy) {
+  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
+      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdotc");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float, float *, int);
+void CUBLASWINAPI cublasSscal(int n, float alpha, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double, double *, int);
+void CUBLASWINAPI cublasDscal(int n, double alpha, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float, cuComplex *, int);
+void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex *x,
+                               int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx, 
-                               float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float *x, int incx,
+                              float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x, 
-                               int incx, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double *x, int incx,
+                              double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x, 
-                               int incx, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex *x,
+                              int incx, cuComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
-                               int incx, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZaxpy(int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const float *, int, float *, int);
+void CUBLASWINAPI cublasScopy(int n, const float *x, int incx, float *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasScopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDcopy(int n, const double *x, int incx, double *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDcopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCcopy(int n, const cuComplex *x, int incx, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCcopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y,
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const cuDoubleComplex *, int,
+                                       cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZcopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int);
+void CUBLASWINAPI cublasSswap(int n, float *x, int incx, float *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int);
+void CUBLASWINAPI cublasDswap(int n, double *x, int incx, double *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCswap(int n, cuComplex *x, int incx, cuComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const float *, int);
+int CUBLASWINAPI cublasIsamax(int n, const float *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIsamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const double *, int);
+int CUBLASWINAPI cublasIdamax(int n, const double *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIdamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuComplex *, int);
+int CUBLASWINAPI cublasIcamax(int n, const cuComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIcamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIzamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const float *, int);
+int CUBLASWINAPI cublasIsamin(int n, const float *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIsamin");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const double *, int);
+int CUBLASWINAPI cublasIdamin(int n, const double *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIdamin");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuComplex *, int);
+int CUBLASWINAPI cublasIcamin(int n, const cuComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIcamin");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIzamin");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasSasum (int n, const float *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int);
+float CUBLASWINAPI cublasSasum(int n, const float *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSasum");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDasum (int n, const double *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int);
+double CUBLASWINAPI cublasDasum(int n, const double *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDasum");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const cuComplex *, int);
+float CUBLASWINAPI cublasScasum(int n, const cuComplex *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasScasum");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDzasum");
   return func_ptr(n, x, incx);
 }
 
-void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy, 
-                              float sc, float ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
+void CUBLASWINAPI cublasSrot(int n, float *x, int incx, float *y, int incy,
+                             float sc, float ss) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrot");
   return func_ptr(n, x, incx, y, incy, sc, ss);
 }
 
-void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy, 
-                              double sc, double ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
+void CUBLASWINAPI cublasDrot(int n, double *x, int incx, double *y, int incy,
+                             double sc, double ss) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrot");
   return func_ptr(n, x, incx, y, incy, sc, ss);
 }
 
-void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y, 
-                              int incy, float c, cuComplex s) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int, float, cuComplex);
+void CUBLASWINAPI cublasCrot(int n, cuComplex *x, int incx, cuComplex *y,
+                             int incy, float c, cuComplex s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
+                                       float, cuComplex);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCrot");
   return func_ptr(n, x, incx, y, incy, c, s);
 }
 
-void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx, 
-                              cuDoubleComplex *y, int incy, double sc, 
-                              cuDoubleComplex cs) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int, double, cuDoubleComplex);
+void CUBLASWINAPI cublasZrot(int n, cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *y, int incy, double sc,
+                             cuDoubleComplex cs) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+                           double, cuDoubleComplex);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZrot");
   return func_ptr(n, x, incx, y, incy, sc, cs);
 }
 
-void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y,
-                               int incy, float c, float s) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int, float, float);
+void CUBLASWINAPI cublasCsrot(int n, cuComplex *x, int incx, cuComplex *y,
+                              int incy, float c, float s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
+                                       float, float);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsrot");
   return func_ptr(n, x, incx, y, incy, c, s);
 }
 
-void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx, 
-                               cuDoubleComplex *y, int incy, double c, double s) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int, double, double);
+void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy, double c,
+                              double s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuDoubleComplex *, int,
+                                       cuDoubleComplex *, int, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdrot");
   return func_ptr(n, x, incx, y, incy, c, s);
 }
 
-void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(float *, float *, float *, float *);
+void CUBLASWINAPI cublasSrotg(float *sa, float *sb, float *sc, float *ss) {
+  using FuncPtr = void(CUBLASWINAPI *)(float *, float *, float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrotg");
   return func_ptr(sa, sb, sc, ss);
 }
 
-void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(double *, double *, double *, double *);
+void CUBLASWINAPI cublasDrotg(double *sa, double *sb, double *sc, double *ss) {
+  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrotg");
   return func_ptr(sa, sb, sc, ss);
 }
 
-void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc,
-                               cuComplex *cs) {
-  using FuncPtr = void (CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
+void CUBLASWINAPI cublasCrotg(cuComplex *ca, cuComplex cb, float *sc,
+                              cuComplex *cs) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCrotg");
   return func_ptr(ca, cb, sc, cs);
 }
 
-void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc,
-                               cuDoubleComplex *cs) {
-  using FuncPtr = void (CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex, double *, cuDoubleComplex *);
+void CUBLASWINAPI cublasZrotg(cuDoubleComplex *ca, cuDoubleComplex cb,
+                              double *sc, cuDoubleComplex *cs) {
+  using FuncPtr = void(CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex,
+                                       double *, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZrotg");
   return func_ptr(ca, cb, sc, cs);
 }
 
-void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy, 
-                              const float* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
+void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
+                              const float *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrotm");
   return func_ptr(n, x, incx, y, incy, sparam);
 }
 
-void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy, 
-                              const double* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
+void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
+                              const double *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrotm");
   return func_ptr(n, x, incx, y, incy, sparam);
 }
 
-void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1, 
-                                const float *sy1, float* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
+void CUBLASWINAPI cublasSrotmg(float *sd1, float *sd2, float *sx1,
+                               const float *sy1, float *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrotmg");
   return func_ptr(sd1, sd2, sx1, sy1, sparam);
 }
 
-void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1, 
-                                const double *sy1, double* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(double *, double *, double *, const double *, double *);
+void CUBLASWINAPI cublasDrotmg(double *sd1, double *sd2, double *sx1,
+                               const double *sy1, double *sparam) {
+  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *,
+                                       const double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrotmg");
   return func_ptr(sd1, sd2, sx1, sy1, sparam);
 }
 
-void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha,
-                               const float *A, int lda, const float *x, int incx,
-                               float beta, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSgemv(char trans, int m, int n, float alpha,
+                              const float *A, int lda, const float *x, int incx,
+                              float beta, float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha,
-                               const double *A, int lda, const double *x, int incx,
-                               double beta, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDgemv(char trans, int m, int n, double alpha,
+                              const double *A, int lda, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
-                               const cuComplex *A, int lda, const cuComplex *x, int incx,
-                               cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCgemv(char trans, int m, int n, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-                               cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgemv(char trans, int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku, 
-                               float alpha, const float *A, int lda, 
-                               const float *x, int incx, float beta, float *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSgbmv(char trans, int m, int n, int kl, int ku,
+                              float alpha, const float *A, int lda,
+                              const float *x, int incx, float beta, float *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku, 
-                               double alpha, const double *A, int lda, 
-                               const double *x, int incx, double beta, double *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDgbmv(char trans, int m, int n, int kl, int ku,
+                              double alpha, const double *A, int lda,
+                              const double *x, int incx, double beta, double *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, int, int, double, const double *,
+                           int, const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, 
-                               cuComplex alpha, const cuComplex *A, int lda, 
-                               const cuComplex *x, int incx, cuComplex beta, cuComplex *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCgbmv(char trans, int m, int n, int kl, int ku,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *x, int incx, cuComplex beta,
+                              cuComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku, 
-                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, 
-                               const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgbmv(char trans, int m, int n, int kl, int ku,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n, 
-                               const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
+                                       int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n, 
-                               const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n, 
-                               const cuComplex *A, int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrmv(char uplo, char trans, char diag, int n,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n, 
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtrmv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k, 
-                               const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStbmv(char uplo, char trans, char diag, int n, int k,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtbmv(char uplo, char trans, char diag, int n, int k,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const cuComplex *A, int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtbmv(char uplo, char trans, char diag, int n, int k,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtbmv(char uplo, char trans, char diag, int n, int k,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
+                           int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n,
+                              const float *AP, float *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, double *, int);
+void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n,
+                              const double *AP, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, cuComplex *, int);
+void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n,
+                              const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *AP, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
+                                       int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda, 
+void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *A, int lda,
                               cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP, 
-                              float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n,
+                              const float *AP, float *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, double *, int);
+void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n,
+                              const double *AP, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, cuComplex *, int);
+void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n,
+                              const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, 
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *AP, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasStbsv(char uplo, char trans, 
-                              char diag, int n, int k, const float *A, 
-                              int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStbsv(char uplo, char trans, char diag, int n, int k,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const double *A, 
-                              int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtbsv(char uplo, char trans, char diag, int n, int k,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const cuComplex *A, 
-                              int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtbsv(char uplo, char trans, char diag, int n, int k,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const cuDoubleComplex *A, 
-                              int lda, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtbsv(char uplo, char trans, char diag, int n, int k,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
+                           int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A,
-                               int lda, const float *x, int incx, float beta, 
-                               float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsymv(char uplo, int n, float alpha, const float *A,
+                              int lda, const float *x, int incx, float beta,
+                              float *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsymv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A,
-                               int lda, const double *x, int incx, double beta, 
-                               double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsymv(char uplo, int n, double alpha, const double *A,
+                              int lda, const double *x, int incx, double beta,
+                              double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsymv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, const cuComplex *x, int incx, cuComplex beta, 
-                               cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasChemv(char uplo, int n, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChemv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A,
-                               int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta, 
-                               cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZhemv(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhemv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha, 
-                               const float *A, int lda, const float *x, int incx, 
-                               float beta, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsbmv(char uplo, int n, int k, float alpha,
+                              const float *A, int lda, const float *x, int incx,
+                              float beta, float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha, 
-                               const double *A, int lda, const double *x, int incx, 
-                               double beta, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsbmv(char uplo, int n, int k, double alpha,
+                              const double *A, int lda, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, 
-                               const cuComplex *A, int lda, const cuComplex *x, int incx, 
-                               cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasChbmv(char uplo, int n, int k, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha, 
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, 
-                               cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZhbmv(char uplo, int n, int k, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha,
-                              const float *AP, const float *x,
-                              int incx, float beta, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha, const float *AP,
+                              const float *x, int incx, float beta, float *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *,
+                                       const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSspmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha,
-                              const double *AP, const double *x,
-                              int incx, double beta, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha, const double *AP,
+                              const double *x, int incx, double beta, double *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, const double *,
+                           int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDspmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
 void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
-                              const cuComplex *AP, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, const cuComplex *, int, cuComplex, cuComplex *, int);
+                              const cuComplex *AP, const cuComplex *x, int incx,
+                              cuComplex beta, cuComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChpmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
 void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *AP, const cuDoubleComplex *x,
-                              int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+                              const cuDoubleComplex *AP,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhpmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx,
-                              const float *y, int incy, float *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, float, const float *, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasSger(int m, int n, float alpha, const float *x,
+                             int incx, const float *y, int incy, float *A,
+                             int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, int, float, const float *, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSger");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx,
-                              const double *y, int incy, double *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, double, const double *, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDger(int m, int n, double alpha, const double *x,
+                             int incx, const double *y, int incy, double *A,
+                             int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, int, double, const double *, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDger");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
-                               int incx, const cuComplex *y, int incy,
-                               cuComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCgeru(int m, int n, cuComplex alpha, const cuComplex *x,
+                              int incx, const cuComplex *y, int incy,
+                              cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgeru");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
-                               int incx, const cuComplex *y, int incy,
-                               cuComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCgerc(int m, int n, cuComplex alpha, const cuComplex *x,
+                              int incx, const cuComplex *y, int incy,
+                              cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgerc");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
-                               int incx, const cuDoubleComplex *y, int incy,
-                               cuDoubleComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgeru(int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgeru");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
-                               int incx, const cuDoubleComplex *y, int incy,
-                               cuDoubleComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgerc(int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgerc");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x,
-                              int incx, float *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float *x,
+                             int incx, float *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x,
-                              int incx, double *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double *x,
+                             int incx, double *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasCher (char uplo, int n, float alpha, 
-                              const cuComplex *x, int incx, cuComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex *x,
+                             int incx, cuComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCher");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasZher (char uplo, int n, double alpha, 
-                              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZher(char uplo, int n, double alpha,
+                             const cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZher");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x,
-                              int incx, float *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, float *);
+void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float *x,
+                             int incx, float *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSspr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x,
-                              int incx, double *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, double *);
+void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double *x,
+                             int incx, double *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDspr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x,
-                              int incx, cuComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const cuComplex *, int, cuComplex *);
+void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex *x,
+                             int incx, cuComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
+                                       cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChpr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x,
-                              int incx, cuDoubleComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
+void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha,
+                             const cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x, 
-                               int incx, const float *y, int incy, float *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasSsyr2(char uplo, int n, float alpha, const float *x,
+                              int incx, const float *y, int incy, float *A,
+                              int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x, 
-                               int incx, const double *y, int incy, double *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDsyr2(char uplo, int n, double alpha, const double *x,
+                              int incx, const double *y, int incy, double *A,
+                              int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x, 
-                               int incx, const cuComplex *y, int incy, cuComplex *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCher2(char uplo, int n, cuComplex alpha,
+                              const cuComplex *x, int incx, const cuComplex *y,
+                              int incy, cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCher2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
-                               int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZher2(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZher2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x, 
-                               int incx, const float *y, int incy, float *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float *);
+void CUBLASWINAPI cublasSspr2(char uplo, int n, float alpha, const float *x,
+                              int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSspr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha,
-                               const double *x, int incx, const double *y,
-                               int incy, double *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double *);
+void CUBLASWINAPI cublasDspr2(char uplo, int n, double alpha, const double *x,
+                              int incx, const double *y, int incy, double *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDspr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
-                               const cuComplex *x, int incx, const cuComplex *y,
-                               int incy, cuComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+void CUBLASWINAPI cublasChpr2(char uplo, int n, cuComplex alpha,
+                              const cuComplex *x, int incx, const cuComplex *y,
+                              int incy, cuComplex *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChpr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-                               int incy, cuDoubleComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+void CUBLASWINAPI cublasZhpr2(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k, 
-                               float alpha, const float *A, int lda, 
-                               const float *B, int ldb, float beta, float *C, 
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSgemm(char transa, char transb, int m, int n, int k,
+                              float alpha, const float *A, int lda,
+                              const float *B, int ldb, float beta, float *C,
+                              int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k,
-                               double alpha, const double *A, int lda, 
-                               const double *B, int ldb, double beta, double *C, 
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDgemm(char transa, char transb, int m, int n, int k,
+                              double alpha, const double *A, int lda,
+                              const double *B, int ldb, double beta, double *C,
+                              int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, int, double, const double *,
+                           int, const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k, 
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, cuComplex beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCgemm(char transa, char transb, int m, int n, int k,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n,
-                               int k, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               const cuDoubleComplex *B, int ldb,
-                               cuDoubleComplex beta, cuDoubleComplex *C,
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgemm(char transa, char transb, int m, int n, int k,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha, 
-                               const float *A, int lda, float beta, float *C, 
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsyrk(char uplo, char trans, int n, int k, float alpha,
+                              const float *A, int lda, float beta, float *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, float,
+                                       const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k,
-                               double alpha, const double *A, int lda,
-                               double beta, double *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsyrk(char uplo, char trans, int n, int k, double alpha,
+                              const double *A, int lda, double beta, double *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, double, const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               cuComplex beta, cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCsyrk(char uplo, char trans, int n, int k,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              cuComplex beta, cuComplex *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *,
+                           int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k,
-                               cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               cuDoubleComplex beta,
-                               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZsyrk(char uplo, char trans, int n, int k,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex,
+                                       const cuDoubleComplex *, int,
+                                       cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k,
-                               float alpha, const cuComplex *A, int lda,
-                               float beta, cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int, float, cuComplex *, int);
+void CUBLASWINAPI cublasCherk(char uplo, char trans, int n, int k, float alpha,
+                              const cuComplex *A, int lda, float beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int,
+                           float, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCherk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k,
-                               double alpha,
-                               const cuDoubleComplex *A, int lda,
-                               double beta,
-                               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZherk(char uplo, char trans, int n, int k, double alpha,
+                              const cuDoubleComplex *A, int lda, double beta,
+                              cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, double,
+                                       const cuDoubleComplex *, int, double,
+                                       cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZherk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha, 
-                                const float *A, int lda, const float *B, int ldb, 
-                                float beta, float *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsyr2k(char uplo, char trans, int n, int k, float alpha,
+                               const float *A, int lda, const float *B, int ldb,
+                               float beta, float *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k,
-                                double alpha, const double *A, int lda,
-                                const double *B, int ldb, double beta,
-                                double *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsyr2k(char uplo, char trans, int n, int k,
+                               double alpha, const double *A, int lda,
+                               const double *B, int ldb, double beta, double *C,
+                               int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k,
-                                cuComplex alpha, const cuComplex *A, int lda,
-                                const cuComplex *B, int ldb, cuComplex beta,
-                                cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCsyr2k(char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, cuComplex beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k,
-                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                                const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
-                                cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZsyr2k(char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *B, int ldb,
+                               cuDoubleComplex beta, cuDoubleComplex *C,
+                               int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k,
-                                cuComplex alpha, const cuComplex *A, int lda,
-                                const cuComplex *B, int ldb, float beta,
-                                cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, float, cuComplex *, int);
+void CUBLASWINAPI cublasCher2k(char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, float beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, float, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCher2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k,
-                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                                const cuDoubleComplex *B, int ldb, double beta,
-                                cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZher2k(char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *B, int ldb,
+                               double beta, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZher2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha, 
-                               const float *A, int lda, const float *B, int ldb,
-                               float beta, float *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsymm(char side, char uplo, int m, int n, float alpha,
+                              const float *A, int lda, const float *B, int ldb,
+                              float beta, float *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha, 
-                               const double *A, int lda, const double *B, int ldb,
-                               double beta, double *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsymm(char side, char uplo, int m, int n, double alpha,
+                              const double *A, int lda, const double *B,
+                              int ldb, double beta, double *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha, 
-                               const cuComplex *A, int lda, const cuComplex *B, int ldb,
-                               cuComplex beta, cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCsymm(char side, char uplo, int m, int n,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha, 
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-                               cuDoubleComplex beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZsymm(char side, char uplo, int m, int n,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, cuComplex beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasChemm(char side, char uplo, int m, int n,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChemm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n,
-                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                               const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
-                               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZhemm(char side, char uplo, int m, int n,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhemm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag,
-                               int m, int n, float alpha, const float *A, int lda,
-                               float *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, float alpha, const float *A,
+                              int lda, float *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa,
-                               char diag, int m, int n, double alpha,
-                               const double *A, int lda, double *B,
-                               int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, double alpha, const double *A,
+                              int lda, double *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag,
-                               int m, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, cuComplex *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuComplex alpha, const cuComplex *A,
+                              int lda, cuComplex *B, int ldb) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa,
-                               char diag, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               cuDoubleComplex *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
+                                       cuDoubleComplex, const cuDoubleComplex *,
+                                       int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag,
-                               int m, int n, float alpha, const float *A, int lda,
-                               float *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, float alpha, const float *A,
+                              int lda, float *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa,
-                               char diag, int m, int n, double alpha,
-                               const double *A, int lda, double *B,
-                               int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, double alpha, const double *A,
+                              int lda, double *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag,
-                               int m, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, cuComplex *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuComplex alpha, const cuComplex *A,
+                              int lda, cuComplex *B, int ldb) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa,
-                               char diag, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *B,
-                               int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
+                                       cuDoubleComplex, const cuDoubleComplex *,
+                                       int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
diff --git a/tensorflow/stream_executor/cuda/cublas_stub.cc b/tensorflow/stream_executor/cuda/cublas_stub.cc
index dd13ad0960b..1cbfd51316c 100644
--- a/tensorflow/stream_executor/cuda/cublas_stub.cc
+++ b/tensorflow/stream_executor/cuda/cublas_stub.cc
@@ -12,7 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#if CUBLAS_VER_MAJOR >= 11
+#include "third_party/gpus/cuda/include/cublas_v2.h"
+#else
 #include "third_party/gpus/cuda/include/cublas.h"
+#endif
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
@@ -65,7 +69,7 @@ typedef enum {} cublasMath_t;
 #include "tensorflow/stream_executor/cuda/cublas_10_1.inc"
 #elif CUDA_VERSION == 10020
 #include "tensorflow/stream_executor/cuda/cublas_10_2.inc"
-#elif CUDA_VERSION == 11000
+#elif CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 0
 #include "tensorflow/stream_executor/cuda/cublas_11_0.inc"
 #else
 #error "We have no wrapper for this version."
diff --git a/tensorflow/stream_executor/cuda/cuda_11_0.inc b/tensorflow/stream_executor/cuda/cuda_11_0.inc
new file mode 100644
index 00000000000..18f3ff4cd57
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_11_0.inc
@@ -0,0 +1,2430 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorString");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(error, pStr);
+}
+
+CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorName");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(error, pStr);
+}
+
+CUresult CUDAAPI cuInit(unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuInit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(Flags);
+}
+
+CUresult CUDAAPI cuDriverGetVersion(int *driverVersion) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDriverGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(driverVersion);
+}
+
+CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, ordinal);
+}
+
+CUresult CUDAAPI cuDeviceGetCount(int *count) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count);
+}
+
+CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetName");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(name, len, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUuuid *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetUuid");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(uuid, dev);
+}
+
+CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceTotalMem_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(bytes, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
+                                      CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice_attribute, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pi, attrib, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList,
+                                                CUdevice dev, int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdevice, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetNvSciSyncAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(nvSciSyncAttrList, dev, flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop,
+                                                         CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevprop *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, dev);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major,
+                                                             int *minor,
+                                                             CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceComputeCapability");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(major, minor, dev);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRetain");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx, dev);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRelease_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxSetFlags_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev, flags);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags,
+                                            int *active) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxGetState");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev, flags, active);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxReset_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev);
+}
+
+CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags,
+                             CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx, flags, dev);
+}
+
+CUresult CUDAAPI cuCtxDestroy(CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPushCurrent_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPopCurrent_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx);
+}
+
+CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCurrent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCurrent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx);
+}
+
+CUresult CUDAAPI cuCtxGetDevice(CUdevice *device) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+CUresult CUDAAPI cuCtxSynchronize(void) {
+  using FuncPtr = CUresult(CUDAAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUlimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pvalue, limit);
+}
+
+CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pconfig);
+}
+
+CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pConfig);
+}
+
+CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetApiVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx, version);
+}
+
+CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority,
+                                             int *greatestPriority) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetStreamPriorityRange");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(leastPriority, greatestPriority);
+}
+
+CUresult CUDAAPI cuCtxResetPersistingL2Cache(void) {
+  using FuncPtr = CUresult(CUDAAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxResetPersistingL2Cache");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx,
+                                               unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxAttach");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx, flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDetach");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoad");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, fname);
+}
+
+CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, image);
+}
+
+CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image,
+                                    unsigned int numOptions,
+                                    CUjit_option *options,
+                                    void **optionValues) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *, unsigned int,
+                                      CUjit_option *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadDataEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, image, numOptions, options, optionValues);
+}
+
+CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadFatBinary");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, fatCubin);
+}
+
+CUresult CUDAAPI cuModuleUnload(CUmodule hmod) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleUnload");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hmod);
+}
+
+CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
+                                     const char *name) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetFunction");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, hmod, name);
+}
+
+CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes,
+                                   CUmodule hmod, const char *name) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetGlobal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytes, hmod, name);
+}
+
+CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod,
+                                   const char *name) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetTexRef");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexRef, hmod, name);
+}
+
+CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod,
+                                    const char *name) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetSurfRef");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfRef, hmod, name);
+}
+
+CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options,
+                              void **optionValues, CUlinkState *stateOut) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(unsigned int, CUjit_option *, void **, CUlinkState *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numOptions, options, optionValues, stateOut);
+}
+
+CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type,
+                               void *data, size_t size, const char *name,
+                               unsigned int numOptions, CUjit_option *options,
+                               void **optionValues) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, void *, size_t,
+                          const char *, unsigned int, CUjit_option *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddData_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state, type, data, size, name, numOptions, options,
+                  optionValues);
+}
+
+CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type,
+                               const char *path, unsigned int numOptions,
+                               CUjit_option *options, void **optionValues) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, const char *,
+                                      unsigned int, CUjit_option *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddFile_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state, type, path, numOptions, options, optionValues);
+}
+
+CUresult CUDAAPI cuLinkComplete(CUlinkState state, void **cubinOut,
+                                size_t *sizeOut) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, void **, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkComplete");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state, cubinOut, sizeOut);
+}
+
+CUresult CUDAAPI cuLinkDestroy(CUlinkState state) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state);
+}
+
+CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetInfo_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(free, total);
+}
+
+CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAlloc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytesize);
+}
+
+CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
+                                 size_t WidthInBytes, size_t Height,
+                                 unsigned int ElementSizeBytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, size_t, size_t,
+                                      unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocPitch_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, pPitch, WidthInBytes, Height, ElementSizeBytes);
+}
+
+CUresult CUDAAPI cuMemFree(CUdeviceptr dptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFree_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr);
+}
+
+CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize,
+                                      CUdeviceptr dptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAddressRange_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pbase, psize, dptr);
+}
+
+CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize) {
+  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocHost_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pp, bytesize);
+}
+
+CUresult CUDAAPI cuMemFreeHost(void *p) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFreeHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize,
+                                unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pp, bytesize, Flags);
+}
+
+CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p,
+                                           unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetDevicePointer_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pdptr, p, Flags);
+}
+
+CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, p);
+}
+
+CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
+                                   unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocManaged");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytesize, flags);
+}
+
+CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetByPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev, pciBusId);
+}
+
+CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pciBusId, len, dev);
+}
+
+CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUipcEventHandle *, CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, event);
+}
+
+CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent,
+                                      CUipcEventHandle handle) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, CUipcEventHandle);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phEvent, handle);
+}
+
+CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUipcMemHandle *, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, dptr);
+}
+
+CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle,
+                                    unsigned int Flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr *, CUipcMemHandle, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pdptr, handle, Flags);
+}
+
+CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcCloseMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr);
+}
+
+CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize,
+                                   unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostRegister_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, bytesize, Flags);
+}
+
+CUresult CUDAAPI cuMemHostUnregister(void *p) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostUnregister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext,
+                              CUdeviceptr srcDevice, CUcontext srcContext,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
+                                      CUcontext, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoD_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcHost, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoH_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcDevice, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoD_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcDevice, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset,
+                              CUdeviceptr srcDevice, size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoA_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcDevice, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray,
+                              size_t srcOffset, size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUarray, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoD_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcArray, srcOffset, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset,
+                              const void *srcHost, size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoA_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcHost, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoH_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcArray, srcOffset, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset,
+                              CUarray srcArray, size_t srcOffset,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUarray, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoA_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcArray, srcOffset, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2D_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DUnaligned_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3D_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src,
+                               size_t ByteCount, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext,
+                                   CUdeviceptr srcDevice, CUcontext srcContext,
+                                   size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
+                                      CUcontext, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount,
+                  hStream);
+}
+
+CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
+                                   size_t ByteCount, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoDAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcHost, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
+                                   size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoHAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcDevice, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
+                                   size_t ByteCount, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoDAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcDevice, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
+                                   const void *srcHost, size_t ByteCount,
+                                   CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoAAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcHost, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray,
+                                   size_t srcOffset, size_t ByteCount,
+                                   CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoHAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcArray, srcOffset, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy, hStream);
+}
+
+CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy, hStream);
+}
+
+CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy,
+                                     CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, uc, N);
+}
+
+CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us,
+                             size_t N) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, us, N);
+}
+
+CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, ui, N);
+}
+
+CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch,
+                              unsigned char uc, size_t Width, size_t Height) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, uc, Width, Height);
+}
+
+CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch,
+                               unsigned short us, size_t Width, size_t Height) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, us, Width, Height);
+}
+
+CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch,
+                               unsigned int ui, size_t Width, size_t Height) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, ui, Width, Height);
+}
+
+CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
+                                 size_t N, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, uc, N, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us,
+                                  size_t N, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, us, N, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui,
+                                  size_t N, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, ui, N, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch,
+                                   unsigned char uc, size_t Width,
+                                   size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char,
+                                      size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, uc, Width, Height, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch,
+                                    unsigned short us, size_t Width,
+                                    size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short,
+                                      size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, us, Width, Height, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch,
+                                    unsigned int ui, size_t Width,
+                                    size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t,
+                                      size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, ui, Width, Height, hStream);
+}
+
+CUresult CUDAAPI cuArrayCreate(CUarray *pHandle,
+                               const CUDA_ARRAY_DESCRIPTOR *pAllocateArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY_DESCRIPTOR *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, pAllocateArray);
+}
+
+CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor,
+                                      CUarray hArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY_DESCRIPTOR *, CUarray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetDescriptor_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pArrayDescriptor, hArray);
+}
+
+CUresult CUDAAPI cuArrayDestroy(CUarray hArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hArray);
+}
+
+CUresult CUDAAPI cuArray3DCreate(
+    CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, pAllocateArray);
+}
+
+CUresult CUDAAPI cuArray3DGetDescriptor(
+    CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY3D_DESCRIPTOR *, CUarray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DGetDescriptor_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pArrayDescriptor, hArray);
+}
+
+CUresult CUDAAPI
+cuMipmappedArrayCreate(CUmipmappedArray *pHandle,
+                       const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc,
+                       unsigned int numMipmapLevels) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUmipmappedArray *, const CUDA_ARRAY3D_DESCRIPTOR *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, pMipmappedArrayDesc, numMipmapLevels);
+}
+
+CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray,
+                                          CUmipmappedArray hMipmappedArray,
+                                          unsigned int level) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUarray *, CUmipmappedArray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayGetLevel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pLevelArray, hMipmappedArray, level);
+}
+
+CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hMipmappedArray);
+}
+
+CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size,
+                                     size_t alignment, CUdeviceptr addr,
+                                     unsigned long long flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, size_t,
+                                      CUdeviceptr, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAddressReserve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, alignment, addr, flags);
+}
+
+CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAddressFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size);
+}
+
+CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+                             const CUmemAllocationProp *prop,
+                             unsigned long long flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, size_t,
+                          const CUmemAllocationProp *, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, size, prop, flags);
+}
+
+CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRelease");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
+                          CUmemGenericAllocationHandle handle,
+                          unsigned long long flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, size_t,
+                          CUmemGenericAllocationHandle, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemMap");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, offset, handle, flags);
+}
+
+CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemUnmap");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size);
+}
+
+CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size,
+                                const CUmemAccessDesc *desc, size_t count) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, const CUmemAccessDesc *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemSetAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, desc, count);
+}
+
+CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags,
+                                const CUmemLocation *location,
+                                CUdeviceptr ptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned long long *,
+                                      const CUmemLocation *, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags, location, ptr);
+}
+
+CUresult CUDAAPI cuMemExportToShareableHandle(
+    void *shareableHandle, CUmemGenericAllocationHandle handle,
+    CUmemAllocationHandleType handleType, unsigned long long flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(void *, CUmemGenericAllocationHandle,
+                          CUmemAllocationHandleType, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemExportToShareableHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(shareableHandle, handle, handleType, flags);
+}
+
+CUresult CUDAAPI cuMemImportFromShareableHandle(
+    CUmemGenericAllocationHandle *handle, void *osHandle,
+    CUmemAllocationHandleType shHandleType) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, void *,
+                                      CUmemAllocationHandleType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemImportFromShareableHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, osHandle, shHandleType);
+}
+
+CUresult CUDAAPI cuMemGetAllocationGranularity(
+    size_t *granularity, const CUmemAllocationProp *prop,
+    CUmemAllocationGranularity_flags option) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, const CUmemAllocationProp *,
+                                      CUmemAllocationGranularity_flags);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAllocationGranularity");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(granularity, prop, option);
+}
+
+CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(
+    CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUmemAllocationProp *, CUmemGenericAllocationHandle);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuMemGetAllocationPropertiesFromHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, handle);
+}
+
+CUresult CUDAAPI
+cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRetainAllocationHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, addr);
+}
+
+CUresult CUDAAPI cuPointerGetAttribute(void *data,
+                                       CUpointer_attribute attribute,
+                                       CUdeviceptr ptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUpointer_attribute, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, attribute, ptr);
+}
+
+CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count,
+                                    CUdevice dstDevice, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUdevice, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPrefetchAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, dstDevice, hStream);
+}
+
+CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count,
+                             CUmem_advise advice, CUdevice device) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUmem_advise, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAdvise");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, advice, device);
+}
+
+CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize,
+                                        CUmem_range_attribute attribute,
+                                        CUdeviceptr devPtr, size_t count) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, CUmem_range_attribute,
+                                      CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSize, attribute, devPtr, count);
+}
+
+CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes,
+                                         CUmem_range_attribute *attributes,
+                                         size_t numAttributes,
+                                         CUdeviceptr devPtr, size_t count) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      void **, size_t *, CUmem_range_attribute *, size_t, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
+}
+
+CUresult CUDAAPI cuPointerSetAttribute(const void *value,
+                                       CUpointer_attribute attribute,
+                                       CUdeviceptr ptr) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(const void *, CUpointer_attribute, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attribute, ptr);
+}
+
+CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes,
+                                        CUpointer_attribute *attributes,
+                                        void **data, CUdeviceptr ptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int, CUpointer_attribute *,
+                                      void **, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numAttributes, attributes, data, ptr);
+}
+
+CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phStream, Flags);
+}
+
+CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream,
+                                            unsigned int flags, int priority) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreateWithPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phStream, flags, priority);
+}
+
+CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, priority);
+}
+
+CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, flags);
+}
+
+CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUcontext *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCtx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, pctx);
+}
+
+CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent,
+                                   unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUevent, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, hEvent, Flags);
+}
+
+CUresult CUDAAPI cuStreamAddCallback(CUstream hStream,
+                                     CUstreamCallback callback, void *userData,
+                                     unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUstreamCallback, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAddCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, callback, userData, flags);
+}
+
+CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream,
+                                      CUstreamCaptureMode mode) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureMode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBeginCapture_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, mode);
+}
+
+CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstreamCaptureMode *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuThreadExchangeStreamCaptureMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mode);
+}
+
+CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUgraph *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamEndCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, phGraph);
+}
+
+CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream,
+                                     CUstreamCaptureStatus *captureStatus) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamIsCapturing");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, captureStatus);
+}
+
+CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream,
+                                        CUstreamCaptureStatus *captureStatus,
+                                        cuuint64_t *id) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *, cuuint64_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCaptureInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, captureStatus, id);
+}
+
+CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr,
+                                        size_t length, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAttachMemAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, dptr, length, flags);
+}
+
+CUresult CUDAAPI cuStreamQuery(CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream);
+}
+
+CUresult CUDAAPI cuStreamSynchronize(CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream);
+}
+
+CUresult CUDAAPI cuStreamDestroy(CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream);
+}
+
+CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src);
+}
+
+CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr,
+                                      CUstreamAttrValue *value_out) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUstreamAttrID, CUstreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value_out);
+}
+
+CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr,
+                                      const CUstreamAttrValue *value) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUstreamAttrID, const CUstreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value);
+}
+
+CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phEvent, Flags);
+}
+
+CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventRecord");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent, hStream);
+}
+
+CUresult CUDAAPI cuEventQuery(CUevent hEvent) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent);
+}
+
+CUresult CUDAAPI cuEventSynchronize(CUevent hEvent) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent);
+}
+
+CUresult CUDAAPI cuEventDestroy(CUevent hEvent) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent);
+}
+
+CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart,
+                                    CUevent hEnd) {
+  using FuncPtr = CUresult(CUDAAPI *)(float *, CUevent, CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventElapsedTime");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pMilliseconds, hStart, hEnd);
+}
+
+CUresult CUDAAPI
+cuImportExternalMemory(CUexternalMemory *extMem_out,
+                       const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory *,
+                                      const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem_out, memHandleDesc);
+}
+
+CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(
+    CUdeviceptr *devPtr, CUexternalMemory extMem,
+    const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUexternalMemory,
+                                      const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedBuffer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, extMem, bufferDesc);
+}
+
+CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(
+    CUmipmappedArray *mipmap, CUexternalMemory extMem,
+    const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUmipmappedArray *, CUexternalMemory,
+                          const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmap, extMem, mipmapDesc);
+}
+
+CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem);
+}
+
+CUresult CUDAAPI cuImportExternalSemaphore(
+    CUexternalSemaphore *extSem_out,
+    const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem_out, semHandleDesc);
+}
+
+CUresult CUDAAPI cuSignalExternalSemaphoresAsync(
+    const CUexternalSemaphore *extSemArray,
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray,
+    unsigned int numExtSems, CUstream stream) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      const CUexternalSemaphore *,
+      const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *, unsigned int, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSignalExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+CUresult CUDAAPI cuWaitExternalSemaphoresAsync(
+    const CUexternalSemaphore *extSemArray,
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray,
+    unsigned int numExtSems, CUstream stream) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      const CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *,
+      unsigned int, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuWaitExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUexternalSemaphore);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem);
+}
+
+CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr,
+                                     cuuint32_t value, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue32");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr,
+                                     cuuint64_t value, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue64");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr,
+                                      cuuint32_t value, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue32");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr,
+                                      cuuint64_t value, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue64");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count,
+                                    CUstreamBatchMemOpParams *paramArray,
+                                    unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int,
+                                      CUstreamBatchMemOpParams *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBatchMemOp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, count, paramArray, flags);
+}
+
+CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib,
+                                    CUfunction hfunc) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction_attribute, CUfunction);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pi, attrib, hfunc);
+}
+
+CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc,
+                                    CUfunction_attribute attrib, int value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunction_attribute, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, attrib, value);
+}
+
+CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunc_cache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, config);
+}
+
+CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc,
+                                          CUsharedconfig config) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUsharedconfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, config);
+}
+
+CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX,
+                                unsigned int gridDimY, unsigned int gridDimZ,
+                                unsigned int blockDimX, unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes, CUstream hStream,
+                                void **kernelParams, void **extra) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
+      unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
+                  blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
+}
+
+CUresult CUDAAPI cuLaunchCooperativeKernel(
+    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
+    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
+    void **kernelParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
+      unsigned int, unsigned int, unsigned int, CUstream, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchCooperativeKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
+                  blockDimZ, sharedMemBytes, hStream, kernelParams);
+}
+
+CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(
+    CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices,
+    unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUDA_LAUNCH_PARAMS *, unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuLaunchCooperativeKernelMultiDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(launchParamsList, numDevices, flags);
+}
+
+CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn,
+                                  void *userData) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUhostFn, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchHostFunc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, fn, userData);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x,
+                                                       int y, int z) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetBlockShape");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, x, y, z);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc,
+                                                       unsigned int bytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, bytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc,
+                                                  unsigned int numbytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, numbytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset,
+                                               unsigned int value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSeti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, offset, value);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset,
+                                               float value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, offset, value);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset,
+                                               void *ptr,
+                                               unsigned int numbytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, offset, ptr, numbytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width,
+                                                int grid_height) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGrid");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, grid_width, grid_height);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f,
+                                                     int grid_width,
+                                                     int grid_height,
+                                                     CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGridAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, grid_width, grid_height, hStream);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc,
+                                                    int texunit,
+                                                    CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetTexRef");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, texunit, hTexRef);
+}
+
+CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraph, flags);
+}
+
+CUresult CUDAAPI cuGraphAddKernelNode(
+    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
+    size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
+                          const CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddKernelNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  nodeParams);
+}
+
+CUresult CUDAAPI cuGraphKernelNodeGetParams(
+    CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphKernelNodeSetParams(
+    CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
+                                      const CUgraphNode *dependencies,
+                                      size_t numDependencies,
+                                      const CUDA_MEMCPY3D *copyParams,
+                                      CUcontext ctx) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
+                          const CUDA_MEMCPY3D *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemcpyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  copyParams, ctx);
+}
+
+CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode,
+                                            CUDA_MEMCPY3D *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMCPY3D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode,
+                                            const CUDA_MEMCPY3D *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMCPY3D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddMemsetNode(
+    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
+    size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams,
+    CUcontext ctx) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
+                          const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemsetNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  memsetParams, ctx);
+}
+
+CUresult CUDAAPI cuGraphMemsetNodeGetParams(
+    CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMSET_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphMemsetNodeSetParams(
+    CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph,
+                                    const CUgraphNode *dependencies,
+                                    size_t numDependencies,
+                                    const CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
+                          const CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddHostNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  nodeParams);
+}
+
+CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode,
+                                          CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphHostNodeSetParams(
+    CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode,
+                                          CUgraph hGraph,
+                                          const CUgraphNode *dependencies,
+                                          size_t numDependencies,
+                                          CUgraph childGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
+                                      const CUgraphNode *, size_t, CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddChildGraphNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  childGraph);
+}
+
+CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode,
+                                               CUgraph *phGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraph *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphChildGraphNodeGetGraph");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, phGraph);
+}
+
+CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
+                                     const CUgraphNode *dependencies,
+                                     size_t numDependencies) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEmptyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphClone, originalGraph);
+}
+
+CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode,
+                                        CUgraphNode hOriginalNode,
+                                        CUgraph hClonedGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraphNode, CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeFindInClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phNode, hOriginalNode, hClonedGraph);
+}
+
+CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNodeType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, type);
+}
+
+CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes,
+                                 size_t *numNodes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, nodes, numNodes);
+}
+
+CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes,
+                                     size_t *numRootNodes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetRootNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, rootNodes, numRootNodes);
+}
+
+CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from,
+                                 CUgraphNode *to, size_t *numEdges) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetEdges");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, from, to, numEdges);
+}
+
+CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode,
+                                            CUgraphNode *dependencies,
+                                            size_t *numDependencies) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, dependencies, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode,
+                                              CUgraphNode *dependentNodes,
+                                              size_t *numDependentNodes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependentNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, dependentNodes, numDependentNodes);
+}
+
+CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from,
+                                        const CUgraphNode *to,
+                                        size_t numDependencies) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const CUgraphNode *,
+                                      const CUgraphNode *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, from, to, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph,
+                                           const CUgraphNode *from,
+                                           const CUgraphNode *to,
+                                           size_t numDependencies) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const CUgraphNode *,
+                                      const CUgraphNode *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphRemoveDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, from, to, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode);
+}
+
+CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph,
+                                    CUgraphNode *phErrorNode, char *logBuffer,
+                                    size_t bufferSize) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec *, CUgraph, CUgraphNode *,
+                                      char *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphInstantiate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize);
+}
+
+CUresult CUDAAPI
+cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode,
+                               const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
+                                      const CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec,
+                                                CUgraphNode hNode,
+                                                const CUDA_MEMCPY3D *copyParams,
+                                                CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
+                                      const CUDA_MEMCPY3D *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, copyParams, ctx);
+}
+
+CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(
+    CUgraphExec hGraphExec, CUgraphNode hNode,
+    const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUgraphExec, CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, memsetParams, ctx);
+}
+
+CUresult CUDAAPI
+cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode,
+                             const CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
+                                      const CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hStream);
+}
+
+CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec);
+}
+
+CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph);
+}
+
+CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph,
+                                   CUgraphNode *hErrorNode_out,
+                                   CUgraphExecUpdateResult *updateResult_out) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraph, CUgraphNode *,
+                                      CUgraphExecUpdateResult *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecUpdate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hGraph, hErrorNode_out, updateResult_out);
+}
+
+CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst,
+                                                 CUgraphNode src) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src);
+}
+
+CUresult CUDAAPI
+cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                              CUkernelNodeAttrValue *value_out) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUkernelNodeAttrID,
+                                      CUkernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value_out);
+}
+
+CUresult CUDAAPI
+cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                              const CUkernelNodeAttrValue *value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUkernelNodeAttrID,
+                                      const CUkernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value);
+}
+
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
+    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction, int, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuOccupancyMaxActiveBlocksPerMultiprocessor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
+}
+
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize,
+    unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(int *, CUfunction, int, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
+}
+
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
+    int *minGridSize, int *blockSize, CUfunction func,
+    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
+    int blockSizeLimit) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUfunction,
+                                      CUoccupancyB2DSize, size_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
+                  dynamicSMemSize, blockSizeLimit);
+}
+
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
+    int *minGridSize, int *blockSize, CUfunction func,
+    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
+    int blockSizeLimit, unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSizeWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
+                  dynamicSMemSize, blockSizeLimit, flags);
+}
+
+CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(
+    size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUfunction, int, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuOccupancyAvailableDynamicSMemPerBlock");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dynamicSmemSize, func, numBlocks, blockSize);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef,
+                                                    CUarray hArray,
+                                                    unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, hArray, Flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(
+    CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUmipmappedArray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, hMipmappedArray, Flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset,
+                                                      CUtexref hTexRef,
+                                                      CUdeviceptr dptr,
+                                                      size_t bytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUtexref, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ByteOffset, hTexRef, dptr, bytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc,
+                     CUdeviceptr dptr, size_t Pitch) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, const CUDA_ARRAY_DESCRIPTOR *,
+                                      CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress2D_v3");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, desc, dptr, Pitch);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef,
+                                                     CUarray_format fmt,
+                                                     int NumPackedComponents) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray_format, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFormat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, fmt, NumPackedComponents);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef,
+                                                          int dim,
+                                                          CUaddress_mode am) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, int, CUaddress_mode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddressMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, dim, am);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef,
+                                                         CUfilter_mode fm) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, fm);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, fm);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef,
+                                                              float bias) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, bias);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(
+    CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelClamp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, minMipmapLevelClamp, maxMipmapLevelClamp);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMaxAnisotropy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, maxAniso);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef,
+                                                          float *pBorderColor) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetBorderColor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, pBorderColor);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef,
+                                                    unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, Flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr,
+                                                      CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddress_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pdptr, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray,
+                                                    CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phArray, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(
+    CUmipmappedArray *phMipmappedArray, CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phMipmappedArray, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam,
+                                                          CUtexref hTexRef,
+                                                          int dim) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUaddress_mode *, CUtexref, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddressMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pam, hTexRef, dim);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm,
+                                                         CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pfm, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat,
+                                                     int *pNumChannels,
+                                                     CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray_format *, int *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFormat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFormat, pNumChannels, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pfm, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pbias, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp,
+                            float *pmaxMipmapLevelClamp, CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(float *, float *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelClamp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pminMipmapLevelClamp, pmaxMipmapLevelClamp, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso,
+                                                            CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMaxAnisotropy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pmaxAniso, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor,
+                                                          CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetBorderColor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pBorderColor, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags,
+                                                    CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef,
+                                                     CUarray hArray,
+                                                     unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref, CUarray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefSetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hSurfRef, hArray, Flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray,
+                                                     CUsurfref hSurfRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUsurfref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefGetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phArray, hSurfRef);
+}
+
+CUresult CUDAAPI
+cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc,
+                  const CUDA_TEXTURE_DESC *pTexDesc,
+                  const CUDA_RESOURCE_VIEW_DESC *pResViewDesc) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject *, const CUDA_RESOURCE_DESC *,
+                                      const CUDA_TEXTURE_DESC *,
+                                      const CUDA_RESOURCE_VIEW_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
+}
+
+CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texObject);
+}
+
+CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
+                                            CUtexObject texObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, texObject);
+}
+
+CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc,
+                                           CUtexObject texObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_TEXTURE_DESC *, CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetTextureDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexDesc, texObject);
+}
+
+CUresult CUDAAPI cuTexObjectGetResourceViewDesc(
+    CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_VIEW_DESC *, CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceViewDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResViewDesc, texObject);
+}
+
+CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject,
+                                    const CUDA_RESOURCE_DESC *pResDesc) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUsurfObject *, const CUDA_RESOURCE_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfObject, pResDesc);
+}
+
+CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsurfObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfObject);
+}
+
+CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
+                                             CUsurfObject surfObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUsurfObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectGetResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, surfObject);
+}
+
+CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev,
+                                       CUdevice peerDev) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceCanAccessPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(canAccessPeer, dev, peerDev);
+}
+
+CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext,
+                                       unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxEnablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerContext, Flags);
+}
+
+CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDisablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerContext);
+}
+
+CUresult CUDAAPI cuDeviceGetP2PAttribute(int *value,
+                                         CUdevice_P2PAttribute attrib,
+                                         CUdevice srcDevice,
+                                         CUdevice dstDevice) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(int *, CUdevice_P2PAttribute, CUdevice, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetP2PAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attrib, srcDevice, dstDevice);
+}
+
+CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnregisterResource");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource);
+}
+
+CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(
+    CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex,
+    unsigned int mipLevel) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUgraphicsResource,
+                                      unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphicsSubResourceGetMappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pArray, resource, arrayIndex, mipLevel);
+}
+
+CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(
+    CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUgraphicsResource);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pMipmappedArray, resource);
+}
+
+CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(
+    CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUgraphicsResource);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedPointer_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pDevPtr, pSize, resource);
+}
+
+CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource,
+                                               unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphicsResourceSetMapFlags_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource, flags);
+}
+
+CUresult CUDAAPI cuGraphicsMapResources(unsigned int count,
+                                        CUgraphicsResource *resources,
+                                        CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsMapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, hStream);
+}
+
+CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count,
+                                          CUgraphicsResource *resources,
+                                          CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnmapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, hStream);
+}
+
+CUresult CUDAAPI cuGetExportTable(const void **ppExportTable,
+                                  const CUuuid *pExportTableId) {
+  using FuncPtr = CUresult(CUDAAPI *)(const void **, const CUuuid *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetExportTable");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ppExportTable, pExportTableId);
+}
+
+CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, CUfunction);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetModule");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hmod, hfunc);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cuda_runtime_11_0.inc b/tensorflow/stream_executor/cuda/cuda_runtime_11_0.inc
new file mode 100644
index 00000000000..df3ada219e2
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_runtime_11_0.inc
@@ -0,0 +1,1974 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceReset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceSynchronize(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit,
+                                                         size_t value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pValue, limit);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaDeviceGetStreamPriorityRange");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(leastPriority, greatestPriority);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(cacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pConfig);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetByPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, pciBusId);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId,
+                                                            int len,
+                                                            int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(char *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pciBusId, len, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcEventHandle_t *, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, cudaIpcEventHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, handle);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcMemHandle_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(
+    void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, cudaIpcMemHandle_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, handle, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcCloseMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadExit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadSynchronize(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadSetLimit(enum cudaLimit limit, size_t value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pValue, limit);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCacheConfig);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(cacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetLastError(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaPeekAtLastError(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPeekAtLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ const char *CUDARTAPI
+cudaGetErrorName(cudaError_t error) {
+  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorName");
+  if (!func_ptr) return "cudaGetErrorName symbol not found.";
+  return func_ptr(error);
+}
+
+extern __host__ __cudart_builtin__ const char *CUDARTAPI
+cudaGetErrorString(cudaError_t error) {
+  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
+  if (!func_ptr) return "cudaGetErrorString symbol not found.";
+  return func_ptr(error);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDeviceCount(int *count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaDeviceProp *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceAttr, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attr, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetNvSciSyncAttributes(
+    void *nvSciSyncAttrList, int device, int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaDeviceGetNvSciSyncAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(nvSciSyncAttrList, device, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr,
+                          int srcDevice, int dstDevice) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceP2PAttr, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetP2PAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attr, srcDevice, dstDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, const struct cudaDeviceProp *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaChooseDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, prop);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDevice(int *device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr,
+                                                          int len) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetValidDevices");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device_arr, len);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags(unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDeviceFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags(unsigned int *flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
+                             int priority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream, flags, priority);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, priority);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCtxResetPersistingL2Cache(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCtxResetPersistingL2Cache");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetAttribute(cudaStream_t hStream, enum cudaStreamAttrID attr,
+                       union cudaStreamAttrValue *value_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamAttrID,
+                                           union cudaStreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value_out);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamSetAttribute(cudaStream_t hStream, enum cudaStreamAttrID attr,
+                       const union cudaStreamAttrValue *value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamAttrID,
+                                           const union cudaStreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamDestroy(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(
+    cudaStream_t stream, cudaEvent_t event, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, cudaEvent_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamWaitEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, event, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback,
+                      void *userData, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamCallback_t,
+                                           void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, callback, userData, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamSynchronize(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr,
+                         size_t length __dv(0),
+                         unsigned int flags __dv(cudaMemAttachSingle)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAttachMemAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, devPtr, length, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureMode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamBeginCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, mode);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaStreamCaptureMode *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaThreadExchangeStreamCaptureMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mode);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraph_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamEndCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(
+    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureStatus *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamIsCapturing");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pCaptureStatus);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(
+    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus,
+    unsigned long long *pId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetCaptureInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pCaptureStatus, pId);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecord");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventDestroy(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms,
+                                                           cudaEvent_t start,
+                                                           cudaEvent_t end) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(float *, cudaEvent_t, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventElapsedTime");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ms, start, end);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(
+    cudaExternalMemory_t *extMem_out,
+    const struct cudaExternalMemoryHandleDesc *memHandleDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaExternalMemory_t *, const struct cudaExternalMemoryHandleDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem_out, memHandleDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(
+    void **devPtr, cudaExternalMemory_t extMem,
+    const struct cudaExternalMemoryBufferDesc *bufferDesc) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, cudaExternalMemory_t,
+                               const struct cudaExternalMemoryBufferDesc *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedBuffer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, extMem, bufferDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(
+    cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem,
+    const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaMipmappedArray_t *, cudaExternalMemory_t,
+      const struct cudaExternalMemoryMipmappedArrayDesc *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmap, extMem, mipmapDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroyExternalMemory(cudaExternalMemory_t extMem) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalMemory_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(
+    cudaExternalSemaphore_t *extSem_out,
+    const struct cudaExternalSemaphoreHandleDesc *semHandleDesc) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t *,
+                               const struct cudaExternalSemaphoreHandleDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem_out, semHandleDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(
+    const cudaExternalSemaphore_t *extSemArray,
+    const struct cudaExternalSemaphoreSignalParams *paramsArray,
+    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
+                               const struct cudaExternalSemaphoreSignalParams *,
+                               unsigned int, cudaStream_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaSignalExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(
+    const cudaExternalSemaphore_t *extSemArray,
+    const struct cudaExternalSemaphoreWaitParams *paramsArray,
+    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
+                               const struct cudaExternalSemaphoreWaitParams *,
+                               unsigned int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaWaitExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args,
+                 size_t sharedMem, cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(
+    const void *func, dim3 gridDim, dim3 blockDim, void **args,
+    size_t sharedMem, cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(
+    struct cudaLaunchParams *launchParamsList, unsigned int numDevices,
+    unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaLaunchParams *,
+                                           unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernelMultiDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(launchParamsList, numDevices, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, cacheConfig);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, enum cudaSharedMemConfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, config);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaFuncAttributes *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attr, func);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncAttribute, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, attr, value);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaSetDoubleForDevice(double *d) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(d);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaSetDoubleForHost(double *d) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(d);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream,
+                                                         cudaHostFn_t fn,
+                                                         void *userData) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaHostFn_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchHostFunc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, fn, userData);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func,
+                                              int blockSize,
+                                              size_t dynamicSMemSize) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize,
+                                          const void *func, int numBlocks,
+                                          int blockSize) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *, int, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaOccupancyAvailableDynamicSMemPerBlock");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dynamicSmemSize, func, numBlocks, blockSize);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks,
+                                                       const void *func,
+                                                       int blockSize,
+                                                       size_t dynamicSMemSize,
+                                                       unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(
+    void **devPtr, size_t size, unsigned int flags __dv(cudaMemAttachGlobal)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocManaged");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMalloc(void **devPtr, size_t size) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr,
+                                                      size_t *pitch,
+                                                      size_t width,
+                                                      size_t height) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t *, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocPitch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, width, height);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocArray(
+    cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width,
+    size_t height __dv(0), unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
+                                           const struct cudaChannelFormatDesc *,
+                                           size_t, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, desc, width, height, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFree(void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size,
+                                                    unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHost, size, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size,
+                                                       unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostRegister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostUnregister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetDevicePointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pDevice, pHost, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags,
+                                                       void *pHost) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, pHost);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMalloc3D(struct cudaPitchedPtr *pitchedDevPtr, struct cudaExtent extent) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr *, struct cudaExtent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, extent);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc,
+                  struct cudaExtent extent, unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
+                                           const struct cudaChannelFormatDesc *,
+                                           struct cudaExtent, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3DArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, desc, extent, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(
+    cudaMipmappedArray_t *mipmappedArray,
+    const struct cudaChannelFormatDesc *desc, struct cudaExtent extent,
+    unsigned int numLevels, unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaMipmappedArray_t *, const struct cudaChannelFormatDesc *,
+      struct cudaExtent, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray, desc, extent, numLevels, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(
+    cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray,
+    unsigned int level) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t *, cudaMipmappedArray_const_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetMipmappedArrayLevel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(levelArray, mipmappedArray, level);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(
+    const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(
+    const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *,
+                                           cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free,
+                                                     size_t *total) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemGetInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(free, total);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent,
+                 unsigned int *flags, cudaArray_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
+                                           struct cudaExtent *, unsigned int *,
+                                           cudaArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(desc, extent, flags, array);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src,
+                                                 size_t count,
+                                                 enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
+                                           enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, count, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice,
+                                                     const void *src,
+                                                     int srcDevice,
+                                                     size_t count) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, int, const void *, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dstDevice, src, srcDevice, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch,
+                                                   const void *src,
+                                                   size_t spitch, size_t width,
+                                                   size_t height,
+                                                   enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, spitch, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(
+    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
+    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t, size_t,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(
+    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
+    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
+    size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
+                                           cudaArray_const_t, size_t, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DArrayToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
+                  width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(
+    const void *symbol, const void *src, size_t count, size_t offset __dv(0),
+    enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, const void *, size_t,
+                                           size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, src, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(
+    void *dst, const void *symbol, size_t count, size_t offset __dv(0),
+    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
+                                           enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, symbol, count, offset, kind);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemcpyAsync(void *dst, const void *src, size_t count,
+                enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, count, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice,
+                    size_t count, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, const void *, int,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dstDevice, src, srcDevice, count, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(
+    void *dst, size_t dpitch, const void *src, size_t spitch, size_t width,
+    size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t, size_t,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, spitch, width, height, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
+    cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
+                                           const void *, size_t, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind,
+                  stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(
+    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
+    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind,
+    cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t,
+                                           size_t, size_t, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind,
+                  stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(
+    const void *symbol, const void *src, size_t count, size_t offset,
+    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, const void *, size_t, size_t,
+                               enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, src, count, offset, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(
+    void *dst, const void *symbol, size_t count, size_t offset,
+    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, symbol, count, offset, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value,
+                                                 size_t count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, value, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch,
+                                                   int value, size_t width,
+                                                   size_t height) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, value, width, height);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset3D(
+    struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int, struct cudaExtent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, value, extent);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(
+    void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemsetAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, value, count, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width,
+                  size_t height, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t,
+                                           cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, value, width, height, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value,
+                  struct cudaExtent extent, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int,
+                                           struct cudaExtent, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, value, extent, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr,
+                                                           const void *symbol) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolAddress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size,
+                                                        const void *symbol) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(size, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice,
+                     cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, size_t, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPrefetchAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, dstDevice, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice,
+              int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t,
+                                           enum cudaMemoryAdvise, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemAdvise");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, advice, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(
+    void *data, size_t dataSize, enum cudaMemRangeAttribute attribute,
+    const void *devPtr, size_t count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      void *, size_t, enum cudaMemRangeAttribute, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSize, attribute, devPtr, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(
+    void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes,
+    size_t numAttributes, const void *devPtr, size_t count) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, size_t *, enum cudaMemRangeAttribute *,
+                               size_t, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset,
+                  const void *src, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t, size_t, size_t, const void *, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset,
+                    size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, wOffset, hOffset, count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(
+    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
+    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count,
+    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, cudaArray_const_t,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyArrayToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
+                  count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, count, kind, stream);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset,
+                         size_t hOffset, size_t count, enum cudaMemcpyKind kind,
+                         cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t, size_t,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, wOffset, hOffset, count, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(
+    struct cudaPointerAttributes *attributes, const void *ptr) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPointerAttributes *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attributes, ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceCanAccessPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(canAccessPeer, device, peerDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceEnablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerDevice, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceDisablePeerAccess(int peerDevice) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceDisablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnregisterResource");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(
+    cudaGraphicsResource_t resource, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceSetMapFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(
+    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsMapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(
+    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnmapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(
+    void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, size_t *, cudaGraphicsResource_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedPointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size, resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(
+    cudaArray_t *array, cudaGraphicsResource_t resource,
+    unsigned int arrayIndex, unsigned int mipLevel) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t *, cudaGraphicsResource_t, unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsSubResourceGetMappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, resource, arrayIndex, mipLevel);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphicsResourceGetMappedMipmappedArray(
+    cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t *, cudaGraphicsResource_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray, resource);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTexture(
+    size_t *offset, const struct textureReference *texref, const void *devPtr,
+    const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      size_t *, const struct textureReference *, const void *,
+      const struct cudaChannelFormatDesc *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref, devPtr, desc, size);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaBindTexture2D(size_t *offset, const struct textureReference *texref,
+                  const void *devPtr, const struct cudaChannelFormatDesc *desc,
+                  size_t width, size_t height, size_t pitch) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      size_t *, const struct textureReference *, const void *,
+      const struct cudaChannelFormatDesc *, size_t, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref, devPtr, desc, width, height, pitch);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(
+    const struct textureReference *texref, cudaArray_const_t array,
+    const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      const struct textureReference *, cudaArray_const_t,
+      const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, array, desc);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaBindTextureToMipmappedArray(const struct textureReference *texref,
+                                cudaMipmappedArray_const_t mipmappedArray,
+                                const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      const struct textureReference *, cudaMipmappedArray_const_t,
+      const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, mipmappedArray, desc);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaUnbindTexture(const struct textureReference *texref) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct textureReference *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUnbindTexture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaGetTextureAlignmentOffset(size_t *offset,
+                              const struct textureReference *texref) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(size_t *, const struct textureReference *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureAlignmentOffset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetTextureReference(
+    const struct textureReference **texref, const void *symbol) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct textureReference **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureReference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, symbol);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(
+    const struct surfaceReference *surfref, cudaArray_const_t array,
+    const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      const struct surfaceReference *, cudaArray_const_t,
+      const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindSurfaceToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfref, array, desc);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(
+    const struct surfaceReference **surfref, const void *symbol) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct surfaceReference **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSurfaceReference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfref, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(
+    struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
+                                           cudaArray_const_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetChannelDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(desc, array);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(
+    cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc,
+    const struct cudaTextureDesc *pTexDesc,
+    const struct cudaResourceViewDesc *pResViewDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaTextureObject_t *, const struct cudaResourceDesc *,
+      const struct cudaTextureDesc *, const struct cudaResourceViewDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroyTextureObject(cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaTextureObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyTextureObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(
+    struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaTextureObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(
+    struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaTextureDesc *, cudaTextureObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(
+    struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaResourceViewDesc *,
+                                           cudaTextureObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceViewDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResViewDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(
+    cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t *,
+                                           const struct cudaResourceDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateSurfaceObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfObject, pResDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroySurfaceObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(
+    struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaSurfaceObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetSurfaceObjectResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, surfObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDriverGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(driverVersion);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaRuntimeGetVersion(int *runtimeVersion) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaRuntimeGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(runtimeVersion);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph,
+                                                      unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraph, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddKernelNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(
+    cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(
+    cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphKernelNodeCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hSrc, hDst);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetAttribute(
+    cudaGraphNode_t hNode, enum cudaKernelNodeAttrID attr,
+    union cudaKernelNodeAttrValue *value_out) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaKernelNodeAttrID,
+                               union cudaKernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetAttribute(
+    cudaGraphNode_t hNode, enum cudaKernelNodeAttrID attr,
+    const union cudaKernelNodeAttrValue *value) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaKernelNodeAttrID,
+                               const union cudaKernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaMemcpy3DParms *pCopyParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pCopyParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(
+    cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(
+    cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaMemsetParams *pMemsetParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemsetNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pMemsetParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(
+    cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(
+    cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddHostNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(
+    cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(
+    cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+                           const cudaGraphNode_t *pDependencies,
+                           size_t numDependencies, cudaGraph_t childGraph) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                               const cudaGraphNode_t *, size_t, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddChildGraphNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  childGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraph_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphChildGraphNodeGetGraph");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEmptyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphClone, originalGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode,
+                         cudaGraph_t clonedGraph) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraphNode_t, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeFindInClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pNode, originalNode, clonedGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaGraphNodeType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pType);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph,
+                                                        cudaGraphNode_t *nodes,
+                                                        size_t *numNodes) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, nodes, numNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(
+    cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetRootNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, pRootNodes, pNumRootNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph,
+                                                        cudaGraphNode_t *from,
+                                                        cudaGraphNode_t *to,
+                                                        size_t *numEdges) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
+                                           cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetEdges");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numEdges);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(
+    cudaGraphNode_t node, cudaGraphNode_t *pDependencies,
+    size_t *pNumDependencies) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pDependencies, pNumDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(
+    cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes,
+    size_t *pNumDependentNodes) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependentNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pDependentNodes, pNumDependentNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
+                         const cudaGraphNode_t *to, size_t numDependencies) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
+                                           const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
+                            const cudaGraphNode_t *to, size_t numDependencies) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
+                                           const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRemoveDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphDestroyNode(cudaGraphNode_t node) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(
+    cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode,
+    char *pLogBuffer, size_t bufferSize) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
+                                           cudaGraphNode_t *, char *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+    const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaKernelNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+    const struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaMemcpy3DParms *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemsetNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+    const struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaMemsetParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+                               const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph,
+                    cudaGraphNode_t *hErrorNode_out,
+                    enum cudaGraphExecUpdateResult *updateResult_out) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraph_t, cudaGraphNode_t *,
+                               enum cudaGraphExecUpdateResult *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecUpdate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hGraph, hErrorNode_out, updateResult_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec,
+                                                      cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graphExec, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecDestroy(cudaGraphExec_t graphExec) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graphExec);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(
+    const void **ppExportTable, const cudaUUID_t *pExportTableId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void **, const cudaUUID_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetExportTable");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ppExportTable, pExportTableId);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudart_stub.cc b/tensorflow/stream_executor/cuda/cudart_stub.cc
index 5ee106a65fd..3b9e0f2937b 100644
--- a/tensorflow/stream_executor/cuda/cudart_stub.cc
+++ b/tensorflow/stream_executor/cuda/cudart_stub.cc
@@ -131,6 +131,13 @@ extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration(
   return func_ptr(gridDim, blockDim, sharedMem, stream);
 }
 
+extern char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
+  using FuncPtr = char(CUDARTAPI *)(void **fatCubinHandle);
+  static auto func_ptr = LoadSymbol<FuncPtr>("__cudaInitModule");
+  if (!func_ptr) return 0;
+  return func_ptr(fatCubinHandle);
+}
+
 #if CUDART_VERSION >= 10010
 extern void CUDARTAPI __cudaRegisterFatBinaryEnd(void **fatCubinHandle) {
   using FuncPtr = void(CUDARTAPI *)(void **fatCubinHandle);
diff --git a/tensorflow/stream_executor/cuda/cudnn_6_0.inc b/tensorflow/stream_executor/cuda/cudnn_6_0.inc
index 6ac7a695d9f..11288983a4a 100644
--- a/tensorflow/stream_executor/cuda/cudnn_6_0.inc
+++ b/tensorflow/stream_executor/cuda/cudnn_6_0.inc
@@ -3,1771 +3,1823 @@
 extern "C" {
 
 size_t CUDNNWINAPI cudnnGetVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
 size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-const char *  CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
   if (!func_ptr) return "cudnnGetErrorString symbol not found.";
   return func_ptr(status);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
+                                           int *value) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreate        (cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroy       (cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetStream     (cudnnHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
+                                         cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetStream     (cudnnHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
+                                         cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateTensorDescriptor(
-                                cudnnTensorDescriptor_t            *tensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnTensorFormat_t                 format,
-                                cudnnDataType_t                     dataType, // image data type
-                                int                                 n,        // number of inputs (batch size)
-                                int                                 c,        // number of input feature maps
-                                int                                 h,        // height of input section
-                                int                                 w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType,  // image data type
+    int n,                     // number of inputs (batch size)
+    int c,                     // number of input feature maps
+    int h,                     // height of input section
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnDataType_t                     dataType, // image data type
-                                int                                 n,        // number of inputs (batch size)
-                                int                                 c,        // number of input feature maps
-                                int                                 h,        // height of input section
-                                int                                 w,        // width of input section
-                                int                                 nStride,
-                                int                                 cStride,
-                                int                                 hStride,
-                                int                                 wStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+    cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t dataType,  // image data type
+    int n,                     // number of inputs (batch size)
+    int c,                     // number of input feature maps
+    int h,                     // height of input section
+    int w,                     // width of input section
+    int nStride, int cStride, int hStride, int wStride) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
+                                   int, int, int, int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                cudnnDataType_t                    *dataType, // image data type
-                                int                                *n,        // number of inputs (batch size)
-                                int                                *c,        // number of input feature maps
-                                int                                *h,        // height of input section
-                                int                                *w,        // width of input section
-                                int                                *nStride,
-                                int                                *cStride,
-                                int                                *hStride,
-                                int                                *wStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+    const cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t *dataType,  // image data type
+    int *n,                     // number of inputs (batch size)
+    int *c,                     // number of input feature maps
+    int *h,                     // height of input section
+    int *w,                     // width of input section
+    int *nStride, int *cStride, int *hStride, int *wStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
+      int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnDataType_t                     dataType,
-                                int                                 nbDims,
-                                const int                           dimA[],
-                                const int                           strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
+    const int dimA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnTensorFormat_t                 format,
-                                cudnnDataType_t                     dataType,
-                                int                                 nbDims,
-                                const int                           dimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                int                                 nbDimsRequested,
-                                cudnnDataType_t                    *dataType,
-                                int                                *nbDims,
-                                int                                 dimA[],
-                                int                                 strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
+                                   cudnnDataType_t *, int *, int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                size_t                              *size) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, size);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyTensorDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnAddTensor(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
+                                         const void *alpha,
+                                         const cudnnTensorDescriptor_t aDesc,
+                                         const void *A, const void *beta,
+                                         const cudnnTensorDescriptor_t cDesc,
+                                         void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t          *opTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t           opTensorDesc,
-                                cudnnOpTensorOp_t                   opTensorOp,
-                                cudnnDataType_t                     opTensorCompType,
-                                cudnnNanPropagation_t               opTensorNanOpt ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
+    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
+                                   cudnnDataType_t, cudnnNanPropagation_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
-                                const cudnnOpTensorDescriptor_t     opTensorDesc,
-                                cudnnOpTensorOp_t                  *opTensorOp,
-                                cudnnDataType_t                    *opTensorCompType,
-                                cudnnNanPropagation_t              *opTensorNanOpt ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
+    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
+      cudnnNanPropagation_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t           opTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnOpTensorDescriptor_t     opTensorDesc,
-                                const void                         *alpha1,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *alpha2,
-                                const cudnnTensorDescriptor_t       bDesc,
-                                const void                         *B,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
+    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
+    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
+    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
+                  beta, cDesc, C);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t          *reduceTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t           reduceTensorDesc,
-                                cudnnReduceTensorOp_t                   reduceTensorOp,
-                                cudnnDataType_t                     reduceTensorCompType,
-                                cudnnNanPropagation_t               reduceTensorNanOpt,
-                                cudnnReduceTensorIndices_t          reduceTensorIndices,
-                                cudnnIndicesType_t                  reduceTensorIndicesType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+    cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
+    cudnnNanPropagation_t reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t reduceTensorIndices,
+    cudnnIndicesType_t reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
+      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
-                                const cudnnReduceTensorDescriptor_t     reduceTensorDesc,
-                                cudnnReduceTensorOp_t                  *reduceTensorOp,
-                                cudnnDataType_t                    *reduceTensorCompType,
-                                cudnnNanPropagation_t              *reduceTensorNanOpt,
-                                cudnnReduceTensorIndices_t         *reduceTensorIndices,
-                                cudnnIndicesType_t                 *reduceTensorIndicesType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t *reduceTensorOp,
+    cudnnDataType_t *reduceTensorCompType,
+    cudnnNanPropagation_t *reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t *reduceTensorIndices,
+    cudnnIndicesType_t *reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
+      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
+      cudnnIndicesType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t           reduceTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                void                               *indices,
-                                size_t                              indicesSizeInBytes,
-                                void                               *workspace,
-                                size_t                              workspaceSizeInBytes,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    void *indices, size_t indicesSizeInBytes, void *workspace,
+    size_t workspaceSizeInBytes, const void *alpha,
+    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
+    const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
+      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
+                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
+                  C);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const void                         *valuePtr ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
+                                         const cudnnTensorDescriptor_t yDesc,
+                                         void *y, const void *valuePtr) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, valuePtr);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const void                         *alpha ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           void *y, const void *alpha) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, alpha);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateFilterDescriptor(
-                                cudnnFilterDescriptor_t            *filterDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc,
-                                cudnnDataType_t                     dataType, // image data type
-                                cudnnTensorFormat_t                 format,
-                                int                                 k,        // number of output feature maps
-                                int                                 c,        // number of input feature maps
-                                int                                 h,        // height of each input filter
-                                int                                 w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType,  // image data type
+                           cudnnTensorFormat_t format,
+                           int k,  // number of output feature maps
+                           int c,  // number of input feature maps
+                           int h,  // height of each input filter
+                           int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                cudnnDataType_t                    *dataType, // image data type
-                                cudnnTensorFormat_t                *format,
-                                int                                *k,        // number of output feature maps
-                                int                                *c,        // number of input feature maps
-                                int                                *h,        // height of each input filter
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t *dataType,  // image data type
+                           cudnnTensorFormat_t *format,
+                           int *k,  // number of output feature maps
+                           int *c,  // number of input feature maps
+                           int *h,  // height of each input filter
+                           int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
+      int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc,
-                                cudnnDataType_t                     dataType, // image data type
-                                cudnnTensorFormat_t                 format,
-                                int                                 nbDims,
-                                const int                           filterDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType,  // image data type
+    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                 nbDimsRequested,
-                                cudnnDataType_t                    *dataType, // image data type
-                                cudnnTensorFormat_t                *format,
-                                int                                *nbDims,
-                                int                                 filterDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType,  // image data type
+    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
+      cudnnTensorFormat_t *, int *, int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
+                  filterDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyFilterDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateConvolutionDescriptor(
-                                cudnnConvolutionDescriptor_t       *convDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor( cudnnConvolutionDescriptor_t convDesc,
-                                                             int pad_h,    // zero-padding height
-                                                             int pad_w,    // zero-padding width
-                                                             int u,   // vertical filter stride
-                                                             int v,   // horizontal filter stride
-                                                             int dilation_h, // filter dilation in the vertical dimension
-                                                             int dilation_w, // filter dilation in the horizontal dimension
-                                                             cudnnConvolutionMode_t mode,
-                                                             cudnnDataType_t computeType
-                                                           ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
+    cudnnConvolutionDescriptor_t convDesc,
+    int pad_h,       // zero-padding height
+    int pad_w,       // zero-padding width
+    int u,           // vertical filter stride
+    int v,           // horizontal filter stride
+    int dilation_h,  // filter dilation in the vertical dimension
+    int dilation_w,  // filter dilation in the horizontal dimension
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(  const cudnnConvolutionDescriptor_t convDesc,
-                                                            int* pad_h,    // zero-padding height
-                                                            int* pad_w,    // zero-padding width
-                                                            int* u,        // vertical filter stride
-                                                            int* v,        // horizontal filter stride
-                                                            int* dilation_h, // filter dilation in the vertical dimension
-                                                            int* dilation_w, // filter dilation in the horizontal dimension
-                                                            cudnnConvolutionMode_t* mode,
-                                                            cudnnDataType_t *computeType
-                                                         ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc,
+    int *pad_h,       // zero-padding height
+    int *pad_w,       // zero-padding width
+    int *u,           // vertical filter stride
+    int *v,           // horizontal filter stride
+    int *dilation_h,  // filter dilation in the vertical dimension
+    int *dilation_w,  // filter dilation in the horizontal dimension
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
+      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                *n,
-                                int                                *c,
-                                int                                *h,
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
-                                cudnnConvolutionDescriptor_t        convDesc,
-                                int                                 arrayLength,             /* nbDims-2 size */
-                                const int                           padA[],
-                                const int                           filterStrideA[],
-                                const int                           dilationA[],
-                                cudnnConvolutionMode_t              mode,
-                                cudnnDataType_t                     computeType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
+    const int padA[], const int filterStrideA[], const int dilationA[],
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
+                  computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                int                                 arrayLengthRequested,
-                                int                                *arrayLength,
-                                int                                 padA[],
-                                int                                 strideA[],
-                                int                                 dilationA[],
-                                cudnnConvolutionMode_t             *mode,
-                                cudnnDataType_t                    *computeType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
+    int *arrayLength, int padA[], int strideA[], int dilationA[],
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
+      cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
+                  dilationA, mode, computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                 nbDims,
-                                int                                 tensorOutputDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int nbDims,
+    int tensorOutputDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOutputDimA);
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
+                  tensorOutputDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyConvolutionDescriptor(
-                                cudnnConvolutionDescriptor_t        convDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionFwdAlgoPerf_t      *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionFwdAlgoPerf_t      *perfResults,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                cudnnConvolutionFwdPreference_t     preference,
-                                size_t                              memoryLimitInBytes,
-                                cudnnConvolutionFwdAlgo_t          *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc,
+    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionFwdAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
+      cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha1,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *alpha2,
-                                const cudnnTensorDescriptor_t       zDesc,
-                                const void                         *z,
-                                const cudnnTensorDescriptor_t       biasDesc,
-                                const void                         *bias,
-                                const cudnnActivationDescriptor_t   activationDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+    cudnnHandle_t handle, const void *alpha1,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
+    const cudnnTensorDescriptor_t zDesc, const void *z,
+    const cudnnTensorDescriptor_t biasDesc, const void *bias,
+    const cudnnActivationDescriptor_t activationDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
+                  activationDesc, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dbDesc,
-                                void                               *db ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dbDesc, void *db) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnFilterDescriptor_t       dwDesc,
-                                const int                           requestedAlgoCount,
-                                int                                 *returnedAlgoCount,
-                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
-                                cudnnHandle_t                        handle,
-                                const cudnnTensorDescriptor_t        xDesc,
-                                const void                          *x,
-                                const cudnnTensorDescriptor_t        dyDesc,
-                                const void                          *y,
-                                const cudnnConvolutionDescriptor_t   convDesc,
-                                const cudnnFilterDescriptor_t        dwDesc,
-                                void                                *dw,
-                                const int                            requestedAlgoCount,
-                                int                                 *returnedAlgoCount,
-                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
-                                void                                *workSpace,
-                                size_t                               workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *y,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
-                                cudnnHandle_t                         handle,
-                                const cudnnTensorDescriptor_t         xDesc,
-                                const cudnnTensorDescriptor_t         dyDesc,
-                                const cudnnConvolutionDescriptor_t    convDesc,
-                                const cudnnFilterDescriptor_t         dwDesc,
-                                cudnnConvolutionBwdFilterPreference_t preference,
-                                size_t                                memoryLimitInBytes,
-                                cudnnConvolutionBwdFilterAlgo_t      *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc,
+    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdFilterAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
+      size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnFilterDescriptor_t       gradDesc,
-                                cudnnConvolutionBwdFilterAlgo_t     algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionBwdFilterAlgo_t     algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnFilterDescriptor_t       dwDesc,
-                                void                               *dw ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnFilterDescriptor_t dwDesc, void *dw) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
+      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                cudnnConvolutionBwdDataPreference_t preference,
-                                size_t                              memoryLimitInBytes,
-                                cudnnConvolutionBwdDataAlgo_t      *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc,
+    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdDataAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
+      size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                cudnnConvolutionBwdDataAlgo_t       algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionBwdDataAlgo_t       algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnIm2Col(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                void                               *colBuffer ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI
+cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+            const void *x, const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
+                                   const void *, const cudnnFilterDescriptor_t,
+                                   const cudnnConvolutionDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
-                                cudnnHandle_t                       handle,
-                                cudnnSoftmaxAlgorithm_t             algo,
-                                cudnnSoftmaxMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnSoftmaxAlgorithm_t             algo,
-                                cudnnSoftmaxMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
+                  dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreatePoolingDescriptor(
-                                cudnnPoolingDescriptor_t           *poolingDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc,
-                                cudnnPoolingMode_t                  mode,
-                                cudnnNanPropagation_t               maxpoolingNanOpt,
-                                int                                 windowHeight,
-                                int                                 windowWidth,
-                                int                                 verticalPadding,
-                                int                                 horizontalPadding,
-                                int                                 verticalStride,
-                                int                                 horizontalStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
+    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
+    int verticalPadding, int horizontalPadding, int verticalStride,
+    int horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
+      int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                cudnnPoolingMode_t                 *mode,
-                                cudnnNanPropagation_t              *maxpoolingNanOpt,
-                                int                                *windowHeight,
-                                int                                *windowWidth,
-                                int                                *verticalPadding,
-                                int                                *horizontalPadding,
-                                int                                *verticalStride,
-                                int                                *horizontalStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
+    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
+    int *windowWidth, int *verticalPadding, int *horizontalPadding,
+    int *verticalStride, int *horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc,
-                                const cudnnPoolingMode_t            mode,
-                                const cudnnNanPropagation_t         maxpoolingNanOpt,
-                                int                                 nbDims,
-                                const int                           windowDimA[],
-                                const int                           paddingA[],
-                                const int                           strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
+    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
+    const int windowDimA[], const int paddingA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
+      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
+                  paddingA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                int                                 nbDimsRequested,
-                                cudnnPoolingMode_t                 *mode,
-                                cudnnNanPropagation_t              *maxpoolingNanOpt,
-                                int                                *nbDims,
-                                int                                 windowDimA[],
-                                int                                 paddingA[],
-                                int                                 strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
+    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
+    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int[], int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
+                  windowDimA, paddingA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdForwardOutputDim(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                int                                 nbDims,
-                                int                                 outputTensorDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims, int outputTensorDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                   const cudnnTensorDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dForwardOutputDim(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                int                                *n,
-                                int                                *c,
-                                int                                *h,
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyPoolingDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
-                                cudnnHandle_t                       handle,
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
-                                cudnnHandle_t                       handle,
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const void                          *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateActivationDescriptor(
-                                cudnnActivationDescriptor_t        *activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
-                                cudnnActivationDescriptor_t         activationDesc,
-                                cudnnActivationMode_t               mode,
-                                cudnnNanPropagation_t               reluNanOpt,
-                                double                              coef ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
+    cudnnNanPropagation_t reluNanOpt, double coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
+                                               cudnnActivationMode_t,
+                                               cudnnNanPropagation_t, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetActivationDescriptor(
-                                const cudnnActivationDescriptor_t   activationDesc,
-                                cudnnActivationMode_t              *mode,
-                                cudnnNanPropagation_t              *reluNanOpt,
-                                double*                             coef ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
+      cudnnNanPropagation_t *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyActivationDescriptor(
-                                cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
-                                cudnnHandle_t                       handle,
-                                cudnnActivationDescriptor_t         activationDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnActivationDescriptor_t         activationDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateLRNDescriptor(
-                                cudnnLRNDescriptor_t               *normDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(
-                                cudnnLRNDescriptor_t                normDesc,
-                                unsigned                            lrnN,
-                                double                              lrnAlpha,
-                                double                              lrnBeta,
-                                double                              lrnK ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned lrnN, double lrnAlpha,
+                                                double lrnBeta, double lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int, double, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(
-                                cudnnLRNDescriptor_t                normDesc,
-                                unsigned*                           lrnN,
-                                double*                             lrnAlpha,
-                                double*                             lrnBeta,
-                                double*                             lrnK ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned *lrnN,
+                                                double *lrnAlpha,
+                                                double *lrnBeta, double *lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyLRNDescriptor( cudnnLRNDescriptor_t lrnDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(lrnDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnLRNMode_t                      lrnMode,
-                                const void*                         alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnLRNMode_t                      lrnMode,
-                                const void*                         alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
+                  x, beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnDivNormMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc, // same desc for means, temp, temp2
-                                const void                         *x,
-                                const void                         *means, // if NULL, means are assumed to be zero
-                                void                               *temp,
-                                void                               *temp2,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc,  // same desc for means, temp, temp2
+    const void *x,
+    const void *means,  // if NULL, means are assumed to be zero
+    void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
+                  beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnDivNormMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc, // same desc for x, means, dy, temp, temp2
-                                const void                         *x,
-                                const void                         *means, // if NULL, means are assumed to be zero
-                                const void                         *dy,
-                                void                               *temp,
-                                void                               *temp2,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dXdMeansDesc, // same desc for dx, dMeans
-                                void                               *dx, // output x differential
-                                void                               *dMeans ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t
+        xDesc,  // same desc for x, means, dy, temp, temp2
+    const void *x,
+    const void *means,  // if NULL, means are assumed to be zero
+    const void *dy, void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t dXdMeansDesc,  // same desc for dx, dMeans
+    void *dx,                                    // output x differential
+    void *dMeans) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
+                  temp2, beta, dXdMeansDesc, dx, dMeans);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
-                                cudnnTensorDescriptor_t             derivedBnDesc,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                cudnnBatchNormMode_t                mode ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
+    cudnnBatchNormMode_t mode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               cudnnBatchNormMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(derivedBnDesc, xDesc, mode);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
 
-                                const void                         *alpha, // alpha[0] = result blend factor
-                                const void                         *beta,  // beta[0] = dest layer blend factor
+    const void *alpha,  // alpha[0] = result blend factor
+    const void *beta,   // beta[0] = dest layer blend factor
 
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,     // NxCxHxW
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,     // NxCxHxW
+    const cudnnTensorDescriptor_t xDesc,
+    const void *x,  // NxCxHxW
+    const cudnnTensorDescriptor_t yDesc,
+    void *y,  // NxCxHxW
 
-                                /* Shared desc for the next 6 tensors in the argument list.
-                                   Data type to be set as follows:
-                                   type = (typeOf(x) == double) ? double : float
-                                   Dimensions for this descriptor depend on normalization mode
-                                   - Spatial Normalization : tensors are expected to have dims 1xCx1x1
-                                    (normalization is performed across NxHxW)
-                                   - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW 
-                                    (normalization is performed across N) */
-                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of
+       1xCxHxW (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
 
-                                // 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
-                                const void                         *bnScale,
-                                const void                         *bnBias,
+    // 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
+    const void *bnScale, const void *bnBias,
 
-                                /* MUST use factor=1 in the very first call of a complete training cycle.
-                                   Use a factor=1/(1+n) at N-th call to the function to get
-                                   Cumulative Moving Average (CMA) behavior
-                                   CMA[n] = (x[1]+...+x[n])/n
-                                   Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
-                                   ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
-                                   CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
-                                double                              exponentialAverageFactor,
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
 
-                                /* Used in Training phase only. 
-                                   runningMean = newMean*factor + runningMean*(1-factor) */
-                                void                               *resultRunningMean,
-                                /* Output in training mode, input in inference. Is the moving average
-                                   of  variance[x] (factor is applied in the same way as for runningMean) */
-                                void                               *resultRunningVariance,
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
 
-                                /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
-                                double                              epsilon,
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
+       backward functions. */
+    double epsilon,
 
-                                /* Optionally save intermediate results from the forward pass here
-                                   - can be reused to speed up backward pass. NULL if unused */
-                                void                               *resultSaveMean,
-                                void                               *resultSaveInvVariance ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean, void *resultSaveInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, double, void *, void *, double, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+  return func_ptr(
+      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
+      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
+      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
-                                const void                         *alpha, // alpha[0] = result blend factor
-                                const void                         *beta,  // beta[0] = dest layer blend factor
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,     // NxCxHxW
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,     // NxCxHxW
-                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
-                                const void                         *bnScale,
-                                const void                         *bnBias,
-                                const void                         *estimatedMean,
-                                const void                         *estimatedVariance,
-                                double                              epsilon ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
+    const void *alpha,  // alpha[0] = result blend factor
+    const void *beta,   // beta[0] = dest layer blend factor
+    const cudnnTensorDescriptor_t xDesc,
+    const void *x,  // NxCxHxW
+    const cudnnTensorDescriptor_t yDesc,
+    void *y,  // NxCxHxW
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
+    const void *bnBias, const void *estimatedMean,
+    const void *estimatedVariance, double epsilon) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, const void *, const void *, double);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
+                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
+                  estimatedVariance, epsilon);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
-                                const void                         *alphaDataDiff,
-                                const void                         *betaDataDiff,
-                                const void                         *alphaParamDiff,
-                                const void                         *betaParamDiff,
-                                const cudnnTensorDescriptor_t       xDesc, // same desc for x, dx, dy
-                                const void                         *x,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx,
-                                /* Shared tensor desc for the 4 tensors below */
-                                const cudnnTensorDescriptor_t       dBnScaleBiasDesc,
-                                const void                         *bnScale, // bnBias doesn't affect backpropagation
-                                /* scale and bias diff are not backpropagated below this layer */
-                                void                               *dBnScaleResult,
-                                void                               *dBnBiasResult,
-                                /* Same epsilon as forward pass */
-                                double                              epsilon,
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
+    const void *betaDataDiff, const void *alphaParamDiff,
+    const void *betaParamDiff,
+    const cudnnTensorDescriptor_t xDesc,  // same desc for x, dx, dy
+    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    /* Shared tensor desc for the 4 tensors below */
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+    const void *bnScale,  // bnBias doesn't affect backpropagation
+    /* scale and bias diff are not backpropagated below this layer */
+    void *dBnScaleResult, void *dBnBiasResult,
+    /* Same epsilon as forward pass */
+    double epsilon,
 
-                                /* Optionally cached intermediate results from
-                                   forward pass */
-                                const void                         *savedMean,
-                                const void                         *savedInvVariance ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+    /* Optionally cached intermediate results from
+       forward pass */
+    const void *savedMean, const void *savedInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, void *, void *, double, const void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
+                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
+                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
+                  epsilon, savedMean, savedInvVariance);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor( 
+cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
 
-                               cudnnSpatialTransformerDescriptor_t        *stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+    cudnnSpatialTransformerDescriptor_t *stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
-                                cudnnSpatialTransformerDescriptor_t         stDesc,
-                                cudnnSamplerType_t                          samplerType, 
-                                cudnnDataType_t                             dataType,
-                                const int                                   nbDims,
-                                const int                                   dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
+    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
+      const int, const int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
-                                 cudnnSpatialTransformerDescriptor_t        stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+    cudnnSpatialTransformerDescriptor_t stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
-                                 cudnnHandle_t                              handle,
-                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
-                                 const void                                *theta,
-                                 void                                      *grid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *theta, void *grid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, theta, grid);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
-                                 cudnnHandle_t                              handle,
-                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
-                                 const void                                *dgrid,
-                                 void                                      *dtheta) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *dgrid, void *dtheta) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, dgrid, dtheta);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
-                                 cudnnHandle_t                              handle,
-                                 cudnnSpatialTransformerDescriptor_t        stDesc,
-                                 const void                                *alpha,                                    
-                                 const cudnnTensorDescriptor_t              xDesc,
-                                 const void                                *x,
-                                 const void                                *grid,
-                                 const void                                *beta,
-                                 cudnnTensorDescriptor_t                    yDesc,
-                                 void                                      *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
+    void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
-                                 cudnnHandle_t                              handle,
-                                 cudnnSpatialTransformerDescriptor_t        stDesc,
-                                 const void                                *alpha,
-                                 const cudnnTensorDescriptor_t              xDesc,
-                                 const void                                *x,
-                                 const void                                *beta,
-                                 const cudnnTensorDescriptor_t              dxDesc,
-                                 void                                      *dx,
-                                 const void                                *alphaDgrid,
-                                 const cudnnTensorDescriptor_t              dyDesc,
-                                 const void                                *dy,
-                                 const void                                *grid,
-                                 const void                                *betaDgrid,
-                                 void                                      *dgrid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
+    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
+                  dyDesc, dy, grid, betaDgrid, dgrid);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t * dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t * sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
+                                                    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t * sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
+    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(xdesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
-                                                    cudnnHandle_t handle,
-                                                    float dropout, 
-                                                    void * states, 
-                                                    size_t stateSizeInBytes, 
-                                                    unsigned long long seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(cudnnHandle_t handle, 
-                                                      const cudnnDropoutDescriptor_t dropoutDesc,
-                                                      const cudnnTensorDescriptor_t xdesc, 
-                                                      const void * x,
-                                                      const cudnnTensorDescriptor_t ydesc,
-                                                      void * y,
-                                                      void * reserveSpace,
-                                                      size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t xdesc, const void *x,
+    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(cudnnHandle_t handle, 
-                                               const cudnnDropoutDescriptor_t dropoutDesc,
-                                               const cudnnTensorDescriptor_t dydesc, 
-                                               const void * dy,
-                                               const cudnnTensorDescriptor_t dxdesc,
-                                               void * dx,
-                                               void * reserveSpace,
-                                               size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t dydesc, const void *dy,
+    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t * rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
-                                             const int minibatch,
-                                             const cudnnDataType_t dataType,
-                                             cudnnPersistentRNNPlan_t * plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
+    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
+                                               const cudnnDataType_t,
+                                               cudnnPersistentRNNPlan_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, minibatch, dataType, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
-                                          cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(cudnnHandle_t handle, 
-                                                cudnnRNNDescriptor_t rnnDesc,
-                                                const int hiddenSize, 
-                                                const int numLayers, 
-                                                cudnnDropoutDescriptor_t dropoutDesc, // Between layers, not between recurrent steps.
-                                                cudnnRNNInputMode_t inputMode,                                                 
-                                                cudnnDirectionMode_t direction, 
-                                                cudnnRNNMode_t mode, 
-                                                cudnnRNNAlgo_t algo, 
-                                                cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers,
+    cudnnDropoutDescriptor_t
+        dropoutDesc,  // Between layers, not between recurrent steps.
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(cudnnRNNDescriptor_t rnnDesc,
-                                                int hiddenSize, 
-                                                int numLayers, 
-                                                cudnnDropoutDescriptor_t dropoutDesc, // Between layers, not between recurrent steps.
-                                                cudnnRNNInputMode_t inputMode,                                                 
-                                                cudnnDirectionMode_t direction, 
-                                                cudnnRNNMode_t mode, 
-                                                cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
+    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
+    cudnnDropoutDescriptor_t
+        dropoutDesc,  // Between layers, not between recurrent steps.
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
+      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
+      cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, dataType);
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
+                  direction, mode, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize( cudnnHandle_t              handle,
-                                                    const cudnnRNNDescriptor_t rnnDesc,  
-                                                    const int seqLength, 
-                                                    const cudnnTensorDescriptor_t    *xDesc,
-                                                    size_t                     *sizeInBytes
-                                                    ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize( cudnnHandle_t              handle,
-                                                          const cudnnRNNDescriptor_t rnnDesc,  
-                                                          const int seqLength, 
-                                                          const cudnnTensorDescriptor_t    *xDesc,
-                                                          size_t                     *sizeInBytes
-                                                    ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNParamsSize( cudnnHandle_t              handle,
-                                                 const cudnnRNNDescriptor_t rnnDesc,  
-                                                 const cudnnTensorDescriptor_t    xDesc,                                                    
-                                                 size_t                     *sizeInBytes,
-                                                 cudnnDataType_t dataType
-                                                    ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
+                      cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
+      size_t *, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams( cudnnHandle_t              handle,
-                             const cudnnRNNDescriptor_t rnnDesc,  
-                             const int layer,
-                             const cudnnTensorDescriptor_t xDesc, 
-                             const cudnnFilterDescriptor_t wDesc, 
-                             const void * w, 
-                             const int linLayerID,  
-                             cudnnFilterDescriptor_t linLayerMatDesc, 
-                             void ** linLayerMat
-                             ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, const int layer,
+    const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc,
+    const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID,
+                  linLayerMatDesc, linLayerMat);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams( cudnnHandle_t              handle,
-                             const cudnnRNNDescriptor_t rnnDesc,  
-                             const int layer,
-                             const cudnnTensorDescriptor_t xDesc, 
-                             const cudnnFilterDescriptor_t wDesc, 
-                             const void * w, 
-                             const int linLayerID, 
-                             cudnnFilterDescriptor_t linLayerBiasDesc, 
-                             void ** linLayerBias                       
-                             ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, const int layer,
+    const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc,
+    const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID,
+                  linLayerBiasDesc, linLayerBias);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference( cudnnHandle_t handle, 
-                                                    const cudnnRNNDescriptor_t rnnDesc, 
-                                                    const int seqLength, 
-                                                    const cudnnTensorDescriptor_t * xDesc, 
-                                                    const void * x, 
-                                                    const cudnnTensorDescriptor_t hxDesc, 
-                                                    const void * hx, 
-                                                    const cudnnTensorDescriptor_t cxDesc, 
-                                                    const void * cx, 
-                                                    const cudnnFilterDescriptor_t wDesc, 
-                                                    const void * w, 
-                                                    const cudnnTensorDescriptor_t *yDesc,  
-                                                    void * y, 
-                                                    const cudnnTensorDescriptor_t hyDesc, 
-                                                    void * hy, 
-                                                    const cudnnTensorDescriptor_t cyDesc, 
-                                                    void * cy, 
-                                                    void * workspace, 
-                                                    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining( cudnnHandle_t handle, 
-                                                   const cudnnRNNDescriptor_t rnnDesc, 
-                                                   const int seqLength, 
-                                                   const cudnnTensorDescriptor_t *xDesc, 
-                                                   const void * x, 
-                                                   const cudnnTensorDescriptor_t hxDesc, 
-                                                   const void * hx, 
-                                                   const cudnnTensorDescriptor_t cxDesc, 
-                                                   const void * cx, 
-                                                   const cudnnFilterDescriptor_t wDesc, 
-                                                   const void * w, 
-                                                   const cudnnTensorDescriptor_t *yDesc,  
-                                                   void * y, 
-                                                   const cudnnTensorDescriptor_t hyDesc, 
-                                                   void * hy, 
-                                                   const cudnnTensorDescriptor_t cyDesc, 
-                                                   void * cy, 
-                                                   void * workspace, 
-                                                   size_t workSpaceSizeInBytes,
-                                                   void * reserveSpace, 
-                                                   size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardData( cudnnHandle_t handle, 
-                                                const cudnnRNNDescriptor_t rnnDesc, 
-                                                const int seqLength, 
-                                                const cudnnTensorDescriptor_t * yDesc, 
-                                                const void * y,                                                
-                                                const cudnnTensorDescriptor_t * dyDesc, 
-                                                const void * dy, 
-                                                const cudnnTensorDescriptor_t dhyDesc, 
-                                                const void * dhy, 
-                                                const cudnnTensorDescriptor_t dcyDesc, 
-                                                const void * dcy, 
-                                                const cudnnFilterDescriptor_t wDesc, 
-                                                const void * w, 
-                                                const cudnnTensorDescriptor_t hxDesc, 
-                                                const void * hx,                                                                  
-                                                const cudnnTensorDescriptor_t cxDesc, 
-                                                const void * cx,                                                 
-                                                const cudnnTensorDescriptor_t * dxDesc, 
-                                                void * dx, 
-                                                const cudnnTensorDescriptor_t dhxDesc,
-                                                void * dhx,
-                                                const cudnnTensorDescriptor_t dcxDesc,
-                                                void * dcx,
-                                                void * workspace,
-                                                size_t workSpaceSizeInBytes,
-                                                void * reserveSpace, 
-                                                size_t reserveSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
+                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
+                     const void *w, const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+                     void *workspace, size_t workSpaceSizeInBytes,
+                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights( cudnnHandle_t handle, 
-                                                   const cudnnRNNDescriptor_t rnnDesc, 
-                                                   const int seqLength, 
-                                                   const cudnnTensorDescriptor_t * xDesc, 
-                                                   const void * x, 
-                                                   const cudnnTensorDescriptor_t hxDesc, 
-                                                   const void * hx,                                                   
-                                                   const cudnnTensorDescriptor_t * yDesc, 
-                                                   const void * y,
-                                                   const void * workspace, 
-                                                   size_t workSpaceSizeInBytes, 
-                                                   const cudnnFilterDescriptor_t dwDesc, 
-                                                   void * dw,
-                                                   const void * reserveSpace, 
-                                                   size_t reserveSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor_v4(
-                                cudnnConvolutionDescriptor_t        convDesc,
-                                int                                 pad_h,      // zero-padding height
-                                int                                 pad_w,      // zero-padding width
-                                int                                 u,          // vertical filter stride
-                                int                                 v,          // horizontal filter stride
-                                int                                 dilation_h, // filter dilation in the vertical dimension
-                                int                                 dilation_w, // filter dilation in the horizontal dimension
-                                cudnnConvolutionMode_t              mode ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor_v4");
+    cudnnConvolutionDescriptor_t convDesc,
+    int pad_h,       // zero-padding height
+    int pad_w,       // zero-padding width
+    int u,           // vertical filter stride
+    int v,           // horizontal filter stride
+    int dilation_h,  // filter dilation in the vertical dimension
+    int dilation_w,  // filter dilation in the horizontal dimension
+    cudnnConvolutionMode_t mode) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int,
+                                   int, int, int, cudnnConvolutionMode_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor_v4");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor_v5( cudnnConvolutionDescriptor_t convDesc,
-                                                             int pad_h,    // zero-padding height
-                                                             int pad_w,    // zero-padding width
-                                                             int u,   // vertical filter stride
-                                                             int v,   // horizontal filter stride
-                                                             int dilation_h, // filter dilation in the vertical dimension
-                                                             int dilation_w, // filter dilation in the horizontal dimension
-                                                             cudnnConvolutionMode_t mode,
-                                                             cudnnDataType_t computeType
-                                                           ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor_v5");
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor_v5(
+    cudnnConvolutionDescriptor_t convDesc,
+    int pad_h,       // zero-padding height
+    int pad_w,       // zero-padding width
+    int u,           // vertical filter stride
+    int v,           // horizontal filter stride
+    int dilation_h,  // filter dilation in the vertical dimension
+    int dilation_w,  // filter dilation in the horizontal dimension
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
+      cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor_v5");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor_v4(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                int                                *pad_h,    // zero-padding height
-                                int                                *pad_w,    // zero-padding width
-                                int                                *u,        // vertical filter stride
-                                int                                *v,        // horizontal filter stride
-                                int                                *dilation_h, // filter dilation in the vertical dimension
-                                int                                *dilation_w, // filter dilation in the horizontal dimension
-                                cudnnConvolutionMode_t             *mode ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor_v4");
+    const cudnnConvolutionDescriptor_t convDesc,
+    int *pad_h,       // zero-padding height
+    int *pad_w,       // zero-padding width
+    int *u,           // vertical filter stride
+    int *v,           // horizontal filter stride
+    int *dilation_h,  // filter dilation in the vertical dimension
+    int *dilation_w,  // filter dilation in the horizontal dimension
+    cudnnConvolutionMode_t *mode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
+      int *, cudnnConvolutionMode_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor_v4");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor_v5(  const cudnnConvolutionDescriptor_t convDesc,
-                                                            int* pad_h,    // zero-padding height
-                                                            int* pad_w,    // zero-padding width
-                                                            int* u,        // vertical filter stride
-                                                            int* v,        // horizontal filter stride
-                                                            int* dilation_h, // filter dilation in the vertical dimension
-                                                            int* dilation_w, // filter dilation in the horizontal dimension
-                                                            cudnnConvolutionMode_t* mode,
-                                                            cudnnDataType_t *computeType
-                                                         ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor_v5");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor_v5(
+    const cudnnConvolutionDescriptor_t convDesc,
+    int *pad_h,       // zero-padding height
+    int *pad_w,       // zero-padding width
+    int *u,           // vertical filter stride
+    int *v,           // horizontal filter stride
+    int *dilation_h,  // filter dilation in the vertical dimension
+    int *dilation_w,  // filter dilation in the horizontal dimension
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
+      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor_v5");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
 }  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_7_0.inc b/tensorflow/stream_executor/cuda/cudnn_7_0.inc
index d2ea31e366b..008ae9099c0 100644
--- a/tensorflow/stream_executor/cuda/cudnn_7_0.inc
+++ b/tensorflow/stream_executor/cuda/cudnn_7_0.inc
@@ -3,1944 +3,2025 @@
 extern "C" {
 
 size_t CUDNNWINAPI cudnnGetVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
 size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-const char *  CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
   if (!func_ptr) return "cudnnGetErrorString symbol not found.";
   return func_ptr(status);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(
-                                cudnnHandle_t                       handle,
-                                cudnnStatus_t                      *rstatus,
-                                cudnnErrQueryMode_t                 mode,
-                                cudnnRuntimeTag_t                  *tag ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
+cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
+                                                 cudnnStatus_t *rstatus,
+                                                 cudnnErrQueryMode_t mode,
+                                                 cudnnRuntimeTag_t *tag) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rstatus, mode, tag);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
+                                           int *value) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreate        (cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroy       (cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetStream     (cudnnHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
+                                         cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetStream     (cudnnHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
+                                         cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateTensorDescriptor(
-                                cudnnTensorDescriptor_t            *tensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnTensorFormat_t                 format,
-                                cudnnDataType_t                     dataType, /* image data type */
-                                int                                 n,        /* number of inputs (batch size) */
-                                int                                 c,        /* number of input feature maps */
-                                int                                 h,        /* height of input section */
-                                int                                 w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnDataType_t                     dataType, /* image data type */
-                                int                                 n,        /* number of inputs (batch size) */
-                                int                                 c,        /* number of input feature maps */
-                                int                                 h,        /* height of input section */
-                                int                                 w,        /* width of input section */
-                                int                                 nStride,
-                                int                                 cStride,
-                                int                                 hStride,
-                                int                                 wStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+    cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w,                    /* width of input section */
+    int nStride, int cStride, int hStride, int wStride) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
+                                   int, int, int, int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                cudnnDataType_t                    *dataType, /* image data type */
-                                int                                *n,        /* number of inputs (batch size) */
-                                int                                *c,        /* number of input feature maps  */
-                                int                                *h,        /* height of input section */
-                                int                                *w,        /* width of input section */
-                                int                                *nStride,
-                                int                                *cStride,
-                                int                                *hStride,
-                                int                                *wStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+    const cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t *dataType, /* image data type */
+    int *n,                    /* number of inputs (batch size) */
+    int *c,                    /* number of input feature maps  */
+    int *h,                    /* height of input section */
+    int *w,                    /* width of input section */
+    int *nStride, int *cStride, int *hStride, int *wStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
+      int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnDataType_t                     dataType,
-                                int                                 nbDims,
-                                const int                           dimA[],
-                                const int                           strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
+    const int dimA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnTensorFormat_t                 format,
-                                cudnnDataType_t                     dataType,
-                                int                                 nbDims,
-                                const int                           dimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                int                                 nbDimsRequested,
-                                cudnnDataType_t                    *dataType,
-                                int                                *nbDims,
-                                int                                 dimA[],
-                                int                                 strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
+                                   cudnnDataType_t *, int *, int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                size_t                              *size) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, size);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyTensorDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnAddTensor(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
+                                         const void *alpha,
+                                         const cudnnTensorDescriptor_t aDesc,
+                                         const void *A, const void *beta,
+                                         const cudnnTensorDescriptor_t cDesc,
+                                         void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t          *opTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t           opTensorDesc,
-                                cudnnOpTensorOp_t                   opTensorOp,
-                                cudnnDataType_t                     opTensorCompType,
-                                cudnnNanPropagation_t               opTensorNanOpt ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
+    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
+                                   cudnnDataType_t, cudnnNanPropagation_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
-                                const cudnnOpTensorDescriptor_t     opTensorDesc,
-                                cudnnOpTensorOp_t                  *opTensorOp,
-                                cudnnDataType_t                    *opTensorCompType,
-                                cudnnNanPropagation_t              *opTensorNanOpt ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
+    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
+      cudnnNanPropagation_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t           opTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnOpTensorDescriptor_t     opTensorDesc,
-                                const void                         *alpha1,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *alpha2,
-                                const cudnnTensorDescriptor_t       bDesc,
-                                const void                         *B,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
+    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
+    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
+    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
+                  beta, cDesc, C);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t          *reduceTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t           reduceTensorDesc,
-                                cudnnReduceTensorOp_t                   reduceTensorOp,
-                                cudnnDataType_t                     reduceTensorCompType,
-                                cudnnNanPropagation_t               reduceTensorNanOpt,
-                                cudnnReduceTensorIndices_t          reduceTensorIndices,
-                                cudnnIndicesType_t                  reduceTensorIndicesType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+    cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
+    cudnnNanPropagation_t reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t reduceTensorIndices,
+    cudnnIndicesType_t reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
+      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
-                                const cudnnReduceTensorDescriptor_t     reduceTensorDesc,
-                                cudnnReduceTensorOp_t                  *reduceTensorOp,
-                                cudnnDataType_t                    *reduceTensorCompType,
-                                cudnnNanPropagation_t              *reduceTensorNanOpt,
-                                cudnnReduceTensorIndices_t         *reduceTensorIndices,
-                                cudnnIndicesType_t                 *reduceTensorIndicesType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t *reduceTensorOp,
+    cudnnDataType_t *reduceTensorCompType,
+    cudnnNanPropagation_t *reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t *reduceTensorIndices,
+    cudnnIndicesType_t *reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
+      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
+      cudnnIndicesType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t           reduceTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                void                               *indices,
-                                size_t                              indicesSizeInBytes,
-                                void                               *workspace,
-                                size_t                              workspaceSizeInBytes,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    void *indices, size_t indicesSizeInBytes, void *workspace,
+    size_t workspaceSizeInBytes, const void *alpha,
+    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
+    const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
+      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
+                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
+                  C);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const void                         *valuePtr ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
+                                         const cudnnTensorDescriptor_t yDesc,
+                                         void *y, const void *valuePtr) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, valuePtr);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const void                         *alpha ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           void *y, const void *alpha) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, alpha);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateFilterDescriptor(
-                                cudnnFilterDescriptor_t            *filterDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc,
-                                cudnnDataType_t                     dataType, /* image data type */
-                                cudnnTensorFormat_t                 format,
-                                int                                 k,        /* number of output feature maps */
-                                int                                 c,        /* number of input feature maps */
-                                int                                 h,        /* height of each input filter */
-                                int                                 w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType,          /* image data type */
+    cudnnTensorFormat_t format, int k, /* number of output feature maps */
+    int c,                             /* number of input feature maps */
+    int h,                             /* height of each input filter */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                cudnnDataType_t                    *dataType, /* image data type */
-                                cudnnTensorFormat_t                *format,
-                                int                                *k,        /* number of output feature maps */
-                                int                                *c,        /* number of input feature maps */
-                                int                                *h,        /* height of each input filter */
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+    const cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t *dataType,           /* image data type */
+    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
+    int *c,                              /* number of input feature maps */
+    int *h,                              /* height of each input filter */
+    int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
+      int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc,
-                                cudnnDataType_t                     dataType, /* image data type */
-                                cudnnTensorFormat_t                 format,
-                                int                                 nbDims,
-                                const int                           filterDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType, /* image data type */
+    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                 nbDimsRequested,
-                                cudnnDataType_t                    *dataType, /* image data type */
-                                cudnnTensorFormat_t                *format,
-                                int                                *nbDims,
-                                int                                 filterDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, /* image data type */
+    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
+      cudnnTensorFormat_t *, int *, int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
+                  filterDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyFilterDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateConvolutionDescriptor(
-                                cudnnConvolutionDescriptor_t       *convDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType( cudnnConvolutionDescriptor_t convDesc,
-                                                       cudnnMathType_t mathType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType( cudnnConvolutionDescriptor_t convDesc,
-                                                       cudnnMathType_t *mathType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount( cudnnConvolutionDescriptor_t convDesc,
-                                                         int groupCount ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount( cudnnConvolutionDescriptor_t convDesc,
-                                                         int *groupCount ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor( cudnnConvolutionDescriptor_t convDesc,
-                                                             int pad_h,    /* zero-padding height */
-                                                             int pad_w,    /* zero-padding width */
-                                                             int u,   /* vertical filter stride */
-                                                             int v,   /* horizontal filter stride */
-                                                             int dilation_h, /* filter dilation in the vertical dimension */
-                                                             int dilation_w, /* filter dilation in the horizontal dimension */
-                                                             cudnnConvolutionMode_t mode,
-                                                             cudnnDataType_t computeType
-                                                           ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
+    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
+    int pad_w,                                        /* zero-padding width */
+    int u,          /* vertical filter stride */
+    int v,          /* horizontal filter stride */
+    int dilation_h, /* filter dilation in the vertical dimension */
+    int dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(  const cudnnConvolutionDescriptor_t convDesc,
-                                                            int* pad_h,    /* zero-padding height */
-                                                            int* pad_w,    /* zero-padding width */
-                                                            int* u,        /* vertical filter stride */
-                                                            int* v,        /* horizontal filter stride */
-                                                            int* dilation_h, /* filter dilation in the vertical dimension */
-                                                            int* dilation_w, /* filter dilation in the horizontal dimension */
-                                                            cudnnConvolutionMode_t* mode,
-                                                            cudnnDataType_t *computeType
-                                                         ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc,
+    int *pad_h,      /* zero-padding height */
+    int *pad_w,      /* zero-padding width */
+    int *u,          /* vertical filter stride */
+    int *v,          /* horizontal filter stride */
+    int *dilation_h, /* filter dilation in the vertical dimension */
+    int *dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
+      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                *n,
-                                int                                *c,
-                                int                                *h,
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
-                                cudnnConvolutionDescriptor_t        convDesc,
-                                int                                 arrayLength,             /* nbDims-2 size */
-                                const int                           padA[],
-                                const int                           filterStrideA[],
-                                const int                           dilationA[],
-                                cudnnConvolutionMode_t              mode,
-                                cudnnDataType_t                     computeType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
+    const int padA[], const int filterStrideA[], const int dilationA[],
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
+                  computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                int                                 arrayLengthRequested,
-                                int                                *arrayLength,
-                                int                                 padA[],
-                                int                                 strideA[],
-                                int                                 dilationA[],
-                                cudnnConvolutionMode_t             *mode,
-                                cudnnDataType_t                    *computeType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
+    int *arrayLength, int padA[], int strideA[], int dilationA[],
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
+      cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
+                  dilationA, mode, computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                 nbDims,
-                                int                                 tensorOutputDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int nbDims,
+    int tensorOutputDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOutputDimA);
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
+                  tensorOutputDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyConvolutionDescriptor(
-                                cudnnConvolutionDescriptor_t        convDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithmMaxCount( cudnnHandle_t     handle,
-                                                                       int              *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionFwdAlgoPerf_t      *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionFwdAlgoPerf_t      *perfResults,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                cudnnConvolutionFwdPreference_t     preference,
-                                size_t                              memoryLimitInBytes,
-                                cudnnConvolutionFwdAlgo_t          *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc,
+    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionFwdAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
+      cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
-                                cudnnHandle_t                      handle,
-                                const cudnnTensorDescriptor_t      srcDesc,
-                                const cudnnFilterDescriptor_t      filterDesc,
-                                const cudnnConvolutionDescriptor_t convDesc,
-                                const cudnnTensorDescriptor_t      destDesc,
-                                const int                          requestedAlgoCount,
-                                int                               *returnedAlgoCount,
-                                cudnnConvolutionFwdAlgoPerf_t     *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnFilterDescriptor_t filterDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha1,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *alpha2,
-                                const cudnnTensorDescriptor_t       zDesc,
-                                const void                         *z,
-                                const cudnnTensorDescriptor_t       biasDesc,
-                                const void                         *bias,
-                                const cudnnActivationDescriptor_t   activationDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+    cudnnHandle_t handle, const void *alpha1,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
+    const cudnnTensorDescriptor_t zDesc, const void *z,
+    const cudnnTensorDescriptor_t biasDesc, const void *bias,
+    const cudnnActivationDescriptor_t activationDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
+                  activationDesc, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dbDesc,
-                                void                               *db ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dbDesc, void *db) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( cudnnHandle_t     handle,
-                                                                              int              *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnFilterDescriptor_t       dwDesc,
-                                const int                           requestedAlgoCount,
-                                int                                 *returnedAlgoCount,
-                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
-                                cudnnHandle_t                        handle,
-                                const cudnnTensorDescriptor_t        xDesc,
-                                const void                          *x,
-                                const cudnnTensorDescriptor_t        dyDesc,
-                                const void                          *y,
-                                const cudnnConvolutionDescriptor_t   convDesc,
-                                const cudnnFilterDescriptor_t        dwDesc,
-                                void                                *dw,
-                                const int                            requestedAlgoCount,
-                                int                                 *returnedAlgoCount,
-                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
-                                void                                *workSpace,
-                                size_t                               workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *y,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
-                                cudnnHandle_t                         handle,
-                                const cudnnTensorDescriptor_t         xDesc,
-                                const cudnnTensorDescriptor_t         dyDesc,
-                                const cudnnConvolutionDescriptor_t    convDesc,
-                                const cudnnFilterDescriptor_t         dwDesc,
-                                cudnnConvolutionBwdFilterPreference_t preference,
-                                size_t                                memoryLimitInBytes,
-                                cudnnConvolutionBwdFilterAlgo_t      *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc,
+    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdFilterAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
+      size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
-                                cudnnHandle_t                         handle,
-                                const cudnnTensorDescriptor_t         srcDesc,
-                                const cudnnTensorDescriptor_t         diffDesc,
-                                const cudnnConvolutionDescriptor_t    convDesc,
-                                const cudnnFilterDescriptor_t         gradDesc,
-                                const int                             requestedAlgoCount,
-                                int                                  *returnedAlgoCount,
-                                cudnnConvolutionBwdFilterAlgoPerf_t  *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnFilterDescriptor_t       gradDesc,
-                                cudnnConvolutionBwdFilterAlgo_t     algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionBwdFilterAlgo_t     algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnFilterDescriptor_t       dwDesc,
-                                void                               *dw ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnFilterDescriptor_t dwDesc, void *dw) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
+      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount( cudnnHandle_t     handle,
-                                                                            int              *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                cudnnConvolutionBwdDataPreference_t preference,
-                                size_t                              memoryLimitInBytes,
-                                cudnnConvolutionBwdDataAlgo_t      *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc,
+    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdDataAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
+      size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                const cudnnTensorDescriptor_t       diffDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       gradDesc,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                cudnnConvolutionBwdDataAlgo_t       algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionBwdDataAlgo_t       algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnIm2Col(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                void                               *colBuffer ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI
+cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+            const void *x, const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
+                                   const void *, const cudnnFilterDescriptor_t,
+                                   const cudnnConvolutionDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
-                                cudnnHandle_t                       handle,
-                                cudnnSoftmaxAlgorithm_t             algo,
-                                cudnnSoftmaxMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnSoftmaxAlgorithm_t             algo,
-                                cudnnSoftmaxMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
+                  dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreatePoolingDescriptor(
-                                cudnnPoolingDescriptor_t           *poolingDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc,
-                                cudnnPoolingMode_t                  mode,
-                                cudnnNanPropagation_t               maxpoolingNanOpt,
-                                int                                 windowHeight,
-                                int                                 windowWidth,
-                                int                                 verticalPadding,
-                                int                                 horizontalPadding,
-                                int                                 verticalStride,
-                                int                                 horizontalStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
+    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
+    int verticalPadding, int horizontalPadding, int verticalStride,
+    int horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
+      int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                cudnnPoolingMode_t                 *mode,
-                                cudnnNanPropagation_t              *maxpoolingNanOpt,
-                                int                                *windowHeight,
-                                int                                *windowWidth,
-                                int                                *verticalPadding,
-                                int                                *horizontalPadding,
-                                int                                *verticalStride,
-                                int                                *horizontalStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
+    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
+    int *windowWidth, int *verticalPadding, int *horizontalPadding,
+    int *verticalStride, int *horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc,
-                                const cudnnPoolingMode_t            mode,
-                                const cudnnNanPropagation_t         maxpoolingNanOpt,
-                                int                                 nbDims,
-                                const int                           windowDimA[],
-                                const int                           paddingA[],
-                                const int                           strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
+    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
+    const int windowDimA[], const int paddingA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
+      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
+                  paddingA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                int                                 nbDimsRequested,
-                                cudnnPoolingMode_t                 *mode,
-                                cudnnNanPropagation_t              *maxpoolingNanOpt,
-                                int                                *nbDims,
-                                int                                 windowDimA[],
-                                int                                 paddingA[],
-                                int                                 strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
+    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
+    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int[], int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
+                  windowDimA, paddingA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdForwardOutputDim(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                int                                 nbDims,
-                                int                                 outputTensorDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims, int outputTensorDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                   const cudnnTensorDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dForwardOutputDim(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                int                                *n,
-                                int                                *c,
-                                int                                *h,
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyPoolingDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
-                                cudnnHandle_t                       handle,
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
-                                cudnnHandle_t                       handle,
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const void                          *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateActivationDescriptor(
-                                cudnnActivationDescriptor_t        *activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
-                                cudnnActivationDescriptor_t         activationDesc,
-                                cudnnActivationMode_t               mode,
-                                cudnnNanPropagation_t               reluNanOpt,
-                                double                              coef ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
+    cudnnNanPropagation_t reluNanOpt, double coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
+                                               cudnnActivationMode_t,
+                                               cudnnNanPropagation_t, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetActivationDescriptor(
-                                const cudnnActivationDescriptor_t   activationDesc,
-                                cudnnActivationMode_t              *mode,
-                                cudnnNanPropagation_t              *reluNanOpt,
-                                double*                             coef ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
+      cudnnNanPropagation_t *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyActivationDescriptor(
-                                cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
-                                cudnnHandle_t                       handle,
-                                cudnnActivationDescriptor_t         activationDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnActivationDescriptor_t         activationDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateLRNDescriptor(
-                                cudnnLRNDescriptor_t               *normDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(
-                                cudnnLRNDescriptor_t                normDesc,
-                                unsigned                            lrnN,
-                                double                              lrnAlpha,
-                                double                              lrnBeta,
-                                double                              lrnK ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned lrnN, double lrnAlpha,
+                                                double lrnBeta, double lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int, double, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(
-                                cudnnLRNDescriptor_t                normDesc,
-                                unsigned*                           lrnN,
-                                double*                             lrnAlpha,
-                                double*                             lrnBeta,
-                                double*                             lrnK ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned *lrnN,
+                                                double *lrnAlpha,
+                                                double *lrnBeta, double *lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyLRNDescriptor( cudnnLRNDescriptor_t lrnDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(lrnDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnLRNMode_t                      lrnMode,
-                                const void*                         alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnLRNMode_t                      lrnMode,
-                                const void*                         alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
+                  x, beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnDivNormMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc, /* same desc for means, temp, temp2 */
-                                const void                         *x,
-                                const void                         *means, /* if NULL, means are assumed to be zero */
-                                void                               *temp,
-                                void                               *temp2,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
+                  beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnDivNormMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc, /* same desc for x, means, dy, temp, temp2 */
-                                const void                         *x,
-                                const void                         *means, /* if NULL, means are assumed to be zero */
-                                const void                         *dy,
-                                void                               *temp,
-                                void                               *temp2,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dXdMeansDesc, /* same desc for dx, dMeans */
-                                void                               *dx, /* output x differential */
-                                void                               *dMeans ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t
+        xDesc, /* same desc for x, means, dy, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    const void *dy, void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+    void *dx,                                   /* output x differential */
+    void *dMeans) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
+                  temp2, beta, dXdMeansDesc, dx, dMeans);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
-                                cudnnTensorDescriptor_t             derivedBnDesc,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                cudnnBatchNormMode_t                mode ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
+    cudnnBatchNormMode_t mode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               cudnnBatchNormMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(derivedBnDesc, xDesc, mode);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
 
-                                const void                         *alpha, /* alpha[0] = result blend factor */
-                                const void                         *beta,  /* beta[0] = dest layer blend factor */
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
 
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,     /* NxCxHxW */
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,     /* NxCxHxW */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
 
-                                /* Shared desc for the next 6 tensors in the argument list.
-                                   Data type to be set as follows:
-                                   type = (typeOf(x) == double) ? double : float
-                                   Dimensions for this descriptor depend on normalization mode
-                                   - Spatial Normalization : tensors are expected to have dims 1xCx1x1
-                                    (normalization is performed across NxHxW)
-                                   - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW 
-                                    (normalization is performed across N) */
-                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of
+       1xCxHxW (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
 
-                                /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
-                                const void                         *bnScale,
-                                const void                         *bnBias,
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
+     */
+    const void *bnScale, const void *bnBias,
 
-                                /* MUST use factor=1 in the very first call of a complete training cycle.
-                                   Use a factor=1/(1+n) at N-th call to the function to get
-                                   Cumulative Moving Average (CMA) behavior
-                                   CMA[n] = (x[1]+...+x[n])/n
-                                   Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
-                                   ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
-                                   CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
-                                double                              exponentialAverageFactor,
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
 
-                                /* Used in Training phase only. 
-                                   runningMean = newMean*factor + runningMean*(1-factor) */
-                                void                               *resultRunningMean,
-                                /* Output in training mode, input in inference. Is the moving average
-                                   of  variance[x] (factor is applied in the same way as for runningMean) */
-                                void                               *resultRunningVariance,
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
 
-                                /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
-                                double                              epsilon,
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
+       backward functions. */
+    double epsilon,
 
-                                /* Optionally save intermediate results from the forward pass here
-                                   - can be reused to speed up backward pass. NULL if unused */
-                                void                               *resultSaveMean,
-                                void                               *resultSaveInvVariance ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean, void *resultSaveInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, double, void *, void *, double, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+  return func_ptr(
+      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
+      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
+      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
-                                const void                         *alpha, /* alpha[0] = result blend factor */
-                                const void                         *beta,  /* beta[0] = dest layer blend factor */
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,     /* NxCxHxW */
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,     /* NxCxHxW */
-                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
-                                const void                         *bnScale,
-                                const void                         *bnBias,
-                                const void                         *estimatedMean,
-                                const void                         *estimatedVariance,
-                                double                              epsilon ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
+    const void *bnBias, const void *estimatedMean,
+    const void *estimatedVariance, double epsilon) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, const void *, const void *, double);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
+                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
+                  estimatedVariance, epsilon);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
-                                const void                         *alphaDataDiff,
-                                const void                         *betaDataDiff,
-                                const void                         *alphaParamDiff,
-                                const void                         *betaParamDiff,
-                                const cudnnTensorDescriptor_t       xDesc, /* same desc for x, dx, dy */
-                                const void                         *x,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx,
-                                /* Shared tensor desc for the 4 tensors below */
-                                const cudnnTensorDescriptor_t       dBnScaleBiasDesc,
-                                const void                         *bnScale, /* bnBias doesn't affect backpropagation */
-                                /* scale and bias diff are not backpropagated below this layer */
-                                void                               *dBnScaleResult,
-                                void                               *dBnBiasResult,
-                                /* Same epsilon as forward pass */
-                                double                              epsilon,
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
+    const void *betaDataDiff, const void *alphaParamDiff,
+    const void *betaParamDiff,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    /* Shared tensor desc for the 4 tensors below */
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+    const void *bnScale, /* bnBias doesn't affect backpropagation */
+    /* scale and bias diff are not backpropagated below this layer */
+    void *dBnScaleResult, void *dBnBiasResult,
+    /* Same epsilon as forward pass */
+    double epsilon,
 
-                                /* Optionally cached intermediate results from
-                                   forward pass */
-                                const void                         *savedMean,
-                                const void                         *savedInvVariance ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+    /* Optionally cached intermediate results from
+       forward pass */
+    const void *savedMean, const void *savedInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, void *, void *, double, const void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
+                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
+                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
+                  epsilon, savedMean, savedInvVariance);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
-                               cudnnSpatialTransformerDescriptor_t        *stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+    cudnnSpatialTransformerDescriptor_t *stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
-                                cudnnSpatialTransformerDescriptor_t         stDesc,
-                                cudnnSamplerType_t                          samplerType, 
-                                cudnnDataType_t                             dataType,
-                                const int                                   nbDims,
-                                const int                                   dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
+    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
+      const int, const int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
-                                 cudnnSpatialTransformerDescriptor_t        stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+    cudnnSpatialTransformerDescriptor_t stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
-                                 cudnnHandle_t                              handle,
-                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
-                                 const void                                *theta,
-                                 void                                      *grid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *theta, void *grid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, theta, grid);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
-                                 cudnnHandle_t                              handle,
-                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
-                                 const void                                *dgrid,
-                                 void                                      *dtheta) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *dgrid, void *dtheta) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, dgrid, dtheta);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
-                                 cudnnHandle_t                              handle,
-                                 cudnnSpatialTransformerDescriptor_t        stDesc,
-                                 const void                                *alpha,                                    
-                                 const cudnnTensorDescriptor_t              xDesc,
-                                 const void                                *x,
-                                 const void                                *grid,
-                                 const void                                *beta,
-                                 cudnnTensorDescriptor_t                    yDesc,
-                                 void                                      *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
+    void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
-                                 cudnnHandle_t                              handle,
-                                 cudnnSpatialTransformerDescriptor_t        stDesc,
-                                 const void                                *alpha,
-                                 const cudnnTensorDescriptor_t              xDesc,
-                                 const void                                *x,
-                                 const void                                *beta,
-                                 const cudnnTensorDescriptor_t              dxDesc,
-                                 void                                      *dx,
-                                 const void                                *alphaDgrid,
-                                 const cudnnTensorDescriptor_t              dyDesc,
-                                 const void                                *dy,
-                                 const void                                *grid,
-                                 const void                                *betaDgrid,
-                                 void                                      *dgrid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
+    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
+                  dyDesc, dy, grid, betaDgrid, dgrid);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t * dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t * sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
+                                                    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t * sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
+    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(xdesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
-                                                    cudnnHandle_t            handle,
-                                                    float                    dropout, 
-                                                    void *                   states, 
-                                                    size_t                   stateSizeInBytes, 
-                                                    unsigned long long       seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
-                                                        cudnnHandle_t            handle,
-                                                        float                    dropout, 
-                                                        void *                   states, 
-                                                        size_t                   stateSizeInBytes, 
-                                                        unsigned long long       seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
-                                                    cudnnHandle_t            handle,
-                                                    float *                  dropout, 
-                                                    void **                  states,
-                                                    unsigned long long *     seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float *, void **, unsigned long long *);
+cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
+    void **states, unsigned long long *seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float *, void **, unsigned long long *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(cudnnHandle_t                  handle, 
-                                              const cudnnDropoutDescriptor_t dropoutDesc,
-                                              const cudnnTensorDescriptor_t  xdesc, 
-                                              const void *                   x,
-                                              const cudnnTensorDescriptor_t  ydesc,
-                                              void *                         y,
-                                              void *                         reserveSpace,
-                                              size_t                         reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t xdesc, const void *x,
+    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(cudnnHandle_t                  handle, 
-                                               const cudnnDropoutDescriptor_t dropoutDesc,
-                                               const cudnnTensorDescriptor_t  dydesc, 
-                                               const void *                   dy,
-                                               const cudnnTensorDescriptor_t  dxdesc,
-                                               void *                         dx,
-                                               void *                         reserveSpace,
-                                               size_t                         reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t dydesc, const void *dy,
+    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t * rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t       rnnDesc,
-                                                       const int                  minibatch,
-                                                       const cudnnDataType_t      dataType,
-                                                       cudnnPersistentRNNPlan_t * plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
+    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
+                                               const cudnnDataType_t,
+                                               cudnnPersistentRNNPlan_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, minibatch, dataType, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
-                                                    cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(cudnnHandle_t            handle,
-                                                   cudnnRNNDescriptor_t     rnnDesc,
-                                                   const int                hiddenSize,
-                                                   const int                numLayers,
-                                                   cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */
-                                                   cudnnRNNInputMode_t      inputMode,          
-                                                   cudnnDirectionMode_t     direction,
-                                                   cudnnRNNMode_t           mode,
-                                                   cudnnRNNAlgo_t           algo,
-                                                   cudnnDataType_t          dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers,
+    cudnnDropoutDescriptor_t
+        dropoutDesc, /* Between layers, not between recurrent steps. */
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(cudnnHandle_t              cudnnHandle,
-                                                cudnnRNNDescriptor_t       rnnDesc,
-                                                int *                      hiddenSize, 
-                                                int *                      numLayers, 
-                                                cudnnDropoutDescriptor_t * dropoutDesc,
-                                                cudnnRNNInputMode_t *      inputMode, 
-                                                cudnnDirectionMode_t *     direction, 
-                                                cudnnRNNMode_t *           mode, 
-                                                cudnnRNNAlgo_t *           algo, 
-                                                cudnnDataType_t *          dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *, cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *, cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(
+    cudnnHandle_t cudnnHandle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
+    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
+    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
+    cudnnRNNMode_t *mode, cudnnRNNAlgo_t *algo, cudnnDataType_t *dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
+      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
+      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cudnnHandle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(cudnnHandle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNMatrixMathType (cudnnRNNDescriptor_t desc, cudnnMathType_t math) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t desc,
+                                                    cudnnMathType_t math) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(desc, math);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize( cudnnHandle_t              handle,
-                                                    const cudnnRNNDescriptor_t rnnDesc,  
-                                                    const int seqLength, 
-                                                    const cudnnTensorDescriptor_t    *xDesc,
-                                                    size_t                     *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize( cudnnHandle_t              handle,
-                                                          const cudnnRNNDescriptor_t rnnDesc,  
-                                                          const int                  seqLength,
-                                                          const cudnnTensorDescriptor_t    *xDesc,
-                                                          size_t                   *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNParamsSize( cudnnHandle_t                    handle,
-                                                 const cudnnRNNDescriptor_t       rnnDesc,  
-                                                 const cudnnTensorDescriptor_t    xDesc,
-                                                 size_t                          *sizeInBytes,
-                                                 cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
+                      cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
+      size_t *, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams( cudnnHandle_t              handle,
-                                                           const cudnnRNNDescriptor_t rnnDesc, 
-                                                           const int layer,
-                                                           const cudnnTensorDescriptor_t xDesc,
-                                                           const cudnnFilterDescriptor_t wDesc,
-                                                           const void * w, 
-                                                           const int linLayerID,  
-                                                           cudnnFilterDescriptor_t linLayerMatDesc,
-                                                           void ** linLayerMat) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, const int layer,
+    const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc,
+    const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID,
+                  linLayerMatDesc, linLayerMat);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams( cudnnHandle_t              handle,
-                                                         const cudnnRNNDescriptor_t rnnDesc, 
-                                                         const int layer,
-                                                         const cudnnTensorDescriptor_t xDesc, 
-                                                         const cudnnFilterDescriptor_t wDesc,
-                                                         const void * w,
-                                                         const int linLayerID,
-                                                         cudnnFilterDescriptor_t linLayerBiasDesc,
-                                                         void ** linLayerBias) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, const int layer,
+    const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc,
+    const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID,
+                  linLayerBiasDesc, linLayerBias);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference( cudnnHandle_t handle,
-                                                    const cudnnRNNDescriptor_t rnnDesc,
-                                                    const int seqLength,
-                                                    const cudnnTensorDescriptor_t * xDesc,
-                                                    const void * x,
-                                                    const cudnnTensorDescriptor_t hxDesc,
-                                                    const void * hx,
-                                                    const cudnnTensorDescriptor_t cxDesc,
-                                                    const void * cx,
-                                                    const cudnnFilterDescriptor_t wDesc,
-                                                    const void * w,
-                                                    const cudnnTensorDescriptor_t *yDesc,
-                                                    void * y,
-                                                    const cudnnTensorDescriptor_t hyDesc,
-                                                    void * hy,
-                                                    const cudnnTensorDescriptor_t cyDesc,
-                                                    void * cy,
-                                                    void * workspace,
-                                                    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining( cudnnHandle_t handle,
-                                                   const cudnnRNNDescriptor_t rnnDesc,
-                                                   const int seqLength,
-                                                   const cudnnTensorDescriptor_t *xDesc,
-                                                   const void * x,
-                                                   const cudnnTensorDescriptor_t hxDesc,
-                                                   const void * hx,
-                                                   const cudnnTensorDescriptor_t cxDesc,
-                                                   const void * cx,
-                                                   const cudnnFilterDescriptor_t wDesc,
-                                                   const void * w,
-                                                   const cudnnTensorDescriptor_t *yDesc,
-                                                   void * y,
-                                                   const cudnnTensorDescriptor_t hyDesc,
-                                                   void * hy,
-                                                   const cudnnTensorDescriptor_t cyDesc,
-                                                   void * cy,
-                                                   void * workspace,
-                                                   size_t workSpaceSizeInBytes,
-                                                   void * reserveSpace,
-                                                   size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardData( cudnnHandle_t handle,
-                                                const cudnnRNNDescriptor_t rnnDesc,
-                                                const int seqLength,
-                                                const cudnnTensorDescriptor_t * yDesc,
-                                                const void * y,
-                                                const cudnnTensorDescriptor_t * dyDesc,
-                                                const void * dy,
-                                                const cudnnTensorDescriptor_t dhyDesc,
-                                                const void * dhy,
-                                                const cudnnTensorDescriptor_t dcyDesc,
-                                                const void * dcy,
-                                                const cudnnFilterDescriptor_t wDesc,
-                                                const void * w,
-                                                const cudnnTensorDescriptor_t hxDesc,
-                                                const void * hx,
-                                                const cudnnTensorDescriptor_t cxDesc,
-                                                const void * cx,
-                                                const cudnnTensorDescriptor_t * dxDesc,
-                                                void * dx,
-                                                const cudnnTensorDescriptor_t dhxDesc,
-                                                void * dhx,
-                                                const cudnnTensorDescriptor_t dcxDesc,
-                                                void * dcx,
-                                                void * workspace,
-                                                size_t workSpaceSizeInBytes,
-                                                void * reserveSpace,
-                                                size_t reserveSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
+                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
+                     const void *w, const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+                     void *workspace, size_t workSpaceSizeInBytes,
+                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights( cudnnHandle_t handle,
-                                                   const cudnnRNNDescriptor_t rnnDesc,
-                                                   const int seqLength,
-                                                   const cudnnTensorDescriptor_t * xDesc,
-                                                   const void * x,
-                                                   const cudnnTensorDescriptor_t hxDesc,
-                                                   const void * hx,
-                                                   const cudnnTensorDescriptor_t * yDesc, 
-                                                   const void * y,
-                                                   const void * workspace, 
-                                                   size_t workSpaceSizeInBytes, 
-                                                   const cudnnFilterDescriptor_t dwDesc, 
-                                                   void * dw,
-                                                   const void * reserveSpace, 
-                                                   size_t reserveSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateCTCLossDescriptor( cudnnCTCLossDescriptor_t* ctcLossDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
-                                cudnnCTCLossDescriptor_t         ctcLossDesc,
-                                cudnnDataType_t                  compType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
-                                cudnnCTCLossDescriptor_t         ctcLossDesc,
-                                cudnnDataType_t*                 compType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyCTCLossDescriptor( cudnnCTCLossDescriptor_t ctcLossDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCTCLoss( cudnnHandle_t handle, 
-                                        const cudnnTensorDescriptor_t probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the mini batch size, A is the alphabet size)  */
-                                        const void * probs,                          /* probabilities after softmax, in GPU memory */
-                                        const int * labels,                          /* labels, in CPU memory */
-                                        const int * labelLengths,                    /* the length of each label, in CPU memory */
-                                        const int * inputLengths,                    /* the lengths of timing steps in each batch, in CPU memory */
-                                        void * costs,                                /* the returned costs of CTC, in GPU memory */
-                                        const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
-                                        const void * gradients,                      /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
-                                        cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
-                                        cudnnCTCLossDescriptor_t ctcLossDesc,
-                                        void * workspace,                            /* pointer to the workspace, in GPU memory */
-                                        size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *, const int *, const int *, void *, const cudnnTensorDescriptor_t, const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the timing steps, N is the mini batch size, A
+                      is the alphabet size)  */
+    const void *probs,       /* probabilities after softmax, in GPU memory */
+    const int *labels,       /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    void *costs,             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t
+        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
+                          T,N,A */
+    const void *gradients,   /* the returned CTC gradients, in GPU memory, to
+                                compute costs only, set it to NULL */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    void *workspace, /* pointer to the workspace, in GPU memory */
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *,
+      const int *, const int *, void *, const cudnnTensorDescriptor_t,
+      const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths,
+                  costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       probsDesc,       /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the mini batch size, A is the alphabet size) */
-                                const cudnnTensorDescriptor_t       gradientsDesc,   /* Tensor descriptor for gradients, the dimensions are T,N,A. To compute costs only, set it to NULL */
-                                const int                          * labels,         /* labels, in CPU memory */
-                                const int                          * labelLengths,   /* the length of each label, in CPU memory */
-                                const int                          * inputLengths,   /* the lengths of timing steps in each batch, in CPU memory */
-                                cudnnCTCLossAlgo_t                  algo,            /* algorithm selected, supported now 0 and 1 */
-                                cudnnCTCLossDescriptor_t            ctcLossDesc,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const int *, const int *, const int *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the timing steps, N is the mini batch size, A
+                      is the alphabet size) */
+    const cudnnTensorDescriptor_t
+        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
+                          T,N,A. To compute costs only, set it to NULL */
+    const int *labels, /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
+      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, algo, ctcLossDesc, sizeInBytes);
+  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
+                  inputLengths, algo, ctcLossDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(cudnnHandle_t            handle,
-                                                   cudnnRNNDescriptor_t     rnnDesc,
-                                                   const int                hiddenSize,
-                                                   const int                numLayers,
-                                                   cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */
-                                                   cudnnRNNInputMode_t      inputMode,          
-                                                   cudnnDirectionMode_t     direction,
-                                                   cudnnRNNMode_t           mode,
-                                                   cudnnRNNAlgo_t           algo,
-                                                   cudnnDataType_t          dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers,
+    cudnnDropoutDescriptor_t
+        dropoutDesc, /* Between layers, not between recurrent steps. */
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t     rnnDesc,
-                                                int                      hiddenSize,
-                                                int                      numLayers,
-                                                cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */
-                                                cudnnRNNInputMode_t      inputMode,
-                                                cudnnDirectionMode_t     direction,
-                                                cudnnRNNMode_t           mode,
-                                                cudnnDataType_t          dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(
+    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
+    cudnnDropoutDescriptor_t
+        dropoutDesc, /* Between layers, not between recurrent steps. */
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
+      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
+      cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, dataType);
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
+                  direction, mode, dataType);
 }
 
 }  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_7_1.inc b/tensorflow/stream_executor/cuda/cudnn_7_1.inc
index 9f4b28f3fe3..5330e6d0584 100644
--- a/tensorflow/stream_executor/cuda/cudnn_7_1.inc
+++ b/tensorflow/stream_executor/cuda/cudnn_7_1.inc
@@ -3,2279 +3,2359 @@
 extern "C" {
 
 size_t CUDNNWINAPI cudnnGetVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
 size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-const char *  CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
   if (!func_ptr) return "cudnnGetErrorString symbol not found.";
   return func_ptr(status);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(
-                                cudnnHandle_t                       handle,
-                                cudnnStatus_t                      *rstatus,
-                                cudnnErrQueryMode_t                 mode,
-                                cudnnRuntimeTag_t                  *tag ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
+cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
+                                                 cudnnStatus_t *rstatus,
+                                                 cudnnErrQueryMode_t mode,
+                                                 cudnnRuntimeTag_t *tag) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rstatus, mode, tag);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
+                                           int *value) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreate        (cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroy       (cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetStream     (cudnnHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
+                                         cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetStream     (cudnnHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
+                                         cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateTensorDescriptor(
-                                cudnnTensorDescriptor_t            *tensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnTensorFormat_t                 format,
-                                cudnnDataType_t                     dataType, /* image data type */
-                                int                                 n,        /* number of inputs (batch size) */
-                                int                                 c,        /* number of input feature maps */
-                                int                                 h,        /* height of input section */
-                                int                                 w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnDataType_t                     dataType, /* image data type */
-                                int                                 n,        /* number of inputs (batch size) */
-                                int                                 c,        /* number of input feature maps */
-                                int                                 h,        /* height of input section */
-                                int                                 w,        /* width of input section */
-                                int                                 nStride,
-                                int                                 cStride,
-                                int                                 hStride,
-                                int                                 wStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+    cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w,                    /* width of input section */
+    int nStride, int cStride, int hStride, int wStride) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
+                                   int, int, int, int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                cudnnDataType_t                    *dataType, /* image data type */
-                                int                                *n,        /* number of inputs (batch size) */
-                                int                                *c,        /* number of input feature maps  */
-                                int                                *h,        /* height of input section */
-                                int                                *w,        /* width of input section */
-                                int                                *nStride,
-                                int                                *cStride,
-                                int                                *hStride,
-                                int                                *wStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+    const cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t *dataType, /* image data type */
+    int *n,                    /* number of inputs (batch size) */
+    int *c,                    /* number of input feature maps  */
+    int *h,                    /* height of input section */
+    int *w,                    /* width of input section */
+    int *nStride, int *cStride, int *hStride, int *wStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
+      int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnDataType_t                     dataType,
-                                int                                 nbDims,
-                                const int                           dimA[],
-                                const int                           strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
+    const int dimA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnTensorFormat_t                 format,
-                                cudnnDataType_t                     dataType,
-                                int                                 nbDims,
-                                const int                           dimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                int                                 nbDimsRequested,
-                                cudnnDataType_t                    *dataType,
-                                int                                *nbDims,
-                                int                                 dimA[],
-                                int                                 strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
+                                   cudnnDataType_t *, int *, int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                size_t                              *size) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, size);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyTensorDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnAddTensor(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
+                                         const void *alpha,
+                                         const cudnnTensorDescriptor_t aDesc,
+                                         const void *A, const void *beta,
+                                         const cudnnTensorDescriptor_t cDesc,
+                                         void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t          *opTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t           opTensorDesc,
-                                cudnnOpTensorOp_t                   opTensorOp,
-                                cudnnDataType_t                     opTensorCompType,
-                                cudnnNanPropagation_t               opTensorNanOpt ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
+    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
+                                   cudnnDataType_t, cudnnNanPropagation_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
-                                const cudnnOpTensorDescriptor_t     opTensorDesc,
-                                cudnnOpTensorOp_t                  *opTensorOp,
-                                cudnnDataType_t                    *opTensorCompType,
-                                cudnnNanPropagation_t              *opTensorNanOpt ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
+    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
+      cudnnNanPropagation_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t           opTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnOpTensorDescriptor_t     opTensorDesc,
-                                const void                         *alpha1,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *alpha2,
-                                const cudnnTensorDescriptor_t       bDesc,
-                                const void                         *B,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
+    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
+    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
+    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
+                  beta, cDesc, C);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t          *reduceTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t           reduceTensorDesc,
-                                cudnnReduceTensorOp_t                   reduceTensorOp,
-                                cudnnDataType_t                     reduceTensorCompType,
-                                cudnnNanPropagation_t               reduceTensorNanOpt,
-                                cudnnReduceTensorIndices_t          reduceTensorIndices,
-                                cudnnIndicesType_t                  reduceTensorIndicesType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+    cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
+    cudnnNanPropagation_t reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t reduceTensorIndices,
+    cudnnIndicesType_t reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
+      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
-                                const cudnnReduceTensorDescriptor_t     reduceTensorDesc,
-                                cudnnReduceTensorOp_t                  *reduceTensorOp,
-                                cudnnDataType_t                    *reduceTensorCompType,
-                                cudnnNanPropagation_t              *reduceTensorNanOpt,
-                                cudnnReduceTensorIndices_t         *reduceTensorIndices,
-                                cudnnIndicesType_t                 *reduceTensorIndicesType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t *reduceTensorOp,
+    cudnnDataType_t *reduceTensorCompType,
+    cudnnNanPropagation_t *reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t *reduceTensorIndices,
+    cudnnIndicesType_t *reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
+      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
+      cudnnIndicesType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t           reduceTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                void                               *indices,
-                                size_t                              indicesSizeInBytes,
-                                void                               *workspace,
-                                size_t                              workspaceSizeInBytes,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    void *indices, size_t indicesSizeInBytes, void *workspace,
+    size_t workspaceSizeInBytes, const void *alpha,
+    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
+    const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
+      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
+                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
+                  C);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const void                         *valuePtr ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
+                                         const cudnnTensorDescriptor_t yDesc,
+                                         void *y, const void *valuePtr) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, valuePtr);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const void                         *alpha ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           void *y, const void *alpha) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, alpha);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateFilterDescriptor(
-                                cudnnFilterDescriptor_t            *filterDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc,
-                                cudnnDataType_t                     dataType, /* image data type */
-                                cudnnTensorFormat_t                 format,
-                                int                                 k,        /* number of output feature maps */
-                                int                                 c,        /* number of input feature maps */
-                                int                                 h,        /* height of each input filter */
-                                int                                 w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType,          /* image data type */
+    cudnnTensorFormat_t format, int k, /* number of output feature maps */
+    int c,                             /* number of input feature maps */
+    int h,                             /* height of each input filter */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                cudnnDataType_t                    *dataType, /* image data type */
-                                cudnnTensorFormat_t                *format,
-                                int                                *k,        /* number of output feature maps */
-                                int                                *c,        /* number of input feature maps */
-                                int                                *h,        /* height of each input filter */
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+    const cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t *dataType,           /* image data type */
+    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
+    int *c,                              /* number of input feature maps */
+    int *h,                              /* height of each input filter */
+    int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
+      int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc,
-                                cudnnDataType_t                     dataType, /* image data type */
-                                cudnnTensorFormat_t                 format,
-                                int                                 nbDims,
-                                const int                           filterDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType, /* image data type */
+    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                 nbDimsRequested,
-                                cudnnDataType_t                    *dataType, /* image data type */
-                                cudnnTensorFormat_t                *format,
-                                int                                *nbDims,
-                                int                                 filterDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, /* image data type */
+    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
+      cudnnTensorFormat_t *, int *, int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
+                  filterDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyFilterDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateConvolutionDescriptor(
-                                cudnnConvolutionDescriptor_t       *convDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType( cudnnConvolutionDescriptor_t convDesc,
-                                                       cudnnMathType_t mathType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType( cudnnConvolutionDescriptor_t convDesc,
-                                                       cudnnMathType_t *mathType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount( cudnnConvolutionDescriptor_t convDesc,
-                                                         int groupCount ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount( cudnnConvolutionDescriptor_t convDesc,
-                                                         int *groupCount ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor( cudnnConvolutionDescriptor_t convDesc,
-                                                             int pad_h,    /* zero-padding height */
-                                                             int pad_w,    /* zero-padding width */
-                                                             int u,   /* vertical filter stride */
-                                                             int v,   /* horizontal filter stride */
-                                                             int dilation_h, /* filter dilation in the vertical dimension */
-                                                             int dilation_w, /* filter dilation in the horizontal dimension */
-                                                             cudnnConvolutionMode_t mode,
-                                                             cudnnDataType_t computeType
-                                                           ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
+    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
+    int pad_w,                                        /* zero-padding width */
+    int u,          /* vertical filter stride */
+    int v,          /* horizontal filter stride */
+    int dilation_h, /* filter dilation in the vertical dimension */
+    int dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(  const cudnnConvolutionDescriptor_t convDesc,
-                                                            int* pad_h,    /* zero-padding height */
-                                                            int* pad_w,    /* zero-padding width */
-                                                            int* u,        /* vertical filter stride */
-                                                            int* v,        /* horizontal filter stride */
-                                                            int* dilation_h, /* filter dilation in the vertical dimension */
-                                                            int* dilation_w, /* filter dilation in the horizontal dimension */
-                                                            cudnnConvolutionMode_t* mode,
-                                                            cudnnDataType_t *computeType
-                                                         ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc,
+    int *pad_h,      /* zero-padding height */
+    int *pad_w,      /* zero-padding width */
+    int *u,          /* vertical filter stride */
+    int *v,          /* horizontal filter stride */
+    int *dilation_h, /* filter dilation in the vertical dimension */
+    int *dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
+      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                *n,
-                                int                                *c,
-                                int                                *h,
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
-                                cudnnConvolutionDescriptor_t        convDesc,
-                                int                                 arrayLength,             /* nbDims-2 size */
-                                const int                           padA[],
-                                const int                           filterStrideA[],
-                                const int                           dilationA[],
-                                cudnnConvolutionMode_t              mode,
-                                cudnnDataType_t                     computeType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
+    const int padA[], const int filterStrideA[], const int dilationA[],
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
+                  computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                int                                 arrayLengthRequested,
-                                int                                *arrayLength,
-                                int                                 padA[],
-                                int                                 strideA[],
-                                int                                 dilationA[],
-                                cudnnConvolutionMode_t             *mode,
-                                cudnnDataType_t                    *computeType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
+    int *arrayLength, int padA[], int strideA[], int dilationA[],
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
+      cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
+                  dilationA, mode, computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                 nbDims,
-                                int                                 tensorOutputDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int nbDims,
+    int tensorOutputDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOutputDimA);
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
+                  tensorOutputDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyConvolutionDescriptor(
-                                cudnnConvolutionDescriptor_t        convDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithmMaxCount( cudnnHandle_t     handle,
-                                                                       int              *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionFwdAlgoPerf_t      *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionFwdAlgoPerf_t      *perfResults,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                cudnnConvolutionFwdPreference_t     preference,
-                                size_t                              memoryLimitInBytes,
-                                cudnnConvolutionFwdAlgo_t          *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc,
+    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionFwdAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
+      cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
-                                cudnnHandle_t                      handle,
-                                const cudnnTensorDescriptor_t      srcDesc,
-                                const cudnnFilterDescriptor_t      filterDesc,
-                                const cudnnConvolutionDescriptor_t convDesc,
-                                const cudnnTensorDescriptor_t      destDesc,
-                                const int                          requestedAlgoCount,
-                                int                               *returnedAlgoCount,
-                                cudnnConvolutionFwdAlgoPerf_t     *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnFilterDescriptor_t filterDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha1,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *alpha2,
-                                const cudnnTensorDescriptor_t       zDesc,
-                                const void                         *z,
-                                const cudnnTensorDescriptor_t       biasDesc,
-                                const void                         *bias,
-                                const cudnnActivationDescriptor_t   activationDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+    cudnnHandle_t handle, const void *alpha1,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
+    const cudnnTensorDescriptor_t zDesc, const void *z,
+    const cudnnTensorDescriptor_t biasDesc, const void *bias,
+    const cudnnActivationDescriptor_t activationDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
+                  activationDesc, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dbDesc,
-                                void                               *db ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dbDesc, void *db) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( cudnnHandle_t     handle,
-                                                                              int              *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnFilterDescriptor_t       dwDesc,
-                                const int                           requestedAlgoCount,
-                                int                                 *returnedAlgoCount,
-                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
-                                cudnnHandle_t                        handle,
-                                const cudnnTensorDescriptor_t        xDesc,
-                                const void                          *x,
-                                const cudnnTensorDescriptor_t        dyDesc,
-                                const void                          *y,
-                                const cudnnConvolutionDescriptor_t   convDesc,
-                                const cudnnFilterDescriptor_t        dwDesc,
-                                void                                *dw,
-                                const int                            requestedAlgoCount,
-                                int                                 *returnedAlgoCount,
-                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
-                                void                                *workSpace,
-                                size_t                               workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *y,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
-                                cudnnHandle_t                         handle,
-                                const cudnnTensorDescriptor_t         xDesc,
-                                const cudnnTensorDescriptor_t         dyDesc,
-                                const cudnnConvolutionDescriptor_t    convDesc,
-                                const cudnnFilterDescriptor_t         dwDesc,
-                                cudnnConvolutionBwdFilterPreference_t preference,
-                                size_t                                memoryLimitInBytes,
-                                cudnnConvolutionBwdFilterAlgo_t      *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc,
+    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdFilterAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
+      size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
-                                cudnnHandle_t                         handle,
-                                const cudnnTensorDescriptor_t         srcDesc,
-                                const cudnnTensorDescriptor_t         diffDesc,
-                                const cudnnConvolutionDescriptor_t    convDesc,
-                                const cudnnFilterDescriptor_t         gradDesc,
-                                const int                             requestedAlgoCount,
-                                int                                  *returnedAlgoCount,
-                                cudnnConvolutionBwdFilterAlgoPerf_t  *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnFilterDescriptor_t       gradDesc,
-                                cudnnConvolutionBwdFilterAlgo_t     algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionBwdFilterAlgo_t     algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnFilterDescriptor_t       dwDesc,
-                                void                               *dw ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnFilterDescriptor_t dwDesc, void *dw) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
+      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount( cudnnHandle_t     handle,
-                                                                            int              *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                cudnnConvolutionBwdDataPreference_t preference,
-                                size_t                              memoryLimitInBytes,
-                                cudnnConvolutionBwdDataAlgo_t      *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc,
+    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdDataAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
+      size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                const cudnnTensorDescriptor_t       diffDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       gradDesc,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                cudnnConvolutionBwdDataAlgo_t       algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionBwdDataAlgo_t       algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnIm2Col(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                void                               *colBuffer ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI
+cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+            const void *x, const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
+                                   const void *, const cudnnFilterDescriptor_t,
+                                   const cudnnConvolutionDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
-                                cudnnHandle_t                       handle,
-                                cudnnSoftmaxAlgorithm_t             algo,
-                                cudnnSoftmaxMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnSoftmaxAlgorithm_t             algo,
-                                cudnnSoftmaxMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
+                  dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreatePoolingDescriptor(
-                                cudnnPoolingDescriptor_t           *poolingDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc,
-                                cudnnPoolingMode_t                  mode,
-                                cudnnNanPropagation_t               maxpoolingNanOpt,
-                                int                                 windowHeight,
-                                int                                 windowWidth,
-                                int                                 verticalPadding,
-                                int                                 horizontalPadding,
-                                int                                 verticalStride,
-                                int                                 horizontalStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
+    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
+    int verticalPadding, int horizontalPadding, int verticalStride,
+    int horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
+      int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                cudnnPoolingMode_t                 *mode,
-                                cudnnNanPropagation_t              *maxpoolingNanOpt,
-                                int                                *windowHeight,
-                                int                                *windowWidth,
-                                int                                *verticalPadding,
-                                int                                *horizontalPadding,
-                                int                                *verticalStride,
-                                int                                *horizontalStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
+    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
+    int *windowWidth, int *verticalPadding, int *horizontalPadding,
+    int *verticalStride, int *horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc,
-                                const cudnnPoolingMode_t            mode,
-                                const cudnnNanPropagation_t         maxpoolingNanOpt,
-                                int                                 nbDims,
-                                const int                           windowDimA[],
-                                const int                           paddingA[],
-                                const int                           strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
+    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
+    const int windowDimA[], const int paddingA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
+      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
+                  paddingA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                int                                 nbDimsRequested,
-                                cudnnPoolingMode_t                 *mode,
-                                cudnnNanPropagation_t              *maxpoolingNanOpt,
-                                int                                *nbDims,
-                                int                                 windowDimA[],
-                                int                                 paddingA[],
-                                int                                 strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
+    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
+    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int[], int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
+                  windowDimA, paddingA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdForwardOutputDim(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                int                                 nbDims,
-                                int                                 outputTensorDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims, int outputTensorDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                   const cudnnTensorDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dForwardOutputDim(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                int                                *n,
-                                int                                *c,
-                                int                                *h,
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyPoolingDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
-                                cudnnHandle_t                       handle,
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
-                                cudnnHandle_t                       handle,
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const void                          *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateActivationDescriptor(
-                                cudnnActivationDescriptor_t        *activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
-                                cudnnActivationDescriptor_t         activationDesc,
-                                cudnnActivationMode_t               mode,
-                                cudnnNanPropagation_t               reluNanOpt,
-                                double                              coef ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
+    cudnnNanPropagation_t reluNanOpt, double coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
+                                               cudnnActivationMode_t,
+                                               cudnnNanPropagation_t, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetActivationDescriptor(
-                                const cudnnActivationDescriptor_t   activationDesc,
-                                cudnnActivationMode_t              *mode,
-                                cudnnNanPropagation_t              *reluNanOpt,
-                                double*                             coef ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
+      cudnnNanPropagation_t *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyActivationDescriptor(
-                                cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
-                                cudnnHandle_t                       handle,
-                                cudnnActivationDescriptor_t         activationDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnActivationDescriptor_t         activationDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateLRNDescriptor(
-                                cudnnLRNDescriptor_t               *normDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(
-                                cudnnLRNDescriptor_t                normDesc,
-                                unsigned                            lrnN,
-                                double                              lrnAlpha,
-                                double                              lrnBeta,
-                                double                              lrnK ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned lrnN, double lrnAlpha,
+                                                double lrnBeta, double lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int, double, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(
-                                cudnnLRNDescriptor_t                normDesc,
-                                unsigned*                           lrnN,
-                                double*                             lrnAlpha,
-                                double*                             lrnBeta,
-                                double*                             lrnK ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned *lrnN,
+                                                double *lrnAlpha,
+                                                double *lrnBeta, double *lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyLRNDescriptor( cudnnLRNDescriptor_t lrnDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(lrnDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnLRNMode_t                      lrnMode,
-                                const void*                         alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnLRNMode_t                      lrnMode,
-                                const void*                         alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
+                  x, beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnDivNormMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc, /* same desc for means, temp, temp2 */
-                                const void                         *x,
-                                const void                         *means, /* if NULL, means are assumed to be zero */
-                                void                               *temp,
-                                void                               *temp2,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
+                  beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnDivNormMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc, /* same desc for x, means, dy, temp, temp2 */
-                                const void                         *x,
-                                const void                         *means, /* if NULL, means are assumed to be zero */
-                                const void                         *dy,
-                                void                               *temp,
-                                void                               *temp2,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dXdMeansDesc, /* same desc for dx, dMeans */
-                                void                               *dx, /* output x differential */
-                                void                               *dMeans ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t
+        xDesc, /* same desc for x, means, dy, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    const void *dy, void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+    void *dx,                                   /* output x differential */
+    void *dMeans) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
+                  temp2, beta, dXdMeansDesc, dx, dMeans);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
-                                cudnnTensorDescriptor_t             derivedBnDesc,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                cudnnBatchNormMode_t                mode ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
+    cudnnBatchNormMode_t mode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               cudnnBatchNormMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(derivedBnDesc, xDesc, mode);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
 
-                                const void                         *alpha, /* alpha[0] = result blend factor */
-                                const void                         *beta,  /* beta[0] = dest layer blend factor */
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
 
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,     /* NxCxHxW */
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,     /* NxCxHxW */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
 
-                                /* Shared desc for the next 6 tensors in the argument list.
-                                   Data type to be set as follows:
-                                   type = (typeOf(x) == double) ? double : float
-                                   Dimensions for this descriptor depend on normalization mode
-                                   - Spatial Normalization : tensors are expected to have dims 1xCx1x1
-                                    (normalization is performed across NxHxW)
-                                   - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW 
-                                    (normalization is performed across N) */
-                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of
+       1xCxHxW (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
 
-                                /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
-                                const void                         *bnScale,
-                                const void                         *bnBias,
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
+     */
+    const void *bnScale, const void *bnBias,
 
-                                /* MUST use factor=1 in the very first call of a complete training cycle.
-                                   Use a factor=1/(1+n) at N-th call to the function to get
-                                   Cumulative Moving Average (CMA) behavior
-                                   CMA[n] = (x[1]+...+x[n])/n
-                                   Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
-                                   ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
-                                   CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
-                                double                              exponentialAverageFactor,
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
 
-                                /* Used in Training phase only. 
-                                   runningMean = newMean*factor + runningMean*(1-factor) */
-                                void                               *resultRunningMean,
-                                /* Output in training mode, input in inference. Is the moving average
-                                   of  variance[x] (factor is applied in the same way as for runningMean) */
-                                void                               *resultRunningVariance,
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
 
-                                /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
-                                double                              epsilon,
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
+       backward functions. */
+    double epsilon,
 
-                                /* Optionally save intermediate results from the forward pass here
-                                   - can be reused to speed up backward pass. NULL if unused */
-                                void                               *resultSaveMean,
-                                void                               *resultSaveInvVariance ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean, void *resultSaveInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, double, void *, void *, double, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+  return func_ptr(
+      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
+      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
+      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
-                                const void                         *alpha, /* alpha[0] = result blend factor */
-                                const void                         *beta,  /* beta[0] = dest layer blend factor */
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,     /* NxCxHxW */
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,     /* NxCxHxW */
-                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
-                                const void                         *bnScale,
-                                const void                         *bnBias,
-                                const void                         *estimatedMean,
-                                const void                         *estimatedVariance,
-                                double                              epsilon ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
+    const void *bnBias, const void *estimatedMean,
+    const void *estimatedVariance, double epsilon) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, const void *, const void *, double);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
+                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
+                  estimatedVariance, epsilon);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
-                                const void                         *alphaDataDiff,
-                                const void                         *betaDataDiff,
-                                const void                         *alphaParamDiff,
-                                const void                         *betaParamDiff,
-                                const cudnnTensorDescriptor_t       xDesc, /* same desc for x, dx, dy */
-                                const void                         *x,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx,
-                                /* Shared tensor desc for the 4 tensors below */
-                                const cudnnTensorDescriptor_t       dBnScaleBiasDesc,
-                                const void                         *bnScale, /* bnBias doesn't affect backpropagation */
-                                /* scale and bias diff are not backpropagated below this layer */
-                                void                               *dBnScaleResult,
-                                void                               *dBnBiasResult,
-                                /* Same epsilon as forward pass */
-                                double                              epsilon,
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
+    const void *betaDataDiff, const void *alphaParamDiff,
+    const void *betaParamDiff,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    /* Shared tensor desc for the 4 tensors below */
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+    const void *bnScale, /* bnBias doesn't affect backpropagation */
+    /* scale and bias diff are not backpropagated below this layer */
+    void *dBnScaleResult, void *dBnBiasResult,
+    /* Same epsilon as forward pass */
+    double epsilon,
 
-                                /* Optionally cached intermediate results from
-                                   forward pass */
-                                const void                         *savedMean,
-                                const void                         *savedInvVariance ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+    /* Optionally cached intermediate results from
+       forward pass */
+    const void *savedMean, const void *savedInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, void *, void *, double, const void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
+                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
+                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
+                  epsilon, savedMean, savedInvVariance);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
-                               cudnnSpatialTransformerDescriptor_t        *stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+    cudnnSpatialTransformerDescriptor_t *stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
-                                cudnnSpatialTransformerDescriptor_t         stDesc,
-                                cudnnSamplerType_t                          samplerType, 
-                                cudnnDataType_t                             dataType,
-                                const int                                   nbDims,
-                                const int                                   dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
+    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
+      const int, const int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
-                                 cudnnSpatialTransformerDescriptor_t        stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+    cudnnSpatialTransformerDescriptor_t stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
-                                 cudnnHandle_t                              handle,
-                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
-                                 const void                                *theta,
-                                 void                                      *grid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *theta, void *grid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, theta, grid);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
-                                 cudnnHandle_t                              handle,
-                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
-                                 const void                                *dgrid,
-                                 void                                      *dtheta) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *dgrid, void *dtheta) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, dgrid, dtheta);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
-                                 cudnnHandle_t                              handle,
-                                 cudnnSpatialTransformerDescriptor_t        stDesc,
-                                 const void                                *alpha,                                    
-                                 const cudnnTensorDescriptor_t              xDesc,
-                                 const void                                *x,
-                                 const void                                *grid,
-                                 const void                                *beta,
-                                 cudnnTensorDescriptor_t                    yDesc,
-                                 void                                      *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
+    void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
-                                 cudnnHandle_t                              handle,
-                                 cudnnSpatialTransformerDescriptor_t        stDesc,
-                                 const void                                *alpha,
-                                 const cudnnTensorDescriptor_t              xDesc,
-                                 const void                                *x,
-                                 const void                                *beta,
-                                 const cudnnTensorDescriptor_t              dxDesc,
-                                 void                                      *dx,
-                                 const void                                *alphaDgrid,
-                                 const cudnnTensorDescriptor_t              dyDesc,
-                                 const void                                *dy,
-                                 const void                                *grid,
-                                 const void                                *betaDgrid,
-                                 void                                      *dgrid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
+    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
+                  dyDesc, dy, grid, betaDgrid, dgrid);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t * dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t * sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
+                                                    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t * sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
+    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(xdesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
-                                                    cudnnHandle_t            handle,
-                                                    float                    dropout, 
-                                                    void *                   states, 
-                                                    size_t                   stateSizeInBytes, 
-                                                    unsigned long long       seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
-                                                        cudnnHandle_t            handle,
-                                                        float                    dropout, 
-                                                        void *                   states, 
-                                                        size_t                   stateSizeInBytes, 
-                                                        unsigned long long       seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
-                                                    cudnnHandle_t            handle,
-                                                    float *                  dropout, 
-                                                    void **                  states,
-                                                    unsigned long long *     seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float *, void **, unsigned long long *);
+cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
+    void **states, unsigned long long *seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float *, void **, unsigned long long *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(cudnnHandle_t                  handle, 
-                                              const cudnnDropoutDescriptor_t dropoutDesc,
-                                              const cudnnTensorDescriptor_t  xdesc, 
-                                              const void *                   x,
-                                              const cudnnTensorDescriptor_t  ydesc,
-                                              void *                         y,
-                                              void *                         reserveSpace,
-                                              size_t                         reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t xdesc, const void *x,
+    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(cudnnHandle_t                  handle, 
-                                               const cudnnDropoutDescriptor_t dropoutDesc,
-                                               const cudnnTensorDescriptor_t  dydesc, 
-                                               const void *                   dy,
-                                               const cudnnTensorDescriptor_t  dxdesc,
-                                               void *                         dx,
-                                               void *                         reserveSpace,
-                                               size_t                         reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t dydesc, const void *dy,
+    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t * rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardInferenceAlgorithmMaxCount(
-                                cudnnHandle_t              handle,
-                          const cudnnRNNDescriptor_t       rnnDesc,
-                                int                        *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardInferenceAlgorithmEx( cudnnHandle_t handle,
-                                                    const cudnnRNNDescriptor_t rnnDesc,
-                                                    const int seqLength,
-                                                    const cudnnTensorDescriptor_t * xDesc,
-                                                    const void * x,
-                                                    const cudnnTensorDescriptor_t hxDesc,
-                                                    const void * hx,
-                                                    const cudnnTensorDescriptor_t cxDesc,
-                                                    const void * cx,
-                                                    const cudnnFilterDescriptor_t wDesc,
-                                                    const void * w,
-                                                    const cudnnTensorDescriptor_t *yDesc,
-                                                    void * y,
-                                                    const cudnnTensorDescriptor_t hyDesc,
-                                                    void * hy,
-                                                    const cudnnTensorDescriptor_t cyDesc,
-                                                    void * cy,
-                                                    const float findIntensity,
-                                                    const int requestedAlgoCount,
-                                                    int *returnedAlgoCount,
-                                                    cudnnAlgorithmPerformance_t *perfResults,
-                                                    void * workspace,
-                                                    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardInferenceAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardTrainingAlgorithmMaxCount(
-                                cudnnHandle_t              handle,
-                          const cudnnRNNDescriptor_t       rnnDesc,
-                                int                        *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardTrainingAlgorithmEx( cudnnHandle_t handle,
-                                                    const cudnnRNNDescriptor_t rnnDesc,
-                                                    const int seqLength,
-                                                    const cudnnTensorDescriptor_t * xDesc,
-                                                    const void * x,
-                                                    const cudnnTensorDescriptor_t hxDesc,
-                                                    const void * hx,
-                                                    const cudnnTensorDescriptor_t cxDesc,
-                                                    const void * cx,
-                                                    const cudnnFilterDescriptor_t wDesc,
-                                                    const void * w,
-                                                    const cudnnTensorDescriptor_t *yDesc,
-                                                    void * y,
-                                                    const cudnnTensorDescriptor_t hyDesc,
-                                                    void * hy,
-                                                    const cudnnTensorDescriptor_t cyDesc,
-                                                    void * cy,
-                                                    const float findIntensity,
-                                                    const int requestedAlgoCount,
-                                                    int *returnedAlgoCount,
-                                                    cudnnAlgorithmPerformance_t *perfResults,
-                                                    void * workspace,
-                                                    size_t workSpaceSizeInBytes,
-                                                    void * reserveSpace,
-                                                    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardTrainingAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardDataAlgorithmMaxCount(
-                                cudnnHandle_t              handle,
-                          const cudnnRNNDescriptor_t       rnnDesc,
-                                int                        *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardDataAlgorithmEx( cudnnHandle_t handle,
-                                                const cudnnRNNDescriptor_t rnnDesc,
-                                                const int seqLength,
-                                                const cudnnTensorDescriptor_t * yDesc,
-                                                const void * y,
-                                                const cudnnTensorDescriptor_t * dyDesc,
-                                                const void * dy,
-                                                const cudnnTensorDescriptor_t dhyDesc,
-                                                const void * dhy,
-                                                const cudnnTensorDescriptor_t dcyDesc,
-                                                const void * dcy,
-                                                const cudnnFilterDescriptor_t wDesc,
-                                                const void * w,
-                                                const cudnnTensorDescriptor_t hxDesc,
-                                                const void * hx,
-                                                const cudnnTensorDescriptor_t cxDesc,
-                                                const void * cx,
-                                                const cudnnTensorDescriptor_t * dxDesc,
-                                                void * dx,
-                                                const cudnnTensorDescriptor_t dhxDesc,
-                                                void * dhx,
-                                                const cudnnTensorDescriptor_t dcxDesc,
-                                                void * dcx,
-                                                const float findIntensity,
-                                                const int requestedAlgoCount,
-                                                int *returnedAlgoCount,
-                                                cudnnAlgorithmPerformance_t *perfResults,
-                                                void * workspace,
-                                                size_t workSpaceSizeInBytes,
-                                                void * reserveSpace,
-                                                size_t reserveSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardDataAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *yDesc, const void *y,
+    const cudnnTensorDescriptor_t *dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
+    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnTensorDescriptor_t *dxDesc, void *dx,
+    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+    const cudnnTensorDescriptor_t dcxDesc, void *dcx, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardWeightsAlgorithmMaxCount(
-                                cudnnHandle_t              handle,
-                          const cudnnRNNDescriptor_t       rnnDesc,
-                                int                        *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardWeightsAlgorithmEx( cudnnHandle_t handle,
-                                                   const cudnnRNNDescriptor_t rnnDesc,
-                                                   const int seqLength,
-                                                   const cudnnTensorDescriptor_t * xDesc,
-                                                   const void * x,
-                                                   const cudnnTensorDescriptor_t hxDesc,
-                                                   const void * hx,
-                                                   const cudnnTensorDescriptor_t * yDesc, 
-                                                   const void * y,
-                                                   const float findIntensity,
-                                                   const int requestedAlgoCount,
-                                                   int *returnedAlgoCount,
-                                                   cudnnAlgorithmPerformance_t *perfResults,
-                                                   const void * workspace, 
-                                                   size_t workSpaceSizeInBytes, 
-                                                   const cudnnFilterDescriptor_t dwDesc, 
-                                                   void * dw,
-                                                   const void * reserveSpace, 
-                                                   size_t reserveSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardWeightsAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y,
+    const float findIntensity, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnAlgorithmPerformance_t *perfResults,
+    const void *workspace, size_t workSpaceSizeInBytes,
+    const cudnnFilterDescriptor_t dwDesc, void *dw, const void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  findIntensity, requestedAlgoCount, returnedAlgoCount,
+                  perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw,
+                  reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t       rnnDesc,
-                                                       const int                  minibatch,
-                                                       const cudnnDataType_t      dataType,
-                                                       cudnnPersistentRNNPlan_t * plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
+    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
+                                               const cudnnDataType_t,
+                                               cudnnPersistentRNNPlan_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, minibatch, dataType, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
-                                                    cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(cudnnHandle_t              handle,
-                                                cudnnRNNDescriptor_t       rnnDesc,
-                                                const int                  hiddenSize,
-                                                const int                  numLayers,
-                                                cudnnDropoutDescriptor_t   dropoutDesc, /* Between layers, not between recurrent steps. */
-                                                cudnnRNNInputMode_t        inputMode,          
-                                                cudnnDirectionMode_t       direction,
-                                                cudnnRNNMode_t             mode,
-                                                cudnnRNNAlgo_t             algo,
-                                                cudnnDataType_t            dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers,
+    cudnnDropoutDescriptor_t
+        dropoutDesc, /* Between layers, not between recurrent steps. */
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNProjectionLayers(cudnnHandle_t        handle,
-                                                cudnnRNNDescriptor_t       rnnDesc,
-                                                const int                  recProjSize,
-                                                const int                  outProjSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNProjectionLayers(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
+                            const int recProjSize, const int outProjSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(cudnnHandle_t        handle,
-                                                const cudnnRNNDescriptor_t rnnDesc,
-                                                int                        *recProjSize,
-                                                int                        *outProjSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *recProjSize,
+    int *outProjSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t     handle,
-                                                cudnnRNNDescriptor_t       rnnDesc,
-                                                cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNAlgorithmDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
+    cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(cudnnHandle_t              handle,
-                                                cudnnRNNDescriptor_t       rnnDesc,
-                                                int *                      hiddenSize, 
-                                                int *                      numLayers, 
-                                                cudnnDropoutDescriptor_t * dropoutDesc,
-                                                cudnnRNNInputMode_t *      inputMode, 
-                                                cudnnDirectionMode_t *     direction, 
-                                                cudnnRNNMode_t *           mode, 
-                                                cudnnRNNAlgo_t *           algo, 
-                                                cudnnDataType_t *          dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *, cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *, cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
+    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
+    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
+    cudnnRNNMode_t *mode, cudnnRNNAlgo_t *algo, cudnnDataType_t *dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
+      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
+      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, mType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t* mType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(
+    cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, mType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize( cudnnHandle_t             handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc,  
-                                                const int                     seqLength, 
-                                                const cudnnTensorDescriptor_t *xDesc,
-                                                size_t                        *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize( cudnnHandle_t       handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc,  
-                                                const int                     seqLength,
-                                                const cudnnTensorDescriptor_t *xDesc,
-                                                size_t                        *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNParamsSize(cudnnHandle_t                 handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc,  
-                                                const cudnnTensorDescriptor_t xDesc,
-                                                size_t                        *sizeInBytes,
-                                                cudnnDataType_t               dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
+                      cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
+      size_t *, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams( cudnnHandle_t      handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc, 
-                                                const int                     pseudoLayer,
-                                                const cudnnTensorDescriptor_t xDesc,
-                                                const cudnnFilterDescriptor_t wDesc,
-                                                const void                    *w, 
-                                                const int                     linLayerID,  
-                                                cudnnFilterDescriptor_t       linLayerMatDesc,
-                                                void                          **linLayerMat) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
+                  linLayerMatDesc, linLayerMat);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams( cudnnHandle_t        handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc, 
-                                                const int                     pseudoLayer,
-                                                const cudnnTensorDescriptor_t xDesc, 
-                                                const cudnnFilterDescriptor_t wDesc,
-                                                const void                    *w,
-                                                const int                     linLayerID,
-                                                cudnnFilterDescriptor_t       linLayerBiasDesc,
-                                                void                          **linLayerBias) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
+                  linLayerBiasDesc, linLayerBias);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference( cudnnHandle_t             handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc,
-                                                const int                     seqLength,
-                                                const cudnnTensorDescriptor_t *xDesc,
-                                                const void                    *x,
-                                                const cudnnTensorDescriptor_t hxDesc,
-                                                const void                    *hx,
-                                                const cudnnTensorDescriptor_t cxDesc,
-                                                const void                    *cx,
-                                                const cudnnFilterDescriptor_t wDesc,
-                                                const void                    *w,
-                                                const cudnnTensorDescriptor_t *yDesc,
-                                                void                          *y,
-                                                const cudnnTensorDescriptor_t hyDesc,
-                                                void                          *hy,
-                                                const cudnnTensorDescriptor_t cyDesc,
-                                                void                          *cy,
-                                                void                          *workspace,
-                                                size_t                        workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining( cudnnHandle_t              handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc,
-                                                const int                     seqLength,
-                                                const cudnnTensorDescriptor_t *xDesc,
-                                                const void                    *x,
-                                                const cudnnTensorDescriptor_t hxDesc,
-                                                const void                    *hx,
-                                                const cudnnTensorDescriptor_t cxDesc,
-                                                const void                    *cx,
-                                                const cudnnFilterDescriptor_t wDesc,
-                                                const void                    *w,
-                                                const cudnnTensorDescriptor_t *yDesc,
-                                                void                          *y,
-                                                const cudnnTensorDescriptor_t hyDesc,
-                                                void                          *hy,
-                                                const cudnnTensorDescriptor_t cyDesc,
-                                                void                          *cy,
-                                                void                          *workspace,
-                                                size_t                        workSpaceSizeInBytes,
-                                                void *                        reserveSpace,
-                                                size_t                        reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardData( cudnnHandle_t                 handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc,
-                                                const int                     seqLength,
-                                                const cudnnTensorDescriptor_t *yDesc,
-                                                const void                    *y,
-                                                const cudnnTensorDescriptor_t *dyDesc,
-                                                const void                    *dy,
-                                                const cudnnTensorDescriptor_t dhyDesc,
-                                                const void                    *dhy,
-                                                const cudnnTensorDescriptor_t dcyDesc,
-                                                const void                    *dcy,
-                                                const cudnnFilterDescriptor_t wDesc,
-                                                const void                    *w,
-                                                const cudnnTensorDescriptor_t hxDesc,
-                                                const void                    *hx,
-                                                const cudnnTensorDescriptor_t cxDesc,
-                                                const void                    *cx,
-                                                const cudnnTensorDescriptor_t *dxDesc,
-                                                void                          *dx,
-                                                const cudnnTensorDescriptor_t dhxDesc,
-                                                void                          *dhx,
-                                                const cudnnTensorDescriptor_t dcxDesc,
-                                                void                          *dcx,
-                                                void                          *workspace,
-                                                size_t                        workSpaceSizeInBytes,
-                                                void *                        reserveSpace,
-                                                size_t                        reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
+                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
+                     const void *w, const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+                     void *workspace, size_t workSpaceSizeInBytes,
+                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights( cudnnHandle_t              handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc,
-                                                const int                     seqLength,
-                                                const cudnnTensorDescriptor_t *xDesc,
-                                                const void                    *x,
-                                                const cudnnTensorDescriptor_t hxDesc,
-                                                const void                    *hx,
-                                                const cudnnTensorDescriptor_t *yDesc, 
-                                                const void                    *y,
-                                                const void                    *workspace, 
-                                                size_t                        workSpaceSizeInBytes, 
-                                                const cudnnFilterDescriptor_t dwDesc, 
-                                                void                          *dw,
-                                                const void                    *reserveSpace, 
-                                                size_t                        reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateCTCLossDescriptor( cudnnCTCLossDescriptor_t* ctcLossDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
-                                cudnnCTCLossDescriptor_t         ctcLossDesc,
-                                cudnnDataType_t                  compType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
-                                cudnnCTCLossDescriptor_t         ctcLossDesc,
-                                cudnnDataType_t*                 compType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyCTCLossDescriptor( cudnnCTCLossDescriptor_t ctcLossDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCTCLoss( cudnnHandle_t handle, 
-                                        const cudnnTensorDescriptor_t probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the mini batch size, A is the alphabet size)  */
-                                        const void * probs,                          /* probabilities after softmax, in GPU memory */
-                                        const int * labels,                          /* labels, in CPU memory */
-                                        const int * labelLengths,                    /* the length of each label, in CPU memory */
-                                        const int * inputLengths,                    /* the lengths of timing steps in each batch, in CPU memory */
-                                        void * costs,                                /* the returned costs of CTC, in GPU memory */
-                                        const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
-                                        const void * gradients,                      /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
-                                        cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
-                                        cudnnCTCLossDescriptor_t ctcLossDesc,
-                                        void * workspace,                            /* pointer to the workspace, in GPU memory */
-                                        size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *, const int *, const int *, void *, const cudnnTensorDescriptor_t, const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the timing steps, N is the mini batch size, A
+                      is the alphabet size)  */
+    const void *probs,       /* probabilities after softmax, in GPU memory */
+    const int *labels,       /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    void *costs,             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t
+        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
+                          T,N,A */
+    const void *gradients,   /* the returned CTC gradients, in GPU memory, to
+                                compute costs only, set it to NULL */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    void *workspace, /* pointer to the workspace, in GPU memory */
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *,
+      const int *, const int *, void *, const cudnnTensorDescriptor_t,
+      const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths,
+                  costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       probsDesc,       /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the mini batch size, A is the alphabet size) */
-                                const cudnnTensorDescriptor_t       gradientsDesc,   /* Tensor descriptor for gradients, the dimensions are T,N,A. To compute costs only, set it to NULL */
-                                const int                          * labels,         /* labels, in CPU memory */
-                                const int                          * labelLengths,   /* the length of each label, in CPU memory */
-                                const int                          * inputLengths,   /* the lengths of timing steps in each batch, in CPU memory */
-                                cudnnCTCLossAlgo_t                  algo,            /* algorithm selected, supported now 0 and 1 */
-                                cudnnCTCLossDescriptor_t            ctcLossDesc,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const int *, const int *, const int *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the timing steps, N is the mini batch size, A
+                      is the alphabet size) */
+    const cudnnTensorDescriptor_t
+        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
+                          T,N,A. To compute costs only, set it to NULL */
+    const int *labels, /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
+      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, algo, ctcLossDesc, sizeInBytes);
+  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
+                  inputLengths, algo, ctcLossDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmDescriptor(
-                                cudnnAlgorithmDescriptor_t *algoDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmDescriptor(
-                                cudnnAlgorithmDescriptor_t algoDesc,
-                                cudnnAlgorithm_t algorithm) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t);
+    cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithm_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc, algorithm);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmDescriptor(
-                                const cudnnAlgorithmDescriptor_t algoDesc,
-                                cudnnAlgorithm_t* algorithm) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t *);
+    const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithm_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc, algorithm);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnCopyAlgorithmDescriptor(
-                                const cudnnAlgorithmDescriptor_t src,
-                                cudnnAlgorithmDescriptor_t dest) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithmDescriptor_t);
+    const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(src, dest);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmDescriptor(
-                                cudnnAlgorithmDescriptor_t algoDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmPerformance(
-                                cudnnAlgorithmPerformance_t* algoPerf,
-                                int numberToCreate ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+    cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, numberToCreate);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmPerformance(
-                                cudnnAlgorithmPerformance_t algoPerf,
-                                cudnnAlgorithmDescriptor_t algoDesc,
-                                cudnnStatus_t status,
-                                float time,
-                                size_t memory ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t, cudnnStatus_t, float, size_t);
+    cudnnAlgorithmPerformance_t algoPerf, cudnnAlgorithmDescriptor_t algoDesc,
+    cudnnStatus_t status, float time, size_t memory) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t,
+                                               cudnnAlgorithmDescriptor_t,
+                                               cudnnStatus_t, float, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, algoDesc, status, time, memory);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmPerformance(
-                                const cudnnAlgorithmPerformance_t algoPerf,
-                                cudnnAlgorithmDescriptor_t* algoDesc,
-                                cudnnStatus_t* status,
-                                float* time,
-                                size_t* memory ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *, cudnnStatus_t *, float *, size_t *);
+    const cudnnAlgorithmPerformance_t algoPerf,
+    cudnnAlgorithmDescriptor_t *algoDesc, cudnnStatus_t *status, float *time,
+    size_t *memory) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *,
+      cudnnStatus_t *, float *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, algoDesc, status, time, memory);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmPerformance(
-                                cudnnAlgorithmPerformance_t* algoPerf,
-                                int numberToDestroy) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
+    cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, numberToDestroy);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmSpaceSize(
-                                cudnnHandle_t              handle,
-                                cudnnAlgorithmDescriptor_t algoDesc,
-                                size_t*                    algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
+    cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
+    size_t *algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSaveAlgorithm(
-                                cudnnHandle_t              handle,
-                                cudnnAlgorithmDescriptor_t algoDesc,
-                                void*                      algoSpace,
-                                size_t                     algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnSaveAlgorithm(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
+                   void *algoSpace, size_t algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnRestoreAlgorithm(
-                                cudnnHandle_t              handle,
-                                void*                      algoSpace,
-                                size_t                     algoSpaceSizeInBytes,
-                                cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, void *, size_t, cudnnAlgorithmDescriptor_t);
+    cudnnHandle_t handle, void *algoSpace, size_t algoSpaceSizeInBytes,
+    cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, void *, size_t,
+                                               cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetCallback(
-                                unsigned            mask,
-                                void                *udata,
-                                cudnnCallback_t     fptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetCallback(unsigned mask, void *udata,
+                                           cudnnCallback_t fptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mask, udata, fptr);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetCallback(
-                                unsigned            *mask,
-                                void                **udata,
-                                cudnnCallback_t     *fptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetCallback(unsigned *mask, void **udata,
+                                           cudnnCallback_t *fptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mask, udata, fptr);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(cudnnHandle_t         handle,
-                                                cudnnRNNDescriptor_t     rnnDesc,
-                                                const int                hiddenSize,
-                                                const int                numLayers,
-                                                cudnnDropoutDescriptor_t dropoutDesc,
-                                                cudnnRNNInputMode_t      inputMode,          
-                                                cudnnDirectionMode_t     direction,
-                                                cudnnRNNMode_t           mode,
-                                                cudnnRNNAlgo_t           algo,
-                                                cudnnDataType_t          dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t  rnnDesc,
-                                                int                      hiddenSize,
-                                                int                      numLayers,
-                                                cudnnDropoutDescriptor_t dropoutDesc,
-                                                cudnnRNNInputMode_t      inputMode,
-                                                cudnnDirectionMode_t     direction,
-                                                cudnnRNNMode_t           mode,
-                                                cudnnDataType_t          dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(
+    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnRNNInputMode_t inputMode,
+    cudnnDirectionMode_t direction, cudnnRNNMode_t mode,
+    cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
+      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
+      cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, dataType);
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
+                  direction, mode, dataType);
 }
 
 }  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_7_3.inc b/tensorflow/stream_executor/cuda/cudnn_7_3.inc
index 0ee8e1492d5..f1c25c74d0c 100644
--- a/tensorflow/stream_executor/cuda/cudnn_7_3.inc
+++ b/tensorflow/stream_executor/cuda/cudnn_7_3.inc
@@ -2,73 +2,71 @@
 
 extern "C" {
 
-size_t CUDNNWINAPI
-cudnnGetVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+size_t CUDNNWINAPI cudnnGetVersion(void) {
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-size_t CUDNNWINAPI
-cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-const char *CUDNNWINAPI
-cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
   if (!func_ptr) return "cudnnGetErrorString symbol not found.";
   return func_ptr(status);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
+cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
+                                                 cudnnStatus_t *rstatus,
+                                                 cudnnErrQueryMode_t mode,
+                                                 cudnnRuntimeTag_t *tag) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rstatus, mode, tag);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
+                                           int *value) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreate(cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroy(cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
+                                         cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
+                                         cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
@@ -76,100 +74,97 @@ cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
-                           cudnnTensorFormat_t format,
-                           cudnnDataType_t dataType, /* image data type */
-                           int n,                    /* number of inputs (batch size) */
-                           int c,                    /* number of input feature maps */
-                           int h,                    /* height of input section */
-                           int w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
-                             cudnnDataType_t dataType, /* image data type */
-                             int n,                    /* number of inputs (batch size) */
-                             int c,                    /* number of input feature maps */
-                             int h,                    /* height of input section */
-                             int w,                    /* width of input section */
-                             int nStride,
-                             int cStride,
-                             int hStride,
-                             int wStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
+    cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w,                    /* width of input section */
+    int nStride, int cStride, int hStride, int wStride) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
+                                   int, int, int, int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
-                           cudnnDataType_t *dataType, /* image data type */
-                           int *n,                    /* number of inputs (batch size) */
-                           int *c,                    /* number of input feature maps  */
-                           int *h,                    /* height of input section */
-                           int *w,                    /* width of input section */
-                           int *nStride,
-                           int *cStride,
-                           int *hStride,
-                           int *wStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
+    const cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t *dataType, /* image data type */
+    int *n,                    /* number of inputs (batch size) */
+    int *c,                    /* number of input feature maps  */
+    int *h,                    /* height of input section */
+    int *w,                    /* width of input section */
+    int *nStride, int *cStride, int *hStride, int *wStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
+      int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
-                           cudnnDataType_t dataType,
-                           int nbDims,
-                           const int dimA[],
-                           const int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
+    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
+    const int dimA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
-                             cudnnTensorFormat_t format,
-                             cudnnDataType_t dataType,
-                             int nbDims,
-                             const int dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
-                           int nbDimsRequested,
-                           cudnnDataType_t *dataType,
-                           int *nbDims,
-                           int dimA[],
-                           int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
+    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
+                                   cudnnDataType_t *, int *, int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
+    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, size);
@@ -177,35 +172,33 @@ cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnTransformTensor(cudnnHandle_t handle,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t xDesc,
-                     const void *x,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t yDesc,
-                     void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnAddTensor(cudnnHandle_t handle,
-               const void *alpha,
-               const cudnnTensorDescriptor_t aDesc,
-               const void *A,
-               const void *beta,
-               const cudnnTensorDescriptor_t cDesc,
-               void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
+                                         const void *alpha,
+                                         const cudnnTensorDescriptor_t aDesc,
+                                         const void *A, const void *beta,
+                                         const cudnnTensorDescriptor_t cDesc,
+                                         void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
@@ -213,29 +206,29 @@ cudnnAddTensor(cudnnHandle_t handle,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
-                           cudnnOpTensorOp_t opTensorOp,
-                           cudnnDataType_t opTensorCompType,
-                           cudnnNanPropagation_t opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
+    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
+    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
+                                   cudnnDataType_t, cudnnNanPropagation_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
-                           cudnnOpTensorOp_t *opTensorOp,
-                           cudnnDataType_t *opTensorCompType,
-                           cudnnNanPropagation_t *opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
+    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
+    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
+      cudnnNanPropagation_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
@@ -243,126 +236,136 @@ cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnOpTensor(cudnnHandle_t handle,
-              const cudnnOpTensorDescriptor_t opTensorDesc,
-              const void *alpha1,
-              const cudnnTensorDescriptor_t aDesc,
-              const void *A,
-              const void *alpha2,
-              const cudnnTensorDescriptor_t bDesc,
-              const void *B,
-              const void *beta,
-              const cudnnTensorDescriptor_t cDesc,
-              void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
+    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
+    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
+    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
+    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
+                  beta, cDesc, C);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               cudnnReduceTensorOp_t reduceTensorOp,
-                               cudnnDataType_t reduceTensorCompType,
-                               cudnnNanPropagation_t reduceTensorNanOpt,
-                               cudnnReduceTensorIndices_t reduceTensorIndices,
-                               cudnnIndicesType_t reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
+    cudnnNanPropagation_t reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t reduceTensorIndices,
+    cudnnIndicesType_t reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
+      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               cudnnReduceTensorOp_t *reduceTensorOp,
-                               cudnnDataType_t *reduceTensorCompType,
-                               cudnnNanPropagation_t *reduceTensorNanOpt,
-                               cudnnReduceTensorIndices_t *reduceTensorIndices,
-                               cudnnIndicesType_t *reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
+    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t *reduceTensorOp,
+    cudnnDataType_t *reduceTensorCompType,
+    cudnnNanPropagation_t *reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t *reduceTensorIndices,
+    cudnnIndicesType_t *reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
+      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
+      cudnnIndicesType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReductionIndicesSize(cudnnHandle_t handle,
-                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                             const cudnnTensorDescriptor_t aDesc,
-                             const cudnnTensorDescriptor_t cDesc,
-                             size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
-                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               const cudnnTensorDescriptor_t aDesc,
-                               const cudnnTensorDescriptor_t cDesc,
-                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnReduceTensor(cudnnHandle_t handle,
-                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                  void *indices,
-                  size_t indicesSizeInBytes,
-                  void *workspace,
-                  size_t workspaceSizeInBytes,
-                  const void *alpha,
-                  const cudnnTensorDescriptor_t aDesc,
-                  const void *A,
-                  const void *beta,
-                  const cudnnTensorDescriptor_t cDesc,
-                  void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    void *indices, size_t indicesSizeInBytes, void *workspace,
+    size_t workspaceSizeInBytes, const void *alpha,
+    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
+    const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
+      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
+                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
+                  C);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
+                                         const cudnnTensorDescriptor_t yDesc,
+                                         void *y, const void *valuePtr) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, valuePtr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           void *y, const void *alpha) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, alpha);
@@ -370,68 +373,70 @@ cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t dataType, /* image data type */
-                           cudnnTensorFormat_t format,
-                           int k,  /* number of output feature maps */
-                           int c,  /* number of input feature maps */
-                           int h,  /* height of each input filter */
-                           int w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType,          /* image data type */
+    cudnnTensorFormat_t format, int k, /* number of output feature maps */
+    int c,                             /* number of input feature maps */
+    int h,                             /* height of each input filter */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t *dataType, /* image data type */
-                           cudnnTensorFormat_t *format,
-                           int *k,  /* number of output feature maps */
-                           int *c,  /* number of input feature maps */
-                           int *h,  /* height of each input filter */
-                           int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
+    const cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t *dataType,           /* image data type */
+    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
+    int *c,                              /* number of input feature maps */
+    int *h,                              /* height of each input filter */
+    int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
+      int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t dataType, /* image data type */
-                           cudnnTensorFormat_t format,
-                           int nbDims,
-                           const int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType, /* image data type */
+    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
-                           int nbDimsRequested,
-                           cudnnDataType_t *dataType, /* image data type */
-                           cudnnTensorFormat_t *format,
-                           int *nbDims,
-                           int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
+    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, /* image data type */
+    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
+      cudnnTensorFormat_t *, int *, int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
+                  filterDimA);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
@@ -439,622 +444,657 @@ cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
-                                int pad_h,      /* zero-padding height */
-                                int pad_w,      /* zero-padding width */
-                                int u,          /* vertical filter stride */
-                                int v,          /* horizontal filter stride */
-                                int dilation_h, /* filter dilation in the vertical dimension */
-                                int dilation_w, /* filter dilation in the horizontal dimension */
-                                cudnnConvolutionMode_t mode,
-                                cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
+    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
+    int pad_w,                                        /* zero-padding width */
+    int u,          /* vertical filter stride */
+    int v,          /* horizontal filter stride */
+    int dilation_h, /* filter dilation in the vertical dimension */
+    int dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
-                                int *pad_h,      /* zero-padding height */
-                                int *pad_w,      /* zero-padding width */
-                                int *u,          /* vertical filter stride */
-                                int *v,          /* horizontal filter stride */
-                                int *dilation_h, /* filter dilation in the vertical dimension */
-                                int *dilation_w, /* filter dilation in the horizontal dimension */
-                                cudnnConvolutionMode_t *mode,
-                                cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc,
+    int *pad_h,      /* zero-padding height */
+    int *pad_w,      /* zero-padding width */
+    int *u,          /* vertical filter stride */
+    int *v,          /* horizontal filter stride */
+    int *dilation_h, /* filter dilation in the vertical dimension */
+    int *dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
+      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
-                                      const cudnnTensorDescriptor_t inputTensorDesc,
-                                      const cudnnFilterDescriptor_t filterDesc,
-                                      int *n,
-                                      int *c,
-                                      int *h,
-                                      int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
-                                int arrayLength, /* nbDims-2 size */
-                                const int padA[],
-                                const int filterStrideA[],
-                                const int dilationA[],
-                                cudnnConvolutionMode_t mode,
-                                cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
+    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
+    const int padA[], const int filterStrideA[], const int dilationA[],
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
-                                int arrayLengthRequested,
-                                int *arrayLength,
-                                int padA[],
-                                int strideA[],
-                                int dilationA[],
-                                cudnnConvolutionMode_t *mode,
-                                cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
+    int *arrayLength, int padA[], int strideA[], int dilationA[],
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
+      cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
+                  dilationA, mode, computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
-                                      const cudnnTensorDescriptor_t inputTensorDesc,
-                                      const cudnnFilterDescriptor_t filterDesc,
-                                      int nbDims,
-                                      int tensorOutputDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int nbDims,
+    int tensorOutputDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOutputDimA);
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
+                  tensorOutputDimA);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
-                                     const cudnnTensorDescriptor_t xDesc,
-                                     const cudnnFilterDescriptor_t wDesc,
-                                     const cudnnConvolutionDescriptor_t convDesc,
-                                     const cudnnTensorDescriptor_t yDesc,
-                                     const int requestedAlgoCount,
-                                     int *returnedAlgoCount,
-                                     cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnTensorDescriptor_t xDesc,
-                                       const void *x,
-                                       const cudnnFilterDescriptor_t wDesc,
-                                       const void *w,
-                                       const cudnnConvolutionDescriptor_t convDesc,
-                                       const cudnnTensorDescriptor_t yDesc,
-                                       void *y,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnConvolutionFwdAlgoPerf_t *perfResults,
-                                       void *workSpace,
-                                       size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle,
-                                    const cudnnTensorDescriptor_t xDesc,
-                                    const cudnnFilterDescriptor_t wDesc,
-                                    const cudnnConvolutionDescriptor_t convDesc,
-                                    const cudnnTensorDescriptor_t yDesc,
-                                    cudnnConvolutionFwdPreference_t preference,
-                                    size_t memoryLimitInBytes,
-                                    cudnnConvolutionFwdAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc,
+    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionFwdAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
+      cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
-                                       const cudnnTensorDescriptor_t srcDesc,
-                                       const cudnnFilterDescriptor_t filterDesc,
-                                       const cudnnConvolutionDescriptor_t convDesc,
-                                       const cudnnTensorDescriptor_t destDesc,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnFilterDescriptor_t filterDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
-                                        const cudnnTensorDescriptor_t xDesc,
-                                        const cudnnFilterDescriptor_t wDesc,
-                                        const cudnnConvolutionDescriptor_t convDesc,
-                                        const cudnnTensorDescriptor_t yDesc,
-                                        cudnnConvolutionFwdAlgo_t algo,
-                                        size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionForward(cudnnHandle_t handle,
-                        const void *alpha,
-                        const cudnnTensorDescriptor_t xDesc,
-                        const void *x,
-                        const cudnnFilterDescriptor_t wDesc,
-                        const void *w,
-                        const cudnnConvolutionDescriptor_t convDesc,
-                        cudnnConvolutionFwdAlgo_t algo,
-                        void *workSpace,
-                        size_t workSpaceSizeInBytes,
-                        const void *beta,
-                        const cudnnTensorDescriptor_t yDesc,
-                        void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
-                                      const void *alpha1,
-                                      const cudnnTensorDescriptor_t xDesc,
-                                      const void *x,
-                                      const cudnnFilterDescriptor_t wDesc,
-                                      const void *w,
-                                      const cudnnConvolutionDescriptor_t convDesc,
-                                      cudnnConvolutionFwdAlgo_t algo,
-                                      void *workSpace,
-                                      size_t workSpaceSizeInBytes,
-                                      const void *alpha2,
-                                      const cudnnTensorDescriptor_t zDesc,
-                                      const void *z,
-                                      const cudnnTensorDescriptor_t biasDesc,
-                                      const void *bias,
-                                      const cudnnActivationDescriptor_t activationDesc,
-                                      const cudnnTensorDescriptor_t yDesc,
-                                      void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
+    cudnnHandle_t handle, const void *alpha1,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
+    const cudnnTensorDescriptor_t zDesc, const void *z,
+    const cudnnTensorDescriptor_t biasDesc, const void *bias,
+    const cudnnActivationDescriptor_t activationDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
+                  activationDesc, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardBias(cudnnHandle_t handle,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dbDesc,
-                             void *db) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dbDesc, void *db) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
-                                            const cudnnTensorDescriptor_t xDesc,
-                                            const cudnnTensorDescriptor_t dyDesc,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnFilterDescriptor_t dwDesc,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
-                                              const cudnnTensorDescriptor_t xDesc,
-                                              const void *x,
-                                              const cudnnTensorDescriptor_t dyDesc,
-                                              const void *y,
-                                              const cudnnConvolutionDescriptor_t convDesc,
-                                              const cudnnFilterDescriptor_t dwDesc,
-                                              void *dw,
-                                              const int requestedAlgoCount,
-                                              int *returnedAlgoCount,
-                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
-                                              void *workSpace,
-                                              size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *y,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
-                                           const cudnnTensorDescriptor_t xDesc,
-                                           const cudnnTensorDescriptor_t dyDesc,
-                                           const cudnnConvolutionDescriptor_t convDesc,
-                                           const cudnnFilterDescriptor_t dwDesc,
-                                           cudnnConvolutionBwdFilterPreference_t preference,
-                                           size_t memoryLimitInBytes,
-                                           cudnnConvolutionBwdFilterAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc,
+    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdFilterAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
+      size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
-                                              const cudnnTensorDescriptor_t srcDesc,
-                                              const cudnnTensorDescriptor_t diffDesc,
-                                              const cudnnConvolutionDescriptor_t convDesc,
-                                              const cudnnFilterDescriptor_t gradDesc,
-                                              const int requestedAlgoCount,
-                                              int *returnedAlgoCount,
-                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
-                                               const cudnnTensorDescriptor_t xDesc,
-                                               const cudnnTensorDescriptor_t dyDesc,
-                                               const cudnnConvolutionDescriptor_t convDesc,
-                                               const cudnnFilterDescriptor_t gradDesc,
-                                               cudnnConvolutionBwdFilterAlgo_t algo,
-                                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
-                               const void *alpha,
-                               const cudnnTensorDescriptor_t xDesc,
-                               const void *x,
-                               const cudnnTensorDescriptor_t dyDesc,
-                               const void *dy,
-                               const cudnnConvolutionDescriptor_t convDesc,
-                               cudnnConvolutionBwdFilterAlgo_t algo,
-                               void *workSpace,
-                               size_t workSpaceSizeInBytes,
-                               const void *beta,
-                               const cudnnFilterDescriptor_t dwDesc,
-                               void *dw) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnFilterDescriptor_t dwDesc, void *dw) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
+      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
-                                          const cudnnFilterDescriptor_t wDesc,
-                                          const cudnnTensorDescriptor_t dyDesc,
-                                          const cudnnConvolutionDescriptor_t convDesc,
-                                          const cudnnTensorDescriptor_t dxDesc,
-                                          const int requestedAlgoCount,
-                                          int *returnedAlgoCount,
-                                          cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
-                                            const cudnnFilterDescriptor_t wDesc,
-                                            const void *w,
-                                            const cudnnTensorDescriptor_t dyDesc,
-                                            const void *dy,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnTensorDescriptor_t dxDesc,
-                                            void *dx,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
-                                            void *workSpace,
-                                            size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
-                                         const cudnnFilterDescriptor_t wDesc,
-                                         const cudnnTensorDescriptor_t dyDesc,
-                                         const cudnnConvolutionDescriptor_t convDesc,
-                                         const cudnnTensorDescriptor_t dxDesc,
-                                         cudnnConvolutionBwdDataPreference_t preference,
-                                         size_t memoryLimitInBytes,
-                                         cudnnConvolutionBwdDataAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc,
+    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdDataAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
+      size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
-                                            const cudnnFilterDescriptor_t filterDesc,
-                                            const cudnnTensorDescriptor_t diffDesc,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnTensorDescriptor_t gradDesc,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
-                                             const cudnnFilterDescriptor_t wDesc,
-                                             const cudnnTensorDescriptor_t dyDesc,
-                                             const cudnnConvolutionDescriptor_t convDesc,
-                                             const cudnnTensorDescriptor_t dxDesc,
-                                             cudnnConvolutionBwdDataAlgo_t algo,
-                                             size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardData(cudnnHandle_t handle,
-                             const void *alpha,
-                             const cudnnFilterDescriptor_t wDesc,
-                             const void *w,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const cudnnConvolutionDescriptor_t convDesc,
-                             cudnnConvolutionBwdDataAlgo_t algo,
-                             void *workSpace,
-                             size_t workSpaceSizeInBytes,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dxDesc,
-                             void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnIm2Col(cudnnHandle_t handle,
-            const cudnnTensorDescriptor_t xDesc,
-            const void *x,
-            const cudnnFilterDescriptor_t wDesc,
-            const cudnnConvolutionDescriptor_t convDesc,
-            void *colBuffer) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+            const void *x, const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
+                                   const void *, const cudnnFilterDescriptor_t,
+                                   const cudnnConvolutionDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSoftmaxForward(cudnnHandle_t handle,
-                    cudnnSoftmaxAlgorithm_t algo,
-                    cudnnSoftmaxMode_t mode,
-                    const void *alpha,
-                    const cudnnTensorDescriptor_t xDesc,
-                    const void *x,
-                    const void *beta,
-                    const cudnnTensorDescriptor_t yDesc,
-                    void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSoftmaxBackward(cudnnHandle_t handle,
-                     cudnnSoftmaxAlgorithm_t algo,
-                     cudnnSoftmaxMode_t mode,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t dyDesc,
-                     const void *dy,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t dxDesc,
-                     void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
+                  dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
-                            cudnnPoolingMode_t mode,
-                            cudnnNanPropagation_t maxpoolingNanOpt,
-                            int windowHeight,
-                            int windowWidth,
-                            int verticalPadding,
-                            int horizontalPadding,
-                            int verticalStride,
-                            int horizontalStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
+    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
+    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
+    int verticalPadding, int horizontalPadding, int verticalStride,
+    int horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
+      int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
-                            cudnnPoolingMode_t *mode,
-                            cudnnNanPropagation_t *maxpoolingNanOpt,
-                            int *windowHeight,
-                            int *windowWidth,
-                            int *verticalPadding,
-                            int *horizontalPadding,
-                            int *verticalStride,
-                            int *horizontalStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
+    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
+    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
+    int *windowWidth, int *verticalPadding, int *horizontalPadding,
+    int *verticalStride, int *horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
-                            const cudnnPoolingMode_t mode,
-                            const cudnnNanPropagation_t maxpoolingNanOpt,
-                            int nbDims,
-                            const int windowDimA[],
-                            const int paddingA[],
-                            const int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
+    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
+    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
+    const int windowDimA[], const int paddingA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
+      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
+                  paddingA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
-                            int nbDimsRequested,
-                            cudnnPoolingMode_t *mode,
-                            cudnnNanPropagation_t *maxpoolingNanOpt,
-                            int *nbDims,
-                            int windowDimA[],
-                            int paddingA[],
-                            int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
+    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
+    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
+    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int[], int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
+                  windowDimA, paddingA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
                                   const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int nbDims,
-                                  int outputTensorDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+                                  int nbDims, int outputTensorDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                   const cudnnTensorDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
 }
@@ -1062,72 +1102,69 @@ cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
 cudnnStatus_t CUDNNWINAPI
 cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
                                   const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int *n,
-                                  int *c,
-                                  int *h,
-                                  int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+                                  int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnPoolingForward(cudnnHandle_t handle,
-                    const cudnnPoolingDescriptor_t poolingDesc,
-                    const void *alpha,
-                    const cudnnTensorDescriptor_t xDesc,
-                    const void *x,
-                    const void *beta,
-                    const cudnnTensorDescriptor_t yDesc,
-                    void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnPoolingBackward(cudnnHandle_t handle,
-                     const cudnnPoolingDescriptor_t poolingDesc,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t dyDesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t xDesc,
-                     const void *x,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t dxDesc,
-                     void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
-                             cudnnActivationMode_t mode,
-                             cudnnNanPropagation_t reluNanOpt,
-                             double coef) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
+    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
+    cudnnNanPropagation_t reluNanOpt, double coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
+                                               cudnnActivationMode_t,
+                                               cudnnNanPropagation_t, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
@@ -1136,9 +1173,10 @@ cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
 cudnnStatus_t CUDNNWINAPI
 cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
                              cudnnActivationMode_t *mode,
-                             cudnnNanPropagation_t *reluNanOpt,
-                             double *coef) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
+      cudnnNanPropagation_t *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
@@ -1146,65 +1184,68 @@ cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnActivationForward(cudnnHandle_t handle,
-                       cudnnActivationDescriptor_t activationDesc,
-                       const void *alpha,
-                       const cudnnTensorDescriptor_t xDesc,
-                       const void *x,
-                       const void *beta,
-                       const cudnnTensorDescriptor_t yDesc,
-                       void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnActivationBackward(cudnnHandle_t handle,
-                        cudnnActivationDescriptor_t activationDesc,
-                        const void *alpha,
-                        const cudnnTensorDescriptor_t yDesc,
-                        const void *y,
-                        const cudnnTensorDescriptor_t dyDesc,
-                        const void *dy,
-                        const cudnnTensorDescriptor_t xDesc,
-                        const void *x,
-                        const void *beta,
-                        const cudnnTensorDescriptor_t dxDesc,
-                        void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned lrnN, double lrnAlpha,
+                                                double lrnBeta, double lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int, double, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned *lrnN,
+                                                double *lrnAlpha,
+                                                double *lrnBeta, double *lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
@@ -1212,110 +1253,104 @@ cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrn
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(lrnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnLRNCrossChannelForward(cudnnHandle_t handle,
-                            cudnnLRNDescriptor_t normDesc,
-                            cudnnLRNMode_t lrnMode,
-                            const void *alpha,
-                            const cudnnTensorDescriptor_t xDesc,
-                            const void *x,
-                            const void *beta,
-                            const cudnnTensorDescriptor_t yDesc,
-                            void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
-                             cudnnLRNDescriptor_t normDesc,
-                             cudnnLRNMode_t lrnMode,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t yDesc,
-                             const void *y,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const cudnnTensorDescriptor_t xDesc,
-                             const void *x,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dxDesc,
-                             void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
+                  x, beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
-                                  cudnnLRNDescriptor_t normDesc,
-                                  cudnnDivNormMode_t mode,
-                                  const void *alpha,
-                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
-                                  const void *x,
-                                  const void *means, /* if NULL, means are assumed to be zero */
-                                  void *temp,
-                                  void *temp2,
-                                  const void *beta,
-                                  const cudnnTensorDescriptor_t yDesc,
-                                  void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
+                  beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
-                                   cudnnLRNDescriptor_t normDesc,
-                                   cudnnDivNormMode_t mode,
-                                   const void *alpha,
-                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
-                                   const void *x,
-                                   const void *means, /* if NULL, means are assumed to be zero */
-                                   const void *dy,
-                                   void *temp,
-                                   void *temp2,
-                                   const void *beta,
-                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
-                                   void *dx,                                   /* output x differential */
-                                   void *dMeans) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t
+        xDesc, /* same desc for x, means, dy, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    const void *dy, void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+    void *dx,                                   /* output x differential */
+    void *dMeans) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
+                  temp2, beta, dXdMeansDesc, dx, dMeans);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
-                              const cudnnTensorDescriptor_t xDesc,
-                              cudnnBatchNormMode_t mode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
+    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
+    cudnnBatchNormMode_t mode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               cudnnBatchNormMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(derivedBnDesc, xDesc, mode);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationForwardTraining(
-    cudnnHandle_t handle,
-    cudnnBatchNormMode_t mode,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
 
     const void *alpha, /* alpha[0] = result blend factor */
     const void *beta,  /* beta[0] = dest layer blend factor */
 
-    const cudnnTensorDescriptor_t xDesc,
-    const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc,
-    void *y, /* NxCxHxW */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
 
     /* Shared desc for the next 6 tensors in the argument list.
        Data type to be set as follows:
@@ -1323,13 +1358,13 @@ cudnnBatchNormalizationForwardTraining(
        Dimensions for this descriptor depend on normalization mode
        - Spatial Normalization : tensors are expected to have dims 1xCx1x1
         (normalization is performed across NxHxW)
-       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
-        (normalization is performed across N) */
+       - Per-Activation Normalization : tensors are expected to have dims of
+       1xCxHxW (normalization is performed across N) */
     const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
 
-    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
-    const void *bnScale,
-    const void *bnBias,
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
+     */
+    const void *bnScale, const void *bnBias,
 
     /* MUST use factor=1 in the very first call of a complete training cycle.
        Use a factor=1/(1+n) at N-th call to the function to get
@@ -1347,162 +1382,173 @@ cudnnBatchNormalizationForwardTraining(
        of  variance[x] (factor is applied in the same way as for runningMean) */
     void *resultRunningVariance,
 
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
+       backward functions. */
     double epsilon,
 
     /* Optionally save intermediate results from the forward pass here
        - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean,
-    void *resultSaveInvVariance) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+    void *resultSaveMean, void *resultSaveInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, double, void *, void *, double, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+  return func_ptr(
+      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
+      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
+      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
-                                        cudnnBatchNormMode_t mode,
-                                        const void *alpha, /* alpha[0] = result blend factor */
-                                        const void *beta,  /* beta[0] = dest layer blend factor */
-                                        const cudnnTensorDescriptor_t xDesc,
-                                        const void *x, /* NxCxHxW */
-                                        const cudnnTensorDescriptor_t yDesc,
-                                        void *y, /* NxCxHxW */
-                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-                                        const void *bnScale,
-                                        const void *bnBias,
-                                        const void *estimatedMean,
-                                        const void *estimatedVariance,
-                                        double epsilon) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
+    const void *bnBias, const void *estimatedMean,
+    const void *estimatedVariance, double epsilon) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, const void *, const void *, double);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
+                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
+                  estimatedVariance, epsilon);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationBackward(cudnnHandle_t handle,
-                                cudnnBatchNormMode_t mode,
-                                const void *alphaDataDiff,
-                                const void *betaDataDiff,
-                                const void *alphaParamDiff,
-                                const void *betaParamDiff,
-                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
-                                const void *x,
-                                const cudnnTensorDescriptor_t dyDesc,
-                                const void *dy,
-                                const cudnnTensorDescriptor_t dxDesc,
-                                void *dx,
-                                /* Shared tensor desc for the 4 tensors below */
-                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-                                const void *bnScale, /* bnBias doesn't affect backpropagation */
-                                /* scale and bias diff are not backpropagated below this layer */
-                                void *dBnScaleResult,
-                                void *dBnBiasResult,
-                                /* Same epsilon as forward pass */
-                                double epsilon,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
+    const void *betaDataDiff, const void *alphaParamDiff,
+    const void *betaParamDiff,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    /* Shared tensor desc for the 4 tensors below */
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+    const void *bnScale, /* bnBias doesn't affect backpropagation */
+    /* scale and bias diff are not backpropagated below this layer */
+    void *dBnScaleResult, void *dBnBiasResult,
+    /* Same epsilon as forward pass */
+    double epsilon,
 
-                                /* Optionally cached intermediate results from
-                                   forward pass */
-                                const void *savedMean,
-                                const void *savedInvVariance) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+    /* Optionally cached intermediate results from
+       forward pass */
+    const void *savedMean, const void *savedInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, void *, void *, double, const void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
+                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
+                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
+                  epsilon, savedMean, savedInvVariance);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
+    cudnnSpatialTransformerDescriptor_t *stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
-                                       cudnnSamplerType_t samplerType,
-                                       cudnnDataType_t dataType,
-                                       const int nbDims,
-                                       const int dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
+    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
+    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
+      const int, const int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
+    cudnnSpatialTransformerDescriptor_t stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
-                                   const cudnnSpatialTransformerDescriptor_t stDesc,
-                                   const void *theta,
-                                   void *grid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *theta, void *grid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, theta, grid);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
-                                    const cudnnSpatialTransformerDescriptor_t stDesc,
-                                    const void *dgrid,
-                                    void *dtheta) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *dgrid, void *dtheta) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, dgrid, dtheta);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
-                             cudnnSpatialTransformerDescriptor_t stDesc,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t xDesc,
-                             const void *x,
-                             const void *grid,
-                             const void *beta,
-                             cudnnTensorDescriptor_t yDesc,
-                             void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
+    void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
-                              cudnnSpatialTransformerDescriptor_t stDesc,
-                              const void *alpha,
-                              const cudnnTensorDescriptor_t xDesc,
-                              const void *x,
-                              const void *beta,
-                              const cudnnTensorDescriptor_t dxDesc,
-                              void *dx,
-                              const void *alphaDgrid,
-                              const cudnnTensorDescriptor_t dyDesc,
-                              const void *dy,
-                              const void *grid,
-                              const void *betaDgrid,
-                              void *dgrid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
+    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
+                  dyDesc, dy, grid, betaDgrid, dgrid);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
@@ -1510,99 +1556,95 @@ cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
+                                                    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
+    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(xdesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                          cudnnHandle_t handle,
-                          float dropout,
-                          void *states,
-                          size_t stateSizeInBytes,
-                          unsigned long long seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                              cudnnHandle_t handle,
-                              float dropout,
-                              void *states,
-                              size_t stateSizeInBytes,
-                              unsigned long long seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                          cudnnHandle_t handle,
-                          float *dropout,
-                          void **states,
-                          unsigned long long *seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float *, void **, unsigned long long *);
+cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
+    void **states, unsigned long long *seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float *, void **, unsigned long long *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutForward(cudnnHandle_t handle,
-                    const cudnnDropoutDescriptor_t dropoutDesc,
-                    const cudnnTensorDescriptor_t xdesc,
-                    const void *x,
-                    const cudnnTensorDescriptor_t ydesc,
-                    void *y,
-                    void *reserveSpace,
-                    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t xdesc, const void *x,
+    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutBackward(cudnnHandle_t handle,
-                     const cudnnDropoutDescriptor_t dropoutDesc,
-                     const cudnnTensorDescriptor_t dydesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t dxdesc,
-                     void *dx,
-                     void *reserveSpace,
-                     size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t dydesc, const void *dy,
+    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
@@ -1610,184 +1652,192 @@ cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNForwardInferenceAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardInferenceAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNForwardInferenceAlgorithmEx(cudnnHandle_t handle,
-                                        const cudnnRNNDescriptor_t rnnDesc,
-                                        const int seqLength,
-                                        const cudnnTensorDescriptor_t *xDesc,
-                                        const void *x,
-                                        const cudnnTensorDescriptor_t hxDesc,
-                                        const void *hx,
-                                        const cudnnTensorDescriptor_t cxDesc,
-                                        const void *cx,
-                                        const cudnnFilterDescriptor_t wDesc,
-                                        const void *w,
-                                        const cudnnTensorDescriptor_t *yDesc,
-                                        void *y,
-                                        const cudnnTensorDescriptor_t hyDesc,
-                                        void *hy,
-                                        const cudnnTensorDescriptor_t cyDesc,
-                                        void *cy,
-                                        const float findIntensity,
-                                        const int requestedAlgoCount,
-                                        int *returnedAlgoCount,
-                                        cudnnAlgorithmPerformance_t *perfResults,
-                                        void *workspace,
-                                        size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardInferenceAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNForwardTrainingAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardTrainingAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNForwardTrainingAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnRNNDescriptor_t rnnDesc,
-                                       const int seqLength,
-                                       const cudnnTensorDescriptor_t *xDesc,
-                                       const void *x,
-                                       const cudnnTensorDescriptor_t hxDesc,
-                                       const void *hx,
-                                       const cudnnTensorDescriptor_t cxDesc,
-                                       const void *cx,
-                                       const cudnnFilterDescriptor_t wDesc,
-                                       const void *w,
-                                       const cudnnTensorDescriptor_t *yDesc,
-                                       void *y,
-                                       const cudnnTensorDescriptor_t hyDesc,
-                                       void *hy,
-                                       const cudnnTensorDescriptor_t cyDesc,
-                                       void *cy,
-                                       const float findIntensity,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnAlgorithmPerformance_t *perfResults,
-                                       void *workspace,
-                                       size_t workSpaceSizeInBytes,
-                                       void *reserveSpace,
-                                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardTrainingAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardDataAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNBackwardDataAlgorithmEx(cudnnHandle_t handle,
-                                    const cudnnRNNDescriptor_t rnnDesc,
-                                    const int seqLength,
-                                    const cudnnTensorDescriptor_t *yDesc,
-                                    const void *y,
-                                    const cudnnTensorDescriptor_t *dyDesc,
-                                    const void *dy,
-                                    const cudnnTensorDescriptor_t dhyDesc,
-                                    const void *dhy,
-                                    const cudnnTensorDescriptor_t dcyDesc,
-                                    const void *dcy,
-                                    const cudnnFilterDescriptor_t wDesc,
-                                    const void *w,
-                                    const cudnnTensorDescriptor_t hxDesc,
-                                    const void *hx,
-                                    const cudnnTensorDescriptor_t cxDesc,
-                                    const void *cx,
-                                    const cudnnTensorDescriptor_t *dxDesc,
-                                    void *dx,
-                                    const cudnnTensorDescriptor_t dhxDesc,
-                                    void *dhx,
-                                    const cudnnTensorDescriptor_t dcxDesc,
-                                    void *dcx,
-                                    const float findIntensity,
-                                    const int requestedAlgoCount,
-                                    int *returnedAlgoCount,
-                                    cudnnAlgorithmPerformance_t *perfResults,
-                                    void *workspace,
-                                    size_t workSpaceSizeInBytes,
-                                    void *reserveSpace,
-                                    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardDataAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *yDesc, const void *y,
+    const cudnnTensorDescriptor_t *dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
+    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnTensorDescriptor_t *dxDesc, void *dx,
+    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+    const cudnnTensorDescriptor_t dcxDesc, void *dcx, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNBackwardWeightsAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardWeightsAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNBackwardWeightsAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnRNNDescriptor_t rnnDesc,
-                                       const int seqLength,
-                                       const cudnnTensorDescriptor_t *xDesc,
-                                       const void *x,
-                                       const cudnnTensorDescriptor_t hxDesc,
-                                       const void *hx,
-                                       const cudnnTensorDescriptor_t *yDesc,
-                                       const void *y,
-                                       const float findIntensity,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnAlgorithmPerformance_t *perfResults,
-                                       const void *workspace,
-                                       size_t workSpaceSizeInBytes,
-                                       const cudnnFilterDescriptor_t dwDesc,
-                                       void *dw,
-                                       const void *reserveSpace,
-                                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardWeightsAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y,
+    const float findIntensity, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnAlgorithmPerformance_t *perfResults,
+    const void *workspace, size_t workSpaceSizeInBytes,
+    const cudnnFilterDescriptor_t dwDesc, void *dw, const void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  findIntensity, requestedAlgoCount, returnedAlgoCount,
+                  perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw,
+                  reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
-                             const int minibatch,
-                             const cudnnDataType_t dataType,
-                             cudnnPersistentRNNPlan_t *plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
+    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
+                                               const cudnnDataType_t,
+                                               cudnnPersistentRNNPlan_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, minibatch, dataType, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, plan);
@@ -1795,289 +1845,285 @@ cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor(cudnnHandle_t handle,
-                      cudnnRNNDescriptor_t rnnDesc,
-                      const int hiddenSize,
-                      const int numLayers,
-                      cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */
-                      cudnnRNNInputMode_t inputMode,
-                      cudnnDirectionMode_t direction,
-                      cudnnRNNMode_t mode,
-                      cudnnRNNAlgo_t algo,
-                      cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers,
+    cudnnDropoutDescriptor_t
+        dropoutDesc, /* Between layers, not between recurrent steps. */
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNProjectionLayers(cudnnHandle_t handle,
-                            cudnnRNNDescriptor_t rnnDesc,
-                            const int recProjSize,
-                            const int outProjSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
+cudnnSetRNNProjectionLayers(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
+                            const int recProjSize, const int outProjSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNProjectionLayers(cudnnHandle_t handle,
-                            const cudnnRNNDescriptor_t rnnDesc,
-                            int *recProjSize,
-                            int *outProjSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *recProjSize,
+    int *outProjSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNAlgorithmDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
+    cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNDescriptor(cudnnHandle_t handle,
-                      cudnnRNNDescriptor_t rnnDesc,
-                      int *hiddenSize,
-                      int *numLayers,
-                      cudnnDropoutDescriptor_t *dropoutDesc,
-                      cudnnRNNInputMode_t *inputMode,
-                      cudnnDirectionMode_t *direction,
-                      cudnnRNNMode_t *mode,
-                      cudnnRNNAlgo_t *algo,
-                      cudnnDataType_t *dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *, cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *, cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
+    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
+    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
+    cudnnRNNMode_t *mode, cudnnRNNAlgo_t *algo, cudnnDataType_t *dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
+      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
+      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, mType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(
+    cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, mType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNWorkspaceSize(cudnnHandle_t handle,
-                         const cudnnRNNDescriptor_t rnnDesc,
-                         const int seqLength,
-                         const cudnnTensorDescriptor_t *xDesc,
-                         size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNTrainingReserveSize(cudnnHandle_t handle,
-                               const cudnnRNNDescriptor_t rnnDesc,
-                               const int seqLength,
-                               const cudnnTensorDescriptor_t *xDesc,
-                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNParamsSize(cudnnHandle_t handle,
-                      const cudnnRNNDescriptor_t rnnDesc,
-                      const cudnnTensorDescriptor_t xDesc,
-                      size_t *sizeInBytes,
+cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
                       cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
+      size_t *, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNLinLayerMatrixParams(cudnnHandle_t handle,
-                                const cudnnRNNDescriptor_t rnnDesc,
-                                const int pseudoLayer,
-                                const cudnnTensorDescriptor_t xDesc,
-                                const cudnnFilterDescriptor_t wDesc,
-                                const void *w,
-                                const int linLayerID,
-                                cudnnFilterDescriptor_t linLayerMatDesc,
-                                void **linLayerMat) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
+                  linLayerMatDesc, linLayerMat);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNLinLayerBiasParams(cudnnHandle_t handle,
-                              const cudnnRNNDescriptor_t rnnDesc,
-                              const int pseudoLayer,
-                              const cudnnTensorDescriptor_t xDesc,
-                              const cudnnFilterDescriptor_t wDesc,
-                              const void *w,
-                              const int linLayerID,
-                              cudnnFilterDescriptor_t linLayerBiasDesc,
-                              void **linLayerBias) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
+                  linLayerBiasDesc, linLayerBias);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardInference(cudnnHandle_t handle,
-                         const cudnnRNNDescriptor_t rnnDesc,
-                         const int seqLength,
-                         const cudnnTensorDescriptor_t *xDesc,
-                         const void *x,
-                         const cudnnTensorDescriptor_t hxDesc,
-                         const void *hx,
-                         const cudnnTensorDescriptor_t cxDesc,
-                         const void *cx,
-                         const cudnnFilterDescriptor_t wDesc,
-                         const void *w,
-                         const cudnnTensorDescriptor_t *yDesc,
-                         void *y,
-                         const cudnnTensorDescriptor_t hyDesc,
-                         void *hy,
-                         const cudnnTensorDescriptor_t cyDesc,
-                         void *cy,
-                         void *workspace,
-                         size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardTraining(cudnnHandle_t handle,
-                        const cudnnRNNDescriptor_t rnnDesc,
-                        const int seqLength,
-                        const cudnnTensorDescriptor_t *xDesc,
-                        const void *x,
-                        const cudnnTensorDescriptor_t hxDesc,
-                        const void *hx,
-                        const cudnnTensorDescriptor_t cxDesc,
-                        const void *cx,
-                        const cudnnFilterDescriptor_t wDesc,
-                        const void *w,
-                        const cudnnTensorDescriptor_t *yDesc,
-                        void *y,
-                        const cudnnTensorDescriptor_t hyDesc,
-                        void *hy,
-                        const cudnnTensorDescriptor_t cyDesc,
-                        void *cy,
-                        void *workspace,
-                        size_t workSpaceSizeInBytes,
-                        void *reserveSpace,
-                        size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardData(cudnnHandle_t handle,
-                     const cudnnRNNDescriptor_t rnnDesc,
-                     const int seqLength,
-                     const cudnnTensorDescriptor_t *yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t *dyDesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t dhyDesc,
-                     const void *dhy,
-                     const cudnnTensorDescriptor_t dcyDesc,
-                     const void *dcy,
-                     const cudnnFilterDescriptor_t wDesc,
-                     const void *w,
-                     const cudnnTensorDescriptor_t hxDesc,
-                     const void *hx,
-                     const cudnnTensorDescriptor_t cxDesc,
-                     const void *cx,
-                     const cudnnTensorDescriptor_t *dxDesc,
-                     void *dx,
-                     const cudnnTensorDescriptor_t dhxDesc,
-                     void *dhx,
-                     const cudnnTensorDescriptor_t dcxDesc,
-                     void *dcx,
-                     void *workspace,
-                     size_t workSpaceSizeInBytes,
-                     void *reserveSpace,
-                     size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
+                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
+                     const void *w, const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+                     void *workspace, size_t workSpaceSizeInBytes,
+                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardWeights(cudnnHandle_t handle,
-                        const cudnnRNNDescriptor_t rnnDesc,
-                        const int seqLength,
-                        const cudnnTensorDescriptor_t *xDesc,
-                        const void *x,
-                        const cudnnTensorDescriptor_t hxDesc,
-                        const void *hx,
-                        const cudnnTensorDescriptor_t *yDesc,
-                        const void *y,
-                        const void *workspace,
-                        size_t workSpaceSizeInBytes,
-                        const cudnnFilterDescriptor_t dwDesc,
-                        void *dw,
-                        const void *reserveSpace,
-                        size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
@@ -2085,82 +2131,102 @@ cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCTCLoss(
+cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
     cudnnHandle_t handle,
     const cudnnTensorDescriptor_t
-        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
-                          mini batch size, A is the alphabet size)  */
-    const void *probs, /* probabilities after softmax, in GPU memory */
-    const int *labels, /* labels, in CPU memory */
-    const int *labelLengths,                     /* the length of each label, in CPU memory */
-    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
-    void *costs,                                 /* the returned costs of CTC, in GPU memory */
-    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
-    const void *gradients,   /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the timing steps, N is the
+                      mini batch size, A is the alphabet size)  */
+    const void *probs,       /* probabilities after softmax, in GPU memory */
+    const int *labels,       /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    void *costs,             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t
+        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
+                          T,N,A */
+    const void *gradients,   /* the returned CTC gradients, in GPU memory, to
+                                compute costs only, set it to NULL */
     cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
     cudnnCTCLossDescriptor_t ctcLossDesc,
-    void *workspace,              /* pointer to the workspace, in GPU memory */
+    void *workspace, /* pointer to the workspace, in GPU memory */
     size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *, const int *, const int *, void *, const cudnnTensorDescriptor_t, const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *,
+      const int *, const int *, void *, const cudnnTensorDescriptor_t,
+      const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths,
+                  costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCTCLossWorkspaceSize(
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
     cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
-                                                timing steps, N is the mini batch size, A is the alphabet size) */
-    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
-                                                    dimensions are T,N,A. To compute costs
-                                                    only, set it to NULL */
-    const int *labels,                           /* labels, in CPU memory */
-    const int *labelLengths,                     /* the length of each label, in CPU memory */
-    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
-    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const int *, const int *, const int *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
+    const cudnnTensorDescriptor_t
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the
+                      timing steps, N is the mini batch size, A is the alphabet
+                      size) */
+    const cudnnTensorDescriptor_t
+        gradientsDesc,       /* Tensor descriptor for gradients, the
+                                dimensions are T,N,A. To compute costs
+                                only, set it to NULL */
+    const int *labels,       /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
+      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, algo, ctcLossDesc, sizeInBytes);
+  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
+                  inputLengths, algo, ctcLossDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmDescriptor(
+    cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithm_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc, algorithm);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmDescriptor(
+    const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithm_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc, algorithm);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnCopyAlgorithmDescriptor(
+    const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(src, dest);
@@ -2168,135 +2234,141 @@ cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorith
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, numberToCreate);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf,
-                             cudnnAlgorithmDescriptor_t algoDesc,
-                             cudnnStatus_t status,
-                             float time,
-                             size_t memory) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t, cudnnStatus_t, float, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t algoPerf, cudnnAlgorithmDescriptor_t algoDesc,
+    cudnnStatus_t status, float time, size_t memory) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t,
+                                               cudnnAlgorithmDescriptor_t,
+                                               cudnnStatus_t, float, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, algoDesc, status, time, memory);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf,
-                             cudnnAlgorithmDescriptor_t *algoDesc,
-                             cudnnStatus_t *status,
-                             float *time,
-                             size_t *memory) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *, cudnnStatus_t *, float *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmPerformance(
+    const cudnnAlgorithmPerformance_t algoPerf,
+    cudnnAlgorithmDescriptor_t *algoDesc, cudnnStatus_t *status, float *time,
+    size_t *memory) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *,
+      cudnnStatus_t *, float *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, algoDesc, status, time, memory);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
+cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, numberToDestroy);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmSpaceSize(
+    cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
+    size_t *algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnSaveAlgorithm(cudnnHandle_t handle,
-                   cudnnAlgorithmDescriptor_t algoDesc,
-                   void *algoSpace,
-                   size_t algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
+cudnnSaveAlgorithm(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
+                   void *algoSpace, size_t algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRestoreAlgorithm(cudnnHandle_t handle,
-                      void *algoSpace,
-                      size_t algoSpaceSizeInBytes,
-                      cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, void *, size_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnRestoreAlgorithm(
+    cudnnHandle_t handle, void *algoSpace, size_t algoSpaceSizeInBytes,
+    cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, void *, size_t,
+                                               cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNSetClip(cudnnHandle_t handle,
-                cudnnRNNDescriptor_t rnnDesc,
-                cudnnRNNClipMode_t clipMode,
-                cudnnNanPropagation_t clipNanOpt,
-                double lclip,
-                double rclip) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t, cudnnNanPropagation_t, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnRNNSetClip(cudnnHandle_t handle,
+                                          cudnnRNNDescriptor_t rnnDesc,
+                                          cudnnRNNClipMode_t clipMode,
+                                          cudnnNanPropagation_t clipNanOpt,
+                                          double lclip, double rclip) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t,
+      cudnnNanPropagation_t, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNSetClip");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNGetClip(cudnnHandle_t handle,
-                cudnnRNNDescriptor_t rnnDesc,
-                cudnnRNNClipMode_t *clipMode,
-                cudnnNanPropagation_t *clipNanOpt,
-                double *lclip,
-                double *rclip) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *, cudnnNanPropagation_t *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnRNNGetClip(cudnnHandle_t handle,
+                                          cudnnRNNDescriptor_t rnnDesc,
+                                          cudnnRNNClipMode_t *clipMode,
+                                          cudnnNanPropagation_t *clipNanOpt,
+                                          double *lclip, double *rclip) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *,
+      cudnnNanPropagation_t *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNGetClip");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetCallback(unsigned mask, void *udata,
+                                           cudnnCallback_t fptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mask, udata, fptr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetCallback(unsigned *mask, void **udata,
+                                           cudnnCallback_t *fptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mask, udata, fptr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNPaddingMode(
+    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNPaddingMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, paddingMode);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNPaddingMode(
+    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnRNNPaddingMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNPaddingMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, paddingMode);
@@ -2304,7 +2376,7 @@ cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *padd
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *RNNDataDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(RNNDataDesc);
@@ -2312,199 +2384,202 @@ cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *RNNDataDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(RNNDataDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc,
-                          cudnnDataType_t dataType,
-                          cudnnRNNDataLayout_t layout,
-                          int maxSeqLength,
-                          int batchSize,
-                          int vectorSize,
-                          const int seqLengthArray[], /* length of each sequence in the batch */
-                          void *paddingFill) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int, int, const int [], void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDataDescriptor(
+    cudnnRNNDataDescriptor_t RNNDataDesc, cudnnDataType_t dataType,
+    cudnnRNNDataLayout_t layout, int maxSeqLength, int batchSize,
+    int vectorSize,
+    const int seqLengthArray[], /* length of each sequence in the batch */
+    void *paddingFill) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int,
+      int, const int[], void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, seqLengthArray, paddingFill);
+  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize,
+                  vectorSize, seqLengthArray, paddingFill);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc,
-                          cudnnDataType_t *dataType,
-                          cudnnRNNDataLayout_t *layout,
-                          int *maxSeqLength,
-                          int *batchSize,
-                          int *vectorSize,
-                          int arrayLengthRequested,
-                          int seqLengthArray[],
-                          void *paddingFill) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *, int *, int *, int *, int, int [], void *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDataDescriptor(
+    cudnnRNNDataDescriptor_t RNNDataDesc, cudnnDataType_t *dataType,
+    cudnnRNNDataLayout_t *layout, int *maxSeqLength, int *batchSize,
+    int *vectorSize, int arrayLengthRequested, int seqLengthArray[],
+    void *paddingFill) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *,
+      int *, int *, int *, int, int[], void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, arrayLengthRequested, seqLengthArray, paddingFill);
+  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize,
+                  vectorSize, arrayLengthRequested, seqLengthArray,
+                  paddingFill);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardTrainingEx(cudnnHandle_t handle,
-                          const cudnnRNNDescriptor_t rnnDesc,
-                          const cudnnRNNDataDescriptor_t xDesc,
-                          const void *x,
-                          const cudnnTensorDescriptor_t hxDesc,
-                          const void *hx,
-                          const cudnnTensorDescriptor_t cxDesc,
-                          const void *cx,
-                          const cudnnFilterDescriptor_t wDesc,
-                          const void *w,
-                          const cudnnRNNDataDescriptor_t yDesc,
-                          void *y,
-                          const cudnnTensorDescriptor_t hyDesc,
-                          void *hy,
-                          const cudnnTensorDescriptor_t cyDesc,
-                          void *cy,
-                          const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-                          const void *keys,                     /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-                          void *cAttn,                          /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-                          void *iAttn,                          /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-                          void *queries,                        /* reserved, should pass NULL */
-                          void *workSpace,
-                          size_t workSpaceSizeInBytes,
-                          void *reserveSpace,
-                          size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTrainingEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnRNNDataDescriptor_t yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy,
+    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+    const void *keys,                     /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+    void *cAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+    void *iAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+    void *queries,                        /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
+      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTrainingEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
+                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
+                  iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes,
+                  reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardInferenceEx(cudnnHandle_t handle,
-                           const cudnnRNNDescriptor_t rnnDesc,
-                           const cudnnRNNDataDescriptor_t xDesc,
-                           const void *x,
-                           const cudnnTensorDescriptor_t hxDesc,
-                           const void *hx,
-                           const cudnnTensorDescriptor_t cxDesc,
-                           const void *cx,
-                           const cudnnFilterDescriptor_t wDesc,
-                           const void *w,
-                           const cudnnRNNDataDescriptor_t yDesc,
-                           void *y,
-                           const cudnnTensorDescriptor_t hyDesc,
-                           void *hy,
-                           const cudnnTensorDescriptor_t cyDesc,
-                           void *cy,
-                           const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-                           const void *keys,                     /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-                           void *cAttn,                          /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-                           void *iAttn,                          /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-                           void *queries,                        /* reserved, should pass NULL */
-                           void *workSpace,
-                           size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInferenceEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnRNNDataDescriptor_t yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy,
+    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+    const void *keys,                     /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+    void *cAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+    void *iAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+    void *queries,                        /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
+      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInferenceEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
+                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
+                  iDesc, iAttn, qDesc, queries, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardDataEx(cudnnHandle_t handle,
-                       const cudnnRNNDescriptor_t rnnDesc,
-                       const cudnnRNNDataDescriptor_t yDesc,
-                       const void *y,
-                       const cudnnRNNDataDescriptor_t dyDesc,
-                       const void *dy,
-                       const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
-                       const void *dcAttn,                    /* reserved, should pass NULL */
-                       const cudnnTensorDescriptor_t dhyDesc,
-                       const void *dhy,
-                       const cudnnTensorDescriptor_t dcyDesc,
-                       const void *dcy,
-                       const cudnnFilterDescriptor_t wDesc,
-                       const void *w,
-                       const cudnnTensorDescriptor_t hxDesc,
-                       const void *hx,
-                       const cudnnTensorDescriptor_t cxDesc,
-                       const void *cx,
-                       const cudnnRNNDataDescriptor_t dxDesc,
-                       void *dx,
-                       const cudnnTensorDescriptor_t dhxDesc,
-                       void *dhx,
-                       const cudnnTensorDescriptor_t dcxDesc,
-                       void *dcx,
-                       const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
-                       void *dkeys,                           /* reserved, should pass NULL */
-                       void *workSpace,
-                       size_t workSpaceSizeInBytes,
-                       void *reserveSpace,
-                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardDataEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t yDesc, const void *y,
+    const cudnnRNNDataDescriptor_t dyDesc, const void *dy,
+    const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
+    const void *dcAttn,                    /* reserved, should pass NULL */
+    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
+    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnRNNDataDescriptor_t dxDesc, void *dx,
+    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+    const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+    const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
+    void *dkeys,                           /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardDataEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn,
+                  dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx,
+                  dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys,
+                  workSpace, workSpaceSizeInBytes, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardWeightsEx(cudnnHandle_t handle,
-                          const cudnnRNNDescriptor_t rnnDesc,
-                          const cudnnRNNDataDescriptor_t xDesc,
-                          const void *x,
-                          const cudnnTensorDescriptor_t hxDesc,
-                          const void *hx,
-                          const cudnnRNNDataDescriptor_t yDesc,
-                          const void *y,
-                          void *workSpace,
-                          size_t workSpaceSizeInBytes,
-                          const cudnnFilterDescriptor_t dwDesc,
-                          void *dw,
-                          void *reserveSpace,
-                          size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, void *, size_t, const cudnnFilterDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeightsEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnRNNDataDescriptor_t yDesc, const void *y, void *workSpace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, const void *, void *, size_t,
+      const cudnnFilterDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeightsEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace,
+                  workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor_v6(cudnnHandle_t handle,
-                         cudnnRNNDescriptor_t rnnDesc,
-                         const int hiddenSize,
-                         const int numLayers,
-                         cudnnDropoutDescriptor_t dropoutDesc,
-                         cudnnRNNInputMode_t inputMode,
-                         cudnnDirectionMode_t direction,
-                         cudnnRNNMode_t mode,
-                         cudnnRNNAlgo_t algo,
-                         cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t rnnDesc,
-                         int hiddenSize,
-                         int numLayers,
-                         cudnnDropoutDescriptor_t dropoutDesc,
-                         cudnnRNNInputMode_t inputMode,
-                         cudnnDirectionMode_t direction,
-                         cudnnRNNMode_t mode,
-                         cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(
+    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnRNNInputMode_t inputMode,
+    cudnnDirectionMode_t direction, cudnnRNNMode_t mode,
+    cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
+      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
+      cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, dataType);
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
+                  direction, mode, dataType);
 }
 
 }  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_7_4.inc b/tensorflow/stream_executor/cuda/cudnn_7_4.inc
index bd9f49f9780..883c8ba8812 100644
--- a/tensorflow/stream_executor/cuda/cudnn_7_4.inc
+++ b/tensorflow/stream_executor/cuda/cudnn_7_4.inc
@@ -2,73 +2,71 @@
 
 extern "C" {
 
-size_t CUDNNWINAPI
-cudnnGetVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+size_t CUDNNWINAPI cudnnGetVersion(void) {
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-size_t CUDNNWINAPI
-cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-const char *CUDNNWINAPI
-cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
   if (!func_ptr) return "cudnnGetErrorString symbol not found.";
   return func_ptr(status);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
+cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
+                                                 cudnnStatus_t *rstatus,
+                                                 cudnnErrQueryMode_t mode,
+                                                 cudnnRuntimeTag_t *tag) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rstatus, mode, tag);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
+                                           int *value) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreate(cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroy(cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
+                                         cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
+                                         cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
@@ -76,100 +74,97 @@ cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
-                           cudnnTensorFormat_t format,
-                           cudnnDataType_t dataType, /* image data type */
-                           int n,                    /* number of inputs (batch size) */
-                           int c,                    /* number of input feature maps */
-                           int h,                    /* height of input section */
-                           int w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
-                             cudnnDataType_t dataType, /* image data type */
-                             int n,                    /* number of inputs (batch size) */
-                             int c,                    /* number of input feature maps */
-                             int h,                    /* height of input section */
-                             int w,                    /* width of input section */
-                             int nStride,
-                             int cStride,
-                             int hStride,
-                             int wStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
+    cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w,                    /* width of input section */
+    int nStride, int cStride, int hStride, int wStride) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
+                                   int, int, int, int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
-                           cudnnDataType_t *dataType, /* image data type */
-                           int *n,                    /* number of inputs (batch size) */
-                           int *c,                    /* number of input feature maps  */
-                           int *h,                    /* height of input section */
-                           int *w,                    /* width of input section */
-                           int *nStride,
-                           int *cStride,
-                           int *hStride,
-                           int *wStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
+    const cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t *dataType, /* image data type */
+    int *n,                    /* number of inputs (batch size) */
+    int *c,                    /* number of input feature maps  */
+    int *h,                    /* height of input section */
+    int *w,                    /* width of input section */
+    int *nStride, int *cStride, int *hStride, int *wStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
+      int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
-                           cudnnDataType_t dataType,
-                           int nbDims,
-                           const int dimA[],
-                           const int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
+    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
+    const int dimA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
-                             cudnnTensorFormat_t format,
-                             cudnnDataType_t dataType,
-                             int nbDims,
-                             const int dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
-                           int nbDimsRequested,
-                           cudnnDataType_t *dataType,
-                           int *nbDims,
-                           int dimA[],
-                           int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
+    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
+                                   cudnnDataType_t *, int *, int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
+    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, size);
@@ -177,35 +172,33 @@ cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnTransformTensor(cudnnHandle_t handle,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t xDesc,
-                     const void *x,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t yDesc,
-                     void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnAddTensor(cudnnHandle_t handle,
-               const void *alpha,
-               const cudnnTensorDescriptor_t aDesc,
-               const void *A,
-               const void *beta,
-               const cudnnTensorDescriptor_t cDesc,
-               void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
+                                         const void *alpha,
+                                         const cudnnTensorDescriptor_t aDesc,
+                                         const void *A, const void *beta,
+                                         const cudnnTensorDescriptor_t cDesc,
+                                         void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
@@ -213,29 +206,29 @@ cudnnAddTensor(cudnnHandle_t handle,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
-                           cudnnOpTensorOp_t opTensorOp,
-                           cudnnDataType_t opTensorCompType,
-                           cudnnNanPropagation_t opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
+    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
+    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
+                                   cudnnDataType_t, cudnnNanPropagation_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
-                           cudnnOpTensorOp_t *opTensorOp,
-                           cudnnDataType_t *opTensorCompType,
-                           cudnnNanPropagation_t *opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
+    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
+    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
+      cudnnNanPropagation_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
@@ -243,126 +236,136 @@ cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnOpTensor(cudnnHandle_t handle,
-              const cudnnOpTensorDescriptor_t opTensorDesc,
-              const void *alpha1,
-              const cudnnTensorDescriptor_t aDesc,
-              const void *A,
-              const void *alpha2,
-              const cudnnTensorDescriptor_t bDesc,
-              const void *B,
-              const void *beta,
-              const cudnnTensorDescriptor_t cDesc,
-              void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
+    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
+    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
+    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
+    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
+                  beta, cDesc, C);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               cudnnReduceTensorOp_t reduceTensorOp,
-                               cudnnDataType_t reduceTensorCompType,
-                               cudnnNanPropagation_t reduceTensorNanOpt,
-                               cudnnReduceTensorIndices_t reduceTensorIndices,
-                               cudnnIndicesType_t reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
+    cudnnNanPropagation_t reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t reduceTensorIndices,
+    cudnnIndicesType_t reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
+      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               cudnnReduceTensorOp_t *reduceTensorOp,
-                               cudnnDataType_t *reduceTensorCompType,
-                               cudnnNanPropagation_t *reduceTensorNanOpt,
-                               cudnnReduceTensorIndices_t *reduceTensorIndices,
-                               cudnnIndicesType_t *reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
+    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t *reduceTensorOp,
+    cudnnDataType_t *reduceTensorCompType,
+    cudnnNanPropagation_t *reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t *reduceTensorIndices,
+    cudnnIndicesType_t *reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
+      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
+      cudnnIndicesType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReductionIndicesSize(cudnnHandle_t handle,
-                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                             const cudnnTensorDescriptor_t aDesc,
-                             const cudnnTensorDescriptor_t cDesc,
-                             size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
-                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               const cudnnTensorDescriptor_t aDesc,
-                               const cudnnTensorDescriptor_t cDesc,
-                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnReduceTensor(cudnnHandle_t handle,
-                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                  void *indices,
-                  size_t indicesSizeInBytes,
-                  void *workspace,
-                  size_t workspaceSizeInBytes,
-                  const void *alpha,
-                  const cudnnTensorDescriptor_t aDesc,
-                  const void *A,
-                  const void *beta,
-                  const cudnnTensorDescriptor_t cDesc,
-                  void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    void *indices, size_t indicesSizeInBytes, void *workspace,
+    size_t workspaceSizeInBytes, const void *alpha,
+    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
+    const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
+      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
+                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
+                  C);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
+                                         const cudnnTensorDescriptor_t yDesc,
+                                         void *y, const void *valuePtr) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, valuePtr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           void *y, const void *alpha) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, alpha);
@@ -370,68 +373,70 @@ cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t dataType, /* image data type */
-                           cudnnTensorFormat_t format,
-                           int k,  /* number of output feature maps */
-                           int c,  /* number of input feature maps */
-                           int h,  /* height of each input filter */
-                           int w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType,          /* image data type */
+    cudnnTensorFormat_t format, int k, /* number of output feature maps */
+    int c,                             /* number of input feature maps */
+    int h,                             /* height of each input filter */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t *dataType, /* image data type */
-                           cudnnTensorFormat_t *format,
-                           int *k,  /* number of output feature maps */
-                           int *c,  /* number of input feature maps */
-                           int *h,  /* height of each input filter */
-                           int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
+    const cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t *dataType,           /* image data type */
+    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
+    int *c,                              /* number of input feature maps */
+    int *h,                              /* height of each input filter */
+    int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
+      int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t dataType, /* image data type */
-                           cudnnTensorFormat_t format,
-                           int nbDims,
-                           const int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType, /* image data type */
+    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
-                           int nbDimsRequested,
-                           cudnnDataType_t *dataType, /* image data type */
-                           cudnnTensorFormat_t *format,
-                           int *nbDims,
-                           int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
+    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, /* image data type */
+    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
+      cudnnTensorFormat_t *, int *, int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
+                  filterDimA);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
@@ -439,622 +444,657 @@ cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
-                                int pad_h,      /* zero-padding height */
-                                int pad_w,      /* zero-padding width */
-                                int u,          /* vertical filter stride */
-                                int v,          /* horizontal filter stride */
-                                int dilation_h, /* filter dilation in the vertical dimension */
-                                int dilation_w, /* filter dilation in the horizontal dimension */
-                                cudnnConvolutionMode_t mode,
-                                cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
+    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
+    int pad_w,                                        /* zero-padding width */
+    int u,          /* vertical filter stride */
+    int v,          /* horizontal filter stride */
+    int dilation_h, /* filter dilation in the vertical dimension */
+    int dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
-                                int *pad_h,      /* zero-padding height */
-                                int *pad_w,      /* zero-padding width */
-                                int *u,          /* vertical filter stride */
-                                int *v,          /* horizontal filter stride */
-                                int *dilation_h, /* filter dilation in the vertical dimension */
-                                int *dilation_w, /* filter dilation in the horizontal dimension */
-                                cudnnConvolutionMode_t *mode,
-                                cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc,
+    int *pad_h,      /* zero-padding height */
+    int *pad_w,      /* zero-padding width */
+    int *u,          /* vertical filter stride */
+    int *v,          /* horizontal filter stride */
+    int *dilation_h, /* filter dilation in the vertical dimension */
+    int *dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
+      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
-                                      const cudnnTensorDescriptor_t inputTensorDesc,
-                                      const cudnnFilterDescriptor_t filterDesc,
-                                      int *n,
-                                      int *c,
-                                      int *h,
-                                      int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
-                                int arrayLength, /* nbDims-2 size */
-                                const int padA[],
-                                const int filterStrideA[],
-                                const int dilationA[],
-                                cudnnConvolutionMode_t mode,
-                                cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
+    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
+    const int padA[], const int filterStrideA[], const int dilationA[],
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
-                                int arrayLengthRequested,
-                                int *arrayLength,
-                                int padA[],
-                                int strideA[],
-                                int dilationA[],
-                                cudnnConvolutionMode_t *mode,
-                                cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
+    int *arrayLength, int padA[], int strideA[], int dilationA[],
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
+      cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
+                  dilationA, mode, computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
-                                      const cudnnTensorDescriptor_t inputTensorDesc,
-                                      const cudnnFilterDescriptor_t filterDesc,
-                                      int nbDims,
-                                      int tensorOutputDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int nbDims,
+    int tensorOutputDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOutputDimA);
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
+                  tensorOutputDimA);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
-                                     const cudnnTensorDescriptor_t xDesc,
-                                     const cudnnFilterDescriptor_t wDesc,
-                                     const cudnnConvolutionDescriptor_t convDesc,
-                                     const cudnnTensorDescriptor_t yDesc,
-                                     const int requestedAlgoCount,
-                                     int *returnedAlgoCount,
-                                     cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnTensorDescriptor_t xDesc,
-                                       const void *x,
-                                       const cudnnFilterDescriptor_t wDesc,
-                                       const void *w,
-                                       const cudnnConvolutionDescriptor_t convDesc,
-                                       const cudnnTensorDescriptor_t yDesc,
-                                       void *y,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnConvolutionFwdAlgoPerf_t *perfResults,
-                                       void *workSpace,
-                                       size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle,
-                                    const cudnnTensorDescriptor_t xDesc,
-                                    const cudnnFilterDescriptor_t wDesc,
-                                    const cudnnConvolutionDescriptor_t convDesc,
-                                    const cudnnTensorDescriptor_t yDesc,
-                                    cudnnConvolutionFwdPreference_t preference,
-                                    size_t memoryLimitInBytes,
-                                    cudnnConvolutionFwdAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc,
+    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionFwdAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
+      cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
-                                       const cudnnTensorDescriptor_t srcDesc,
-                                       const cudnnFilterDescriptor_t filterDesc,
-                                       const cudnnConvolutionDescriptor_t convDesc,
-                                       const cudnnTensorDescriptor_t destDesc,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnFilterDescriptor_t filterDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
-                                        const cudnnTensorDescriptor_t xDesc,
-                                        const cudnnFilterDescriptor_t wDesc,
-                                        const cudnnConvolutionDescriptor_t convDesc,
-                                        const cudnnTensorDescriptor_t yDesc,
-                                        cudnnConvolutionFwdAlgo_t algo,
-                                        size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionForward(cudnnHandle_t handle,
-                        const void *alpha,
-                        const cudnnTensorDescriptor_t xDesc,
-                        const void *x,
-                        const cudnnFilterDescriptor_t wDesc,
-                        const void *w,
-                        const cudnnConvolutionDescriptor_t convDesc,
-                        cudnnConvolutionFwdAlgo_t algo,
-                        void *workSpace,
-                        size_t workSpaceSizeInBytes,
-                        const void *beta,
-                        const cudnnTensorDescriptor_t yDesc,
-                        void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
-                                      const void *alpha1,
-                                      const cudnnTensorDescriptor_t xDesc,
-                                      const void *x,
-                                      const cudnnFilterDescriptor_t wDesc,
-                                      const void *w,
-                                      const cudnnConvolutionDescriptor_t convDesc,
-                                      cudnnConvolutionFwdAlgo_t algo,
-                                      void *workSpace,
-                                      size_t workSpaceSizeInBytes,
-                                      const void *alpha2,
-                                      const cudnnTensorDescriptor_t zDesc,
-                                      const void *z,
-                                      const cudnnTensorDescriptor_t biasDesc,
-                                      const void *bias,
-                                      const cudnnActivationDescriptor_t activationDesc,
-                                      const cudnnTensorDescriptor_t yDesc,
-                                      void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
+    cudnnHandle_t handle, const void *alpha1,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
+    const cudnnTensorDescriptor_t zDesc, const void *z,
+    const cudnnTensorDescriptor_t biasDesc, const void *bias,
+    const cudnnActivationDescriptor_t activationDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
+                  activationDesc, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardBias(cudnnHandle_t handle,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dbDesc,
-                             void *db) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dbDesc, void *db) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
-                                            const cudnnTensorDescriptor_t xDesc,
-                                            const cudnnTensorDescriptor_t dyDesc,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnFilterDescriptor_t dwDesc,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
-                                              const cudnnTensorDescriptor_t xDesc,
-                                              const void *x,
-                                              const cudnnTensorDescriptor_t dyDesc,
-                                              const void *y,
-                                              const cudnnConvolutionDescriptor_t convDesc,
-                                              const cudnnFilterDescriptor_t dwDesc,
-                                              void *dw,
-                                              const int requestedAlgoCount,
-                                              int *returnedAlgoCount,
-                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
-                                              void *workSpace,
-                                              size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *y,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
-                                           const cudnnTensorDescriptor_t xDesc,
-                                           const cudnnTensorDescriptor_t dyDesc,
-                                           const cudnnConvolutionDescriptor_t convDesc,
-                                           const cudnnFilterDescriptor_t dwDesc,
-                                           cudnnConvolutionBwdFilterPreference_t preference,
-                                           size_t memoryLimitInBytes,
-                                           cudnnConvolutionBwdFilterAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc,
+    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdFilterAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
+      size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
-                                              const cudnnTensorDescriptor_t srcDesc,
-                                              const cudnnTensorDescriptor_t diffDesc,
-                                              const cudnnConvolutionDescriptor_t convDesc,
-                                              const cudnnFilterDescriptor_t gradDesc,
-                                              const int requestedAlgoCount,
-                                              int *returnedAlgoCount,
-                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
-                                               const cudnnTensorDescriptor_t xDesc,
-                                               const cudnnTensorDescriptor_t dyDesc,
-                                               const cudnnConvolutionDescriptor_t convDesc,
-                                               const cudnnFilterDescriptor_t gradDesc,
-                                               cudnnConvolutionBwdFilterAlgo_t algo,
-                                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
-                               const void *alpha,
-                               const cudnnTensorDescriptor_t xDesc,
-                               const void *x,
-                               const cudnnTensorDescriptor_t dyDesc,
-                               const void *dy,
-                               const cudnnConvolutionDescriptor_t convDesc,
-                               cudnnConvolutionBwdFilterAlgo_t algo,
-                               void *workSpace,
-                               size_t workSpaceSizeInBytes,
-                               const void *beta,
-                               const cudnnFilterDescriptor_t dwDesc,
-                               void *dw) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnFilterDescriptor_t dwDesc, void *dw) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
+      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
-                                          const cudnnFilterDescriptor_t wDesc,
-                                          const cudnnTensorDescriptor_t dyDesc,
-                                          const cudnnConvolutionDescriptor_t convDesc,
-                                          const cudnnTensorDescriptor_t dxDesc,
-                                          const int requestedAlgoCount,
-                                          int *returnedAlgoCount,
-                                          cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
-                                            const cudnnFilterDescriptor_t wDesc,
-                                            const void *w,
-                                            const cudnnTensorDescriptor_t dyDesc,
-                                            const void *dy,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnTensorDescriptor_t dxDesc,
-                                            void *dx,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
-                                            void *workSpace,
-                                            size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
-                                         const cudnnFilterDescriptor_t wDesc,
-                                         const cudnnTensorDescriptor_t dyDesc,
-                                         const cudnnConvolutionDescriptor_t convDesc,
-                                         const cudnnTensorDescriptor_t dxDesc,
-                                         cudnnConvolutionBwdDataPreference_t preference,
-                                         size_t memoryLimitInBytes,
-                                         cudnnConvolutionBwdDataAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc,
+    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdDataAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
+      size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
-                                            const cudnnFilterDescriptor_t filterDesc,
-                                            const cudnnTensorDescriptor_t diffDesc,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnTensorDescriptor_t gradDesc,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
-                                             const cudnnFilterDescriptor_t wDesc,
-                                             const cudnnTensorDescriptor_t dyDesc,
-                                             const cudnnConvolutionDescriptor_t convDesc,
-                                             const cudnnTensorDescriptor_t dxDesc,
-                                             cudnnConvolutionBwdDataAlgo_t algo,
-                                             size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardData(cudnnHandle_t handle,
-                             const void *alpha,
-                             const cudnnFilterDescriptor_t wDesc,
-                             const void *w,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const cudnnConvolutionDescriptor_t convDesc,
-                             cudnnConvolutionBwdDataAlgo_t algo,
-                             void *workSpace,
-                             size_t workSpaceSizeInBytes,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dxDesc,
-                             void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnIm2Col(cudnnHandle_t handle,
-            const cudnnTensorDescriptor_t xDesc,
-            const void *x,
-            const cudnnFilterDescriptor_t wDesc,
-            const cudnnConvolutionDescriptor_t convDesc,
-            void *colBuffer) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+            const void *x, const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
+                                   const void *, const cudnnFilterDescriptor_t,
+                                   const cudnnConvolutionDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSoftmaxForward(cudnnHandle_t handle,
-                    cudnnSoftmaxAlgorithm_t algo,
-                    cudnnSoftmaxMode_t mode,
-                    const void *alpha,
-                    const cudnnTensorDescriptor_t xDesc,
-                    const void *x,
-                    const void *beta,
-                    const cudnnTensorDescriptor_t yDesc,
-                    void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSoftmaxBackward(cudnnHandle_t handle,
-                     cudnnSoftmaxAlgorithm_t algo,
-                     cudnnSoftmaxMode_t mode,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t dyDesc,
-                     const void *dy,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t dxDesc,
-                     void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
+                  dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
-                            cudnnPoolingMode_t mode,
-                            cudnnNanPropagation_t maxpoolingNanOpt,
-                            int windowHeight,
-                            int windowWidth,
-                            int verticalPadding,
-                            int horizontalPadding,
-                            int verticalStride,
-                            int horizontalStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
+    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
+    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
+    int verticalPadding, int horizontalPadding, int verticalStride,
+    int horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
+      int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
-                            cudnnPoolingMode_t *mode,
-                            cudnnNanPropagation_t *maxpoolingNanOpt,
-                            int *windowHeight,
-                            int *windowWidth,
-                            int *verticalPadding,
-                            int *horizontalPadding,
-                            int *verticalStride,
-                            int *horizontalStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
+    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
+    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
+    int *windowWidth, int *verticalPadding, int *horizontalPadding,
+    int *verticalStride, int *horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
-                            const cudnnPoolingMode_t mode,
-                            const cudnnNanPropagation_t maxpoolingNanOpt,
-                            int nbDims,
-                            const int windowDimA[],
-                            const int paddingA[],
-                            const int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
+    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
+    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
+    const int windowDimA[], const int paddingA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
+      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
+                  paddingA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
-                            int nbDimsRequested,
-                            cudnnPoolingMode_t *mode,
-                            cudnnNanPropagation_t *maxpoolingNanOpt,
-                            int *nbDims,
-                            int windowDimA[],
-                            int paddingA[],
-                            int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
+    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
+    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
+    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int[], int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
+                  windowDimA, paddingA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
                                   const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int nbDims,
-                                  int outputTensorDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+                                  int nbDims, int outputTensorDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                   const cudnnTensorDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
 }
@@ -1062,72 +1102,69 @@ cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
 cudnnStatus_t CUDNNWINAPI
 cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
                                   const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int *n,
-                                  int *c,
-                                  int *h,
-                                  int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+                                  int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnPoolingForward(cudnnHandle_t handle,
-                    const cudnnPoolingDescriptor_t poolingDesc,
-                    const void *alpha,
-                    const cudnnTensorDescriptor_t xDesc,
-                    const void *x,
-                    const void *beta,
-                    const cudnnTensorDescriptor_t yDesc,
-                    void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnPoolingBackward(cudnnHandle_t handle,
-                     const cudnnPoolingDescriptor_t poolingDesc,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t dyDesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t xDesc,
-                     const void *x,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t dxDesc,
-                     void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
-                             cudnnActivationMode_t mode,
-                             cudnnNanPropagation_t reluNanOpt,
-                             double coef) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
+    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
+    cudnnNanPropagation_t reluNanOpt, double coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
+                                               cudnnActivationMode_t,
+                                               cudnnNanPropagation_t, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
@@ -1136,9 +1173,10 @@ cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
 cudnnStatus_t CUDNNWINAPI
 cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
                              cudnnActivationMode_t *mode,
-                             cudnnNanPropagation_t *reluNanOpt,
-                             double *coef) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
+      cudnnNanPropagation_t *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
@@ -1146,65 +1184,68 @@ cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnActivationForward(cudnnHandle_t handle,
-                       cudnnActivationDescriptor_t activationDesc,
-                       const void *alpha,
-                       const cudnnTensorDescriptor_t xDesc,
-                       const void *x,
-                       const void *beta,
-                       const cudnnTensorDescriptor_t yDesc,
-                       void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnActivationBackward(cudnnHandle_t handle,
-                        cudnnActivationDescriptor_t activationDesc,
-                        const void *alpha,
-                        const cudnnTensorDescriptor_t yDesc,
-                        const void *y,
-                        const cudnnTensorDescriptor_t dyDesc,
-                        const void *dy,
-                        const cudnnTensorDescriptor_t xDesc,
-                        const void *x,
-                        const void *beta,
-                        const cudnnTensorDescriptor_t dxDesc,
-                        void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned lrnN, double lrnAlpha,
+                                                double lrnBeta, double lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int, double, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned *lrnN,
+                                                double *lrnAlpha,
+                                                double *lrnBeta, double *lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
@@ -1212,157 +1253,157 @@ cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrn
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(lrnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnLRNCrossChannelForward(cudnnHandle_t handle,
-                            cudnnLRNDescriptor_t normDesc,
-                            cudnnLRNMode_t lrnMode,
-                            const void *alpha,
-                            const cudnnTensorDescriptor_t xDesc,
-                            const void *x,
-                            const void *beta,
-                            const cudnnTensorDescriptor_t yDesc,
-                            void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
-                             cudnnLRNDescriptor_t normDesc,
-                             cudnnLRNMode_t lrnMode,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t yDesc,
-                             const void *y,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const cudnnTensorDescriptor_t xDesc,
-                             const void *x,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dxDesc,
-                             void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
+                  x, beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
-                                  cudnnLRNDescriptor_t normDesc,
-                                  cudnnDivNormMode_t mode,
-                                  const void *alpha,
-                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
-                                  const void *x,
-                                  const void *means, /* if NULL, means are assumed to be zero */
-                                  void *temp,
-                                  void *temp2,
-                                  const void *beta,
-                                  const cudnnTensorDescriptor_t yDesc,
-                                  void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
+                  beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
-                                   cudnnLRNDescriptor_t normDesc,
-                                   cudnnDivNormMode_t mode,
-                                   const void *alpha,
-                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
-                                   const void *x,
-                                   const void *means, /* if NULL, means are assumed to be zero */
-                                   const void *dy,
-                                   void *temp,
-                                   void *temp2,
-                                   const void *beta,
-                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
-                                   void *dx,                                   /* output x differential */
-                                   void *dMeans) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t
+        xDesc, /* same desc for x, means, dy, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    const void *dy, void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+    void *dx,                                   /* output x differential */
+    void *dMeans) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
+                  temp2, beta, dXdMeansDesc, dx, dMeans);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
-                              const cudnnTensorDescriptor_t xDesc,
-                              cudnnBatchNormMode_t mode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
+    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
+    cudnnBatchNormMode_t mode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               cudnnBatchNormMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(derivedBnDesc, xDesc, mode);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
-                                                         cudnnBatchNormMode_t mode,
-                                                         cudnnBatchNormOps_t bnOps,
-                                                         const cudnnTensorDescriptor_t xDesc,
-                                                         const cudnnTensorDescriptor_t zDesc,
-                                                         const cudnnTensorDescriptor_t yDesc,
-                                                         const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-                                                         const cudnnActivationDescriptor_t activationDesc,
-                                                         size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnActivationDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize");
+cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
+    const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t zDesc,
+    const cudnnTensorDescriptor_t yDesc,
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnActivationDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, xDesc, zDesc, yDesc, bnScaleBiasMeanVarDesc, activationDesc, sizeInBytes);
+  return func_ptr(handle, mode, bnOps, xDesc, zDesc, yDesc,
+                  bnScaleBiasMeanVarDesc, activationDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
-                                                  cudnnBatchNormMode_t mode,
-                                                  cudnnBatchNormOps_t bnOps,
-                                                  const cudnnTensorDescriptor_t xDesc,
-                                                  const cudnnTensorDescriptor_t yDesc,
-                                                  const cudnnTensorDescriptor_t dyDesc,
-                                                  const cudnnTensorDescriptor_t dzDesc,
-                                                  const cudnnTensorDescriptor_t dxDesc,
-                                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-                                                  const cudnnActivationDescriptor_t activationDesc,
-                                                  size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnActivationDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationBackwardExWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
+    const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t yDesc,
+    const cudnnTensorDescriptor_t dyDesc, const cudnnTensorDescriptor_t dzDesc,
+    const cudnnTensorDescriptor_t dxDesc,
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+    const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnActivationDescriptor_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationBackwardExWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, xDesc, yDesc, dyDesc, dzDesc, dxDesc, dBnScaleBiasDesc, activationDesc, sizeInBytes);
+  return func_ptr(handle, mode, bnOps, xDesc, yDesc, dyDesc, dzDesc, dxDesc,
+                  dBnScaleBiasDesc, activationDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
-                                                     cudnnBatchNormMode_t mode,
-                                                     cudnnBatchNormOps_t bnOps,
-                                                     const cudnnActivationDescriptor_t activationDesc,
-                                                     const cudnnTensorDescriptor_t xDesc,
-                                                     size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationTrainingExReserveSpaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
+    const cudnnActivationDescriptor_t activationDesc,
+    const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
+      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cudnnGetBatchNormalizationTrainingExReserveSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, bnOps, activationDesc, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationForwardTraining(
-    cudnnHandle_t handle,
-    cudnnBatchNormMode_t mode,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
 
     const void *alpha, /* alpha[0] = result blend factor */
     const void *beta,  /* beta[0] = dest layer blend factor */
 
-    const cudnnTensorDescriptor_t xDesc,
-    const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc,
-    void *y, /* NxCxHxW */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
 
     /* Shared desc for the next 6 tensors in the argument list.
        Data type to be set as follows:
@@ -1370,13 +1411,13 @@ cudnnBatchNormalizationForwardTraining(
        Dimensions for this descriptor depend on normalization mode
        - Spatial Normalization : tensors are expected to have dims 1xCx1x1
         (normalization is performed across NxHxW)
-       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
-        (normalization is performed across N) */
+       - Per-Activation Normalization : tensors are expected to have dims of
+       1xCxHxW (normalization is performed across N) */
     const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
 
-    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
-    const void *bnScale,
-    const void *bnBias,
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
+     */
+    const void *bnScale, const void *bnBias,
 
     /* MUST use factor=1 in the very first call of a complete training cycle.
        Use a factor=1/(1+n) at N-th call to the function to get
@@ -1394,248 +1435,261 @@ cudnnBatchNormalizationForwardTraining(
        of  variance[x] (factor is applied in the same way as for runningMean) */
     void *resultRunningVariance,
 
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
+       backward functions. */
     double epsilon,
 
     /* Optionally save intermediate results from the forward pass here
        - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean,
-    void *resultSaveInvVariance) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+    void *resultSaveMean, void *resultSaveInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, double, void *, void *, double, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+  return func_ptr(
+      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
+      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
+      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationForwardTrainingEx(
-    cudnnHandle_t handle,
-    cudnnBatchNormMode_t mode,
-    cudnnBatchNormOps_t bnOps,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTrainingEx(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
 
     const void *alpha, /* alpha[0] = result blend factor */
     const void *beta,  /* beta[0] = dest layer blend factor */
 
-    const cudnnTensorDescriptor_t xDesc,
-    const void *xData,
-    const cudnnTensorDescriptor_t zDesc,
-    const void *zData,
-    const cudnnTensorDescriptor_t yDesc,
-    void *yData,
+    const cudnnTensorDescriptor_t xDesc, const void *xData,
+    const cudnnTensorDescriptor_t zDesc, const void *zData,
+    const cudnnTensorDescriptor_t yDesc, void *yData,
 
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-    const void *bnScale,
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
     const void *bnBias,
 
-    double exponentialAverageFactor,
-    void *resultRunningMean,
+    double exponentialAverageFactor, void *resultRunningMean,
     void *resultRunningVariance,
 
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
+       backward functions. */
     double epsilon,
 
     /* Optionally save intermediate results from the forward pass here
        - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean,
-    void *resultSaveInvVariance,
+    void *resultSaveMean, void *resultSaveInvVariance,
 
-    cudnnActivationDescriptor_t activationDesc,
-    void *workspace,
-    size_t workSpaceSizeInBytes,
-    void *reserveSpace,
+    cudnnActivationDescriptor_t activationDesc, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
     size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTrainingEx");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, double, void *, void *, double, void *,
+      void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTrainingEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, alpha, beta, xDesc, xData, zDesc, zData, yDesc, yData, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance, activationDesc, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, mode, bnOps, alpha, beta, xDesc, xData, zDesc, zData,
+                  yDesc, yData, bnScaleBiasMeanVarDesc, bnScale, bnBias,
+                  exponentialAverageFactor, resultRunningMean,
+                  resultRunningVariance, epsilon, resultSaveMean,
+                  resultSaveInvVariance, activationDesc, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
-                                        cudnnBatchNormMode_t mode,
-                                        const void *alpha, /* alpha[0] = result blend factor */
-                                        const void *beta,  /* beta[0] = dest layer blend factor */
-                                        const cudnnTensorDescriptor_t xDesc,
-                                        const void *x, /* NxCxHxW */
-                                        const cudnnTensorDescriptor_t yDesc,
-                                        void *y, /* NxCxHxW */
-                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-                                        const void *bnScale,
-                                        const void *bnBias,
-                                        const void *estimatedMean,
-                                        const void *estimatedVariance,
-                                        double epsilon) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
+    const void *bnBias, const void *estimatedMean,
+    const void *estimatedVariance, double epsilon) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, const void *, const void *, double);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
+                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
+                  estimatedVariance, epsilon);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationBackward(cudnnHandle_t handle,
-                                cudnnBatchNormMode_t mode,
-                                const void *alphaDataDiff,
-                                const void *betaDataDiff,
-                                const void *alphaParamDiff,
-                                const void *betaParamDiff,
-                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
-                                const void *x,
-                                const cudnnTensorDescriptor_t dyDesc,
-                                const void *dy,
-                                const cudnnTensorDescriptor_t dxDesc,
-                                void *dx,
-                                /* Shared tensor desc for the 4 tensors below */
-                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-                                const void *bnScale, /* bnBias doesn't affect backpropagation */
-                                /* scale and bias diff are not backpropagated below this layer */
-                                void *dBnScaleResult,
-                                void *dBnBiasResult,
-                                /* Same epsilon as forward pass */
-                                double epsilon,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
+    const void *betaDataDiff, const void *alphaParamDiff,
+    const void *betaParamDiff,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    /* Shared tensor desc for the 4 tensors below */
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+    const void *bnScale, /* bnBias doesn't affect backpropagation */
+    /* scale and bias diff are not backpropagated below this layer */
+    void *dBnScaleResult, void *dBnBiasResult,
+    /* Same epsilon as forward pass */
+    double epsilon,
 
-                                /* Optionally cached intermediate results from
-                                   forward pass */
-                                const void *savedMean,
-                                const void *savedInvVariance) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+    /* Optionally cached intermediate results from
+       forward pass */
+    const void *savedMean, const void *savedInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, void *, void *, double, const void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
+                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
+                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
+                  epsilon, savedMean, savedInvVariance);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
-                                  cudnnBatchNormMode_t mode,
-                                  cudnnBatchNormOps_t bnOps,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackwardEx(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
 
-                                  const void *alphaDataDiff,
-                                  const void *betaDataDiff,
-                                  const void *alphaParamDiff,
-                                  const void *betaParamDiff,
-                                  const cudnnTensorDescriptor_t xDesc,
-                                  const void *xData,
-                                  const cudnnTensorDescriptor_t yDesc,
-                                  const void *yData,
-                                  const cudnnTensorDescriptor_t dyDesc,
-                                  const void *dyData,
-                                  const cudnnTensorDescriptor_t dzDesc,
-                                  void *dzData,
-                                  const cudnnTensorDescriptor_t dxDesc,
-                                  void *dxData,
+    const void *alphaDataDiff, const void *betaDataDiff,
+    const void *alphaParamDiff, const void *betaParamDiff,
+    const cudnnTensorDescriptor_t xDesc, const void *xData,
+    const cudnnTensorDescriptor_t yDesc, const void *yData,
+    const cudnnTensorDescriptor_t dyDesc, const void *dyData,
+    const cudnnTensorDescriptor_t dzDesc, void *dzData,
+    const cudnnTensorDescriptor_t dxDesc, void *dxData,
 
-                                  /* Shared tensor desc for the 4 tensors below */
-                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-                                  const void *bnScaleData,
-                                  const void *bnBiasData, /* needed if there is activation */
-                                  void *dBnScaleData,
-                                  void *dBnBiasData,
-                                  double epsilon, /* Same epsilon as forward pass */
+    /* Shared tensor desc for the 4 tensors below */
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc, const void *bnScaleData,
+    const void *bnBiasData, /* needed if there is activation */
+    void *dBnScaleData, void *dBnBiasData,
+    double epsilon, /* Same epsilon as forward pass */
 
-                                  /* Optionally cached intermediate results from
-                                     forward pass */
-                                  const void *savedMean,
-                                  const void *savedInvVariance,
-                                  cudnnActivationDescriptor_t activationDesc,
-                                  void *workSpace,
-                                  size_t workSpaceSizeInBytes,
-                                  void *reserveSpace,
-                                  size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, double, const void *, const void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackwardEx");
+    /* Optionally cached intermediate results from
+       forward pass */
+    const void *savedMean, const void *savedInvVariance,
+    cudnnActivationDescriptor_t activationDesc, void *workSpace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *,
+      const void *, const void *, const void *, const cudnnTensorDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, const void *, const void *, void *,
+      void *, double, const void *, const void *, cudnnActivationDescriptor_t,
+      void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackwardEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData, dxDesc, dxData, dBnScaleBiasDesc, bnScaleData, bnBiasData, dBnScaleData, dBnBiasData, epsilon, savedMean, savedInvVariance, activationDesc, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(
+      handle, mode, bnOps, alphaDataDiff, betaDataDiff, alphaParamDiff,
+      betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData,
+      dxDesc, dxData, dBnScaleBiasDesc, bnScaleData, bnBiasData, dBnScaleData,
+      dBnBiasData, epsilon, savedMean, savedInvVariance, activationDesc,
+      workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
+    cudnnSpatialTransformerDescriptor_t *stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
-                                       cudnnSamplerType_t samplerType,
-                                       cudnnDataType_t dataType,
-                                       const int nbDims,
-                                       const int dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
+    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
+    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
+      const int, const int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
+    cudnnSpatialTransformerDescriptor_t stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
-                                   const cudnnSpatialTransformerDescriptor_t stDesc,
-                                   const void *theta,
-                                   void *grid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *theta, void *grid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, theta, grid);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
-                                    const cudnnSpatialTransformerDescriptor_t stDesc,
-                                    const void *dgrid,
-                                    void *dtheta) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *dgrid, void *dtheta) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, dgrid, dtheta);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
-                             cudnnSpatialTransformerDescriptor_t stDesc,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t xDesc,
-                             const void *x,
-                             const void *grid,
-                             const void *beta,
-                             cudnnTensorDescriptor_t yDesc,
-                             void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
+    void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
-                              cudnnSpatialTransformerDescriptor_t stDesc,
-                              const void *alpha,
-                              const cudnnTensorDescriptor_t xDesc,
-                              const void *x,
-                              const void *beta,
-                              const cudnnTensorDescriptor_t dxDesc,
-                              void *dx,
-                              const void *alphaDgrid,
-                              const cudnnTensorDescriptor_t dyDesc,
-                              const void *dy,
-                              const void *grid,
-                              const void *betaDgrid,
-                              void *dgrid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
+    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
+                  dyDesc, dy, grid, betaDgrid, dgrid);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
@@ -1643,99 +1697,95 @@ cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
+                                                    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
+    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(xdesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                          cudnnHandle_t handle,
-                          float dropout,
-                          void *states,
-                          size_t stateSizeInBytes,
-                          unsigned long long seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                              cudnnHandle_t handle,
-                              float dropout,
-                              void *states,
-                              size_t stateSizeInBytes,
-                              unsigned long long seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                          cudnnHandle_t handle,
-                          float *dropout,
-                          void **states,
-                          unsigned long long *seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float *, void **, unsigned long long *);
+cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
+    void **states, unsigned long long *seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float *, void **, unsigned long long *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutForward(cudnnHandle_t handle,
-                    const cudnnDropoutDescriptor_t dropoutDesc,
-                    const cudnnTensorDescriptor_t xdesc,
-                    const void *x,
-                    const cudnnTensorDescriptor_t ydesc,
-                    void *y,
-                    void *reserveSpace,
-                    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t xdesc, const void *x,
+    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutBackward(cudnnHandle_t handle,
-                     const cudnnDropoutDescriptor_t dropoutDesc,
-                     const cudnnTensorDescriptor_t dydesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t dxdesc,
-                     void *dx,
-                     void *reserveSpace,
-                     size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t dydesc, const void *dy,
+    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
@@ -1743,184 +1793,192 @@ cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNForwardInferenceAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardInferenceAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNForwardInferenceAlgorithmEx(cudnnHandle_t handle,
-                                        const cudnnRNNDescriptor_t rnnDesc,
-                                        const int seqLength,
-                                        const cudnnTensorDescriptor_t *xDesc,
-                                        const void *x,
-                                        const cudnnTensorDescriptor_t hxDesc,
-                                        const void *hx,
-                                        const cudnnTensorDescriptor_t cxDesc,
-                                        const void *cx,
-                                        const cudnnFilterDescriptor_t wDesc,
-                                        const void *w,
-                                        const cudnnTensorDescriptor_t *yDesc,
-                                        void *y,
-                                        const cudnnTensorDescriptor_t hyDesc,
-                                        void *hy,
-                                        const cudnnTensorDescriptor_t cyDesc,
-                                        void *cy,
-                                        const float findIntensity,
-                                        const int requestedAlgoCount,
-                                        int *returnedAlgoCount,
-                                        cudnnAlgorithmPerformance_t *perfResults,
-                                        void *workspace,
-                                        size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardInferenceAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNForwardTrainingAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardTrainingAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNForwardTrainingAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnRNNDescriptor_t rnnDesc,
-                                       const int seqLength,
-                                       const cudnnTensorDescriptor_t *xDesc,
-                                       const void *x,
-                                       const cudnnTensorDescriptor_t hxDesc,
-                                       const void *hx,
-                                       const cudnnTensorDescriptor_t cxDesc,
-                                       const void *cx,
-                                       const cudnnFilterDescriptor_t wDesc,
-                                       const void *w,
-                                       const cudnnTensorDescriptor_t *yDesc,
-                                       void *y,
-                                       const cudnnTensorDescriptor_t hyDesc,
-                                       void *hy,
-                                       const cudnnTensorDescriptor_t cyDesc,
-                                       void *cy,
-                                       const float findIntensity,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnAlgorithmPerformance_t *perfResults,
-                                       void *workspace,
-                                       size_t workSpaceSizeInBytes,
-                                       void *reserveSpace,
-                                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardTrainingAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardDataAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNBackwardDataAlgorithmEx(cudnnHandle_t handle,
-                                    const cudnnRNNDescriptor_t rnnDesc,
-                                    const int seqLength,
-                                    const cudnnTensorDescriptor_t *yDesc,
-                                    const void *y,
-                                    const cudnnTensorDescriptor_t *dyDesc,
-                                    const void *dy,
-                                    const cudnnTensorDescriptor_t dhyDesc,
-                                    const void *dhy,
-                                    const cudnnTensorDescriptor_t dcyDesc,
-                                    const void *dcy,
-                                    const cudnnFilterDescriptor_t wDesc,
-                                    const void *w,
-                                    const cudnnTensorDescriptor_t hxDesc,
-                                    const void *hx,
-                                    const cudnnTensorDescriptor_t cxDesc,
-                                    const void *cx,
-                                    const cudnnTensorDescriptor_t *dxDesc,
-                                    void *dx,
-                                    const cudnnTensorDescriptor_t dhxDesc,
-                                    void *dhx,
-                                    const cudnnTensorDescriptor_t dcxDesc,
-                                    void *dcx,
-                                    const float findIntensity,
-                                    const int requestedAlgoCount,
-                                    int *returnedAlgoCount,
-                                    cudnnAlgorithmPerformance_t *perfResults,
-                                    void *workspace,
-                                    size_t workSpaceSizeInBytes,
-                                    void *reserveSpace,
-                                    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardDataAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *yDesc, const void *y,
+    const cudnnTensorDescriptor_t *dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
+    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnTensorDescriptor_t *dxDesc, void *dx,
+    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+    const cudnnTensorDescriptor_t dcxDesc, void *dcx, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNBackwardWeightsAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardWeightsAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNBackwardWeightsAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnRNNDescriptor_t rnnDesc,
-                                       const int seqLength,
-                                       const cudnnTensorDescriptor_t *xDesc,
-                                       const void *x,
-                                       const cudnnTensorDescriptor_t hxDesc,
-                                       const void *hx,
-                                       const cudnnTensorDescriptor_t *yDesc,
-                                       const void *y,
-                                       const float findIntensity,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnAlgorithmPerformance_t *perfResults,
-                                       const void *workspace,
-                                       size_t workSpaceSizeInBytes,
-                                       const cudnnFilterDescriptor_t dwDesc,
-                                       void *dw,
-                                       const void *reserveSpace,
-                                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardWeightsAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y,
+    const float findIntensity, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnAlgorithmPerformance_t *perfResults,
+    const void *workspace, size_t workSpaceSizeInBytes,
+    const cudnnFilterDescriptor_t dwDesc, void *dw, const void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  findIntensity, requestedAlgoCount, returnedAlgoCount,
+                  perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw,
+                  reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
-                             const int minibatch,
-                             const cudnnDataType_t dataType,
-                             cudnnPersistentRNNPlan_t *plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
+    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
+                                               const cudnnDataType_t,
+                                               cudnnPersistentRNNPlan_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, minibatch, dataType, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, plan);
@@ -1928,289 +1986,285 @@ cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor(cudnnHandle_t handle,
-                      cudnnRNNDescriptor_t rnnDesc,
-                      const int hiddenSize,
-                      const int numLayers,
-                      cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */
-                      cudnnRNNInputMode_t inputMode,
-                      cudnnDirectionMode_t direction,
-                      cudnnRNNMode_t mode,
-                      cudnnRNNAlgo_t algo,
-                      cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers,
+    cudnnDropoutDescriptor_t
+        dropoutDesc, /* Between layers, not between recurrent steps. */
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNProjectionLayers(cudnnHandle_t handle,
-                            cudnnRNNDescriptor_t rnnDesc,
-                            const int recProjSize,
-                            const int outProjSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
+cudnnSetRNNProjectionLayers(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
+                            const int recProjSize, const int outProjSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNProjectionLayers(cudnnHandle_t handle,
-                            const cudnnRNNDescriptor_t rnnDesc,
-                            int *recProjSize,
-                            int *outProjSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *recProjSize,
+    int *outProjSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNAlgorithmDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
+    cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNDescriptor(cudnnHandle_t handle,
-                      cudnnRNNDescriptor_t rnnDesc,
-                      int *hiddenSize,
-                      int *numLayers,
-                      cudnnDropoutDescriptor_t *dropoutDesc,
-                      cudnnRNNInputMode_t *inputMode,
-                      cudnnDirectionMode_t *direction,
-                      cudnnRNNMode_t *mode,
-                      cudnnRNNAlgo_t *algo,
-                      cudnnDataType_t *dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *, cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *, cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
+    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
+    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
+    cudnnRNNMode_t *mode, cudnnRNNAlgo_t *algo, cudnnDataType_t *dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
+      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
+      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, mType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(
+    cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, mType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNWorkspaceSize(cudnnHandle_t handle,
-                         const cudnnRNNDescriptor_t rnnDesc,
-                         const int seqLength,
-                         const cudnnTensorDescriptor_t *xDesc,
-                         size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNTrainingReserveSize(cudnnHandle_t handle,
-                               const cudnnRNNDescriptor_t rnnDesc,
-                               const int seqLength,
-                               const cudnnTensorDescriptor_t *xDesc,
-                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNParamsSize(cudnnHandle_t handle,
-                      const cudnnRNNDescriptor_t rnnDesc,
-                      const cudnnTensorDescriptor_t xDesc,
-                      size_t *sizeInBytes,
+cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
                       cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
+      size_t *, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNLinLayerMatrixParams(cudnnHandle_t handle,
-                                const cudnnRNNDescriptor_t rnnDesc,
-                                const int pseudoLayer,
-                                const cudnnTensorDescriptor_t xDesc,
-                                const cudnnFilterDescriptor_t wDesc,
-                                const void *w,
-                                const int linLayerID,
-                                cudnnFilterDescriptor_t linLayerMatDesc,
-                                void **linLayerMat) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
+                  linLayerMatDesc, linLayerMat);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNLinLayerBiasParams(cudnnHandle_t handle,
-                              const cudnnRNNDescriptor_t rnnDesc,
-                              const int pseudoLayer,
-                              const cudnnTensorDescriptor_t xDesc,
-                              const cudnnFilterDescriptor_t wDesc,
-                              const void *w,
-                              const int linLayerID,
-                              cudnnFilterDescriptor_t linLayerBiasDesc,
-                              void **linLayerBias) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
+                  linLayerBiasDesc, linLayerBias);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardInference(cudnnHandle_t handle,
-                         const cudnnRNNDescriptor_t rnnDesc,
-                         const int seqLength,
-                         const cudnnTensorDescriptor_t *xDesc,
-                         const void *x,
-                         const cudnnTensorDescriptor_t hxDesc,
-                         const void *hx,
-                         const cudnnTensorDescriptor_t cxDesc,
-                         const void *cx,
-                         const cudnnFilterDescriptor_t wDesc,
-                         const void *w,
-                         const cudnnTensorDescriptor_t *yDesc,
-                         void *y,
-                         const cudnnTensorDescriptor_t hyDesc,
-                         void *hy,
-                         const cudnnTensorDescriptor_t cyDesc,
-                         void *cy,
-                         void *workspace,
-                         size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardTraining(cudnnHandle_t handle,
-                        const cudnnRNNDescriptor_t rnnDesc,
-                        const int seqLength,
-                        const cudnnTensorDescriptor_t *xDesc,
-                        const void *x,
-                        const cudnnTensorDescriptor_t hxDesc,
-                        const void *hx,
-                        const cudnnTensorDescriptor_t cxDesc,
-                        const void *cx,
-                        const cudnnFilterDescriptor_t wDesc,
-                        const void *w,
-                        const cudnnTensorDescriptor_t *yDesc,
-                        void *y,
-                        const cudnnTensorDescriptor_t hyDesc,
-                        void *hy,
-                        const cudnnTensorDescriptor_t cyDesc,
-                        void *cy,
-                        void *workspace,
-                        size_t workSpaceSizeInBytes,
-                        void *reserveSpace,
-                        size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardData(cudnnHandle_t handle,
-                     const cudnnRNNDescriptor_t rnnDesc,
-                     const int seqLength,
-                     const cudnnTensorDescriptor_t *yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t *dyDesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t dhyDesc,
-                     const void *dhy,
-                     const cudnnTensorDescriptor_t dcyDesc,
-                     const void *dcy,
-                     const cudnnFilterDescriptor_t wDesc,
-                     const void *w,
-                     const cudnnTensorDescriptor_t hxDesc,
-                     const void *hx,
-                     const cudnnTensorDescriptor_t cxDesc,
-                     const void *cx,
-                     const cudnnTensorDescriptor_t *dxDesc,
-                     void *dx,
-                     const cudnnTensorDescriptor_t dhxDesc,
-                     void *dhx,
-                     const cudnnTensorDescriptor_t dcxDesc,
-                     void *dcx,
-                     void *workspace,
-                     size_t workSpaceSizeInBytes,
-                     void *reserveSpace,
-                     size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
+                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
+                     const void *w, const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+                     void *workspace, size_t workSpaceSizeInBytes,
+                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardWeights(cudnnHandle_t handle,
-                        const cudnnRNNDescriptor_t rnnDesc,
-                        const int seqLength,
-                        const cudnnTensorDescriptor_t *xDesc,
-                        const void *x,
-                        const cudnnTensorDescriptor_t hxDesc,
-                        const void *hx,
-                        const cudnnTensorDescriptor_t *yDesc,
-                        const void *y,
-                        const void *workspace,
-                        size_t workSpaceSizeInBytes,
-                        const cudnnFilterDescriptor_t dwDesc,
-                        void *dw,
-                        const void *reserveSpace,
-                        size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
@@ -2218,82 +2272,102 @@ cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCTCLoss(
+cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
     cudnnHandle_t handle,
     const cudnnTensorDescriptor_t
-        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
-                          mini batch size, A is the alphabet size)  */
-    const void *probs, /* probabilities after softmax, in GPU memory */
-    const int *labels, /* labels, in CPU memory */
-    const int *labelLengths,                     /* the length of each label, in CPU memory */
-    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
-    void *costs,                                 /* the returned costs of CTC, in GPU memory */
-    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
-    const void *gradients,   /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the timing steps, N is the
+                      mini batch size, A is the alphabet size)  */
+    const void *probs,       /* probabilities after softmax, in GPU memory */
+    const int *labels,       /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    void *costs,             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t
+        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
+                          T,N,A */
+    const void *gradients,   /* the returned CTC gradients, in GPU memory, to
+                                compute costs only, set it to NULL */
     cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
     cudnnCTCLossDescriptor_t ctcLossDesc,
-    void *workspace,              /* pointer to the workspace, in GPU memory */
+    void *workspace, /* pointer to the workspace, in GPU memory */
     size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *, const int *, const int *, void *, const cudnnTensorDescriptor_t, const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *,
+      const int *, const int *, void *, const cudnnTensorDescriptor_t,
+      const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths,
+                  costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCTCLossWorkspaceSize(
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
     cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
-                                                timing steps, N is the mini batch size, A is the alphabet size) */
-    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
-                                                    dimensions are T,N,A. To compute costs
-                                                    only, set it to NULL */
-    const int *labels,                           /* labels, in CPU memory */
-    const int *labelLengths,                     /* the length of each label, in CPU memory */
-    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
-    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const int *, const int *, const int *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
+    const cudnnTensorDescriptor_t
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the
+                      timing steps, N is the mini batch size, A is the alphabet
+                      size) */
+    const cudnnTensorDescriptor_t
+        gradientsDesc,       /* Tensor descriptor for gradients, the
+                                dimensions are T,N,A. To compute costs
+                                only, set it to NULL */
+    const int *labels,       /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
+      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, algo, ctcLossDesc, sizeInBytes);
+  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
+                  inputLengths, algo, ctcLossDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmDescriptor(
+    cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithm_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc, algorithm);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmDescriptor(
+    const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithm_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc, algorithm);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnCopyAlgorithmDescriptor(
+    const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(src, dest);
@@ -2301,135 +2375,141 @@ cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorith
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, numberToCreate);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf,
-                             cudnnAlgorithmDescriptor_t algoDesc,
-                             cudnnStatus_t status,
-                             float time,
-                             size_t memory) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t, cudnnStatus_t, float, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t algoPerf, cudnnAlgorithmDescriptor_t algoDesc,
+    cudnnStatus_t status, float time, size_t memory) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t,
+                                               cudnnAlgorithmDescriptor_t,
+                                               cudnnStatus_t, float, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, algoDesc, status, time, memory);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf,
-                             cudnnAlgorithmDescriptor_t *algoDesc,
-                             cudnnStatus_t *status,
-                             float *time,
-                             size_t *memory) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *, cudnnStatus_t *, float *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmPerformance(
+    const cudnnAlgorithmPerformance_t algoPerf,
+    cudnnAlgorithmDescriptor_t *algoDesc, cudnnStatus_t *status, float *time,
+    size_t *memory) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *,
+      cudnnStatus_t *, float *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, algoDesc, status, time, memory);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
+cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, numberToDestroy);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmSpaceSize(
+    cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
+    size_t *algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnSaveAlgorithm(cudnnHandle_t handle,
-                   cudnnAlgorithmDescriptor_t algoDesc,
-                   void *algoSpace,
-                   size_t algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
+cudnnSaveAlgorithm(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
+                   void *algoSpace, size_t algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRestoreAlgorithm(cudnnHandle_t handle,
-                      void *algoSpace,
-                      size_t algoSpaceSizeInBytes,
-                      cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, void *, size_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnRestoreAlgorithm(
+    cudnnHandle_t handle, void *algoSpace, size_t algoSpaceSizeInBytes,
+    cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, void *, size_t,
+                                               cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNSetClip(cudnnHandle_t handle,
-                cudnnRNNDescriptor_t rnnDesc,
-                cudnnRNNClipMode_t clipMode,
-                cudnnNanPropagation_t clipNanOpt,
-                double lclip,
-                double rclip) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t, cudnnNanPropagation_t, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnRNNSetClip(cudnnHandle_t handle,
+                                          cudnnRNNDescriptor_t rnnDesc,
+                                          cudnnRNNClipMode_t clipMode,
+                                          cudnnNanPropagation_t clipNanOpt,
+                                          double lclip, double rclip) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t,
+      cudnnNanPropagation_t, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNSetClip");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNGetClip(cudnnHandle_t handle,
-                cudnnRNNDescriptor_t rnnDesc,
-                cudnnRNNClipMode_t *clipMode,
-                cudnnNanPropagation_t *clipNanOpt,
-                double *lclip,
-                double *rclip) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *, cudnnNanPropagation_t *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnRNNGetClip(cudnnHandle_t handle,
+                                          cudnnRNNDescriptor_t rnnDesc,
+                                          cudnnRNNClipMode_t *clipMode,
+                                          cudnnNanPropagation_t *clipNanOpt,
+                                          double *lclip, double *rclip) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *,
+      cudnnNanPropagation_t *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNGetClip");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetCallback(unsigned mask, void *udata,
+                                           cudnnCallback_t fptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mask, udata, fptr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetCallback(unsigned *mask, void **udata,
+                                           cudnnCallback_t *fptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mask, udata, fptr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNPaddingMode(
+    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNPaddingMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, paddingMode);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNPaddingMode(
+    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnRNNPaddingMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNPaddingMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, paddingMode);
@@ -2437,7 +2517,7 @@ cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *padd
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *RNNDataDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(RNNDataDesc);
@@ -2445,199 +2525,202 @@ cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *RNNDataDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(RNNDataDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc,
-                          cudnnDataType_t dataType,
-                          cudnnRNNDataLayout_t layout,
-                          int maxSeqLength,
-                          int batchSize,
-                          int vectorSize,
-                          const int seqLengthArray[], /* length of each sequence in the batch */
-                          void *paddingFill) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int, int, const int [], void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDataDescriptor(
+    cudnnRNNDataDescriptor_t RNNDataDesc, cudnnDataType_t dataType,
+    cudnnRNNDataLayout_t layout, int maxSeqLength, int batchSize,
+    int vectorSize,
+    const int seqLengthArray[], /* length of each sequence in the batch */
+    void *paddingFill) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int,
+      int, const int[], void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, seqLengthArray, paddingFill);
+  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize,
+                  vectorSize, seqLengthArray, paddingFill);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc,
-                          cudnnDataType_t *dataType,
-                          cudnnRNNDataLayout_t *layout,
-                          int *maxSeqLength,
-                          int *batchSize,
-                          int *vectorSize,
-                          int arrayLengthRequested,
-                          int seqLengthArray[],
-                          void *paddingFill) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *, int *, int *, int *, int, int [], void *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDataDescriptor(
+    cudnnRNNDataDescriptor_t RNNDataDesc, cudnnDataType_t *dataType,
+    cudnnRNNDataLayout_t *layout, int *maxSeqLength, int *batchSize,
+    int *vectorSize, int arrayLengthRequested, int seqLengthArray[],
+    void *paddingFill) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *,
+      int *, int *, int *, int, int[], void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, arrayLengthRequested, seqLengthArray, paddingFill);
+  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize,
+                  vectorSize, arrayLengthRequested, seqLengthArray,
+                  paddingFill);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardTrainingEx(cudnnHandle_t handle,
-                          const cudnnRNNDescriptor_t rnnDesc,
-                          const cudnnRNNDataDescriptor_t xDesc,
-                          const void *x,
-                          const cudnnTensorDescriptor_t hxDesc,
-                          const void *hx,
-                          const cudnnTensorDescriptor_t cxDesc,
-                          const void *cx,
-                          const cudnnFilterDescriptor_t wDesc,
-                          const void *w,
-                          const cudnnRNNDataDescriptor_t yDesc,
-                          void *y,
-                          const cudnnTensorDescriptor_t hyDesc,
-                          void *hy,
-                          const cudnnTensorDescriptor_t cyDesc,
-                          void *cy,
-                          const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-                          const void *keys,                     /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-                          void *cAttn,                          /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-                          void *iAttn,                          /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-                          void *queries,                        /* reserved, should pass NULL */
-                          void *workSpace,
-                          size_t workSpaceSizeInBytes,
-                          void *reserveSpace,
-                          size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTrainingEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnRNNDataDescriptor_t yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy,
+    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+    const void *keys,                     /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+    void *cAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+    void *iAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+    void *queries,                        /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
+      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTrainingEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
+                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
+                  iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes,
+                  reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardInferenceEx(cudnnHandle_t handle,
-                           const cudnnRNNDescriptor_t rnnDesc,
-                           const cudnnRNNDataDescriptor_t xDesc,
-                           const void *x,
-                           const cudnnTensorDescriptor_t hxDesc,
-                           const void *hx,
-                           const cudnnTensorDescriptor_t cxDesc,
-                           const void *cx,
-                           const cudnnFilterDescriptor_t wDesc,
-                           const void *w,
-                           const cudnnRNNDataDescriptor_t yDesc,
-                           void *y,
-                           const cudnnTensorDescriptor_t hyDesc,
-                           void *hy,
-                           const cudnnTensorDescriptor_t cyDesc,
-                           void *cy,
-                           const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-                           const void *keys,                     /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-                           void *cAttn,                          /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-                           void *iAttn,                          /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-                           void *queries,                        /* reserved, should pass NULL */
-                           void *workSpace,
-                           size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInferenceEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnRNNDataDescriptor_t yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy,
+    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+    const void *keys,                     /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+    void *cAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+    void *iAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+    void *queries,                        /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
+      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInferenceEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
+                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
+                  iDesc, iAttn, qDesc, queries, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardDataEx(cudnnHandle_t handle,
-                       const cudnnRNNDescriptor_t rnnDesc,
-                       const cudnnRNNDataDescriptor_t yDesc,
-                       const void *y,
-                       const cudnnRNNDataDescriptor_t dyDesc,
-                       const void *dy,
-                       const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
-                       const void *dcAttn,                    /* reserved, should pass NULL */
-                       const cudnnTensorDescriptor_t dhyDesc,
-                       const void *dhy,
-                       const cudnnTensorDescriptor_t dcyDesc,
-                       const void *dcy,
-                       const cudnnFilterDescriptor_t wDesc,
-                       const void *w,
-                       const cudnnTensorDescriptor_t hxDesc,
-                       const void *hx,
-                       const cudnnTensorDescriptor_t cxDesc,
-                       const void *cx,
-                       const cudnnRNNDataDescriptor_t dxDesc,
-                       void *dx,
-                       const cudnnTensorDescriptor_t dhxDesc,
-                       void *dhx,
-                       const cudnnTensorDescriptor_t dcxDesc,
-                       void *dcx,
-                       const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
-                       void *dkeys,                           /* reserved, should pass NULL */
-                       void *workSpace,
-                       size_t workSpaceSizeInBytes,
-                       void *reserveSpace,
-                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardDataEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t yDesc, const void *y,
+    const cudnnRNNDataDescriptor_t dyDesc, const void *dy,
+    const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
+    const void *dcAttn,                    /* reserved, should pass NULL */
+    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
+    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnRNNDataDescriptor_t dxDesc, void *dx,
+    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+    const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+    const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
+    void *dkeys,                           /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardDataEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn,
+                  dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx,
+                  dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys,
+                  workSpace, workSpaceSizeInBytes, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardWeightsEx(cudnnHandle_t handle,
-                          const cudnnRNNDescriptor_t rnnDesc,
-                          const cudnnRNNDataDescriptor_t xDesc,
-                          const void *x,
-                          const cudnnTensorDescriptor_t hxDesc,
-                          const void *hx,
-                          const cudnnRNNDataDescriptor_t yDesc,
-                          const void *y,
-                          void *workSpace,
-                          size_t workSpaceSizeInBytes,
-                          const cudnnFilterDescriptor_t dwDesc,
-                          void *dw,
-                          void *reserveSpace,
-                          size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, void *, size_t, const cudnnFilterDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeightsEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnRNNDataDescriptor_t yDesc, const void *y, void *workSpace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, const void *, void *, size_t,
+      const cudnnFilterDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeightsEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace,
+                  workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor_v6(cudnnHandle_t handle,
-                         cudnnRNNDescriptor_t rnnDesc,
-                         const int hiddenSize,
-                         const int numLayers,
-                         cudnnDropoutDescriptor_t dropoutDesc,
-                         cudnnRNNInputMode_t inputMode,
-                         cudnnDirectionMode_t direction,
-                         cudnnRNNMode_t mode,
-                         cudnnRNNAlgo_t algo,
-                         cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t rnnDesc,
-                         int hiddenSize,
-                         int numLayers,
-                         cudnnDropoutDescriptor_t dropoutDesc,
-                         cudnnRNNInputMode_t inputMode,
-                         cudnnDirectionMode_t direction,
-                         cudnnRNNMode_t mode,
-                         cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(
+    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnRNNInputMode_t inputMode,
+    cudnnDirectionMode_t direction, cudnnRNNMode_t mode,
+    cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
+      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
+      cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, dataType);
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
+                  direction, mode, dataType);
 }
 
 }  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_7_6.inc b/tensorflow/stream_executor/cuda/cudnn_7_6.inc
index 7a5f1c9751d..9dd420a9022 100644
--- a/tensorflow/stream_executor/cuda/cudnn_7_6.inc
+++ b/tensorflow/stream_executor/cuda/cudnn_7_6.inc
@@ -2,73 +2,71 @@
 
 extern "C" {
 
-size_t CUDNNWINAPI
-cudnnGetVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+size_t CUDNNWINAPI cudnnGetVersion(void) {
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-size_t CUDNNWINAPI
-cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-const char *CUDNNWINAPI
-cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
   if (!func_ptr) return "cudnnGetErrorString symbol not found.";
   return func_ptr(status);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
+cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
+                                                 cudnnStatus_t *rstatus,
+                                                 cudnnErrQueryMode_t mode,
+                                                 cudnnRuntimeTag_t *tag) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rstatus, mode, tag);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
+                                           int *value) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreate(cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroy(cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
+                                         cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
+                                         cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
@@ -76,100 +74,97 @@ cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
-                           cudnnTensorFormat_t format,
-                           cudnnDataType_t dataType, /* image data type */
-                           int n,                    /* number of inputs (batch size) */
-                           int c,                    /* number of input feature maps */
-                           int h,                    /* height of input section */
-                           int w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
-                             cudnnDataType_t dataType, /* image data type */
-                             int n,                    /* number of inputs (batch size) */
-                             int c,                    /* number of input feature maps */
-                             int h,                    /* height of input section */
-                             int w,                    /* width of input section */
-                             int nStride,
-                             int cStride,
-                             int hStride,
-                             int wStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
+    cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w,                    /* width of input section */
+    int nStride, int cStride, int hStride, int wStride) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
+                                   int, int, int, int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
-                           cudnnDataType_t *dataType, /* image data type */
-                           int *n,                    /* number of inputs (batch size) */
-                           int *c,                    /* number of input feature maps  */
-                           int *h,                    /* height of input section */
-                           int *w,                    /* width of input section */
-                           int *nStride,
-                           int *cStride,
-                           int *hStride,
-                           int *wStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
+    const cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t *dataType, /* image data type */
+    int *n,                    /* number of inputs (batch size) */
+    int *c,                    /* number of input feature maps  */
+    int *h,                    /* height of input section */
+    int *w,                    /* width of input section */
+    int *nStride, int *cStride, int *hStride, int *wStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
+      int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
-                           cudnnDataType_t dataType,
-                           int nbDims,
-                           const int dimA[],
-                           const int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
+    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
+    const int dimA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
-                             cudnnTensorFormat_t format,
-                             cudnnDataType_t dataType,
-                             int nbDims,
-                             const int dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
-                           int nbDimsRequested,
-                           cudnnDataType_t *dataType,
-                           int *nbDims,
-                           int dimA[],
-                           int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
+    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
+                                   cudnnDataType_t *, int *, int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
+    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, size);
@@ -177,126 +172,141 @@ cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
-                       const cudnnTensorDescriptor_t srcDesc,
-                       cudnnTensorDescriptor_t destDesc,
-                       size_t *destSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorTransformDescriptor_t, const cudnnTensorDescriptor_t, cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnInitTransformDest(
+    const cudnnTensorTransformDescriptor_t transformDesc,
+    const cudnnTensorDescriptor_t srcDesc, cudnnTensorDescriptor_t destDesc,
+    size_t *destSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnTensorTransformDescriptor_t, const cudnnTensorDescriptor_t,
+      cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnInitTransformDest");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(transformDesc, srcDesc, destDesc, destSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorTransformDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnCreateTensorTransformDescriptor(
+    cudnnTensorTransformDescriptor_t *transformDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateTensorTransformDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(transformDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
-                                  const uint32_t nbDims,
-                                  const cudnnTensorFormat_t destFormat,
-                                  const int32_t padBeforeA[],
-                                  const int32_t padAfterA[],
-                                  const uint32_t foldA[],
-                                  const cudnnFoldingDirection_t direction) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t, const uint32_t, const cudnnTensorFormat_t, const int32_t [], const int32_t [], const uint32_t [], const cudnnFoldingDirection_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorTransformDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorTransformDescriptor(
+    cudnnTensorTransformDescriptor_t transformDesc, const uint32_t nbDims,
+    const cudnnTensorFormat_t destFormat, const int32_t padBeforeA[],
+    const int32_t padAfterA[], const uint32_t foldA[],
+    const cudnnFoldingDirection_t direction) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnTensorTransformDescriptor_t, const uint32_t,
+      const cudnnTensorFormat_t, const int32_t[], const int32_t[],
+      const uint32_t[], const cudnnFoldingDirection_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetTensorTransformDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, nbDims, destFormat, padBeforeA, padAfterA, foldA, direction);
+  return func_ptr(transformDesc, nbDims, destFormat, padBeforeA, padAfterA,
+                  foldA, direction);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
-                                  uint32_t nbDimsRequested,
-                                  cudnnTensorFormat_t *destFormat,
-                                  int32_t padBeforeA[],
-                                  int32_t padAfterA[],
-                                  uint32_t foldA[],
-                                  cudnnFoldingDirection_t *direction) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t, uint32_t, cudnnTensorFormat_t *, int32_t [], int32_t [], uint32_t [], cudnnFoldingDirection_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorTransformDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorTransformDescriptor(
+    cudnnTensorTransformDescriptor_t transformDesc, uint32_t nbDimsRequested,
+    cudnnTensorFormat_t *destFormat, int32_t padBeforeA[], int32_t padAfterA[],
+    uint32_t foldA[], cudnnFoldingDirection_t *direction) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnTensorTransformDescriptor_t, uint32_t, cudnnTensorFormat_t *,
+      int32_t[], int32_t[], uint32_t[], cudnnFoldingDirection_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetTensorTransformDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, nbDimsRequested, destFormat, padBeforeA, padAfterA, foldA, direction);
+  return func_ptr(transformDesc, nbDimsRequested, destFormat, padBeforeA,
+                  padAfterA, foldA, direction);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorTransformDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnDestroyTensorTransformDescriptor(
+    cudnnTensorTransformDescriptor_t transformDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyTensorTransformDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(transformDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnTransformTensor(cudnnHandle_t handle,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t xDesc,
-                     const void *x,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t yDesc,
-                     void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnTransformTensorEx(cudnnHandle_t handle,
-                       const cudnnTensorTransformDescriptor_t transDesc,
-                       const void *alpha,
-                       const cudnnTensorDescriptor_t srcDesc,
-                       const void *srcData,
-                       const void *beta,
-                       const cudnnTensorDescriptor_t destDesc,
-                       void *destData) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnTransformTensorEx(
+    cudnnHandle_t handle, const cudnnTensorTransformDescriptor_t transDesc,
+    const void *alpha, const cudnnTensorDescriptor_t srcDesc,
+    const void *srcData, const void *beta,
+    const cudnnTensorDescriptor_t destDesc, void *destData) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc, destData);
+  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc,
+                  destData);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFoldedConvBackwardDataDescriptors(const cudnnHandle_t handle,
-                                          const cudnnFilterDescriptor_t filterDesc,
-                                          const cudnnTensorDescriptor_t diffDesc,
-                                          const cudnnConvolutionDescriptor_t convDesc,
-                                          const cudnnTensorDescriptor_t gradDesc,
-                                          const cudnnTensorFormat_t transformFormat,
-                                          cudnnFilterDescriptor_t foldedFilterDesc,
-                                          cudnnTensorDescriptor_t paddedDiffDesc,
-                                          cudnnConvolutionDescriptor_t foldedConvDesc,
-                                          cudnnTensorDescriptor_t foldedGradDesc,
-                                          cudnnTensorTransformDescriptor_t filterFoldTransDesc,
-                                          cudnnTensorTransformDescriptor_t diffPadTransDesc,
-                                          cudnnTensorTransformDescriptor_t gradFoldTransDesc,
-                                          cudnnTensorTransformDescriptor_t gradUnfoldTransDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorFormat_t, cudnnFilterDescriptor_t, cudnnTensorDescriptor_t, cudnnConvolutionDescriptor_t, cudnnTensorDescriptor_t, cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFoldedConvBackwardDataDescriptors");
+cudnnStatus_t CUDNNWINAPI cudnnGetFoldedConvBackwardDataDescriptors(
+    const cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t gradDesc,
+    const cudnnTensorFormat_t transformFormat,
+    cudnnFilterDescriptor_t foldedFilterDesc,
+    cudnnTensorDescriptor_t paddedDiffDesc,
+    cudnnConvolutionDescriptor_t foldedConvDesc,
+    cudnnTensorDescriptor_t foldedGradDesc,
+    cudnnTensorTransformDescriptor_t filterFoldTransDesc,
+    cudnnTensorTransformDescriptor_t diffPadTransDesc,
+    cudnnTensorTransformDescriptor_t gradFoldTransDesc,
+    cudnnTensorTransformDescriptor_t gradUnfoldTransDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorFormat_t,
+      cudnnFilterDescriptor_t, cudnnTensorDescriptor_t,
+      cudnnConvolutionDescriptor_t, cudnnTensorDescriptor_t,
+      cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t,
+      cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetFoldedConvBackwardDataDescriptors");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, transformFormat, foldedFilterDesc, paddedDiffDesc, foldedConvDesc, foldedGradDesc, filterFoldTransDesc, diffPadTransDesc, gradFoldTransDesc, gradUnfoldTransDesc);
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
+                  transformFormat, foldedFilterDesc, paddedDiffDesc,
+                  foldedConvDesc, foldedGradDesc, filterFoldTransDesc,
+                  diffPadTransDesc, gradFoldTransDesc, gradUnfoldTransDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnAddTensor(cudnnHandle_t handle,
-               const void *alpha,
-               const cudnnTensorDescriptor_t aDesc,
-               const void *A,
-               const void *beta,
-               const cudnnTensorDescriptor_t cDesc,
-               void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
+                                         const void *alpha,
+                                         const cudnnTensorDescriptor_t aDesc,
+                                         const void *A, const void *beta,
+                                         const cudnnTensorDescriptor_t cDesc,
+                                         void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
@@ -304,29 +314,29 @@ cudnnAddTensor(cudnnHandle_t handle,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
-                           cudnnOpTensorOp_t opTensorOp,
-                           cudnnDataType_t opTensorCompType,
-                           cudnnNanPropagation_t opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
+    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
+    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
+                                   cudnnDataType_t, cudnnNanPropagation_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
-                           cudnnOpTensorOp_t *opTensorOp,
-                           cudnnDataType_t *opTensorCompType,
-                           cudnnNanPropagation_t *opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
+    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
+    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
+      cudnnNanPropagation_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
@@ -334,126 +344,136 @@ cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnOpTensor(cudnnHandle_t handle,
-              const cudnnOpTensorDescriptor_t opTensorDesc,
-              const void *alpha1,
-              const cudnnTensorDescriptor_t aDesc,
-              const void *A,
-              const void *alpha2,
-              const cudnnTensorDescriptor_t bDesc,
-              const void *B,
-              const void *beta,
-              const cudnnTensorDescriptor_t cDesc,
-              void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
+    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
+    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
+    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
+    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
+                  beta, cDesc, C);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               cudnnReduceTensorOp_t reduceTensorOp,
-                               cudnnDataType_t reduceTensorCompType,
-                               cudnnNanPropagation_t reduceTensorNanOpt,
-                               cudnnReduceTensorIndices_t reduceTensorIndices,
-                               cudnnIndicesType_t reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
+    cudnnNanPropagation_t reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t reduceTensorIndices,
+    cudnnIndicesType_t reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
+      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               cudnnReduceTensorOp_t *reduceTensorOp,
-                               cudnnDataType_t *reduceTensorCompType,
-                               cudnnNanPropagation_t *reduceTensorNanOpt,
-                               cudnnReduceTensorIndices_t *reduceTensorIndices,
-                               cudnnIndicesType_t *reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
+    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t *reduceTensorOp,
+    cudnnDataType_t *reduceTensorCompType,
+    cudnnNanPropagation_t *reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t *reduceTensorIndices,
+    cudnnIndicesType_t *reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
+      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
+      cudnnIndicesType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReductionIndicesSize(cudnnHandle_t handle,
-                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                             const cudnnTensorDescriptor_t aDesc,
-                             const cudnnTensorDescriptor_t cDesc,
-                             size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
-                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               const cudnnTensorDescriptor_t aDesc,
-                               const cudnnTensorDescriptor_t cDesc,
-                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnReduceTensor(cudnnHandle_t handle,
-                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                  void *indices,
-                  size_t indicesSizeInBytes,
-                  void *workspace,
-                  size_t workspaceSizeInBytes,
-                  const void *alpha,
-                  const cudnnTensorDescriptor_t aDesc,
-                  const void *A,
-                  const void *beta,
-                  const cudnnTensorDescriptor_t cDesc,
-                  void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    void *indices, size_t indicesSizeInBytes, void *workspace,
+    size_t workspaceSizeInBytes, const void *alpha,
+    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
+    const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
+      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
+                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
+                  C);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
+                                         const cudnnTensorDescriptor_t yDesc,
+                                         void *y, const void *valuePtr) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, valuePtr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           void *y, const void *alpha) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, alpha);
@@ -461,745 +481,785 @@ cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t dataType, /* image data type */
-                           cudnnTensorFormat_t format,
-                           int k,  /* number of output feature maps */
-                           int c,  /* number of input feature maps */
-                           int h,  /* height of each input filter */
-                           int w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType,          /* image data type */
+    cudnnTensorFormat_t format, int k, /* number of output feature maps */
+    int c,                             /* number of input feature maps */
+    int h,                             /* height of each input filter */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t *dataType, /* image data type */
-                           cudnnTensorFormat_t *format,
-                           int *k,  /* number of output feature maps */
-                           int *c,  /* number of input feature maps */
-                           int *h,  /* height of each input filter */
-                           int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
+    const cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t *dataType,           /* image data type */
+    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
+    int *c,                              /* number of input feature maps */
+    int *h,                              /* height of each input filter */
+    int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
+      int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t dataType, /* image data type */
-                           cudnnTensorFormat_t format,
-                           int nbDims,
-                           const int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType, /* image data type */
+    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
-                           int nbDimsRequested,
-                           cudnnDataType_t *dataType, /* image data type */
-                           cudnnTensorFormat_t *format,
-                           int *nbDims,
-                           int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
+    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, /* image data type */
+    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
+      cudnnTensorFormat_t *, int *, int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
+                  filterDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetFilterSizeInBytes(
+    const cudnnFilterDescriptor_t filterDesc, size_t *size) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnFilterDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterSizeInBytes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, size);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnTransformFilter(cudnnHandle_t handle,
-                     const cudnnTensorTransformDescriptor_t transDesc,
-                     const void *alpha,
-                     const cudnnFilterDescriptor_t srcDesc,
-                     const void *srcData,
-                     const void *beta,
-                     const cudnnFilterDescriptor_t destDesc,
-                     void *destData) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const void *, const cudnnFilterDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnTransformFilter(
+    cudnnHandle_t handle, const cudnnTensorTransformDescriptor_t transDesc,
+    const void *alpha, const cudnnFilterDescriptor_t srcDesc,
+    const void *srcData, const void *beta,
+    const cudnnFilterDescriptor_t destDesc, void *destData) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *, const void *,
+      const cudnnFilterDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformFilter");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc, destData);
+  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc,
+                  destData);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnReorderFilterAndBias(cudnnHandle_t handle,
-                          const cudnnFilterDescriptor_t filterDesc,
-                          cudnnReorderType_t reorderType,
-                          const void *filterData,
-                          void *reorderedFilterData,
-                          int reorderBias,
-                          const void *biasData,
-                          void *reorderedBiasData) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, cudnnReorderType_t, const void *, void *, int, const void *, void *);
+cudnnStatus_t CUDNNWINAPI cudnnReorderFilterAndBias(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
+    cudnnReorderType_t reorderType, const void *filterData,
+    void *reorderedFilterData, int reorderBias, const void *biasData,
+    void *reorderedBiasData) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t, cudnnReorderType_t,
+      const void *, void *, int, const void *, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReorderFilterAndBias");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, reorderType, filterData, reorderedFilterData, reorderBias, biasData, reorderedBiasData);
+  return func_ptr(handle, filterDesc, reorderType, filterData,
+                  reorderedFilterData, reorderBias, biasData,
+                  reorderedBiasData);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnReorderType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionReorderType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnReorderType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionReorderType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, reorderType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnReorderType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionReorderType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnReorderType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionReorderType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, reorderType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
-                                int pad_h,      /* zero-padding height */
-                                int pad_w,      /* zero-padding width */
-                                int u,          /* vertical filter stride */
-                                int v,          /* horizontal filter stride */
-                                int dilation_h, /* filter dilation in the vertical dimension */
-                                int dilation_w, /* filter dilation in the horizontal dimension */
-                                cudnnConvolutionMode_t mode,
-                                cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
+    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
+    int pad_w,                                        /* zero-padding width */
+    int u,          /* vertical filter stride */
+    int v,          /* horizontal filter stride */
+    int dilation_h, /* filter dilation in the vertical dimension */
+    int dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
-                                int *pad_h,      /* zero-padding height */
-                                int *pad_w,      /* zero-padding width */
-                                int *u,          /* vertical filter stride */
-                                int *v,          /* horizontal filter stride */
-                                int *dilation_h, /* filter dilation in the vertical dimension */
-                                int *dilation_w, /* filter dilation in the horizontal dimension */
-                                cudnnConvolutionMode_t *mode,
-                                cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc,
+    int *pad_h,      /* zero-padding height */
+    int *pad_w,      /* zero-padding width */
+    int *u,          /* vertical filter stride */
+    int *v,          /* horizontal filter stride */
+    int *dilation_h, /* filter dilation in the vertical dimension */
+    int *dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
+      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
-                                      const cudnnTensorDescriptor_t inputTensorDesc,
-                                      const cudnnFilterDescriptor_t filterDesc,
-                                      int *n,
-                                      int *c,
-                                      int *h,
-                                      int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
-                                int arrayLength, /* nbDims-2 size */
-                                const int padA[],
-                                const int filterStrideA[],
-                                const int dilationA[],
-                                cudnnConvolutionMode_t mode,
-                                cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
+    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
+    const int padA[], const int filterStrideA[], const int dilationA[],
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
-                                int arrayLengthRequested,
-                                int *arrayLength,
-                                int padA[],
-                                int strideA[],
-                                int dilationA[],
-                                cudnnConvolutionMode_t *mode,
-                                cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
+    int *arrayLength, int padA[], int strideA[], int dilationA[],
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
+      cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
+                  dilationA, mode, computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
-                                      const cudnnTensorDescriptor_t inputTensorDesc,
-                                      const cudnnFilterDescriptor_t filterDesc,
-                                      int nbDims,
-                                      int tensorOutputDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int nbDims,
+    int tensorOutputDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOutputDimA);
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
+                  tensorOutputDimA);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
-                                     const cudnnTensorDescriptor_t xDesc,
-                                     const cudnnFilterDescriptor_t wDesc,
-                                     const cudnnConvolutionDescriptor_t convDesc,
-                                     const cudnnTensorDescriptor_t yDesc,
-                                     const int requestedAlgoCount,
-                                     int *returnedAlgoCount,
-                                     cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnTensorDescriptor_t xDesc,
-                                       const void *x,
-                                       const cudnnFilterDescriptor_t wDesc,
-                                       const void *w,
-                                       const cudnnConvolutionDescriptor_t convDesc,
-                                       const cudnnTensorDescriptor_t yDesc,
-                                       void *y,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnConvolutionFwdAlgoPerf_t *perfResults,
-                                       void *workSpace,
-                                       size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle,
-                                    const cudnnTensorDescriptor_t xDesc,
-                                    const cudnnFilterDescriptor_t wDesc,
-                                    const cudnnConvolutionDescriptor_t convDesc,
-                                    const cudnnTensorDescriptor_t yDesc,
-                                    cudnnConvolutionFwdPreference_t preference,
-                                    size_t memoryLimitInBytes,
-                                    cudnnConvolutionFwdAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc,
+    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionFwdAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
+      cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
-                                       const cudnnTensorDescriptor_t srcDesc,
-                                       const cudnnFilterDescriptor_t filterDesc,
-                                       const cudnnConvolutionDescriptor_t convDesc,
-                                       const cudnnTensorDescriptor_t destDesc,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnFilterDescriptor_t filterDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
-                                        const cudnnTensorDescriptor_t xDesc,
-                                        const cudnnFilterDescriptor_t wDesc,
-                                        const cudnnConvolutionDescriptor_t convDesc,
-                                        const cudnnTensorDescriptor_t yDesc,
-                                        cudnnConvolutionFwdAlgo_t algo,
-                                        size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionForward(cudnnHandle_t handle,
-                        const void *alpha,
-                        const cudnnTensorDescriptor_t xDesc,
-                        const void *x,
-                        const cudnnFilterDescriptor_t wDesc,
-                        const void *w,
-                        const cudnnConvolutionDescriptor_t convDesc,
-                        cudnnConvolutionFwdAlgo_t algo,
-                        void *workSpace,
-                        size_t workSpaceSizeInBytes,
-                        const void *beta,
-                        const cudnnTensorDescriptor_t yDesc,
-                        void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
-                                      const void *alpha1,
-                                      const cudnnTensorDescriptor_t xDesc,
-                                      const void *x,
-                                      const cudnnFilterDescriptor_t wDesc,
-                                      const void *w,
-                                      const cudnnConvolutionDescriptor_t convDesc,
-                                      cudnnConvolutionFwdAlgo_t algo,
-                                      void *workSpace,
-                                      size_t workSpaceSizeInBytes,
-                                      const void *alpha2,
-                                      const cudnnTensorDescriptor_t zDesc,
-                                      const void *z,
-                                      const cudnnTensorDescriptor_t biasDesc,
-                                      const void *bias,
-                                      const cudnnActivationDescriptor_t activationDesc,
-                                      const cudnnTensorDescriptor_t yDesc,
-                                      void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
+    cudnnHandle_t handle, const void *alpha1,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
+    const cudnnTensorDescriptor_t zDesc, const void *z,
+    const cudnnTensorDescriptor_t biasDesc, const void *bias,
+    const cudnnActivationDescriptor_t activationDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
+                  activationDesc, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardBias(cudnnHandle_t handle,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dbDesc,
-                             void *db) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dbDesc, void *db) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
-                                            const cudnnTensorDescriptor_t xDesc,
-                                            const cudnnTensorDescriptor_t dyDesc,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnFilterDescriptor_t dwDesc,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
-                                              const cudnnTensorDescriptor_t xDesc,
-                                              const void *x,
-                                              const cudnnTensorDescriptor_t dyDesc,
-                                              const void *y,
-                                              const cudnnConvolutionDescriptor_t convDesc,
-                                              const cudnnFilterDescriptor_t dwDesc,
-                                              void *dw,
-                                              const int requestedAlgoCount,
-                                              int *returnedAlgoCount,
-                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
-                                              void *workSpace,
-                                              size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *y,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
-                                           const cudnnTensorDescriptor_t xDesc,
-                                           const cudnnTensorDescriptor_t dyDesc,
-                                           const cudnnConvolutionDescriptor_t convDesc,
-                                           const cudnnFilterDescriptor_t dwDesc,
-                                           cudnnConvolutionBwdFilterPreference_t preference,
-                                           size_t memoryLimitInBytes,
-                                           cudnnConvolutionBwdFilterAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc,
+    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdFilterAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
+      size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
-                                              const cudnnTensorDescriptor_t srcDesc,
-                                              const cudnnTensorDescriptor_t diffDesc,
-                                              const cudnnConvolutionDescriptor_t convDesc,
-                                              const cudnnFilterDescriptor_t gradDesc,
-                                              const int requestedAlgoCount,
-                                              int *returnedAlgoCount,
-                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
-                                               const cudnnTensorDescriptor_t xDesc,
-                                               const cudnnTensorDescriptor_t dyDesc,
-                                               const cudnnConvolutionDescriptor_t convDesc,
-                                               const cudnnFilterDescriptor_t gradDesc,
-                                               cudnnConvolutionBwdFilterAlgo_t algo,
-                                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
-                               const void *alpha,
-                               const cudnnTensorDescriptor_t xDesc,
-                               const void *x,
-                               const cudnnTensorDescriptor_t dyDesc,
-                               const void *dy,
-                               const cudnnConvolutionDescriptor_t convDesc,
-                               cudnnConvolutionBwdFilterAlgo_t algo,
-                               void *workSpace,
-                               size_t workSpaceSizeInBytes,
-                               const void *beta,
-                               const cudnnFilterDescriptor_t dwDesc,
-                               void *dw) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnFilterDescriptor_t dwDesc, void *dw) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
+      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
-                                          const cudnnFilterDescriptor_t wDesc,
-                                          const cudnnTensorDescriptor_t dyDesc,
-                                          const cudnnConvolutionDescriptor_t convDesc,
-                                          const cudnnTensorDescriptor_t dxDesc,
-                                          const int requestedAlgoCount,
-                                          int *returnedAlgoCount,
-                                          cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
-                                            const cudnnFilterDescriptor_t wDesc,
-                                            const void *w,
-                                            const cudnnTensorDescriptor_t dyDesc,
-                                            const void *dy,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnTensorDescriptor_t dxDesc,
-                                            void *dx,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
-                                            void *workSpace,
-                                            size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
-                                         const cudnnFilterDescriptor_t wDesc,
-                                         const cudnnTensorDescriptor_t dyDesc,
-                                         const cudnnConvolutionDescriptor_t convDesc,
-                                         const cudnnTensorDescriptor_t dxDesc,
-                                         cudnnConvolutionBwdDataPreference_t preference,
-                                         size_t memoryLimitInBytes,
-                                         cudnnConvolutionBwdDataAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc,
+    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdDataAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
+      size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
-                                            const cudnnFilterDescriptor_t filterDesc,
-                                            const cudnnTensorDescriptor_t diffDesc,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnTensorDescriptor_t gradDesc,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
-                                             const cudnnFilterDescriptor_t wDesc,
-                                             const cudnnTensorDescriptor_t dyDesc,
-                                             const cudnnConvolutionDescriptor_t convDesc,
-                                             const cudnnTensorDescriptor_t dxDesc,
-                                             cudnnConvolutionBwdDataAlgo_t algo,
-                                             size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardData(cudnnHandle_t handle,
-                             const void *alpha,
-                             const cudnnFilterDescriptor_t wDesc,
-                             const void *w,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const cudnnConvolutionDescriptor_t convDesc,
-                             cudnnConvolutionBwdDataAlgo_t algo,
-                             void *workSpace,
-                             size_t workSpaceSizeInBytes,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dxDesc,
-                             void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnIm2Col(cudnnHandle_t handle,
-            const cudnnTensorDescriptor_t xDesc,
-            const void *x,
-            const cudnnFilterDescriptor_t wDesc,
-            const cudnnConvolutionDescriptor_t convDesc,
-            void *colBuffer) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+            const void *x, const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
+                                   const void *, const cudnnFilterDescriptor_t,
+                                   const cudnnConvolutionDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSoftmaxForward(cudnnHandle_t handle,
-                    cudnnSoftmaxAlgorithm_t algo,
-                    cudnnSoftmaxMode_t mode,
-                    const void *alpha,
-                    const cudnnTensorDescriptor_t xDesc,
-                    const void *x,
-                    const void *beta,
-                    const cudnnTensorDescriptor_t yDesc,
-                    void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSoftmaxBackward(cudnnHandle_t handle,
-                     cudnnSoftmaxAlgorithm_t algo,
-                     cudnnSoftmaxMode_t mode,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t dyDesc,
-                     const void *dy,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t dxDesc,
-                     void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
+                  dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
-                            cudnnPoolingMode_t mode,
-                            cudnnNanPropagation_t maxpoolingNanOpt,
-                            int windowHeight,
-                            int windowWidth,
-                            int verticalPadding,
-                            int horizontalPadding,
-                            int verticalStride,
-                            int horizontalStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
+    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
+    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
+    int verticalPadding, int horizontalPadding, int verticalStride,
+    int horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
+      int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
-                            cudnnPoolingMode_t *mode,
-                            cudnnNanPropagation_t *maxpoolingNanOpt,
-                            int *windowHeight,
-                            int *windowWidth,
-                            int *verticalPadding,
-                            int *horizontalPadding,
-                            int *verticalStride,
-                            int *horizontalStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
+    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
+    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
+    int *windowWidth, int *verticalPadding, int *horizontalPadding,
+    int *verticalStride, int *horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
-                            const cudnnPoolingMode_t mode,
-                            const cudnnNanPropagation_t maxpoolingNanOpt,
-                            int nbDims,
-                            const int windowDimA[],
-                            const int paddingA[],
-                            const int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
+    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
+    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
+    const int windowDimA[], const int paddingA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
+      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
+                  paddingA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
-                            int nbDimsRequested,
-                            cudnnPoolingMode_t *mode,
-                            cudnnNanPropagation_t *maxpoolingNanOpt,
-                            int *nbDims,
-                            int windowDimA[],
-                            int paddingA[],
-                            int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
+    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
+    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
+    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int[], int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
+                  windowDimA, paddingA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
                                   const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int nbDims,
-                                  int outputTensorDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+                                  int nbDims, int outputTensorDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                   const cudnnTensorDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
 }
@@ -1207,72 +1267,69 @@ cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
 cudnnStatus_t CUDNNWINAPI
 cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
                                   const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int *n,
-                                  int *c,
-                                  int *h,
-                                  int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+                                  int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnPoolingForward(cudnnHandle_t handle,
-                    const cudnnPoolingDescriptor_t poolingDesc,
-                    const void *alpha,
-                    const cudnnTensorDescriptor_t xDesc,
-                    const void *x,
-                    const void *beta,
-                    const cudnnTensorDescriptor_t yDesc,
-                    void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnPoolingBackward(cudnnHandle_t handle,
-                     const cudnnPoolingDescriptor_t poolingDesc,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t dyDesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t xDesc,
-                     const void *x,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t dxDesc,
-                     void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
-                             cudnnActivationMode_t mode,
-                             cudnnNanPropagation_t reluNanOpt,
-                             double coef) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
+    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
+    cudnnNanPropagation_t reluNanOpt, double coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
+                                               cudnnActivationMode_t,
+                                               cudnnNanPropagation_t, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
@@ -1281,9 +1338,10 @@ cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
 cudnnStatus_t CUDNNWINAPI
 cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
                              cudnnActivationMode_t *mode,
-                             cudnnNanPropagation_t *reluNanOpt,
-                             double *coef) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
+      cudnnNanPropagation_t *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
@@ -1291,65 +1349,68 @@ cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnActivationForward(cudnnHandle_t handle,
-                       cudnnActivationDescriptor_t activationDesc,
-                       const void *alpha,
-                       const cudnnTensorDescriptor_t xDesc,
-                       const void *x,
-                       const void *beta,
-                       const cudnnTensorDescriptor_t yDesc,
-                       void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnActivationBackward(cudnnHandle_t handle,
-                        cudnnActivationDescriptor_t activationDesc,
-                        const void *alpha,
-                        const cudnnTensorDescriptor_t yDesc,
-                        const void *y,
-                        const cudnnTensorDescriptor_t dyDesc,
-                        const void *dy,
-                        const cudnnTensorDescriptor_t xDesc,
-                        const void *x,
-                        const void *beta,
-                        const cudnnTensorDescriptor_t dxDesc,
-                        void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned lrnN, double lrnAlpha,
+                                                double lrnBeta, double lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int, double, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned *lrnN,
+                                                double *lrnAlpha,
+                                                double *lrnBeta, double *lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
@@ -1357,157 +1418,157 @@ cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrn
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(lrnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnLRNCrossChannelForward(cudnnHandle_t handle,
-                            cudnnLRNDescriptor_t normDesc,
-                            cudnnLRNMode_t lrnMode,
-                            const void *alpha,
-                            const cudnnTensorDescriptor_t xDesc,
-                            const void *x,
-                            const void *beta,
-                            const cudnnTensorDescriptor_t yDesc,
-                            void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
-                             cudnnLRNDescriptor_t normDesc,
-                             cudnnLRNMode_t lrnMode,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t yDesc,
-                             const void *y,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const cudnnTensorDescriptor_t xDesc,
-                             const void *x,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dxDesc,
-                             void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
+                  x, beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
-                                  cudnnLRNDescriptor_t normDesc,
-                                  cudnnDivNormMode_t mode,
-                                  const void *alpha,
-                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
-                                  const void *x,
-                                  const void *means, /* if NULL, means are assumed to be zero */
-                                  void *temp,
-                                  void *temp2,
-                                  const void *beta,
-                                  const cudnnTensorDescriptor_t yDesc,
-                                  void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
+                  beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
-                                   cudnnLRNDescriptor_t normDesc,
-                                   cudnnDivNormMode_t mode,
-                                   const void *alpha,
-                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
-                                   const void *x,
-                                   const void *means, /* if NULL, means are assumed to be zero */
-                                   const void *dy,
-                                   void *temp,
-                                   void *temp2,
-                                   const void *beta,
-                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
-                                   void *dx,                                   /* output x differential */
-                                   void *dMeans) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t
+        xDesc, /* same desc for x, means, dy, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    const void *dy, void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+    void *dx,                                   /* output x differential */
+    void *dMeans) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
+                  temp2, beta, dXdMeansDesc, dx, dMeans);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
-                              const cudnnTensorDescriptor_t xDesc,
-                              cudnnBatchNormMode_t mode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
+    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
+    cudnnBatchNormMode_t mode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               cudnnBatchNormMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(derivedBnDesc, xDesc, mode);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
-                                                         cudnnBatchNormMode_t mode,
-                                                         cudnnBatchNormOps_t bnOps,
-                                                         const cudnnTensorDescriptor_t xDesc,
-                                                         const cudnnTensorDescriptor_t zDesc,
-                                                         const cudnnTensorDescriptor_t yDesc,
-                                                         const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-                                                         const cudnnActivationDescriptor_t activationDesc,
-                                                         size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnActivationDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize");
+cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
+    const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t zDesc,
+    const cudnnTensorDescriptor_t yDesc,
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnActivationDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, xDesc, zDesc, yDesc, bnScaleBiasMeanVarDesc, activationDesc, sizeInBytes);
+  return func_ptr(handle, mode, bnOps, xDesc, zDesc, yDesc,
+                  bnScaleBiasMeanVarDesc, activationDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
-                                                  cudnnBatchNormMode_t mode,
-                                                  cudnnBatchNormOps_t bnOps,
-                                                  const cudnnTensorDescriptor_t xDesc,
-                                                  const cudnnTensorDescriptor_t yDesc,
-                                                  const cudnnTensorDescriptor_t dyDesc,
-                                                  const cudnnTensorDescriptor_t dzDesc,
-                                                  const cudnnTensorDescriptor_t dxDesc,
-                                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-                                                  const cudnnActivationDescriptor_t activationDesc,
-                                                  size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnActivationDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationBackwardExWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
+    const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t yDesc,
+    const cudnnTensorDescriptor_t dyDesc, const cudnnTensorDescriptor_t dzDesc,
+    const cudnnTensorDescriptor_t dxDesc,
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+    const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnActivationDescriptor_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationBackwardExWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, xDesc, yDesc, dyDesc, dzDesc, dxDesc, dBnScaleBiasDesc, activationDesc, sizeInBytes);
+  return func_ptr(handle, mode, bnOps, xDesc, yDesc, dyDesc, dzDesc, dxDesc,
+                  dBnScaleBiasDesc, activationDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
-                                                     cudnnBatchNormMode_t mode,
-                                                     cudnnBatchNormOps_t bnOps,
-                                                     const cudnnActivationDescriptor_t activationDesc,
-                                                     const cudnnTensorDescriptor_t xDesc,
-                                                     size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationTrainingExReserveSpaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
+    const cudnnActivationDescriptor_t activationDesc,
+    const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
+      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cudnnGetBatchNormalizationTrainingExReserveSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, bnOps, activationDesc, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationForwardTraining(
-    cudnnHandle_t handle,
-    cudnnBatchNormMode_t mode,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
 
     const void *alpha, /* alpha[0] = result blend factor */
     const void *beta,  /* beta[0] = dest layer blend factor */
 
-    const cudnnTensorDescriptor_t xDesc,
-    const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc,
-    void *y, /* NxCxHxW */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
 
     /* Shared desc for the next 6 tensors in the argument list.
        Data type to be set as follows:
@@ -1515,13 +1576,13 @@ cudnnBatchNormalizationForwardTraining(
        Dimensions for this descriptor depend on normalization mode
        - Spatial Normalization : tensors are expected to have dims 1xCx1x1
         (normalization is performed across NxHxW)
-       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
-        (normalization is performed across N) */
+       - Per-Activation Normalization : tensors are expected to have dims of
+       1xCxHxW (normalization is performed across N) */
     const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
 
-    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
-    const void *bnScale,
-    const void *bnBias,
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
+     */
+    const void *bnScale, const void *bnBias,
 
     /* MUST use factor=1 in the very first call of a complete training cycle.
        Use a factor=1/(1+n) at N-th call to the function to get
@@ -1539,248 +1600,261 @@ cudnnBatchNormalizationForwardTraining(
        of  variance[x] (factor is applied in the same way as for runningMean) */
     void *resultRunningVariance,
 
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
+       backward functions. */
     double epsilon,
 
     /* Optionally save intermediate results from the forward pass here
        - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean,
-    void *resultSaveInvVariance) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+    void *resultSaveMean, void *resultSaveInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, double, void *, void *, double, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+  return func_ptr(
+      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
+      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
+      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationForwardTrainingEx(
-    cudnnHandle_t handle,
-    cudnnBatchNormMode_t mode,
-    cudnnBatchNormOps_t bnOps,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTrainingEx(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
 
     const void *alpha, /* alpha[0] = result blend factor */
     const void *beta,  /* beta[0] = dest layer blend factor */
 
-    const cudnnTensorDescriptor_t xDesc,
-    const void *xData,
-    const cudnnTensorDescriptor_t zDesc,
-    const void *zData,
-    const cudnnTensorDescriptor_t yDesc,
-    void *yData,
+    const cudnnTensorDescriptor_t xDesc, const void *xData,
+    const cudnnTensorDescriptor_t zDesc, const void *zData,
+    const cudnnTensorDescriptor_t yDesc, void *yData,
 
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-    const void *bnScale,
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
     const void *bnBias,
 
-    double exponentialAverageFactor,
-    void *resultRunningMean,
+    double exponentialAverageFactor, void *resultRunningMean,
     void *resultRunningVariance,
 
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
+       backward functions. */
     double epsilon,
 
     /* Optionally save intermediate results from the forward pass here
        - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean,
-    void *resultSaveInvVariance,
+    void *resultSaveMean, void *resultSaveInvVariance,
 
-    cudnnActivationDescriptor_t activationDesc,
-    void *workspace,
-    size_t workSpaceSizeInBytes,
-    void *reserveSpace,
+    cudnnActivationDescriptor_t activationDesc, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
     size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTrainingEx");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, double, void *, void *, double, void *,
+      void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTrainingEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, alpha, beta, xDesc, xData, zDesc, zData, yDesc, yData, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance, activationDesc, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, mode, bnOps, alpha, beta, xDesc, xData, zDesc, zData,
+                  yDesc, yData, bnScaleBiasMeanVarDesc, bnScale, bnBias,
+                  exponentialAverageFactor, resultRunningMean,
+                  resultRunningVariance, epsilon, resultSaveMean,
+                  resultSaveInvVariance, activationDesc, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
-                                        cudnnBatchNormMode_t mode,
-                                        const void *alpha, /* alpha[0] = result blend factor */
-                                        const void *beta,  /* beta[0] = dest layer blend factor */
-                                        const cudnnTensorDescriptor_t xDesc,
-                                        const void *x, /* NxCxHxW */
-                                        const cudnnTensorDescriptor_t yDesc,
-                                        void *y, /* NxCxHxW */
-                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-                                        const void *bnScale,
-                                        const void *bnBias,
-                                        const void *estimatedMean,
-                                        const void *estimatedVariance,
-                                        double epsilon) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
+    const void *bnBias, const void *estimatedMean,
+    const void *estimatedVariance, double epsilon) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, const void *, const void *, double);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
+                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
+                  estimatedVariance, epsilon);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationBackward(cudnnHandle_t handle,
-                                cudnnBatchNormMode_t mode,
-                                const void *alphaDataDiff,
-                                const void *betaDataDiff,
-                                const void *alphaParamDiff,
-                                const void *betaParamDiff,
-                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
-                                const void *x,
-                                const cudnnTensorDescriptor_t dyDesc,
-                                const void *dy,
-                                const cudnnTensorDescriptor_t dxDesc,
-                                void *dx,
-                                /* Shared tensor desc for the 4 tensors below */
-                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-                                const void *bnScale, /* bnBias doesn't affect backpropagation */
-                                /* scale and bias diff are not backpropagated below this layer */
-                                void *dBnScaleResult,
-                                void *dBnBiasResult,
-                                /* Same epsilon as forward pass */
-                                double epsilon,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
+    const void *betaDataDiff, const void *alphaParamDiff,
+    const void *betaParamDiff,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    /* Shared tensor desc for the 4 tensors below */
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+    const void *bnScale, /* bnBias doesn't affect backpropagation */
+    /* scale and bias diff are not backpropagated below this layer */
+    void *dBnScaleResult, void *dBnBiasResult,
+    /* Same epsilon as forward pass */
+    double epsilon,
 
-                                /* Optionally cached intermediate results from
-                                   forward pass */
-                                const void *savedMean,
-                                const void *savedInvVariance) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+    /* Optionally cached intermediate results from
+       forward pass */
+    const void *savedMean, const void *savedInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, void *, void *, double, const void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
+                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
+                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
+                  epsilon, savedMean, savedInvVariance);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
-                                  cudnnBatchNormMode_t mode,
-                                  cudnnBatchNormOps_t bnOps,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackwardEx(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
 
-                                  const void *alphaDataDiff,
-                                  const void *betaDataDiff,
-                                  const void *alphaParamDiff,
-                                  const void *betaParamDiff,
-                                  const cudnnTensorDescriptor_t xDesc,
-                                  const void *xData,
-                                  const cudnnTensorDescriptor_t yDesc,
-                                  const void *yData,
-                                  const cudnnTensorDescriptor_t dyDesc,
-                                  const void *dyData,
-                                  const cudnnTensorDescriptor_t dzDesc,
-                                  void *dzData,
-                                  const cudnnTensorDescriptor_t dxDesc,
-                                  void *dxData,
+    const void *alphaDataDiff, const void *betaDataDiff,
+    const void *alphaParamDiff, const void *betaParamDiff,
+    const cudnnTensorDescriptor_t xDesc, const void *xData,
+    const cudnnTensorDescriptor_t yDesc, const void *yData,
+    const cudnnTensorDescriptor_t dyDesc, const void *dyData,
+    const cudnnTensorDescriptor_t dzDesc, void *dzData,
+    const cudnnTensorDescriptor_t dxDesc, void *dxData,
 
-                                  /* Shared tensor desc for the 4 tensors below */
-                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-                                  const void *bnScaleData,
-                                  const void *bnBiasData, /* needed if there is activation */
-                                  void *dBnScaleData,
-                                  void *dBnBiasData,
-                                  double epsilon, /* Same epsilon as forward pass */
+    /* Shared tensor desc for the 4 tensors below */
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc, const void *bnScaleData,
+    const void *bnBiasData, /* needed if there is activation */
+    void *dBnScaleData, void *dBnBiasData,
+    double epsilon, /* Same epsilon as forward pass */
 
-                                  /* Optionally cached intermediate results from
-                                     forward pass */
-                                  const void *savedMean,
-                                  const void *savedInvVariance,
-                                  cudnnActivationDescriptor_t activationDesc,
-                                  void *workSpace,
-                                  size_t workSpaceSizeInBytes,
-                                  void *reserveSpace,
-                                  size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, double, const void *, const void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackwardEx");
+    /* Optionally cached intermediate results from
+       forward pass */
+    const void *savedMean, const void *savedInvVariance,
+    cudnnActivationDescriptor_t activationDesc, void *workSpace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *,
+      const void *, const void *, const void *, const cudnnTensorDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, const void *, const void *, void *,
+      void *, double, const void *, const void *, cudnnActivationDescriptor_t,
+      void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackwardEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData, dxDesc, dxData, dBnScaleBiasDesc, bnScaleData, bnBiasData, dBnScaleData, dBnBiasData, epsilon, savedMean, savedInvVariance, activationDesc, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(
+      handle, mode, bnOps, alphaDataDiff, betaDataDiff, alphaParamDiff,
+      betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData,
+      dxDesc, dxData, dBnScaleBiasDesc, bnScaleData, bnBiasData, dBnScaleData,
+      dBnBiasData, epsilon, savedMean, savedInvVariance, activationDesc,
+      workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
+    cudnnSpatialTransformerDescriptor_t *stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
-                                       cudnnSamplerType_t samplerType,
-                                       cudnnDataType_t dataType,
-                                       const int nbDims,
-                                       const int dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
+    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
+    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
+      const int, const int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
+    cudnnSpatialTransformerDescriptor_t stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
-                                   const cudnnSpatialTransformerDescriptor_t stDesc,
-                                   const void *theta,
-                                   void *grid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *theta, void *grid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, theta, grid);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
-                                    const cudnnSpatialTransformerDescriptor_t stDesc,
-                                    const void *dgrid,
-                                    void *dtheta) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *dgrid, void *dtheta) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, dgrid, dtheta);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
-                             cudnnSpatialTransformerDescriptor_t stDesc,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t xDesc,
-                             const void *x,
-                             const void *grid,
-                             const void *beta,
-                             cudnnTensorDescriptor_t yDesc,
-                             void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
+    void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
-                              cudnnSpatialTransformerDescriptor_t stDesc,
-                              const void *alpha,
-                              const cudnnTensorDescriptor_t xDesc,
-                              const void *x,
-                              const void *beta,
-                              const cudnnTensorDescriptor_t dxDesc,
-                              void *dx,
-                              const void *alphaDgrid,
-                              const cudnnTensorDescriptor_t dyDesc,
-                              const void *dy,
-                              const void *grid,
-                              const void *betaDgrid,
-                              void *dgrid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
+    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
+                  dyDesc, dy, grid, betaDgrid, dgrid);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
@@ -1788,99 +1862,95 @@ cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
+                                                    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
+    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(xdesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                          cudnnHandle_t handle,
-                          float dropout,
-                          void *states,
-                          size_t stateSizeInBytes,
-                          unsigned long long seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                              cudnnHandle_t handle,
-                              float dropout,
-                              void *states,
-                              size_t stateSizeInBytes,
-                              unsigned long long seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                          cudnnHandle_t handle,
-                          float *dropout,
-                          void **states,
-                          unsigned long long *seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float *, void **, unsigned long long *);
+cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
+    void **states, unsigned long long *seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float *, void **, unsigned long long *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutForward(cudnnHandle_t handle,
-                    const cudnnDropoutDescriptor_t dropoutDesc,
-                    const cudnnTensorDescriptor_t xdesc,
-                    const void *x,
-                    const cudnnTensorDescriptor_t ydesc,
-                    void *y,
-                    void *reserveSpace,
-                    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t xdesc, const void *x,
+    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutBackward(cudnnHandle_t handle,
-                     const cudnnDropoutDescriptor_t dropoutDesc,
-                     const cudnnTensorDescriptor_t dydesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t dxdesc,
-                     void *dx,
-                     void *reserveSpace,
-                     size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t dydesc, const void *dy,
+    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
@@ -1888,132 +1958,130 @@ cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor(cudnnHandle_t handle,
-                      cudnnRNNDescriptor_t rnnDesc,
-                      const int hiddenSize,
-                      const int numLayers,
-                      cudnnDropoutDescriptor_t dropoutDesc,
-                      cudnnRNNInputMode_t inputMode,
-                      cudnnDirectionMode_t direction,
-                      cudnnRNNMode_t mode,
-                      cudnnRNNAlgo_t algo,
-                      cudnnDataType_t mathPrec) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t mathPrec) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, mathPrec);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, mathPrec);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNDescriptor(cudnnHandle_t handle,
-                      cudnnRNNDescriptor_t rnnDesc,
-                      int *hiddenSize,
-                      int *numLayers,
-                      cudnnDropoutDescriptor_t *dropoutDesc,
-                      cudnnRNNInputMode_t *inputMode,
-                      cudnnDirectionMode_t *direction,
-                      cudnnRNNMode_t *mode,
-                      cudnnRNNAlgo_t *algo,
-                      cudnnDataType_t *mathPrec) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *, cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *, cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
+    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
+    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
+    cudnnRNNMode_t *mode, cudnnRNNAlgo_t *algo, cudnnDataType_t *mathPrec) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
+      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
+      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, mathPrec);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, mathPrec);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, mType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(
+    cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, mType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t biasMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc,
+                                              cudnnRNNBiasMode_t biasMode) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNBiasMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, biasMode);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t *biasMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc,
+                                              cudnnRNNBiasMode_t *biasMode) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBiasMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, biasMode);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNSetClip(cudnnHandle_t handle,
-                cudnnRNNDescriptor_t rnnDesc,
-                cudnnRNNClipMode_t clipMode,
-                cudnnNanPropagation_t clipNanOpt,
-                double lclip,
-                double rclip) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t, cudnnNanPropagation_t, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnRNNSetClip(cudnnHandle_t handle,
+                                          cudnnRNNDescriptor_t rnnDesc,
+                                          cudnnRNNClipMode_t clipMode,
+                                          cudnnNanPropagation_t clipNanOpt,
+                                          double lclip, double rclip) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t,
+      cudnnNanPropagation_t, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNSetClip");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNGetClip(cudnnHandle_t handle,
-                cudnnRNNDescriptor_t rnnDesc,
-                cudnnRNNClipMode_t *clipMode,
-                cudnnNanPropagation_t *clipNanOpt,
-                double *lclip,
-                double *rclip) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *, cudnnNanPropagation_t *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnRNNGetClip(cudnnHandle_t handle,
+                                          cudnnRNNDescriptor_t rnnDesc,
+                                          cudnnRNNClipMode_t *clipMode,
+                                          cudnnNanPropagation_t *clipNanOpt,
+                                          double *lclip, double *rclip) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *,
+      cudnnNanPropagation_t *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNGetClip");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNProjectionLayers(cudnnHandle_t handle,
-                            cudnnRNNDescriptor_t rnnDesc,
-                            const int recProjSize,
-                            const int outProjSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
+cudnnSetRNNProjectionLayers(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
+                            const int recProjSize, const int outProjSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNProjectionLayers(cudnnHandle_t handle,
-                            const cudnnRNNDescriptor_t rnnDesc,
-                            int *recProjSize,
-                            int *outProjSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *recProjSize,
+    int *outProjSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
-                             const int minibatch,
-                             const cudnnDataType_t dataType,
-                             cudnnPersistentRNNPlan_t *plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
+    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
+                                               const cudnnDataType_t,
+                                               cudnnPersistentRNNPlan_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, minibatch, dataType, plan);
@@ -2021,209 +2089,206 @@ cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNWorkspaceSize(cudnnHandle_t handle,
-                         const cudnnRNNDescriptor_t rnnDesc,
-                         const int seqLength,
-                         const cudnnTensorDescriptor_t *xDesc,
-                         size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNTrainingReserveSize(cudnnHandle_t handle,
-                               const cudnnRNNDescriptor_t rnnDesc,
-                               const int seqLength,
-                               const cudnnTensorDescriptor_t *xDesc,
-                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNParamsSize(cudnnHandle_t handle,
-                      const cudnnRNNDescriptor_t rnnDesc,
-                      const cudnnTensorDescriptor_t xDesc,
-                      size_t *sizeInBytes,
+cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
                       cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
+      size_t *, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNLinLayerMatrixParams(cudnnHandle_t handle,
-                                const cudnnRNNDescriptor_t rnnDesc,
-                                const int pseudoLayer,
-                                const cudnnTensorDescriptor_t xDesc,
-                                const cudnnFilterDescriptor_t wDesc,
-                                const void *w,
-                                const int linLayerID,
-                                cudnnFilterDescriptor_t linLayerMatDesc,
-                                void **linLayerMat) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
+                  linLayerMatDesc, linLayerMat);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNLinLayerBiasParams(cudnnHandle_t handle,
-                              const cudnnRNNDescriptor_t rnnDesc,
-                              const int pseudoLayer,
-                              const cudnnTensorDescriptor_t xDesc,
-                              const cudnnFilterDescriptor_t wDesc,
-                              const void *w,
-                              const int linLayerID,
-                              cudnnFilterDescriptor_t linLayerBiasDesc,
-                              void **linLayerBias) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
+                  linLayerBiasDesc, linLayerBias);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardInference(cudnnHandle_t handle,
-                         const cudnnRNNDescriptor_t rnnDesc,
-                         const int seqLength,
-                         const cudnnTensorDescriptor_t *xDesc,
-                         const void *x,
-                         const cudnnTensorDescriptor_t hxDesc,
-                         const void *hx,
-                         const cudnnTensorDescriptor_t cxDesc,
-                         const void *cx,
-                         const cudnnFilterDescriptor_t wDesc,
-                         const void *w,
-                         const cudnnTensorDescriptor_t *yDesc,
-                         void *y,
-                         const cudnnTensorDescriptor_t hyDesc,
-                         void *hy,
-                         const cudnnTensorDescriptor_t cyDesc,
-                         void *cy,
-                         void *workspace,
-                         size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardTraining(cudnnHandle_t handle,
-                        const cudnnRNNDescriptor_t rnnDesc,
-                        const int seqLength,
-                        const cudnnTensorDescriptor_t *xDesc,
-                        const void *x,
-                        const cudnnTensorDescriptor_t hxDesc,
-                        const void *hx,
-                        const cudnnTensorDescriptor_t cxDesc,
-                        const void *cx,
-                        const cudnnFilterDescriptor_t wDesc,
-                        const void *w,
-                        const cudnnTensorDescriptor_t *yDesc,
-                        void *y,
-                        const cudnnTensorDescriptor_t hyDesc,
-                        void *hy,
-                        const cudnnTensorDescriptor_t cyDesc,
-                        void *cy,
-                        void *workspace,
-                        size_t workSpaceSizeInBytes,
-                        void *reserveSpace,
-                        size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardData(cudnnHandle_t handle,
-                     const cudnnRNNDescriptor_t rnnDesc,
-                     const int seqLength,
-                     const cudnnTensorDescriptor_t *yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t *dyDesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t dhyDesc,
-                     const void *dhy,
-                     const cudnnTensorDescriptor_t dcyDesc,
-                     const void *dcy,
-                     const cudnnFilterDescriptor_t wDesc,
-                     const void *w,
-                     const cudnnTensorDescriptor_t hxDesc,
-                     const void *hx,
-                     const cudnnTensorDescriptor_t cxDesc,
-                     const void *cx,
-                     const cudnnTensorDescriptor_t *dxDesc,
-                     void *dx,
-                     const cudnnTensorDescriptor_t dhxDesc,
-                     void *dhx,
-                     const cudnnTensorDescriptor_t dcxDesc,
-                     void *dcx,
-                     void *workspace,
-                     size_t workSpaceSizeInBytes,
-                     void *reserveSpace,
-                     size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
+                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
+                     const void *w, const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+                     void *workspace, size_t workSpaceSizeInBytes,
+                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardWeights(cudnnHandle_t handle,
-                        const cudnnRNNDescriptor_t rnnDesc,
-                        const int seqLength,
-                        const cudnnTensorDescriptor_t *xDesc,
-                        const void *x,
-                        const cudnnTensorDescriptor_t hxDesc,
-                        const void *hx,
-                        const cudnnTensorDescriptor_t *yDesc,
-                        const void *y,
-                        const void *workspace,
-                        size_t workSpaceSizeInBytes,
-                        const cudnnFilterDescriptor_t dwDesc,
-                        void *dw,
-                        const void *reserveSpace,
-                        size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNPaddingMode(
+    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNPaddingMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, paddingMode);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNPaddingMode(
+    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnRNNPaddingMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNPaddingMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, paddingMode);
@@ -2231,7 +2296,7 @@ cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *padd
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDataDesc);
@@ -2239,338 +2304,352 @@ cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDataDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
-                          cudnnDataType_t dataType,
-                          cudnnRNNDataLayout_t layout,
-                          int maxSeqLength,
-                          int batchSize,
-                          int vectorSize,
-                          const int seqLengthArray[], /* length of each sequence in the batch */
-                          void *paddingFill) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int, int, const int [], void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDataDescriptor(
+    cudnnRNNDataDescriptor_t rnnDataDesc, cudnnDataType_t dataType,
+    cudnnRNNDataLayout_t layout, int maxSeqLength, int batchSize,
+    int vectorSize,
+    const int seqLengthArray[], /* length of each sequence in the batch */
+    void *paddingFill) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int,
+      int, const int[], void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, seqLengthArray, paddingFill);
+  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize,
+                  vectorSize, seqLengthArray, paddingFill);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
-                          cudnnDataType_t *dataType,
-                          cudnnRNNDataLayout_t *layout,
-                          int *maxSeqLength,
-                          int *batchSize,
-                          int *vectorSize,
-                          int arrayLengthRequested,
-                          int seqLengthArray[],
-                          void *paddingFill) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *, int *, int *, int *, int, int [], void *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDataDescriptor(
+    cudnnRNNDataDescriptor_t rnnDataDesc, cudnnDataType_t *dataType,
+    cudnnRNNDataLayout_t *layout, int *maxSeqLength, int *batchSize,
+    int *vectorSize, int arrayLengthRequested, int seqLengthArray[],
+    void *paddingFill) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *,
+      int *, int *, int *, int, int[], void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, arrayLengthRequested, seqLengthArray, paddingFill);
+  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize,
+                  vectorSize, arrayLengthRequested, seqLengthArray,
+                  paddingFill);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardTrainingEx(cudnnHandle_t handle,
-                          const cudnnRNNDescriptor_t rnnDesc,
-                          const cudnnRNNDataDescriptor_t xDesc,
-                          const void *x,
-                          const cudnnTensorDescriptor_t hxDesc,
-                          const void *hx,
-                          const cudnnTensorDescriptor_t cxDesc,
-                          const void *cx,
-                          const cudnnFilterDescriptor_t wDesc,
-                          const void *w,
-                          const cudnnRNNDataDescriptor_t yDesc,
-                          void *y,
-                          const cudnnTensorDescriptor_t hyDesc,
-                          void *hy,
-                          const cudnnTensorDescriptor_t cyDesc,
-                          void *cy,
-                          const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-                          const void *keys,                     /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-                          void *cAttn,                          /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-                          void *iAttn,                          /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-                          void *queries,                        /* reserved, should pass NULL */
-                          void *workSpace,
-                          size_t workSpaceSizeInBytes,
-                          void *reserveSpace,
-                          size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTrainingEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnRNNDataDescriptor_t yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy,
+    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+    const void *keys,                     /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+    void *cAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+    void *iAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+    void *queries,                        /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
+      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTrainingEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
+                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
+                  iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes,
+                  reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardInferenceEx(cudnnHandle_t handle,
-                           const cudnnRNNDescriptor_t rnnDesc,
-                           const cudnnRNNDataDescriptor_t xDesc,
-                           const void *x,
-                           const cudnnTensorDescriptor_t hxDesc,
-                           const void *hx,
-                           const cudnnTensorDescriptor_t cxDesc,
-                           const void *cx,
-                           const cudnnFilterDescriptor_t wDesc,
-                           const void *w,
-                           const cudnnRNNDataDescriptor_t yDesc,
-                           void *y,
-                           const cudnnTensorDescriptor_t hyDesc,
-                           void *hy,
-                           const cudnnTensorDescriptor_t cyDesc,
-                           void *cy,
-                           const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-                           const void *keys,                     /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-                           void *cAttn,                          /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-                           void *iAttn,                          /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-                           void *queries,                        /* reserved, should pass NULL */
-                           void *workSpace,
-                           size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInferenceEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnRNNDataDescriptor_t yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy,
+    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+    const void *keys,                     /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+    void *cAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+    void *iAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+    void *queries,                        /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
+      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInferenceEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
+                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
+                  iDesc, iAttn, qDesc, queries, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardDataEx(cudnnHandle_t handle,
-                       const cudnnRNNDescriptor_t rnnDesc,
-                       const cudnnRNNDataDescriptor_t yDesc,
-                       const void *y,
-                       const cudnnRNNDataDescriptor_t dyDesc,
-                       const void *dy,
-                       const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
-                       const void *dcAttn,                    /* reserved, should pass NULL */
-                       const cudnnTensorDescriptor_t dhyDesc,
-                       const void *dhy,
-                       const cudnnTensorDescriptor_t dcyDesc,
-                       const void *dcy,
-                       const cudnnFilterDescriptor_t wDesc,
-                       const void *w,
-                       const cudnnTensorDescriptor_t hxDesc,
-                       const void *hx,
-                       const cudnnTensorDescriptor_t cxDesc,
-                       const void *cx,
-                       const cudnnRNNDataDescriptor_t dxDesc,
-                       void *dx,
-                       const cudnnTensorDescriptor_t dhxDesc,
-                       void *dhx,
-                       const cudnnTensorDescriptor_t dcxDesc,
-                       void *dcx,
-                       const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
-                       void *dkeys,                           /* reserved, should pass NULL */
-                       void *workSpace,
-                       size_t workSpaceSizeInBytes,
-                       void *reserveSpace,
-                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardDataEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t yDesc, const void *y,
+    const cudnnRNNDataDescriptor_t dyDesc, const void *dy,
+    const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
+    const void *dcAttn,                    /* reserved, should pass NULL */
+    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
+    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnRNNDataDescriptor_t dxDesc, void *dx,
+    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+    const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+    const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
+    void *dkeys,                           /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardDataEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn,
+                  dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx,
+                  dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys,
+                  workSpace, workSpaceSizeInBytes, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardWeightsEx(cudnnHandle_t handle,
-                          const cudnnRNNDescriptor_t rnnDesc,
-                          const cudnnRNNDataDescriptor_t xDesc,
-                          const void *x,
-                          const cudnnTensorDescriptor_t hxDesc,
-                          const void *hx,
-                          const cudnnRNNDataDescriptor_t yDesc,
-                          const void *y,
-                          void *workSpace,
-                          size_t workSpaceSizeInBytes,
-                          const cudnnFilterDescriptor_t dwDesc,
-                          void *dw,
-                          void *reserveSpace,
-                          size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, void *, size_t, const cudnnFilterDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeightsEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnRNNDataDescriptor_t yDesc, const void *y, void *workSpace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, const void *, void *, size_t,
+      const cudnnFilterDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeightsEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace,
+                  workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNAlgorithmDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
+    cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNForwardInferenceAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardInferenceAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNForwardInferenceAlgorithmEx(cudnnHandle_t handle,
-                                        const cudnnRNNDescriptor_t rnnDesc,
-                                        const int seqLength,
-                                        const cudnnTensorDescriptor_t *xDesc,
-                                        const void *x,
-                                        const cudnnTensorDescriptor_t hxDesc,
-                                        const void *hx,
-                                        const cudnnTensorDescriptor_t cxDesc,
-                                        const void *cx,
-                                        const cudnnFilterDescriptor_t wDesc,
-                                        const void *w,
-                                        const cudnnTensorDescriptor_t *yDesc,
-                                        void *y,
-                                        const cudnnTensorDescriptor_t hyDesc,
-                                        void *hy,
-                                        const cudnnTensorDescriptor_t cyDesc,
-                                        void *cy,
-                                        const float findIntensity,
-                                        const int requestedAlgoCount,
-                                        int *returnedAlgoCount,
-                                        cudnnAlgorithmPerformance_t *perfResults,
-                                        void *workspace,
-                                        size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardInferenceAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNForwardTrainingAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardTrainingAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNForwardTrainingAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnRNNDescriptor_t rnnDesc,
-                                       const int seqLength,
-                                       const cudnnTensorDescriptor_t *xDesc,
-                                       const void *x,
-                                       const cudnnTensorDescriptor_t hxDesc,
-                                       const void *hx,
-                                       const cudnnTensorDescriptor_t cxDesc,
-                                       const void *cx,
-                                       const cudnnFilterDescriptor_t wDesc,
-                                       const void *w,
-                                       const cudnnTensorDescriptor_t *yDesc,
-                                       void *y,
-                                       const cudnnTensorDescriptor_t hyDesc,
-                                       void *hy,
-                                       const cudnnTensorDescriptor_t cyDesc,
-                                       void *cy,
-                                       const float findIntensity,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnAlgorithmPerformance_t *perfResults,
-                                       void *workspace,
-                                       size_t workSpaceSizeInBytes,
-                                       void *reserveSpace,
-                                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardTrainingAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardDataAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNBackwardDataAlgorithmEx(cudnnHandle_t handle,
-                                    const cudnnRNNDescriptor_t rnnDesc,
-                                    const int seqLength,
-                                    const cudnnTensorDescriptor_t *yDesc,
-                                    const void *y,
-                                    const cudnnTensorDescriptor_t *dyDesc,
-                                    const void *dy,
-                                    const cudnnTensorDescriptor_t dhyDesc,
-                                    const void *dhy,
-                                    const cudnnTensorDescriptor_t dcyDesc,
-                                    const void *dcy,
-                                    const cudnnFilterDescriptor_t wDesc,
-                                    const void *w,
-                                    const cudnnTensorDescriptor_t hxDesc,
-                                    const void *hx,
-                                    const cudnnTensorDescriptor_t cxDesc,
-                                    const void *cx,
-                                    const cudnnTensorDescriptor_t *dxDesc,
-                                    void *dx,
-                                    const cudnnTensorDescriptor_t dhxDesc,
-                                    void *dhx,
-                                    const cudnnTensorDescriptor_t dcxDesc,
-                                    void *dcx,
-                                    const float findIntensity,
-                                    const int requestedAlgoCount,
-                                    int *returnedAlgoCount,
-                                    cudnnAlgorithmPerformance_t *perfResults,
-                                    void *workspace,
-                                    size_t workSpaceSizeInBytes,
-                                    void *reserveSpace,
-                                    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardDataAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *yDesc, const void *y,
+    const cudnnTensorDescriptor_t *dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
+    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnTensorDescriptor_t *dxDesc, void *dx,
+    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+    const cudnnTensorDescriptor_t dcxDesc, void *dcx, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNBackwardWeightsAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardWeightsAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNBackwardWeightsAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnRNNDescriptor_t rnnDesc,
-                                       const int seqLength,
-                                       const cudnnTensorDescriptor_t *xDesc,
-                                       const void *x,
-                                       const cudnnTensorDescriptor_t hxDesc,
-                                       const void *hx,
-                                       const cudnnTensorDescriptor_t *yDesc,
-                                       const void *y,
-                                       const float findIntensity,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnAlgorithmPerformance_t *perfResults,
-                                       const void *workspace,
-                                       size_t workSpaceSizeInBytes,
-                                       const cudnnFilterDescriptor_t dwDesc,
-                                       void *dw,
-                                       const void *reserveSpace,
-                                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardWeightsAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y,
+    const float findIntensity, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnAlgorithmPerformance_t *perfResults,
+    const void *workspace, size_t workSpaceSizeInBytes,
+    const cudnnFilterDescriptor_t dwDesc, void *dw, const void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  findIntensity, requestedAlgoCount, returnedAlgoCount,
+                  perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw,
+                  reserveSpace, reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSeqDataDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnSeqDataDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSeqDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(seqDataDesc);
@@ -2578,47 +2657,43 @@ cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSeqDataDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnSeqDataDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySeqDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(seqDataDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
-                          cudnnDataType_t dataType,
-                          int nbDims,
-                          const int dimA[],
-                          const cudnnSeqDataAxis_t axes[],
-                          size_t seqLengthArraySize,
-                          const int seqLengthArray[],
-                          void *paddingFill) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSeqDataDescriptor_t, cudnnDataType_t, int, const int [], const cudnnSeqDataAxis_t [], size_t, const int [], void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetSeqDataDescriptor(
+    cudnnSeqDataDescriptor_t seqDataDesc, cudnnDataType_t dataType, int nbDims,
+    const int dimA[], const cudnnSeqDataAxis_t axes[],
+    size_t seqLengthArraySize, const int seqLengthArray[], void *paddingFill) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnSeqDataDescriptor_t, cudnnDataType_t, int, const int[],
+      const cudnnSeqDataAxis_t[], size_t, const int[], void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSeqDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(seqDataDesc, dataType, nbDims, dimA, axes, seqLengthArraySize, seqLengthArray, paddingFill);
+  return func_ptr(seqDataDesc, dataType, nbDims, dimA, axes, seqLengthArraySize,
+                  seqLengthArray, paddingFill);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
-                          cudnnDataType_t *dataType,
-                          int *nbDims,
-                          int nbDimsRequested,
-                          int dimA[],
-                          cudnnSeqDataAxis_t axes[],
-                          size_t *seqLengthArraySize,
-                          size_t seqLengthSizeRequested,
-                          int seqLengthArray[],
-                          void *paddingFill) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnSeqDataDescriptor_t, cudnnDataType_t *, int *, int, int [], cudnnSeqDataAxis_t [], size_t *, size_t, int [], void *);
+cudnnStatus_t CUDNNWINAPI cudnnGetSeqDataDescriptor(
+    const cudnnSeqDataDescriptor_t seqDataDesc, cudnnDataType_t *dataType,
+    int *nbDims, int nbDimsRequested, int dimA[], cudnnSeqDataAxis_t axes[],
+    size_t *seqLengthArraySize, size_t seqLengthSizeRequested,
+    int seqLengthArray[], void *paddingFill) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnSeqDataDescriptor_t, cudnnDataType_t *, int *, int, int[],
+      cudnnSeqDataAxis_t[], size_t *, size_t, int[], void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetSeqDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(seqDataDesc, dataType, nbDims, nbDimsRequested, dimA, axes, seqLengthArraySize, seqLengthSizeRequested, seqLengthArray, paddingFill);
+  return func_ptr(seqDataDesc, dataType, nbDims, nbDimsRequested, dimA, axes,
+                  seqLengthArraySize, seqLengthSizeRequested, seqLengthArray,
+                  paddingFill);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAttnDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAttnDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAttnDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(attnDesc);
@@ -2626,217 +2701,198 @@ cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAttnDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAttnDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAttnDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(attnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
-                       cudnnAttnQueryMap_t queryMap,
-                       int nHeads,
-                       double smScaler,
-                       cudnnDataType_t dataType,
-                       cudnnDataType_t computePrec,
-                       cudnnMathType_t mathType,
-                       cudnnDropoutDescriptor_t attnDropoutDesc,
-                       cudnnDropoutDescriptor_t postDropoutDesc,
-                       int qSize,
-                       int kSize,
-                       int vSize,
-                       int qProjSize,
-                       int kProjSize,
-                       int vProjSize,
-                       int oProjSize,
-                       int qoMaxSeqLength,
-                       int kvMaxSeqLength,
-                       int maxBatchSize,
-                       int maxBeamSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAttnDescriptor_t, cudnnAttnQueryMap_t, int, double, cudnnDataType_t, cudnnDataType_t, cudnnMathType_t, cudnnDropoutDescriptor_t, cudnnDropoutDescriptor_t, int, int, int, int, int, int, int, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetAttnDescriptor(
+    cudnnAttnDescriptor_t attnDesc, cudnnAttnQueryMap_t queryMap, int nHeads,
+    double smScaler, cudnnDataType_t dataType, cudnnDataType_t computePrec,
+    cudnnMathType_t mathType, cudnnDropoutDescriptor_t attnDropoutDesc,
+    cudnnDropoutDescriptor_t postDropoutDesc, int qSize, int kSize, int vSize,
+    int qProjSize, int kProjSize, int vProjSize, int oProjSize,
+    int qoMaxSeqLength, int kvMaxSeqLength, int maxBatchSize, int maxBeamSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnAttnDescriptor_t, cudnnAttnQueryMap_t, int, double, cudnnDataType_t,
+      cudnnDataType_t, cudnnMathType_t, cudnnDropoutDescriptor_t,
+      cudnnDropoutDescriptor_t, int, int, int, int, int, int, int, int, int,
+      int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAttnDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attnDesc, queryMap, nHeads, smScaler, dataType, computePrec, mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
+  return func_ptr(attnDesc, queryMap, nHeads, smScaler, dataType, computePrec,
+                  mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize,
+                  vSize, qProjSize, kProjSize, vProjSize, oProjSize,
+                  qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
-                       cudnnAttnQueryMap_t *queryMap,
-                       int *nHeads,
-                       double *smScaler,
-                       cudnnDataType_t *dataType,
-                       cudnnDataType_t *computePrec,
-                       cudnnMathType_t *mathType,
-                       cudnnDropoutDescriptor_t *attnDropoutDesc,
-                       cudnnDropoutDescriptor_t *postDropoutDesc,
-                       int *qSize,
-                       int *kSize,
-                       int *vSize,
-                       int *qProjSize,
-                       int *kProjSize,
-                       int *vProjSize,
-                       int *oProjSize,
-                       int *qoMaxSeqLength,
-                       int *kvMaxSeqLength,
-                       int *maxBatchSize,
-                       int *maxBeamSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAttnDescriptor_t, cudnnAttnQueryMap_t *, int *, double *, cudnnDataType_t *, cudnnDataType_t *, cudnnMathType_t *, cudnnDropoutDescriptor_t *, cudnnDropoutDescriptor_t *, int *, int *, int *, int *, int *, int *, int *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAttnDescriptor(
+    cudnnAttnDescriptor_t attnDesc, cudnnAttnQueryMap_t *queryMap, int *nHeads,
+    double *smScaler, cudnnDataType_t *dataType, cudnnDataType_t *computePrec,
+    cudnnMathType_t *mathType, cudnnDropoutDescriptor_t *attnDropoutDesc,
+    cudnnDropoutDescriptor_t *postDropoutDesc, int *qSize, int *kSize,
+    int *vSize, int *qProjSize, int *kProjSize, int *vProjSize, int *oProjSize,
+    int *qoMaxSeqLength, int *kvMaxSeqLength, int *maxBatchSize,
+    int *maxBeamSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnAttnDescriptor_t, cudnnAttnQueryMap_t *, int *, double *,
+      cudnnDataType_t *, cudnnDataType_t *, cudnnMathType_t *,
+      cudnnDropoutDescriptor_t *, cudnnDropoutDescriptor_t *, int *, int *,
+      int *, int *, int *, int *, int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAttnDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attnDesc, queryMap, nHeads, smScaler, dataType, computePrec, mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
+  return func_ptr(attnDesc, queryMap, nHeads, smScaler, dataType, computePrec,
+                  mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize,
+                  vSize, qProjSize, kProjSize, vProjSize, oProjSize,
+                  qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
-                             const cudnnAttnDescriptor_t attnDesc,
-                             size_t *weightSizeInBytes,
-                             size_t *workSpaceSizeInBytes,
-                             size_t *reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, size_t *, size_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetMultiHeadAttnBuffers(
+    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
+    size_t *weightSizeInBytes, size_t *workSpaceSizeInBytes,
+    size_t *reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnAttnDescriptor_t, size_t *, size_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetMultiHeadAttnBuffers");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, weightSizeInBytes, workSpaceSizeInBytes, reserveSpaceSizeInBytes);
+  return func_ptr(handle, attnDesc, weightSizeInBytes, workSpaceSizeInBytes,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
-                             const cudnnAttnDescriptor_t attnDesc,
-                             cudnnMultiHeadAttnWeightKind_t wKind,
-                             size_t weightSizeInBytes,
-                             const void *w,
-                             cudnnTensorDescriptor_t wDesc,
-                             void **wAddr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, cudnnMultiHeadAttnWeightKind_t, size_t, const void *, cudnnTensorDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetMultiHeadAttnWeights(
+    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
+    cudnnMultiHeadAttnWeightKind_t wKind, size_t weightSizeInBytes,
+    const void *w, cudnnTensorDescriptor_t wDesc, void **wAddr) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnAttnDescriptor_t,
+      cudnnMultiHeadAttnWeightKind_t, size_t, const void *,
+      cudnnTensorDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetMultiHeadAttnWeights");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, attnDesc, wKind, weightSizeInBytes, w, wDesc, wAddr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnMultiHeadAttnForward(cudnnHandle_t handle,
-                          const cudnnAttnDescriptor_t attnDesc,
-                          int currIdx,
-                          const int *loWinIdx,
-                          const int *hiWinIdx,
-                          const int *seqLengthArrayQRO,
-                          const int *seqLengthArrayKV,
-                          const cudnnSeqDataDescriptor_t qDesc,
-                          const void *queries,
-                          const void *residuals,
-                          const cudnnSeqDataDescriptor_t kDesc,
-                          const void *keys,
-                          const cudnnSeqDataDescriptor_t vDesc,
-                          const void *values,
-                          const cudnnSeqDataDescriptor_t oDesc,
-                          void *out,
-                          size_t weightSizeInBytes,
-                          const void *w,
-                          size_t workSpaceSizeInBytes,
-                          void *workSpace,
-                          size_t reserveSpaceSizeInBytes,
-                          void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, int, const int *, const int *, const int *, const int *, const cudnnSeqDataDescriptor_t, const void *, const void *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, void *, size_t, const void *, size_t, void *, size_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnMultiHeadAttnForward(
+    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc, int currIdx,
+    const int *loWinIdx, const int *hiWinIdx, const int *seqLengthArrayQRO,
+    const int *seqLengthArrayKV, const cudnnSeqDataDescriptor_t qDesc,
+    const void *queries, const void *residuals,
+    const cudnnSeqDataDescriptor_t kDesc, const void *keys,
+    const cudnnSeqDataDescriptor_t vDesc, const void *values,
+    const cudnnSeqDataDescriptor_t oDesc, void *out, size_t weightSizeInBytes,
+    const void *w, size_t workSpaceSizeInBytes, void *workSpace,
+    size_t reserveSpaceSizeInBytes, void *reserveSpace) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnAttnDescriptor_t, int, const int *, const int *,
+      const int *, const int *, const cudnnSeqDataDescriptor_t, const void *,
+      const void *, const cudnnSeqDataDescriptor_t, const void *,
+      const cudnnSeqDataDescriptor_t, const void *,
+      const cudnnSeqDataDescriptor_t, void *, size_t, const void *, size_t,
+      void *, size_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, currIdx, loWinIdx, hiWinIdx, seqLengthArrayQRO, seqLengthArrayKV, qDesc, queries, residuals, kDesc, keys, vDesc, values, oDesc, out, weightSizeInBytes, w, workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes, reserveSpace);
+  return func_ptr(handle, attnDesc, currIdx, loWinIdx, hiWinIdx,
+                  seqLengthArrayQRO, seqLengthArrayKV, qDesc, queries,
+                  residuals, kDesc, keys, vDesc, values, oDesc, out,
+                  weightSizeInBytes, w, workSpaceSizeInBytes, workSpace,
+                  reserveSpaceSizeInBytes, reserveSpace);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
-                               const cudnnAttnDescriptor_t attnDesc,
-                               const int *loWinIdx,
-                               const int *hiWinIdx,
-                               const int *seqLengthArrayDQDO,
-                               const int *seqLengthArrayDKDV,
-                               const cudnnSeqDataDescriptor_t doDesc,
-                               const void *dout,
-                               const cudnnSeqDataDescriptor_t dqDesc,
-                               void *dqueries,
-                               const void *queries,
-                               const cudnnSeqDataDescriptor_t dkDesc,
-                               void *dkeys,
-                               const void *keys,
-                               const cudnnSeqDataDescriptor_t dvDesc,
-                               void *dvalues,
-                               const void *values,
-                               size_t weightSizeInBytes,
-                               const void *w,
-                               size_t workSpaceSizeInBytes,
-                               void *workSpace,
-                               size_t reserveSpaceSizeInBytes,
-                               void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, const int *, const int *, const int *, const int *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, void *, const void *, const cudnnSeqDataDescriptor_t, void *, const void *, const cudnnSeqDataDescriptor_t, void *, const void *, size_t, const void *, size_t, void *, size_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnMultiHeadAttnBackwardData(
+    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
+    const int *loWinIdx, const int *hiWinIdx, const int *seqLengthArrayDQDO,
+    const int *seqLengthArrayDKDV, const cudnnSeqDataDescriptor_t doDesc,
+    const void *dout, const cudnnSeqDataDescriptor_t dqDesc, void *dqueries,
+    const void *queries, const cudnnSeqDataDescriptor_t dkDesc, void *dkeys,
+    const void *keys, const cudnnSeqDataDescriptor_t dvDesc, void *dvalues,
+    const void *values, size_t weightSizeInBytes, const void *w,
+    size_t workSpaceSizeInBytes, void *workSpace,
+    size_t reserveSpaceSizeInBytes, void *reserveSpace) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnAttnDescriptor_t, const int *, const int *,
+      const int *, const int *, const cudnnSeqDataDescriptor_t, const void *,
+      const cudnnSeqDataDescriptor_t, void *, const void *,
+      const cudnnSeqDataDescriptor_t, void *, const void *,
+      const cudnnSeqDataDescriptor_t, void *, const void *, size_t,
+      const void *, size_t, void *, size_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, loWinIdx, hiWinIdx, seqLengthArrayDQDO, seqLengthArrayDKDV, doDesc, dout, dqDesc, dqueries, queries, dkDesc, dkeys, keys, dvDesc, dvalues, values, weightSizeInBytes, w, workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes, reserveSpace);
+  return func_ptr(handle, attnDesc, loWinIdx, hiWinIdx, seqLengthArrayDQDO,
+                  seqLengthArrayDKDV, doDesc, dout, dqDesc, dqueries, queries,
+                  dkDesc, dkeys, keys, dvDesc, dvalues, values,
+                  weightSizeInBytes, w, workSpaceSizeInBytes, workSpace,
+                  reserveSpaceSizeInBytes, reserveSpace);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
-                                  const cudnnAttnDescriptor_t attnDesc,
-                                  cudnnWgradMode_t addGrad,
-                                  const cudnnSeqDataDescriptor_t qDesc,
-                                  const void *queries,
-                                  const cudnnSeqDataDescriptor_t kDesc,
-                                  const void *keys,
-                                  const cudnnSeqDataDescriptor_t vDesc,
-                                  const void *values,
-                                  const cudnnSeqDataDescriptor_t doDesc,
-                                  const void *dout,
-                                  size_t weightSizeInBytes,
-                                  const void *w,
-                                  void *dw,
-                                  size_t workSpaceSizeInBytes,
-                                  void *workSpace,
-                                  size_t reserveSpaceSizeInBytes,
-                                  void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, cudnnWgradMode_t, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, const void *, size_t, const void *, void *, size_t, void *, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnBackwardWeights");
+cudnnStatus_t CUDNNWINAPI cudnnMultiHeadAttnBackwardWeights(
+    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
+    cudnnWgradMode_t addGrad, const cudnnSeqDataDescriptor_t qDesc,
+    const void *queries, const cudnnSeqDataDescriptor_t kDesc, const void *keys,
+    const cudnnSeqDataDescriptor_t vDesc, const void *values,
+    const cudnnSeqDataDescriptor_t doDesc, const void *dout,
+    size_t weightSizeInBytes, const void *w, void *dw,
+    size_t workSpaceSizeInBytes, void *workSpace,
+    size_t reserveSpaceSizeInBytes, void *reserveSpace) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnAttnDescriptor_t, cudnnWgradMode_t,
+      const cudnnSeqDataDescriptor_t, const void *,
+      const cudnnSeqDataDescriptor_t, const void *,
+      const cudnnSeqDataDescriptor_t, const void *,
+      const cudnnSeqDataDescriptor_t, const void *, size_t, const void *,
+      void *, size_t, void *, size_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnMultiHeadAttnBackwardWeights");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, addGrad, qDesc, queries, kDesc, keys, vDesc, values, doDesc, dout, weightSizeInBytes, w, dw, workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes, reserveSpace);
+  return func_ptr(handle, attnDesc, addGrad, qDesc, queries, kDesc, keys, vDesc,
+                  values, doDesc, dout, weightSizeInBytes, w, dw,
+                  workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes,
+                  reserveSpace);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
-                            cudnnDataType_t compType,
-                            cudnnLossNormalizationMode_t normMode,
-                            cudnnNanPropagation_t gradMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t, cudnnLossNormalizationMode_t, cudnnNanPropagation_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptorEx(
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType,
+    cudnnLossNormalizationMode_t normMode, cudnnNanPropagation_t gradMode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnCTCLossDescriptor_t, cudnnDataType_t, cudnnLossNormalizationMode_t,
+      cudnnNanPropagation_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType, normMode, gradMode);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
-                            cudnnDataType_t *compType,
-                            cudnnLossNormalizationMode_t *normMode,
-                            cudnnNanPropagation_t *gradMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *, cudnnLossNormalizationMode_t *, cudnnNanPropagation_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptorEx(
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType,
+    cudnnLossNormalizationMode_t *normMode, cudnnNanPropagation_t *gradMode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnCTCLossDescriptor_t, cudnnDataType_t *,
+      cudnnLossNormalizationMode_t *, cudnnNanPropagation_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType, normMode, gradMode);
@@ -2844,82 +2900,102 @@ cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCTCLoss(
+cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
     cudnnHandle_t handle,
     const cudnnTensorDescriptor_t
-        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
-                          mini batch size, A is the alphabet size)  */
-    const void *probs, /* probabilities after softmax, in GPU memory */
-    const int *labels, /* labels, in CPU memory */
-    const int *labelLengths,                     /* the length of each label, in CPU memory */
-    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
-    void *costs,                                 /* the returned costs of CTC, in GPU memory */
-    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
-    const void *gradients,   /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the timing steps, N is the
+                      mini batch size, A is the alphabet size)  */
+    const void *probs,       /* probabilities after softmax, in GPU memory */
+    const int *labels,       /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    void *costs,             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t
+        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
+                          T,N,A */
+    const void *gradients,   /* the returned CTC gradients, in GPU memory, to
+                                compute costs only, set it to NULL */
     cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
     cudnnCTCLossDescriptor_t ctcLossDesc,
-    void *workspace,              /* pointer to the workspace, in GPU memory */
+    void *workspace, /* pointer to the workspace, in GPU memory */
     size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *, const int *, const int *, void *, const cudnnTensorDescriptor_t, const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *,
+      const int *, const int *, void *, const cudnnTensorDescriptor_t,
+      const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths,
+                  costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCTCLossWorkspaceSize(
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
     cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
-                                                timing steps, N is the mini batch size, A is the alphabet size) */
-    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
-                                                    dimensions are T,N,A. To compute costs
-                                                    only, set it to NULL */
-    const int *labels,                           /* labels, in CPU memory */
-    const int *labelLengths,                     /* the length of each label, in CPU memory */
-    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
-    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const int *, const int *, const int *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
+    const cudnnTensorDescriptor_t
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the
+                      timing steps, N is the mini batch size, A is the alphabet
+                      size) */
+    const cudnnTensorDescriptor_t
+        gradientsDesc,       /* Tensor descriptor for gradients, the
+                                dimensions are T,N,A. To compute costs
+                                only, set it to NULL */
+    const int *labels,       /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
+      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, algo, ctcLossDesc, sizeInBytes);
+  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
+                  inputLengths, algo, ctcLossDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmDescriptor(
+    cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithm_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc, algorithm);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmDescriptor(
+    const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithm_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc, algorithm);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnCopyAlgorithmDescriptor(
+    const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(src, dest);
@@ -2927,236 +3003,255 @@ cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorith
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, numberToCreate);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf,
-                             cudnnAlgorithmDescriptor_t algoDesc,
-                             cudnnStatus_t status,
-                             float time,
-                             size_t memory) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t, cudnnStatus_t, float, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t algoPerf, cudnnAlgorithmDescriptor_t algoDesc,
+    cudnnStatus_t status, float time, size_t memory) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t,
+                                               cudnnAlgorithmDescriptor_t,
+                                               cudnnStatus_t, float, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, algoDesc, status, time, memory);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf,
-                             cudnnAlgorithmDescriptor_t *algoDesc,
-                             cudnnStatus_t *status,
-                             float *time,
-                             size_t *memory) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *, cudnnStatus_t *, float *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmPerformance(
+    const cudnnAlgorithmPerformance_t algoPerf,
+    cudnnAlgorithmDescriptor_t *algoDesc, cudnnStatus_t *status, float *time,
+    size_t *memory) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *,
+      cudnnStatus_t *, float *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, algoDesc, status, time, memory);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
+cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, numberToDestroy);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmSpaceSize(
+    cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
+    size_t *algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnSaveAlgorithm(cudnnHandle_t handle,
-                   cudnnAlgorithmDescriptor_t algoDesc,
-                   void *algoSpace,
-                   size_t algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
+cudnnSaveAlgorithm(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
+                   void *algoSpace, size_t algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRestoreAlgorithm(cudnnHandle_t handle,
-                      void *algoSpace,
-                      size_t algoSpaceSizeInBytes,
-                      cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, void *, size_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnRestoreAlgorithm(
+    cudnnHandle_t handle, void *algoSpace, size_t algoSpaceSizeInBytes,
+    cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, void *, size_t,
+                                               cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetCallback(unsigned mask, void *udata,
+                                           cudnnCallback_t fptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mask, udata, fptr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetCallback(unsigned *mask, void **udata,
+                                           cudnnCallback_t *fptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mask, udata, fptr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t *, cudnnFusedOps_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFusedOpsConstParamPack");
+cudnnStatus_t CUDNNWINAPI cudnnCreateFusedOpsConstParamPack(
+    cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t *,
+                                               cudnnFusedOps_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateFusedOpsConstParamPack");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(constPack, ops);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsConstParamPack");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsConstParamPack");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(constPack);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFusedOpsConstParamPackAttribute(cudnnFusedOpsConstParamPack_t constPack,
-                                        cudnnFusedOpsConstParamLabel_t paramLabel,
-                                        const void *param) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t, cudnnFusedOpsConstParamLabel_t, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFusedOpsConstParamPackAttribute");
+cudnnStatus_t CUDNNWINAPI cudnnSetFusedOpsConstParamPackAttribute(
+    cudnnFusedOpsConstParamPack_t constPack,
+    cudnnFusedOpsConstParamLabel_t paramLabel, const void *param) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t,
+                                               cudnnFusedOpsConstParamLabel_t,
+                                               const void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetFusedOpsConstParamPackAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(constPack, paramLabel, param);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFusedOpsConstParamPackAttribute(const cudnnFusedOpsConstParamPack_t constPack,
-                                        cudnnFusedOpsConstParamLabel_t paramLabel,
-                                        void *param,
-                                        int *isNULL) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFusedOpsConstParamPack_t, cudnnFusedOpsConstParamLabel_t, void *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFusedOpsConstParamPackAttribute");
+cudnnStatus_t CUDNNWINAPI cudnnGetFusedOpsConstParamPackAttribute(
+    const cudnnFusedOpsConstParamPack_t constPack,
+    cudnnFusedOpsConstParamLabel_t paramLabel, void *param, int *isNULL) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFusedOpsConstParamPack_t, cudnnFusedOpsConstParamLabel_t,
+      void *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetFusedOpsConstParamPackAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(constPack, paramLabel, param, isNULL);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t *, cudnnFusedOps_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFusedOpsVariantParamPack");
+cudnnStatus_t CUDNNWINAPI cudnnCreateFusedOpsVariantParamPack(
+    cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnFusedOpsVariantParamPack_t *, cudnnFusedOps_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateFusedOpsVariantParamPack");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(varPack, ops);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsVariantParamPack");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsVariantParamPack");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(varPack);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFusedOpsVariantParamPackAttribute(cudnnFusedOpsVariantParamPack_t varPack,
-                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
-                                          void *ptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t, cudnnFusedOpsVariantParamLabel_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFusedOpsVariantParamPackAttribute");
+cudnnStatus_t CUDNNWINAPI cudnnSetFusedOpsVariantParamPackAttribute(
+    cudnnFusedOpsVariantParamPack_t varPack,
+    cudnnFusedOpsVariantParamLabel_t paramLabel, void *ptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t,
+                                   cudnnFusedOpsVariantParamLabel_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetFusedOpsVariantParamPackAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(varPack, paramLabel, ptr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFusedOpsVariantParamPackAttribute(const cudnnFusedOpsVariantParamPack_t varPack,
-                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
-                                          void *ptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFusedOpsVariantParamPack_t, cudnnFusedOpsVariantParamLabel_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFusedOpsVariantParamPackAttribute");
+cudnnStatus_t CUDNNWINAPI cudnnGetFusedOpsVariantParamPackAttribute(
+    const cudnnFusedOpsVariantParamPack_t varPack,
+    cudnnFusedOpsVariantParamLabel_t paramLabel, void *ptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnFusedOpsVariantParamPack_t,
+                                   cudnnFusedOpsVariantParamLabel_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetFusedOpsVariantParamPackAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(varPack, paramLabel, ptr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan, cudnnFusedOps_t ops) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsPlan_t *, cudnnFusedOps_t);
+cudnnStatus_t CUDNNWINAPI cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan,
+                                                  cudnnFusedOps_t ops) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsPlan_t *, cudnnFusedOps_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFusedOpsPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, ops);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsPlan_t);
+cudnnStatus_t CUDNNWINAPI cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnMakeFusedOpsPlan(cudnnHandle_t handle,
-                      cudnnFusedOpsPlan_t plan,
+cudnnMakeFusedOpsPlan(cudnnHandle_t handle, cudnnFusedOpsPlan_t plan,
                       const cudnnFusedOpsConstParamPack_t constPack,
                       size_t *workspaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnFusedOpsPlan_t, const cudnnFusedOpsConstParamPack_t, size_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnFusedOpsPlan_t, const cudnnFusedOpsConstParamPack_t,
+      size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMakeFusedOpsPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, plan, constPack, workspaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan, cudnnFusedOpsVariantParamPack_t varPack) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFusedOpsPlan_t, cudnnFusedOpsVariantParamPack_t);
+cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan,
+                     cudnnFusedOpsVariantParamPack_t varPack) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnFusedOpsPlan_t,
+                                   cudnnFusedOpsVariantParamPack_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFusedOpsExecute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, plan, varPack);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor_v6(cudnnHandle_t handle,
-                         cudnnRNNDescriptor_t rnnDesc,
-                         const int hiddenSize,
-                         const int numLayers,
-                         cudnnDropoutDescriptor_t dropoutDesc,
-                         cudnnRNNInputMode_t inputMode,
-                         cudnnDirectionMode_t direction,
-                         cudnnRNNMode_t mode,
-                         cudnnRNNAlgo_t algo,
-                         cudnnDataType_t mathPrec) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t mathPrec) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, mathPrec);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, mathPrec);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t rnnDesc,
-                         int hiddenSize,
-                         int numLayers,
-                         cudnnDropoutDescriptor_t dropoutDesc,
-                         cudnnRNNInputMode_t inputMode,
-                         cudnnDirectionMode_t direction,
-                         cudnnRNNMode_t mode,
-                         cudnnDataType_t mathPrec) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(
+    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnRNNInputMode_t inputMode,
+    cudnnDirectionMode_t direction, cudnnRNNMode_t mode,
+    cudnnDataType_t mathPrec) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
+      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
+      cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, mathPrec);
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
+                  direction, mode, mathPrec);
 }
 
 }  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cupti_stub.cc b/tensorflow/stream_executor/cuda/cupti_stub.cc
index 130c3f96e44..313d68eddfc 100644
--- a/tensorflow/stream_executor/cuda/cupti_stub.cc
+++ b/tensorflow/stream_executor/cuda/cupti_stub.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 // IWYU pragma: no_include "perftools/gputools/executor/stream_executor.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
@@ -23,12 +24,16 @@ limitations under the License.
 namespace {
 // Returns DSO handle or null if loading the DSO fails.
 void* GetDsoHandle() {
+#if defined(PLATFORM_GOOGLE) && (CUDA_VERSION > 10000)
+  return nullptr;
+#else
   static auto handle = []() -> void* {
     auto handle_or = stream_executor::internal::DsoLoader::GetCuptiDsoHandle();
     if (!handle_or.ok()) return nullptr;
     return handle_or.ValueOrDie();
   }();
   return handle;
+#endif
 }
 
 template <typename T>
diff --git a/tensorflow/stream_executor/cuda/cusolver_dense_11_0.inc b/tensorflow/stream_executor/cuda/cusolver_dense_11_0.inc
new file mode 100644
index 00000000000..c4f32c84680
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cusolver_dense_11_0.inc
@@ -0,0 +1,4686 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+cusolverStatus_t CUSOLVERAPI cusolverGetProperty(libraryPropertyType type,
+                                                 int *value) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverGetVersion(int *version) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(version);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCreate(cusolverDnHandle_t *handle) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDestroy(cusolverDnHandle_t handle) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSetStream(cusolverDnHandle_t handle,
+                                                 cudaStream_t streamId) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnGetStream(cusolverDnHandle_t handle,
+                                                 cudaStream_t *streamId) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSParamsCreate(cusolverDnIRSParams_t *params_ptr) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params_ptr);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSParamsDestroy(cusolverDnIRSParams_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetRefinementSolver(
+    cusolverDnIRSParams_t params, cusolverIRSRefinement_t refinement_solver) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t,
+                                                  cusolverIRSRefinement_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetRefinementSolver");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, refinement_solver);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetSolverMainPrecision(
+    cusolverDnIRSParams_t params, cusolverPrecType_t solver_main_precision) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t,
+                                                  cusolverPrecType_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetSolverMainPrecision");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, solver_main_precision);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetSolverLowestPrecision(
+    cusolverDnIRSParams_t params, cusolverPrecType_t solver_lowest_precision) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t,
+                                                  cusolverPrecType_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetSolverLowestPrecision");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, solver_lowest_precision);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetSolverPrecisions(
+    cusolverDnIRSParams_t params, cusolverPrecType_t solver_main_precision,
+    cusolverPrecType_t solver_lowest_precision) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnIRSParams_t, cusolverPrecType_t, cusolverPrecType_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetSolverPrecisions");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, solver_main_precision, solver_lowest_precision);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSParamsSetTol(cusolverDnIRSParams_t params, double val) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetTol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, val);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSParamsSetTolInner(cusolverDnIRSParams_t params, double val) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetTolInner");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, val);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetMaxIters(
+    cusolverDnIRSParams_t params, cusolver_int_t maxiters) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetMaxIters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, maxiters);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetMaxItersInner(
+    cusolverDnIRSParams_t params, cusolver_int_t maxiters_inner) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetMaxItersInner");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, maxiters_inner);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsGetMaxIters(
+    cusolverDnIRSParams_t params, cusolver_int_t *maxiters) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsGetMaxIters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, maxiters);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSParamsEnableFallback(cusolverDnIRSParams_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSParamsEnableFallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSParamsDisableFallback(cusolverDnIRSParams_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSParamsDisableFallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSInfosDestroy(cusolverDnIRSInfos_t infos) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSInfosCreate(cusolverDnIRSInfos_t *infos_ptr) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos_ptr);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetNiters(
+    cusolverDnIRSInfos_t infos, cusolver_int_t *niters) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetNiters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos, niters);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetOuterNiters(
+    cusolverDnIRSInfos_t infos, cusolver_int_t *outer_niters) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t, cusolver_int_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetOuterNiters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos, outer_niters);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSInfosRequestResidual(cusolverDnIRSInfos_t infos) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSInfosRequestResidual");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetResidualHistory(
+    cusolverDnIRSInfos_t infos, void **residual_history) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t, void **);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetResidualHistory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos, residual_history);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetMaxIters(
+    cusolverDnIRSInfos_t infos, cusolver_int_t *maxiters) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetMaxIters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos, maxiters);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZZgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZCgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZKgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZEgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZEgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZYgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZYgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCCgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
+      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
+      cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCEgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
+      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
+      cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCEgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCKgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
+      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
+      cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCYgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
+      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
+      cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCYgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDDgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDSgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDHgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDBgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDBgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDXgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDXgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSSgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
+    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
+    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
+      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSHgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
+    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
+    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
+      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSBgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
+    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
+    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
+      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSBgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSXgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
+    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
+    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
+      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSXgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZZgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZCgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZKgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZEgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZEgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZYgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZYgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCCgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
+      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
+      cuComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCKgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
+      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
+      cuComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCEgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
+      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
+      cuComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCEgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCYgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
+      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
+      cuComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCYgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDDgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDSgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDHgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDBgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDBgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDXgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDXgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSSgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
+    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
+    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
+      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSHgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
+    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
+    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
+      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSBgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
+    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
+    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
+      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSBgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSXgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
+    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
+    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
+      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSXgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZZgels(cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+                 cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+                 cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+                 cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+                 cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZCgels(cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+                 cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+                 cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+                 cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+                 cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZKgels(cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+                 cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+                 cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+                 cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+                 cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZEgels(cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+                 cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+                 cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+                 cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+                 cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZEgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZYgels(cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+                 cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+                 cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+                 cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+                 cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZYgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCCgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCKgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCEgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCEgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCYgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCYgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDDgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDSgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDHgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDBgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDBgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDXgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDXgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSSgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
+    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
+      void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSHgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
+    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
+      void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSBgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
+    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
+      void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSBgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSXgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
+    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
+      void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSXgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZZgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZCgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZKgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZEgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZEgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZYgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZYgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCCgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCKgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCEgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCEgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCYgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCYgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDDgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDSgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDHgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDBgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDBgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDXgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDXgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSSgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
+    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
+      void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSHgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
+    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
+      void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSBgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
+    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
+      void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSBgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSXgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
+    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
+      void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSXgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgesv(
+    cusolverDnHandle_t handle, cusolverDnIRSParams_t gesv_irs_params,
+    cusolverDnIRSInfos_t gesv_irs_infos, cusolver_int_t n, cusolver_int_t nrhs,
+    void *dA, cusolver_int_t ldda, void *dB, cusolver_int_t lddb, void *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *niters, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnIRSParams_t, cusolverDnIRSInfos_t,
+      cusolver_int_t, cusolver_int_t, void *, cusolver_int_t, void *,
+      cusolver_int_t, void *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, gesv_irs_params, gesv_irs_infos, n, nrhs, dA, ldda,
+                  dB, lddb, dX, lddx, dWorkspace, lwork_bytes, niters, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolverDnIRSParams_t params, cusolver_int_t n,
+    cusolver_int_t nrhs, size_t *lwork_bytes) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnIRSParams_t,
+                                      cusolver_int_t, cusolver_int_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, n, nrhs, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgels(
+    cusolverDnHandle_t handle, cusolverDnIRSParams_t gels_irs_params,
+    cusolverDnIRSInfos_t gels_irs_infos, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, void *dA, cusolver_int_t ldda, void *dB,
+    cusolver_int_t lddb, void *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *niters, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnIRSParams_t, cusolverDnIRSInfos_t,
+      cusolver_int_t, cusolver_int_t, cusolver_int_t, void *, cusolver_int_t,
+      void *, cusolver_int_t, void *, cusolver_int_t, void *, size_t,
+      cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, gels_irs_params, gels_irs_infos, m, n, nrhs, dA, ldda,
+                  dB, lddb, dX, lddx, dWorkspace, lwork_bytes, niters, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgels_bufferSize(
+    cusolverDnHandle_t handle, cusolverDnIRSParams_t params, cusolver_int_t m,
+    cusolver_int_t n, cusolver_int_t nrhs, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnIRSParams_t, cusolver_int_t, cusolver_int_t,
+      cusolver_int_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, m, n, nrhs, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnSpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, float *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnDpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, double *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuComplex *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuDoubleComplex *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda,
+                                              float *Workspace, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda,
+                                              double *Workspace, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda,
+                                              cuComplex *Workspace, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              cuDoubleComplex *Workspace,
+                                              int Lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs, const float *A, int lda,
+                                              float *B, int ldb, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
+      float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs, const double *A,
+                                              int lda, double *B, int ldb,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
+      double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs, const cuComplex *A,
+                                              int lda, cuComplex *B, int ldb,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
+      cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs,
+                                              const cuDoubleComplex *A, int lda,
+                                              cuDoubleComplex *B, int ldb,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrfBatched(cusolverDnHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n, float *Aarray[],
+                                                     int lda, int *infoArray,
+                                                     int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrfBatched(cusolverDnHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n, double *Aarray[],
+                                                     int lda, int *infoArray,
+                                                     int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotrfBatched(cusolverDnHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n, cuComplex *Aarray[],
+                                                     int lda, int *infoArray,
+                                                     int batchSize) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      cuComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotrfBatched(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+    cuDoubleComplex *Aarray[], int lda, int *infoArray, int batchSize) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      cuDoubleComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrsBatched(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+    int nrhs, /* only support rhs = 1*/
+    float *A[], int lda, float *B[], int ldb, int *d_info, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, float *[], int, float *[],
+      int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrsBatched(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+    int nrhs, /* only support rhs = 1*/
+    double *A[], int lda, double *B[], int ldb, int *d_info, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, double *[], int,
+      double *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCpotrsBatched(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+                        int nrhs, /* only support rhs = 1*/
+                        cuComplex *A[], int lda, cuComplex *B[], int ldb,
+                        int *d_info, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, cuComplex *[], int,
+      cuComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZpotrsBatched(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+                        int nrhs, /* only support rhs = 1*/
+                        cuDoubleComplex *A[], int lda, cuDoubleComplex *B[],
+                        int ldb, int *d_info, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, cuDoubleComplex *[], int,
+      cuDoubleComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnSpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, float *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnDpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, double *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuDoubleComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda, float *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda, double *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda,
+                                              cuComplex *work, int lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              cuDoubleComplex *work, int lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnStrtri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
+    int n, float *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, float *, int,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnStrtri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDtrtri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
+    int n, double *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, double *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDtrtri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCtrtri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
+    int n, cuComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCtrtri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZtrtri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
+    int n, cuDoubleComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZtrtri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnStrtri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo,
+                                              cublasDiagType_t diag, int n,
+                                              float *A, int lda, float *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, float *, int,
+      float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnStrtri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDtrtri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo,
+                                              cublasDiagType_t diag, int n,
+                                              double *A, int lda, double *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, double *,
+      int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDtrtri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCtrtri(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
+    int n, cuComplex *A, int lda, cuComplex *work, int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, cuComplex *,
+      int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCtrtri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZtrtri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo,
+                                              cublasDiagType_t diag, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              cuDoubleComplex *work, int lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int,
+      cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZtrtri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnSlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, float *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlauum_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnDlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, double *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlauum_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnClauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClauum_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuDoubleComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlauum_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSlauum(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda, float *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlauum");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDlauum(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda, double *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlauum");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnClauum(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda,
+                                              cuComplex *work, int lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClauum");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZlauum(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              cuDoubleComplex *work, int lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlauum");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCgetrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
+                            cuComplex *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZgetrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
+                            cuDoubleComplex *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf(cusolverDnHandle_t handle, int m,
+                                              int n, float *A, int lda,
+                                              float *Workspace, int *devIpiv,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, float *, int, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf(cusolverDnHandle_t handle, int m,
+                                              int n, double *A, int lda,
+                                              double *Workspace, int *devIpiv,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, double *, int, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgetrf(cusolverDnHandle_t handle, int m,
+                                              int n, cuComplex *A, int lda,
+                                              cuComplex *Workspace,
+                                              int *devIpiv, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuComplex *,
+                                      int, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgetrf(cusolverDnHandle_t handle, int m,
+                                              int n, cuDoubleComplex *A,
+                                              int lda,
+                                              cuDoubleComplex *Workspace,
+                                              int *devIpiv, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *,
+      int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSlaswp(cusolverDnHandle_t handle, int n,
+                                              float *A, int lda, int k1, int k2,
+                                              const int *devIpiv, int incx) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, float *, int, int, int, const int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlaswp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDlaswp(cusolverDnHandle_t handle, int n,
+                                              double *A, int lda, int k1,
+                                              int k2, const int *devIpiv,
+                                              int incx) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, double *, int, int, int, const int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlaswp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnClaswp(cusolverDnHandle_t handle, int n,
+                                              cuComplex *A, int lda, int k1,
+                                              int k2, const int *devIpiv,
+                                              int incx) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, cuComplex *, int, int, int, const int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClaswp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZlaswp(cusolverDnHandle_t handle, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              int k1, int k2,
+                                              const int *devIpiv, int incx) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
+                                                  cuDoubleComplex *, int, int,
+                                                  int, const int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlaswp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgetrs(cusolverDnHandle_t handle,
+                                              cublasOperation_t trans, int n,
+                                              int nrhs, const float *A, int lda,
+                                              const int *devIpiv, float *B,
+                                              int ldb, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasOperation_t, int, int, const float *, int,
+      const int *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgetrs(cusolverDnHandle_t handle,
+                                              cublasOperation_t trans, int n,
+                                              int nrhs, const double *A,
+                                              int lda, const int *devIpiv,
+                                              double *B, int ldb,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasOperation_t, int, int, const double *, int,
+      const int *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgetrs(cusolverDnHandle_t handle,
+                                              cublasOperation_t trans, int n,
+                                              int nrhs, const cuComplex *A,
+                                              int lda, const int *devIpiv,
+                                              cuComplex *B, int ldb,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasOperation_t, int, int, const cuComplex *, int,
+      const int *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgetrs(
+    cusolverDnHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const cuDoubleComplex *A, int lda, const int *devIpiv, cuDoubleComplex *B,
+    int ldb, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
+      int, const int *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCgeqrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
+                            cuComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZgeqrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
+                            cuDoubleComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf(cusolverDnHandle_t handle, int m,
+                                              int n, float *A, int lda,
+                                              float *TAU, float *Workspace,
+                                              int Lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, float *, int, float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf(cusolverDnHandle_t handle, int m,
+                                              int n, double *A, int lda,
+                                              double *TAU, double *Workspace,
+                                              int Lwork, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, double *,
+                                      int, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgeqrf(cusolverDnHandle_t handle, int m,
+                                              int n, cuComplex *A, int lda,
+                                              cuComplex *TAU,
+                                              cuComplex *Workspace, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  cuComplex *, int, cuComplex *,
+                                                  cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgeqrf(cusolverDnHandle_t handle, int m,
+                                              int n, cuDoubleComplex *A,
+                                              int lda, cuDoubleComplex *TAU,
+                                              cuDoubleComplex *Workspace,
+                                              int Lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda,
+    const float *tau, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int,
+                                      const float *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda,
+    const double *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  int, const double *, int,
+                                                  const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungqr_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int k, const cuComplex *A, int lda,
+    const cuComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  int, const cuComplex *, int,
+                                                  const cuComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungqr_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int k, const cuDoubleComplex *A,
+    int lda, const cuDoubleComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr(cusolverDnHandle_t handle, int m,
+                                              int n, int k, float *A, int lda,
+                                              const float *tau, float *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, int, float *, int, const float *, float *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr(cusolverDnHandle_t handle, int m,
+                                              int n, int k, double *A, int lda,
+                                              const double *tau, double *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, int, double *, int, const double *,
+      double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungqr(cusolverDnHandle_t handle, int m,
+                                              int n, int k, cuComplex *A,
+                                              int lda, const cuComplex *tau,
+                                              cuComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, int, cuComplex *, int, const cuComplex *,
+      cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungqr(
+    cusolverDnHandle_t handle, int m, int n, int k, cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *tau, cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, int, cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSormqr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const float *A, int lda, const float *tau,
+    const float *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const float *, int, const float *, const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDormqr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const double *A, int lda, const double *tau,
+    const double *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const double *, int, const double *, const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const cuComplex *A, int lda, const cuComplex *tau,
+    const cuComplex *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const cuComplex *, int, const cuComplex *, const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *tau, const cuDoubleComplex *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSormqr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const float *A, int lda, const float *tau, float *C,
+    int ldc, float *work, int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const float *, int, const float *, float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
+                  lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDormqr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const double *A, int lda, const double *tau, double *C,
+    int ldc, double *work, int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const double *, int, const double *, double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
+                  lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const cuComplex *A, int lda, const cuComplex *tau,
+    cuComplex *C, int ldc, cuComplex *work, int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const cuComplex *, int, const cuComplex *, cuComplex *, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
+                  lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *tau, cuDoubleComplex *C, int ldc,
+    cuDoubleComplex *work, int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
+                  lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf_bufferSize(
+    cusolverDnHandle_t handle, int n, float *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
+                                                  float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf_bufferSize(
+    cusolverDnHandle_t handle, int n, double *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
+                                                  double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf_bufferSize(
+    cusolverDnHandle_t handle, int n, cuComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
+                                                  cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf_bufferSize(
+    cusolverDnHandle_t handle, int n, cuDoubleComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda, int *ipiv,
+                                              float *work, int lwork,
+                                              int *info) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      float *, int, int *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda, int *ipiv,
+                                              double *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *, double *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda, int *ipiv,
+                                              cuComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *,
+      cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              int *ipiv, cuDoubleComplex *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrs_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
+    const float *A, int lda, const int *ipiv, float *B, int ldb, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
+      const int *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrs_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrs_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
+    const double *A, int lda, const int *ipiv, double *B, int ldb, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
+      const int *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrs_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytrs_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
+    const cuComplex *A, int lda, const int *ipiv, cuComplex *B, int ldb,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
+      const int *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrs_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytrs_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
+    const cuDoubleComplex *A, int lda, const int *ipiv, cuDoubleComplex *B,
+    int ldb, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      int, const int *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrs_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs, const float *A, int lda,
+                                              const int *ipiv, float *B,
+                                              int ldb, float *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
+      const int *, float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs, const double *A,
+                                              int lda, const int *ipiv,
+                                              double *B, int ldb, double *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
+      const int *, double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCsytrs(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+                 int nrhs, const cuComplex *A, int lda, const int *ipiv,
+                 cuComplex *B, int ldb, cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
+      const int *, cuComplex *, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytrs(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
+    const cuDoubleComplex *A, int lda, const int *ipiv, cuDoubleComplex *B,
+    int ldb, cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      int, const int *, cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda,
+    const int *ipiv, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      float *, int, const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
+    const int *ipiv, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      double *, int, const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuComplex *A,
+    int lda, const int *ipiv, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      cuComplex *, int, const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
+    int lda, const int *ipiv, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda,
+                                              const int *ipiv, float *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, const int *,
+      float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda,
+                                              const int *ipiv, double *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, const int *,
+      double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda,
+                                              const int *ipiv, cuComplex *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, const int *,
+      cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytri(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
+    int lda, const int *ipiv, cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      const int *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgebrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgebrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgebrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgebrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd(cusolverDnHandle_t handle, int m,
+                                              int n, float *A, int lda,
+                                              float *D, float *E, float *TAUQ,
+                                              float *TAUP, float *Work,
+                                              int Lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, float *, int, float *, float *, float *,
+      float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgebrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd(cusolverDnHandle_t handle, int m,
+                                              int n, double *A, int lda,
+                                              double *D, double *E,
+                                              double *TAUQ, double *TAUP,
+                                              double *Work, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, double *, int, double *, double *, double *,
+      double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgebrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd(cusolverDnHandle_t handle, int m,
+                                              int n, cuComplex *A, int lda,
+                                              float *D, float *E,
+                                              cuComplex *TAUQ, cuComplex *TAUP,
+                                              cuComplex *Work, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuComplex *, int, float *, float *,
+      cuComplex *, cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgebrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd(
+    cusolverDnHandle_t handle, int m, int n, cuDoubleComplex *A, int lda,
+    double *D, double *E, cuDoubleComplex *TAUQ, cuDoubleComplex *TAUP,
+    cuDoubleComplex *Work, int Lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, double *, double *,
+      cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgebrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
+    const float *A, int lda, const float *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const float *, int,
+      const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgbr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
+    const double *A, int lda, const double *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const double *, int,
+      const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgbr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungbr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
+    const cuComplex *A, int lda, const cuComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const cuComplex *,
+      int, const cuComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungbr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungbr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungbr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr(cusolverDnHandle_t handle,
+                                              cublasSideMode_t side, int m,
+                                              int n, int k, float *A, int lda,
+                                              const float *tau, float *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, float *, int,
+      const float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgbr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr(cusolverDnHandle_t handle,
+                                              cublasSideMode_t side, int m,
+                                              int n, int k, double *A, int lda,
+                                              const double *tau, double *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, double *, int,
+      const double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgbr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungbr(cusolverDnHandle_t handle,
+                                              cublasSideMode_t side, int m,
+                                              int n, int k, cuComplex *A,
+                                              int lda, const cuComplex *tau,
+                                              cuComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, cuComplex *, int,
+      const cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungbr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZungbr(cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n,
+                 int k, cuDoubleComplex *A, int lda, const cuDoubleComplex *tau,
+                 cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, cuDoubleComplex *,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungbr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const float *A,
+    int lda, const float *d, const float *e, const float *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const float *, int,
+      const float *, const float *, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const double *A,
+    int lda, const double *d, const double *e, const double *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const double *, int,
+      const double *, const double *, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChetrd_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex *A,
+    int lda, const float *d, const float *e, const cuComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const cuComplex *, int,
+      const float *, const float *, const cuComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChetrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *A, int lda, const double *d, const double *e,
+    const cuDoubleComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      const double *, const double *, const cuDoubleComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhetrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda, float *d,
+                                              float *e, float *tau, float *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, float *, float *,
+      float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
+    double *d, double *e, double *tau, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, double *,
+      double *, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChetrd(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda, float *d,
+                                              float *e, cuComplex *tau,
+                                              cuComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, float *,
+      float *, cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChetrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
+    int lda, double *d, double *e, cuDoubleComplex *tau, cuDoubleComplex *work,
+    int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      double *, double *, cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhetrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const float *A,
+    int lda, const float *tau, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      const float *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const double *A,
+    int lda, const double *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const double *, int,
+      const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungtr_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex *A,
+    int lda, const cuComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const cuComplex *, int,
+      const cuComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungtr_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda,
+                                              const float *tau, float *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, const float *,
+      float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda,
+                                              const double *tau, double *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, const double *,
+      double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungtr(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuComplex *A,
+    int lda, const cuComplex *tau, cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int,
+      const cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungtr(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              const cuDoubleComplex *tau,
+                                              cuDoubleComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSormtr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, const float *A, int lda,
+    const float *tau, const float *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, const float *, int, const float *, const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDormtr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, const double *A, int lda,
+    const double *tau, const double *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, const double *, int, const double *, const double *, int,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCunmtr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, const cuComplex *A, int lda,
+    const cuComplex *tau, const cuComplex *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, const cuComplex *, int, const cuComplex *, const cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *tau, const cuDoubleComplex *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, const cuDoubleComplex *, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSormtr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, float *A, int lda, float *tau,
+    float *C, int ldc, float *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, float *, int, float *, float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
+                  lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDormtr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, double *A, int lda, double *tau,
+    double *C, int ldc, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, double *, int, double *, double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
+                  lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCunmtr(cusolverDnHandle_t handle, cublasSideMode_t side,
+                 cublasFillMode_t uplo, cublasOperation_t trans, int m, int n,
+                 cuComplex *A, int lda, cuComplex *tau, cuComplex *C, int ldc,
+                 cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, cuComplex *, int, cuComplex *, cuComplex *, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
+                  lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, cuDoubleComplex *A, int lda,
+    cuDoubleComplex *tau, cuDoubleComplex *C, int ldc, cuDoubleComplex *work,
+    int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, cuDoubleComplex *, int, cuDoubleComplex *, cuDoubleComplex *,
+      int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
+                  lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd(
+    cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m,
+    int n, float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt,
+    float *work, int lwork, float *rwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, signed char, signed char, int, int, float *, int,
+      float *, float *, int, float *, int, float *, int, float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
+                  lwork, rwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd(
+    cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m,
+    int n, double *A, int lda, double *S, double *U, int ldu, double *VT,
+    int ldvt, double *work, int lwork, double *rwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, signed char, signed char, int, int, double *, int,
+      double *, double *, int, double *, int, double *, int, double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
+                  lwork, rwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCgesvd(cusolverDnHandle_t handle, signed char jobu, signed char jobvt,
+                 int m, int n, cuComplex *A, int lda, float *S, cuComplex *U,
+                 int ldu, cuComplex *VT, int ldvt, cuComplex *work, int lwork,
+                 float *rwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, signed char, signed char, int, int, cuComplex *, int,
+      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, float *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
+                  lwork, rwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZgesvd(cusolverDnHandle_t handle, signed char jobu, signed char jobvt,
+                 int m, int n, cuDoubleComplex *A, int lda, double *S,
+                 cuDoubleComplex *U, int ldu, cuDoubleComplex *VT, int ldvt,
+                 cuDoubleComplex *work, int lwork, double *rwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, signed char, signed char, int, int, cuDoubleComplex *,
+      int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
+                  lwork, rwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const float *A, int lda, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const float *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const double *A, int lda, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const double *, int, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuComplex *A, int lda, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuComplex *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuDoubleComplex *, int, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, float *A, int lda, float *W, float *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
+      int, float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, double *A, int lda, double *W, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
+      int, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevd(cusolverDnHandle_t handle,
+                                              cusolverEigMode_t jobz,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda, float *W,
+                                              cuComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
+      int, float *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevd(cusolverDnHandle_t handle,
+                                              cusolverEigMode_t jobz,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              double *W, cuDoubleComplex *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu,
+    int il, int iu, int *meig, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, const float *, int, float, float, int, int, int *,
+      const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, const double *A, int lda, double vl,
+    double vu, int il, int iu, int *meig, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, const double *, int, double, double, int, int,
+      int *, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, const cuComplex *A, int lda, float vl,
+    float vu, int il, int iu, int *meig, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, const cuComplex *, int, float, float, int, int,
+      int *, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda, double vl,
+    double vu, int il, int iu, int *meig, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, const cuDoubleComplex *, int, double, double, int,
+      int, int *, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevdx(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il,
+    int iu, int *meig, float *W, float *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, float *, int, float, float, int, int, int *,
+      float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevdx(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu,
+    int il, int iu, int *meig, double *W, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, double *, int, double, double, int, int, int *,
+      double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCheevdx(cusolverDnHandle_t handle, cusolverEigMode_t jobz,
+                  cusolverEigRange_t range, cublasFillMode_t uplo, int n,
+                  cuComplex *A, int lda, float vl, float vu, int il, int iu,
+                  int *meig, float *W, cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, cuComplex *, int, float, float, int, int, int *,
+      float *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevdx(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, cuDoubleComplex *A, int lda, double vl,
+    double vu, int il, int iu, int *meig, double *W, cuDoubleComplex *work,
+    int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, cuDoubleComplex *, int, double, double, int, int,
+      int *, double *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const float *A,
+    int lda, const float *B, int ldb, float vl, float vu, int il, int iu,
+    int *meig, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, const float *, int,
+      const float *, int, float, float, int, int, int *, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const double *A,
+    int lda, const double *B, int ldb, double vl, double vu, int il, int iu,
+    int *meig, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, const double *, int,
+      const double *, int, double, double, int, int, int *, const double *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const cuComplex *A,
+    int lda, const cuComplex *B, int ldb, float vl, float vu, int il, int iu,
+    int *meig, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, const cuComplex *, int,
+      const cuComplex *, int, float, float, int, int, int *, const float *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    double vl, double vu, int il, int iu, int *meig, const double *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, double, double, int, int, int *,
+      const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvdx(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, float *A, int lda,
+    float *B, int ldb, float vl, float vu, int il, int iu, int *meig, float *W,
+    float *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, float *, int, float *, int,
+      float, float, int, int, int *, float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvdx(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, double *A, int lda,
+    double *B, int ldb, double vl, double vu, int il, int iu, int *meig,
+    double *W, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, double *, int, double *, int,
+      double, double, int, int, int *, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvdx(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, cuComplex *A,
+    int lda, cuComplex *B, int ldb, float vl, float vu, int il, int iu,
+    int *meig, float *W, cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
+      int, float, float, int, int, int *, float *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvdx(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
+    int lda, cuDoubleComplex *B, int ldb, double vl, double vu, int il, int iu,
+    int *meig, double *W, cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, double, double, int, int, int *, double *,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const float *A, int lda, const float *B,
+    int ldb, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const float *, int, const float *, int,
+      const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const double *A, int lda, const double *B,
+    int ldb, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const double *, int, const double *, int,
+      const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const cuComplex *A, int lda,
+    const cuComplex *B, int ldb, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int,
+      const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *B, int ldb, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, float *A, int lda, float *B, int ldb,
+    float *W, float *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, float *, int, float *, int, float *, float *, int,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, double *A, int lda, double *B, int ldb,
+    double *W, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, double *, int, double *, int, double *, double *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvd(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, cuComplex *A, int lda, cuComplex *B, int ldb,
+    float *W, cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float *,
+      cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZhegvd(cusolverDnHandle_t handle, cusolverEigType_t itype,
+                 cusolverEigMode_t jobz, cublasFillMode_t uplo, int n,
+                 cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb,
+                 double *W, cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      double *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCreateSyevjInfo(syevjInfo_t *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateSyevjInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDestroySyevjInfo(syevjInfo_t info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroySyevjInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetTolerance(syevjInfo_t info,
+                                                          double tolerance) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetTolerance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, tolerance);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetMaxSweeps(syevjInfo_t info,
+                                                          int max_sweeps) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetMaxSweeps");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, max_sweeps);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetSortEig(syevjInfo_t info,
+                                                        int sort_eig) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetSortEig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, sort_eig);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetResidual(
+    cusolverDnHandle_t handle, syevjInfo_t info, double *residual) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t,
+                                                  syevjInfo_t, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjGetResidual");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, residual);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetSweeps(
+    cusolverDnHandle_t handle, syevjInfo_t info, int *executed_sweeps) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, syevjInfo_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjGetSweeps");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, executed_sweeps);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const float *A, int lda, const float *W, int *lwork,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const float *, int, const float *, int *, syevjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnSsyevjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const double *A, int lda, const double *W, int *lwork,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const double *, int, const double *, int *, syevjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnDsyevjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuComplex *A, int lda, const float *W, int *lwork,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuComplex *, int, const float *, int *, syevjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnCheevjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnZheevjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, float *A, int lda, float *W, float *work, int lwork, int *info,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
+      int, float *, float *, int, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, double *A, int lda, double *W, double *work, int lwork, int *info,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
+      int, double *, double *, int, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, cuComplex *A, int lda, float *W, cuComplex *work, int lwork,
+    int *info, syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
+      int, float *, cuComplex *, int, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, cuDoubleComplex *A, int lda, double *W, cuDoubleComplex *work,
+    int lwork, int *info, syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *,
+      syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const float *A, int lda, const float *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const float *, int, const float *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const double *A, int lda, const double *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const double *, int, const double *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuComplex *A, int lda, const float *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuComplex *, int, const float *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj(cusolverDnHandle_t handle,
+                                              cusolverEigMode_t jobz,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda, float *W,
+                                              float *work, int lwork, int *info,
+                                              syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
+      int, float *, float *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj(cusolverDnHandle_t handle,
+                                              cusolverEigMode_t jobz,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda, double *W,
+                                              double *work, int lwork,
+                                              int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
+      int, double *, double *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevj(cusolverDnHandle_t handle,
+                                              cusolverEigMode_t jobz,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda, float *W,
+                                              cuComplex *work, int lwork,
+                                              int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
+      int, float *, cuComplex *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevj(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, cuDoubleComplex *A, int lda, double *W, cuDoubleComplex *work,
+    int lwork, int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *,
+      syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const float *A, int lda, const float *B,
+    int ldb, const float *W, int *lwork, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const float *, int, const float *, int,
+      const float *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const double *A, int lda, const double *B,
+    int ldb, const double *W, int *lwork, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const double *, int, const double *, int,
+      const double *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const cuComplex *A, int lda,
+    const cuComplex *B, int ldb, const float *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int,
+      const float *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *B, int ldb, const double *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, float *A, int lda, float *B, int ldb,
+    float *W, float *work, int lwork, int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, float *, int, float *, int, float *, float *, int,
+      int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, double *A, int lda, double *B, int ldb,
+    double *W, double *work, int lwork, int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, double *, int, double *, int, double *, double *,
+      int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvj(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, cuComplex *A, int lda, cuComplex *B, int ldb,
+    float *W, cuComplex *work, int lwork, int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float *,
+      cuComplex *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, cuDoubleComplex *A, int lda,
+    cuDoubleComplex *B, int ldb, double *W, cuDoubleComplex *work, int lwork,
+    int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      double *, cuDoubleComplex *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCreateGesvdjInfo(gesvdjInfo_t *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateGesvdjInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDestroyGesvdjInfo(gesvdjInfo_t info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroyGesvdjInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetTolerance(gesvdjInfo_t info,
+                                                           double tolerance) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetTolerance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, tolerance);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetMaxSweeps(gesvdjInfo_t info,
+                                                           int max_sweeps) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetMaxSweeps");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, max_sweeps);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetSortEig(gesvdjInfo_t info,
+                                                         int sort_svd) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetSortEig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, sort_svd);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetResidual(
+    cusolverDnHandle_t handle, gesvdjInfo_t info, double *residual) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t,
+                                                  gesvdjInfo_t, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjGetResidual");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, residual);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetSweeps(
+    cusolverDnHandle_t handle, gesvdjInfo_t info, int *executed_sweeps) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, gesvdjInfo_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjGetSweeps");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, executed_sweeps);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    const float *A, int lda, const float *S, const float *U, int ldu,
+    const float *V, int ldv, int *lwork, gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, const float *, int,
+      const float *, const float *, int, const float *, int, int *,
+      gesvdjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnSgesvdjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    const double *A, int lda, const double *S, const double *U, int ldu,
+    const double *V, int ldv, int *lwork, gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, const double *, int,
+      const double *, const double *, int, const double *, int, int *,
+      gesvdjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnDgesvdjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    const cuComplex *A, int lda, const float *S, const cuComplex *U, int ldu,
+    const cuComplex *V, int ldv, int *lwork, gesvdjInfo_t params,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, const cuComplex *, int,
+      const float *, const cuComplex *, int, const cuComplex *, int, int *,
+      gesvdjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnCgesvdjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    const cuDoubleComplex *A, int lda, const double *S,
+    const cuDoubleComplex *U, int ldu, const cuDoubleComplex *V, int ldv,
+    int *lwork, gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, const cuDoubleComplex *,
+      int, const double *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnZgesvdjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, float *A,
+    int lda, float *S, float *U, int ldu, float *V, int ldv, float *work,
+    int lwork, int *info, gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, float *, int, float *,
+      float *, int, float *, int, float *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
+                  info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, double *A,
+    int lda, double *S, double *U, int ldu, double *V, int ldv, double *work,
+    int lwork, int *info, gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, double *, int, double *,
+      double *, int, double *, int, double *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
+                  info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    cuComplex *A, int lda, float *S, cuComplex *U, int ldu, cuComplex *V,
+    int ldv, cuComplex *work, int lwork, int *info, gesvdjInfo_t params,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, cuComplex *, int,
+      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, int *,
+      gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
+                  info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    cuDoubleComplex *A, int lda, double *S, cuDoubleComplex *U, int ldu,
+    cuDoubleComplex *V, int ldv, cuDoubleComplex *work, int lwork, int *info,
+    gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, cuDoubleComplex *, int,
+      double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
+                  info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    const float *A, int lda, const float *S, const float *U, int ldu,
+    const float *V, int ldv, int *lwork, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
+      const float *, const float *, int, const float *, int, int *,
+      gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    const double *A, int lda, const double *S, const double *U, int ldu,
+    const double *V, int ldv, int *lwork, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
+      const double *, const double *, int, const double *, int, int *,
+      gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    const cuComplex *A, int lda, const float *S, const cuComplex *U, int ldu,
+    const cuComplex *V, int ldv, int *lwork, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
+      int, const float *, const cuComplex *, int, const cuComplex *, int, int *,
+      gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    const cuDoubleComplex *A, int lda, const double *S,
+    const cuDoubleComplex *U, int ldu, const cuDoubleComplex *V, int ldv,
+    int *lwork, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
+      const cuDoubleComplex *, int, const double *, const cuDoubleComplex *,
+      int, const cuDoubleComplex *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    float *A, int lda, float *S, float *U, int ldu, float *V, int ldv,
+    float *work, int lwork, int *info, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, float *, int,
+      float *, float *, int, float *, int, float *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
+                  lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    double *A, int lda, double *S, double *U, int ldu, double *V, int ldv,
+    double *work, int lwork, int *info, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, double *, int,
+      double *, double *, int, double *, int, double *, int, int *,
+      gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
+                  lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    cuComplex *A, int lda, float *S, cuComplex *U, int ldu, cuComplex *V,
+    int ldv, cuComplex *work, int lwork, int *info, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, cuComplex *, int,
+      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, int *,
+      gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
+                  lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    cuDoubleComplex *A, int lda, double *S, cuDoubleComplex *U, int ldu,
+    cuDoubleComplex *V, int ldv, cuDoubleComplex *work, int lwork, int *info,
+    gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, cuDoubleComplex *,
+      int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
+                  lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const float *d_A, int lda, long long int strideA, const float *d_S,
+    long long int strideS, const float *d_U, int ldu, long long int strideU,
+    const float *d_V, int ldv, long long int strideV, int *lwork,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
+      long long, const float *, long long, const float *, int, long long,
+      const float *, int, long long, int *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnSgesvdaStridedBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const double *d_A, int lda, long long int strideA, const double *d_S,
+    long long int strideS, const double *d_U, int ldu, long long int strideU,
+    const double *d_V, int ldv, long long int strideV, int *lwork,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
+      long long, const double *, long long, const double *, int, long long,
+      const double *, int, long long, int *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnDgesvdaStridedBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const cuComplex *d_A, int lda, long long int strideA, const float *d_S,
+    long long int strideS, const cuComplex *d_U, int ldu, long long int strideU,
+    const cuComplex *d_V, int ldv, long long int strideV, int *lwork,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
+      int, long long, const float *, long long, const cuComplex *, int,
+      long long, const cuComplex *, int, long long, int *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnCgesvdaStridedBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const cuDoubleComplex *d_A, int lda, long long int strideA,
+    const double *d_S, long long int strideS, const cuDoubleComplex *d_U,
+    int ldu, long long int strideU, const cuDoubleComplex *d_V, int ldv,
+    long long int strideV, int *lwork, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
+      const cuDoubleComplex *, int, long long, const double *, long long,
+      const cuDoubleComplex *, int, long long, const cuDoubleComplex *, int,
+      long long, int *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnZgesvdaStridedBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdaStridedBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const float *d_A, int lda, long long int strideA, float *d_S,
+    long long int strideS, float *d_U, int ldu, long long int strideU,
+    float *d_V, int ldv, long long int strideV, float *d_work, int lwork,
+    int *d_info, double *h_R_nrmF, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
+      long long, float *, long long, float *, int, long long, float *, int,
+      long long, float *, int, int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdaStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
+                  h_R_nrmF, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdaStridedBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const double *d_A, int lda, long long int strideA, double *d_S,
+    long long int strideS, double *d_U, int ldu, long long int strideU,
+    double *d_V, int ldv, long long int strideV, double *d_work, int lwork,
+    int *d_info, double *h_R_nrmF, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
+      long long, double *, long long, double *, int, long long, double *, int,
+      long long, double *, int, int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdaStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
+                  h_R_nrmF, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdaStridedBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const cuComplex *d_A, int lda, long long int strideA, float *d_S,
+    long long int strideS, cuComplex *d_U, int ldu, long long int strideU,
+    cuComplex *d_V, int ldv, long long int strideV, cuComplex *d_work,
+    int lwork, int *d_info, double *h_R_nrmF, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
+      int, long long, float *, long long, cuComplex *, int, long long,
+      cuComplex *, int, long long, cuComplex *, int, int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdaStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
+                  h_R_nrmF, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdaStridedBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const cuDoubleComplex *d_A, int lda, long long int strideA, double *d_S,
+    long long int strideS, cuDoubleComplex *d_U, int ldu, long long int strideU,
+    cuDoubleComplex *d_V, int ldv, long long int strideV,
+    cuDoubleComplex *d_work, int lwork, int *d_info, double *h_R_nrmF,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
+      const cuDoubleComplex *, int, long long, double *, long long,
+      cuDoubleComplex *, int, long long, cuDoubleComplex *, int, long long,
+      cuDoubleComplex *, int, int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdaStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
+                  h_R_nrmF, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCreateParams(cusolverDnParams_t *params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnParams_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnDestroyParams(cusolverDnParams_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnParams_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroyParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnSetAdvOptions(cusolverDnParams_t params,
+                        cusolverDnFunction_t function, cusolverAlgMode_t algo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnParams_t, cusolverDnFunction_t, cusolverAlgMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSetAdvOptions");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, function, algo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnPotrf_bufferSize(
+    cusolverDnHandle_t handle, cusolverDnParams_t params, cublasFillMode_t uplo,
+    int64_t n, cudaDataType dataTypeA, const void *A, int64_t lda,
+    cudaDataType computeType, size_t *workspaceInBytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, cublasFillMode_t, int64_t,
+      cudaDataType, const void *, int64_t, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnPotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, uplo, n, dataTypeA, A, lda, computeType,
+                  workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnPotrf(cusolverDnHandle_t handle, cusolverDnParams_t params,
+                cublasFillMode_t uplo, int64_t n, cudaDataType dataTypeA,
+                void *A, int64_t lda, cudaDataType computeType, void *pBuffer,
+                size_t workspaceInBytes, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, cublasFillMode_t, int64_t,
+      cudaDataType, void *, int64_t, cudaDataType, void *, size_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnPotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, uplo, n, dataTypeA, A, lda, computeType,
+                  pBuffer, workspaceInBytes, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnPotrs(
+    cusolverDnHandle_t handle, cusolverDnParams_t params, cublasFillMode_t uplo,
+    int64_t n, int64_t nrhs, cudaDataType dataTypeA, const void *A, int64_t lda,
+    cudaDataType dataTypeB, void *B, int64_t ldb, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, cublasFillMode_t, int64_t,
+      int64_t, cudaDataType, const void *, int64_t, cudaDataType, void *,
+      int64_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnPotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, uplo, n, nrhs, dataTypeA, A, lda, dataTypeB,
+                  B, ldb, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnGeqrf_bufferSize(
+    cusolverDnHandle_t handle, cusolverDnParams_t params, int64_t m, int64_t n,
+    cudaDataType dataTypeA, const void *A, int64_t lda,
+    cudaDataType dataTypeTau, const void *tau, cudaDataType computeType,
+    size_t *workspaceInBytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType,
+      const void *, int64_t, cudaDataType, const void *, cudaDataType,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, m, n, dataTypeA, A, lda, dataTypeTau, tau,
+                  computeType, workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnGeqrf(cusolverDnHandle_t handle, cusolverDnParams_t params, int64_t m,
+                int64_t n, cudaDataType dataTypeA, void *A, int64_t lda,
+                cudaDataType dataTypeTau, void *tau, cudaDataType computeType,
+                void *pBuffer, size_t workspaceInBytes, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType,
+      void *, int64_t, cudaDataType, void *, cudaDataType, void *, size_t,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, m, n, dataTypeA, A, lda, dataTypeTau, tau,
+                  computeType, pBuffer, workspaceInBytes, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnGetrf_bufferSize(
+    cusolverDnHandle_t handle, cusolverDnParams_t params, int64_t m, int64_t n,
+    cudaDataType dataTypeA, const void *A, int64_t lda,
+    cudaDataType computeType, size_t *workspaceInBytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType,
+      const void *, int64_t, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, m, n, dataTypeA, A, lda, computeType,
+                  workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnGetrf(cusolverDnHandle_t handle, cusolverDnParams_t params, int64_t m,
+                int64_t n, cudaDataType dataTypeA, void *A, int64_t lda,
+                int64_t *ipiv, cudaDataType computeType, void *pBuffer,
+                size_t workspaceInBytes, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType,
+      void *, int64_t, int64_t *, cudaDataType, void *, size_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, m, n, dataTypeA, A, lda, ipiv, computeType,
+                  pBuffer, workspaceInBytes, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnGetrs(
+    cusolverDnHandle_t handle, cusolverDnParams_t params,
+    cublasOperation_t trans, int64_t n, int64_t nrhs, cudaDataType dataTypeA,
+    const void *A, int64_t lda, const int64_t *ipiv, cudaDataType dataTypeB,
+    void *B, int64_t ldb, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, cublasOperation_t, int64_t,
+      int64_t, cudaDataType, const void *, int64_t, const int64_t *,
+      cudaDataType, void *, int64_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, trans, n, nrhs, dataTypeA, A, lda, ipiv,
+                  dataTypeB, B, ldb, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSyevd_bufferSize(
+    cusolverDnHandle_t handle, cusolverDnParams_t params,
+    cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n,
+    cudaDataType dataTypeA, const void *A, int64_t lda, cudaDataType dataTypeW,
+    const void *W, cudaDataType computeType, size_t *workspaceInBytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t,
+      cublasFillMode_t, int64_t, cudaDataType, const void *, int64_t,
+      cudaDataType, const void *, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSyevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW,
+                  W, computeType, workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnSyevd(cusolverDnHandle_t handle, cusolverDnParams_t params,
+                cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n,
+                cudaDataType dataTypeA, void *A, int64_t lda,
+                cudaDataType dataTypeW, void *W, cudaDataType computeType,
+                void *pBuffer, size_t workspaceInBytes, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t,
+      cublasFillMode_t, int64_t, cudaDataType, void *, int64_t, cudaDataType,
+      void *, cudaDataType, void *, size_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSyevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW,
+                  W, computeType, pBuffer, workspaceInBytes, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSyevdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverDnParams_t params,
+    cusolverEigMode_t jobz, cusolverEigRange_t range, cublasFillMode_t uplo,
+    int64_t n, cudaDataType dataTypeA, const void *A, int64_t lda, void *vl,
+    void *vu, int64_t il, int64_t iu, int64_t *h_meig, cudaDataType dataTypeW,
+    const void *W, cudaDataType computeType, size_t *workspaceInBytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int64_t, cudaDataType, const void *,
+      int64_t, void *, void *, int64_t, int64_t, int64_t *, cudaDataType,
+      const void *, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSyevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, jobz, range, uplo, n, dataTypeA, A, lda, vl,
+                  vu, il, iu, h_meig, dataTypeW, W, computeType,
+                  workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSyevdx(
+    cusolverDnHandle_t handle, cusolverDnParams_t params,
+    cusolverEigMode_t jobz, cusolverEigRange_t range, cublasFillMode_t uplo,
+    int64_t n, cudaDataType dataTypeA, void *A, int64_t lda, void *vl, void *vu,
+    int64_t il, int64_t iu, int64_t *meig64, cudaDataType dataTypeW, void *W,
+    cudaDataType computeType, void *pBuffer, size_t workspaceInBytes,
+    int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int64_t, cudaDataType, void *,
+      int64_t, void *, void *, int64_t, int64_t, int64_t *, cudaDataType,
+      void *, cudaDataType, void *, size_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSyevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, jobz, range, uplo, n, dataTypeA, A, lda, vl,
+                  vu, il, iu, meig64, dataTypeW, W, computeType, pBuffer,
+                  workspaceInBytes, info);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cusparse_10_1.inc b/tensorflow/stream_executor/cuda/cusparse_10_1.inc
index 3b7f3815829..e94aa081b8c 100644
--- a/tensorflow/stream_executor/cuda/cusparse_10_1.inc
+++ b/tensorflow/stream_executor/cuda/cusparse_10_1.inc
@@ -8225,6 +8225,6 @@ cusparseStatus_t CUSPARSEAPI cusparseConstrainedGeMM_bufferSize(
                   bufferSize);
 }
 
-#endif // _WIN32
+#endif  // _WIN32
 
 }  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cusparse_10_2.inc b/tensorflow/stream_executor/cuda/cusparse_10_2.inc
index 3b7f3815829..e94aa081b8c 100644
--- a/tensorflow/stream_executor/cuda/cusparse_10_2.inc
+++ b/tensorflow/stream_executor/cuda/cusparse_10_2.inc
@@ -8225,6 +8225,6 @@ cusparseStatus_t CUSPARSEAPI cusparseConstrainedGeMM_bufferSize(
                   bufferSize);
 }
 
-#endif // _WIN32
+#endif  // _WIN32
 
 }  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cusparse_11_0.inc b/tensorflow/stream_executor/cuda/cusparse_11_0.inc
new file mode 100644
index 00000000000..31eb65c24ec
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cusparse_11_0.inc
@@ -0,0 +1,6584 @@
+// Auto-generated, do not edit.
+
+#define CUSPARSE_DEPRECATED(new_func)
+
+extern "C" {
+
+cusparseStatus_t CUSPARSEAPI cusparseCreate(cusparseHandle_t *handle) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroy(cusparseHandle_t handle) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGetVersion(cusparseHandle_t handle,
+                                                int *version) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, version);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGetProperty(libraryPropertyType type,
+                                                 int *value) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+const char *CUSPARSEAPI cusparseGetErrorName(cusparseStatus_t status) {
+  using FuncPtr = const char *(CUSPARSEAPI *)(cusparseStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorName");
+  if (!func_ptr) return "cusparseGetErrorName symbol not found.";
+  return func_ptr(status);
+}
+
+const char *CUSPARSEAPI cusparseGetErrorString(cusparseStatus_t status) {
+  using FuncPtr = const char *(CUSPARSEAPI *)(cusparseStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorString");
+  if (!func_ptr) return "cusparseGetErrorString symbol not found.";
+  return func_ptr(status);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSetStream(cusparseHandle_t handle,
+                                               cudaStream_t streamId) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGetStream(cusparseHandle_t handle,
+                                               cudaStream_t *streamId) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t *mode) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t,
+                                                  cusparsePointerMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetPointerMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t mode) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cusparsePointerMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetPointerMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateMatDescr(cusparseMatDescr_t *descrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateMatDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyMatDescr(cusparseMatDescr_t descrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyMatDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCopyMatDescr(cusparseMatDescr_t dest, const cusparseMatDescr_t src) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t,
+                                                  const cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCopyMatDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dest, src);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSetMatType(cusparseMatDescr_t descrA,
+                                                cusparseMatrixType_t type) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseMatrixType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, type);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatFillMode(cusparseMatDescr_t descrA, cusparseFillMode_t fillMode) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseFillMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatFillMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, fillMode);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatDiagType(cusparseMatDescr_t descrA, cusparseDiagType_t diagType) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseDiagType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatDiagType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, diagType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSetMatIndexBase(cusparseMatDescr_t descrA,
+                                                     cusparseIndexBase_t base) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatIndexBase");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, base);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsv2Info(csrsv2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsv2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsv2Info(csrsv2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsv2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsric02Info(csric02Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsric02Info(csric02Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBsric02Info(bsric02Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyBsric02Info(bsric02Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrilu02Info(csrilu02Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrilu02Info(csrilu02Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBsrilu02Info(bsrilu02Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrilu02Info(bsrilu02Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsv2Info(bsrsv2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsv2Info(bsrsv2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsm2Info(bsrsm2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsm2Info(bsrsm2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsru2csrInfo(csru2csrInfo_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsru2csrInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsru2csrInfo(csru2csrInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsru2csrInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateColorInfo(cusparseColorInfo_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateColorInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyColorInfo(cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyColorInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSetColorAlgs(cusparseColorInfo_t info,
+                                                  cusparseColorAlg_t alg) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t, cusparseColorAlg_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetColorAlgs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, alg);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGetColorAlgs(cusparseColorInfo_t info,
+                                                  cusparseColorAlg_t *alg) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t,
+                                                  cusparseColorAlg_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetColorAlgs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, alg);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreatePruneInfo(pruneInfo_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreatePruneInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyPruneInfo(pruneInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyPruneInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSaxpyi(cusparseHandle_t handle, int nnz,
+                                            const float *alpha,
+                                            const float *xVal, const int *xInd,
+                                            float *y,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const float *, const float *, const int *, float *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSaxpyi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDaxpyi(cusparseHandle_t handle, int nnz,
+                                            const double *alpha,
+                                            const double *xVal, const int *xInd,
+                                            double *y,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, const double *, const int *,
+      double *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDaxpyi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCaxpyi(cusparseHandle_t handle, int nnz,
+                                            const cuComplex *alpha,
+                                            const cuComplex *xVal,
+                                            const int *xInd, cuComplex *y,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const cuComplex *, const int *,
+      cuComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCaxpyi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZaxpyi(cusparseHandle_t handle, int nnz,
+                                            const cuDoubleComplex *alpha,
+                                            const cuDoubleComplex *xVal,
+                                            const int *xInd, cuDoubleComplex *y,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      const int *, cuDoubleComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZaxpyi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgthr(cusparseHandle_t handle, int nnz,
+                                           const float *y, float *xVal,
+                                           const int *xInd,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const float *, float *, const int *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgthr(cusparseHandle_t handle, int nnz,
+                                           const double *y, double *xVal,
+                                           const int *xInd,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, double *, const int *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgthr(cusparseHandle_t handle, int nnz,
+                                           const cuComplex *y, cuComplex *xVal,
+                                           const int *xInd,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, cuComplex *, const int *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgthr(cusparseHandle_t handle, int nnz,
+                                           const cuDoubleComplex *y,
+                                           cuDoubleComplex *xVal,
+                                           const int *xInd,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *,
+      const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgthrz(cusparseHandle_t handle, int nnz,
+                                            float *y, float *xVal,
+                                            const int *xInd,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, float *, float *,
+                                      const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthrz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgthrz(cusparseHandle_t handle, int nnz,
+                                            double *y, double *xVal,
+                                            const int *xInd,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, double *, double *,
+                                      const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthrz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgthrz(cusparseHandle_t handle, int nnz,
+                                            cuComplex *y, cuComplex *xVal,
+                                            const int *xInd,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cuComplex *, cuComplex *, const int *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthrz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgthrz(cusparseHandle_t handle, int nnz,
+                                            cuDoubleComplex *y,
+                                            cuDoubleComplex *xVal,
+                                            const int *xInd,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cuDoubleComplex *, cuDoubleComplex *, const int *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthrz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSsctr(cusparseHandle_t handle, int nnz,
+                                           const float *xVal, const int *xInd,
+                                           float *y,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int,
+                                                  const float *, const int *,
+                                                  float *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSsctr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDsctr(cusparseHandle_t handle, int nnz,
+                                           const double *xVal, const int *xInd,
+                                           double *y,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, const int *, double *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDsctr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsctr(cusparseHandle_t handle, int nnz,
+                                           const cuComplex *xVal,
+                                           const int *xInd, cuComplex *y,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const int *, cuComplex *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsctr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZsctr(cusparseHandle_t handle, int nnz,
+                                           const cuDoubleComplex *xVal,
+                                           const int *xInd, cuDoubleComplex *y,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
+      cuDoubleComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZsctr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSroti(cusparseHandle_t handle, int nnz,
+                                           float *xVal, const int *xInd,
+                                           float *y, const float *c,
+                                           const float *s,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, float *, const int *, float *, const float *,
+      const float *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSroti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDroti(cusparseHandle_t handle, int nnz,
+                                           double *xVal, const int *xInd,
+                                           double *y, const double *c,
+                                           const double *s,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, double *, const int *, double *, const double *,
+      const double *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDroti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
+               int n, const float *alpha, const float *A, int lda, int nnz,
+               const float *xVal, const int *xInd, const float *beta, float *y,
+               cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
+      const float *, int, int, const float *, const int *, const float *,
+      float *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
+                  idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
+                          int m, int n, int nnz, int *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
+               int n, const double *alpha, const double *A, int lda, int nnz,
+               const double *xVal, const int *xInd, const double *beta,
+               double *y, cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
+      const double *, int, int, const double *, const int *, const double *,
+      double *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
+                  idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
+                          int m, int n, int nnz, int *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgemvi(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
+    const cuComplex *alpha, const cuComplex *A, int lda, int nnz,
+    const cuComplex *xVal, const int *xInd, const cuComplex *beta, cuComplex *y,
+    cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
+      const cuComplex *, int, int, const cuComplex *, const int *,
+      const cuComplex *, cuComplex *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
+                  idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
+                          int m, int n, int nnz, int *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgemvi(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
+    const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, int nnz,
+    const cuDoubleComplex *xVal, const int *xInd, const cuDoubleComplex *beta,
+    cuDoubleComplex *y, cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, int, const cuDoubleComplex *, const int *,
+      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
+                  idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
+                          int m, int n, int nnz, int *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx_bufferSize(
+    cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+    int m, int n, int nnz, const void *alpha, cudaDataType alphatype,
+    const cusparseMatDescr_t descrA, const void *csrValA,
+    cudaDataType csrValAtype, const int *csrRowPtrA, const int *csrColIndA,
+    const void *x, cudaDataType xtype, const void *beta, cudaDataType betatype,
+    void *y, cudaDataType ytype, cudaDataType executiontype,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int,
+      const void *, cudaDataType, const cusparseMatDescr_t, const void *,
+      cudaDataType, const int *, const int *, const void *, cudaDataType,
+      const void *, cudaDataType, void *, cudaDataType, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA,
+                  csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta,
+                  betatype, y, ytype, executiontype, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx(
+    cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+    int m, int n, int nnz, const void *alpha, cudaDataType alphatype,
+    const cusparseMatDescr_t descrA, const void *csrValA,
+    cudaDataType csrValAtype, const int *csrRowPtrA, const int *csrColIndA,
+    const void *x, cudaDataType xtype, const void *beta, cudaDataType betatype,
+    void *y, cudaDataType ytype, cudaDataType executiontype, void *buffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int,
+      const void *, cudaDataType, const cusparseMatDescr_t, const void *,
+      cudaDataType, const int *, const int *, const void *, cudaDataType,
+      const void *, cudaDataType, void *, cudaDataType, cudaDataType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA,
+                  csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta,
+                  betatype, y, ytype, executiontype, buffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nb, int nnzb, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const float *x, const float *beta, float *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, int, const float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
+                  x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nb, int nnzb, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const double *x, const double *beta, double *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      const double *, const cusparseMatDescr_t, const double *, const int *,
+      const int *, int, const double *, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
+                  x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrmv(cusparseHandle_t handle, cusparseDirection_t dirA,
+               cusparseOperation_t transA, int mb, int nb, int nnzb,
+               const cuComplex *alpha, const cusparseMatDescr_t descrA,
+               const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
+               const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
+               const cuComplex *beta, cuComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, int, const cuComplex *, const cuComplex *,
+      cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
+                  x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nb, int nnzb,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
+    const cuDoubleComplex *beta, cuDoubleComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
+                  x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
+                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
+                int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
+                const float *bsrSortedValA, const int *bsrSortedMaskPtrA,
+                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
+                const int *bsrSortedColIndA, int blockDim, const float *x,
+                const float *beta, float *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      int, const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, const int *, const int *, int, const float *, const float *,
+      float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
+                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
+                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
+                int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
+                const double *bsrSortedValA, const int *bsrSortedMaskPtrA,
+                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
+                const int *bsrSortedColIndA, int blockDim, const double *x,
+                const double *beta, double *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      int, const double *, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const int *, const int *, int, const double *,
+      const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
+                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrxmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
+    const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
+    const cuComplex *beta, cuComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, const int *, const int *, int,
+      const cuComplex *, const cuComplex *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
+                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrxmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
+    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
+    const cuDoubleComplex *beta, cuDoubleComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      int, const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, const int *,
+      const int *, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
+                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrsv2_zeroPivot(cusparseHandle_t handle,
+                                                       csrsv2Info_t info,
+                                                       int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsv2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, csrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, csrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, csrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, csrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, csrsv2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, csrsv2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, csrsv2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, csrsv2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const float *alpha, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrsv2Info_t info, const float *f, float *x,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      csrsv2Info_t, const float *, float *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const double *alpha, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrsv2Info_t info, const double *f, double *x,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      csrsv2Info_t, const double *, double *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrsv2Info_t info, const cuComplex *f,
+    cuComplex *x, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      csrsv2Info_t, const cuComplex *, cuComplex *, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrsv2Info_t info, const cuDoubleComplex *f,
+    cuDoubleComplex *x, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, csrsv2Info_t, const cuDoubleComplex *, cuDoubleComplex *,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXbsrsv2_zeroPivot(cusparseHandle_t handle,
+                                                       bsrsv2Info_t info,
+                                                       int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsv2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, float *, const int *, const int *, int,
+      bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, double *, const int *, const int *, int,
+      bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
+      bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
+      int, bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
+    bsrsv2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, float *, const int *, const int *, int,
+      bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
+                  pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
+    bsrsv2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, double *, const int *, const int *, int,
+      bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
+                  pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
+    bsrsv2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
+      bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
+                  pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
+    bsrsv2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
+      int, bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
+                  pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, const float *f, float *x, cusparseSolvePolicy_t policy,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, int, bsrsv2Info_t, const float *, float *,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
+                  policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, const double *f, double *x, cusparseSolvePolicy_t policy,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const double *, const cusparseMatDescr_t, const double *, const int *,
+      const int *, int, bsrsv2Info_t, const double *, double *,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
+                  policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, const cuComplex *f, cuComplex *x,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, int, bsrsv2Info_t, const cuComplex *,
+      cuComplex *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
+                  policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, const cuDoubleComplex *f, cuDoubleComplex *x,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, bsrsv2Info_t,
+      const cuDoubleComplex *, cuDoubleComplex *, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
+                  policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrmm(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int kb, int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
+    const float *bsrSortedValA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, const int blockSize, const float *B,
+    const int ldb, const float *beta, float *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      const int, const float *, const int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
+                  B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrmm(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int kb, int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
+    const double *bsrSortedValA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, const int blockSize, const double *B,
+    const int ldb, const double *beta, double *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      const int, const double *, const int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
+                  B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrmm(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int kb, int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, const int blockSize, const cuComplex *B,
+    const int ldb, const cuComplex *beta, cuComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, int, const cuComplex *,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      const int, const cuComplex *, const int, const cuComplex *, cuComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
+                  B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrmm(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int kb, int nnzb, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA,
+    const int blockSize, const cuDoubleComplex *B, const int ldb,
+    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, const int, const cuDoubleComplex *, const int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
+                  B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgemmi(
+    cusparseHandle_t handle, int m, int n, int k, int nnz, const float *alpha,
+    const float *A, int lda, const float *cscValB, const int *cscColPtrB,
+    const int *cscRowIndB, const float *beta, float *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int, const float *, const float *, int,
+      const float *, const int *, const int *, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemmi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
+                  cscRowIndB, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgemmi(
+    cusparseHandle_t handle, int m, int n, int k, int nnz, const double *alpha,
+    const double *A, int lda, const double *cscValB, const int *cscColPtrB,
+    const int *cscRowIndB, const double *beta, double *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int, const double *, const double *, int,
+      const double *, const int *, const int *, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemmi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
+                  cscRowIndB, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgemmi(
+    cusparseHandle_t handle, int m, int n, int k, int nnz,
+    const cuComplex *alpha, const cuComplex *A, int lda,
+    const cuComplex *cscValB, const int *cscColPtrB, const int *cscRowIndB,
+    const cuComplex *beta, cuComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, const int *, const int *,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemmi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
+                  cscRowIndB, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgemmi(cusparseHandle_t handle, int m, int n, int k, int nnz,
+               const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda,
+               const cuDoubleComplex *cscValB, const int *cscColPtrB,
+               const int *cscRowIndB, const cuDoubleComplex *beta,
+               cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, const int *,
+      const int *, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemmi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
+                  cscRowIndB, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsm2Info(csrsm2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsm2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsm2Info(csrsm2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsm2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrsm2_zeroPivot(cusparseHandle_t handle,
+                                                       csrsm2Info_t info,
+                                                       int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsm2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *B,
+    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, const float *, int, csrsm2Info_t, cusparseSolvePolicy_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const double *B,
+    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const double *, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuComplex *B, int ldb, csrsm2Info_t info,
+    cusparseSolvePolicy_t policy, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, const cuComplex *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
+    csrsm2Info_t info, cusparseSolvePolicy_t policy, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *,
+      const cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_analysis(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *B,
+    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, const float *, int, csrsm2Info_t, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_analysis(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const double *B,
+    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const double *, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_analysis(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuComplex *B, int ldb, csrsm2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, const cuComplex *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_analysis(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
+    csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *,
+      const cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_solve(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float *B, int ldb,
+    csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, float *, int, csrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_solve(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, double *B,
+    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const double *, const cusparseMatDescr_t, const double *,
+      const int *, const int *, double *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_solve(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cuComplex *B, int ldb, csrsm2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, cuComplex *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_solve(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cuDoubleComplex *B, int ldb, csrsm2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *, int,
+      csrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXbsrsm2_zeroPivot(cusparseHandle_t handle,
+                                                       bsrsm2Info_t info,
+                                                       int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsm2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, const float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, bsrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, const double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, bsrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, bsrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
+    const float *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    const float *B, int ldb, float *X, int ldx, cusparseSolvePolicy_t policy,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      bsrsm2Info_t, const float *, int, float *, int, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
+    const double *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    const double *B, int ldb, double *X, int ldx, cusparseSolvePolicy_t policy,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      bsrsm2Info_t, const double *, int, double *, int, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    const cuComplex *B, int ldb, cuComplex *X, int ldx,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cuComplex *,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, bsrsm2Info_t, const cuComplex *, int, cuComplex *, int,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    const cuDoubleComplex *B, int ldb, cuDoubleComplex *X, int ldx,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, bsrsm2Info_t, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_numericBoost(
+    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
+    float *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, csrilu02Info_t, int, double *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_numericBoost(
+    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
+    double *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, csrilu02Info_t, int, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_numericBoost(
+    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
+    cuComplex *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, csrilu02Info_t, int, double *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_numericBoost(
+    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
+    cuDoubleComplex *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, csrilu02Info_t, int, double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02_zeroPivot(
+    cusparseHandle_t handle, csrilu02Info_t info, int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrilu02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
+    csrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, csrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_numericBoost(
+    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
+    float *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, bsrilu02Info_t, int, double *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_numericBoost(
+    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
+    double *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, bsrilu02Info_t, int, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_numericBoost(
+    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
+    cuComplex *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, bsrilu02Info_t, int, double *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_numericBoost(
+    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
+    cuDoubleComplex *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, bsrilu02Info_t, int, double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXbsrilu02_zeroPivot(
+    cusparseHandle_t handle, bsrilu02Info_t info, int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrilu02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsric02_zeroPivot(cusparseHandle_t handle,
+                                                        csric02Info_t info,
+                                                        int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsric02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
+    csric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, csric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXbsric02_zeroPivot(cusparseHandle_t handle,
+                                                        bsric02Info_t info,
+                                                        int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsric02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsric02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsric02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsric02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsric02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsric02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsric02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsric02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsric02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
+    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      const float *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
+    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, const double *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du,
+    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2(cusparseHandle_t handle, int m,
+                                            int n, const float *dl,
+                                            const float *d, const float *du,
+                                            float *B, int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2(cusparseHandle_t handle, int m,
+                                            int n, const double *dl,
+                                            const double *d, const double *du,
+                                            double *B, int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2(cusparseHandle_t handle, int m,
+                                            int n, const cuComplex *dl,
+                                            const cuComplex *d,
+                                            const cuComplex *du, cuComplex *B,
+                                            int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2(cusparseHandle_t handle, int m,
+                                            int n, const cuDoubleComplex *dl,
+                                            const cuDoubleComplex *d,
+                                            const cuDoubleComplex *du,
+                                            cuDoubleComplex *B, int ldb,
+                                            void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
+    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      const float *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
+    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, const double *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du,
+    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot(
+    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
+    const float *du, float *B, int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot(
+    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
+    const double *du, double *B, int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot(
+    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, cuComplex *B, int ldb,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *B,
+    int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int m, const float *dl, const float *d,
+    const float *du, const float *x, int batchCount, int batchStride,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const float *, const float *, const float *,
+      const float *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
+                  bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2StridedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int m, const double *dl, const double *d,
+    const double *du, const double *x, int batchCount, int batchStride,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, const double *, const double *,
+      const double *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
+                  bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
+    const cuComplex *du, const cuComplex *x, int batchCount, int batchStride,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
+                  bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du,
+    const cuDoubleComplex *x, int batchCount, int batchStride,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
+                  bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch(
+    cusparseHandle_t handle, int m, const float *dl, const float *d,
+    const float *du, float *x, int batchCount, int batchStride, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const float *, const float *, const float *,
+      float *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2StridedBatch(cusparseHandle_t handle, int m, const double *dl,
+                           const double *d, const double *du, double *x,
+                           int batchCount, int batchStride, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, const double *, const double *,
+      double *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch(
+    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
+    const cuComplex *du, cuComplex *x, int batchCount, int batchStride,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, cuComplex *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch(
+    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *x,
+    int batchCount, int batchStride, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      const cuDoubleComplex *, cuDoubleComplex *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const float *dl, const float *d,
+    const float *du, const float *x, int batchCount,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      const float *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const double *dl, const double *d,
+    const double *du, const double *x, int batchCount,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, const double *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, const cuComplex *x, int batchCount,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du,
+    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, float *dl, float *d, float *du,
+    float *x, int batchCount, void *pBuffer) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int, float *,
+                                      float *, float *, float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, double *dl, double *d, double *du,
+    double *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
+                                                  double *, double *, double *,
+                                                  double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, cuComplex *dl, cuComplex *d,
+    cuComplex *du, cuComplex *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
+      cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *dl,
+    cuDoubleComplex *d, cuDoubleComplex *du, cuDoubleComplex *x, int batchCount,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
+      cuDoubleComplex *, cuDoubleComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const float *ds, const float *dl,
+    const float *d, const float *du, const float *dw, const float *x,
+    int batchCount, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      const float *, const float *, const float *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const double *ds,
+    const double *dl, const double *d, const double *du, const double *dw,
+    const double *x, int batchCount, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, const double *, const double *, const double *, int,
+      size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const cuComplex *ds,
+    const cuComplex *dl, const cuComplex *d, const cuComplex *du,
+    const cuComplex *dw, const cuComplex *x, int batchCount,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, const cuComplex *,
+      const cuComplex *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *ds,
+    const cuDoubleComplex *dl, const cuDoubleComplex *d,
+    const cuDoubleComplex *du, const cuDoubleComplex *dw,
+    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, float *ds, float *dl, float *d,
+    float *du, float *dw, float *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, float *, float *, float *, float *, float *,
+      float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, double *ds, double *dl, double *d,
+    double *du, double *dw, double *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, double *, double *, double *, double *,
+      double *, double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, cuComplex *ds, cuComplex *dl,
+    cuComplex *d, cuComplex *du, cuComplex *dw, cuComplex *x, int batchCount,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
+      cuComplex *, cuComplex *, cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *ds,
+    cuDoubleComplex *dl, cuDoubleComplex *d, cuDoubleComplex *du,
+    cuDoubleComplex *dw, cuDoubleComplex *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
+      cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *,
+      cuDoubleComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrgemm2Info(csrgemm2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrgemm2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrgemm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrgemm2Info(csrgemm2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrgemm2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrgemm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int k, const float *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB, const float *beta,
+    const cusparseMatDescr_t descrD, int nnzD, const int *csrSortedRowPtrD,
+    const int *csrSortedColIndD, csrgemm2Info_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t,
+      int, const int *, const int *, const cusparseMatDescr_t, int, const int *,
+      const int *, const float *, const cusparseMatDescr_t, int, const int *,
+      const int *, csrgemm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
+                  csrSortedColIndD, info, pBufferSizeInBytes);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int k, const double *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const double *beta, const cusparseMatDescr_t descrD, int nnzD,
+    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
+    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t,
+      int, const int *, const int *, const cusparseMatDescr_t, int, const int *,
+      const int *, const double *, const cusparseMatDescr_t, int, const int *,
+      const int *, csrgemm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
+                  csrSortedColIndD, info, pBufferSizeInBytes);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int k, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cuComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
+    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
+    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *,
+      const cusparseMatDescr_t, int, const int *, const int *,
+      const cusparseMatDescr_t, int, const int *, const int *,
+      const cuComplex *, const cusparseMatDescr_t, int, const int *,
+      const int *, csrgemm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
+                  csrSortedColIndD, info, pBufferSizeInBytes);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int k, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cuDoubleComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
+    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
+    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const int *, const int *,
+      const cusparseMatDescr_t, int, const int *, const int *,
+      const cuDoubleComplex *, const cusparseMatDescr_t, int, const int *,
+      const int *, csrgemm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
+                  csrSortedColIndD, info, pBufferSizeInBytes);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseXcsrgemm2Nnz(
+    cusparseHandle_t handle, int m, int n, int k,
+    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cusparseMatDescr_t descrD, int nnzD, const int *csrSortedRowPtrD,
+    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, const csrgemm2Info_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, int,
+      const int *, const int *, const cusparseMatDescr_t, int, const int *,
+      const int *, const cusparseMatDescr_t, int, const int *, const int *,
+      const cusparseMatDescr_t, int *, int *, const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgemm2Nnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, descrD, nnzD, csrSortedRowPtrD,
+                  csrSortedColIndD, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2(
+    cusparseHandle_t handle, int m, int n, int k, const float *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB, const float *beta,
+    const cusparseMatDescr_t descrD, int nnzD, const float *csrSortedValD,
+    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    const csrgemm2Info_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t,
+      int, const float *, const int *, const int *, const cusparseMatDescr_t,
+      int, const float *, const int *, const int *, const float *,
+      const cusparseMatDescr_t, int, const float *, const int *, const int *,
+      const cusparseMatDescr_t, float *, const int *, int *,
+      const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
+                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
+                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
+                  csrSortedColIndC, info, pBuffer);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2(
+    cusparseHandle_t handle, int m, int n, int k, const double *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const double *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const double *beta, const cusparseMatDescr_t descrD, int nnzD,
+    const double *csrSortedValD, const int *csrSortedRowPtrD,
+    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
+    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    const csrgemm2Info_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t,
+      int, const double *, const int *, const int *, const cusparseMatDescr_t,
+      int, const double *, const int *, const int *, const double *,
+      const cusparseMatDescr_t, int, const double *, const int *, const int *,
+      const cusparseMatDescr_t, double *, const int *, int *,
+      const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
+                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
+                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
+                  csrSortedColIndC, info, pBuffer);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2(
+    cusparseHandle_t handle, int m, int n, int k, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const cuComplex *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cuComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
+    const cuComplex *csrSortedValD, const int *csrSortedRowPtrD,
+    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
+    cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
+    int *csrSortedColIndC, const csrgemm2Info_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *,
+      const cusparseMatDescr_t, int, const cuComplex *, const int *,
+      const int *, const cusparseMatDescr_t, int, const cuComplex *,
+      const int *, const int *, const cuComplex *, const cusparseMatDescr_t,
+      int, const cuComplex *, const int *, const int *,
+      const cusparseMatDescr_t, cuComplex *, const int *, int *,
+      const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
+                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
+                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
+                  csrSortedColIndC, info, pBuffer);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2(
+    cusparseHandle_t handle, int m, int n, int k, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cuDoubleComplex *beta,
+    const cusparseMatDescr_t descrD, int nnzD,
+    const cuDoubleComplex *csrSortedValD, const int *csrSortedRowPtrD,
+    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
+    int *csrSortedColIndC, const csrgemm2Info_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
+      const int *, const cusparseMatDescr_t, int, const cuDoubleComplex *,
+      const int *, const int *, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
+      const int *, const cusparseMatDescr_t, cuDoubleComplex *, const int *,
+      int *, const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
+                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
+                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
+                  csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
+    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cusparseMatDescr_t descrC, const float *csrSortedValC,
+    const int *csrSortedRowPtrC, const int *csrSortedColIndC,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
+      const float *, const int *, const int *, const float *,
+      const cusparseMatDescr_t, int, const float *, const int *, const int *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const double *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    const double *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
+      const double *, const int *, const int *, const double *,
+      const cusparseMatDescr_t, int, const double *, const int *, const int *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    const cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
+      int, const cuComplex *, const int *, const int *, const cuComplex *,
+      const cusparseMatDescr_t, int, const cuComplex *, const int *,
+      const int *, const cusparseMatDescr_t, const cuComplex *, const int *,
+      const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *beta,
+    const cusparseMatDescr_t descrB, int nnzB,
+    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    const cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
+      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
+      const cuDoubleComplex *, const int *, const int *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrgeam2Nnz(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *workspace) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, int, const int *,
+      const int *, const cusparseMatDescr_t, int, const int *, const int *,
+      const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgeam2Nnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, workspace);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2(
+    cusparseHandle_t handle, int m, int n, const float *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
+    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
+      const float *, const int *, const int *, const float *,
+      const cusparseMatDescr_t, int, const float *, const int *, const int *,
+      const cusparseMatDescr_t, float *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2(
+    cusparseHandle_t handle, int m, int n, const double *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const double *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    double *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
+      const double *, const int *, const int *, const double *,
+      const cusparseMatDescr_t, int, const double *, const int *, const int *,
+      const cusparseMatDescr_t, double *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2(
+    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    cuComplex *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
+      int, const cuComplex *, const int *, const int *, const cuComplex *,
+      const cusparseMatDescr_t, int, const cuComplex *, const int *,
+      const int *, const cusparseMatDescr_t, cuComplex *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *beta,
+    const cusparseMatDescr_t descrB, int nnzB,
+    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
+    int *csrSortedColIndC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
+      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
+      const cuDoubleComplex *, const int *, const int *,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrcolor(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
+    int *coloring, int *reordering, const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, const float *, int *, int *, int *,
+      const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, fractionToColor, ncolors, coloring,
+                  reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrcolor(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
+    int *coloring, int *reordering, const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, int *, int *, int *,
+      const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, fractionToColor, ncolors, coloring,
+                  reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrcolor(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
+    int *coloring, int *reordering, const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, const float *, int *, int *, int *,
+      const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, fractionToColor, ncolors, coloring,
+                  reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrcolor(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
+    int *coloring, int *reordering, const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, const double *, int *,
+      int *, int *, const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, fractionToColor, ncolors, coloring,
+                  reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+             const cusparseMatDescr_t descrA, const float *A, int lda,
+             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+             const cusparseMatDescr_t descrA, const double *A, int lda,
+             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+             const cusparseMatDescr_t descrA, const cuComplex *A, int lda,
+             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+             const cusparseMatDescr_t descrA, const cuDoubleComplex *A, int lda,
+             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSnnz_compress(
+    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
+    const float *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
+    int *nnzC, float tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cusparseMatDescr_t, const float *,
+      const int *, int *, int *, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
+                  nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnnz_compress(
+    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
+    const double *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
+    int *nnzC, double tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cusparseMatDescr_t, const double *,
+      const int *, int *, int *, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
+                  nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCnnz_compress(
+    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
+    int *nnzC, cuComplex tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, int *, int *, cuComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
+                  nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZnnz_compress(
+    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    int *nnzPerRow, int *nnzC, cuDoubleComplex tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cusparseMatDescr_t, const cuDoubleComplex *,
+      const int *, int *, int *, cuDoubleComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
+                  nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2csr_compress(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedColIndA,
+    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
+    float *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
+    float tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, int, const int *, float *, int *, int *, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
+                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
+                  csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2csr_compress(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedColIndA,
+    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
+    double *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
+    double tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, int, const int *, double *, int *, int *,
+      double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
+                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
+                  csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2csr_compress(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedColIndA,
+    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
+    cuComplex *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
+    cuComplex tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, int, const int *, cuComplex *, int *, int *,
+      cuComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
+                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
+                  csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2csr_compress(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedColIndA,
+    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
+    cuDoubleComplex *csrSortedValC, int *csrSortedColIndC,
+    int *csrSortedRowPtrC, cuDoubleComplex tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, const int *,
+      cuDoubleComplex *, int *, int *, cuDoubleComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
+                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
+                  csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSdense2csr(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *A, int lda, const int *nnzPerRow, float *csrSortedValA,
+    int *csrSortedRowPtrA, int *csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
+      const int *, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDdense2csr(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *A, int lda, const int *nnzPerRow, double *csrSortedValA,
+    int *csrSortedRowPtrA, int *csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
+      const int *, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCdense2csr(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *A, int lda, const int *nnzPerRow, cuComplex *csrSortedValA,
+    int *csrSortedRowPtrA, int *csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      int, const int *, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZdense2csr(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *A, int lda, const int *nnzPerRow,
+    cuDoubleComplex *csrSortedValA, int *csrSortedRowPtrA,
+    int *csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, float *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, double *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cuComplex *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSdense2csc(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *A, int lda, const int *nnzPerCol, float *cscSortedValA,
+    int *cscSortedRowIndA, int *cscSortedColPtrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
+      const int *, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
+                  cscSortedRowIndA, cscSortedColPtrA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDdense2csc(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *A, int lda, const int *nnzPerCol, double *cscSortedValA,
+    int *cscSortedRowIndA, int *cscSortedColPtrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
+      const int *, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
+                  cscSortedRowIndA, cscSortedColPtrA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCdense2csc(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *A, int lda, const int *nnzPerCol, cuComplex *cscSortedValA,
+    int *cscSortedRowIndA, int *cscSortedColPtrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      int, const int *, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
+                  cscSortedRowIndA, cscSortedColPtrA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZdense2csc(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *A, int lda, const int *nnzPerCol,
+    cuDoubleComplex *cscSortedValA, int *cscSortedRowIndA,
+    int *cscSortedColPtrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
+                  cscSortedRowIndA, cscSortedColPtrA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsc2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *cscSortedValA, const int *cscSortedRowIndA,
+    const int *cscSortedColPtrA, float *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsc2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
+                  cscSortedColPtrA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsc2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *cscSortedValA, const int *cscSortedRowIndA,
+    const int *cscSortedColPtrA, double *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsc2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
+                  cscSortedColPtrA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsc2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *cscSortedValA, const int *cscSortedRowIndA,
+    const int *cscSortedColPtrA, cuComplex *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsc2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
+                  cscSortedColPtrA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsc2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *cscSortedValA, const int *cscSortedRowIndA,
+    const int *cscSortedColPtrA, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsc2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
+                  cscSortedColPtrA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcoo2csr(cusparseHandle_t handle,
+                                              const int *cooRowInd, int nnz,
+                                              int m, int *csrSortedRowPtr,
+                                              cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoo2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, cooRowInd, nnz, m, csrSortedRowPtr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr2coo(cusparseHandle_t handle,
+                                              const int *csrSortedRowPtr,
+                                              int nnz, int m, int *cooRowInd,
+                                              cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2coo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, csrSortedRowPtr, nnz, m, cooRowInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr2bsrNnz(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, int blockDim, const cusparseMatDescr_t descrC,
+    int *bsrSortedRowPtrC, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const int *, const int *, int, const cusparseMatDescr_t, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2bsrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedRowPtrC,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2bsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, float *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, const cusparseMatDescr_t,
+      float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
+                  bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2bsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, double *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, const cusparseMatDescr_t,
+      double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
+                  bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2bsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int,
+      const cusparseMatDescr_t, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
+                  bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2bsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
+                  bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, const cusparseMatDescr_t,
+      float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, double *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, const cusparseMatDescr_t,
+      double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int,
+      const cusparseMatDescr_t, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, cuDoubleComplex *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSize(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const float *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
+      int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSize(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const double *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
+      int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSize(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
+      const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSize(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
+      const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSizeExt(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const float *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
+      int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSizeExt(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const double *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
+      int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSizeExt(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
+      const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSizeExt(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
+      const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const float *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim, float *bscVal,
+    int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
+      int, int, float *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
+                  bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const double *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    double *bscVal, int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
+      int, int, double *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
+                  bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    cuComplex *bscVal, int *bscRowInd, int *bscColPtr,
+    cusparseAction_t copyValues, cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
+      const int *, int, int, cuComplex *, int *, int *, cusparseAction_t,
+      cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
+                  bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    cuDoubleComplex *bscVal, int *bscRowInd, int *bscColPtr,
+    cusparseAction_t copyValues, cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
+      const int *, int, int, cuDoubleComplex *, int *, int *, cusparseAction_t,
+      cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
+                  bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, int rowBlockDim, int colBlockDim,
+    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
+    int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const int *, const int *, int, int, const cusparseMatDescr_t, int *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
+    int colBlockDim, const cusparseMatDescr_t descrC, float *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, int,
+      const cusparseMatDescr_t, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
+    int colBlockDim, const cusparseMatDescr_t descrC, double *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, int,
+      const cusparseMatDescr_t, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
+    int colBlockDim, const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, int,
+      const cusparseMatDescr_t, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
+    int colBlockDim, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
+    int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr2gebsrNnz(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrC,
+    int *bsrSortedRowPtrC, int rowBlockDim, int colBlockDim,
+    int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const int *, const int *, const cusparseMatDescr_t, int *, int, int,
+      int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2gebsrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedRowPtrC, rowBlockDim,
+                  colBlockDim, nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrC, float *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
+    int colBlockDim, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, const cusparseMatDescr_t,
+      float *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrC, double *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
+    int colBlockDim, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, const cusparseMatDescr_t,
+      double *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
+    int colBlockDim, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, const cusparseMatDescr_t,
+      cuComplex *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
+    int colBlockDim, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, int, int,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      int, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      int, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, int, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, int, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXgebsr2gebsrNnz(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, int rowBlockDimA, int colBlockDimA,
+    const cusparseMatDescr_t descrC, int *bsrSortedRowPtrC, int rowBlockDimC,
+    int colBlockDimC, int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const int *, const int *, int, int,
+      const cusparseMatDescr_t, int *, int, int, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2gebsrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDimA, colBlockDimA, descrC,
+                  bsrSortedRowPtrC, rowBlockDimC, colBlockDimC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, const cusparseMatDescr_t descrC, float *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
+    int colBlockDimC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      int, const cusparseMatDescr_t, float *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, const cusparseMatDescr_t descrC, double *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
+    int colBlockDimC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      int, const cusparseMatDescr_t, double *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
+    int colBlockDimC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, int, const cusparseMatDescr_t, cuComplex *, int *, int *, int, int,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *bsrSortedValC, int *bsrSortedRowPtrC,
+    int *bsrSortedColIndC, int rowBlockDimC, int colBlockDimC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, int, const cusparseMatDescr_t, cuDoubleComplex *, int *,
+      int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateIdentityPermutation(cusparseHandle_t handle, int n, int *p) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCreateIdentityPermutation");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, p);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcoosort_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, const int *cooRowsA,
+    const int *cooColsA, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosort_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcoosortByRow(cusparseHandle_t handle,
+                                                   int m, int n, int nnz,
+                                                   int *cooRowsA, int *cooColsA,
+                                                   int *P, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByRow");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcoosortByColumn(cusparseHandle_t handle,
+                                                      int m, int n, int nnz,
+                                                      int *cooRowsA,
+                                                      int *cooColsA, int *P,
+                                                      void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByColumn");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrsort_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, const int *csrRowPtrA,
+    const int *csrColIndA, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrRowPtrA, csrColIndA,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrsort(cusparseHandle_t handle, int m,
+                                              int n, int nnz,
+                                              const cusparseMatDescr_t descrA,
+                                              const int *csrRowPtrA,
+                                              int *csrColIndA, int *P,
+                                              void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
+      int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrRowPtrA, csrColIndA, P,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcscsort_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, const int *cscColPtrA,
+    const int *cscRowIndA, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cscColPtrA, cscRowIndA,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcscsort(cusparseHandle_t handle, int m,
+                                              int n, int nnz,
+                                              const cusparseMatDescr_t descrA,
+                                              const int *cscColPtrA,
+                                              int *cscRowIndA, int *P,
+                                              void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
+      int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, cscColPtrA, cscRowIndA, P,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsru2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, float *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, float *, const int *, int *,
+      csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, double *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, double *, const int *, int *,
+      csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, cuComplex *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, cuComplex *, const int *, int *,
+      csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, cuDoubleComplex *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, cuDoubleComplex *, const int *, int *,
+      csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsru2csr(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2csru(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2csru(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2csru(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2csru(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    const float *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneDense2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    const double *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneDense2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnz(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    const float *threshold, const cusparseMatDescr_t descrC, int *csrRowPtrC,
+    int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, const float *,
+      const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrRowPtrC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnz(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, const double *,
+      const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, const float *,
+      const cusparseMatDescr_t, float *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, const double *,
+      const cusparseMatDescr_t, double *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    const float *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, const float *, const cusparseMatDescr_t,
+      const float *, const int *, const int *, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    const double *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, const cusparseMatDescr_t,
+      const double *, const int *, const int *, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnz(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, const float *, const cusparseMatDescr_t, int *,
+      int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnz(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, const cusparseMatDescr_t, int *,
+      int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, const float *, const cusparseMatDescr_t,
+      float *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, const cusparseMatDescr_t,
+      double *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC,
+    const float *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, pruneInfo_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, float,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      pruneInfo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC,
+    const double *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, pruneInfo_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, float,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      pruneInfo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnzByPercentage(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC, int *csrRowPtrC,
+    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, float,
+      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnzByPercentage(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC, int *csrRowPtrC,
+    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, float,
+      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC, float *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, float,
+      const cusparseMatDescr_t, float *, const int *, int *, pruneInfo_t,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC, double *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, float,
+      const cusparseMatDescr_t, double *, const int *, int *, pruneInfo_t,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, const float *csrSortedValC,
+    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float, const cusparseMatDescr_t, const float *,
+      const int *, const int *, pruneInfo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, const double *csrSortedValC,
+    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, float, const cusparseMatDescr_t, const double *,
+      const int *, const int *, pruneInfo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnzByPercentage(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
+    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
+      pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnzByPercentage(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
+    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
+      pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float, const cusparseMatDescr_t, float *,
+      const int *, int *, pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, double *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, float, const cusparseMatDescr_t, double *,
+      const int *, int *, pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx2(
+    cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal,
+    const int *csrRowPtr, const int *csrColInd, void *cscVal, int *cscColPtr,
+    int *cscRowInd, cudaDataType valType, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, void *buffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const void *, const int *, const int *,
+      void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t,
+      cusparseCsr2CscAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
+                  cscColPtr, cscRowInd, valType, copyValues, idxBase, alg,
+                  buffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx2_bufferSize(
+    cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal,
+    const int *csrRowPtr, const int *csrColInd, void *cscVal, int *cscColPtr,
+    int *cscRowInd, cudaDataType valType, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const void *, const int *, const int *,
+      void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t,
+      cusparseCsr2CscAlg_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
+                  cscColPtr, cscRowInd, valType, copyValues, idxBase, alg,
+                  bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateSpVec(cusparseSpVecDescr_t *spVecDescr, int64_t size, int64_t nnz,
+                    void *indices, void *values, cusparseIndexType_t idxType,
+                    cusparseIndexBase_t idxBase, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpVecDescr_t *, int64_t, int64_t, void *, void *,
+      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateSpVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroySpVec(cusparseSpVecDescr_t spVecDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpVecGet(cusparseSpVecDescr_t spVecDescr,
+                                              int64_t *size, int64_t *nnz,
+                                              void **indices, void **values,
+                                              cusparseIndexType_t *idxType,
+                                              cusparseIndexBase_t *idxBase,
+                                              cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpVecDescr_t, int64_t *, int64_t *, void **, void **,
+      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpVecGetIndexBase(
+    cusparseSpVecDescr_t spVecDescr, cusparseIndexBase_t *idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t,
+                                                  cusparseIndexBase_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetIndexBase");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecGetValues(cusparseSpVecDescr_t spVecDescr, void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecSetValues(cusparseSpVecDescr_t spVecDescr, void *values) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateDnVec(cusparseDnVecDescr_t *dnVecDescr, int64_t size,
+                    void *values, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseDnVecDescr_t *, int64_t, void *, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, size, values, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyDnVec(cusparseDnVecDescr_t dnVecDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnVecGet(cusparseDnVecDescr_t dnVecDescr,
+                                              int64_t *size, void **values,
+                                              cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseDnVecDescr_t, int64_t *, void **, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, size, values, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecGetValues(cusparseDnVecDescr_t dnVecDescr, void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecSetValues(cusparseDnVecDescr_t dnVecDescr, void *values) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroySpMat(cusparseSpMatDescr_t spMatDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMatGetFormat(
+    cusparseSpMatDescr_t spMatDescr, cusparseFormat_t *format) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, cusparseFormat_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetFormat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, format);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMatGetIndexBase(
+    cusparseSpMatDescr_t spMatDescr, cusparseIndexBase_t *idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t,
+                                                  cusparseIndexBase_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetIndexBase");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetValues(cusparseSpMatDescr_t spMatDescr, void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatSetValues(cusparseSpMatDescr_t spMatDescr, void *values) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetSize(cusparseSpMatDescr_t spMatDescr, int64_t *rows,
+                     int64_t *cols, int64_t *nnz) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatSetStridedBatch(cusparseSpMatDescr_t spMatDescr, int batchCount) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, batchCount);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetStridedBatch(cusparseSpMatDescr_t spMatDescr, int *batchCount) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, batchCount);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsr(
+    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
+    void *csrRowOffsets, void *csrColInd, void *csrValues,
+    cusparseIndexType_t csrRowOffsetsType, cusparseIndexType_t csrColIndType,
+    cusparseIndexBase_t idxBase, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
+      cusparseIndexType_t, cusparseIndexType_t, cusparseIndexBase_t,
+      cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
+                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsrGet(
+    cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols, int64_t *nnz,
+    void **csrRowOffsets, void **csrColInd, void **csrValues,
+    cusparseIndexType_t *csrRowOffsetsType, cusparseIndexType_t *csrColIndType,
+    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **, void **,
+      void **, cusparseIndexType_t *, cusparseIndexType_t *,
+      cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
+                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsrSetPointers(cusparseSpMatDescr_t spMatDescr, void *csrRowOffsets,
+                       void *csrColInd, void *csrValues) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *,
+                                                  void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrSetPointers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, csrRowOffsets, csrColInd, csrValues);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCoo(cusparseSpMatDescr_t *spMatDescr,
+                                               int64_t rows, int64_t cols,
+                                               int64_t nnz, void *cooRowInd,
+                                               void *cooColInd, void *cooValues,
+                                               cusparseIndexType_t cooIdxType,
+                                               cusparseIndexBase_t idxBase,
+                                               cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
+      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCoo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
+                  cooIdxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCooAoS(
+    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
+    void *cooInd, void *cooValues, cusparseIndexType_t cooIdxType,
+    cusparseIndexBase_t idxBase, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *,
+      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCooAoS");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooInd, cooValues, cooIdxType,
+                  idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCooGet(
+    cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols, int64_t *nnz,
+    void **cooRowInd,  // COO row indices
+    void **cooColInd,  // COO column indices
+    void **cooValues,  // COO values
+    cusparseIndexType_t *idxType, cusparseIndexBase_t *idxBase,
+    cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **, void **,
+      void **, cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
+                  idxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCooAoSGet(cusparseSpMatDescr_t spMatDescr,
+                                               int64_t *rows, int64_t *cols,
+                                               int64_t *nnz,
+                                               void **cooInd,     // COO indices
+                                               void **cooValues,  // COO values
+                                               cusparseIndexType_t *idxType,
+                                               cusparseIndexBase_t *idxBase,
+                                               cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **, void **,
+      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooAoSGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooInd, cooValues, idxType,
+                  idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateDnMat(
+    cusparseDnMatDescr_t *dnMatDescr, int64_t rows, int64_t cols, int64_t ld,
+    void *values, cudaDataType valueType, cusparseOrder_t order) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseDnMatDescr_t *, int64_t, int64_t, int64_t, void *, cudaDataType,
+      cusparseOrder_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, rows, cols, ld, values, valueType, order);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyDnMat(cusparseDnMatDescr_t dnMatDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnMatGet(cusparseDnMatDescr_t dnMatDescr,
+                                              int64_t *rows, int64_t *cols,
+                                              int64_t *ld, void **values,
+                                              cudaDataType *type,
+                                              cusparseOrder_t *order) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseDnMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
+      cudaDataType *, cusparseOrder_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, rows, cols, ld, values, type, order);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatGetValues(cusparseDnMatDescr_t dnMatDescr, void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatSetValues(cusparseDnMatDescr_t dnMatDescr, void *values) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnMatSetStridedBatch(
+    cusparseDnMatDescr_t dnMatDescr, int batchCount, int64_t batchStride) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, int, int64_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, batchCount, batchStride);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnMatGetStridedBatch(
+    cusparseDnMatDescr_t dnMatDescr, int *batchCount, int64_t *batchStride) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, int *, int64_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, batchCount, batchStride);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpVV_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opX, cusparseSpVecDescr_t vecX,
+    cusparseDnVecDescr_t vecY, const void *result, cudaDataType computeType,
+    size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseSpVecDescr_t,
+      cusparseDnVecDescr_t, const void *, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opX, vecX, vecY, result, computeType, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVV(cusparseHandle_t handle, cusparseOperation_t opX,
+             cusparseSpVecDescr_t vecX, cusparseDnVecDescr_t vecY, void *result,
+             cudaDataType computeType, void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseSpVecDescr_t,
+      cusparseDnVecDescr_t, void *, cudaDataType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opX, vecX, vecY, result, computeType, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMV(
+    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
+    cusparseSpMatDescr_t matA, cusparseDnVecDescr_t vecX, const void *beta,
+    cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpMVAlg_t alg,
+    void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const void *, cusparseSpMatDescr_t,
+      cusparseDnVecDescr_t, const void *, cusparseDnVecDescr_t, cudaDataType,
+      cusparseSpMVAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg,
+                  externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMV_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
+    cusparseSpMatDescr_t matA, cusparseDnVecDescr_t vecX, const void *beta,
+    cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpMVAlg_t alg,
+    size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const void *, cusparseSpMatDescr_t,
+      cusparseDnVecDescr_t, const void *, cusparseDnVecDescr_t, cudaDataType,
+      cusparseSpMVAlg_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg,
+                  bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMM(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseSpMatDescr_t matA, cusparseDnMatDescr_t matB,
+    const void *beta, cusparseDnMatDescr_t matC, cudaDataType computeType,
+    cusparseSpMMAlg_t alg, void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseSpMatDescr_t, cusparseDnMatDescr_t, const void *,
+      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMM_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseSpMatDescr_t matA, cusparseDnMatDescr_t matB,
+    const void *beta, cusparseDnMatDescr_t matC, cudaDataType computeType,
+    cusparseSpMMAlg_t alg, size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseSpMatDescr_t, cusparseDnMatDescr_t, const void *,
+      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_createDescr(cusparseSpGEMMDescr_t *descr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpGEMMDescr_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_createDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_destroyDescr(cusparseSpGEMMDescr_t descr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpGEMMDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_destroyDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_workEstimation(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseSpMatDescr_t matA, cusparseSpMatDescr_t matB,
+    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
+    cusparseSpGEMMAlg_t alg, cusparseSpGEMMDescr_t spgemmDescr,
+    size_t *bufferSize1, void *externalBuffer1) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseSpMatDescr_t, cusparseSpMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
+      cusparseSpGEMMDescr_t, size_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_workEstimation");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, spgemmDescr, bufferSize1, externalBuffer1);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_compute(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseSpMatDescr_t matA, cusparseSpMatDescr_t matB,
+    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
+    cusparseSpGEMMAlg_t alg, cusparseSpGEMMDescr_t spgemmDescr,
+    size_t *bufferSize2, void *externalBuffer2) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseSpMatDescr_t, cusparseSpMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
+      cusparseSpGEMMDescr_t, size_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_compute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, spgemmDescr, bufferSize2, externalBuffer2);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_copy(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseSpMatDescr_t matA, cusparseSpMatDescr_t matB,
+    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
+    cusparseSpGEMMAlg_t alg, cusparseSpGEMMDescr_t spgemmDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseSpMatDescr_t, cusparseSpMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
+      cusparseSpGEMMDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_copy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, spgemmDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseConstrainedGeMM(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseDnMatDescr_t matA, cusparseDnMatDescr_t matB,
+    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
+    void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseDnMatDescr_t, cusparseDnMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstrainedGeMM");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseConstrainedGeMM_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseDnMatDescr_t matA, cusparseDnMatDescr_t matB,
+    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
+    size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseDnMatDescr_t, cusparseDnMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseConstrainedGeMM_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  bufferSize);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cusparse_stub.cc b/tensorflow/stream_executor/cuda/cusparse_stub.cc
index 783b034d7b6..ae56402fbc3 100644
--- a/tensorflow/stream_executor/cuda/cusparse_stub.cc
+++ b/tensorflow/stream_executor/cuda/cusparse_stub.cc
@@ -59,7 +59,7 @@ cusparseStatus_t GetSymbolNotFoundError() {
 #include "tensorflow/stream_executor/cuda/cusparse_10_1.inc"
 #elif CUDA_VERSION == 10020
 #include "tensorflow/stream_executor/cuda/cusparse_10_2.inc"
-#elif CUDA_VERSION == 11000
+#elif CUSPARSE_VER_MAJOR == 11 && CUSPARSE_VER_MINOR == 0
 #include "tensorflow/stream_executor/cuda/cusparse_11_0.inc"
 #else
 #error "We don't have a wrapper for this version."
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
index 5cb1642083e..9744fc82593 100644
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -222,11 +222,11 @@ cc_library(
     hdrs = if_gpu_is_configured(["asm_compiler.h"]),
     copts = tf_copts(),
     visibility = [
+        "//tensorflow/compiler/mlir/tools/kernel_gen:__subpackages__",
         "//tensorflow/compiler/xla/service/gpu:__subpackages__",
         "//tensorflow/compiler/xla/service/mlir_gpu:__subpackages__",
         "//tensorflow/core/kernels:__subpackages__",
         "//tensorflow/stream_executor:__subpackages__",
-        "//third_party/tf_runtime/tools/tf_kernel_gen:__subpackages__",
     ],
     deps = if_gpu_is_configured([
         ":gpu_asm_opts",
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
index 9ae8b41ccf4..fb7d88aaedb 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -31,8 +31,12 @@ namespace internal {
 
 namespace {
 string GetCudaVersion() { return TF_CUDA_VERSION; }
-string GetCudaLibVersion() { return TF_CUDA_LIB_VERSION; }
 string GetCudnnVersion() { return TF_CUDNN_VERSION; }
+string GetCublasVersion() { return TF_CUBLAS_VERSION; }
+string GetCusolverVersion() { return TF_CUSOLVER_VERSION; }
+string GetCurandVersion() { return TF_CURAND_VERSION; }
+string GetCufftVersion() { return TF_CUFFT_VERSION; }
+string GetCusparseVersion() { return TF_CUSPARSE_VERSION; }
 string GetTensorRTVersion() { return TF_TENSORRT_VERSION; }
 
 port::StatusOr<void*> GetDsoHandle(const string& name, const string& version) {
@@ -77,23 +81,23 @@ port::StatusOr<void*> GetCudaRuntimeDsoHandle() {
 }
 
 port::StatusOr<void*> GetCublasDsoHandle() {
-  return GetDsoHandle("cublas", GetCudaLibVersion());
+  return GetDsoHandle("cublas", GetCublasVersion());
 }
 
 port::StatusOr<void*> GetCufftDsoHandle() {
-  return GetDsoHandle("cufft", GetCudaLibVersion());
+  return GetDsoHandle("cufft", GetCufftVersion());
 }
 
 port::StatusOr<void*> GetCusolverDsoHandle() {
-  return GetDsoHandle("cusolver", GetCudaLibVersion());
+  return GetDsoHandle("cusolver", GetCusolverVersion());
 }
 
 port::StatusOr<void*> GetCusparseDsoHandle() {
-  return GetDsoHandle("cusparse", GetCudaLibVersion());
+  return GetDsoHandle("cusparse", GetCusparseVersion());
 }
 
 port::StatusOr<void*> GetCurandDsoHandle() {
-  return GetDsoHandle("curand", GetCudaLibVersion());
+  return GetDsoHandle("curand", GetCurandVersion());
 }
 
 port::StatusOr<void*> GetCuptiDsoHandle() {
diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
index e22a243a70b..fd3b5f19913 100644
--- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -132,6 +132,11 @@ bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
     VLOG(3) << "Unloading  HSACO module " << module;
     GpuDriver::UnloadModule(context_, module);
     gpu_binary_to_module_.erase(module_it);
+    const char* mem_it = nullptr;
+    for (auto x : in_memory_modules_) {
+      if (x.second == module) mem_it = x.first;
+    }
+    if (mem_it != nullptr) in_memory_modules_.erase(mem_it);
   }
   return true;
 }
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index d9229e00306..03c561f4fc1 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -48,6 +48,7 @@ load(
     "//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
     "if_mkl_v1_open_source_only",
+    "if_mkldnn_threadpool",
 )
 load(
     "//third_party/ngraph:build_defs.bzl",
@@ -58,7 +59,7 @@ load(
 # not contain rc or alpha, only numbers.
 # Also update tensorflow/core/public/version.h
 # and tensorflow/tools/pip_package/setup.py
-VERSION = "2.1.0"
+VERSION = "2.2.0"
 VERSION_MAJOR = VERSION.split(".")[0]
 
 # Sanitize a dependency so that it works correctly from code that includes
@@ -193,10 +194,10 @@ def if_macos(a, otherwise = []):
         "//conditions:default": otherwise,
     })
 
-def if_ios(a):
+def if_ios(a, otherwise = []):
     return select({
         clean_dep("//tensorflow:ios"): a,
-        "//conditions:default": [],
+        "//conditions:default": otherwise,
     })
 
 def if_ios_x86_64(a):
@@ -327,6 +328,11 @@ def tf_copts(
         if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML"]) +
         if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) +
         if_mkl_v1_open_source_only(["-DENABLE_MKLDNN_V1"]) +
+        if_mkldnn_threadpool([
+            "-DENABLE_MKLDNN_THREADPOOL",
+            "-DENABLE_MKLDNN_V1",
+            "-DINTEL_MKL_DNN_ONLY",
+        ]) +
         if_enable_mkl(["-DENABLE_MKL"]) +
         if_ngraph(["-DINTEL_NGRAPH=1"]) +
         if_android_arm(["-mfpu=neon"]) +
@@ -348,7 +354,9 @@ def tf_copts(
     )
 
 def tf_openmp_copts():
-    return if_mkl_lnx_x64(["-fopenmp"])
+    # TODO(intel-mkl): Remove -fopenmp for threadpool after removing all
+    # omp pragmas in tensorflow/core.
+    return if_mkl_lnx_x64(["-fopenmp"]) + if_mkldnn_threadpool(["-fopenmp"])
 
 def tfe_xla_copts():
     return select({
@@ -615,6 +623,9 @@ def tf_cc_shared_object(
             linkshared = 1,
             data = data + data_extra,
             linkopts = linkopts + _rpath_linkopts(name_os_full) + select({
+                clean_dep("//tensorflow:ios"): [
+                    "-Wl,-install_name,@rpath/" + soname,
+                ],
                 clean_dep("//tensorflow:macos"): [
                     "-Wl,-install_name,@rpath/" + soname,
                 ],
@@ -863,7 +874,7 @@ def tf_gen_op_wrappers_cc(
             clean_dep("//tensorflow/core:ops"),
             clean_dep("//tensorflow/core:protos_all_cc"),
         ]) + if_android([
-            clean_dep("//tensorflow/core:android_tensorflow_lib"),
+            clean_dep("//tensorflow/core:portable_tensorflow_lib"),
         ]),
         copts = tf_copts(),
         alwayslink = 1,
@@ -880,7 +891,7 @@ def tf_gen_op_wrappers_cc(
             clean_dep("//tensorflow/core:ops"),
             clean_dep("//tensorflow/core:protos_all_cc"),
         ]) + if_android([
-            clean_dep("//tensorflow/core:android_tensorflow_lib"),
+            clean_dep("//tensorflow/core:portable_tensorflow_lib"),
         ]),
         copts = tf_copts(),
         alwayslink = 1,
@@ -2147,6 +2158,8 @@ def pywrap_tensorflow_macro(
 # 2. When --define=no_tensorflow_py_deps=false (by default), it's a normal py_test.
 def py_test(deps = [], data = [], kernels = [], **kwargs):
     # Python version placeholder
+    if kwargs.get("python_version", None) == "PY3":
+        kwargs["tags"] = kwargs.get("tags", []) + ["no_oss_py2"]
     native.py_test(
         # TODO(jlebar): Ideally we'd use tcmalloc here.,
         deps = select({
@@ -2207,6 +2220,15 @@ def tf_py_test(
         xla_enabled = False,
         grpc_enabled = False,
         tfrt_enabled = False,
+        # `tfrt_enabled` is set for some test targets, and if we enable
+        # TFRT tests just by that, this will enable TFRT builds for open source.
+        # TFRT open source is not fully integrated yet so we need a temporary
+        # workaround to enable TFRT only for internal builds. `tfrt_enabled_internal`
+        # will be set by `tensorflow.google.bzl`'s `tf_py_test` target, which is
+        # only applied for internal builds.
+        # TODO(b/156911178): Revert this temporary workaround once TFRT open source
+        # is fully integrated with TF.
+        tfrt_enabled_internal = False,
         **kwargs):
     """Create one or more python tests with extra tensorflow dependencies."""
     xla_test_true_list = []
@@ -2250,7 +2272,7 @@ def tf_py_test(
         deps = depset(deps + xla_test_true_list),
         **kwargs
     )
-    if tfrt_enabled:
+    if tfrt_enabled_internal:
         py_test(
             name = name + "_tfrt",
             size = size,
@@ -2846,7 +2868,7 @@ def if_mlir(if_true, if_false = []):
         "//conditions:default": if_false,
     })
 
-def tfcompile_extra_flags():
+def tfcompile_target_cpu():
     return ""
 
 def tf_external_workspace_visible(visibility):
diff --git a/tensorflow/tools/android/inference_interface/BUILD b/tensorflow/tools/android/inference_interface/BUILD
index cbd161f05b3..fb3ab00f9bc 100644
--- a/tensorflow/tools/android/inference_interface/BUILD
+++ b/tensorflow/tools/android/inference_interface/BUILD
@@ -34,7 +34,7 @@ cc_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:android_tensorflow_lib_lite",
+        "//tensorflow/core:portable_tensorflow_lib_lite",
         "//tensorflow/java/src/main/native",
     ],
     alwayslink = 1,
@@ -83,7 +83,7 @@ cc_binary(
     ],
     deps = [
         ":android_tensorflow_inference_jni",
-        "//tensorflow/core:android_tensorflow_lib",
+        "//tensorflow/core:portable_tensorflow_lib",
         LINKER_SCRIPT,
     ],
 )
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
index 4a30fae1da9..9315973e51d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.Tensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
   is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
+  is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "OVERLOADABLE_OPERATORS"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index 6bf7e809604..df5f7761b07 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 7af13486d3d..3488398a955 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -43,6 +43,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index 3e586a48947..ba554656ba1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -43,6 +43,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index b838fe0f336..061ccb70f6d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -43,6 +43,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index 9c1adbad96c..f1a99bf2b21 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -43,6 +43,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 905e0d6f8fe..140c1355285 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -43,6 +43,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index 8eee750860e..a41f7cdfedf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -43,6 +43,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 498d8cfdcbb..e9ec2a6e187 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -80,6 +80,10 @@ tf_module {
     name: "UNKNOWN_CARDINALITY"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "service"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "Counter"
     argspec: "args=[\'start\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \"<dtype: \'int64\'>\"], "
@@ -216,6 +220,10 @@ tf_module {
     name: "shuffle_and_repeat"
     argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "snapshot"
+    argspec: "args=[\'path\', \'compression\', \'reader_func\', \'shard_func\'], varargs=None, keywords=None, defaults=[\'AUTO\', \'None\', \'None\'], "
+  }
   member_method {
     name: "take_while"
     argspec: "args=[\'predicate\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt
new file mode 100644
index 00000000000..12f4f3c2b08
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.data.experimental.service"
+tf_module {
+  member_method {
+    name: "distribute"
+    argspec: "args=[\'processing_mode\', \'service\', \'job_name\', \'max_outstanding_requests\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
index 5013717337b..1d2af017d84 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "FixedLengthRecordDataset"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "INFINITE_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "Iterator"
     mtype: "<type \'type\'>"
@@ -28,6 +32,10 @@ tf_module {
     name: "TextLineDataset"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "UNKNOWN_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "experimental"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
index c0dc0054165..658212aca5e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.distribute.cluster_resolver.TPUClusterResolver"
 tf_class {
-  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver.TPUClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver.TPUClusterResolver\'>"
   is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.experimental.-embedding-config-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.experimental.-embedding-config-spec.pbtxt
index 46d0362a705..355c57269fd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.experimental.-embedding-config-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.experimental.-embedding-config-spec.pbtxt
@@ -35,6 +35,10 @@ tf_class {
     name: "table_to_config_dict"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "tensor_core_feature_columns"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index 272396239d7..d696021fcb4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -175,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 8979491971f..b8486a27b9e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.Sequential"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.functional.Functional\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
index 448ea60cc0f..7bf71844fa6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.experimental.LinearModel"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.premade.linear.LinearModel\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -176,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
index 41483f2b83d..e2bef6beaaa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.experimental.SequenceFeatures"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.feature_column.sequence_feature_column.SequenceFeatures\'>"
-  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.base_feature_layer._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 8e1d9927434..87a7319639b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.experimental.WideDeepModel"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.premade.wide_deep.WideDeepModel\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -176,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index cb2f7f03e56..3319122f50b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 9d847c759a1..30736595ce5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index 68cbf32998e..84527b26a39 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index 76d66200fbc..ccb783c33bf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index 8d874ede685..179acdc7966 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index f97c7617dbd..b7db8afd065 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
index ecda1603325..7ed6c7747a7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
-  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.base_feature_layer._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
new file mode 100644
index 00000000000..6cfcbf73e5d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
@@ -0,0 +1,222 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryCrossing"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_crossing.CategoryCrossing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'depth\', \'name\', \'separator\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "partial_crossing"
+    argspec: "args=[\'self\', \'partial_inputs\', \'ragged_out\', \'sparse_out\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
new file mode 100644
index 00000000000..e907d9a293b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding.__metaclass__"
+tf_class {
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
new file mode 100644
index 00000000000..5bd938d8fd6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
@@ -0,0 +1,234 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_encoding_v1.CategoryEncoding\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_encoding.CategoryEncoding\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer_v1.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_tokens\', \'output_mode\', \'sparse\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'count\', \'False\'], "
+  }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'count_weights\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_num_elements"
+    argspec: "args=[\'self\', \'num_elements\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_tfidf_data"
+    argspec: "args=[\'self\', \'tfidf_data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
new file mode 100644
index 00000000000..82ce2303a76
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Discretization"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.discretization.Discretization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'bins\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
new file mode 100644
index 00000000000..e4a5619058d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Hashing"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.hashing.Hashing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_bins\', \'salt\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt
new file mode 100644
index 00000000000..409509cd4d2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.IntegerLookup.__metaclass__"
+tf_class {
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
new file mode 100644
index 00000000000..c0052764039
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
@@ -0,0 +1,240 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.IntegerLookup"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.integer_lookup_v1.IntegerLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.integer_lookup.IntegerLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.index_lookup_v1.IndexLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.index_lookup.IndexLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer_v1.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_values\', \'num_oov_indices\', \'mask_value\', \'oov_value\', \'vocabulary\', \'invert\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'0\', \'-1\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_vocabulary"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_vocabulary"
+    argspec: "args=[\'self\', \'vocab\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "vocab_size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
new file mode 100644
index 00000000000..85850223bcb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomZoom"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomZoom\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
index 7036fb926a8..60c0bc92f81 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -113,7 +113,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'scale\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'scale\', \'offset\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.0\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt
new file mode 100644
index 00000000000..4cb57350380
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.StringLookup.__metaclass__"
+tf_class {
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
new file mode 100644
index 00000000000..1deb932cb4e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
@@ -0,0 +1,240 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.StringLookup"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.string_lookup_v1.StringLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.string_lookup.StringLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.index_lookup_v1.IndexLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.index_lookup.IndexLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer_v1.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_tokens\', \'num_oov_indices\', \'mask_token\', \'oov_token\', \'vocabulary\', \'encoding\', \'invert\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'\', \'[UNK]\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_vocabulary"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_vocabulary"
+    argspec: "args=[\'self\', \'vocab\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "vocab_size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
index 47852865558..4f5b0f480e4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@@ -221,7 +221,7 @@ tf_class {
   }
   member_method {
     name: "set_vocabulary"
-    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\', \'append\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index f369c32a65e..94f6e3ba990 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -1,9 +1,29 @@
 path: "tensorflow.keras.layers.experimental.preprocessing"
 tf_module {
+  member {
+    name: "CategoryCrossing"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoryEncoding"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CenterCrop"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Discretization"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Hashing"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "IntegerLookup"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Normalization"
     mtype: "<type \'type\'>"
@@ -40,6 +60,10 @@ tf_module {
     name: "RandomWidth"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RandomZoom"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Rescaling"
     mtype: "<type \'type\'>"
@@ -48,6 +72,10 @@ tf_module {
     name: "Resizing"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "StringLookup"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TextVectorization"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 13c3416fc0c..00c9fc22def 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.models.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -175,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 9218cbea99e..d3cca7311ee 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.models.Sequential"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.functional.Functional\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt
new file mode 100644
index 00000000000..5cb133ca85d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt
new file mode 100644
index 00000000000..c5b706d1d2f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt
new file mode 100644
index 00000000000..eb769a0dc44
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt
new file mode 100644
index 00000000000..fda5c76ecd2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
index bb10d41d704..96a4b193b1b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
@@ -1,13 +1,29 @@
 path: "tensorflow.keras.regularizers"
 tf_module {
+  member {
+    name: "L1"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "L1L2"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "L2"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Regularizer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "l1"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "l2"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -16,18 +32,10 @@ tf_module {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "l1"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "l1_l2"
     argspec: "args=[\'l1\', \'l2\'], varargs=None, keywords=None, defaults=[\'0.01\', \'0.01\'], "
   }
-  member_method {
-    name: "l2"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "serialize"
     argspec: "args=[\'regularizer\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 60ae8ef5be9..e6274357a49 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -2432,6 +2432,10 @@ tf_module {
     name: "tuple"
     argspec: "args=[\'tensors\', \'name\', \'control_inputs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "type_spec_from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "unique"
     argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index e622768979c..3a47de87737 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -342,7 +342,7 @@ tf_module {
   }
   member_method {
     name: "BatchFunction"
-    argspec: "args=[\'in_tensors\', \'captured_tensors\', \'f\', \'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'Tout\', \'max_enqueued_batches\', \'allowed_batch_sizes\', \'container\', \'shared_name\', \'batching_queue\', \'name\'], varargs=None, keywords=None, defaults=[\'10\', \'[]\', \'\', \'\', \'\', \'None\'], "
+    argspec: "args=[\'in_tensors\', \'captured_tensors\', \'f\', \'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'Tout\', \'max_enqueued_batches\', \'allowed_batch_sizes\', \'container\', \'shared_name\', \'batching_queue\', \'enable_large_batch_splitting\', \'name\'], varargs=None, keywords=None, defaults=[\'10\', \'[]\', \'\', \'\', \'\', \'False\', \'None\'], "
   }
   member_method {
     name: "BatchIFFT"
@@ -702,15 +702,15 @@ tf_module {
   }
   member_method {
     name: "CollectiveBcastRecv"
-    argspec: "args=[\'T\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
+    argspec: "args=[\'T\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CollectiveBcastSend"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CollectiveGather"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CollectivePermute"
@@ -718,7 +718,7 @@ tf_module {
   }
   member_method {
     name: "CollectiveReduce"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'auto\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CombinedNonMaxSuppression"
@@ -736,6 +736,10 @@ tf_module {
     name: "ComplexAbs"
     argspec: "args=[\'x\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "CompressElement"
+    argspec: "args=[\'components\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "ComputeAccidentalHits"
     argspec: "args=[\'true_classes\', \'sampled_candidates\', \'num_true\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
@@ -1074,11 +1078,11 @@ tf_module {
   }
   member_method {
     name: "DenseBincount"
-    argspec: "args=[\'input\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'input\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "DenseCountSparseOutput"
-    argspec: "args=[\'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "DenseToCSRSparseMatrix"
@@ -1136,6 +1140,10 @@ tf_module {
     name: "DestroyTemporaryVariable"
     argspec: "args=[\'ref\', \'var_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DeviceIndex"
+    argspec: "args=[\'device_names\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Diag"
     argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1472,6 +1480,10 @@ tf_module {
     name: "ExtractGlimpse"
     argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'True\', \'uniform\', \'None\'], "
   }
+  member_method {
+    name: "ExtractGlimpseV2"
+    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'True\', \'uniform\', \'None\'], "
+  }
   member_method {
     name: "ExtractImagePatches"
     argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3070,11 +3082,11 @@ tf_module {
   }
   member_method {
     name: "RaggedBincount"
-    argspec: "args=[\'splits\', \'values\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "RaggedCountSparseOutput"
-    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "RaggedCross"
@@ -3980,6 +3992,10 @@ tf_module {
     name: "SnapshotDataset"
     argspec: "args=[\'input_dataset\', \'path\', \'output_types\', \'output_shapes\', \'compression\', \'reader_path_prefix\', \'writer_path_prefix\', \'shard_size_bytes\', \'pending_snapshot_expiry_seconds\', \'num_reader_threads\', \'reader_buffer_size\', \'num_writer_threads\', \'writer_buffer_size\', \'shuffle_on_read\', \'seed\', \'seed2\', \'mode\', \'snapshot_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'10737418240\', \'86400\', \'1\', \'1\', \'1\', \'1\', \'False\', \'0\', \'0\', \'auto\', \'\', \'None\'], "
   }
+  member_method {
+    name: "SnapshotDatasetV2"
+    argspec: "args=[\'input_dataset\', \'path\', \'reader_func_other_args\', \'shard_func_other_args\', \'output_types\', \'output_shapes\', \'reader_func\', \'shard_func\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
   member_method {
     name: "SobolSample"
     argspec: "args=[\'dim\', \'num_results\', \'skip\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
@@ -4082,7 +4098,7 @@ tf_module {
   }
   member_method {
     name: "SparseBincount"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "SparseConcat"
@@ -4094,12 +4110,20 @@ tf_module {
   }
   member_method {
     name: "SparseCountSparseOutput"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "SparseCross"
     argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'hashed_output\', \'num_buckets\', \'hash_key\', \'out_type\', \'internal_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SparseCrossHashed"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'num_buckets\', \'strong_hash\', \'salt\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseCrossV2"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'sep\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "SparseDenseCwiseAdd"
     argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4590,7 +4614,7 @@ tf_module {
   }
   member_method {
     name: "TPUReplicatedInput"
-    argspec: "args=[\'inputs\', \'is_mirrored_variable\', \'index\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\', \'None\'], "
+    argspec: "args=[\'inputs\', \'is_mirrored_variable\', \'index\', \'is_packed\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\', \'False\', \'None\'], "
   }
   member_method {
     name: "TPUReplicatedOutput"
@@ -4948,6 +4972,10 @@ tf_module {
     name: "UnbatchGrad"
     argspec: "args=[\'original_input\', \'batch_index\', \'grad\', \'id\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
   }
+  member_method {
+    name: "UncompressElement"
+    argspec: "args=[\'compressed\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "UnicodeDecode"
     argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'Tsplits\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \"<dtype: \'int64\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index 4c4f6c62291..9550418c2a6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "concat"
@@ -22,7 +22,7 @@ tf_module {
   }
   member_method {
     name: "cross"
-    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'inputs\', \'name\', \'separator\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "cross_hashed"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
new file mode 100644
index 00000000000..e2c6bbd43d9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.Adagrad"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.Adagrad\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
new file mode 100644
index 00000000000..941e81acbbb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.Adam"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.Adam\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
new file mode 100644
index 00000000000..b2c31d00ad8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.tpu.experimental.embedding.FeatureConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.FeatureConfig\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'table\', \'max_sequence_length\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
new file mode 100644
index 00000000000..9a3f47406b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.SGD"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.SGD\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\'], varargs=None, keywords=None, defaults=[\'0.01\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
new file mode 100644
index 00000000000..9cc8354b4bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.tpu.experimental.embedding.TPUEmbedding"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2.TPUEmbedding\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "embedding_tables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_config\', \'batch_size\', \'optimizer\', \'pipeline_execution_with_tensor_core\', \'initialize_tpu_embedding\'], varargs=None, keywords=None, defaults=[\'False\', \'True\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'gradients\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'features\', \'weights\', \'training\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-table-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-table-config.pbtxt
new file mode 100644
index 00000000000..6be35ed6fb6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-table-config.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.tpu.experimental.embedding.TableConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.TableConfig\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'vocabulary_size\', \'dim\', \'initializer\', \'optimizer\', \'combiner\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'mean\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
new file mode 100644
index 00000000000..9d4f24f4edd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.tpu.experimental.embedding"
+tf_module {
+  member {
+    name: "Adagrad"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Adam"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FeatureConfig"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SGD"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TPUEmbedding"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TableConfig"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt
index ef1c8078cca..f9925518a1a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "Topology"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "embedding"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "embedding_column"
     argspec: "args=[\'categorical_column\', \'dimension\', \'combiner\', \'initializer\', \'max_sequence_length\', \'learning_rate_fn\', \'embedding_lookup_device\', \'tensor_core_shape\', \'use_safe_embedding_lookup\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'0\', \'None\', \'None\', \'None\', \'True\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
index 4a30fae1da9..9315973e51d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.Tensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
   is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
+  is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "OVERLOADABLE_OPERATORS"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index acc6334055f..c52d26ec6ff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -28,6 +28,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 1c5ab59020e..aa27517a73f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -30,6 +30,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index 18c77bf4289..7864c08540c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -29,6 +29,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index 6ebe24206ab..1aeaac23b4a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -30,6 +30,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 41865c9700f..087eb1a3860 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -30,6 +30,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index ae905aa1fea..ac00eaf018a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -30,6 +30,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index 3f274660402..8fb4318379e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -30,6 +30,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index ce77b867902..bef3f166a6d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -68,6 +68,10 @@ tf_module {
     name: "UNKNOWN_CARDINALITY"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "service"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "Counter"
     argspec: "args=[\'start\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \"<dtype: \'int64\'>\"], "
@@ -184,6 +188,10 @@ tf_module {
     name: "shuffle_and_repeat"
     argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "snapshot"
+    argspec: "args=[\'path\', \'compression\', \'reader_func\', \'shard_func\'], varargs=None, keywords=None, defaults=[\'AUTO\', \'None\', \'None\'], "
+  }
   member_method {
     name: "take_while"
     argspec: "args=[\'predicate\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-master-server.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-master-server.pbtxt
new file mode 100644
index 00000000000..daac7716ca8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-master-server.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.data.experimental.service.MasterServer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.MasterServer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "target"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'port\', \'protocol\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "join"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "start"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt
new file mode 100644
index 00000000000..d0121b7edf2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.data.experimental.service.WorkerServer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.WorkerServer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'port\', \'master_address\', \'worker_address\', \'protocol\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "join"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "start"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
new file mode 100644
index 00000000000..347dd3c74b1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.data.experimental.service"
+tf_module {
+  member {
+    name: "MasterServer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "WorkerServer"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "distribute"
+    argspec: "args=[\'processing_mode\', \'service\', \'job_name\', \'max_outstanding_requests\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
index ecc61e7b138..f8e0a976a73 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "FixedLengthRecordDataset"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "INFINITE_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "Options"
     mtype: "<type \'type\'>"
@@ -24,6 +28,10 @@ tf_module {
     name: "TextLineDataset"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "UNKNOWN_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "experimental"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
index c0dc0054165..658212aca5e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.distribute.cluster_resolver.TPUClusterResolver"
 tf_class {
-  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver.TPUClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver.TPUClusterResolver\'>"
   is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index 272396239d7..d696021fcb4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -175,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 8979491971f..b8486a27b9e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.Sequential"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.functional.Functional\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.-backup-and-restore.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.-backup-and-restore.pbtxt
new file mode 100644
index 00000000000..4b0ab2536ae
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.-backup-and-restore.pbtxt
@@ -0,0 +1,82 @@
+path: "tensorflow.keras.callbacks.experimental.BackupAndRestore"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.callbacks.BackupAndRestore\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'backup_dir\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "on_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_epoch_begin"
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_epoch_end"
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_model"
+    argspec: "args=[\'self\', \'model\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_params"
+    argspec: "args=[\'self\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.pbtxt
new file mode 100644
index 00000000000..670df243e9c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.callbacks.experimental"
+tf_module {
+  member {
+    name: "BackupAndRestore"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.pbtxt
index 31716a24407..cd1d0d940a3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.pbtxt
@@ -56,4 +56,8 @@ tf_module {
     name: "TerminateOnNaN"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
index 448ea60cc0f..7bf71844fa6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.experimental.LinearModel"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.premade.linear.LinearModel\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -176,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
index 41483f2b83d..e2bef6beaaa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.experimental.SequenceFeatures"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.feature_column.sequence_feature_column.SequenceFeatures\'>"
-  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.base_feature_layer._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 8e1d9927434..87a7319639b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.experimental.WideDeepModel"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.premade.wide_deep.WideDeepModel\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -176,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index cb2f7f03e56..3319122f50b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 9d847c759a1..30736595ce5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index 68cbf32998e..84527b26a39 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index 76d66200fbc..ccb783c33bf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index 8d874ede685..179acdc7966 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index f97c7617dbd..b7db8afd065 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index f7137f0d09b..3b4eb863387 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features_v2.DenseFeatures\'>"
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
-  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.base_feature_layer._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
new file mode 100644
index 00000000000..6cfcbf73e5d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
@@ -0,0 +1,222 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryCrossing"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_crossing.CategoryCrossing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'depth\', \'name\', \'separator\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "partial_crossing"
+    argspec: "args=[\'self\', \'partial_inputs\', \'ragged_out\', \'sparse_out\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
new file mode 100644
index 00000000000..e907d9a293b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding.__metaclass__"
+tf_class {
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
new file mode 100644
index 00000000000..fe0e45c3c67
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
@@ -0,0 +1,232 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_encoding.CategoryEncoding\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_tokens\', \'output_mode\', \'sparse\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'count\', \'False\'], "
+  }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'count_weights\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_num_elements"
+    argspec: "args=[\'self\', \'num_elements\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_tfidf_data"
+    argspec: "args=[\'self\', \'tfidf_data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
new file mode 100644
index 00000000000..82ce2303a76
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Discretization"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.discretization.Discretization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'bins\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
new file mode 100644
index 00000000000..e4a5619058d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Hashing"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.hashing.Hashing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_bins\', \'salt\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt
new file mode 100644
index 00000000000..409509cd4d2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.IntegerLookup.__metaclass__"
+tf_class {
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
new file mode 100644
index 00000000000..dedb0f3073f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
@@ -0,0 +1,237 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.IntegerLookup"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.integer_lookup.IntegerLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.index_lookup.IndexLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_values\', \'num_oov_indices\', \'mask_value\', \'oov_value\', \'vocabulary\', \'invert\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'0\', \'-1\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_vocabulary"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_vocabulary"
+    argspec: "args=[\'self\', \'vocab\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "vocab_size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
new file mode 100644
index 00000000000..85850223bcb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomZoom"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomZoom\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
index 7036fb926a8..60c0bc92f81 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -113,7 +113,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'scale\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'scale\', \'offset\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.0\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt
new file mode 100644
index 00000000000..4cb57350380
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.StringLookup.__metaclass__"
+tf_class {
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
new file mode 100644
index 00000000000..b419e779c48
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
@@ -0,0 +1,237 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.StringLookup"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.string_lookup.StringLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.index_lookup.IndexLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_tokens\', \'num_oov_indices\', \'mask_token\', \'oov_token\', \'vocabulary\', \'encoding\', \'invert\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'\', \'[UNK]\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_vocabulary"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_vocabulary"
+    argspec: "args=[\'self\', \'vocab\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "vocab_size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
index 05154268354..a33f65189fd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@@ -219,7 +219,7 @@ tf_class {
   }
   member_method {
     name: "set_vocabulary"
-    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\', \'append\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index f369c32a65e..94f6e3ba990 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -1,9 +1,29 @@
 path: "tensorflow.keras.layers.experimental.preprocessing"
 tf_module {
+  member {
+    name: "CategoryCrossing"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoryEncoding"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CenterCrop"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Discretization"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Hashing"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "IntegerLookup"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Normalization"
     mtype: "<type \'type\'>"
@@ -40,6 +60,10 @@ tf_module {
     name: "RandomWidth"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RandomZoom"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Rescaling"
     mtype: "<type \'type\'>"
@@ -48,6 +72,10 @@ tf_module {
     name: "Resizing"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "StringLookup"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TextVectorization"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 13c3416fc0c..00c9fc22def 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.models.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -175,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 9218cbea99e..d3cca7311ee 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.models.Sequential"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.functional.Functional\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.pbtxt
index 0b49aa9f3d4..e59c78cc496 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.pbtxt
@@ -68,4 +68,8 @@ tf_module {
     name: "save_img"
     argspec: "args=[\'path\', \'x\', \'data_format\', \'file_format\', \'scale\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'True\'], "
   }
+  member_method {
+    name: "smart_resize"
+    argspec: "args=[\'x\', \'size\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'bilinear\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt
new file mode 100644
index 00000000000..5cb133ca85d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt
new file mode 100644
index 00000000000..c5b706d1d2f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt
new file mode 100644
index 00000000000..eb769a0dc44
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt
new file mode 100644
index 00000000000..fda5c76ecd2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
index bb10d41d704..96a4b193b1b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
@@ -1,13 +1,29 @@
 path: "tensorflow.keras.regularizers"
 tf_module {
+  member {
+    name: "L1"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "L1L2"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "L2"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Regularizer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "l1"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "l2"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -16,18 +32,10 @@ tf_module {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "l1"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "l1_l2"
     argspec: "args=[\'l1\', \'l2\'], varargs=None, keywords=None, defaults=[\'0.01\', \'0.01\'], "
   }
-  member_method {
-    name: "l2"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "serialize"
     argspec: "args=[\'regularizer\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index 227366f5f98..2ea4e8f84a6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -82,7 +82,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'arr\', \'weights\', \'minlength\', \'maxlength\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int32\'>\", \'None\'], "
+    argspec: "args=[\'arr\', \'weights\', \'minlength\', \'maxlength\', \'dtype\', \'name\', \'axis\', \'binary_output\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int32\'>\", \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "ceil"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 8928a9d3b67..468b4e36238 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -1108,6 +1108,10 @@ tf_module {
     name: "tuple"
     argspec: "args=[\'tensors\', \'control_inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "type_spec_from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "unique"
     argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index e622768979c..3a47de87737 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -342,7 +342,7 @@ tf_module {
   }
   member_method {
     name: "BatchFunction"
-    argspec: "args=[\'in_tensors\', \'captured_tensors\', \'f\', \'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'Tout\', \'max_enqueued_batches\', \'allowed_batch_sizes\', \'container\', \'shared_name\', \'batching_queue\', \'name\'], varargs=None, keywords=None, defaults=[\'10\', \'[]\', \'\', \'\', \'\', \'None\'], "
+    argspec: "args=[\'in_tensors\', \'captured_tensors\', \'f\', \'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'Tout\', \'max_enqueued_batches\', \'allowed_batch_sizes\', \'container\', \'shared_name\', \'batching_queue\', \'enable_large_batch_splitting\', \'name\'], varargs=None, keywords=None, defaults=[\'10\', \'[]\', \'\', \'\', \'\', \'False\', \'None\'], "
   }
   member_method {
     name: "BatchIFFT"
@@ -702,15 +702,15 @@ tf_module {
   }
   member_method {
     name: "CollectiveBcastRecv"
-    argspec: "args=[\'T\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
+    argspec: "args=[\'T\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CollectiveBcastSend"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CollectiveGather"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CollectivePermute"
@@ -718,7 +718,7 @@ tf_module {
   }
   member_method {
     name: "CollectiveReduce"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'auto\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CombinedNonMaxSuppression"
@@ -736,6 +736,10 @@ tf_module {
     name: "ComplexAbs"
     argspec: "args=[\'x\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "CompressElement"
+    argspec: "args=[\'components\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "ComputeAccidentalHits"
     argspec: "args=[\'true_classes\', \'sampled_candidates\', \'num_true\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
@@ -1074,11 +1078,11 @@ tf_module {
   }
   member_method {
     name: "DenseBincount"
-    argspec: "args=[\'input\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'input\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "DenseCountSparseOutput"
-    argspec: "args=[\'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "DenseToCSRSparseMatrix"
@@ -1136,6 +1140,10 @@ tf_module {
     name: "DestroyTemporaryVariable"
     argspec: "args=[\'ref\', \'var_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DeviceIndex"
+    argspec: "args=[\'device_names\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Diag"
     argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1472,6 +1480,10 @@ tf_module {
     name: "ExtractGlimpse"
     argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'True\', \'uniform\', \'None\'], "
   }
+  member_method {
+    name: "ExtractGlimpseV2"
+    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'True\', \'uniform\', \'None\'], "
+  }
   member_method {
     name: "ExtractImagePatches"
     argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3070,11 +3082,11 @@ tf_module {
   }
   member_method {
     name: "RaggedBincount"
-    argspec: "args=[\'splits\', \'values\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "RaggedCountSparseOutput"
-    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "RaggedCross"
@@ -3980,6 +3992,10 @@ tf_module {
     name: "SnapshotDataset"
     argspec: "args=[\'input_dataset\', \'path\', \'output_types\', \'output_shapes\', \'compression\', \'reader_path_prefix\', \'writer_path_prefix\', \'shard_size_bytes\', \'pending_snapshot_expiry_seconds\', \'num_reader_threads\', \'reader_buffer_size\', \'num_writer_threads\', \'writer_buffer_size\', \'shuffle_on_read\', \'seed\', \'seed2\', \'mode\', \'snapshot_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'10737418240\', \'86400\', \'1\', \'1\', \'1\', \'1\', \'False\', \'0\', \'0\', \'auto\', \'\', \'None\'], "
   }
+  member_method {
+    name: "SnapshotDatasetV2"
+    argspec: "args=[\'input_dataset\', \'path\', \'reader_func_other_args\', \'shard_func_other_args\', \'output_types\', \'output_shapes\', \'reader_func\', \'shard_func\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
   member_method {
     name: "SobolSample"
     argspec: "args=[\'dim\', \'num_results\', \'skip\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
@@ -4082,7 +4098,7 @@ tf_module {
   }
   member_method {
     name: "SparseBincount"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "SparseConcat"
@@ -4094,12 +4110,20 @@ tf_module {
   }
   member_method {
     name: "SparseCountSparseOutput"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "SparseCross"
     argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'hashed_output\', \'num_buckets\', \'hash_key\', \'out_type\', \'internal_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SparseCrossHashed"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'num_buckets\', \'strong_hash\', \'salt\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseCrossV2"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'sep\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "SparseDenseCwiseAdd"
     argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4590,7 +4614,7 @@ tf_module {
   }
   member_method {
     name: "TPUReplicatedInput"
-    argspec: "args=[\'inputs\', \'is_mirrored_variable\', \'index\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\', \'None\'], "
+    argspec: "args=[\'inputs\', \'is_mirrored_variable\', \'index\', \'is_packed\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\', \'False\', \'None\'], "
   }
   member_method {
     name: "TPUReplicatedOutput"
@@ -4948,6 +4972,10 @@ tf_module {
     name: "UnbatchGrad"
     argspec: "args=[\'original_input\', \'batch_index\', \'grad\', \'id\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
   }
+  member_method {
+    name: "UncompressElement"
+    argspec: "args=[\'compressed\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "UnicodeDecode"
     argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'Tsplits\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \"<dtype: \'int64\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index a9ad81920dd..0028b7d8953 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -10,7 +10,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "concat"
@@ -18,7 +18,7 @@ tf_module {
   }
   member_method {
     name: "cross"
-    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'inputs\', \'name\', \'separator\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "cross_hashed"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
new file mode 100644
index 00000000000..e2c6bbd43d9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.Adagrad"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.Adagrad\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
new file mode 100644
index 00000000000..941e81acbbb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.Adam"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.Adam\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
new file mode 100644
index 00000000000..b2c31d00ad8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.tpu.experimental.embedding.FeatureConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.FeatureConfig\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'table\', \'max_sequence_length\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
new file mode 100644
index 00000000000..9a3f47406b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.SGD"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.SGD\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\'], varargs=None, keywords=None, defaults=[\'0.01\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
new file mode 100644
index 00000000000..9cc8354b4bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.tpu.experimental.embedding.TPUEmbedding"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2.TPUEmbedding\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "embedding_tables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_config\', \'batch_size\', \'optimizer\', \'pipeline_execution_with_tensor_core\', \'initialize_tpu_embedding\'], varargs=None, keywords=None, defaults=[\'False\', \'True\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'gradients\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'features\', \'weights\', \'training\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-table-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-table-config.pbtxt
new file mode 100644
index 00000000000..6be35ed6fb6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-table-config.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.tpu.experimental.embedding.TableConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.TableConfig\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'vocabulary_size\', \'dim\', \'initializer\', \'optimizer\', \'combiner\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'mean\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
new file mode 100644
index 00000000000..9d4f24f4edd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.tpu.experimental.embedding"
+tf_module {
+  member {
+    name: "Adagrad"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Adam"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FeatureConfig"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SGD"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TPUEmbedding"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TableConfig"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.pbtxt
index df31799828c..5c547f4f49b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "Topology"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "embedding"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "initialize_tpu_system"
     argspec: "args=[\'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 1732a285855..0098003f0c8 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -34,7 +34,6 @@ import re
 import sys
 
 import six
-from six.moves import range
 import tensorflow as tf
 
 from google.protobuf import message
@@ -87,9 +86,8 @@ _API_GOLDEN_FOLDER_V2 = None
 def _InitPathConstants():
   global _API_GOLDEN_FOLDER_V1
   global _API_GOLDEN_FOLDER_V2
-  root_golden_path_v2 = os.path.join(
-      resource_loader.get_data_files_path(), '..', 'golden', 'v2',
-      'tensorflow.pbtxt')
+  root_golden_path_v2 = os.path.join(resource_loader.get_data_files_path(),
+                                     '..', 'golden', 'v2', 'tensorflow.pbtxt')
 
   if FLAGS.update_goldens:
     root_golden_path_v2 = os.path.realpath(root_golden_path_v2)
@@ -106,7 +104,6 @@ _UPDATE_WARNING_FILE = resource_loader.get_path_to_datafile(
 
 _NON_CORE_PACKAGES = ['estimator']
 
-
 # TODO(annarev): remove this once we test with newer version of
 # estimator that actually has compat v1 version.
 if not hasattr(tf.compat.v1, 'estimator'):
@@ -282,9 +279,6 @@ class ApiCompatibilityTest(test.TestCase):
       diff_count = len(diffs)
       logging.error(self._test_readme_message)
       logging.error('%d differences found between API and golden.', diff_count)
-      messages = verbose_diffs if verbose else diffs
-      for i in range(diff_count):
-        print('Issue %d\t: %s' % (i + 1, messages[i]), file=sys.stderr)
 
       if update_goldens:
         # Write files if requested.
@@ -394,11 +388,12 @@ class ApiCompatibilityTest(test.TestCase):
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
     omit_golden_symbols_map = {}
-    if (api_version == 2 and FLAGS.only_test_core_api
-        and not _TENSORBOARD_AVAILABLE):
+    if (api_version == 2 and FLAGS.only_test_core_api and
+        not _TENSORBOARD_AVAILABLE):
       # In TF 2.0 these summary symbols are imported from TensorBoard.
       omit_golden_symbols_map['tensorflow.summary'] = [
-          'audio', 'histogram', 'image', 'scalar', 'text']
+          'audio', 'histogram', 'image', 'scalar', 'text'
+      ]
 
     self._checkBackwardsCompatibility(
         tf,
@@ -418,7 +413,9 @@ class ApiCompatibilityTest(test.TestCase):
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
     self._checkBackwardsCompatibility(
-        tf.compat.v1, golden_file_pattern, api_version,
+        tf.compat.v1,
+        golden_file_pattern,
+        api_version,
         additional_private_map={
             'tf': ['pywrap_tensorflow'],
             'tf.compat': ['v1', 'v2'],
@@ -434,7 +431,8 @@ class ApiCompatibilityTest(test.TestCase):
     if FLAGS.only_test_core_api and not _TENSORBOARD_AVAILABLE:
       # In TF 2.0 these summary symbols are imported from TensorBoard.
       omit_golden_symbols_map['tensorflow.summary'] = [
-          'audio', 'histogram', 'image', 'scalar', 'text']
+          'audio', 'histogram', 'image', 'scalar', 'text'
+      ]
     self._checkBackwardsCompatibility(
         tf.compat.v2,
         golden_file_pattern,
diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD
index 93b408d522e..674133431f1 100644
--- a/tensorflow/tools/benchmark/BUILD
+++ b/tensorflow/tools/benchmark/BUILD
@@ -28,8 +28,8 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu",
diff --git a/tensorflow/tools/ci_build/Dockerfile.pi b/tensorflow/tools/ci_build/Dockerfile.pi
index f772880cec1..b8b4753a1e6 100644
--- a/tensorflow/tools/ci_build/Dockerfile.pi
+++ b/tensorflow/tools/ci_build/Dockerfile.pi
@@ -22,3 +22,6 @@ RUN /install/install_pi_toolchain.sh
 
 # Set up the master bazelrc configuration file.
 COPY install/.bazelrc /etc/bazel.bazelrc
+
+# XLA is not needed for PI
+ENV TF_ENABLE_XLA=0
diff --git a/tensorflow/tools/ci_build/Dockerfile.pi-python3 b/tensorflow/tools/ci_build/Dockerfile.pi-python3
index 14ebb9069f9..bcc5d13f9d5 100644
--- a/tensorflow/tools/ci_build/Dockerfile.pi-python3
+++ b/tensorflow/tools/ci_build/Dockerfile.pi-python3
@@ -21,3 +21,6 @@ RUN /install/install_pi_python3_toolchain.sh
 
 # Set up the master bazelrc configuration file.
 COPY install/.bazelrc /etc/bazel.bazelrc
+
+# XLA is not needed for PI
+ENV TF_ENABLE_XLA=0
diff --git a/tensorflow/tools/ci_build/Dockerfile.pi-python37 b/tensorflow/tools/ci_build/Dockerfile.pi-python37
index 8e231102b79..2c1cd2f8942 100644
--- a/tensorflow/tools/ci_build/Dockerfile.pi-python37
+++ b/tensorflow/tools/ci_build/Dockerfile.pi-python37
@@ -21,3 +21,6 @@ RUN /install/install_pi_python37_toolchain.sh
 
 # Set up the master bazelrc configuration file.
 COPY install/.bazelrc /etc/bazel.bazelrc
+
+# XLA is not needed for PI
+ENV TF_ENABLE_XLA=0
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
index df4b847b6f7..91d501109d0 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
@@ -75,6 +75,13 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
 RUN python3.8 get-pip.py
 RUN python3.8 -m pip install --upgrade pip setuptools wheel
 
+# Overwrite include paths that are generated for the multipython image.
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
+
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
+
 # Make apt work with python 3.6.
 RUN cp /usr/lib/python3/dist-packages/apt_pkg.cpython-35m-x86_64-linux-gnu.so \
        /usr/lib/python3/dist-packages/apt_pkg.so
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
index 54bb4b3773f..9c85091563e 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
@@ -17,7 +17,6 @@ RUN apt-get update && apt-get install -y \
       flex \
       g++ \
       make \
-      patchelf \
       rpm2cpio \
       unar \
       wget \
@@ -56,17 +55,17 @@ RUN /install/install_bootstrap_deb_packages.sh
 COPY install/install_deb_packages.sh /install/
 RUN /install/install_deb_packages.sh
 
-# Install patchelf to facilitate the creation of manylinux2010 whls.
-COPY install/install_patchelf.sh /install/
-RUN /install/install_patchelf.sh
-
-# Install additional dependencies to build Python from source.
+# Install additional packages needed for this image:
+# - dependencies to build Python from source
+# - patchelf, as it is required by auditwheel
 RUN apt-get update && apt-get install -y \
-    libncurses5-dev \
+    libbz2-dev \
+    libffi-dev \
     libgdbm-dev \
+    libncurses5-dev \
     libnss3-dev \
     libreadline-dev \
-    libffi-dev \
+    patchelf \
       && \
     rm -rf /var/lib/apt/lists/*
 
@@ -87,9 +86,6 @@ RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.5"
 RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.6"
 RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
 
-# Install auditwheel to create manylinux2010 compliant binaries
-RUN pip3 install auditwheel
-
 ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
 COPY install/install_latest_clang.sh /install/
 RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython
new file mode 100644
index 00000000000..2aab5678590
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython
@@ -0,0 +1,91 @@
+# Dockerfile to build a manylinux 2010 compliant cross-compiler.
+#
+# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
+# glibc (2.12) and system libstdc++ (4.4).
+#
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython \
+#  --tag "gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython
+
+FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 as devtoolset
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+      cpio \
+      file \
+      flex \
+      g++ \
+      make \
+      rpm2cpio \
+      unar \
+      wget \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+ADD devtoolset/fixlinks.sh fixlinks.sh
+ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
+ADD devtoolset/rpm-patch.sh rpm-patch.sh
+
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
+RUN /build_devtoolset.sh devtoolset-7 /dt7
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
+RUN /build_devtoolset.sh devtoolset-8 /dt8
+
+# TODO(klimek): Split up into two different docker images.
+FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+COPY --from=devtoolset /dt7 /dt7
+COPY --from=devtoolset /dt8 /dt8
+
+# Install TensorRT.
+RUN apt-get update && apt-get install -y \
+    libnvinfer-dev=6.0.1-1+cuda10.1 \
+    libnvinfer6=6.0.1-1+cuda10.1 \
+    libnvinfer-plugin-dev=6.0.1-1+cuda10.1 \
+    libnvinfer-plugin6=6.0.1-1+cuda10.1 \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy and run the install scripts.
+ARG DEBIAN_FRONTEND=noninteractive
+
+COPY install/install_bootstrap_deb_packages.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+
+COPY install/install_deb_packages.sh /install/
+RUN /install/install_deb_packages.sh
+
+# Install additional packages needed for this image:
+# - dependencies to build Python from source
+# - patchelf, as it is required by auditwheel
+RUN apt-get update && apt-get install -y \
+    libbz2-dev \
+    libffi-dev \
+    libgdbm-dev \
+    libncurses5-dev \
+    libnss3-dev \
+    libreadline-dev \
+    patchelf \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY install/install_bazel.sh /install/
+RUN /install/install_bazel.sh
+
+COPY install/build_and_install_python.sh /install/
+RUN /install/build_and_install_python.sh "2.7.17" "--enable-unicode=ucs4"
+RUN /install/build_and_install_python.sh "3.5.9"
+RUN /install/build_and_install_python.sh "3.6.9"
+RUN /install/build_and_install_python.sh "3.7.7"
+RUN /install/build_and_install_python.sh "3.8.2"
+
+COPY install/install_pip_packages_by_version.sh /install/
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip2.7"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.5"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.6"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
+
+ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
+COPY install/install_latest_clang.sh /install/
+RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
index 516129ccd43..a14b9ac2a3e 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
@@ -73,13 +73,12 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
 RUN python3.8 get-pip.py
 RUN python3.8 -m pip install --upgrade pip setuptools wheel
 
-# TODO(klimek): Figure out a better way to get the right include paths
-# forwarded when we install new packages.
-RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt7/usr/include/x86_64-linux-gnu/python2.7"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt8/usr/include/x86_64-linux-gnu/python2.7"
+# Overwrite include paths that are generated for the multipython image.
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python2.7" "/dt7/usr/include/x86_64-linux-gnu/python2.7"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python2.7" "/dt8/usr/include/x86_64-linux-gnu/python2.7"
 
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
 
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh b/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
index 3bb8d8b7afa..cf0036fb98f 100755
--- a/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
+++ b/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
@@ -40,7 +40,7 @@ yes "" | python configure.py
 PIP_TEST_ROOT=pip_test_root
 mkdir -p ${PIP_TEST_ROOT}
 ln -s $(pwd)/tensorflow ${PIP_TEST_ROOT}/tensorflow
-bazel test --define=no_tensorflow_py_deps=true \
+bazel --output_base=/tmp test --define=no_tensorflow_py_deps=true \
       --test_lang_filters=py \
       --build_tests_only \
       -k \
diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index 44180b8bf84..a281afe7442 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -54,7 +54,7 @@ function build_libtensorflow_tarball() {
   BAZEL_OPTS="--config=opt --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
   export CC_OPT_FLAGS="-mavx -msse4.2"
   if [ "${TF_NEED_CUDA}" == "1" ]; then
-    BAZEL_OPTS="${BAZEL_OPTS} --config=cuda"
+    BAZEL_OPTS="${BAZEL_OPTS} --config=cuda --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
     export TF_NEED_ROCM=0
   fi
   bazel clean --expunge
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index cc1156f8cc5..6db88755ac8 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -702,23 +702,37 @@ done
 # Print summary of build results
 COUNTER=0
 echo "==== Summary of sanity check results ===="
+TESTCASE_XML=''
 while [[ ${COUNTER} -lt "${#SANITY_STEPS[@]}" ]]; do
   INDEX=COUNTER
   ((INDEX++))
 
   echo "${INDEX}. ${SANITY_STEPS[COUNTER]}: ${SANITY_STEPS_DESC[COUNTER]}"
+  TESTCASE_XML="${TESTCASE_XML} <testcase name=\"${SANITY_STEPS_DESC[COUNTER]}\" status=\"run\" classname=\"\" time=\"0\">"
+
   if [[ ${STEP_EXIT_CODES[COUNTER]} == "0" ]]; then
     printf "  ${COLOR_GREEN}PASS${COLOR_NC}\n"
   else
     printf "  ${COLOR_RED}FAIL${COLOR_NC}\n"
+    TESTCASE_XML="${TESTCASE_XML} <failure message=\"\" type=\"\"/>"
   fi
 
+  TESTCASE_XML="${TESTCASE_XML} </testcase>"
+
   ((COUNTER++))
 done
 
 echo
 echo "${FAIL_COUNTER} failed; ${PASS_COUNTER} passed."
 
+mkdir -p "${KOKORO_ARTIFACTS_DIR}/${KOKORO_JOB_NAME}/summary"
+echo '<?xml version="1.0" encoding="UTF-8"?>'\
+  '<testsuites name="1"  tests="1" failures="0" errors="0" time="0">'\
+  '<testsuite name="Kokoro Summary" tests="'"$((FAIL_COUNTER + PASS_COUNTER))"\
+  '" failures="'"${FAIL_COUNTER}"'" errors="0" time="0">'\
+  "${TESTCASE_XML}"'</testsuite></testsuites>'\
+  > "${KOKORO_ARTIFACTS_DIR}/${KOKORO_JOB_NAME}/summary/sponge_log.xml"
+
 echo
 if [[ ${FAIL_COUNTER} == "0" ]]; then
   printf "Sanity checks ${COLOR_GREEN}PASSED${COLOR_NC}\n"
diff --git a/tensorflow/tools/ci_build/install/install_pi_python37_toolchain.sh b/tensorflow/tools/ci_build/install/install_pi_python37_toolchain.sh
index 7688a081d6f..3bda56af648 100755
--- a/tensorflow/tools/ci_build/install/install_pi_python37_toolchain.sh
+++ b/tensorflow/tools/ci_build/install/install_pi_python37_toolchain.sh
@@ -15,12 +15,14 @@
 # ==============================================================================
 
 dpkg --add-architecture armhf
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-security main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-backports main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+dpkg --add-architecture arm64
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-security main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-backports main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
 sed -i 's#deb http://archive.ubuntu.com/ubuntu/#deb [arch=amd64] http://archive.ubuntu.com/ubuntu/#g' /etc/apt/sources.list
 yes | add-apt-repository ppa:deadsnakes/ppa
 apt-get update
 apt-get install -y python3.7 python3-numpy python3.7-dev python3-pip
 apt-get install -y libpython3.7-dev:armhf
+apt-get install -y libpython3.7-dev:arm64
diff --git a/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh b/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh
index 7c87a3fc7c5..b02c35c612d 100755
--- a/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh
+++ b/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh
@@ -15,11 +15,13 @@
 # ==============================================================================
 
 dpkg --add-architecture armhf
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-security main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-backports main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+dpkg --add-architecture arm64
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-security main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-backports main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
 sed -i 's#deb http://archive.ubuntu.com/ubuntu/#deb [arch=amd64] http://archive.ubuntu.com/ubuntu/#g' /etc/apt/sources.list
 apt-get update
 apt-get install -y libpython3-all-dev:armhf
+apt-get install -y libpython3-all-dev:arm64
 apt-get install -y python3 python3-numpy python3-dev python3-pip
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index c52b94e0f1f..3009213d43a 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -147,3 +147,8 @@ pip3 install --upgrade argparse
 # tree
 pip2 install dm-tree
 pip3 install dm-tree
+
+# tf.distribute multi worker tests require the following:
+# Those tests are Python3 only.
+pip3 install --upgrade dill
+pip3 install --upgrade tblib
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
index d9953db3b5a..6fe5dfc3193 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
@@ -26,6 +26,7 @@ if [[ ! -x "$(which "${PIP}")" ]]; then
 fi
 
 PACKAGES=(
+  "auditwheel"
   "wheel"
   "setuptools"
   "virtualenv"
@@ -56,6 +57,8 @@ PACKAGES=(
   "tb-nightly"
   "argparse"
   "dm-tree"
+  "dill"
+  "tblib"
 )
 
 # tf.mock require the following for python2:
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
index 467b8dc8083..1b255682671 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
@@ -36,7 +36,7 @@ DOCKER_BINARY="docker"
 if [ "${TF_NEED_CUDA}" == "1" ]; then
   DOCKER_IMAGE="tf-tensorflow-gpu"
   DOCKER_BINARY="nvidia-docker"
-  DOCKER_FILE="Dockerfile.gpu"
+  DOCKER_FILE="Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010"
 fi
 if [ "${TF_NEED_ROCM}" == "1" ]; then
   DOCKER_IMAGE="tf-tensorflow-rocm"
diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index a6ef52b8bea..0a9f6eae0b3 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -146,6 +146,7 @@ function install_pip_deps {
   ${PIP_CMD} install --user --upgrade attrs
   ${PIP_CMD} install --user --upgrade tf-estimator-nightly
   ${PIP_CMD} install --user --upgrade "future>=0.17.1"
+  ${PIP_CMD} install --user --upgrade wrapt
   # LINT.ThenChange(:ubuntu_16_pip_installations)
 }
 
@@ -176,8 +177,10 @@ function install_ubuntu_16_pip_deps {
   "${PIP_CMD}" install scipy --user
   "${PIP_CMD}" install scikit-learn --user
   "${PIP_CMD}" install PyYAML==3.13 --user
-  "${PIP_CMD}" install --user --upgrade tf-estimator-nightly
+  # b/156523241
+  "${PIP_CMD}" install --force-reinstall --user --upgrade tf-estimator-nightly
   "${PIP_CMD}" install --user --upgrade tb-nightly
+  "${PIP_CMD}" install --user --upgrade wrapt
   # LINT.ThenChange(:ubuntu_pip_installations)
 }
 
@@ -218,7 +221,9 @@ function install_macos_pip_deps {
   ${SUDO_CMD} ${PIP_CMD} install --upgrade grpcio
   ${SUDO_CMD} ${PIP_CMD} install --upgrade tb-nightly
   ${PIP_CMD} install --user --upgrade attrs
-  ${PIP_CMD} install --user --upgrade tf-estimator-nightly
+  # b/156523241
+  ${PIP_CMD} install --force-reinstall --user --upgrade tf-estimator-nightly
+  ${PIP_CMD} install --user --upgrade wrapt
   ${PIP_CMD} install --user --upgrade "future>=0.17.1"
 }
 
diff --git a/tensorflow/tools/ci_build/release/common_win.bat b/tensorflow/tools/ci_build/release/common_win.bat
index 85f22c1e4cb..03217ce7e56 100644
--- a/tensorflow/tools/ci_build/release/common_win.bat
+++ b/tensorflow/tools/ci_build/release/common_win.bat
@@ -28,7 +28,7 @@ SET PATH=%PATH%;C:\%PYTHON_DIRECTORY%
 
 %PIP_EXE% install setuptools --upgrade
 %PIP_EXE% install future>=0.17.1 --no-deps
-%PIP_EXE% install tf-estimator-nightly --no-deps
+%PIP_EXE% install --ignore-installed --force-reinstall --upgrade tf-estimator-nightly --no-deps
 %PIP_EXE% install tb-nightly --no-deps
 %PIP_EXE% install numpy --upgrade --no-deps
 %PIP_EXE% install opt_einsum --upgrade
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
index f6de18d81ac..0630c117036 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
@@ -44,7 +44,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py2,-v1only,-gpu,-tpu,-benchmark-test'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
index c64d9c00787..188e47fa74b 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
@@ -39,7 +39,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py2'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
index 8c9b91dd55e..3f31033b2ac 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
@@ -44,7 +44,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35,-gpu,-tpu,-benchmark-test'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
index e03f4c4ce2f..dcbd5b504c8 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
@@ -43,7 +43,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
index a66dca3885e..26ee4ea8edb 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
@@ -44,7 +44,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35,-v1only,-gpu,-tpu,-benchmark-test'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh
index dc153b16a43..3d04cf1d9ba 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh
@@ -42,7 +42,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
index 5d75224a45c..ed577db961a 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
@@ -44,7 +44,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py37,-v1only,-gpu,-tpu,-benchmark-test'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh
index afe933a1912..c3840aa2dc8 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh
@@ -42,7 +42,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
index a5a5b6a34c4..f8eda5a7520 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
@@ -44,7 +44,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py38,-v1only,-gpu,-tpu,-benchmark-test'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
index ad14d8724b8..8524bbbad03 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
@@ -46,7 +46,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2,-v1only'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh
index a4d9bb1de03..bd2e27e8781 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh
@@ -43,7 +43,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
index 3842410edb2..5d0cbacb0b7 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
@@ -45,7 +45,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py35,-v1only'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh
index cd8cdd98014..1e2665f4120 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh
@@ -42,7 +42,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py35'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh
index d23ce016080..25c4de88cdd 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh
@@ -45,7 +45,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py36,-v1only'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh
index 084bfeb3a22..c4d78dc3fe5 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh
@@ -42,7 +42,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py36'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh
index 9cded426bde..940cef32ef8 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh
@@ -45,7 +45,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py37,-v1only'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh
index 2df3c0e61e7..2208327388f 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh
@@ -42,7 +42,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py37'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
index 366f2464612..a27d1f863d6 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
@@ -45,7 +45,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py38,-v1only'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
index 34ef1974916..a90a3e5a212 100755
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
@@ -45,10 +45,15 @@ bazel build --config=opt \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   tensorflow/tools/pip_package:build_pip_package
 
+# Set TF nightly flag so we get the proper version of estimator
+if [[ "$IS_NIGHTLY" == 1 ]]; then
+  NIGHTLY_FLAG="--nightly_flag"
+fi
+
 PIP_WHL_DIR=whl
 mkdir -p ${PIP_WHL_DIR}
 PIP_WHL_DIR=$(readlink -f ${PIP_WHL_DIR})  # Get absolute path
-bazel-bin/tensorflow/tools/pip_package/build_pip_package "${PIP_WHL_DIR}"
+bazel-bin/tensorflow/tools/pip_package/build_pip_package "${PIP_WHL_DIR}" "${NIGHTLY_FLAG}"
 WHL_PATH=$(ls "${PIP_WHL_DIR}"/*.whl)
 
 cp "${WHL_PATH}" "$(pwd)"/.
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
index 12290d1b0b5..dd618031c0d 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
@@ -58,7 +58,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh
index d5e5c76ce82..db0c3a22c06 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh
@@ -56,7 +56,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
index be97cc4bfa8..0e8cd8cd784 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
@@ -59,7 +59,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
index a3104e88395..4bbbd50724b 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
@@ -56,7 +56,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
index 15f7db11a87..0b26173ca5f 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
@@ -59,7 +59,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
index c1fc598eed6..484daa63cb8 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
@@ -56,7 +56,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
index 56f2a7f66e9..00047b775b1 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
@@ -59,7 +59,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
index e5d3fda2b73..50cf3d61e4a 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
@@ -56,7 +56,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
index 28b633c390e..9aa5fdf68c8 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
@@ -59,7 +59,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh
new file mode 100644
index 00000000000..abb85c18711
--- /dev/null
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+
+# Source the external common scripts.
+source tensorflow/tools/ci_build/release/common.sh
+
+
+# Install latest bazel
+install_bazelisk
+which bazel
+
+# Install realpath
+sudo apt-get install realpath
+
+./tensorflow/tools/ci_build/linux/libtensorflow.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh
new file mode 100644
index 00000000000..c399ed2680f
--- /dev/null
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh
@@ -0,0 +1,30 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+
+# Source the external common scripts.
+source tensorflow/tools/ci_build/release/common.sh
+
+
+# Install latest bazel
+install_bazelisk
+which bazel
+
+# Install realpath
+sudo apt-get install realpath
+
+export TF_NEED_CUDA=1
+
+./tensorflow/tools/ci_build/linux/libtensorflow.sh
diff --git a/tensorflow/tools/common/traverse.py b/tensorflow/tools/common/traverse.py
index 5efce450dcb..1d9c98277b5 100644
--- a/tensorflow/tools/common/traverse.py
+++ b/tensorflow/tools/common/traverse.py
@@ -46,9 +46,16 @@ def _traverse_internal(root, visit, stack, path):
           children.append(enum_member)
       children = sorted(children)
   except ImportError:
-    # On some Python installations, some modules do not support enumerating
-    # members (six in particular), leading to import errors.
-    children = []
+    # Children could be missing for one of two reasons:
+    # 1. On some Python installations, some modules do not support enumerating
+    #    members (six in particular), leading to import errors.
+    # 2. Children are lazy-loaded.
+    try:
+      children = []
+      for child_name in root.__all__:
+        children.append((child_name, getattr(root, child_name)))
+    except AttributeError:
+      children = []
 
   new_stack = stack + [root]
   visit(path, root, children)
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index ed8747a73f0..0c75b70f5dd 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -93,10 +93,11 @@ tensorflow::DoQuantizeTrainingOnSerializedGraphDef
 tensorflow::SessionState::kTensorHandleResourceTypeName
 
 [server_lib] # server_lib
+tensorflow::data::GrpcDataServerBase::Join
 tensorflow::data::GrpcDataServerBase::Start
 tensorflow::data::GrpcDataServerBase::Stop
 tensorflow::data::GrpcDataServerBase::BoundPort
-tensorflow::data::MasterGrpcDataServer::NumTasks
+tensorflow::data::MasterGrpcDataServer::NumWorkers
 tensorflow::data::NewMasterServer
 tensorflow::data::NewWorkerServer
 
diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py
index 7b3dcbd33c0..d2135f38ab4 100644
--- a/tensorflow/tools/dockerfiles/assembler.py
+++ b/tensorflow/tools/dockerfiles/assembler.py
@@ -558,7 +558,7 @@ def main(argv):
       # Only build images for host architecture
       proc_arch = platform.processor()
       is_x86 = proc_arch.startswith('x86')
-      if (is_x86 and any([arch in tag for arch in ['ppc64le']]) or
+      if (is_x86 and any(arch in tag for arch in ['ppc64le']) or
           not is_x86 and proc_arch not in tag):
         continue
 
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index c092f21addb..f347cab91bb 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -2,6 +2,7 @@
 #   Doc generator
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 
 package(
     default_visibility = ["//tensorflow:__subpackages__"],
@@ -10,6 +11,8 @@ package(
 
 exports_files(["LICENSE"])
 
+tpu_module = "tpu.,distribute.tpu_strategy"
+
 py_library(
     name = "tf_doctest_lib",
     srcs = ["tf_doctest_lib.py"],
@@ -22,6 +25,7 @@ py_library(
 py_test(
     name = "tf_doctest",
     srcs = ["tf_doctest.py"],
+    args = ["--module_prefix_skip=" + tpu_module],
     python_version = "PY3",
     tags = [
         "no_oss_py2",
@@ -32,6 +36,27 @@ py_test(
         "nomsan",
         "notsan",
     ],
+    deps = [
+        ":tf_doctest_lib",
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+tpu_py_test(
+    name = "tf_doctest_tpu",
+    srcs = ["tf_doctest.py"],
+    args = ["--module=" + tpu_module],
+    disable_experimental = True,
+    disable_v3 = True,
+    main = "tf_doctest.py",
+    python_version = "PY3",
+    tags = [
+        "no_oss",
+        "noasan",
+        "nomsan",
+        "notsan",
+    ],
     deps = [
         ":tf_doctest_lib",
         "//tensorflow:tensorflow_py",
@@ -40,6 +65,32 @@ py_test(
     ],
 )
 
+py_test(
+    name = "tf_doctest_gpu",
+    srcs = ["tf_doctest.py"],
+    args = [
+        "--module=distribute.",
+        "--module_prefix_skip=" + tpu_module,
+    ],
+    main = "tf_doctest.py",
+    python_version = "PY3",
+    tags = [
+        "no_oss_py2",
+        "no_pip",
+        "no_rocm",
+        "no_windows",  # numpy prints differently on windows.
+        "noasan",
+        "nomsan",
+        "notsan",
+        "requires-gpu-nvidia",
+    ],
+    deps = [
+        ":tf_doctest_lib",
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "tf_doctest_test",
     srcs = ["tf_doctest_test.py"],
diff --git a/tensorflow/tools/docs/tf_doctest.py b/tensorflow/tools/docs/tf_doctest.py
index 19624659e37..00bf3492787 100644
--- a/tensorflow/tools/docs/tf_doctest.py
+++ b/tensorflow/tools/docs/tf_doctest.py
@@ -19,16 +19,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
 import os
+import pkgutil
 import sys
 
 from absl import flags
 from absl.testing import absltest
 import numpy as np
-
 import tensorflow.compat.v2 as tf
 
-from tensorflow.python.keras import preprocessing
+import tensorflow.python as tf_root
 from tensorflow.tools.docs import tf_doctest_lib
 
 # We put doctest after absltest so that it picks up the unittest monkeypatch.
@@ -37,12 +38,11 @@ import doctest  # pylint: disable=g-bad-import-order
 
 tf.compat.v1.enable_v2_behavior()
 
-# Inject keras.preprocessing files into `tf.keras.preprocessing` namespace.
-tf.keras.preprocessing = preprocessing
-
 FLAGS = flags.FLAGS
 
-flags.DEFINE_string('module', None, 'A specific module to run doctest on.')
+flags.DEFINE_list('module', [], 'A list of specific module to run doctest on.')
+flags.DEFINE_list('module_prefix_skip', [],
+                  'A list of modules to ignore when resolving modules.')
 flags.DEFINE_boolean('list', None,
                      'List all the modules in the core package imported.')
 flags.DEFINE_string('file', None, 'A specific file to run doctest on.')
@@ -50,9 +50,24 @@ flags.DEFINE_string('file', None, 'A specific file to run doctest on.')
 flags.mark_flags_as_mutual_exclusive(['module', 'file'])
 flags.mark_flags_as_mutual_exclusive(['list', 'file'])
 
+# Both --module and --module_prefix_skip are relative to PACKAGE.
 PACKAGE = 'tensorflow.python.'
 
 
+def recursive_import(root):
+  """Recursively imports all the sub-modules under a root package.
+
+  Args:
+    root: A python package.
+  """
+  for _, name, _ in pkgutil.walk_packages(
+      root.__path__, prefix=root.__name__ + '.'):
+    try:
+      importlib.import_module(name)
+    except (AttributeError, ImportError):
+      pass
+
+
 def find_modules():
   """Finds all the modules in the core package imported.
 
@@ -68,23 +83,24 @@ def find_modules():
   return tf_modules
 
 
-def filter_on_submodules(all_modules, submodule):
-  """Filters all the modules based on the module flag.
+def filter_on_submodules(all_modules, submodules):
+  """Filters all the modules based on the modules flag.
 
   The module flag has to be relative to the core package imported.
-  For example, if `submodule=keras.layers` then, this function will return
+  For example, if `module=keras.layers` then, this function will return
   all the modules in the submodule.
 
   Args:
     all_modules: All the modules in the core package.
-    submodule: Submodule to filter from all the modules.
+    submodules: Submodules to filter from all the modules.
 
   Returns:
     All the modules in the submodule.
   """
 
   filtered_modules = [
-      mod for mod in all_modules if PACKAGE + submodule in mod.__name__
+      mod for mod in all_modules
+      if any(PACKAGE + submodule in mod.__name__ for submodule in submodules)
   ]
   return filtered_modules
 
@@ -140,6 +156,9 @@ def load_tests(unused_loader, tests, unused_ignore):
     tf_modules = get_module_and_inject_docstring(FLAGS.file)
 
   for module in tf_modules:
+    if any(module.__name__.startswith(PACKAGE + prefix)
+           for prefix in FLAGS.module_prefix_skip):
+      continue
     testcase = TfTestCase()
     tests.addTests(
         doctest.DocTestSuite(
@@ -159,6 +178,6 @@ def load_tests(unused_loader, tests, unused_ignore):
         ))
   return tests
 
-
 if __name__ == '__main__':
+  recursive_import(tf_root)
   absltest.main()
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 70d88f294bc..43bc04a1b60 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -28,6 +28,7 @@ transitive_hdrs(
     deps = [
         "//tensorflow/c/experimental:network",
         "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
+        "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -107,7 +108,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/autograph/pyct/common_transformers:common_transformers",
     "//tensorflow/python/compiler:compiler",
     "//tensorflow/python:cond_v2",
-    "//tensorflow/python/data/service:server_lib",
     "//tensorflow/python:distributed_framework_test_lib",
     "//tensorflow/python/distribute:distribute_test_lib_pip",
     "//tensorflow/python:loss_scale",
@@ -120,6 +120,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/data/experimental/kernel_tests:reader_dataset_ops_test_base",
     "//tensorflow/python/data/experimental/kernel_tests:stats_dataset_test_base",
     "//tensorflow/python/data/experimental/ops:testing",
+    "//tensorflow/python/data/experimental/service:server_lib",
     "//tensorflow/python/data/kernel_tests:test_base",
     "//tensorflow/python/debug:debug_pip",
     "//tensorflow/python/distribute:combinations",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 622480102a9..01a3696823d 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -43,43 +43,31 @@ from setuptools import setup
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
-DOCLINES = __doc__.split('\n')
-
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
 # Also update tensorflow/tensorflow.bzl and
 # tensorflow/core/public/version.h
-_VERSION = '2.1.0'
+_VERSION = '2.2.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.7.0',
     'astunparse == 1.6.3',
-    'backports.weakref >= 1.0rc1;python_version<"3.4"',
-    'enum34 >= 1.1.6;python_version<"3.4"',
     'gast == 0.3.3',
     'google_pasta >= 0.1.8',
     'h5py >= 2.10.0, < 2.11.0',
-    'keras_preprocessing >= 1.1.0',
+    'keras_preprocessing >= 1.1.1, < 1.2',
     'numpy >= 1.16.0, < 2.0',
     'opt_einsum >= 2.3.2',
     'protobuf >= 3.9.2',
     'tensorboard >= 2.2.0, < 2.3.0',
-    'tensorflow_estimator >= 2.1.0, < 2.2.0',
+    'tensorflow_estimator >= 2.2.0, < 2.3.0',
     'termcolor >= 1.1.0',
     'wrapt >= 1.11.1',
-    # python3 requires wheel 0.26
-    'wheel >= 0.26;python_version>="3"',
-    'wheel;python_version<"3"',
-    # mock comes with unittest.mock for python3, need to install for python2
-    'mock >= 2.0.0;python_version<"3"',
-    # functools comes with python3, need to install the backport for python2
-    'functools32 >= 3.2.3;python_version<"3"',
+    'wheel >= 0.26',
     'six >= 1.12.0',
     # scipy < 1.4.1 causes segfaults due to pybind11
-    # Latest scipy pip for py2 is scipy==1.2.2
-    'scipy == 1.4.1;python_version>="3"',
-    'scipy == 1.2.2;python_version<"3"',
+    'scipy == 1.4.1',
 ]
 
 if sys.byteorder == 'little':
@@ -100,11 +88,19 @@ if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
       REQUIRED_PACKAGES[i] = 'tb-nightly >= 2.3.0a0, < 2.4.0a0'
-    elif 'tensorflow_estimator' in pkg and '2.0' in project_name:
-      REQUIRED_PACKAGES[i] = 'tensorflow-estimator-2.0-preview'
     elif 'tensorflow_estimator' in pkg:
       REQUIRED_PACKAGES[i] = 'tf-estimator-nightly'
 
+DOCLINES = __doc__.split('\n')
+if project_name.endswith('-gpu'):
+  project_name_no_gpu = project_name[:-len('-gpu')]
+  _GPU_PACKAGE_NOTE = 'Note that %s package by default supports both CPU and '\
+      'GPU. %s has the same content and exists solely for backward '\
+      'compatiblity. Please migrate to %s for GPU support.'\
+      % (project_name_no_gpu, project_name, project_name_no_gpu)
+  DOCLINES.append(_GPU_PACKAGE_NOTE)
+
+
 # pylint: disable=line-too-long
 CONSOLE_SCRIPTS = [
     'toco_from_protos = tensorflow.lite.toco.python.toco_from_protos:main',
@@ -121,11 +117,6 @@ CONSOLE_SCRIPTS = [
 ]
 # pylint: enable=line-too-long
 
-# Only keep freeze_graph console script in 1.X.
-if _VERSION.startswith('1.') and '_2.0' not in project_name:
-  CONSOLE_SCRIPTS.append(
-      'freeze_graph = tensorflow.python.tools.freeze_graph:run_main')
-
 # remove the tensorboard console script if building tf_nightly
 if 'tf_nightly' in project_name:
   CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:run_main')
@@ -249,6 +240,7 @@ headers = (
     list(find_files('*.proto', 'tensorflow/compiler')) +
     list(find_files('*.proto', 'tensorflow/core')) +
     list(find_files('*.proto', 'tensorflow/python')) +
+    list(find_files('*.def', 'tensorflow/compiler')) +
     list(find_files('*.h', 'tensorflow/c')) +
     list(find_files('*.h', 'tensorflow/cc')) +
     list(find_files('*.h', 'tensorflow/compiler')) +
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 56f36a7b004..1ac8a49028d 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -162,26 +162,13 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         print("path_prefix was specified to tf_workspace but is no longer used " +
               "and will be removed in the future.")
 
-    TFRT_COMMIT = "0bad623e8d99ace05f7f60e9e7f8b53ec813d66a"
-    TFRT_SHA256 = "d002429866d2d824a80dcf6c1602a15398412bc01324200d371c55b13b9a4b27"
-    TFRT_URLS = [
-        "http://mirror.tensorflow.org/github.com/tensorflow/runtime/archive/{commit}.zip".format(commit = TFRT_COMMIT),
-        "https://github.com/tensorflow/runtime/archive/{commit}.zip".format(commit = TFRT_COMMIT),
-    ]
-    tf_http_archive(
-        name = "tf_runtime",
-        sha256 = TFRT_SHA256,
-        strip_prefix = "runtime-" + TFRT_COMMIT,
-        urls = TFRT_URLS,
-    )
-
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "15a300dec0d483af67310ed2edf76a6eff643e1438d0612ad00a372add472c22",
-        strip_prefix = "XNNPACK-5cb16e7ace0fcdcab164af01620a606ba828a3be",
+        sha256 = "dfa6181e238f0ca88a641952678cd7f3e38da541d8b731ce3fea1d0eeffb6101",
+        strip_prefix = "XNNPACK-b2217ddb5fa74db09d9da1326902269ae18e41ad",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/5cb16e7ace0fcdcab164af01620a606ba828a3be.zip",
-            "https://github.com/google/XNNPACK/archive/5cb16e7ace0fcdcab164af01620a606ba828a3be.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/b2217ddb5fa74db09d9da1326902269ae18e41ad.zip",
+            "https://github.com/google/XNNPACK/archive/b2217ddb5fa74db09d9da1326902269ae18e41ad.zip",
         ],
     )
 
@@ -197,11 +184,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "pthreadpool",
-        sha256 = "c4b148fba41fc937fdf96bc195caadf0cf0be83f1c3e335ef5355934d4501f83",
-        strip_prefix = "pthreadpool-e918b206d26b1f3b2100b0edabf445c18708d2b7",
+        sha256 = "9f5fb7f87dc778d9c1d638826344b762afa23884d0252526337ae710264faef3",
+        strip_prefix = "pthreadpool-18a7156cb9be8e534acefade42e46d4209600c35",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/pthreadpool/archive/e918b206d26b1f3b2100b0edabf445c18708d2b7.zip",
-            "https://github.com/Maratyszcza/pthreadpool/archive/e918b206d26b1f3b2100b0edabf445c18708d2b7.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/pthreadpool/archive/18a7156cb9be8e534acefade42e46d4209600c35.zip",
+            "https://github.com/Maratyszcza/pthreadpool/archive/18a7156cb9be8e534acefade42e46d4209600c35.zip",
         ],
     )
 
@@ -213,11 +200,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "mkl_dnn",
         build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
-        sha256 = "31e78581e59d7e60d4becaba3834fc6a5bf2dccdae3e16b7f70d89ceab38423f",
-        strip_prefix = "mkl-dnn-0.21.3",
+        sha256 = "a0211aeb5e7dad50b97fa5dffc1a2fe2fe732572d4164e1ee8750a2ede43fbec",
+        strip_prefix = "oneDNN-0.21.3",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/archive/v0.21.3.tar.gz",
-            "https://github.com/intel/mkl-dnn/archive/v0.21.3.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v0.21.3.tar.gz",
+            "https://github.com/oneapi-src/oneDNN/archive/v0.21.3.tar.gz",
         ],
     )
 
@@ -250,11 +237,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "d96aa8eda6dbf80e313c992a59e9e9451f420a6b9f58ef30aa41bffdc9df2f1b",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-1e41406c362788057b3adcd9a25b73f43e6e6492",
+        sha256 = "615be1295290c13039b0c980a4a55933be26b1e06194d86c6014876fa85c7c6b",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-8719b9c5bc1a97e62d675c02495ed72dda6fae73",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/1e41406c362788057b3adcd9a25b73f43e6e6492/eigen-1e41406c362788057b3adcd9a25b73f43e6e6492.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/1e41406c362788057b3adcd9a25b73f43e6e6492/eigen-1e41406c362788057b3adcd9a25b73f43e6e6492.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/8719b9c5bc1a97e62d675c02495ed72dda6fae73/eigen-8719b9c5bc1a97e62d675c02495ed72dda6fae73.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/8719b9c5bc1a97e62d675c02495ed72dda6fae73/eigen-8719b9c5bc1a97e62d675c02495ed72dda6fae73.tar.gz",
         ],
     )
 
@@ -354,11 +341,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "gemmlowp",
-        sha256 = "6678b484d929f2d0d3229d8ac4e3b815a950c86bb9f17851471d143f6d4f7834",  # SHARED_GEMMLOWP_SHA
-        strip_prefix = "gemmlowp-12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3",
+        sha256 = "43146e6f56cb5218a8caaab6b5d1601a083f1f31c06ff474a4378a7d35be9cfb",  # SHARED_GEMMLOWP_SHA
+        strip_prefix = "gemmlowp-fda83bdc38b118cc6b56753bd540caa49e570745",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip",
-            "https://github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/archive/fda83bdc38b118cc6b56753bd540caa49e570745.zip",
+            "https://github.com/google/gemmlowp/archive/fda83bdc38b118cc6b56753bd540caa49e570745.zip",
         ],
     )
 
@@ -656,17 +643,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "com_github_nanopb_nanopb",
-        sha256 = "18234d9f01b57248472a9bfa65c3379352b5d66c15b0ef1c2b4feece4b5670fe",
-        build_file = "@com_github_grpc_grpc//third_party:nanopb.BUILD",
-        strip_prefix = "nanopb-0.4.1",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nanopb/nanopb/archive/0.4.1.tar.gz",
-            "https://github.com/nanopb/nanopb/archive/0.4.1.tar.gz",
-        ],
-    )
-
     tf_http_archive(
         name = "linenoise",
         build_file = clean_dep("//third_party:linenoise.BUILD"),
@@ -679,8 +655,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "307cfdf5338641e3a895857ef02dc9da35cd0eb6"
-    LLVM_SHA256 = "5e75125ecadee4f91e07c20bf6612d740913a677348fd33c7264ee8fe7d12b17"
+    LLVM_COMMIT = "034a7b6604067b0ccb36c761a5782456b76c447e"
+    LLVM_SHA256 = "87bd4dd8c2620ae6371dcfeeb6f1583918945c829cb115020ad4bc0a74a079d7"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
diff --git a/third_party/FP16/workspace.bzl b/third_party/FP16/workspace.bzl
index 441ef6b15e1..31746d6c371 100644
--- a/third_party/FP16/workspace.bzl
+++ b/third_party/FP16/workspace.bzl
@@ -5,11 +5,11 @@ load("//third_party:repo.bzl", "third_party_http_archive")
 def repo():
     third_party_http_archive(
         name = "FP16",
-        strip_prefix = "FP16-3c54eacb74f6f5e39077300c5564156c424d77ba",
-        sha256 = "0d56bb92f649ec294dbccb13e04865e3c82933b6f6735d1d7145de45da700156",
+        strip_prefix = "FP16-4dfe081cf6bcd15db339cf2680b9281b8451eeb3",
+        sha256 = "d973501a40c55126b31accc2d9f08d931ec3cc190c0430309a5e341d3c0ce32a",
         urls = [
-            "https://mirror.bazel.build/github.com/Maratyszcza/FP16/archive/3c54eacb74f6f5e39077300c5564156c424d77ba.zip",
-            "https://github.com/Maratyszcza/FP16/archive/3c54eacb74f6f5e39077300c5564156c424d77ba.zip",
+            "https://mirror.bazel.build/github.com/Maratyszcza/FP16/archive/4dfe081cf6bcd15db339cf2680b9281b8451eeb3.zip",
+            "https://github.com/Maratyszcza/FP16/archive/4dfe081cf6bcd15db339cf2680b9281b8451eeb3.zip",
         ],
         build_file = "//third_party/FP16:BUILD.bazel",
     )
diff --git a/third_party/aws/aws-c-common.bazel b/third_party/aws/aws-c-common.bazel
index a66fbcb1164..ab9406805c2 100644
--- a/third_party/aws/aws-c-common.bazel
+++ b/third_party/aws/aws-c-common.bazel
@@ -14,7 +14,6 @@ cc_library(
     srcs = select({
         "@org_tensorflow//tensorflow:linux_aarch64": glob([
             "source/posix/*.c",
-            "source/arch/*.c"
         ]),
         "@org_tensorflow//tensorflow:linux_x86_64": glob([
             "source/posix/*.c",
diff --git a/third_party/aws/workspace.bzl b/third_party/aws/workspace.bzl
index 13c573337d9..658aaaff00d 100644
--- a/third_party/aws/workspace.bzl
+++ b/third_party/aws/workspace.bzl
@@ -9,11 +9,11 @@ def repo():
     third_party_http_archive(
         name = "aws",
         urls = [
-            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.7.266.tar.gz",
-            "https://github.com/aws/aws-sdk-cpp/archive/1.7.266.tar.gz",
+            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.7.336.tar.gz",
+            "https://github.com/aws/aws-sdk-cpp/archive/1.7.336.tar.gz",
         ],
-        sha256 = "39fd8a2999260d2b8fcbc8187f1ed5299972c2b8bd14adb7850fd674fea67fb7",
-        strip_prefix = "aws-sdk-cpp-1.7.266",
+        sha256 = "758174f9788fed6cc1e266bcecb20bf738bd5ef1c3d646131c9ed15c2d6c5720",
+        strip_prefix = "aws-sdk-cpp-1.7.336",
         build_file = "//third_party/aws:BUILD.bazel",
     )
 
diff --git a/third_party/cpuinfo/workspace.bzl b/third_party/cpuinfo/workspace.bzl
index 922ab022486..ed5b8aea41a 100644
--- a/third_party/cpuinfo/workspace.bzl
+++ b/third_party/cpuinfo/workspace.bzl
@@ -5,11 +5,11 @@ load("//third_party:repo.bzl", "third_party_http_archive")
 def repo():
     third_party_http_archive(
         name = "cpuinfo",
-        strip_prefix = "cpuinfo-0cc563acb9baac39f2c1349bc42098c4a1da59e3",
-        sha256 = "80625d0b69a3d69b70c2236f30db2c542d0922ccf9bb51a61bc39c49fac91a35",
+        strip_prefix = "cpuinfo-5cefcd6293e6881754c2c53f99e95b159d2d8aa5",
+        sha256 = "8ea076bcc4ff73cdff520ece01b776d2a778ced60956f5eb88697a78e22c389d",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/pytorch/cpuinfo/archive/0cc563acb9baac39f2c1349bc42098c4a1da59e3.tar.gz",
-            "https://github.com/pytorch/cpuinfo/archive/0cc563acb9baac39f2c1349bc42098c4a1da59e3.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/pytorch/cpuinfo/archive/5cefcd6293e6881754c2c53f99e95b159d2d8aa5.zip",
+            "https://github.com/pytorch/cpuinfo/archive/5cefcd6293e6881754c2c53f99e95b159d2d8aa5.zip",
         ],
         build_file = "//third_party/cpuinfo:BUILD.bazel",
     )
diff --git a/third_party/flatbuffers/build_defs.bzl b/third_party/flatbuffers/build_defs.bzl
index d07ad18630f..02027aa09f5 100644
--- a/third_party/flatbuffers/build_defs.bzl
+++ b/third_party/flatbuffers/build_defs.bzl
@@ -362,7 +362,7 @@ def _concat_flatbuffer_py_srcs_impl(ctx):
             "sed 's/from flatbuffers." +
             "/from flatbuffers.python.flatbuffers./' |" +
             "sed '1s/^/from flatbuffers.python " +
-            "import flatbuffers\\n/' > %s"
+            "import flatbuffers\\'$'\\n/' > %s"
         ) % (
             ctx.attr.deps[0].files.to_list()[0].path,
             ctx.outputs.out.path,
@@ -472,6 +472,7 @@ def flatbuffer_java_library(
     native.java_library(
         name = name,
         srcs = [out_srcjar],
+        javacopts = ["-source 7 -target 7"],
         deps = [
             "@flatbuffers//:runtime_java",
         ],
@@ -562,7 +563,6 @@ def flatbuffer_android_library(
         srcs,
         custom_package = "",
         package_prefix = "",
-        javacopts = None,
         include_paths = DEFAULT_INCLUDE_PATHS,
         flatc_args = DEFAULT_FLATC_ARGS,
         visibility = None):
@@ -575,7 +575,6 @@ def flatbuffer_android_library(
           namespace in the schema files will be used. (optional)
       package_prefix: like custom_package, but prefixes to the existing
           namespace. (optional)
-      javacopts: List of options to pass to javac.
       include_paths: List of paths that includes files can be found in. (optional)
       flatc_args: List of additional arguments to pass to flatc. (optional)
       visibility: Visibility setting for the android_library rule. (optional)
@@ -604,6 +603,7 @@ def flatbuffer_android_library(
     android_library(
         name = name,
         srcs = [out_srcjar],
+        javacopts = ["-source 7 -target 7"],
         visibility = visibility,
         deps = [
             "@flatbuffers//:runtime_android",
diff --git a/third_party/gpus/check_cuda_libs.py b/third_party/gpus/check_cuda_libs.py
index b7b36e6466e..686d36f5c77 100644
--- a/third_party/gpus/check_cuda_libs.py
+++ b/third_party/gpus/check_cuda_libs.py
@@ -59,10 +59,10 @@ def check_cuda_lib(path, check_soname=True):
   objdump = which("objdump")
   if check_soname and objdump is not None and not _is_windows():
     # Decode is necessary as in py3 the return type changed from str to bytes
-    output = subprocess.check_output([objdump, "-p", path]).decode("ascii")
+    output = subprocess.check_output([objdump, "-p", path]).decode("utf-8")
     output = [line for line in output.splitlines() if "SONAME" in line]
     sonames = [line.strip().split(" ")[-1] for line in output]
-    if not any([soname == os.path.basename(path) for soname in sonames]):
+    if not any(soname == os.path.basename(path) for soname in sonames):
       raise ConfigError("None of the libraries match their SONAME: " + path)
 
 
@@ -86,4 +86,3 @@ def main():
 
 if __name__ == "__main__":
   main()
-
diff --git a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
index e50592fd857..082ed950b04 100644
--- a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
@@ -12,1426 +12,237 @@ load(
     "tool",
     "tool_path",
     "variable_with_value",
+    "with_feature_set",
 )
-load(
-    "@bazel_tools//tools/build_defs/cc:action_names.bzl",
-    "ASSEMBLE_ACTION_NAME",
-    "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME",
-    "CLIF_MATCH_ACTION_NAME",
-    "CPP_COMPILE_ACTION_NAME",
-    "CPP_HEADER_PARSING_ACTION_NAME",
-    "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_EXECUTABLE_ACTION_NAME",
-    "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_STATIC_LIBRARY_ACTION_NAME",
-    "CPP_MODULE_CODEGEN_ACTION_NAME",
-    "CPP_MODULE_COMPILE_ACTION_NAME",
-    "C_COMPILE_ACTION_NAME",
-    "LINKSTAMP_COMPILE_ACTION_NAME",
-    "LTO_BACKEND_ACTION_NAME",
-    "LTO_INDEXING_ACTION_NAME",
-    "OBJCPP_COMPILE_ACTION_NAME",
-    "OBJCPP_EXECUTABLE_ACTION_NAME",
-    "OBJC_ARCHIVE_ACTION_NAME",
-    "OBJC_COMPILE_ACTION_NAME",
-    "OBJC_EXECUTABLE_ACTION_NAME",
-    "OBJC_FULLY_LINK_ACTION_NAME",
-    "PREPROCESS_ASSEMBLE_ACTION_NAME",
-    "STRIP_ACTION_NAME",
-)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
-ACTION_NAMES = struct(
-    c_compile = C_COMPILE_ACTION_NAME,
-    cpp_compile = CPP_COMPILE_ACTION_NAME,
-    linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME,
-    cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME,
-    cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME,
-    cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME,
-    cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME,
-    assemble = ASSEMBLE_ACTION_NAME,
-    preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME,
-    lto_indexing = LTO_INDEXING_ACTION_NAME,
-    lto_backend = LTO_BACKEND_ACTION_NAME,
-    cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME,
-    cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME,
-    strip = STRIP_ACTION_NAME,
-    objc_archive = OBJC_ARCHIVE_ACTION_NAME,
-    objc_compile = OBJC_COMPILE_ACTION_NAME,
-    objc_executable = OBJC_EXECUTABLE_ACTION_NAME,
-    objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME,
-    objcpp_compile = OBJCPP_COMPILE_ACTION_NAME,
-    objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME,
-    clif_match = CLIF_MATCH_ACTION_NAME,
-    objcopy_embed_data = "objcopy_embed_data",
-    ld_embed_data = "ld_embed_data",
-)
+def all_assembly_actions():
+    return [
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.preprocess_assemble,
+    ]
 
-def _impl(ctx):
-    if (ctx.attr.cpu == "darwin"):
-        toolchain_identifier = "local_darwin"
-    elif (ctx.attr.cpu == "local"):
-        toolchain_identifier = "local_linux"
-    elif (ctx.attr.cpu == "x64_windows"):
-        toolchain_identifier = "local_windows"
-    else:
-        fail("Unreachable")
+def all_compile_actions():
+    return [
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+    ]
 
-    host_system_name = "local"
+def all_c_compile_actions():
+    return [
+        ACTION_NAMES.c_compile,
+    ]
 
-    target_system_name = "local"
+def all_cpp_compile_actions():
+    return [
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+    ]
 
-    if (ctx.attr.cpu == "darwin"):
-        target_cpu = "darwin"
-    elif (ctx.attr.cpu == "local"):
-        target_cpu = "local"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_cpu = "x64_windows"
-    else:
-        fail("Unreachable")
+def all_preprocessed_actions():
+    return [
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+    ]
 
-    if (ctx.attr.cpu == "local"):
-        target_libc = "local"
-    elif (ctx.attr.cpu == "darwin"):
-        target_libc = "macosx"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_libc = "msvcrt"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        compiler = "compiler"
-    elif (ctx.attr.cpu == "x64_windows"):
-        compiler = "msvc-cl"
-    else:
-        fail("Unreachable")
-
-    abi_version = "local"
-
-    abi_libc_version = "local"
-
-    cc_target_os = None
-
-    builtin_sysroot = ctx.attr.builtin_sysroot
-
-    all_link_actions = [
+def all_link_actions():
+    return [
         ACTION_NAMES.cpp_link_executable,
         ACTION_NAMES.cpp_link_dynamic_library,
         ACTION_NAMES.cpp_link_nodeps_dynamic_library,
     ]
 
-    cpp_link_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
+def all_executable_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_executable,
+    ]
+
+def all_shared_library_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+def all_archive_actions():
+    return [ACTION_NAMES.cpp_link_static_library]
+
+def all_strip_actions():
+    return [ACTION_NAMES.strip]
+
+def _library_to_link(flag_prefix, value, iterate = None):
+    return flag_group(
+        flags = [
+            "{}%{{libraries_to_link.{}}}".format(
+                flag_prefix,
+                iterate if iterate else "name",
+            ),
         ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
+        iterate_over = ("libraries_to_link." + iterate if iterate else None),
+        expand_if_equal = variable_with_value(
+            name = "libraries_to_link.type",
+            value = value,
+        ),
     )
 
-    cpp_link_nodeps_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
+def _surround_static_library(prefix, suffix):
+    return [
+        flag_group(
+            flags = [prefix, "%{libraries_to_link.name}", suffix],
+            expand_if_true = "libraries_to_link.is_whole_archive",
+        ),
+        flag_group(
+            flags = ["%{libraries_to_link.name}"],
+            expand_if_false = "libraries_to_link.is_whole_archive",
+        ),
+    ]
+
+def _prefix_static_library(prefix):
+    return [
+        flag_group(
+            flags = ["%{libraries_to_link.name}"],
+            expand_if_false = "libraries_to_link.is_whole_archive",
+        ),
+        flag_group(
+            flags = [prefix + "%{libraries_to_link.name}"],
+            expand_if_true = "libraries_to_link.is_whole_archive",
+        ),
+    ]
+
+def _static_library_to_link(alwayslink_prefix, alwayslink_suffix = None):
+    if alwayslink_suffix:
+        flag_groups = _surround_static_library(alwayslink_prefix, alwayslink_suffix)
+    else:
+        flag_groups = _prefix_static_library(alwayslink_prefix)
+    return flag_group(
+        flag_groups = flag_groups,
+        expand_if_equal = variable_with_value(
+            name = "libraries_to_link.type",
+            value = "static_library",
+        ),
     )
 
-    cpp_link_static_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_static_library,
-        implies = [
-            "nologo",
-            "archiver_flags",
-            "input_param_flags",
-            "linker_param_file",
-            "msvc_env",
-        ],
-        tools = [tool(path = ctx.attr.msvc_lib_path)],
+def _iterate_flag_group(iterate_over, flags = [], flag_groups = []):
+    return flag_group(
+        iterate_over = iterate_over,
+        expand_if_available = iterate_over,
+        flag_groups = flag_groups,
+        flags = flags,
     )
 
-    assemble_action = action_config(
-        action_name = ACTION_NAMES.assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
+def _libraries_to_link_group(flavour):
+    if flavour == "linux":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                flag_group(
+                    flags = ["-Wl,--start-lib"],
+                    expand_if_equal = variable_with_value(
+                        name = "libraries_to_link.type",
+                        value = "object_file_group",
+                    ),
+                ),
+                _library_to_link("", "object_file_group", "object_files"),
+                flag_group(
+                    flags = ["-Wl,--end-lib"],
+                    expand_if_equal = variable_with_value(
+                        name = "libraries_to_link.type",
+                        value = "object_file_group",
+                    ),
+                ),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("-Wl,-whole-archive", "-Wl,-no-whole-archive"),
+                _library_to_link("-l", "dynamic_library"),
+                _library_to_link("-l:", "versioned_dynamic_library"),
+            ],
+        )
+    elif flavour == "darwin":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                _library_to_link("", "object_file_group", "object_files"),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("-Wl,-force_load,"),
+                _library_to_link("-l", "dynamic_library"),
+                _library_to_link("-l:", "versioned_dynamic_library"),
+            ],
+        )
+    elif flavour == "msvc":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                _library_to_link("", "object_file_group", "object_files"),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("/WHOLEARCHIVE:"),
+            ],
+        )
+
+def _action_configs_with_tool(path, actions):
+    return [
+        action_config(
+            action_name = name,
+            enabled = True,
+            tools = [tool(path = path)],
+        )
+        for name in actions
+    ]
+
+def _action_configs(assembly_path, c_compiler_path, cc_compiler_path, archiver_path, linker_path, strip_path):
+    return _action_configs_with_tool(
+        assembly_path,
+        all_assembly_actions(),
+    ) + _action_configs_with_tool(
+        c_compiler_path,
+        all_c_compile_actions(),
+    ) + _action_configs_with_tool(
+        cc_compiler_path,
+        all_cpp_compile_actions(),
+    ) + _action_configs_with_tool(
+        archiver_path,
+        all_archive_actions(),
+    ) + _action_configs_with_tool(
+        linker_path,
+        all_link_actions(),
+    ) + _action_configs_with_tool(
+        strip_path,
+        all_strip_actions(),
     )
 
-    preprocess_assemble_action = action_config(
-        action_name = ACTION_NAMES.preprocess_assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    c_compile_action = action_config(
-        action_name = ACTION_NAMES.c_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_compile_action = action_config(
-        action_name = ACTION_NAMES.cpp_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_link_executable_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_executable,
-        implies = [
-            "nologo",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        action_configs = []
-    elif (ctx.attr.cpu == "x64_windows"):
-        action_configs = [
-            assemble_action,
-            preprocess_assemble_action,
-            c_compile_action,
-            cpp_compile_action,
-            cpp_link_executable_action,
-            cpp_link_dynamic_library_action,
-            cpp_link_nodeps_dynamic_library_action,
-            cpp_link_static_library_action,
+def _tool_paths(cpu, ctx):
+    if cpu in ["local", "darwin"]:
+        return [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + (
+                "/ar" if cpu == "local" else "/libtool"
+            )),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
         ]
-    else:
-        fail("Unreachable")
-
-    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
-
-    pic_feature = feature(
-        name = "pic",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
-                    flag_group(
-                        flags = ["-fPIE"],
-                        expand_if_not_available = "pic",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    preprocessor_defines_feature = feature(
-        name = "preprocessor_defines",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/D%{preprocessor_defines}"],
-                        iterate_over = "preprocessor_defines",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    generate_pdb_file_feature = feature(
-        name = "generate_pdb_file",
-        requires = [
-            feature_set(features = ["dbg"]),
-            feature_set(features = ["fastbuild"]),
-        ],
-    )
-
-    linkstamps_feature = feature(
-        name = "linkstamps",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{linkstamp_paths}"],
-                        iterate_over = "linkstamp_paths",
-                        expand_if_available = "linkstamp_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        flag_sets = ([
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ctx.attr.host_unfiltered_compile_flags,
-                    ),
-                ],
-            ),
-        ] if ctx.attr.host_unfiltered_compile_flags else []),
-    )
-
-    determinism_feature = feature(
-        name = "determinism",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-Wno-builtin-macro-redefined",
-                            "-D__DATE__=\"redacted\"",
-                            "-D__TIMESTAMP__=\"redacted\"",
-                            "-D__TIME__=\"redacted\"",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    nologo_feature = feature(
-        name = "nologo",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                flag_groups = [flag_group(flags = ["/nologo"])],
-            ),
-        ],
-    )
-
-    supports_pic_feature = feature(name = "supports_pic", enabled = True)
-
-    output_execpath_flags_feature = feature(
-        name = "output_execpath_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie"])],
-                ),
-            ],
-        )
-    else:
-        hardening_feature = None
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        enabled = True,
-        implies = ["copy_dynamic_libraries_to_binary"],
-    )
-
-    msvc_env_feature = feature(
-        name = "msvc_env",
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
-                    env_entry(
-                        key = "INCLUDE",
-                        value = ctx.attr.msvc_env_include,
-                    ),
-                    env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
-                    env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
-                    env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
-                ],
-            ),
-        ],
-    )
-
-    linker_subsystem_flag_feature = feature(
-        name = "linker_subsystem_flag",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_no_debug_feature = feature(
-        name = "dynamic_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MD"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    warnings_feature = feature(
-        name = "warnings",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wall"] + ctx.attr.host_compiler_warnings,
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_debug_feature = feature(
-        name = "dynamic_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MDd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    compiler_output_flags_feature = feature(
-        name = "compiler_output_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.assemble],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}", "/Zi"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fa%{output_file}"],
-                                expand_if_available = "output_assembly_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/P", "/Fi%{output_file}"],
-                                expand_if_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if ctx.attr.compiler == "clang":
-      default_compile_flags_feature = feature(
-          name = "default_compile_flags",
-          enabled = True,
-          flag_sets = [
-              flag_set(
-                  actions = [
-                      ACTION_NAMES.assemble,
-                      ACTION_NAMES.preprocess_assemble,
-                      ACTION_NAMES.linkstamp_compile,
-                      ACTION_NAMES.c_compile,
-                      ACTION_NAMES.cpp_compile,
-                      ACTION_NAMES.cpp_header_parsing,
-                      ACTION_NAMES.cpp_module_compile,
-                      ACTION_NAMES.cpp_module_codegen,
-                      ACTION_NAMES.lto_backend,
-                      ACTION_NAMES.clif_match,
-                  ],
-                  flag_groups = [
-                      flag_group(
-                          flags = [
-                              "-fexperimental-new-pass-manager",
-                          ],
-                      ),
-                  ],
-              ),
-          ],
-      )
-
-    elif ctx.attr.compiler == "msvc":
-      default_compile_flags_feature = feature(
-          name = "default_compile_flags",
-          enabled = True,
-          flag_sets = [
-              flag_set(
-                  actions = [
-                      ACTION_NAMES.assemble,
-                      ACTION_NAMES.preprocess_assemble,
-                      ACTION_NAMES.linkstamp_compile,
-                      ACTION_NAMES.c_compile,
-                      ACTION_NAMES.cpp_compile,
-                      ACTION_NAMES.cpp_header_parsing,
-                      ACTION_NAMES.cpp_module_compile,
-                      ACTION_NAMES.cpp_module_codegen,
-                      ACTION_NAMES.lto_backend,
-                      ACTION_NAMES.clif_match,
-                  ],
-                  flag_groups = [
-                      flag_group(
-                          flags = [
-                              "/DCOMPILER_MSVC",
-                              "/DNOMINMAX",
-                              "/D_WIN32_WINNT=0x0600",
-                              "/D_CRT_SECURE_NO_DEPRECATE",
-                              "/D_CRT_SECURE_NO_WARNINGS",
-                              "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
-                              "/bigobj",
-                              "/Zm500",
-                              "/J",
-                              "/Gy",
-                              "/GF",
-                              "/EHsc",
-                              "/wd4351",
-                              "/wd4291",
-                              "/wd4250",
-                              "/wd4996",
-                          ],
-                      ),
-                  ],
-              ),
-          ],
-      )
-
-    else:
-      default_compile_flags_feature = feature(
-          name = "default_compile_flags")
-
-    static_link_msvcrt_debug_feature = feature(
-        name = "static_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MTd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["-g"])],
-                ),
-            ],
-            implies = ["common"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    else:
-        dbg_feature = None
-
-    undefined_dynamic_feature = feature(
-        name = "undefined-dynamic",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
-            ),
-        ],
-    )
-
-    parse_showincludes_feature = feature(
-        name = "parse_showincludes",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                ],
-                flag_groups = [flag_group(flags = ["/showIncludes"])],
-            ),
-        ],
-    )
-
-    linker_param_file_feature = feature(
-        name = "linker_param_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["@%{linker_param_file}"],
-                        expand_if_available = "linker_param_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_no_debug_feature = feature(
-        name = "static_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MT"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    supports_interface_shared_libraries_feature = feature(
-        name = "supports_interface_shared_libraries",
-        enabled = True,
-    )
-
-    disable_assertions_feature = feature(
-        name = "disable-assertions",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-DNDEBUG"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "x64_windows"):
-        fastbuild_feature = feature(
-            name = "fastbuild",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [
-                        flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
-                    ],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    elif (ctx.attr.cpu == "darwin" or
-          ctx.attr.cpu == "local"):
-        fastbuild_feature = feature(name = "fastbuild", implies = ["common"])
-    else:
-        fastbuild_feature = None
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    compiler_input_flags_feature = feature(
-        name = "compiler_input_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/c", "%{source_file}"],
-                        expand_if_available = "source_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    no_legacy_features_feature = feature(name = "no_legacy_features")
-
-    archiver_flags_feature = feature(
-        name = "archiver_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    redirector_feature = feature(
-        name = "redirector",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-B",
-                            "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linker_bin_path_feature = feature(
-        name = "linker-bin-path",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                        ACTION_NAMES.cpp_link_executable,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
-                ),
-            ],
-        )
-    else:
-        opt_feature = None
-
-    include_paths_feature = feature(
-        name = "include_paths",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/I%{quote_include_paths}"],
-                        iterate_over = "quote_include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{include_paths}"],
-                        iterate_over = "include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{system_include_paths}"],
-                        iterate_over = "system_include_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    shared_flag_feature = feature(
-        name = "shared_flag",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["/DLL"])],
-            ),
-        ],
-    )
-
-    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
-
-    frame_pointer_feature = feature(
-        name = "frame-pointer",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])],
-            ),
-        ],
-    )
-
-    build_id_feature = feature(
-        name = "build-id",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        iterate_over = "sysroot",
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    cuda_path_feature = feature(
-        name = "cuda_path",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--cuda-path=" + ctx.attr.cuda_path],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    def_file_feature = feature(
-        name = "def_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
-                        expand_if_available = "def_file_path",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "darwin"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lc++"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "local"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lstdc++"])],
-                ),
-            ],
-        )
-    else:
-        stdlib_feature = None
-
-    no_stripping_feature = feature(name = "no_stripping")
-
-    alwayslink_feature = feature(
-        name = "alwayslink",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
-            ),
-        ],
-    )
-
-    input_param_flags_feature = feature(
-        name = "input_param_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/IMPLIB:%{interface_library_output_path}"],
-                        expand_if_available = "interface_library_output_path",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                iterate_over = "libraries_to_link.object_files",
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "interface_library",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.name}"],
-                                        expand_if_false = "libraries_to_link.is_whole_archive",
-                                    ),
-                                    flag_group(
-                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
-                                        expand_if_true = "libraries_to_link.is_whole_archive",
-                                    ),
-                                ],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "static_library",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-no-canonical-prefixes",
-                            ] + ctx.attr.extra_no_canonical_prefixes_flags,
-                        ),
-                    ],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])],
-                ),
-            ],
-        )
-    else:
-        no_canonical_prefixes_feature = None
-
-    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    user_link_flags_feature = feature(
-        name = "user_link_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_link_flags}"],
-                        iterate_over = "user_link_flags",
-                        expand_if_available = "user_link_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "determinism",
-                "alwayslink",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "build-id",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "determinism",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-                "undefined-dynamic",
-            ],
-        )
-    else:
-        common_feature = None
-
-    if (ctx.attr.cpu == "local"):
-        features = [
-            default_compile_flags_feature,
-            stdlib_feature,
-            determinism_feature,
-            alwayslink_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            build_id_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-        if ctx.attr.cuda_path:
-            features += [cuda_path_feature]
-    elif (ctx.attr.cpu == "darwin"):
-        features = [
-            stdlib_feature,
-            determinism_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            undefined_dynamic_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "x64_windows"):
-        features = [
-            no_legacy_features_feature,
-            redirector_feature,
-            nologo_feature,
-            has_configured_linker_path_feature,
-            no_stripping_feature,
-            targets_windows_feature,
-            copy_dynamic_libraries_to_binary_feature,
-            default_compile_flags_feature,
-            msvc_env_feature,
-            include_paths_feature,
-            preprocessor_defines_feature,
-            parse_showincludes_feature,
-            generate_pdb_file_feature,
-            shared_flag_feature,
-            linkstamps_feature,
-            output_execpath_flags_feature,
-            archiver_flags_feature,
-            input_param_flags_feature,
-            linker_subsystem_flag_feature,
-            user_link_flags_feature,
-            default_link_flags_feature,
-            linker_param_file_feature,
-            static_link_msvcrt_feature,
-            static_link_msvcrt_no_debug_feature,
-            dynamic_link_msvcrt_no_debug_feature,
-            static_link_msvcrt_debug_feature,
-            dynamic_link_msvcrt_debug_feature,
-            dbg_feature,
-            fastbuild_feature,
-            opt_feature,
-            user_compile_flags_feature,
-            sysroot_feature,
-            unfiltered_compile_flags_feature,
-            compiler_output_flags_feature,
-            compiler_input_flags_feature,
-            def_file_feature,
-            windows_export_all_symbols_feature,
-            no_windows_export_all_symbols_feature,
-            supports_dynamic_linker_feature,
-            supports_interface_shared_libraries_feature,
-        ]
-    else:
-        fail("Unreachable")
-
-    cxx_builtin_include_directories = ctx.attr.builtin_include_directories
-
-    if (ctx.attr.cpu == "x64_windows"):
-        tool_paths = [
+    elif cpu == "x64_windows":
+        return [
             tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
             tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
             tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
@@ -1452,58 +263,760 @@ def _impl(ctx):
                 path = "wrapper/bin/msvc_nop.bat",
             ),
         ]
-    elif (ctx.attr.cpu == "local"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+    else:
+        fail("Unreachable")
+
+def _sysroot_group():
+    return flag_group(
+        flags = ["--sysroot=%{sysroot}"],
+        expand_if_available = "sysroot",
+    )
+
+def _no_canonical_prefixes_group(extra_flags):
+    return flag_group(
+        flags = [
+            "-no-canonical-prefixes",
+        ] + extra_flags,
+    )
+
+def _cuda_set(cuda_path, actions):
+    if cuda_path:
+        return [flag_set(
+            actions = actions,
+            flag_groups = [
+                flag_group(
+                    flags = ["--cuda-path=" + cuda_path],
+                ),
+            ],
+        )]
+    else:
+        return []
+
+def _nologo():
+    return flag_group(flags = ["/nologo"])
+
+def _features(cpu, compiler, ctx):
+    if cpu in ["local", "darwin"]:
+        return [
+            feature(name = "no_legacy_features"),
+            feature(
+                name = "all_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["-MD", "-MF", "%{dependency_file}"],
+                                expand_if_available = "dependency_file",
+                            ),
+                            flag_group(
+                                flags = ["-gsplit-dwarf"],
+                                expand_if_available = "per_object_debug_info_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["-frandom-seed=%{output_file}"],
+                                expand_if_available = "output_file",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-D%{preprocessor_defines}"],
+                                iterate_over = "preprocessor_defines",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-include", "%{includes}"],
+                                iterate_over = "includes",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-iquote", "%{quote_include_paths}"],
+                                iterate_over = "quote_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-I%{include_paths}"],
+                                iterate_over = "include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-isystem", "%{system_include_paths}"],
+                                iterate_over = "system_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-F", "%{framework_include_paths}"],
+                                iterate_over = "framework_include_paths",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_cpp_compile_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["-fexperimental-new-pass-manager"]),
+                        ] if compiler == "clang" else [],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-Wno-builtin-macro-redefined",
+                                    "-D__DATE__=\"redacted\"",
+                                    "-D__TIMESTAMP__=\"redacted\"",
+                                    "-D__TIME__=\"redacted\"",
+                                ],
+                            ),
+                            flag_group(
+                                flags = ["-fPIC"],
+                                expand_if_available = "pic",
+                            ),
+                            flag_group(
+                                flags = ["-fPIE"],
+                                expand_if_not_available = "pic",
+                            ),
+                            flag_group(
+                                flags = [
+                                    "-U_FORTIFY_SOURCE",
+                                    "-D_FORTIFY_SOURCE=1",
+                                    "-fstack-protector",
+                                    "-Wall",
+                                ] + ctx.attr.host_compiler_warnings + [
+                                    "-fno-omit-frame-pointer",
+                                ],
+                            ),
+                            _no_canonical_prefixes_group(
+                                ctx.attr.extra_no_canonical_prefixes_flags,
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["-DNDEBUG"])],
+                        with_features = [with_feature_set(features = ["disable-assertions"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-g0",
+                                    "-O2",
+                                    "-ffunction-sections",
+                                    "-fdata-sections",
+                                ],
+                            ),
+                        ],
+                        with_features = [with_feature_set(features = ["opt"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["-g"])],
+                        with_features = [with_feature_set(features = ["dbg"])],
+                    ),
+                ] + _cuda_set(
+                    ctx.attr.cuda_path,
+                    all_compile_actions(),
+                ) + [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            _iterate_flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                            ),
+                            _sysroot_group(),
+                            flag_group(
+                                expand_if_available = "source_file",
+                                flags = ["-c", "%{source_file}"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_assembly_file",
+                                flags = ["-S"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_preprocess_file",
+                                flags = ["-E"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_file",
+                                flags = ["-o", "%{output_file}"],
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_archive_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_archive_actions(),
+                        flag_groups = [
+                            flag_group(
+                                expand_if_available = "linker_param_file",
+                                flags = ["@%{linker_param_file}"],
+                            ),
+                            flag_group(flags = ["rcsD"]),
+                            flag_group(
+                                flags = ["%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                            flag_group(
+                                iterate_over = "libraries_to_link",
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_equal = variable_with_value(
+                                            name = "libraries_to_link.type",
+                                            value = "object_file",
+                                        ),
+                                    ),
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.object_files}"],
+                                        iterate_over = "libraries_to_link.object_files",
+                                        expand_if_equal = variable_with_value(
+                                            name = "libraries_to_link.type",
+                                            value = "object_file_group",
+                                        ),
+                                    ),
+                                ],
+                                expand_if_available = "libraries_to_link",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_link_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_shared_library_link_actions(),
+                        flag_groups = [flag_group(flags = ["-shared"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = (
+                                ["-Wl,-no-as-needed"] if cpu == "local" else []
+                            ) + [
+                                "-B" + ctx.attr.linker_bin_path,
+                            ]),
+                            flag_group(
+                                flags = ["@%{linker_param_file}"],
+                                expand_if_available = "linker_param_file",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["%{linkstamp_paths}"],
+                                iterate_over = "linkstamp_paths",
+                            ),
+                            flag_group(
+                                flags = ["-o", "%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-L%{library_search_directories}"],
+                                iterate_over = "library_search_directories",
+                            ),
+                            _iterate_flag_group(
+                                iterate_over = "runtime_library_search_directories",
+                                flags = [
+                                    "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}",
+                                ] if cpu == "local" else [
+                                    "-Wl,-rpath,@loader_path/%{runtime_library_search_directories}",
+                                ],
+                            ),
+                            _libraries_to_link_group("darwin" if cpu == "darwin" else "linux"),
+                            _iterate_flag_group(
+                                flags = ["%{user_link_flags}"],
+                                iterate_over = "user_link_flags",
+                            ),
+                            flag_group(
+                                flags = ["-Wl,--gdb-index"],
+                                expand_if_available = "is_using_fission",
+                            ),
+                            flag_group(
+                                flags = ["-Wl,-S"],
+                                expand_if_available = "strip_debug_symbols",
+                            ),
+                            flag_group(flags = ["-lc++" if cpu == "darwin" else "-lstdc++"]),
+                            _no_canonical_prefixes_group(
+                                ctx.attr.extra_no_canonical_prefixes_flags,
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_executable_link_actions(),
+                        flag_groups = [flag_group(flags = ["-pie"])],
+                    ),
+                ] + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = [
+                            "-Wl,-z,relro,-z,now",
+                        ])],
+                    ),
+                ] if cpu == "local" else []) + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["-Wl,--gc-sections"]),
+                            flag_group(
+                                flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
+                            ),
+                        ],
+                    ),
+                ] if cpu == "local" else []) + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
+                    ),
+                ] if cpu == "darwin" else []) + _cuda_set(
+                    ctx.attr.cuda_path,
+                    all_link_actions(),
+                ) + [
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            _sysroot_group(),
+                        ],
+                    ),
+                ],
+            ),
+            feature(name = "opt"),
+            feature(name = "fastbuild"),
+            feature(name = "dbg"),
+            feature(name = "supports_dynamic_linker", enabled = True),
+            feature(name = "pic", enabled = True),
+            feature(name = "supports_pic", enabled = True),
+            feature(name = "has_configured_linker_path", enabled = True),
         ]
-    elif (ctx.attr.cpu == "darwin"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+    elif cpu == "x64_windows":
+        return [
+            feature(name = "no_legacy_features"),
+            feature(
+                name = "common_flags",
+                enabled = True,
+                env_sets = [
+                    env_set(
+                        actions = all_compile_actions() + all_link_actions() + all_archive_actions(),
+                        env_entries = [
+                            env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                            env_entry(key = "INCLUDE", value = ctx.attr.msvc_env_include),
+                            env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
+                            env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                            env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-B",
+                                    "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
+                                ],
+                            ),
+                            _nologo(),
+                            flag_group(
+                                flags = [
+                                    "/DCOMPILER_MSVC",
+                                    "/DNOMINMAX",
+                                    "/D_WIN32_WINNT=0x0600",
+                                    "/D_CRT_SECURE_NO_DEPRECATE",
+                                    "/D_CRT_SECURE_NO_WARNINGS",
+                                    "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
+                                    "/bigobj",
+                                    "/Zm500",
+                                    "/J",
+                                    "/Gy",
+                                    "/GF",
+                                    "/EHsc",
+                                    "/wd4351",
+                                    "/wd4291",
+                                    "/wd4250",
+                                    "/wd4996",
+                                ],
+                            ),
+                            _iterate_flag_group(
+                                flags = ["/I%{quote_include_paths}"],
+                                iterate_over = "quote_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["/I%{include_paths}"],
+                                iterate_over = "include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["/I%{system_include_paths}"],
+                                iterate_over = "system_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["/D%{preprocessor_defines}"],
+                                iterate_over = "preprocessor_defines",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [flag_group(flags = ["/showIncludes"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/MT"])],
+                        with_features = [with_feature_set(features = ["static_link_msvcrt_no_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/MD"])],
+                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_no_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/MTd"])],
+                        with_features = [with_feature_set(features = ["static_link_msvcrt_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/MDd"])],
+                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                        with_features = [with_feature_set(features = ["dbg"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                        with_features = [with_feature_set(features = ["fastbuild"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
+                        with_features = [with_feature_set(features = ["opt"])],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [
+                            _iterate_flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                            ),
+                        ] + ([
+                            flag_group(flags = ctx.attr.host_unfiltered_compile_flags),
+                        ] if ctx.attr.host_unfiltered_compile_flags else []),
+                    ),
+                    flag_set(
+                        actions = [ACTION_NAMES.assemble],
+                        flag_groups = [
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["/Fo%{output_file}", "/Zi"],
+                                        expand_if_not_available = "output_preprocess_file",
+                                    ),
+                                ],
+                                expand_if_available = "output_file",
+                                expand_if_not_available = "output_assembly_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["/Fo%{output_file}"],
+                                        expand_if_not_available = "output_preprocess_file",
+                                    ),
+                                ],
+                                expand_if_available = "output_file",
+                                expand_if_not_available = "output_assembly_file",
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["/Fa%{output_file}"],
+                                        expand_if_available = "output_assembly_file",
+                                    ),
+                                ],
+                                expand_if_available = "output_file",
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["/P", "/Fi%{output_file}"],
+                                        expand_if_available = "output_preprocess_file",
+                                    ),
+                                ],
+                                expand_if_available = "output_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/c", "%{source_file}"],
+                                expand_if_available = "source_file",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_archive_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_archive_actions(),
+                        flag_groups = [
+                            _nologo(),
+                            flag_group(
+                                flags = ["/OUT:%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_link_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_shared_library_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DLL"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            _nologo(),
+                            _iterate_flag_group(
+                                flags = ["%{linkstamp_paths}"],
+                                iterate_over = "linkstamp_paths",
+                            ),
+                            flag_group(
+                                flags = ["/OUT:%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_shared_library_link_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/IMPLIB:%{interface_library_output_path}"],
+                                expand_if_available = "interface_library_output_path",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_link_actions() +
+                                  all_archive_actions(),
+                        flag_groups = [
+                            _libraries_to_link_group("msvc"),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["/SUBSYSTEM:CONSOLE"]),
+                            _iterate_flag_group(
+                                flags = ["%{user_link_flags}"],
+                                iterate_over = "user_link_flags",
+                            ),
+                            flag_group(flags = ["/MACHINE:X64"]),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_link_actions() +
+                                  all_archive_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["@%{linker_param_file}"],
+                                expand_if_available = "linker_param_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+                        with_features = [with_feature_set(features = ["static_link_msvcrt_no_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_no_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+                        with_features = [with_feature_set(features = ["static_link_msvcrt_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
+                        with_features = [with_feature_set(features = ["dbg"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
+                        ],
+                        with_features = [with_feature_set(features = ["fastbuild"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                                expand_if_available = "def_file_path",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(name = "parse_showincludes", enabled = True),
+            feature(name = "no_stripping", enabled = True),
+            feature(
+                name = "targets_windows",
+                enabled = True,
+                implies = ["copy_dynamic_libraries_to_binary"],
+            ),
+            feature(name = "copy_dynamic_libraries_to_binary"),
+            feature(
+                name = "generate_pdb_file",
+                requires = [
+                    feature_set(features = ["dbg"]),
+                    feature_set(features = ["fastbuild"]),
+                ],
+            ),
+            feature(name = "static_link_msvcrt"),
+            feature(
+                name = "static_link_msvcrt_no_debug",
+                requires = [
+                    feature_set(features = ["fastbuild"]),
+                    feature_set(features = ["opt"]),
+                ],
+            ),
+            feature(
+                name = "dynamic_link_msvcrt_no_debug",
+                requires = [
+                    feature_set(features = ["fastbuild"]),
+                    feature_set(features = ["opt"]),
+                ],
+            ),
+            feature(
+                name = "static_link_msvcrt_debug",
+                requires = [feature_set(features = ["dbg"])],
+            ),
+            feature(
+                name = "dynamic_link_msvcrt_debug",
+                requires = [feature_set(features = ["dbg"])],
+            ),
+            feature(
+                name = "dbg",
+                implies = ["generate_pdb_file"],
+            ),
+            feature(
+                name = "fastbuild",
+                implies = ["generate_pdb_file"],
+            ),
+            feature(
+                name = "opt",
+            ),
+            feature(name = "windows_export_all_symbols"),
+            feature(name = "no_windows_export_all_symbols"),
+            feature(name = "supports_dynamic_linker", enabled = True),
+            feature(
+                name = "supports_interface_shared_libraries",
+                enabled = True,
+            ),
+            feature(name = "has_configured_linker_path", enabled = True),
         ]
     else:
         fail("Unreachable")
 
+def _impl(ctx):
+    cpu = ctx.attr.cpu
+    compiler = ctx.attr.compiler
+
+    if (cpu == "darwin"):
+        toolchain_identifier = "local_darwin"
+        target_cpu = "darwin"
+        target_libc = "macosx"
+        compiler = "compiler"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.host_compiler_path,
+            c_compiler_path = ctx.attr.host_compiler_path,
+            cc_compiler_path = ctx.attr.host_compiler_path,
+            archiver_path = ctx.attr.host_compiler_prefix + "/libtool",
+            linker_path = ctx.attr.host_compiler_path,
+            strip_path = ctx.attr.host_compiler_prefix + "/strip",
+        )
+    elif (cpu == "local"):
+        toolchain_identifier = "local_linux"
+        target_cpu = "local"
+        target_libc = "local"
+        compiler = "compiler"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.host_compiler_path,
+            c_compiler_path = ctx.attr.host_compiler_path,
+            cc_compiler_path = ctx.attr.host_compiler_path,
+            archiver_path = ctx.attr.host_compiler_prefix + "/ar",
+            linker_path = ctx.attr.host_compiler_path,
+            strip_path = ctx.attr.host_compiler_prefix + "/strip",
+        )
+    elif (cpu == "x64_windows"):
+        toolchain_identifier = "local_windows"
+        target_cpu = "x64_windows"
+        target_libc = "msvcrt"
+        compiler = "msvc-cl"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.msvc_ml_path,
+            c_compiler_path = ctx.attr.msvc_cl_path,
+            cc_compiler_path = ctx.attr.msvc_cl_path,
+            archiver_path = ctx.attr.msvc_lib_path,
+            linker_path = ctx.attr.msvc_link_path,
+            strip_path = "fake_tool_strip_not_supported",
+        )
+    else:
+        fail("Unreachable")
+
     out = ctx.actions.declare_file(ctx.label.name)
     ctx.actions.write(out, "Fake executable")
     return [
         cc_common.create_cc_toolchain_config_info(
             ctx = ctx,
-            features = features,
+            features = _features(cpu, compiler, ctx),
             action_configs = action_configs,
             artifact_name_patterns = [],
-            cxx_builtin_include_directories = cxx_builtin_include_directories,
+            cxx_builtin_include_directories = ctx.attr.builtin_include_directories,
             toolchain_identifier = toolchain_identifier,
-            host_system_name = host_system_name,
-            target_system_name = target_system_name,
+            host_system_name = "local",
+            target_system_name = "local",
             target_cpu = target_cpu,
             target_libc = target_libc,
             compiler = compiler,
-            abi_version = abi_version,
-            abi_libc_version = abi_libc_version,
-            tool_paths = tool_paths,
+            abi_version = "local",
+            abi_libc_version = "local",
+            tool_paths = _tool_paths(cpu, ctx),
             make_variables = [],
-            builtin_sysroot = builtin_sysroot,
-            cc_target_os = cc_target_os,
+            builtin_sysroot = ctx.attr.builtin_sysroot,
+            cc_target_os = None,
         ),
         DefaultInfo(
             executable = out,
@@ -1514,6 +1027,7 @@ cc_toolchain_config = rule(
     implementation = _impl,
     attrs = {
         "cpu": attr.string(mandatory = True, values = ["darwin", "local", "x64_windows"]),
+        "compiler": attr.string(values = ["clang", "msvc", "unknown"], default = "unknown"),
         "builtin_include_directories": attr.string_list(),
         "extra_no_canonical_prefixes_flags": attr.string_list(),
         "host_compiler_path": attr.string(),
@@ -1531,7 +1045,6 @@ cc_toolchain_config = rule(
         "msvc_lib_path": attr.string(default = "msvc_not_used"),
         "msvc_link_path": attr.string(default = "msvc_not_used"),
         "msvc_ml_path": attr.string(default = "msvc_not_used"),
-        "compiler": attr.string(values = ["clang", "msvc", "unknown"], default="unknown"),
     },
     provides = [CcToolchainConfigInfo],
     executable = True,
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
index f3b2ae6846d..9cc06ef99f5 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@@ -53,13 +53,6 @@ NVCC_PATH = '%{nvcc_path}'
 PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
 NVCC_VERSION = '%{cuda_version}'
 
-
-# TODO(amitpatankar): Benchmark enabling all capabilities by default.
-# Environment variable for supported TF CUDA Compute Capabilities
-# eg. export TF_CUDA_COMPUTE_CAPABILITIES=3.5,3.7,5.2,6.0,6.1,7.0
-CUDA_COMPUTE_ENV_VAR = 'TF_CUDA_COMPUTE_CAPABILITIES'
-DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,6.0'
-
 def Log(s):
   print('gpus/crosstool: {0}'.format(s))
 
@@ -78,7 +71,8 @@ def GetOptionValue(argv, option):
   """
 
   parser = ArgumentParser()
-  parser.add_argument('-' + option, nargs='*', action='append')
+  parser.add_argument(option, nargs='*', action='append')
+  option = option.lstrip('-').replace('-', '_')
   args, _ = parser.parse_known_args(argv)
   if not args or not vars(args)[option]:
     return []
@@ -180,17 +174,17 @@ def InvokeNvcc(argv, log=False):
 
   host_compiler_options = GetHostCompilerOptions(argv)
   nvcc_compiler_options = GetNvccOptions(argv)
-  opt_option = GetOptionValue(argv, 'O')
-  m_options = GetOptionValue(argv, 'm')
+  opt_option = GetOptionValue(argv, '-O')
+  m_options = GetOptionValue(argv, '-m')
   m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
-  include_options = GetOptionValue(argv, 'I')
-  out_file = GetOptionValue(argv, 'o')
-  depfiles = GetOptionValue(argv, 'MF')
-  defines = GetOptionValue(argv, 'D')
+  include_options = GetOptionValue(argv, '-I')
+  out_file = GetOptionValue(argv, '-o')
+  depfiles = GetOptionValue(argv, '-MF')
+  defines = GetOptionValue(argv, '-D')
   defines = ''.join([' -D' + define for define in defines])
-  undefines = GetOptionValue(argv, 'U')
+  undefines = GetOptionValue(argv, '-U')
   undefines = ''.join([' -U' + define for define in undefines])
-  std_options = GetOptionValue(argv, 'std')
+  std_options = GetOptionValue(argv, '-std')
   # Supported -std flags as of CUDA 9.0. Only keep last to mimic gcc/clang.
   nvcc_allowed_std_options = ["c++03", "c++11", "c++14"]
   std_options = ''.join([' -std=' + define
@@ -198,7 +192,7 @@ def InvokeNvcc(argv, log=False):
 
   # The list of source files get passed after the -c option. I don't know of
   # any other reliable way to just get the list of source files to be compiled.
-  src_files = GetOptionValue(argv, 'c')
+  src_files = GetOptionValue(argv, '-c')
 
   # Pass -w through from host to nvcc, but don't do anything fancier with
   # warnings-related flags, since they're not necessarily the same across
@@ -224,13 +218,16 @@ def InvokeNvcc(argv, log=False):
   srcs = ' '.join(src_files)
   out = ' -o ' + out_file[0]
 
-  supported_cuda_compute_capabilities = [ %{cuda_compute_capabilities} ]
   nvccopts = '-D_FORCE_INLINES '
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
-    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
-        capability, capability, capability)
-  nvccopts += ' ' + nvcc_compiler_options
+  for capability in GetOptionValue(argv, "--cuda-gpu-arch"):
+    capability = capability[len('sm_'):]
+    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s\" ' % (capability,
+                                                               capability)
+  for capability in GetOptionValue(argv, '--cuda-include-ptx'):
+    capability = capability[len('sm_'):]
+    nvccopts += r'-gencode=arch=compute_%s,\"code=compute_%s\" ' % (capability,
+                                                                    capability)
+  nvccopts += nvcc_compiler_options
   nvccopts += undefines
   nvccopts += defines
   nvccopts += std_options
@@ -272,6 +269,7 @@ def main():
   if args.x and args.x[0] == 'cuda':
     if args.cuda_log: Log('-x cuda')
     leftover = [pipes.quote(s) for s in leftover]
+    args.cuda_log = True
     if args.cuda_log: Log('using nvcc')
     return InvokeNvcc(leftover, log=args.cuda_log)
 
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
index f5ac7b39dfd..89275128a9c 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -179,7 +179,7 @@ def InvokeHipcc(argv, log=False):
   # Also we need to retain warning about uninitialised shared variable as
   # warning only, even when -Werror option is specified.
   if HIPCC_IS_HIPCLANG:
-    hipccopts += ' --include=hip/hip_runtime.h -Wno-error=cuda-shared-init '
+    hipccopts += ' --include=hip/hip_runtime.h '
   hipccopts += ' ' + hipcc_compiler_options
   # Use -fno-gpu-rdc by default for early GPU kernel finalization
   # This flag would trigger GPU kernels be generated at compile time, instead
@@ -258,6 +258,8 @@ def main():
     gpu_linker_flags.append('-L' + HIP_RUNTIME_PATH)
     gpu_linker_flags.append('-Wl,-rpath=' + HIP_RUNTIME_PATH)
     gpu_linker_flags.append('-l' + HIP_RUNTIME_LIBRARY)
+    if HIPCC_IS_HIPCLANG:
+      gpu_linker_flags.append("-lrt")
 
     if VERBOSE: print(' '.join([CPU_COMPILER] + gpu_linker_flags))
     return subprocess.call([CPU_COMPILER] + gpu_linker_flags)
diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
index 46e8aef3606..73012876691 100644
--- a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+++ b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@@ -37,13 +37,6 @@ GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
 NVCC_PATH = '%{nvcc_path}'
 NVCC_VERSION = '%{cuda_version}'
 NVCC_TEMP_DIR = "%{nvcc_tmp_dir}"
-DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,6.0'
-
-# Taken from environment variable for supported TF CUDA Compute Capabilities
-# eg. export TF_CUDA_COMPUTE_CAPABILITIES=3.5,3.7,5.2,6.0,6.1,7.0
-supported_cuda_compute_capabilities = os.environ.get(
-    'TF_CUDA_COMPUTE_CAPABILITIES',
-    DEFAULT_CUDA_COMPUTE_CAPABILITIES).split(',')
 
 def Log(s):
   print('gpus/crosstool: {0}'.format(s))
@@ -53,7 +46,7 @@ def GetOptionValue(argv, option):
   """Extract the list of values for option from options.
 
   Args:
-    option: The option whose value to extract, without the leading '/'.
+    option: The option whose value to extract.
 
   Returns:
     1. A list of values, either directly following the option,
@@ -62,8 +55,9 @@ def GetOptionValue(argv, option):
     2. The leftover options.
   """
 
-  parser = ArgumentParser(prefix_chars='/')
-  parser.add_argument('/' + option, nargs='*', action='append')
+  parser = ArgumentParser(prefix_chars='-/')
+  parser.add_argument(option, nargs='*', action='append')
+  option = option.lstrip('-/').replace('-', '_')
   args, leftover = parser.parse_known_args(argv)
   if args and vars(args)[option]:
     return (sum(vars(args)[option], []), leftover)
@@ -122,18 +116,18 @@ def InvokeNvcc(argv, log=False):
 
   nvcc_compiler_options, argv = GetNvccOptions(argv)
 
-  opt_option, argv = GetOptionValue(argv, 'O')
+  opt_option, argv = GetOptionValue(argv, '/O')
   opt = ['-g']
   if (len(opt_option) > 0 and opt_option[0] != 'd'):
     opt = ['-O2']
 
-  include_options, argv = GetOptionValue(argv, 'I')
+  include_options, argv = GetOptionValue(argv, '/I')
   includes = ["-I " + include for include in include_options]
 
-  defines, argv = GetOptionValue(argv, 'D')
+  defines, argv = GetOptionValue(argv, '/D')
   defines = ['-D' + define for define in defines]
 
-  undefines, argv = GetOptionValue(argv, 'U')
+  undefines, argv = GetOptionValue(argv, '/U')
   undefines = ['-U' + define for define in undefines]
 
   # The rest of the unrecognized options should be passed to host compiler
@@ -142,10 +136,20 @@ def InvokeNvcc(argv, log=False):
   m_options = ["-m64"]
 
   nvccopts = ['-D_FORCE_INLINES']
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
-    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
-        capability, capability, capability)]
+  compute_capabilities, argv = GetOptionValue(argv, "--cuda-gpu-arch")
+  for capability in compute_capabilities:
+    capability = capability[len('sm_'):]
+    nvccopts += [
+        r'-gencode=arch=compute_%s,"code=sm_%s"' % (capability, capability)
+    ]
+  compute_capabilities, argv = GetOptionValue(argv, '--cuda-include-ptx')
+  for capability in compute_capabilities:
+    capability = capability[len('sm_'):]
+    nvccopts += [
+        r'-gencode=arch=compute_%s,"code=compute_%s"' % (capability, capability)
+    ]
+  _, argv = GetOptionValue(argv, '--no-cuda-include-ptx')
+
   nvccopts += nvcc_compiler_options
   nvccopts += undefines
   nvccopts += defines
@@ -163,10 +167,15 @@ def InvokeNvcc(argv, log=False):
   # Provide a unique dir for each compiling action to avoid conflicts.
   tempdir = tempfile.mkdtemp(dir = NVCC_TEMP_DIR)
   nvccopts += ['--keep', '--keep-dir', tempdir]
-  cmd = [NVCC_PATH] + nvccopts
   if log:
-    Log(cmd)
-  proc = subprocess.Popen(cmd,
+    Log([NVCC_PATH] + nvccopts)
+
+  # Store command line options in a file to avoid hitting the character limit.
+  optsfile = tempfile.NamedTemporaryFile(mode='w', dir=tempdir, delete=False)
+  optsfile.write("\n".join(nvccopts))
+  optsfile.close()
+
+  proc = subprocess.Popen([NVCC_PATH, "--options-file", optsfile.name],
                           stdout=sys.stdout,
                           stderr=sys.stderr,
                           env=os.environ.copy(),
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 9d17e1b8f35..6b5318f60c2 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -84,6 +84,42 @@ cuda_header_library(
     includes = ["cublas/include"],
 )
 
+cuda_header_library(
+    name = "cusolver_headers",
+    hdrs = [":cusolver-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    strip_include_prefix = "cusolver/include",
+    deps = [":cuda_headers"],
+    includes = ["cusolver/include"],
+)
+
+cuda_header_library(
+    name = "cufft_headers",
+    hdrs = [":cufft-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    strip_include_prefix = "cufft/include",
+    deps = [":cuda_headers"],
+    includes = ["cufft/include"],
+)
+
+cuda_header_library(
+    name = "cusparse_headers",
+    hdrs = [":cusparse-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    strip_include_prefix = "cusparse/include",
+    deps = [":cuda_headers"],
+    includes = ["cusparse/include"],
+)
+
+cuda_header_library(
+    name = "curand_headers",
+    hdrs = [":curand-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    strip_include_prefix = "curand/include",
+    deps = [":cuda_headers"],
+    includes = ["curand/include"],
+)
+
 cc_library(
     name = "cublas",
     srcs = ["cuda/lib/%{cublas_lib}"],
@@ -166,6 +202,14 @@ cc_library(
     data = [":cuda-nvvm"],
 )
 
+filegroup(
+    name = "cuda_root",
+    srcs = [
+        "cuda/bin/fatbinary",
+        "cuda/bin/bin2c",
+    ],
+)
+
 bzl_library(
     name = "build_defs_bzl",
     srcs = ["build_defs.bzl"],
diff --git a/third_party/gpus/cuda/BUILD.windows.tpl b/third_party/gpus/cuda/BUILD.windows.tpl
index f91c71f74a1..75d360fa17e 100644
--- a/third_party/gpus/cuda/BUILD.windows.tpl
+++ b/third_party/gpus/cuda/BUILD.windows.tpl
@@ -85,6 +85,42 @@ cuda_header_library(
     deps = [":cuda_headers"],
 )
 
+cuda_header_library(
+    name = "cusolver_headers",
+    hdrs = [":cusolver-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cusolver/include"],
+    strip_include_prefix = "cusolver/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "cufft_headers",
+    hdrs = [":cufft-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cufft/include"],
+    strip_include_prefix = "cufft/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "cusparse_headers",
+    hdrs = [":cusparse-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cusparse/include"],
+    strip_include_prefix = "cusparse/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "curand_headers",
+    hdrs = [":curand-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["curand/include"],
+    strip_include_prefix = "curand/include",
+    deps = [":cuda_headers"],
+)
+
 cc_import(
     name = "cublas",
     interface_library = "cuda/lib/%{cublas_lib}",
diff --git a/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/gpus/cuda/build_defs.bzl.tpl
index 3280d6b041f..bba772e2377 100644
--- a/third_party/gpus/cuda/build_defs.bzl.tpl
+++ b/third_party/gpus/cuda/build_defs.bzl.tpl
@@ -51,6 +51,10 @@ def cuda_is_configured():
     """Returns true if CUDA was enabled during the configure process."""
     return %{cuda_is_configured}
 
+def cuda_gpu_architectures():
+    """Returns a list of supported GPU architectures."""
+    return %{cuda_gpu_architectures}
+
 def if_cuda_is_configured(x):
     """Tests if the CUDA was enabled during the configure process.
 
diff --git a/third_party/gpus/cuda/cuda_config.h.tpl b/third_party/gpus/cuda/cuda_config.h.tpl
index dbd846307bb..b59889938b1 100644
--- a/third_party/gpus/cuda/cuda_config.h.tpl
+++ b/third_party/gpus/cuda/cuda_config.h.tpl
@@ -17,7 +17,11 @@ limitations under the License.
 #define CUDA_CUDA_CONFIG_H_
 
 #define TF_CUDA_VERSION "%{cuda_version}"
-#define TF_CUDA_LIB_VERSION "%{cuda_lib_version}"
+#define TF_CUBLAS_VERSION "%{cublas_version}"
+#define TF_CUSOLVER_VERSION "%{cusolver_version}"
+#define TF_CURAND_VERSION "%{curand_version}"
+#define TF_CUFFT_VERSION "%{cufft_version}"
+#define TF_CUSPARSE_VERSION "%{cusparse_version}"
 #define TF_CUDNN_VERSION "%{cudnn_version}"
 
 #define TF_CUDA_TOOLKIT_PATH "%{cuda_toolkit_path}"
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 545aeebe97a..203434ab3f4 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -66,8 +66,6 @@ _TF_CUDA_CONFIG_REPO = "TF_CUDA_CONFIG_REPO"
 _TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG"
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 
-_DEFAULT_CUDA_COMPUTE_CAPABILITIES = ["3.5", "5.2"]
-
 def to_list_of_strings(elements):
     """Convert the list of ["a", "b", "c"] into '"a", "b", "c"'.
 
@@ -410,18 +408,40 @@ _NVCC_VERSION_PREFIX = "Cuda compilation tools, release "
 _DEFINE_CUDNN_MAJOR = "#define CUDNN_MAJOR"
 
 def compute_capabilities(repository_ctx):
-    """Returns a list of strings representing cuda compute capabilities."""
-    capabilities_str = get_host_environ(repository_ctx, _TF_CUDA_COMPUTE_CAPABILITIES)
-    if capabilities_str == None:
-        return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-    capabilities = capabilities_str.split(",")
-    for capability in capabilities:
-        # Workaround for Skylark's lack of support for regex. This check should
-        # be equivalent to checking:
-        #     if re.match("[0-9]+.[0-9]+", capability) == None:
+    """Returns a list of strings representing cuda compute capabilities.
+
+    Args:
+      repository_ctx: the repo rule's context.
+    Returns: list of cuda architectures to compile for. 'compute_xy' refers to
+      both PTX and SASS, 'sm_xy' refers to SASS only.
+    """
+    capabilities = get_host_environ(
+        repository_ctx,
+        _TF_CUDA_COMPUTE_CAPABILITIES,
+        "compute_35,compute_52",
+    ).split(",")
+
+    # Map old 'x.y' capabilities to 'compute_xy'.
+    for i, capability in enumerate(capabilities):
         parts = capability.split(".")
-        if len(parts) != 2 or not parts[0].isdigit() or not parts[1].isdigit():
+        if len(parts) != 2:
+            continue
+        capabilities[i] = "compute_%s%s" % (parts[0], parts[1])
+
+    # Make list unique
+    capabilities = dict(zip(capabilities, capabilities)).keys()
+
+    # Validate capabilities.
+    for capability in capabilities:
+        if not capability.startswith(("compute_", "sm_")):
             auto_configure_fail("Invalid compute capability: %s" % capability)
+        for prefix in ["compute_", "sm_"]:
+            if not capability.startswith(prefix):
+                continue
+            if len(capability) == len(prefix) + 2 and capability[-2:].isdigit():
+                continue
+            auto_configure_fail("Invalid compute capability: %s" % capability)
+
     return capabilities
 
 def lib_name(base_name, cpu_value, version = None, static = False):
@@ -527,28 +547,28 @@ def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
             "cublas",
             cpu_value,
             cuda_config.config["cublas_library_dir"],
-            cuda_config.cuda_lib_version,
+            cuda_config.cublas_version,
             static = False,
         ),
         "cusolver": _check_cuda_lib_params(
             "cusolver",
             cpu_value,
-            cuda_config.config["cuda_library_dir"],
-            cuda_config.cuda_lib_version,
+            cuda_config.config["cusolver_library_dir"],
+            cuda_config.cusolver_version,
             static = False,
         ),
         "curand": _check_cuda_lib_params(
             "curand",
             cpu_value,
-            cuda_config.config["cuda_library_dir"],
-            cuda_config.cuda_lib_version,
+            cuda_config.config["curand_library_dir"],
+            cuda_config.curand_version,
             static = False,
         ),
         "cufft": _check_cuda_lib_params(
             "cufft",
             cpu_value,
-            cuda_config.config["cuda_library_dir"],
-            cuda_config.cuda_lib_version,
+            cuda_config.config["cufft_library_dir"],
+            cuda_config.cufft_version,
             static = False,
         ),
         "cudnn": _check_cuda_lib_params(
@@ -568,8 +588,8 @@ def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
         "cusparse": _check_cuda_lib_params(
             "cusparse",
             cpu_value,
-            cuda_config.config["cuda_library_dir"],
-            cuda_config.cuda_lib_version,
+            cuda_config.config["cusparse_library_dir"],
+            cuda_config.cusparse_version,
             static = False,
         ),
     }
@@ -646,18 +666,37 @@ def _get_cuda_config(repository_ctx, find_cuda_config_script):
     cuda_version = ("64_%s%s" if is_windows else "%s.%s") % (cuda_major, cuda_minor)
     cudnn_version = ("64_%s" if is_windows else "%s") % config["cudnn_version"]
 
-    # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc.
-    # It changed from 'x.y' to just 'x' in CUDA 10.1.
-    if (int(cuda_major), int(cuda_minor)) >= (10, 1):
+    if int(cuda_major) >= 11:
+        cublas_version = ("64_%s" if is_windows else "%s") % config["cublas_version"].split(".")[0]
+        cusolver_version = ("64_%s" if is_windows else "%s") % config["cusolver_version"].split(".")[0]
+        curand_version = ("64_%s" if is_windows else "%s") % config["curand_version"].split(".")[0]
+        cufft_version = ("64_%s" if is_windows else "%s") % config["cufft_version"].split(".")[0]
+        cusparse_version = ("64_%s" if is_windows else "%s") % config["cusparse_version"].split(".")[0]
+    elif (int(cuda_major), int(cuda_minor)) >= (10, 1):
+        # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc.
+        # It changed from 'x.y' to just 'x' in CUDA 10.1.
         cuda_lib_version = ("64_%s" if is_windows else "%s") % cuda_major
+        cublas_version = cuda_lib_version
+        cusolver_version = cuda_lib_version
+        curand_version = cuda_lib_version
+        cufft_version = cuda_lib_version
+        cusparse_version = cuda_lib_version
     else:
-        cuda_lib_version = cuda_version
+        cublas_version = cuda_version
+        cusolver_version = cuda_version
+        curand_version = cuda_version
+        cufft_version = cuda_version
+        cusparse_version = cuda_version
 
     return struct(
         cuda_toolkit_path = toolkit_path,
         cuda_version = cuda_version,
+        cublas_version = cublas_version,
+        cusolver_version = cusolver_version,
+        curand_version = curand_version,
+        cufft_version = cufft_version,
+        cusparse_version = cusparse_version,
         cudnn_version = cudnn_version,
-        cuda_lib_version = cuda_lib_version,
         compute_capabilities = compute_capabilities(repository_ctx),
         cpu_value = cpu_value,
         config = config,
@@ -714,6 +753,7 @@ def _create_dummy_repository(repository_ctx):
         {
             "%{cuda_is_configured}": "False",
             "%{cuda_extra_copts}": "[]",
+            "%{cuda_gpu_architectures}": "[]",
         },
     )
     _tpl(
@@ -738,6 +778,10 @@ def _create_dummy_repository(repository_ctx):
             "%{copy_rules}": """
 filegroup(name="cuda-include")
 filegroup(name="cublas-include")
+filegroup(name="cusolver-include")
+filegroup(name="cufft-include")
+filegroup(name="cusparse-include")
+filegroup(name="curand-include")
 filegroup(name="cudnn-include")
 """,
         },
@@ -769,7 +813,11 @@ filegroup(name="cudnn-include")
         "cuda:cuda_config.h",
         {
             "%{cuda_version}": "",
-            "%{cuda_lib_version}": "",
+            "%{cublas_version}": "",
+            "%{cusolver_version}": "",
+            "%{curand_version}": "",
+            "%{cufft_version}": "",
+            "%{cusparse_version}": "",
             "%{cudnn_version}": "",
             "%{cuda_toolkit_path}": "",
         },
@@ -808,23 +856,35 @@ def make_copy_files_rule(repository_ctx, name, srcs, outs):
     cmd = \"""%s \""",
 )""" % (name, "\n".join(outs), " && \\\n".join(cmds))
 
-def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir):
-    """Returns a rule to recursively copy a directory."""
+def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir, exceptions = None):
+    """Returns a rule to recursively copy a directory.
+    If exceptions is not None, it must be a list of files or directories in
+    'src_dir'; these will be excluded from copying.
+    """
     src_dir = _norm_path(src_dir)
     out_dir = _norm_path(out_dir)
     outs = read_dir(repository_ctx, src_dir)
+    post_cmd = ""
+    if exceptions != None:
+        outs = [x for x in outs if not any([
+            x.startswith(src_dir + "/" + y)
+            for y in exceptions
+        ])]
     outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
 
     # '@D' already contains the relative path for a single file, see
     # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
     out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
+    if exceptions != None:
+        for x in exceptions:
+            post_cmd += " ; rm -fR " + out_dir + "/" + x
     return """genrule(
     name = "%s",
     outs = [
 %s
     ],
-    cmd = \"""cp -rLf "%s/." "%s/" \""",
-)""" % (name, "\n".join(outs), src_dir, out_dir)
+    cmd = \"""cp -rLf "%s/." "%s/" %s\""",
+)""" % (name, "\n".join(outs), src_dir, out_dir, post_cmd)
 
 def _flag_enabled(repository_ctx, flag_name):
     return get_host_environ(repository_ctx, flag_name) == "1"
@@ -836,14 +896,14 @@ def _tf_sysroot(repository_ctx):
     return get_host_environ(repository_ctx, _TF_SYSROOT, "")
 
 def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
-    capability_flags = [
-        "--cuda-gpu-arch=sm_" + cap.replace(".", "")
-        for cap in compute_capabilities
-    ]
+    capability_flags = ["--no-cuda-include-ptx=all"]
+    for capability in compute_capabilities:
+        if capability.startswith("compute_"):
+            capability = capability.replace("compute_", "sm_")
+            capability_flags.append("--cuda-include-ptx=%s" % capability)
+        capability_flags.append("--cuda-gpu-arch=%s" % capability)
 
-    # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc
-    # TODO(csigg): Make this consistent with cuda clang and pass unconditionally.
-    return "if_cuda_clang(%s)" % str(capability_flags)
+    return str(capability_flags)
 
 def _tpl_path(repository_ctx, filename):
     return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % filename))
@@ -927,6 +987,56 @@ def _create_local_cuda_repository(repository_ctx):
         ],
     ))
 
+    cusolver_include_path = cuda_config.config["cusolver_include_dir"]
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "cusolver-include",
+        srcs = [
+            cusolver_include_path + "/cusolver_common.h",
+            cusolver_include_path + "/cusolverDn.h",
+        ],
+        outs = [
+            "cusolver/include/cusolver_common.h",
+            "cusolver/include/cusolverDn.h",
+        ],
+    ))
+
+    cufft_include_path = cuda_config.config["cufft_include_dir"]
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "cufft-include",
+        srcs = [
+            cufft_include_path + "/cufft.h",
+        ],
+        outs = [
+            "cufft/include/cufft.h",
+        ],
+    ))
+
+    cusparse_include_path = cuda_config.config["cusparse_include_dir"]
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "cusparse-include",
+        srcs = [
+            cusparse_include_path + "/cusparse.h",
+        ],
+        outs = [
+            "cusparse/include/cusparse.h",
+        ],
+    ))
+
+    curand_include_path = cuda_config.config["curand_include_dir"]
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "curand-include",
+        srcs = [
+            curand_include_path + "/curand.h",
+        ],
+        outs = [
+            "curand/include/curand.h",
+        ],
+    ))
+
     check_cuda_libs_script = repository_ctx.path(Label("@org_tensorflow//third_party/gpus:check_cuda_libs.py"))
     cuda_libs = _find_libs(repository_ctx, check_cuda_libs_script, cuda_config)
     cuda_lib_srcs = []
@@ -976,6 +1086,7 @@ def _create_local_cuda_repository(repository_ctx):
                 repository_ctx,
                 cuda_config.compute_capabilities,
             ),
+            "%{cuda_gpu_architectures}": str(cuda_config.compute_capabilities),
         },
     )
 
@@ -1092,9 +1203,6 @@ def _create_local_cuda_repository(repository_ctx):
             "%{cuda_version}": cuda_config.cuda_version,
             "%{nvcc_path}": nvcc_path,
             "%{gcc_host_compiler_path}": str(cc),
-            "%{cuda_compute_capabilities}": ", ".join(
-                ["\"%s\"" % c for c in cuda_config.compute_capabilities],
-            ),
             "%{nvcc_tmp_dir}": _get_nvcc_tmp_dir_for_windows(repository_ctx),
         }
         repository_ctx.template(
@@ -1134,7 +1242,11 @@ def _create_local_cuda_repository(repository_ctx):
         tpl_paths["cuda:cuda_config.h"],
         {
             "%{cuda_version}": cuda_config.cuda_version,
-            "%{cuda_lib_version}": cuda_config.cuda_lib_version,
+            "%{cublas_version}": cuda_config.cublas_version,
+            "%{cusolver_version}": cuda_config.cusolver_version,
+            "%{curand_version}": cuda_config.curand_version,
+            "%{cufft_version}": cuda_config.cufft_version,
+            "%{cusparse_version}": cuda_config.cusparse_version,
             "%{cudnn_version}": cuda_config.cudnn_version,
             "%{cuda_toolkit_path}": cuda_config.cuda_toolkit_path,
         },
diff --git a/third_party/gpus/find_cuda_config.py b/third_party/gpus/find_cuda_config.py
index e2ab42abf67..d768d4e3570 100644
--- a/third_party/gpus/find_cuda_config.py
+++ b/third_party/gpus/find_cuda_config.py
@@ -318,12 +318,9 @@ def _find_cublas_config(base_paths, required_version, cuda_version):
     # cuBLAS uses the major version only.
     cublas_version = header_version.split(".")[0]
 
-    if not _matches_version(cuda_version, cublas_version):
-      raise ConfigError("cuBLAS version %s does not match CUDA version %s" %
-                        (cublas_version, cuda_version))
-
   else:
     # There is no version info available before CUDA 10.1, just find the file.
+    header_version = cuda_version
     header_path = _find_file(base_paths, _header_paths(), "cublas_api.h")
     # cuBLAS version is the same as CUDA version (x.y).
     cublas_version = required_version
@@ -331,11 +328,101 @@ def _find_cublas_config(base_paths, required_version, cuda_version):
   library_path = _find_library(base_paths, "cublas", cublas_version)
 
   return {
+      "cublas_version": header_version,
       "cublas_include_dir": os.path.dirname(header_path),
       "cublas_library_dir": os.path.dirname(library_path),
   }
 
 
+def _find_cusolver_config(base_paths, required_version, cuda_version):
+
+  if _at_least_version(cuda_version, "11.0"):
+
+    def get_header_version(path):
+      version = (
+          _get_header_version(path, name)
+          for name in ("CUSOLVER_VER_MAJOR", "CUSOLVER_VER_MINOR",
+                       "CUSOLVER_VER_PATCH"))
+      return ".".join(version)
+
+    header_path, header_version = _find_header(base_paths, "cusolver_common.h",
+                                               required_version,
+                                               get_header_version)
+    cusolver_version = header_version.split(".")[0]
+
+  else:
+    header_version = cuda_version
+    header_path = _find_file(base_paths, _header_paths(), "cusolver_common.h")
+    cusolver_version = required_version
+
+  library_path = _find_library(base_paths, "cusolver", cusolver_version)
+
+  return {
+      "cusolver_version": header_version,
+      "cusolver_include_dir": os.path.dirname(header_path),
+      "cusolver_library_dir": os.path.dirname(library_path),
+  }
+
+
+def _find_curand_config(base_paths, required_version, cuda_version):
+
+  if _at_least_version(cuda_version, "11.0"):
+
+    def get_header_version(path):
+      version = (
+          _get_header_version(path, name)
+          for name in ("CURAND_VER_MAJOR", "CURAND_VER_MINOR",
+                       "CURAND_VER_PATCH"))
+      return ".".join(version)
+
+    header_path, header_version = _find_header(base_paths, "curand.h",
+                                               required_version,
+                                               get_header_version)
+    curand_version = header_version.split(".")[0]
+
+  else:
+    header_version = cuda_version
+    header_path = _find_file(base_paths, _header_paths(), "curand.h")
+    curand_version = required_version
+
+  library_path = _find_library(base_paths, "curand", curand_version)
+
+  return {
+      "curand_version": header_version,
+      "curand_include_dir": os.path.dirname(header_path),
+      "curand_library_dir": os.path.dirname(library_path),
+  }
+
+
+def _find_cufft_config(base_paths, required_version, cuda_version):
+
+  if _at_least_version(cuda_version, "11.0"):
+
+    def get_header_version(path):
+      version = (
+          _get_header_version(path, name)
+          for name in ("CUFFT_VER_MAJOR", "CUFFT_VER_MINOR", "CUFFT_VER_PATCH"))
+      return ".".join(version)
+
+    header_path, header_version = _find_header(base_paths, "cufft.h",
+                                               required_version,
+                                               get_header_version)
+    cufft_version = header_version.split(".")[0]
+
+  else:
+    header_version = cuda_version
+    header_path = _find_file(base_paths, _header_paths(), "cufft.h")
+    cufft_version = required_version
+
+  library_path = _find_library(base_paths, "cufft", cufft_version)
+
+  return {
+      "cufft_version": header_version,
+      "cufft_include_dir": os.path.dirname(header_path),
+      "cufft_library_dir": os.path.dirname(library_path),
+  }
+
+
 def _find_cudnn_config(base_paths, required_version):
 
   def get_header_version(path):
@@ -358,6 +445,36 @@ def _find_cudnn_config(base_paths, required_version):
   }
 
 
+def _find_cusparse_config(base_paths, required_version, cuda_version):
+
+  if _at_least_version(cuda_version, "11.0"):
+
+    def get_header_version(path):
+      version = (
+          _get_header_version(path, name)
+          for name in ("CUSPARSE_VER_MAJOR", "CUSPARSE_VER_MINOR",
+                       "CUSPARSE_VER_PATCH"))
+      return ".".join(version)
+
+    header_path, header_version = _find_header(base_paths, "cusparse.h",
+                                               required_version,
+                                               get_header_version)
+    cusparse_version = header_version.split(".")[0]
+
+  else:
+    header_version = cuda_version
+    header_path = _find_file(base_paths, _header_paths(), "cusparse.h")
+    cusparse_version = required_version
+
+  library_path = _find_library(base_paths, "cusparse", cusparse_version)
+
+  return {
+      "cusparse_version": header_version,
+      "cusparse_include_dir": os.path.dirname(header_path),
+      "cusparse_library_dir": os.path.dirname(library_path),
+  }
+
+
 def _find_nccl_config(base_paths, required_version):
 
   def get_header_version(path):
@@ -465,6 +582,33 @@ def find_cuda_config():
     result.update(
         _find_cublas_config(cublas_paths, cublas_version, cuda_version))
 
+    cusolver_paths = base_paths
+    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
+      cusolver_paths = cuda_paths
+    cusolver_version = os.environ.get("TF_CUSOLVER_VERSION", "")
+    result.update(
+        _find_cusolver_config(cusolver_paths, cusolver_version, cuda_version))
+
+    curand_paths = base_paths
+    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
+      curand_paths = cuda_paths
+    curand_version = os.environ.get("TF_CURAND_VERSION", "")
+    result.update(
+        _find_curand_config(curand_paths, curand_version, cuda_version))
+
+    cufft_paths = base_paths
+    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
+      cufft_paths = cuda_paths
+    cufft_version = os.environ.get("TF_CUFFT_VERSION", "")
+    result.update(_find_cufft_config(cufft_paths, cufft_version, cuda_version))
+
+    cusparse_paths = base_paths
+    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
+      cusparse_paths = cuda_paths
+    cusparse_version = os.environ.get("TF_CUSPARSE_VERSION", "")
+    result.update(
+        _find_cusparse_config(cusparse_paths, cusparse_version, cuda_version))
+
   if "cudnn" in libraries:
     cudnn_paths = _get_legacy_path("CUDNN_INSTALL_PATH", base_paths)
     cudnn_version = os.environ.get("TF_CUDNN_VERSION", "")
diff --git a/third_party/gpus/find_cuda_config.py.gz.base64 b/third_party/gpus/find_cuda_config.py.gz.base64
index ee5ddc22044..ae3ee37c077 100644
--- a/third_party/gpus/find_cuda_config.py.gz.base64
+++ b/third_party/gpus/find_cuda_config.py.gz.base64
@@ -1 +1 @@
-eJzNXHtz47iR/5+fAseJy6RHpu291FZOF++V157JKvHZU7Zm9lK2o0AkLHFNkQofkpVUvvt1NwASBCm/JlsZVa1HAtFAd+PXLxDYd+w0W27yeDYv2XeHR//FxnPBxiItsvxjkq3ZSVXOs7wI2EmSsCvsVrArUYh8JaLAeee8Y+dxCN1FxKo0Ejkrgf5kyUP4Rz0ZsC8iL+IsZd8Fh8zDDq565Pr/DSNssoot+IalWcmqQsAQccHu40Qw8RiKZcnilIXZYpnEPA0FW8flnKZRgwAb7M9qiGxacujNof8Sft2b/RgviWH8zMtyOTw4WK/XASdmgyyfHSSyY3FwPjr9cHH9YR8YJpLPaSKKguXib1Wcg6jTDeNL4CfkU+Ay4WuW5YzPcgHPygz5XedxGaezASuy+3LNcwGjRHFR5vG0KlvK0tyBzGYHUBdPmXtyzUbXLvvx5Hp0PYAxfh6Nf7r8PGY/n1xdnVyMRx+u2eUVO728OBuNR5cX8OsjO7n4M/vT6OJswASoCqYRj8sc+QcmY1QjLR27FqLFwH0mGSqWIozv4xDkSmcVnwk2y1YiT0EcthT5Ii5wMQtgL4JRkngRl7yklo5QOM3xv/TjuK77KY9TgOHp57MTmH6a83yDzLC54Dh/BEsUllkeC+KRrST6AFIZMIiKJSk3RSkWgeMg4IswjwFnheA5YKEgVWwbHoFZtEcZwIqj1srCgcYFQiASJaoqJRXHuWaCBlpK/pE+zNL7eFblpECkK8ooq8qAuFrycl5IPNHoRIxUNQ5r0QBget0QgvM8q2ZzJtJVnGfpQqSls+J5jGgFUx7dg6mxFU/iyGIgVkoaSOGkVjS7xJzIc1r4XJRVTiBg0ATqCrNIKG0mAGO0Pak8XAagvY+BeRi/4ZIj27MKuQOmrqvlMssR+Q0Zmg0tgxenYVJF0BRWP56fXPsD+HJ2cTFgF6en5wNSjHRaV+P2kpb8AQeqeZpyQLqJkIYfMGqUevxxgnNOPp2Mf7p2DBUyrULkHPzRgu8XYslBdUA8S7IpTRIwY/Ykyx4kmiR4Cgc51aCSSCJ3Ned5tI8qjACExGhRTU027/NsgewB9yQCYSNwYClb/KKG0Y3WUoFu2OV1Y9ORuOdVUmI/cLXR0HEYGGtaPQ7BKx5URX6QZCFPDsIq4gNqkNqVWCQudpNIoobtL3cDoP8ZljZbF0NW88H61DYA/+Ow5nM6vL39lGeznC/YR1TF7e3Fl9HZ6IT94dNnCEuLZYUulI2zLHmIy9tbHP72ds9xPoJCpzx8AL8a0VIAd/E0TuJygw53IUwMJUVGUYUnYJEpdFxJFTod9WbSWfawDqtEoWTTNrQAVEewctSS7td67h1FaVuuPqvDIvRtsMVOoihGY+RJB6xAvf+VHxiCLIo+GjpfPlxdQ/DQ64JN48vL8z+NxrSaQCPNrqbBHy2qfhqwUGOei4v2RLJpdHE9Pjk/10RozzVz+MNijppMGukCJj+dXekhtCOgIcYQxy+vrsbGMHVTa2rni/al/QAIwdNNhU4pwKTBEB53MaTuPgabXXRx6JYF4IE7NQyUg4ZIvFiWG+xdpYafzhgPKcPh6Ub3Vf4LogDgH+fjKnBptwGma3AxdMr7ye/Vsx8mapAhA6aCv7efSR80AUANWRAE7YfqS/MUAq3jQLoATpnFmf6WFfobejz9fZnwEpnRv8Fvq2/gxpZ5FkL2UbdsCgfSguUmgbAyxJwHFXw825fP98F97fNyv8yWTplvhrCgZJzFHLxBwtQg63kczh2VHo6o7QOGobo7plJIUATFkq9TTYdRaCIeRViV2pPLoRqGRLqVH8cJEw551Cm5P5rQ+0A8gMp9nHsJj6EbeFk2iQtQKnhWj57IgFlrKpCZg+dDfoTpMPRzDcK1dKkvIFXO1yRe8DB7CekZz2GemnLBS0x+NIQ8HpYVTyY1hHXWoVtoAkDJ6VyED6hHQf6T/K9OdxZCqExHUcto76g4gEWFSGSjBrU9DUtVVg3Wh2ksmmVM2ZcapM1nPTZ+OmNZnbFHAeHQIHmNJ9WfI2Z/joJ22zivhEkQfGcTWL8/Qtx6hiL4zycpuh97DmJKr0Q+K4Y1ta23ISU1WmmdlAnCax7UxG0Vt0kxLQ4xX9L5jIxGkOCWPEkoCdUDXckMc8h+VsDC3nLsBl4Ssu2kOEtFIJEJfyFuW0sO7vMCukhhlX1I3UELVEVQgS1BRB4aWVedaOPA1njHNgCxhlt6vtMDv+NOU6s38dIZjedlgTWv1zFAbbm8nCSCF+XrTFfp8pjdgN/zVj7lqSvKRy0WoGQsPTdw/TtDqD7CrnQWqSEh+6HRhhZkJkodpbQomHgOWMoXQvsbBQx0Biq0wPRADsUWZdOnZqUWSBgghwlWY/A8zoJsKfTIbu5CdE4h94Zc89ityvv937l+kMMQSKD8KJNQowUM6KvnvpNzsp2Cvfduo/e+y3aI0QFN5RMZ4I+6a9tSKqC2YAZl2tI7Mhbfrb1xCMsuipinExAyqsLSu4/zooT8VkD2HdnKAOuhHB2z4WmcqoIcXCpRUQovCZVC1Hw3WREgWfBLFqfePYwuV/MeFSVJ8SfpVdLfmWuVRBNZCkyoPvD6uOqk2e36oTZTLFusmNmwiejRZDQXrAQFbs/Vza6P6ZV7UID4B3Uj0KlM6tjIRYIQY9ZEPvFuWiMDJPaX7p1P0bzEmkGuOpYZACgA8x47/oF5wZ7vypXDCILDi5Ls2MSanEGaQQtOKrUxkaVmU/DC3kEksCb0XF6Ecez6ElIq6/mcxvjwjLrUyY85YBMRuihEngMeRZ4GACwTYtdrI9M3sFlQee5hkevJAXzfBIMqLSdYOio80FfT56h1NtslT2YLsO7u4cqJRPUHB0KbgM+RtX6+BzIYRs7aSaoabNWBC1Shcv8A5PFasdSta1t3YD14Wxm72oGO4DFag7X0RU8Mt3njWvX5/k6BLsckAvBanVzVZLJt9UlT9w601W/ReoWVW+638+72RS4SWWrXm1i4F2JshcjdD8sbKRbdmllXbfx0GpRsduvePrmP/VlaNQ/FY5nz4uD086fx6ODJAWUfelb7OV0Y/Qskb238PCs69P7+t62frR/9suKDR5OsJfxeWzQwrQnVlhPaw/Nwv0HKOajF0L+RZ+WhSAdNXzRXVwaQG/c21SDb3Sl2EZ7krVF6+hLXfqQZwL+TXsac8LWDtqnvDLdlFmtaKadZlUTkWWhbEqvvHbkTB9+MnbfNEKw0ze7pn8ZYPUMZXU0ZgtXWQ3UnUr1Kx6aAPQnB9qGUh6NIju9RgB7L9QD/eK2QLyOeObFvpSr4DLXJ40K8HTItRSg7aBFrM5MJ1LZaU5udTE3kFjZpCMxMFwGiaHaRN01+0RMBDAYBbyYHFDzQ4nQUsupqm9bdKfaCaIMUgE5MeDV2qel9a3Bwtk/Wae3PEzn1zfDojiK0gEDfy9aTfEBkKDL3GW6emx41tWc4si1Qt93oE+hQE4nXGcxguxC2CAOGke4VyOLSiso5Lzslp0SbVk5Tbfz77LZTnxrSkufyVcf4rZs+Hf8wsCbd7i8U9ROLqnqYQAaISb1jHqXl0qFga1VMYJLJRkvlKpPpdzRvwIbKZ16MjpahPAF2K+VqM/6MA3larJaGKH2UOZ+FzM7Kw6RIt6VKV9BocIe7A9tLetd89aBrG5Xvt3L8WmO4bWNm7u5OFOxE5HD1pAcH7Ojw8HBQc7FDv2W77zgqxTY0W6u14XsrclwkDubuc9p/0Yq8YYyuLn2nUwNZ4rB3+CoAtw0E49NsJYJaC6ZLrsXui80kd166A6tAMfCQrsKwDw1GLDoFWiaraP26O0uk/Qt80XUbvb8N4M+AffH0d/rju7VX1HX11lJeV/D7ihco5Huq757tnDqd2154d+vo7fs5bdBCAymIUgtQBf4IxKNwO4kJhXLZwdVUagvM0HC9Wk+4j56KYhqnsgAYNOwMrPrRXkxfMr9atGGyfTrsewAoisQqDo06q4BSyS7bsFqBijuK+T4VtKWskS1yYNetW/aCo8NgGrrKmJdlbFrzdv463tQlYrBmvx7oNQYB/d0eJ1kbF27CyddpMJK9yVIvq6+Va7x+6+2v9O/LTer/Oz/RU8tYoxRXRyUqa5blI5dHYGrtBUQ+vjy79MIins38Ib2yQpOaZkBqbtbhpjzoLNASqTnq+kzHcs1smuX4In7eTlQsZYDuAkgd/cGLiG3VEPVBPYAvM/sudzeHd+w/jvseHN2pzR9KT8za0B2lEAWLGIROS/lawtTqECvEVcFoz2WL7+7OR+ulPME/NOpNg3OHbftr9VFbEyg59LNBYUcyv01sqG0bsYl3i7pZMc2hsYamqVvzdFasGRUt7DmJLHP2LfLnZLKN2BLKXBotlo0bpPinlSBNE168JEWy4uNQ7z123s9YW3bg0Y5c2f0lGZaZY5m7lM+8PjF6YiilWAR277nNeY7J/5788fLKpeSsaRtdYNu2fMXs+ulkfPpTEy51nqarUNNJMvb2JIzWgy/jt6RiX52H9Sdh6FbVIZmqUE55wX/JmpN/WZpsArVpTQJsS9nMCvvwzjHz4k651gZSe+BmI6fr6xSrdZZcsCgT8uiYzIzI/zWPt/s8xrz2tJYR0GI3+xPv8HVwLuQ5tXqCOL2HSmrFITPEUxlTAQBVr4bRNgbsl6pQe3V07gVf7NkYel3oNyBkLZ/xnpgOQaKZ8KKtEA9San/LYtoAQ/lfk1jgcG5nLfvjCHV52qdu8aZE+bQ77ThS2y1GafrrFI6NU3upS7MdGp4sM3wZ/ZRuTP8kV3X+4cuHc+2vnvJWX1Ewpunr3dRXOqmtdSIs2Mu9zqvKwzSV1WEzw7bUp+khQ3Dze9Du9SZcI+FXwToNw+TbRDWdc6xBLX9pTNOvVvT9NdCMuvlGwEzL9GtgGQcGjZrj9yLZ7IB5r/Fz0OrzBhgT3VehuKQzsHn5jSL5S3MutwG02fhUytnq2cL8O/ZXxcRfMXxzECgVOYd6lmXTX6CwxdPhbE3XLqBjyaC2oy1cuh2gso6qiNOZGg6ee4sqLuMlbv/HC0iRpiLJ1ir2NxqhQxGtdFBlbDxJvEbP2iDqnSGc5Is6WybfR8rrFwv+IDrZHtOXLuhgR/CsmddnTN5q7xerUXov8m8ozVZnX4xctk4r9anv79kiW4nIPLZH+WW+qC/ZKLmU4oN5J5d8i5bq0b4RZSEAtBd4oaN8lZ9cxSg1WK49S6+7tDuBS7ObBp2+b3CdNe1b3Sda8gTPqU1EuvLgP7VLqk43Hd/c2W+F7KtA5EzopFvfTYLmhbQem86J1UePWmeTmuYb3ftOr9nANQ45KO5aZ/PEjIdSwheJweXmYetwL3hGGqVfElzn7iWOQd8djd47F3RipsxglF15IAnPrvzu+8n3v21OtuyyNZ3VWOZiFWdVkWxkGUi7ld3beQ5TN6PYGZ00Bi+Q4d0f6KA0RVPtDuRln3Us7z3iRSJ0DNbq+61zxE+tV/d06F+8g5u/HLC7Pf89CXa7ft9IdfA/vwHD6VvdJ0+N3rRfMxhnw56Drd8c+QE/mMR/l4bchHcDCHUX+ZJzIC/cai0VUDjO5QVIdfVAIVq5DG1pueBJM4Wz7QiGolIky4SHwnNvb+nUmglw7KBk6Ly17CI5ikO6QpXTHeCnbnIquEgRmltjx+yG57NVALEe3L08E4sN9O5pUwT4/eZoeHfndN6/WacIXeuSFUpGgrUOUFnr57Yu9T0bUF50/LIzJ70ra51VMB7DaumVFI/AWyHX8U76dnXi9R//lOtKu6t0QrPWoHE8c5uMnTtjbuvslPKDdFC1WkbgWr3uC+tmgr5tp87ayOFu2jvwd+Y+jma2YUQbZFlBEujZJ97NkYxo6rPfM+/ocMCO6qzvHfuxs62ltpzWvFBXWuR2U338DDeejNc7+mqExWujhP4dqV5Emlf4akzaCm/y/J6dcJMLe7+qbzUUUNJ0C1Kgcq+hYgcwtxtnetBi73D0m2JzD3Gr3D07XAaD9kZHLR3Vjj3CUT23XbZOvOwRzap3eyQzL0s+I5i5x9HwZhW9tVQ6peqTrE63tkvXG/p7JOxJVnuktO9zPiOpXQe3+e3JXUlqtO6Hgb7SQqPGpVjUQQu08gCcRfJajks5Jt0+aLXSWyaj6CPX83CHSrKi8MpMmdWNNBnrFhxqOhqjLuaIObEZ4DmkShgHartxUXNdM0FX+T13pxjKPX3mNSP5/fUVuiC1d4+BT/4vAAL831sIryhzT6jaGx9CpCjx+IPjYLCnDGQyoauGkwlKMpm4OJIUyvl/z0/IjA==
\ No newline at end of file
+eJzdPGtz4zaS3/UrcJxzmZqRaTu7ldrTrXPl2DMX7/rsKVszuS3bq4VJSGaGInUkJVu7lf9+3Q2ABEBS8iOTuOKqTEQQ3ehu9BMPvmFH2XyVx9O7kn2zt/8fbHQn2EikRZZ/SLJ7drgo77K8CNhhkrAL7FawC1GIfCmioPem94adxiF0FxFbpJHIWQnwh3Mewv/UmwH7LPIizlL2TbDHfOzgqVde/z8BwypbsBlfsTQr2aIQgCIu2CROBBMPoZiXLE5ZmM3mSczTULD7uLyjYRQSIIP9TaHIbksOvTn0n8PTxOzHeEkE499dWc6Hu7v39/cBJ2KDLJ/uJrJjsXt6cvT+7PL9DhBMIJ/SRBQFy8X/LeIcWL1dMT4HekJ+C1Qm/J5lOePTXMC7MkN67/O4jNPpgBXZpLznuQAsUVyUeXy7KC1haeqAZ7MDiIunzDu8ZCeXHvv+8PLkcgA4fjwZ/XD+acR+PLy4ODwbnby/ZOcX7Oj87PhkdHJ+Bk8f2OHZ39hfT86OB0yAqGAY8TDPkX4gMkYx0tSxSyEsAiaZJKiYizCexCHwlU4XfCrYNFuKPAV22Fzks7jAySyAvAiwJPEsLnlJLQ2mcJiDX/Sv53nexzxOQQ2PPh0fwvC3Oc9XSAy7ExzHj2CKwjLLY0E0sqXUPlCpDAhEwRKXq6IUs6DXQ4UvwjwGPSsEz0EXChJFF3pUzMLGMoAZR6mVRQ8aZ6gCkShRVCmJOM41EYRoLulH+DBLJ/F0kZMAEa4oo2xRBkTVnJd3hdQnwk7ACFXpYcUaKJieN1TBuzxbTO+YSJdxnqUzkZa9Jc9j1FYw5ZMJmBpb8iSOHAJiJaSBZE5KRZNLxIk8p4nPRbnISQkYNIG4wiwSSpoJqDHanhQeTgPATmIgHvDXVHIke7pA6oCoy8V8nuWo+TUYmg1Ngx+nYbKIoClcfH96eNkfwI/js7MBOzs6Oh2QYKTTuhjZU1ryL4iooumWg6abGlLTA0aNXI8+jHHM8cfD0Q+XPUOETIsQKQd/NOM7hZhzEB0AT5PslgYJmDF6kmVfpDZJ5Sl6SKlWKqlJ5K7ueB7toAgjUEIitFjcmmRO8myG5AH1xALpRtCDqbToRQmjG624Atmw88vapiMx4YukxH7gaqNhr8fAWNPFwxC84u6iyHeTLOTJbriI+IAapHSlLhIV20kktYbtzLcDgP8Rpja7L4asooO1iW0A/qfH6r+j4fX1xzyb5nzGPqAorq/PPp8cnxyy//74CcLSbL5AF8pGWZZ8icvra0R/ff221/sAAr3l4RfwqxFNBVAX38ZJXK7Q4c6EqUNJkVFU4QlYZAodl1KEvYZ4M+ksW0iHWaJQsrINLQDRkVr11JTuVHJuxaKkLWefVWER+ta6xQ6jKEZj5ElDWQF654V/gIIsiv606nx+f3EJwUPPCzaNzs9P/3oyotkEGGl2FQw+WFDtMGChxjhnZ/ZAsunk7HJ0eHqqgdCeK+LwwSGOmkwY6QLGPxxfaBTaERCKEcTx84uLkYGmarKG7n3WvrRdAULwdLdCpxRg0mAID9sYUrcfgtU2ujh0ywL0gfcqNVAOGiLxbF6usPciNfx0xnhIGQ5PV7qv8l8QBUD/cTyuApd2G2C6BhXDXjkZ/1m9+26skAwZEBX8034nfdAYFGrIgiCwX6of9VsItL0epAvglFmc6V9ZoX+hx9O/5wkvkRj9DH5b/QI3Ns+zELKPqmVV9CAtmK8SCCtDzHlQwAfTHfl+B9zXDi93ymzeK/PVECaUjLO4A2+QMIXk/i4O73oqPTyhtvcYhqrumEohQBEUc36fajiMQmPxIMJFqT25RFUTJNJOenq9MOGQRx2R+6MB/fdEA4i8j2PP4TV0Ay/LxnEBQgXP6tMbGTArSQUyc/D7kB9hOgz9PAPwXrrUR4Aq52sCz3iYPQb0mOcwTgU54yUmP1qFfB6WC56MKxXWWYduoQFAS47uRPgF5SjIf5L/1enOTAiV6ShoGe17Kg5gUSES2aiV2h2GpSqrBuvDNBbNMqbsSyGx6axw418Dl9MZexQQDg2Qp3hS/bfP3L/9wG4b5QthAgTfuADO8weIWxsggj+shWj+uWMQUXom8mkxrKBduQ0pqdFCa6RMEF7zoAK2RWyDYlocYr6k8xkZjSDBLXmSUBKqEV3IDHPIflSKhb0l7lq9pMraSXGWikBqJvwLcduZcnCfZ9BFMqvsQ8oOWqAqggpsDizy0Mi6qkQbETv4DlwFxBpu7vd7Lep30GiyehMtDWw8Lwusef2GAWrL5eU4Ebwon2a6SpYH7Ar8nr/sU566pHzUIQFKxtL3Aq9/YzDVBtjkzgE1OGTf1dLQjExFqaOUZgUTzwFL+Uxof6MUA52BCi0wPIBDsUXZ9JFZqQVSDZDCBKsxeB9nQTYXGrOXexCdU8i9Idc88BblZOdPXj/IAQUCKD/KpKrRBAb00/feyDHZVsHe+dfRu77HtojQAQ3VJzDQP+qubUuJgNqCKZRpc3/fmHyv8sYhTLsoYp6OgcloEZb+JM6LEvJbAdl35AoDrIdydMyGb+NUFeTgUgmKUngJqASixrvKigDBgp+yOPUngF3O5gQFJUHxkeQq4W/MuUqisSwFxlQf+G1UNdJsu36ozBTLFidm1mSi9mgwGgtmggK37+lmr4/plbdbAPu7VSPAqUzqwMhFghBj1li+8a8szKASO3Pvpk/RvMSaQc46lhmgUKDMb9nBd8wP3vY9OXMYQRC9KMmOTV2TI0gzsNRJpTamZqnRlHph7yASWBP6Hi/COPb6UqVU1vMpjfHlMXWpkh8TYR0RmlqINAc8inytADBNqLu+rZl9QzcLKs99LHJ9iaDfN5VBlZZjLB2VPtBP0+eoeTbbJU1mC5DuvcWZE4nqDw6EFgE3gVmP7wAM0MhRG0lVrVtV4AJRqNw/AH58K5Z6VW3rDZwXzytjl1vQETyGhcySF70x3OaV59TnO1sFuhwTCJTX6eSpJpNsp0+aejcgrXaL1jOs3HK7nTeXL3KRyFK7WsTCtRBjKUSufjjeSJHoVcR6auGn0aB4c1vf7pD72Jmmi/qleChzXuweffo4Otldi1D2oXeVn9OF0S/AubXws5F16P3tH61H66GdV3zxYIJZzL+1WQPTGlNtOaY1PB/XGySfg4oN/Yw0Kw9FMqj7orl6MoBcedepVrLtrWIb1ZO8NXJPP+LKj9QI+jfSy5gDPhWpDX1juC2zWNNCOcoWSUSehZYlsfrekitx8MtYeVsNwUrTbEL/q43VN4TRlJTBWGU9VHci1JNkbDLYkhB0o1IejiI57qMAPJbrAf7jWyFfRjxz4L6TquA7lCaPC/F8lbEEoezAAtZmJhOorlpTm51MTeQSNkkIzEwXAaKoV5FXdX7REgEMAkHfTAooeKDF6Sjk1NUurLdVvA2iFUKAdmLCq3WXmt5ZyMHZrq3T7L81OfXVcP+GIrSAQN9K1lo6IDIUmbeBmk3Do6TeGo6sQ9VdN7pGO9RA4mkGM+hmwmVhwDDSPUGzuLSi8o6XjZJTapsWTl1t/HZ226hPDW7Jc/VVx/i5iz4N/zBwBu32Fwp6zaSqHqYig4pJuWMepfnSoaCzKiZlksmGJXKVybQ7mmfohspnHq0dlqGsUXYn5bIJ3+BA1rNlSYjSR5nzOZrZmHkYFOE6qnSlGrXe4epAd0nvmVsPurZR+b6V41cSw2UbM3P3tqJgKyKHqwfd3WX7e3t7g4qKLXqW7f1eT6XYhmQrsdZ0d2qOh8DBnbdJ+o+akWfgaMqy32vUQA477A1uBeCygWD8NluKoJKC6ZIrtttiM/Gdl97AKVAMfUiXYdimDUYsOgJYJqtovd2dJdL+BW50XUfvrgP4Z8A++/o3/dP3Kq+o6+rOUl5X8DuKFijkW6rvluWcKp3rLrybdXT3eo6ttNBAAqLUAkSBD4F4EF4jMaFQLjt4GkotgRkSrmZrjftoqShu41QWAIOanIFTP7qT2ZfEL2e2mnQPh313QYsisYxDo84qoFRyyzasVqDijmK+QwVtKWtkBxzI9aqWt8H+XnAbesqY52VsWnM3fQ1v6hEwWHO/QvQUg4D+XouTrIwLF+HkdhpgchdZqmnta+Ea22+t/ZX8+3KR+n9PD/XQMtYowVVRicqaefnA5RGYSnoBgY/Oj8/9sIin0/6QtqzQpG4zADUX63BRHmQWaI7UGFV9pmO5JjbNctyIv7MTFUcYILsAUsf+4FHArmgIerdC0JeZfZO6q70b9m8HbS/2b9TiD6UnZm3onaQQBYsYmE5LuS1hSnWIFeKyYLTm0uG7m+PRfClP8C+t9abBeUPb/qw+amkCOYd+rlK4kaxvAxti6wI29d2BrmdMU2jMoWnqzjiNGauxooVt4sgx574Dvokn14gdpsyp0Wy5eoMQPzsJ0m3Ci8ekSE58HOq1x8b+jLNkBx5t35PdH5NhmTmWuUq5YfvE6ImhlGIR2L3v1ec5xv9z+JfzC4+Ss7rt5AzbuvIVs+vHw9HRD3W41HmarkJNJ8nY85Mwmg8+j5+Tir04D2tPwtCtqkMyi0I55Rn/KatP/mVpsgrUojUx0JWymRX23k3PKu/f4G5qLuQxrwp1nE6gEFlySKzwUMOtgPlVO6uoWgP206JQS110bAT3xcwpaF9AdyfpabHVmCNHPsZGLJ0yRD3k6jinfudDztrvkJY7gyihp0RuREeprIm3w1GbXcBj2PIaOP3WO7cOt0aQ6/1aw6P9bLunIkuAoq/qoPaDvd/UQV2en6J/cVyU2brRSRmdfyU3Vc3LbAZW/Zp8VUXbc3zQV3QarsQ6qX2pD5AIyQvYuDv8gN1pnSdQPZ/lCxTsy7xBztPod+0LLg7Pjl1PULdt9ANV11/JC+B8vDLjJxV5ZaavxNRB4kvtHdGRtZt4O2zd7LLO0qnfs+ycIF9m5ZNJ+bs28g8fRq6NV00bTVz3/JUsHObilRk4ascrs28ppHYCX2rdgI2M28DaYdtGj3Wmjd2eZdkI+HTDtndj0vTrbMfUhvlYs3SNEu9rGAZJj9IY9SNZ3On7z+9PtdmtM7oXbMOkz0ioX2hwnbsvMGGPN7YnbbqkqdxzqUfoWlCse8iFrfp5YPd6lloj4AvVupjzHJj7Pcesy4+HF5fvG0Wq0bq5SK07/1pFKs3LK4tfSlleWQirRNVJ5ovLUkIoy1ILd1dZanVaW5bKns8rSyXsixxAGobJ6wxrdH2wslf5pIMaPVmG+DXCGcrmlUQzmqavEcwQMUjUxN+q02YH3E4yHgdWn2doMsG9SItLulqaP6r0+g00+XN93bVWaLNxXfyxelo6/4b9QxHxD1y058BQKnJewuDZ7U8ixEspGbunrxlAx5LFpTwZRZfu1W7EoojTqUIH7/3ZIi7jOZ6qi2eigF5Jdq9W/GuJ0F0DKzaoA0I8SfxaztogqgMXOMhn+Vreh1dfNZjxL6Jx5o3pbxnQfYlgo5lXVzeea+9ny5N0IvJXFHPVlRJjO7zabtKXqb9ls2wpIvM2HO075bPq2xWKLyX44C74JaRUYXslwkIF0F7gkY7ySX5yGSPXYLnuKK3u0u0ELs1tGjT6PsN1VrDPdZ9oyWO8/jUW6dKH/9ThI3Vp6ODqxj1s6X5hg5wJXSBru6Bfn/PWuOn6VXWjx7ryUzdf6d43es4GnnF3QFFnXXkTUx5KDh/FBpdncqw7s+AZCUs7JzjPzW8jDNo+fdD6KQO6iFJmgGVb3vPBKyF/+nb87R/rCyPb7J6uQMxzsYyzRZGs5PYwHQJqfvSmx9QHR9gxXeAFL5DhJzWgg5IUDbU9kN/QuI/l54Tw+xzoGJzZ71vXc9fNV/PS5d/93au/77Kbt/13xNj1/buaq93/+ncwnLbZXXsZ88o+vWdcudqktv36Jg34wST+pzTkOrwbilB1kWeHB/I7VlpKRcKLO/ldIXWjX2m0chna0nLBk3qIXtfNBgWlQOYJD4XvXV/TZTBTwbGD4qFxGLipyVEc0pdJcvq01roPJCl1kSzUH2M5YFc8ny4DiPXg7uVVU2ygI52rIsDfV/vDm5te41ircznPc75dgpwRY9a9JGf+POtbORsDyqNuNTbGpCOo1hUA4zXMlp5J8QC0FXIeb6RvVxdJ//WznFc6tEQXHysJGrceu3hsfIrFs64kKT9I9z8X8whcq988B14P4CzGqOUGZ24kuiv7YNuN6kkHHDSxNSHaIMsFJIG+e5HcxGRE0z77M/P39wZsv8r63rDvG8dd1EGTe16oL0XIQybVrS48bmKcmtRfHHBorYVgcrJBI80v41Q66Qq8zvNbDpiZVLinVLpmQ20c/6JSBjnuVVJuDNEQTmOLvlU89SGMpwnIPuJiU9PcxO8SE+27fU0hWQM0ROTsabYKSO9OP0085p6/SYW749klGNy2+JpyMfE3xGLvBbVKRW3oPU0oxhapQYGzT9RpUXLN66talD1E06Lc1cV2i6pWjJ9oUdZ6vE1Nc/2xTUwqQqVpR4hK0zpGuZmz10xwW8KUu7fSngPU3xXr5L5lb80g0N1iqbijRasW5mghqZu3RqLewpqz0NbCmfnxsw2MmYurNW3OalvFla7l2jir6rxu7lprjhYOW6rkFi7d77Nt4NRdgLPpbSmaiWs0zy8D/YkawhqXYlZlyyCVL0BZJD+z41FxS18TsVpxCM9YbaKc58sNCslJ/5dmra6+MCWT7BmPUzlqtYpExInVAO8VLoRxQb6ZkGuqKyLo05y+t1UM5b0E5teY+u0LO5j7qK0SzLjlJz0D/Fyt8Isy94Va9MOXkKKWeJ2p18Mqg0qf8Zg+HTYeIyfjsYeYJFO9/wcrgfOr
\ No newline at end of file
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 3c345e6724b..4cfec2459e4 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -615,6 +615,7 @@ def _create_local_rocm_repository(repository_ctx):
             name = "rocm-include",
             src_dir = rocm_toolkit_path + "/include",
             out_dir = "rocm/include",
+            exceptions = ["gtest", "gmock"],
         ),
         make_copy_dir_rule(
             repository_ctx,
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 4c3d56c42a7..c16b62f635a 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -2744,6 +2744,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ml_policies",
+    srcs = glob([
+        "lib/Analysis/ML/*.c",
+        "lib/Analysis/ML/*.cpp",
+        "lib/Analysis/ML/*.inc",
+        "lib/Analysis/ML/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Analysis/ML/*.h",
+        "include/llvm/Analysis/ML/*.def",
+        "include/llvm/Analysis/ML/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":core",
+        ":support",
+    ],
+)
+
 cc_library(
     name = "msp430_asm_parser",
     srcs = glob([
@@ -3227,6 +3248,7 @@ cc_library(
         ":inst_combine",
         ":instrumentation",
         ":ipo",
+        ":ml_policies",
         ":scalar",
         ":support",
         ":target",
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index 4b8fb83eb09..bd0686523bc 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -107,6 +107,7 @@ def mkl_deps():
     return select({
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_only": ["@mkl_dnn"],
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_v1_only": ["@mkl_dnn_v1//:mkl_dnn"],
+        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_threadpool": ["@mkl_dnn_v1//:mkl_dnn"],
         "@org_tensorflow//third_party/mkl:build_with_mkl_ml_only": ["@org_tensorflow//third_party/mkl:intel_binary_blob"],
         "@org_tensorflow//third_party/mkl:build_with_mkl": [
             "@org_tensorflow//third_party/mkl:intel_binary_blob",
diff --git a/third_party/mkl_dnn/BUILD b/third_party/mkl_dnn/BUILD
index 774e5b0e2c0..fe558322916 100644
--- a/third_party/mkl_dnn/BUILD
+++ b/third_party/mkl_dnn/BUILD
@@ -27,6 +27,15 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "build_with_mkldnn_threadpool",
+    define_values = {
+        "build_with_mkl": "true",
+        "build_with_mkldnn_threadpool": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
 bzl_library(
     name = "build_defs_bzl",
     srcs = ["build_defs.bzl"],
diff --git a/third_party/mkl_dnn/build_defs.bzl b/third_party/mkl_dnn/build_defs.bzl
index af05333c947..bd3b4b94f29 100644
--- a/third_party/mkl_dnn/build_defs.bzl
+++ b/third_party/mkl_dnn/build_defs.bzl
@@ -29,3 +29,19 @@ def if_mkl_v1_open_source_only(if_true, if_false = []):
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_v1_only": if_true,
         "//conditions:default": if_false,
     })
+
+def if_mkldnn_threadpool(if_true, if_false = []):
+    """Returns `if_true` if MKL-DNN v1.x is used.
+
+    Shorthand for select()'ing on whether we're building with
+    MKL-DNN v1.x open source library only with user specified threadpool, without depending on MKL binary form.
+
+    Returns a select statement which evaluates to if_true if we're building
+    with MKL-DNN v1.x open source library only with user specified threadpool. Otherwise, the
+    select statement evaluates to if_false.
+
+    """
+    return select({
+        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_threadpool": if_true,
+        "//conditions:default": if_false,
+    })
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 243ec00a60f..47a7efecda3 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -4,6 +4,7 @@ load(
     "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
     "if_mkl_v1_open_source_only",
+    "if_mkldnn_threadpool",
 )
 load(
     "@org_tensorflow//third_party:common.bzl",
@@ -18,15 +19,26 @@ config_setting(
     },
 )
 
+_DNNL_RUNTIME_OMP = {
+    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
+    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
+    "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
+}
+
+_DNNL_RUNTIME_THREADPOOL = {
+    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
+    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
+    "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
+}
+
 template_rule(
     name = "dnnl_config_h",
     src = "include/dnnl_config.h.in",
     out = "include/dnnl_config.h",
-    substitutions = {
-        "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
-        "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
-        "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
-    },
+    substitutions = if_mkldnn_threadpool(
+        _DNNL_RUNTIME_THREADPOOL,
+        if_false = _DNNL_RUNTIME_OMP,
+    ),
 )
 
 # Create the file mkldnn_version.h with MKL-DNN version numbers.
@@ -59,9 +71,10 @@ cc_library(
         "src/cpu/**/*.cpp",
         "src/cpu/**/*.hpp",
         "src/cpu/xbyak/*.h",
-    ]) + if_mkl_v1_open_source_only([
+    ]) + [
         ":dnnl_config_h",
-    ]) + [":dnnl_version_h"],
+        ":dnnl_version_h",
+    ],
     hdrs = glob(["include/*"]),
     copts = [
         "-fexceptions",
@@ -73,6 +86,9 @@ cc_library(
     ]) + if_mkl_v1_open_source_only([
         "-UUSE_MKL",
         "-UUSE_CBLAS",
+    ]) + if_mkldnn_threadpool([
+        "-UUSE_MKL",
+        "-UUSE_CBLAS",
     ]) + select({
         "@org_tensorflow//tensorflow:linux_x86_64": [
             "-fopenmp",  # only works with gcc
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 8074eb5e290..e3cec7d7104 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -175,10 +175,11 @@ filegroup(
 filegroup(
     name = "AffineOpsTdFiles",
     srcs = [
+        "include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.td",
         "include/mlir/Dialect/Affine/IR/AffineOps.td",
         "include/mlir/Dialect/Affine/IR/AffineOpsBase.td",
         "include/mlir/Interfaces/LoopLikeInterface.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -207,6 +208,26 @@ gentbl(
     ],
 )
 
+gentbl(
+    name = "AffineMemoryOpInterfacesIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-op-interface-decls",
+            "include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.h.inc",
+        ),
+        (
+            "-gen-op-interface-defs",
+            "include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.td",
+    td_srcs = [
+        ":AffineOpsTdFiles",
+    ],
+)
+
 ##---------------------------------------------------------------------------##
 # AVX512 dialect.
 ##---------------------------------------------------------------------------##
@@ -217,7 +238,7 @@ filegroup(
         "include/mlir/Dialect/AVX512/AVX512.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/IR/OpBase.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
@@ -291,43 +312,44 @@ cc_library(
         ":Transforms",
         ":VectorOps",
         ":VectorToLLVM",
+        ":VectorToSCF",
         "@llvm-project//llvm:core",
         "@llvm-project//llvm:support",
     ],
 )
 
 filegroup(
-    name = "LoopOpsTdFiles",
+    name = "SCFTdFiles",
     srcs = [
-        "include/mlir/Dialect/LoopOps/LoopOps.td",
+        "include/mlir/Dialect/SCF/SCFOps.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
         "include/mlir/Interfaces/LoopLikeInterface.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
 
 gentbl(
-    name = "LoopOpsIncGen",
+    name = "SCFIncGen",
     strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-decls",
-            "include/mlir/Dialect/LoopOps/LoopOps.h.inc",
+            "include/mlir/Dialect/SCF/SCFOps.h.inc",
         ),
         (
             "-gen-op-defs",
-            "include/mlir/Dialect/LoopOps/LoopOps.cpp.inc",
+            "include/mlir/Dialect/SCF/SCFOps.cpp.inc",
         ),
         (
             "-gen-dialect-decls",
-            "include/mlir/Dialect/LoopOps/LoopOpsDialect.h.inc",
+            "include/mlir/Dialect/SCF/SCFOpsDialect.h.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Dialect/LoopOps/LoopOps.td",
+    td_file = "include/mlir/Dialect/SCF/SCFOps.td",
     td_srcs = [
-        ":LoopOpsTdFiles",
+        ":SCFTdFiles",
     ],
 )
 
@@ -337,30 +359,30 @@ gentbl(
     tbl_outs = [
         (
             "-gen-pass-decls",
-            "include/mlir/Dialect/LoopOps/Passes.h.inc",
+            "include/mlir/Dialect/SCF/Passes.h.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Dialect/LoopOps/Passes.td",
+    td_file = "include/mlir/Dialect/SCF/Passes.td",
     td_srcs = [
         ":PassBaseTdFiles",
     ],
 )
 
 cc_library(
-    name = "LoopOpsTransforms",
+    name = "SCFTransforms",
     srcs = glob([
-        "lib/Dialect/LoopOps/Transforms/*.cpp",
-        "lib/Dialect/LoopOps/Transforms/*.h",
+        "lib/Dialect/SCF/Transforms/*.cpp",
+        "lib/Dialect/SCF/Transforms/*.h",
     ]),
-    hdrs = ["include/mlir/Dialect/LoopOps/Passes.h"],
+    hdrs = ["include/mlir/Dialect/SCF/Passes.h"],
     includes = ["include"],
     deps = [
         ":Affine",
         ":IR",
-        ":LoopOps",
         ":LoopPassIncGen",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Transforms",
         "@llvm-project//llvm:support",
@@ -374,7 +396,7 @@ filegroup(
         "include/mlir/IR/OpAsmInterface.td",
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         "include/mlir/Interfaces/ViewLikeInterface.td",
         ":OpBaseTdFiles",
     ],
@@ -462,6 +484,7 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
+        ":AffineMemoryOpInterfacesIncGen",
         ":AffineOpsIncGen",
         ":EDSC",
         ":IR",
@@ -521,8 +544,8 @@ cc_library(
         ":AffinePassIncGen",
         ":Analysis",
         ":IR",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":Transforms",
@@ -559,11 +582,12 @@ cc_library(
         ":Affine",
         ":ConversionPassIncGen",
         ":IR",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":Transforms",
+        ":VectorOps",
     ],
 )
 
@@ -588,17 +612,17 @@ cc_library(
 )
 
 cc_library(
-    name = "LoopOps",
+    name = "SCFDialect",
     srcs = glob(
         [
-            "lib/Dialect/LoopOps/*.cpp",
-            "lib/Dialect/LoopOps/*.h",
-            "lib/Dialect/LoopOps/EDSC/*.cpp",
+            "lib/Dialect/SCF/*.cpp",
+            "lib/Dialect/SCF/*.h",
+            "lib/Dialect/SCF/EDSC/*.cpp",
         ],
     ),
     hdrs = glob([
-        "include/mlir/Dialect/LoopOps/*.h",
-        "include/mlir/Dialect/LoopOps/EDSC/*.h",
+        "include/mlir/Dialect/SCF/*.h",
+        "include/mlir/Dialect/SCF/EDSC/*.h",
     ]),
     includes = ["include"],
     deps = [
@@ -606,7 +630,7 @@ cc_library(
         ":EDSC",
         ":IR",
         ":LoopLikeInterface",
-        ":LoopOpsIncGen",
+        ":SCFIncGen",
         ":SideEffects",
         ":StandardOps",
         ":Support",
@@ -657,6 +681,7 @@ gentbl(
     td_file = "include/mlir/Dialect/Shape/IR/ShapeOps.td",
     td_srcs = [
         ":StdOpsTdFiles",
+        "include/mlir/Dialect/Shape/IR/ShapeBase.td",
         "include/mlir/Interfaces/InferTypeOpInterface.td",
     ],
 )
@@ -675,6 +700,7 @@ cc_library(
     deps = [
         ":CallOpInterfaces",
         ":CommonFolders",
+        ":ControlFlowInterfaces",
         ":Dialect",
         ":IR",
         ":InferTypeOpInterface",
@@ -715,24 +741,35 @@ cc_library(
     ],
 )
 
+gentbl(
+    name = "StandardOpsTransformsPassIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [(
+        "-gen-pass-decls",
+        "include/mlir/Dialect/StandardOps/Transforms/Passes.h.inc",
+    )],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/StandardOps/Transforms/Passes.td",
+    td_srcs = [":PassBaseTdFiles"],
+)
+
 cc_library(
     name = "StandardOpsTransforms",
-    srcs = glob(
-        [
-            "lib/Dialect/StandardOps/Transforms/*.cpp",
-            "lib/Dialect/StandardOps/Transforms/*.h",
-        ],
-    ),
-    hdrs = glob([
-        "include/mlir/Dialect/StandardOps/Transforms/*.h",
+    srcs = glob([
+        "lib/Dialect/StandardOps/Transforms/*.cpp",
+        "lib/Dialect/StandardOps/Transforms/*.h",
     ]),
+    hdrs = glob(["include/mlir/Dialect/StandardOps/Transforms/*.h"]),
     includes = ["include"],
     deps = [
         ":Analysis",
         ":ControlFlowInterfaces",
         ":IR",
+        ":Pass",
         ":StandardOps",
+        ":StandardOpsTransformsPassIncGen",
         ":Support",
+        ":Transforms",
         "@llvm-project//llvm:support",
     ],
 )
@@ -775,9 +812,6 @@ cc_library(
             "lib/Support/*.h",
         ],
         exclude = [
-            # TODO(herhut): Move JitRunner out of Support so that Support does not
-            # depend on dialect.
-            "lib/Support/JitRunner.cpp",
             # TODO(jpienaar): Move this out, else Support depends on Analysis/
             "lib/Support/MlirOptMain.cpp",
         ],
@@ -985,7 +1019,7 @@ filegroup(
         "include/mlir/Dialect/GPU/GPUOps.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/IR/SymbolInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -1101,9 +1135,9 @@ cc_library(
         ":GPUDialect",
         ":GPUPassIncGen",
         ":IR",
-        ":LoopOps",
         ":ParallelLoopMapperAttrGen",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":Transforms",
@@ -1118,7 +1152,7 @@ filegroup(
         "include/mlir/Dialect/LLVMIR/LLVMOps.td",
         "include/mlir/IR/SymbolInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -1224,6 +1258,7 @@ cc_library(
         ":Transforms",
         ":VectorOps",
         ":VectorToLLVM",
+        ":VectorToSCF",
         "@llvm-project//llvm:support",
     ],
 )
@@ -1252,13 +1287,13 @@ cc_library(
 )
 
 cc_library(
-    name = "GPUToCUDATransforms",
+    name = "GPUToGPURuntimeTransforms",
     srcs = [
-        "lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp",
-        "lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp",
+        "lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp",
+        "lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp",
         "lib/Conversion/PassDetail.h",
     ],
-    hdrs = ["include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"],
+    hdrs = ["include/mlir/Conversion/GPUCommon/GPUCommonPass.h"],
     includes = ["include"],
     deps = [
         ":ConversionPassIncGen",
@@ -1312,8 +1347,8 @@ cc_library(
         ":GPUDialect",
         ":GPUToSPIRVIncGen",
         ":IR",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":SPIRVDialect",
         ":SPIRVLowering",
         ":StandardToSPIRVConversions",
@@ -1405,7 +1440,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/LLVMIR/NVVMOps.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -1477,7 +1512,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/LLVMIR/ROCDLOps.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -1529,7 +1564,7 @@ filegroup(
         "include/mlir/IR/SymbolInterfaces.td",
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ] + glob(["include/mlir/Dialect/SPIRV/*.td"]),
 )
@@ -1789,28 +1824,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "StandardToStandard",
-    srcs = glob([
-        "lib/Conversion/StandardToStandard/*.cpp",
-        "lib/Conversion/StandardToStandard/*.h",
-    ]),
-    hdrs = glob([
-        "include/mlir/Conversion/StandardToStandard/*.h",
-    ]),
-    includes = [
-        "include",
-        "lib/Conversion/StandardToStandard",
-    ],
-    deps = [
-        ":ConversionPassIncGen",
-        ":IR",
-        ":Pass",
-        ":StandardOps",
-        ":Transforms",
-    ],
-)
-
 cc_library(
     name = "SPIRVSerialization",
     srcs = glob(
@@ -1871,7 +1884,7 @@ cc_library(
         ":ControlFlowInterfaces",
         ":IR",
         ":LoopLikeInterface",
-        ":LoopOps",
+        ":SCFDialect",
         ":SideEffects",
         ":StandardOps",
         ":Support",
@@ -1988,8 +2001,8 @@ cc_library(
         ":ControlFlowInterfaces",
         ":IR",
         ":LoopLikeInterface",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":SideEffects",
         ":StandardOps",
         ":Support",
@@ -2014,9 +2027,9 @@ cc_library(
 )
 
 cc_library(
-    name = "LoopsToGPU",
-    srcs = ["lib/Conversion/LoopsToGPU/LoopsToGPU.cpp"],
-    hdrs = ["include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h"],
+    name = "SCFToGPU",
+    srcs = ["lib/Conversion/SCFToGPU/SCFToGPU.cpp"],
+    hdrs = ["include/mlir/Conversion/SCFToGPU/SCFToGPU.h"],
     includes = ["include"],
     deps = [
         ":Affine",
@@ -2025,8 +2038,8 @@ cc_library(
         ":GPUDialect",
         ":GPUTransforms",
         ":IR",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":TransformUtils",
@@ -2036,22 +2049,22 @@ cc_library(
 )
 
 cc_library(
-    name = "LoopsToGPUPass",
+    name = "SCFToGPUPass",
     srcs = [
-        "lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp",
         "lib/Conversion/PassDetail.h",
+        "lib/Conversion/SCFToGPU/SCFToGPUPass.cpp",
     ],
     hdrs = [
-        "include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h",
+        "include/mlir/Conversion/SCFToGPU/SCFToGPUPass.h",
     ],
     includes = ["include"],
     deps = [
         ":Affine",
         ":ConversionPassIncGen",
         ":GPUDialect",
-        ":LoopOps",
-        ":LoopsToGPU",
         ":Pass",
+        ":SCFDialect",
+        ":SCFToGPU",
         ":StandardOps",
         ":Support",
         ":Transforms",
@@ -2062,19 +2075,19 @@ cc_library(
 cc_library(
     name = "CFGTransforms",
     srcs = [
-        "lib/Conversion/LoopToStandard/LoopToStandard.cpp",
         "lib/Conversion/PassDetail.h",
+        "lib/Conversion/SCFToStandard/SCFToStandard.cpp",
     ],
     hdrs = [
-        "include/mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h",
+        "include/mlir/Conversion/SCFToStandard/SCFToStandard.h",
     ],
     includes = ["include"],
     deps = [
         ":ConversionPassIncGen",
         ":IR",
         ":LLVMDialect",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":TransformUtils",
@@ -2232,7 +2245,7 @@ gentbl(
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Interfaces/SideEffects.td",
+    td_file = "include/mlir/Interfaces/SideEffectInterfaces.td",
     td_srcs = [
         ":OpBaseTdFiles",
     ],
@@ -2241,10 +2254,10 @@ gentbl(
 cc_library(
     name = "SideEffects",
     srcs = [
-        "lib/Interfaces/SideEffects.cpp",
+        "lib/Interfaces/SideEffectInterfaces.cpp",
     ],
     hdrs = [
-        "include/mlir/Interfaces/SideEffects.h",
+        "include/mlir/Interfaces/SideEffectInterfaces.h",
     ],
     includes = ["include"],
     deps = [
@@ -2280,7 +2293,7 @@ cc_library(
         ":Affine",
         ":CallOpInterfaces",
         ":IR",
-        ":LoopOps",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:support",
@@ -2458,6 +2471,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":Analysis",
+        ":GPUToGPURuntimeTransforms",
         ":GPUToNVVMTransforms",
         ":GPUToROCDLTransforms",
         ":GPUToSPIRVTransforms",
@@ -2467,16 +2481,17 @@ cc_library(
         ":LLVMTransforms",
         ":LinalgToLLVM",
         ":LinalgToSPIRV",
-        ":LoopOpsTransforms",
+        ":LinalgToStandard",
         ":NVVMDialect",
         ":Parser",
         ":Pass",
+        ":SCFTransforms",
+        ":StandardOpsTransforms",
         ":StandardToSPIRVConversions",
-        ":StandardToStandard",
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
-        ":VectorToLoops",
+        ":VectorToSCF",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir/test:TestAffine",
         "@llvm-project//mlir/test:TestDialect",
@@ -2536,7 +2551,7 @@ cc_library(
         ":ConversionPassIncGen",
         ":GPUDialect",
         ":GPUPassIncGen",
-        ":GPUToCUDATransforms",
+        ":GPUToGPURuntimeTransforms",
         ":GPUToNVVMTransforms",
         ":GPUToROCDLTransforms",
         ":GPUToSPIRVTransforms",
@@ -2552,28 +2567,31 @@ cc_library(
         ":LinalgPassIncGen",
         ":LinalgToLLVM",
         ":LinalgToSPIRV",
+        ":LinalgToStandard",
         ":LinalgTransforms",
-        ":LoopOps",
-        ":LoopOpsTransforms",
         ":LoopPassIncGen",
-        ":LoopsToGPUPass",
         ":NVVMDialect",
         ":OpenMPDialect",
         ":QuantOps",
         ":QuantPassIncGen",
         ":ROCDLDialect",
+        ":SCFDialect",
+        ":SCFToGPUPass",
+        ":SCFTransforms",
         ":SDBM",
         ":SPIRVDialect",
         ":SPIRVLowering",
         ":SPIRVPassIncGen",
         ":Shape",
         ":StandardOps",
+        ":StandardOpsTransforms",
+        ":StandardOpsTransformsPassIncGen",
         ":StandardToSPIRVConversions",
-        ":StandardToStandard",
         ":Transforms",
         ":TransformsPassIncGen",
         ":VectorOps",
         ":VectorToLLVM",
+        ":VectorToSCF",
     ],
 )
 
@@ -2592,6 +2610,7 @@ cc_library(
     srcs = [
         "tools/mlir-opt/mlir-opt.cpp",
     ],
+    copts = ["-DMLIR_INCLUDE_TESTS"],
     deps = [
         ":AllPassesAndDialectsNoRegistration",
         ":Analysis",
@@ -2608,11 +2627,11 @@ cc_binary(
     deps = [
         ":Analysis",
         ":IR",
-        ":LoopsToGPUPass",
         ":MlirOptLib",
         ":MlirOptMain",
         ":OpenMPDialect",
         ":QuantOps",
+        ":SCFToGPUPass",
         ":Transforms",
         "@llvm-project//llvm:all_targets",
         "@llvm-project//llvm:support",
@@ -2627,8 +2646,8 @@ cc_binary(
 
 cc_library(
     name = "MlirJitRunner",
-    srcs = ["lib/Support/JitRunner.cpp"],
-    hdrs = ["include/mlir/Support/JitRunner.h"],
+    srcs = ["lib/ExecutionEngine/JitRunner.cpp"],
+    hdrs = ["include/mlir/ExecutionEngine/JitRunner.h"],
     includes = ["include"],
     deps = [
         ":AllPassesAndDialectsNoRegistration",
@@ -2688,6 +2707,7 @@ cc_binary(
     srcs = ["tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp"],
     linkshared = True,
     deps = [
+        ":mlir_c_runner_utils",
         "//third_party/gpus/cuda:cuda_headers",
         "//third_party/gpus/cuda:cuda_runtime",
         "//third_party/gpus/cuda:libcuda",
@@ -2737,6 +2757,7 @@ cc_binary(
         ":AllPassesAndDialectsNoRegistration",
         ":ExecutionEngineUtils",
         ":GPUDialect",
+        ":GPUToGPURuntimeTransforms",
         ":GPUToNVVMTransforms",
         ":GPUToROCDLTransforms",
         ":GPUTransforms",
@@ -2746,6 +2767,7 @@ cc_binary(
         ":MlirJitRunner",
         ":NVVMDialect",
         ":Pass",
+        ":TargetNVVMIR",
         ":Transforms",
         "//devtools/build/runtime:get_runfiles_dir",
         "//third_party/gpus/cuda:cuda_headers",
@@ -2855,6 +2877,14 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/OpenMP/OpenMPOps.cpp.inc",
         ),
+        (
+            "-gen-enum-decls",
+            "include/mlir/Dialect/OpenMP/OpenMPOpsEnums.h.inc",
+        ),
+        (
+            "-gen-enum-defs",
+            "include/mlir/Dialect/OpenMP/OpenMPOpsEnums.cpp.inc",
+        ),
         (
             "-gen-dialect-decls",
             "include/mlir/Dialect/OpenMP/OpenMPOpsDialect.h.inc",
@@ -2886,6 +2916,7 @@ cc_library(
     deps = [
         ":IR",
         ":OpenMPOpsIncGen",
+        ":StandardOps",
         "@llvm-project//llvm:support",
     ],
 )
@@ -2895,7 +2926,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/Quant/QuantOps.td",
         "include/mlir/Dialect/Quant/QuantOpsBase.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -3123,7 +3154,32 @@ cc_library(
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
-        ":VectorToLoops",
+        ":VectorToSCF",
+        "@llvm-project//llvm:core",
+        "@llvm-project//llvm:support",
+    ],
+)
+
+cc_library(
+    name = "LinalgToStandard",
+    srcs = glob([
+        "lib/Conversion/LinalgToStandard/*.cpp",
+        "lib/Conversion/LinalgToStandard/*.h",
+    ]) + ["lib/Conversion/PassDetail.h"],
+    hdrs = glob([
+        "include/mlir/Conversion/LinalgToStandard/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":Affine",
+        ":ConversionPassIncGen",
+        ":IR",
+        ":LinalgOps",
+        ":Pass",
+        ":SCFDialect",
+        ":StandardOps",
+        ":Support",
+        ":Transforms",
         "@llvm-project//llvm:core",
         "@llvm-project//llvm:support",
     ],
@@ -3230,8 +3286,8 @@ cc_library(
         ":LinalgOps",
         ":LinalgPassIncGen",
         ":LinalgStructuredOpsIncGen",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":TransformUtils",
@@ -3336,13 +3392,13 @@ cc_library(
 )
 
 cc_library(
-    name = "VectorToLoops",
+    name = "VectorToSCF",
     srcs = glob([
-        "lib/Conversion/VectorToLoops/*.cpp",
-        "lib/Conversion/VectorToLoops/*.h",
-    ]),
+        "lib/Conversion/VectorToSCF/*.cpp",
+        "lib/Conversion/VectorToSCF/*.h",
+    ]) + ["lib/Conversion/PassDetail.h"],
     hdrs = glob([
-        "include/mlir/Conversion/VectorToLoops/*.h",
+        "include/mlir/Conversion/VectorToSCF/*.h",
     ]),
     includes = ["include"],
     deps = [
@@ -3352,8 +3408,8 @@ cc_library(
         ":IR",
         ":LLVMDialect",
         ":LLVMTransforms",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":Transforms",
@@ -3375,7 +3431,7 @@ exports_files(
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.h",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         "include/mlir/Interfaces/ViewLikeInterface.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/StandardOps/IR/Ops.td",
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index c3dd157af83..9b6cb28a394 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -77,7 +77,7 @@ gentbl(
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/ControlFlowInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
     test = True,
 )
@@ -106,7 +106,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SideEffects",
         "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:StandardToStandard",
+        "@llvm-project//mlir:StandardOpsTransforms",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
@@ -158,20 +158,22 @@ cc_library(
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:EDSC",
         "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:GPUToCUDATransforms",
+        "@llvm-project//mlir:GPUToGPURuntimeTransforms",
         "@llvm-project//mlir:GPUTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:LoopOps",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TargetNVVMIR",
+        "@llvm-project//mlir:TargetROCDLIR",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorOps",
         "@llvm-project//mlir:VectorToLLVM",
-        "@llvm-project//mlir:VectorToLoops",
+        "@llvm-project//mlir:VectorToSCF",
     ],
 )
 
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index 4936844b6b2..9b0c9bdda1d 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -88,6 +88,7 @@ cc_library(
     ],
     hdrs = ["src/nccl.h"],
     include_prefix = "third_party/nccl",
+    linkopts = ["-lrt"],
     strip_include_prefix = "src",
     visibility = ["//visibility:public"],
     deps = [
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index e734e49f9dc..9268af7c890 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -1,6 +1,6 @@
 """Repository rule for NCCL."""
 
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_gpu_architectures")
 load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
 
 def _gen_device_srcs_impl(ctx):
@@ -84,6 +84,7 @@ def _device_link_impl(ctx):
     cubins = []
     images = []
     for arch in ctx.attr.gpu_archs:
+        arch = arch.replace("compute_", "sm_")  # PTX is JIT-linked at runtime.
         cubin = ctx.actions.declare_file("%s_%s.cubin" % (name, arch))
         register_h = ctx.actions.declare_file("%s_register_%s.h" % (name, arch))
         ctx.actions.run(
@@ -285,7 +286,7 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
         name = dlink_hdrs,
         deps = [lib],
         out = dlink_cc,
-        gpu_archs = %{gpu_architectures},
+        gpu_archs = cuda_gpu_architectures(),
         nvlink_args = select({
             "@org_tensorflow//tensorflow:linux_x86_64": ["--cpu-arch=X86_64"],
             "@org_tensorflow//tensorflow:linux_ppc64le": ["--cpu-arch=PPC64LE"],
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index 92acb204097..d59e861d70b 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -13,7 +13,6 @@
 
 load(
     "//third_party/gpus:cuda_configure.bzl",
-    "compute_capabilities",
     "enable_cuda",
     "find_cuda_config",
 )
@@ -84,16 +83,7 @@ def _create_local_nccl_repository(repository_ctx):
         # Alias to open source build from @nccl_archive.
         repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
 
-        # TODO(csigg): implement and reuse in cuda_configure.bzl.
-        gpu_architectures = [
-            "sm_" + capability.replace(".", "")
-            for capability in compute_capabilities(repository_ctx)
-        ]
-
-        # Round-about way to make the list unique.
-        gpu_architectures = dict(zip(gpu_architectures, gpu_architectures)).keys()
         config_wrap = {
-            "%{gpu_architectures}": str(gpu_architectures),
             "%{use_bin2c_path}": "False",
         }
         if (int(cuda_major), int(cuda_minor)) <= (10, 1):
diff --git a/third_party/psimd/workspace.bzl b/third_party/psimd/workspace.bzl
index 03d010c3db8..768fd6da839 100644
--- a/third_party/psimd/workspace.bzl
+++ b/third_party/psimd/workspace.bzl
@@ -5,11 +5,11 @@ load("//third_party:repo.bzl", "third_party_http_archive")
 def repo():
     third_party_http_archive(
         name = "psimd",
-        strip_prefix = "psimd-85427dd4c8521cc037a1ffa6fcd25c55fafc8a00",
-        sha256 = "db23c2bc4a58d6f40c181797e43103300edac7cf9d286ca81590543f66ab95d2",
+        strip_prefix = "psimd-072586a71b55b7f8c584153d223e95687148a900",
+        sha256 = "dc615342bcbe51ca885323e51b68b90ed9bb9fa7df0f4419dbfa0297d5e837b7",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/psimd/archive/85427dd4c8521cc037a1ffa6fcd25c55fafc8a00.zip",
-            "https://github.com/Maratyszcza/psimd/archive/85427dd4c8521cc037a1ffa6fcd25c55fafc8a00.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip",
+            "https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip",
         ],
         build_file = "//third_party/psimd:BUILD.bazel",
     )
diff --git a/third_party/toolchains/embedded/arm-linux/arm_linux_toolchain_configure.bzl b/third_party/toolchains/embedded/arm-linux/arm_linux_toolchain_configure.bzl
index da4282d0215..af34133f27c 100644
--- a/third_party/toolchains/embedded/arm-linux/arm_linux_toolchain_configure.bzl
+++ b/third_party/toolchains/embedded/arm-linux/arm_linux_toolchain_configure.bzl
@@ -10,6 +10,16 @@ def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
     )
 
 def _arm_linux_toolchain_configure_impl(repository_ctx):
+    # We need to find a cross-compilation include directory for Python, so look
+    # for an environment variable. Be warned, this crosstool template is only
+    # regenerated on the first run of Bazel, so if you change the variable after
+    # it may not be reflected in later builds. Doing a shutdown and clean of Bazel
+    # doesn't fix this, you'll need to delete the generated file at something like:
+    # external/local_config_arm_compiler/CROSSTOOL in your Bazel install.
+    if "CROSSTOOL_PYTHON_INCLUDE_PATH" in repository_ctx.os.environ:
+        python_include_path = repository_ctx.os.environ["CROSSTOOL_PYTHON_INCLUDE_PATH"]
+    else:
+        python_include_path = "/usr/include/python3.5"
     _tpl(repository_ctx, "cc_config.bzl", {
         "%{AARCH64_COMPILER_PATH}%": str(repository_ctx.path(
             repository_ctx.attr.aarch64_repo,
@@ -17,6 +27,7 @@ def _arm_linux_toolchain_configure_impl(repository_ctx):
         "%{ARMHF_COMPILER_PATH}%": str(repository_ctx.path(
             repository_ctx.attr.armhf_repo,
         )),
+        "%{PYTHON_INCLUDE_PATH}%": python_include_path,
     })
     repository_ctx.symlink(repository_ctx.attr.build_file, "BUILD")
 
diff --git a/third_party/toolchains/embedded/arm-linux/cc_config.bzl.tpl b/third_party/toolchains/embedded/arm-linux/cc_config.bzl.tpl
index 06aaaecfa74..afbea6a3e34 100644
--- a/third_party/toolchains/embedded/arm-linux/cc_config.bzl.tpl
+++ b/third_party/toolchains/embedded/arm-linux/cc_config.bzl.tpl
@@ -252,6 +252,10 @@ def _impl(ctx):
                                 "%{AARCH64_COMPILER_PATH}%/aarch64-linux-gnu/include/c++/8.3.0/",
                                 "-isystem",
                                 "%{AARCH64_COMPILER_PATH}%/aarch64-linux-gnu/libc/usr/include/",
+                                "-isystem",
+                                "%{PYTHON_INCLUDE_PATH}%",
+                                "-isystem",
+                                "/usr/include/",
                             ],
                         ),
                     ],
@@ -347,6 +351,10 @@ def _impl(ctx):
                                 "%{ARMHF_COMPILER_PATH}%/arm-linux-gnueabihf/include/c++/8.3.0/",
                                 "-isystem",
                                 "%{ARMHF_COMPILER_PATH}%/arm-linux-gnueabihf/libc/usr/include/",
+                                "-isystem",
+                                "%{PYTHON_INCLUDE_PATH}%",
+                                "-isystem",
+                                "/usr/include/",
                             ],
                         ),
                     ],
@@ -466,6 +474,7 @@ def _impl(ctx):
                 "%{AARCH64_COMPILER_PATH}%/lib/gcc/aarch64-linux-gnu/8.3.0/include-fixed",
                 "%{AARCH64_COMPILER_PATH}%/aarch64-linux-gnu/include/c++/8.3.0/",
                 "%{AARCH64_COMPILER_PATH}%/aarch64-linux-gnu/libc/usr/include/",
+                "/usr/include",
             ]
     elif (ctx.attr.cpu == "armhf"):
         cxx_builtin_include_directories = [
@@ -473,6 +482,7 @@ def _impl(ctx):
                 "%{ARMHF_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/8.3.0/include-fixed",
                 "%{ARMHF_COMPILER_PATH}%/arm-linux-gnueabihf/include/c++/8.3.0/",
                 "%{ARMHF_COMPILER_PATH}%/arm-linux-gnueabihf/libc/usr/include/",
+                "/usr/include",
             ]
     else:
         fail("Unreachable")
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index b1d0389a16d..4a3ed0b5225 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -2,14 +2,15 @@
 container_digests = {
     "ubuntu16.04": "sha256:b90dcf2f35f3354909f4491bdf019c110b4b4d95ef0395ebf178bc5d523a4208",
     "centos6": "sha256:d09c12fb26fbbe8398b4973260c75172eb67d509dae9d6f4ad54279b7d6b0494",
-    "ubuntu16.04-manylinux2010": "sha256:b5227c4069980005336dd5cf04e3122974984da3396a514a06d7db3a7ae7b2f9",
+    "ubuntu16.04-manylinux2010": "sha256:5d855d2e9905c3824d71129fbf29696eb18d2237c5d152ab8d23f6882b83f115",
     "cuda10.0-cudnn7-ubuntu14.04": "sha256:d433e1221f802dac393bc8652fabcc63aa46896cd920bb888ae0e2002fe6b756",
     "cuda10.0-cudnn7-centos7": "sha256:a453b7147a60928a8345689eae48916a746b3578b5e831bfa151f0529d469c88",
     "cuda10.0-cudnn7-centos6": "sha256:a1909ba09c703340ee0074ce63dd94fe8fea48035a25264677907a609e2375e0",
     "cuda10.1-cudnn7-centos6": "sha256:454b899657e87893ee5e68dc0f87df59b6a0a7418ae09cafcc3dd65ac71feca9",
     "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": "sha256:5812d9d0ef0a3276fc5faaf4cd01f3d6e03d635893a6e2d2e04f6f01d626c432",
-    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:cc7f760195d7bbe283b45ae740409751d0b74d8ffbdc2f7a3cb62c71a71fbe25",
-    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:c460570b88eab3da92f06fdf30098d89be4de0f3b010ee3d39086f4d000dd3b8",
+    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:5e6d21c8ef226316eb6df5e2e6015244c16a8e5d936b52a09820442d2f8a919f",
+    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:3f890a951c81a201d60d0161a56ce628a90323be0c7f795550caa37f6f41a85c",
+    "cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython": "sha256:bd7666d1ef49b2b2e2a64981f1c9234deeccdb0d5198b30ff4289c3dfcffedbf",
     "rocm-ubuntu16.04": "sha256:e645447dd6127325f3e97b8bf23424f637a8579d963b34fcc6772cf7cfaa0ebe",
     "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
 }
diff --git a/third_party/toolchains/remote_config/configs.bzl b/third_party/toolchains/remote_config/configs.bzl
index 4098e5f1580..3bbf99e0e36 100644
--- a/third_party/toolchains/remote_config/configs.bzl
+++ b/third_party/toolchains/remote_config/configs.bzl
@@ -39,6 +39,19 @@ def initialize_rbe_configs():
         python_install_path = "/usr/local",
     )
 
+    tensorflow_rbe_config(
+        name = "ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0",
+        compiler = "/dt7/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "10.1",
+        cudnn_version = "7",
+        os = "ubuntu18.04-manylinux2010-multipython",
+        python_versions = ["2.7", "3.5", "3.6", "3.7", "3.8"],
+        tensorrt_install_path = "/usr",
+        tensorrt_version = "6.0",
+        python_install_path = "/usr/local",
+    )
+
     # TODO(klimek): Delete this once all users are migrated to a python-version
     # independent configuration. In the future, use
     # "ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0" instead.
diff --git a/third_party/toolchains/remote_config/containers.bzl b/third_party/toolchains/remote_config/containers.bzl
index 7b948f78f56..8f6dae7f311 100644
--- a/third_party/toolchains/remote_config/containers.bzl
+++ b/third_party/toolchains/remote_config/containers.bzl
@@ -31,6 +31,13 @@ containers = {
         "digest": container_digests["cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython.
+    "cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython",
+        "digest": container_digests["cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
     "rocm-ubuntu16.04": {
         "registry": "gcr.io",